-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathmain.py
More file actions
40 lines (37 loc) · 1.35 KB
/
main.py
File metadata and controls
40 lines (37 loc) · 1.35 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
from src.functions import *
if __name__ == '__main__':
DATA_PATH: str = '../data/ny_cab_csv/nyc_taxi_data_2014.csv'
OUT_PATH: str = '../models/'
SAMPLE:float = 3e-2
DROP_COLUMNS: list = ['store_and_fwd_flag']
OUTLIER_COLUMNS: list = ['pickup_longitude', 'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude']
FEATURES: list = [
'start_hour',
'start_minute',
'passenger_count',
'pickup_longitude',
'pickup_latitude',
'dropoff_longitude',
'dropoff_latitude',
'rate_code',
'day_of_week',
'gps_distance'
]
TARGETS: list = [
'fare_amount',
'trip_duration'
]
TRAIN_SIZE: float = 6e-1
VAL_TEST_SPLIT: float = 5e-1
MAX_DEPTH = 10
df = load_data(DATA_PATH, SAMPLE)
df = drop_nulls_and_columns(df, DROP_COLUMNS)
df = create_time_features(df)
df = remove_outliers(df, OUTLIER_COLUMNS)
df = create_geo_features(df)
X_train, X_val, X_test, y_train, y_val, y_test = split_data(df, FEATURES, TARGETS, TRAIN_SIZE, VAL_TEST_SPLIT, random_state=42, shuffle=True)
for target in TARGETS:
tree = train_tree(X_train, y_train[target], X_val, y_val[target], MAX_DEPTH)
file_name = 'tree_' + str(target) + '.pkl'
file_path = OUT_PATH + file_name
pickle_model(tree, file_path)