11### UNMODIFIABLE IMPORT BEGIN ###
22import random
33from pathlib import Path
4+ import pandas as pd
45import numpy as np
56from typing import Tuple
67from fedot .api .main import Fedot
78from fedot .core .data .data import InputData
8- from fedot .core .repository .tasks import Task
9- from fedot .core .repository .tasks import TaskTypesEnum # classification, regression, ts_forecasting.
10- def train_model (train_features : np .ndarray , train_target : np .ndarray ):
11- input_data = InputData .from_numpy (train_features , train_target , task = Task (TaskTypesEnum .classification ))
9+ from fedot .core .repository .tasks import (
10+ Task ,
11+ TaskTypesEnum ,
12+ ) # classification, regression, ts_forecasting.
13+ def train_model (train_features : np .ndarray | pd .DataFrame , train_target : np .ndarray | pd .DataFrame | pd .Series ):
14+ if isinstance (train_features , pd .DataFrame ) and isinstance (train_target , (pd .DataFrame , pd .Series )):
15+ input_data = InputData .from_dataframe (train_features , train_target , task = Task (TaskTypesEnum .classification ))
16+ elif isinstance (train_features , np .ndarray ) and isinstance (train_target , np .ndarray ):
17+ input_data = InputData .from_numpy (train_features , train_target , task = Task (TaskTypesEnum .classification ))
18+ else :
19+ raise ValueError ("Unsupported data types for train_features and train_target. "
20+ "Expected pandas DataFrame and (DataFrame or Series), or numpy ndarray and numpy ndarray."
21+ f"Got: { type (train_features )} and { type (train_target )} " )
22+
1223 model = Fedot (problem = TaskTypesEnum .classification .value ,
13- timeout = 60 ,
24+ timeout = 10 ,
1425 seed = 42 ,
15- cv_folds = None ,
16- preset = 'auto ' ,
26+ cv_folds = 3 ,
27+ preset = 'best_quality ' ,
1728 metric = 'accuracy' ,
1829 n_jobs = 1 ,
1930 with_tuning = True ,
2031 show_progress = True )
2132
22- model .fit (features = input_data ) # this is the training step, after this step variable ‘model‘ will be a trained model
33+ try :
34+ model .fit (features = input_data ) # this is the training step, after this step variable 'model' will be a trained model
35+ except Exception as e :
36+ raise RuntimeError (
37+ f"Model training failed. Please check your data preprocessing carefully. "
38+ f"Common issues include: missing values, incorrect data types, feature scaling problems, "
39+ f"or incompatible target variable format. Original error: { str (e )} "
40+ ) from e
2341
2442 # Save the pipeline
2543 pipeline = model .current_pipeline
2644 pipeline .save (path = PIPELINE_PATH , create_subdir = False , is_datetime_in_path = False )
2745
2846 return model
29- def evaluate_model (model , test_features : np .ndarray , test_target : np .ndarray ):
30- input_data = InputData .from_numpy (test_features , test_target , task = Task (TaskTypesEnum .classification ))
31- y_pred = model .predict (features = input_data )
47+ def evaluate_model (model : Fedot , test_features : np .ndarray | pd .DataFrame | pd .Series , test_target : np .ndarray | pd .DataFrame | pd .Series ):
48+ if isinstance (test_features , pd .DataFrame ) and isinstance (test_target , (pd .DataFrame , pd .Series )):
49+ input_data = InputData .from_dataframe (test_features , test_target , task = Task (TaskTypesEnum .classification ))
50+ elif isinstance (test_features , np .ndarray ) and isinstance (test_target , np .ndarray ):
51+ input_data = InputData .from_numpy (test_features , test_target , task = Task (TaskTypesEnum .classification ))
52+ else :
53+ raise ValueError ("Unsupported data types for test_features and test_target. "
54+ "Expected pandas DataFrame and (DataFrame or Series), or numpy ndarray and numpy ndarray."
55+ f"Got: { type (test_features )} and { type (test_target )} " )
56+ y_pred = model .predict_proba (features = input_data )
3257 print ("Model metrics: " , model .get_metrics ())
3358 return model .get_metrics ()
34- def automl_predict (model , features : np .ndarray ) -> np .ndarray :
59+ def automl_predict (model : Fedot , features : np .ndarray | pd .DataFrame | pd .Series ) -> np .ndarray :
60+ if isinstance (features , (pd .DataFrame , pd .Series )):
61+ features = features .to_numpy ()
3562 input_data = InputData .from_numpy (features , None , task = Task (TaskTypesEnum .classification ))
36- predictions = model .predict (features = input_data )
63+ predictions = model .predict_proba (features = input_data )
3764 print (f"Predictions shape: { predictions .shape } " )
3865 return predictions
66+
3967### UNMODIFIABLE IMPORT END ###
4068# USER CODE BEGIN IMPORTS #
4169from sklearn .model_selection import train_test_split
4270from sklearn .impute import SimpleImputer
43- import pandas as pd
4471# USER CODE END IMPORTS #
4572
4673SEED = 42
@@ -55,28 +82,37 @@ def automl_predict(model, features: np.ndarray) -> np.ndarray:
5582EVAL_SET_SIZE = 0.2 # 20% of the data for evaluation
5683### UNMODIFIABLE CODE END ###
5784
85+ # --- TODO: Update these paths for your specific competition ---
86+ TRAIN_FILE = DATASET_PATH / "train.csv" # TODO: Replace with your actual filename
87+ TEST_FILE = DATASET_PATH / "test.csv" # TODO: Replace with your actual filename
88+ SAMPLE_SUBMISSION_FILE = DATASET_PATH / "gender_submission.csv" # TODO: Replace with your actual filename or None
89+
90+
5891# USER CODE BEGIN LOAD_DATA #
5992def load_data ():
60- train = pd .read_csv (DATASET_PATH / "train.csv" )
61- X_test = pd .read_csv (DATASET_PATH / "test.csv" )
93+ # TODO: this function is for loading a dataset from user’s local storage
94+ train = pd .read_csv (TRAIN_FILE )
95+ X_test = pd .read_csv (TEST_FILE )
6296 return train , X_test
97+
98+
6399# USER CODE END LOAD_DATA #
64100
101+
65102def transform_data (dataset : pd .DataFrame ) -> Tuple [np .ndarray , np .ndarray ]:
66103 """
67104 Function to transform data into a format that can be used for training the model.
68105 Used on both Train and Test data. Test data may initially not contain target columns.
69106 """
70107
71- # TODO: Specify target columns
72- target_columns = ['Survived' ]
108+ target_columns = ['Survived' ] # TODO: Replace with ACTUAL target columns
73109
74110 # Separating features and target if present
75111 data = dataset .copy (deep = True )
76112 has_target = any (col in data .columns for col in target_columns )
77113 if has_target :
78114 features = data .drop (columns = target_columns )
79- target = data [target_columns ].values
115+ target = data [target_columns ].values . ravel () # Ensure target is 1D
80116 else :
81117 features = data
82118 target = None
@@ -85,61 +121,89 @@ def transform_data(dataset: pd.DataFrame) -> Tuple[np.ndarray, np.ndarray]:
85121 numeric_cols = features .select_dtypes (include = [np .number ]).columns
86122 categorical_cols = features .select_dtypes (exclude = [np .number ]).columns
87123 if len (numeric_cols ) > 0 :
88- numeric_imputer = SimpleImputer (strategy = ' mean' )
124+ numeric_imputer = SimpleImputer (strategy = " mean" )
89125 features [numeric_cols ] = numeric_imputer .fit_transform (features [numeric_cols ])
90126 if len (categorical_cols ) > 0 :
91- categorical_imputer = SimpleImputer (strategy = 'most_frequent' )
92- features [categorical_cols ] = categorical_imputer .fit_transform (features [categorical_cols ])
127+ categorical_imputer = SimpleImputer (strategy = "most_frequent" )
128+ features [categorical_cols ] = categorical_imputer .fit_transform (
129+ features [categorical_cols ]
130+ )
93131
94- # Drop all columns from features that are not important for predictions. All other dataset transformations are STRICTLY FORBIDDEN.
132+ # TODO: Drop all columns from features that are not important for prdictions. All other dataset transformations are STRICTLY FORBIDDEN.
133+ # TODO: Before any operations, make sure to check whether columns you operate on are present in data. Do not raise exceptions.
95134 columns_to_drop = ['PassengerId' , 'Name' , 'Ticket' , 'Cabin' ]
96- features = features .drop (columns = [col for col in columns_to_drop if col in features .columns ])
135+ existing_columns_to_drop = [col for col in columns_to_drop if col in features .columns ]
136+ features = features .drop (columns = existing_columns_to_drop )
97137
98138 return features .values , target
99139
140+
100141# The main function to orchestrate the data loading, feature engineering, model training and model evaluation
101142def create_model ():
102143 """
103144 Function to execute the ML pipeline.
104145 """
105146 # USER CODE BEGIN CREATE MODEL #
147+ # TODO: Step 1. Retrieve or load a dataset from hub (if available) or user’s local storage, start path from the DATASET_PATH
106148 train , X_test = load_data ()
107-
108- # Create a train-test split of the data
109- train_data , eval_test_data = train_test_split (train , test_size = EVAL_SET_SIZE , random_state = SEED , stratify = train ['Survived' ])
149+
150+ # TODO: Step 2. Create a train-test split of the data by splitting the ‘dataset‘ into train_data and test_data.
151+ # Create a train-validation split
152+ # Note: EVAL_SET_SIZE is a constant defined above, corresponding to 20% of the data for evaluation
153+ # Note: You may need to use stratified sampling if the target is categorical
154+ train_data , eval_test_data = train_test_split (
155+ train , test_size = EVAL_SET_SIZE , random_state = SEED , stratify = train ['Survived' ]
156+ ) # corresponding to 80%, 20% of ‘dataset‘
110157
111158 train_features , train_target = transform_data (train_data )
112159 eval_test_features , eval_test_target = transform_data (eval_test_data )
113160 test_features , _ = transform_data (X_test )
114161
115- # Train AutoML model
162+ # TODO: Step 3. Train AutoML model. AutoML performs feature engineering and model training.
116163 model = train_model (train_features , train_target )
117164
118- # Evaluate the trained model
165+ # TODO: Step 4. evaluate the trained model using the defined "evaluate_model" function model_performance, model_complexity = evaluate_model()
119166 model_performance = evaluate_model (model , eval_test_features , eval_test_target )
120167
121- # Evaluate predictions for the test dataset using AutoML Framework
122- predictions = automl_predict (model , test_features )
123-
124- # Create a DataFrame for output submission
125- output = pd .DataFrame ({
126- 'PassengerId' : X_test ['PassengerId' ],
127- 'Survived' : predictions .flatten ().astype (int )
128- })
129-
130- # USER CODE END CREATE MODEL #
168+ # TODO: Step 5. Evaluate predictions for the test datase using AutoML Framework
169+ # **YOU MUST USE automl_predict()**
170+ # Prediction result will not have an ID column, only a column for target (or columns for multiple targets)
171+ # If output submission should have an ID column, add it to the prediction.
172+ # If ID column has numeric type, convert it to integer
173+ predictions : np .ndarray = automl_predict (model , test_features ) # returns 2D array
174+ test_passenger_ids = pd .read_csv (TEST_FILE )['PassengerId' ]
175+ output = pd .DataFrame ({'PassengerId' : test_passenger_ids , 'Survived' : predictions .flatten ().astype (int )})
131176
177+ # USER CODE END CREATE MODEL #
178+ # If target submission format is not numeric, convert predictions to expected format
179+ # For example: convert probabilities to class labels, apply inverse transformations,
180+ # or map numeric predictions back to categorical labels if needed
132181 output .to_csv (SUBMISSION_PATH , index = False )
133182 return model_performance
134183
184+
135185### UNMODIFIABLE CODE BEGIN ###
136186def main ():
137- """
187+ """
138188 Main function to execute the ML pipeline.
139189 """
190+ print ("Files and directories:" )
191+ paths = {
192+ "Dataset Path" : DATASET_PATH ,
193+ "Workspace Path" : WORKSPACE_PATH ,
194+ "Pipeline Path" : PIPELINE_PATH ,
195+ "Submission Path" : SUBMISSION_PATH ,
196+ "Train File" : TRAIN_FILE ,
197+ "Test File" : TEST_FILE ,
198+ "Sample Submission File" : SAMPLE_SUBMISSION_FILE ,
199+ }
200+ for name , path in paths .items ():
201+ print (f"{ name } : { path } " )
202+
140203 model_performance = create_model ()
141204 print ("Model Performance on Test Set:" , model_performance )
142-
205+
206+
143207if __name__ == "__main__" :
144208 main ()
145209### UNMODIFIABLE CODE END ###
0 commit comments