Skip to content

Commit 82b5f04

Browse files
authored
Refactoring (#24)
* ref: code structure for improved readability and maintainability * ref: update configuration logic * ref: caching, session * ref: delete unused * fix: loader_test PermissionError
1 parent fef48fe commit 82b5f04

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

68 files changed

+1917
-7375
lines changed

.gitignore

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -200,5 +200,7 @@ db/
200200
docker/docker_caches/
201201
temp-*
202202

203+
# Ruff
204+
.ruff_cache/
203205

204206

examples/by_datasets/titanic.ipynb

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -311,7 +311,7 @@
311311
],
312312
"metadata": {
313313
"kernelspec": {
314-
"display_name": ".venv",
314+
"display_name": "fedotllm",
315315
"language": "python",
316316
"name": "python3"
317317
},
@@ -325,7 +325,7 @@
325325
"name": "python",
326326
"nbconvert_exporter": "python",
327327
"pygments_lexer": "ipython3",
328-
"version": "3.10.16"
328+
"version": "3.11.11"
329329
}
330330
},
331331
"nbformat": 4,

examples/runner.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
import logging
77
from pathlib import Path
88
from fedotllm.main import FedotAI
9-
from fedotllm.output import JupyterOutput
9+
from fedotllm.handlers import JupyterOutput
1010
from fedotllm.llm import AIInference
1111
from examples.kaggle import download_from_kaggle, submit_to_kaggle
1212
from golem.core.dag.graph_utils import graph_structure
Lines changed: 105 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -1,46 +1,73 @@
11
### UNMODIFIABLE IMPORT BEGIN ###
22
import random
33
from pathlib import Path
4+
import pandas as pd
45
import numpy as np
56
from typing import Tuple
67
from fedot.api.main import Fedot
78
from fedot.core.data.data import InputData
8-
from fedot.core.repository.tasks import Task
9-
from fedot.core.repository.tasks import TaskTypesEnum # classification, regression, ts_forecasting.
10-
def train_model(train_features: np.ndarray, train_target: np.ndarray):
11-
input_data = InputData.from_numpy(train_features, train_target, task=Task(TaskTypesEnum.classification))
9+
from fedot.core.repository.tasks import (
10+
Task,
11+
TaskTypesEnum,
12+
) # classification, regression, ts_forecasting.
13+
def train_model(train_features: np.ndarray | pd.DataFrame, train_target: np.ndarray | pd.DataFrame | pd.Series):
14+
if isinstance(train_features, pd.DataFrame) and isinstance(train_target, (pd.DataFrame, pd.Series)):
15+
input_data = InputData.from_dataframe(train_features, train_target, task=Task(TaskTypesEnum.classification))
16+
elif isinstance(train_features, np.ndarray) and isinstance(train_target, np.ndarray):
17+
input_data = InputData.from_numpy(train_features, train_target, task=Task(TaskTypesEnum.classification))
18+
else:
19+
raise ValueError("Unsupported data types for train_features and train_target. "
20+
"Expected pandas DataFrame and (DataFrame or Series), or numpy ndarray and numpy ndarray."
21+
f"Got: {type(train_features)} and {type(train_target)}")
22+
1223
model = Fedot(problem=TaskTypesEnum.classification.value,
13-
timeout=60,
24+
timeout=10,
1425
seed=42,
15-
cv_folds=None,
16-
preset='auto',
26+
cv_folds=3,
27+
preset='best_quality',
1728
metric='accuracy',
1829
n_jobs=1,
1930
with_tuning=True,
2031
show_progress=True)
2132

22-
model.fit(features=input_data) # this is the training step, after this step variable ‘model‘ will be a trained model
33+
try:
34+
model.fit(features=input_data) # this is the training step, after this step variable 'model' will be a trained model
35+
except Exception as e:
36+
raise RuntimeError(
37+
f"Model training failed. Please check your data preprocessing carefully. "
38+
f"Common issues include: missing values, incorrect data types, feature scaling problems, "
39+
f"or incompatible target variable format. Original error: {str(e)}"
40+
) from e
2341

2442
# Save the pipeline
2543
pipeline = model.current_pipeline
2644
pipeline.save(path=PIPELINE_PATH, create_subdir=False, is_datetime_in_path=False)
2745

2846
return model
29-
def evaluate_model(model, test_features: np.ndarray, test_target: np.ndarray):
30-
input_data = InputData.from_numpy(test_features, test_target, task=Task(TaskTypesEnum.classification))
31-
y_pred = model.predict(features=input_data)
47+
def evaluate_model(model: Fedot, test_features: np.ndarray | pd.DataFrame | pd.Series, test_target: np.ndarray | pd.DataFrame | pd.Series):
48+
if isinstance(test_features, pd.DataFrame) and isinstance(test_target, (pd.DataFrame, pd.Series)):
49+
input_data = InputData.from_dataframe(test_features, test_target, task=Task(TaskTypesEnum.classification))
50+
elif isinstance(test_features, np.ndarray) and isinstance(test_target, np.ndarray):
51+
input_data = InputData.from_numpy(test_features, test_target, task=Task(TaskTypesEnum.classification))
52+
else:
53+
raise ValueError("Unsupported data types for test_features and test_target. "
54+
"Expected pandas DataFrame and (DataFrame or Series), or numpy ndarray and numpy ndarray."
55+
f"Got: {type(test_features)} and {type(test_target)}")
56+
y_pred = model.predict_proba(features=input_data)
3257
print("Model metrics: ", model.get_metrics())
3358
return model.get_metrics()
34-
def automl_predict(model, features: np.ndarray) -> np.ndarray:
59+
def automl_predict(model: Fedot, features: np.ndarray | pd.DataFrame | pd.Series) -> np.ndarray:
60+
if isinstance(features, (pd.DataFrame, pd.Series)):
61+
features = features.to_numpy()
3562
input_data = InputData.from_numpy(features, None, task=Task(TaskTypesEnum.classification))
36-
predictions = model.predict(features=input_data)
63+
predictions = model.predict_proba(features=input_data)
3764
print(f"Predictions shape: {predictions.shape}")
3865
return predictions
66+
3967
### UNMODIFIABLE IMPORT END ###
4068
# USER CODE BEGIN IMPORTS #
4169
from sklearn.model_selection import train_test_split
4270
from sklearn.impute import SimpleImputer
43-
import pandas as pd
4471
# USER CODE END IMPORTS #
4572

4673
SEED = 42
@@ -55,28 +82,37 @@ def automl_predict(model, features: np.ndarray) -> np.ndarray:
5582
EVAL_SET_SIZE = 0.2 # 20% of the data for evaluation
5683
### UNMODIFIABLE CODE END ###
5784

85+
# --- TODO: Update these paths for your specific competition ---
86+
TRAIN_FILE = DATASET_PATH / "train.csv" # TODO: Replace with your actual filename
87+
TEST_FILE = DATASET_PATH / "test.csv" # TODO: Replace with your actual filename
88+
SAMPLE_SUBMISSION_FILE = DATASET_PATH / "gender_submission.csv" # TODO: Replace with your actual filename or None
89+
90+
5891
# USER CODE BEGIN LOAD_DATA #
5992
def load_data():
60-
train = pd.read_csv(DATASET_PATH / "train.csv")
61-
X_test = pd.read_csv(DATASET_PATH / "test.csv")
93+
# TODO: this function is for loading a dataset from user’s local storage
94+
train = pd.read_csv(TRAIN_FILE)
95+
X_test = pd.read_csv(TEST_FILE)
6296
return train, X_test
97+
98+
6399
# USER CODE END LOAD_DATA #
64100

101+
65102
def transform_data(dataset: pd.DataFrame) -> Tuple[np.ndarray, np.ndarray]:
66103
"""
67104
Function to transform data into a format that can be used for training the model.
68105
Used on both Train and Test data. Test data may initially not contain target columns.
69106
"""
70107

71-
# TODO: Specify target columns
72-
target_columns = ['Survived']
108+
target_columns = ['Survived'] # TODO: Replace with ACTUAL target columns
73109

74110
# Separating features and target if present
75111
data = dataset.copy(deep=True)
76112
has_target = any(col in data.columns for col in target_columns)
77113
if has_target:
78114
features = data.drop(columns=target_columns)
79-
target = data[target_columns].values
115+
target = data[target_columns].values.ravel() # Ensure target is 1D
80116
else:
81117
features = data
82118
target = None
@@ -85,61 +121,89 @@ def transform_data(dataset: pd.DataFrame) -> Tuple[np.ndarray, np.ndarray]:
85121
numeric_cols = features.select_dtypes(include=[np.number]).columns
86122
categorical_cols = features.select_dtypes(exclude=[np.number]).columns
87123
if len(numeric_cols) > 0:
88-
numeric_imputer = SimpleImputer(strategy='mean')
124+
numeric_imputer = SimpleImputer(strategy="mean")
89125
features[numeric_cols] = numeric_imputer.fit_transform(features[numeric_cols])
90126
if len(categorical_cols) > 0:
91-
categorical_imputer = SimpleImputer(strategy='most_frequent')
92-
features[categorical_cols] = categorical_imputer.fit_transform(features[categorical_cols])
127+
categorical_imputer = SimpleImputer(strategy="most_frequent")
128+
features[categorical_cols] = categorical_imputer.fit_transform(
129+
features[categorical_cols]
130+
)
93131

94-
# Drop all columns from features that are not important for predictions. All other dataset transformations are STRICTLY FORBIDDEN.
132+
# TODO: Drop all columns from features that are not important for prdictions. All other dataset transformations are STRICTLY FORBIDDEN.
133+
# TODO: Before any operations, make sure to check whether columns you operate on are present in data. Do not raise exceptions.
95134
columns_to_drop = ['PassengerId', 'Name', 'Ticket', 'Cabin']
96-
features = features.drop(columns=[col for col in columns_to_drop if col in features.columns])
135+
existing_columns_to_drop = [col for col in columns_to_drop if col in features.columns]
136+
features = features.drop(columns=existing_columns_to_drop)
97137

98138
return features.values, target
99139

140+
100141
# The main function to orchestrate the data loading, feature engineering, model training and model evaluation
101142
def create_model():
102143
"""
103144
Function to execute the ML pipeline.
104145
"""
105146
# USER CODE BEGIN CREATE MODEL #
147+
# TODO: Step 1. Retrieve or load a dataset from hub (if available) or user’s local storage, start path from the DATASET_PATH
106148
train, X_test = load_data()
107-
108-
# Create a train-test split of the data
109-
train_data, eval_test_data = train_test_split(train, test_size=EVAL_SET_SIZE, random_state=SEED, stratify=train['Survived'])
149+
150+
# TODO: Step 2. Create a train-test split of the data by splitting the ‘dataset‘ into train_data and test_data.
151+
# Create a train-validation split
152+
# Note: EVAL_SET_SIZE is a constant defined above, corresponding to 20% of the data for evaluation
153+
# Note: You may need to use stratified sampling if the target is categorical
154+
train_data, eval_test_data = train_test_split(
155+
train, test_size=EVAL_SET_SIZE, random_state=SEED, stratify=train['Survived']
156+
) # corresponding to 80%, 20% of ‘dataset‘
110157

111158
train_features, train_target = transform_data(train_data)
112159
eval_test_features, eval_test_target = transform_data(eval_test_data)
113160
test_features, _ = transform_data(X_test)
114161

115-
# Train AutoML model
162+
# TODO: Step 3. Train AutoML model. AutoML performs feature engineering and model training.
116163
model = train_model(train_features, train_target)
117164

118-
# Evaluate the trained model
165+
# TODO: Step 4. evaluate the trained model using the defined "evaluate_model" function model_performance, model_complexity = evaluate_model()
119166
model_performance = evaluate_model(model, eval_test_features, eval_test_target)
120167

121-
# Evaluate predictions for the test dataset using AutoML Framework
122-
predictions = automl_predict(model, test_features)
123-
124-
# Create a DataFrame for output submission
125-
output = pd.DataFrame({
126-
'PassengerId': X_test['PassengerId'],
127-
'Survived': predictions.flatten().astype(int)
128-
})
129-
130-
# USER CODE END CREATE MODEL #
168+
# TODO: Step 5. Evaluate predictions for the test datase using AutoML Framework
169+
# **YOU MUST USE automl_predict()**
170+
# Prediction result will not have an ID column, only a column for target (or columns for multiple targets)
171+
# If output submission should have an ID column, add it to the prediction.
172+
# If ID column has numeric type, convert it to integer
173+
predictions: np.ndarray = automl_predict(model, test_features) # returns 2D array
174+
test_passenger_ids = pd.read_csv(TEST_FILE)['PassengerId']
175+
output = pd.DataFrame({'PassengerId': test_passenger_ids, 'Survived': predictions.flatten().astype(int)})
131176

177+
# USER CODE END CREATE MODEL #
178+
# If target submission format is not numeric, convert predictions to expected format
179+
# For example: convert probabilities to class labels, apply inverse transformations,
180+
# or map numeric predictions back to categorical labels if needed
132181
output.to_csv(SUBMISSION_PATH, index=False)
133182
return model_performance
134183

184+
135185
### UNMODIFIABLE CODE BEGIN ###
136186
def main():
137-
"""
187+
"""
138188
Main function to execute the ML pipeline.
139189
"""
190+
print("Files and directories:")
191+
paths = {
192+
"Dataset Path": DATASET_PATH,
193+
"Workspace Path": WORKSPACE_PATH,
194+
"Pipeline Path": PIPELINE_PATH,
195+
"Submission Path": SUBMISSION_PATH,
196+
"Train File": TRAIN_FILE,
197+
"Test File": TEST_FILE,
198+
"Sample Submission File": SAMPLE_SUBMISSION_FILE,
199+
}
200+
for name, path in paths.items():
201+
print(f"{name}: {path}")
202+
140203
model_performance = create_model()
141204
print("Model Performance on Test Set:", model_performance)
142-
205+
206+
143207
if __name__ == "__main__":
144208
main()
145209
### UNMODIFIABLE CODE END ###

0 commit comments

Comments
 (0)