aimclub
diff --git a/‎.gitignore‎
Lines changed: 2 additions & 0 deletions b/‎.gitignore‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎examples/by_datasets/titanic.ipynb‎
Lines changed: 2 additions & 2 deletions b/‎examples/by_datasets/titanic.ipynb‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎examples/runner.py‎
Lines changed: 1 addition & 1 deletion b/‎examples/runner.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎examples/titanic/output/solution.py‎
Lines changed: 105 additions & 41 deletions b/‎examples/titanic/output/solution.py‎
Lines changed: 105 additions & 41 deletions
@@ -200,5 +200,7 @@ db/
 docker/docker_caches/
 temp-*
 
+# Ruff
+.ruff_cache/
 
 
@@ -311,7 +311,7 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": ".venv",
+   "display_name": "fedotllm",
    "language": "python",
    "name": "python3"
   },
@@ -325,7 +325,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.10.16"
+   "version": "3.11.11"
   }
  },
  "nbformat": 4,
 
@@ -6,7 +6,7 @@
 import logging
 from pathlib import Path
 from fedotllm.main import FedotAI
-from fedotllm.output import JupyterOutput
+from fedotllm.handlers import JupyterOutput
 from fedotllm.llm import AIInference
 from examples.kaggle import download_from_kaggle, submit_to_kaggle
 from golem.core.dag.graph_utils import graph_structure
 
@@ -1,46 +1,73 @@
 ### UNMODIFIABLE IMPORT BEGIN ###
 import random
 from pathlib import Path
+import pandas as pd
 import numpy as np
 from typing import Tuple
 from fedot.api.main import Fedot
 from fedot.core.data.data import InputData
-from fedot.core.repository.tasks import Task
-from fedot.core.repository.tasks import TaskTypesEnum  # classification, regression, ts_forecasting.
-def train_model(train_features: np.ndarray, train_target: np.ndarray):
-    input_data = InputData.from_numpy(train_features, train_target, task=Task(TaskTypesEnum.classification))
+from fedot.core.repository.tasks import (
+    Task,
+    TaskTypesEnum,
+)  # classification, regression, ts_forecasting.
+def train_model(train_features: np.ndarray | pd.DataFrame, train_target: np.ndarray | pd.DataFrame | pd.Series):
+    if isinstance(train_features, pd.DataFrame) and isinstance(train_target, (pd.DataFrame, pd.Series)):
+        input_data = InputData.from_dataframe(train_features, train_target, task=Task(TaskTypesEnum.classification))
+    elif isinstance(train_features, np.ndarray) and isinstance(train_target, np.ndarray):
+        input_data = InputData.from_numpy(train_features, train_target, task=Task(TaskTypesEnum.classification))
+    else:
+        raise ValueError("Unsupported data types for train_features and train_target. "
+                         "Expected pandas DataFrame and (DataFrame or Series), or numpy ndarray and numpy ndarray."
+                         f"Got: {type(train_features)} and {type(train_target)}")
+        
     model = Fedot(problem=TaskTypesEnum.classification.value,
-            timeout=60,
+            timeout=10,
             seed=42,
-            cv_folds=None,
-            preset='auto',
+            cv_folds=3,
+            preset='best_quality',
             metric='accuracy',
             n_jobs=1,
             with_tuning=True,
             show_progress=True)
 
-    model.fit(features=input_data) # this is the training step, after this step variable ‘model‘ will be a trained model
+    try:
+        model.fit(features=input_data) # this is the training step, after this step variable 'model' will be a trained model
+    except Exception as e:
+        raise RuntimeError(
+            f"Model training failed. Please check your data preprocessing carefully. "
+            f"Common issues include: missing values, incorrect data types, feature scaling problems, "
+            f"or incompatible target variable format. Original error: {str(e)}"
+        ) from e
 
     # Save the pipeline
     pipeline = model.current_pipeline
     pipeline.save(path=PIPELINE_PATH, create_subdir=False, is_datetime_in_path=False)
 
     return model
-def evaluate_model(model, test_features: np.ndarray, test_target: np.ndarray):
-    input_data = InputData.from_numpy(test_features, test_target, task=Task(TaskTypesEnum.classification))
-    y_pred = model.predict(features=input_data)
+def evaluate_model(model: Fedot, test_features: np.ndarray | pd.DataFrame | pd.Series, test_target: np.ndarray | pd.DataFrame | pd.Series):
+    if isinstance(test_features, pd.DataFrame) and isinstance(test_target, (pd.DataFrame, pd.Series)):
+        input_data = InputData.from_dataframe(test_features, test_target, task=Task(TaskTypesEnum.classification))
+    elif isinstance(test_features, np.ndarray) and isinstance(test_target, np.ndarray):
+        input_data = InputData.from_numpy(test_features, test_target, task=Task(TaskTypesEnum.classification))
+    else:
+        raise ValueError("Unsupported data types for test_features and test_target. "
+                         "Expected pandas DataFrame and (DataFrame or Series), or numpy ndarray and numpy ndarray."
+                         f"Got: {type(test_features)} and {type(test_target)}")
+    y_pred = model.predict_proba(features=input_data)
     print("Model metrics: ", model.get_metrics())
     return model.get_metrics()
-def automl_predict(model, features: np.ndarray) -> np.ndarray:
+def automl_predict(model: Fedot, features: np.ndarray | pd.DataFrame | pd.Series) -> np.ndarray:
+    if isinstance(features, (pd.DataFrame, pd.Series)):
+        features = features.to_numpy()
     input_data = InputData.from_numpy(features, None, task=Task(TaskTypesEnum.classification))
-    predictions = model.predict(features=input_data)
+    predictions = model.predict_proba(features=input_data)
     print(f"Predictions shape: {predictions.shape}")
     return predictions
+
 ### UNMODIFIABLE IMPORT END ###
 # USER CODE BEGIN IMPORTS #
 from sklearn.model_selection import train_test_split
 from sklearn.impute import SimpleImputer
-import pandas as pd
 # USER CODE END IMPORTS #
 
 SEED = 42
@@ -55,28 +82,37 @@ def automl_predict(model, features: np.ndarray) -> np.ndarray:
 EVAL_SET_SIZE = 0.2  # 20% of the data for evaluation
 ### UNMODIFIABLE CODE END ###
 
+# --- TODO: Update these paths for your specific competition ---
+TRAIN_FILE = DATASET_PATH / "train.csv"  # TODO: Replace with your actual filename
+TEST_FILE = DATASET_PATH / "test.csv"  # TODO: Replace with your actual filename
+SAMPLE_SUBMISSION_FILE = DATASET_PATH / "gender_submission.csv"  # TODO: Replace with your actual filename or None
+
+
 # USER CODE BEGIN LOAD_DATA #
 def load_data():
-    train = pd.read_csv(DATASET_PATH / "train.csv")
-    X_test = pd.read_csv(DATASET_PATH / "test.csv")
+    # TODO: this function is for loading a dataset from user’s local storage
+    train = pd.read_csv(TRAIN_FILE)
+    X_test = pd.read_csv(TEST_FILE)
     return train, X_test
+
+
 # USER CODE END LOAD_DATA #
 
+
 def transform_data(dataset: pd.DataFrame) -> Tuple[np.ndarray, np.ndarray]:
     """
     Function to transform data into a format that can be used for training the model.
     Used on both Train and Test data. Test data may initially not contain target columns.
     """
 
-    # TODO: Specify target columns 
-    target_columns = ['Survived']
+    target_columns = ['Survived'] # TODO: Replace with ACTUAL target columns
 
     # Separating features and target if present
     data = dataset.copy(deep=True)
     has_target = any(col in data.columns for col in target_columns)
     if has_target:
         features = data.drop(columns=target_columns)
-        target = data[target_columns].values
+        target = data[target_columns].values.ravel()  # Ensure target is 1D
     else:
         features = data
         target = None
@@ -85,61 +121,89 @@ def transform_data(dataset: pd.DataFrame) -> Tuple[np.ndarray, np.ndarray]:
     numeric_cols = features.select_dtypes(include=[np.number]).columns
     categorical_cols = features.select_dtypes(exclude=[np.number]).columns
     if len(numeric_cols) > 0:
-        numeric_imputer = SimpleImputer(strategy='mean')
+        numeric_imputer = SimpleImputer(strategy="mean")
         features[numeric_cols] = numeric_imputer.fit_transform(features[numeric_cols])
     if len(categorical_cols) > 0:
-        categorical_imputer = SimpleImputer(strategy='most_frequent')
-        features[categorical_cols] = categorical_imputer.fit_transform(features[categorical_cols])
+        categorical_imputer = SimpleImputer(strategy="most_frequent")
+        features[categorical_cols] = categorical_imputer.fit_transform(
+            features[categorical_cols]
+        )
 
-    # Drop all columns from features that are not important for predictions. All other dataset transformations are STRICTLY FORBIDDEN.
+    # TODO: Drop all columns from features that are not important for prdictions. All other dataset transformations are STRICTLY FORBIDDEN.
+    # TODO: Before any operations, make sure to check whether columns you operate on are present in data. Do not raise exceptions.
     columns_to_drop = ['PassengerId', 'Name', 'Ticket', 'Cabin']
-    features = features.drop(columns=[col for col in columns_to_drop if col in features.columns])
+    existing_columns_to_drop = [col for col in columns_to_drop if col in features.columns]
+    features = features.drop(columns=existing_columns_to_drop)
 
     return features.values, target
 
+
 # The main function to orchestrate the data loading, feature engineering, model training and model evaluation
 def create_model():
     """
     Function to execute the ML pipeline.
     """
     # USER CODE BEGIN CREATE MODEL #
+    # TODO: Step 1. Retrieve or load a dataset from hub (if available) or user’s local storage, start path from the DATASET_PATH
     train, X_test = load_data()
-    
-    # Create a train-test split of the data
-    train_data, eval_test_data = train_test_split(train, test_size=EVAL_SET_SIZE, random_state=SEED, stratify=train['Survived'])
+
+    # TODO: Step 2. Create a train-test split of the data by splitting the ‘dataset‘ into train_data and test_data.
+    # Create a train-validation split
+    # Note: EVAL_SET_SIZE is a constant defined above, corresponding to 20% of the data for evaluation
+    # Note: You may need to use stratified sampling if the target is categorical
+    train_data, eval_test_data = train_test_split(
+        train, test_size=EVAL_SET_SIZE, random_state=SEED, stratify=train['Survived']
+    )  # corresponding to 80%, 20% of ‘dataset‘
 
     train_features, train_target = transform_data(train_data)
     eval_test_features, eval_test_target = transform_data(eval_test_data)
     test_features, _ = transform_data(X_test)
 
-    # Train AutoML model
+    # TODO: Step 3. Train AutoML model. AutoML performs feature engineering and model training.
     model = train_model(train_features, train_target)
 
-    # Evaluate the trained model
+    # TODO: Step 4. evaluate the trained model using the defined "evaluate_model" function model_performance, model_complexity = evaluate_model()
     model_performance = evaluate_model(model, eval_test_features, eval_test_target)
 
-    # Evaluate predictions for the test dataset using AutoML Framework
-    predictions = automl_predict(model, test_features)
-    
-    # Create a DataFrame for output submission
-    output = pd.DataFrame({
-        'PassengerId': X_test['PassengerId'],
-        'Survived': predictions.flatten().astype(int)
-    })
-    
-    # USER CODE END CREATE MODEL #
+    # TODO: Step 5.  Evaluate predictions for the test datase using AutoML Framework
+    # **YOU MUST USE automl_predict()**
+    # Prediction result will not have an ID column, only a column for target (or columns for multiple targets)
+    # If output submission should have an ID column, add it to the prediction.
+    # If ID column has numeric type, convert it to integer
+    predictions: np.ndarray = automl_predict(model, test_features)  # returns 2D array
+    test_passenger_ids = pd.read_csv(TEST_FILE)['PassengerId']
+    output = pd.DataFrame({'PassengerId': test_passenger_ids, 'Survived': predictions.flatten().astype(int)})
 
+    # USER CODE END CREATE MODEL #
+    # If target submission format is not numeric, convert predictions to expected format
+    # For example: convert probabilities to class labels, apply inverse transformations,
+    # or map numeric predictions back to categorical labels if needed
     output.to_csv(SUBMISSION_PATH, index=False)
     return model_performance
 
+
 ### UNMODIFIABLE CODE BEGIN ###
 def main():
-    """ 
+    """
     Main function to execute the ML pipeline.
     """
+    print("Files and directories:")
+    paths = {
+        "Dataset Path": DATASET_PATH,
+        "Workspace Path": WORKSPACE_PATH,
+        "Pipeline Path": PIPELINE_PATH,
+        "Submission Path": SUBMISSION_PATH,
+        "Train File": TRAIN_FILE,
+        "Test File": TEST_FILE,
+        "Sample Submission File": SAMPLE_SUBMISSION_FILE,
+    }
+    for name, path in paths.items():
+        print(f"{name}: {path}")
+
     model_performance = create_model()
     print("Model Performance on Test Set:", model_performance)
-        
+
+
 if __name__ == "__main__":
     main()
 ### UNMODIFIABLE CODE END ###