diff --git a/.github/workflows/integration-build.yml b/.github/workflows/integration-build.yml index 1f7097dff3..03aaa644d7 100644 --- a/.github/workflows/integration-build.yml +++ b/.github/workflows/integration-build.yml @@ -15,7 +15,7 @@ jobs: timeout-minutes: 95 strategy: matrix: - python-version: [ 3.8, 3.9, '3.10' ] + python-version: [ 3.9, '3.10' ] steps: - name: Checkout branch diff --git a/.github/workflows/publish_pypi.yml b/.github/workflows/publish_pypi.yml index 633f8df53b..ef255c4c9d 100644 --- a/.github/workflows/publish_pypi.yml +++ b/.github/workflows/publish_pypi.yml @@ -9,7 +9,7 @@ jobs: runs-on: ubuntu-latest strategy: matrix: - python-version: [ 3.8 ] + python-version: [ 3.9 ] steps: - uses: actions/checkout@v2 diff --git a/.github/workflows/unit-build.yml b/.github/workflows/unit-build.yml index 1f58fea64f..d9dafeab2e 100644 --- a/.github/workflows/unit-build.yml +++ b/.github/workflows/unit-build.yml @@ -17,7 +17,7 @@ jobs: timeout-minutes: 15 strategy: matrix: - python-version: [ 3.8, 3.9, '3.10' ] + python-version: [ 3.9, '3.10' ] steps: - uses: actions/checkout@v2 diff --git a/.readthedocs.yml b/.readthedocs.yml index b37d7d4c3c..6c32252236 100644 --- a/.readthedocs.yml +++ b/.readthedocs.yml @@ -8,7 +8,7 @@ version: 2 build: os: ubuntu-22.04 tools: - python: "3.8" + python: "3.9" # Build documentation in the docs/ directory with Sphinx sphinx: diff --git a/.travis.yml b/.travis.yml index 3c2f5e2b50..b58383f3a3 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,7 +1,6 @@ language: python python: - - "3.8" - "3.9" - "3.10" diff --git a/docs/source/introduction/fedot_features/automation_features.rst b/docs/source/introduction/fedot_features/automation_features.rst index 65e2d28807..fe15cfdfb4 100644 --- a/docs/source/introduction/fedot_features/automation_features.rst +++ b/docs/source/introduction/fedot_features/automation_features.rst @@ -173,6 +173,10 @@ Apart from that there are other options whose names speak for themselves: ``'sta `polyfit`,Polynomial approximation,Forecasting `stl_arima`,STL Decomposition with ARIMA,Forecasting `ts_naive_average`,Naive Average,Forecasting + `tabpfn`,TabPFN classifier,Classification + `tabpfnreg`,TabPFN regressor,Regression, + `autotabpfn`,AutoTabPFN classifier,Classification + `autotabpfnreg`,AutoTabPFN regressor,Regression, .. csv-table:: Available models implementations diff --git a/fedot/core/operations/evaluation/operation_implementations/models/tabpfn.py b/fedot/core/operations/evaluation/operation_implementations/models/tabpfn.py new file mode 100644 index 0000000000..17c194b7ee --- /dev/null +++ b/fedot/core/operations/evaluation/operation_implementations/models/tabpfn.py @@ -0,0 +1,100 @@ +import os + +import numpy as np +from tabpfn import TabPFNClassifier, TabPFNRegressor +from tabpfn_extensions.post_hoc_ensembles.sklearn_interface import AutoTabPFNClassifier, AutoTabPFNRegressor +from typing import Optional +from fedot.core.data.data import InputData, OutputData +from fedot.core.operations.evaluation.operation_implementations.implementation_interfaces import ModelImplementation +from fedot.core.operations.operation_parameters import OperationParameters +from fedot.core.utils import default_fedot_data_dir + + +class FedotTabPFNImplementation(ModelImplementation): + __operation_params = [ + 'enable_categorical', + 'max_samples', + 'max_features', + 'model_path' + ] + + def __init__(self, params: Optional[OperationParameters] = None): + super().__init__(params) + + self.model_params = { + k: v for k, v in self.params.to_dict().items() if k not in self.__operation_params + } + + model_path = self.params.get('model_path', None) + if model_path == "auto": + self.model_params['model_path'] = os.path.join(default_fedot_data_dir(), 'tabpfn') + elif model_path is not None: + self.model_params['model_path'] = model_path + + self.model = None + self.classes_ = None + + def fit(self, input_data: InputData): + self.model.categorical_features_indices = input_data.categorical_idx + + if self.params.get('enable_categorical'): + input_data = input_data.get_not_encoded_data() + + self.model.fit(X=input_data.features, y=input_data.target) + + return self.model + + def predict(self, input_data: InputData) -> OutputData: + if self.params.get('enable_categorical'): + input_data = input_data.get_not_encoded_data() + + prediction = self.model.predict(input_data.features) + + output_data = self._convert_to_output( + input_data=input_data, + predict=prediction + ) + return output_data + + def predict_proba(self, input_data: InputData): + if self.params.get('enable_categorical'): + input_data = input_data.get_not_encoded_data() + + prediction = self.model.predict_proba(input_data.features) + output_data = self._convert_to_output( + input_data=input_data, + predict=prediction + ) + return output_data + + +class FedotTabPFNClassificationImplementation(FedotTabPFNImplementation): + def __init__(self, params: Optional[OperationParameters] = None): + super().__init__(params) + self.model = TabPFNClassifier(**self.model_params) + + def fit(self, input_data: InputData): + self.classes_ = np.unique(np.array(input_data.target)) + return super().fit(input_data=input_data) + + +class FedotTabPFNRegressionImplementation(FedotTabPFNImplementation): + def __init__(self, params: Optional[OperationParameters] = None): + super().__init__(params) + self.model = TabPFNRegressor(**self.model_params) + + +class FedotAutoTabPFNClassificationImplementation(FedotTabPFNImplementation): + def __init__(self, params: Optional[OperationParameters] = None): + super().__init__(params) + self.model = AutoTabPFNClassifier(**self.model_params) + + def fit(self, input_data: InputData): + self.classes_ = np.unique(np.array(input_data.target)) + return super().fit(input_data=input_data) + + +class FedotAutoTabPFNRegressionImplementation(FedotTabPFNImplementation): + def __init__(self, params: Optional[OperationParameters] = None): + super().__init__(params) + self.model = AutoTabPFNRegressor(**self.model_params) diff --git a/fedot/core/operations/evaluation/tabpfn.py b/fedot/core/operations/evaluation/tabpfn.py new file mode 100644 index 0000000000..2d9b5b6a50 --- /dev/null +++ b/fedot/core/operations/evaluation/tabpfn.py @@ -0,0 +1,94 @@ +from typing import Optional + +from fedot.core.data.data import InputData, OutputData +from fedot.core.operations.evaluation.evaluation_interfaces import EvaluationStrategy +from fedot.core.operations.evaluation.operation_implementations.models.tabpfn import \ + FedotTabPFNClassificationImplementation, FedotTabPFNRegressionImplementation, \ + FedotAutoTabPFNClassificationImplementation, FedotAutoTabPFNRegressionImplementation +from fedot.core.operations.operation_parameters import OperationParameters +from fedot.core.repository.tasks import TaskTypesEnum +from fedot.utilities.random import ImplementationRandomStateHandler + + +class TabPFNStrategy(EvaluationStrategy): + _operations_by_types = { + 'tabpfn': FedotTabPFNClassificationImplementation, + 'tabpfnreg': FedotTabPFNRegressionImplementation, + 'autotabpfn': FedotAutoTabPFNClassificationImplementation, + 'autotabpfnreg': FedotAutoTabPFNRegressionImplementation, + } + + def __init__(self, operation_type: str, params: Optional[OperationParameters] = None): + self.operation_impl = self._convert_to_operation(operation_type) + super().__init__(operation_type, params) + self.device = params.get('device', 'auto') + self.max_samples = params.get('max_samples', 1000) + self.max_features = params.get('max_features', 500) + + def fit(self, train_data: InputData): + check_data_size( + data=train_data, + device=self.device, + max_samples=self.max_samples, + max_features=self.max_features, + ) + if train_data.task.task_type == TaskTypesEnum.ts_forecasting: + raise ValueError('Time series forecasting not supported for TabPFN') + + operation_implementation = self.operation_impl(self.params_for_fit) + + with ImplementationRandomStateHandler(implementation=operation_implementation): + operation_implementation.fit(train_data) + + return operation_implementation + + def predict(self, trained_operation, predict_data: InputData) -> OutputData: + raise NotImplementedError() + + +class TabPFNClassificationStrategy(TabPFNStrategy): + def __init__(self, operation_type: str, params: Optional[OperationParameters] = None): + super().__init__(operation_type, params) + + def predict(self, trained_operation, predict_data: InputData) -> OutputData: + if self.output_mode == 'labels': + output = trained_operation.predict(predict_data) + elif self.output_mode in ['probs', 'full_probs', 'default']: + n_classes = len(trained_operation.classes_) + output = trained_operation.predict_proba(predict_data) + if n_classes < 2: + raise ValueError('Data set contain only 1 target class. Please reformat your data.') + elif (n_classes == 2 and self.output_mode != 'full_probs' + and len(output.predict.shape) > 1): + output.predict = output.predict[:, 1] + else: + raise ValueError(f'Output model {self.output_mode} is not supported') + + return output + + +class TabPFNRegressionStrategy(TabPFNStrategy): + def __init__(self, operation_type: str, params: Optional[OperationParameters] = None): + super().__init__(operation_type, params) + + def predict(self, trained_operation, predict_data: InputData) -> OutputData: + return trained_operation.predict(predict_data) + + +def check_data_size( + data: InputData, + device: str = "auto", + max_samples: int = 1000, + max_features: int = 500, +) -> bool: + if data.features.shape[0] > max_samples: + raise ValueError( + f"Input data has too many samples ({data.features.shape[0]}), " + f"maximum is {max_samples} for device '{device}'" + ) + if data.features.shape[1] > max_features: + raise ValueError( + f"Input data has too many features ({data.features.shape[1]}), " + f"maximum is {max_features}" + ) + return True diff --git a/fedot/core/pipelines/tuning/search_space.py b/fedot/core/pipelines/tuning/search_space.py index 816475674a..f6fae6e49b 100644 --- a/fedot/core/pipelines/tuning/search_space.py +++ b/fedot/core/pipelines/tuning/search_space.py @@ -930,6 +930,45 @@ def get_parameters_dict(self): 'hyperopt-dist': hp.choice, 'sampling-scope': [['euclidean', 'manhattan', 'cosine']], 'type': 'categorical'} + }, + 'tabpfn': { + 'n_estimators': { + 'hyperopt-dist': hp.uniformint, + 'sampling-scope': [1, 10], + 'type': 'discrete' + }, + 'softmax_temperature': { + 'hyperopt-dist': hp.uniform, + 'sampling-scope': [0.0, 1.0], + 'type': 'continuous' + }, + 'balance_probabilities': { + 'hyperopt-dist': hp.choice, + 'sampling-scope': [[True, False]], + 'type': 'categorical' + }, + 'average_before_softmax': { + 'hyperopt-dist': hp.choice, + 'sampling-scope': [[True, False]], + 'type': 'categorical' + }, + }, + 'tabpfnreg': { + 'n_estimators': { + 'hyperopt-dist': hp.uniformint, + 'sampling-scope': [1, 10], + 'type': 'discrete' + }, + 'softmax_temperature': { + 'hyperopt-dist': hp.uniform, + 'sampling-scope': [0.0, 1.0], + 'type': 'continuous' + }, + 'average_before_softmax': { + 'hyperopt-dist': hp.choice, + 'sampling-scope': [[True, False]], + 'type': 'categorical' + }, } } diff --git a/fedot/core/repository/data/default_operation_params.json b/fedot/core/repository/data/default_operation_params.json index 2363d5450d..3409e687a7 100644 --- a/fedot/core/repository/data/default_operation_params.json +++ b/fedot/core/repository/data/default_operation_params.json @@ -222,5 +222,111 @@ "max_homology_dimension": 1, "metric": "euclidean", "stride": 1 + }, + "tabpfn": { + "n_jobs": 1, + "n_estimators": 4, + "softmax_temperature": 0.9, + "balance_probabilities": false, + "average_before_softmax": false, + "model_path": "auto", + "device": "cpu", + "ignore_pretraining_limits": false, + "inference_precision": "auto", + "fit_mode": "fit_preprocessors", + "memory_saving_mode": "auto", + "inference_config": null, + "enable_categorical": false, + "max_samples": 1000, + "max_features": 500 + }, + "tabpfnreg": { + "n_jobs": 1, + "n_estimators": 8, + "softmax_temperature": 0.9, + "average_before_softmax": false, + "model_path": "auto", + "device": "cpu", + "ignore_pretraining_limits": false, + "inference_precision": "auto", + "fit_mode": "fit_preprocessors", + "memory_saving_mode": "auto", + "inference_config": null, + "enable_categorical": true, + "max_samples": 1000, + "max_features": 500 + }, + "autotabpfn": { + "max_time": 30, + "preset": "default", + "ges_scoring_string": "roc", + "device": "cpu", + "ignore_pretraining_limits": false, + "enable_categorical": true, + "max_samples": 1000, + "max_features": 500 + }, + "autotabpfnreg": { + "max_time": 30, + "preset": "default", + "ges_scoring_string": "mse", + "device": "cpu", + "ignore_pretraining_limits": false, + "enable_categorical": true, + "max_samples": 1000, + "max_features": 500 + }, + "tabpfn_gpu": { + "n_jobs": 1, + "n_estimators": 4, + "softmax_temperature": 0.9, + "balance_probabilities": false, + "average_before_softmax": false, + "model_path": "auto", + "device": "cuda", + "ignore_pretraining_limits": false, + "inference_precision": "auto", + "fit_mode": "fit_preprocessors", + "memory_saving_mode": "auto", + "inference_config": null, + "enable_categorical": false, + "max_samples": 10000, + "max_features": 500 + }, + "tabpfnreg_gpu": { + "n_jobs": 1, + "n_estimators": 8, + "softmax_temperature": 0.9, + "average_before_softmax": false, + "model_path": "auto", + "device": "cuda", + "ignore_pretraining_limits": false, + "inference_precision": "auto", + "fit_mode": "fit_preprocessors", + "memory_saving_mode": "auto", + "inference_config": null, + "enable_categorical": true, + "max_samples": 10000, + "max_features": 500 + }, + "autotabpfn_gpu": { + "max_time": 30, + "preset": "default", + "ges_scoring_string": "roc", + "device": "cuda", + "ignore_pretraining_limits": false, + "enable_categorical": true, + "max_samples": 10000, + "max_features": 500 + }, + "autotabpfnreg_gpu": { + "max_time": 30, + "preset": "default", + "ges_scoring_string": "mse", + "device": "cuda", + "ignore_pretraining_limits": false, + "enable_categorical": true, + "max_samples": 10000, + "max_features": 500 } } \ No newline at end of file diff --git a/fedot/core/repository/data/gpu_models_repository.json b/fedot/core/repository/data/gpu_models_repository.json index 6fbd2f129b..75b67515f0 100644 --- a/fedot/core/repository/data/gpu_models_repository.json +++ b/fedot/core/repository/data/gpu_models_repository.json @@ -59,6 +59,50 @@ "cuML", "gpu" ] + }, + "tabpfn_gpu_class": { + "tasks": "[TaskTypesEnum.classification]", + "input_type": "[DataTypesEnum.table]", + "output_type": "[DataTypesEnum.table]", + "accepted_node_types": [ + "any" + ], + "description": "TabPFN implementations for classification problems", + "forbidden_node_types": "[]", + "strategies": [ + "fedot.core.operations.evaluation.tabpfn", + "TabPFNClassificationStrategy" + ], + "tags": [ + "ml", + "neural", + "non_linear", + "tabpfn", + "non-default", + "gpu" + ] + }, + "tabpfn_gpu_regr": { + "tasks": "[TaskTypesEnum.regression]", + "input_type": "[DataTypesEnum.table]", + "output_type": "[DataTypesEnum.table]", + "accepted_node_types": [ + "any" + ], + "description": "TabPFN implementations for regression problems", + "forbidden_node_types": "[]", + "strategies": [ + "fedot.core.operations.evaluation.tabpfn", + "TabPFNRegressionStrategy" + ], + "tags": [ + "ml", + "neural", + "non_linear", + "tabpfn", + "non-default", + "gpu" + ] } }, "operations": { @@ -134,6 +178,26 @@ "cd": { "meta": "rapids_gpu_class", "presets": ["gpu"] + }, + "tabpfn_gpu": { + "meta": "tabpfn_gpu_class", + "presets": ["gpu"], + "tags": ["non_auto"] + }, + "tabpfnreg_gpu": { + "meta": "tabpfn_gpu_regr", + "presets": ["gpu"], + "tags": ["non_auto"] + }, + "autotabpfn_gpu": { + "meta": "tabpfn_gpu_class", + "presets": ["gpu"], + "tags": ["auto"] + }, + "autotabpfnreg_gpu": { + "meta": "tabpfn_gpu_regr", + "presets": ["gpu"], + "tags": ["auto"] } } } \ No newline at end of file diff --git a/fedot/core/repository/data/model_repository.json b/fedot/core/repository/data/model_repository.json index e456eb3910..ed886f1949 100644 --- a/fedot/core/repository/data/model_repository.json +++ b/fedot/core/repository/data/model_repository.json @@ -129,6 +129,50 @@ ], "tasks": "[TaskTypesEnum.regression]" }, + "tabpfn_class": { + "accepted_node_types": [ + "any" + ], + "description": "TabPFN implementations for classification problems", + "forbidden_node_types": "[]", + "input_type": "[DataTypesEnum.table]", + "output_type": "[DataTypesEnum.table]", + "strategies": [ + "fedot.core.operations.evaluation.tabpfn", + "TabPFNClassificationStrategy" + ], + "tags": [ + "ml", + "neural", + "non_linear", + "tabpfn", + "non-default", + "cpu" + ], + "tasks": "[TaskTypesEnum.classification]" + }, + "tabpfn_regr": { + "accepted_node_types": [ + "any" + ], + "description": "TabPFN implementations for regression problems", + "forbidden_node_types": "[]", + "input_type": "[DataTypesEnum.table]", + "output_type": "[DataTypesEnum.table]", + "strategies": [ + "fedot.core.operations.evaluation.tabpfn", + "TabPFNRegressionStrategy" + ], + "tags": [ + "ml", + "neural", + "non_linear", + "tabpfn", + "non-default", + "cpu" + ], + "tasks": "[TaskTypesEnum.regression]" + }, "ts_model": { "description": "Implementations of the time series models", "input_type": "[DataTypesEnum.ts]", @@ -513,6 +557,22 @@ "custom_model", "non-default" ] - } + }, + "tabpfn": { + "meta": "tabpfn_class", + "tags": ["non_auto"] + }, + "tabpfnreg": { + "meta": "tabpfn_regr", + "tags": ["non_auto"] + }, + "autotabpfn": { + "meta": "tabpfn_class", + "tags": ["auto"] + }, + "autotabpfnreg": { + "meta": "tabpfn_regr", + "tags": ["auto"] + } } } \ No newline at end of file diff --git a/other_requirements/docs.txt b/other_requirements/docs.txt index bb5e855d75..2bcb8cfd39 100644 --- a/other_requirements/docs.txt +++ b/other_requirements/docs.txt @@ -1,4 +1,4 @@ -sphinx==4.2.0 +sphinx==5.3.0 sphinx_rtd_theme==1.0.0 readthedocs-sphinx-search==0.3.2 sphinxcontrib-details-directive diff --git a/other_requirements/extra.txt b/other_requirements/extra.txt index cf2980ec4f..04698766c4 100644 --- a/other_requirements/extra.txt +++ b/other_requirements/extra.txt @@ -1,6 +1,8 @@ # DNNs tensorflow >= 2.8.0; python_version >= '3.8' torch >= 1.9.0 +tabpfn >= 2.0.0 +tabpfn-extensions >= 0.0.4 # Images opencv-python >= 4.5.5.64 diff --git a/test/integration/models/test_model.py b/test/integration/models/test_model.py index 72822b4873..2996e1e831 100644 --- a/test/integration/models/test_model.py +++ b/test/integration/models/test_model.py @@ -465,7 +465,21 @@ def test_locf_forecast_correctly(): def test_models_does_not_fall_on_constant_data(operation): """ Run models on constant data """ # models that raise exception - to_skip = {'custom', 'arima', 'catboost', 'cgru', 'clstm', 'catboostreg', 'lda', 'decompose', 'class_decompose'} + to_skip = { + 'custom', + 'arima', + 'catboost', + 'cgru', + 'clstm', + 'catboostreg', + 'lda', + 'decompose', + 'class_decompose', + 'tabpfn', + 'tabpfnreg', + 'autotabpfn', + 'autotabpfnreg', + } if operation.id in to_skip: return diff --git a/test/integration/models/test_strategy.py b/test/integration/models/test_strategy.py index a28aa71ce3..280454dc44 100644 --- a/test/integration/models/test_strategy.py +++ b/test/integration/models/test_strategy.py @@ -1,4 +1,5 @@ import numpy as np +import pandas as pd from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.metrics import roc_auc_score as roc_auc, mean_squared_error @@ -76,3 +77,46 @@ def test_boosting_regression_operation(): assert isinstance(pipeline, Pipeline) assert predicted_output.predict.shape[0] == n_samples * 0.2 assert metric < rmse_threshold + + +def run_tabpfn( + model_name: str, + train_data: pd.DataFrame, + test_data: pd.DataFrame, +): + pipeline = PipelineBuilder().add_node(model_name).build() + pipeline.fit(train_data) + predicted_output = pipeline.predict(test_data, output_mode='labels') + metric = roc_auc(test_data.target, predicted_output.predict) + + assert isinstance(pipeline, Pipeline) + assert metric > 0.5 + + +def test_tabpfn_classification_operation(): + n_samples = 20 + train_data, test_data = get_classification_data( + classes_amount=2, + samples_amount=n_samples, + features_amount=4, + ) + + model_names = OperationTypesRepository().suitable_operation( + task_type=TaskTypesEnum.classification, tags=['tabpfn', 'cpu'] + ) + + for model_name in model_names: + run_tabpfn(model_name, train_data, test_data) + + +def test_tabpfn_regression_operation(): + n_samples = 20 + data = get_synthetic_regression_data(n_samples=n_samples, n_features=4, random_state=42) + train_data, test_data = train_test_data_setup(data) + + model_names = OperationTypesRepository().suitable_operation( + task_type=TaskTypesEnum.regression, tags=['tabpfn', 'cpu'] + ) + + for model_name in model_names: + run_tabpfn(model_name, train_data, test_data) diff --git a/test/unit/pipelines/test_decompose_pipelines.py b/test/unit/pipelines/test_decompose_pipelines.py index 4d03f339fc..12785db181 100644 --- a/test/unit/pipelines/test_decompose_pipelines.py +++ b/test/unit/pipelines/test_decompose_pipelines.py @@ -69,7 +69,11 @@ def generate_cascade_decompose_pipeline(): return pipeline -def get_classification_data(classes_amount: int = 2): +def get_classification_data( + classes_amount: int = 2, + samples_amount: int = 800, + features_amount: int = 4, +): """ Function generate synthetic dataset for classification task :param classes_amount: amount of classes to predict @@ -81,9 +85,12 @@ def get_classification_data(classes_amount: int = 2): # Define options for dataset with 800 objects features_options = {'informative': 2, 'redundant': 1, 'repeated': 1, 'clusters_per_class': 1} - x_train, y_train, x_test, y_test = get_classification_dataset(features_options, - 800, 4, - classes_amount) + x_train, y_train, x_test, y_test = get_classification_dataset( + features_options, + samples_amount=samples_amount, + features_amount=features_amount, + classes_amount=classes_amount + ) y_train = y_train.reshape((-1, 1)) y_test = y_test.reshape((-1, 1))