aimclub
diff --git a/‎fedotllm/agents/automl/automl.py‎
Lines changed: 18 additions & 1 deletion b/‎fedotllm/agents/automl/automl.py‎
Lines changed: 18 additions & 1 deletion
diff --git a/‎fedotllm/agents/automl/nodes.py‎
Lines changed: 61 additions & 2 deletions b/‎fedotllm/agents/automl/nodes.py‎
Lines changed: 61 additions & 2 deletions
diff --git a/‎fedotllm/agents/automl/state.py‎
Lines changed: 2 additions & 1 deletion b/‎fedotllm/agents/automl/state.py‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎fedotllm/agents/automl/structured.py‎
Lines changed: 62 additions & 1 deletion b/‎fedotllm/agents/automl/structured.py‎
Lines changed: 62 additions & 1 deletion
@@ -9,6 +9,7 @@
     fix_solution,
     generate_automl_config,
     generate_code,
+    generate_rdkit_config,
     generate_report,
     if_bug,
     init_state,
@@ -26,6 +27,8 @@
 INIT_STATE = "init_state"
 PROBLEM_REFLECTION = "problem_reflection"
 GENERATE_AUTOML_CONFIG = "generate_automl_config"
+GENERATE_RDKIT_CONFIG = "generate_rdkit_config"
+
 SELECT_SKELETON = "select_skeleton"
 INSERT_TEMPLATE_FIRST = "insert_templates_first"
 GENERATE_CODE = "generate_code"
@@ -62,6 +65,14 @@ def create_graph(self):
                 dataset=self.dataset,
             ),
         )
+        workflow.add_node(
+            GENERATE_RDKIT_CONFIG,
+            partial(
+                generate_rdkit_config,
+                inference=self.inference,
+                dataset=self.dataset,
+            ),
+        )
         workflow.add_node(
             SELECT_SKELETON,
             partial(
@@ -103,7 +114,13 @@ def create_graph(self):
         workflow.add_edge(START, INIT_STATE)
         workflow.add_edge(INIT_STATE, PROBLEM_REFLECTION)
         workflow.add_edge(PROBLEM_REFLECTION, GENERATE_AUTOML_CONFIG)
-        workflow.add_edge(GENERATE_AUTOML_CONFIG, SELECT_SKELETON)
+        workflow.add_conditional_edges(
+            GENERATE_AUTOML_CONFIG,
+            lambda state: self.config.automl.templates.code == "skeleton-chem.py", #if chemistry
+            {True: GENERATE_RDKIT_CONFIG, False: SELECT_SKELETON},
+        )
+        workflow.add_edge(GENERATE_RDKIT_CONFIG, SELECT_SKELETON)
+
         workflow.add_edge(SELECT_SKELETON, GENERATE_CODE)
         workflow.add_edge(GENERATE_CODE, INSERT_TEMPLATE_FIRST)
         workflow.add_conditional_edges(
 
@@ -11,7 +11,7 @@
 
 from fedotllm import prompts
 from fedotllm.agents.automl.state import AutoMLAgentState
-from fedotllm.agents.automl.structured import FedotConfig
+from fedotllm.agents.automl.structured import FedotConfig, RDKitConfig
 from fedotllm.agents.automl.templates.load_template import (
     load_template,
     render_template,
@@ -32,12 +32,39 @@
     "predict_proba": "predict_proba(features=input_data)",
 }
 
+RDKIT_DESCRIPTORS_MAP = {
+    "MolWt": "Descriptors.MolWt(mol)",
+    "HeavyAtomMolWt": "Descriptors.HeavyAtomMolWt(mol)",
+    "HeavyAtomCount": "Descriptors.HeavyAtomCount(mol)",
+    "NumAtoms": "mol.GetNumAtoms()",
+    "NumValenceElectrons": "Descriptors.NumValenceElectrons(mol)",
+    
+    # Lipophilicity/Hydrophobicity
+    "MolLogP": "Descriptors.MolLogP(mol)",
+    "MolMR": "Descriptors.MolMR(mol)",
+    
+    # Hydrogen Bonding
+    "NumHDonors": "Descriptors.NumHDonors(mol)",
+    "NumHAcceptors": "Descriptors.NumHAcceptors(mol)",
+    
+    # Topology and Connectivity
+    "TPSA": "Descriptors.TPSA(mol)",
+    "NumRotatableBonds": "Descriptors.NumRotatableBonds(mol)",
+    "RingCount": "Descriptors.RingCount(mol)",
+    "NumAromaticRings": "Descriptors.NumAromaticRings(mol)",
+    "NumAliphaticRings": "Descriptors.NumAliphaticRings(mol)",
+    "NumSaturatedRings": "Descriptors.NumSaturatedRings(mol)",
+    "NumHeteroatoms": "Descriptors.NumHeteroatoms(mol)",
+    "NumAmideBonds": "Descriptors.NumAmideBonds(mol)"
+}
+
 
 def init_state(state: AutoMLAgentState):
     return Command(
         update={
             "reflection": None,
             "fedot_config": None,
+            "rdkit_config": None,
             "skeleton": None,
             "raw_code": None,
             "code": None,
@@ -83,6 +110,20 @@ def generate_automl_config(
 
     return Command(update={"fedot_config": fedot_config})
 
+def generate_rdkit_config(
+    state: AutoMLAgentState, inference: AIInference, dataset: Dataset
+):
+    logger.info("Running generate RDKit config")
+
+    rdkit_config = inference.create(
+        prompts.automl.generate_rdkit_configuration_prompt(
+            reflection=state["reflection"],
+        ),
+        response_model=RDKitConfig,
+    )
+
+    return Command(update={"rdkit_config": rdkit_config})
+
 
 def select_skeleton(
     state: AutoMLAgentState, app_config: AppConfig, dataset: Dataset, workspace: Path
@@ -126,8 +167,13 @@ def insert_templates(
     logger.info("Running insert templates")
     code = state["raw_code"]
     fedot_config = state["fedot_config"]
+    rdkit_config = state["rdkit_config"]
     predict_method = PREDICT_METHOD_MAP.get(fedot_config.predict_method)
 
+    if rdkit_config is not None:
+        rdkit_decriptor_lines  = [f'"{item.value}": {RDKIT_DESCRIPTORS_MAP.get(item.value)}' for item in rdkit_config.descriptors]
+        rdkit_decriptors_code = "\n,".join(rdkit_decriptor_lines)
+
     predictor_init_kwargs = (
         {
             "problem": str(fedot_config.problem),
@@ -164,14 +210,26 @@ def insert_templates(
             },
         }
 
+        if rdkit_config is not None:
+            smiles_to_features_params = {"descriptors": rdkit_decriptors_code}
+            smiles_to_features_template = {
+                app_config.automl.templates.smiles_to_features: {"params": smiles_to_features_params}
+            }
+            templates.update(smiles_to_features_template)
+
         rendered_templates = []
         for template_name, fconfig in templates.items():
             template = load_template(template_name)
             rendered = render_template(template=template, **fconfig["params"])
             rendered_templates.append(rendered)
 
+        line_to_replace = "from automl import train_model, evaluate_model, automl_predict"
+        
+        if rdkit_config is not None:
+            line_to_replace = "from automl import train_model, evaluate_model, automl_predict, smiles_to_features"
+
         code = code.replace(
-            "from automl import train_model, evaluate_model, automl_predict",
+            line_to_replace,
             "\n".join(rendered_templates),
         )
 
@@ -322,6 +380,7 @@ def test_submission_format(args: tuple) -> Observation:
                     msg=f"Submission file has wrong number of columns. Expected: {sample_df.shape[1]}, Got: {submission_df.shape[1]}",
                 )
 
+
             # LLM validation for deeper format checking
             try:
                 submission_sample = submission_df.head(3).to_string(
 
@@ -1,11 +1,12 @@
-from fedotllm.agents.automl.structured import FedotConfig
+from fedotllm.agents.automl.structured import FedotConfig, RDKitConfig
 from fedotllm.agents.base import FedotLLMAgentState
 from fedotllm.enviroments import Observation
 
 
 class AutoMLAgentState(FedotLLMAgentState):
     reflection: str
     fedot_config: FedotConfig
+    rdkit_config: RDKitConfig
     skeleton: str
     raw_code: str | None
     code: str | None
 
@@ -1,5 +1,5 @@
 from enum import Enum
-from typing import Literal, Optional, Union
+from typing import Literal, Optional, Union, List
 
 from fedot.core.repository.tasks import TaskTypesEnum
 from pydantic import BaseModel, ConfigDict, Field
@@ -91,3 +91,64 @@ class FedotConfig(BaseModel):
         ...,
         description="Method for prediction: predict - for classification and regression, predict_proba - for classification, forecast - for time series forecasting",
     )
+
+#RDKit Descriptors
+
+class RDKitDescriptorsEnum(str, Enum):
+    MOLWT = "MolWt"
+    HEAVYATOMMOLWT = "HeavyAtomMolWt"
+    HEAVYATOMCOUNT = "HeavyAtomCount"
+    NUMATOMS = "NumAtoms"
+    NUMVALENCEELECTRONS = "NumValenceElectrons"
+
+    # Lipophilicity/Hydrophobicity
+    MOLLOGP = "MolLogP"
+    MOLMR = "MolMR"
+
+    # Hydrogen Bonding
+    NUMHDONORS = "NumHDonors"
+    NUMHACCEPTORS = "NumHAcceptors"
+
+    # Topology and Connectivity
+    TPSA = "TPSA"
+    NUMROTATABLEBONDS = "NumRotatableBonds"
+    RINGCOUNT = "RingCount"
+    NUMAROMATICRINGS = "NumAromaticRings"
+    NUMALIPHATICRINGS = "NumAliphaticRings"
+    NUMSATURATEDRINGS = "NumSaturatedRings"
+    NUMHETEROATOMS = "NumHeteroatoms"
+    NUMAMIDEBONDS = "NumAmideBonds"
+
+class RDKitConfig(BaseModel):
+    model_config = ConfigDict(arbitrary_types_allowed=True)
+
+    descriptors: List[RDKitDescriptorsEnum] = Field(
+        ..., description=(
+            """Here's a list of some of the most popular RDKit descriptors to choose from, with short explanations:
+
+--Basic Molecular Properties--
+MolWt -> Molecular Weight. The average molecular weight of the molecule (sum of atomic weights).
+HeavyAtomMolWt -> Heavy Atom Molecular Weight. The molecular weight considering only non-hydrogen atoms.
+HeavyAtomCount -> Number of Heavy Atoms. The count of non-hydrogen atoms in the molecule.
+NumAtoms -> Number of Atoms. The total count of atoms in the molecule (including hydrogens if they are explicitly present).
+NumValenceElectrons -> Number of Valence Electrons. The sum of valence electrons of all atoms in the molecule.
+
+--Lipophilicity/Hydrophobicity--
+MolLogP -> Molecular LogP (octanol-water partition coefficient). A measure of a molecule's lipophilicity, indicating its preference for a lipid (fat) environment over an aqueous (water) environment.
+MolMR -> Molar Refractivity. A measure of the total polarizability of a molecule, related to its volume and electronic properties.
+
+--Hydrogen Bonding--
+NumHDonors -> Number of Hydrogen Bond Donors. The count of atoms capable of donating a hydrogen bond (typically N-H and O-H groups).
+NumHAcceptors -> Number of Hydrogen Bond Acceptors. The count of atoms capable of accepting a hydrogen bond (typically O, N, F atoms with lone pairs).
+
+--Topology and Connectivity--
+TPSA -> Topological Polar Surface Area - The sum of polar surface areas of polar atoms (nitrogen, oxygen, and their attached hydrogens). It's a useful descriptor for predicting drug absorption and blood-brain barrier penetration.
+NumRotatableBonds -> Number of Rotatable Bonds. The count of single bonds between two non-terminal heavy atoms, excluding amide C-N bonds and bonds to terminal acetylenes. This descriptor relates to molecular flexibility.
+RingCount -> Total number of Rings.
+NumAromaticRings -> Number of Aromatic Rings. The count of aromatic ring systems in the molecule.
+NumAliphaticRings -> Number of Aliphatic Rings. The count of non-aromatic (aliphatic) ring systems.
+NumSaturatedRings -> Number of Saturated Rings. The count of fully saturated ring systems.
+NumHeteroatoms -> Number of Heteroatoms. The count of non-carbon and non-hydrogen atoms (e.g., O, N, S, P, halogens).
+NumAmideBonds -> Number of Amide Bonds. The count of amide functional groups."""
+        ),
+    )