Skip to content

Commit 9ee62d0

Browse files
committed
Feat: added RDKit pipeline generation
1 parent 82b5f04 commit 9ee62d0

File tree

14 files changed

+398
-11
lines changed

14 files changed

+398
-11
lines changed

fedotllm/agents/automl/automl.py

Lines changed: 18 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
fix_solution,
1010
generate_automl_config,
1111
generate_code,
12+
generate_rdkit_config,
1213
generate_report,
1314
if_bug,
1415
init_state,
@@ -26,6 +27,8 @@
2627
INIT_STATE = "init_state"
2728
PROBLEM_REFLECTION = "problem_reflection"
2829
GENERATE_AUTOML_CONFIG = "generate_automl_config"
30+
GENERATE_RDKIT_CONFIG = "generate_rdkit_config"
31+
2932
SELECT_SKELETON = "select_skeleton"
3033
INSERT_TEMPLATE_FIRST = "insert_templates_first"
3134
GENERATE_CODE = "generate_code"
@@ -62,6 +65,14 @@ def create_graph(self):
6265
dataset=self.dataset,
6366
),
6467
)
68+
workflow.add_node(
69+
GENERATE_RDKIT_CONFIG,
70+
partial(
71+
generate_rdkit_config,
72+
inference=self.inference,
73+
dataset=self.dataset,
74+
),
75+
)
6576
workflow.add_node(
6677
SELECT_SKELETON,
6778
partial(
@@ -103,7 +114,13 @@ def create_graph(self):
103114
workflow.add_edge(START, INIT_STATE)
104115
workflow.add_edge(INIT_STATE, PROBLEM_REFLECTION)
105116
workflow.add_edge(PROBLEM_REFLECTION, GENERATE_AUTOML_CONFIG)
106-
workflow.add_edge(GENERATE_AUTOML_CONFIG, SELECT_SKELETON)
117+
workflow.add_conditional_edges(
118+
GENERATE_AUTOML_CONFIG,
119+
lambda state: self.config.automl.templates.code == "skeleton-chem.py", #if chemistry
120+
{True: GENERATE_RDKIT_CONFIG, False: SELECT_SKELETON},
121+
)
122+
workflow.add_edge(GENERATE_RDKIT_CONFIG, SELECT_SKELETON)
123+
107124
workflow.add_edge(SELECT_SKELETON, GENERATE_CODE)
108125
workflow.add_edge(GENERATE_CODE, INSERT_TEMPLATE_FIRST)
109126
workflow.add_conditional_edges(

fedotllm/agents/automl/nodes.py

Lines changed: 61 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111

1212
from fedotllm import prompts
1313
from fedotllm.agents.automl.state import AutoMLAgentState
14-
from fedotllm.agents.automl.structured import FedotConfig
14+
from fedotllm.agents.automl.structured import FedotConfig, RDKitConfig
1515
from fedotllm.agents.automl.templates.load_template import (
1616
load_template,
1717
render_template,
@@ -32,12 +32,39 @@
3232
"predict_proba": "predict_proba(features=input_data)",
3333
}
3434

35+
RDKIT_DESCRIPTORS_MAP = {
36+
"MolWt": "Descriptors.MolWt(mol)",
37+
"HeavyAtomMolWt": "Descriptors.HeavyAtomMolWt(mol)",
38+
"HeavyAtomCount": "Descriptors.HeavyAtomCount(mol)",
39+
"NumAtoms": "mol.GetNumAtoms()",
40+
"NumValenceElectrons": "Descriptors.NumValenceElectrons(mol)",
41+
42+
# Lipophilicity/Hydrophobicity
43+
"MolLogP": "Descriptors.MolLogP(mol)",
44+
"MolMR": "Descriptors.MolMR(mol)",
45+
46+
# Hydrogen Bonding
47+
"NumHDonors": "Descriptors.NumHDonors(mol)",
48+
"NumHAcceptors": "Descriptors.NumHAcceptors(mol)",
49+
50+
# Topology and Connectivity
51+
"TPSA": "Descriptors.TPSA(mol)",
52+
"NumRotatableBonds": "Descriptors.NumRotatableBonds(mol)",
53+
"RingCount": "Descriptors.RingCount(mol)",
54+
"NumAromaticRings": "Descriptors.NumAromaticRings(mol)",
55+
"NumAliphaticRings": "Descriptors.NumAliphaticRings(mol)",
56+
"NumSaturatedRings": "Descriptors.NumSaturatedRings(mol)",
57+
"NumHeteroatoms": "Descriptors.NumHeteroatoms(mol)",
58+
"NumAmideBonds": "Descriptors.NumAmideBonds(mol)"
59+
}
60+
3561

3662
def init_state(state: AutoMLAgentState):
3763
return Command(
3864
update={
3965
"reflection": None,
4066
"fedot_config": None,
67+
"rdkit_config": None,
4168
"skeleton": None,
4269
"raw_code": None,
4370
"code": None,
@@ -83,6 +110,20 @@ def generate_automl_config(
83110

84111
return Command(update={"fedot_config": fedot_config})
85112

113+
def generate_rdkit_config(
114+
state: AutoMLAgentState, inference: AIInference, dataset: Dataset
115+
):
116+
logger.info("Running generate RDKit config")
117+
118+
rdkit_config = inference.create(
119+
prompts.automl.generate_rdkit_configuration_prompt(
120+
reflection=state["reflection"],
121+
),
122+
response_model=RDKitConfig,
123+
)
124+
125+
return Command(update={"rdkit_config": rdkit_config})
126+
86127

87128
def select_skeleton(
88129
state: AutoMLAgentState, app_config: AppConfig, dataset: Dataset, workspace: Path
@@ -126,8 +167,13 @@ def insert_templates(
126167
logger.info("Running insert templates")
127168
code = state["raw_code"]
128169
fedot_config = state["fedot_config"]
170+
rdkit_config = state["rdkit_config"]
129171
predict_method = PREDICT_METHOD_MAP.get(fedot_config.predict_method)
130172

173+
if rdkit_config is not None:
174+
rdkit_decriptor_lines = [f'"{item.value}": {RDKIT_DESCRIPTORS_MAP.get(item.value)}' for item in rdkit_config.descriptors]
175+
rdkit_decriptors_code = "\n,".join(rdkit_decriptor_lines)
176+
131177
predictor_init_kwargs = (
132178
{
133179
"problem": str(fedot_config.problem),
@@ -164,14 +210,26 @@ def insert_templates(
164210
},
165211
}
166212

213+
if rdkit_config is not None:
214+
smiles_to_features_params = {"descriptors": rdkit_decriptors_code}
215+
smiles_to_features_template = {
216+
app_config.automl.templates.smiles_to_features: {"params": smiles_to_features_params}
217+
}
218+
templates.update(smiles_to_features_template)
219+
167220
rendered_templates = []
168221
for template_name, fconfig in templates.items():
169222
template = load_template(template_name)
170223
rendered = render_template(template=template, **fconfig["params"])
171224
rendered_templates.append(rendered)
172225

226+
line_to_replace = "from automl import train_model, evaluate_model, automl_predict"
227+
228+
if rdkit_config is not None:
229+
line_to_replace = "from automl import train_model, evaluate_model, automl_predict, smiles_to_features"
230+
173231
code = code.replace(
174-
"from automl import train_model, evaluate_model, automl_predict",
232+
line_to_replace,
175233
"\n".join(rendered_templates),
176234
)
177235

@@ -322,6 +380,7 @@ def test_submission_format(args: tuple) -> Observation:
322380
msg=f"Submission file has wrong number of columns. Expected: {sample_df.shape[1]}, Got: {submission_df.shape[1]}",
323381
)
324382

383+
325384
# LLM validation for deeper format checking
326385
try:
327386
submission_sample = submission_df.head(3).to_string(

fedotllm/agents/automl/state.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,12 @@
1-
from fedotllm.agents.automl.structured import FedotConfig
1+
from fedotllm.agents.automl.structured import FedotConfig, RDKitConfig
22
from fedotllm.agents.base import FedotLLMAgentState
33
from fedotllm.enviroments import Observation
44

55

66
class AutoMLAgentState(FedotLLMAgentState):
77
reflection: str
88
fedot_config: FedotConfig
9+
rdkit_config: RDKitConfig
910
skeleton: str
1011
raw_code: str | None
1112
code: str | None

fedotllm/agents/automl/structured.py

Lines changed: 62 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
from enum import Enum
2-
from typing import Literal, Optional, Union
2+
from typing import Literal, Optional, Union, List
33

44
from fedot.core.repository.tasks import TaskTypesEnum
55
from pydantic import BaseModel, ConfigDict, Field
@@ -91,3 +91,64 @@ class FedotConfig(BaseModel):
9191
...,
9292
description="Method for prediction: predict - for classification and regression, predict_proba - for classification, forecast - for time series forecasting",
9393
)
94+
95+
#RDKit Descriptors
96+
97+
class RDKitDescriptorsEnum(str, Enum):
98+
MOLWT = "MolWt"
99+
HEAVYATOMMOLWT = "HeavyAtomMolWt"
100+
HEAVYATOMCOUNT = "HeavyAtomCount"
101+
NUMATOMS = "NumAtoms"
102+
NUMVALENCEELECTRONS = "NumValenceElectrons"
103+
104+
# Lipophilicity/Hydrophobicity
105+
MOLLOGP = "MolLogP"
106+
MOLMR = "MolMR"
107+
108+
# Hydrogen Bonding
109+
NUMHDONORS = "NumHDonors"
110+
NUMHACCEPTORS = "NumHAcceptors"
111+
112+
# Topology and Connectivity
113+
TPSA = "TPSA"
114+
NUMROTATABLEBONDS = "NumRotatableBonds"
115+
RINGCOUNT = "RingCount"
116+
NUMAROMATICRINGS = "NumAromaticRings"
117+
NUMALIPHATICRINGS = "NumAliphaticRings"
118+
NUMSATURATEDRINGS = "NumSaturatedRings"
119+
NUMHETEROATOMS = "NumHeteroatoms"
120+
NUMAMIDEBONDS = "NumAmideBonds"
121+
122+
class RDKitConfig(BaseModel):
123+
model_config = ConfigDict(arbitrary_types_allowed=True)
124+
125+
descriptors: List[RDKitDescriptorsEnum] = Field(
126+
..., description=(
127+
"""Here's a list of some of the most popular RDKit descriptors to choose from, with short explanations:
128+
129+
--Basic Molecular Properties--
130+
MolWt -> Molecular Weight. The average molecular weight of the molecule (sum of atomic weights).
131+
HeavyAtomMolWt -> Heavy Atom Molecular Weight. The molecular weight considering only non-hydrogen atoms.
132+
HeavyAtomCount -> Number of Heavy Atoms. The count of non-hydrogen atoms in the molecule.
133+
NumAtoms -> Number of Atoms. The total count of atoms in the molecule (including hydrogens if they are explicitly present).
134+
NumValenceElectrons -> Number of Valence Electrons. The sum of valence electrons of all atoms in the molecule.
135+
136+
--Lipophilicity/Hydrophobicity--
137+
MolLogP -> Molecular LogP (octanol-water partition coefficient). A measure of a molecule's lipophilicity, indicating its preference for a lipid (fat) environment over an aqueous (water) environment.
138+
MolMR -> Molar Refractivity. A measure of the total polarizability of a molecule, related to its volume and electronic properties.
139+
140+
--Hydrogen Bonding--
141+
NumHDonors -> Number of Hydrogen Bond Donors. The count of atoms capable of donating a hydrogen bond (typically N-H and O-H groups).
142+
NumHAcceptors -> Number of Hydrogen Bond Acceptors. The count of atoms capable of accepting a hydrogen bond (typically O, N, F atoms with lone pairs).
143+
144+
--Topology and Connectivity--
145+
TPSA -> Topological Polar Surface Area - The sum of polar surface areas of polar atoms (nitrogen, oxygen, and their attached hydrogens). It's a useful descriptor for predicting drug absorption and blood-brain barrier penetration.
146+
NumRotatableBonds -> Number of Rotatable Bonds. The count of single bonds between two non-terminal heavy atoms, excluding amide C-N bonds and bonds to terminal acetylenes. This descriptor relates to molecular flexibility.
147+
RingCount -> Total number of Rings.
148+
NumAromaticRings -> Number of Aromatic Rings. The count of aromatic ring systems in the molecule.
149+
NumAliphaticRings -> Number of Aliphatic Rings. The count of non-aromatic (aliphatic) ring systems.
150+
NumSaturatedRings -> Number of Saturated Rings. The count of fully saturated ring systems.
151+
NumHeteroatoms -> Number of Heteroatoms. The count of non-carbon and non-hydrogen atoms (e.g., O, N, S, P, halogens).
152+
NumAmideBonds -> Number of Amide Bonds. The count of amide functional groups."""
153+
),
154+
)

0 commit comments

Comments
 (0)