Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
182 changes: 182 additions & 0 deletions pysmiles/cheminfo.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,182 @@
"""
Some simple cheminformatics tools.
"""
from collections import defaultdict
import numpy as np
import networkx as nx
from .smiles_helper import PTE, remove_explicit_hydrogens

def tanimoto_similarity(binarr1, binarr2):
"""
Compute Tanimoto similarity between two binary arrays.

Parameters
----------
binarr1: np.array
binarr2: np.array

Returns
-------
float
the tanimoto similarity
"""
intersection = np.sum(binarr1 & binarr2)
union = np.sum(binarr1) + np.sum(binarr2) - intersection
return intersection / union if union != 0 else 0.0

def features_to_binary(feature_list, size=1024):
"""
Converte a list of features to a binary array of
given `size`.

Parameters
----------
feature_list: abc.iteratable[int]
a iteratable with integer numbers
size: int
size of the binary array; default=1024

Returns
-------
np.array
"""
fp_array = np.zeros(size, dtype=int)
nonzero = np.array(feature_list, dtype=int) % size
fp_array[nonzero] = 1
Comment on lines +44 to +45
Copy link
Copy Markdown
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't quite see how this affects the length of the feature_list

Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

not sure I understand the question.

Copy link
Copy Markdown
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The goals is to fold feature_list into a binary array of a fixed size, no? Maybe I'm just unclear what feature_list is

return fp_array

def _aggregate_features(graph, cycles=4, label='fp_invariant'):
"""
Given atomic invariants in a molecule stored under the `label`
keyword, iteratively aggregate those features for a given number
of `cycles`. It returns a feature list, which is composed of the
hashes of the aggregated features.
Comment on lines +49 to +53
Copy link
Copy Markdown
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Also describe a bit about what you do with 'subgraph'


Parameters
----------
graph: networkx.Graph
cycles: int
number of cycles to run; default is 4
label: abc.hashable
the keyword under which the inital invariants are stored

Returns
-------
list[int]
list of the hashes of unique features
"""
feature_list = list(nx.get_node_attributes(graph, label).values())
substructures = set(nx.get_node_attributes(graph, 'subgraph').values())
for idx in range(0, cycles):
for node in graph.nodes:
subgraph = set(graph.nodes[node]['subgraph'])
feat_list_iter = [(idx, graph.nodes[node][label])]
for neigh in graph.neighbors(node):
order = graph.edges[(node, neigh)].get('order', 1)
Copy link
Copy Markdown
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

'order' needs to be a function argument. Maybe even a list of bond-features to consider. Maybe create initial bond-invariants as well

feat_list_iter.append((order, graph.nodes[neigh][label]))
subgraph = subgraph | graph.nodes[neigh]['subgraph']
graph.nodes[node]['subgraph'] = subgraph
feat_list_iter = sorted(feat_list_iter)
Copy link
Copy Markdown
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is this correct? Shouldn't the central node be the first one, and then sort only the neighbours? Actually, you now sort by bond order...

Copy link
Copy Markdown
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Oh, better idea! Don't make a list of tuples, but a set of tuples. Hashing a set will deal with the fact that the order doesn't matter

feat_tuple_iter = tuple(feat for sub_feat in feat_list_iter for feat in sub_feat)
Copy link
Copy Markdown
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

wut?

new_hash = hash(feat_tuple_iter)
graph.nodes[node]['fp_invariant'] = new_hash
if frozenset(subgraph) not in substructures:
feature_list.append(new_hash)
substructures.add(frozenset(subgraph))
Comment on lines +81 to +85
Copy link
Copy Markdown
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You don't use the substructures, do you?
Instead of just doing new_hash = hash(feat_tuple_iter), first try to lookup the hash for this fragment in your substructures. Substructures should then be a dict of frozenset(subgraph) -> hash

Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You do because the hashes will be different depending on which atom you start even if they generate the same substructure

Copy link
Copy Markdown
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Right. I missed that you append to feature_list here, only if your subgraph is new

return feature_list

def _assign_invariants(graph, node_feats):
"""
Assign inital atom invariants from the features
provided in `node_feats`. Invarients get annotated
under the `fp_invarient` keyword. It also stores all
edges under the keyword `subraph`.
Copy link
Copy Markdown
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
edges under the keyword `subraph`.
edges under the keyword `subgraph`.


Parameters
----------
graph: networkx.graph
molecule graph
node_feats: list[abc.hashable]
features used to assgin the invariants
Copy link
Copy Markdown
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
features used to assgin the invariants
features used to assign the invariants

"""
for node in graph.nodes:
feat_list = []
for feat in node_feats:
feat_list.append(graph.nodes[node][feat])
_hash = hash(tuple(feat_list))
graph.nodes[node]['fp_invariant'] = _hash
graph.nodes[node]['subgraph'] = frozenset(graph.edges(node))
Copy link
Copy Markdown
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I would just store node indices. It's going to be an induced subgraph anyway


def _reduced_valency(graph):
"""
Compute the reduced valency for fingerprint calculations.
The reduced valency is given as the valency minus the
hydrogen atoms.
"""
red_valency = defaultdict(int)
for e1, e2, order in graph.edges(data='order'):
red_valency[e1] += order
red_valency[e2] += order
nx.set_node_attributes(graph, red_valency, 'red_valency')
return graph
Comment on lines +110 to +121
Copy link
Copy Markdown
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Make red_valency and order arguments


def annotate_rings(graph):
"""
Find all nodes that are part of a ring and
set the ring attribute to True otherwise
it is set to False.

Parameters
----------
graph: :class:`networkx.Graph`

Returns
-------
:class:`networkx.Graph`
"""
nx.set_node_attributes(graph, 0, 'ring')
for cycle in nx.cycle_basis(graph):
for node in cycle:
graph.nodes[node]['ring'] = 1
return graph
Comment on lines +123 to +141
Copy link
Copy Markdown
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Make the attribute name an argument


def _prepare_atomic_molecule(graph):
"""
Set some attributes required for fingerprint calculations.
"""
remove_explicit_hydrogens(graph)
graph = _reduced_valency(graph)
annotate_rings(graph)
for node in graph.nodes:
element = graph.nodes[node]['element']
graph.nodes[node]['atomic_num'] = PTE[element.capitalize()]['AtomicNumber']
graph.nodes[node]['mass'] = PTE[element.capitalize()]['AtomicMass']
Copy link
Copy Markdown
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

SMILES can contain isotope information

graph.nodes[node]['degree'] = graph.degree(node)

return graph

def extended_connectivity_fp(graph,
node_feats=['red_valency',
'hcount',
'degree',
'atomic_num',
'mass',
'charge']):
"""
Computes an extended connectivity fingerprint from a networkx graph.
Follows the workflow and explanation outliend in:
https://chemicbook.com/2021/03/25/a-beginners-guide-for-understanding-extended-connectivity-fingerprints.html

Parameters
----------
graph: networkx.graph
the molecule graph
node_feats: list[abc.hashable]
features to use for invariant computations

"""
graph = _prepare_atomic_molecule(graph)
_assign_invariants(graph, node_feats)
feature_list = _aggregate_features(graph, cycles=4)
Copy link
Copy Markdown
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

iterations needs to be an argument

fp_array = features_to_binary(feature_list)
return fp_array
Loading