Merge pull request #1 from m-bone/AmbiguousAtomControl

m-bone · web-flow · commit c10281ac8849 · 2021-03-16T10:15:08.000Z
Ambiguous atom control
diff --git a/AtomMapping.py b/AtomMapping.py
@@ -1,58 +1,88 @@
 import os
+import math
 import numpy as np
+from natsort import natsorted
 from sklearn.metrics import mean_absolute_error
 
 from BondDistanceMatrix import bond_distance_matrix
 from MappingFunctions import element_atomID_dict, element_validation, get_atomIDs
 
-# File search constants and user inputs
-DATA_DIR = os.getcwd() + '/Test_Cases'
-PRE_FILE_NAME = 'new_start_molecule.data'
-POST_FILE_NAME = 'new_post_rx1_molecule.data'
-ELEMENT_BY_TYPE = ['H', 'H', 'C', 'C', 'N', 'O', 'O', 'O']
-PRE_BONDING_ATOMS = ['28', '62']
-POST_BONDING_ATOMS = ['32', '15']
-POST_MAJOR_MOVED_ATOMS = ['33']
-POST_MINOR_MOVED_ATOMS = ['16']
-_WEIGHT_COEFF = 0.0
-
-# Get elements of atom IDs for pre and post molecules
-preElements = element_atomID_dict(DATA_DIR, PRE_FILE_NAME, ELEMENT_BY_TYPE)
-postElements = element_atomID_dict(DATA_DIR, POST_FILE_NAME, ELEMENT_BY_TYPE)
-
-# Get atomIDs - using existing function for now 
-preAtomIDs = get_atomIDs(DATA_DIR, PRE_FILE_NAME)
-postAtomIDs = get_atomIDs(DATA_DIR, POST_FILE_NAME)
-
-# Calculate bond distance matrices for pre and post molecule
-preBondDistMat = bond_distance_matrix(DATA_DIR, PRE_FILE_NAME, PRE_BONDING_ATOMS)
-postBondDistMat = bond_distance_matrix(DATA_DIR, POST_FILE_NAME, POST_BONDING_ATOMS) 
-
-# Set value for hydrogen that moves to epoxide ring to zero - this will be automated / user supplied info in the future
-for atom in POST_MAJOR_MOVED_ATOMS:
-    atomIndex = postAtomIDs.index(atom)
-    for index, atomRow in enumerate(postBondDistMat):
-        postBondDistMat[index][atomIndex] = 0.0
-
-# Sample weight matrix - lower weight for atoms with significant movement
-sampleWeights = np.ones(len(postAtomIDs))
-for atom in POST_MINOR_MOVED_ATOMS:
-    atomIndex = postAtomIDs.index(atom)
-    sampleWeights[atomIndex] = _WEIGHT_COEFF
-
-mappedIDList = []
-for searchIndex, searchRow in enumerate(preBondDistMat):
-    # Shortcircuit this search if search atom is a bonding atom
-    if preAtomIDs[searchIndex] in PRE_BONDING_ATOMS:
-        bondingIndex = PRE_BONDING_ATOMS.index(preAtomIDs[searchIndex])
-        bondingPostAtomID = POST_BONDING_ATOMS[bondingIndex]
-        mappedIDList.append([preAtomIDs[searchIndex], bondingPostAtomID])
-
-    else:
-        # Sort search row arrays from smallest to largest
-        searchRowIndex = np.argsort(searchRow)
-        searchRowSorted = np.take_along_axis(searchRow, searchRowIndex, axis=0)
+
+def atom_mapping(DATA_DIR, PRE_FILE_NAME, POST_FILE_NAME, ELEMENT_BY_TYPE, PRE_BONDING_ATOMS, POST_BONDING_ATOMS, POST_MAJOR_MOVED_ATOMS, POST_MINOR_MOVED_ATOMS):
+    _WEIGHT_COEFF = 0.0
+    # Get elements of atom IDs for pre and post molecules
+    preElements = element_atomID_dict(DATA_DIR, PRE_FILE_NAME, ELEMENT_BY_TYPE)
+    postElements = element_atomID_dict(DATA_DIR, POST_FILE_NAME, ELEMENT_BY_TYPE)
+
+    # Get atomIDs
+    preAtomIDs = get_atomIDs(DATA_DIR, PRE_FILE_NAME)
+    postAtomIDs = get_atomIDs(DATA_DIR, POST_FILE_NAME)
+
+    # Calculate bond distance matrices for pre and post molecule
+    preBondDistMat = bond_distance_matrix(DATA_DIR, PRE_FILE_NAME, PRE_BONDING_ATOMS, powerBonds=False)
+    postBondDistMat = bond_distance_matrix(DATA_DIR, POST_FILE_NAME, POST_BONDING_ATOMS, powerBonds=False) 
+
+    # Set value for hydrogen that moves to epoxide ring to zero - this will be automated / user supplied info in the future
+    for atom in POST_MAJOR_MOVED_ATOMS:
+        atomIndex = postAtomIDs.index(atom)
+        for index, _ in enumerate(postBondDistMat):
+            postBondDistMat[index][atomIndex] = 0.0
+
+    # Sample weight matrix - lower weight for atoms with significant movement
+    sampleWeights = np.ones(len(postAtomIDs))
+    for atom in POST_MINOR_MOVED_ATOMS:
+        atomIndex = postAtomIDs.index(atom)
+        sampleWeights[atomIndex] = _WEIGHT_COEFF
+
+    mappedIDList = []
+    mappedPostAtomsIndex = []
+    for searchIndex, searchRow in enumerate(preBondDistMat):
+        # Shortcircuit this search if search atom is a bonding atom
+        if preAtomIDs[searchIndex] in PRE_BONDING_ATOMS:
+            bondingIndex = PRE_BONDING_ATOMS.index(preAtomIDs[searchIndex])
+            bondingPostAtomID = POST_BONDING_ATOMS[bondingIndex]
+            mappedIDList.append([preAtomIDs[searchIndex], bondingPostAtomID])
+
+        else:
+            # Sort search row arrays from smallest to largest
+            searchRowIndex = np.argsort(searchRow)
+            searchRowSorted = np.take_along_axis(searchRow, searchRowIndex, axis=0)
+            
+            distDifference = []
+            for row in postBondDistMat:
+                # Sort row arrays from smallest to largest
+                rowIndex = np.argsort(row)
+                rowSorted = np.take_along_axis(row, rowIndex, axis=0)
+                
+                # Sort sample weight matrix the same as row
+                sampleWeightsSorted = np.take_along_axis(sampleWeights, rowIndex, axis=0)
+
+                # MAE
+                finalVal = mean_absolute_error(searchRowSorted, rowSorted, sample_weight=sampleWeightsSorted)
+
+                # Append - abs to get smallest value closest to zero
+                distDifference.append(abs(finalVal))
+
+            mappedPreAtomID, mappedPostAtomID, postAtomIDIndex = element_validation(preAtomIDs[searchIndex], postAtomIDs, distDifference, preElements, postElements, POST_BONDING_ATOMS)
+
+            mappedIDList.append([mappedPreAtomID, mappedPostAtomID])
+            mappedPostAtomsIndex.append(postAtomIDIndex)
+
+    # Ambiguous Atom Group Processing
+    # Gather all the pairs
+    mappedPostAtomIDs = [val[1] for val in mappedIDList]
+    repeatedPostIDSet = natsorted(set([val for val in mappedPostAtomIDs if mappedPostAtomIDs.count(val) > 1]))
+    repeatedIndexes = [postAtomIDs.index(ID) for ID in repeatedPostIDSet]
+
+    ambiguousGroupPairs = []
+    # Loop through all post atoms to find similar
+    for index in repeatedIndexes:
+        matchArray = postBondDistMat[index]
         
+        # Sort search row arrays from smallest to largest
+        searchRowIndex = np.argsort(matchArray)
+        searchRowSorted = np.take_along_axis(matchArray, searchRowIndex, axis=0)
+            
         distDifference = []
         for row in postBondDistMat:
             # Sort row arrays from smallest to largest
@@ -68,26 +98,23 @@
             # Append - abs to get smallest value closest to zero
             distDifference.append(abs(finalVal))
 
-        mappedPreAtomID, mappedPostAtomID = element_validation(preAtomIDs[searchIndex], postAtomIDs, distDifference, preElements, postElements)
-
-        mappedIDList.append([mappedPreAtomID, mappedPostAtomID])
-
+        # Set repeatedIndex value to nan as it will always be zero
+        distDifference[index] = math.nan
+        _, smallestIndex = min((val, idx) for (idx, val) in enumerate(distDifference))
 
+        ambiguousGroupPairs.append([postAtomIDs[index], postAtomIDs[smallestIndex]])
+        # print(f'Atom {postAtomIDs[index]} is paired to atom {postAtomIDs[smallestIndex]}')
 
-# Print test report
-for mappedPair in mappedIDList:
-    print(f'Atom {mappedPair[0]} is mapped to atom {mappedPair[1]}')
+    # Update mappedIDList based on the ambiguousGroupPairs values
+    # Interestingly, mappedIDList can be updated with the iterator, but ambiguousGroupPairs needs to be deleted with the index value
+    for mappedID in mappedIDList:
+        if mappedID[1] in repeatedPostIDSet: # If mappedPostAtomID is one that is repeated
+            for index, groupPair in enumerate(ambiguousGroupPairs):
+                if groupPair[0] == mappedID[1]: # If groupPair is a matching PostAtomID
+                    mappedID[1] = groupPair[1]
+                    del ambiguousGroupPairs[index]
+                    break
 
-correctPostAtomIDs = [['38'], ['39'], ['35'], ['41', '42'], ['42', '41'], ['32'], ['16'], ['5', '36'], ['36', '5'], ['37'], ['6', '9'], ['4'], ['1', '3'], ['3', '1'], ['9', '6'], ['17', '23'], ['23', '17'], ['15'], ['33', '34'], ['34', '33']]
-totalAtoms = len(correctPostAtomIDs)
-correctAtoms = 0
-incorrectPreAtomsList = []
-for index, atom in enumerate(mappedIDList):
-    if atom[1] in correctPostAtomIDs[index]:
-        correctAtoms += 1
-    else:
-        incorrectPreAtomsList.append(atom[0])
+    return mappedIDList
+# This needs to include a check to make sure it's not updating the atom to an ID already assigned - might be best to have an unassigned-postAtomIDList
 
-print(f'Test Results: Weight coeff is {_WEIGHT_COEFF}')
-print(f'Correct atoms: {correctAtoms}. Accuracy: {round(correctAtoms / totalAtoms * 100, 1)}%')
-print(f'Incorrect premolecule atomIDs: {incorrectPreAtomsList}')
diff --git a/BondDistanceMatrix.py b/BondDistanceMatrix.py
@@ -51,7 +51,7 @@ def get_bond_path(atomList, bonds):
 
     return bondIDList
 
-def calc_path_distance(bondList, bondDict):
+def calc_path_distance(bondList, bondDict, powers):
     # If bondList is empty return zero
     if len(bondList) == 0:
         return 0.0
@@ -60,6 +60,9 @@ def calc_path_distance(bondList, bondDict):
     for bondID in bondList:
         bondDistList.append(bondDict[bondID])
 
+    if powers:
+        bondDistList = [bond ** (index + 1) for index, bond in enumerate(bondDistList)]
+
     bondDistMultiple = reduce((lambda x, y: x * y), bondDistList)
     return bondDistMultiple
 
@@ -139,7 +142,7 @@ def breadth_first_search(graph, start, target):
 
     return path
 
-def bond_distance_matrix(directory, fileName, bondingAtoms):
+def bond_distance_matrix(directory, fileName, bondingAtoms, powerBonds=False):
     os.chdir(directory)
 
     # Load molecule file
@@ -180,7 +183,7 @@ def bond_distance_matrix(directory, fileName, bondingAtoms):
         for otherAtom in atomIDs:
             atomPath = breadth_first_search(moleculeGraph, startAtom, otherAtom)
             bondPath = get_bond_path(atomPath, bonds)
-            pathDistance = calc_path_distance(bondPath, bondLengthDict)
+            pathDistance = calc_path_distance(bondPath, bondLengthDict, powerBonds)
             atomBondDistanceList.append(pathDistance)
         
         totalBondDistanceList.append(atomBondDistanceList)
diff --git a/DetailedTesting.py b/DetailedTesting.py
@@ -0,0 +1,52 @@
+from AtomMapping import atom_mapping
+
+class Reaction:
+    def __init__(self, directory, preFileName, postFileName, elementByType, preBondingAtoms, postBondingAtoms, postMajorMovedAtoms, postMinorMovedAtoms):
+        self.mappedIDList = atom_mapping(directory, preFileName, postFileName, elementByType, preBondingAtoms, postBondingAtoms, postMajorMovedAtoms, postMinorMovedAtoms)
+
+    def test_report(self, correctPostAtomIDs, reactionName):
+        print(f'\n\nReaction: {reactionName}')
+        # Print test report
+        for mappedPair in self.mappedIDList:
+            print(f'Atom {mappedPair[0]} is mapped to atom {mappedPair[1]}')
+
+        
+        totalAtoms = len(correctPostAtomIDs)
+        correctAtoms = 0
+        incorrectPreAtomsList = []
+        for index, atom in enumerate(self.mappedIDList):
+            if atom[1] in correctPostAtomIDs[index]:
+                correctAtoms += 1
+            else:
+                incorrectPreAtomsList.append(atom[0])
+
+        mappedPostAtomsList = [val[1] for val in self.mappedIDList]
+        repeatedPostIDs = [val for val in mappedPostAtomsList if mappedPostAtomsList.count(val) > 1]
+
+        print(f'Total atoms: {totalAtoms}. Correct atoms: {correctAtoms}. Accuracy: {round(correctAtoms / totalAtoms * 100, 1)}%')
+        print(f'Incorrect premolecule atomIDs: {incorrectPreAtomsList}')
+        print(f'Repeated Atoms: {repeatedPostIDs}, Count: {len(repeatedPostIDs)}')
+
+# DGEBA-DETDA
+dgebaDetda = Reaction('/home/matt/Documents/Oct20-Dec20/Bonding_Test/DGEBA_DETDA/Reaction', 'new_start_molecule.data', 'new_post_rx1_molecule.data', ['H', 'H', 'C', 'C', 'N', 'O', 'O', 'O'],
+['28', '62'], ['32', '15'], ['33'], ['16'])
+correctDgebaDetda = [['38'], ['39'], ['35'], ['41', '42'], ['42', '41'], ['32'], ['16'], ['5', '36'], ['36', '5'], ['37'], ['6', '9'], ['4'], ['1', '3'], ['3', '1'], ['9', '6'], ['17', '23'], ['23', '17'], ['15'], ['33', '34'], ['34', '33']]
+dgebaDetda.test_report(correctDgebaDetda, 'DGEBA-DETDA')
+
+# Ethyl Ethanoate
+ethylEthanoate = Reaction('/home/matt/Documents/Oct20-Dec20/Bonding_Test/Ethyl_Ethanoate/Reaction', 'pre-molecule.data', 'post-molecule.data', ['H', 'H', 'C', 'C', 'O', 'O', 'O', 'O'], ['6', '11'], ['7', '2'], [], [])
+correctEthylEthanoate = [['9'], ['8'], ['12', '13', '14'], ['13', '12', '14'], ['14', '12', '13'], ['7'], ['10', '11'], ['11', '10'], ['17', '16'], ['1'], ['2'], ['3', '4', '5'], ['4', '3', '5'], ['5', '3', '4'], ['15'], ['16', '17'], ['6']]
+ethylEthanoate.test_report(correctEthylEthanoate, 'Ethyl Ethanoate')
+
+
+# Nothing reasonable got given 13, too many 14 including some across the molecule boundary
+# 15 given 2 should be impossible for multiple reasons - 15 is O, 2 is C and 2 is a bonding atom
+
+# Validation idea
+# Search for ambiguous groups in the post molecule by comparing post atom to all post atoms
+# I can find this easily and it can confirm if something should be an ambiguous group
+# Could cause issues if the BPDM manages to split two things that should be pairs - this check may find things that the BPDM doesn't
+# Tool would help explin why BPDM works in some cases but less in others
+# I can also predict how many ambiguous groups in my pre and post molecule with this method
+# This could check if I have as many as I expect and it may help identify atoms that have moved - can use ambiguous pairs as a useful tool
+# Can I use ambiguous pairs that don't exist before but do after and visa versa to identify moved atoms
diff --git a/MappingFunctions.py b/MappingFunctions.py
@@ -40,22 +40,24 @@ def element_atomID_dict(directory, fileName, elementsByType):
 
     return elementIDDict
 
-def element_validation(preAtomID, postAtomIDList, differenceList, preElementDict, postElementDict):
+def element_validation(preAtomID, postAtomIDList, differenceList, preElementDict, postElementDict, postBondingAtoms):
     # Make a copy of unchanged differenceList
     originalDifferenceList = differenceList.copy()
 
-    checkElement = 1
-
     # Find lowest difference post atom ID that is the same element as the pre atom ID
+    checkElement = 1
     while checkElement:
         # Find smallest value and corresponding index
         val, idx = min((val, idx) for (idx, val) in enumerate(differenceList))
         # Find the smallest value's index in the original list
         originalIndex = originalDifferenceList.index(val)
 
-        # If elements are the same return the pre and post atom IDs 
-        if preElementDict[preAtomID] == postElementDict[postAtomIDList[originalIndex]]:
-            return preAtomID, postAtomIDList[originalIndex]
+        if postAtomIDList[originalIndex] in postBondingAtoms:
+            # If chosen ID is one of the bondingAtoms, it's wrong so can be removed
+            del differenceList[idx]
+        elif preElementDict[preAtomID] == postElementDict[postAtomIDList[originalIndex]]:
+            # If elements are the same return the pre and post atom IDs
+            return preAtomID, postAtomIDList[originalIndex], originalIndex
         else:
             # If the elements are different delete the smallest value by index and try again
             del differenceList[idx]
diff --git a/test_AtomMapping.py b/test_AtomMapping.py
@@ -0,0 +1,33 @@
+from AtomMapping import atom_mapping
+
+def validation_function(mappedIDList, correctPostAtomIDs):
+    # Calculate accuracy
+    totalAtoms = len(correctPostAtomIDs)
+    correctAtoms = 0
+    incorrectPreAtomsList = []
+    for index, atom in enumerate(mappedIDList):
+        if atom[1] in correctPostAtomIDs[index]:
+            correctAtoms += 1
+        else:
+            incorrectPreAtomsList.append(atom[0])
+
+    accuracy = round(correctAtoms / totalAtoms * 100, 1)
+
+    # Calculate multiple assignment atoms
+    mappedPostAtomsList = [val[1] for val in mappedIDList]
+    repeatedPostIDs = [val for val in mappedPostAtomsList if mappedPostAtomsList.count(val) > 1]
+    countRepeatedPostIDs = len(repeatedPostIDs)
+
+    return accuracy, countRepeatedPostIDs
+
+def test_dgeba_detda():
+    mappedIDList = atom_mapping('/home/matt/Documents/Oct20-Dec20/Bonding_Test/DGEBA_DETDA/Reaction', 'new_start_molecule.data', 'new_post_rx1_molecule.data', ['H', 'H', 'C', 'C', 'N', 'O', 'O', 'O'],
+    ['28', '62'], ['32', '15'], ['33'], ['16'])
+    correctPostAtomIDs = [['38'], ['39'], ['35'], ['41', '42'], ['42', '41'], ['32'], ['16'], ['5', '36'], ['36', '5'], ['37'], ['6', '9'], ['4'], ['1', '3'], ['3', '1'], ['9', '6'], ['17', '23'], ['23', '17'], ['15'], ['33', '34'], ['34', '33']]
+    acc, repeatCount = validation_function(mappedIDList, correctPostAtomIDs)
+
+    # Check accuracy and number of repeated IDs are as expect
+    checkValues = [acc, repeatCount] 
+    expected = [95, 2]
+
+    assert checkValues == expected