1818import warnings
1919import gzip
2020import tarfile
21+ import h5py
22+ from scipy .special import logsumexp
2123
2224d = {
2325 type_definitions .gromos_atp .name [i ]: type_definitions .gromos_atp .rc_c12 [i ]
@@ -47,11 +49,29 @@ def read_mat(name, protein_ref_indices, args, cumulative=False):
4749 if args .tar :
4850 with tarfile .open (args .histo , "r:*" ) as tar :
4951 ref_df = pd .read_csv (tar .extractfile (name ), header = None , sep = "\s+" , usecols = [0 , * protein_ref_indices ])
52+ ref_df_columns = ["distance" , * [str (x ) for x in protein_ref_indices ]]
53+ ref_df .columns = ref_df_columns
54+ ref_df .set_index ("distance" , inplace = True )
5055 else :
51- ref_df = pd .read_csv (f"{ path_prefix } /{ name } " , header = None , sep = "\s+" , usecols = [0 , * protein_ref_indices ])
52- ref_df_columns = ["distance" , * [str (x ) for x in protein_ref_indices ]]
53- ref_df .columns = ref_df_columns
54- ref_df .set_index ("distance" , inplace = True )
56+ if not args .h5 :
57+ ref_df = pd .read_csv (f"{ path_prefix } /{ name } " , header = None , sep = "\s+" , usecols = [0 , * protein_ref_indices ])
58+ ref_df_columns = ["distance" , * [str (x ) for x in protein_ref_indices ]]
59+ ref_df .columns = ref_df_columns
60+ ref_df .set_index ("distance" , inplace = True )
61+ else :
62+ with h5py .File (f"{ path_prefix } /{ name } " , "r" ) as f :
63+ if "density" not in f :
64+ raise KeyError (f"Dataset 'density' not found in { name } " )
65+
66+ data = f ["density" ][:] # Read full dataset
67+ # Extract the first column (distance) and the relevant protein_ref_indices columns
68+ distances = data [:, 0 ] # First column is distance
69+ protein_data = data [:, protein_ref_indices ] # Select the relevant protein reference indices
70+ # Create a DataFrame
71+ ref_df = pd .DataFrame (protein_data , columns = [str (i ) for i in protein_ref_indices ])
72+ ref_df ["distance" ] = distances
73+ # Set 'distance' as the index
74+ ref_df .set_index ("distance" , inplace = True )
5575
5676 return ref_df
5777
@@ -100,6 +120,7 @@ def run_mat_(arguments):
100120 # We do not consider old histograms
101121 frac_target_list = [x for x in frac_target_list if x [0 ] != "#" and x [- 1 ] != "#" ]
102122 for i , ref_f in enumerate (frac_target_list ):
123+ print (f"\r Progress: { ref_f } " , end = "" , flush = True )
103124 results_df = pd .DataFrame ()
104125 ai = ref_f .split ("." )[- 2 ].split ("_" )[- 1 ]
105126
@@ -148,6 +169,7 @@ def run_mat_(arguments):
148169 if not results_df .empty :
149170 df = pd .concat ([df , results_df ])
150171
172+ print ("done." )
151173 df .fillna (0 ).infer_objects (copy = False )
152174 out_path = f"mat_{ process .pid } _t{ time .time ()} .part"
153175 df .to_csv (out_path , index = False )
@@ -415,7 +437,10 @@ def c12_avg(values, weights):
415437 w = w [r [0 ][0 ]:w .size ]
416438 # fmt: on
417439 res = np .maximum (cutoff / 4.5 , 0.1 )
418- exp_aver = (1.0 / res ) / np .log (np .sum (w * np .exp (1.0 / v / res )) / norm )
440+ log_exp_sum = logsumexp (1.0 / v / res , b = w ) - np .log (norm )
441+ exp_aver = (1.0 / res ) / log_exp_sum
442+ if exp_aver < 0.01 :
443+ exp_aver = 0
419444
420445 return exp_aver
421446
@@ -549,9 +574,12 @@ def main_routine(mol_i, mol_j, topology_mego, topology_ref, molecules_name, pref
549574 )
550575 if args .tar :
551576 with tarfile .open (args .histo , "r:*" ) as tar :
552- target_list = [x .name for x in tar .getmembers () if prefix in x .name ]
577+ target_list = [x .name for x in tar .getmembers () if prefix in x .name and x . name . endswith ( ".dat" ) ]
553578 else :
554- target_list = [x for x in os .listdir (args .histo ) if prefix in x ]
579+ if not args .h5 :
580+ target_list = [x for x in os .listdir (args .histo ) if prefix in x and x .endswith (".dat" )]
581+ else :
582+ target_list = [x for x in os .listdir (args .histo ) if prefix in x and x .endswith (".h5" )]
555583
556584 protein_mego_i = topology_mego .molecules [list (topology_mego .molecules .keys ())[mol_i - 1 ]][0 ]
557585 protein_mego_j = topology_mego .molecules [list (topology_mego .molecules .keys ())[mol_j - 1 ]][0 ]
@@ -714,7 +742,10 @@ def main_routine(mol_i, mol_j, topology_mego, topology_ref, molecules_name, pref
714742
715743 dict_m_m_r = {}
716744 for target in target_list :
717- target_fields = target .replace (".dat" , "" ).split ("_" )
745+ if not args .h5 or args .tar :
746+ target_fields = target .replace (".dat" , "" ).split ("_" )
747+ else :
748+ target_fields = target .replace (".h5" , "" ).split ("_" )
718749 mi = int (target_fields [- 4 ])
719750 mj = int (target_fields [- 3 ])
720751 ai = int (target_fields [- 1 ])
@@ -740,7 +771,6 @@ def main_routine(mol_i, mol_j, topology_mego, topology_ref, molecules_name, pref
740771 df ["c12dist" ] = 0.0
741772 df ["p" ] = 0.0
742773 df ["cutoff" ] = [c12_cutoff [i , j ] for i in range (len (protein_ref_indices_i )) for j in range (len (protein_ref_indices_j ))]
743-
744774 else :
745775 if not args .residue :
746776 chunks = np .array_split (target_list , args .num_threads )
@@ -877,6 +907,11 @@ def main_routine(mol_i, mol_j, topology_mego, topology_ref, molecules_name, pref
877907 action = "store_true" ,
878908 help = "Read from tar file instead of directory" ,
879909 )
910+ parser .add_argument (
911+ "--h5" ,
912+ action = "store_true" ,
913+ help = "Read from h5 file instead of text (.dat)" ,
914+ )
880915 parser .add_argument (
881916 "--custom_c12" ,
882917 type = str ,
@@ -914,6 +949,10 @@ def main_routine(mol_i, mol_j, topology_mego, topology_ref, molecules_name, pref
914949 print (f"The path '{ args .histo } ' is not a tar file." )
915950 sys .exit ()
916951
952+ if args .tar and args .h5 :
953+ printf ("cannot use --tar and --h5, chose one." )
954+ sys .exit ()
955+
917956 # Sets mode
918957 modes = np .array (args .mode .split ("+" ), dtype = str )
919958 modes_possible = np .array (["intra" , "same" , "cross" ])
0 commit comments