Skip to content

Commit 4fd8017

Browse files
make_mat.py with h5 input
1 parent 48cf506 commit 4fd8017

File tree

3 files changed

+53
-10
lines changed

3 files changed

+53
-10
lines changed

conda/environment.yml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,3 +9,5 @@ dependencies:
99
- parmed
1010
- gitpython
1111
- pyyaml
12+
- scipy
13+
- h5py

requirements.txt

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,4 +2,6 @@ ParmEd>=3.4.3
22
numpy>=1.26.2
33
pandas>=2.1.3
44
pyyaml>=6.0.1
5-
gitpython>=3.1.43
5+
gitpython>=3.1.43
6+
h5py>=3.11.0
7+
scipy>=1.13.0

tools/make_mat/make_mat.py

Lines changed: 48 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,8 @@
1818
import warnings
1919
import gzip
2020
import tarfile
21+
import h5py
22+
from scipy.special import logsumexp
2123

2224
d = {
2325
type_definitions.gromos_atp.name[i]: type_definitions.gromos_atp.rc_c12[i]
@@ -47,11 +49,29 @@ def read_mat(name, protein_ref_indices, args, cumulative=False):
4749
if args.tar:
4850
with tarfile.open(args.histo, "r:*") as tar:
4951
ref_df = pd.read_csv(tar.extractfile(name), header=None, sep="\s+", usecols=[0, *protein_ref_indices])
52+
ref_df_columns = ["distance", *[str(x) for x in protein_ref_indices]]
53+
ref_df.columns = ref_df_columns
54+
ref_df.set_index("distance", inplace=True)
5055
else:
51-
ref_df = pd.read_csv(f"{path_prefix}/{name}", header=None, sep="\s+", usecols=[0, *protein_ref_indices])
52-
ref_df_columns = ["distance", *[str(x) for x in protein_ref_indices]]
53-
ref_df.columns = ref_df_columns
54-
ref_df.set_index("distance", inplace=True)
56+
if not args.h5:
57+
ref_df = pd.read_csv(f"{path_prefix}/{name}", header=None, sep="\s+", usecols=[0, *protein_ref_indices])
58+
ref_df_columns = ["distance", *[str(x) for x in protein_ref_indices]]
59+
ref_df.columns = ref_df_columns
60+
ref_df.set_index("distance", inplace=True)
61+
else:
62+
with h5py.File(f"{path_prefix}/{name}", "r") as f:
63+
if "density" not in f:
64+
raise KeyError(f"Dataset 'density' not found in {name}")
65+
66+
data = f["density"][:] # Read full dataset
67+
# Extract the first column (distance) and the relevant protein_ref_indices columns
68+
distances = data[:, 0] # First column is distance
69+
protein_data = data[:, protein_ref_indices] # Select the relevant protein reference indices
70+
# Create a DataFrame
71+
ref_df = pd.DataFrame(protein_data, columns=[str(i) for i in protein_ref_indices])
72+
ref_df["distance"] = distances
73+
# Set 'distance' as the index
74+
ref_df.set_index("distance", inplace=True)
5575

5676
return ref_df
5777

@@ -100,6 +120,7 @@ def run_mat_(arguments):
100120
# We do not consider old histograms
101121
frac_target_list = [x for x in frac_target_list if x[0] != "#" and x[-1] != "#"]
102122
for i, ref_f in enumerate(frac_target_list):
123+
print(f"\rProgress: {ref_f} ", end="", flush=True)
103124
results_df = pd.DataFrame()
104125
ai = ref_f.split(".")[-2].split("_")[-1]
105126

@@ -148,6 +169,7 @@ def run_mat_(arguments):
148169
if not results_df.empty:
149170
df = pd.concat([df, results_df])
150171

172+
print("done.")
151173
df.fillna(0).infer_objects(copy=False)
152174
out_path = f"mat_{process.pid}_t{time.time()}.part"
153175
df.to_csv(out_path, index=False)
@@ -415,7 +437,10 @@ def c12_avg(values, weights):
415437
w = w[r[0][0]:w.size]
416438
# fmt: on
417439
res = np.maximum(cutoff / 4.5, 0.1)
418-
exp_aver = (1.0 / res) / np.log(np.sum(w * np.exp(1.0 / v / res)) / norm)
440+
log_exp_sum = logsumexp(1.0 / v / res, b=w) - np.log(norm)
441+
exp_aver = (1.0 / res) / log_exp_sum
442+
if exp_aver < 0.01:
443+
exp_aver = 0
419444

420445
return exp_aver
421446

@@ -549,9 +574,12 @@ def main_routine(mol_i, mol_j, topology_mego, topology_ref, molecules_name, pref
549574
)
550575
if args.tar:
551576
with tarfile.open(args.histo, "r:*") as tar:
552-
target_list = [x.name for x in tar.getmembers() if prefix in x.name]
577+
target_list = [x.name for x in tar.getmembers() if prefix in x.name and x.name.endswith(".dat")]
553578
else:
554-
target_list = [x for x in os.listdir(args.histo) if prefix in x]
579+
if not args.h5:
580+
target_list = [x for x in os.listdir(args.histo) if prefix in x and x.endswith(".dat")]
581+
else:
582+
target_list = [x for x in os.listdir(args.histo) if prefix in x and x.endswith(".h5")]
555583

556584
protein_mego_i = topology_mego.molecules[list(topology_mego.molecules.keys())[mol_i - 1]][0]
557585
protein_mego_j = topology_mego.molecules[list(topology_mego.molecules.keys())[mol_j - 1]][0]
@@ -714,7 +742,10 @@ def main_routine(mol_i, mol_j, topology_mego, topology_ref, molecules_name, pref
714742

715743
dict_m_m_r = {}
716744
for target in target_list:
717-
target_fields = target.replace(".dat", "").split("_")
745+
if not args.h5 or args.tar:
746+
target_fields = target.replace(".dat", "").split("_")
747+
else:
748+
target_fields = target.replace(".h5", "").split("_")
718749
mi = int(target_fields[-4])
719750
mj = int(target_fields[-3])
720751
ai = int(target_fields[-1])
@@ -740,7 +771,6 @@ def main_routine(mol_i, mol_j, topology_mego, topology_ref, molecules_name, pref
740771
df["c12dist"] = 0.0
741772
df["p"] = 0.0
742773
df["cutoff"] = [c12_cutoff[i, j] for i in range(len(protein_ref_indices_i)) for j in range(len(protein_ref_indices_j))]
743-
744774
else:
745775
if not args.residue:
746776
chunks = np.array_split(target_list, args.num_threads)
@@ -877,6 +907,11 @@ def main_routine(mol_i, mol_j, topology_mego, topology_ref, molecules_name, pref
877907
action="store_true",
878908
help="Read from tar file instead of directory",
879909
)
910+
parser.add_argument(
911+
"--h5",
912+
action="store_true",
913+
help="Read from h5 file instead of text (.dat)",
914+
)
880915
parser.add_argument(
881916
"--custom_c12",
882917
type=str,
@@ -914,6 +949,10 @@ def main_routine(mol_i, mol_j, topology_mego, topology_ref, molecules_name, pref
914949
print(f"The path '{args.histo}' is not a tar file.")
915950
sys.exit()
916951

952+
if args.tar and args.h5:
953+
printf("cannot use --tar and --h5, chose one.")
954+
sys.exit()
955+
917956
# Sets mode
918957
modes = np.array(args.mode.split("+"), dtype=str)
919958
modes_possible = np.array(["intra", "same", "cross"])

0 commit comments

Comments
 (0)