-
Notifications
You must be signed in to change notification settings - Fork 4
Expand file tree
/
Copy pathpackage_hubmap.py
More file actions
154 lines (139 loc) · 5.39 KB
/
package_hubmap.py
File metadata and controls
154 lines (139 loc) · 5.39 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
import os
import argparse
import frictionless
from frictionless import fields, Schema
import pandas as pd
'''
Takes as input the three main HubMAP metadata files (datasets, donors, and samples) and
packages them into a frictionless data package.
'''
def load_metadata(file_path):
"""
Load metadata from a CSV file into a pandas DataFrame.
"""
try:
return pd.read_csv(file_path)
except Exception as e:
raise ValueError(f"Error loading file {file_path}: {e}")
def get_df_schema(df, primary_keys, foreign_keys, descriptions):
fields = get_field_definitions(df, descriptions)
return Schema(fields=fields, primary_key=primary_keys, foreign_keys=foreign_keys)
def get_field_definitions(df, descriptions):
field_definitions = []
[fields.StringField(name='id')]
for col in df.columns:
dtype = df[col].dtype
# column_spec = {"name": col}
description = descriptions[col]
# if is nan
if pd.isna(description):
description = ""
if dtype == "object":
# column_spec["data_type"] = "string"
field = fields.StringField(name=col, description=description)
elif pd.api.types.is_numeric_dtype(dtype):
# column_spec["data_type"] = "number"
field = fields.NumberField(name=col, description=description)
else:
# column_spec["data_type"] = "unknown"
field = fields.AnyField(name=col, description=description)
# column_spec["cardinality"] = len(df[col].unique())
field_definitions.append(field)
return field_definitions
def create_frictionless_package(input_folder, output_path):
"""
Create a frictionless data package from HubMAP metadata files.
"""
timestamped_data = {
"datasets": "hubmap-datasets-metadata-2025-05-06_04-30-42.tsv",
"donors": "hubmap-donors-metadata-2025-05-06_04-29-48.tsv",
"samples": "hubmap-samples-metadata-2025-05-06_04-29-50.tsv"
}
description_lookup = {}
# for each timestamped file, open it as a df, save the first two rows as a dict
# and then save the rest of the file as a csv.
for key, value in timestamped_data.items():
# open the file
file_path = os.path.join(input_folder, value)
df = pd.read_csv(file_path, sep="\t")
# save the first two rows as a dict
descriptions = df.iloc[0:2].to_dict(orient="records")
description_lookup[key] = descriptions[0]
# print(metadata_dict)
# save the rest of the file as a csv
df.iloc[1:].to_csv(os.path.join(input_folder, f"{key}.csv"), index=False)
dataset_info = [
{
"name": "datasets.csv",
"primary_keys": ["hubmap_id"],
"foreign_keys": [
{
"fields": ["donor.hubmap_id"],
"reference": {
"resource": "donors",
"fields": ["hubmap_id"]
},
}
],
},
{
"name": "donors.csv",
"primary_keys": ["hubmap_id"],
"foreign_keys": [
{
"fields": ["hubmap_id"],
"reference": {
"resource": "datasets",
"fields": ["donor.hubmap_id"]
},
},
{
"fields": ["hubmap_id"],
"reference": {
"resource": "datasets",
"fields": ["donor.hubmap_id"]
},
},
],
},
{
"name": "samples.csv",
"primary_keys": ["hubmap_id"],
"foreign_keys": [
{
"fields": ["donor.hubmap_id"],
"reference": {
"resource": "donors",
"fields": ["hubmap_id"]
},
}
],
},
]
# dataset_paths = [os.path.join(input_folder, name) for name in dataset_names]
resources = []
for dataset in dataset_info:
name = dataset["name"]
primary_keys = dataset["primary_keys"]
foreign_keys = dataset["foreign_keys"]
dataset_path = os.path.join(input_folder, name)
df = load_metadata(dataset_path)
key = os.path.splitext(name)[0]
# get schema from the dataframe
schema = get_df_schema(df, primary_keys, foreign_keys, description_lookup[key])
resource = frictionless.Resource(path=name, schema=schema)
resources.append(resource)
# resources = [frictionless.Resource(path) for path in dataset_paths]
# Create the package
package = frictionless.Package(resources=resources, name="hubmap_metadata")
# Save the package to the specified output path
package.to_json(output_path)
print(f"Frictionless package created at {output_path}")
if __name__ == "__main__":
# Parse command-line arguments
parser = argparse.ArgumentParser(description="Package HubMAP metadata into a frictionless data package.")
parser.add_argument("input_folder", help="Path to the hubmap folder")
parser.add_argument("output_path", help="Path to save the frictionless data package (ZIP)")
args = parser.parse_args()
# Create the frictionless package
create_frictionless_package(args.input_folder, args.output_path)