DQVis-Generation/package_hubmap.py at main · hms-dbmi/DQVis-Generation · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
import os
import argparse
import frictionless
from frictionless import fields, Schema
import pandas as pd

'''
Takes as input the three main HubMAP metadata files (datasets, donors, and samples) and
packages them into a frictionless data package.
'''

def load_metadata(file_path):
    """
    Load metadata from a CSV file into a pandas DataFrame.
    """
    try:
        return pd.read_csv(file_path)
    except Exception as e:
        raise ValueError(f"Error loading file {file_path}: {e}")

def get_df_schema(df, primary_keys, foreign_keys, descriptions):
    fields = get_field_definitions(df, descriptions)
    return Schema(fields=fields, primary_key=primary_keys, foreign_keys=foreign_keys)

def get_field_definitions(df, descriptions):
    field_definitions = []

    [fields.StringField(name='id')]

    for col in df.columns:
        dtype = df[col].dtype
        # column_spec = {"name": col}
        description = descriptions[col]
        # if is nan
        if pd.isna(description):
            description = ""
        if dtype == "object":
            # column_spec["data_type"] = "string"
            field = fields.StringField(name=col, description=description)
        elif pd.api.types.is_numeric_dtype(dtype):
            # column_spec["data_type"] = "number"
            field = fields.NumberField(name=col, description=description)
        else:
            # column_spec["data_type"] = "unknown"
            field = fields.AnyField(name=col, description=description)
        # column_spec["cardinality"] = len(df[col].unique())
        field_definitions.append(field)
    return field_definitions

def create_frictionless_package(input_folder, output_path):
    """
    Create a frictionless data package from HubMAP metadata files.
    """
    timestamped_data = {
        "datasets": "hubmap-datasets-metadata-2025-05-06_04-30-42.tsv",
        "donors": "hubmap-donors-metadata-2025-05-06_04-29-48.tsv",
        "samples": "hubmap-samples-metadata-2025-05-06_04-29-50.tsv"
        }
    description_lookup = {}

    # for each timestamped file, open it as a df, save the first two rows as a dict
    # and then save the rest of the file as a csv.
    for key, value in timestamped_data.items():
        # open the file
        file_path = os.path.join(input_folder, value)
        df = pd.read_csv(file_path, sep="\t")
        # save the first two rows as a dict
        descriptions = df.iloc[0:2].to_dict(orient="records")
        description_lookup[key] = descriptions[0]
        # print(metadata_dict)
        # save the rest of the file as a csv
        df.iloc[1:].to_csv(os.path.join(input_folder, f"{key}.csv"), index=False)

    dataset_info = [
        {
            "name": "datasets.csv",
            "primary_keys": ["hubmap_id"],
            "foreign_keys": [
                {
                    "fields": ["donor.hubmap_id"],
                    "reference": {
                        "resource": "donors",
                        "fields": ["hubmap_id"]
                    },
                }
            ],
        },
        {
            "name": "donors.csv",
            "primary_keys": ["hubmap_id"],
            "foreign_keys": [
                {
                    "fields": ["hubmap_id"],
                    "reference": {
                        "resource": "datasets",
                        "fields": ["donor.hubmap_id"]
                    },
                },
                {
                    "fields": ["hubmap_id"],
                    "reference": {
                        "resource": "datasets",
                        "fields": ["donor.hubmap_id"]
                    },
                },
            ],
        },
        {
            "name": "samples.csv",
            "primary_keys": ["hubmap_id"],
            "foreign_keys": [
                {
                    "fields": ["donor.hubmap_id"],
                    "reference": {
                        "resource": "donors",
                        "fields": ["hubmap_id"]
                    },
                }
            ],
        },
    ]
    # dataset_paths = [os.path.join(input_folder, name) for name in dataset_names]
    resources = []
    for dataset in dataset_info:
        name = dataset["name"]
        primary_keys = dataset["primary_keys"]
        foreign_keys = dataset["foreign_keys"]
        dataset_path = os.path.join(input_folder, name)
        df = load_metadata(dataset_path)
        key = os.path.splitext(name)[0]
        # get schema from the dataframe
        schema = get_df_schema(df, primary_keys, foreign_keys, description_lookup[key])
        resource = frictionless.Resource(path=name, schema=schema)
        resources.append(resource)
    # resources = [frictionless.Resource(path) for path in dataset_paths]

    # Create the package
    package = frictionless.Package(resources=resources, name="hubmap_metadata")

    # Save the package to the specified output path
    package.to_json(output_path)
    print(f"Frictionless package created at {output_path}")

if __name__ == "__main__":

    # Parse command-line arguments
    parser = argparse.ArgumentParser(description="Package HubMAP metadata into a frictionless data package.")
    parser.add_argument("input_folder", help="Path to the hubmap folder")
    parser.add_argument("output_path", help="Path to save the frictionless data package (ZIP)")

    args = parser.parse_args()

    # Create the frictionless package
    create_frictionless_package(args.input_folder, args.output_path)