Skip to content

Commit 1f8b7fe

Browse files
authored
feat: pull protocols.io AIND workspace, create ProtocolModel (#253)
* feat: pull protocols.io AIND workspace, create ProtocolModel * tests: coverage on protoocls * chore: lint * chore: typo * refactor: title -> name * tests: fix test * chore: docstrings * fix: API pagination doesn't work * fix: properly deal with versions by suffixing them to title * feat: from_doi/from_url functions * refactor: use proper regex for URLs * chore: lint * chore: docstring * fix: type errors * feat: added version * refactor: remove authors, add version * chore: unused import
1 parent 4afe331 commit 1f8b7fe

File tree

11 files changed

+1618
-0
lines changed

11 files changed

+1618
-0
lines changed

scripts/get_protocols.py

Lines changed: 92 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,92 @@
1+
"""Script to grab protocol information from protocols.IO using the official API.
2+
3+
Requires you set PROTOCOLS_CLIENT_TOKEN environment variable with your API token, from
4+
protocols.io/developers
5+
"""
6+
7+
import re
8+
import requests
9+
import pandas as pd
10+
import os
11+
12+
13+
OUTPUT_CSV = "src/aind_data_schema_models/_generators/models/protocols.csv"
14+
WORKSPACE_URI = "allen-institute-for-neural-dynamics"
15+
API_BASE = "https://www.protocols.io/api"
16+
17+
18+
def get_access_token():
19+
"""Get access token from environment variable."""
20+
token = os.environ.get("PROTOCOLS_CLIENT_TOKEN")
21+
if not token:
22+
raise RuntimeError("PROTOCOLS_CLIENT_TOKEN environment variable not set.")
23+
return token
24+
25+
26+
def get_workspace_protocols(token, workspace_uri):
27+
"""Get protocols from a given workspace."""
28+
headers = {"Authorization": f"Bearer {token}"}
29+
all_items = []
30+
page = 1
31+
while True:
32+
url = f"{API_BASE}/v3/workspaces/{workspace_uri}/protocols?page={page}&page_size=10000"
33+
resp = requests.get(url, headers=headers)
34+
resp.raise_for_status()
35+
data = resp.json()
36+
items = data.get("items", [])
37+
if not items:
38+
break
39+
all_items.extend(items)
40+
# Check pagination info
41+
pagination = data.get("pagination", {})
42+
total_pages = pagination.get("total_pages", 1)
43+
if page >= total_pages:
44+
break
45+
page += 1
46+
return all_items
47+
48+
49+
def parse_protocol(protocol: dict) -> list[dict]:
50+
"""Parse protocol information to extract title and DOI."""
51+
title = protocol.get("title", "")
52+
53+
versions = protocol.get("versions", [])
54+
55+
protocols = []
56+
57+
for version in versions:
58+
version_doi = version.get("doi", "")
59+
version_doi = re.sub(r"^(https?://)?(dx\.)?doi\.org/", "", version_doi)
60+
version_id = int(version.get("version_id", "")) + 1 # version_id is 0-based
61+
62+
protocols.append({"title": title, "DOI": version_doi, "version": version_id})
63+
64+
return protocols
65+
66+
67+
def main():
68+
"""Main function to fetch protocols and save to CSV."""
69+
token = get_access_token()
70+
print("Fetching protocols from workspace...")
71+
protocols = get_workspace_protocols(token, WORKSPACE_URI)
72+
print(f"Found {len(protocols)} protocols.")
73+
# Deduplicate by DOI
74+
results_by_doi = {}
75+
for p in protocols:
76+
print(f"Processing protocol: {p.get('title', '')}")
77+
parsed_protocols = parse_protocol(p)
78+
for proto in parsed_protocols:
79+
doi = proto["DOI"]
80+
if doi and doi not in results_by_doi:
81+
results_by_doi[doi] = proto
82+
results = list(results_by_doi.values())
83+
# Ensure output directory exists
84+
os.makedirs(os.path.dirname(OUTPUT_CSV), exist_ok=True)
85+
# Write CSV using pandas
86+
df = pd.DataFrame(results)
87+
df.to_csv(OUTPUT_CSV, index=False)
88+
print(f"Saved {len(results)} protocols to {OUTPUT_CSV}")
89+
90+
91+
if __name__ == "__main__":
92+
main()

src/aind_data_schema_models/_generators/generator.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,16 @@ def load_data(data_type: str, root_path: str) -> pd.DataFrame:
3434
return data
3535

3636

37+
def regex_search(value, pattern):
38+
"""Perform regex search on a value and return matched groups."""
39+
import re
40+
41+
match = re.search(pattern, value)
42+
if match:
43+
return match.groups()
44+
return []
45+
46+
3747
def generate_code(data_type: str, root_path: str, isort: bool = True, black: bool = True):
3848
"""Generate code from the template type
3949
@@ -62,6 +72,8 @@ def generate_code(data_type: str, root_path: str, isort: bool = True, black: boo
6272
env.filters["to_class_name"] = to_class_name
6373
env.filters["to_class_name_underscored"] = to_class_name_underscored
6474
env.filters["unique_rows"] = lambda data, key: data.drop_duplicates(subset=key)
75+
76+
env.filters["regex_search"] = regex_search
6577
rendered_template = env.from_string(template)
6678

6779
# Render template with data
Lines changed: 68 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,68 @@
1+
title,DOI,version
2+
Solenoid Valve Calibration for Behavior Rigs Utilizing Water Reward,10.17504/protocols.io.261gerq7dl47/v1,1
3+
Running a Dynamic Foraging Behavior Task in Mice,10.17504/protocols.io.5jyl8p4m6g2w/v1,1
4+
Whole Brain Embedding for SmartSPIM - EasyIndex with 2% Agarose,10.17504/protocols.io.3byl4jpn8lo5/v1,1
5+
Stereotaxic Injection by Nanoject Protocol,10.17504/protocols.io.bd8qi9vw,1
6+
Stereotaxic Injection by Nanoject Protocol,10.17504/protocols.io.besfjebn,2
7+
Stereotaxic Injection by Nanoject Protocol,10.17504/protocols.io.bgpujvnw,3
8+
Stereotaxic Injection by Nanoject Protocol,10.17504/protocols.io.bp2l6nr7kgqe/v4,4
9+
Stereotaxic Injection by Nanoject Protocol,10.17504/protocols.io.bp2l6nr7kgqe/v5,5
10+
Stereotaxic Injection by Nanoject Protocol,10.17504/protocols.io.bp2l6nr7kgqe/v6,6
11+
Stereotaxic Injection by Nanoject Protocol,10.17504/protocols.io.bp2l6nr7kgqe/v7,7
12+
Stereotaxic Injection by Iontophoresis,10.17504/protocols.io.bd8ti9wn,1
13+
Stereotaxic Injection by Iontophoresis,10.17504/protocols.io.besgjebw,2
14+
Stereotaxic Injection by Iontophoresis,10.17504/protocols.io.bgpvjvn6,3
15+
Stereotaxic Injection by Iontophoresis,10.17504/protocols.io.14egn8ewzg5d/v4,4
16+
Stereotaxic Injection by Iontophoresis,10.17504/protocols.io.14egn8ewzg5d/v5,5
17+
Stereotaxic Injection by Iontophoresis,10.17504/protocols.io.14egn8ewzg5d/v6,6
18+
Stereotaxic Injection by Iontophoresis,10.17504/protocols.io.14egn8ewzg5d/v7,7
19+
Mouse Habituation - Head Fixation into Tube,10.17504/protocols.io.rm7vzxd74gx1/v1,1
20+
Mouse Water Restriction,10.17504/protocols.io.x54v9pn34g3e/v1,1
21+
Modified Frame-projected Independent Fiber Photometry (FIP) System_Hardware,10.17504/protocols.io.261ge39edl47/v1,1
22+
Modified Frame-projected Independent Fiber Photometry (FIP) System_Hardware,10.17504/protocols.io.261ge39edl47/v2,2
23+
Making Agarose for use in acute in vivo Electrophysiology Experiments ,10.17504/protocols.io.5jyl8py89g2w/v1,1
24+
Plug Removal for acute in vivo Electrophysiology Experiments,10.17504/protocols.io.eq2lywz8qvx9/v1,1
25+
Intraperitoneal Injection in an Adult Mouse,10.17504/protocols.io.bfzgjp3w,1
26+
Intraperitoneal Injection in an Adult Mouse,10.17504/protocols.io.5qpvo5w8dl4o/v2,2
27+
Multiplexed RNA FISH on Expanded Mouse Brain Slices,10.17504/protocols.io.dm6gpzj28lzp/v1,1
28+
General Setup and Takedown Procedures for Rodent Neurosurgery,10.17504/protocols.io.kqdg392o7g25/v1,1
29+
General Setup and Takedown Procedures for Rodent Neurosurgery,10.17504/protocols.io.kqdg392o7g25/v2,2
30+
Dual Hemisphere Craniotomy for Electrophysiology,10.17504/protocols.io.rm7vzjoe2lx1/v1,1
31+
Aqueous (SBiP) Delipidation for Whole Mouse Brain After morphoFISH Perfusion ,10.17504/protocols.io.rm7vzjz54lx1/v1,1
32+
Mouse Habituation - Head Fixation on Disk,10.17504/protocols.io.j8nlkojmxv5r/v1,1
33+
Mouse Habituation - Head Fixation on Disk,10.17504/protocols.io.j8nlkojmxv5r/v2,2
34+
Preparation of Lipopolysaccharide for Intraperitoneal Injection,10.17504/protocols.io.14egn9y1ml5d/v1,1
35+
Temporal Assessment of Immune Response,10.17504/protocols.io.5jyl8dqx6g2w/v1,1
36+
Mouse VAB Catheter Maintenance,10.17504/protocols.io.8epv52o5dv1b/v1,1
37+
Processing Blood Intended for Olink Assay,10.17504/protocols.io.261ger81jl47/v1,1
38+
BARseq 2.5,10.17504/protocols.io.kqdg3ke9qv25/v1,1
39+
Aqueous (SBiP) Delipidation of a Whole Mouse Brain,10.17504/protocols.io.n2bvj81mwgk5/v1,1
40+
Aqueous (SBiP) Delipidation of a Whole Mouse Brain,10.17504/protocols.io.n2bvj81mwgk5/v2,2
41+
Tetrahydrofuran and Dichloromethane Delipidation of a Whole Mouse Brain,10.17504/protocols.io.36wgqj1kxvk5/v1,1
42+
Tetrahydrofuran and Dichloromethane Delipidation of a Whole Mouse Brain,10.17504/protocols.io.36wgqj1kxvk5/v2,2
43+
Whole Mouse Brain Delipidation - Dichloromethane,10.17504/protocols.io.dm6gpj7n5gzp/v1,1
44+
"Whole Mouse Brain Delipidation, Immunolabeling, and Expansion Microscopy",10.17504/protocols.io.n92ldpwjxl5b/v1,1
45+
Immunolabeling of a Whole Mouse Brain,10.17504/protocols.io.ewov1okwylr2/v1,1
46+
Structural MRI Using the University of Washington 14T Vertical Bore Bruker MRI,10.17504/protocols.io.3byl4j8p2lo5/v1,1
47+
Duragel Application for Acute Electrophysiology Recordings,10.17504/protocols.io.14egn2dwqg5d/v1,1
48+
SmartSPIM setup and alignment,10.17504/protocols.io.5jyl8jyb7g2w/v1,1
49+
Refractive Index Matching - EasyIndex,10.17504/protocols.io.kxygx965kg8j/v1,1
50+
Preparing a 3D Printed Implant for Acute In Vivo Electrophysiology,10.17504/protocols.io.6qpvr4jmogmk/v1,1
51+
Imaging cleared mouse brains on SmartSPIM,10.17504/protocols.io.3byl4jo1rlo5/v1,1
52+
Refractive Index Matching - Ethyl Cinnamate,10.17504/protocols.io.n2bvj8k4bgk5/v1,1
53+
DAPI Staining Mouse Brain Sections,10.17504/protocols.io.3byl4jm6rlo5/v1,1
54+
Modified Frame-projected Independent Fiber Photometry (FIP) System_Triggering system,10.17504/protocols.io.kxygx3e6wg8j/v1,1
55+
Multi-Site Optic Fiber Implants,10.17504/protocols.io.6qpvr3dqovmk/v1,1
56+
Immunohistochemistry (IHC) Staining Mouse Brain Sections,10.17504/protocols.io.5qpvo3b7bv4o/v1,1
57+
Sectioning Mouse Brain with Sliding Microtome,10.17504/protocols.io.5jyl8p97rg2w/v1,1
58+
Mounting and Coverslipping Mouse Brain Sections,10.17504/protocols.io.n92ldmpy7l5b/v1,1
59+
Stereotactic Injections with Headframe Implant,10.17504/protocols.io.eq2lyj72elx9/v1,1
60+
"Protocol Collection: Perfusing, Sectioning, IHC, Mounting and Coverslipping Mouse Brain Specimens",10.17504/protocols.io.kxygx3yxkg8j/v1,1
61+
Mouse Cardiac Perfusion Fixation and Brain Collection,10.17504/protocols.io.bd8vi9w6,1
62+
Mouse Cardiac Perfusion Fixation and Brain Collection,10.17504/protocols.io.besijece,2
63+
Mouse Cardiac Perfusion Fixation and Brain Collection,10.17504/protocols.io.beudjes6,3
64+
Mouse Cardiac Perfusion Fixation and Brain Collection,10.17504/protocols.io.be2djga6,4
65+
Mouse Cardiac Perfusion Fixation and Brain Collection,10.17504/protocols.io.bg5vjy66,5
66+
Mouse Cardiac Perfusion Fixation and Brain Collection,10.17504/protocols.io.8epv51bejl1b/v6,6
67+
Mouse Cardiac Perfusion Fixation and Brain Collection,10.17504/protocols.io.8epv51bejl1b/v7,7
68+
Mouse Cardiac Perfusion Fixation and Brain Collection,10.17504/protocols.io.8epv51bejl1b/v8,8

src/aind_data_schema_models/_generators/models/registries.csv

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
name,abbreviation
22
Addgene,ADDGENE
3+
Digital Object Identifier,DOI
34
Edinburgh Mouse Atlas Project,EMAPA
45
Mouse Genome Informatics,MGI
56
National Center for Biotechnology Information,NCBI
Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,52 @@
1+
"""Protocols"""
2+
{% raw -%}
3+
from pydantic import Field, ConfigDict
4+
from typing import Union
5+
from typing_extensions import Annotated
6+
from aind_data_schema_models.pid_names import BaseName
7+
from aind_data_schema_models.registries import Registry
8+
import re
9+
{% endraw %}
10+
11+
class ProtocolModel(BaseName):
12+
"""Base model for protocol"""
13+
model_config = ConfigDict(frozen=True)
14+
name: str
15+
version: int
16+
registry: Registry
17+
registry_identifier: str
18+
19+
{% for _, row in data.iterrows() %}
20+
class {{ row['title'] | to_class_name_underscored }}_V{{ row['version'] }}(ProtocolModel):
21+
"""Model {{ row['title'] }}"""
22+
name: str = "{{ row['title'] }}"
23+
version: int = {{ row['version'] }}
24+
registry: Registry = Registry.DOI
25+
registry_identifier: str = "{{ row['DOI'] }}"
26+
27+
28+
{% endfor %}
29+
30+
class Protocols:
31+
"""Protocols"""
32+
{% for _, row in data.iterrows() %}
33+
{{ row['title'] | to_class_name | upper }}_V{{ row['version'] }} = {{ row['title'] | to_class_name_underscored }}_V{{ row['version'] }}()
34+
{% endfor %}
35+
36+
ALL = tuple(ProtocolModel.__subclasses__())
37+
38+
ONE_OF = Annotated[Union[tuple(ProtocolModel.__subclasses__())], Field(discriminator="title")]
39+
40+
doi_map = {m().registry_identifier: m() for m in ALL if getattr(m(), "registry_identifier", None)}
41+
42+
@classmethod
43+
def from_doi(cls, doi: str) -> ProtocolModel:
44+
"""Return protocol model by DOI."""
45+
return cls.doi_map.get(doi, None)
46+
47+
@classmethod
48+
def from_url(cls, url: str) -> ProtocolModel:
49+
"""Return protocol model by DOI, stripping URL prefixes."""
50+
# Remove any leading protocol/domain up to the DOI
51+
doi = re.sub(r'^(https?://)?(dx\.)?doi\.org/', '', url)
52+
return cls.from_doi(doi)

0 commit comments

Comments
 (0)