Skip to content

Commit 8d4ae2b

Browse files
committed
Add huggingface dataset type
1 parent 1bcab24 commit 8d4ae2b

File tree

4 files changed

+118
-4
lines changed

4 files changed

+118
-4
lines changed

docs/source/api.rst

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,7 @@ User Classes
4343
proj.documentation.RTD
4444
proj.git.GitRepo
4545
proj.helm.HelmChart
46+
proj.hf.HuggingFaceDataset
4647
proj.hf.HuggingFaceRepo
4748
proj.ide.JetbrainsIDE
4849
proj.ide.NvidiaAIWorkbench
@@ -81,6 +82,7 @@ User Classes
8182
.. autoclass:: projspec.proj.documentation.RTD
8283
.. autoclass:: projspec.proj.git.GitRepo
8384
.. autoclass:: projspec.proj.helm.HelmChart
85+
.. autoclass:: projspec.proj.hf.HuggingFaceDataset
8486
.. autoclass:: projspec.proj.hf.HuggingFaceRepo
8587
.. autoclass:: projspec.proj.ide.JetbrainsIDE
8688
.. autoclass:: projspec.proj.ide.NvidiaAIWorkbench

src/projspec/config.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ def conf_dir():
1515
def defaults():
1616
return {
1717
"library_path": f"{conf_dir()}/library.json",
18-
"scan_types": [".py", ".yaml", ".yml", ".toml", ".json"],
18+
"scan_types": [".py", ".yaml", ".yml", ".toml", ".json", ".md"],
1919
"scan_max_files": 100,
2020
"scan_max_size": 5 * 2**10,
2121
"remote_artifact_status": False,

src/projspec/proj/base.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -135,6 +135,8 @@ def scanned_files(self):
135135
return self._scanned_files
136136

137137
def get_file(self, name: str, text=True) -> io.IOBase:
138+
# TODO: possibly cache files that are read by some parser and might be
139+
# needed again
138140
if name in self.scanned_files:
139141
if text:
140142
return io.StringIO(self.scanned_files[name].decode())

src/projspec/proj/hf.py

Lines changed: 113 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,21 @@
22

33
from projspec.proj import ProjectSpec, ParseFailed
44

5+
# Metadata keys that are specific to dataset cards and never appear on model cards.
6+
# Used to distinguish the two repo types when both have a README.md.
7+
_DATASET_DISCRIMINATORS = frozenset(
8+
[
9+
"task_categories",
10+
"task_ids",
11+
"dataset_info",
12+
"size_categories",
13+
"annotations_creators",
14+
"language_creators",
15+
"source_datasets",
16+
"configs",
17+
]
18+
)
19+
520

621
class HuggingFaceRepo(ProjectSpec):
722
spec_doc = "https://huggingface.co/docs/hub/en/model-cards"
@@ -21,9 +36,7 @@ def parse(self) -> None:
2136
from projspec.content.metadata import DescriptiveMetadata, License
2237
import yaml
2338

24-
readme = f"{self.proj.url}/README.md"
25-
26-
with self.proj.fs.open(readme, "rt") as f:
39+
with self.get_file("README.md") as f:
2740
txt = f.read()
2841
if txt.count("---\n") < 2:
2942
raise ParseFailed
@@ -65,3 +78,100 @@ def _create(path: str) -> None:
6578
---
6679
"""
6780
)
81+
82+
83+
class HuggingFaceDataset(ProjectSpec):
84+
"""A dataset repository hosted on the Hugging Face Hub.
85+
86+
A HuggingFace dataset repo is identified by a ``README.md`` whose YAML
87+
front-matter contains at least one dataset-specific key (e.g.
88+
``task_categories``, ``dataset_info``, ``size_categories``). The card
89+
format is defined at
90+
https://huggingface.co/docs/hub/datasets-cards and the full metadata
91+
specification is at
92+
https://github.com/huggingface/hub-docs/blob/main/datasetcard.md
93+
94+
Parsed contents
95+
---------------
96+
descriptive_metadata
97+
Carries ``pretty_name``, ``language``, ``tags``, ``task_categories``,
98+
``size_categories``, ``source_datasets``, ``annotations_creators``,
99+
and ``language_creators`` — whichever are present in the card.
100+
license *(optional)*
101+
Present when a ``license`` key is found in the front-matter.
102+
"""
103+
104+
spec_doc = "https://huggingface.co/docs/hub/datasets-cards"
105+
106+
def match(self) -> bool:
107+
return "README.md" in self.proj.basenames
108+
109+
def parse(self) -> None:
110+
import yaml
111+
from projspec.content.metadata import DescriptiveMetadata, License
112+
113+
try:
114+
with self.get_file("README.md") as f:
115+
txt = f.read()
116+
except OSError as exc:
117+
raise ParseFailed(f"Could not read README.md: {exc}") from exc
118+
119+
if txt.count("---\n") < 2:
120+
raise ParseFailed("README.md has no YAML front-matter")
121+
try:
122+
meta = yaml.safe_load(StringIO(txt.split("---\n")[1]))
123+
except yaml.YAMLError as exc:
124+
raise ParseFailed(f"Invalid YAML front-matter: {exc}") from exc
125+
if not isinstance(meta, dict):
126+
raise ParseFailed("YAML front-matter did not parse to a mapping")
127+
128+
if "license" in meta:
129+
self._contents["license"] = License(
130+
proj=self.proj,
131+
shortname=meta["license"],
132+
fullname=meta.get("license_name", "unknown"),
133+
url=meta.get("license_link", ""),
134+
)
135+
136+
descriptive_keys = [
137+
"pretty_name",
138+
"language",
139+
"tags",
140+
"task_categories",
141+
"task_ids",
142+
"size_categories",
143+
"source_datasets",
144+
"annotations_creators",
145+
"language_creators",
146+
"paperswithcode_id",
147+
]
148+
card_meta = {k: meta[k] for k in descriptive_keys if k in meta}
149+
self._contents["descriptive_metadata"] = DescriptiveMetadata(
150+
proj=self.proj,
151+
meta=card_meta,
152+
)
153+
154+
@staticmethod
155+
def _create(path: str) -> None:
156+
"""Scaffold a minimal but valid HuggingFace dataset card."""
157+
with open(f"{path}/README.md", "w") as f:
158+
f.write(
159+
"""\
160+
---
161+
pretty_name: My Dataset
162+
license: apache-2.0
163+
language:
164+
- en
165+
tags:
166+
- text
167+
task_categories:
168+
- text-classification
169+
size_categories:
170+
- n<1K
171+
---
172+
173+
# My Dataset
174+
175+
A short description of the dataset.
176+
"""
177+
)

0 commit comments

Comments
 (0)