Add huggingface dataset type

martindurant · martindurant · commit 8d4ae2bd9216 · 2026-03-30T16:54:27.000-04:00
diff --git a/docs/source/api.rst b/docs/source/api.rst
@@ -43,6 +43,7 @@ User Classes
     proj.documentation.RTD
     proj.git.GitRepo
     proj.helm.HelmChart
+    proj.hf.HuggingFaceDataset
     proj.hf.HuggingFaceRepo
     proj.ide.JetbrainsIDE
     proj.ide.NvidiaAIWorkbench
@@ -81,6 +82,7 @@ User Classes
 .. autoclass:: projspec.proj.documentation.RTD
 .. autoclass:: projspec.proj.git.GitRepo
 .. autoclass:: projspec.proj.helm.HelmChart
+.. autoclass:: projspec.proj.hf.HuggingFaceDataset
 .. autoclass:: projspec.proj.hf.HuggingFaceRepo
 .. autoclass:: projspec.proj.ide.JetbrainsIDE
 .. autoclass:: projspec.proj.ide.NvidiaAIWorkbench
diff --git a/src/projspec/config.py b/src/projspec/config.py
@@ -15,7 +15,7 @@ def conf_dir():
 def defaults():
     return {
         "library_path": f"{conf_dir()}/library.json",
-        "scan_types": [".py", ".yaml", ".yml", ".toml", ".json"],
+        "scan_types": [".py", ".yaml", ".yml", ".toml", ".json", ".md"],
         "scan_max_files": 100,
         "scan_max_size": 5 * 2**10,
         "remote_artifact_status": False,
diff --git a/src/projspec/proj/base.py b/src/projspec/proj/base.py
@@ -135,6 +135,8 @@ def scanned_files(self):
         return self._scanned_files
 
     def get_file(self, name: str, text=True) -> io.IOBase:
+        # TODO: possibly cache files that are read by some parser and might be
+        #  needed again
         if name in self.scanned_files:
             if text:
                 return io.StringIO(self.scanned_files[name].decode())
diff --git a/src/projspec/proj/hf.py b/src/projspec/proj/hf.py
@@ -2,6 +2,21 @@
 
 from projspec.proj import ProjectSpec, ParseFailed
 
+# Metadata keys that are specific to dataset cards and never appear on model cards.
+# Used to distinguish the two repo types when both have a README.md.
+_DATASET_DISCRIMINATORS = frozenset(
+    [
+        "task_categories",
+        "task_ids",
+        "dataset_info",
+        "size_categories",
+        "annotations_creators",
+        "language_creators",
+        "source_datasets",
+        "configs",
+    ]
+)
+
 
 class HuggingFaceRepo(ProjectSpec):
     spec_doc = "https://huggingface.co/docs/hub/en/model-cards"
@@ -21,9 +36,7 @@ def parse(self) -> None:
         from projspec.content.metadata import DescriptiveMetadata, License
         import yaml
 
-        readme = f"{self.proj.url}/README.md"
-
-        with self.proj.fs.open(readme, "rt") as f:
+        with self.get_file("README.md") as f:
             txt = f.read()
         if txt.count("---\n") < 2:
             raise ParseFailed
@@ -65,3 +78,100 @@ def _create(path: str) -> None:
 ---
 """
             )
+
+
+class HuggingFaceDataset(ProjectSpec):
+    """A dataset repository hosted on the Hugging Face Hub.
+
+    A HuggingFace dataset repo is identified by a ``README.md`` whose YAML
+    front-matter contains at least one dataset-specific key (e.g.
+    ``task_categories``, ``dataset_info``, ``size_categories``).  The card
+    format is defined at
+    https://huggingface.co/docs/hub/datasets-cards and the full metadata
+    specification is at
+    https://github.com/huggingface/hub-docs/blob/main/datasetcard.md
+
+    Parsed contents
+    ---------------
+    descriptive_metadata
+        Carries ``pretty_name``, ``language``, ``tags``, ``task_categories``,
+        ``size_categories``, ``source_datasets``, ``annotations_creators``,
+        and ``language_creators`` — whichever are present in the card.
+    license *(optional)*
+        Present when a ``license`` key is found in the front-matter.
+    """
+
+    spec_doc = "https://huggingface.co/docs/hub/datasets-cards"
+
+    def match(self) -> bool:
+        return "README.md" in self.proj.basenames
+
+    def parse(self) -> None:
+        import yaml
+        from projspec.content.metadata import DescriptiveMetadata, License
+
+        try:
+            with self.get_file("README.md") as f:
+                txt = f.read()
+        except OSError as exc:
+            raise ParseFailed(f"Could not read README.md: {exc}") from exc
+
+        if txt.count("---\n") < 2:
+            raise ParseFailed("README.md has no YAML front-matter")
+        try:
+            meta = yaml.safe_load(StringIO(txt.split("---\n")[1]))
+        except yaml.YAMLError as exc:
+            raise ParseFailed(f"Invalid YAML front-matter: {exc}") from exc
+        if not isinstance(meta, dict):
+            raise ParseFailed("YAML front-matter did not parse to a mapping")
+
+        if "license" in meta:
+            self._contents["license"] = License(
+                proj=self.proj,
+                shortname=meta["license"],
+                fullname=meta.get("license_name", "unknown"),
+                url=meta.get("license_link", ""),
+            )
+
+        descriptive_keys = [
+            "pretty_name",
+            "language",
+            "tags",
+            "task_categories",
+            "task_ids",
+            "size_categories",
+            "source_datasets",
+            "annotations_creators",
+            "language_creators",
+            "paperswithcode_id",
+        ]
+        card_meta = {k: meta[k] for k in descriptive_keys if k in meta}
+        self._contents["descriptive_metadata"] = DescriptiveMetadata(
+            proj=self.proj,
+            meta=card_meta,
+        )
+
+    @staticmethod
+    def _create(path: str) -> None:
+        """Scaffold a minimal but valid HuggingFace dataset card."""
+        with open(f"{path}/README.md", "w") as f:
+            f.write(
+                """\
+---
+pretty_name: My Dataset
+license: apache-2.0
+language:
+- en
+tags:
+- text
+task_categories:
+- text-classification
+size_categories:
+- n<1K
+---
+
+# My Dataset
+
+A short description of the dataset.
+"""
+            )