22
33from projspec .proj import ProjectSpec , ParseFailed
44
5+ # Metadata keys that are specific to dataset cards and never appear on model cards.
6+ # Used to distinguish the two repo types when both have a README.md.
7+ _DATASET_DISCRIMINATORS = frozenset (
8+ [
9+ "task_categories" ,
10+ "task_ids" ,
11+ "dataset_info" ,
12+ "size_categories" ,
13+ "annotations_creators" ,
14+ "language_creators" ,
15+ "source_datasets" ,
16+ "configs" ,
17+ ]
18+ )
19+
520
621class HuggingFaceRepo (ProjectSpec ):
722 spec_doc = "https://huggingface.co/docs/hub/en/model-cards"
@@ -21,9 +36,7 @@ def parse(self) -> None:
2136 from projspec .content .metadata import DescriptiveMetadata , License
2237 import yaml
2338
24- readme = f"{ self .proj .url } /README.md"
25-
26- with self .proj .fs .open (readme , "rt" ) as f :
39+ with self .get_file ("README.md" ) as f :
2740 txt = f .read ()
2841 if txt .count ("---\n " ) < 2 :
2942 raise ParseFailed
@@ -65,3 +78,100 @@ def _create(path: str) -> None:
6578---
6679"""
6780 )
81+
82+
83+ class HuggingFaceDataset (ProjectSpec ):
84+ """A dataset repository hosted on the Hugging Face Hub.
85+
86+ A HuggingFace dataset repo is identified by a ``README.md`` whose YAML
87+ front-matter contains at least one dataset-specific key (e.g.
88+ ``task_categories``, ``dataset_info``, ``size_categories``). The card
89+ format is defined at
90+ https://huggingface.co/docs/hub/datasets-cards and the full metadata
91+ specification is at
92+ https://github.com/huggingface/hub-docs/blob/main/datasetcard.md
93+
94+ Parsed contents
95+ ---------------
96+ descriptive_metadata
97+ Carries ``pretty_name``, ``language``, ``tags``, ``task_categories``,
98+ ``size_categories``, ``source_datasets``, ``annotations_creators``,
99+ and ``language_creators`` — whichever are present in the card.
100+ license *(optional)*
101+ Present when a ``license`` key is found in the front-matter.
102+ """
103+
104+ spec_doc = "https://huggingface.co/docs/hub/datasets-cards"
105+
106+ def match (self ) -> bool :
107+ return "README.md" in self .proj .basenames
108+
109+ def parse (self ) -> None :
110+ import yaml
111+ from projspec .content .metadata import DescriptiveMetadata , License
112+
113+ try :
114+ with self .get_file ("README.md" ) as f :
115+ txt = f .read ()
116+ except OSError as exc :
117+ raise ParseFailed (f"Could not read README.md: { exc } " ) from exc
118+
119+ if txt .count ("---\n " ) < 2 :
120+ raise ParseFailed ("README.md has no YAML front-matter" )
121+ try :
122+ meta = yaml .safe_load (StringIO (txt .split ("---\n " )[1 ]))
123+ except yaml .YAMLError as exc :
124+ raise ParseFailed (f"Invalid YAML front-matter: { exc } " ) from exc
125+ if not isinstance (meta , dict ):
126+ raise ParseFailed ("YAML front-matter did not parse to a mapping" )
127+
128+ if "license" in meta :
129+ self ._contents ["license" ] = License (
130+ proj = self .proj ,
131+ shortname = meta ["license" ],
132+ fullname = meta .get ("license_name" , "unknown" ),
133+ url = meta .get ("license_link" , "" ),
134+ )
135+
136+ descriptive_keys = [
137+ "pretty_name" ,
138+ "language" ,
139+ "tags" ,
140+ "task_categories" ,
141+ "task_ids" ,
142+ "size_categories" ,
143+ "source_datasets" ,
144+ "annotations_creators" ,
145+ "language_creators" ,
146+ "paperswithcode_id" ,
147+ ]
148+ card_meta = {k : meta [k ] for k in descriptive_keys if k in meta }
149+ self ._contents ["descriptive_metadata" ] = DescriptiveMetadata (
150+ proj = self .proj ,
151+ meta = card_meta ,
152+ )
153+
154+ @staticmethod
155+ def _create (path : str ) -> None :
156+ """Scaffold a minimal but valid HuggingFace dataset card."""
157+ with open (f"{ path } /README.md" , "w" ) as f :
158+ f .write (
159+ """\
160+ ---
161+ pretty_name: My Dataset
162+ license: apache-2.0
163+ language:
164+ - en
165+ tags:
166+ - text
167+ task_categories:
168+ - text-classification
169+ size_categories:
170+ - n<1K
171+ ---
172+
173+ # My Dataset
174+
175+ A short description of the dataset.
176+ """
177+ )
0 commit comments