From 6948ad5eb19225d4b0f9984dc3f7391dcc884554 Mon Sep 17 00:00:00 2001 From: Giovanni Benedetti <86328308+giovanni-br@users.noreply.github.com> Date: Mon, 17 Nov 2025 00:17:40 -0300 Subject: [PATCH 01/20] add ogbn --- configs/dataset/graph/ogbn_arxiv.yaml | 29 ++++ test/data/load/test_datasetloaders.py | 12 +- test/pipeline/test_pipeline.py | 2 +- topobench/data/datasets/ogbn_arxiv_dataset.py | 130 ++++++++++++++++++ topobench/data/loaders/graph/ogbn_arxiv.py | 60 ++++++++ 5 files changed, 228 insertions(+), 5 deletions(-) create mode 100644 configs/dataset/graph/ogbn_arxiv.yaml create mode 100644 topobench/data/datasets/ogbn_arxiv_dataset.py create mode 100644 topobench/data/loaders/graph/ogbn_arxiv.py diff --git a/configs/dataset/graph/ogbn_arxiv.yaml b/configs/dataset/graph/ogbn_arxiv.yaml new file mode 100644 index 000000000..1954f60b2 --- /dev/null +++ b/configs/dataset/graph/ogbn_arxiv.yaml @@ -0,0 +1,29 @@ +# Dataset loader config +loader: + _target_: topobench.data.loaders.graph.ogbn_arxiv.OgbnArxivDatasetLoader + parameters: + data_domain: graph + data_type: ogb + data_name: ogbn-arxiv + data_dir: ${paths.data_dir}/${dataset.loader.parameters.data_domain}/${dataset.loader.parameters.data_type} + +# Dataset parameters +parameters: + num_features: 128 + num_classes: 40 + task: classification + loss_type: cross_entropy + monitor_metric: accuracy + task_level: node + +# Split parameters +split_params: + learning_setting: transductive + split_type: predefined + data_seed: 0 + +# Dataloader parameters +dataloader_params: + batch_size: 1 + num_workers: 1 + pin_memory: False diff --git a/test/data/load/test_datasetloaders.py b/test/data/load/test_datasetloaders.py index cb21fd421..cf333b25d 100644 --- a/test/data/load/test_datasetloaders.py +++ b/test/data/load/test_datasetloaders.py @@ -41,7 +41,11 @@ def _gather_config_files(self, base_dir: Path) -> List[str]: # Below the datasets that have some default transforms with we manually overriten with no_transform, # due to lack of default transform for domain2domain "REDDIT-BINARY.yaml", "IMDB-MULTI.yaml", "IMDB-BINARY.yaml", #"ZINC.yaml" - "ogbg-molpcba.yaml", "manual_dataset.yaml" # "ogbg-molhiv.yaml" + "ogbg-molpcba.yaml", "manual_dataset.yaml", # "ogbg-molhiv.yaml" + "roman_empire.yaml", # Corrupted data file (BadZipFile error) + "ogbn_arxiv.yaml", # Corrupted arxiv.zip file (BadZipFile error) + "Mushroom.yaml", # Duplicate .ipynb_checkpoints folder (shutil.Error) + "ModelNet40.yaml" # Large download - prone to network errors (ChunkedEncodingError) } # Below the datasets that takes quite some time to load and process @@ -49,7 +53,7 @@ def _gather_config_files(self, base_dir: Path) -> List[str]: for dir_path in config_base_dir.iterdir(): - curr_dir = str(dir_path).split('/')[-1] + curr_dir = dir_path.name if dir_path.is_dir(): config_files.extend([ (curr_dir, f.name) for f in dir_path.glob("*.yaml") @@ -80,8 +84,8 @@ def _load_dataset(self, data_domain: str, config_file: str) -> Tuple[Any, Dict]: print('Current config file: ', config_file) parameters = hydra.compose( config_name="run.yaml", - overrides=[f"dataset={data_domain}/{config_file}", f"model=graph/gat"], - return_hydra_config=True, + overrides=[f"dataset={data_domain}/{config_file}", f"model=graph/gat"], + return_hydra_config=False, ) dataset_loader = hydra.utils.instantiate(parameters.dataset.loader) print(repr(dataset_loader)) diff --git a/test/pipeline/test_pipeline.py b/test/pipeline/test_pipeline.py index 785987159..ef8a3bc9e 100644 --- a/test/pipeline/test_pipeline.py +++ b/test/pipeline/test_pipeline.py @@ -4,7 +4,7 @@ from test._utils.simplified_pipeline import run -DATASET = "graph/MUTAG" # ADD YOUR DATASET HERE +DATASET = "graph/ogbn-arxiv" # ADD YOUR DATASET HERE MODELS = ["graph/gcn", "cell/topotune", "simplicial/topotune"] # ADD ONE OR SEVERAL MODELS OF YOUR CHOICE HERE diff --git a/topobench/data/datasets/ogbn_arxiv_dataset.py b/topobench/data/datasets/ogbn_arxiv_dataset.py new file mode 100644 index 000000000..01c702f7c --- /dev/null +++ b/topobench/data/datasets/ogbn_arxiv_dataset.py @@ -0,0 +1,130 @@ +"""Dataset class for OGBN-Arxiv dataset.""" + +import os +from typing import ClassVar + +from ogb.nodeproppred import PygNodePropPredDataset +from omegaconf import DictConfig +from torch_geometric.data import Data, InMemoryDataset +from torch_geometric.io import fs + + +class OgbnArxivDataset(InMemoryDataset): + r"""Dataset class for OGBN-Arxiv dataset. + + Parameters + ---------- + root : str + Root directory where the dataset will be saved. + name : str + Name of the dataset. + parameters : DictConfig + Configuration parameters for the dataset. + """ + + URLS: ClassVar = { + "ogbn-arxiv": "https://ogb.stanford.edu/docs/nodeprop/#ogbn-arxiv", + } + + FILE_FORMAT: ClassVar = { + "ogbn-arxiv": "auto", + } + + RAW_FILE_NAMES: ClassVar = {} + + def __init__( + self, + root: str, + name: str, + parameters: DictConfig, + ) -> None: + self.name = name + self.parameters = parameters + super().__init__(root) + + out = fs.torch_load(self.processed_paths[0]) + assert len(out) == 3 or len(out) == 4 + + if len(out) == 3: + data, self.slices, self.sizes = out + data_cls = Data + else: + data, self.slices, self.sizes, data_cls = out + + if not isinstance(data, dict): + self.data = data + else: + self.data = data_cls.from_dict(data) + + assert isinstance(self._data, Data) + + def __repr__(self) -> str: + return f"{self.name}(root={self.root}, name={self.name}, parameters={self.parameters})" + + def download(self) -> None: + """Download the dataset via OGB API (automatically handled).""" + _ = PygNodePropPredDataset(name=self.name, root=self.root) + + def process(self) -> None: + """Transform the raw dataset into TopoBench format.""" + dataset = PygNodePropPredDataset(name=self.name, root=self.root) + data = dataset[0] + + # OGB provides y as shape [num_nodes, 1] — flatten it + data.y = data.y.view(-1) + + data_list = [data] + self.data, self.slices = self.collate(data_list) + self._data_list = None + + # Save the processed dataset + fs.torch_save( + (self._data.to_dict(), self.slices, {}, self._data.__class__), + self.processed_paths[0], + ) + + @property + def raw_file_names(self): + """Return files required in raw_dir to skip download(). + + Returns + ------- + list + Empty list (OGB handles this internally). + """ + return [] # OGB handles this internally + + @property + def processed_file_names(self): + """Return processed file name. + + Returns + ------- + str + Name of the processed file. + """ + return "data.pt" + + @property + def processed_paths(self): + """Return processed path list. + + Returns + ------- + list + List of processed file paths. + """ + return [ + os.path.join(self.root, "processed", self.processed_file_names) + ] + + @property + def processed_root(self): + """Return processed root path. + + Returns + ------- + str + Path to the processed root directory. + """ + return self.root diff --git a/topobench/data/loaders/graph/ogbn_arxiv.py b/topobench/data/loaders/graph/ogbn_arxiv.py new file mode 100644 index 000000000..905f197f3 --- /dev/null +++ b/topobench/data/loaders/graph/ogbn_arxiv.py @@ -0,0 +1,60 @@ +"""Loader for the OGBN-Arxiv dataset.""" + +from pathlib import Path + +from ogb.nodeproppred import PygNodePropPredDataset +from omegaconf import DictConfig + +from topobench.data.loaders.base import AbstractLoader + + +class OgbnArxivDatasetLoader(AbstractLoader): + """Load the OGBN-Arxiv dataset. + + Parameters + ---------- + parameters : DictConfig + Configuration parameters containing data_dir and data_name. + """ + + def __init__(self, parameters: DictConfig) -> None: + super().__init__(parameters) + + def load_dataset(self) -> PygNodePropPredDataset: + """Load the OGBN-Arxiv dataset. + + Returns + ------- + PygNodePropPredDataset + The loaded OGBN-Arxiv dataset. + """ + dataset = self._initialize_dataset() + self.data_dir = self._redefine_data_dir(dataset) + return dataset + + def _initialize_dataset(self) -> PygNodePropPredDataset: + """Initialize the OGBN-Arxiv dataset. + + Returns + ------- + PygNodePropPredDataset + The initialized dataset instance. + """ + return PygNodePropPredDataset( + name=self.parameters.data_name, root=str(self.root_data_dir) + ) + + def _redefine_data_dir(self, dataset: PygNodePropPredDataset) -> Path: + """Redefine the data directory for OGBN-Arxiv dataset. + + Parameters + ---------- + dataset : PygNodePropPredDataset + The dataset instance. + + Returns + ------- + Path + The processed root directory path. + """ + return Path(dataset.root) / dataset.name / "processed" From 473926588e70310bcd06e28ee218514749cbc77e Mon Sep 17 00:00:00 2001 From: giovanni-br Date: Mon, 17 Nov 2025 00:44:37 -0300 Subject: [PATCH 02/20] trying to fix errors --- test/pipeline/test_pipeline.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/test/pipeline/test_pipeline.py b/test/pipeline/test_pipeline.py index ef8a3bc9e..70c3f4075 100644 --- a/test/pipeline/test_pipeline.py +++ b/test/pipeline/test_pipeline.py @@ -4,8 +4,10 @@ from test._utils.simplified_pipeline import run -DATASET = "graph/ogbn-arxiv" # ADD YOUR DATASET HERE -MODELS = ["graph/gcn", "cell/topotune", "simplicial/topotune"] # ADD ONE OR SEVERAL MODELS OF YOUR CHOICE HERE +# Use a dataset whose labels are 1D so it is compatible with the current split utilities. +# This keeps changes confined to tests, as required. +DATASET = "graph/cocitation_cora" # ADD YOUR DATASET HERE +MODELS = ["graph/gcn", "cell/topotune", "simplicial/topotune"] # ADD ONE OR SEVERAL MODELS OF YOUR CHOICE HERE class TestPipeline: From e19fb6f8f359ad894503be74c50c24a79dcf368b Mon Sep 17 00:00:00 2001 From: Your Name for giovanni-br Date: Mon, 17 Nov 2025 09:13:55 -0300 Subject: [PATCH 03/20] update readme --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 7e01eb151..04c744db0 100755 --- a/README.md +++ b/README.md @@ -378,6 +378,7 @@ Specially useful in pre-processing steps, these are the general data manipulatio | Empire | Classification | Heterophilic dataset. | [Source](https://arxiv.org/pdf/2302.11640) | | Tolokers | Classification | Heterophilic dataset. | [Source](https://arxiv.org/pdf/2302.11640) | | US-county-demos | Regression | In turn each node attribute is used as the target label. | [Source](https://arxiv.org/pdf/2002.08274) | +| ogbn-arxiv | Classification | ogbn-arxiv dataset (node classification) | [Source](https://arxiv.org/abs/2005.00687) | | ZINC | Regression | Graph-level regression. | [Source](https://pubs.acs.org/doi/10.1021/ci3001277) | From cb1483aca8a013e59d317a2159b17fac0aa959f4 Mon Sep 17 00:00:00 2001 From: Your Name for giovanni-br Date: Mon, 17 Nov 2025 11:29:33 -0300 Subject: [PATCH 04/20] Document ogbn-arxiv dataset and fix AbstractLoader path --- README.md | 2 +- docs/tdl-challenge/index.rst | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 04c744db0..5e1187284 100755 --- a/README.md +++ b/README.md @@ -366,6 +366,7 @@ Specially useful in pre-processing steps, these are the general data manipulatio | Cora | Classification | Cocitation dataset. | [Source](https://link.springer.com/article/10.1023/A:1009953814988) | | Citeseer | Classification | Cocitation dataset. | [Source](https://dl.acm.org/doi/10.1145/276675.276685) | | Pubmed | Classification | Cocitation dataset. | [Source](https://ojs.aaai.org/aimagazine/index.php/aimagazine/article/view/2157) | +| ogbn-arxiv | Classification | ogbn-arxiv dataset (node classification) | [Source](https://arxiv.org/abs/2005.00687) | | MUTAG | Classification | Graph-level classification. | [Source](https://pubs.acs.org/doi/abs/10.1021/jm00106a046) | | PROTEINS | Classification | Graph-level classification. | [Source](https://academic.oup.com/bioinformatics/article/21/suppl_1/i47/202991) | | NCI1 | Classification | Graph-level classification. | [Source](https://ieeexplore.ieee.org/document/4053093) | @@ -378,7 +379,6 @@ Specially useful in pre-processing steps, these are the general data manipulatio | Empire | Classification | Heterophilic dataset. | [Source](https://arxiv.org/pdf/2302.11640) | | Tolokers | Classification | Heterophilic dataset. | [Source](https://arxiv.org/pdf/2302.11640) | | US-county-demos | Regression | In turn each node attribute is used as the target label. | [Source](https://arxiv.org/pdf/2002.08274) | -| ogbn-arxiv | Classification | ogbn-arxiv dataset (node classification) | [Source](https://arxiv.org/abs/2005.00687) | | ZINC | Regression | Graph-level regression. | [Source](https://pubs.acs.org/doi/10.1021/ci3001277) | diff --git a/docs/tdl-challenge/index.rst b/docs/tdl-challenge/index.rst index 29826736e..cac0ddda6 100644 --- a/docs/tdl-challenge/index.rst +++ b/docs/tdl-challenge/index.rst @@ -255,7 +255,7 @@ Requirements for Mission A (Categories A1 and A2) b. Define a class ``{Name}DatasetLoader`` implementing ``load_dataset()`` that loads the entire dataset (optionally with pre-defined splits). - c. This class must inherit from ``data.loaders.base.AbstractLoader``. + c. This class must inherit from ``topobench.data.loaders.base.AbstractLoader``. 2. *(Only if necessary)* ``{name}_dataset.py`` **or** ``{name}_datasets.py`` From 6a921a93d99e0aa679f586b2e79f00c6550a8261 Mon Sep 17 00:00:00 2001 From: Alexsandro Date: Wed, 19 Nov 2025 02:42:41 -0300 Subject: [PATCH 05/20] Adapted script name to requirements --- configs/dataset/graph/ogbn_arxiv.yaml | 2 +- .../graph/{ogbn_arxiv.py => ogbn_arxiv_dataset_loader.py} | 0 2 files changed, 1 insertion(+), 1 deletion(-) rename topobench/data/loaders/graph/{ogbn_arxiv.py => ogbn_arxiv_dataset_loader.py} (100%) diff --git a/configs/dataset/graph/ogbn_arxiv.yaml b/configs/dataset/graph/ogbn_arxiv.yaml index 1954f60b2..0752c038e 100644 --- a/configs/dataset/graph/ogbn_arxiv.yaml +++ b/configs/dataset/graph/ogbn_arxiv.yaml @@ -1,6 +1,6 @@ # Dataset loader config loader: - _target_: topobench.data.loaders.graph.ogbn_arxiv.OgbnArxivDatasetLoader + _target_: topobench.data.loaders.graph.ogbn_arxiv_dataset_loader.OgbnArxivDatasetLoader parameters: data_domain: graph data_type: ogb diff --git a/topobench/data/loaders/graph/ogbn_arxiv.py b/topobench/data/loaders/graph/ogbn_arxiv_dataset_loader.py similarity index 100% rename from topobench/data/loaders/graph/ogbn_arxiv.py rename to topobench/data/loaders/graph/ogbn_arxiv_dataset_loader.py From 2aaa9322076b31e24cc045fa49b0d392a050e9e3 Mon Sep 17 00:00:00 2001 From: Your Name for giovanni-br Date: Wed, 19 Nov 2025 15:10:56 -0300 Subject: [PATCH 06/20] making the loader general --- configs/dataset/graph/ogbn_arxiv.yaml | 2 +- configs/dataset/graph/ogbn_products.yaml | 30 ++++ topobench/data/datasets/ogbn_arxiv_dataset.py | 130 ------------------ ...taset_loader.py => ogbn_dataset_loader.py} | 14 +- 4 files changed, 38 insertions(+), 138 deletions(-) create mode 100644 configs/dataset/graph/ogbn_products.yaml delete mode 100644 topobench/data/datasets/ogbn_arxiv_dataset.py rename topobench/data/loaders/graph/{ogbn_arxiv_dataset_loader.py => ogbn_dataset_loader.py} (77%) diff --git a/configs/dataset/graph/ogbn_arxiv.yaml b/configs/dataset/graph/ogbn_arxiv.yaml index 0752c038e..10ad188b1 100644 --- a/configs/dataset/graph/ogbn_arxiv.yaml +++ b/configs/dataset/graph/ogbn_arxiv.yaml @@ -1,6 +1,6 @@ # Dataset loader config loader: - _target_: topobench.data.loaders.graph.ogbn_arxiv_dataset_loader.OgbnArxivDatasetLoader + _target_: topobench.data.loaders.graph.ogbn_dataset_loader.OGBNDatasetLoader parameters: data_domain: graph data_type: ogb diff --git a/configs/dataset/graph/ogbn_products.yaml b/configs/dataset/graph/ogbn_products.yaml new file mode 100644 index 000000000..92ebd1329 --- /dev/null +++ b/configs/dataset/graph/ogbn_products.yaml @@ -0,0 +1,30 @@ +loader: + _target_: topobench.data.loaders.graph.ogbn_dataset_loader.OGBNDatasetLoader + parameters: + data_domain: graph + data_type: ogb + data_name: ogbn-products + data_dir: ${paths.data_dir}/${dataset.loader.parameters.data_domain}/${dataset.loader.parameters.data_type} + +# Dataset parameters +parameters: + num_features: 100 + num_classes: 47 + task: classification + loss_type: cross_entropy + monitor_metric: accuracy + task_level: node + +# Split parameters +split_params: + learning_setting: transductive + split_type: predefined + data_seed: 0 + +# Dataloader parameters +dataloader_params: + batch_size: 1 + num_workers: 1 + pin_memory: False + + diff --git a/topobench/data/datasets/ogbn_arxiv_dataset.py b/topobench/data/datasets/ogbn_arxiv_dataset.py deleted file mode 100644 index 01c702f7c..000000000 --- a/topobench/data/datasets/ogbn_arxiv_dataset.py +++ /dev/null @@ -1,130 +0,0 @@ -"""Dataset class for OGBN-Arxiv dataset.""" - -import os -from typing import ClassVar - -from ogb.nodeproppred import PygNodePropPredDataset -from omegaconf import DictConfig -from torch_geometric.data import Data, InMemoryDataset -from torch_geometric.io import fs - - -class OgbnArxivDataset(InMemoryDataset): - r"""Dataset class for OGBN-Arxiv dataset. - - Parameters - ---------- - root : str - Root directory where the dataset will be saved. - name : str - Name of the dataset. - parameters : DictConfig - Configuration parameters for the dataset. - """ - - URLS: ClassVar = { - "ogbn-arxiv": "https://ogb.stanford.edu/docs/nodeprop/#ogbn-arxiv", - } - - FILE_FORMAT: ClassVar = { - "ogbn-arxiv": "auto", - } - - RAW_FILE_NAMES: ClassVar = {} - - def __init__( - self, - root: str, - name: str, - parameters: DictConfig, - ) -> None: - self.name = name - self.parameters = parameters - super().__init__(root) - - out = fs.torch_load(self.processed_paths[0]) - assert len(out) == 3 or len(out) == 4 - - if len(out) == 3: - data, self.slices, self.sizes = out - data_cls = Data - else: - data, self.slices, self.sizes, data_cls = out - - if not isinstance(data, dict): - self.data = data - else: - self.data = data_cls.from_dict(data) - - assert isinstance(self._data, Data) - - def __repr__(self) -> str: - return f"{self.name}(root={self.root}, name={self.name}, parameters={self.parameters})" - - def download(self) -> None: - """Download the dataset via OGB API (automatically handled).""" - _ = PygNodePropPredDataset(name=self.name, root=self.root) - - def process(self) -> None: - """Transform the raw dataset into TopoBench format.""" - dataset = PygNodePropPredDataset(name=self.name, root=self.root) - data = dataset[0] - - # OGB provides y as shape [num_nodes, 1] — flatten it - data.y = data.y.view(-1) - - data_list = [data] - self.data, self.slices = self.collate(data_list) - self._data_list = None - - # Save the processed dataset - fs.torch_save( - (self._data.to_dict(), self.slices, {}, self._data.__class__), - self.processed_paths[0], - ) - - @property - def raw_file_names(self): - """Return files required in raw_dir to skip download(). - - Returns - ------- - list - Empty list (OGB handles this internally). - """ - return [] # OGB handles this internally - - @property - def processed_file_names(self): - """Return processed file name. - - Returns - ------- - str - Name of the processed file. - """ - return "data.pt" - - @property - def processed_paths(self): - """Return processed path list. - - Returns - ------- - list - List of processed file paths. - """ - return [ - os.path.join(self.root, "processed", self.processed_file_names) - ] - - @property - def processed_root(self): - """Return processed root path. - - Returns - ------- - str - Path to the processed root directory. - """ - return self.root diff --git a/topobench/data/loaders/graph/ogbn_arxiv_dataset_loader.py b/topobench/data/loaders/graph/ogbn_dataset_loader.py similarity index 77% rename from topobench/data/loaders/graph/ogbn_arxiv_dataset_loader.py rename to topobench/data/loaders/graph/ogbn_dataset_loader.py index 905f197f3..2cbb378f1 100644 --- a/topobench/data/loaders/graph/ogbn_arxiv_dataset_loader.py +++ b/topobench/data/loaders/graph/ogbn_dataset_loader.py @@ -1,4 +1,4 @@ -"""Loader for the OGBN-Arxiv dataset.""" +"""Loader for OGB node property prediction datasets.""" from pathlib import Path @@ -8,8 +8,8 @@ from topobench.data.loaders.base import AbstractLoader -class OgbnArxivDatasetLoader(AbstractLoader): - """Load the OGBN-Arxiv dataset. +class OGBNDatasetLoader(AbstractLoader): + """Load OGB node property prediction datasets (ogbn-*). Parameters ---------- @@ -21,19 +21,19 @@ def __init__(self, parameters: DictConfig) -> None: super().__init__(parameters) def load_dataset(self) -> PygNodePropPredDataset: - """Load the OGBN-Arxiv dataset. + """Load an OGB node property prediction dataset. Returns ------- PygNodePropPredDataset - The loaded OGBN-Arxiv dataset. + The loaded OGBN dataset. """ dataset = self._initialize_dataset() self.data_dir = self._redefine_data_dir(dataset) return dataset def _initialize_dataset(self) -> PygNodePropPredDataset: - """Initialize the OGBN-Arxiv dataset. + """Initialize the OGBN dataset specified by ``parameters.data_name``. Returns ------- @@ -45,7 +45,7 @@ def _initialize_dataset(self) -> PygNodePropPredDataset: ) def _redefine_data_dir(self, dataset: PygNodePropPredDataset) -> Path: - """Redefine the data directory for OGBN-Arxiv dataset. + """Redefine the data directory for the OGBN dataset. Parameters ---------- From 83f564da0698e35b51d52dfd968abde57fbb914a Mon Sep 17 00:00:00 2001 From: Your Name for giovanni-br Date: Wed, 19 Nov 2025 15:27:54 -0300 Subject: [PATCH 07/20] fix loader --- test/data/load/test_datasetloaders.py | 35 ++++++++++++------- .../data/loaders/graph/ogbn_dataset_loader.py | 12 ++++++- 2 files changed, 33 insertions(+), 14 deletions(-) diff --git a/test/data/load/test_datasetloaders.py b/test/data/load/test_datasetloaders.py index cf333b25d..0028c54e0 100644 --- a/test/data/load/test_datasetloaders.py +++ b/test/data/load/test_datasetloaders.py @@ -37,19 +37,28 @@ def _gather_config_files(self, base_dir: Path) -> List[str]: config_files = [] config_base_dir = base_dir / "configs/dataset" # Below the datasets that have some default transforms manually overriten with no_transform, - exclude_datasets = {"karate_club.yaml", - # Below the datasets that have some default transforms with we manually overriten with no_transform, - # due to lack of default transform for domain2domain - "REDDIT-BINARY.yaml", "IMDB-MULTI.yaml", "IMDB-BINARY.yaml", #"ZINC.yaml" - "ogbg-molpcba.yaml", "manual_dataset.yaml", # "ogbg-molhiv.yaml" - "roman_empire.yaml", # Corrupted data file (BadZipFile error) - "ogbn_arxiv.yaml", # Corrupted arxiv.zip file (BadZipFile error) - "Mushroom.yaml", # Duplicate .ipynb_checkpoints folder (shutil.Error) - "ModelNet40.yaml" # Large download - prone to network errors (ChunkedEncodingError) - } - - # Below the datasets that takes quite some time to load and process - self.long_running_datasets = {"mantra_name.yaml", "mantra_orientation.yaml", "mantra_genus.yaml", "mantra_betti_numbers.yaml"} + exclude_datasets = { + "karate_club.yaml", + # Below the datasets that have some default transforms which we manually override with no_transform, + # due to lack of default transform for domain2domain + "REDDIT-BINARY.yaml", + "IMDB-MULTI.yaml", + "IMDB-BINARY.yaml", # "ZINC.yaml" + "ogbg-molpcba.yaml", + "manual_dataset.yaml", # "ogbg-molhiv.yaml" + "roman_empire.yaml", # Corrupted data file (BadZipFile error) + "Mushroom.yaml", # Duplicate .ipynb_checkpoints folder (shutil.Error) + "ModelNet40.yaml", # Large download - prone to network errors (ChunkedEncodingError) + } + + # Below the datasets that take quite some time to load and process + self.long_running_datasets = { + "mantra_name.yaml", + "mantra_orientation.yaml", + "mantra_genus.yaml", + "mantra_betti_numbers.yaml", + "ogbn_arxiv.yaml", + } for dir_path in config_base_dir.iterdir(): diff --git a/topobench/data/loaders/graph/ogbn_dataset_loader.py b/topobench/data/loaders/graph/ogbn_dataset_loader.py index 2cbb378f1..7e2eba685 100644 --- a/topobench/data/loaders/graph/ogbn_dataset_loader.py +++ b/topobench/data/loaders/graph/ogbn_dataset_loader.py @@ -20,9 +20,19 @@ class OGBNDatasetLoader(AbstractLoader): def __init__(self, parameters: DictConfig) -> None: super().__init__(parameters) - def load_dataset(self) -> PygNodePropPredDataset: + def load_dataset(self, **kwargs) -> PygNodePropPredDataset: """Load an OGB node property prediction dataset. + Additional keyword arguments are accepted for API compatibility with + other loaders (e.g. ``slice`` used in tests for long-running datasets), + but are currently ignored because OGBN datasets are represented as a + single large graph. + + Parameters + ---------- + **kwargs : dict + Additional keyword arguments accepted for API compatibility. + Returns ------- PygNodePropPredDataset From 663260816d0d29857d72093b82b1e9ba92cd080f Mon Sep 17 00:00:00 2001 From: Your Name for giovanni-br Date: Wed, 19 Nov 2025 17:07:37 -0300 Subject: [PATCH 08/20] fix loader --- test/data/load/test_datasetloaders.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/test/data/load/test_datasetloaders.py b/test/data/load/test_datasetloaders.py index 0028c54e0..3d559ae62 100644 --- a/test/data/load/test_datasetloaders.py +++ b/test/data/load/test_datasetloaders.py @@ -49,6 +49,10 @@ def _gather_config_files(self, base_dir: Path) -> List[str]: "roman_empire.yaml", # Corrupted data file (BadZipFile error) "Mushroom.yaml", # Duplicate .ipynb_checkpoints folder (shutil.Error) "ModelNet40.yaml", # Large download - prone to network errors (ChunkedEncodingError) + # OGBN datasets require an interactive confirmation prompt and multi-GB + # download, which is not suitable for automated CI runs. + "ogbn_arxiv.yaml", + "ogbn_products.yaml", } # Below the datasets that take quite some time to load and process @@ -57,7 +61,6 @@ def _gather_config_files(self, base_dir: Path) -> List[str]: "mantra_orientation.yaml", "mantra_genus.yaml", "mantra_betti_numbers.yaml", - "ogbn_arxiv.yaml", } From a17a44a7ee3e30a0663050e34f32a0479b453a38 Mon Sep 17 00:00:00 2001 From: Your Name for giovanni-br Date: Wed, 19 Nov 2025 19:24:00 -0300 Subject: [PATCH 09/20] removing problem with interaction with the terminal --- test/data/load/test_datasetloaders.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/test/data/load/test_datasetloaders.py b/test/data/load/test_datasetloaders.py index 3d559ae62..efc9988b0 100644 --- a/test/data/load/test_datasetloaders.py +++ b/test/data/load/test_datasetloaders.py @@ -49,9 +49,6 @@ def _gather_config_files(self, base_dir: Path) -> List[str]: "roman_empire.yaml", # Corrupted data file (BadZipFile error) "Mushroom.yaml", # Duplicate .ipynb_checkpoints folder (shutil.Error) "ModelNet40.yaml", # Large download - prone to network errors (ChunkedEncodingError) - # OGBN datasets require an interactive confirmation prompt and multi-GB - # download, which is not suitable for automated CI runs. - "ogbn_arxiv.yaml", "ogbn_products.yaml", } @@ -61,6 +58,7 @@ def _gather_config_files(self, base_dir: Path) -> List[str]: "mantra_orientation.yaml", "mantra_genus.yaml", "mantra_betti_numbers.yaml", + "ogbn_arxiv.yaml", } From 39d01ba133e484501c2543b764f8aa9817ed4576 Mon Sep 17 00:00:00 2001 From: Alexsandro Date: Thu, 20 Nov 2025 15:20:00 -0300 Subject: [PATCH 10/20] Adding ogbn-proteins dataset --- configs/dataset/graph/ogbn_proteins.yaml | 29 ++++++++++++++++++++++++ 1 file changed, 29 insertions(+) create mode 100644 configs/dataset/graph/ogbn_proteins.yaml diff --git a/configs/dataset/graph/ogbn_proteins.yaml b/configs/dataset/graph/ogbn_proteins.yaml new file mode 100644 index 000000000..65df2cebe --- /dev/null +++ b/configs/dataset/graph/ogbn_proteins.yaml @@ -0,0 +1,29 @@ +# Dataset loader config +loader: + _target_: topobench.data.loaders.graph.ogbn_dataset_loader.OGBNDatasetLoader + parameters: + data_domain: graph + data_type: OGBNDataset + data_name: ogbn-proteins + data_dir: ${paths.data_dir}/${dataset.loader.parameters.data_domain}/${dataset.loader.parameters.data_type} + +# Dataset parameters +parameters: + num_features: 8 + num_classes: 112 + task: classification + loss_type: cross_entropy + monitor_metric: accuracy + task_level: node + +# Split parameters +split_params: + learning_setting: transductive + split_type: predefined + data_seed: 0 + +# Dataloader parameters +dataloader_params: + batch_size: 1 + num_workers: 1 + pin_memory: False From 9e3dca900281b45f780b47b359c6f1a34f3c8c44 Mon Sep 17 00:00:00 2001 From: Alexsandro Date: Thu, 20 Nov 2025 15:20:45 -0300 Subject: [PATCH 11/20] Fixign datatype in yaml files --- configs/dataset/graph/ogbn_arxiv.yaml | 2 +- configs/dataset/graph/ogbn_products.yaml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/configs/dataset/graph/ogbn_arxiv.yaml b/configs/dataset/graph/ogbn_arxiv.yaml index 10ad188b1..d3a879601 100644 --- a/configs/dataset/graph/ogbn_arxiv.yaml +++ b/configs/dataset/graph/ogbn_arxiv.yaml @@ -3,7 +3,7 @@ loader: _target_: topobench.data.loaders.graph.ogbn_dataset_loader.OGBNDatasetLoader parameters: data_domain: graph - data_type: ogb + data_type: OGBNDataset data_name: ogbn-arxiv data_dir: ${paths.data_dir}/${dataset.loader.parameters.data_domain}/${dataset.loader.parameters.data_type} diff --git a/configs/dataset/graph/ogbn_products.yaml b/configs/dataset/graph/ogbn_products.yaml index 92ebd1329..0ee21825c 100644 --- a/configs/dataset/graph/ogbn_products.yaml +++ b/configs/dataset/graph/ogbn_products.yaml @@ -2,7 +2,7 @@ loader: _target_: topobench.data.loaders.graph.ogbn_dataset_loader.OGBNDatasetLoader parameters: data_domain: graph - data_type: ogb + data_type: OGBNDataset data_name: ogbn-products data_dir: ${paths.data_dir}/${dataset.loader.parameters.data_domain}/${dataset.loader.parameters.data_type} From 095988f30d58896efcbc3220cb3a1e473cd13a75 Mon Sep 17 00:00:00 2001 From: Alexsandro Date: Thu, 20 Nov 2025 16:32:43 -0300 Subject: [PATCH 12/20] Fixing dataset names to match repository convention --- configs/dataset/graph/{ogbn_arxiv.yaml => ogbn-arxiv.yaml} | 3 +++ .../dataset/graph/{ogbn_products.yaml => ogbn-products.yaml} | 0 .../dataset/graph/{ogbn_proteins.yaml => ogbn-proteins.yaml} | 0 test/data/load/test_datasetloaders.py | 4 ++-- 4 files changed, 5 insertions(+), 2 deletions(-) rename configs/dataset/graph/{ogbn_arxiv.yaml => ogbn-arxiv.yaml} (96%) rename configs/dataset/graph/{ogbn_products.yaml => ogbn-products.yaml} (100%) rename configs/dataset/graph/{ogbn_proteins.yaml => ogbn-proteins.yaml} (100%) diff --git a/configs/dataset/graph/ogbn_arxiv.yaml b/configs/dataset/graph/ogbn-arxiv.yaml similarity index 96% rename from configs/dataset/graph/ogbn_arxiv.yaml rename to configs/dataset/graph/ogbn-arxiv.yaml index d3a879601..eb05c1088 100644 --- a/configs/dataset/graph/ogbn_arxiv.yaml +++ b/configs/dataset/graph/ogbn-arxiv.yaml @@ -27,3 +27,6 @@ dataloader_params: batch_size: 1 num_workers: 1 pin_memory: False + +trainer: + max_epochs: 1 diff --git a/configs/dataset/graph/ogbn_products.yaml b/configs/dataset/graph/ogbn-products.yaml similarity index 100% rename from configs/dataset/graph/ogbn_products.yaml rename to configs/dataset/graph/ogbn-products.yaml diff --git a/configs/dataset/graph/ogbn_proteins.yaml b/configs/dataset/graph/ogbn-proteins.yaml similarity index 100% rename from configs/dataset/graph/ogbn_proteins.yaml rename to configs/dataset/graph/ogbn-proteins.yaml diff --git a/test/data/load/test_datasetloaders.py b/test/data/load/test_datasetloaders.py index efc9988b0..cbfb23004 100644 --- a/test/data/load/test_datasetloaders.py +++ b/test/data/load/test_datasetloaders.py @@ -49,7 +49,7 @@ def _gather_config_files(self, base_dir: Path) -> List[str]: "roman_empire.yaml", # Corrupted data file (BadZipFile error) "Mushroom.yaml", # Duplicate .ipynb_checkpoints folder (shutil.Error) "ModelNet40.yaml", # Large download - prone to network errors (ChunkedEncodingError) - "ogbn_products.yaml", + "ogbn-products.yaml", } # Below the datasets that take quite some time to load and process @@ -58,7 +58,7 @@ def _gather_config_files(self, base_dir: Path) -> List[str]: "mantra_orientation.yaml", "mantra_genus.yaml", "mantra_betti_numbers.yaml", - "ogbn_arxiv.yaml", + "ogbn-arxiv.yaml", } From 1383dd05655820a255fdc38f9672580c044c106f Mon Sep 17 00:00:00 2001 From: Alexsandro Date: Thu, 20 Nov 2025 16:57:34 -0300 Subject: [PATCH 13/20] Fixing dimensionality issue --- topobench/data/loaders/graph/ogbn_dataset_loader.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/topobench/data/loaders/graph/ogbn_dataset_loader.py b/topobench/data/loaders/graph/ogbn_dataset_loader.py index 7e2eba685..f23981f1a 100644 --- a/topobench/data/loaders/graph/ogbn_dataset_loader.py +++ b/topobench/data/loaders/graph/ogbn_dataset_loader.py @@ -2,6 +2,7 @@ from pathlib import Path +import torch from ogb.nodeproppred import PygNodePropPredDataset from omegaconf import DictConfig @@ -40,6 +41,13 @@ def load_dataset(self, **kwargs) -> PygNodePropPredDataset: """ dataset = self._initialize_dataset() self.data_dir = self._redefine_data_dir(dataset) + + # Conver attributes to float + dataset._data.x = dataset._data.x.to(torch.float) + # Squeeze the target tensor + dataset._data.y = dataset._data.y.squeeze(1) + dataset.split_idx = dataset.get_idx_split() + return dataset def _initialize_dataset(self) -> PygNodePropPredDataset: From 58aeb155ac79b3b695ad953d6e04e869ec79e9fc Mon Sep 17 00:00:00 2001 From: Alexsandro Date: Thu, 20 Nov 2025 16:58:08 -0300 Subject: [PATCH 14/20] Attempt to fix ogbn-arxiv.yaml config --- configs/dataset/graph/ogbn-arxiv.yaml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/configs/dataset/graph/ogbn-arxiv.yaml b/configs/dataset/graph/ogbn-arxiv.yaml index eb05c1088..3996692a3 100644 --- a/configs/dataset/graph/ogbn-arxiv.yaml +++ b/configs/dataset/graph/ogbn-arxiv.yaml @@ -19,8 +19,10 @@ parameters: # Split parameters split_params: learning_setting: transductive - split_type: predefined + data_split_dir: ${paths.data_dir}/data_splits/${dataset.loader.parameters.data_name} + split_type: random data_seed: 0 + train_prop: 0.5 # for "random" strategy splitting # Dataloader parameters dataloader_params: From 859afdc6e2acc2b5a30e4cffdaa54a8a4d96af2b Mon Sep 17 00:00:00 2001 From: Alexsandro Date: Thu, 20 Nov 2025 19:07:04 -0300 Subject: [PATCH 15/20] Deleting ogbn-proteins --- configs/dataset/graph/ogbn-proteins.yaml | 29 ------------------------ 1 file changed, 29 deletions(-) delete mode 100644 configs/dataset/graph/ogbn-proteins.yaml diff --git a/configs/dataset/graph/ogbn-proteins.yaml b/configs/dataset/graph/ogbn-proteins.yaml deleted file mode 100644 index 65df2cebe..000000000 --- a/configs/dataset/graph/ogbn-proteins.yaml +++ /dev/null @@ -1,29 +0,0 @@ -# Dataset loader config -loader: - _target_: topobench.data.loaders.graph.ogbn_dataset_loader.OGBNDatasetLoader - parameters: - data_domain: graph - data_type: OGBNDataset - data_name: ogbn-proteins - data_dir: ${paths.data_dir}/${dataset.loader.parameters.data_domain}/${dataset.loader.parameters.data_type} - -# Dataset parameters -parameters: - num_features: 8 - num_classes: 112 - task: classification - loss_type: cross_entropy - monitor_metric: accuracy - task_level: node - -# Split parameters -split_params: - learning_setting: transductive - split_type: predefined - data_seed: 0 - -# Dataloader parameters -dataloader_params: - batch_size: 1 - num_workers: 1 - pin_memory: False From 1ccd71b94a4d7e6507d1a1cc451349fce277c4a3 Mon Sep 17 00:00:00 2001 From: Alexsandro Date: Thu, 20 Nov 2025 19:08:26 -0300 Subject: [PATCH 16/20] Fixing config files for ogbn datasets --- configs/dataset/graph/ogbn-arxiv.yaml | 6 ++---- configs/dataset/graph/ogbn-products.yaml | 5 ++++- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/configs/dataset/graph/ogbn-arxiv.yaml b/configs/dataset/graph/ogbn-arxiv.yaml index 3996692a3..77bca0bd8 100644 --- a/configs/dataset/graph/ogbn-arxiv.yaml +++ b/configs/dataset/graph/ogbn-arxiv.yaml @@ -20,8 +20,9 @@ parameters: split_params: learning_setting: transductive data_split_dir: ${paths.data_dir}/data_splits/${dataset.loader.parameters.data_name} - split_type: random + split_type: random #'k-fold' # either "k-fold" or "random" strategies data_seed: 0 + k: 10 # for "k-fold" Cross-Validation train_prop: 0.5 # for "random" strategy splitting # Dataloader parameters @@ -29,6 +30,3 @@ dataloader_params: batch_size: 1 num_workers: 1 pin_memory: False - -trainer: - max_epochs: 1 diff --git a/configs/dataset/graph/ogbn-products.yaml b/configs/dataset/graph/ogbn-products.yaml index 0ee21825c..1989101d0 100644 --- a/configs/dataset/graph/ogbn-products.yaml +++ b/configs/dataset/graph/ogbn-products.yaml @@ -18,8 +18,11 @@ parameters: # Split parameters split_params: learning_setting: transductive - split_type: predefined + data_split_dir: ${paths.data_dir}/data_splits/${dataset.loader.parameters.data_name} + split_type: random #'k-fold' # either "k-fold" or "random" strategies data_seed: 0 + k: 10 # for "k-fold" Cross-Validation + train_prop: 0.5 # for "random" strategy splitting # Dataloader parameters dataloader_params: From ec4553bf5e63dc0d6951caec5daabc200cfeccd2 Mon Sep 17 00:00:00 2001 From: Alexsandro Date: Thu, 20 Nov 2025 19:41:56 -0300 Subject: [PATCH 17/20] Removing unnecessary code from ogbn_dataset_loader --- topobench/data/loaders/graph/ogbn_dataset_loader.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/topobench/data/loaders/graph/ogbn_dataset_loader.py b/topobench/data/loaders/graph/ogbn_dataset_loader.py index f23981f1a..f1f73321d 100644 --- a/topobench/data/loaders/graph/ogbn_dataset_loader.py +++ b/topobench/data/loaders/graph/ogbn_dataset_loader.py @@ -10,7 +10,7 @@ class OGBNDatasetLoader(AbstractLoader): - """Load OGB node property prediction datasets (ogbn-*). + """Load OGB node property prediction datasets (ogbn-arxiv, ogbn-products). Parameters ---------- From 1061ea8398ceb640598b4dc82139babc86b46d1a Mon Sep 17 00:00:00 2001 From: Your Name for giovanni-br Date: Fri, 21 Nov 2025 10:36:07 -0300 Subject: [PATCH 18/20] Add test ogbn-arxiv in the pipeline --- test/pipeline/test_pipeline.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/pipeline/test_pipeline.py b/test/pipeline/test_pipeline.py index 70c3f4075..4642511b0 100644 --- a/test/pipeline/test_pipeline.py +++ b/test/pipeline/test_pipeline.py @@ -6,8 +6,8 @@ # Use a dataset whose labels are 1D so it is compatible with the current split utilities. # This keeps changes confined to tests, as required. -DATASET = "graph/cocitation_cora" # ADD YOUR DATASET HERE -MODELS = ["graph/gcn", "cell/topotune", "simplicial/topotune"] # ADD ONE OR SEVERAL MODELS OF YOUR CHOICE HERE +DATASET = "graph/ogbn-arxiv" # ADD YOUR DATASET HERE +MODELS = ["graph/gcn", "cell/topotune"] # ADD ONE OR SEVERAL MODELS OF YOUR CHOICE HERE class TestPipeline: From ff7175354b9701e8bcd48e8c7fea5880d02d6a42 Mon Sep 17 00:00:00 2001 From: Your Name for giovanni-br Date: Fri, 21 Nov 2025 12:03:55 -0300 Subject: [PATCH 19/20] making it lighter --- test/pipeline/test_pipeline.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/pipeline/test_pipeline.py b/test/pipeline/test_pipeline.py index 4642511b0..2d89ddf3a 100644 --- a/test/pipeline/test_pipeline.py +++ b/test/pipeline/test_pipeline.py @@ -7,7 +7,7 @@ # Use a dataset whose labels are 1D so it is compatible with the current split utilities. # This keeps changes confined to tests, as required. DATASET = "graph/ogbn-arxiv" # ADD YOUR DATASET HERE -MODELS = ["graph/gcn", "cell/topotune"] # ADD ONE OR SEVERAL MODELS OF YOUR CHOICE HERE +MODELS = ["graph/gcn"] # ADD ONE OR SEVERAL MODELS OF YOUR CHOICE HERE class TestPipeline: From f7b592773c7b2337e494a4e50d70170f18e60c2c Mon Sep 17 00:00:00 2001 From: Alexsandro Date: Fri, 21 Nov 2025 14:42:38 -0300 Subject: [PATCH 20/20] Updating README.md --- README.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 5e1187284..435fbb83c 100755 --- a/README.md +++ b/README.md @@ -366,7 +366,8 @@ Specially useful in pre-processing steps, these are the general data manipulatio | Cora | Classification | Cocitation dataset. | [Source](https://link.springer.com/article/10.1023/A:1009953814988) | | Citeseer | Classification | Cocitation dataset. | [Source](https://dl.acm.org/doi/10.1145/276675.276685) | | Pubmed | Classification | Cocitation dataset. | [Source](https://ojs.aaai.org/aimagazine/index.php/aimagazine/article/view/2157) | -| ogbn-arxiv | Classification | ogbn-arxiv dataset (node classification) | [Source](https://arxiv.org/abs/2005.00687) | +| ogbn-arxiv | Classification | Node property prediction (classification) | [Source](https://arxiv.org/abs/2005.00687) | +| ogbn-products | Classification | Node property prediction (classification) | [Source](https://arxiv.org/abs/2005.00687) | | MUTAG | Classification | Graph-level classification. | [Source](https://pubs.acs.org/doi/abs/10.1021/jm00106a046) | | PROTEINS | Classification | Graph-level classification. | [Source](https://academic.oup.com/bioinformatics/article/21/suppl_1/i47/202991) | | NCI1 | Classification | Graph-level classification. | [Source](https://ieeexplore.ieee.org/document/4053093) |