diff --git a/.github/pull_request_template.md b/.github/pull_request_template.md
index 18f3b4f2..0c7e321e 100644
--- a/.github/pull_request_template.md
+++ b/.github/pull_request_template.md
@@ -1,4 +1,4 @@
-### All Submissions
+### All submissions
 
 - [ ] Have you followed the guidelines in our [Contributing](../../CONTRIBUTING.md) document?
 - [ ] Have you checked to ensure there are no other open [Pull Requests](../../../pulls) for the same changes?
diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml
index 4f6af7ea..a61bcb44 100644
--- a/.github/workflows/ci.yaml
+++ b/.github/workflows/ci.yaml
@@ -25,7 +25,6 @@ jobs:
         make build
 
     - name: Check docstrings
-      continue-on-error: true   # TODO: remove when all docstring issues are fixed
       run: |
         make docstring-check
 
diff --git a/Makefile b/Makefile
index 7605111e..07d75216 100644
--- a/Makefile
+++ b/Makefile
@@ -27,7 +27,7 @@ setup-tensorboard:
 	@echo '=== Setup TensorBoard ==='
 	$(UV) pip install -e ".[tensorboard]"
 
-check: lint format typecheck
+check: typecheck format lint docstring-check
 
 format:
 	@echo '=== Formatting ==='
@@ -73,8 +73,6 @@ si-test:
 	@echo '=== Running single integration test for $(T) ==='
 	$(UV) run $(PYTEST) -n auto -s $(PROJECT_NAME)/integration_tests/$(T) -m "integration"
 
-
-
 # If the first argument is run...
 ifeq ($(firstword $(MAKECMDGOALS)),run)
   # use the rest as arguments for run...
diff --git a/docs/getting-started/installation.md b/docs/getting-started/installation.md
index 4cbdc772..7502350b 100644
--- a/docs/getting-started/installation.md
+++ b/docs/getting-started/installation.md
@@ -58,7 +58,7 @@ HyperBench declares compatibility ranges for direct dependencies in `pyproject.t
 
 | Dependency | Supported range | Notes |
 | --- | --- | --- |
-| mkdocstrings[python] | `>=1.0.4,<2.0.0` |  |
+| mkdocstrings\[python\] | `>=1.0.4,<2.0.0` |  |
 | pre-commit | `>=4.5.1,<5.0.0` |  |
 | pytest | `>=9.0.3,<10.0.0` |  |
 | pytest-cov | `>=7.1.0,<8.0.0` |  |
diff --git a/examples/early_stopping.py b/examples/early_stopping.py
index 03e9c149..b0070fb5 100644
--- a/examples/early_stopping.py
+++ b/examples/early_stopping.py
@@ -81,7 +81,8 @@
     train_dataset.enrich_node_features(
         enricher=LaplacianPositionalEncodingEnricher(
             num_features=num_features,
-            # In transductive setting, use total number of nodes to ensure consistent encoding across splits
+            # In transductive setting, use total number of nodes to ensure consistent encoding
+            # across splits
             # as the train dataset contain all nodes but may have no hyperedges where they appear
             num_nodes=train_dataset.hdata.num_nodes,
         ),
diff --git a/examples/external_dataset.py b/examples/external_dataset.py
index b5b0b305..13d6e4d0 100644
--- a/examples/external_dataset.py
+++ b/examples/external_dataset.py
@@ -81,7 +81,8 @@
     train_dataset.enrich_node_features(
         enricher=LaplacianPositionalEncodingEnricher(
             num_features=num_features,
-            # In transductive setting, use total number of nodes to ensure consistent encoding across splits
+            # In transductive setting, use total number of nodes to ensure consistent encoding
+            # across splits
             # as the train dataset contain all nodes but may have no hyperedges where they appear
             num_nodes=train_dataset.hdata.num_nodes,
         ),
diff --git a/hyperbench/__init__.py b/hyperbench/__init__.py
index aaa13ee1..b13aa47e 100644
--- a/hyperbench/__init__.py
+++ b/hyperbench/__init__.py
@@ -10,5 +10,8 @@
 
 warnings.filterwarnings(
     "ignore",
-    message="ignore:Failing to pass a value to the 'type_params' parameter of 'typing._eval_type' is deprecated.*",
+    message=(
+        "ignore:Failing to pass a value to the 'type_params' parameter of "
+        "'typing._eval_type' is deprecated.*"
+    ),
 )
diff --git a/hyperbench/data/dataset.py b/hyperbench/data/dataset.py
index 6dbbe5a0..230f1484 100644
--- a/hyperbench/data/dataset.py
+++ b/hyperbench/data/dataset.py
@@ -30,7 +30,8 @@ class Dataset(TorchDataset):
 
     Args:
         hdata: The processed hypergraph data in HData format.
-        sampling_strategy: The strategy used for sampling sub-hypergraphs (e.g., by node IDs or hyperedge IDs).
+        sampling_strategy: The strategy used for sampling sub-hypergraphs
+            (e.g., by node IDs or hyperedge IDs).
             If not provided, defaults to ``SamplingStrategy.HYPEREDGE``.
     """
 
@@ -44,10 +45,11 @@ def __init__(
 
         Args:
             hdata: Optional HData object to initialize the dataset with.
-                If provided, the dataset will be initialized with this data instead of loading and processing from HIF. Must be provided if prepare is set to ``False``.
-            sampling_strategy: The sampling strategy to use for the dataset. If not provided, defaults to ``SamplingStrategy.HYPEREDGE``.
+                If provided, the dataset will be initialized with this data instead of loading and
+                processing from HIF. Must be provided if prepare is set to ``False``.
+            sampling_strategy: The sampling strategy to use for the dataset. If not provided,
+                defaults to ``SamplingStrategy.HYPEREDGE``.
         """
-
         self.__sampler = create_sampler_from_strategy(sampling_strategy)
         self.sampling_strategy = sampling_strategy
         self.hdata = hdata if hdata is not None else HData.empty()
@@ -58,18 +60,23 @@ def __len__(self) -> int:
     def __getitem__(self, index: int | list[int]) -> HData:
         """
         Sample a sub-hypergraph based on the sampling strategy and return it as HData.
+
         If:
-        - Sampling by node IDs, the sub-hypergraph will contain all hyperedges incident to the sampled nodes and all nodes incident to those hyperedges.
-        - Sampling by hyperedge IDs, the sub-hypergraph will contain all nodes incident to the sampled hyperedges.
+            - Sampling by node IDs, the sub-hypergraph will contain all hyperedges incident to the
+            sampled nodes and all nodes incident to those hyperedges.
+            - Sampling by hyperedge IDs, the sub-hypergraph will contain all nodes incident to the
+            sampled hyperedges.
 
         Args:
-            index: An integer or a list of integers representing node or hyperedge IDs to sample, depending on the sampling strategy.
+            index: An integer or a list of integers representing node or hyperedge IDs to sample,
+                depending on the sampling strategy.
 
         Returns:
             hdata: An HData instance containing the sampled sub-hypergraph.
 
         Raises:
-            ValueError: If the provided index is invalid (e.g., empty list or list length exceeds number of nodes/hyperedges).
+            ValueError: If the provided index is invalid (e.g., empty list or list length exceeds
+                number of nodes/hyperedges).
             IndexError: If any node/hyperedge ID is out of bounds.
         """
         return self.__sampler.sample(index, self.hdata)
@@ -85,7 +92,8 @@ def from_hdata(
 
         Args:
             hdata: `HData` object containing the hypergraph data.
-            sampling_strategy: The sampling strategy to use for the dataset. If not provided, defaults to ``SamplingStrategy.HYPEREDGE``.
+            sampling_strategy: The sampling strategy to use for the dataset. If not provided,
+                defaults to ``SamplingStrategy.HYPEREDGE``.
 
         Returns:
             dataset: The `Dataset` instance with the provided `HData`.
@@ -100,11 +108,13 @@ def from_url(
         save_on_disk: bool = False,
     ) -> Dataset:
         """
-        Create a `Dataset` instance by loading a hypergraph from a URL pointing to a .json or .json.zst file in HIF format.
+        Create a `Dataset` instance by loading a hypergraph from a URL pointing to a .json or
+        .json.zst file in HIF format.
 
         Args:
             url: The URL to the .json or .json.zst file containing the HIF hypergraph data.
-            sampling_strategy: The sampling strategy to use for the dataset. If not provided, defaults to ``SamplingStrategy.HYPEREDGE``.
+                sampling_strategy: The sampling strategy to use for the dataset. If not provided,
+                defaults to ``SamplingStrategy.HYPEREDGE``.
             save_on_disk: Whether to save the downloaded file on disk.
 
         Returns:
@@ -121,11 +131,14 @@ def from_path(
         sampling_strategy: SamplingStrategy = SamplingStrategy.HYPEREDGE,
     ) -> Dataset:
         """
-        Create a `Dataset` instance by loading a hypergraph from a local file path pointing to a .json or .json.zst file in HIF format.
+        Create a `Dataset` instance by loading a hypergraph from a local file path pointing to a
+        .json or .json.zst file in HIF format.
 
         Args:
-            filepath: The local file path to the .json or .json.zst file containing the HIF hypergraph data.
-            sampling_strategy: The sampling strategy to use for the dataset. If not provided, defaults to ``SamplingStrategy.HYPEREDGE``.
+            filepath: The local file path to the .json or .json.zst file containing the
+                HIF hypergraph data.
+            sampling_strategy: The sampling strategy to use for the dataset. If not provided,
+                defaults to ``SamplingStrategy.HYPEREDGE``.
 
         Returns:
             dataset: The `Dataset` instance with the loaded hypergraph data.
@@ -143,7 +156,8 @@ def enrich_node_features(
         Enrich node features using the provided node feature enricher.
 
         Args:
-            enricher: An instance of NodeEnricher to generate structural node features from hypergraph topology.
+            enricher: An instance of NodeEnricher to generate structural node features from
+                hypergraph topology.
             enrichment_mode: How to combine generated features with existing ``hdata.x``.
                 ``concatenate`` appends new features to the existing ones as additional columns.
                 ``replace`` substitutes ``hdata.x`` entirely.
@@ -168,18 +182,22 @@ def enrich_node_features_from(
             >>> test_dataset.enrich_node_features_from(
             ...     train_dataset,
             ...     node_space_setting="inductive",
-            ...     fill_value=0.0,  # torch.tensor(0.0) also works and will be broadcast to the appropriate shape
+            ...     fill_value=0.0,  # torch.tensor(0.0) also works and will be broadcast to the
+            ...     appropriate shape
             ... )
 
         Args:
             dataset_with_features: Source dataset providing node features.
             node_space_setting: The setting for the node space, determining how nodes are handled.
                 ``transductive`` (default) preserves the full node space of the target dataset.
-                ``inductive`` allows the target dataset to have a different node space, filling missing features with ``fill_value``.
-            fill_value: Scalar or vector used to fill missing node features when ``node_space_setting`` is not transductive.
+                ``inductive`` allows the target dataset to have a different node space, filling
+                missing features with ``fill_value``.
+            fill_value: Scalar or vector used to fill missing node features when
+                ``node_space_setting`` is not transductive.
 
         Raises:
-            ValueError: If the source dataset's node features cannot be aligned with the target dataset's nodes.
+            ValueError: If the source dataset's node features cannot be aligned with the target
+                dataset's nodes.
         """
         self.hdata = self.hdata.enrich_node_features_from(
             hdata_with_features=dataset_with_features.hdata,
@@ -196,8 +214,10 @@ def enrich_hyperedge_attr(
         Enrich hyperedge attributes using the provided hyperedge feature enricher.
 
         Args:
-            enricher: An instance of HyperedgeEnricher to generate structural hyperedge attributes from hypergraph topology.
-            enrichment_mode: How to combine generated attributes with existing ``hdata.hyperedge_attr``.
+            enricher: An instance of HyperedgeEnricher to generate structural hyperedge
+                attributes from hypergraph topology.
+            enrichment_mode: How to combine generated attributes with existing
+                ``hdata.hyperedge_attr``.
                 ``concatenate`` appends new attributes to the existing ones as additional columns.
                 ``replace`` substitutes ``hdata.hyperedge_attr`` entirely.
                 Defaults to ``replace`` if not provided.
@@ -213,8 +233,10 @@ def enrich_hyperedge_weights(
         Enrich hyperedge weights using the provided hyperedge weight enricher.
 
         Args:
-            enricher: An instance of HyperedgeEnricher to generate structural hyperedge weights from hypergraph topology.
-            enrichment_mode: How to combine generated weights with existing ``hdata.hyperedge_weights``.
+            enricher: An instance of HyperedgeEnricher to generate structural hyperedge weights
+                from hypergraph topology.
+            enrichment_mode: How to combine generated weights with existing
+                ``hdata.hyperedge_weights``.
                 ``concatenate`` appends new weights to the existing ones as additional columns.
                 ``replace`` substitutes ``hdata.hyperedge_weights`` entirely.
                 Defaults to ``replace`` if not provided.
@@ -242,7 +264,8 @@ def add_negative_samples(
         Create a new `Dataset` with sampled negative hyperedges added.
 
         Args:
-            negative_sampler: Sampler used to generate negative hyperedges from this dataset's ``hdata``.
+            negative_sampler: Sampler used to generate negative hyperedges from
+                this dataset's ``hdata``.
             seed: Optional random seed used for both negative sampling and the final shuffle.
 
         Returns:
@@ -265,10 +288,11 @@ def remove_hyperedges_with_fewer_than_k_nodes(
 
         Args:
             k: The minimum number of nodes a hyperedge must have to be retained.
-            preserve_global_node_ids: Whether to preserve the global node IDs after removing hyperedges. Defaults to ``False``.
-                If ``False``, the global node IDs will be reindexed to be contiguous after removing hyperedges.
-                If ``True``, the global node IDs will be preserved, which may cause some models to raise
-                as they may expect contiguous global node IDs.
+            preserve_global_node_ids: Whether to preserve the global node IDs
+                after removing hyperedges. Defaults to ``False``. If ``False``, the global node IDs
+                will be reindexed to be contiguous after removing hyperedges.
+                If ``True``, the global node IDs will be preserved, which may cause some models
+                to raise as they may expect contiguous global node IDs.
         """
         self.hdata = self.hdata.remove_hyperedges_with_fewer_than_k_nodes(
             k, preserve_global_node_ids
@@ -313,7 +337,9 @@ def split(
 
         Args:
             ratios: List of floats summing to ``1.0``, e.g., ``[0.8, 0.1, 0.1]``.
-            shuffle: Whether to shuffle hyperedges before splitting. Defaults to ``False`` for deterministic splits.
+                shuffle: Whether to shuffle hyperedges before splitting. Defaults to ``False``
+                for deterministic splits.
+            seed: Optional random seed for reproducibility. Ignored if shuffle is set to ``False``.
             node_space_setting: Whether to preserve the full node space in the splits.
                 ``transductive`` (default) preserves the full node space on the
                 first split. ``inductive`` keeps each split's local node space.
@@ -324,9 +350,11 @@ def split(
             train_split_idx: The index of the split to treat as the train split. Defaults to ``0``,
                 so the first split is the train split that gets the full node space in the
                 transductive setting and is optionally rebalanced to cover all nodes.
-                This is used only when ``node_space_setting=="transductive"`` and ``cover_all_nodes_in_train_split==True``,
+                This is used only when ``node_space_setting=="transductive"``
+                and ``cover_all_nodes_in_train_split==True``,
                 to determine which split should be rebalanced to cover all nodes.
-                For the 'inductive' setting, splits are always returned based on the provided ratios.
+                For the 'inductive' setting, splits are always returned based on
+                the provided ratios.
             seed: Optional random seed for reproducibility. Ignored if shuffle is set to ``False``.
             splitter: Optional dataset splitter. When provided, it owns split
                 construction and final-ratio reporting.
@@ -396,10 +424,13 @@ def split_with_ratios(
             train_split_idx: The index of the split to treat as the train split. Defaults to ``0``,
                 so the first split is the train split that gets the full node space in the
                 transductive setting and is optionally rebalanced to cover all nodes.
-                This is used only when ``node_space_setting=="transductive"`` and ``cover_all_nodes_in_train_split==True``,
+                This is used only when ``node_space_setting=="transductive"``
+                and ``cover_all_nodes_in_train_split==True``,
                 to determine which split should be rebalanced to cover all nodes.
-                For the 'inductive' setting, splits are always returned based on the provided ratios.
-            seed: Optional random seed for reproducibility. Ignored if ``shuffle`` is set to ``False``.
+                For the 'inductive' setting, splits are always returned based on
+                the provided ratios.
+            seed: Optional random seed for reproducibility. Ignored if ``shuffle``
+                is set to ``False``.
 
         Returns:
             datasets_and_ratios: A tuple containing the split datasets and their
@@ -451,27 +482,35 @@ def transform_hyperedge_attrs(
     def stats(self) -> dict[str, Any]:
         """
         Compute statistics for the dataset.
+
         This method currently delegates to the underlying HData's stats method.
-        The fields returned in the dictionary include:
-        - ``shape_x``: The shape of the node feature matrix ``x``.
-        - ``shape_hyperedge_attr``: The shape of the hyperedge attribute matrix, or ``None`` if hyperedge attributes are not present.
-        - ``num_nodes``: The number of nodes in the hypergraph.
-        - ``num_hyperedges``: The number of hyperedges in the hypergraph.
-        - ``avg_degree_node_raw``: The average degree of nodes, calculated as the mean number of hyperedges each node belongs to.
-        - ``avg_degree_node``: The floored node average degree.
-        - ``avg_degree_hyperedge_raw``: The average size of hyperedges, calculated as the mean number of nodes each hyperedge contains.
-        - ``avg_degree_hyperedge``: The floored hyperedge average size.
-        - ``node_degree_max``: The maximum degree of any node in the hypergraph.
-        - ``hyperedge_degree_max``: The maximum size of any hyperedge in the hypergraph.
-        - ``node_degree_median``: The median degree of nodes in the hypergraph.
-        - ``hyperedge_degree_median``: The median size of hyperedges in the hypergraph.
-        - ``distribution_node_degree``: A list where the value at index ``i`` represents the count of nodes with degree ``i``.
-        - ``distribution_hyperedge_size``: A list where the value at index ``i`` represents the count of hyperedges with size ``i``.
-        - ``distribution_node_degree_hist``: A dictionary where the keys are node degrees and the values are the count of nodes with that degree.
-        - ``distribution_hyperedge_size_hist``: A dictionary where the keys are hyperedge sizes and the values are the count of hyperedges with that size.
+
+        Fields:
+            - ``shape_x``: The shape of the node feature matrix ``x``.
+            - ``shape_hyperedge_attr``: The shape of the hyperedge attribute matrix, or ``None``
+            if hyperedge attributes are not present.
+            - ``num_nodes``: The number of nodes in the hypergraph.
+            - ``num_hyperedges``: The number of hyperedges in the hypergraph.
+            - ``avg_degree_node_raw``: The average degree of nodes, calculated as the mean number
+             of hyperedges each node belongs to.
+            - ``avg_degree_node``: The floored node average degree.
+            - ``avg_degree_hyperedge_raw``: The average size of hyperedges, calculated as the
+            mean number of nodes each hyperedge contains.
+            - ``avg_degree_hyperedge``: The floored hyperedge average size.
+            - ``node_degree_max``: The maximum degree of any node in the hypergraph.
+            - ``hyperedge_degree_max``: The maximum size of any hyperedge in the hypergraph.
+            - ``node_degree_median``: The median degree of nodes in the hypergraph.
+            - ``hyperedge_degree_median``: The median size of hyperedges in the hypergraph.
+            - ``distribution_node_degree``: A list where the value at index ``i`` represents
+            the count of nodes with degree ``i``.
+            - ``distribution_hyperedge_size``: A list where the value at index ``i`` represents
+            the count of hyperedges with size ``i``.
+            - ``distribution_node_degree_hist``: A dictionary where the keys are node degrees
+             and the values are the count of nodes with that degree.
+            - ``distribution_hyperedge_size_hist``: A dictionary where the keys are hyperedge
+             sizes and the values are the count of hyperedges with that size.
 
         Returns:
             stats: A dictionary containing various statistics about the hypergraph.
         """
-
         return self.hdata.stats()
diff --git a/hyperbench/data/enricher.py b/hyperbench/data/enricher.py
index 5d57ea3c..8178bcfe 100644
--- a/hyperbench/data/enricher.py
+++ b/hyperbench/data/enricher.py
@@ -30,7 +30,8 @@ class _VilLainTrainer:
     Args:
         num_features: Dimensionality of the embeddings to generate.
         num_nodes: Total number of nodes, including isolated nodes missing from ``hyperedge_index``.
-        num_hyperedges: Total number of hyperedges, including empty hyperedges missing from ``hyperedge_index``.
+        num_hyperedges: Total number of hyperedges, including empty hyperedges missing
+            from ``hyperedge_index``.
         labels_per_subspace: Number of virtual labels per VilLain subspace.
         training_steps: Propagation steps used for VilLain self-supervised loss.
         generation_steps: Propagation steps averaged for final embeddings.
@@ -93,7 +94,8 @@ def _num_hyperedges(self, hyperedge_index: Tensor) -> int:
         Return the explicit hyperedge count or infer it from ``hyperedge_index``.
 
         Args:
-            hyperedge_index: Hyperedge index tensor used to infer the hyperedge count when no explicit count was provided.
+            hyperedge_index: Hyperedge index tensor used to infer the hyperedge count when
+                no explicit count was provided.
 
         Returns:
             Total number of hyperedges to preserve during VilLain propagation.
@@ -109,14 +111,15 @@ def _num_nodes(self, hyperedge_index: Tensor) -> int:
         Return the explicit node count or infer it from ``hyperedge_index``.
 
         Args:
-            hyperedge_index: Hyperedge index tensor used to infer the node count when no explicit count was provided.
+            hyperedge_index: Hyperedge index tensor used to infer the node count when
+                no explicit count was provided.
 
         Returns:
             Total number of nodes to preserve during VilLain training and embedding generation.
         """
         return HyperedgeIndex(hyperedge_index).num_nodes_if_isolated_exist(self.num_nodes)
 
-    def _train(self, hyperedge_index: Tensor):
+    def _train(self, hyperedge_index: Tensor) -> VilLain:
         """
         Train a VilLain model on the provided hypergraph topology.
 
@@ -247,7 +250,8 @@ def enrich(self, hyperedge_index: Tensor) -> Tensor:
             hyperedge_index: Hyperedge index tensor of shape ``(2, num_hyperedges)``.
 
         Returns:
-            hyperedge_attr: Tensor of shape ``(num_hyperedges, 1)`` containing the generated attribute for each hyperedge.
+            hyperedge_attr: Tensor of shape ``(num_hyperedges, 1)`` containing
+                the generated attribute for each hyperedge.
         """
         num_hyperedges = HyperedgeIndex(hyperedge_index).num_hyperedges
         hyperedge_attrs = torch.full(
@@ -265,8 +269,10 @@ class VilLainHyperedgeAttrsEnricher(_VilLainTrainer, HyperedgeAttrsEnricher):
 
     Args:
         num_features: Dimensionality of the hyperedge embeddings to generate.
-        num_nodes: Total number of nodes, including isolated nodes that do not appear in ``hyperedge_index``.
-        num_hyperedges: Total number of hyperedges, including empty hyperedges that do not appear in ``hyperedge_index``.
+        num_nodes: Total number of nodes, including isolated nodes that do not
+            appear in ``hyperedge_index``.
+        num_hyperedges: Total number of hyperedges, including empty hyperedges that
+            do not appear in ``hyperedge_index``.
         labels_per_subspace: Number of virtual labels per subspace. Defaults to ``2``.
         training_steps: Propagation steps used for VilLain self-supervised loss. Defaults to ``4``.
         generation_steps: Propagation steps averaged for final embeddings. Defaults to ``100``.
@@ -320,7 +326,8 @@ def enrich(self, hyperedge_index: Tensor) -> Tensor:
             hyperedge_index: Hyperedge index tensor of shape ``(2, num_hyperedges)``.
 
         Returns:
-            hyperedge_embeddings: Tensor of shape ``(num_hyperedges, num_features)`` containing VilLain hyperedge embeddings.
+            hyperedge_embeddings: Tensor of shape ``(num_hyperedges, num_features)``
+                containing VilLain hyperedge embeddings.
         """
         num_hyperedges = self._num_hyperedges(hyperedge_index)
         if num_hyperedges == 0:
@@ -347,8 +354,10 @@ class ABHyperedgeWeightsEnricher(HyperedgeWeightsEnricher):
 
     Args:
         cache_dir: Directory for saving/loading cached features. If ``None``, caching is disabled.
-        alpha: Scaling factor for the random component added to weights. Must be between 0.0 and 1.0.
-        beta: If provided, the random component is alpha * beta. If None, no random component is added.
+        alpha: Scaling factor for the random component added to weights.
+            Must be between 0.0 and 1.0.
+        beta: If provided, the random component is alpha * beta.
+            If None, no random component is added.
     """
 
     def __init__(
@@ -373,10 +382,13 @@ def enrich(self, hyperedge_index: Tensor) -> Tensor:
             hyperedge_index: Hyperedge index tensor of shape ``(2, num_hyperedges)``.
 
         Returns:
-            hyperedge_weight: Tensor of shape ``(num_hyperedges,)`` containing the weight of each hyperedge.
+            hyperedge_weight: Tensor of shape ``(num_hyperedges,)`` containing
+                the weight of each hyperedge.
         """
-        # Count the number of nodes in each hyperedge by counting occurrences of each hyperedge index.
-        # Example: if hyperedge_index[1] = [0, 0, 1, 1, 1], then we have 2 nodes in hyperedge 0 and 3 nodes in hyperedge 1.
+        # Count the number of nodes in each hyperedge by counting occurrences of
+        # each hyperedge index.
+        # Example: if hyperedge_index[1] = [0, 0, 1, 1, 1], then we have 2 nodes
+        # in hyperedge 0 and 3 nodes in hyperedge 1.
         num_hyperedges = int(hyperedge_index[1].max().item()) + 1
         weights = torch.bincount(hyperedge_index[1], minlength=num_hyperedges).float()
 
@@ -388,40 +400,53 @@ def enrich(self, hyperedge_index: Tensor) -> Tensor:
 
 class Node2VecEnricher(NodeEnricher):
     """
-    Enrich node features using Node2Vec embeddings computed from the clique expansion of the hypergraph.
+    Enrich node features using Node2Vec embeddings computed from the clique expansion of the
+    hypergraph.
 
     Args:
         num_features: Dimensionality of the node embeddings to generate.
         walk_length: Length of each random walk.
-        context_size: Window size for the skip-gram model (number of neighbors in the walk considered as context).
-            For example, if ``context_size=2`` and ``walk_length=5``, then for a random walk ``[v0, v1, v2, v3, v4]``,
-            the context for ``v2`` would be ``[v0, v1, v3, v4]`` as we take neighbors within distance 2 in the walk.
+        context_size: Window size for the skip-gram model
+            (number of neighbors in the walk considered as context).
+            For example, if ``context_size=2`` and ``walk_length=5``, then for
+            a random walk ``[v0, v1, v2, v3, v4]``,
+            the context for ``v2`` would be ``[v0, v1, v3, v4]`` as we take neighbors within
+            distance 2 in the walk.
             The pairs generated by skip-gram would be ``[(v2, v0), (v2, v1), (v2, v3), (v2, v4)]``.
-            Rule of thumb: Graphs with strong local structure (5-10), Graphs with communities/long-range patterns (10-20).
+            Rule of thumb: Graphs with strong local structure (5-10), Graphs with
+            communities/long-range patterns (10-20).
             Defaults to ``10``.
         num_walks_per_node: Number of random walks to start at each node.
         p: Return hyperparameter for Node2Vec. Default is ``1.0`` (unbiased).
             This controls the probability of stepping back to the node visited in the previous step.
-            Lower values of ``p`` make immediate backtracking more likely, which keeps walks closer to the
-            local neighborhood. Higher values of ``p`` discourage returning to the previous node, so walks
+            Lower values of ``p`` make immediate backtracking more likely,
+            which keeps walks closer to the
+            local neighborhood. Higher values of ``p`` discourage returning to the previous node,
+             so walks
             are less likely to bounce back and forth across the same edge.
         q: In-out hyperparameter for Node2Vec. Default is ``1.0`` (unbiased).
             This controls whether walks stay near the source node or explore further outward.
-            Lower values of ``q`` bias the walk toward outward exploration, behaving more like DFS and
+            Lower values of ``q`` bias the walk toward outward exploration, behaving more like DFS
+            and
             emphasizing structural roles. Higher values of ``q`` bias the walk toward nearby nodes,
             behaving more like BFS and emphasizing community structure and homophily.
         num_negative_samples: Number of negative samples to use for training the skip-gram model.
-            If set to ``X``, then for each positive pair ``(u, v)`` generated from the random walks, ``X`` negative pairs ``(u, v_neg)`` will be generated,
+            If set to ``X``, then for each positive pair ``(u, v)`` generated from the random walks,
+            ``X`` negative pairs ``(u, v_neg)`` will be generated,
             where ``v_neg`` is a node sampled uniformly at random from all nodes in the graph.
             Defaults to ``1``, meaning one negative sample per positive pair.
-        num_nodes: Total number of nodes in the graph. If not provided, it will be inferred from the hyperedge_index.
-            This is only needed if the hyperedge_index does not include all nodes (e.g., some isolated nodes are missing).
-        graph_reduction_strategy: Strategy for reducing the hyperedge graph. Defaults to ``clique_expansion``.
+        num_nodes: Total number of nodes in the graph. If not provided, it will be inferred from
+            the hyperedge_index.
+            This is only needed if the hyperedge_index does not include all nodes
+            (e.g., some isolated nodes are missing).
+        graph_reduction_strategy: Strategy for reducing the hyperedge graph.
+            Defaults to ``clique_expansion``.
         num_epochs: Number of epochs used to optimize Node2Vec embeddings. Defaults to ``5``.
         learning_rate: Learning rate for embedding optimization. Defaults to ``0.01``.
         batch_size: Batch size used by the random-walk loader. Defaults to ``128``.
         sparse: Whether Node2Vec embeddings should use sparse gradients.
-        cache_dir: Optional directory to cache computed embeddings. If ``None``, caching is disabled.
+        cache_dir: Optional directory to cache computed embeddings. If ``None``, caching
+            is disabled.
         verbose: Whether to print verbose output during training. Defaults to ``False``.
     """
 
@@ -465,15 +490,17 @@ def enrich(self, hyperedge_index: Tensor) -> Tensor:
         """
         Compute Node2Vec embeddings from the clique expansion of the hypergraph.
 
-        The hypergraph is converted to a regular graph via clique expansion, where each hyperedge of size k
-        contributes a k x k block of edges between its member nodes.
-        The resulting ``edge_index`` is then used to train a Node2Vec model using random walks and the skip-gram objective.
+        The hypergraph is converted to a regular graph via clique expansion, where each hyperedge
+        of size k contributes a k x k block of edges between its member nodes.
+        The resulting ``edge_index`` is then used to train a Node2Vec model using random walks
+        and the skip-gram objective.
 
         Args:
             hyperedge_index: Hyperedge index tensor of shape ``(2, num_hyperedges)``.
 
         Returns:
-            x: Tensor of shape ``(num_nodes, embedding_dim)`` containing the Node2Vec embeddings for each node.
+            x: Tensor of shape ``(num_nodes, embedding_dim)`` containing the Node2Vec embeddings
+                for each node.
         """
         device = hyperedge_index.device
 
@@ -497,7 +524,9 @@ def enrich(self, hyperedge_index: Tensor) -> Tensor:
         edge_index_wrapper = EdgeIndex(reduced_edge_index).remove_selfloops()
         if edge_index_wrapper.num_edges == 0:
             warnings.warn(
-                "Clique expansion produced no non-self-loop edges. Returning zero node features.",
+                """
+                Clique expansion produced no non-self-loop edges. Returning zero node features.
+                """,
                 category=UserWarning,
                 stacklevel=2,
             )
@@ -581,13 +610,17 @@ def __validate(self) -> None:
 
 class LaplacianPositionalEncodingEnricher(NodeEnricher):
     """
-    Enrich node features with Laplacian Positional Encodings computed from the symmetric normalized Laplacian of the clique expansion of the hypergraph.
+    Enrich node features with Laplacian Positional Encodings computed from the symmetric normalized
+    Laplacian of the clique expansion of the hypergraph.
 
     Args:
         num_features: Number of positional encoding features to generate for each node.
-        num_nodes: Total number of nodes in the graph. If not provided, it will be inferred from the hyperedge_index.
-            This is only needed if the hyperedge_index does not include all nodes (e.g., some isolated nodes are missing).
-            Another instance is when the setting is transductive and the hyperedge index contains some hyperedges
+        num_nodes: Total number of nodes in the graph. If not provided, it will be inferred
+            from the hyperedge_index.
+            This is only needed if the hyperedge_index does not include all nodes
+            (e.g., some isolated nodes are missing).
+            Another instance is when the setting is transductive and the hyperedge index
+            contains some hyperedges
             that do not contain all the nodes in the node space.
         cache_dir: Optional directory to cache computed features. If ``None``, caching is disabled.
     """
@@ -658,7 +691,8 @@ def enrich(self, hyperedge_index: Tensor) -> Tensor:
             return eigenvectors[:, 1 : self.num_features + 1]
 
         # If the graph has fewer usable eigenvectors than requested
-        # (e.g., num_features = 5 but only 2 available), we create a zero-padded tensor and fill what we have.
+        # (e.g., num_features = 5 but only 2 available), we create a zero-padded tensor
+        # and fill what we have.
         # Example: num_nontrivial_eigenvectors = 2, num_features = 5
         #          -> shape (3, 5)  # columns 0-1 filled, 2-4 are zeros.
         x = torch.zeros(
@@ -676,8 +710,10 @@ class VilLainEnricher(_VilLainTrainer, NodeEnricher):
 
     Args:
         num_features: Dimensionality of the node embeddings to generate.
-        num_nodes: Total number of nodes, including isolated nodes that do not appear in ``hyperedge_index``.
-        num_hyperedges: Total number of hyperedges, including empty hyperedges that do not appear in ``hyperedge_index``.
+        num_nodes: Total number of nodes, including isolated nodes that do not appear
+            in ``hyperedge_index``.
+        num_hyperedges: Total number of hyperedges, including empty hyperedges that
+            do not appear in ``hyperedge_index``.
         labels_per_subspace: Number of virtual labels per subspace. Defaults to ``2``.
         training_steps: Propagation steps used for VilLain self-supervised loss. Defaults to ``4``.
         generation_steps: Propagation steps averaged for final embeddings. Defaults to ``100``.
@@ -731,7 +767,8 @@ def enrich(self, hyperedge_index: Tensor) -> Tensor:
             hyperedge_index: Hyperedge index tensor of shape ``(2, num_hyperedges)``.
 
         Returns:
-            node_embeddings: Tensor of shape ``(num_nodes, num_features)`` containing VilLain node embeddings.
+            node_embeddings: Tensor of shape ``(num_nodes, num_features)`` containing
+                VilLain node embeddings.
         """
         num_nodes = self._num_nodes(hyperedge_index)
         if num_nodes == 0:
diff --git a/hyperbench/data/hif.py b/hyperbench/data/hif.py
index a19278d6..f4fc50df 100644
--- a/hyperbench/data/hif.py
+++ b/hyperbench/data/hif.py
@@ -25,7 +25,9 @@
 
 
 class HIFProcessor:
-    """A utility class to process HIF hypergraph data into `HData` format."""
+    """
+    A utility class to process HIF hypergraph data into `HData` format.
+    """
 
     @staticmethod
     def transform_attrs(
@@ -34,11 +36,13 @@ def transform_attrs(
     ) -> Tensor:
         """
         Extract and encode numeric attributes to tensor.
+
         Non-numeric attributes are discarded. Missing attributes are filled with ``0.0``.
 
         Args:
             attrs: Dictionary of attributes
-            attr_keys: Optional list of attribute keys to encode. If provided, ensures consistent ordering and fill missing with ``0.0``.
+            attr_keys: Optional list of attribute keys to encode. If provided,
+                ensures consistent ordering and fill missing with ``0.0``.
 
         Returns:
             attrs: Tensor of numeric attribute values
@@ -67,7 +71,6 @@ def process_hypergraph(cls, hypergraph: HIFHypergraph) -> HData:
         Returns:
             hdata: The processed hypergraph data.
         """
-
         num_nodes = len(hypergraph.nodes)
         x = cls.__process_x(hypergraph, num_nodes)
 
@@ -93,7 +96,8 @@ def process_hypergraph(cls, hypergraph: HIFHypergraph) -> HData:
                 )
 
             if hyperedge_id not in hyperedge_id_to_idx:
-                # Hyperedges start from 0 and are assigned IDs in the order they are first encountered in incidences
+                # Hyperedges start from 0 and are assigned IDs in the order they are
+                # first encountered in incidences
                 hyperedge_id_to_idx[hyperedge_id] = len(hyperedge_id_to_idx)
 
             node_ids.append(node_id_to_idx[node_id])
@@ -244,7 +248,9 @@ def __process_hyperedge_weights(
 
 
 class HIFLoader:
-    """A utility class to load hypergraphs from HIF format."""
+    """
+    A utility class to load hypergraphs from HIF format.
+    """
 
     @classmethod
     def load_from_url(cls, url: str, save_on_disk: bool = False) -> HData:
@@ -263,7 +269,8 @@ def load_from_url(cls, url: str, save_on_disk: bool = False) -> HData:
         response = requests.get(url, timeout=20)
         if response.status_code != 200:
             raise ValueError(
-                f"Failed to download dataset from URL {url!r} with status code {response.status_code}"
+                f"Failed to download dataset from URL {url!r} "
+                f"with status code {response.status_code}"
             )
 
         if not url.endswith((".json.zst", ".json")):
@@ -273,7 +280,9 @@ def load_from_url(cls, url: str, save_on_disk: bool = False) -> HData:
 
         if os.path.basename(url).count(".") > 2:
             raise ValueError(
-                f"URL {url!r} has an unexpected filename format. Expected at most one dot in the base filename before the extension (e.g., dataset.json or dataset.json.zst)."
+                f"URL {url!r} has an unexpected filename format. "
+                "Expected at most one dot in the base filename before the "
+                "extension (e.g., dataset.json or dataset.json.zst)."
             )
 
         if url.endswith(".json.zst"):
@@ -298,7 +307,8 @@ def load_from_url(cls, url: str, save_on_disk: bool = False) -> HData:
     @classmethod
     def load_from_path(cls, filepath: str) -> HData:
         """
-        Load a hypergraph from a local file path pointing to a .json or .json.zst file in HIF format.
+        Load a hypergraph from a local file path pointing to a .json or .json.zst file in HIF
+        format.
 
         Args:
             filepath: The local file path to the .json or .json.zst file
@@ -337,7 +347,10 @@ def load_by_name(
             hif_data = from_zst_file_to_json(zst_filename)
             return cls.__process_hif_data(hif_data, dataset_name)
 
-        github_url = f"https://raw.githubusercontent.com/hypernetwork-research-group/datasets/{GITHUB_COMMIT_SHA}/{dataset_name}.json.zst"
+        github_url = (
+            f"https://raw.githubusercontent.com/hypernetwork-research-group/datasets/"
+            f"{GITHUB_COMMIT_SHA}/{dataset_name}.json.zst"
+        )
         response = requests.get(github_url, timeout=20)
         if response.status_code == 200:
             dataset_bytes = response.content
@@ -348,7 +361,8 @@ def load_by_name(
             return hdata
 
         warnings.warn(
-            f"GitHub raw download failed for dataset {dataset_name!r} with status code {response.status_code}\n"
+            f"GitHub raw download failed for dataset {dataset_name!r} "
+            f"with status code {response.status_code}\n"
             "Falling back to Hugging Face Hub download for dataset",
             category=UserWarning,
             stacklevel=2,
@@ -356,7 +370,8 @@ def load_by_name(
 
         if hf_sha is None:
             raise ValueError(
-                f"Failed to download dataset {dataset_name!r} from GitHub with status code {response.status_code} "
+                f"Failed to download dataset {dataset_name!r} from GitHub "
+                f"with status code {response.status_code} "
                 f"and no SHA provided for Hugging Face Hub fallback."
             )
 
@@ -384,7 +399,8 @@ def load_by_name(
                 shutil.copyfile(downloaded_path, zst_filename)
             except Exception as e:
                 raise ValueError(
-                    f"Failed to save downloaded dataset {dataset_name!r} to disk at {zst_filename!r}: {e!s}."
+                    f"Failed to save downloaded dataset {dataset_name!r} to disk at "
+                    f"{zst_filename!r}: {e!s}."
                 ) from e
 
         if os.path.isdir(hf_cache_dir):
@@ -394,7 +410,8 @@ def load_by_name(
                 shutil.rmtree(os.path.join(hf_cache_dir, ".locks", path_prefix))
             except Exception as e:
                 warnings.warn(
-                    f"Failed to clean up Hugging Face Hub cache after downloading dataset {dataset_name!r}: {e!s}.",
+                    f"Failed to clean up Hugging Face Hub cache after downloading "
+                    f"dataset {dataset_name!r}: {e!s}.",
                     category=UserWarning,
                     stacklevel=2,
                 )
diff --git a/hyperbench/data/loader.py b/hyperbench/data/loader.py
index 77e79b44..e0c60dd3 100644
--- a/hyperbench/data/loader.py
+++ b/hyperbench/data/loader.py
@@ -35,12 +35,14 @@ def collate(self, batch: list[HData]) -> HData:
         """
         Collates a list of `HData` objects into a single batched `HData` object.
 
-        This function combines multiple separate samples into a single batched representation suitable for mini-batch training.
-        It handles:
-        - Concatenating node features from all samples.
-        - Concatenating and offsetting hyperedges from all samples.
-        - Concatenating hyperedge attributes from all samples, if present.
-        - Concatenating hyperedge weights from all samples, if present.
+        This function combines multiple separate samples into a single batched representation
+        suitable for mini-batch training.
+
+        Handles:
+            - Concatenating node features from all samples.
+            - Concatenating and offsetting hyperedges from all samples.
+            - Concatenating hyperedge attributes from all samples, if present.
+            - Concatenating hyperedge weights from all samples, if present.
 
         Examples:
             Given ``batch = [HData_0, HData_1]``:
@@ -56,7 +58,7 @@ def collate(self, batch: list[HData]) -> HData:
             - ``HData_0`` (3 nodes, 2 hyperedges):
 
             >>> hyperedge_index = [[0, 1, 1, 2],  # Nodes 0, 1, 1, 2
-            ...                    [0, 0, 1, 1]]  # Hyperedge 0 contains {0,1}, Hyperedge 1 contains {1,2}
+            ...                    [0, 0, 1, 1]]  # HE 0 contains {0,1}, HE 1 contains {1,2}
 
             - ``HData_1`` (2 nodes, 1 hyperedge):
 
diff --git a/hyperbench/data/negative_sampler.py b/hyperbench/data/negative_sampler.py
index 3cf485d0..299108a1 100644
--- a/hyperbench/data/negative_sampler.py
+++ b/hyperbench/data/negative_sampler.py
@@ -18,8 +18,10 @@ class NegativeSampler(ABC):
 
     Args:
         return_0based_negatives:
-            - If ``True``, the negative samples returned by the ``sample`` method will have 0-based node and hyperedge IDs.
-            - If ``False``, the negative samples will retain the original global node and hyperedge IDs from the input data.
+            - If ``True``, the negative samples returned by the ``sample`` method
+                will have 0-based node and hyperedge IDs.
+            - If ``False``, the negative samples will retain the original global node
+                and hyperedge IDs from the input data.
     """
 
     def __init__(self, return_0based_negatives: bool = False):
@@ -59,8 +61,10 @@ def _new_negative_hyperedge_index(
 
         Returns:
             hyperedge_index: The concatenated, sorted, and remapped hyperedge index tensor.
-            If ``self.return_0based_negatives`` is ``True``, the returned tensor will have 0-based node and hyperedge IDs.
-            Otherwise, it will retain the original global node and hyperedge IDs from the input data.
+            If ``self.return_0based_negatives`` is ``True``, the returned tensor will
+                have 0-based node and hyperedge IDs.
+            Otherwise, it will retain the original global node and hyperedge IDs
+                from the input data.
         """
         negative_hyperedge_index = torch.cat(sampled_hyperedge_indexes, dim=1)
         if not self.return_0based_negatives:
@@ -86,7 +90,8 @@ def _new_global_node_ids(
             negative_node_ids: Tensor of negative node IDs.
 
         Returns:
-            global_node_ids: The global node IDs for the negative samples, or ``None`` if the input global node IDs are ``None``.
+            global_node_ids: The global node IDs for the negative samples, or ``None`` if
+                the input global node IDs are ``None``.
         """
         if global_node_ids is None:
             return None
@@ -122,11 +127,13 @@ def _new_enriched_hyperedge_attr(
         Generate enriched hyperedge attributes for the negative samples.
 
         Args:
-            hyperedge_attr_enricher: An optional `HyperedgeAttrsEnricher` to generate attributes for the new hyperedges.
+            hyperedge_attr_enricher: An optional `HyperedgeAttrsEnricher` to generate attributes
+                for the new hyperedges.
             negative_hyperedge_index: The index tensor for the negative hyperedges.
 
         Returns:
-            hyperedge_attr: The enriched hyperedge attribute tensor for the negative samples, or ``None`` if the enricher is not provided.
+            hyperedge_attr: The enriched hyperedge attribute tensor for the negative samples,
+                or ``None`` if the enricher is not provided.
         """
         if hyperedge_attr_enricher is None:
             return None
@@ -145,11 +152,13 @@ def _new_enriched_hyperedge_weights(
         Generate enriched hyperedge weights for the negative samples.
 
         Args:
-            hyperedge_weights_enricher: An optional `HyperedgeWeightsEnricher` to generate weights for the new hyperedges.
+            hyperedge_weights_enricher: An optional `HyperedgeWeightsEnricher` to
+                generate weights for the new hyperedges.
             negative_hyperedge_index: The index tensor for the negative hyperedges.
 
         Returns:
-            hyperedge_weights: The enriched hyperedge weight tensor for the negative samples, or ``None`` if the enricher is not provided.
+            hyperedge_weights: The enriched hyperedge weight tensor for the negative samples,
+                or ``None`` if the enricher is not provided.
         """
         if hyperedge_weights_enricher is None:
             return None
@@ -168,7 +177,8 @@ def _new_x(self, x: Tensor, negative_node_ids: Tensor) -> tuple[Tensor, int]:
             negative_node_ids: Tensor of negative node IDs.
 
         Returns:
-            x_and_num_negative_nodes: The node feature matrix for the negative samples and the number of negative nodes.
+            x: The node feature matrix for the negative samples.
+            num_negative_nodes: The number of negative nodes.
         """
         return x[negative_node_ids], len(negative_node_ids)
 
@@ -245,11 +255,15 @@ class SameNodeSpaceNegativeSampler(NegativeSampler, ABC):
     Base class for negative samplers that sample only from existing nodes.
 
     Args:
-        hyperedge_attr_enricher: An optional `HyperedgeAttrsEnricher` to generate attributes for the new hyperedges.
-        hyperedge_weights_enricher: An optional `HyperedgeWeightsEnricher` to generate weights for the new hyperedges.
+        hyperedge_attr_enricher: An optional `HyperedgeAttrsEnricher` to generate attributes for
+            the new hyperedges.
+        hyperedge_weights_enricher: An optional `HyperedgeWeightsEnricher` to generate weights
+            for the new hyperedges.
         return_0based_negatives:
-            - If ``True``, the negative samples returned by the ``sample`` method will have 0-based node and hyperedge IDs.
-            - If ``False``, the negative samples will retain the original global node and hyperedge IDs from the input data.
+            - If ``True``, the negative samples returned by the ``sample`` method will have
+                0-based node and hyperedge IDs.
+            - If ``False``, the negative samples will retain the original global node
+                and hyperedge IDs from the input data.
     """
 
     def __init__(
@@ -269,11 +283,15 @@ class GeneratedNodesNegativeSampler(NegativeSampler, ABC):
 
     Args:
         node_feature_enricher: A `NodeEnricher` to generate features for the new nodes.
-        hyperedge_attr_enricher: An optional `HyperedgeAttrsEnricher` to generate attributes for the new hyperedges.
-        hyperedge_weights_enricher: An optional `HyperedgeWeightsEnricher` to generate weights for the new hyperedges.
+        hyperedge_attr_enricher: An optional `HyperedgeAttrsEnricher` to generate attributes
+            for the new hyperedges.
+        hyperedge_weights_enricher: An optional `HyperedgeWeightsEnricher` to generate weights
+            for the new hyperedges.
         return_0based_negatives:
-            - If ``True``, the negative samples returned by the ``sample`` method will have 0-based node and hyperedge IDs.
-            - If ``False``, the negative samples will retain the original global node and hyperedge IDs from the input data.
+            - If ``True``, the negative samples returned by the ``sample`` method will have
+                0-based node and hyperedge IDs.
+            - If ``False``, the negative samples will retain the original global node and
+                hyperedge IDs from the input data.
     """
 
     def __init__(
@@ -291,23 +309,30 @@ def __init__(
 
 class RandomNegativeSampler(SameNodeSpaceNegativeSampler):
     """
-    A random negative sampler. Negatives generated with ``return_0based_negatives = False`` aren't usable standalone
-    as they have global node and hyperedge IDs. They must be concatenated with the original `HData` object
-    that is provided as input to the ``sample`` method, as it contains the global node and hyperedge IDs and features
-    that can be indexed with the negative samples' IDs.
+    A random negative sampler. Negatives generated with ``return_0based_negatives = False``
+    aren't usable standalone as they have global node and hyperedge IDs. They must be concatenated
+    with the original `HData` object that is provided as input to the ``sample`` method, as it
+    contains the global node and hyperedge IDs and features that can be indexed with
+    the negative samples' IDs.
 
     Args:
         num_negative_samples: Number of negative hyperedges to generate.
         num_nodes_per_sample: Number of nodes per negative hyperedge.
-        hyperedge_attr_enricher: An optional `HyperedgeAttrsEnricher` to generate attributes for the new hyperedges.
-            If not provided, random attributes will be generated for the negative hyperedges if the input data has hyperedge attributes.
-        hyperedge_weights_enricher: An optional `HyperedgeEnricher` to generate weights for the new hyperedges.
-            If not provided, the negative hyperedges will not have weights.
+        hyperedge_attr_enricher: An optional `HyperedgeAttrsEnricher` to generate attributes
+            for the new hyperedges.
+            If not provided, random attributes will be generated for the negative hyperedges if
+            the input data has hyperedge attributes.
+        hyperedge_weights_enricher: An optional `HyperedgeWeightsEnricher` to generate weights
+            for the new hyperedges. If not provided, the negative hyperedges will not have weights.
         return_0based_negatives:
-            - If ``True``, the negative samples returned by the ``sample`` method will have 0-based node and hyperedge IDs.
-            - If ``False``, the negative samples will retain the original global node and hyperedge IDs from the input data.
-        max_retry: Maximum number of rejected sampling attempts allowed per requested negative hyperedge before failing.
-            If ``num_negative_samples`` is ``N``, the total maximum number of attempts will be ``N * max_retry``.
+            - If ``True``, the negative samples returned by the ``sample`` method
+                will have 0-based node and hyperedge IDs.
+            - If ``False``, the negative samples will retain the original global node and
+                hyperedge IDs from the input data.
+        max_retry: Maximum number of rejected sampling attempts allowed per requested
+            negative hyperedge before failing.
+            If ``num_negative_samples`` is ``N``, the total maximum number of attempts
+            will be ``N * max_retry``.
 
     Raises:
         ValueError: If any numeric argument is not positive.
@@ -341,10 +366,12 @@ def __init__(
     def sample(self, hdata: HData, seed: int | None = None) -> HData:
         """
         Generate negative hyperedges by randomly sampling unique node IDs.
-        Node IDs are sampled from the same node space as the input data, and the new negative hyperedge IDs
-        start from the original number of hyperedges in the input data to avoid ID conflicts.
-        The resulting negative samples are returned as a new `HData` object with remapped 0-based node and hyperedge IDs, if ``self.return_0based_negatives == True``.
-        Otherwise, the negative samples retain their original global node and hyperedge IDs from the input data.
+        Node IDs are sampled from the same node space as the input data, and the new negative
+        hyperedge IDs start from the original number of hyperedges in the input data to
+        avoid ID conflicts. The resulting negative samples are returned as a new `HData` object
+        with remapped 0-based node and hyperedge IDs, if ``self.return_0based_negatives == True``.
+        Otherwise, the negative samples retain their original global node and hyperedge IDs
+        from the input data.
 
         Examples:
             With ``self.return_0based_negatives = True``:
@@ -383,7 +410,8 @@ def sample(self, hdata: HData, seed: int | None = None) -> HData:
         """
         if self.num_nodes_per_sample > hdata.num_nodes:
             raise ValueError(
-                f"Asked to create samples with {self.num_nodes_per_sample} nodes, but only {hdata.num_nodes} nodes are available."
+                f"Asked to create samples with {self.num_nodes_per_sample} nodes,"
+                f" but only {hdata.num_nodes} nodes are available."
             )
 
         device = hdata.device
@@ -439,7 +467,8 @@ def sample(self, hdata: HData, seed: int | None = None) -> HData:
             hyperedge_attr_enricher=self.hyperedge_attr_enricher,
             negative_hyperedge_index=negative_hyperedge_index,
         )
-        # Default to the random attributes if no enricher is provided and the input data has hyperedge attributes
+        # Default to the random attributes if no enricher is provided and the input
+        # data has hyperedge attributes
         if negative_hyperedge_attr is None:
             negative_hyperedge_attr = self._new_hyperedge_attr(
                 sampled_hyperedge_attrs=sampled_hyperedge_attrs, hyperedge_attr=hdata.hyperedge_attr
@@ -471,12 +500,15 @@ def __sample_loop(
 
         Args:
             hdata: The input hypergraph data used as the node and hyperedge ID source.
-            positive_hyperedges_signatures: Existing positive hyperedge signatures that must not be sampled as negatives.
+            positive_hyperedges_signatures: Existing positive hyperedge signatures that
+                must not be sampled as negatives.
             seed: Optional random seed for reproducible sampling.
 
         Returns:
-            samples: A tuple containing sampled hyperedge index tensors, sampled hyperedge attribute
-            tensors, sampled node IDs, and the first negative hyperedge ID.
+            sampled_hyperedge_indexes:  sampled hyperedge index tensors
+            sampled_hyperedge_attrs: sampled hyperedge attribute tensors.
+            sampled_negative_node_ids: sampled negative node IDs.
+            new_hyperedge_id_offset: first negative hyperedge ID.
 
         Raises:
             ValueError: If the sampler cannot produce the requested number of unique negative
@@ -500,7 +532,8 @@ def __sample_loop(
             attempts += 1
 
             # Sample with multinomial without replacement to ensure unique node ids
-            # and assign each node id equal probability of being selected by setting all of them to 1
+            # and assign each node id equal probability of being
+            # selected by setting all of them to 1
             # Example: num_nodes_per_sample=3, max_node_id=5
             #          -> possible output: [2, 0, 4]
             equal_probabilities = torch.ones(
@@ -612,14 +645,18 @@ class CliqueNegativeSampler(SameNodeSpaceNegativeSampler):
         num_nodes_per_sample: Number of nodes per negative hyperedge. Must be at least 2.
         hyperedge_attr_enricher: Optional enricher to generate attributes for sampled negatives.
         hyperedge_weights_enricher: Optional enricher to generate weights for sampled negatives.
-        return_0based_negatives: If ``True``, returned negative node and hyperedge IDs are rebased to 0-based IDs.
-        max_candidates: Optional upper bound for full-size clique candidates enumerated during search
+        return_0based_negatives: If ``True``, returned negative node and hyperedge IDs
+            are rebased to 0-based IDs.
+        max_candidates: Optional upper bound for full-size clique candidates enumerated
+            during search.
             If ``None``, it means no explicit cap. The limit counts every full-size clique candidate
-            encountered before positive-hyperedge filtering, so positive hyperedges still consume the budget
-            because they still require search work. This is a safety guard for dense graphs where clique enumeration
-            can grow quickly. For example, ``max_candidates=10_000`` means the sampler stops if finding candidates
+            encountered before positive-hyperedge filtering, so positive hyperedges still consume
+            the budget because they still require search work. This is a safety guard for
+            dense graphs where clique enumeration can grow quickly. For example,
+            ``max_candidates=10_000`` means the sampler stops if finding candidates
             requires enumerating more than 10,000 cliques of size ``num_nodes_per_sample``.
-            It does not control how many negatives are returned, as that is controlled by ``num_negative_samples``.
+            It does not control how many negatives are returned, as that is controlled
+            by ``num_negative_samples``.
 
     Raises:
         ValueError: If numeric arguments are invalid.
@@ -638,7 +675,8 @@ def __init__(
             raise ValueError(f"num_negative_samples must be positive, got {num_negative_samples}.")
         if num_nodes_per_sample < 2:
             raise ValueError(
-                f"num_nodes_per_sample must be at least 2 for clique negative sampling, got {num_nodes_per_sample}."
+                f"num_nodes_per_sample must be at least 2 for clique "
+                f"negative sampling, got {num_nodes_per_sample}."
             )
         if max_candidates is not None and max_candidates <= 0:
             raise ValueError(
@@ -670,7 +708,8 @@ def sample(self, hdata: HData, seed: int | None = None) -> HData:
         """
         if self.num_nodes_per_sample > hdata.num_nodes:
             raise ValueError(
-                f"Asked to create samples with {self.num_nodes_per_sample} nodes, but only {hdata.num_nodes} nodes are available."
+                f"Asked to create samples with {self.num_nodes_per_sample} nodes, "
+                f"but only {hdata.num_nodes} nodes are available."
             )
         device = hdata.device
 
@@ -774,7 +813,8 @@ def __expand_clique_candidates(
             prefix: Current partial clique, represented as sorted node IDs.
             candidates: Node IDs that may extend ``prefix`` while preserving clique structure.
             adjacency_list: Clique-expanded graph adjacency list.
-            positive_hyperedge_signatures: Positive hyperedge node signatures that must not be returned as negatives.
+            positive_hyperedge_signatures: Positive hyperedge node signatures that must not
+                be returned as negatives.
             valid_candidates: Output list mutated in place with valid negative clique candidates.
             enumerated_candidates: Number of full-size clique candidates visited so far.
 
@@ -787,7 +827,8 @@ def __expand_clique_candidates(
         if len(prefix) == self.num_nodes_per_sample:  # Found a full-size clique candidate
             if self.max_candidates is not None and enumerated_candidates >= self.max_candidates:
                 raise ValueError(
-                    f"Clique negative candidate enumeration exceeded max_candidates={self.max_candidates}."
+                    f"Clique negative candidate enumeration exceeded "
+                    f"max_candidates={self.max_candidates}."
                 )
             enumerated_candidates += 1
 
@@ -832,7 +873,8 @@ def __find_valid_clique_candidates(
 
         Args:
             adjacency_list: Clique-expanded graph adjacency list.
-            positive_hyperedge_signatures: Positive hyperedge node signatures with the requested sample size.
+            positive_hyperedge_signatures: Positive hyperedge node signatures with
+                the requested sample size.
 
         Returns:
             candidates: Clique node signatures that are not positive hyperedges.
@@ -881,8 +923,10 @@ def __sample_loop(
             seed: Optional seed for reproducible candidate shuffling and random attributes.
 
         Returns:
-            samples: A tuple containing sampled hyperedge index tensors, sampled hyperedge
-            attribute tensors, sampled node IDs, and the first negative hyperedge ID.
+            sampled_hyperedge_indexes:  sampled hyperedge index tensors
+            sampled_hyperedge_attrs: sampled hyperedge attribute tensors.
+            sampled_negative_node_ids: sampled negative node IDs.
+            new_hyperedge_id_offset: first negative hyperedge ID.
         """
         device = hdata.device
         generator = create_seeded_torch_generator(device=device, seed=seed)
@@ -892,7 +936,8 @@ def __sample_loop(
         #                               (0, 2, 3),   # index 1
         #                               (1, 2, 3)],  # index 2
         #          -> shuffled_clique_candidate_indexes = [2, 0, 1]
-        #          -> sampled_clique_candidate_indexes = [2, 0] if num_negative_samples=2 as we only need 2 samples
+        #          as we only need 2 samples
+        #          -> sampled_clique_candidate_indexes = [2, 0] if num_negative_samples=2
         #          -> sampled_clique_candidates = [(1, 2, 3),  # index 2 in clique_candidates
         #                                          (0, 1, 3)]  # index 0 in clique_candidates
         num_valid_clique_candidates = len(clique_candidates)
diff --git a/hyperbench/data/negative_sampling_scheduler.py b/hyperbench/data/negative_sampling_scheduler.py
index 68c0fa8d..a404d601 100644
--- a/hyperbench/data/negative_sampling_scheduler.py
+++ b/hyperbench/data/negative_sampling_scheduler.py
@@ -13,15 +13,20 @@
 class NegativeSamplingScheduler:
     """
     Manages when to perform negative sampling during training based on a specified schedule.
-    This class allows for flexible scheduling of negative sampling, enabling it to be performed at different frequencies (e.g., every epoch, every N epochs, or only at the first epoch).
-        The scheduler maintains a cache of the most recently sampled negatives, which can be reused across epochs if the schedule does not require resampling. This helps to optimize training
-        by avoiding unnecessary sampling when the schedule dictates that negatives should only be generated at certain intervals.
+
+    This class allows for flexible scheduling of negative sampling, enabling it to be performed at
+    different frequencies (e.g., every epoch, every N epochs, or only at the first epoch). The
+    scheduler maintains a cache of the most recently sampled negatives, which can be reused across
+    epochs if the schedule does not require resampling. This helps to optimize training by avoiding
+    unnecessary sampling when the schedule dictates that negatives should only be generated at
+    certain intervals.
 
     Args:
         negative_sampler: An instance of a ``NegativeSampler`` that defines how to sample negatives.
         negative_sampling_schedule: Literal string specifying the schedule for sampling negatives.
         negative_sampling_every_n: An integer specifying the interval for sampling negatives
-            when the schedule is set to ``"every_n_epochs"``. This parameter is ignored for other schedules.
+            when the schedule is set to ``"every_n_epochs"``. This parameter is ignored
+            for other schedules.
     """
 
     def __init__(
@@ -38,7 +43,9 @@ def __init__(
 
     @property
     def config(self) -> dict[str, Any]:
-        """Returns the configuration of the negative sampling scheduler as a dictionary."""
+        """
+        Returns the configuration of the negative sampling scheduler as a dictionary.
+        """
         return {
             "negative_sampler": self.negative_sampler,
             "negative_sampling_schedule": self.negative_sampling_schedule,
@@ -50,10 +57,12 @@ def should_sample(self, epoch: int) -> bool:
         Whether to resample negatives for the current epoch.
 
         Args:
-            epoch: The current epoch number, used to determine if sampling should occur based on the schedule.
+            epoch: The current epoch number, used to determine if sampling should occur based
+                on the schedule.
 
         Returns:
-            should_sample: True if negatives should be resampled for the current epoch, False otherwise.
+            should_sample: True if negatives should be resampled for the current epoch,
+                False otherwise.
         """
         if epoch < 0:
             raise ValueError(f"Epoch must be non-negative, got {epoch}.")
@@ -62,7 +71,8 @@ def should_sample(self, epoch: int) -> bool:
             case "every_n_epochs":
                 if self.negative_sampling_every_n <= 0:
                     raise ValueError(
-                        f"negative_sampling_every_n must be positive, got {self.negative_sampling_every_n}."
+                        f"negative_sampling_every_n must be positive, "
+                        f"got {self.negative_sampling_every_n}."
                     )
                 return epoch % self.negative_sampling_every_n == 0
             case "first_epoch":
@@ -80,7 +90,8 @@ def sample(self, batch: HData, epoch: int) -> HData:
 
         Args:
             batch: The current batch of data for which to sample negatives.
-            epoch: The current epoch number, used to determine if sampling should occur based on the schedule.
+            epoch: The current epoch number, used to determine if sampling should occur
+                based on the schedule.
 
         Returns:
             negatives: A batch of negative samples, either freshly sampled or from cache.
diff --git a/hyperbench/data/sampler.py b/hyperbench/data/sampler.py
index 0cb2178e..5d44d9b8 100644
--- a/hyperbench/data/sampler.py
+++ b/hyperbench/data/sampler.py
@@ -48,7 +48,8 @@ def _normalize_index(self, index: int | list[int], size: int) -> list[int]:
             ids: List of IDs to sample.
 
         Raises:
-            ValueError: If the provided index is invalid (e.g., empty list or list length exceeds number of sampleable items).
+            ValueError: If the provided index is invalid (e.g., empty list or list length exceeds
+                number of sampleable items).
             TypeError: If the index is not an integer or a list of integers.
         """
         if isinstance(index, list):
@@ -56,7 +57,8 @@ def _normalize_index(self, index: int | list[int], size: int) -> list[int]:
                 raise ValueError("Index list cannot be empty.")
             if len(index) > size:
                 raise ValueError(
-                    f"Index list length ({len(index)}) cannot exceed the number of sampleable items ({size})."
+                    f"Index list length ({len(index)}) cannot exceed the number of "
+                    f"sampleable items ({size})."
                 )
             for id in index:
                 if not isinstance(id, int) or isinstance(id, bool):
@@ -75,14 +77,16 @@ def _sample_hyperedge_index(
         sampled_hyperedge_ids: Tensor,
     ) -> Tensor:
         """
-        Sample the hyperedge index to keep only incidences belonging to the specified sampled hyperedge IDs.
+        Sample the hyperedge index to keep only incidences belonging to the specified sampled
+        hyperedge IDs.
 
         Args:
             hyperedge_index: The original hyperedge index tensor of shape ``[2, num_incidences]``.
             sampled_hyperedge_ids: A tensor containing the IDs of hyperedges to sample.
 
         Returns:
-            hyperedge_index: A new hyperedge index tensor containing only the incidences of the sampled hyperedges.
+            hyperedge_index: A new hyperedge index tensor containing only the incidences of the
+                sampled hyperedges.
         """
         hyperedge_ids = hyperedge_index[1]
 
@@ -120,27 +124,30 @@ def _validate_bounds(self, ids: list[int], size: int, label: str) -> None:
 class HyperedgeSampler(BaseSampler):
     def sample(self, index: int | list[int], hdata: HData) -> HData:
         """
-        Sample hyperedges by their IDs and return the sub-hypergraph containing only those hyperedges and their incident nodes.
+        Sample hyperedges by their IDs and return the sub-hypergraph containing only those
+        hyperedges and their incident nodes.
 
         Examples:
-        >>> hyperedge_index = [[0, 0, 1, 2, 3, 4],
-        ...                    [0, 0, 0, 1, 2, 2]]
-        >>> hdata = HData.from_hyperedge_index(hyperedge_index)
-        >>> strategy = HyperedgeSampler()
-        >>> sampled_hdata = strategy.sample([0, 2], hdata)
-        >>> sampled_hdata.hyperedge_index
-        >>> tensor([[0, 0, 1, 3, 4],
-        ...         [0, 0, 0, 2, 2]])
+            >>> hyperedge_index = [[0, 0, 1, 2, 3, 4],
+            ...                    [0, 0, 0, 1, 2, 2]]
+            >>> hdata = HData.from_hyperedge_index(hyperedge_index)
+            >>> strategy = HyperedgeSampler()
+            >>> sampled_hdata = strategy.sample([0, 2], hdata)
+            >>> sampled_hdata.hyperedge_index
+            >>> tensor([[0, 0, 1, 3, 4],
+            ...         [0, 0, 0, 2, 2]])
 
         Args:
             index: An integer or a list of integers representing hyperedge IDs to sample.
             hdata: The original HData to sample from.
 
         Returns:
-            hdata: An HData instance containing only the sampled hyperedges and their incident nodes.
+            hdata: An HData instance containing only the sampled hyperedges and
+                their incident nodes.
 
         Raises:
-            ValueError: If the provided index is invalid (e.g., empty list or list length exceeds number of hyperedges).
+            ValueError: If the provided index is invalid (e.g., empty list or list length exceeds
+                number of hyperedges).
             IndexError: If any hyperedge ID is out of bounds.
         """
         ids = self._normalize_index(index, self.len(hdata))
@@ -182,27 +189,30 @@ def len(self, hdata: HData) -> int:
 class NodeSampler(BaseSampler):
     def sample(self, index: int | list[int], hdata: HData) -> HData:
         """
-        Sample nodes by their IDs and return the sub-hypergraph containing only those nodes and their incident hyperedges.
+        Sample nodes by their IDs and return the sub-hypergraph containing only those nodes and
+        their incident hyperedges.
 
         Examples:
-        >>> hyperedge_index = [[0, 0, 1, 2, 3, 4],
-        ...                    [0, 0, 0, 1, 2, 2]]
-        >>> hdata = HData.from_hyperedge_index(hyperedge_index)
-        >>> strategy = NodeSampler()
-        >>> sampled_hdata = strategy.sample([0, 3], hdata)
-        >>> sampled_hdata.hyperedge_index
-        >>> tensor([[0, 0, 1, 3, 4],
-        ...         [0, 0, 0, 2, 2]])
+            >>> hyperedge_index = [[0, 0, 1, 2, 3, 4],
+            ...                    [0, 0, 0, 1, 2, 2]]
+            >>> hdata = HData.from_hyperedge_index(hyperedge_index)
+            >>> strategy = NodeSampler()
+            >>> sampled_hdata = strategy.sample([0, 3], hdata)
+            >>> sampled_hdata.hyperedge_index
+            >>> tensor([[0, 0, 1, 3, 4],
+            ...         [0, 0, 0, 2, 2]])
 
         Args:
             index: An integer or a list of integers representing node IDs to sample.
             hdata: The original HData to sample from.
 
         Returns:
-            hdata: An HData instance containing only the sampled nodes and their incident hyperedges.
+            hdata: An HData instance containing only the sampled nodes and their
+                incident hyperedges.
 
         Raises:
-            ValueError: If the provided index is invalid (e.g., empty list or list length exceeds number of nodes).
+            ValueError: If the provided index is invalid (e.g., empty list or list length exceeds
+                number of nodes).
             IndexError: If any node ID is out of bounds.
         """
         ids = self._normalize_index(index, self.len(hdata))
@@ -220,7 +230,8 @@ def sample(self, index: int | list[int], hdata: HData) -> HData:
         sampled_nodes_mask = torch.isin(node_ids, sampled_node_ids)
 
         # Get unique hyperedges that have at least one sampled node
-        # Example: hyperedge_ids = [0, 0, 0, 1, 2, 2], sampled_nodes_mask = [True, True, False, False, True, False]
+        # Example: hyperedge_ids = [0, 0, 0, 1, 2, 2],
+        #  sampled_nodes_mask = [True, True, False, False, True, False]
         #          -> sampled_hyperedge_ids = [0, 2] as they connect to sampled nodes
         sampled_hyperedge_ids = hyperedge_ids[sampled_nodes_mask].unique()
 
diff --git a/hyperbench/data/splitter.py b/hyperbench/data/splitter.py
index c7616359..81addc02 100644
--- a/hyperbench/data/splitter.py
+++ b/hyperbench/data/splitter.py
@@ -38,7 +38,8 @@ def split(self, to_split: _ToSplitType, **kwargs: Any) -> _SplitResultType:
 
         Args:
             to_split: The object to split.
-            **kwargs: Additional keyword arguments that may be required by specific splitter implementations.
+            **kwargs: Additional keyword arguments that may be required by specific splitter
+                implementations.
 
         Returns:
             The result of splitting the input object.
@@ -51,17 +52,7 @@ class DefaultDatasetSplitter(Splitter["Dataset", tuple[list["Dataset"], list[flo
     Split a dataset by hyperedges and materialize dataset partitions.
 
     Args:
-        ratios: List of floats summing to ``1.0``.
         node_space_setting: Whether to preserve full or local node spaces.
-        cover_all_nodes_in_train_split: Whether transductive splits should move
-            hyperedges into the first split until all nodes are incident to at
-            least one selected training hyperedge.
-            train_split_idx: The index of the split to treat as the train split. Defaults to ``0``,
-                so the first split is the train split that gets the full node space in the
-                transductive setting and is optionally rebalanced to cover all nodes.
-                This is used only when ``node_space_setting=="transductive"`` and ``cover_all_nodes_in_train_split==True``,
-                to determine which split should be rebalanced to cover all nodes.
-                For the 'inductive' setting, splits are always returned based on the provided ratios.
         shuffle: Whether to shuffle hyperedges before splitting.
         seed: Optional random seed for reproducibility.
     """
@@ -84,11 +75,25 @@ def split(self, to_split: Dataset, **kwargs: Any) -> tuple[list[Dataset], list[f
 
         Args:
             to_split: The `Dataset` to split.
-            ratios: Desired split ratios, used for initial split construction and
-                as a reference during rebalancing. Expected as a keyword argument.
 
+            kwargs:
+                ratios: Desired split ratios, used for initial split construction and
+                    as a reference during rebalancing. Expected as a keyword argument.
+                    List of floats summing to ``1.0``.
+                cover_all_nodes_in_train_split: Whether transductive splits should move
+                    hyperedges into the first split until all nodes are incident to at
+                    least one selected training hyperedge.
+                train_split_idx: The index of the split to treat as the train split.
+                    Defaults to ``0``, so the first split is the train split that gets the full
+                    node space in the transductive setting and is optionally rebalanced to cover
+                    all nodes. This is used only when ``node_space_setting=="transductive"``
+                    and ``cover_all_nodes_in_train_split==True``,
+                    to determine which split should be rebalanced to cover all nodes.
+                    For the 'inductive' setting, splits are always returned based on the
+                    provided ratios.
         Returns:
-            datasets_and_ratios: Split datasets and final hyperedge-count ratios.
+            split_datasets: The list of split datasets.
+            final_ratios: The list of final hyperedge-count ratios.
 
         Raises:
             ValueError: If ratios do not sum to ``1.0``, a final split has zero
@@ -149,7 +154,8 @@ def __validate_train_split_idx(self, train_split_idx: int, ratios: list[float])
         if self.node_space_setting != "transductive" and train_split_idx != 0:
             raise ValueError(
                 f"'train_split_idx' is only relevant when 'node_space_setting' is 'transductive', "
-                f"got 'node_space_setting={self.node_space_setting}' and 'train_split_idx={train_split_idx}'."
+                f"got 'node_space_setting={self.node_space_setting}' and"
+                f" 'train_split_idx={train_split_idx}'."
                 "For the 'inductive' setting, splits are returned based on the provided ratios."
             )
         validate_is_between("train_split_idx", train_split_idx, 0, len(ratios) - 1)
@@ -176,7 +182,9 @@ def split(self, to_split: HData, **kwargs: Any) -> HData:
 
         Args:
             to_split: The original `HData` containing the full hypergraph.
-            split_hyperedge_ids: The hyperedge IDs that should be included in the split, expected as a keyword argument.
+            kwargs:
+                split_hyperedge_ids: The hyperedge IDs that should be included in the split,
+                    expected as a keyword argument.
 
         Returns:
             hdata: The splitted instance with remapped node and hyperedge IDs.
@@ -282,7 +290,8 @@ def ensure_split_covers_all_nodes(
             ratios: The final ratios of hyperedges in each split after rebalancing.
 
         Raises:
-            ValueError: If one or more nodes do not appear in any hyperedge of the source hypergraph.
+            ValueError: If one or more nodes do not appear in any hyperedge of
+                he source hypergraph.
         """
         validate_is_non_empty("hyperedge_ids_by_split", hyperedge_ids_by_split)
         validate_is_between("split_idx", split_idx, 0, len(hyperedge_ids_by_split) - 1)
@@ -356,7 +365,8 @@ def get_hyperedge_ids_permutation(self, shuffle: bool | None, seed: int | None)
         Returns:
             hyperedge_ids_permutation: Ordered or shuffled hyperedge IDs on the HData device.
         """
-        # Shuffle hyperedge IDs if shuffle is requested, otherwise keep original order for deterministic splits
+        # Shuffle hyperedge IDs if shuffle is requested, otherwise keep original order
+        # for deterministic splits
         if shuffle:
             generator = create_seeded_torch_generator(device=self.device, seed=seed)
             random_hyperedge_ids_permutation = torch.randperm(
@@ -405,8 +415,9 @@ def split(self, to_split: Tensor, **kwargs: Any) -> tuple[list[Tensor], list[flo
 
         Args:
             to_split: Hyperedge IDs to partition.
-            ratios: Desired split ratios, used for initial split construction and
-                as a reference during rebalancing. Expected as a keyword argument.
+            kwargs:
+                ratios: Desired split ratios, used for initial split construction and
+                    as a reference during rebalancing. Expected as a keyword argument.
 
         Returns:
             hyperedge_ids_by_split: The updated hyperedge IDs for each split.
diff --git a/hyperbench/data/supported_datasets.py b/hyperbench/data/supported_datasets.py
index ed5ed954..c6615e8c 100644
--- a/hyperbench/data/supported_datasets.py
+++ b/hyperbench/data/supported_datasets.py
@@ -8,12 +8,15 @@
 class _PreloadedDataset(Dataset):
     """
     Base class for datasets that use default loading.
-    Subclasses should specify the ``DATASET_NAME`` class variable.
-    The dataset will be saved on disk after the first load.
+
+    Subclasses should specify the ``DATASET_NAME`` class variable. The dataset will be saved on
+    disk after the first load.
 
     Args:
-        hdata: Optional HData object. If ``None``, the dataset will be loaded using the ``DATASET_NAME``.
-        sampling_strategy: The sampling strategy to use for this dataset. Default is ``SamplingStrategy.HYPEREDGE``.
+        hdata: Optional HData object. If ``None``, the dataset will be loaded using
+            the ``DATASET_NAME``.
+        sampling_strategy: The sampling strategy to use for this dataset.
+            Default is ``SamplingStrategy.HYPEREDGE``.
     """
 
     DATASET_NAME: ClassVar[str] = ""
@@ -72,8 +75,9 @@ def __validate(self) -> None:
 
 
 def list_datasets() -> list[str]:
-    """Return supported preloaded dataset names in deterministic order."""
-
+    """
+    Return supported preloaded dataset names in deterministic order.
+    """
     return sorted(_PreloadedDataset._registry)
 
 
diff --git a/hyperbench/hlp/common.py b/hyperbench/hlp/common.py
index 870a222e..2e47eb25 100644
--- a/hyperbench/hlp/common.py
+++ b/hyperbench/hlp/common.py
@@ -14,14 +14,17 @@ class HlpModule(L.LightningModule):
     A LightningModule for HLP models with optional negative sampling.
 
     Args:
-        encoder: Optional encoder module. Defaults to ``None`` as not all HLP model will use an encoder.
+        encoder: Optional encoder module. Defaults to ``None`` as not
+            all HLP model will use an encoder.
         decoder: Decoder module to use to predict whether hyperedges are positive or negative.
         loss_fn: Loss function.
         metrics: Optional ``MetricCollection`` of torchmetrics to compute during evaluation.
             Cloned per stage (train, val, test) for independent state accumulation.
         negative_sampler: Optional negative sampler. If ``None``, no negative sampling is performed.
-        negative_sampling_schedule: When to perform negative sampling during training. Defaults to ``"every_epoch"``.
-        negative_sampling_every_n: If using ``"every_n_epochs"`` schedule, how many epochs between negative sampling runs. Defaults to ``1``.
+        negative_sampling_schedule: When to perform negative sampling during training.
+            Defaults to ``"every_epoch"``.
+        negative_sampling_every_n: If using ``"every_n_epochs"`` schedule, how many epochs between
+            negative sampling runs. Defaults to ``1``.
     """
 
     def __init__(
@@ -97,7 +100,8 @@ def _compute_metrics(
 
         Uses class-based torchmetrics with proper multi-batch accumulation:
         1. ``update()`` accumulates predictions/targets across batches.
-        2. Passing the MetricCollection to ``self.log_dict()`` tells Lightning to call ``compute()`` at epoch end and ``reset()`` automatically.
+        2. Passing the MetricCollection to ``self.log_dict()`` tells Lightning to call
+            ``compute()`` at epoch end and ``reset()`` automatically.
 
         Args:
             scores: The predicted scores (logits) from the model.
@@ -121,7 +125,7 @@ def _compute_metrics(
             stage_metrics,
             prog_bar=True,
             on_step=False,
-            on_epoch=True,  # Compute and log metrics at epoch end, not per step, for proper accumulation
+            on_epoch=True,  # Compute and log metrics at epoch end for proper accumulation
             batch_size=batch_size,
         )
 
@@ -133,7 +137,8 @@ def _get_stage_metrics(self, stage: Stage) -> MetricCollection | None:
             stage: The current stage (train/val/test) for which to get metrics.
 
         Returns:
-            metrics: The metric collection corresponding to the given stage, or ``None`` if no metrics are configured.
+            metrics: The metric collection corresponding to the given stage, or ``None``
+                if no metrics are configured.
         """
         match stage:
             case Stage.TRAIN:
@@ -146,7 +151,9 @@ def _get_stage_metrics(self, stage: Stage) -> MetricCollection | None:
                 raise ValueError(f"Unrecognized stage: {stage}")
 
     def _should_sample_negatives(self) -> bool:
-        """Whether to resample negatives for the current epoch."""
+        """
+        Whether to resample negatives for the current epoch.
+        """
         if self.__negative_sampling_scheduler is None:
             raise ValueError(
                 "Asked to check negative sampling schedule but no negative sampler is configured."
diff --git a/hyperbench/hlp/common_neighbors_hlp.py b/hyperbench/hlp/common_neighbors_hlp.py
index 0ff7cda3..781fd5fb 100644
--- a/hyperbench/hlp/common_neighbors_hlp.py
+++ b/hyperbench/hlp/common_neighbors_hlp.py
@@ -55,7 +55,9 @@ def forward(self, hyperedge_index: Tensor) -> Tensor:
         return self.decoder(hyperedge_index, self.node_to_neighbors)
 
     def on_fit_start(self) -> None:
-        """Warn users if they are running unnecessary training epochs."""
+        """
+        Warn users if they are running unnecessary training epochs.
+        """
         if self.trainer.max_epochs is None or self.trainer.max_epochs > 0:
             warnings.warn(
                 f"{self.__class__.__name__} is a non-trainable heuristic model. "
@@ -86,7 +88,8 @@ def __step(self, batch: HData, stage: Stage) -> Tensor:
 
         Args:
             batch: `HData` object containing the hypergraph.
-            stage: The current stage of evaluation (e.g., ``Stage.TRAIN``, ``Stage.VAL``, ``Stage.TEST``).
+            stage: The current stage of evaluation
+                (e.g., ``Stage.TRAIN``, ``Stage.VAL``, ``Stage.TEST``).
 
         Returns:
             loss: The computed loss.
diff --git a/hyperbench/hlp/gcn_hlp.py b/hyperbench/hlp/gcn_hlp.py
index f5b7ac62..51ba1d66 100644
--- a/hyperbench/hlp/gcn_hlp.py
+++ b/hyperbench/hlp/gcn_hlp.py
@@ -14,7 +14,7 @@ class GCNEncoderConfig(TypedDict):
     """
     Configuration for the GCN encoder in GCNHlpModule.
 
-    Args:
+    Attributes:
         in_channels: Number of input features per node.
         out_channels: Number of output features (embedding size) per node.
         hidden_channels: Number of hidden units in the intermediate GCN layers.
@@ -25,9 +25,11 @@ class GCNEncoderConfig(TypedDict):
         add_self_loops: Whether to add self-loops before convolution. Defaults to ``True``.
         normalize: Whether to normalize the adjacency matrix in ``GCNConv``. Defaults to ``True``.
         cached: Whether to cache the normalized graph in ``GCNConv``. Defaults to ``False``.
-        graph_reduction_strategy: Strategy for reducing the hypergraph to a graph. Defaults to ``"clique_expansion"``
-        num_nodes: Total number of nodes in the hypergraph. This is useful when setting is transductive
-            but train dataset may not contain all hyperedges where some nodes appear, to ensure consistent encoding across splits.
+        graph_reduction_strategy: Strategy for reducing the hypergraph to a graph.
+            Defaults to ``"clique_expansion"``.
+        num_nodes: Total number of nodes in the hypergraph. This is useful when setting is
+            transductive but train dataset may not contain all hyperedges where some nodes appear,
+            to ensure consistent encoding across splits.
         activation_fn: Activation function to use after each hidden layer. Defaults to ``nn.ReLU``.
         activation_fn_kwargs: Keyword arguments for the activation function. Defaults to empty dict.
     """
diff --git a/hyperbench/hlp/hgnn_hlp.py b/hyperbench/hlp/hgnn_hlp.py
index fdb9f63a..b70b9add 100644
--- a/hyperbench/hlp/hgnn_hlp.py
+++ b/hyperbench/hlp/hgnn_hlp.py
@@ -14,7 +14,7 @@ class HGNNEncoderConfig(TypedDict):
     """
     Configuration for the HGNN encoder in HGNNHlpModule.
 
-    Args:
+    Attributes:
         in_channels: Number of input features per node.
         hidden_channels: Number of hidden units in the intermediate HGNN layer.
         out_channels: Number of output features (embedding size) per node.
@@ -83,33 +83,34 @@ def forward(self, x: Tensor, hyperedge_index: Tensor) -> Tensor:
         Run the full HGNN-based hyperedge link prediction pipeline.
 
         The pipeline has three stages:
-        1. Encode: HGNN applies two rounds of ``D_n^{-1/2} H D_e^{-1} H^T D_n^{-1/2}``
-           smoothing to propagate information through the hypergraph topology (nodes ->
-           hyperedges -> nodes). The output is a structure-aware node embedding matrix of
-           shape ``(num_nodes, out_channels)``.
-        2. Aggregate: For each hyperedge being scored, pool the embeddings of its member
-           nodes using the configured strategy (mean/max/min/sum). This produces a hyperedge
-           embedding that summarizes the collective representation of the hyperedge's nodes.
-           Shape: ``(num_hyperedges, out_channels)``.
-        3. Decode: A single linear layer (SLP) projects each hyperedge embedding to a
-           scalar score representing the likelihood that the hyperedge is a real (positive)
-           hyperedge. Shape: ``(num_hyperedges,)``.
+            1. Encode: HGNN applies two rounds of ``D_n^{-1/2} H D_e^{-1} H^T D_n^{-1/2}``
+            smoothing to propagate information through the hypergraph topology (nodes ->
+            hyperedges -> nodes). The output is a structure-aware node embedding matrix of
+            shape ``(num_nodes, out_channels)``.
+            2. Aggregate: For each hyperedge being scored, pool the embeddings of its member
+            nodes using the configured strategy (mean/max/min/sum). This produces a hyperedge
+            embedding that summarizes the collective representation of the hyperedge's nodes.
+            Shape: ``(num_hyperedges, out_channels)``.
+            3. Decode: A single linear layer (SLP) projects each hyperedge embedding to a
+            scalar score representing the likelihood that the hyperedge is a real (positive)
+            hyperedge. Shape: ``(num_hyperedges,)``.
 
         Examples:
-            Given 5 nodes with 8 features and 2 hyperedges::
+            Given 5 nodes with 8 features and 2 hyperedges:
 
                 >>> x.shape  # (5, 8) - all nodes in the hypergraph
                 >>> hyperedge_index = [[0, 1, 2, 3, 4],  # node IDs
                 ...                    [0, 0, 0, 1, 1]]  # hyperedge IDs
 
             The forward pass:
-                1. HGNN encodes all 5 nodes using the hypergraph Laplacian.
-                   ``node_embeddings.shape = (5, out_channels)``
-                2. Aggregate per hyperedge:
-                   - hyperedge 0: pool(emb[0], emb[1], emb[2])
-                   - hyperedge 1: pool(emb[3], emb[4])
-                   ``hyperedge_embeddings.shape = (2, out_channels)``
-                3. Decode: one scalar per hyperedge -> ``scores.shape = (2,)``
+
+                >>> HGNN encodes all 5 nodes using the hypergraph Laplacian.
+                ...   ``node_embeddings.shape = (5, out_channels)``
+                >>> Aggregate per hyperedge:
+                ...   - hyperedge 0: pool(emb[0], emb[1], emb[2])
+                ...   - hyperedge 1: pool(emb[3], emb[4])
+                ...   ``hyperedge_embeddings.shape = (2, out_channels)``
+                >>> Decode: one scalar per hyperedge -> ``scores.shape = (2,)``
 
         Args:
             x: Node feature matrix of shape ``(num_nodes, in_channels)``.
diff --git a/hyperbench/hlp/hgnnp_hlp.py b/hyperbench/hlp/hgnnp_hlp.py
index 1769a41a..c97fe9e5 100644
--- a/hyperbench/hlp/hgnnp_hlp.py
+++ b/hyperbench/hlp/hgnnp_hlp.py
@@ -14,7 +14,7 @@ class HGNNPEncoderConfig(TypedDict):
     """
     Configuration for the HGNN+ encoder in HGNNPHlpModule.
 
-    Args:
+    Attributes:
         in_channels: Number of input features per node.
         hidden_channels: Number of hidden units in the intermediate HGNN+ layer.
         out_channels: Number of output features (embedding size) per node.
@@ -83,15 +83,15 @@ def forward(self, x: Tensor, hyperedge_index: Tensor) -> Tensor:
         Run the full HGNN+-based hyperedge link prediction pipeline.
 
         The pipeline has three stages:
-        1. Encode: HGNN+ applies two rounds of ``D_v^{-1} H D_e^{-1} H^T``
-           smoothing to propagate information through the hypergraph topology with
-           two-stage mean aggregation. The output is a structure-aware node
-           embedding matrix of shape ``(num_nodes, out_channels)``.
-        2. Aggregate: For each hyperedge being scored, pool the embeddings of its member
-           nodes using the configured strategy (mean/max/min/sum). This produces a hyperedge
-           embedding of shape ``(num_hyperedges, out_channels)``.
-        3. Decode: A single linear layer projects each hyperedge embedding to a
-           scalar score. Shape: ``(num_hyperedges,)``.
+            1. Encode: HGNN+ applies two rounds of ``D_v^{-1} H D_e^{-1} H^T``
+            smoothing to propagate information through the hypergraph topology with
+            two-stage mean aggregation. The output is a structure-aware node
+            embedding matrix of shape ``(num_nodes, out_channels)``.
+            2. Aggregate: For each hyperedge being scored, pool the embeddings of its member
+            nodes using the configured strategy (mean/max/min/sum). This produces a hyperedge
+            embedding of shape ``(num_hyperedges, out_channels)``.
+            3. Decode: A single linear layer projects each hyperedge embedding to a
+            scalar score. Shape: ``(num_hyperedges,)``.
 
         Args:
             x: Node feature matrix of shape ``(num_nodes, in_channels)``.
diff --git a/hyperbench/hlp/hnhn_hlp.py b/hyperbench/hlp/hnhn_hlp.py
index f4836d4c..e9f02148 100644
--- a/hyperbench/hlp/hnhn_hlp.py
+++ b/hyperbench/hlp/hnhn_hlp.py
@@ -14,7 +14,7 @@ class HNHNEncoderConfig(TypedDict):
     """
     Configuration for the HNHN encoder in HNHNHlpModule.
 
-    Args:
+    Attributes:
         in_channels: Number of input features per node.
         hidden_channels: Number of hidden units in the intermediate HNHN layer.
         out_channels: Number of output features (embedding size) per node.
diff --git a/hyperbench/hlp/hypergcn_hlp.py b/hyperbench/hlp/hypergcn_hlp.py
index 3981f4a6..7f3abf34 100644
--- a/hyperbench/hlp/hypergcn_hlp.py
+++ b/hyperbench/hlp/hypergcn_hlp.py
@@ -14,16 +14,19 @@ class HyperGCNEncoderConfig(TypedDict):
     """
     Configuration for the HyperGCN encoder in HyperGCNHlpModule.
 
-    Args:
+    Attributes:
         in_channels: Number of input features per node.
         hidden_channels: Number of hidden units in the intermediate HyperGCN layer.
         out_channels: Number of output features (embedding size) per node.
         bias: Whether to include bias terms. Defaults to ``True``.
         use_batch_normalization: Whether to use batch normalization. Defaults to ``False``.
         drop_rate: Dropout rate. Defaults to ``0.5``.
-        use_mediator: Whether to use mediator nodes for hyperedge-to-edge conversion. Defaults to ``False``.
-        fast: Whether to cache the graph structure after first computation. Defaults to ``True``.
-        seed: Optional random seed for the random reduction of hyperedges to edges. Defaults to ``None``.
+        use_mediator: Whether to use mediator nodes for hyperedge-to-edge conversion.
+            Defaults to ``False``.
+        fast: Whether to cache the graph structure after first computation.
+            Defaults to ``True``.
+        seed: Optional random seed for the random reduction of hyperedges to edges.
+            Defaults to ``None``.
     """
 
     in_channels: int
@@ -99,19 +102,20 @@ def forward(self, x: Tensor, hyperedge_index: Tensor) -> Tensor:
             3. Decode: A linear layer scores each hyperedge embedding.
 
         Examples:
-            Given 5 nodes with 3 features and 2 hyperedges::
+            Given 5 nodes with 3 features and 2 hyperedges:
 
                 >>> x.shape  # (5, 3) — all nodes in the hypergraph
                 >>> hyperedge_index = [[0, 1, 2, 3, 4],  # node IDs (global)
                 ...                    [0, 0, 0, 1, 1]]  # hyperedge IDs
 
             The forward pass:
-                1. HyperGCN encodes all 5 nodes using the full graph Laplacian.
-                   ``node_embeddings.shape = (5, out_channels)``
-                2. Aggregate per hyperedge:
-                   - hyperedge 0: pool(emb[0], emb[1], emb[2])
-                   - hyperedge 1: pool(emb[3], emb[4])
-                3. Decode: one scalar score per hyperedge → ``scores.shape = (2,)``
+
+                >>> HyperGCN encodes all 5 nodes using the full graph Laplacian.
+                ...   ``node_embeddings.shape = (5, out_channels)``
+                >>> Aggregate per hyperedge:
+                ...   - hyperedge 0: pool(emb[0], emb[1], emb[2])
+                ...   - hyperedge 1: pool(emb[3], emb[4])
+                >>> Decode: one scalar score per hyperedge → ``scores.shape = (2,)``
 
         Args:
             x: Node feature matrix of shape ``(num_nodes, in_channels)``.
diff --git a/hyperbench/hlp/mlp_hlp.py b/hyperbench/hlp/mlp_hlp.py
index c25c7b4d..1bf5a7d2 100644
--- a/hyperbench/hlp/mlp_hlp.py
+++ b/hyperbench/hlp/mlp_hlp.py
@@ -14,17 +14,23 @@ class MlpEncoderConfig(TypedDict):
     """
     Configuration for the MLP encoder in MLPHlpModule.
 
-    Args:
+    Attributes:
         in_channels: Number of input features per node.
         out_channels: Number of output features (embedding size) per node.
         num_layers: Number of layers in the MLP encoder.
-        hidden_channels: Optional number of hidden units per layer. If ``None``, no hidden layers are used and the encoder is a simple linear layer.
-        activation_fn: Optional activation function class to use in the MLP encoder. If ``None``, no activation function is applied.
-        activation_fn_kwargs: Optional dictionary of keyword arguments to pass to the activation function constructor.
-        normalization_fn: Optional normalization function class to use in the MLP encoder. If ``None``, no normalization is applied.
-        normalization_fn_kwargs: Optional dictionary of keyword arguments to pass to the normalization function constructor.
+        hidden_channels: Optional number of hidden units per layer. If ``None``, no hidden layers
+            are used and the encoder is a simple linear layer.
+        activation_fn: Optional activation function class to use in the MLP encoder.
+            If ``None``, no activation function is applied.
+        activation_fn_kwargs: Optional dictionary of keyword arguments to pass to the activation
+            function constructor.
+        normalization_fn: Optional normalization function class to use in the MLP encoder.
+            If ``None``, no normalization is applied.
+        normalization_fn_kwargs: Optional dictionary of keyword arguments to pass to the
+            normalization function constructor.
         bias: Whether to include bias terms in the MLP layers. Defaults to ``True``.
-        drop_rate: Dropout rate to apply after each MLP layer (except the last one). Defaults to ``0.0`` (no dropout).
+        drop_rate: Dropout rate to apply after each MLP layer (except the last one).
+            Defaults to ``0.0`` (no dropout).
     """
 
     in_channels: int
@@ -76,7 +82,8 @@ def __init__(
             drop_rate=encoder_config.get("drop_rate", 0.0),
         )
 
-        # The decoder takes in the aggregated hyperedge embeddings of shape (num_hyperedges, encoder_config.out_channels)
+        # The decoder takes in the aggregated hyperedge embeddings of shape
+        # (num_hyperedges, encoder_config.out_channels)
         # and produces a score for each hyperedge of shape (num_hyperedges, 1).
         decoder = SLP(in_channels=encoder_config.get("out_channels", 1), out_channels=1)
 
@@ -107,14 +114,15 @@ def forward(self, x: Tensor, hyperedge_index: Tensor) -> Tensor:
                 ...                    [0, 0, 0, 1, 1]]   # hyperedge ids
 
             The forward pass:
-                1. Encoder maps each node to an embedding vector.
-                2. Aggregate embeddings by summing them per hyperedge:
-                    - hyperedge 0: emb[0] + emb[1] + emb[2]
-                    - hyperedge 1: emb[2] + emb[3]
-                3. Sums are divided by the number of nodes per hyperedge (mean pooling):
-                    - hyperedge 0: (emb[0] + emb[1] + emb[2]) / 3
-                    - hyperedge 1: (emb[2] + emb[3]) / 2
-                4. Decoder scores each hyperedge embedding, producing one scalar per hyperedge.
+
+                >>> Encoder maps each node to an embedding vector.
+                >>> Aggregate embeddings by summing them per hyperedge:
+                ...   - hyperedge 0: emb[0] + emb[1] + emb[2]
+                ...   - hyperedge 1: emb[2] + emb[3]
+                >>> Sums are divided by the number of nodes per hyperedge (mean pooling):
+                ...   - hyperedge 0: (emb[0] + emb[1] + emb[2]) / 3
+                ...   - hyperedge 1: (emb[2] + emb[3]) / 2
+                >>> Decoder scores each hyperedge embedding, producing one scalar per hyperedge.
 
         Args:
             x: Node feature matrix of shape ``(num_nodes, in_channels)``.
@@ -137,8 +145,10 @@ def forward(self, x: Tensor, hyperedge_index: Tensor) -> Tensor:
 
         # Aggregate: for each hyperedge, aggregate the embeddings of its member nodes.
         # Example::
-        # - hyperedge 0 contains node 0, 1, 2 -> aggregate([e00, e01], [e10, e11], [e20, e21]) -> [pooled_0, pooled_1]
-        # - hyperedge 1 contains node 2, 3 -> aggregate([e20, e21], [e30, e31]) -> [pooled_0, pooled_1]
+        # - hyperedge 0 contains node 0, 1, 2 -> aggregate([e00, e01], [e10, e11], [e20, e21])
+        #                                         -> [pooled_0, pooled_1]
+        # - hyperedge 1 contains node 2, 3 -> aggregate([e20, e21], [e30, e31])
+        #                                  -> [pooled_0, pooled_1]
         # shape: (num_hyperedges, out_channels)
         hyperedge_embeddings = HyperedgeAggregator(hyperedge_index, node_embeddings).pool(
             self.aggregation,
diff --git a/hyperbench/hlp/nhp_hlp.py b/hyperbench/hlp/nhp_hlp.py
index d6912d89..625d7c37 100644
--- a/hyperbench/hlp/nhp_hlp.py
+++ b/hyperbench/hlp/nhp_hlp.py
@@ -14,7 +14,7 @@ class NHPEncoderConfig(TypedDict):
     """
     Configuration for the NHP encoder/scorer to be used for hyperedge link prediction.
 
-    Args:
+    Attributes:
         in_channels: Number of input features per node.
         hidden_channels: Number of hidden channels for incidence embeddings. Defaults to ``512``.
         aggregation: Hyperedge scoring aggregation. ``"maxmin"`` uses the paper's
diff --git a/hyperbench/hlp/node2vec_common.py b/hyperbench/hlp/node2vec_common.py
index 3d9c9905..d1cba1dd 100644
--- a/hyperbench/hlp/node2vec_common.py
+++ b/hyperbench/hlp/node2vec_common.py
@@ -18,22 +18,29 @@ class Node2VecGCNHlpConfig(TypedDict):
     """
     Configuration for the GCN model.
 
-    Args:
+    Attributes:
         out_channels: Dimension of the output node embeddings from the GCN layers.
         hidden_channels: Dimension of the hidden node embeddings in the GCN layers.
         num_layers: Number of GCN layers. Must be at least 1. Defaults to ``2``.
-        drop_rate: Dropout rate applied after each GCN layer (except the last one). Defaults to ``0.0`` (no dropout).
+        drop_rate: Dropout rate applied after each GCN layer (except the last one).
+            Defaults to ``0.0`` (no dropout).
         bias: Whether to include a bias term in the GCN layers. Defaults to ``True``.
         improved: Whether to use the improved version of GCNConv. Defaults to ``False``.
         add_self_loops: Whether to add self-loops to the input graph. Defaults to ``True``.
-        normalize: Whether to symmetrically normalize the adjacency matrix in GCNConv. Defaults to ``True``.
+        normalize: Whether to symmetrically normalize the adjacency matrix in GCNConv.
+            Defaults to ``True``.
         cached: Whether to cache the normalized adjacency matrix in GCNConv.
-            Only applicable if the graph structure does not change between epochs. Defaults to ``False``.
-        graph_reduction_strategy: Strategy for reducing the hyperedge graph. Defaults to ``clique_expansion``.
-        num_nodes: Total number of nodes in the hypergraph. This is useful when setting is transductive
-            but train dataset may not contain all hyperedges where some nodes appear, to ensure consistent encoding across splits.
-        activation_fn: Activation function to use after each hidden layer. Defaults to ``nn.ReLU``.
-        activation_fn_kwargs: Keyword arguments for the activation function. Defaults to empty dict.
+            Only applicable if the graph structure does not change between epochs.
+            Defaults to ``False``.
+        graph_reduction_strategy: Strategy for reducing the hyperedge graph.
+            Defaults to ``clique_expansion``.
+        num_nodes: Total number of nodes in the hypergraph. This is useful when setting is
+            transductive but train dataset may not contain all hyperedges where some nodes appear,
+            to ensure consistent encoding across splits.
+        activation_fn: Activation function to use after each hidden layer.
+            Defaults to ``nn.ReLU``.
+        activation_fn_kwargs: Keyword arguments for the activation function.
+            Defaults to empty dict.
     """
 
     out_channels: int
@@ -55,12 +62,15 @@ class Node2VecHlpConfig(TypedDict):
     """
     Configuration for the Node2Vec encoder.
 
-    Args:
+    Attributes:
         context_size: Skip-gram context size for Node2Vec.
-            For example, if ``context_size=2`` and ``walk_length=5``, then for a random walk ``[v0, v1, v2, v3, v4]``,
-            the context for ``v2`` would be ``[v0, v1, v3, v4]`` as we take neighbors within distance 2 in the walk.
+            For example, if ``context_size=2`` and ``walk_length=5``, then for a
+            random walk ``[v0, v1, v2, v3, v4]``,
+            the context for ``v2`` would be ``[v0, v1, v3, v4]`` as we take neighbors within
+            distance 2 in the walk.
             The pairs generated by skip-gram would be ``[(v2, v0), (v2, v1), (v2, v3), (v2, v4)]``.
-            Rule of thumb: Graphs with strong local structure (5-10), Graphs with communities/long-range patterns (10-20).
+            Rule of thumb: Graphs with strong local structure (5-10), Graphs with
+            communities/long-range patterns (10-20).
             Defaults to ``10``.
         walk_length: Length of each random walk.
         num_walks_per_node: Number of walks sampled per node.
@@ -76,14 +86,19 @@ class Node2VecHlpConfig(TypedDict):
             ``X`` negative pairs ``(u, v_neg)`` will be generated,
             where ``v_neg`` is a node sampled uniformly at random from all nodes in the graph.
             Defaults to ``1``, meaning one negative sample per positive pair.
-        num_nodes: Number of nodes in the stable node space. Defaults to the number of nodes in the ``hyperedge_index`` if not provided.
-        train_hyperedge_index: Training hypereddge index used to build the Node2Vec walk graph. Required in ``joint`` mode.
-        graph_reduction_strategy: Strategy for reducing the hyperedge graph. Defaults to ``clique_expansion``.
+        num_nodes: Number of nodes in the stable node space. Defaults to the number of nodes
+            in the ``hyperedge_index`` if not provided.
+        train_hyperedge_index: Training hypereddge index used to build the Node2Vec walk graph.
+            Required in ``joint`` mode.
+        graph_reduction_strategy: Strategy for reducing the hyperedge graph.
+            Defaults to ``clique_expansion``.
         random_walk_batch_size: Batch size used by the walk sampler in joint mode.
         node2vec_loss_weight: Weight applied to the Node2Vec walk loss in joint mode.
-            This is to decide how much the loss of Node2Vec contributes to the overall loss in joint training, relative to the HLP loss.
-             Defaults to ``1.0`` (equal weighting). Set to a higher value to prioritize learning good node embeddings,
-             or a lower value to prioritize the HLP loss. Ignored in precomputed mode.
+            This is to decide how much the loss of Node2Vec contributes to the overall loss in
+            joint training, relative to the HLP loss.
+            Defaults to ``1.0`` (equal weighting). Set to a higher value to prioritize learning
+            good node embeddings, or a lower value to prioritize the HLP loss.
+            Ignored in precomputed mode.
         sparse: Whether to use sparse gradients in the Node2Vec encoder. Defaults to ``False``.
     """
 
@@ -105,9 +120,9 @@ class Node2VecWalkLoaderState:
     """
     State object to hold the walk loader and its iterator for joint Node2Vec training.
 
-    Args:
-        walk_loader: The DataLoader that provides batches of random walks from the Node2Vec encoder during joint training.
-            Initialized lazily when first needed.
+    Attributes:
+        walk_loader: The DataLoader that provides batches of random walks from the Node2Vec encoder
+            during joint training. Initialized lazily when first needed.
         cached_walk_loader_iterator: An iterator over the walk_loader, cached to allow
             fetching the next batch of walks at each training step without reinitializing.
     """
diff --git a/hyperbench/hlp/node2vecgcn_hlp.py b/hyperbench/hlp/node2vecgcn_hlp.py
index 1cfe0554..94b700cd 100644
--- a/hyperbench/hlp/node2vecgcn_hlp.py
+++ b/hyperbench/hlp/node2vecgcn_hlp.py
@@ -28,10 +28,12 @@ class Node2VecGCNEncoderConfig(TypedDict):
     """
     Configuration for the Node2Vec encoder in ``Node2VecGCNHlpModule``.
 
-    Args:
-        mode: Whether to use precomputed node embeddings from ``x`` or train a Node2Vec encoder jointly inside the module.
+    Attributes:
+        mode: Whether to use precomputed node embeddings from ``x`` or train a Node2Vec encoder
+            jointly inside the module.
         num_features: Dimension of the node embeddings consumed by the decoder.
-        node2vec_config: Shared Node2Vec configuration used in joint mode, or metadata for validating precomputed embeddings.
+        node2vec_config: Shared Node2Vec configuration used in joint mode, or metadata for
+            validating precomputed embeddings.
         gcn_config: Configuration for the GCN layers.
     """
 
@@ -46,15 +48,16 @@ class Node2VecGCNHlpModule(HlpModule):
     A LightningModule for Node2Vec-based Hyperedge Link Prediction with GCN encoder.
 
     Supports two modes:
-    - ``precomputed``: use node embeddings already stored in ``batch.x``.
-    - ``joint``: train a Node2Vec encoder jointly with the GCN layers and hyperedge decoder.
+        - ``precomputed``: use node embeddings already stored in ``batch.x``.
+        - ``joint``: train a Node2Vec encoder jointly with the GCN layers and hyperedge decoder.
 
     Args:
         encoder_config: Configuration for the Node2Vec encoder and GCN layers.
         aggregation: Method to aggregate node embeddings per hyperedge.
         loss_fn: Loss function. Defaults to ``BCEWithLogitsLoss``.
         lr: Learning rate for the optimizer. Defaults to ``0.001``.
-        weight_decay: Weight decay (L2 regularization) for the optimizer. Defaults to ``0.0`` (no weight decay).
+        weight_decay: Weight decay (L2 regularization) for the optimizer.
+            Defaults to ``0.0`` (no weight decay).
         metrics: Optional dictionary of metric functions.
     """
 
diff --git a/hyperbench/hlp/node2vecslp_hlp.py b/hyperbench/hlp/node2vecslp_hlp.py
index 4999d052..355ced2f 100644
--- a/hyperbench/hlp/node2vecslp_hlp.py
+++ b/hyperbench/hlp/node2vecslp_hlp.py
@@ -25,10 +25,12 @@ class Node2VecSLPEncoderConfig(TypedDict):
     """
     Configuration for the Node2Vec encoder in ``Node2VecSLPHlpModule``.
 
-    Args:
-        mode: Whether to use precomputed node embeddings from ``x`` or train a Node2Vec encoder jointly inside the module.
+    Attributes:
+        mode: Whether to use precomputed node embeddings from ``x`` or train a Node2Vec encoder
+            jointly inside the module.
         num_features: Dimension of the node embeddings consumed by the decoder.
-        node2vec_config: Shared Node2Vec configuration used in joint mode, or metadata for validating precomputed embeddings.
+        node2vec_config: Shared Node2Vec configuration used in joint mode, or metadata for
+            validating precomputed embeddings.
     """
 
     mode: NotRequired[Node2VecMode]
@@ -41,15 +43,16 @@ class Node2VecSLPHlpModule(HlpModule):
     A LightningModule for Node2Vec-based Hyperedge Link Prediction.
 
     Supports two modes:
-    - ``precomputed``: use node embeddings already stored in ``batch.x``.
-    - ``joint``: train a Node2Vec encoder jointly with the hyperedge decoder.
+        - ``precomputed``: use node embeddings already stored in ``batch.x``.
+        - ``joint``: train a Node2Vec encoder jointly with the hyperedge decoder.
 
     Args:
         encoder_config: Configuration for the Node2Vec encoder.
         aggregation: Method to aggregate node embeddings per hyperedge.
         loss_fn: Loss function. Defaults to ``BCEWithLogitsLoss``.
         lr: Learning rate for the optimizer. Defaults to ``0.001``.
-        weight_decay: Weight decay (L2 regularization) for the optimizer. Defaults to ``0.0`` (no weight decay).
+        weight_decay: Weight decay (L2 regularization) for the optimizer.
+            Defaults to ``0.0`` (no weight decay).
         metrics: Optional dictionary of metric functions.
     """
 
diff --git a/hyperbench/hlp/villain_hlp.py b/hyperbench/hlp/villain_hlp.py
index 3e08c6f3..6edca6ba 100644
--- a/hyperbench/hlp/villain_hlp.py
+++ b/hyperbench/hlp/villain_hlp.py
@@ -14,7 +14,7 @@ class VilLainEncoderConfig(TypedDict):
     """
     Configuration for ``VilLainHlpModule``.
 
-    Args:
+    Attributes:
         num_nodes: Total number of trainable nodes.
         embedding_dim: Returned node and hyperedge embedding dimension. Defaults to ``128``.
         labels_per_subspace: Number of virtual labels per subspace. Defaults to ``2``.
@@ -42,7 +42,8 @@ class VilLainHlpModule(HlpModule):
     Args:
         encoder_config: Configuration for the VilLain encoder.
         embedding_mode: Whether to return node or hyperedge embeddings from the VilLain encoder.
-        aggregation: Aggregation method to pool node embeddings into hyperedge embeddings when ``embedding_mode="node"``.
+        aggregation: Aggregation method to pool node embeddings into hyperedge embeddings
+            when ``embedding_mode="node"``.
             Ignored when ``embedding_mode="hyperedge"``. Defaults to ``maxmin``.
         loss_fn: Loss function for the HLP task. Defaults to ``nn.BCEWithLogitsLoss()``.
         lr: Learning rate for the optimizer. Defaults to ``0.01``.
diff --git a/hyperbench/integration_tests/data/enricher_integration_test.py b/hyperbench/integration_tests/data/enricher_integration_test.py
index 34f42cd8..b7c24896 100644
--- a/hyperbench/integration_tests/data/enricher_integration_test.py
+++ b/hyperbench/integration_tests/data/enricher_integration_test.py
@@ -33,9 +33,10 @@
 # reasonable amount of time, we limit the number of nodes and hyperedges to 75000
 # for the enrichment tests. This allows us to test the functionality of the
 # enrichers without running into excessively long test times, while still providing
-#  a meaningful test of their behavior on reasonably sized datasets.
+# a meaningful test of their behavior on reasonably sized datasets.
 # With the threshold of 75000 nodes and hyperedges, we cover ~75% of the datasets.
-# The datasets.py in the scripts folder contains a function that calculates the node count cutoff to cover 75% of the datasets.
+# The datasets.py in the scripts folder contains a function that calculates the node count
+# cutoff to cover 75% of the datasets.
 
 
 @pytest.mark.flaky(reruns=3, reruns_delay=10, rerun_show_tracebacks=True)
diff --git a/hyperbench/models/gcn.py b/hyperbench/models/gcn.py
index e889e836..815fde6f 100644
--- a/hyperbench/models/gcn.py
+++ b/hyperbench/models/gcn.py
@@ -9,20 +9,25 @@ class GCNConfig(TypedDict):
     """
     Configuration for the GCN model.
 
-    Args:
+    Attributes:
         in_channels: Dimension of the input node embeddings to the GCN layers.
         out_channels: Dimension of the output node embeddings from the GCN layers.
         hidden_channels: Dimension of the hidden node embeddings in the GCN layers.
         num_layers: Number of GCN layers. Must be at least 1. Defaults to ``2``.
-        drop_rate: Dropout rate applied after each GCN layer (except the last one). Defaults to ``0.0`` (no dropout).
-        activation_fn: Activation function to use after each hidden layer. Defaults to ``nn.ReLU``.
-        activation_fn_kwargs: Keyword arguments for the activation function. Defaults to empty dict.
+        drop_rate: Dropout rate applied after each GCN layer (except the last one).
+            Defaults to ``0.0`` (no dropout).
+        activation_fn: Activation function to use after each hidden layer.
+            Defaults to ``nn.ReLU``.
+        activation_fn_kwargs: Keyword arguments for the activation function.
+            Defaults to empty dict.
         bias: Whether to include a bias term in the GCN layers. Defaults to ``True``.
         improved: Whether to use the improved version of GCNConv. Defaults to ``False``.
         add_self_loops: Whether to add self-loops to the input graph. Defaults to ``True``.
-        normalize: Whether to symmetrically normalize the adjacency matrix in GCNConv. Defaults to ``True``.
+        normalize: Whether to symmetrically normalize the adjacency matrix in GCNConv.
+            Defaults to ``True``.
         cached: Whether to cache the normalized adjacency matrix in GCNConv.
-            Only applicable if the graph structure does not change between epochs. Defaults to ``False``.
+            Only applicable if the graph structure does not change between epochs.
+            Defaults to ``False``.
     """
 
     in_channels: int
@@ -52,7 +57,8 @@ class GCN(nn.Module):
         drop_rate: Dropout rate applied after each GCN layer except the last one.
         bias: Whether to include a bias term in the GCN layers.
         activation_fn: Activation function to use after each hidden layer. Defaults to ``nn.ReLU``.
-        activation_fn_kwargs: Keyword arguments for the activation function. Defaults to empty dict.
+        activation_fn_kwargs: Keyword arguments for the activation function.
+            Defaults to empty dict.
         improved: Whether to use the improved version of ``GCNConv``.
         add_self_loops: Whether to add self-loops to the input graph.
         normalize: Whether to symmetrically normalize the adjacency matrix in ``GCNConv``.
@@ -122,7 +128,8 @@ def __build_layers(
         hidden_channels = hidden_channels if hidden_channels is not None else 0
         if num_layers > 1 and hidden_channels <= 0:
             raise ValueError(
-                f"Expected positive hidden_channels for GCN with multiple layers, got {hidden_channels}."
+                f"Expected positive hidden_channels for GCN with multiple layers, "
+                f"got {hidden_channels}."
             )
 
         common_kwargs: dict[str, bool] = {
diff --git a/hyperbench/models/hgnn.py b/hyperbench/models/hgnn.py
index 6d2255e2..21fd6b4e 100644
--- a/hyperbench/models/hgnn.py
+++ b/hyperbench/models/hgnn.py
@@ -9,15 +9,19 @@ class HGNN(nn.Module):
     Unlike HyperGCN (which approximates each hyperedge by selecting representative pairwise
     edges via random projection), HGNN preserves all higher-order relationships by passing
     messages through the full incidence structure: nodes -> hyperedges -> nodes.
-    - Proposed in `Hypergraph Neural Networks <https://arxiv.org/pdf/1809.09401>`_ paper (AAAI 2019).
-    - Reference implementation: `source <https://deephypergraph.readthedocs.io/en/latest/_modules/dhg/models/hypergraphs/hgnn.html#HGNN>`_.
+
+    References:
+        - Proposed in [Hypergraph Neural Networks](https://arxiv.org/pdf/1809.09401) (AAAI 2019).
+        - Reference implementation: [Code](https://deephypergraph.readthedocs.io/en/latest/_modules/dhg/models/hypergraphs/hgnn.html#HGNN).
 
     Args:
         in_channels: The number of input channels.
         hidden_channels: The number of hidden channels.
         num_classes: The number of output channels.
-        bias: If set to ``False``, the layer will not learn the bias parameter. Defaults to ``True``.
-        use_batch_normalization: If set to ``True``, layers will use batch normalization. Defaults to ``False``.
+        bias: If set to ``False``, the layer will not learn the bias parameter.
+            Defaults to ``True``.
+        use_batch_normalization: If set to ``True``, layers will use batch normalization.
+            Defaults to ``False``.
         drop_rate: Dropout ratio. Defaults to ``0.5``.
     """
 
diff --git a/hyperbench/models/hgnnp.py b/hyperbench/models/hgnnp.py
index d52332a5..7c73dce7 100644
--- a/hyperbench/models/hgnnp.py
+++ b/hyperbench/models/hgnnp.py
@@ -6,17 +6,21 @@ class HGNNP(nn.Module):
     """
     HGNN+ performs hypergraph convolution with two-stage mean aggregation using the
     incidence structure directly: nodes -> hyperedges -> nodes.
-    - Proposed in `HGNN+: General Hypergraph Neural Networks <https://ieeexplore.ieee.org/document/9795251>`_ paper (IEEE T-PAMI 2022).
-    - Reference implementation: `source <https://deephypergraph.readthedocs.io/en/latest/_modules/dhg/models/hypergraphs/hgnnp.html#HGNNP>`_.
+
+    References:
+        - Proposed in [HGNN+: General Hypergraph Neural Networks](https://ieeexplore.ieee.org/document/9795251) paper (IEEE T-PAMI 2022).
+        - Reference implementation: [Code](https://deephypergraph.readthedocs.io/en/latest/_modules/dhg/models/hypergraphs/hgnnp.html#HGNNP).
 
     Args:
         in_channels: The number of input channels.
         hidden_channels: The number of hidden channels.
         num_classes: The number of output channels.
-        bias: If set to ``False``, the layer will not learn the bias parameter. Defaults to ``True``.
-        use_batch_normalization: If set to ``True``, layers will use batch normalization. Defaults to ``False``.
+        bias: If set to ``False``, the layer will not learn the bias parameter.
+            Defaults to ``True``.
+        use_batch_normalization: If set to ``True``, layers will use batch normalization.
+            Defaults to ``False``.
         drop_rate: Dropout ratio. Defaults to ``0.5``.
-    """
+    """  # noqa: E501
 
     def __init__(
         self,
diff --git a/hyperbench/models/hnhn.py b/hyperbench/models/hnhn.py
index 350ae906..9ca8b5ea 100644
--- a/hyperbench/models/hnhn.py
+++ b/hyperbench/models/hnhn.py
@@ -5,19 +5,23 @@
 
 class HNHN(nn.Module):
     """
-    HNHN performs incidence-based hypergraph convolution with explicit hyperedge
-    embeddings between the node -> hyperedge -> node propagation steps.
-    - Proposed in `HNHN: Hypergraph Networks with Hyperedge Neurons <https://arxiv.org/abs/2006.12278>`_ paper.
-    - Reference implementation: `source <https://deephypergraph.readthedocs.io/en/latest/_modules/dhg/models/hypergraphs/hnhn.html#HNHN>`_.
+    HNHN performs incidence-based hypergraph convolution with explicit hyperedge embeddings between
+    the node -> hyperedge -> node propagation steps.
+
+    References:
+        - Proposed in [HNHN: Hypergraph Networks with Hyperedge Neurons](https://arxiv.org/abs/2006.12278) paper.
+        - Reference implementation: [Code](https://deephypergraph.readthedocs.io/en/latest/_modules/dhg/models/hypergraphs/hnhn.html#HNHN).
 
     Args:
         in_channels: The number of input channels.
         hidden_channels: The number of hidden channels.
         num_classes: The number of output channels.
-        bias: If set to ``False``, the layer will not learn the bias parameter. Defaults to ``True``.
-        use_batch_normalization: If set to ``True``, layers will use batch normalization. Defaults to ``False``.
+        bias: If set to ``False``, the layer will not learn the bias parameter.
+            Defaults to ``True``.
+        use_batch_normalization: If set to ``True``, layers will use batch normalization.
+            Defaults to ``False``.
         drop_rate: Dropout ratio. Defaults to ``0.5``.
-    """
+    """  # noqa: E501
 
     def __init__(
         self,
diff --git a/hyperbench/models/hypergcn.py b/hyperbench/models/hypergcn.py
index e6a4b9c9..e10d5d5d 100644
--- a/hyperbench/models/hypergcn.py
+++ b/hyperbench/models/hypergcn.py
@@ -5,24 +5,33 @@
 
 class HyperGCN(nn.Module):
     """
-    HyperGCN approximates each hyperedge of the hypergraph by a set of pairwise edges connecting the vertices of the hyperedge
-    and treats the learning problem as a graph learning problem on the approximation.
-    - Proposed in `HyperGCN: A New Method of Training Graph Convolutional Networks on Hypergraphs <https://dl.acm.org/doi/10.5555/3454287.3454422>`_ paper (NeurIPS 2019).
-    - Code of the paper: `source <https://github.com/malllabiisc/HyperGCN>`_.
-    - Reference implementation: `source <https://deephypergraph.readthedocs.io/en/latest/_modules/dhg/models/hypergraphs/hypergcn.html#HyperGCN>`_.
+    HyperGCN approximates each hyperedge of the hypergraph by a set of pairwise edges connecting the
+    vertices of the hyperedge and treats the learning problem as a graph learning problem on the
+    approximation.
+
+    References:
+        - Proposed in [HyperGCN: A New Method of Training Graph Convolutional Networks on Hypergraphs](https://dl.acm.org/doi/10.5555/3454287.3454422) paper (NeurIPS 2019).
+        - Code of the paper: [source](https://github.com/malllabiisc/HyperGCN).
+        - Reference implementation: [source](https://deephypergraph.readthedocs.io/en/latest/_modules/dhg/models/hypergraphs/hypergcn.html#HyperGCN).
 
     Args:
         in_channels: The number of input channels.
         hidden_channels: The number of hidden channels.
-        num_classes: The number of classes of the classification task as HyperGCB is a node classification model.
-        bias: If set to ``False``, the layer will not learn the bias parameter. Defaults to ``True``.
-        use_batch_normalization: If set to ``True``, layers will use batch normalization. Defaults to ``False``.
+        num_classes: The number of classes of the classification task as HyperGCB is a
+            node classification model.
+        bias: If set to ``False``, the layer will not learn the bias parameter.
+            Defaults to ``True``.
+        use_batch_normalization: If set to ``True``, layers will use batch normalization.
+            Defaults to ``False``.
         drop_rate: Dropout ratio. Defaults to ``0.5``.
-        use_mediator: Whether to use mediator to transform the hyperedges to edges in the graph. Defaults to ``False``.
-        fast: If set to ``True``, the transformed graph structure will be computed once from the input hypergraph
-            and vertex features, and cached for future use. Defaults to ``True``.
-        seed: Optional random seed for the random reduction of hyperedges to edges. Defaults to ``None``.
-    """
+        use_mediator: Whether to use mediator to transform the hyperedges to edges in the graph.
+            Defaults to ``False``.
+        fast: If set to ``True``, the transformed graph structure will be computed once from
+            the input hypergraph and vertex features, and cached for future use.
+            Defaults to ``True``.
+        seed: Optional random seed for the random reduction of hyperedges to edges.
+            Defaults to ``None``.
+    """  # noqa: E501
 
     def __init__(
         self,
@@ -82,7 +91,8 @@ def forward(self, x: Tensor, hyperedge_index: Tensor) -> Tensor:
             return x
 
         # If the GCN Laplacian is cached, we need to check if the node feature size has changed
-        # with cached_gcn_laplacian_matrix.size(0) != x.size(0), this can happen, for example, due to:
+        # with cached_gcn_laplacian_matrix.size(0) != x.size(0), this can happen,
+        # for example, due to:
         # adding new negative samples or having validation/test sets with different node features
         should_not_use_cached_gcn_laplacian_matrix = (
             self.cached_gcn_laplacian_matrix is None  # Not cached yet
diff --git a/hyperbench/models/mlp.py b/hyperbench/models/mlp.py
index c27eb01a..0a191888 100644
--- a/hyperbench/models/mlp.py
+++ b/hyperbench/models/mlp.py
@@ -9,7 +9,8 @@
 
 class MLP(nn.Module):
     """
-    A simple multi-layer perceptron (MLP) with configurable number of layers, hidden channels, activation functions, normalization, and dropout.
+    A simple multi-layer perceptron (MLP) with configurable number of layers, hidden channels,
+    activation functions, normalization, and dropout.
 
     Examples:
         >>> mlp = MLP(in_channels=16, out_channels=1, hidden_channels=32, num_layers=3)
@@ -39,13 +40,19 @@ class MLP(nn.Module):
         in_channels: Number of input features.
         out_channels: Number of output features.
         hidden_channels: Number of hidden units in each hidden layer. Required if num_layers > 1.
-        num_layers: Total number of layers (including output layer). Must be at least 1. Defaults to 1.
+        num_layers: Total number of layers (including output layer). Must be at least 1.
+            Defaults to 1.
         activation_fn: Activation function to use after each hidden layer. Defaults to ``nn.ReLU``.
-        activation_fn_kwargs: Keyword arguments for the activation function. Defaults to empty dict.
-        normalization_fn: Normalization function to use after each hidden layer (before activation). If ``None``, no normalization is applied. Defaults to ``None``.
-        normalization_fn_kwargs: Keyword arguments for the normalization function. Defaults to empty dict.
+        activation_fn_kwargs: Keyword arguments for the activation function.
+            Defaults to empty dict.
+        normalization_fn: Normalization function to use after each
+            hidden layer (before activation).
+            If ``None``, no normalization is applied. Defaults to ``None``.
+        normalization_fn_kwargs: Keyword arguments for the normalization function.
+            Defaults to empty dict.
         bias: Whether to include bias terms in the linear layers. Defaults to ``True``.
-        drop_rate: Dropout rate to apply after each hidden layer (after activation). If 0.0, no dropout is applied. Defaults to 0.0.
+        drop_rate: Dropout rate to apply after each hidden layer (after activation). If 0.0, no
+            dropout is applied. Defaults to 0.0.
     """
 
     def __init__(
@@ -105,7 +112,8 @@ def __validate_num_layers(self, num_layers: int, hidden_channels: int | None) ->
 
 class SLP(MLP):
     """
-    A single-layer perceptron (SLP) which is a special case of MLP with exactly one layer and no hidden units.
+    A single-layer perceptron (SLP) which is a special case of MLP with exactly
+    one layer and no hidden units.
 
     Examples:
         >>> slp = SLP(in_channels=16, out_channels=1)
diff --git a/hyperbench/models/nhp.py b/hyperbench/models/nhp.py
index 4be38118..774127d5 100644
--- a/hyperbench/models/nhp.py
+++ b/hyperbench/models/nhp.py
@@ -9,11 +9,14 @@
 class NHP(nn.Module):
     """
     Neural Hyperlink Predictor (NHP) for undirected hyperedge link prediction.
-    - Proposed in `NHP: Neural Hypergraph Link Prediction <https://dl.acm.org/doi/10.1145/3340531.3411870>`_ paper (CIKM 2020).
-    - Reference implementation: `source <https://github.com/cyixiao/NHP-reproduce/>`_.
+
+    References:
+        - Proposed in [NHP: Neural Hypergraph Link Prediction](https://dl.acm.org/doi/10.1145/3340531.3411870) paper (CIKM 2020).
+        - Reference implementation: [Code](https://github.com/cyixiao/NHP-reproduce/).
 
     NHP scores each candidate hyperedge by building candidate-specific node embeddings.
-    A node that appears in multiple candidate hyperedges can receive a different incidence embedding in each one,
+    A node that appears in multiple candidate hyperedges can receive a different incidence
+    embedding in each one,
     because its update depends on the other nodes in that candidate hyperedge.
 
     Examples:
@@ -37,11 +40,13 @@ class NHP(nn.Module):
     Args:
         in_channels: Number of input features per node.
         hidden_channels: Number of hidden units in the node embeddings.
-        activation_fn: Activation function to use after the linear transformations. Defaults to ``nn.ReLU``.
+        activation_fn: Activation function to use after the linear transformations.
+            Defaults to ``nn.ReLU``.
         activation_fn_kwargs: Keyword arguments for the activation function. Defaults to empty dict.
-        aggregation: Method to aggregate the incidence embeddings into a hyperedge embedding. Must be either "maxmin" or "mean". Defaults to "maxmin".
+        aggregation: Method to aggregate the incidence embeddings into a hyperedge embedding.
+            Must be either "maxmin" or "mean". Defaults to "maxmin".
         bias: Whether to include bias terms in the linear layers. Defaults to ``True``.
-    """
+    """  # noqa: E501
 
     def __init__(
         self,
@@ -100,7 +105,8 @@ def forward(self, x: Tensor, hyperedge_index: Tensor) -> Tensor:
         #             shape: (num_incidences, in_channels)
         incidence_node_features = x[node_ids]
 
-        # Do one local message-passing step to sum original node features per hyperedge to get hyperedge features.
+        # Do one local message-passing step to sum original node features per hyperedge
+        # to get hyperedge features.
         # that are aware of all nodes in the candidate hyperedge.
         # Example: hyperedge 0 contains nodes (0, 1)    -> [1, 0] + [0, 1] = [1, 1]
         #          hyperedge 1 contains nodes (1, 2, 3) -> [0, 1] + [1, 1] + [1, 0] = [2, 2]
@@ -146,7 +152,8 @@ def forward(self, x: Tensor, hyperedge_index: Tensor) -> Tensor:
         # shape (num_incidences, hidden_channels)
         selfloop_embeddings = self.self_loop(incidence_node_features)
 
-        # incidence_embeddings[0] = activation_fn(selfloop_embeddings[0] + neighbor_aware_hyperedge_embeddings[0])
+        # incidence_embeddings[0] =
+        # activation_fn(selfloop_embeddings[0] + neighbor_aware_hyperedge_embeddings[0])
         # is the embedding of the first incidence (i.e., node 0 in hyperedge 0)
         # after one local message-passing step inside that candidate hyperedge.
         incidence_embeddings = self.activation_fn(
@@ -155,7 +162,8 @@ def forward(self, x: Tensor, hyperedge_index: Tensor) -> Tensor:
 
         # Treat each incidence embedding as a separately aggregatable set of features.
         # This is required because incidence embeddings are not global node embeddings:
-        # node 1 may appear twice with two different embeddings as it participates in two different candidate hyperedges.
+        # node 1 may appear twice with two different embeddings as it participates in
+        # two different candidate hyperedges.
         # Example: incidence_ids = [0, 1, 2, 3, 4],
         #          hyperedge_ids = [0, 0, 1, 1, 1]
         #          -> incidence_hyperedge_index = [[0, 1, 2, 3, 4],
@@ -173,11 +181,13 @@ def forward(self, x: Tensor, hyperedge_index: Tensor) -> Tensor:
         #                                  [5, 6],  # features 2, node 1 in hyperedge 1
         #                                  [7, 8],  # features 3, node 2 in hyperedge 1
         #                                  [9, 10]] # features 4, node 3 in hyperedge 1
-        #          -> incidence_aggregator pools features (0, 1) for hyperedge 0 and features (2, 3, 4) for hyperedge 1
+        #          -> incidence_aggregator pools features (0, 1) for hyperedge 0 and
+        #                features (2, 3, 4) for hyperedge 1
         #          if aggregation == "maxmin":
-        #          -> hyperedge_embeddings = [[max(1, 3) - min(1, 3), max(2, 4) - min(2, 4)],                # hyperedge 0
-        #                                     [max(5, 7, 9) - min(5, 7, 9), max(6, 8, 10) - min(6, 8, 10)]]  # hyperedge 1
-        #                                    shape: (num_hyperedges, hidden_channels)
+        #          -> hyperedge_embeddings =
+        #               [[max(1, 3) - min(1, 3), max(2, 4) - min(2, 4)],               # hyperedge 0
+        #               [max(5, 7, 9) - min(5, 7, 9), max(6, 8, 10) - min(6, 8, 10)]]  # hyperedge 1
+        #               shape: (num_hyperedges, hidden_channels)
         #         if aggregation == "mean":
         #         -> hyperedge_embeddings = [[mean(1, 3), mean(2, 4)],         # hyperedge 0
         #                                    [mean(5, 7, 9), mean(6, 8, 10)]]  # hyperedge 1
diff --git a/hyperbench/models/node2vec.py b/hyperbench/models/node2vec.py
index b7d40d88..85a8ce96 100644
--- a/hyperbench/models/node2vec.py
+++ b/hyperbench/models/node2vec.py
@@ -14,29 +14,35 @@ class Node2Vec(nn.Module):
         edge_index: Edge index representing the graph structure. Size ``(2, num_edges)``.
         embedding_dim: Dimension of the node embeddings to learn.
         walk_length: Length of each random walk.
-        context_size: Window size for the skip-gram model (number of neighbors in the walk considered as context).
-            For example, if ``context_size=2`` and ``walk_length=5``, then for a random walk ``[v0, v1, v2, v3, v4]``,
-            the context for ``v2`` would be ``[v0, v1, v3, v4]`` as we take neighbors within distance 2 in the walk.
+        context_size: Window size for the skip-gram model (number of neighbors in the walk
+            considered as context).
+            For example, if ``context_size=2`` and ``walk_length=5``, then for
+            a random walk ``[v0, v1, v2, v3, v4]``,
+            the context for ``v2`` would be ``[v0, v1, v3, v4]`` as we take neighbors within
+            distance 2 in the walk.
             The pairs generated by skip-gram would be ``[(v2, v0), (v2, v1), (v2, v3), (v2, v4)]``.
-            Rule of thumb: Graphs with strong local structure (5-10), Graphs with communities/long-range patterns (10-20).
+            Rule of thumb: Graphs with strong local structure (5-10), Graphs with
+            communities/long-range patterns (10-20).
             Defaults to ``10``.
         num_walks_per_node: Number of random walks to start at each node.
         p: Return hyperparameter for Node2Vec. Default is ``1.0`` (unbiased).
             This controls the probability of stepping back to the node visited in the previous step.
-            Lower values of ``p`` make immediate backtracking more likely, which keeps walks closer to the
-            local neighborhood. Higher values of ``p`` discourage returning to the previous node, so walks
-            are less likely to bounce back and forth across the same edge.
+            Lower values of ``p`` make immediate backtracking more likely, which keeps walks closer
+            to the local neighborhood. Higher values of ``p`` discourage returning to the
+            previous node, so walks are less likely to bounce back and forth across the same edge.
         q: In-out hyperparameter for Node2Vec. Default is ``1.0`` (unbiased).
             This controls whether walks stay near the source node or explore further outward.
-            Lower values of ``q`` bias the walk toward outward exploration, behaving more like DFS and
-            emphasizing structural roles. Higher values of ``q`` bias the walk toward nearby nodes,
-            behaving more like BFS and emphasizing community structure and homophily.
+            Lower values of ``q`` bias the walk toward outward exploration, behaving more like DFS
+            and emphasizing structural roles. Higher values of ``q`` bias the walk toward
+            nearby nodes, behaving more like BFS and emphasizing community structure and homophily.
         num_negative_samples: Number of negative samples to use for training the skip-gram model.
-            If set to ``X``, then for each positive pair ``(u, v)`` generated from the random walks, ``X`` negative pairs ``(u, v_neg)`` will be generated,
-            where ``v_neg`` is a node sampled uniformly at random from all nodes in the graph.
+            If set to ``X``, then for each positive pair ``(u, v)`` generated from the random walks,
+            ``X`` negative pairs ``(u, v_neg)`` will be generated, where ``v_neg`` is a node
+            sampled uniformly at random from all nodes in the graph.
             Defaults to ``1``, meaning one negative sample per positive pair.
-        num_nodes: Total number of nodes in the graph. If not provided, it will be inferred from the hyperedge_index.
-            This is only needed if the hyperedge_index does not include all nodes (e.g., some isolated nodes are missing).
+        num_nodes: Total number of nodes in the graph. If not provided, it will be inferred from
+            the hyperedge_index. This is only needed if the hyperedge_index does not include all
+            nodes (e.g., some isolated nodes are missing).
         sparse: Whether Node2Vec embeddings should use sparse gradients.
     """
 
@@ -91,33 +97,40 @@ class Node2VecConfig(TypedDict):
     """
     Configuration for the Node2Vec model.
 
-    Args:
+    Attributes:
         edge_index: Edge index representing the graph structure. Size ``(2, num_edges)``.
         embedding_dim: Dimension of the node embeddings to learn.
         walk_length: Length of each random walk.
-        context_size: Window size for the skip-gram model (number of neighbors in the walk considered as context).
-            For example, if ``context_size=2`` and ``walk_length=5``, then for a random walk ``[v0, v1, v2, v3, v4]``,
-            the context for ``v2`` would be ``[v0, v1, v3, v4]`` as we take neighbors within distance 2 in the walk.
+        context_size: Window size for the skip-gram model (number of neighbors in the walk
+            considered as context).
+            For example, if ``context_size=2`` and ``walk_length=5``, then for a
+            random walk ``[v0, v1, v2, v3, v4]``,
+            the context for ``v2`` would be ``[v0, v1, v3, v4]`` as we take neighbors within
+            distance 2 in the walk.
             The pairs generated by skip-gram would be ``[(v2, v0), (v2, v1), (v2, v3), (v2, v4)]``.
-            Rule of thumb: Graphs with strong local structure (5-10), Graphs with communities/long-range patterns (10-20).
+            Rule of thumb: Graphs with strong local structure (5-10), Graphs with
+            communities/long-range patterns (10-20).
             Defaults to ``10``.
         num_walks_per_node: Number of random walks to start at each node.
         p: Return hyperparameter for Node2Vec. Default is ``1.0`` (unbiased).
             This controls the probability of stepping back to the node visited in the previous step.
-            Lower values of ``p`` make immediate backtracking more likely, which keeps walks closer to the
-            local neighborhood. Higher values of ``p`` discourage returning to the previous node, so walks
-            are less likely to bounce back and forth across the same edge.
+            Lower values of ``p`` make immediate backtracking more likely, which keeps walks
+            closer to the local neighborhood. Higher values of ``p`` discourage returning to the
+            previous node, so walks are less likely to bounce back and forth across the same edge.
         q: In-out hyperparameter for Node2Vec. Default is ``1.0`` (unbiased).
             This controls whether walks stay near the source node or explore further outward.
-            Lower values of ``q`` bias the walk toward outward exploration, behaving more like DFS and
-            emphasizing structural roles. Higher values of ``q`` bias the walk toward nearby nodes,
-            behaving more like BFS and emphasizing community structure and homophily.
+            Lower values of ``q`` bias the walk toward outward exploration, behaving more like
+            DFS and emphasizing structural roles. Higher values of ``q`` bias the walk toward
+            nearby nodes, behaving more like BFS and emphasizing community structure and homophily.
         num_negative_samples: Number of negative samples to use for training the skip-gram model.
-            If set to ``X``, then for each positive pair ``(u, v)`` generated from the random walks, ``X`` negative pairs ``(u, v_neg)`` will be generated,
-            where ``v_neg`` is a node sampled uniformly at random from all nodes in the graph.
+            If set to ``X``, then for each positive pair ``(u, v)`` generated from the random walks,
+            ``X`` negative pairs ``(u, v_neg)`` will be generated, where ``v_neg`` is a node sampled
+             uniformly at random from all nodes in the graph.
             Defaults to ``1``, meaning one negative sample per positive pair.
-        num_nodes: Total number of nodes in the graph. If not provided, it will be inferred from the hyperedge_index.
-            This is only needed if the hyperedge_index does not include all nodes (e.g., some isolated nodes are missing).
+        num_nodes: Total number of nodes in the graph. If not provided, it will be inferred from
+            the hyperedge_index.
+            This is only needed if the hyperedge_index does not include all nodes
+            (e.g., some isolated nodes are missing).
         sparse: Whether Node2Vec embeddings should use sparse gradients.
     """
 
diff --git a/hyperbench/models/villain.py b/hyperbench/models/villain.py
index 8ff27f59..721d0c82 100644
--- a/hyperbench/models/villain.py
+++ b/hyperbench/models/villain.py
@@ -11,14 +11,17 @@
 class VilLain(nn.Module):
     """
     VilLain learns node-specific virtual-label logits instead of consuming existing node features.
-    The model is transductive: rows in ``node_embedding`` correspond to the fixed global node space used during training.
-    - Proposed in `VilLain: Self-Supervised Learning on Homogeneous Hypergraphs without Features via Virtual Label Propagation <https://dl.acm.org/doi/pdf/10.1145/3589334.3645454>`_ paper (WWW 2024).
-    - Reference implementation: `source <https://github.com/geon0325/VilLain/>`_.
+    The model is transductive: rows in ``node_embedding`` correspond to the fixed global node space
+    used during training.
+
+    References:
+        - Proposed in [VilLain: Self-Supervised Learning on Homogeneous Hypergraphs without Features via Virtual Label Propagation](https://dl.acm.org/doi/pdf/10.1145/3589334.3645454) paper (WWW 2024).
+        - Reference implementation: [Code](https://github.com/geon0325/VilLain/).
 
     Each forward pass:
-    1. Samples differentiable virtual-label assignments with Gumbel-Softmax.
-    2. Propagates them over the incidence structure.
-    3. Returns averaged propagated node embeddings.
+        1. Samples differentiable virtual-label assignments with Gumbel-Softmax.
+        2. Propagates them over the incidence structure.
+        3. Returns averaged propagated node embeddings.
 
     Args:
         num_nodes: Total number of trainable nodes.
@@ -28,7 +31,7 @@ class VilLain(nn.Module):
         generation_steps: Propagation steps averaged for final embeddings. Defaults to ``100``.
         tau: Gumbel-Softmax temperature. Defaults to ``1.0``.
         eps: Numerical stability constant. Defaults to ``1e-10``.
-    """
+    """  # noqa: E501
 
     def __init__(
         self,
@@ -81,18 +84,27 @@ def forward(
     ) -> tuple[Tensor, VilLainLossParts]:
         """
         Compute the self-supervised VilLain objective.
-        Use ``hyperedge_embeddings`` or ``node_embeddings`` to generate final embeddings for inference after training.
+
+        Use ``hyperedge_embeddings`` or ``node_embeddings`` to generate final embeddings for
+        inference after training.
 
         Args:
             hyperedge_index: Incidence tensor of shape ``(2, num_incidences)``.
-            node_ids: Optional global node ids matching local node ids the embedding table in the transductive setting.
-                Use this when a batch has rebased local node ids but the learned logits live in the full transductive node table.
-                This is needed as the model keeps an internal embedding table with a row for every node in the global node space.
-            num_hyperedges: Optional explicit hyperedge count used during node-to-hyperedge pooling to preserve empty hyperedges.
+            node_ids: Optional global node ids matching local node ids the embedding table in the
+                transductive setting.
+                Use this when a batch has rebased local node ids but the learned logits live in the
+                full transductive node table.
+                This is needed as the model keeps an internal embedding table with a row for every
+                node in the global node space.
+            num_hyperedges: Optional explicit hyperedge count used during node-to-hyperedge pooling
+                to preserve empty hyperedges.
                 If not provided, the hyperedge count is inferred from ``hyperedge_index``.
 
         Returns:
-            node_embeddings: Node embeddings of shape ``(num_local_nodes, embedding_dim)``.
+            total_loss: The combined loss scalar tensor to optimize.
+            loss_parts: A dictionary containing the individual loss components. It contains
+                ``local_loss`` and ``global_loss`` scalar tensors.
+
         """
         return self.loss(
             hyperedge_index=hyperedge_index,
@@ -111,14 +123,19 @@ def loss(
 
         Args:
             hyperedge_index: Incidence tensor of shape ``(2, num_incidences)``.
-            node_ids: Optional global node ids matching local node ids the embedding table in the transductive setting.
-                Use this when a batch has rebased local node ids but the learned logits live in the full transductive node table.
-                This is needed as the model keeps an internal embedding table with a row for every node in the global node space.
-            num_hyperedges: Optional explicit hyperedge count used during node-to-hyperedge pooling to preserve empty hyperedges.
+            node_ids: Optional global node ids matching local node ids the embedding table in the
+                transductive setting. Use this when a batch has rebased local node ids but the
+                learned logits live in the full transductive node table.
+                This is needed as the model keeps an internal embedding table with a row for every
+                node in the global node space.
+            num_hyperedges: Optional explicit hyperedge count used during node-to-hyperedge pooling
+                to preserve empty hyperedges.
                 If not provided, the hyperedge count is inferred from ``hyperedge_index``.
 
         Returns:
-            loss: A tuple ``(total_loss, loss_parts)`` where ``loss_parts`` contains ``local_loss`` and ``global_loss`` scalar tensors.
+            total_loss: The combined loss scalar tensor to optimize.
+            loss_parts: A dictionary containing the individual loss components. It contains
+                ``local_loss`` and ``global_loss`` scalar tensors.
         """
         node_embeddings = self.__get_initial_virtual_node_features(node_ids=node_ids)
         actual_num_hyperedges = self.__num_hyperedges(hyperedge_index, num_hyperedges)
@@ -149,15 +166,20 @@ def hyperedge_embeddings(
     ) -> Tensor:
         """
         Generate hyperedge embeddings by averaging propagated hyperedge states.
-        Every generation step computes hyperedge states from the current node states, then updates node states for the next step.
+
+        Every generation step computes hyperedge states from the current node states, then updates
+        node states for the next step.
 
         Args:
             hyperedge_index: Incidence tensor of shape ``(2, num_incidences)``.
-            node_ids: Optional global node ids matching local node ids the embedding table in the transductive setting.
-                Use this when a batch has rebased local node ids but the learned logits live in the full transductive node table.
-                This is needed as the model keeps an internal embedding table with a row for every node in the global node space.
-            num_hyperedges: Optional explicit hyperedge count used during node-to-hyperedge pooling to preserve empty hyperedges.
-                If not provided, the hyperedge count is inferred from ``hyperedge_index``.
+            node_ids: Optional global node ids matching local node ids the embedding table in the
+                transductive setting. Use this when a batch has rebased local node ids but the
+                learned logits live in the full transductive node table.
+                This is needed as the model keeps an internal embedding table with a row for every
+                node in the global node space.
+            num_hyperedges: Optional explicit hyperedge count used during node-to-hyperedge pooling
+                to preserve empty hyperedges. If not provided, the hyperedge count is inferred from
+                ``hyperedge_index``.
 
         Returns:
             hyperedge_embeddings: Hyperedge embeddings of shape ``(num_hyperedges, embedding_dim)``.
@@ -180,11 +202,14 @@ def node_embeddings(
 
         Args:
             hyperedge_index: Incidence tensor of shape ``(2, num_incidences)``.
-            node_ids: Optional global node ids matching local node ids the embedding table in the transductive setting.
-                Use this when a batch has rebased local node ids but the learned logits live in the full transductive node table.
-                This is needed as the model keeps an internal embedding table with a row for every node in the global node space.
-            num_hyperedges: Optional explicit hyperedge count used during node-to-hyperedge pooling to preserve empty hyperedges.
-                If not provided, the hyperedge count is inferred from ``hyperedge_index``.
+            node_ids: Optional global node ids matching local node ids the embedding table in the
+                transductive setting. Use this when a batch has rebased local node ids but the
+                learned logits live in the full transductive node table.
+                This is needed as the model keeps an internal embedding table with a row for every
+                node in the global node space.
+            num_hyperedges: Optional explicit hyperedge count used during node-to-hyperedge pooling
+                to preserve empty hyperedges. If not provided, the hyperedge count is inferred from
+                ``hyperedge_index``.
 
         Returns:
             node_embeddings: Node embeddings of shape ``(num_local_nodes, embedding_dim)``.
@@ -197,7 +222,9 @@ def node_embeddings(
         )
 
     def reset_parameters(self) -> None:
-        """Initialize trainable virtual-label logits near zero."""
+        """
+        Initialize trainable virtual-label logits near zero.
+        """
         nn.init.normal_(self.node_embedding, mean=0.0, std=0.1)
 
     def __embeddings(
@@ -212,8 +239,10 @@ def __embeddings(
 
         Args:
             hyperedge_index: Incidence tensor of shape ``(2, num_incidences)``.
-            node_ids: Optional global node ids matching local node ids the embedding table in the transductive setting.
-            num_hyperedges: Optional explicit hyperedge count to preserve empty hyperedges during propagation.
+            node_ids: Optional global node ids matching local node ids the embedding table in the
+                transductive setting.
+            num_hyperedges: Optional explicit hyperedge count to preserve empty hyperedges
+                during propagation.
             mode: Selects whether to accumulate propagated node states or hyperedge states.
 
         Returns:
@@ -237,16 +266,20 @@ def __embeddings(
                 )
 
                 # Suppose generation_steps = 100.
-                # Average 100 propagated embeddings for each node/hyperedge to get more stable final embeddings.
-                # Sum here and divide by generation_steps later to avoid storing all 100 embeddings in memory at once.
+                # Average 100 propagated embeddings for each node/hyperedge to get more
+                # stable final embeddings.
+                # Sum here and divide by generation_steps later to avoid storing all 100 embeddings
+                # in memory at once.
                 final_embeddings = final_embeddings + (
                     x if mode == "node" else hyperedge_embeddings
                 )
             final_embeddings = final_embeddings / self.generation_steps
 
-            # Example: final_embeddings.shape = (num_nodes/num_hyperedges, 8) with raw_embedding_dim=8
+            # Example: final_embeddings.shape = (num_nodes/num_hyperedges, 8)
+            #                   with raw_embedding_dim=8
             #          -> returned shape = (num_nodes/num_hyperedges, 4) with embedding_dim=4
-            #             as it takes the first 4 channels of the raw embedding as the final embedding.
+            #             as it takes the first 4 channels of the raw embedding
+            #               as the final embedding.
             return final_embeddings[:, : self.embedding_dim]
 
     def __get_initial_virtual_node_features(self, node_ids: Tensor | None = None) -> Tensor:
@@ -254,7 +287,8 @@ def __get_initial_virtual_node_features(self, node_ids: Tensor | None = None) ->
         Convert trainable node logits into flattened virtual-label probabilities.
 
         Args:
-            node_ids: Optional global node ids matching local node ids the embedding table in the transductive setting.
+            node_ids: Optional global node ids matching local node ids the embedding table
+                in the transductive setting.
                 If ``None``, all node rows are used.
 
         Returns:
@@ -291,7 +325,8 @@ def __message_passing(
         num_hyperedges: int,
     ) -> tuple[Tensor, Tensor]:
         """
-        One round of message passing, where nodes send messages to hyperedges and then hyperedges send messages back to nodes.
+        One round of message passing, where nodes send messages to hyperedges and then hyperedges
+        send messages back to nodes.
 
         Args:
             x: Virtual node features of shape (num_nodes, raw_embedding_dim).
@@ -299,7 +334,9 @@ def __message_passing(
             num_hyperedges: Total number of hyperedges.
 
         Returns:
-            embeddings: The updated node and hyperedge embeddings after one round of message passing.
+            node_embeddings: The updated node embeddings after one round of message passing.
+            hyperedge_embeddings: The updated hyperedge embeddings after one round
+                of message passing.
         """
         hyperedge_embeddings = HyperedgeAggregator(
             hyperedge_index=hyperedge_index,
@@ -321,7 +358,9 @@ def __num_hyperedges(
         num_hyperedges: int | None,
     ) -> int:
         """
-        Return the explicit hyperedge count or infer it from the ``hyperedge_index``, if not provided.
+        Return the explicit hyperedge count or infer it from the ``hyperedge_index``, if not
+        provided.
+
         Explicit counts are required when empty hyperedges must remain in the hypergraph.
         """
         if num_hyperedges is not None:
diff --git a/hyperbench/nn/aggregator.py b/hyperbench/nn/aggregator.py
index 8be38081..446e7ca7 100644
--- a/hyperbench/nn/aggregator.py
+++ b/hyperbench/nn/aggregator.py
@@ -16,7 +16,8 @@ class HyperedgeAggregator:
         hyperedge_index: Hyperedge incidence in COO format of size ``(2, num_incidences)``.
         node_embeddings: Node embedding matrix of size ``(num_nodes, num_channels)``.
         num_hyperedges: Optional explicit hyperedge count.
-            When provided, the pooled output preserves empty hyperedges that do not appear in ``hyperedge_index``.
+            When provided, the pooled output preserves empty hyperedges that do not appear
+            in ``hyperedge_index``.
     """
 
     def __init__(
@@ -34,16 +35,24 @@ def pool(self, aggregation: Literal["maxmin", "max", "min", "mean", "mul", "sum"
         Aggregate node embeddings for each hyperedge.
 
         ``hyperedge_index`` is the COO encoding of the nonzero entries of ``H``,
-        so ``hyperedge_index[0, k] = v`` and ``hyperedge_index[1, k] = e`` means ``H[v, e] = 1`` for incidence ``k``.
+        so ``hyperedge_index[0, k] = v`` and ``hyperedge_index[1, k] = e`` means ``H[v, e] = 1``
+        for incidence ``k``.
 
         Let ``H`` be the binary incidence matrix of shape ``(num_nodes, num_hyperedges)``
         and let ``X`` be the node embedding matrix of shape ``(num_nodes, num_channels)``.
-        This method pools node features into hyperedge features using the incidence pattern in ``H``:
-        - ``aggregation="sum"`` computes the equivalent of the standard sparse matrix product ``H^T X``.
-        - ``aggregation="mean"`` computes ``D_e^{-1} H^T X``, where ``D_e[e, e] = sum_v H[v, e]`` is the hyperedge cardinality matrix.
-        - ``aggregation in {"max", "min", "mul"}`` uses the same sparsity pattern as ``H^T X``,
-          but replaces the summation over incident nodes with a channel-wise ``max``, ``min``, or product reduction.
-        - ``aggregation="maxmin"`` computes the channel-wise range ``max - min`` for each hyperedge.
+        This method pools node features into hyperedge features using the incidence pattern in
+        ``H``.
+
+        Aggregations:
+            - ``aggregation="sum"`` computes the equivalent of the standard
+                sparse matrix product ``H^T X``.
+            - ``aggregation="mean"`` computes ``D_e^{-1} H^T X``, where
+                ``D_e[e, e] = sum_v H[v, e]`` is the hyperedge cardinality matrix.
+            - ``aggregation in {"max", "min", "mul"}`` uses the same sparsity pattern as ``H^T X``,
+                but replaces the summation over incident nodes with a channel-wise ``max``, ``min``,
+                or product reduction.
+            - ``aggregation="maxmin"`` computes the channel-wise range ``max - min``
+                for each hyperedge.
 
         Examples:
             >>> hyperedge_index = [[0, 1, 2, 2, 3],
@@ -62,7 +71,8 @@ def pool(self, aggregation: Literal["maxmin", "max", "min", "mean", "mul", "sum"
             aggregation: Reduction applied across the nodes belonging to each hyperedge.
 
         Returns:
-            hyperedge_embeddings: A hyperedge embedding matrix of shape ``(num_hyperedges, num_channels)``.
+            hyperedge_embeddings: A hyperedge embedding matrix of
+                shape ``(num_hyperedges, num_channels)``.
         """
         # Gather the embeddings for each incidence.
         # A node appearing in multiple hyperedges is repeated, once per incidence.
@@ -121,7 +131,8 @@ class NodeAggregator:
     Args:
         hyperedge_index: Hyperedge incidence in COO format of size ``(2, num_incidences)``.
         hyperedge_embeddings: Hyperedge embedding matrix of size ``(num_hyperedges, num_channels)``.
-        num_nodes: Optional explicit node count. When provided, the pooled output preserves isolated nodes that do not appear in ``hyperedge_index``.
+        num_nodes: Optional explicit node count. When provided, the pooled output preserves
+            isolated nodes that do not appear in ``hyperedge_index``.
     """
 
     def __init__(
@@ -139,15 +150,22 @@ def pool(self, aggregation: Literal["maxmin", "max", "min", "mean", "mul", "sum"
         Aggregate hyperedge embeddings for each node.
 
         ``hyperedge_index`` is the COO encoding of the nonzero entries of ``H``,
-        so ``hyperedge_index[0, k] = v`` and ``hyperedge_index[1, k] = e`` means ``H[v, e] = 1`` for incidence ``k``.
+        so ``hyperedge_index[0, k] = v`` and ``hyperedge_index[1, k] = e`` means ``H[v, e] = 1``
+        for incidence ``k``.
 
         Let ``H`` be the incidence matrix of shape ``(num_nodes, num_hyperedges)``
         and let ``E`` be the hyperedge embedding matrix of shape ``(num_hyperedges, num_channels)``.
-        This method pools hyperedge features into node features using the incidence pattern in ``H``:
-        - ``aggregation="sum"`` computes the equivalent of the standard sparse matrix product ``H E``.
-        - ``aggregation="mean"`` computes ``D_v^{-1} H E``, where ``D_v[v, v] = sum_e H[v, e]`` is the node degree matrix.
-        - ``aggregation in {"max", "min", "mul"}`` uses the same sparsity pattern as ``H E``,
-          but replaces the summation over incident hyperedges with a channel-wise ``max``, ``min``, or product reduction.
+        This method pools hyperedge features into node features using the incidence pattern
+        in ``H``.
+
+        Aggregations:
+            - ``aggregation="sum"`` computes the equivalent of the standard
+                sparse matrix product ``H E``.
+            - ``aggregation="mean"`` computes ``D_v^{-1} H E``, where ``D_v[v, v] = sum_e H[v, e]``
+                is the node degree matrix.
+            - ``aggregation in {"max", "min", "mul"}`` uses the same sparsity pattern as ``H E``,
+                but replaces the summation over incident hyperedges with a channel-wise
+                ``max``, ``min``, or product reduction.
 
         Examples:
             >>> hyperedge_index = [[0, 1, 1, 2],
diff --git a/hyperbench/nn/conv.py b/hyperbench/nn/conv.py
index 6f415acd..992c9a5c 100644
--- a/hyperbench/nn/conv.py
+++ b/hyperbench/nn/conv.py
@@ -6,19 +6,25 @@
 
 class HyperGCNConv(nn.Module):
     """
-    The HyperGCNConv layer proposed in `HyperGCN: A New Method of Training Graph Convolutional Networks on Hypergraphs <https://dl.acm.org/doi/10.5555/3454287.3454422>`_ paper (NeurIPS 2019).
-    Reference implementation: `source <https://deephypergraph.readthedocs.io/en/latest/_modules/dhg/nn/convs/hypergraphs/hypergcn_conv.html#HyperGCNConv>`_.
+    References:
+        - The HyperGCNConv layer proposed in [HyperGCN: A New Method of Training Graph Convolutional Networks on Hypergraphs](https://dl.acm.org/doi/10.5555/3454287.3454422) paper (NeurIPS 2019).
+        - Reference implementation: [source](https://deephypergraph.readthedocs.io/en/latest/_modules/dhg/nn/convs/hypergraphs/hypergcn_conv.html#HyperGCNConv).
 
     Args:
         in_channels: The number of input channels.
         out_channels: The number of output channels.
-        bias: If set to ``False``, the layer will not learn the bias parameter. Defaults to ``True``.
-        use_batch_normalization: If set to ``True``, the layer will use batch normalization. Defaults to ``False``.
+        bias: If set to ``False``, the layer will not learn the bias parameter.
+            Defaults to ``True``.
+        use_batch_normalization: If set to ``True``, the layer will use batch normalization.
+            Defaults to ``False``.
         drop_rate: If set to a positive number, the layer will use dropout. Defaults to ``0.5``.
-        use_mediator: Whether to use mediator to transform the hyperedges to edges in the graph. Defaults to ``False``.
-        is_last: If set to ``True``, the layer will not apply the final activation and dropout functions. Defaults to ``False``.
-        seed: Optional random seed for the random reduction of hyperedges to edges. Defaults to ``None``.
-    """
+        use_mediator: Whether to use mediator to transform the hyperedges to edges in the graph.
+            Defaults to ``False``.
+        is_last: If set to ``True``, the layer will not apply the final activation and
+            dropout functions. Defaults to ``False``.
+        seed: Optional random seed for the random reduction of hyperedges to edges.
+            Defaults to ``None``.
+    """  # noqa: E501
 
     def __init__(
         self,
@@ -39,7 +45,8 @@ def __init__(
         self.dropout = nn.Dropout(drop_rate)
 
         # θ is the learnable weight matrix (as in the HyperGCN paper),
-        # it projects node features from in_channels to out_channels and learns how to mix feature channels
+        # it projects node features from in_channels to out_channels and
+        # learns how to mix feature channels
         self.theta = nn.Linear(in_channels, out_channels, bias=bias)
 
         self.seed = seed
@@ -55,9 +62,12 @@ def forward(
 
         Args:
             x: Input node feature matrix. Size ``(num_nodes, in_channels)``.
-            hyperedge_index: Hyperedge indices representing the hypergraph structure. Size ``(2, num_hyperedges)``.
-            gcn_laplacian_matrix: Optional precomputed normalized GCN Laplacian matrix. Size ``(num_nodes, num_nodes)``. Defaults to ``None``.
-                If provided, it will be used directly for smoothing, so we can skip computing it from edge_index.
+            hyperedge_index: Hyperedge indices representing the hypergraph structure.
+                Size ``(2, num_hyperedges)``.
+            gcn_laplacian_matrix: Optional precomputed normalized GCN Laplacian matrix.
+                Size ``(num_nodes, num_nodes)``. Defaults to ``None``.
+                If provided, it will be used directly for smoothing, so we can skip computing
+                it from edge_index.
 
         Returns:
             x: The output node feature matrix. Size ``(num_nodes, out_channels)``.
@@ -94,12 +104,15 @@ def forward(
 
 class HGNNConv(nn.Module):
     """
-    The HGNNConv layer proposed in `Hypergraph Neural Networks <https://arxiv.org/pdf/1809.09401>`_ paper (AAAI 2019).
-    Reference implementation: `source <https://deephypergraph.readthedocs.io/en/latest/_modules/dhg/nn/convs/hypergraphs/hgnn_conv.html#HGNNConv>`_.
+    References:
+        - The HGNNConv layer proposed in [Hypergraph Neural Networks](https://arxiv.org/pdf/1809.09401) paper (AAAI 2019).
+        - Reference implementation: [Code](https://deephypergraph.readthedocs.io/en/latest/_modules/dhg/nn/convs/hypergraphs/hgnn_conv.html#HGNNConv).
 
-    Each layer performs: ``X' = sigma(L_HGNN X Theta)`` where ``L_HGNN = D_n^{-1/2} H D_e^{-1} H^T D_n^{-1/2}``
-    is the hypergraph Laplacian computed from the incidence matrix H. This smooths node features through
-    the hypergraph structure (nodes -> hyperedges -> nodes) without reducing to a pairwise graph.
+    Each layer performs: ``X' = sigma(L_HGNN X Theta)``
+    where ``L_HGNN = D_n^{-1/2} H D_e^{-1} H^T D_n^{-1/2}``
+    is the hypergraph Laplacian computed from the incidence matrix H.
+    This smooths node features through the hypergraph structure (nodes -> hyperedges -> nodes)
+    without reducing to a pairwise graph.
 
     Unlike ``HyperGCNConv``, which uses a GCN Laplacian on a graph reduced from the hypergraph,
     ``HGNNConv`` operates entirely in hypergraph space and preserves all higher-order relationships.
@@ -107,11 +120,14 @@ class HGNNConv(nn.Module):
     Args:
         in_channels: The number of input channels.
         out_channels: The number of output channels.
-        bias: If set to ``False``, the layer will not learn the bias parameter. Defaults to ``True``.
-        use_batch_normalization: If set to ``True``, the layer will use batch normalization. Defaults to ``False``.
+        bias: If set to ``False``, the layer will not learn the bias parameter.
+            Defaults to ``True``.
+        use_batch_normalization: If set to ``True``, the layer will use batch normalization.
+            Defaults to ``False``.
         drop_rate: If set to a positive number, the layer will use dropout. Defaults to ``0.5``.
-        is_last: If set to ``True``, the layer will not apply the final activation and dropout functions. Defaults to ``False``.
-    """
+        is_last: If set to ``True``, the layer will not apply the final activation and
+            dropout functions. Defaults to ``False``.
+    """  # noqa: E501
 
     def __init__(
         self,
@@ -166,8 +182,9 @@ def forward(self, x: Tensor, hyperedge_index: Tensor) -> Tensor:
 
 class HGNNPConv(nn.Module):
     """
-    The HGNNPConv layer proposed in `HGNN+: General Hypergraph Neural Networks <https://ieeexplore.ieee.org/document/9795251>`_ paper (IEEE T-PAMI 2022).
-    Reference implementation: `source <https://deephypergraph.readthedocs.io/en/latest/_modules/dhg/nn/convs/hypergraphs/hgnnp_conv.html#HGNNPConv>`_.
+    References:
+        - The HGNNPConv layer proposed in [HGNN+: General Hypergraph Neural Networks](https://ieeexplore.ieee.org/document/9795251) paper (IEEE T-PAMI 2022).
+        - Reference implementation: [Code](https://deephypergraph.readthedocs.io/en/latest/_modules/dhg/nn/convs/hypergraphs/hgnnp_conv.html#HGNNPConv).
 
     Each layer performs: ``X' = sigma(M_HGNN+ X Theta)`` where
     ``M_HGNN+ = D_v^{-1} H D_e^{-1} H^T`` is the HGNN+ smoothing matrix.
@@ -179,11 +196,14 @@ class HGNNPConv(nn.Module):
     Args:
         in_channels: The number of input channels.
         out_channels: The number of output channels.
-        bias: If set to ``False``, the layer will not learn the bias parameter. Defaults to ``True``.
-        use_batch_normalization: If set to ``True``, the layer will use batch normalization. Defaults to ``False``.
+        bias: If set to ``False``, the layer will not learn the bias parameter.
+            Defaults to ``True``.
+        use_batch_normalization: If set to ``True``, the layer will use batch normalization.
+            Defaults to ``False``.
         drop_rate: If set to a positive number, the layer will use dropout. Defaults to ``0.5``.
-        is_last: If set to ``True``, the layer will not apply the final activation and dropout functions. Defaults to ``False``.
-    """
+        is_last: If set to ``True``, the layer will not apply the final activation and dropout
+            functions. Defaults to ``False``.
+    """  # noqa: E501
 
     def __init__(
         self,
@@ -233,17 +253,21 @@ def forward(self, x: Tensor, hyperedge_index: Tensor) -> Tensor:
 
 class HNHNConv(nn.Module):
     """
-    The HNHNConv layer proposed in `HNHN: Hypergraph Networks with Hyperedge Neurons <https://arxiv.org/abs/2006.12278>`_ paper.
-    Reference implementation: `source <https://deephypergraph.readthedocs.io/en/latest/_modules/dhg/nn/convs/hypergraphs/hnhn_conv.html#HNHNConv>`_.
+    References:
+        - The HNHNConv layer proposed in [HNHN: Hypergraph Networks with Hyperedge Neurons](https://arxiv.org/abs/2006.12278) paper.
+        - Reference implementation: [Code](https://deephypergraph.readthedocs.io/en/latest/_modules/dhg/nn/convs/hypergraphs/hnhn_conv.html#HNHNConv).
 
     Args:
         in_channels: The number of input channels.
         out_channels: The number of output channels.
-        bias: If set to ``False``, the layer will not learn the bias parameter. Defaults to ``True``.
-        use_batch_normalization: If set to ``True``, the layer will use batch normalization. Defaults to ``False``.
+        bias: If set to ``False``, the layer will not learn the bias parameter.
+            Defaults to ``True``.
+        use_batch_normalization: If set to ``True``, the layer will use batch normalization.
+            Defaults to ``False``.
         drop_rate: If set to a positive number, the layer will use dropout. Defaults to ``0.5``.
-        is_last: If set to ``True``, the layer will not apply the final activation and dropout functions. Defaults to ``False``.
-    """
+        is_last: If set to ``True``, the layer will not apply the final activation and
+            dropout functions. Defaults to ``False``.
+    """  # noqa: E501
 
     __AGGREGATION: Literal["mean"] = "mean"
 
@@ -266,8 +290,8 @@ def __init__(
 
     def forward(self, x: Tensor, hyperedge_index: Tensor) -> Tensor:
         """
-        Apply one HNHN convolution layer using two learned projections around
-        node-to-hyperedge and hyperedge-to-node mean aggregation.
+        Apply one HNHN convolution layer using two learned projections around node-to-hyperedge and
+        hyperedge-to-node mean aggregation.
 
         Args:
             x: Input node feature matrix of size ``(num_nodes, in_channels)``.
diff --git a/hyperbench/nn/loss.py b/hyperbench/nn/loss.py
index 261be18f..adb84a84 100644
--- a/hyperbench/nn/loss.py
+++ b/hyperbench/nn/loss.py
@@ -24,7 +24,8 @@ def forward(self, logits: Tensor, labels: Tensor) -> Tensor:
 
         Args:
             logits: Logit scores for each candidate hyperedge, of shape ``(num_hyperedges,)``.
-            labels: Binary labels indicating positive (1) and negative (0) hyperedges, of shape ``(num_hyperedges,)``.
+            labels: Binary labels indicating positive (1) and negative (0) hyperedges, of shape
+                ``(num_hyperedges,)``.
 
         Returns:
             loss: Scalar loss value.
@@ -90,8 +91,10 @@ def local_loss(self, node_embeddings: Tensor, hyperedge_embeddings: Tensor) -> T
         to become confident within each virtual-label subspace.
 
         Args:
-            node_embeddings: Propagated node states of shape ``(num_nodes, num_subspaces * labels_per_subspace)``.
-            hyperedge_embeddings: Propagated hyperedge states with the same channel dimension as ``node_embeddings``.
+            node_embeddings: Propagated node states of shape
+                ``(num_nodes, num_subspaces * labels_per_subspace)``.
+            hyperedge_embeddings: Propagated hyperedge states with the same channel dimension
+                as ``node_embeddings``.
 
         Returns:
             loss: Scalar tensor containing node plus hyperedge entropy losses.
@@ -106,8 +109,10 @@ def global_loss(self, node_embeddings: Tensor, hyperedge_embeddings: Tensor) ->
         with a distinctiveness term that separates label columns inside each subspace.
 
         Args:
-            node_embeddings: Propagated node states of shape ``(num_nodes, num_subspaces * labels_per_subspace)``.
-            hyperedge_embeddings: Propagated hyperedge states with the same channel dimension as ``node_embeddings``.
+            node_embeddings: Propagated node states of shape
+                ``(num_nodes, num_subspaces * labels_per_subspace)``.
+            hyperedge_embeddings: Propagated hyperedge states with the same channel dimension
+                as ``node_embeddings``.
 
         Returns:
             loss: Scalar tensor containing node plus hyperedge global losses.
@@ -137,7 +142,8 @@ def entropy_loss(self, x: Tensor) -> Tensor:
         Compute mean entropy within each virtual-label subspace.
 
         Args:
-            x: Flattened virtual-label probabilities of shape ``(num_items, num_subspaces * labels_per_subspace)``.
+            x: Flattened virtual-label probabilities of shape
+                ``(num_items, num_subspaces * labels_per_subspace)``.
 
         Returns:
             loss: Scalar entropy loss.
@@ -151,7 +157,8 @@ def entropy_loss(self, x: Tensor) -> Tensor:
         #          virtual-label distribution in subspace 0.
         probs = x.view(-1, self.num_subspaces, self.labels_per_subspace)
 
-        # With this, we induce structurally close nodes (or hyperedges) to be assigned to the same label.
+        # With this, we induce structurally close nodes (or hyperedges)
+        # to be assigned to the same label.
         # Example: probs.shape = (num_nodes, 4, 2)
         #          -> entropy.shape = (num_nodes, 4), one entropy per item and subspace
         entropy = -(probs * torch.log(probs + self.eps)).sum(dim=2, dtype=torch.float)
@@ -161,11 +168,12 @@ def balance_loss(self, x: Tensor) -> Tensor:
         """
         Compute negative entropy of global virtual-label usage.
 
-        This term is minimized, so the negative sign makes optimization maximize entropy of average label usage
-        and reduces collapse to one virtual label.
+        This term is minimized, so the negative sign makes optimization maximize entropy
+        of average label usage and reduces collapse to one virtual label.
 
         Args:
-            x: Flattened virtual-label probabilities of shape ``(num_items, num_subspaces * labels_per_subspace)``.
+            x: Flattened virtual-label probabilities of shape
+                ``(num_items, num_subspaces * labels_per_subspace)``.
 
         Returns:
             loss: Scalar balance loss.
@@ -183,7 +191,8 @@ def balance_loss(self, x: Tensor) -> Tensor:
         mean_probs = probs.mean(dim=0, dtype=torch.float)
 
         # Negative entropy to maximize global label diversity and prevents collapse.
-        # Example: mean_probs[0] = [0.50, 0.50] has higher entropy than mean_probs[0] = [0.99, 0.01].
+        # Example: mean_probs[0] = [0.50, 0.50] has higher entropy
+        #                   than mean_probs[0] = [0.99, 0.01].
         entropy = -(mean_probs * torch.log(mean_probs + self.eps)).sum(dim=1, dtype=torch.float)
         return -entropy.mean(dtype=torch.float)
 
@@ -191,11 +200,14 @@ def distinctiveness_loss(self, x: Tensor) -> Tensor:
         """
         Penalize similar virtual-label columns inside each subspace.
 
-        For every subspace, this compares all label columns across items with cosine similarity and applies a diagonal classification objective.
-        The diagonal target encourages each label column to be most similar to itself and less similar to other labels.
+        For every subspace, this compares all label columns across items with cosine similarity
+        and applies a diagonal classification objective.
+        The diagonal target encourages each label column to be most similar to itself
+        and less similar to other labels.
 
         Args:
-            x: Flattened virtual-label probabilities of shape ``(num_items, num_subspaces * labels_per_subspace)``.
+            x: Flattened virtual-label probabilities of shape
+                ``(num_items, num_subspaces * labels_per_subspace)``.
 
         Returns:
             loss: Scalar distinctiveness loss.
@@ -222,22 +234,33 @@ def distinctiveness_loss(self, x: Tensor) -> Tensor:
         ).repeat_interleave(self.labels_per_subspace)
 
         # Compare every virtual-label column against every other column.
-        # Two different labels in the same subspace should not describe the same pattern of nodes/hyperedges.
+        # Two different labels in the same subspace should not describe
+        # the same pattern of nodes/hyperedges.
         # Example: with num_subspaces=4:
         #          probs[:, :, idx_i] and probs[:, :, idx_j] both have shape (4, 4, 4),
         #          where the last dimension enumerates the four ordered label pairs above
-        #          probs[:, :, idx_i] == [[[p00, p01, p00, p01],   # node/hyperedge 0's label probabilities for the four pairs
-        #                                  [p10, p11, p10, p11],   # node/hyperedge 1's label probabilities for the four pairs
-        #                                  [p20, p21, p20, p21],   # node/hyperedge 2's label probabilities for the four pairs
-        #                                  [p30, p31, p30, p31]],  # node/hyperedge 3's label probabilities for the four pairs
-        #                                 ...]
-        #          probs[:, :, idx_j] == [[[p00, p00, p01, p01],   # node/hyperedge 0's label probabilities for the four pairs
-        #                                  [p10, p10, p11, p11],   # node/hyperedge 1's label probabilities for the four pairs
-        #                                  [p20, p20, p21, p21],   # node/hyperedge 2's label probabilities for the four pairs
-        #                                  [p30, p30, p31, p31]],  # node/hyperedge 3's label probabilities for the four pairs
-        #                                 ...]
-        #          F.cosine_similarity(..., dim=0) compares each pair across the 4 items, producing shape (4, 4)
-        #          view(-1, 2, 2) restores one 2x2 similarity matrix per subspace, so shape becomes (4, 2, 2)
+        #          # node/hyperedge 0's label probabilities for the four pairs
+        #          probs[:, :, idx_i] == [[[p00, p01, p00, p01],
+        #                 # node/hyperedge 1's label probabilities for the four pairs
+        #                [p10, p11, p10, p11],
+        #                 # node/hyperedge 2's label probabilities for the four pairs
+        #                [p20, p21, p20, p21],
+        #                 # node/hyperedge 3's label probabilities for the four pairs
+        #                [p30, p31, p30, p31]],
+        #               ...]
+        #          # node/hyperedge 0's label probabilities for the four pairs
+        #          probs[:, :, idx_j] == [[[p00, p00, p01, p01],
+        #                 # node/hyperedge 1's label probabilities for the four pairs
+        #                [p10, p10, p11, p11],
+        #                 # node/hyperedge 2's label probabilities for the four pairs
+        #                [p20, p20, p21, p21],
+        #                 # node/hyperedge 3's label probabilities for the four pairs
+        #                [p30, p30, p31, p31]],
+        #                ..]
+        #          F.cosine_similarity(..., dim=0) compares each pair across the 4 items,
+        #           producing shape (4, 4)
+        #          view(-1, 2, 2) restores one 2x2 similarity matrix per subspace,
+        #            so shape becomes (4, 2, 2)
         similarity = F.cosine_similarity(
             probs[:, :, idx_i],
             probs[:, :, idx_j],
@@ -245,7 +268,8 @@ def distinctiveness_loss(self, x: Tensor) -> Tensor:
             eps=self.eps,
         ).view(-1, self.labels_per_subspace, self.labels_per_subspace)
 
-        # Turn each similarity row into a classification distribution and keep the diagonal self-match probabilities.
+        # Turn each similarity row into a classification distribution and keep the
+        # diagonal self-match probabilities.
         # Example: similarity[subspace 0].shape = (2, 2)
         #          - row 0 scores how label 0 matches labels [0, 1]
         #          - row 1 scores how label 1 matches labels [0, 1]
@@ -263,9 +287,11 @@ class VilLainLossParts(TypedDict):
     """
     Named VilLain self-supervised loss parts returned by ``VilLain.loss``.
 
-    Args:
-        local_loss: Sum of node and hyperedge local entropy losses over all training propagation steps.
-        global_loss: Sum of balance and distinctiveness losses over all training propagation steps.
+    Attributes:
+        local_loss: Sum of node and hyperedge local entropy losses over all training
+            propagation steps.
+        global_loss: Sum of balance and distinctiveness losses over all training
+            propagation steps.
     """
 
     local_loss: Tensor
diff --git a/hyperbench/nn/scorer.py b/hyperbench/nn/scorer.py
index d54809ae..e5f65392 100644
--- a/hyperbench/nn/scorer.py
+++ b/hyperbench/nn/scorer.py
@@ -73,10 +73,12 @@ def score_batch(
 
         Args:
             hyperedge_index: Tensor of shape ``(2, |E|)``.
-            node_to_neighbors: Optional precomputed node to neighborhood mapping. If None, it will be computed from ``hyperedge_index``.
+            node_to_neighbors: Optional precomputed node to neighborhood mapping. If None, it will
+                be computed from ``hyperedge_index``.
 
         Returns:
-            scores: A 1-D tensor of shape ``(num_hyperedges,)`` with the CN score for each hyperedge.
+            scores: A 1-D tensor of shape ``(num_hyperedges,)`` with the CN score
+                or each hyperedge.
         """
         if node_to_neighbors is None:
             node_to_neighbors = Hypergraph.from_hyperedge_index(hyperedge_index).neighbors_of_all()
diff --git a/hyperbench/tests/data/dataset_test.py b/hyperbench/tests/data/dataset_test.py
index 6074fd48..4f54d3e0 100644
--- a/hyperbench/tests/data/dataset_test.py
+++ b/hyperbench/tests/data/dataset_test.py
@@ -219,13 +219,13 @@ def test_getitem_index_list_empty(mock_hdata, strategy):
         pytest.param(
             SamplingStrategy.NODE,
             [0, 1, 2, 3, 4],
-            r"Index list length \(5\) cannot exceed the number of sampleable items \(4\)\.",
+            re.escape("Index list length (5) cannot exceed the number of sampleable items (4)."),
             id="node_strategy",
         ),
         pytest.param(
             SamplingStrategy.HYPEREDGE,
             [0, 1, 2],
-            r"Index list length \(3\) cannot exceed the number of sampleable items \(2\)\.",
+            re.escape("Index list length (3) cannot exceed the number of sampleable items (2)."),
             id="hyperedge_strategy",
         ),
     ],
@@ -244,12 +244,15 @@ def test_getitem_raises_when_index_list_larger_than_max(
     "strategy, index, expected_message",
     [
         pytest.param(
-            SamplingStrategy.NODE, 4, r"Node ID 4 is out of bounds \(0, 3\)\.", id="node_strategy"
+            SamplingStrategy.NODE,
+            4,
+            re.escape("Node ID 4 is out of bounds (0, 3)."),
+            id="node_strategy",
         ),
         pytest.param(
             SamplingStrategy.HYPEREDGE,
             2,
-            r"Hyperedge ID 2 is out of bounds \(0, 1\)\.",
+            re.escape("Hyperedge ID 2 is out of bounds (0, 1)."),
             id="hyperedge_strategy",
         ),
     ],
@@ -267,7 +270,8 @@ def test_getitem_raises_when_index_out_of_bounds(
 @pytest.mark.parametrize(
     "strategy, index, expected_shape, expected_num_hyperedges",
     [
-        # When node 1 is selected, we get hyperedge 0 with nodes 0 and 1 -> 2 incidences, 1 hyperedge
+        # When node 1 is selected, we get hyperedge 0 with nodes 0
+        # and 1 -> 2 incidences, 1 hyperedge
         pytest.param(SamplingStrategy.NODE, 1, (2, 1), 1, id="node_strategy"),
         # When hyperedge 0 is selected, we get nodes 0 and 1 -> 2 incidences, 1 hyperedge
         pytest.param(SamplingStrategy.HYPEREDGE, 0, (2, 1), 1, id="hyperedge_strategy"),
@@ -288,7 +292,8 @@ def test_getitem_single_index(
 @pytest.mark.parametrize(
     "strategy, index, expected_shape, expected_num_hyperedges",
     [
-        # When nodes (0, 2, 3) -> hyperedge 0 (nodes 0, 1) + hyperedge 1 (nodes 2, 3) -> 4 incidences, 2 hyperedges
+        # When nodes (0, 2, 3) -> hyperedge 0 (nodes 0, 1) + hyperedge 1 (nodes 2, 3)
+        # -> 4 incidences, 2 hyperedges
         pytest.param(SamplingStrategy.NODE, [0, 2, 3], (2, 4), 2, id="node_strategy"),
         # When hyperedge 0 (nodes 0, 1) + hyperedge 1 (nodes 2, 3) -> 4 incidences, 2 hyperedges
         pytest.param(SamplingStrategy.HYPEREDGE, [0, 1], (2, 4), 2, id="hyperedge_strategy"),
@@ -322,7 +327,8 @@ def test_getitem_with_hyperedge_attr(mock_hdata_with_hyperedge_attr, strategy):
     assert data.hyperedge_index.shape == (2, 2)
     assert data.num_hyperedges == 1
 
-    # Even though the original hypergraph has edge attributes, __getitem__ should return hyperedge_attr as None
+    # Even though the original hypergraph has edge attributes, __getitem__ should
+    # return hyperedge_attr as None
     # as the hyperedge attributes are handled by the loader's collate function during batching
     assert data.hyperedge_attr is None
 
@@ -362,7 +368,8 @@ def test_getitem_with_multiple_hyperedge_attr(
     data = dataset[index]
     assert data.num_hyperedges == 2
 
-    # Even though the original hypergraph has edge attributes, __getitem__ should return hyperedge_attr as None
+    # Even though the original hypergraph has edge attributes, __getitem__ should
+    # return hyperedge_attr as None
     # as the hyperedge attributes are handled by the loader's collate function during batching
     assert data.hyperedge_attr is None
 
@@ -383,7 +390,8 @@ def test_getitem_with_hyperedge_weights(mock_hdata_with_hyperedge_weights, strat
     assert data.hyperedge_index.shape == (2, 2)
     assert data.num_hyperedges == 1
 
-    # Even though the original hypergraph has edge attributes, __getitem__ should return hyperedge_weights as None
+    # Even though the original hypergraph has edge attributes, __getitem__ should
+    # return hyperedge_weights as None
     # as the hyperedge weights are handled by the loader's collate function during batching
     assert data.hyperedge_weights is None
 
@@ -1392,7 +1400,8 @@ def test_split_with_ratios_raises_when_train_split_idx_provided_but_not_transduc
             # 3/5 and 2/5 as we ensure splits don't get more then requested,
             # in this way, all later splits get at least what they requested,
             # except the last one that might get slightly more due to rounding.
-            # This effect is mitigated the more hyperedges we have, as the ratios get closer to the requested ones.
+            # This effect is mitigated the more hyperedges we have, as the ratios get closer to the
+            # requested ones.
             [0.6, 0.4],
             id="five_hyperedges_rounds_train_up",
         ),
@@ -1403,7 +1412,7 @@ def test_split_with_ratios_raises_when_train_split_idx_provided_but_not_transduc
                     torch.arange(
                         500,
                         dtype=torch.long,
-                    ),  # 500 hyperedges, 125 per node, so we can split exactly according to the ratios
+                    ),  # 500 hyperedges, 125 per node, so we can split according to the ratios
                 ]
             ),
             [375, 125],
@@ -1453,7 +1462,8 @@ def test_split_transductive_raises_when_node_is_missing_from_all_hyperedges():
     with pytest.raises(
         ValueError,
         match=re.escape(
-            "Cannot create a transductive first split covering all nodes because these node ids do not appear in any hyperedge: [3]."
+            "Cannot create a transductive first split covering all nodes because "
+            "these node ids do not appear in any hyperedge: [3]."
         ),
     ):
         dataset.split(
diff --git a/hyperbench/tests/data/enricher_test.py b/hyperbench/tests/data/enricher_test.py
index 8f3f0674..2af87de3 100644
--- a/hyperbench/tests/data/enricher_test.py
+++ b/hyperbench/tests/data/enricher_test.py
@@ -230,7 +230,9 @@ def test_node2vec_enricher_returns_zero_features_when_clique_has_no_non_selfloop
 
     with pytest.warns(
         UserWarning,
-        match="Clique expansion produced no non-self-loop edges. Returning zero node features.",
+        match=re.escape(
+            "Clique expansion produced no non-self-loop edges. Returning zero node features."
+        ),
     ):
         result = enricher.enrich(hyperedge_index)
 
diff --git a/hyperbench/tests/data/hif_test.py b/hyperbench/tests/data/hif_test.py
index 2ad70bc3..d57330f2 100644
--- a/hyperbench/tests/data/hif_test.py
+++ b/hyperbench/tests/data/hif_test.py
@@ -246,12 +246,19 @@ def test_load_from_url_raises_when_status_is_not_200():
     [
         pytest.param(
             "https://example.com/algebra.json.zst.zst",
-            r"Unsupported file format for URL 'https://example.com/algebra.json.zst.zst'\. Expected \.json or \.json\.zst",
+            re.escape(
+                "Unsupported file format for URL 'https://example.com/algebra.json.zst.zst'"
+                ". Expected .json or .json.zst"
+            ),
             id="json-zst-zst",
         ),
         pytest.param(
             "https://example.com/algebra.zst.json.zst",
-            r"URL 'https://example.com/algebra.zst.json.zst' has an unexpected filename format\. Expected at most one dot in the base filename before the extension \(e\.g\., dataset\.json or dataset\.json\.zst\)\.",
+            re.escape(
+                "URL 'https://example.com/algebra.zst.json.zst' has an unexpected filename "
+                "format. Expected at most one dot in the base filename before the extension (e.g., "
+                "dataset.json or dataset.json.zst)."
+            ),
             id="zst-json-zst",
         ),
     ],
@@ -567,7 +574,8 @@ def test_hifloader_falls_back_to_hf_hub_download_when_github_raw_download_fails(
         pytest.raises(
             ValueError,
             match=re.escape(
-                "Failed to download dataset 'algebra' from GitHub with status code 404 and no SHA provided for Hugging Face Hub fallback."
+                "Failed to download dataset 'algebra' from GitHub with "
+                "status code 404 and no SHA provided for Hugging Face Hub fallback."
             ),
         ),
     ):
@@ -586,7 +594,7 @@ def test_hifloader_from_url_raise_error_on_wrong_extension():
 
         with pytest.raises(
             ValueError,
-            match=r"Unsupported file format for URL 'https://example.com/algebra.txt'",
+            match=re.escape("Unsupported file format for URL 'https://example.com/algebra.txt'"),
         ):
             HIFLoader.load_from_url("https://example.com/algebra.txt")
 
@@ -821,7 +829,8 @@ def test_load_by_name_raises_warn_when_fail_to_cleanup_hf_cache(tmp_path, mock_h
         patch(
             "hyperbench.data.hif.shutil.rmtree",
             side_effect=FileNotFoundError(
-                f"[Errno 2] No such file or directory: '{tmp_path / 'hf_cache' / 'datasets--HypernetworkRG--algebra'}'"
+                f"[Errno 2] No such file or directory: '"
+                f"{tmp_path / 'hf_cache' / 'datasets--HypernetworkRG--algebra'}'"
             ),
         ),
         pytest.warns(UserWarning, match="Failed to clean up Hugging Face Hub cache"),
@@ -855,7 +864,9 @@ def test_load_by_name_raises_when_downloaded_hf_file_cannot_be_read(tmp_path):
         pytest.warns(UserWarning, match="GitHub raw download failed"),
         pytest.raises(
             ValueError,
-            match=r"Failed to read compressed JSON file 'downloaded\.json\.zst': missing file\.",
+            match=re.escape(
+                "Failed to read compressed JSON file 'downloaded.json.zst': missing file."
+            ),
         ),
         patch("hyperbench.data.hif.os.getenv", return_value=None),
     ):
@@ -897,7 +908,8 @@ def test_load_by_name_raises_when_downloaded_hf_content_cannot_be_written(tmp_pa
         pytest.warns(UserWarning, match="GitHub raw download failed"),
         pytest.raises(
             ValueError,
-            match=r"Failed to save downloaded dataset 'algebra' to disk at '.*algebra\.json\.zst': disk full\.",
+            match=r"Failed to save downloaded dataset 'algebra' to disk at '.*algebra\.json\.zst': "
+            r"disk full\.",
         ),
     ):
         HIFLoader.load_by_name("algebra", hf_sha=hf_sha, save_on_disk=True)
@@ -929,7 +941,7 @@ def test_load_by_name_raises_when_saving_downloaded_dataset_fails(tmp_path):
         ),
         pytest.raises(
             ValueError,
-            match=r"Failed to save downloaded 'algebra\.json\.zst'",
+            match=re.escape("Failed to save downloaded 'algebra.json.zst'"),
         ),
     ):
         HIFLoader.load_by_name("algebra", save_on_disk=True)
@@ -963,9 +975,9 @@ def test_hifloader_download_failure_when_hf_fallback_fails(tmp_path):
         pytest.warns(UserWarning, match="GitHub raw download failed"),
         pytest.raises(
             ValueError,
-            match=(
-                r"Failed to download dataset 'algebra' from GitHub and Hugging Face Hub\. "
-                r"GitHub error: 404 \| Hugging Face error: HFHub failed"
+            match=re.escape(
+                "Failed to download dataset 'algebra' from GitHub and Hugging Face Hub. "
+                "GitHub error: 404 | Hugging Face error: HFHub failed"
             ),
         ),
         patch("hyperbench.data.hif.os.getenv", return_value=None),
@@ -999,8 +1011,10 @@ def test_hifloader_download_failure_when_hf_token_is_invalid(tmp_path):
         pytest.raises(
             ValueError,
             match=(
-                r"Failed to download dataset 'algebra' from GitHub and Hugging Face Hub\. "
-                r"GitHub error: 404 \| Hugging Face error: HFHub failed"
+                re.escape(
+                    "Failed to download dataset 'algebra' from GitHub and Hugging Face Hub. "
+                    "GitHub error: 404 | Hugging Face error: HFHub failed"
+                )
             ),
         ),
         patch("hyperbench.data.hif.os.getenv", return_value="invalid_token"),
diff --git a/hyperbench/tests/data/negative_sampler_test.py b/hyperbench/tests/data/negative_sampler_test.py
index 48acdde2..9540fd2d 100644
--- a/hyperbench/tests/data/negative_sampler_test.py
+++ b/hyperbench/tests/data/negative_sampler_test.py
@@ -574,7 +574,8 @@ def test_clique_negative_sampler_fails_when_positive_clique_is_only_candidate():
 
     with pytest.raises(
         ValueError,
-        match="Asked to create 1 clique negative samples with 3 nodes each, but only 0 are available",
+        match="Asked to create 1 clique negative samples with 3 nodes each, "
+        "but only 0 are available",
     ):
         sampler.sample(hdata)
 
diff --git a/hyperbench/tests/data/splitter_test.py b/hyperbench/tests/data/splitter_test.py
index 84b6d7ff..fca637a3 100644
--- a/hyperbench/tests/data/splitter_test.py
+++ b/hyperbench/tests/data/splitter_test.py
@@ -187,7 +187,7 @@ def test_default_dataset_splitter_rebalances_first_split_to_cover_all_nodes():
     assert torch.equal(split_labels.sort().values, hdata.y)
 
 
-def test_default_dataset_splitter_returns_final_transductive_ratios_when_train_coverage_is_enabled():
+def test_default_dataset_splitter_returns_final_transductive_ratios_when_train_cov_is_enabled():
     hdata = HData(
         x=torch.arange(4, dtype=torch.float).unsqueeze(1),
         hyperedge_index=torch.tensor([[0, 1, 2, 3, 0], [0, 1, 2, 3, 4]], dtype=torch.long),
@@ -253,7 +253,8 @@ def test_default_dataset_splitter_raises_when_node_is_missing_from_all_hyperedge
     with pytest.raises(
         ValueError,
         match=re.escape(
-            "Cannot create a transductive first split covering all nodes because these node ids do not appear in any hyperedge: [3]."
+            "Cannot create a transductive first split covering all nodes because these "
+            "node ids do not appear in any hyperedge: [3]."
         ),
     ):
         DefaultDatasetSplitter().split(
@@ -306,7 +307,7 @@ def test_hyperedge_id_splitter_get_hyperedge_ids_permutation_is_deterministic_wi
     assert torch.equal(permutation_a.sort().values, torch.arange(5, dtype=torch.long))
 
 
-def test_hyperedge_id_splitter_split_uses_cumulative_floor_boundaries_and_last_split_absorbs_remainder(
+def test_hyperedge_id_splitter_split_cumulative_floor_boundaries_and_last_split_absorbs_remainder(
     mock_hdata_five_hyperedges,
 ):
     hyperedge_ids = torch.arange(5, dtype=torch.long)
@@ -357,7 +358,8 @@ def test_split_validates_ratio_values(
             # 3/5 and 2/5 as we ensure splits don't get more then requested,
             # in this way, all later splits get at least what they requested,
             # except the last one that might get slightly more due to rounding.
-            # This effect is mitigated the more hyperedges we have, as the ratios get closer to the requested ones.
+            # This effect is mitigated the more hyperedges we have, as the ratios get closer to the
+            # requested ones.
             [0.6, 0.4],
             id="five_hyperedges_rounds_train_up",
         ),
@@ -368,7 +370,7 @@ def test_split_validates_ratio_values(
                     torch.arange(
                         500,
                         dtype=torch.long,
-                    ),  # 500 hyperedges, 125 per node, so we can split exactly according to the ratios
+                    ),  # 500 hyperedges, 125 per node, so we can split according to the ratios
                 ]
             ),
             [375, 125],
@@ -398,7 +400,7 @@ def test_hyperedge_id_splitter_split_returns_expected_cumulative_ratios(
     assert final_ratios == pytest.approx(expected_final_ratios)
 
 
-def test_hyperedge_id_splitter_ensure_split_covers_all_nodes_moves_best_covering_hyperedge_into_first_split():
+def test_hyperedge_id_splitter_split_covers_all_nodes_moves_best_covering_he_in_first_split():
     x = torch.ones((4, 1), dtype=torch.float32)
     hyperedge_index = torch.tensor(
         [
@@ -459,7 +461,7 @@ def test_hyperedge_id_splitter_ensure_split_covers_all_nodes_rejects_invalid_spl
         )
 
 
-def test_hyperedge_id_splitter_ensure_split_covers_all_nodes_raises_when_node_is_missing_from_hypergraph():
+def test_hyperedge_id_splitter_ensure_split_covers_all_nodes_raises_when_node_is_missing():
     x = torch.ones((4, 1), dtype=torch.float32)
     hyperedge_index = torch.tensor([[0, 1, 2], [0, 0, 1]], dtype=torch.long)
     hdata = HData(x=x, hyperedge_index=hyperedge_index)
@@ -472,7 +474,8 @@ def test_hyperedge_id_splitter_ensure_split_covers_all_nodes_raises_when_node_is
     with pytest.raises(
         ValueError,
         match=re.escape(
-            "Cannot create a transductive first split covering all nodes because these node ids do not appear in any hyperedge: [3]."
+            "Cannot create a transductive first split covering all nodes because these "
+            "node ids do not appear in any hyperedge: [3]."
         ),
     ):
         splitter.ensure_split_covers_all_nodes(
diff --git a/hyperbench/tests/train/latex_logger_test.py b/hyperbench/tests/train/latex_logger_test.py
index 2ad6a5bf..aa302a42 100644
--- a/hyperbench/tests/train/latex_logger_test.py
+++ b/hyperbench/tests/train/latex_logger_test.py
@@ -254,7 +254,8 @@ def test_finalize_writes_comprehensive_overall_table_trail(tmp_path, mock_option
         \multicolumn{3}{c}{\textbf{Test Results}} \\
         \midrule
         Model & test\_auc & test\_loss \\
-        model\_a & \cellcolor[HTML]{59FF59}\underline{0.9123} & \cellcolor[HTML]{59FF59}\underline{0.1230} \\
+        model\_a & \cellcolor[HTML]{59FF59}\underline{0.9123} & """
+        r"""\cellcolor[HTML]{59FF59}\underline{0.1230} \\
         model\_b & \cellcolor[HTML]{FF5959}0.8821 & - \\
         \hline
         \addlinespace[3pt]
@@ -310,7 +311,8 @@ def test_finalize_writes_comprehensive_test_table_trail(tmp_path, mock_option_co
         \multicolumn{3}{c}{\textbf{Test Results}} \\
         \midrule
         Model & test\_auc & test\_loss \\
-        model\_a & \cellcolor[HTML]{59FF59}\underline{0.9123} & \cellcolor[HTML]{59FF59}\underline{0.1234} \\
+        model\_a & \cellcolor[HTML]{59FF59}\underline{0.9123} & """
+        r"""\cellcolor[HTML]{59FF59}\underline{0.1234} \\
         model\_b & \cellcolor[HTML]{FF5959}0.8821 & - \\
         \hline
         \end{tabular}
diff --git a/hyperbench/tests/types/graph_test.py b/hyperbench/tests/types/graph_test.py
index 4179f6f7..f629bac6 100644
--- a/hyperbench/tests/types/graph_test.py
+++ b/hyperbench/tests/types/graph_test.py
@@ -232,7 +232,8 @@ def test_to_edge_index_is_contiguous(mock_single_edge_graph):
 
     Examples:
         If edges = [[0, 1]], then edge_index = [[0], [1]] should be contiguous.
-        If edges = [[0, 1], [1, 2], [2, 3]], then edge_index = [[0, 1, 2], [1, 2, 3]] should be contiguous.
+        If edges = [[0, 1], [1, 2], [2, 3]], then edge_index = [[0, 1, 2], [1, 2, 3]]
+            should be contiguous.
     """
     edge_index = mock_single_edge_graph.to_edge_index()
     assert edge_index.is_contiguous()
@@ -281,7 +282,9 @@ def test_bidirectional_edges():
 
 
 def test_star_graph():
-    """Test star graph (all edges connected to central node)."""
+    """
+    Test star graph (all edges connected to central node).
+    """
     graph = Graph([[0, 1], [0, 2], [0, 3], [0, 4]])
     assert graph.num_nodes == 5
     assert graph.num_edges == 4
@@ -292,7 +295,9 @@ def test_star_graph():
 
 
 def test_cyclic_graph():
-    """Test cyclic graph (a closed loop)."""
+    """
+    Test cyclic graph (a closed loop).
+    """
     graph = Graph([[0, 1], [1, 2], [2, 3], [3, 0]])
     assert graph.num_nodes == 4
     assert graph.num_edges == 4
@@ -326,7 +331,9 @@ def test_smoothing_with_laplacian_output_shape_matches_x_shape(num_nodes, num_fe
 
 
 def test_smoothing_with_laplacian_with_identity_laplacian_returns_original_x():
-    """Smoothing with identity laplacian should return the original features."""
+    """
+    Smoothing with identity laplacian should return the original features.
+    """
     num_nodes = 3
     x = torch.tensor([[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]], dtype=torch.float)
 
@@ -475,6 +482,7 @@ def test_smoothing_with_laplacian_drop_rate_stochastic():
 def test_smoothing_with_laplacian_influences_connected_nodes():
     """
     Features of connected nodes should be aggregated.
+
     For a connected graph with GCN normalization, smoothing should mix features from neighbors.
     """
     # Two connected nodes with distinct features
@@ -718,7 +726,9 @@ def test_get_sparse_adjacency_matrix_shape(edge_index, num_nodes):
 
 
 def test_get_sparse_adjacency_matrix_empty_edge_index():
-    """Empty edge_index produces all-zero adjacency matrix when converted to dense."""
+    """
+    Empty edge_index produces all-zero adjacency matrix when converted to dense.
+    """
     edge_index = torch.tensor([[], []], dtype=torch.long)
     adj_matrix = EdgeIndex(edge_index).get_sparse_adjacency_matrix(num_nodes=3)
     dense_adj_matrix = adj_matrix.to_dense()
@@ -898,7 +908,9 @@ def test_get_sparse_adjacency_matrix_ignores_stored_edge_weights_by_default():
     ],
 )
 def test_get_sparse_adjacency_matrix_isolated_nodes(edge_index, num_nodes, isolated_nodes):
-    """Nodes not in edge_index have zero rows and columns."""
+    """
+    Nodes not in edge_index have zero rows and columns.
+    """
     adj_matrix = EdgeIndex(edge_index).get_sparse_adjacency_matrix(num_nodes=num_nodes)
     dense_adj_matrix = adj_matrix.to_dense()
 
@@ -997,7 +1009,9 @@ def test_get_sparse_normalized_degree_matrix_isolated_nodes_are_zero():
 
 
 def test_get_sparse_normalized_degree_matrix_empty_edge_index():
-    """Empty edge_index produces all-zero matrix (all nodes isolated)."""
+    """
+    Empty edge_index produces all-zero matrix (all nodes isolated).
+    """
     edge_index = torch.tensor([[], []], dtype=torch.long)
 
     degree_matrix = EdgeIndex(edge_index).get_sparse_normalized_degree_matrix(num_nodes=3)
@@ -1144,7 +1158,8 @@ def test_get_sparse_normalized_laplacian_has_features_for_isolated_nodes():
     # isolated nodes are not in the edge_index
     edge_index = torch.tensor([[0], [1]], dtype=torch.long)
 
-    # we want all nodes in the gcn laplacian, so we specify num_nodes=4 to include nodes 2 and 3 which are isolated
+    # we want all nodes in the gcn laplacian, so we specify num_nodes=4 to include nodes 2 and 3
+    # which are isolated
     gcn_laplacian = EdgeIndex(edge_index).get_sparse_normalized_gcn_laplacian(num_nodes=4)
     dense_gcn_laplacian = gcn_laplacian.to_dense()
 
@@ -1328,7 +1343,9 @@ def test_get_sparse_normalized_laplacian_is_symmetric():
 
 
 def test_get_sparse_normalized_laplacian_diagonal_values():
-    """For a connected graph without self-loops, diagonal of the laplacian should be non-negative."""
+    """
+    For a connected graph without self-loops, diagonal of the laplacian should be non-negative.
+    """
     edge_index = EdgeIndex(torch.tensor([[0, 1], [1, 0]], dtype=torch.long))
     laplacian = edge_index.get_sparse_normalized_laplacian(num_nodes=2)
     dense_laplacian = laplacian.to_dense()
diff --git a/hyperbench/tests/types/hdata_test.py b/hyperbench/tests/types/hdata_test.py
index d3fdf5cd..99a71489 100644
--- a/hyperbench/tests/types/hdata_test.py
+++ b/hyperbench/tests/types/hdata_test.py
@@ -277,8 +277,8 @@ def test_init_hyperedge_attr_defaults_to_none():
                 "hyperedge_index": torch.tensor([[0, 1, 2], [0, 0, 0]], dtype=torch.long),
             },
             (
-                "'x' must have one feature row per node, or be 'torch.empty((0, 0))' if there are no "
-                "nodes. Got x.shape=(2, 2) but num_nodes=3."
+                "'x' must have one feature row per node, or be 'torch.empty((0, 0))' "
+                "if there are no nodes. Got x.shape=(2, 2) but num_nodes=3."
             ),
             id="x_rows_do_not_match_num_nodes",
         ),
@@ -648,7 +648,8 @@ def test_cat_same_node_space_raises_on_overlapping_hyperedge_ids():
     with pytest.raises(
         ValueError,
         match=re.escape(
-            "Overlapping hyperedge IDs found across instances. Ensure each instance uses distinct hyperedge IDs."
+            "Overlapping hyperedge IDs found across instances. Ensure each "
+            "instance uses distinct hyperedge IDs."
         ),
     ):
         HData.cat_same_node_space([hdata1, hdata2])
@@ -1443,7 +1444,8 @@ def test_enrich_node_features_from_raises_when_source_rows_do_not_match_global_n
     with pytest.raises(
         ValueError,
         match=re.escape(
-            "Expected 'hdata_with_features.x' rows to align with hdata_with_features.global_node_ids."
+            "Expected 'hdata_with_features.x' rows to align with "
+            "hdata_with_features.global_node_ids."
         ),
     ):
         target_hdata.enrich_node_features_from(source_hdata)
@@ -2053,7 +2055,8 @@ def test_remove_hyperedges_with_fewer_than_k_nodes(hyperedge_index, k, expected_
             id="disjoint_nodes_first_hyperedge_removed",
         ),
         pytest.param(
-            # Hyperedge 0: nodes {0, 2} -> 2 nodes (removed), hyperedge 1: nodes {1, 2, 3} -> 3 nodes (kept)
+            # Hyperedge 0: nodes {0, 2} -> 2 nodes (removed), hyperedge 1: nodes {1, 2, 3}
+            #                                                            -> 3 nodes (kept)
             # Node 2 is shared, so it survives because hyperedge 1 is kept
             # Node 0 is the only node removed as it is only in the removed hyperedge 0
             torch.tensor([[0, 2, 1, 2, 3], [0, 0, 1, 1, 1]], dtype=torch.long),
@@ -2082,7 +2085,8 @@ def test_remove_hyperedges_with_fewer_than_k_nodes_subsets_x(hyperedge_index, k,
             id="disjoint_nodes_first_hyperedge_removed",
         ),
         pytest.param(
-            # Hyperedge 0: nodes {0, 2} -> 2 nodes (removed). hyperedge 1: nodes {1, 2, 3} -> 3 nodes (kept)
+            # Hyperedge 0: nodes {0, 2} -> 2 nodes (removed). hyperedge 1: nodes {1, 2, 3}
+            #                                                            -> 3 nodes (kept)
             # Node 2 is shared, so y for hyperedge 1 must survive
             torch.tensor([[0, 2, 1, 2, 3], [0, 0, 1, 1, 1]], dtype=torch.long),
             3,
@@ -2112,7 +2116,8 @@ def test_remove_hyperedges_with_fewer_than_k_nodes_subsets_y(hyperedge_index, k,
             id="disjoint_nodes_first_hyperedge_removed",
         ),
         pytest.param(
-            # Hyperedge 0: nodes {0, 2} -> 2 nodes (removed), hyperedge 1: nodes {1, 2, 3} -> 3 nodes (kept)
+            # Hyperedge 0: nodes {0, 2} -> 2 nodes (removed), hyperedge 1: nodes {1, 2, 3}
+            #                                                            -> 3 nodes (kept)
             # Node 2 is shared, so attr for hyperedge 1 must survive
             torch.tensor([[0, 2, 1, 2, 3], [0, 0, 1, 1, 1]], dtype=torch.long),
             3,
@@ -2165,7 +2170,7 @@ def test_remove_hyperedges_with_fewer_than_k_nodes_subsets_global_node_ids_when_
     assert torch.equal(result.global_node_ids, torch.tensor([30, 40, 50], dtype=torch.long))
 
 
-def test_remove_hyperedges_with_fewer_than_k_nodes_does_not_subset_global_node_ids_when_preserve_false():
+def test_remove_hyperedges_with_fewer_than_k_nodes_not_subset_global_node_ids_when_preserve_false():
     x = torch.randn(5, 2, dtype=torch.float)
     hyperedge_index = torch.tensor([[0, 1, 2, 3, 4], [0, 0, 1, 1, 1]], dtype=torch.long)
     hdata = HData(x=x, hyperedge_index=hyperedge_index)
diff --git a/hyperbench/tests/types/hypergraph_test.py b/hyperbench/tests/types/hypergraph_test.py
index d5a3e0c1..cbf484af 100644
--- a/hyperbench/tests/types/hypergraph_test.py
+++ b/hyperbench/tests/types/hypergraph_test.py
@@ -11,7 +11,9 @@
 
 @pytest.fixture(autouse=True)
 def seed():
-    """Fix random seed for deterministic projections."""
+    """
+    Fix random seed for deterministic projections.
+    """
     torch.manual_seed(42)
 
 
diff --git a/hyperbench/tests/utils/nn_utils_test.py b/hyperbench/tests/utils/nn_utils_test.py
index 21694862..fafea6e7 100644
--- a/hyperbench/tests/utils/nn_utils_test.py
+++ b/hyperbench/tests/utils/nn_utils_test.py
@@ -81,14 +81,16 @@ def test_maxmin_scatter_respects_explicit_dim_size():
     # Example:
     # - index[1] == 0 means src[1] = [3, 1] contributes to output row 0.
     # - index[2] == 2 means src[2] = [-2, 7] contributes to output row 2.
-    # Missing group ids indicate that those groups receive no source rows, so group 1 and group 3 are empty.
+    # Missing group ids indicate that those groups receive no source rows, so group 1
+    # and group 3 are empty.
     index = torch.tensor([0, 0, 2], dtype=torch.long)
 
     # dim_size=4 forces four output rows even though max(index) would only imply three rows.
     result = maxmin_scatter(src=src, index=index, dim=0, dim_size=4)
 
     # Group 0 receives [1, 4] and [3, 1], so its range is [2, 3].
-    # Group 2 receives only row [-2, 7], so max(-2) - min(-2) and max(7) - min(7) are both 0 and the range is [0, 0].
+    # Group 2 receives only row [-2, 7], so max(-2) - min(-2) and max(7) - min(7) are both 0 and
+    # the range is [0, 0].
     # Empty groups 1 and 3 follow torch_geometric.scatter's neutral empty output,
     # so max and min both become [0, 0], and max - min is also [0, 0].
     expected = torch.tensor(
diff --git a/hyperbench/tests/utils/sparse_utils_test.py b/hyperbench/tests/utils/sparse_utils_test.py
index f3adfe22..77f399d0 100644
--- a/hyperbench/tests/utils/sparse_utils_test.py
+++ b/hyperbench/tests/utils/sparse_utils_test.py
@@ -16,7 +16,6 @@ def mock_values():
 
 
 def test_dropout_zero_probability(mock_indices, mock_values):
-    """Test that zero dropout probability returns the original sparse tensor."""
     sparse_tensor = torch.sparse_coo_tensor(
         mock_indices, mock_values, (3, 3), dtype=mock_values.dtype
     )
@@ -28,7 +27,6 @@ def test_dropout_zero_probability(mock_indices, mock_values):
 
 
 def test_dropout_full_probability(mock_indices, mock_values):
-    """Test that full dropout probability (1.0) drops all elements."""
     sparse_tensor = torch.sparse_coo_tensor(
         mock_indices, mock_values, (3, 3), dtype=mock_values.dtype
     )
@@ -43,7 +41,6 @@ def test_dropout_full_probability(mock_indices, mock_values):
 
 @pytest.mark.parametrize("invalid_prob", [-0.5, 1.5])
 def test_dropout_invalid_probability_out_of_range(mock_indices, mock_values, invalid_prob):
-    """Test that dropout probability below 0 raises ValueError."""
     sparse_tensor = torch.sparse_coo_tensor(
         mock_indices, mock_values, (2, 2), dtype=mock_values.dtype
     )
@@ -56,7 +53,6 @@ def test_dropout_invalid_probability_out_of_range(mock_indices, mock_values, inv
 
 
 def test_dropout_preserves_indices():
-    """Test that dropout preserves the sparsity pattern (indices) unchanged."""
     indices = torch.tensor([[0, 1, 2, 0], [0, 1, 2, 2]], dtype=torch.long)
     values = torch.tensor([1.0, 2.0, 3.0, 4.0], dtype=torch.float)
     sparse_tensor = torch.sparse_coo_tensor(indices, values, (3, 3), dtype=values.dtype)
@@ -68,7 +64,6 @@ def test_dropout_preserves_indices():
 
 
 def test_dropout_preserves_shape():
-    """Test that dropout preserves the tensor shape."""
     shape = (5, 10)  # Shape of the tensor if it were dense
     indices = torch.tensor([[0, 2, 4], [1, 5, 9]], dtype=torch.long)
     values = torch.tensor([1.0, 2.0, 3.0], dtype=torch.float)
@@ -80,7 +75,6 @@ def test_dropout_preserves_shape():
 
 
 def test_dropout_preserves_dtype():
-    """Test that dropout preserves the tensor dtype."""
     indices = torch.tensor([[0, 1], [0, 1]], dtype=torch.long)
     values = torch.tensor([1.0, 2.0], dtype=torch.float32)
     sparse_tensor = torch.sparse_coo_tensor(indices, values, (2, 2), dtype=torch.float32)
@@ -91,7 +85,6 @@ def test_dropout_preserves_dtype():
 
 
 def test_dropout_with_fill_value_zero(mock_indices):
-    """Test dropout with fill_value=0.0 (default behavior)."""
     values = torch.tensor([5.0, 10.0, 15.0], dtype=torch.float)
     sparse_tensor = torch.sparse_coo_tensor(mock_indices, values, (3, 3), dtype=values.dtype)
 
@@ -105,7 +98,6 @@ def test_dropout_with_fill_value_zero(mock_indices):
 
 
 def test_dropout_with_nonzero_fill_value(mock_indices):
-    """Test dropout with a non-zero fill_value."""
     values = torch.tensor([5.0, 10.0, 15.0], dtype=torch.float)
     sparse_tensor = torch.sparse_coo_tensor(mock_indices, values, (3, 3), dtype=values.dtype)
     fill_value = 99.0
@@ -120,7 +112,6 @@ def test_dropout_with_nonzero_fill_value(mock_indices):
 
 
 def test_dropout_with_negative_values():
-    """Test dropout with negative values in the sparse tensor."""
     indices = torch.tensor([[0, 1, 2], [0, 1, 2]], dtype=torch.long)
     values = torch.tensor([-1.0, -5.0, -10.0], dtype=torch.float)
     sparse_tensor = torch.sparse_coo_tensor(indices, values, (3, 3), dtype=values.dtype)
@@ -133,7 +124,6 @@ def test_dropout_with_negative_values():
 
 
 def test_dropout_preserves_cpu_device():
-    """Test that dropout preserves the device."""
     device = torch.device("cpu")
 
     indices = torch.tensor([[0, 1], [0, 1]], device=device, dtype=torch.long)
@@ -149,7 +139,6 @@ def test_dropout_preserves_cpu_device():
 
 @pytest.mark.skipif(not torch.cuda.is_available(), reason="Cuda not available")
 def test_dropout_preserves_cuda_device():
-    """Test that dropout preserves the device."""
     device = torch.device("cuda")
 
     indices = torch.tensor([[0, 1], [0, 1]], device=device, dtype=torch.long)
@@ -165,7 +154,6 @@ def test_dropout_preserves_cuda_device():
 
 @pytest.mark.skipif(not torch.mps.is_available(), reason="MPS not available")
 def test_dropout_preserves_mps_device():
-    """Test that dropout preserves the device."""
     device = torch.device("mps")
 
     indices = torch.tensor([[0, 1], [0, 1]], device=device, dtype=torch.long)
@@ -180,7 +168,6 @@ def test_dropout_preserves_mps_device():
 
 
 def test_dropout_fill_value_with_full_dropout():
-    """Test that fill_value is applied correctly when dropout is 1.0."""
     indices = torch.tensor([[0, 1], [0, 1]], dtype=torch.long)
     values = torch.tensor([1.0, 2.0], dtype=torch.float)
     sparse_tensor = torch.sparse_coo_tensor(indices, values, (2, 2), dtype=values.dtype)
@@ -195,7 +182,6 @@ def test_dropout_fill_value_with_full_dropout():
 
 
 def test_dropout_with_unsorted_indices():
-    """Test that dropout handles unsorted indices correctly."""
     # Create a sparse tensor with unsorted/duplicate indices
     indices = torch.tensor([[2, 0, 1, 0], [2, 0, 1, 0]], dtype=torch.long)
     values = torch.tensor([3.0, 1.0, 2.0, 4.0], dtype=torch.float)
@@ -214,7 +200,6 @@ def test_dropout_with_unsorted_indices():
 
 
 def test_dropout_single_element():
-    """Test dropout on a sparse tensor with a single element."""
     indices = torch.tensor([[0], [0]], dtype=torch.long)
     values = torch.tensor([42.0], dtype=torch.float)
     sparse_tensor = torch.sparse_coo_tensor(indices, values, (1, 1), dtype=values.dtype)
@@ -229,7 +214,6 @@ def test_dropout_single_element():
 
 
 def test_dropout_large_sparse_matrix():
-    """Test dropout on a large sparse matrix."""
     size = 1000
     num_nonzero_elements = 500
     rows = torch.randint(0, size, (num_nonzero_elements,), dtype=torch.long)
@@ -245,7 +229,6 @@ def test_dropout_large_sparse_matrix():
 
 
 def test_dropout_returns_new_tensor(mock_indices, mock_values):
-    """Test that dropout returns a new tensor, not a reference to the original."""
     sparse_tensor = torch.sparse_coo_tensor(
         mock_indices, mock_values, (2, 2), dtype=mock_values.dtype
     )
@@ -257,7 +240,6 @@ def test_dropout_returns_new_tensor(mock_indices, mock_values):
 
 
 def test_dropout_statistical_property_moderate_rate():
-    """Test that dropout approximately respects the expected keep probability."""
     # Create a larger sparse tensor for statistical testing
     num_elements = 1000
 
diff --git a/hyperbench/train/latex_logger.py b/hyperbench/train/latex_logger.py
index e08126b2..990cd474 100644
--- a/hyperbench/train/latex_logger.py
+++ b/hyperbench/train/latex_logger.py
@@ -69,7 +69,7 @@ class LaTexTableConfig(TypedDict):
     """
     Configuration for the LaTex table logger.
 
-    Args:
+    Attributes:
         table_caption: Caption for the LaTex table.
         sort_by: Per-column sorting criteria ("asc" or "des").
         border: Whether to include borders in the LaTex table.
@@ -81,8 +81,6 @@ class LaTexTableConfig(TypedDict):
 
 
 class LaTexTableLogger(Logger):
-    # TODO: settings has to be configurable in Trainer
-
     """A Lightning Logger that accumulates metrics and writes a LaTex comparison table.
 
     Multiple instances (one per model) share a class-level store keyed by experiment_name.
diff --git a/hyperbench/train/markdown_logger.py b/hyperbench/train/markdown_logger.py
index 6c2e7afb..84bc084e 100644
--- a/hyperbench/train/markdown_logger.py
+++ b/hyperbench/train/markdown_logger.py
@@ -145,11 +145,14 @@ def __split_results(
         - "train*" --> train_results
         - "val*" --> val_results
         - anything else (e.g., "epoch") --> ignored
+        Models with no metrics in a category are excluded from that category's dict.
 
         Returns:
             results: Tuple of (test_results, train_results, val_results), where each is a dict
-            mapping model names to their respective metric dicts. Models with no metrics
-            in a category are excluded from that category's dict.
+            mapping model names to their respective metric dicts.
+            test_results: Dict mapping model names to their test metric dicts.
+            train_results: Dict mapping model names to their train metric dicts.
+            val_results: Dict mapping model names to their val metric dicts.
         """
         store = self.__shared_stores.get(self.__experiment_name, {})
         test_results: dict[str, dict[str, float]] = {}
@@ -183,7 +186,6 @@ def clear(self, experiment_name: str) -> None:
 
         Args:
             experiment_name: The experiment name whose data should be cleared.
-
         """
         self.__shared_stores.pop(experiment_name, None)
 
@@ -220,8 +222,6 @@ def __build_comparison_table(
 
         Returns:
             table: Markdown table string. Returns an empty string if ``results`` is empty.
-
-
         """
         if not results:
             return ""
diff --git a/hyperbench/train/trainer.py b/hyperbench/train/trainer.py
index 626b7c2a..55bf8db3 100644
--- a/hyperbench/train/trainer.py
+++ b/hyperbench/train/trainer.py
@@ -28,18 +28,21 @@ class MultiModelTrainer:
     A trainer class to handle training multiple models with individual trainers.
 
     Args:
-        model_configs: A list of ModelConfig objects, each containing a model and its associated trainer (if any).
+        model_configs: A list of ModelConfig objects, each containing a model and its
+            associated trainer (if any).
 
         experiment_name: Name for this experiment run's log directory. When ``None`` (default),
-            auto-increments as ``experiment_0``, ``experiment_1``, etc. under the log root directory.
-            Only used when ``logger`` is not provided.
+            auto-increments as ``experiment_0``, ``experiment_1``, etc. under
+            the log root directory. Only used when ``logger`` is not provided.
 
-        accelerator: Supports passing different accelerator types ("cpu", "gpu", "tpu", "hpu", "mps", "auto")
+        accelerator: Supports passing different accelerator types
+            ("cpu", "gpu", "tpu", "hpu", "mps", "auto")
             as well as custom accelerator instances.
 
-        devices: The devices to use. Can be set to a positive number (int or str), a sequence of device indices
-            (list or str), the value ``-1`` to indicate all available devices should be used, or ``"auto"`` for
-            automatic selection based on the chosen accelerator. Defaults to ``"auto"``.
+        devices: The devices to use. Can be set to a positive number (int or str), a
+            sequence of device indices (list or str), the value ``-1`` to indicate all available
+            devices should be used, or ``"auto"`` for automatic selection based on the chosen
+            accelerator. Defaults to ``"auto"``.
 
         strategy: Supports different training strategies with aliases as well custom strategies.
             Defaults to ``"auto"``.
@@ -47,8 +50,9 @@ class MultiModelTrainer:
         num_nodes: Number of GPU nodes for distributed training.
             Defaults to ``1``.
 
-        precision: Double precision (64, '64' or '64-true'), full precision (32, '32' or '32-true'),
-            16bit mixed precision (16, '16', '16-mixed') or bfloat16 mixed precision ('bf16', 'bf16-mixed').
+        precision: Double precision (64, '64' or '64-true'),
+            full precision (32, '32' or '32-true'), 16bit mixed precision (16, '16', '16-mixed') or
+            bfloat16 mixed precision ('bf16', 'bf16-mixed').
             Can be used on CPU, GPU, TPUs, or HPUs.
             Defaults to ``'32-true'``.
 
@@ -58,27 +62,28 @@ class MultiModelTrainer:
 
         min_epochs: Force training for at least these many epochs. Disabled by default (None).
 
-        max_steps: Stop training after this number of steps. Disabled by default (-1). If ``max_steps = -1``
-            and ``max_epochs = None``, will default to ``max_epochs = 1000``. To enable infinite training, set
-            ``max_epochs`` to ``-1``.
-
-        min_steps: Force training for at least these number of steps. Disabled by default (``None``).
-
-        check_val_every_n_epoch: Perform a validation loop after every `N` training epochs. If ``None``,
-            validation will be done solely based on the number of training batches, requiring ``val_check_interval``
-            to be an integer value. When used together with a time-based ``val_check_interval`` and
-            ``check_val_every_n_epoch`` > 1, validation is aligned to epoch multiples: if the interval elapses
-            before the next multiple-N epoch, validation runs at the start of that epoch (after the first batch)
-            and the timer resets; if it elapses during a multiple-N epoch, validation runs after the current batch.
-            For ``None`` or ``1`` cases, the time-based behavior of ``val_check_interval`` applies without
-            additional alignment.
-            Defaults to ``1``.
-
-        logger: Logger (or iterable collection of loggers) for experiment tracking. A ``True`` value uses
-            the default ``TensorBoardLogger`` if it is installed, otherwise ``CSVLogger``.
-            ``False`` will disable logging. If multiple loggers are provided, local files
-            (checkpoints, profiler traces, etc.) are saved in the ``log_dir`` of the first logger.
-            Defaults to ``True``.
+        max_steps: Stop training after this number of steps. Disabled by default (-1).
+            If ``max_steps = -1`` and ``max_epochs = None``, will default to ``max_epochs = 1000``.
+            To enable infinite training, set ``max_epochs`` to ``-1``.
+
+        min_steps: Force training for at least these number of steps.
+            Disabled by default (``None``).
+
+        check_val_every_n_epoch: Perform a validation loop after every `N` training epochs.
+            If ``None``, validation will be done solely based on the number of training batches,
+            requiring ``val_check_interval`` to be an integer value. When used together with a
+            time-based ``val_check_interval`` and ``check_val_every_n_epoch`` > 1, validation is
+            aligned to epoch multiples: if the interval elapses before the next multiple-N epoch,
+            validation runs at the start of that epoch (after the first batch) and the timer resets;
+            if it elapses during a multiple-N epoch, validation runs after the current batch.
+            For ``None`` or ``1`` cases, the time-based behavior of ``val_check_interval``
+            applies without additional alignment. Defaults to ``1``.
+
+        logger: Logger (or iterable collection of loggers) for experiment tracking. A ``True``
+            value uses the default ``TensorBoardLogger`` if it is installed,
+            otherwise ``CSVLogger``. ``False`` will disable logging. If multiple loggers are
+            provided, local files (checkpoints, profiler traces, etc.) are saved in the ``log_dir``
+            of the first logger. Defaults to ``True``.
 
         default_root_dir: Default path for logs and weights when no logger/ckpt_callback passed.
             Defaults to ``os.getcwd()``.
@@ -98,8 +103,8 @@ class MultiModelTrainer:
             Defaults to ``False``.
 
         enable_checkpointing: If ``True``, enable checkpointing.
-            It will configure a default ModelCheckpoint callback if there is no user-defined ModelCheckpoint in
-            :paramref:`~hyperbench.train.MultiModelTrainer.callbacks`.
+            It will configure a default ModelCheckpoint callback if there is no user-defined
+                ModelCheckpoint in :paramref:`~hyperbench.train.MultiModelTrainer.callbacks`.
             Defaults to ``True``.
 
         enable_progress_bar: Whether to enable the progress bar by default.
@@ -119,10 +124,11 @@ class MultiModelTrainer:
 
         auto_start_tensorboard: When ``True`` and tensorboard is installed, automatically starts
             a TensorBoard server pointing at the experiment log directory.
-            Using this option requires that TensorBoard is installed in the environment and moves control
-            of the TensorBoard server lifecycle to the trainer, which will automatically terminate the server
-            when the trainer is finalized (e.g., at the end of a `with` block or when the object is garbage collected).
-            Enable `auto_wait` to keep the server alive after training completes so you can inspect results before the trainer is finalized.
+            Using this option requires that TensorBoard is installed in the environment and moves
+            control of the TensorBoard server lifecycle to the trainer, which will automatically
+            terminate the server when the trainer is finalized (e.g., at the end of a `with` block
+            or when the object is garbage collected). Enable `auto_wait` to keep the server alive
+            after training completes so you can inspect results before the trainer is finalized.
             Defaults to ``False``.
 
         tensorboard_port: Port for the auto-launched TensorBoard server.
@@ -272,7 +278,8 @@ def fit_all(
             if not config.is_trainable:
                 if verbose:
                     print(
-                        f"Skipping training for model {config.full_model_name()} [{i + 1}/{len(self.model_configs)} models] (is_trainable=False)"
+                        f"Skipping training for model {config.full_model_name()} "
+                        f"[{i + 1}/{len(self.model_configs)} models] (is_trainable=False)"
                     )
                 continue
 
@@ -335,7 +342,8 @@ def test_all(
                 verbose=verbose_loop,
             )
 
-            # In Lightning, test() returns a list of dicts, one per dataloader, but we use a single dataloader
+            # In Lightning, test() returns a list of dicts, one per dataloader,
+            # but we use a single dataloader
             test_results[config.full_model_name()] = (
                 trainer_test_results[0] if len(trainer_test_results) > 0 else {}
             )
@@ -352,6 +360,7 @@ def finalize(self) -> None:
     def wait(self) -> None:
         """
         Wait until the user presses Enter, keeping process alive.
+
         If no process is running, this method does nothing.
         """
         # For now, we only use this for waiting on TensorBoard, but this can be extended
@@ -373,7 +382,8 @@ def __auto_start_tensorboard_if_enabled(self) -> None:
             else:
                 warnings.warn(
                     "TensorBoard is not available. "
-                    "Install it with `pip install hyperbench[tensorboard]` or `pip install tensorboard`"
+                    "Install it with `pip install hyperbench[tensorboard]` or "
+                    "`pip install tensorboard`"
                     "to enable auto-start.",
                     category=UserWarning,
                     stacklevel=2,
diff --git a/hyperbench/types/graph.py b/hyperbench/types/graph.py
index c8943bac..d8f31ccf 100644
--- a/hyperbench/types/graph.py
+++ b/hyperbench/types/graph.py
@@ -12,8 +12,10 @@ class Graph:
     A simple graph data structure using edge list representation.
 
     Args:
-        edges: A list of edges, where each edge is represented as a list of two integers (source_node, destination_node).
-        edge_weights: Optional list of edge weights corresponding to each edge in ``edges``. If provided, must have the same length as ``edges``.
+        edges: A list of edges, where each edge is represented as a list of two integers
+            (source_node, destination_node).
+        edge_weights: Optional list of edge weights corresponding to each edge in ``edges``.
+            If provided, must have the same length as ``edges``.
     """
 
     def __init__(self, edges: list[list[int]], edge_weights: list[float] | None = None):
@@ -23,19 +25,25 @@ def __init__(self, edges: list[list[int]], edge_weights: list[float] | None = No
 
     @property
     def edge_weights(self) -> list[float] | None:
-        """Return the edge weights, if present."""
+        """
+        Return the edge weights, if present.
+        """
         return self.__edge_weights
 
     @property
     def edge_weights_tensor(self) -> Tensor:
-        """Return the edge weights as a tensor, if present."""
+        """
+        Return the edge weights as a tensor, if present.
+        """
         if self.__edge_weights is not None:
             return torch.tensor(self.__edge_weights, dtype=torch.float)
         return torch.empty(0, dtype=torch.float)
 
     @property
     def num_nodes(self) -> int:
-        """Return the number of nodes in the graph."""
+        """
+        Return the number of nodes in the graph.
+        """
         nodes = set()
         for edge in self.edges:
             nodes.update(edge)
@@ -43,7 +51,9 @@ def num_nodes(self) -> int:
 
     @property
     def num_edges(self) -> int:
-        """Return the number of edges in the graph."""
+        """
+        Return the number of edges in the graph.
+        """
         return len(self.edges)
 
     def remove_selfloops(self) -> Graph:
@@ -120,7 +130,8 @@ def smoothing_with_laplacian_matrix(
         Args:
             x: Node feature matrix. Size ``(num_nodes, C)``.
             laplacian_matrix: The Laplacian matrix. Size ``(num_nodes, num_nodes)``.
-            drop_rate: Randomly dropout the connections in the Laplacian with probability ``drop_rate``. Defaults to ``0.0``.
+            drop_rate: Randomly dropout the connections in the Laplacian with probability
+                ``drop_rate``. Defaults to ``0.0``.
 
         Returns:
             x: The smoothed feature matrix. Size ``(num_nodes, C)``.
@@ -135,7 +146,8 @@ def smoothing_with_laplacian_matrix(
 class EdgeIndex:
     """
     A wrapper for edge index representation of a graph.
-    Edge index is a tensor of shape ``(2, num_edges)`` where the first row contains source node indices
+    Edge index is a tensor of shape ``(2, num_edges)`` where the first row contains source
+        node indices
     and the second row contains destination node indices for each edge.
 
     Examples:
@@ -161,24 +173,32 @@ def __init__(
 
     @property
     def item(self) -> Tensor:
-        """Return the edge index tensor."""
+        """
+        Return the edge index tensor.
+        """
         return self.__edge_index
 
     @property
     def edge_weights(self) -> Tensor | None:
-        """Return the edge weight tensor, if present."""
+        """
+        Return the edge weight tensor, if present.
+        """
         return self.__edge_weights
 
     @property
     def max_node_id(self) -> int:
-        """Return the maximum node ID in the edge index."""
+        """
+        Return the maximum node ID in the edge index.
+        """
         if self.__edge_index.size(1) < 1:
             return -1
         return int(self.__edge_index.max())
 
     @property
     def num_edges(self) -> int:
-        """Return the number of edges in the graph."""
+        """
+        Return the number of edges in the graph.
+        """
         if self.__edge_index.size(1) < 1:
             return 0
         # Number of edges is the number of columns in edge_index, which is dim=1,
@@ -187,7 +207,9 @@ def num_edges(self) -> int:
 
     @property
     def num_nodes(self) -> int:
-        """Return the number of nodes in the graph."""
+        """
+        Return the number of nodes in the graph.
+        """
         if self.__edge_index.size(1) < 1:
             return 0
         unique_nodes = torch.unique(self.__edge_index)
@@ -218,10 +240,14 @@ def add_selfloops(
             ...                              [1, 0, 3, 0, 1, 2, 3, 4, 5]]
 
         Args:
-            num_nodes: Total number of nodes. When provided, self-loops are added for nodes ``0`` to ``num_nodes - 1``. When ``None``, defaults to ``self.num_nodes``.
-                This parameter is important when ``edge_index`` does not contain all nodes (e.g., some nodes are isolated and have no edges or have been removed),
-                as it ensures that the resulting Laplacian matrix has the correct size and includes all nodes. For instance, for self-loops.
-            with_duplicate_removal: Whether to remove duplicate edges after adding self-loops. Defaults to ``True``.
+            num_nodes: Total number of nodes. When provided, self-loops are added for nodes ``0``
+                to ``num_nodes - 1``. When ``None``, defaults to ``self.num_nodes``.
+                This parameter is important when ``edge_index`` does not contain all nodes
+                (e.g., some nodes are isolated and have no edges or have been removed),
+                as it ensures that the resulting Laplacian matrix has the correct size and includes
+                all nodes. For instance, for self-loops.
+            with_duplicate_removal: Whether to remove duplicate edges after adding self-loops.
+                Defaults to ``True``.
 
         Returns:
             edge_index: This `EdgeIndex` instance with self-loops added.
@@ -302,7 +328,8 @@ def get_sparse_adjacency_matrix(
         Args:
             num_nodes: The number of nodes in the graph.
                 If ``None``, it will be inferred from ``self.num_nodes``.
-                Note that the node indices in ``edge_index`` are assumed to be in the range [0, num_nodes-1].
+                Note that the node indices in ``edge_index`` are assumed to be in the
+                range [0, num_nodes-1].
             use_edge_weights: Whether to use edge weights if they are present.
                 If ``False``, all edges will have weight 1. Defaults to ``False``.
 
@@ -373,7 +400,8 @@ def get_sparse_identity_matrix(self, num_nodes: int | None = None) -> Tensor:
         # Example: num_nodes = 3
         #          -> identity_indices = [[0, 1, 2],
         #                                 [0, 1, 2]]
-        #             we use repeat(2, 1) as I is a matrix NxN, so we need indices for both rows and columns
+        #             we use repeat(2, 1) as I is a matrix NxN, so we need indices
+        #             for both rows and columns
         #          -> values = [1, 1, 1]
         #                   0  1  2
         #          -> I = [[1, 0, 0], 0
@@ -404,11 +432,14 @@ def get_sparse_normalized_degree_matrix(
         Args:
             num_nodes: The number of nodes in the graph.
                 If ``None``, it will be inferred from ``self.num_nodes``.
-                Note that the node indices in ``edge_index`` are assumed to be in the range [0, num_nodes-1].
-            use_edge_weights: If ``True``, use the edge weights from ``self.edge_weights``. If ``False``, all edges use weight 1.
+                Note that the node indices in ``edge_index`` are assumed to be in
+                the range [0, num_nodes-1].
+            use_edge_weights: If ``True``, use the edge weights from ``self.edge_weights``.
+                If ``False``, all edges use weight 1.
 
         Returns:
-            degree_matrix: The sparse normalized degree matrix D^-1/2 of shape ``(num_nodes, num_nodes)``.
+            degree_matrix: The sparse normalized degree matrix D^-1/2 of
+                shape ``(num_nodes, num_nodes)``.
         """
         num_nodes = self.num_nodes if num_nodes is None else num_nodes
         self.__validate_num_nodes(num_nodes)
@@ -458,7 +489,7 @@ def get_sparse_normalized_laplacian(
         num_nodes: int | None = None,
     ) -> Tensor:
         """
-        Compute the sparse symmetric normalized Laplacian matrix: L = I - D^{-1/2} A D^{-1/2}.
+        Compute the sparse symmetric normalized Laplacian matrix: `L = I - D^{-1/2} A D^{-1/2}`.
 
         Unlike ``get_sparse_normalized_gcn_laplacian``, this method does not add self-loops
         and computes the standard Laplacian (not the GCN propagation matrix).
@@ -468,7 +499,8 @@ def get_sparse_normalized_laplacian(
                 it will be inferred from ``self.num_nodes``.
 
         Returns:
-            laplacian: The sparse symmetric normalized Laplacian matrix of shape ``(num_nodes, num_nodes)``.
+            laplacian: The sparse symmetric normalized Laplacian
+                matrix of shape ``(num_nodes, num_nodes)``.
         """
         num_nodes = self.num_nodes if num_nodes is None else num_nodes
         self.__validate_num_nodes(num_nodes)
@@ -496,19 +528,25 @@ def get_sparse_normalized_gcn_laplacian(
         """
         Compute the sparse Laplacian matrix from a graph edge index.
 
-        The GCN Laplacian is defined as: L_GCN = D_hat^-1/2 * A_hat * D_hat^-1/2,
-        where A_hat = A + I (adjacency with self-loops) and D_hat is the degree matrix of A_hat.
+        The GCN Laplacian is defined as: `L_GCN = D_hat^-1/2 * A_hat * D_hat^-1/2`,
+        where `A_hat = A + I` (adjacency with self-loops) and `D_hat` is the degree matrix
+        of `A_hat`.
 
         Args:
             num_nodes: The number of nodes in the graph. If ``None``,
                 it will be inferred from ``self.num_nodes``.
-                Note that the node indices in ``edge_index`` are assumed to be in the range [0, num_nodes-1].
-                This parameter is important when ``edge_index`` does not contain all nodes (e.g., some nodes are isolated and have no edges or have been removed),
-                as it ensures that the resulting Laplacian matrix has the correct size and includes all nodes. For instance, for self-loops.
-            use_edge_weights: If ``True``, use the edge weights from ``self.edge_weights``. If ``False``, all edges use weight 1.
+                Note that the node indices in ``edge_index`` are assumed to be
+                in the range [0, num_nodes-1].
+                This parameter is important when ``edge_index`` does not contain all nodes
+                (e.g., some nodes are isolated and have no edges or have been removed),
+                as it ensures that the resulting Laplacian matrix has the correct size and
+                includes all nodes. For instance, for self-loops.
+            use_edge_weights: If ``True``, use the edge weights from ``self.edge_weights``.
+                If ``False``, all edges use weight 1.
 
         Returns:
-            laplacian: The sparse symmetrically normalized Laplacian matrix of shape ``(num_nodes, num_nodes)``.
+            laplacian: The sparse symmetrically normalized Laplacian matrix of
+                shape ``(num_nodes, num_nodes)``.
         """
         num_nodes = self.num_nodes if num_nodes is None else num_nodes
         self.__validate_num_nodes(num_nodes)
@@ -530,7 +568,9 @@ def get_sparse_normalized_gcn_laplacian(
         return normalized_laplacian_matrix.coalesce()
 
     def remove_selfloops(self) -> EdgeIndex:
-        """Remove self-loops from the edge index."""
+        """
+        Remove self-loops from the edge index.
+        """
         # Example: edge_index = [[0, 1, 2, 3],
         #                        [1, 1, 3, 2]], shape (2, |E| = 4)
         #          -> keep_mask = [True, False, True, True]
@@ -545,15 +585,19 @@ def remove_selfloops(self) -> EdgeIndex:
 
     def remove_duplicate_edges(self, num_nodes: int | None = None) -> EdgeIndex:
         """
-        Remove duplicate edges from the edge index. Keeps the tensor contiguous in memory.
-
-        Args:
-            num_nodes: The number of nodes in the graph. If ``None``, it will be inferred from ``self.num_nodes``.
-                This parameter is important when ``edge_index`` does not contain all nodes (e.g., some nodes are isolated and have no edges or have been removed),
-                as it ensures that the resulting Laplacian matrix has the correct size and includes all nodes. For instance, for self-loops.
-
-        Returns:
-            edge_index: This `EdgeIndex` instance with duplicate edges removed.
+        Remove duplicate edges from the edge index.
+
+        Keeps the tensor contiguous in memory.
+                Args:
+                    num_nodes: The number of nodes in the graph. If ``None``, it will be
+                        inferred from ``self.num_nodes``.
+                        This parameter is important when ``edge_index`` does not contain all nodes
+                        (e.g., some nodes are isolated and have no edges or have been removed),
+                        as it ensures that the resulting Laplacian matrix has the correct size
+                        and includes all nodes. For instance, for self-loops.
+
+                Returns:
+                    edge_index: This `EdgeIndex` instance with duplicate edges removed.
         """
         num_nodes = self.num_nodes if num_nodes is None else num_nodes
         self.__validate_num_nodes(num_nodes)
@@ -609,9 +653,12 @@ def to_undirected(
 
         Args:
             with_selfloops: Whether to add self-loops to each node. Defaults to ``False``.
-            num_nodes: Total number of nodes. Propagated to ``add_selfloops`` when ``with_selfloops`` is ``True``.
-                This parameter is useful when ``edge_index`` does not contain all nodes (e.g., some nodes are isolated and have no edges or have been removed),
-                as it ensures that the resulting Laplacian matrix has the correct size and includes all nodes. For instance, for self-loops.
+            num_nodes: Total number of nodes. Propagated to ``add_selfloops`` when
+                ``with_selfloops`` is ``True``.
+                This parameter is useful when ``edge_index`` does not contain all nodes
+                (e.g., some nodes are isolated and have no edges or have been removed),
+                as it ensures that the resulting Laplacian matrix has the correct size and
+                includes all nodes. For instance, for self-loops.
 
         Returns:
             edge_index: This `EdgeIndex` instance converted to undirected.
@@ -636,17 +683,22 @@ def to_undirected(
         # Example: encoded_edge_ids          = [1, 4, 11],
         #          reversed_encoded_edge_ids = [4, 1, 14]
         #          -> missing_reverse_mask = [False, False, True]
-        #             because 4 and 1 are in both, it means edges (0,1) and (1,0) are already present,
-        #             but 14 is only in reversed_encoded_edge_ids, which means edge (3,2) is missing
-        #             and this is because the mask points to the missing reversee edges that are missing
+        #             because 4 and 1 are in both, it means edges (0,1) and (1,0)
+        #             are already present,
+        #             but 14 is only in reversed_encoded_edge_ids, which means
+        #             edge (3,2) is missing
+        #             and this is because the mask points to the missing reversee edges
+        #             that are missing
         missing_mask = torch.logical_not(torch.isin(reversed_encoded_edge_ids, encoded_edge_ids))
 
-        # Keep all original sources and append the destination of each edge whose reverse is missing.
+        # Keep all original sources and append the destination of each edge
+        # whose reverse is missing.
         # Example: orig_src = [0, 1, 2], orig_dest[missing_mask] = [3]
         #          -> src = [0, 1, 2, 3]
         src = torch.cat([orig_src, orig_dest[missing_mask]])
 
-        # Keep all original destinations and append the source of each edge whose reverse is missing.
+        # Keep all original destinations and append the source of each edge
+        # whose reverse is missing.
         # Example: orig_dest = [1, 0, 3], orig_src[missing_mask] = [2]
         #          -> dest = [1, 0, 3, 2]
         #          -> final undirected edges: [(0,1), (1,0), (2,3), (3,2)]
@@ -678,8 +730,10 @@ def to_undirected(
 
         if with_selfloops:
             # Don't remove duplicate edges when adding self-loops, as we need to remove them
-            # even if with_selfloops is False, to ensure that the edge index is clean and doesn't contain duplicate edges.
-            # In this way, we don't do the duplicate edge removal twice, which would be redundant and inefficient
+            # even if with_selfloops is False, to ensure that the edge index is clean
+            # and doesn't contain duplicate edges.
+            # In this way, we don't do the duplicate edge removal twice, which would be
+            # redundant and inefficient
             self.add_selfloops(num_nodes=num_nodes, with_duplicate_removal=False)
 
         self.remove_duplicate_edges(num_nodes=num_nodes)
@@ -692,13 +746,15 @@ def __validate_edge_weights(self, edge_weights: Tensor | None) -> None:
 
         if edge_weights.dim() != 1:
             raise ValueError(
-                f"'edge_weights' must be a 1D tensor. Got {edge_weights.dim()}D tensor with shape {edge_weights.shape}."
+                f"'edge_weights' must be a 1D tensor. Got "
+                f"{edge_weights.dim()}D tensor with shape {edge_weights.shape}."
             )
 
         if edge_weights.size(0) != self.__edge_index.size(1):
             raise ValueError(
-                "'edge_weights' must have the same number of entries as edges in the 'edge_index'. "
-                f"Got {edge_weights.size(0)} edge weights but {self.__edge_index.size(1)} edges."
+                f"'edge_weights' must have the same number of entries as edges in "
+                f"the 'edge_index'. Got {edge_weights.size(0)} edge weights but "
+                f"{self.__edge_index.size(1)} edge columns."
             )
 
     def __validate_num_nodes(self, num_nodes: int) -> None:
diff --git a/hyperbench/types/hdata.py b/hyperbench/types/hdata.py
index 85df8c8c..b7441c46 100644
--- a/hyperbench/types/hdata.py
+++ b/hyperbench/types/hdata.py
@@ -44,12 +44,15 @@ class HData:
         ...                                 [0, 1, 2, 3, 4]]) # hyperedge IDs
         >>> data = HData(x=x, hyperedge_index=hyperedge_index)
 
-    Args:
+    Attributes:
         x: Node feature matrix of shape ``[num_nodes, num_features]``.
         hyperedge_index: Hyperedge connectivity in COO format of shape ``[2, num_incidences]``,
-            where ``hyperedge_index[0]`` contains node IDs and ``hyperedge_index[1]`` contains hyperedge IDs.
-        hyperedge_weights: Optional tensor of shape ``[num_hyperedges]`` containing weights for each hyperedge.
-        hyperedge_attr: Hyperedge feature matrix of shape ``[num_hyperedges, num_hyperedge_features]``.
+            where ``hyperedge_index[0]`` contains node IDs and ``hyperedge_index[1]``
+            contains hyperedge IDs.
+        hyperedge_weights: Optional tensor of shape ``[num_hyperedges]`` containing weights
+            for each hyperedge.
+        hyperedge_attr: Hyperedge feature matrix of
+            shape ``[num_hyperedges, num_hyperedge_features]``.
             Features associated with each hyperedge (e.g., weights, timestamps, types).
         num_nodes: Number of nodes in the hypergraph.
             If ``None``, inferred as ``x.size(0)``.
@@ -58,9 +61,11 @@ class HData:
         y: Labels for hyperedges, of shape ``[num_hyperedges]``.
             Used for supervised learning tasks. For unsupervised tasks, this can be ignored.
             Default is a tensor of ones, indicating all hyperedges are positive examples.
-        global_node_ids: Optional stable node IDs of shape ``[num_nodes]`` matching the row order of ``x``.
-            Use this to preserve access to the canonical node space when ``hyperedge_index`` is rebased locally.
-            If ``None``, defaults to ``torch.arange(num_nodes)``, assuming that these are the global node IDs in the same order as the rows of ``x``.
+        global_node_ids: Optional stable node IDs of shape ``[num_nodes]`` matching the
+            row order of ``x``. Use this to preserve access to the canonical node space
+            when ``hyperedge_index`` is rebased locally.
+            If ``None``, defaults to ``torch.arange(num_nodes)``, assuming that these are the
+            global node IDs in the same order as the rows of ``x``.
     """
 
     def __init__(
@@ -115,6 +120,13 @@ def __init__(
         self.device = self.get_device_if_all_consistent()
 
     def __repr__(self) -> str:
+        hyperedge_weights_shape = (
+            self.hyperedge_weights.shape if self.hyperedge_weights is not None else None
+        )
+        hyperedge_attr_shape = (
+            self.hyperedge_attr.shape if self.hyperedge_attr is not None else None
+        )
+
         return (
             f"{self.__class__.__name__}(\n"
             f"    num_nodes={self.num_nodes},\n"
@@ -122,8 +134,8 @@ def __repr__(self) -> str:
             f"    x_shape={self.x.shape},\n"
             f"    global_node_ids_shape={self.global_node_ids.shape},\n"
             f"    hyperedge_index_shape={self.hyperedge_index.shape},\n"
-            f"    hyperedge_weights_shape={self.hyperedge_weights.shape if self.hyperedge_weights is not None else None},\n"
-            f"    hyperedge_attr_shape={self.hyperedge_attr.shape if self.hyperedge_attr is not None else None},\n"
+            f"    hyperedge_weights_shape={hyperedge_weights_shape},\n"
+            f"    hyperedge_attr_shape={hyperedge_attr_shape},\n"
             f"    y_shape={self.y.shape if self.y is not None else None}\n"
             f"    device={self.device}\n"
             f")"
@@ -137,21 +149,29 @@ def cat_same_node_space(
         global_node_ids: Tensor | None = None,
     ) -> HData:
         """
-        Concatenate `HData` instances that share the same node space, meaning nodes with the same ID in different instances are the same node.
-        This is useful when combining positive and negative hyperedges that reference the same set of nodes.
+        Concatenate `HData` instances that share the same node space, meaning nodes with
+        the same ID in different instances are the same node.
+        This is useful when combining positive and negative hyperedges that reference
+        the same set of nodes.
 
         Notes:
-            - ``x`` is derived from the instance with the largest number of nodes, if not provided explicitly.
+            - ``x`` is derived from the instance with the largest number of nodes,
+                if not provided explicitly.
                 If there are conflicting features for the same node ID across instances,
                 the features from the instance with the largest number of nodes will be used.
-                If ``global_node_ids`` is provided explicitly, ``x`` must also be provided to ensure consistency.
+                If ``global_node_ids`` is provided explicitly, ``x`` must also be provided
+                to ensure consistency.
             - ``hyperedge_index`` is the concatenation of all input hyperedge indices.
             - ``hyperedge_weights`` is the concatenation of all input hyperedge weights, if present.
-                If some instances have hyperedge weights and others do not, the resulting ``hyperedge_weights`` will be set to ``None``.
+                If some instances have hyperedge weights and others do not, the resulting
+                ``hyperedge_weights`` will be set to ``None``.
             - ``hyperedge_attr`` is the concatenation of all input hyperedge attributes, if present.
-                If some instances have hyperedge attributes and others do not, the resulting ``hyperedge_attr`` will be set to ``None``.
-            - ``global_node_ids`` is derived from the instance with the largest number of nodes, if not provided explicitly.
-                If ``x`` is provided explicitly, ``global_node_ids`` must be provided explicitly as well to ensure consistency.
+                If some instances have hyperedge attributes and others do not, the resulting
+                ``hyperedge_attr`` will be set to ``None``.
+            - ``global_node_ids`` is derived from the instance with the largest number of nodes,
+                if not provided explicitly.
+                If ``x`` is provided explicitly, ``global_node_ids`` must be provided explicitly
+                as well to ensure consistency.
             - ``y`` is the concatenation of all input labels.
 
         Examples:
@@ -165,21 +185,29 @@ def cat_same_node_space(
         Args:
             hdatas: One or more `HData` instances sharing the same node space.
             x: Optional node feature matrix to use for the resulting `HData`.
-                If ``None``, the node features from the instance with the largest number of nodes will be used.
-                If ``global_node_ids`` is provided explicitly, ``x`` must also be provided to ensure consistency.
+                If ``None``, the node features from the instance with the largest number of
+                nodes will be used.
+                If ``global_node_ids`` is provided explicitly, ``x`` must also be provided
+                to ensure consistency.
             global_node_ids: Optional global node IDs for the resulting `HData`.
-                If ``None``, the global node IDs from the instance with the largest number of nodes will be used.
-                If ``x`` is provided explicitly, ``global_node_ids`` must also be provided to ensure consistency.
-                If ``x`` is provided and there is no need for ``global_node_ids`` to preserve access to the canonical node space,
-                it is recommended to use arbitrary global node IDs that are consistent with the feature rows of ``x``.
+                If ``None``, the global node IDs from the instance with the largest number of
+                nodes will be used.
+                If ``x`` is provided explicitly, ``global_node_ids`` must also be provided
+                to ensure consistency.
+                If ``x`` is provided and there is no need for ``global_node_ids`` to preserve
+                access to the canonical node space,
+                it is recommended to use arbitrary global node IDs that are consistent with
+                the feature rows of ``x``.
                 For example, ``global_node_ids=torch.arange(x.size(0))``).
 
         Returns:
             hdata: A new `HData` with shared nodes and concatenated hyperedges.
 
         Raises:
-            ValueError: If no HData instances are provided, if there are overlapping hyperedge IDs across instances,
-                or if ``x`` and ``global_node_ids`` are not both provided when one of them is provided.
+            ValueError: If no HData instances are provided, if there are overlapping
+                hyperedge IDs across instances,
+                or if ``x`` and ``global_node_ids`` are not both provided when one of
+                them is provided.
         """
         cls.__validate_can_perform_cat_same_node_space(hdatas, x, global_node_ids)
 
@@ -253,7 +281,8 @@ def empty(cls) -> HData:
     @classmethod
     def from_hyperedge_index(cls, hyperedge_index: Tensor) -> HData:
         """
-        Build an `HData` from a given hyperedge index, with empty node features and hyperedge attributes.
+        Build an `HData` from a given hyperedge index, with empty node features and
+        hyperedge attributes.
 
         - Node features are initialized as an empty tensor of shape ``[0, 0]``.
         - Hyperedge attributes are set to ``None``.
@@ -270,10 +299,12 @@ def from_hyperedge_index(cls, hyperedge_index: Tensor) -> HData:
             >>> hyperedge_weights = None
 
         Args:
-            hyperedge_index: Tensor of shape ``[2, num_incidences]`` representing the hypergraph connectivity.
+            hyperedge_index: Tensor of shape ``[2, num_incidences]`` representing
+                the hypergraph connectivity.
 
         Returns:
-            hdata: An `HData` instance with the given hyperedge index and default values for other attributes.
+            hdata: An `HData` instance with the given hyperedge index and default values
+                for other attributes.
         """
         return cls(
             x=empty_nodefeatures(),
@@ -297,7 +328,10 @@ def split(
 
         Examples:
             Transductive split (default) preserving the full node space:
-            >>> split_hdata = HData.split(hdata, torch.tensor([1]), node_space_setting="transductive")
+            >>> split_hdata = HData.split(
+            ...    hdata,
+            ...    torch.tensor([1]),
+            ...    node_space_setting="transductive")
             >>> split_hdata.x.shape[0] == hdata.x.shape[0]
             >>> split_hdata.hyperedge_index
             ... # node IDs stay in the original row space, hyperedge IDs are rebased
@@ -310,8 +344,10 @@ def split(
         Args:
             hdata: The original `HData` containing the full hypergraph.
             split_hyperedge_ids: Tensor of hyperedge IDs to include in this split.
-                It is assumed that the provided hyperedge IDs are valid and exist in ``hdata.hyperedge_index[1]``.
-                It is mandatory to provide this argument unless a custom ``splitter`` is provided that owns split materialization.
+                It is assumed that the provided hyperedge IDs are valid and exist
+                in ``hdata.hyperedge_index[1]``.
+                It is mandatory to provide this argument unless a custom ``splitter`` is provided
+                that owns split materialization.
             node_space_setting: Whether to preserve the full node space in the splits.
                 ``transductive`` (default) ensures all node features are present in the split,
                 while ``inductive`` allows splits to have disjoint node spaces.
@@ -348,7 +384,8 @@ def enrich_node_features(
         Enrich node features using the provided node feature enricher.
 
         Args:
-            enricher: An instance of NodeEnricher to generate structural node features from hypergraph topology.
+            enricher: An instance of NodeEnricher to generate structural node features
+                from hypergraph topology.
             enrichment_mode: How to combine generated features with existing ``hdata.x``.
                 ``concatenate`` appends new features as additional columns.
                 ``replace`` substitutes ``hdata.x`` entirely.
@@ -384,7 +421,8 @@ def enrich_node_features_from(
         Copy node features from another `HData` by aligning features by ``global_node_ids``.
 
         Examples:
-            Transductive enrichment (default) expecting the same node space in both source and target:
+            Transductive enrichment (default) expecting the same node space in both
+            source and target:
             >>> target = target.enrich_node_features_from(source, node_space_setting="transductive")
 
             Inductive with a scalar fill value:
@@ -405,8 +443,10 @@ def enrich_node_features_from(
             hdata_with_features: Source `HData` providing node features.
             node_space_setting: The setting for the node space, determining how nodes are handled.
                 If ``"transductive"``, every target node is expected to exist in the source.
-                If ``"inductive"``, the target dataset may have a different node space, and missing nodes are filled using ``fill_value``.
-            fill_value: Scalar or vector used to fill missing node features when ``node_space_setting`` is not transductive.
+                If ``"inductive"``, the target dataset may have a different node space, and missing
+                nodes are filled using ``fill_value``.
+            fill_value: Scalar or vector used to fill missing node features when
+                ``node_space_setting`` is not transductive.
 
         Returns:
             hdata: A new `HData` with node features copied from ``hdata_with_features``.
@@ -414,20 +454,24 @@ def enrich_node_features_from(
         Raises:
             ValueError: If either instance lacks ``global_node_ids``, if the source feature rows
                 do not align with the source node IDs, if ``fill_value`` is used with
-                ``node_space_setting="transductive"``, or if ``fill_value`` is missing or malformed when ``node_space_setting="inductive"``.
+                ``node_space_setting="transductive"``, or if ``fill_value`` is missing or
+                malformed when ``node_space_setting="inductive"``.
         """
         source_global_node_ids = hdata_with_features.global_node_ids
         source_x = hdata_with_features.x
         if source_x.size(0) != source_global_node_ids.size(0):
             raise ValueError(
-                "Expected 'hdata_with_features.x' rows to align with hdata_with_features.global_node_ids."
+                "Expected 'hdata_with_features.x' rows to align with "
+                "hdata_with_features.global_node_ids."
             )
         self.__validate_node_space_setting(node_space_setting, fill_value)
 
         target_global_node_ids = self.global_node_ids.detach().cpu().tolist()
 
-        # We need the index of the features for each node in the source, as we will use the index to track back
-        # to the node feautures after we match the global node id in the target to the one that is in the source
+        # We need the index of the features for each node in the source, as we will use
+        # the index to track back
+        # to the node feautures after we match the global node id in the target to the one that
+        # is in the source
         source_feature_idx_by_global_node_id = {
             int(global_node_id): feature_idx
             for feature_idx, global_node_id in enumerate(
@@ -448,8 +492,10 @@ def enrich_node_features_from(
             source_feature_idx = source_feature_idx_by_global_node_id.get(int(global_node_id))
             if source_feature_idx is None:
                 # Example: global_node_id = 30 is not present in the source
-                #          -> strict transductive mode records it as missing and then raises an error
-                #          -> non-transductive mode fills the features with fill_value and continues enriching the other nodes
+                #          -> strict transductive mode records it as
+                #             missing and then raises an error
+                #          -> non-transductive mode fills the features with
+                #             fill_value and continues enriching the other nodes
                 if is_transductive_setting(node_space_setting):
                     missing_global_node_ids.append(
                         int(global_node_id)
@@ -460,7 +506,8 @@ def enrich_node_features_from(
                     )  # fill missing node features with fill_value and
                 continue
 
-            # Match the global node IDs in the target to the corresponding feature indices in the source
+            # Match the global node IDs in the target to the corresponding
+            # feature indices in the source
             # Example: source_global_node_ids = [10, 20, 30], source_x has shape (3, num_features)
             #          target_global_node_ids = [10, 30]
             #          -> source_feature_idx_by_global_node_id = {10: 0, 20: 1, 30: 2}
@@ -494,8 +541,10 @@ def enrich_hyperedge_weights(
         Enrich hyperedge weights using the provided hyperedge weight enricher.
 
         Args:
-            enricher: An instance of HyperedgeEnricher to generate hyperedge weights from hypergraph topology.
-            enrichment_mode: How to combine generated weights with existing ``hdata.hyperedge_weights``.
+            enricher: An instance of HyperedgeEnricher to generate hyperedge weights from
+                hypergraph topology.
+            enrichment_mode: How to combine generated weights with
+                existing ``hdata.hyperedge_weights``.
                 ``concatenate`` appends new weights to the existing 1D tensor.
                 ``replace`` substitutes ``hdata.hyperedge_weights`` entirely.
                 Defaults to ``replace`` if not provided.
@@ -536,8 +585,10 @@ def enrich_hyperedge_attr(
         Enrich hyperedge features using the provided hyperedge feature enricher.
 
         Args:
-            enricher: An instance of HyperedgeEnricher to generate structural hyperedge features from hypergraph topology.
-            enrichment_mode: How to combine generated features with existing ``hdata.hyperedge_attr``.
+            enricher: An instance of HyperedgeEnricher to generate structural hyperedge
+                features from hypergraph topology.
+            enrichment_mode: How to combine generated features with
+                existing ``hdata.hyperedge_attr``.
                 ``concatenate`` appends new features as additional columns.
                 ``replace`` substitutes ``hdata.hyperedge_attr`` entirely.
                 Defaults to ``replace`` if not provided.
@@ -569,6 +620,7 @@ def enrich_hyperedge_attr(
     def get_device_if_all_consistent(self) -> torch.device:
         """
         Check that all tensors are on the same device and return that device.
+
         If there are no tensors or if they are on different devices, return CPU.
 
         Returns:
@@ -604,10 +656,12 @@ def remove_hyperedges_with_fewer_than_k_nodes(
 
         Args:
             k: The minimum number of nodes a hyperedge must have to be retained.
-            preserve_global_node_ids: Whether to preserve the global node IDs after removing hyperedges. Defaults to ``False``.
-                If ``False``, the global node IDs will be reindexed to be contiguous after removing hyperedges.
-                If ``True``, the global node IDs will be preserved, which may cause some models to raise
-                as they may expect contiguous global node IDs.
+            preserve_global_node_ids: Whether to preserve the global node IDs after
+                removing hyperedges. Defaults to ``False``.
+                If ``False``, the global node IDs will be reindexed to be contiguous after
+                removing hyperedges.
+                If ``True``, the global node IDs will be preserved, which may cause some models
+                to raise as they may expect contiguous global node IDs.
         """
         validate_is_positive("k", k)
 
@@ -649,8 +703,10 @@ def shuffle(self, seed: int | None = None) -> HData:
         """
         Return a new `HData` instance with hyperedge IDs randomly reassigned.
 
-        Each hyperedge keeps its original set of nodes, but is assigned a new ID via a random permutation.
-        ``y`` and ``hyperedge_attr`` are reordered to match, so that ``y[new_id]`` still corresponds to the correct hyperedge.
+        Each hyperedge keeps its original set of nodes, but is assigned a new ID
+        via a random permutation.
+        ``y`` and ``hyperedge_attr`` are reordered to match, so that ``y[new_id]``
+        still corresponds to the correct hyperedge.
         Same for ``hyperedge_attr[new_id]`` if hyperedge attributes are present.
 
         Examples:
@@ -665,10 +721,12 @@ def shuffle(self, seed: int | None = None) -> HData:
             >>> shuffled_hdata.y  # labels are permuted to match new hyperedge IDs, e.g., [0, 1]
 
         Args:
-            seed: Optional random seed for reproducibility. If ``None``, the shuffle will be non-deterministic.
+            seed: Optional random seed for reproducibility. If ``None``, the shuffle
+                will be non-deterministic.
 
         Returns:
-            hdata: A new `HData` instance with hyperedge IDs, ``y``, and ``hyperedge_attr`` permuted.
+            hdata: A new `HData` instance with hyperedge IDs, ``y``, and
+                ``hyperedge_attr`` permuted.
         """
         generator = create_seeded_torch_generator(device=self.device, seed=seed)
         permutation = torch.randperm(
@@ -680,8 +738,10 @@ def shuffle(self, seed: int | None = None) -> HData:
 
         # permutation[new_id] = old_id, so y[permutation] puts old labels into new slots
         # inverse_permutation[old_id] = new_id, used to remap hyperedge IDs in incidences
-        # Example: permutation = [1, 2, 0] means new_id 0 gets old_id 1, new_id 1 gets old_id 2, new_id 2 gets old_id 0
-        #          -> inverse_permutation = [2, 0, 1] means old_id 0 gets new_id 2, old_id 1 gets new_id 0, old_id 2 gets new_id 1
+        # Example: permutation = [1, 2, 0] means new_id 0 gets old_id 1,
+        #                   new_id 1 gets old_id 2, new_id 2 gets old_id 0
+        #                   -> inverse_permutation = [2, 0, 1] means old_id 0 gets new_id 2,
+        #                        old_id 1 gets new_id 0, old_id 2 gets new_id 1
         inverse_permutation = torch.empty_like(
             permutation,
             dtype=permutation.dtype,
@@ -697,14 +757,17 @@ def shuffle(self, seed: int | None = None) -> HData:
 
         # Example: hyperedge_index = [[0, 1, 2, 3, 4],
         #                             [0, 0, 1, 1, 2]],
-        #          inverse_permutation = [2, 0, 1] (new_id 0 -> old_id 2, new_id 1 -> old_id 0, new_id 2 -> old_id 1)
+        #          inverse_permutation = [2, 0, 1] (new_id 0 -> old_id 2, new_id 1 ->
+        #                                           old_id 0, new_id 2 -> old_id 1)
         #          -> new_hyperedge_index = [[0, 1, 2, 3, 4],
         #                                    [2, 2, 0, 0, 1]]
         old_hyperedge_ids = self.hyperedge_index[1]
         new_hyperedge_index[1] = inverse_permutation[old_hyperedge_ids]
 
         # Example: hyperedge_attr = [attr_0, attr_1, attr_2], permutation = [1, 2, 0]
-        #          -> new_hyperedge_attr = [attr_1  (attr of old_id 1), attr_2 (attr of old_id 2), attr_0 (attr of old_id 0)]
+        #          -> new_hyperedge_attr = [attr_1  (attr of old_id 1),
+        #                                   attr_2 (attr of old_id 2),
+        #                                   attr_0 (attr of old_id 0)]
         new_hyperedge_attr = (
             self.hyperedge_attr[permutation] if self.hyperedge_attr is not None else None
         )
@@ -752,7 +815,8 @@ def to(self, device: torch.device | str, non_blocking: bool = False) -> HData:
 
         Args:
             device: The target device (e.g., 'cpu', 'cuda:0').
-            non_blocking: If ``True`` and the source and destination devices are both CUDA, the copy will be non-blocking.
+            non_blocking: If ``True`` and the source and destination devices are both CUDA,
+                the copy will be non-blocking.
 
         Returns:
             hdata: The `HData` instance with all tensors moved to the specified device.
@@ -782,7 +846,8 @@ def with_y_to(self, value: float) -> HData:
             value: The value to set for all entries in the y attribute.
 
         Returns:
-            hdata: A new `HData` instance with the same attributes except for y, which is set to a tensor of the given value.
+            hdata: A new `HData` instance with the same attributes except for y,
+                which is set to a tensor of the given value.
         """
         return self.__class__(
             x=self.x.clone(),
@@ -800,7 +865,8 @@ def with_y_ones(self) -> HData:
         Return a copy of this instance with a y attribute of all ones.
 
         Returns:
-            hdata: A new `HData` instance with the same attributes except for y, which is set to a tensor of ones.
+            hdata: A new `HData` instance with the same attributes except for y, which is
+                set to a tensor of ones.
         """
         return self.with_y_to(1.0)
 
@@ -809,36 +875,45 @@ def with_y_zeros(self) -> HData:
         Return a copy of this instance with a y attribute of all zeros.
 
         Returns:
-            hdata: A new `HData` instance with the same attributes except for y, which is set to a tensor of zeros.
+            hdata: A new `HData` instance with the same attributes except for y, which
+                is set to a tensor of zeros.
         """
         return self.with_y_to(0.0)
 
     def stats(self) -> dict[str, Any]:
         """
         Compute statistics for the hypergraph data.
-        The fields returned in the dictionary include:
-        - ``shape_x``: The shape of the node feature matrix ``x``.
-        - ``shape_hyperedge_weights``: The shape of the hyperedge weights tensor, or ``None`` if hyperedge weights are not present.
-        - ``shape_hyperedge_attr``: The shape of the hyperedge attribute matrix, or ``None`` if hyperedge attributes are not present.
-        - ``num_nodes``: The number of nodes in the hypergraph.
-        - ``num_hyperedges``: The number of hyperedges in the hypergraph.
-        - ``avg_degree_node_raw``: The average degree of nodes, calculated as the mean number of hyperedges each node belongs to.
-        - ``avg_degree_node``: The floored node average degree.
-        - ``avg_degree_hyperedge_raw``: The average size of hyperedges, calculated as the mean number of nodes each hyperedge contains.
-        - ``avg_degree_hyperedge``: The floored hyperedge average size.
-        - ``node_degree_max``: The maximum degree of any node in the hypergraph.
-        - ``hyperedge_degree_max``: The maximum size of any hyperedge in the hypergraph.
-        - ``node_degree_median``: The median degree of nodes in the hypergraph.
-        - ``hyperedge_degree_median``: The median size of hyperedges in the hypergraph.
-        - ``distribution_node_degree``: A list where the value at index ``i`` represents the count of nodes with degree ``i``.
-        - ``distribution_hyperedge_size``: A list where the value at index ``i`` represents the count of hyperedges with size ``i``.
-        - ``distribution_node_degree_hist``: A dictionary where the keys are node degrees and the values are the count of nodes with that degree.
-        - ``distribution_hyperedge_size_hist``: A dictionary where the keys are hyperedge sizes and the values are the count of hyperedges with that size.
+
+        Fields:
+            - ``shape_x``: The shape of the node feature matrix ``x``.
+            - ``shape_hyperedge_weights``: The shape of the hyperedge weights tensor, or
+                ``None`` if hyperedge weights are not present.
+            - ``shape_hyperedge_attr``: The shape of the hyperedge attribute matrix, or ``None``
+                if hyperedge attributes are not present.
+            - ``num_nodes``: The number of nodes in the hypergraph.
+            - ``num_hyperedges``: The number of hyperedges in the hypergraph.
+            - ``avg_degree_node_raw``: The average degree of nodes, calculated as the mean
+                number of hyperedges each node belongs to.
+            - ``avg_degree_node``: The floored node average degree.
+            - ``avg_degree_hyperedge_raw``: The average size of hyperedges, calculated as
+                the mean number of nodes each hyperedge contains.
+            - ``avg_degree_hyperedge``: The floored hyperedge average size.
+            - ``node_degree_max``: The maximum degree of any node in the hypergraph.
+            - ``hyperedge_degree_max``: The maximum size of any hyperedge in the hypergraph.
+            - ``node_degree_median``: The median degree of nodes in the hypergraph.
+            - ``hyperedge_degree_median``: The median size of hyperedges in the hypergraph.
+            - ``distribution_node_degree``: A list where the value at index ``i`` represents
+                the count of nodes with degree ``i``.
+            - ``distribution_hyperedge_size``: A list where the value at index ``i`` represents
+                the count of hyperedges with size ``i``.
+            - ``distribution_node_degree_hist``: A dictionary where the keys are node degrees and
+                the values are the count of nodes with that degree.
+            - ``distribution_hyperedge_size_hist``: A dictionary where the keys are hyperedge
+                sizes and the values are the count of hyperedges with that size.
 
         Returns:
             stats: A dictionary containing various statistics about the hypergraph.
         """
-
         node_ids = self.hyperedge_index[0]
         hyperedge_ids = self.hyperedge_index[1]
 
@@ -937,7 +1012,8 @@ def __validate_can_perform_cat_same_node_space(
         unique_joint_hyperedge_ids = joint_hyperedge_ids.unique()
         if unique_joint_hyperedge_ids.size(0) != joint_hyperedge_ids.size(0):
             raise ValueError(
-                "Overlapping hyperedge IDs found across instances. Ensure each instance uses distinct hyperedge IDs."
+                "Overlapping hyperedge IDs found across instances. Ensure each "
+                "instance uses distinct hyperedge IDs."
             )
 
     def __to_fill_features(
@@ -961,7 +1037,8 @@ def __to_fill_features(
 
         # This can happen when fill_value is:
         # - A scalar tensor, e.g., tensor(0.0), which should be broadcasted to all features
-        # - A list with a single value, e.g., [0.0], which should also be broadcasted to all features
+        # - A list with a single value, e.g., [0.0], which should
+        #   also be broadcasted to all features
         if fill_features.numel() == 1:
             fill_features = fill_features.repeat(num_features)
 
@@ -985,7 +1062,8 @@ def __validate_enrichment_mode(self, enrichment_mode: EnrichmentMode | None) ->
             return
 
         raise ValueError(
-            f"'enrichment_mode' must be one of 'replace', 'concatenate', or None, got {enrichment_mode!r}."
+            f"'enrichment_mode' must be one of 'replace', 'concatenate', "
+            f"or None, got {enrichment_mode!r}."
         )
 
     def __validate_hyperedge_attr(self) -> None:
@@ -995,12 +1073,14 @@ def __validate_hyperedge_attr(self) -> None:
         validate_floating_tensor_dtype("hyperedge_attr", self.hyperedge_attr)
         if self.hyperedge_attr.dim() != 2:
             raise ValueError(
-                f"'hyperedge_attr' must be a 2D tensor, got shape {tuple(self.hyperedge_attr.shape)}."
+                f"'hyperedge_attr' must be a 2D tensor, got shape "
+                f"{tuple(self.hyperedge_attr.shape)}."
             )
         if self.hyperedge_attr.size(0) != self.num_hyperedges:
             raise ValueError(
-                f"'hyperedge_attr' must have one row per hyperedge. "
-                f"Got size={self.hyperedge_attr.size(0)} but num_hyperedges={self.num_hyperedges}."
+                "'hyperedge_attr' must have one row per hyperedge. "
+                f"Got size={self.hyperedge_attr.size(0)} but "
+                f"num_hyperedges={self.num_hyperedges}."
             )
 
     def __validate_hyperedge_index(self) -> None:
@@ -1031,19 +1111,22 @@ def __validate_hyperedge_weights(self) -> None:
 
         if self.hyperedge_weights.dim() != 1:
             raise ValueError(
-                f"'hyperedge_weights' must be a 1D tensor, got shape {tuple(self.hyperedge_weights.shape)}."
+                f"'hyperedge_weights' must be a 1D tensor, "
+                f"got shape {tuple(self.hyperedge_weights.shape)}."
             )
         if self.hyperedge_weights.size(0) != self.num_hyperedges:
             raise ValueError(
                 f"'hyperedge_weights' must have one entry per hyperedge. "
-                f"Got size={self.hyperedge_weights.size(0)} but num_hyperedges={self.num_hyperedges}."
+                f"Got size={self.hyperedge_weights.size(0)} but "
+                f"num_hyperedges={self.num_hyperedges}."
             )
 
     def __validate_global_node_ids(self) -> None:
         validate_long_tensor_dtype("global_node_ids", self.global_node_ids)
         if self.global_node_ids.dim() != 1:
             raise ValueError(
-                f"'global_node_ids' must be a 1D tensor, got shape {tuple(self.global_node_ids.shape)}."
+                f"'global_node_ids' must be a 1D tensor, got "
+                f"shape {tuple(self.global_node_ids.shape)}."
             )
         if self.global_node_ids.size(0) != self.num_nodes:
             raise ValueError(
@@ -1064,7 +1147,8 @@ def __validate_labels(self) -> None:
     def __validate_x(self) -> None:
         if self.x.size(0) not in (0, self.num_nodes):
             raise ValueError(
-                f"'x' must have one feature row per node, or be 'torch.empty((0, 0))' if there are no nodes. "
+                f"'x' must have one feature row per node, or be 'torch.empty((0, 0))' "
+                f"if there are no nodes. "
                 f"Got x.shape={tuple(self.x.shape)} but num_nodes={self.num_nodes}."
             )
 
diff --git a/hyperbench/types/hypergraph.py b/hyperbench/types/hypergraph.py
index b953e712..6fba83f1 100644
--- a/hyperbench/types/hypergraph.py
+++ b/hyperbench/types/hypergraph.py
@@ -21,15 +21,19 @@
 
 class HIFHypergraph:
     """
-    A hypergraph data structure that supports directed/undirected hyperedges
-    with incidence-based representation.
+    A hypergraph data structure that supports directed/undirected hyperedges with incidence-based
+    representation.
 
     Args:
-        network_type: The type of hypergraph, which can be "asc" (or "directed") for directed hyperedges, or "undirected" for undirected hyperedges.
+        network_type: The type of hypergraph, which can be "asc" (or "directed") for
+            directed hyperedges, or "undirected" for undirected hyperedges.
         metadata: Optional dictionary of metadata about the hypergraph.
-        incidences: A list of incidences, where each incidence is a dictionary with keys "node" and "edge" representing the relationship between a node and a hyperedge.
-        nodes: A list of node dictionaries, where each dictionary contains information about a node (e.g., id, features).
-        hyperedges: A list of edge dictionaries, where each dictionary contains information about a hyperedge (e.g., id, features).
+        incidences: A list of incidences, where each incidence is a dictionary with keys "node"
+            and "edge" representing the relationship between a node and a hyperedge.
+        nodes: A list of node dictionaries, where each dictionary contains information about
+            a node (e.g., id, features).
+        hyperedges: A list of edge dictionaries, where each dictionary contains information
+            about a hyperedge (e.g., id, features).
     """
 
     def __init__(
@@ -83,37 +87,47 @@ def from_hif(cls, data: dict[str, Any]) -> HIFHypergraph:
 
     @property
     def num_nodes(self) -> int:
-        """Return the number of nodes in the hypergraph."""
+        """
+        Return the number of nodes in the hypergraph.
+        """
         return len(self.nodes)
 
     @property
     def num_hyperedges(self) -> int:
-        """Return the number of hyperedges in the hypergraph."""
+        """
+        Return the number of hyperedges in the hypergraph.
+        """
         return len(self.hyperedges)
 
     def stats(self) -> dict[str, Any]:
         """
         Compute statistics for the HIFhypergraph.
-        The fields returned in the dictionary include:
-        - ``num_nodes``: The number of nodes in the hypergraph.
-        - ``num_hyperedges``: The number of hyperedges in the hypergraph.
-        - ``avg_degree_node_raw``: The average degree of nodes, calculated as the mean number of hyperedges each node belongs to.
-        - ``avg_degree_node``: The floored node average degree.
-        - ``avg_degree_hyperedge_raw``: The average size of hyperedges, calculated as the mean number of nodes each hyperedge contains.
-        - ``avg_degree_hyperedge``: The floored hyperedge average size.
-        - ``node_degree_max``: The maximum degree of any node in the hypergraph.
-        - ``hyperedge_degree_max``: The maximum size of any hyperedge in the hypergraph.
-        - ``node_degree_median``: The median degree of nodes in the hypergraph.
-        - ``hyperedge_degree_median``: The median size of hyperedges in the hypergraph.
-        - ``distribution_node_degree``: A list where the value at index ``i`` represents the count of nodes with degree ``i``.
-        - ``distribution_hyperedge_size``: A list where the value at index ``i`` represents the count of hyperedges with size ``i``.
-        - ``distribution_node_degree_hist``: A dictionary where the keys are node degrees and the values are the count of nodes with that degree.
-        - ``distribution_hyperedge_size_hist``: A dictionary where the keys are hyperedge sizes and the values are the count of hyperedges with that size.
+
+        Fields:
+            - ``num_nodes``: The number of nodes in the hypergraph.
+            - ``num_hyperedges``: The number of hyperedges in the hypergraph.
+            - ``avg_degree_node_raw``: The average degree of nodes, calculated as the mean
+                number of hyperedges each node belongs to.
+            - ``avg_degree_node``: The floored node average degree.
+            - ``avg_degree_hyperedge_raw``: The average size of hyperedges, calculated as the
+                mean number of nodes each hyperedge contains.
+            - ``avg_degree_hyperedge``: The floored hyperedge average size.
+            - ``node_degree_max``: The maximum degree of any node in the hypergraph.
+            - ``hyperedge_degree_max``: The maximum size of any hyperedge in the hypergraph.
+            - ``node_degree_median``: The median degree of nodes in the hypergraph.
+            - ``hyperedge_degree_median``: The median size of hyperedges in the hypergraph.
+            - ``distribution_node_degree``: A list where the value at index ``i`` represents
+                the count of nodes with degree ``i``.
+            - ``distribution_hyperedge_size``: A list where the value at index ``i`` represents
+                the count of hyperedges with size ``i``.
+            - ``distribution_node_degree_hist``: A dictionary where the keys are node degrees
+                and the values are the count of nodes with that degree.
+            - ``distribution_hyperedge_size_hist``: A dictionary where the keys are hyperedge
+                sizes and the values are the count of hyperedges with that size.
 
         Returns:
             stats: A dictionary containing various statistics about the hypergraph.
         """
-
         node_degree: dict[Any, int] = {}
         hyperedge_size: dict[Any, int] = {}
 
@@ -206,7 +220,9 @@ def __init__(self, hyperedges: list[list[int]]):
 
     @property
     def num_nodes(self) -> int:
-        """Return the number of nodes in the hypergraph."""
+        """
+        Return the number of nodes in the hypergraph.
+        """
         nodes = set()
         for edge in self.hyperedges:
             nodes.update(edge)
@@ -214,7 +230,9 @@ def num_nodes(self) -> int:
 
     @property
     def num_hyperedges(self) -> int:
-        """Return the number of hyperedges in the hypergraph."""
+        """
+        Return the number of hyperedges in the hypergraph.
+        """
         return len(self.hyperedges)
 
     def neighbors_of(self, node: int) -> Neighborhood:
@@ -261,7 +279,9 @@ def neighbors_of_all(self) -> dict[int, Neighborhood]:
         return node_to_neighbors
 
     def stats(self) -> dict[str, Any]:
-        """Return basic statistics about the hypergraph."""
+        """
+        Return basic statistics about the hypergraph.
+        """
         node_degree: dict[int, int] = {}
         distribution_hyperedge_size: list[int] = []
         total_incidences = 0
@@ -339,7 +359,8 @@ def from_hyperedge_index(cls, hyperedge_index: Tensor) -> Hypergraph:
         Create a Hypergraph from a hyperedge index representation.
 
         Args:
-            hyperedge_index: Tensor of shape (2, |E|) representing hyperedges, where each column is (node, hyperedge).
+            hyperedge_index: Tensor of shape (2, |E|) representing hyperedges, where each
+                column is (node, hyperedge).
 
         Returns:
             hypergraph: Hypergraph instance
@@ -363,12 +384,14 @@ def smoothing_with_matrix(
     ) -> Tensor:
         """
         Return the feature matrix smoothed with a smoothing matrix.
+
         Computes ``M @ X`` where ``M`` is the smoothing matrix and ``X`` is the node feature matrix.
 
         Args:
             x: Node feature matrix. Size ``(num_nodes, C)``.
             matrix: The smoothing matrix. Size ``(num_nodes, num_nodes)``.
-            drop_rate: Randomly dropout the connections in the smoothing matrix with probability ``drop_rate``. Defaults to ``0.0``.
+            drop_rate: Randomly dropout the connections in the smoothing matrix with
+                probability ``drop_rate``. Defaults to ``0.0``.
 
         Returns:
             x: The smoothed feature matrix. Size ``(num_nodes, C)``.
@@ -381,9 +404,10 @@ def smoothing_with_matrix(
 class HyperedgeIndex:
     """
     A wrapper for hyperedge index representation.
-    Hyperedge index is a tensor of shape ``(2, num_incidences)`` that encodes the relationships between nodes and hyperedges.
-    Each column in the tensor represents an incidence between a node and a hyperedge, with the first row containing node indices
-    and the second row containing corresponding hyperedge indices.
+    Hyperedge index is a tensor of shape ``(2, num_incidences)`` that encodes the relationships
+    between nodes and hyperedges.
+    Each column in the tensor represents an incidence between a node and a hyperedge, with the
+    first row containing node indices and the second row containing corresponding hyperedge indices.
 
     Examples:
         >>> hyperedge_index = [[0, 1, 2, 0],
@@ -397,7 +421,8 @@ class HyperedgeIndex:
         The number of hyperedges is 2 (hyperedges 0 and 1).
 
     Args:
-        hyperedge_index: A tensor of shape ``(2, num_incidences)`` representing hyperedges, where each column is (node, hyperedge).
+        hyperedge_index: A tensor of shape ``(2, num_incidences)`` representing hyperedges,
+            where each column is (node, hyperedge).
     """
 
     def __init__(self, hyperedge_index: Tensor):
@@ -405,32 +430,44 @@ def __init__(self, hyperedge_index: Tensor):
 
     @property
     def all_node_ids(self) -> Tensor:
-        """Return the tensor of all node IDs in the hyperedge index."""
+        """
+        Return the tensor of all node IDs in the hyperedge index.
+        """
         return self.__hyperedge_index[0]
 
     @property
     def all_hyperedge_ids(self) -> Tensor:
-        """Return the tensor of all hyperedge IDs in the hyperedge index."""
+        """
+        Return the tensor of all hyperedge IDs in the hyperedge index.
+        """
         return self.__hyperedge_index[1]
 
     @property
     def item(self) -> Tensor:
-        """Return the hyperedge index tensor."""
+        """
+        Return the hyperedge index tensor.
+        """
         return self.__hyperedge_index
 
     @property
     def node_ids(self) -> Tensor:
-        """Return the sorted unique node IDs from the hyperedge index."""
+        """
+        Return the sorted unique node IDs from the hyperedge index.
+        """
         return self.__hyperedge_index[0].unique(sorted=True)
 
     @property
     def hyperedge_ids(self) -> Tensor:
-        """Return the sorted unique hyperedge IDs from the hyperedge index."""
+        """
+        Return the sorted unique hyperedge IDs from the hyperedge index.
+        """
         return self.__hyperedge_index[1].unique(sorted=True)
 
     @property
     def num_hyperedges(self) -> int:
-        """Return the number of hyperedges in the hypergraph."""
+        """
+        Return the number of hyperedges in the hypergraph.
+        """
         if self.num_incidences < 1:
             return 0
 
@@ -439,7 +476,9 @@ def num_hyperedges(self) -> int:
 
     @property
     def num_nodes(self) -> int:
-        """Return the number of nodes in the hypergraph."""
+        """
+        Return the number of nodes in the hypergraph.
+        """
         if self.num_incidences < 1:
             return 0
 
@@ -448,7 +487,10 @@ def num_nodes(self) -> int:
 
     @property
     def num_incidences(self) -> int:
-        """Return the number of incidences in the hypergraph, which is the number of columns in the hyperedge index."""
+        """
+        Return the number of incidences in the hypergraph, which is the number of columns in the
+        hyperedge index.
+        """
         return self.__hyperedge_index.size(1)
 
     def nodes_in(self, hyperedge_id: int) -> list[int]:
@@ -466,13 +508,15 @@ def nodes_in(self, hyperedge_id: int) -> list[int]:
 
     def num_nodes_if_isolated_exist(self, num_nodes: int) -> int:
         """
-        Return the number of nodes in the hypergraph, accounting for isolated nodes that may not appear in the hyperedge index.
+        Return the number of nodes in the hypergraph, accounting for isolated nodes that may not
+        appear in the hyperedge index.
 
         Args:
             num_nodes: The total number of nodes in the hypergraph, including isolated nodes.
 
         Returns:
-            num_nodes: The number of nodes in the hypergraph, which is the maximum of the number of unique nodes in the hyperedge index and the provided ``num_nodes``.
+            num_nodes: The number of nodes in the hypergraph, which is the maximum of the number of
+                unique nodes in the hyperedge index and the provided ``num_nodes``.
         """
         return max(self.num_nodes, num_nodes)
 
@@ -488,7 +532,8 @@ def get_clique_expansion_adjacency_list(self, num_nodes: int | None = None) -> l
                 If ``None``, inferred from the unique node IDs in ``hyperedge_index``.
 
         Returns:
-            adjacency: A list where ``adjacency[node_id]`` is the set of nodes adjacent to ``node_id``.
+            adjacency: A list where ``adjacency[node_id]`` is the set of
+                nodes adjacent to ``node_id``.
         """
         num_nodes = num_nodes if num_nodes is not None else self.num_nodes
         self.__validate_num_nodes(num_nodes)
@@ -500,7 +545,8 @@ def get_clique_expansion_adjacency_list(self, num_nodes: int | None = None) -> l
                 self.all_node_ids[self.all_hyperedge_ids == hyperedge_id].unique().tolist()
             )
 
-            # Clique expansion: every pair of nodes in the same hyperedge becomes an undirected graph edge
+            # Clique expansion: every pair of nodes in the same hyperedge
+            # becomes an undirected graph edge
             # Example: hyperedge [0, 1, 2] adds (0, 1), (0, 2), and (1, 2):
             #          -> adjacency[0] = {1, 2}
             #          -> adjacency[1] = {0, 2}
@@ -527,7 +573,8 @@ def get_sparse_incidence_matrix(
             num_hyperedges: Total number of hyperedges. If ``None``, inferred from hyperedge index.
 
         Returns:
-            incidence_matrix: The sparse incidence matrix H of shape ``(num_nodes, num_hyperedges)``.
+            incidence_matrix: The sparse incidence matrix H of
+                shape ``(num_nodes, num_hyperedges)``.
 
         Raises:
             ValueError: If the provided dimensions cannot contain the raw node or hyperedge IDs.
@@ -559,7 +606,8 @@ def get_sparse_normalized_node_degree_matrix(
         Compute a sparse diagonal node degree matrix from row-sums of the incidence matrix.
 
         Args:
-            incidence_matrix: The sparse incidence matrix H of shape ``(num_nodes, num_hyperedges)``.
+            incidence_matrix: The sparse incidence matrix H of
+                shape ``(num_nodes, num_hyperedges)``.
             power: Exponent applied to node degrees before placing them on the diagonal.
             num_nodes: Total number of nodes. If ``None``, inferred from hyperedge index.
 
@@ -600,16 +648,18 @@ def get_sparse_rownormalized_node_degree_matrix(
         num_nodes: int | None = None,
     ) -> Tensor:
         """
-        Compute the sparse normalized node degree matrix D_n^-1.
+        Compute the sparse normalized node degree matrix `D_n^-1`.
+
         The node degree ``d_n[i]`` is the number of hyperedges containing node ``i``
         (i.e., the row-sum of the incidence matrix H).
 
         Args:
-            incidence_matrix: The sparse incidence matrix H of shape ``(num_nodes, num_hyperedges)``.
+            incidence_matrix: The sparse incidence matrix H of
+                shape ``(num_nodes, num_hyperedges)``.
             num_nodes: Total number of nodes. If ``None``, inferred from hyperedge index.
 
         Returns:
-            degree_matrix: The sparse diagonal matrix D_n^-1 of shape ``(num_nodes, num_nodes)``.
+            degree_matrix: The sparse diagonal matrix `D_n^-1` of shape ``(num_nodes, num_nodes)``.
         """
         # Example: hyperedge_index = [[0, 1, 2, 0],
         #                             [0, 0, 0, 1]]
@@ -632,16 +682,19 @@ def get_sparse_symnormalized_node_degree_matrix(
         num_nodes: int | None = None,
     ) -> Tensor:
         """
-        Compute the sparse normalized node degree matrix D_n^-1/2.
+        Compute the sparse normalized node degree matrix `D_n^-1/2`.
+
         The node degree ``d_n[i]`` is the number of hyperedges containing node ``i``
         (i.e., the row-sum of the incidence matrix H).
 
         Args:
-            incidence_matrix: The sparse incidence matrix H of shape ``(num_nodes, num_hyperedges)``.
+            incidence_matrix: The sparse incidence matrix H of
+                shape ``(num_nodes, num_hyperedges)``.
             num_nodes: Total number of nodes. If ``None``, inferred from hyperedge index.
 
         Returns:
-            degree_matrix: The sparse diagonal matrix D_n^-1/2 of shape ``(num_nodes, num_nodes)``.
+            degree_matrix: The sparse diagonal matrix `D_n^-1/2`
+                of shape ``(num_nodes, num_nodes)``.
         """
         # Example: hyperedge_index = [[0, 1, 2, 0],
         #                             [0, 0, 0, 1]]
@@ -664,17 +717,19 @@ def get_sparse_normalized_hyperedge_degree_matrix(
         num_hyperedges: int | None = None,
     ) -> Tensor:
         """
-        Compute the sparse normalized hyperedge degree matrix D_e^-1.
+        Compute the sparse normalized hyperedge degree matrix `D_e^-1`.
 
         The hyperedge degree ``d_e[j]`` is the number of nodes in hyperedge ``j``
         (i.e., the column-sum of the incidence matrix H).
 
         Args:
-            incidence_matrix: The sparse incidence matrix H of shape ``(num_nodes, num_hyperedges)``.
+            incidence_matrix: The sparse incidence matrix H of
+                shape ``(num_nodes, num_hyperedges)``.
             num_hyperedges: Total number of hyperedges. If ``None``, inferred from hyperedge index.
 
         Returns:
-            degree_matrix: The sparse diagonal matrix D_e^-1 of shape ``(num_hyperedges, num_hyperedges)``.
+            degree_matrix: The sparse diagonal matrix `D_e^-1` of
+                shape ``(num_hyperedges, num_hyperedges)``.
         """
         num_hyperedges = (
             num_hyperedges if num_hyperedges is not None else int(incidence_matrix.size(1))
@@ -735,8 +790,8 @@ def get_sparse_hgnn_smoothing_matrix(
 
         where:
             - H is the incidence matrix of shape ``(num_nodes, num_hyperedges)``
-            - D_n^-1/2 is the normalized node degree matrix
-            - D_e^-1 is the inverse hyperedge degree matrix (with W = I)
+            - `D_n^-1/2` is the normalized node degree matrix
+            - `D_e^-1` is the inverse hyperedge degree matrix (with W = I)
 
         Args:
             num_nodes: Total number of nodes. If ``None``, inferred from hyperedge index.
@@ -817,7 +872,7 @@ def get_sparse_hgnnp_smoothing_matrix(
         )
         return smoothing_matrix.coalesce()
 
-    def reduce(self, strategy: Literal["clique_expansion"], **kwargs) -> Tensor:
+    def reduce(self, strategy: Literal["clique_expansion"], **kwargs: Any) -> Tensor:
         """
         Reduce the hypergraph to a graph represented by edge index using the specified strategy.
 
@@ -843,12 +898,14 @@ def reduce_to_edge_index_on_clique_expansion(
         num_hyperedges: int | None = None,
     ) -> Tensor:
         """
-        Construct a graph from a hypergraph via clique expansion using ``H @ H^T``, where ``H`` is the incidence matrix of the hypergraph.
+        Construct a graph from a hypergraph via clique expansion using ``H @ H^T``,
+        where ``H`` is the incidence matrix of the hypergraph.
         In clique expansion, each hyperedge is replaced by a clique connecting all its member nodes.
 
         For each hyperedge, all pairs of member nodes become edges in the resulting graph.
         This is computed efficiently using the incidence matrix: ``A = H @ H^T``, where ``H`` is
-        the sparse incidence matrix of shape ``[num_nodes, num_hyperedges]`` and ``A`` is the adjacency matrix of the clique-expanded graph.
+        the sparse incidence matrix of shape ``[num_nodes, num_hyperedges]`` and ``A`` is
+        the adjacency matrix of the clique-expanded graph.
 
         Args:
             num_nodes: Total number of nodes. If ``None``, inferred from hyperedge index.
@@ -897,23 +954,26 @@ def reduce_to_edge_index_on_random_direction(
         seed: int | None = None,
     ) -> tuple[Tensor, Tensor | None]:
         """
-        Construct a graph from a hypergraph with methods proposed in `HyperGCN: A New Method of Training Graph Convolutional Networks on Hypergraphs <https://arxiv.org/pdf/1809.02589.pdf>`_ paper.
-        Reference implementation: `source <https://deephypergraph.readthedocs.io/en/latest/_modules/dhg/structure/graphs/graph.html#Graph.from_hypergraph_hypergcn>`_.
+        References:
+            - Construct a graph from a hypergraph with methods proposed in [HyperGCN: A New Method of Training Graph Convolutional Networks on Hypergraphs](https://arxiv.org/pdf/1809.02589.pdf) paper.
+            - Reference implementation: [source](https://deephypergraph.readthedocs.io/en/latest/_modules/dhg/structure/graphs/graph.html#Graph.from_hypergraph_hypergcn).
 
         Args:
             x: Node feature matrix. Size ``(num_nodes, C)``.
-            with_mediators: Whether to use mediator to transform the hyperedges to edges in the graph. Defaults to ``False``.
+            with_mediators: Whether to use mediator to transform the hyperedges to edges in the
+                graph. Defaults to ``False``.
             remove_selfloops: Whether to remove self-loops. Defaults to ``True``.
-            return_weights: Whether to return the DHG-style reduced-edge weights alongside the edge index. Defaults to ``False``.
+            return_weights: Whether to return the DHG-style reduced-edge weights alongside the
+                edge index. Defaults to ``False``.
 
         Returns:
-            reduced_graph: A tuple ``(edge_index, edge_weights)`` where:
-            - ``edge_index`` has size ``(2, |num_edges|)``.
-            - ``edge_weights`` has size ``(|num_edges|,)`` when ``return_weights=True``, otherwise ``None``.
+            edge_index: The edge index of the reduced graph. Size ``(2, |num_edges|)``.
+            edge_weights: The edge weights of the reduced graph. Size ``(|num_edges|,)`` when
+                ``return_weights=True``, otherwise ``None``.
 
         Raises:
             ValueError: If any hyperedge contains fewer than 2 nodes.
-        """
+        """  # noqa: E501
         device = x.device
         generator = create_seeded_torch_generator(device, seed)
 
@@ -923,7 +983,8 @@ def reduce_to_edge_index_on_random_direction(
         graph_edge_weights: list[float] = []
 
         # Random direction (feature_dim, 1) for projecting nodes in each hyperedge
-        # Geometrically, we are choosing a random line through the origin in ℝᵈ, where ᵈ = feature_dim
+        # Geometrically, we are choosing a random line through the origin
+        # in ℝᵈ, where ᵈ = feature_dim
         random_direction = torch.rand(
             size=(x.shape[1], 1),
             dtype=x.dtype,
@@ -936,13 +997,15 @@ def reduce_to_edge_index_on_random_direction(
             if num_nodes_in_edge < 2:
                 raise ValueError("The number of vertices in an hyperedge must be >= 2.")
 
-            # projections (num_nodes_in_edge,) contains a scalar value for each node in the hyperedge,
+            # projections (num_nodes_in_edge,) contains a scalar value for
+            # each node in the hyperedge,
             # indicating its projection on the random vector 'random_direction'.
             # Key idea: If two points are very far apart in ℝᵈ, there is a high probability
             # that a random projection will still separate them
             projections = torch.matmul(x[edge], random_direction).squeeze()
 
-            # The indices of the nodes that the farthest apart in the direction of 'random_direction'
+            # The indices of the nodes that the farthest apart in the
+            # direction of 'random_direction'
             node_max_proj_idx = torch.argmax(projections)
             node_min_proj_idx = torch.argmin(projections)
 
@@ -971,7 +1034,11 @@ def reduce_to_edge_index_on_random_direction(
         )
 
     def remove_duplicate_edges(self) -> HyperedgeIndex:
-        """Remove duplicate edges from the hyperedge index. Keeps the tensor contiguous in memory."""
+        """
+        Remove duplicate edges from the hyperedge index.
+
+        Keeps the tensor contiguous in memory.
+        """
         # Example: hyperedge_index = [[0, 1, 2, 2, 0, 3, 2],
         #                             [3, 4, 4, 3, 4, 3, 3]], shape (2, 7)
         #          -> after torch.unique(..., dim=1):
@@ -997,7 +1064,8 @@ def remove_hyperedges_with_fewer_than_k_nodes(self, k: int) -> HyperedgeIndex:
             >>> k = 3
             >>> unique_hyperedge_ids: [0, 1, 2]
             ... # inverse -> idx_to_hyperedge_id, counts -> num_nodes_per_hyperedge
-            ... inverse           = [0, 0, 1, 1, 2, 1]  # (index into unique_hyperedge_ids per column)
+            ... # (index into unique_hyperedge_ids per column)
+            ... inverse           = [0, 0, 1, 1, 2, 1]
             ... counts            = [2, 3, 1]
             >>> # counts[inverse] is equivalent to:
             ... # for i, inv in enumerate(inverse): keep_mask[i] = counts[inv]
@@ -1012,7 +1080,8 @@ def remove_hyperedges_with_fewer_than_k_nodes(self, k: int) -> HyperedgeIndex:
             k: The minimum number of nodes a hyperedge must contain to be kept.
 
         Returns:
-            hyperedge_index: A new `HyperedgeIndex` instance with hyperedges containing fewer than k nodes.
+            hyperedge_index: A new `HyperedgeIndex` instance with hyperedges
+                containing fewer than k nodes.
         """
         validate_is_positive("k", k)
 
@@ -1031,16 +1100,22 @@ def to_0based(
         hyperedge_ids_to_rebase: Tensor | None = None,
     ) -> HyperedgeIndex:
         """
-        Convert hyperedge index to the 0-based format by rebasing node IDs to the range ``[0, num_nodes-1]`` and hyperedge IDs ``[0, num_hyperedges-1]``.
+        Convert hyperedge index to the 0-based format by rebasing node IDs to the range ``[0,
+        num_nodes-1]`` and hyperedge IDs ``[0, num_hyperedges-1]``.
 
         Args:
-            node_ids_to_rebase: Tensor of shape ``(num_nodes,)`` containing the original node IDs that need to be rebased to 0-based format.
-                If ``None``, all node IDs in the hyperedge index will be rebased to 0-based format based on their unique sorted order.
-            hyperedge_ids_to_rebase: Tensor of shape ``(num_hyperedges,)`` containing the original hyperedge IDs that need to be rebased to 0-based format.
-                If ``None``, all hyperedge IDs in the hyperedge index will be rebased to 0-based format based on their unique sorted order.
+            node_ids_to_rebase: Tensor of shape ``(num_nodes,)`` containing the original node IDs
+                that need to be rebased to 0-based format.
+                If ``None``, all node IDs in the hyperedge index will be rebased to 0-based format
+                based on their unique sorted order.
+            hyperedge_ids_to_rebase: Tensor of shape ``(num_hyperedges,)`` containing the original
+                hyperedge IDs that need to be rebased to 0-based format.
+                If ``None``, all hyperedge IDs in the hyperedge index will be rebased to
+                0-based format based on their unique sorted order.
 
         Returns:
-            hyperedge_index: A new `HyperedgeIndex` instance with the hyperedge index converted to 0-based format.
+            hyperedge_index: A new `HyperedgeIndex` instance with the hyperedge index
+                converted to 0-based format.
         """
         # Example: hyperedge_index after sorting: [[0, 0, 1, 2, 3, 4],
         #                                          [3, 4, 4, 3, 4, 3]]
diff --git a/hyperbench/utils/data_utils.py b/hyperbench/utils/data_utils.py
index 7eb2a87b..1262d66b 100644
--- a/hyperbench/utils/data_utils.py
+++ b/hyperbench/utils/data_utils.py
@@ -65,8 +65,8 @@ def validate_is_between(
 ) -> None:
     if min_value > max_value:
         raise ValueError(
-            f"Invalid bounds for {name!r}: 'min_value' ({min_value}) "
-            f"cannot be greater than 'max_value' ({max_value})."
+            f"Invalid bounds for {name!r}: 'min_value' ({min_value}) cannot "
+            f"be greater than 'max_value' ({max_value})."
         )
     if not math.isfinite(value) or value < min_value or value > max_value:
         raise ValueError(
@@ -109,7 +109,8 @@ def validate_ratios(ratios: list[int | float]) -> None:
     # Allow small imprecision in sum of ratios, but raise error if it's significant
     # Example: ratios = [0.8, 0.1, 0.1] -> sum = 1.0 (valid)
     #          ratios = [0.8, 0.1, 0.05] -> sum = 0.95 (invalid, raises ValueError)
-    #          ratios = [0.8, 0.1, 0.1, 0.0000001] -> sum = 1.0000001 (valid, allows small imprecision)
+    #          (valid, allows small imprecision)
+    #          ratios = [0.8, 0.1, 0.1, 0.0000001] -> sum = 1.0000001
     ratio_sum = float(sum(ratios))
     if abs(ratio_sum - 1.0) > 1e-6:
         raise ValueError(f"'ratios' must sum to 1.0, got {ratio_sum}.")
diff --git a/hyperbench/utils/nn_utils.py b/hyperbench/utils/nn_utils.py
index efebc73a..c68dd9b6 100644
--- a/hyperbench/utils/nn_utils.py
+++ b/hyperbench/utils/nn_utils.py
@@ -35,7 +35,8 @@ def maxmin_scatter(
     dim_size: int | None = None,
 ) -> Tensor:
     """
-    Performs a scatter reduction that computes the channel-wise range (max - min) for each index group.
+    Performs a scatter reduction that computes the channel-wise range (max - min) for each
+    index group.
 
     Args:
         src: The source tensor containing the values to scatter.
diff --git a/hyperbench/utils/node_utils.py b/hyperbench/utils/node_utils.py
index 91fd2e47..b037c67c 100644
--- a/hyperbench/utils/node_utils.py
+++ b/hyperbench/utils/node_utils.py
@@ -33,7 +33,8 @@ def validate_node_space_setting(node_space_setting: NodeSpaceSetting) -> None:
     Validate that the node space setting is one of the supported values.
 
     Args:
-        node_space_setting: The node space setting to validate, which should be either "inductive" or "transductive".
+        node_space_setting: The node space setting to validate, which should be either "inductive"
+            or "transductive".
 
     Raises:
         ValueError: If the node space setting is not one of the supported values.
@@ -42,5 +43,6 @@ def validate_node_space_setting(node_space_setting: NodeSpaceSetting) -> None:
         return
 
     raise ValueError(
-        f"'node_space_setting' must be one of 'transductive' or 'inductive', got {node_space_setting!r}."
+        f"'node_space_setting' must be one of 'transductive' or 'inductive', "
+        f"got {node_space_setting!r}."
     )
diff --git a/hyperbench/utils/sparse_utils.py b/hyperbench/utils/sparse_utils.py
index 87010041..01b7dba6 100644
--- a/hyperbench/utils/sparse_utils.py
+++ b/hyperbench/utils/sparse_utils.py
@@ -8,7 +8,8 @@ def sparse_dropout(
     dropout_prob: float,
     fill_value: float = 0.0,
 ) -> Tensor:
-    """Dropout function for sparse matrix.
+    """
+    Dropout function for sparse matrix.
 
     Returns a new sparse matrix with the same shape as the input sparse matrix,
     but with some elements dropped out.
@@ -19,7 +20,8 @@ def sparse_dropout(
         fill_value: The fill value for dropped elements. Defaults to ``0.0``.
 
     Returns:
-        matrix: A new sparse matrix with the same shape as the input sparse matrix, but with some elements dropped out.
+        matrix: A new sparse matrix with the same shape as the input sparse matrix,
+            but with some elements dropped out.
     """
     device = sparse_tensor.device
 
@@ -41,9 +43,10 @@ def sparse_dropout(
 
     # Generate a binary mask matching the shape of values for elements to keep
     # 'torch.bernoulli()' samples 1 with probability keep_prob and 0 with probability dropout_prob
-    # Example: values = [0.5, 1.2, 3.4], keep_prob = 0.8
-    #          -> keep_mask might be [1, 0, 1], meaning we keep the 1st and 3rd elements, drop the 2nd
-    keep_mask = torch.bernoulli(torch.full_like(values, keep_prob, dtype=values.dtype)).to(device)
+    # Example:
+    #   values = [0.5, 1.2, 3.4], keep_prob = 0.8
+    #   -> keep_mask might be [1, 0, 1], meaning we keep the 1st and 3rd elements, drop the 2nd
+    keep_mask = torch.bernoulli(torch.full_like(values, keep_prob)).to(device)
 
     if fill_value == 0.0:
         # If fill_value is 0, just zero out the dropped elements,
@@ -52,7 +55,8 @@ def sparse_dropout(
         #          -> new_values = [0.5*1, 1.2*0, 3.4*1] = [0.5, 0.0, 3.4]
         new_values = values * keep_mask
     else:
-        # If fill_value is non-zero, we must fill the dropped elements with the specified fill_value instead of zero
+        # If fill_value is non-zero, we must fill the dropped elements with the
+        # specified fill_value instead of zero
         # 'torch.logical_not(keep_mask)' identifies dropped elements where mask is 0 and
         # Example: values = [0.5, 1.2, 3.4], keep_mask = [1, 0, 1], fill_value = 9.9
         #          -> values_to_fill_mask = [0, 1, 0]
diff --git a/pyproject.toml b/pyproject.toml
index 71f8866d..44e0c85c 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -10,7 +10,7 @@ readme = "README.md"
 requires-python = ">=3.10"
 authors = [
     { name = "Tiziano Citro", email = "tcitro@unisa.it" },
-    { name = "Daniele De Vinco", email = "ddevinco@unisa.it"}
+    { name = "Daniele De Vinco", email = "ddevinco@unisa.it" },
 ]
 dependencies = [
     "fastjsonschema>=2.21.2,<3.0.0",
@@ -64,7 +64,7 @@ dev = [
 ]
 docs = [
     "mkdocstrings[python]>=1.0.4,<2.0.0",
-    "zensical>=0.0.43,<1.0.0",
+    "zensical>=0.0.44,<1.0.0",
 ]
 test = [
     "pytest>=9.0.3,<10.0.0",
@@ -78,16 +78,9 @@ where = ["."]
 include = ["hyperbench"]
 
 [tool.pytest.ini_options]
-addopts = [
-    "--color=yes",
-    "--verbose",
-    "--tb=short",
-    "--strict-markers",
-]
+addopts = ["--color=yes", "--verbose", "--tb=short", "--strict-markers"]
 testpaths = ["hyperbench/tests", "hyperbench/integration_tests"]
-markers = [
-    "integration: tests that use real workflows",
-]
+markers = ["integration: tests that use real workflows"]
 filterwarnings = [
     "ignore:.*torch.jit.script.*deprecated.*",
     "ignore:.*torch.jit.script.*is not supported in Python 3.14.*",
@@ -108,7 +101,7 @@ omit = [
     "hyperbench/tests/*",
     "hyperbench/nn/*",
     "hyperbench/models/*",
-    "hyperbench/hlp/*"
+    "hyperbench/hlp/*",
 ]
 
 [tool.coverage.report]
@@ -131,29 +124,28 @@ line-length = 100
 
 [tool.ruff.lint]
 select = [
-    "B",     # flake8-bugbear: bugs / bad practices (https://docs.astral.sh/ruff/rules/#flake8-bugbear-b)
-    "C4",    # flake8-comprehensions (https://docs.astral.sh/ruff/rules/#flake8-comprehensions-c4)
-    "E",     # pycodestyle (https://docs.astral.sh/ruff/rules/#pycodestyle-e-w)
-    "F",     # pyflakes: undefined names, unused imports, etc. (https://docs.astral.sh/ruff/rules/#pyflakes-f)
-    "FURB",  # refurb: modern/simpler code improvements (https://docs.astral.sh/ruff/rules/#refurb-furb)
-    "I",     # isort (https://docs.astral.sh/ruff/rules/#isort-i)
-    "N",     # pep8-naming (https://docs.astral.sh/ruff/rules/#pep8-naming-n)
-    "PERF",  # perflint: performance suggestions (https://docs.astral.sh/ruff/rules/#perflint-perf)
-    "PL",    # pylint-inspired rules (https://docs.astral.sh/ruff/rules/#pylint-pl)
-    "Q",     # flake8-quotes: string quote normalization (https://docs.astral.sh/ruff/rules/#flake8-quotes-q)
-    "RET",   # flake8-return: return statement simplifications (https://docs.astral.sh/ruff/rules/#flake8-return-ret)
-    "RSE",   # flake8-raise: raise statement improvements (https://docs.astral.sh/ruff/rules/#flake8-raise-rse)
-    "RUF",   # Ruff-specific rules (https://docs.astral.sh/ruff/rules/#ruff-specific-rules-ruf)
-    "SIM",   # flake8-simplify: simplify code (https://docs.astral.sh/ruff/rules/#flake8-simplify-sim)
-    "T10",   # flake8-debugger (https://docs.astral.sh/ruff/rules/#flake8-debugger-t10)
-    "UP",    # pyupgrade: modern Python syntax (https://docs.astral.sh/ruff/rules/#pyupgrade-up)
+    "B",    # flake8-bugbear: bugs / bad practices (https://docs.astral.sh/ruff/rules/#flake8-bugbear-b)
+    "C4",   # flake8-comprehensions (https://docs.astral.sh/ruff/rules/#flake8-comprehensions-c4)
+    "E",    # pycodestyle (https://docs.astral.sh/ruff/rules/#pycodestyle-e-w)
+    "F",    # pyflakes: undefined names, unused imports, etc. (https://docs.astral.sh/ruff/rules/#pyflakes-f)
+    "FURB", # refurb: modern/simpler code improvements (https://docs.astral.sh/ruff/rules/#refurb-furb)
+    "I",    # isort (https://docs.astral.sh/ruff/rules/#isort-i)
+    "N",    # pep8-naming (https://docs.astral.sh/ruff/rules/#pep8-naming-n)
+    "PERF", # perflint: performance suggestions (https://docs.astral.sh/ruff/rules/#perflint-perf)
+    "PL",   # pylint-inspired rules (https://docs.astral.sh/ruff/rules/#pylint-pl)
+    "Q",    # flake8-quotes: string quote normalization (https://docs.astral.sh/ruff/rules/#flake8-quotes-q)
+    "RET",  # flake8-return: return statement simplifications (https://docs.astral.sh/ruff/rules/#flake8-return-ret)
+    "RSE",  # flake8-raise: raise statement improvements (https://docs.astral.sh/ruff/rules/#flake8-raise-rse)
+    "RUF",  # Ruff-specific rules (https://docs.astral.sh/ruff/rules/#ruff-specific-rules-ruf)
+    "SIM",  # flake8-simplify: simplify code (https://docs.astral.sh/ruff/rules/#flake8-simplify-sim)
+    "T10",  # flake8-debugger (https://docs.astral.sh/ruff/rules/#flake8-debugger-t10)
+    "UP",   # pyupgrade: modern Python syntax (https://docs.astral.sh/ruff/rules/#pyupgrade-up)
 ]
 ignore = [
-  "E501",     # line too long (https://docs.astral.sh/ruff/rules/line-too-long)
-  "I001",     # unsorted imports (https://docs.astral.sh/ruff/rules/unsorted-imports)
-  "N812",     # lowercase imported as non lowercase (https://docs.astral.sh/ruff/rules/lowercase-imported-as-non-lowercase)
-  "PLC0415",  # import outside top-level (https://docs.astral.sh/ruff/rules/import-outside-top-level)
-  "PLR0913",  # too many arguments (https://docs.astral.sh/ruff/rules/too-many-arguments)
-  "PLR2004",  # magic numbers (https://docs.astral.sh/ruff/rules/magic-value-comparison)
-  "RET504",   # unnecessary assignment (https://docs.astral.sh/ruff/rules/unnecessary-assign)
+    "I001",    # unsorted imports (https://docs.astral.sh/ruff/rules/unsorted-imports)
+    "N812",    # lowercase imported as non lowercase (https://docs.astral.sh/ruff/rules/lowercase-imported-as-non-lowercase)
+    "PLC0415", # import outside top-level (https://docs.astral.sh/ruff/rules/import-outside-top-level)
+    "PLR0913", # too many arguments (https://docs.astral.sh/ruff/rules/too-many-arguments)
+    "PLR2004", # magic numbers (https://docs.astral.sh/ruff/rules/magic-value-comparison)
+    "RET504",  # unnecessary assignment (https://docs.astral.sh/ruff/rules/unnecessary-assign)
 ]
diff --git a/scripts/validate_docstrings.py b/scripts/validate_docstrings.py
index fcebed1f..24f589f2 100644
--- a/scripts/validate_docstrings.py
+++ b/scripts/validate_docstrings.py
@@ -93,7 +93,7 @@ def validate_docstrings(
 
 def format_issues(issues: Sequence[DocstringIssue]) -> str:
     if not issues:
-        return "No docstring issues found."
+        return "\033[1;32mAll checks passed!\033[0m"
 
     lines = ["Docstring issues:"]
     for issue in issues:
diff --git a/zensical.toml b/zensical.toml
index fd2c78c7..b9a88519 100644
--- a/zensical.toml
+++ b/zensical.toml
@@ -1,6 +1,7 @@
 [project]
 docs_dir = "docs"
 site_dir = "docs/site"
+watch = ["hyperbench"]
 site_name = "©Hyperbench Documentation"
 site_description = "Documentation for Hyperbench"
 site_author = "Hypernetwork Research Group"