Merge pull request #658 from yzhao062/pr-656

yzhao062 · web-flow · commit 2d565ee39e2b · 2026-02-28T17:15:56.000-08:00
Pr 656 with fix
diff --git a/CHANGES.txt b/CHANGES.txt
@@ -208,3 +208,4 @@ v<2.0.7>, <02/27/2026> -- Improve NearestNeighbors consistency/performance in KN
 v<2.0.7>, <02/27/2026> -- VAE default output activation changed to identity and added identity activation support/tests (issue #651).
 v<2.0.7>, <02/27/2026> -- Fix package_data/MANIFEST configuration to include auto model selector JSON resources in distributions (issue #642).
 v<2.0.7>, <02/27/2026> -- In BaseDeepLearningDetector, explicitly ignore y during unsupervised fit to avoid batch tuple/list device errors in DL detectors (issue #591).
+v<2.0.8>, <02/28/2026> -- Add HDBSCAN detector with tests and documentation updates.
diff --git a/README.rst b/README.rst
@@ -374,6 +374,7 @@ Proximity-Based      (Incremental) COF   Memory Efficient Connectivity-Based Out
 Proximity-Based      CBLOF               Clustering-Based Local Outlier Factor                                                                   2003   [#He2003Discovering]_
 Proximity-Based      LOCI                LOCI: Fast outlier detection using the local correlation integral                                       2003   [#Papadimitriou2003LOCI]_
 Proximity-Based      HBOS                Histogram-based Outlier Score                                                                           2012   [#Goldstein2012Histogram]_
+Proximity-Based      HDBSCAN             Density-based clustering based on hierarchical density estimates                                        2013   [#Campello2013Density]_
 Proximity-Based      kNN                 k Nearest Neighbors (use the distance to the kth nearest neighbor as the outlier score)                 2000   [#Ramaswamy2000Efficient]_
 Proximity-Based      AvgKNN              Average kNN (use the average distance to k nearest neighbors as the outlier score)                      2002   [#Angiulli2002Fast]_
 Proximity-Based      MedKNN              Median kNN (use the median distance to k nearest neighbors as the outlier score)                        2002   [#Angiulli2002Fast]_
@@ -538,6 +539,8 @@ Reference
 
 .. [#Burgess2018Understanding] Burgess, Christopher P., et al. "Understanding disentangling in beta-VAE." arXiv preprint arXiv:1804.03599 (2018).
 
+.. [#Campello2013Density] Campello, R.J.G.B., Moulavi, D. and Sander, J., 2013, April. Density-based clustering based on hierarchical density estimates. In *Pacific-Asia Conference on Knowledge Discovery and Data Mining* (pp. 160-172). Springer.
+
 .. [#Cook1977Detection] Cook, R.D., 1977. Detection of influential observation in linear regression. Technometrics, 19(1), pp.15-18.
 
 .. [#Chen2024PyOD] Chen, S., Qian, Z., Siu, W., Hu, X., Li, J., Li, S., Qin, Y., Yang, T., Xiao, Z., Ye, W. and Zhang, Y., 2024. PyOD 2: A Python Library for Outlier Detection with LLM-powered Model Selection. arXiv preprint arXiv:2412.12154.
diff --git a/docs/index.rst b/docs/index.rst
@@ -228,6 +228,7 @@ Proximity-Based      Incr. COF         Memory Efficient Connectivity-Based Outli
 Proximity-Based      CBLOF             Clustering-Based Local Outlier Factor                                                                   2003   :class:`pyod.models.cblof.CBLOF`                     :cite:`a-he2003discovering`
 Proximity-Based      LOCI              LOCI: Fast outlier detection using the local correlation integral                                       2003   :class:`pyod.models.loci.LOCI`                       :cite:`a-papadimitriou2003loci`
 Proximity-Based      HBOS              Histogram-based Outlier Score                                                                           2012   :class:`pyod.models.hbos.HBOS`                       :cite:`a-goldstein2012histogram`
+Proximity-Based      HDBSCAN           Density-based clustering based on hierarchical density estimates                                        2013   :class:`pyod.models.hdbscan.HDBSCAN`                 :cite:`a-campello2013density`
 Proximity-Based      kNN               k Nearest Neighbors (use the distance to the kth nearest neighbor as the outlier score                  2000   :class:`pyod.models.knn.KNN`                         :cite:`a-ramaswamy2000efficient,a-angiulli2002fast`
 Proximity-Based      AvgKNN            Average kNN (use the average distance to k nearest neighbors as the outlier score)                      2002   :class:`pyod.models.knn.KNN`                         :cite:`a-ramaswamy2000efficient,a-angiulli2002fast`
 Proximity-Based      MedKNN            Median kNN (use the median distance to k nearest neighbors as the outlier score)                        2002   :class:`pyod.models.knn.KNN`                         :cite:`a-ramaswamy2000efficient,a-angiulli2002fast`
diff --git a/docs/pyod.models.rst b/docs/pyod.models.rst
@@ -184,6 +184,15 @@ pyod.models.hbos module
     :show-inheritance:
     :inherited-members:
 
+pyod.models.hdbscan module
+--------------------------
+
+.. automodule:: pyod.models.hdbscan
+    :members:
+    :undoc-members:
+    :show-inheritance:
+    :inherited-members:
+
 pyod.models.iforest module
 --------------------------
 
@@ -453,4 +462,4 @@ Module contents
 
 .. bibliography::
    :cited:
-   :labelprefix: B
+   :labelprefix: B
diff --git a/docs/zreferences.bib b/docs/zreferences.bib
@@ -112,6 +112,15 @@ @inproceedings{breunig2000lof
   organization={ACM}
 }
 
+@inproceedings{campello2013density,
+  title={Density-based clustering based on hierarchical density estimates},
+  author={Campello, Ricardo JGB and Moulavi, Davoud and Sander, J{\"o}rg},
+  booktitle={Pacific-Asia conference on knowledge discovery and data mining},
+  pages={160--172},
+  year={2013},
+  organization={Springer}
+}
+
 
 @inproceedings{zhao2018xgbod,
   title={XGBOD: Improving Supervised Outlier Detection with Unsupervised Representation Learning},
diff --git a/examples/hdbscan_example.py b/examples/hdbscan_example.py
@@ -0,0 +1,57 @@
+# -*- coding: utf-8 -*-
+"""Example of using HDBSCAN for outlier detection
+"""
+# Author: Yue Zhao <yzhao062@gmail.com>
+# License: BSD 2 clause
+
+from __future__ import division
+from __future__ import print_function
+
+import os
+import sys
+
+# temporary solution for relative imports in case pyod is not installed
+# if pyod is installed, no need to use the following line
+sys.path.append(
+    os.path.abspath(os.path.join(os.path.dirname("__file__"), '..')))
+
+from pyod.models.hdbscan import HDBSCAN
+from pyod.utils.data import generate_data
+from pyod.utils.data import evaluate_print
+from pyod.utils.example import visualize
+
+if __name__ == "__main__":
+    contamination = 0.1  # percentage of outliers
+    n_train = 200  # number of training points
+    n_test = 100  # number of testing points
+
+    # Generate sample data
+    X_train, X_test, y_train, y_test = \
+        generate_data(n_train=n_train,
+                      n_test=n_test,
+                      n_features=2,
+                      contamination=contamination,
+                      random_state=42)
+
+    # train HDBSCAN detector
+    clf_name = 'HDBSCAN'
+    clf = HDBSCAN()
+    clf.fit(X_train)
+
+    # get the prediction labels and outlier scores of the training data
+    y_train_pred = clf.labels_  # binary labels (0: inliers, 1: outliers)
+    y_train_scores = clf.decision_scores_  # raw outlier scores
+
+    # get the prediction on the test data
+    y_test_pred = clf.predict(X_test)  # outlier labels (0 or 1)
+    y_test_scores = clf.decision_function(X_test)  # outlier scores
+
+    # evaluate and print the results
+    print("\nOn Training Data:")
+    evaluate_print(clf_name, y_train, y_train_scores)
+    print("\nOn Test Data:")
+    evaluate_print(clf_name, y_test, y_test_scores)
+
+    # visualize the results
+    visualize(clf_name, X_train, y_train, X_test, y_test, y_train_pred,
+              y_test_pred, show_figure=True, save_figure=False)
diff --git a/pyod/models/hdbscan.py b/pyod/models/hdbscan.py
@@ -0,0 +1,188 @@
+# -*- coding: utf-8 -*-
+"""HDBSCAN-based outlier detection.
+"""
+# Author: Chao Gao <gaoc96@qq.com>
+# License: BSD 2 clause
+
+import numpy as np
+from sklearn.neighbors import NearestNeighbors
+from sklearn.utils.validation import check_array
+from sklearn.utils.validation import check_is_fitted
+
+from .base import BaseDetector
+
+
+class HDBSCAN(BaseDetector):
+    """Wrapper of scikit-learn HDBSCAN for outlier detection.
+
+    HDBSCAN (Hierarchical Density-Based Spatial Clustering of Applications
+    with Noise) performs density-based clustering and identifies outliers as
+    points that do not belong to any cluster (noise) or have weak membership
+    in their assigned cluster. The anomaly score is computed as
+    ``1 - probabilities_``, where ``probabilities_`` represents each point's
+    cluster membership strength. See :cite:`campello2013density` for details.
+
+    For new data prediction, the outlier scores are approximated using
+    k-nearest neighbor interpolation from the training data scores.
+
+    Parameters
+    ----------
+    min_cluster_size : int, optional (default=5)
+        The minimum number of samples in a group for that group to be
+        considered a cluster.
+
+    min_samples : int or None, optional (default=None)
+        The number of samples in a neighborhood for a point to be
+        considered a core point. If None, defaults to min_cluster_size.
+
+    metric : str or callable, optional (default='euclidean')
+        The metric to use when calculating distance between instances in a
+        feature array.
+
+    alpha : float, optional (default=1.0)
+        A distance scaling parameter as used in robust single linkage.
+
+    algorithm : str, optional (default='auto')
+        Exactly which algorithm to use for computing core distances;
+        By default this is set to ``'auto'`` which attempts to use a
+        ``KDTree`` if possible, otherwise it uses a ``BallTree``.
+        Both ``'KDTree'`` and ``'BallTree'`` algorithms are also available.
+
+    leaf_size : int, optional (default=40)
+        Leaf size for trees responsible for fast nearest neighbor queries
+        during clustering.
+
+    n_jobs : int or None, optional (default=1)
+        Number of parallel jobs to run for nearest-neighbor search in
+        :meth:`decision_function` (KNN interpolation on training scores).
+        ``-1`` means using all processors.
+
+    contamination : float in (0., 0.5), optional (default=0.1)
+        The amount of contamination of the data set, i.e. the proportion
+        of outliers in the data set. Used when fitting to define the
+        threshold on the decision function.
+
+    Attributes
+    ----------
+    decision_scores_ : numpy array of shape (n_samples,)
+        The outlier scores of the training data (1 - membership probability).
+        The higher, the more abnormal. Outliers tend to have higher
+        scores. This value is available once the detector is fitted.
+
+    threshold_ : float
+        The threshold is based on ``contamination``. It is the
+        ``n_samples * contamination`` most abnormal samples in
+        ``decision_scores_``. The threshold is calculated for generating
+        binary outlier labels.
+
+    labels_ : int, either 0 or 1
+        The binary labels of the training data. 0 stands for inliers
+        and 1 for outliers/anomalies. It is generated by applying
+        ``threshold_`` on ``decision_scores_``.
+
+    cluster_labels_ : numpy array of shape (n_samples,)
+        Cluster labels for each point in the training data. Noisy samples
+        are given the label -1.
+    """
+
+    def __init__(self, min_cluster_size=5, min_samples=None,
+                 metric='euclidean', alpha=1.0, algorithm='auto',
+                 leaf_size=40, n_jobs=1, contamination=0.1):
+        super(HDBSCAN, self).__init__(contamination=contamination)
+        self.min_cluster_size = min_cluster_size
+        self.min_samples = min_samples
+        self.metric = metric
+        self.alpha = alpha
+        self.algorithm = algorithm
+        self.leaf_size = leaf_size
+        self.n_jobs = n_jobs
+
+    def fit(self, X, y=None):
+        """Fit detector. y is ignored in unsupervised methods.
+
+        Parameters
+        ----------
+        X : numpy array of shape (n_samples, n_features)
+            The input samples.
+
+        y : Ignored
+            Not used, present for API consistency by convention.
+
+        Returns
+        -------
+        self : object
+            Fitted estimator.
+        """
+        X = check_array(X)
+        self._set_n_classes(y)
+
+        try:
+            from sklearn.cluster import HDBSCAN as sklearn_HDBSCAN
+        except Exception as e:
+            raise ImportError(
+                "HDBSCAN requires scikit-learn with sklearn.cluster.HDBSCAN. "
+                "Please upgrade scikit-learn."
+            ) from e
+
+        self.detector_ = sklearn_HDBSCAN(
+            min_cluster_size=self.min_cluster_size,
+            min_samples=self.min_samples,
+            metric=self.metric,
+            alpha=self.alpha,
+            algorithm=self.algorithm,
+            leaf_size=self.leaf_size,
+            store_centers='centroid',
+        )
+        self.detector_.fit(X)
+
+        self.cluster_labels_ = self.detector_.labels_
+
+        # Use 1 - membership probability as outlier scores
+        # Noise points (label=-1) have probability 0, so score 1.0
+        self.decision_scores_ = 1.0 - self.detector_.probabilities_
+        self._process_decision_scores()
+
+        # Build a KNN model on training data for scoring new samples
+        self.X_train_ = X
+        self.tree_ = NearestNeighbors(
+            n_neighbors=min(self.min_cluster_size, X.shape[0]),
+            metric=self.metric,
+            n_jobs=self.n_jobs,
+        )
+        self.tree_.fit(X)
+
+        return self
+
+    def decision_function(self, X):
+        """Predict raw anomaly score of X using the fitted detector.
+
+        For new data, anomaly scores are approximated by the weighted
+        average of the k nearest neighbors' outlier scores in the training
+        data.
+
+        Parameters
+        ----------
+        X : numpy array of shape (n_samples, n_features)
+            The input samples.
+
+        Returns
+        -------
+        anomaly_scores : numpy array of shape (n_samples,)
+            The anomaly score of the input samples.
+        """
+        check_is_fitted(self, ['decision_scores_', 'threshold_', 'labels_'])
+        X = check_array(X)
+
+        # Find k nearest neighbors in training data
+        dist, ind = self.tree_.kneighbors(X)
+
+        # Weight by inverse distance; closer neighbors have more influence
+        # Add small epsilon to avoid division by zero
+        weights = 1.0 / (dist + 1e-10)
+        weights = weights / weights.sum(axis=1, keepdims=True)
+
+        # Weighted average of training outlier scores
+        neighbor_scores = self.decision_scores_[ind]
+        scores = np.sum(weights * neighbor_scores, axis=1)
+
+        return scores.ravel()
diff --git a/pyod/test/test_hdbscan.py b/pyod/test/test_hdbscan.py