|
| 1 | +# -*- coding: utf-8 -*- |
| 2 | +"""HDBSCAN-based outlier detection. |
| 3 | +""" |
| 4 | +# Author: Chao Gao <gaoc96@qq.com> |
| 5 | +# License: BSD 2 clause |
| 6 | + |
| 7 | +import numpy as np |
| 8 | +from sklearn.neighbors import NearestNeighbors |
| 9 | +from sklearn.utils.validation import check_array |
| 10 | +from sklearn.utils.validation import check_is_fitted |
| 11 | + |
| 12 | +from .base import BaseDetector |
| 13 | + |
| 14 | + |
| 15 | +class HDBSCAN(BaseDetector): |
| 16 | + """Wrapper of scikit-learn HDBSCAN for outlier detection. |
| 17 | +
|
| 18 | + HDBSCAN (Hierarchical Density-Based Spatial Clustering of Applications |
| 19 | + with Noise) performs density-based clustering and identifies outliers as |
| 20 | + points that do not belong to any cluster (noise) or have weak membership |
| 21 | + in their assigned cluster. The anomaly score is computed as |
| 22 | + ``1 - probabilities_``, where ``probabilities_`` represents each point's |
| 23 | + cluster membership strength. See :cite:`campello2013density` for details. |
| 24 | +
|
| 25 | + For new data prediction, the outlier scores are approximated using |
| 26 | + k-nearest neighbor interpolation from the training data scores. |
| 27 | +
|
| 28 | + Parameters |
| 29 | + ---------- |
| 30 | + min_cluster_size : int, optional (default=5) |
| 31 | + The minimum number of samples in a group for that group to be |
| 32 | + considered a cluster. |
| 33 | +
|
| 34 | + min_samples : int or None, optional (default=None) |
| 35 | + The number of samples in a neighborhood for a point to be |
| 36 | + considered a core point. If None, defaults to min_cluster_size. |
| 37 | +
|
| 38 | + metric : str or callable, optional (default='euclidean') |
| 39 | + The metric to use when calculating distance between instances in a |
| 40 | + feature array. |
| 41 | +
|
| 42 | + alpha : float, optional (default=1.0) |
| 43 | + A distance scaling parameter as used in robust single linkage. |
| 44 | +
|
| 45 | + algorithm : str, optional (default='auto') |
| 46 | + Exactly which algorithm to use for computing core distances; |
| 47 | + By default this is set to ``'auto'`` which attempts to use a |
| 48 | + ``KDTree`` if possible, otherwise it uses a ``BallTree``. |
| 49 | + Both ``'KDTree'`` and ``'BallTree'`` algorithms are also available. |
| 50 | +
|
| 51 | + leaf_size : int, optional (default=40) |
| 52 | + Leaf size for trees responsible for fast nearest neighbor queries |
| 53 | + during clustering. |
| 54 | +
|
| 55 | + n_jobs : int or None, optional (default=1) |
| 56 | + Number of parallel jobs to run for nearest-neighbor search in |
| 57 | + :meth:`decision_function` (KNN interpolation on training scores). |
| 58 | + ``-1`` means using all processors. |
| 59 | +
|
| 60 | + contamination : float in (0., 0.5), optional (default=0.1) |
| 61 | + The amount of contamination of the data set, i.e. the proportion |
| 62 | + of outliers in the data set. Used when fitting to define the |
| 63 | + threshold on the decision function. |
| 64 | +
|
| 65 | + Attributes |
| 66 | + ---------- |
| 67 | + decision_scores_ : numpy array of shape (n_samples,) |
| 68 | + The outlier scores of the training data (1 - membership probability). |
| 69 | + The higher, the more abnormal. Outliers tend to have higher |
| 70 | + scores. This value is available once the detector is fitted. |
| 71 | +
|
| 72 | + threshold_ : float |
| 73 | + The threshold is based on ``contamination``. It is the |
| 74 | + ``n_samples * contamination`` most abnormal samples in |
| 75 | + ``decision_scores_``. The threshold is calculated for generating |
| 76 | + binary outlier labels. |
| 77 | +
|
| 78 | + labels_ : int, either 0 or 1 |
| 79 | + The binary labels of the training data. 0 stands for inliers |
| 80 | + and 1 for outliers/anomalies. It is generated by applying |
| 81 | + ``threshold_`` on ``decision_scores_``. |
| 82 | +
|
| 83 | + cluster_labels_ : numpy array of shape (n_samples,) |
| 84 | + Cluster labels for each point in the training data. Noisy samples |
| 85 | + are given the label -1. |
| 86 | + """ |
| 87 | + |
| 88 | + def __init__(self, min_cluster_size=5, min_samples=None, |
| 89 | + metric='euclidean', alpha=1.0, algorithm='auto', |
| 90 | + leaf_size=40, n_jobs=1, contamination=0.1): |
| 91 | + super(HDBSCAN, self).__init__(contamination=contamination) |
| 92 | + self.min_cluster_size = min_cluster_size |
| 93 | + self.min_samples = min_samples |
| 94 | + self.metric = metric |
| 95 | + self.alpha = alpha |
| 96 | + self.algorithm = algorithm |
| 97 | + self.leaf_size = leaf_size |
| 98 | + self.n_jobs = n_jobs |
| 99 | + |
| 100 | + def fit(self, X, y=None): |
| 101 | + """Fit detector. y is ignored in unsupervised methods. |
| 102 | +
|
| 103 | + Parameters |
| 104 | + ---------- |
| 105 | + X : numpy array of shape (n_samples, n_features) |
| 106 | + The input samples. |
| 107 | +
|
| 108 | + y : Ignored |
| 109 | + Not used, present for API consistency by convention. |
| 110 | +
|
| 111 | + Returns |
| 112 | + ------- |
| 113 | + self : object |
| 114 | + Fitted estimator. |
| 115 | + """ |
| 116 | + X = check_array(X) |
| 117 | + self._set_n_classes(y) |
| 118 | + |
| 119 | + try: |
| 120 | + from sklearn.cluster import HDBSCAN as sklearn_HDBSCAN |
| 121 | + except Exception as e: |
| 122 | + raise ImportError( |
| 123 | + "HDBSCAN requires scikit-learn with sklearn.cluster.HDBSCAN. " |
| 124 | + "Please upgrade scikit-learn." |
| 125 | + ) from e |
| 126 | + |
| 127 | + self.detector_ = sklearn_HDBSCAN( |
| 128 | + min_cluster_size=self.min_cluster_size, |
| 129 | + min_samples=self.min_samples, |
| 130 | + metric=self.metric, |
| 131 | + alpha=self.alpha, |
| 132 | + algorithm=self.algorithm, |
| 133 | + leaf_size=self.leaf_size, |
| 134 | + store_centers='centroid', |
| 135 | + ) |
| 136 | + self.detector_.fit(X) |
| 137 | + |
| 138 | + self.cluster_labels_ = self.detector_.labels_ |
| 139 | + |
| 140 | + # Use 1 - membership probability as outlier scores |
| 141 | + # Noise points (label=-1) have probability 0, so score 1.0 |
| 142 | + self.decision_scores_ = 1.0 - self.detector_.probabilities_ |
| 143 | + self._process_decision_scores() |
| 144 | + |
| 145 | + # Build a KNN model on training data for scoring new samples |
| 146 | + self.X_train_ = X |
| 147 | + self.tree_ = NearestNeighbors( |
| 148 | + n_neighbors=min(self.min_cluster_size, X.shape[0]), |
| 149 | + metric=self.metric, |
| 150 | + n_jobs=self.n_jobs, |
| 151 | + ) |
| 152 | + self.tree_.fit(X) |
| 153 | + |
| 154 | + return self |
| 155 | + |
| 156 | + def decision_function(self, X): |
| 157 | + """Predict raw anomaly score of X using the fitted detector. |
| 158 | +
|
| 159 | + For new data, anomaly scores are approximated by the weighted |
| 160 | + average of the k nearest neighbors' outlier scores in the training |
| 161 | + data. |
| 162 | +
|
| 163 | + Parameters |
| 164 | + ---------- |
| 165 | + X : numpy array of shape (n_samples, n_features) |
| 166 | + The input samples. |
| 167 | +
|
| 168 | + Returns |
| 169 | + ------- |
| 170 | + anomaly_scores : numpy array of shape (n_samples,) |
| 171 | + The anomaly score of the input samples. |
| 172 | + """ |
| 173 | + check_is_fitted(self, ['decision_scores_', 'threshold_', 'labels_']) |
| 174 | + X = check_array(X) |
| 175 | + |
| 176 | + # Find k nearest neighbors in training data |
| 177 | + dist, ind = self.tree_.kneighbors(X) |
| 178 | + |
| 179 | + # Weight by inverse distance; closer neighbors have more influence |
| 180 | + # Add small epsilon to avoid division by zero |
| 181 | + weights = 1.0 / (dist + 1e-10) |
| 182 | + weights = weights / weights.sum(axis=1, keepdims=True) |
| 183 | + |
| 184 | + # Weighted average of training outlier scores |
| 185 | + neighbor_scores = self.decision_scores_[ind] |
| 186 | + scores = np.sum(weights * neighbor_scores, axis=1) |
| 187 | + |
| 188 | + return scores.ravel() |
0 commit comments