scverse
diff --git a/‎README.md‎
Lines changed: 14 additions & 14 deletions b/‎README.md‎
Lines changed: 14 additions & 14 deletions
diff --git a/‎snapatac2-core/Cargo.toml‎
Lines changed: 9 additions & 1 deletion b/‎snapatac2-core/Cargo.toml‎
Lines changed: 9 additions & 1 deletion
diff --git a/‎snapatac2-core/benches/benchmark.rs‎
Lines changed: 84 additions & 0 deletions b/‎snapatac2-core/benches/benchmark.rs‎
Lines changed: 84 additions & 0 deletions
diff --git a/‎snapatac2-core/src/embedding.rs‎
Lines changed: 114 additions & 19 deletions b/‎snapatac2-core/src/embedding.rs‎
Lines changed: 114 additions & 19 deletions
diff --git a/‎snapatac2-python/src/embedding.rs‎
Lines changed: 2 additions & 2 deletions b/‎snapatac2-python/src/embedding.rs‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎snapatac2-python/tests/test_idf/Cargo.toml‎
Lines changed: 0 additions & 13 deletions b/‎snapatac2-python/tests/test_idf/Cargo.toml‎
Lines changed: 0 additions & 13 deletions
diff --git a/‎snapatac2-python/tests/test_idf/README.md‎
Lines changed: 0 additions & 53 deletions b/‎snapatac2-python/tests/test_idf/README.md‎
Lines changed: 0 additions & 53 deletions
@@ -19,20 +19,6 @@ SnapATAC2 is a flexible, versatile, and scalable single-cell omics analysis fram
 - Seamless integration with other single-cell analysis packages such as Scanpy.
 - Implementation of fully backed AnnData.
 
-[//]: # (numfocus-fiscal-sponsor-attribution)
-
-SnapATAC2 is part of the scverse® project ([website](https://scverse.org), [governance](https://scverse.org/about/roles)) and is fiscally sponsored by [NumFOCUS](https://numfocus.org/).
-If you like scverse® and want to support our mission, please consider making a tax-deductible [donation](https://numfocus.org/donate-to-scverse) to help the project pay for developer time, professional services, travel, workshops, and a variety of other needs.
-
-<div align="center">
-<a href="https://numfocus.org/project/scverse">
-  <img
-    src="https://raw.githubusercontent.com/numfocus/templates/master/images/numfocus-logo.png"
-    width="200"
-  >
-</a>
-</div>
-
 Documentation
 -------------
 
@@ -46,3 +32,17 @@ How to cite
 Zhang, K., Zemke, N. R., Armand, E. J. & Ren, B. (2024).
 A fast, scalable and versatile tool for analysis of single-cell omics data.
 Nature Methods, 1–11. https://doi.org/10.1038/s41592-023-02139-9
+
+[//]: # (numfocus-fiscal-sponsor-attribution)
+
+SnapATAC2 is part of the scverse® project ([website](https://scverse.org), [governance](https://scverse.org/about/roles)) and is fiscally sponsored by [NumFOCUS](https://numfocus.org/).
+If you like scverse® and want to support our mission, please consider making a tax-deductible [donation](https://numfocus.org/donate-to-scverse) to help the project pay for developer time, professional services, travel, workshops, and a variety of other needs.
+
+<div align="center">
+<a href="https://numfocus.org/project/scverse">
+  <img
+    src="https://raw.githubusercontent.com/numfocus/templates/master/images/numfocus-logo.png"
+    width="200"
+  >
+</a>
+</div>
@@ -38,4 +38,12 @@ statrs = "0.18"
 smallvec = "1.13"
 sanitize-filename = "0.5"
 tempfile = "3.3"
-zstd = { version = "0.13", features = ["zstdmt"] }
+zstd = { version = "0.13", features = ["zstdmt"] }
+
+[dev-dependencies]
+criterion = { version = "0.5", features = ["html_reports"] }
+rand = "0.8.5"
+
+[[bench]]
+name = "benchmark"
+harness = false
@@ -0,0 +1,84 @@
+use criterion::{criterion_group, criterion_main, Criterion};
+use nalgebra_sparse::CsrMatrix;
+use rand::{rngs::StdRng, Rng, SeedableRng};
+use snapatac2_core::embedding::{InverseDocumentFrequency, idf_from_chunks_parallel};
+
+/// Generate a random binary CSR matrix with given shape and density
+fn random_csr_matrix(rows: usize, cols: usize, density: f64, rng: &mut StdRng) -> CsrMatrix<f64> {
+    let mut indptr = Vec::with_capacity(rows + 1);
+    let mut indices = Vec::new();
+    let mut data = Vec::new();
+    
+    indptr.push(0);
+    for _ in 0..rows {
+        let mut row_indices = Vec::new();
+        for j in 0..cols {
+            if rng.gen::<f64>() < density {
+                row_indices.push(j);
+            }
+        }
+        indices.extend(&row_indices);
+        data.extend(std::iter::repeat(1.0).take(row_indices.len()));
+        indptr.push(indices.len());
+    }
+    
+    CsrMatrix::try_from_csr_data(rows, cols, indptr, indices, data).unwrap()
+}
+
+/// Create a matrix with all columns having the same count (edge case test)
+fn uniform_csr_matrix(rows: usize, cols: usize) -> CsrMatrix<f64> {
+    let mut indptr = Vec::with_capacity(rows + 1);
+    let mut indices = Vec::new();
+    let mut data = Vec::new();
+    
+    indptr.push(0);
+    for _ in 0..rows {
+        // Each row has all columns filled
+        indices.extend(0..cols);
+        data.extend(std::iter::repeat(1.0).take(cols));
+        indptr.push(indices.len());
+    }
+    
+    CsrMatrix::try_from_csr_data(rows, cols, indptr, indices, data).unwrap()
+}
+
+/// Create a matrix with some columns having zero counts (edge case test)
+fn sparse_csr_matrix(rows: usize, cols: usize) -> CsrMatrix<f64> {
+    let mut indptr = Vec::with_capacity(rows + 1);
+    let mut indices = Vec::new();
+    let mut data = Vec::new();
+    
+    indptr.push(0);
+    for _ in 0..rows {
+        // Only fill first half of columns
+        indices.extend(0..(cols/2));
+        data.extend(std::iter::repeat(1.0).take(cols/2));
+        indptr.push(indices.len());
+    }
+    
+    CsrMatrix::try_from_csr_data(rows, cols, indptr, indices, data).unwrap()
+}
+
+fn bench_idf(c: &mut Criterion) {
+    let mut group = c.benchmark_group("IDF");
+    group.sample_size(20);
+
+    let rng = &mut StdRng::seed_from_u64(42);
+
+    for n in [1000usize, 3000, 10000].into_iter() {
+        let csr = random_csr_matrix(n, n, 0.5, rng);
+
+        group.bench_with_input(format!("Noraml ({} x {})", n, n), &csr, |b, csr|
+            b.iter(|| std::iter::once(csr.clone()).idf())
+        );
+
+        group.bench_with_input(format!("Parallel ({} x {})", n, n), &csr, |b, csr|
+            b.iter(|| idf_from_chunks_parallel(std::iter::once(csr.clone())))
+        );
+    }
+    group.finish();
+}
+
+
+criterion_group!(benches, bench_idf);
+criterion_main!(benches);
@@ -1,22 +1,117 @@
 use nalgebra_sparse::CsrMatrix;
+use itertools::Itertools;
+use rayon::iter::{ParallelBridge, ParallelIterator};
 
-/// The IDF transformation.
-pub fn idf_l2(csr: &mut CsrMatrix<f64>) {
-    let n = csr.ncols();
-    let mut idf = vec![0.0; n];
-    csr.row_iter().for_each(|row|
-        row.col_indices().into_iter().zip(row.values().into_iter()).for_each(|(i, v)|
-            idf[*i] += v
-        )
-    );
-    idf.iter_mut().for_each(|c| *c = (n as f64 / (1.0 + *c)).ln());
-    csr.row_iter_mut().for_each(|mut row| {
-        let (cols, values) = row.cols_and_values_mut();
-        let s = cols.into_iter().zip(values.into_iter())
-            .map(|(i, v)| {
-                *v *= idf[*i];
-                *v * *v
-            }).sum::<f64>().sqrt();
-        values.iter_mut().for_each(|v| *v /= s);
-    });
+pub trait InverseDocumentFrequency {
+    /// Compute inverse document frequency (IDF) for a given sparse matrix.
+    /// The input matrix is expected to be in CSR format,
+    /// where each column represents a document and each row represents a term.
+    /// The IDF is computed as `log(N / df)`, where `N` is the total number of documents
+    /// and `df` is the document frequency of the term.
+    /// If a term appears in all documents, its IDF is set to log(N / (N - 1)).
+    /// If a term does not appear in any document, its IDF is set to log(N / 1) to 
+    /// avoid division by zero.
+    fn idf(self) -> Vec<f64>;
+}
+
+/*
+impl InverseDocumentFrequency for &CsrMatrix<f64> {
+    fn idf(self) -> Vec<f64> {
+        let mut idf = vec![0.0; self.ncols()];
+        // Compute document frequency for each term
+        self.col_indices().iter().for_each(|i| idf[*i] += 1.0);
+        let n = self.nrows() as f64;
+        if idf.iter().all_equal() {
+            vec![1.0; idf.len()]
+        } else {
+            idf.iter_mut().for_each(|x| {
+                if *x == 0.0 {
+                    *x = 1.0;
+                } else if *x == n {
+                    *x = n - 1.0;
+                }
+                *x = (n / *x).ln()
+            });
+            idf
+        }
+    }
+}
+    */
+
+impl<I: Iterator<Item = CsrMatrix<f64>>> InverseDocumentFrequency for I {
+    fn idf(self) -> Vec<f64> {
+        let mut iter = self.peekable();
+        let mut idf = vec![0.0; iter.peek().unwrap().ncols()];
+        let mut n = 0.0;
+        iter.for_each(|mat| {
+            mat.col_indices().iter().for_each(|i| idf[*i] += 1.0);
+            n += mat.nrows() as f64;
+        });
+        if idf.iter().all_equal() {
+            vec![1.0; idf.len()]
+        } else {
+            idf.iter_mut().for_each(|x| {
+                if *x == 0.0 {
+                    *x = 1.0;
+                } else if *x == n {
+                    *x = n - 1.0;
+                }
+                *x = (n / *x).ln()
+            });
+            idf
+        }
+    }
+}
+
+
+// idf_from_chunks that parallelizes the counting step
+pub fn idf_from_chunks_parallel<I>(input: I) -> Vec<f64>
+where
+    I: IntoIterator<Item = CsrMatrix<f64>>,
+{
+    let mut idf: Option<Vec<f64>> = None;
+    let mut n = 0.0;
+    for mat in input {
+        let ncols = mat.ncols();
+        if idf.is_none() {
+            idf = Some(vec![0.0; ncols]);
+        }
+        let local: Vec<f64> = mat
+            .row_iter()
+            .par_bridge()
+            .map(|row| {
+                let mut local = vec![0.0; ncols];
+                for i in row.col_indices() {
+                    local[*i] += 1.0;
+                }
+                local
+            })
+            .reduce(|| vec![0.0; ncols], |mut a, b| {
+                for (x, y) in a.iter_mut().zip(b) {
+                    *x += y;
+                }
+                a
+            });
+        if let Some(ref mut idf_vec) = idf {
+            for (x, y) in idf_vec.iter_mut().zip(local) {
+                *x += y;
+            }
+        }
+        n += mat.nrows() as f64;
+    }
+    let idf = idf.unwrap_or_default();
+    if idf.iter().all_equal() {
+        vec![1.0; idf.len()]
+    } else {
+        idf.into_iter()
+            .map(|mut x| {
+                if x == 0.0 {
+                    x = 1.0;
+                } else if x == n {
+                    x = n - 1.0;
+                }
+                (n / x).ln()
+            })
+            .collect()
+    }
 }
@@ -18,7 +18,7 @@ use numpy::{array::PyArrayMethods, PyArray1, PyArray2, PyReadonlyArray1, PyReado
 use pyanndata::data::PyArrayData;
 use pyo3::prelude::*;
 use rand::SeedableRng;
-use rayon::prelude::{ParallelBridge, ParallelIterator};
+use rayon::{iter::IntoParallelRefIterator, prelude::{ParallelBridge, ParallelIterator}};
 use std::ops::Deref;
 
 #[pyfunction]
@@ -293,8 +293,8 @@ where
     let mut idf = vec![0.0; iter.peek().unwrap().ncols()];
     let mut n = 0.0;
     iter.for_each(|mat| {
-        mat.col_indices().iter().for_each(|i| idf[*i] += 1.0);
         n += mat.nrows() as f64;
+        mat.col_indices().iter().for_each(|i| idf[*i] += 1.0);
     });
     if idf.iter().all_equal() {
         vec![1.0; idf.len()]