Skip to content

Commit 92951cb

Browse files
committed
add some benchmarks
1 parent ab079fc commit 92951cb

File tree

8 files changed

+223
-345
lines changed

8 files changed

+223
-345
lines changed

README.md

Lines changed: 14 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -19,20 +19,6 @@ SnapATAC2 is a flexible, versatile, and scalable single-cell omics analysis fram
1919
- Seamless integration with other single-cell analysis packages such as Scanpy.
2020
- Implementation of fully backed AnnData.
2121

22-
[//]: # (numfocus-fiscal-sponsor-attribution)
23-
24-
SnapATAC2 is part of the scverse® project ([website](https://scverse.org), [governance](https://scverse.org/about/roles)) and is fiscally sponsored by [NumFOCUS](https://numfocus.org/).
25-
If you like scverse® and want to support our mission, please consider making a tax-deductible [donation](https://numfocus.org/donate-to-scverse) to help the project pay for developer time, professional services, travel, workshops, and a variety of other needs.
26-
27-
<div align="center">
28-
<a href="https://numfocus.org/project/scverse">
29-
<img
30-
src="https://raw.githubusercontent.com/numfocus/templates/master/images/numfocus-logo.png"
31-
width="200"
32-
>
33-
</a>
34-
</div>
35-
3622
Documentation
3723
-------------
3824

@@ -46,3 +32,17 @@ How to cite
4632
Zhang, K., Zemke, N. R., Armand, E. J. & Ren, B. (2024).
4733
A fast, scalable and versatile tool for analysis of single-cell omics data.
4834
Nature Methods, 1–11. https://doi.org/10.1038/s41592-023-02139-9
35+
36+
[//]: # (numfocus-fiscal-sponsor-attribution)
37+
38+
SnapATAC2 is part of the scverse® project ([website](https://scverse.org), [governance](https://scverse.org/about/roles)) and is fiscally sponsored by [NumFOCUS](https://numfocus.org/).
39+
If you like scverse® and want to support our mission, please consider making a tax-deductible [donation](https://numfocus.org/donate-to-scverse) to help the project pay for developer time, professional services, travel, workshops, and a variety of other needs.
40+
41+
<div align="center">
42+
<a href="https://numfocus.org/project/scverse">
43+
<img
44+
src="https://raw.githubusercontent.com/numfocus/templates/master/images/numfocus-logo.png"
45+
width="200"
46+
>
47+
</a>
48+
</div>

snapatac2-core/Cargo.toml

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -38,4 +38,12 @@ statrs = "0.18"
3838
smallvec = "1.13"
3939
sanitize-filename = "0.5"
4040
tempfile = "3.3"
41-
zstd = { version = "0.13", features = ["zstdmt"] }
41+
zstd = { version = "0.13", features = ["zstdmt"] }
42+
43+
[dev-dependencies]
44+
criterion = { version = "0.5", features = ["html_reports"] }
45+
rand = "0.8.5"
46+
47+
[[bench]]
48+
name = "benchmark"
49+
harness = false
Lines changed: 84 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,84 @@
1+
use criterion::{criterion_group, criterion_main, Criterion};
2+
use nalgebra_sparse::CsrMatrix;
3+
use rand::{rngs::StdRng, Rng, SeedableRng};
4+
use snapatac2_core::embedding::{InverseDocumentFrequency, idf_from_chunks_parallel};
5+
6+
/// Generate a random binary CSR matrix with given shape and density
7+
fn random_csr_matrix(rows: usize, cols: usize, density: f64, rng: &mut StdRng) -> CsrMatrix<f64> {
8+
let mut indptr = Vec::with_capacity(rows + 1);
9+
let mut indices = Vec::new();
10+
let mut data = Vec::new();
11+
12+
indptr.push(0);
13+
for _ in 0..rows {
14+
let mut row_indices = Vec::new();
15+
for j in 0..cols {
16+
if rng.gen::<f64>() < density {
17+
row_indices.push(j);
18+
}
19+
}
20+
indices.extend(&row_indices);
21+
data.extend(std::iter::repeat(1.0).take(row_indices.len()));
22+
indptr.push(indices.len());
23+
}
24+
25+
CsrMatrix::try_from_csr_data(rows, cols, indptr, indices, data).unwrap()
26+
}
27+
28+
/// Create a matrix with all columns having the same count (edge case test)
29+
fn uniform_csr_matrix(rows: usize, cols: usize) -> CsrMatrix<f64> {
30+
let mut indptr = Vec::with_capacity(rows + 1);
31+
let mut indices = Vec::new();
32+
let mut data = Vec::new();
33+
34+
indptr.push(0);
35+
for _ in 0..rows {
36+
// Each row has all columns filled
37+
indices.extend(0..cols);
38+
data.extend(std::iter::repeat(1.0).take(cols));
39+
indptr.push(indices.len());
40+
}
41+
42+
CsrMatrix::try_from_csr_data(rows, cols, indptr, indices, data).unwrap()
43+
}
44+
45+
/// Create a matrix with some columns having zero counts (edge case test)
46+
fn sparse_csr_matrix(rows: usize, cols: usize) -> CsrMatrix<f64> {
47+
let mut indptr = Vec::with_capacity(rows + 1);
48+
let mut indices = Vec::new();
49+
let mut data = Vec::new();
50+
51+
indptr.push(0);
52+
for _ in 0..rows {
53+
// Only fill first half of columns
54+
indices.extend(0..(cols/2));
55+
data.extend(std::iter::repeat(1.0).take(cols/2));
56+
indptr.push(indices.len());
57+
}
58+
59+
CsrMatrix::try_from_csr_data(rows, cols, indptr, indices, data).unwrap()
60+
}
61+
62+
fn bench_idf(c: &mut Criterion) {
63+
let mut group = c.benchmark_group("IDF");
64+
group.sample_size(20);
65+
66+
let rng = &mut StdRng::seed_from_u64(42);
67+
68+
for n in [1000usize, 3000, 10000].into_iter() {
69+
let csr = random_csr_matrix(n, n, 0.5, rng);
70+
71+
group.bench_with_input(format!("Noraml ({} x {})", n, n), &csr, |b, csr|
72+
b.iter(|| std::iter::once(csr.clone()).idf())
73+
);
74+
75+
group.bench_with_input(format!("Parallel ({} x {})", n, n), &csr, |b, csr|
76+
b.iter(|| idf_from_chunks_parallel(std::iter::once(csr.clone())))
77+
);
78+
}
79+
group.finish();
80+
}
81+
82+
83+
criterion_group!(benches, bench_idf);
84+
criterion_main!(benches);

snapatac2-core/src/embedding.rs

Lines changed: 114 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -1,22 +1,117 @@
11
use nalgebra_sparse::CsrMatrix;
2+
use itertools::Itertools;
3+
use rayon::iter::{ParallelBridge, ParallelIterator};
24

3-
/// The IDF transformation.
4-
pub fn idf_l2(csr: &mut CsrMatrix<f64>) {
5-
let n = csr.ncols();
6-
let mut idf = vec![0.0; n];
7-
csr.row_iter().for_each(|row|
8-
row.col_indices().into_iter().zip(row.values().into_iter()).for_each(|(i, v)|
9-
idf[*i] += v
10-
)
11-
);
12-
idf.iter_mut().for_each(|c| *c = (n as f64 / (1.0 + *c)).ln());
13-
csr.row_iter_mut().for_each(|mut row| {
14-
let (cols, values) = row.cols_and_values_mut();
15-
let s = cols.into_iter().zip(values.into_iter())
16-
.map(|(i, v)| {
17-
*v *= idf[*i];
18-
*v * *v
19-
}).sum::<f64>().sqrt();
20-
values.iter_mut().for_each(|v| *v /= s);
21-
});
5+
pub trait InverseDocumentFrequency {
6+
/// Compute inverse document frequency (IDF) for a given sparse matrix.
7+
/// The input matrix is expected to be in CSR format,
8+
/// where each column represents a document and each row represents a term.
9+
/// The IDF is computed as `log(N / df)`, where `N` is the total number of documents
10+
/// and `df` is the document frequency of the term.
11+
/// If a term appears in all documents, its IDF is set to log(N / (N - 1)).
12+
/// If a term does not appear in any document, its IDF is set to log(N / 1) to
13+
/// avoid division by zero.
14+
fn idf(self) -> Vec<f64>;
15+
}
16+
17+
/*
18+
impl InverseDocumentFrequency for &CsrMatrix<f64> {
19+
fn idf(self) -> Vec<f64> {
20+
let mut idf = vec![0.0; self.ncols()];
21+
// Compute document frequency for each term
22+
self.col_indices().iter().for_each(|i| idf[*i] += 1.0);
23+
let n = self.nrows() as f64;
24+
if idf.iter().all_equal() {
25+
vec![1.0; idf.len()]
26+
} else {
27+
idf.iter_mut().for_each(|x| {
28+
if *x == 0.0 {
29+
*x = 1.0;
30+
} else if *x == n {
31+
*x = n - 1.0;
32+
}
33+
*x = (n / *x).ln()
34+
});
35+
idf
36+
}
37+
}
38+
}
39+
*/
40+
41+
impl<I: Iterator<Item = CsrMatrix<f64>>> InverseDocumentFrequency for I {
42+
fn idf(self) -> Vec<f64> {
43+
let mut iter = self.peekable();
44+
let mut idf = vec![0.0; iter.peek().unwrap().ncols()];
45+
let mut n = 0.0;
46+
iter.for_each(|mat| {
47+
mat.col_indices().iter().for_each(|i| idf[*i] += 1.0);
48+
n += mat.nrows() as f64;
49+
});
50+
if idf.iter().all_equal() {
51+
vec![1.0; idf.len()]
52+
} else {
53+
idf.iter_mut().for_each(|x| {
54+
if *x == 0.0 {
55+
*x = 1.0;
56+
} else if *x == n {
57+
*x = n - 1.0;
58+
}
59+
*x = (n / *x).ln()
60+
});
61+
idf
62+
}
63+
}
64+
}
65+
66+
67+
// idf_from_chunks that parallelizes the counting step
68+
pub fn idf_from_chunks_parallel<I>(input: I) -> Vec<f64>
69+
where
70+
I: IntoIterator<Item = CsrMatrix<f64>>,
71+
{
72+
let mut idf: Option<Vec<f64>> = None;
73+
let mut n = 0.0;
74+
for mat in input {
75+
let ncols = mat.ncols();
76+
if idf.is_none() {
77+
idf = Some(vec![0.0; ncols]);
78+
}
79+
let local: Vec<f64> = mat
80+
.row_iter()
81+
.par_bridge()
82+
.map(|row| {
83+
let mut local = vec![0.0; ncols];
84+
for i in row.col_indices() {
85+
local[*i] += 1.0;
86+
}
87+
local
88+
})
89+
.reduce(|| vec![0.0; ncols], |mut a, b| {
90+
for (x, y) in a.iter_mut().zip(b) {
91+
*x += y;
92+
}
93+
a
94+
});
95+
if let Some(ref mut idf_vec) = idf {
96+
for (x, y) in idf_vec.iter_mut().zip(local) {
97+
*x += y;
98+
}
99+
}
100+
n += mat.nrows() as f64;
101+
}
102+
let idf = idf.unwrap_or_default();
103+
if idf.iter().all_equal() {
104+
vec![1.0; idf.len()]
105+
} else {
106+
idf.into_iter()
107+
.map(|mut x| {
108+
if x == 0.0 {
109+
x = 1.0;
110+
} else if x == n {
111+
x = n - 1.0;
112+
}
113+
(n / x).ln()
114+
})
115+
.collect()
116+
}
22117
}

snapatac2-python/src/embedding.rs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ use numpy::{array::PyArrayMethods, PyArray1, PyArray2, PyReadonlyArray1, PyReado
1818
use pyanndata::data::PyArrayData;
1919
use pyo3::prelude::*;
2020
use rand::SeedableRng;
21-
use rayon::prelude::{ParallelBridge, ParallelIterator};
21+
use rayon::{iter::IntoParallelRefIterator, prelude::{ParallelBridge, ParallelIterator}};
2222
use std::ops::Deref;
2323

2424
#[pyfunction]
@@ -293,8 +293,8 @@ where
293293
let mut idf = vec![0.0; iter.peek().unwrap().ncols()];
294294
let mut n = 0.0;
295295
iter.for_each(|mat| {
296-
mat.col_indices().iter().for_each(|i| idf[*i] += 1.0);
297296
n += mat.nrows() as f64;
297+
mat.col_indices().iter().for_each(|i| idf[*i] += 1.0);
298298
});
299299
if idf.iter().all_equal() {
300300
vec![1.0; idf.len()]

snapatac2-python/tests/test_idf/Cargo.toml

Lines changed: 0 additions & 13 deletions
This file was deleted.

snapatac2-python/tests/test_idf/README.md

Lines changed: 0 additions & 53 deletions
This file was deleted.

0 commit comments

Comments
 (0)