Merge remote-tracking branch 'origin/main' into ivf-flat-search-udf

divyegala · divyegala · commit 77114a41aef0 · 2026-03-04T17:30:19.000Z
diff --git a/README.md b/README.md
@@ -77,40 +77,7 @@ cuVS is built on top of the RAPIDS RAFT library of high performance machine lear
 
 ## Installing cuVS
 
-cuVS comes with pre-built packages that can be installed through [conda](https://conda.io/projects/conda/en/latest/user-guide/getting-started.html#managing-python) and [pip](https://pip.pypa.io/en/stable/). Different packages are available for the different languages supported by cuVS:
-
-| Python | C/C++     |
-|--------|-----------|
-| `cuvs` | `libcuvs` |
-
-### Stable release
-
-It is recommended to use [mamba](https://conda.github.io/conda-libmamba-solver/user-guide/) to install the desired packages. The following command will install the Python package. You can substitute `cuvs` for any of the packages in the table above:
-
-```bash
-conda install -c rapidsai -c conda-forge cuvs
-```
-
-The cuVS Python package can also be installed through [pip](https://docs.rapids.ai/install#pip>).
-
-```bash
-# CUDA 13
-pip install cuvs-cu13 --extra-index-url=https://pypi.nvidia.com
-
-# CUDA 12
-pip install cuvs-cu12 --extra-index-url=https://pypi.nvidia.com
-```
-
-### Nightlies
-If installing a version that has not yet been released, the `rapidsai` channel can be replaced with `rapidsai-nightly`:
-
-```bash
-# CUDA 13
-conda install -c rapidsai-nightly -c conda-forge cuvs=26.04 cuda-version=13.1
-
-# CUDA 12
-conda install -c rapidsai-nightly -c conda-forge cuvs=26.04 cuda-version=12.9
-```
+cuVS comes with pre-built packages that can be installed through [conda](https://conda.io/projects/conda/en/latest/user-guide/getting-started.html#managing-python) and [pip](https://pip.pypa.io/en/stable/) or [tarball](https://developer.nvidia.com/cuvs-downloads). Different packages are available for the different languages supported by cuVS.
 
 > [!NOTE]
 > If compiled binary size is a concern, please note that the cuVS builds for CUDA 13 are roughly half the size of CUDA 12 builds. This is a result of improved compression rates in the newer supported CUDA drivers. We will be adopting the newer drivers for CUDA 12 builds in Spring of 2026, which will ultimately bring them down to roughly the size of the CUDA 13 builds. In the meantime, the NVIDIA cuVS team is continuing to shave down the binary sizes for all supported CUDA versions. If binary size is an issue for you, please consider linking to cuVS statically either by building from source or using pre-built `libcuvs-static` conda package.
diff --git a/cpp/src/neighbors/detail/epsilon_neighborhood.cuh b/cpp/src/neighbors/detail/epsilon_neighborhood.cuh
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
 
@@ -27,6 +27,10 @@ struct EpsUnexpL2SqNeighborhood : public BaseClass {
 
   DataT acc[P::AccRowsPerTh][P::AccColsPerTh];
 
+  size_t n_blocks_y;
+  size_t block_x;
+  size_t block_y;
+
  public:
   DI EpsUnexpL2SqNeighborhood(bool* _adj,
                               IdxT* _vd,
@@ -36,9 +40,17 @@ struct EpsUnexpL2SqNeighborhood : public BaseClass {
                               IdxT _n,
                               IdxT _k,
                               DataT _eps,
-                              char* _smem)
-    : BaseClass(_x, _y, _m, _n, _k, _smem), adj(_adj), eps(_eps), vd(_vd), smem(_smem)
+                              char* _smem,
+                              size_t _n_blocks_y)
+    : BaseClass(_x, _y, _m, _n, _k, _smem),
+      adj(_adj),
+      eps(_eps),
+      vd(_vd),
+      smem(_smem),
+      n_blocks_y(_n_blocks_y)
   {
+    block_x = static_cast<size_t>(blockIdx.x) / n_blocks_y;
+    block_y = static_cast<size_t>(blockIdx.x) % n_blocks_y;
   }
 
   DI void run()
@@ -51,7 +63,7 @@ struct EpsUnexpL2SqNeighborhood : public BaseClass {
  private:
   DI void prolog()
   {
-    this->ldgXY(IdxT(blockIdx.x) * P::Mblk, IdxT(blockIdx.y) * P::Nblk, 0);
+    this->ldgXY(block_x * P::Mblk, block_y * P::Nblk, 0);
 #pragma unroll
     for (int i = 0; i < P::AccRowsPerTh; ++i) {
 #pragma unroll
@@ -67,7 +79,7 @@ struct EpsUnexpL2SqNeighborhood : public BaseClass {
   DI void loop()
   {
     for (int kidx = P::Kblk; kidx < this->k; kidx += P::Kblk) {
-      this->ldgXY(IdxT(blockIdx.x) * P::Mblk, IdxT(blockIdx.y) * P::Nblk, kidx);
+      this->ldgXY(block_x * P::Mblk, block_y * P::Nblk, kidx);
       accumulate();  // on the previous k-block
       this->stsXY();
       __syncthreads();
@@ -79,8 +91,8 @@ struct EpsUnexpL2SqNeighborhood : public BaseClass {
 
   DI void epilog()
   {
-    IdxT startx = blockIdx.x * P::Mblk + this->accrowid;
-    IdxT starty = blockIdx.y * P::Nblk + this->acccolid;
+    IdxT startx = block_x * P::Mblk + this->accrowid;
+    IdxT starty = block_y * P::Nblk + this->acccolid;
     auto lid    = raft::laneId();
     IdxT sums[P::AccRowsPerTh];
 #pragma unroll
@@ -126,7 +138,7 @@ struct EpsUnexpL2SqNeighborhood : public BaseClass {
     __syncthreads();  // so that we can safely reuse smem
     int gid       = this->accrowid;
     int lid       = this->acccolid;
-    auto cidx     = IdxT(blockIdx.x) * P::Mblk + gid;
+    auto cidx     = block_x * P::Mblk + gid;
     IdxT totalSum = 0;
     // update the individual vertex degrees
 #pragma unroll
@@ -157,11 +169,18 @@ struct EpsUnexpL2SqNeighborhood : public BaseClass {
 };  // struct EpsUnexpL2SqNeighborhood
 
 template <typename DataT, typename IdxT, typename Policy>
-__launch_bounds__(Policy::Nthreads, 2) RAFT_KERNEL epsUnexpL2SqNeighKernel(
-  bool* adj, IdxT* vd, const DataT* x, const DataT* y, IdxT m, IdxT n, IdxT k, DataT eps)
+__launch_bounds__(Policy::Nthreads, 2) RAFT_KERNEL epsUnexpL2SqNeighKernel(bool* adj,
+                                                                           IdxT* vd,
+                                                                           const DataT* x,
+                                                                           const DataT* y,
+                                                                           IdxT m,
+                                                                           IdxT n,
+                                                                           IdxT k,
+                                                                           DataT eps,
+                                                                           size_t n_blocks_y)
 {
   extern __shared__ char smem[];
-  EpsUnexpL2SqNeighborhood<DataT, IdxT, Policy> obj(adj, vd, x, y, m, n, k, eps, smem);
+  EpsUnexpL2SqNeighborhood<DataT, IdxT, Policy> obj(adj, vd, x, y, m, n, k, eps, smem, n_blocks_y);
   obj.run();
 }
 
@@ -177,10 +196,12 @@ void epsUnexpL2SqNeighImpl(bool* adj,
                            cudaStream_t stream)
 {
   typedef typename raft::linalg::Policy4x4<DataT, VecLen>::Policy Policy;
-  dim3 grid(raft::ceildiv<int>(m, Policy::Mblk), raft::ceildiv<int>(n, Policy::Nblk));
+  size_t n_blocks_x = raft::ceildiv<size_t>(m, Policy::Mblk);
+  size_t n_blocks_y = raft::ceildiv<size_t>(n, Policy::Nblk);
+  dim3 grid(n_blocks_x * n_blocks_y);
   dim3 blk(Policy::Nthreads);
   epsUnexpL2SqNeighKernel<DataT, IdxT, Policy>
-    <<<grid, blk, Policy::SmemSize, stream>>>(adj, vd, x, y, m, n, k, eps);
+    <<<grid, blk, Policy::SmemSize, stream>>>(adj, vd, x, y, m, n, k, eps, n_blocks_y);
   RAFT_CUDA_TRY(cudaGetLastError());
 }
 
diff --git a/cpp/tests/neighbors/epsilon_neighborhood.cu b/cpp/tests/neighbors/epsilon_neighborhood.cu
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
 
@@ -10,6 +10,7 @@
 #include <raft/core/device_mdspan.hpp>
 #include <raft/core/host_mdspan.hpp>
 #include <raft/core/resource/cuda_stream.hpp>
+#include <raft/matrix/init.cuh>
 #include <raft/random/make_blobs.cuh>
 #include <raft/sparse/convert/csr.cuh>
 #include <raft/util/cudart_utils.hpp>
@@ -419,4 +420,41 @@ TEST_P(EpsNeighRbcTestFI, SparseRbcMaxK)
 
 INSTANTIATE_TEST_CASE_P(EpsNeighTests, EpsNeighRbcTestFI, ::testing::ValuesIn(inputsfi_rbc));
 
+TEST(EpsNeighborhood, LargeNDimension)
+{
+  // n just past the grid.y=65535 limit for Nblk=16
+  int64_t m = 1, n = 65536 * 16 + 1, k = 4;
+  float eps = 1e10f;  // large enough that everything is a neighbor
+
+  raft::resources handle;
+  auto x   = raft::make_device_matrix<float, int64_t>(handle, m, k);
+  auto y   = raft::make_device_matrix<float, int64_t>(handle, n, k);
+  auto adj = raft::make_device_matrix<bool, int64_t>(handle, m, n);
+  auto vd  = raft::make_device_vector<int64_t, int64_t>(handle, m + 1);
+
+  // fill x, y with zeros (every pair has distance 0 < eps)
+  raft::matrix::fill(handle, x.view(), 0.0f);
+  raft::matrix::fill(handle, y.view(), 0.0f);
+
+  cuvs::neighbors::epsilon_neighborhood::compute(handle,
+                                                 raft::make_const_mdspan(x.view()),
+                                                 raft::make_const_mdspan(y.view()),
+                                                 adj.view(),
+                                                 vd.view(),
+                                                 eps,
+                                                 cuvs::distance::DistanceType::L2Unexpanded);
+
+  // Verify: with distance=0 and huge eps, every entry in adj should be true
+  // and vd[0] should equal n
+  auto adj_expected = raft::make_device_matrix<bool, int64_t>(handle, m, n);
+  raft::matrix::fill(handle, adj_expected.view(), true);
+  auto stream = raft::resource::get_cuda_stream(handle);
+  ASSERT_TRUE(cuvs::devArrMatch(
+    adj_expected.data_handle(), adj.data_handle(), m * n, cuvs::Compare<bool>(), stream));
+
+  int64_t expected_vd0 = n;
+  ASSERT_TRUE(
+    cuvs::devArrMatch(&expected_vd0, vd.data_handle(), 1, cuvs::Compare<int64_t>(), stream));
+}
+
 };  // namespace cuvs::neighbors::epsilon_neighborhood
diff --git a/docs/source/build.rst b/docs/source/build.rst
@@ -9,6 +9,8 @@ The cuVS software development kit provides APIs for C, C++, Python, and Rust lan
 
   * `Python through Pip`_
 
+  * `Tarball`_
+
 - `Build from source`_
 
   * `Prerequisites`_
@@ -77,12 +79,42 @@ The cuVS Python package can also be `installed through pip <https://docs.rapids.
 
 Note: these packages statically link the C and C++ libraries so the `libcuvs` and `libcuvs_c` shared libraries won't be readily available to use in your code.
 
+Tarball
+^^^^^^^
+
+Install Dependencies
+~~~~~~~~~~~~~~~~~~~~
+
+1. `NCCL <https://docs.nvidia.com/deeplearning/nccl/install-guide/index.html>` _
+2. `libopenmp`
+3. CUDA Toolkit Runtime 12.2+
+4. Ampere architecture or better (compute capability >= 8.0)
+
+Download & Extract
+~~~~~~~~~~~~~~~~~~
+
+Download the pre-built tarball for your CPU architecture and CUDA version from
+`https://developer.nvidia.com/cuvs-downloads <https://developer.nvidia.com/cuvs-downloads>`_
+
+Untar the tarball into a directory.
+
+.. code-block:: bash
+
+    tar -xzvf libcuvs-linux-sbsa-26.02.00.189485_cuda12-archive.tar.xz -C /path/to/folder
+
+
+Add cuVS to your system library load path. This should be done in the appropriate profile configuration (for e.g. `.bashrc`, `.bash_profile`) to maintain the setting across sessions.
+
+.. code-block:: bash
+
+    export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/path/to/folder
+
+
 Build from source
 -----------------
 
 The core cuVS source code is written in C++ and wrapped through a C API. The C API is wrapped around the C++ APIs and the other supported languages are built around the C API.
 
-
 Prerequisites
 ^^^^^^^^^^^^^
 
diff --git a/rust/cuvs/src/brute_force.rs b/rust/cuvs/src/brute_force.rs
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
 //! Brute Force KNN
@@ -62,7 +62,7 @@ impl Index {
     /// * `neighbors` - Matrix in device memory that receives the indices of the nearest neighbors
     /// * `distances` - Matrix in device memory that receives the distances of the nearest neighbors
     pub fn search(
-        self,
+        &self,
         res: &Resources,
         queries: &ManagedTensor,
         neighbors: &ManagedTensor,
@@ -89,7 +89,7 @@ impl Index {
 impl Drop for Index {
     fn drop(&mut self) {
         if let Err(e) = check_cuvs(unsafe { ffi::cuvsBruteForceIndexDestroy(self.0) }) {
-            write!(stderr(), "failed to call cagraIndexDestroy {:?}", e)
+            write!(stderr(), "failed to call bruteForceIndexDestroy {:?}", e)
                 .expect("failed to write to stderr");
         }
     }
@@ -172,4 +172,11 @@ mod tests {
     fn test_l2() {
         test_bfknn(DistanceType::L2Expanded);
     }
+
+    // NOTE: brute_force multiple-search test is omitted here because the C++
+    // brute_force::index stores a non-owning view into the dataset. Building
+    // from device data via `build()` drops the ManagedTensor after the call,
+    // leaving a dangling pointer. A follow-up PR will add dataset lifetime
+    // enforcement (DatasetOwnership<'a>) to make this safe.
+    // See: https://github.com/rapidsai/cuvs/issues/1838
 }
diff --git a/rust/cuvs/src/cagra/index.rs b/rust/cuvs/src/cagra/index.rs
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
 
@@ -59,7 +59,7 @@ impl Index {
     /// * `neighbors` - Matrix in device memory that receives the indices of the nearest neighbors
     /// * `distances` - Matrix in device memory that receives the distances of the nearest neighbors
     pub fn search(
-        self,
+        &self,
         res: &Resources,
         params: &SearchParams,
         queries: &ManagedTensor,
@@ -167,4 +167,59 @@ mod tests {
             .set_compression(CompressionParams::new().unwrap());
         test_cagra(build_params);
     }
+
+    /// Test that an index can be searched multiple times without rebuilding.
+    /// This validates that search() takes &self instead of self.
+    #[test]
+    fn test_cagra_multiple_searches() {
+        let res = Resources::new().unwrap();
+        let build_params = IndexParams::new().unwrap();
+
+        // Create a random dataset
+        let n_datapoints = 256;
+        let n_features = 16;
+        let dataset =
+            ndarray::Array::<f32, _>::random((n_datapoints, n_features), Uniform::new(0., 1.0));
+
+        // Build the index once
+        let index =
+            Index::build(&res, &build_params, &dataset).expect("failed to create cagra index");
+
+        let search_params = SearchParams::new().unwrap();
+        let k = 5;
+
+        // Perform multiple searches on the same index
+        for search_iter in 0..3 {
+            let n_queries = 4;
+            let queries = dataset.slice(s![0..n_queries, ..]);
+            let queries = ManagedTensor::from(&queries).to_device(&res).unwrap();
+
+            let mut neighbors_host = ndarray::Array::<u32, _>::zeros((n_queries, k));
+            let neighbors = ManagedTensor::from(&neighbors_host)
+                .to_device(&res)
+                .unwrap();
+
+            let mut distances_host = ndarray::Array::<f32, _>::zeros((n_queries, k));
+            let distances = ManagedTensor::from(&distances_host)
+                .to_device(&res)
+                .unwrap();
+
+            // This should work on every iteration because search() takes &self
+            index
+                .search(&res, &search_params, &queries, &neighbors, &distances)
+                .expect(&format!("search iteration {} failed", search_iter));
+
+            // Copy back to host memory
+            distances.to_host(&res, &mut distances_host).unwrap();
+            neighbors.to_host(&res, &mut neighbors_host).unwrap();
+
+            // Verify results are consistent across searches
+            assert_eq!(
+                neighbors_host[[0, 0]],
+                0,
+                "iteration {}: first query should find itself",
+                search_iter
+            );
+        }
+    }
 }
diff --git a/rust/cuvs/src/ivf_flat/index.rs b/rust/cuvs/src/ivf_flat/index.rs
diff --git a/rust/cuvs/src/ivf_pq/index.rs b/rust/cuvs/src/ivf_pq/index.rs