robust hash insertion

chrischoy · chrischoy · commit f10086fddb5a · 2025-12-03T11:46:53.000-07:00
diff --git a/tests/nn/test_sparse_generative_features.py b/tests/nn/test_sparse_generative_features.py
@@ -10,6 +10,7 @@
     offsets_from_batch_index,
 )
 from warpconvnet.geometry.coords.ops.serialization import POINT_ORDERING, encode
+from warpconvnet.geometry.coords.ops.expand import expand_coords
 from warpconvnet.geometry.coords.ops.stride import stride_coords
 from warpconvnet.geometry.types.voxels import Voxels
 
@@ -309,3 +310,131 @@ def test_generate_output_coords_transposed_generative(toy_voxels):
     # Kernel map should have valid structure
     assert kernel_map is not None
     assert len(kernel_map) > 0
+
+
+def test_large_scale_expand_uniqueness():
+    """Ensure expand_coords does not produce duplicates on large input."""
+    torch.manual_seed(0)
+    if not torch.cuda.is_available():
+        pytest.skip("CUDA not available")
+    device = torch.device("cuda")
+
+    # Generate large random coordinates
+    N = 1000000
+    coords_range = 60
+
+    num_batches = 4
+    coords_list = []
+    for i in range(num_batches):
+        # Generate random coordinates
+        c = torch.randint(
+            -coords_range, coords_range, (N // num_batches, 3), device=device, dtype=torch.int32
+        )
+        b = torch.full((N // num_batches, 1), i, device=device, dtype=torch.int32)
+        coords_list.append(torch.cat([b, c], dim=1))
+
+    batch_indexed_coords = torch.cat(coords_list, dim=0)
+    # Ensure input uniqueness per batch
+    batch_indexed_coords = torch.unique(batch_indexed_coords, dim=0)
+
+    kernel_size = (3, 3, 3)
+    dilation = (1, 1, 1)
+
+    out_coords, out_offsets = expand_coords(
+        batch_indexed_coords, kernel_size=kernel_size, kernel_dilation=dilation
+    )
+
+    # Check for duplicates
+    unique_out, counts = torch.unique(out_coords, dim=0, return_counts=True)
+    if unique_out.shape[0] != out_coords.shape[0]:
+        num_duplicates = out_coords.shape[0] - unique_out.shape[0]
+        duplicate_mask = counts > 1
+        duplicate_examples = unique_out[duplicate_mask]
+        # Find frequencies of duplicates
+        max_dups = counts.max().item()
+        pytest.fail(
+            f"Found {num_duplicates} duplicate coordinates in expanded output. "
+            f"Total: {out_coords.shape[0]}, Unique: {unique_out.shape[0]}. "
+            f"Max duplicates for a single coord: {max_dups}. "
+            f"Example duplicates: {duplicate_examples[:5]}"
+        )
+
+
+def test_large_scale_transposed_generative_duplicates():
+    """
+    Reproduce duplicate coordinates issue with SpatiallySparseConv
+    configured as transposed=True, generative=True, stride=(2,2,2).
+    """
+    torch.manual_seed(0)
+    if not torch.cuda.is_available():
+        pytest.skip("CUDA not available")
+    device = torch.device("cuda")
+
+    # Generate large random coordinates
+    # N needs to be large enough to cause hash collisions or stress the table resizing
+    N = 1000000
+    coords_range = 50  # Very dense
+
+    num_batches = 1  # Single batch to focus on collisions within one set
+    coords_list = []
+    for i in range(num_batches):
+        c = torch.randint(
+            -coords_range, coords_range, (N // num_batches, 3), device=device, dtype=torch.int32
+        )
+        # Ensure uniqueness within batch for valid input
+        c = torch.unique(c, dim=0)
+        b = torch.full((c.shape[0], 1), i, device=device, dtype=torch.int32)
+        coords_list.append(torch.cat([b, c], dim=1))
+
+    batch_indexed_coords = torch.cat(coords_list, dim=0)
+
+    # Construct Voxels object
+    # We need features for the forward pass, even though we only care about coords
+    features = torch.randn(batch_indexed_coords.shape[0], 16, device=device)
+
+    # Re-split by batch for Voxels constructor
+    coords_per_batch = []
+    feats_per_batch = []
+    for i in range(num_batches):
+        mask = batch_indexed_coords[:, 0] == i
+        coords_per_batch.append(batch_indexed_coords[mask, 1:])
+        feats_per_batch.append(features[mask])
+
+    voxels = Voxels(
+        batched_coordinates=coords_per_batch, batched_features=feats_per_batch, device=device
+    )
+
+    # Configure the problematic layer
+    in_channels = 16
+    out_channels = 16
+    kernel_size = (3, 3, 3)
+    stride = (2, 2, 2)
+
+    conv = SpatiallySparseConv(
+        in_channels,
+        out_channels,
+        kernel_size=kernel_size,
+        stride=stride,
+        generative=True,
+        transposed=True,
+    ).to(device)
+
+    # Run forward pass
+    out_voxels = conv(voxels)
+
+    # Check for duplicates in output coordinates
+    out_coords = out_voxels.batch_indexed_coordinates
+    unique_out, counts = torch.unique(out_coords, dim=0, return_counts=True)
+
+    if unique_out.shape[0] != out_coords.shape[0]:
+        num_duplicates = out_coords.shape[0] - unique_out.shape[0]
+        max_dups = counts.max().item()
+        duplicate_examples = unique_out[counts > 1][:5]
+
+        pytest.fail(
+            f"Found {num_duplicates} duplicate coordinates in output.\n"
+            f"Total output coords: {out_coords.shape[0]}\n"
+            f"Unique output coords: {unique_out.shape[0]}\n"
+            f"Max duplicates for a single coord: {max_dups}\n"
+            f"Examples: {duplicate_examples}"
+        )
diff --git a/warpconvnet/csrc/hashmap_kernels.cu b/warpconvnet/csrc/hashmap_kernels.cu
@@ -120,7 +120,8 @@ struct MurmurHash {
 // --- Vector Comparison ---
 // a, b: pointers to the start of the vectors
 // dim: dimension of the vectors
-__device__ inline bool vec_equal(const int* a, const int* b, int dim) {
+template <typename T>
+__device__ inline bool vec_equal(const T* a, const int* b, int dim) {
   for (int i = 0; i < dim; ++i) {
     if (a[i] != b[i]) {
       return false;
@@ -142,63 +143,93 @@ __device__ inline void set_expand_status(int* status_ptr, ExpandStatus new_statu
   atomicCAS(status_ptr, kExpandStatusSuccess, static_cast<int>(new_status));
 }
 
+// --- Helper for finding or claiming a slot ---
 template <typename HashFuncT>
-__device__ inline void insert_candidate_if_absent(int* table_kvs,
-                                                  int* vector_keys,
-                                                  const int* candidate_key,
-                                                  int key_dim,
-                                                  int table_capacity,
-                                                  int vector_capacity,
-                                                  int* num_entries_ptr,
-                                                  int* status_ptr) {
-  const int initial_slot = HashFuncT::hash(candidate_key, key_dim, table_capacity);
-  int slot = initial_slot;
+__device__ inline int claim_slot_or_find(int* table_kvs,
+                                         const int* vector_keys,
+                                         const int* key,
+                                         int key_dim,
+                                         int capacity,
+                                         bool* found_existing) {
+  int slot = HashFuncT::hash(key, key_dim, capacity);
+  int initial_slot = slot;
   int attempts = 0;
+  *found_existing = false;
 
-  while (attempts < table_capacity) {
+  while (attempts < capacity) {
     int* slot_address = &table_kvs[slot * 2];
     int prev = atomicCAS(slot_address, -1, slot);
 
     if (prev == -1) {
-      // Reserve the slot and append the candidate to vector_keys.
-      int new_index = atomicAdd(num_entries_ptr, 1);
-      if (new_index >= vector_capacity) {
-        // Roll back reservation and flag overflow.
-        atomicExch(slot_address, -1);
-        table_kvs[slot * 2 + 1] = -1;
-        set_expand_status(status_ptr, kExpandStatusVectorOverflow);
-        return;
-      }
-
-      int* dst = &vector_keys[new_index * key_dim];
-      for (int d = 0; d < key_dim; ++d) {
-        dst[d] = candidate_key[d];
-      }
-
-      __threadfence();
-      table_kvs[slot * 2 + 1] = new_index;
-      return;
+      return slot;  // Successfully claimed
     }
 
-    int vector_index = table_kvs[slot * 2 + 1];
+    // Slot occupied (or reserved by another thread just now)
+    // Use volatile to ensure we read the latest value from memory
+    volatile int* slot_value_ptr = &table_kvs[slot * 2 + 1];
+    int vector_index = *slot_value_ptr;
+
     if (vector_index < 0) {
       // Another thread is writing to this slot. Retry without advancing.
       continue;
     }
 
-    const int* existing_key = &vector_keys[vector_index * key_dim];
-    if (vec_equal(existing_key, candidate_key, key_dim)) {
-      return;
+    const volatile int* existing_key = &vector_keys[vector_index * key_dim];
+    if (vec_equal(existing_key, key, key_dim)) {
+      *found_existing = true;
+      return -1;  // Key found
     }
 
-    slot = (slot + 1) % table_capacity;
+    // Collision with different key
+    slot = (slot + 1) % capacity;
     if (slot == initial_slot) {
       break;
     }
     attempts++;
   }
+  return -1;  // Table full or not found (and couldn't claim)
+}
+
+template <typename HashFuncT>
+__device__ inline void insert_candidate_if_absent(int* table_kvs,
+                                                  int* vector_keys,
+                                                  const int* candidate_key,
+                                                  int key_dim,
+                                                  int table_capacity,
+                                                  int vector_capacity,
+                                                  int* num_entries_ptr,
+                                                  int* status_ptr) {
+  bool found = false;
+  int slot = claim_slot_or_find<HashFuncT>(
+      table_kvs, vector_keys, candidate_key, key_dim, table_capacity, &found);
 
-  set_expand_status(status_ptr, kExpandStatusTableFull);
+  if (found) {
+    return;  // Already present
+  }
+
+  if (slot == -1) {
+    // Table full
+    set_expand_status(status_ptr, kExpandStatusTableFull);
+    return;
+  }
+
+  // Slot reserved, now allocate index
+  int new_index = atomicAdd(num_entries_ptr, 1);
+  if (new_index >= vector_capacity) {
+    // Roll back reservation and flag overflow.
+    atomicExch(&table_kvs[slot * 2], -1);
+    table_kvs[slot * 2 + 1] = -1;
+    set_expand_status(status_ptr, kExpandStatusVectorOverflow);
+    return;
+  }
+
+  int* dst = &vector_keys[new_index * key_dim];
+  for (int d = 0; d < key_dim; ++d) {
+    dst[d] = candidate_key[d];
+  }
+
+  __threadfence();
+  table_kvs[slot * 2 + 1] = new_index;
 }
 
 // --- Device Function for Hash Table Search ---
@@ -271,40 +302,20 @@ __global__ void insert_kernel_templated(
   }
 
   const int* key_to_insert = &vector_keys[idx * key_dim];
-  // Use the templated hash function directly
-  int slot = HashFuncT::hash(key_to_insert, key_dim, table_capacity);
-  int initial_slot = slot;
-  int attempts = 0;
-
-  while (attempts < table_capacity) {
-    int* slot_address = &table_kvs[slot * 2];
-    // Store the *original index* (idx) in the compare field, not the slot.
-    // This prevents overwriting if two different keys hash to the same slot initially.
-    // We are essentially using the first element of the pair to *reserve* the slot
-    // via atomicCAS, and the second to store the value (original index).
-    // We store the actual index idx+1 temporarily to distinguish from initial -1.
-    // Let's refine this: Store 'slot' in compare field as originally, seems simpler.
-    int prev = atomicCAS(slot_address, -1, slot);  // Try to claim the slot marker
-
-    if (prev == -1) {
-      // Slot claimed successfully, now store the actual value index
-      table_kvs[slot * 2 + 1] = idx;
-      // Optional: store the actual hash value in table_kvs[slot*2 + 0] = slot;
-      // Already done by atomicCAS if successful.
-      return;
-    }
+  bool found = false;
+  int slot = claim_slot_or_find<HashFuncT>(
+      table_kvs, vector_keys, key_to_insert, key_dim, table_capacity, &found);
 
-    // Collision or slot already claimed
-    slot = (slot + 1) % table_capacity;
+  if (found) {
+    return;  // Already present (deduplication)
+  }
 
-    if (slot == initial_slot) {
-      // Table is full or couldn't find an empty slot after full circle
-      // Consider adding a mechanism to signal failure if needed.
-      return;
-    }
-    attempts++;
+  if (slot != -1) {
+    // Claimed successfully, store the index
+    table_kvs[slot * 2 + 1] = idx;
   }
-  // Exceeded attempts (should only happen if table is pathologically full)
+  // If slot == -1 and !found, table is full (fail silently as per original logic, or could add
+  // error handling)
 }
 
 // --- Templated Search Kernel ---