pytorch · q10 · Apr 11, 2026
diff --git a/include/fbgemm/UtilsAvx2.h b/include/fbgemm/UtilsAvx2.h
@@ -11,6 +11,7 @@
 // flags.
 
 #include <cstdint>
+#include <memory>
 #include <string>
 
 namespace fbgemm {
@@ -89,4 +90,22 @@ fbgemmAlignedAlloc(size_t align, size_t size, bool raiseException = false);
  */
 FBGEMM_API void fbgemmAlignedFree(void* p);
 
+/**
+ * @brief RAII wrapper for aligned allocations.
+ */
+struct AlignedFreeDeleter {
+  void operator()(void* p) const {
+    fbgemmAlignedFree(p);
+  }
+};
+
+template <typename T>
+using aligned_unique_ptr = std::unique_ptr<T[], AlignedFreeDeleter>;
+
+template <typename T>
+aligned_unique_ptr<T> makeAlignedUniquePtr(size_t align, size_t count) {
+  return aligned_unique_ptr<T>(
+      static_cast<T*>(fbgemmAlignedAlloc(align, count * sizeof(T))));
+}
+
 } // namespace fbgemm
diff --git a/src/FbgemmI8Depthwise2DAvx2-inl.h b/src/FbgemmI8Depthwise2DAvx2-inl.h
@@ -152,8 +152,9 @@ static ALWAYS_INLINE void depthwise_2d_(
   int W_OUT = (W + PAD_L + PAD_R - S) / stride_w + 1;
   const std::int8_t* Bp = B.PackedMat();
 
-  int32_t* row_offsets = static_cast<int32_t*>(
-      fbgemmAlignedAlloc(64, (IC + 31) / 32 * 32 * sizeof(int32_t)));
+  auto row_offsets_owner =
+      makeAlignedUniquePtr<int32_t>(64, (IC + 31) / 32 * 32);
+  int32_t* row_offsets = row_offsets_owner.get();
 
   int64_t n_begin = 0, n_end = 0, h_begin = 0, h_end = 0, w_begin = 0,
           w_end = 0;
@@ -487,8 +488,6 @@ static ALWAYS_INLINE void depthwise_2d_(
       }
     }
   } // for each n
-
-  fbgemmAlignedFree(row_offsets);
 }
 
 // Dispatch A_SYMMETRIC and B_SYMMETRIC
@@ -518,8 +517,9 @@ static void depthwise_2d_(
     const float* act_times_w_scale,
     int thread_id,
     int num_threads) {
-  int32_t* C_int32_temp = static_cast<int32_t*>(
-      fbgemmAlignedAlloc(64, (OC + 31) / 32 * 32 * sizeof(int32_t)));
+  auto C_int32_temp_owner =
+      makeAlignedUniquePtr<int32_t>(64, (OC + 31) / 32 * 32);
+  int32_t* C_int32_temp = C_int32_temp_owner.get();
   if (A_zero_point == 0 || col_offsets == nullptr) {
     if (Q_GRAN == QuantizationGranularity::TENSOR && B_zero_point[0] == 0) {
       depthwise_2d_<
@@ -637,7 +637,6 @@ static void depthwise_2d_(
           num_threads);
     }
   }
-  fbgemmAlignedFree(C_int32_temp);
 }
 
 // Dispatch HAS_BIAS

diff --git a/src/FbgemmI8Depthwise3DAvx2.cc b/src/FbgemmI8Depthwise3DAvx2.cc
@@ -167,8 +167,9 @@ static ALWAYS_INLINE void depthwise_3d_same_pad_(
   int W_OUT = (W + PAD_L + PAD_R - K_W) / stride_w + 1;
   const int8_t* Bp = B.PackedMat();
 
-  int32_t* row_offsets = static_cast<int32_t*>(
-      fbgemmAlignedAlloc(64, (IC + 31) / 32 * 32 * sizeof(int32_t)));
+  auto row_offsets_owner =
+      makeAlignedUniquePtr<int32_t>(64, (IC + 31) / 32 * 32);
+  int32_t* row_offsets = row_offsets_owner.get();
 
   int64_t n_begin = 0, n_end = 0, t_begin = 0, t_end = 0, h_begin = 0,
           h_end = 0;
@@ -779,7 +780,6 @@ static ALWAYS_INLINE void depthwise_3d_same_pad_(
       } // h
     } // t
   } // for each n
-  fbgemmAlignedFree(row_offsets);
 }
 
 // Dispatch A_SYMMETRIC and B_SYMMETRIC
@@ -802,8 +802,9 @@ static void depthwise_3d_same_pad_(
     const float* act_times_w_scale,
     int thread_id,
     int num_threads) {
-  int32_t* C_int32_temp = static_cast<int32_t*>(
-      fbgemmAlignedAlloc(64, (conv_p.OC + 31) / 32 * 32 * sizeof(int32_t)));
+  auto C_int32_temp_owner =
+      makeAlignedUniquePtr<int32_t>(64, (conv_p.OC + 31) / 32 * 32);
+  int32_t* C_int32_temp = C_int32_temp_owner.get();
   if (A_zero_point == 0 || col_offsets == nullptr) {
     if (Q_GRAN == QuantizationGranularity::TENSOR && B_zero_point[0] == 0) {
       depthwise_3d_same_pad_<
@@ -893,7 +894,6 @@ static void depthwise_3d_same_pad_(
           num_threads);
     }
   }
-  fbgemmAlignedFree(C_int32_temp);
 }
 
 // Dispatch HAS_BIAS

diff --git a/src/FbgemmI8Spmdm.cc b/src/FbgemmI8Spmdm.cc
@@ -78,8 +78,8 @@ void CompressedSparseColumn::SpMDM(
     // If NNZ/K is small, it's not worth doing transpose so we just use this
     // scalar loop.
 #ifdef _MSC_VER
-    int32_t* C_temp = static_cast<int32_t*>(
-        fbgemmAlignedAlloc(64, block.row_size * sizeof(int32_t)));
+    auto C_temp_owner = makeAlignedUniquePtr<int32_t>(64, block.row_size);
+    int32_t* C_temp = C_temp_owner.get();
 #else
     int32_t C_temp[block.row_size];
 #endif
@@ -141,9 +141,6 @@ void CompressedSparseColumn::SpMDM(
         }
       } // for each column of B
     }
-#ifdef _MSC_VER
-    fbgemmAlignedFree(C_temp);
-#endif
     return;
   }
 
@@ -165,10 +162,10 @@ void CompressedSparseColumn::SpMDM(
 // dynamically allocated memory for MSVC even though dynamically allocated
 // memory works for all compilers.
 #ifdef _MSC_VER
-  uint8_t* A_buffer =
-      static_cast<uint8_t*>(fbgemmAlignedAlloc(64, K * 32 * sizeof(uint8_t)));
-  int32_t* C_buffer =
-      static_cast<int32_t*>(fbgemmAlignedAlloc(64, N * 32 * sizeof(int32_t)));
+  auto A_buffer_owner = makeAlignedUniquePtr<uint8_t>(64, K * 32);
+  auto C_buffer_owner = makeAlignedUniquePtr<int32_t>(64, N * 32);
+  uint8_t* A_buffer = A_buffer_owner.get();
+  int32_t* C_buffer = C_buffer_owner.get();
 #else
   alignas(64) uint8_t A_buffer[K * 32];
   alignas(64) int32_t C_buffer[N * 32];
@@ -180,8 +177,8 @@ void CompressedSparseColumn::SpMDM(
     // Transpose 32 x K submatrix of A
     if (i_end - i1 < 32) {
 #ifdef _MSC_VER
-      uint8_t* A_temp_buffer = static_cast<uint8_t*>(
-          fbgemmAlignedAlloc(64, K * 32 * sizeof(uint8_t)));
+      auto A_temp_buffer_owner = makeAlignedUniquePtr<uint8_t>(64, K * 32);
+      uint8_t* A_temp_buffer = A_temp_buffer_owner.get();
 #else
       alignas(64) uint8_t A_temp_buffer[K * 32];
 #endif
@@ -200,9 +197,6 @@ void CompressedSparseColumn::SpMDM(
       for (int i2 = (i_end - i1) / 8 * 8; i2 < 32; i2 += 8) {
         transpose_8rows(K, A_temp_buffer + i2 * K, K, A_buffer + i2, 32);
       }
-#ifdef _MSC_VER
-      fbgemmAlignedFree(A_temp_buffer);
-#endif
     } else {
       for (int i2 = 0; i2 < 32; i2 += 8) {
         transpose_8rows(K, A + (i1 + i2) * lda, lda, A_buffer + i2, 32);
@@ -280,10 +274,6 @@ void CompressedSparseColumn::SpMDM(
   spmdm_run_time += (dt);
   t_start = std::chrono::high_resolution_clock::now();
 #endif
-#ifdef _MSC_VER
-  fbgemmAlignedFree(A_buffer);
-  fbgemmAlignedFree(C_buffer);
-#endif
 
 #endif // __aarch64__
 }

diff --git a/src/PackAWithQuantRowOffset.cc b/src/PackAWithQuantRowOffset.cc
@@ -20,6 +20,7 @@
 #include "./OptimizedKernelsAvx2.h" // @manual
 #include "fbgemm/Fbgemm.h"
 #include "fbgemm/QuantUtils.h"
+#include "fbgemm/UtilsAvx2.h"
 
 namespace fbgemm {
 
@@ -147,10 +148,12 @@ void PackAWithQuantRowOffset<T, accT>::pack(
       (block.col_start % (this->numCols() / this->numGroups())) != 0;
   int32_t* row_offset_buf = getRowOffsetBuffer();
 
+  aligned_unique_ptr<float> smat_transposed_owner;
   float* smat_transposed = nullptr;
   if (tr) {
-    smat_transposed = static_cast<float*>(fbgemmAlignedAlloc(
-        64, block.row_size * block.col_size * sizeof(float)));
+    smat_transposed_owner =
+        makeAlignedUniquePtr<float>(64, block.row_size * block.col_size);
+    smat_transposed = smat_transposed_owner.get();
     transpose_simd(
         block.col_size,
         block.row_size,
@@ -197,9 +200,6 @@ void PackAWithQuantRowOffset<T, accT>::pack(
       out[i * BaseType::blockColSize() + j] = 0;
     }
   }
-  if (smat_transposed) {
-    fbgemmAlignedFree(smat_transposed);
-  }
 }
 
 template <typename T, typename accT>

diff --git a/src/PackDepthwiseConvMatrixAvx2.cc b/src/PackDepthwiseConvMatrixAvx2.cc
@@ -26,8 +26,9 @@ PackedDepthWiseConvMatrix::PackedDepthWiseConvMatrix(
     : OC_(OC), kernel_prod_(kernel_prod) {
   // The input is in OC T R S layout.
   // Transpose the input matrix to make packing faster.
-  int8_t* smat_transposed = static_cast<int8_t*>(
-      fbgemmAlignedAlloc(64, OC * kernel_prod * sizeof(int8_t)));
+  auto smat_transposed_owner =
+      makeAlignedUniquePtr<int8_t>(64, OC * kernel_prod);
+  int8_t* smat_transposed = smat_transposed_owner.get();
   for (int i = 0; i < kernel_prod; ++i) {
     for (int j = 0; j < OC; ++j) {
       smat_transposed[i * OC + j] = smat[i + j * kernel_prod];
@@ -93,12 +94,14 @@ PackedDepthWiseConvMatrix::PackedDepthWiseConvMatrix(
   // (28, 8), (28, 9), (28, 10), zero, ..., (31, 8), (31, 9), (31, 10), zero
 
   // Allocate buffers
-  auto b_v = static_cast<__m256i*>(
-      fbgemmAlignedAlloc(64, kernel_prod * sizeof(__m256i)));
-  auto b_interleaved_epi16 = static_cast<__m256i*>(
-      fbgemmAlignedAlloc(64, kernel_prod_aligned * sizeof(__m256i)));
-  auto b_interleaved_epi32 = static_cast<__m256i*>(
-      fbgemmAlignedAlloc(64, kernel_prod_aligned * sizeof(__m256i)));
+  auto b_v_owner = makeAlignedUniquePtr<__m256i>(64, kernel_prod);
+  auto b_v = b_v_owner.get();
+  auto b_interleaved_epi16_owner =
+      makeAlignedUniquePtr<__m256i>(64, kernel_prod_aligned);
+  auto b_interleaved_epi16 = b_interleaved_epi16_owner.get();
+  auto b_interleaved_epi32_owner =
+      makeAlignedUniquePtr<__m256i>(64, kernel_prod_aligned);
+  auto b_interleaved_epi32 = b_interleaved_epi32_owner.get();
   for (int k1 = 0; k1 < OC; k1 += 32) {
     int remainder = OC - k1;
     if (remainder < 32) {
@@ -154,10 +157,6 @@ PackedDepthWiseConvMatrix::PackedDepthWiseConvMatrix(
           b_interleaved_epi32[i]);
     }
   }
-  fbgemmAlignedFree(b_v);
-  fbgemmAlignedFree(b_interleaved_epi16);
-  fbgemmAlignedFree(b_interleaved_epi32);
-  fbgemmAlignedFree(smat_transposed);
 }
 
 int PackedDepthWiseConvMatrix::addr(int r, int c) {

diff --git a/src/PackWeightsForDirectConv.cc b/src/PackWeightsForDirectConv.cc
@@ -257,10 +257,12 @@ void fbgemmDirectConv(
       fn = codeObj.getOrCreateDirectConvTrans<inst_set_t::avx2>(
           true, conv_p.stride[1], conv_p.K[1]);
 
-      int32_t* inSum = static_cast<int32_t*>(fbgemmAlignedAlloc(
-          64, conv_p.IN_DIM[0] * conv_p.IN_DIM[1] * sizeof(int32_t)));
-      int32_t* rowSum = static_cast<int32_t*>(fbgemmAlignedAlloc(
-          64, conv_p.OUT_DIM[0] * conv_p.OUT_DIM[1] * sizeof(int32_t)));
+      auto inSum_owner = makeAlignedUniquePtr<int32_t>(
+          64, conv_p.IN_DIM[0] * conv_p.IN_DIM[1]);
+      int32_t* inSum = inSum_owner.get();
+      auto rowSum_owner = makeAlignedUniquePtr<int32_t>(
+          64, conv_p.OUT_DIM[0] * conv_p.OUT_DIM[1]);
+      int32_t* rowSum = rowSum_owner.get();
 
       directConvRowSum(conv_p, Aint8, inSum, rowSum);
       int kernel_dim = conv_p.K[0] * conv_p.K[1];
@@ -450,8 +452,6 @@ void fbgemmDirectConv(
           }
         }
       }
-      fbgemmAlignedFree(inSum);
-      fbgemmAlignedFree(rowSum);
     } // transposed conv
     else { // non-transposed conv
       assert(false && "non-transposed direct conv not integrated yet.");

diff --git a/src/QuantUtilsAvx2.cc b/src/QuantUtilsAvx2.cc
@@ -22,6 +22,7 @@
 #include "fbgemm/FbgemmConvert.h"
 #include "fbgemm/FloatConversion.h"
 #include "fbgemm/Types.h"
+#include "fbgemm/UtilsAvx2.h"
 
 namespace fbgemm {
 
@@ -1599,12 +1600,14 @@ void FloatOrHalfToFusedNBitRowwiseQuantizedSBHalfAvx2(
       (input_columns + NUM_ELEM_PER_BYTE - 1) / NUM_ELEM_PER_BYTE +
       2 * sizeof(std::uint16_t);
 
+  aligned_unique_ptr<float> input_row_float_for_fp16_owner;
   float* input_row_float_for_fp16 = nullptr;
   float min_max_row_float_for_fp16[kRowwiseMinMaxNumCols];
   const auto is_valid_rowwise_min_max = (rowwise_min_max != nullptr);
   if constexpr (std::is_same_v<InputType, float16>) {
-    input_row_float_for_fp16 = static_cast<float*>(
-        fbgemmAlignedAlloc(64, input_columns * sizeof(float)));
+    input_row_float_for_fp16_owner =
+        makeAlignedUniquePtr<float>(64, input_columns);
+    input_row_float_for_fp16 = input_row_float_for_fp16_owner.get();
   }
 
   for (size_t row = 0; row < input_rows; ++row) {
@@ -1794,10 +1797,6 @@ void FloatOrHalfToFusedNBitRowwiseQuantizedSBHalfAvx2(
       }
     }
   } // for each row
-
-  if constexpr (std::is_same_v<InputType, float16>) {
-    fbgemmAlignedFree(input_row_float_for_fp16);
-  }
 }
 
 template <typename InputType>
@@ -1824,12 +1823,14 @@ void FloatOrHalfToFused8BitRowwiseQuantizedSBFloatAvx2(
       _mm256_set_epi32(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x04, 0x00);
 
   const int64_t output_columns = input_columns + 2 * sizeof(float);
+  aligned_unique_ptr<float> input_row_float_for_fp16_owner;
   float* input_row_float_for_fp16 = nullptr;
   float min_max_row_float_for_fp16[kRowwiseMinMaxNumCols];
   const auto is_valid_rowwise_min_max = (rowwise_min_max != nullptr);
   if constexpr (std::is_same_v<InputType, float16>) {
-    input_row_float_for_fp16 = static_cast<float*>(
-        fbgemmAlignedAlloc(64, input_columns * sizeof(float)));
+    input_row_float_for_fp16_owner =
+        makeAlignedUniquePtr<float>(64, input_columns);
+    input_row_float_for_fp16 = input_row_float_for_fp16_owner.get();
   }
   for (size_t row = 0; row < input_rows; ++row) {
     const InputType* input_row = input + row * input_columns;
@@ -1957,9 +1958,6 @@ void FloatOrHalfToFused8BitRowwiseQuantizedSBFloatAvx2(
           std::lrintf((input_row_float[col] - minimum_element) * inverse_scale);
     }
   } // for each row
-  if constexpr (std::is_same_v<InputType, float16>) {
-    fbgemmAlignedFree(input_row_float_for_fp16);
-  }
 }
 
 template <typename OutputType, int BIT_RATE>

diff --git a/src/Utils.cc b/src/Utils.cc
@@ -25,6 +25,7 @@
 #include <unordered_map>
 #include <unordered_set>
 #include <utility>
+#include "fbgemm/UtilsAvx2.h"
 
 #ifdef _OPENMP
 #include <omp.h>
@@ -675,10 +676,11 @@ std::pair<K*, V*> radix_sort_parallel(
 #ifdef _MSC_VER
   const size_t array_size = static_cast<size_t>(RDX_HIST_SIZE) * maxthreads;
   // fixes MSVC error C2131
-  auto* const histogram = static_cast<int64_t*>(
-      fbgemm::fbgemmAlignedAlloc(64, array_size * sizeof(int64_t)));
-  auto* const histogram_ps = static_cast<int64_t*>(
-      fbgemm::fbgemmAlignedAlloc(64, array_size * sizeof(int64_t)));
+  auto histogram_owner = fbgemm::makeAlignedUniquePtr<int64_t>(64, array_size);
+  auto* const histogram = histogram_owner.get();
+  auto histogram_ps_owner =
+      fbgemm::makeAlignedUniquePtr<int64_t>(64, array_size);
+  auto* const histogram_ps = histogram_ps_owner.get();
 
 #else
   alignas(64) int64_t histogram[RDX_HIST_SIZE * maxthreads];
@@ -719,10 +721,6 @@ std::pair<K*, V*> radix_sort_parallel(
       }
     }
   }
-#ifdef _MSC_VER
-  fbgemm::fbgemmAlignedFree(histogram);
-  fbgemm::fbgemmAlignedFree(histogram_ps);
-#endif
   return (
       num_passes % 2 == 0 ? std::pair{inp_key_buf, inp_value_buf}
                           : std::pair{tmp_key_buf, tmp_value_buf});