Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 19 additions & 0 deletions include/fbgemm/UtilsAvx2.h
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
// flags.

#include <cstdint>
#include <memory>
#include <string>

namespace fbgemm {
Expand Down Expand Up @@ -89,4 +90,22 @@ fbgemmAlignedAlloc(size_t align, size_t size, bool raiseException = false);
*/
FBGEMM_API void fbgemmAlignedFree(void* p);

/**
* @brief RAII wrapper for aligned allocations.
*/
struct AlignedFreeDeleter {
void operator()(void* p) const {
fbgemmAlignedFree(p);
}
};

template <typename T>
using aligned_unique_ptr = std::unique_ptr<T[], AlignedFreeDeleter>;

template <typename T>
aligned_unique_ptr<T> makeAlignedUniquePtr(size_t align, size_t count) {
return aligned_unique_ptr<T>(
static_cast<T*>(fbgemmAlignedAlloc(align, count * sizeof(T))));
}

} // namespace fbgemm
13 changes: 6 additions & 7 deletions src/FbgemmI8Depthwise2DAvx2-inl.h
Original file line number Diff line number Diff line change
Expand Up @@ -152,8 +152,9 @@ static ALWAYS_INLINE void depthwise_2d_(
int W_OUT = (W + PAD_L + PAD_R - S) / stride_w + 1;
const std::int8_t* Bp = B.PackedMat();

int32_t* row_offsets = static_cast<int32_t*>(
fbgemmAlignedAlloc(64, (IC + 31) / 32 * 32 * sizeof(int32_t)));
auto row_offsets_owner =
makeAlignedUniquePtr<int32_t>(64, (IC + 31) / 32 * 32);
int32_t* row_offsets = row_offsets_owner.get();

int64_t n_begin = 0, n_end = 0, h_begin = 0, h_end = 0, w_begin = 0,
w_end = 0;
Expand Down Expand Up @@ -487,8 +488,6 @@ static ALWAYS_INLINE void depthwise_2d_(
}
}
} // for each n

fbgemmAlignedFree(row_offsets);
}

// Dispatch A_SYMMETRIC and B_SYMMETRIC
Expand Down Expand Up @@ -518,8 +517,9 @@ static void depthwise_2d_(
const float* act_times_w_scale,
int thread_id,
int num_threads) {
int32_t* C_int32_temp = static_cast<int32_t*>(
fbgemmAlignedAlloc(64, (OC + 31) / 32 * 32 * sizeof(int32_t)));
auto C_int32_temp_owner =
makeAlignedUniquePtr<int32_t>(64, (OC + 31) / 32 * 32);
int32_t* C_int32_temp = C_int32_temp_owner.get();
if (A_zero_point == 0 || col_offsets == nullptr) {
if (Q_GRAN == QuantizationGranularity::TENSOR && B_zero_point[0] == 0) {
depthwise_2d_<
Expand Down Expand Up @@ -637,7 +637,6 @@ static void depthwise_2d_(
num_threads);
}
}
fbgemmAlignedFree(C_int32_temp);
}

// Dispatch HAS_BIAS
Expand Down
12 changes: 6 additions & 6 deletions src/FbgemmI8Depthwise3DAvx2.cc
Original file line number Diff line number Diff line change
Expand Up @@ -167,8 +167,9 @@ static ALWAYS_INLINE void depthwise_3d_same_pad_(
int W_OUT = (W + PAD_L + PAD_R - K_W) / stride_w + 1;
const int8_t* Bp = B.PackedMat();

int32_t* row_offsets = static_cast<int32_t*>(
fbgemmAlignedAlloc(64, (IC + 31) / 32 * 32 * sizeof(int32_t)));
auto row_offsets_owner =
makeAlignedUniquePtr<int32_t>(64, (IC + 31) / 32 * 32);
int32_t* row_offsets = row_offsets_owner.get();

int64_t n_begin = 0, n_end = 0, t_begin = 0, t_end = 0, h_begin = 0,
h_end = 0;
Expand Down Expand Up @@ -779,7 +780,6 @@ static ALWAYS_INLINE void depthwise_3d_same_pad_(
} // h
} // t
} // for each n
fbgemmAlignedFree(row_offsets);
}

// Dispatch A_SYMMETRIC and B_SYMMETRIC
Expand All @@ -802,8 +802,9 @@ static void depthwise_3d_same_pad_(
const float* act_times_w_scale,
int thread_id,
int num_threads) {
int32_t* C_int32_temp = static_cast<int32_t*>(
fbgemmAlignedAlloc(64, (conv_p.OC + 31) / 32 * 32 * sizeof(int32_t)));
auto C_int32_temp_owner =
makeAlignedUniquePtr<int32_t>(64, (conv_p.OC + 31) / 32 * 32);
int32_t* C_int32_temp = C_int32_temp_owner.get();
if (A_zero_point == 0 || col_offsets == nullptr) {
if (Q_GRAN == QuantizationGranularity::TENSOR && B_zero_point[0] == 0) {
depthwise_3d_same_pad_<
Expand Down Expand Up @@ -893,7 +894,6 @@ static void depthwise_3d_same_pad_(
num_threads);
}
}
fbgemmAlignedFree(C_int32_temp);
}

// Dispatch HAS_BIAS
Expand Down
26 changes: 8 additions & 18 deletions src/FbgemmI8Spmdm.cc
Original file line number Diff line number Diff line change
Expand Up @@ -78,8 +78,8 @@ void CompressedSparseColumn::SpMDM(
// If NNZ/K is small, it's not worth doing transpose so we just use this
// scalar loop.
#ifdef _MSC_VER
int32_t* C_temp = static_cast<int32_t*>(
fbgemmAlignedAlloc(64, block.row_size * sizeof(int32_t)));
auto C_temp_owner = makeAlignedUniquePtr<int32_t>(64, block.row_size);
int32_t* C_temp = C_temp_owner.get();
#else
int32_t C_temp[block.row_size];
#endif
Expand Down Expand Up @@ -141,9 +141,6 @@ void CompressedSparseColumn::SpMDM(
}
} // for each column of B
}
#ifdef _MSC_VER
fbgemmAlignedFree(C_temp);
#endif
return;
}

Expand All @@ -165,10 +162,10 @@ void CompressedSparseColumn::SpMDM(
// dynamically allocated memory for MSVC even though dynamically allocated
// memory works for all compilers.
#ifdef _MSC_VER
uint8_t* A_buffer =
static_cast<uint8_t*>(fbgemmAlignedAlloc(64, K * 32 * sizeof(uint8_t)));
int32_t* C_buffer =
static_cast<int32_t*>(fbgemmAlignedAlloc(64, N * 32 * sizeof(int32_t)));
auto A_buffer_owner = makeAlignedUniquePtr<uint8_t>(64, K * 32);
auto C_buffer_owner = makeAlignedUniquePtr<int32_t>(64, N * 32);
uint8_t* A_buffer = A_buffer_owner.get();
int32_t* C_buffer = C_buffer_owner.get();
#else
alignas(64) uint8_t A_buffer[K * 32];
alignas(64) int32_t C_buffer[N * 32];
Expand All @@ -180,8 +177,8 @@ void CompressedSparseColumn::SpMDM(
// Transpose 32 x K submatrix of A
if (i_end - i1 < 32) {
#ifdef _MSC_VER
uint8_t* A_temp_buffer = static_cast<uint8_t*>(
fbgemmAlignedAlloc(64, K * 32 * sizeof(uint8_t)));
auto A_temp_buffer_owner = makeAlignedUniquePtr<uint8_t>(64, K * 32);
uint8_t* A_temp_buffer = A_temp_buffer_owner.get();
#else
alignas(64) uint8_t A_temp_buffer[K * 32];
#endif
Expand All @@ -200,9 +197,6 @@ void CompressedSparseColumn::SpMDM(
for (int i2 = (i_end - i1) / 8 * 8; i2 < 32; i2 += 8) {
transpose_8rows(K, A_temp_buffer + i2 * K, K, A_buffer + i2, 32);
}
#ifdef _MSC_VER
fbgemmAlignedFree(A_temp_buffer);
#endif
} else {
for (int i2 = 0; i2 < 32; i2 += 8) {
transpose_8rows(K, A + (i1 + i2) * lda, lda, A_buffer + i2, 32);
Expand Down Expand Up @@ -280,10 +274,6 @@ void CompressedSparseColumn::SpMDM(
spmdm_run_time += (dt);
t_start = std::chrono::high_resolution_clock::now();
#endif
#ifdef _MSC_VER
fbgemmAlignedFree(A_buffer);
fbgemmAlignedFree(C_buffer);
#endif

#endif // __aarch64__
}
Expand Down
10 changes: 5 additions & 5 deletions src/PackAWithQuantRowOffset.cc
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
#include "./OptimizedKernelsAvx2.h" // @manual
#include "fbgemm/Fbgemm.h"
#include "fbgemm/QuantUtils.h"
#include "fbgemm/UtilsAvx2.h"

namespace fbgemm {

Expand Down Expand Up @@ -147,10 +148,12 @@ void PackAWithQuantRowOffset<T, accT>::pack(
(block.col_start % (this->numCols() / this->numGroups())) != 0;
int32_t* row_offset_buf = getRowOffsetBuffer();

aligned_unique_ptr<float> smat_transposed_owner;
float* smat_transposed = nullptr;
if (tr) {
smat_transposed = static_cast<float*>(fbgemmAlignedAlloc(
64, block.row_size * block.col_size * sizeof(float)));
smat_transposed_owner =
makeAlignedUniquePtr<float>(64, block.row_size * block.col_size);
smat_transposed = smat_transposed_owner.get();
transpose_simd(
block.col_size,
block.row_size,
Expand Down Expand Up @@ -197,9 +200,6 @@ void PackAWithQuantRowOffset<T, accT>::pack(
out[i * BaseType::blockColSize() + j] = 0;
}
}
if (smat_transposed) {
fbgemmAlignedFree(smat_transposed);
}
}

template <typename T, typename accT>
Expand Down
23 changes: 11 additions & 12 deletions src/PackDepthwiseConvMatrixAvx2.cc
Original file line number Diff line number Diff line change
Expand Up @@ -26,8 +26,9 @@ PackedDepthWiseConvMatrix::PackedDepthWiseConvMatrix(
: OC_(OC), kernel_prod_(kernel_prod) {
// The input is in OC T R S layout.
// Transpose the input matrix to make packing faster.
int8_t* smat_transposed = static_cast<int8_t*>(
fbgemmAlignedAlloc(64, OC * kernel_prod * sizeof(int8_t)));
auto smat_transposed_owner =
makeAlignedUniquePtr<int8_t>(64, OC * kernel_prod);
int8_t* smat_transposed = smat_transposed_owner.get();
for (int i = 0; i < kernel_prod; ++i) {
for (int j = 0; j < OC; ++j) {
smat_transposed[i * OC + j] = smat[i + j * kernel_prod];
Expand Down Expand Up @@ -93,12 +94,14 @@ PackedDepthWiseConvMatrix::PackedDepthWiseConvMatrix(
// (28, 8), (28, 9), (28, 10), zero, ..., (31, 8), (31, 9), (31, 10), zero

// Allocate buffers
auto b_v = static_cast<__m256i*>(
fbgemmAlignedAlloc(64, kernel_prod * sizeof(__m256i)));
auto b_interleaved_epi16 = static_cast<__m256i*>(
fbgemmAlignedAlloc(64, kernel_prod_aligned * sizeof(__m256i)));
auto b_interleaved_epi32 = static_cast<__m256i*>(
fbgemmAlignedAlloc(64, kernel_prod_aligned * sizeof(__m256i)));
auto b_v_owner = makeAlignedUniquePtr<__m256i>(64, kernel_prod);
auto b_v = b_v_owner.get();
auto b_interleaved_epi16_owner =
makeAlignedUniquePtr<__m256i>(64, kernel_prod_aligned);
auto b_interleaved_epi16 = b_interleaved_epi16_owner.get();
auto b_interleaved_epi32_owner =
makeAlignedUniquePtr<__m256i>(64, kernel_prod_aligned);
auto b_interleaved_epi32 = b_interleaved_epi32_owner.get();
for (int k1 = 0; k1 < OC; k1 += 32) {
int remainder = OC - k1;
if (remainder < 32) {
Expand Down Expand Up @@ -154,10 +157,6 @@ PackedDepthWiseConvMatrix::PackedDepthWiseConvMatrix(
b_interleaved_epi32[i]);
}
}
fbgemmAlignedFree(b_v);
fbgemmAlignedFree(b_interleaved_epi16);
fbgemmAlignedFree(b_interleaved_epi32);
fbgemmAlignedFree(smat_transposed);
}

int PackedDepthWiseConvMatrix::addr(int r, int c) {
Expand Down
12 changes: 6 additions & 6 deletions src/PackWeightsForDirectConv.cc
Original file line number Diff line number Diff line change
Expand Up @@ -257,10 +257,12 @@ void fbgemmDirectConv(
fn = codeObj.getOrCreateDirectConvTrans<inst_set_t::avx2>(
true, conv_p.stride[1], conv_p.K[1]);

int32_t* inSum = static_cast<int32_t*>(fbgemmAlignedAlloc(
64, conv_p.IN_DIM[0] * conv_p.IN_DIM[1] * sizeof(int32_t)));
int32_t* rowSum = static_cast<int32_t*>(fbgemmAlignedAlloc(
64, conv_p.OUT_DIM[0] * conv_p.OUT_DIM[1] * sizeof(int32_t)));
auto inSum_owner = makeAlignedUniquePtr<int32_t>(
64, conv_p.IN_DIM[0] * conv_p.IN_DIM[1]);
int32_t* inSum = inSum_owner.get();
auto rowSum_owner = makeAlignedUniquePtr<int32_t>(
64, conv_p.OUT_DIM[0] * conv_p.OUT_DIM[1]);
int32_t* rowSum = rowSum_owner.get();

directConvRowSum(conv_p, Aint8, inSum, rowSum);
int kernel_dim = conv_p.K[0] * conv_p.K[1];
Expand Down Expand Up @@ -450,8 +452,6 @@ void fbgemmDirectConv(
}
}
}
fbgemmAlignedFree(inSum);
fbgemmAlignedFree(rowSum);
} // transposed conv
else { // non-transposed conv
assert(false && "non-transposed direct conv not integrated yet.");
Expand Down
20 changes: 9 additions & 11 deletions src/QuantUtilsAvx2.cc
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
#include "fbgemm/FbgemmConvert.h"
#include "fbgemm/FloatConversion.h"
#include "fbgemm/Types.h"
#include "fbgemm/UtilsAvx2.h"

namespace fbgemm {

Expand Down Expand Up @@ -1599,12 +1600,14 @@ void FloatOrHalfToFusedNBitRowwiseQuantizedSBHalfAvx2(
(input_columns + NUM_ELEM_PER_BYTE - 1) / NUM_ELEM_PER_BYTE +
2 * sizeof(std::uint16_t);

aligned_unique_ptr<float> input_row_float_for_fp16_owner;
float* input_row_float_for_fp16 = nullptr;
float min_max_row_float_for_fp16[kRowwiseMinMaxNumCols];
const auto is_valid_rowwise_min_max = (rowwise_min_max != nullptr);
if constexpr (std::is_same_v<InputType, float16>) {
input_row_float_for_fp16 = static_cast<float*>(
fbgemmAlignedAlloc(64, input_columns * sizeof(float)));
input_row_float_for_fp16_owner =
makeAlignedUniquePtr<float>(64, input_columns);
input_row_float_for_fp16 = input_row_float_for_fp16_owner.get();
}

for (size_t row = 0; row < input_rows; ++row) {
Expand Down Expand Up @@ -1794,10 +1797,6 @@ void FloatOrHalfToFusedNBitRowwiseQuantizedSBHalfAvx2(
}
}
} // for each row

if constexpr (std::is_same_v<InputType, float16>) {
fbgemmAlignedFree(input_row_float_for_fp16);
}
}

template <typename InputType>
Expand All @@ -1824,12 +1823,14 @@ void FloatOrHalfToFused8BitRowwiseQuantizedSBFloatAvx2(
_mm256_set_epi32(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x04, 0x00);

const int64_t output_columns = input_columns + 2 * sizeof(float);
aligned_unique_ptr<float> input_row_float_for_fp16_owner;
float* input_row_float_for_fp16 = nullptr;
float min_max_row_float_for_fp16[kRowwiseMinMaxNumCols];
const auto is_valid_rowwise_min_max = (rowwise_min_max != nullptr);
if constexpr (std::is_same_v<InputType, float16>) {
input_row_float_for_fp16 = static_cast<float*>(
fbgemmAlignedAlloc(64, input_columns * sizeof(float)));
input_row_float_for_fp16_owner =
makeAlignedUniquePtr<float>(64, input_columns);
input_row_float_for_fp16 = input_row_float_for_fp16_owner.get();
}
for (size_t row = 0; row < input_rows; ++row) {
const InputType* input_row = input + row * input_columns;
Expand Down Expand Up @@ -1957,9 +1958,6 @@ void FloatOrHalfToFused8BitRowwiseQuantizedSBFloatAvx2(
std::lrintf((input_row_float[col] - minimum_element) * inverse_scale);
}
} // for each row
if constexpr (std::is_same_v<InputType, float16>) {
fbgemmAlignedFree(input_row_float_for_fp16);
}
}

template <typename OutputType, int BIT_RATE>
Expand Down
14 changes: 6 additions & 8 deletions src/Utils.cc
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
#include <unordered_map>
#include <unordered_set>
#include <utility>
#include "fbgemm/UtilsAvx2.h"

#ifdef _OPENMP
#include <omp.h>
Expand Down Expand Up @@ -675,10 +676,11 @@ std::pair<K*, V*> radix_sort_parallel(
#ifdef _MSC_VER
const size_t array_size = static_cast<size_t>(RDX_HIST_SIZE) * maxthreads;
// fixes MSVC error C2131
auto* const histogram = static_cast<int64_t*>(
fbgemm::fbgemmAlignedAlloc(64, array_size * sizeof(int64_t)));
auto* const histogram_ps = static_cast<int64_t*>(
fbgemm::fbgemmAlignedAlloc(64, array_size * sizeof(int64_t)));
auto histogram_owner = fbgemm::makeAlignedUniquePtr<int64_t>(64, array_size);
auto* const histogram = histogram_owner.get();
auto histogram_ps_owner =
fbgemm::makeAlignedUniquePtr<int64_t>(64, array_size);
auto* const histogram_ps = histogram_ps_owner.get();

#else
alignas(64) int64_t histogram[RDX_HIST_SIZE * maxthreads];
Expand Down Expand Up @@ -719,10 +721,6 @@ std::pair<K*, V*> radix_sort_parallel(
}
}
}
#ifdef _MSC_VER
fbgemm::fbgemmAlignedFree(histogram);
fbgemm::fbgemmAlignedFree(histogram_ps);
#endif
return (
num_passes % 2 == 0 ? std::pair{inp_key_buf, inp_value_buf}
: std::pair{tmp_key_buf, tmp_value_buf});
Expand Down
Loading