Skip to content

Commit 6314281

Browse files
Ubuntuclaude
andcommitted
Add spqlios-intl: interleaved-format FFT backend
Add a new FFT backend at thirdparties/spqlios-intl/ that uses interleaved complex layout [re0,im0,re1,im1,...] instead of SPQLIOS's split layout. Enable with -DUSE_SPQLIOS_INTL=ON. This sets USE_INTERLEAVED_FORMAT and links the new library. Components: - fft_processor_spqlios_intl.h: class with same interface as SPQLIOS - fft_processor_spqlios_intl.cpp: C++ FFT/IFFT with AVX2 intrinsics - Complex multiply via unpacklo/unpackhi + vfmaddsub (3 ops) - Cooley-Tukey DIT (forward) / Gentleman-Sande DIF (inverse) - Interleaved trig tables - AVX2 MulInFD/FMAInFD for interleaved format in mulfft.hpp Status: functionally correct (passes polymul, externalproduct tests). Performance is currently ~2x slower than optimized split SPQLIOS because the FFT butterfly lacks stage fusion. The interleaved MulInFD/FMAInFD use vfmaddsub for complex multiply (3 loads + 1 store per 2 complex values vs 6 loads + 2 stores for split format). Next step: port the fused butterfly passes (Option C/D) to the interleaved FFT to match or beat split SPQLIOS performance. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
1 parent c7c505c commit 6314281

File tree

7 files changed

+682
-3
lines changed

7 files changed

+682
-3
lines changed

CMakeLists.txt

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -83,6 +83,7 @@ option(USE_MKL "Use Intel MKL" OFF)
8383
option(USE_FFTW3 "Use FFTW3" OFF)
8484
option(USE_SPQLIOX_AARCH64 "Use spqliox_aarch64" OFF)
8585
option(USE_SPQLIOS_ARITHMETIC "Use spqlios-arithmetic backend" OFF)
86+
option(USE_SPQLIOS_INTL "Use interleaved-format SPQLIOS" OFF)
8687
option(USE_HEXL "Use Intel HEXL" OFF)
8788

8889
option(ENABLE_TEST "Build tests" OFF)
@@ -191,6 +192,13 @@ elseif(USE_CONCRETE_FFT)
191192
add_compile_definitions(USE_CONCRETE_FFT)
192193
add_compile_definitions(USE_INTERLEAVED_FORMAT)
193194
add_subdirectory(thirdparties/concrete-fft)
195+
elseif(USE_SPQLIOS_INTL)
196+
set(TFHEpp_DEFINITIONS
197+
"${TFHEpp_DEFINITIONS};USE_SPQLIOS_INTL;USE_INTERLEAVED_FORMAT"
198+
PARENT_SCOPE)
199+
add_compile_definitions(USE_SPQLIOS_INTL)
200+
add_compile_definitions(USE_INTERLEAVED_FORMAT)
201+
add_subdirectory(thirdparties/spqlios-intl)
194202
elseif(USE_SPQLIOS_ARITHMETIC)
195203
set(TFHEpp_DEFINITIONS
196204
"${TFHEpp_DEFINITIONS};USE_SPQLIOS_ARITHMETIC"

benchmark/bench_fft.cpp

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,11 @@
99
#include <immintrin.h>
1010
#include "google-benchmark/include/benchmark/benchmark.h"
1111

12+
// Alias for the active FFT processor type
13+
#ifdef USE_SPQLIOS_INTL
14+
using FFT_Processor_Spqlios = FFT_Processor_Spqlios_Intl;
15+
#endif
16+
1217
// Forward-declare the SPQLIOS conversion functions from x86.h
1318
namespace SPQLIOS {
1419
inline __m256i mm256_cvtpd_epi64(const __m256d x) {

include/mulfft.hpp

Lines changed: 38 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,8 @@
1717
#include <fft_processor_concrete.hpp>
1818
#elif USE_SPQLIOS_ARITHMETIC
1919
#include <fft_processor_spqlios_arithmetic.h>
20+
#elif USE_SPQLIOS_INTL
21+
#include <fft_processor_spqlios_intl.h>
2022
#else
2123
#include <fft_processor_spqlios.h>
2224
#endif
@@ -217,7 +219,18 @@ inline void TwistIFFTUInt(PolynomialInFD<P> &res, const Polynomial<P> &a)
217219
template <uint32_t N>
218220
inline void MulInFD(std::array<double, N> &res, const std::array<double, N> &b)
219221
{
220-
#ifdef USE_INTERLEAVED_FORMAT
222+
#if defined(USE_INTERLEAVED_FORMAT) && defined(__AVX2__)
223+
// AVX2 interleaved complex multiply: 2 complex per YMM
224+
for (uint32_t i = 0; i < N; i += 4) {
225+
__m256d a = _mm256_load_pd(res.data() + i);
226+
__m256d w = _mm256_load_pd(b.data() + i);
227+
__m256d w_swap = _mm256_permute_pd(w, 0b0101);
228+
__m256d a_re = _mm256_unpacklo_pd(a, a);
229+
__m256d a_im = _mm256_unpackhi_pd(a, a);
230+
_mm256_store_pd(res.data() + i,
231+
_mm256_fmaddsub_pd(a_re, w, _mm256_mul_pd(a_im, w_swap)));
232+
}
233+
#elif defined(USE_INTERLEAVED_FORMAT)
221234
for (int i = 0; i < N / 2; i++) {
222235
const std::complex tmp = std::complex(res[2 * i], res[2 * i + 1]) *
223236
std::complex(b[2 * i], b[2 * i + 1]);
@@ -279,7 +292,17 @@ template <uint32_t N>
279292
inline void MulInFD(std::array<double, N> &res, const std::array<double, N> &a,
280293
const std::array<double, N> &b)
281294
{
282-
#ifdef USE_INTERLEAVED_FORMAT
295+
#if defined(USE_INTERLEAVED_FORMAT) && defined(__AVX2__)
296+
for (uint32_t i = 0; i < N; i += 4) {
297+
__m256d va = _mm256_load_pd(a.data() + i);
298+
__m256d w = _mm256_load_pd(b.data() + i);
299+
__m256d w_swap = _mm256_permute_pd(w, 0b0101);
300+
__m256d a_re = _mm256_unpacklo_pd(va, va);
301+
__m256d a_im = _mm256_unpackhi_pd(va, va);
302+
_mm256_store_pd(res.data() + i,
303+
_mm256_fmaddsub_pd(a_re, w, _mm256_mul_pd(a_im, w_swap)));
304+
}
305+
#elif defined(USE_INTERLEAVED_FORMAT)
283306
for (int i = 0; i < N / 2; i++) {
284307
const std::complex tmp = std::complex(a[2 * i], a[2 * i + 1]) *
285308
std::complex(b[2 * i], b[2 * i + 1]);
@@ -347,7 +370,19 @@ template <uint32_t N>
347370
inline void FMAInFD(std::array<double, N> &res, const std::array<double, N> &a,
348371
const std::array<double, N> &b)
349372
{
350-
#ifdef USE_INTERLEAVED_FORMAT
373+
#if defined(USE_INTERLEAVED_FORMAT) && defined(__AVX2__)
374+
// AVX2 interleaved complex FMA: res += a * b
375+
for (uint32_t i = 0; i < N; i += 4) {
376+
__m256d va = _mm256_load_pd(a.data() + i);
377+
__m256d w = _mm256_load_pd(b.data() + i);
378+
__m256d r = _mm256_load_pd(res.data() + i);
379+
__m256d w_swap = _mm256_permute_pd(w, 0b0101);
380+
__m256d a_re = _mm256_unpacklo_pd(va, va);
381+
__m256d a_im = _mm256_unpackhi_pd(va, va);
382+
__m256d prod = _mm256_fmaddsub_pd(a_re, w, _mm256_mul_pd(a_im, w_swap));
383+
_mm256_store_pd(res.data() + i, _mm256_add_pd(r, prod));
384+
}
385+
#elif defined(USE_INTERLEAVED_FORMAT)
351386
for (int i = 0; i < N / 2; i++) {
352387
std::complex tmp = std::complex(a[2 * i], a[2 * i + 1]) *
353388
std::complex(b[2 * i], b[2 * i + 1]);

src/CMakeLists.txt

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,10 @@ elseif(USE_SPQLIOS_ARITHMETIC)
5858
target_link_libraries(tfhe++ INTERFACE libspqlios)
5959
target_include_directories(
6060
tfhe++ PUBLIC ${PROJECT_SOURCE_DIR}/thirdparties/spqlios-arithmetic)
61+
elseif(USE_SPQLIOS_INTL)
62+
target_link_libraries(tfhe++ INTERFACE spqlios-intl)
63+
target_include_directories(
64+
tfhe++ PUBLIC ${PROJECT_SOURCE_DIR}/thirdparties/spqlios-intl)
6165
else()
6266
if(USE_MKL)
6367
target_link_libraries(tfhe++ INTERFACE mklproc)
Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
add_library(spqlios-intl STATIC
2+
fft_processor_spqlios_intl.cpp
3+
)
4+
target_include_directories(spqlios-intl PUBLIC ${CMAKE_CURRENT_SOURCE_DIR})
5+
target_include_directories(spqlios-intl PRIVATE ${CMAKE_SOURCE_DIR}/include)
6+
target_compile_options(spqlios-intl PRIVATE -mavx2 -mfma)

0 commit comments

Comments
 (0)