virtualsecureplatform
diff --git a/‎include/detwfa.hpp‎
Lines changed: 48 additions & 0 deletions b/‎include/detwfa.hpp‎
Lines changed: 48 additions & 0 deletions
diff --git a/‎include/gatebootstrapping.hpp‎
Lines changed: 40 additions & 0 deletions b/‎include/gatebootstrapping.hpp‎
Lines changed: 40 additions & 0 deletions
diff --git a/‎include/mulfft.hpp‎
Lines changed: 58 additions & 14 deletions b/‎include/mulfft.hpp‎
Lines changed: 58 additions & 14 deletions
diff --git a/‎include/params.hpp‎
Lines changed: 10 additions & 0 deletions b/‎include/params.hpp‎
Lines changed: 10 additions & 0 deletions
diff --git a/‎include/params/128bit.hpp‎
Lines changed: 14 additions & 13 deletions b/‎include/params/128bit.hpp‎
Lines changed: 14 additions & 13 deletions
diff --git a/‎include/tlwe.hpp‎
Lines changed: 2 additions & 4 deletions b/‎include/tlwe.hpp‎
Lines changed: 2 additions & 4 deletions
@@ -116,4 +116,52 @@ void CMUXwithPolynomialMulByXaiMinusOne(TRLWE<P> &acc,
         for (int i = 0; i < P::n; i++) acc[k][i] += temp[k][i];
 }
 
+// Double Decomposition variants
+template <class P>
+void CMUXFFTDD(TRLWE<P> &res, const TRGSWFFT<P> &cs, const TRLWE<P> &c1,
+               const TRLWE<P> &c0)
+{
+    for (int k = 0; k < P::k + 1; k++)
+        for (int i = 0; i < P::n; i++) res[k][i] = c1[k][i] - c0[k][i];
+    ExternalProductDD<P>(res, res, cs);
+    for (int k = 0; k < P::k + 1; k++)
+        for (int i = 0; i < P::n; i++) res[k][i] += c0[k][i];
+}
+
+template <class bkP>
+void CMUXwithPolynomialMulByXaiMinusOneDD(
+    TRLWE<typename bkP::targetP> &acc,
+    const BootstrappingKeyElementFFT<bkP> &cs, const int a)
+{
+    if constexpr (bkP::domainP::key_value_diff == 1) {
+        alignas(64) TRLWE<typename bkP::targetP> temp;
+        for (int k = 0; k < bkP::targetP::k + 1; k++)
+            PolynomialMulByXaiMinusOne<typename bkP::targetP>(temp[k], acc[k],
+                                                              a);
+        ExternalProductDD<typename bkP::targetP>(temp, temp, cs[0]);
+        for (int k = 0; k < bkP::targetP::k + 1; k++)
+            for (int i = 0; i < bkP::targetP::n; i++) acc[k][i] += temp[k][i];
+    }
+    else {
+        alignas(32) TRLWE<typename bkP::targetP> temp;
+        int count = 0;
+        for (int i = bkP::domainP::key_value_min;
+             i <= bkP::domainP::key_value_max; i++) {
+            if (i != 0) {
+                const int mod = (a * i) % (2 * bkP::targetP::n);
+                const int index = mod > 0 ? mod : mod + (2 * bkP::targetP::n);
+                for (int k = 0; k < bkP::targetP::k + 1; k++)
+                    PolynomialMulByXaiMinusOne<typename bkP::targetP>(
+                        temp[k], acc[k], index);
+                ExternalProductDD<typename bkP::targetP>(temp, temp,
+                                                         cs[count]);
+                for (int k = 0; k < bkP::targetP::k + 1; k++)
+                    for (int i = 0; i < bkP::targetP::n; i++)
+                        acc[k][i] += temp[k][i];
+                count++;
+            }
+        }
+    }
+}
+
 }  // namespace TFHEpp
@@ -263,6 +263,46 @@ constexpr Polynomial<P> μpolygen()
     return poly;
 }
 
+// Double Decomposition variants
+template <class P, uint32_t num_out = 1>
+void BlindRotateDD(TRLWE<typename P::targetP> &res,
+                   const TLWE<typename P::domainP> &tlwe,
+                   const BootstrappingKeyFFT<P> &bkfft,
+                   const Polynomial<typename P::targetP> &testvector)
+{
+    constexpr uint32_t bitwidth = bits_needed<num_out - 1>();
+    const uint32_t b̄ = 2 * P::targetP::n -
+                       ((tlwe[P::domainP::k * P::domainP::n] >>
+                         (std::numeric_limits<typename P::domainP::T>::digits -
+                          1 - P::targetP::nbit + bitwidth))
+                        << bitwidth);
+    res = {};
+    PolynomialMulByXai<typename P::targetP>(res[P::targetP::k], testvector, b̄);
+    for (int i = 0; i < P::domainP::k * P::domainP::n; i++) {
+        constexpr typename P::domainP::T roundoffset =
+            1ULL << (std::numeric_limits<typename P::domainP::T>::digits - 2 -
+                     P::targetP::nbit + bitwidth);
+        const uint32_t ā =
+            (tlwe[i] + roundoffset) >>
+            (std::numeric_limits<typename P::domainP::T>::digits - 1 -
+             P::targetP::nbit + bitwidth)
+                << bitwidth;
+        if (ā == 0) continue;
+        CMUXwithPolynomialMulByXaiMinusOneDD<P>(res, bkfft[i], ā);
+    }
+}
+
+template <class P>
+void GateBootstrappingTLWE2TLWEDD(
+    TLWE<typename P::targetP> &res, const TLWE<typename P::domainP> &tlwe,
+    const BootstrappingKeyFFT<P> &bkfft,
+    const Polynomial<typename P::targetP> &testvector)
+{
+    alignas(64) TRLWE<typename P::targetP> acc;
+    BlindRotateDD<P>(acc, tlwe, bkfft, testvector);
+    SampleExtractIndex<typename P::targetP>(res, acc, 0);
+}
+
 template <class bkP, typename bkP::targetP::T μ, class iksP>
 void GateBootstrapping(TLWE<typename iksP::targetP> &res,
                        const TLWE<typename bkP::domainP> &tlwe,
 
@@ -68,10 +68,9 @@ inline void TwistNTT(Polynomial<P> &res, PolynomialNTT<P> &a)
         cuHEpp::TwistNTT<typename lvl1param::T, lvl1param::nbit>(
             res, a, (*ntttablelvl1)[0], (*ntttwistlvl1)[0]);
 #endif
-    else if constexpr (std::is_same_v<typename P::T, uint64_t>) {
+    else if constexpr (std::is_same_v<typename P::T, uint64_t>)
         cuHEpp::TwistNTT<typename lvl2param::T, lvl2param::nbit>(
             res, a, (*ntttablelvl2)[0], (*ntttwistlvl2)[0]);
-    }
     else
         static_assert(false_v<typename P::T>, "Undefined TwistNTT!");
 }
@@ -89,6 +88,14 @@ inline void TwistFFT(Polynomial<P> &res, const PolynomialInFD<P> &a)
         else if constexpr (std::is_same_v<typename P::T, uint64_t>)
             fftplvl1.execute_direct_torus64(res.data(), a.data());
     }
+    else if constexpr (std::is_same_v<P, lvl3param>) {
+        // For 128-bit lvl3param, use 64-bit FFT and shift result to top 64 bits
+        // This preserves the Torus semantics (most significant bits)
+        alignas(64) std::array<uint64_t, P::n> temp;
+        fftplvl3.execute_direct_torus64(temp.data(), a.data());
+        for (int i = 0; i < P::n; i++)
+            res[i] = static_cast<__uint128_t>(temp[i]) << 64;
+    }
     else if constexpr (std::is_same_v<typename P::T, uint64_t>)
         fftplvl2.execute_direct_torus64(res.data(), a.data());
     else
@@ -143,6 +150,14 @@ inline void TwistIFFT(PolynomialInFD<P> &res, const Polynomial<P> &a)
         if constexpr (std::is_same_v<typename P::T, uint64_t>)
             fftplvl1.execute_reverse_torus64(res.data(), a.data());
     }
+    else if constexpr (std::is_same_v<P, lvl3param>) {
+        // For 128-bit lvl3param, use top 64 bits for FFT
+        // This preserves the Torus semantics (most significant bits)
+        alignas(64) std::array<uint64_t, P::n> temp;
+        for (int i = 0; i < P::n; i++)
+            temp[i] = static_cast<uint64_t>(a[i] >> 64);
+        fftplvl3.execute_reverse_torus64(res.data(), temp.data());
+    }
     else if constexpr (std::is_same_v<typename P::T, uint64_t>)
         fftplvl2.execute_reverse_torus64(res.data(), a.data());
     else
@@ -301,8 +316,21 @@ inline void PolyMul(Polynomial<P> &res, const Polynomial<P> &a,
         for (int i = 0; i < P::n; i++) ntta[i] *= nttb[i];
         TwistNTT<P>(res, ntta);
     }
+    else if constexpr (std::is_same_v<typename P::T, __uint128_t>) {
+        // Naive for 128-bit types (FFT/NTT don't support 128-bit precision)
+        for (int i = 0; i < P::n; i++) {
+            __uint128_t ri = 0;
+            for (int j = 0; j <= i; j++)
+                ri += static_cast<__int128_t>(a[j]) *
+                      static_cast<__int128_t>(b[i - j]);
+            for (int j = i + 1; j < P::n; j++)
+                ri -= static_cast<__int128_t>(a[j]) *
+                      static_cast<__int128_t>(b[P::n + i - j]);
+            res[i] = ri;
+        }
+    }
     else {
-        // Naieve
+        // Naive for other types
         for (int i = 0; i < P::n; i++) {
             typename P::T ri = 0;
             for (int j = 0; j <= i; j++)
@@ -339,17 +367,33 @@ template <class P>
 inline void PolyMulNaive(Polynomial<P> &res, const Polynomial<P> &a,
                          const Polynomial<P> &b)
 {
-    for (int i = 0; i < P::n; i++) {
-        typename P::T ri = 0;
-        for (int j = 0; j <= i; j++)
-            ri += static_cast<typename std::make_signed<typename P::T>::type>(
-                      a[j]) *
-                  b[i - j];
-        for (int j = i + 1; j < P::n; j++)
-            ri -= static_cast<typename std::make_signed<typename P::T>::type>(
-                      a[j]) *
-                  b[P::n + i - j];
-        res[i] = ri;
+    if constexpr (std::is_same_v<typename P::T, __uint128_t>) {
+        for (int i = 0; i < P::n; i++) {
+            __uint128_t ri = 0;
+            for (int j = 0; j <= i; j++)
+                ri += static_cast<__int128_t>(a[j]) *
+                      static_cast<__int128_t>(b[i - j]);
+            for (int j = i + 1; j < P::n; j++)
+                ri -= static_cast<__int128_t>(a[j]) *
+                      static_cast<__int128_t>(b[P::n + i - j]);
+            res[i] = ri;
+        }
+    }
+    else {
+        for (int i = 0; i < P::n; i++) {
+            typename P::T ri = 0;
+            for (int j = 0; j <= i; j++)
+                ri +=
+                    static_cast<typename std::make_signed<typename P::T>::type>(
+                        a[j]) *
+                    b[i - j];
+            for (int j = i + 1; j < P::n; j++)
+                ri -=
+                    static_cast<typename std::make_signed<typename P::T>::type>(
+                        a[j]) *
+                    b[P::n + i - j];
+            res[i] = ri;
+        }
     }
 }
 
 
@@ -67,6 +67,16 @@ struct lvl02param {
 #endif
 };
 
+struct lvl03param {
+    using domainP = lvl0param;
+    using targetP = lvl3param;
+#ifdef USE_KEY_BUNDLE
+    static constexpr uint32_t Addends = 2;
+#else
+    static constexpr uint32_t Addends = 1;
+#endif
+};
+
 struct lvlh2param {
     using domainP = lvlhalfparam;
     using targetP = lvl2param;
 
@@ -153,35 +153,36 @@ struct AHlvl2param {
     static constexpr std::uint32_t B̅gₐbit = baseP::B̅gₐbit;
 };
 
-// lvl3param with 64-bit Torus and non-trivial Double Decomposition
-// Double decomposition constraint: l * Bgbit + l̅ * B̅gbit <= 64
-// Using l=2, Bgbit=16, l̅=2, B̅gbit=16: 2*16 + 2*16 = 64 bits (fully utilized)
+// lvl3param with 128-bit Torus and non-trivial Double Decomposition
+// Double decomposition constraint: l * Bgbit + l̅ * B̅gbit <= 128
+// Using l=4, Bgbit=16, l̅=4, B̅gbit=16: 4*16 + 4*16 = 128 bits (fully utilized)
 struct lvl3param {
     static constexpr int32_t key_value_max = 1;
     static constexpr int32_t key_value_min = -1;
     static const std::uint32_t nbit = 12;  // dimension must be a power of 2 for
     // ease of polynomial multiplication.
     static constexpr std::uint32_t n = 1 << nbit;  // dimension = 4096
     static constexpr std::uint32_t k = 1;
-    static constexpr std::uint32_t lₐ = 2;
-    static constexpr std::uint32_t l = 2;
+    static constexpr std::uint32_t lₐ = 4;
+    static constexpr std::uint32_t l = 4;
     static constexpr std::uint32_t Bgbit = 16;
     static constexpr std::uint32_t Bgₐbit = 16;
     static constexpr uint32_t Bg = 1U << Bgbit;
     static constexpr uint32_t Bgₐ = 1U << Bgₐbit;
     static constexpr ErrorDistribution errordist =
         ErrorDistribution::ModularGaussian;
-    static const inline double α = std::pow(2.0, -51);  // fresh noise
-    using T = uint64_t;                                 // Torus representation
-    static constexpr T μ = 1ULL << 61;
+    static const inline double α = std::pow(2.0, -105);  // fresh noise
+    using T = __uint128_t;                               // Torus representation
+    static constexpr T μ = static_cast<T>(1) << 125;
     static constexpr uint32_t plain_modulusbit = 31;
-    static constexpr uint64_t plain_modulus = 1ULL << plain_modulusbit;
-    static constexpr double Δ = 1ULL << (64 - plain_modulusbit - 1);
+    static constexpr T plain_modulus = static_cast<T>(1) << plain_modulusbit;
+    static constexpr double Δ =
+        static_cast<double>(static_cast<T>(1) << (128 - plain_modulusbit - 1));
     // Double Decomposition (bivariate representation) parameters
     // Non-trivial values for testing actual double decomposition
-    // Constraint: l * Bgbit + l̅ * B̅gbit <= 64
-    static constexpr std::uint32_t l̅ = 2;    // auxiliary decomposition levels
-    static constexpr std::uint32_t l̅ₐ = 2;
+    // Constraint: l * Bgbit + l̅ * B̅gbit <= 128
+    static constexpr std::uint32_t l̅ = 4;    // auxiliary decomposition levels
+    static constexpr std::uint32_t l̅ₐ = 4;
     static constexpr std::uint32_t B̅gbit = 16;   // 2^16 base for auxiliary
     static constexpr std::uint32_t B̅gₐbit = 16;
 };
 
@@ -13,13 +13,11 @@ template <class P>
 void tlweSymEncrypt(TLWE<P> &res, const typename P::T p, const double α,
                     const Key<P> &key)
 {
-    std::uniform_int_distribution<typename P::T> Torusdist(
-        0, std::numeric_limits<typename P::T>::max());
     res = {};
     res[P::k * P::n] = ModularGaussian<P>(p, α);
     for (int k = 0; k < P::k; k++)
         for (int i = 0; i < P::n; i++) {
-            res[k * P::n + i] = Torusdist(generator);
+            res[k * P::n + i] = UniformTorusRandom<P>();
             res[P::k * P::n] += res[k * P::n + i] * key[k * P::n + i];
         }
 }
@@ -122,7 +120,7 @@ typename P::T tlweSymIntDecrypt(const TLWE<P> &c, const Key<P> &key)
     constexpr double Δ =
         2 *
         static_cast<double>(
-            1ULL << (std::numeric_limits<typename P::T>::digits - 1)) /
+            static_cast<typename P::T>(1) << (std::numeric_limits<typename P::T>::digits - 1)) /
         plain_modulus;
     const typename P::T phase = tlweSymPhase<P>(c, key);
     typename P::T res = static_cast<typename P::T>(std::round(phase / Δ));