FFT with modulo - YessineJallouli/Competitive-Programming GitHub Wiki

#include <bits/stdc++.h>
typedef long long ll;
using namespace std;
using cx = complex<double>;

constexpr auto pi = M_PI;

// runs in 2 sec for N = 10^6
// you can use it for every modulo

void inplace_fft2(vector<cx> &a, bool inv=false) {
    int n = a.size();
    for (int i = 1, j = 0; i < n; i++) {
        int bit = n >> 1;
        for (; j & bit; bit >>= 1)
            j ^= bit;
        j ^= bit;
        if (i < j)
            swap(a[i], a[j]);
    }
    for (int len = 2; len <= n; len <<= 1) {
        double ang = 2 * pi / len * (inv ? -1 : 1);
        cx wlen = polar(1., ang);
        for (int i = 0; i < n; i += len) {
            cx w(1);
            for (int j = 0; j < len / 2; j++) {
                cx u = a[i + j], v = a[i + j + len / 2] * w;
                a[i + j] = u + v;
                a[i + j + len / 2] = u - v;
                w *= wlen;
            }
        }
    }
    if (inv)
        for (auto &z: a)
            z /= n;
}

vector<cx> multiply(vector<cx> a, vector<cx> b) {
    int r = a.size() + b.size()-1;
    int n = 1;
    while (n < r)
        n*= 2;

    a.resize(n);
    b.resize(n);
    inplace_fft2(a, false);
    inplace_fft2(b, false); // true for cross correlation
    for (int i = 0; i < n; i++)
        a[i]*= b[i];
    inplace_fft2(a, true);
    a.resize(r);
    return a;
}

ll bit_ceil(ll r) {
    ll n = 1;
    while (n < r)
        n*= 2;
    return n;
}

const int mod = 1e9+7;

vector<ll> fast_modular_multiplication_real(const vector<ll> &a,const vector<ll> &b, int decompositions = 4) {
    vector<vector<cx>> A(decompositions), B(decompositions);
    ll block = ceil(pow(mod, 1. / decompositions));
    for (int i = 0; i < a.size(); i++) {
        auto z = a[i];
        for (int j = 0; j < decompositions; j++) {
            auto [q, r] = div(z, block);
            A[j].push_back(r);
            z = q;
        }
    }
    for (int i = 0; i < b.size(); i++) {
        auto z = b[i];
        for (int j = 0; j < decompositions; j++) {
            auto [q, r] = div(z, block);
            B[j].push_back(r);
            z = q;
        }
    }
    vector<vector<vector<cx>>> C(decompositions, vector<vector<cx>>(decompositions));
    for (int i = 0; i < decompositions; i++)
        for (int j = 0; j < decompositions; j++)
            C[i][j] = multiply(A[i], B[j]);
    vector<ll> R(a.size() + b.size() - 1);
    for (int i = 0; i < R.size(); i++) {
        ll x = 0;
        ll t1 = 1;
        for (int j = 0; j < decompositions; j++) {
            ll t2 = t1;
            for (int k = 0; k < decompositions; k++) {
                x += llround(C[j][k][i].real()) % mod * t2 % mod;
                x %= mod;
                t2 *= block;
                t2 %= mod;
            }
            t1 *= block;
        }
        R[i] = x;
    }
    return R;
}

int main() {
    ios_base::sync_with_stdio(false); cin.tie(nullptr);
    int n,m; cin >> n >> m;
    vector<ll> A(n), B(m);
    for (auto &a : A)
        cin >> a;
    for (auto &b : B)
        cin >> b;
    vector<ll> ans = fast_modular_multiplication_real(A, B);
    for (auto c : ans)
        cout << c << ' ';
    cout << '\n';
}      
⚠️ **GitHub.com Fallback** ⚠️