Submission #32608

# Submission time Handle Problem Language Result Execution time Memory
32608 2017-10-12T01:01:02 Z ho94949 씽크스몰 (kriii3_TT) C++14
30 / 30
6979 ms 278140 KB
#include<bits/stdc++.h>
#include <smmintrin.h>
using namespace std;
#pragma GCC target("avx2")
#pragma GCC target("fma")

//Who might read this code
//This code uses AVX2 operation for fast FFT calculation.
//This depends on judging environment, and should be used carefully


//simple debug code of "print" operation
void print(__m256d a)
{
    cout << a[0] << " ";
    cout << a[1] << " ";
    cout << a[2] << " ";
    cout << a[3] << endl;
}

/*

Flow of function "mult" a, b consist of two complex number.
Return value is multiple of each complex number.
Required operations: 6

    +-------+----movedup----+-------+    +-------+-------+-------+-------+    +-------+----shuffle----+-------+
    |  a1R  |  a1R  |  a2R  |  a2R  | <- |  a1R  |  a1I  |  a2R  |  a2I  | -> |  a1I  |  a1I  |  a2I  |  a2I  |
    +-------+-------+-------+-------+    +-------+-------+-------+-------+    +-------+-------+-------+-------+
                    |                                                                         |
                    v                                                                         v
    +-------+------mul------+-------+    +-------+-------+-------+-------+    +-------+------mul------+-------+
    |a1R*b1R|a1R*b1I|a2R*b2R|a2R*b2I| <- |  b1R  |  b1I  |  b2R  |  b2I  | -> |a1I*b1R|a1I*b1I|a2I*b2R|a2I*b2I|
    +-------+-------+-------+-------+    +-------+-------+-------+-------+    +-------+-------+-------+-------+
                                                                                              |
        -       +       -       +                                                             |
                                                                                              |
    +-------+----shuffle----+-------+                                                         |
    |a1I*b1I|a1I*b1R|a2I*b2I|a2I*b2R| <-------------------------------------------------------+
    +-------+-------+-------+-------+                    
                    | addsub
                    v
    +---------------+---------------+---------------+---------------+
    |a1R*b1R-a1I*b1I|a1R*b1I+a1I*b1R|a2R*b2R-a2I*b2I|a2R*b2I+a2I*b2R|
    +---------------+---------------+---------------+---------------+
*/

__m256d mult(__m256d a, __m256d b)
{
    __m256d c = _mm256_movedup_pd(a);
    __m256d d = _mm256_shuffle_pd(a, a, 15);
    __m256d cb = _mm256_mul_pd(c, b);
    __m256d db = _mm256_mul_pd(d, b);
    __m256d e = _mm256_shuffle_pd(db, db, 5);
    __m256d r = _mm256_addsub_pd(cb, e);
    return r;    
}

/*
Just plain FFT operation.
kth power of root of unity is calculated by w * wlen, not precalculated one.
To reduce memory access and gain speed for two complex multiplication.
And also reduce for calling sincos system call.
*/
void fft(int n, __m128d a[], bool invert)
{
    //xor bit-reversal
    unsigned int sl = 32 - __builtin_ctz(n);
	for(unsigned i=0,j=0; i<n-1;++i)
	{
	    if(i<j) swap(a[i], a[j]);
	    unsigned t = i^(i+1);
	    t <<= __builtin_clz(t);
	    t >>= sl;
	    j ^= t;
	}
	
    for(int len=2; len<=n; len<<=1)
    {
        double ang = 2*3.14159265358979/len*(invert?-1:1);
        __m256d wlen; wlen[0] = cos(ang), wlen[1] = sin(ang);
        for(int i=0; i<n; i += len)
        {
            __m256d w; w[0] = 1; w[1] = 0;
            for(int j=0; j<len/2; ++j)
            {
                w = _mm256_permute2f128_pd(w, w, 0);
                wlen = _mm256_insertf128_pd(wlen, a[i+j+len/2], 1);
                w = mult(w, wlen);
                __m128d vw = _mm256_extractf128_pd(w, 1);
                __m128d u = a[i+j];
                a[i+j] = _mm_add_pd(u, vw);
                a[i+j+len/2] = _mm_sub_pd(u, vw);
            }
        }
    }
    if(invert)
    {
        __m128d inv; inv[0] = inv[1] = 1.0/n;
        for(int i=0; i<n; ++i)
            a[i] = _mm_mul_pd(a[i], inv);
    }
}
vector<int64_t> multiply(vector<int64_t>& v, vector<int64_t>& w)
{
    //v+w-1 is enough for size. odd n will cause error.
    int n = 2; while(n < v.size()+w.size()-1) n<<=1; 
    __m128d* fv = new __m128d[n];
    __m128d* fw = new __m128d[n];
    
    for(int i=0; i<n; ++i)
        fv[i][0] = fv[i][1] = fw[i][0] = fw[i][1] = 0;
    for(int i=0; i<v.size(); ++i)
        fv[i][0] = v[i];
    for(int i=0; i<w.size(); ++i)
        fw[i][0] = w[i];

    fft(n, fv, 0); fft(n, fw, 0); 
    
    for(int i=0; i<n; i += 2)
    {
        __m256d a, b;
        a = _mm256_insertf128_pd(a, fv[i], 0);
        a = _mm256_insertf128_pd(a, fv[i+1], 1);
        b = _mm256_insertf128_pd(b, fw[i], 0);
        b = _mm256_insertf128_pd(b, fw[i+1], 1);
        a = mult(a, b);
        fv[i] = _mm256_extractf128_pd(a, 0);
        fv[i+1] = _mm256_extractf128_pd(a, 1);
    }
    
    fft(n, fv, 1);
    
    vector<int64_t> ret(n);
    for(int i=0; i<n; ++i) ret[i] = (int64_t)round(fv[i][0]);
    
    delete[] fv;
    delete[] fw;
    return ret;
}



static char _buffer[1 << 19]; // Can be changed
static int _currentChar = 0;
static int _charsNumber = 0;
static inline int _read() {
	if (_charsNumber < 0) exit(1);
	if (!_charsNumber || _currentChar == _charsNumber) {
		_charsNumber = (int)fread(_buffer, sizeof(_buffer[0]), sizeof(_buffer), stdin);
		_currentChar = 0;
	}
	if (_charsNumber <= 0) return -1;
	return _buffer[_currentChar++];
}
static inline int _readInt() {
	int c, x, s;
	c = _read();
	while (c <= 32) c = _read();
	x = 0;
	s = 1;
	if (c == '-') {
		s = -1;
		c = _read();
	}
	while (c > 32) {
		x *= 10;
		x += c - '0';
		c = _read();
	}
	if (s < 0) x = -x;
	return x;
}


int main()
{
    int N = _readInt()+1; int M = _readInt()+1;
    vector<int64_t> _a(N), _b(M);
    for(int i=0; i<N; ++i) _a[i] = _readInt();
    for(int i=0; i<M; ++i) _b[i] = _readInt();
    vector<int64_t> a[3], b[3];
    for(int i=0; i<3; ++i)
    {
        a[i].resize(N);
        b[i].resize(M);
        for(int j=0; j<N; ++j)
        {
            a[i][j] = _a[j] & 127;
            _a[j] >>= 7;
        }
        for(int j=0; j<M; ++j)
        {
            b[i][j] = _b[j] & 127;
            _b[j] >>= 7;
        }
    }
    vector<int64_t> res[3][3];
    for(int i=0; i<3; ++i)
        for(int j=0; j<3; ++j)
        {
            res[i][j] = multiply(a[i], b[j]);
        }
    int64_t ans = 0;
    for(int i=0; i<res[0][0].size(); ++i)
    {
        int64_t t = 0;
        for(int j=0; j<3; ++j)
            for(int k=0; k<3; ++k)
                t += res[j][k][i] << (7*(j+k));
        ans ^= t;
    }
    std::cout << ans << endl;
    return 0;
    
    
    
    
}







Compilation message

tt.cpp: In function 'void fft(int, __m128d*, bool)':
tt.cpp:69:25: warning: comparison between signed and unsigned integer expressions [-Wsign-compare]
  for(unsigned i=0,j=0; i<n-1;++i)
                         ^
tt.cpp: In function 'std::vector<long int> multiply(std::vector<long int>&, std::vector<long int>&)':
tt.cpp:107:24: warning: comparison between signed and unsigned integer expressions [-Wsign-compare]
     int n = 2; while(n < v.size()+w.size()-1) n<<=1; 
                        ^
tt.cpp:113:19: warning: comparison between signed and unsigned integer expressions [-Wsign-compare]
     for(int i=0; i<v.size(); ++i)
                   ^
tt.cpp:115:19: warning: comparison between signed and unsigned integer expressions [-Wsign-compare]
     for(int i=0; i<w.size(); ++i)
                   ^
tt.cpp: In function 'int main()':
tt.cpp:205:19: warning: comparison between signed and unsigned integer expressions [-Wsign-compare]
     for(int i=0; i<res[0][0].size(); ++i)
                   ^
# Verdict Execution time Memory Grader output
1 Correct 0 ms 2576 KB Output is correct
2 Correct 0 ms 2576 KB Output is correct
3 Correct 0 ms 2576 KB Output is correct
4 Correct 0 ms 2712 KB Output is correct
5 Correct 0 ms 2856 KB Output is correct
6 Correct 0 ms 2852 KB Output is correct
7 Correct 0 ms 2844 KB Output is correct
8 Correct 0 ms 2844 KB Output is correct
# Verdict Execution time Memory Grader output
1 Correct 46 ms 6756 KB Output is correct
2 Correct 236 ms 19140 KB Output is correct
3 Correct 239 ms 19760 KB Output is correct
4 Correct 506 ms 34624 KB Output is correct
5 Correct 516 ms 35280 KB Output is correct
6 Correct 489 ms 34912 KB Output is correct
7 Correct 513 ms 35552 KB Output is correct
8 Correct 523 ms 35552 KB Output is correct
# Verdict Execution time Memory Grader output
1 Correct 1163 ms 65648 KB Output is correct
2 Correct 2953 ms 126848 KB Output is correct
3 Correct 2963 ms 130000 KB Output is correct
4 Correct 6643 ms 268188 KB Output is correct
5 Correct 6979 ms 261900 KB Output is correct
6 Correct 6643 ms 273740 KB Output is correct
7 Correct 6703 ms 278140 KB Output is correct
8 Correct 6639 ms 278140 KB Output is correct