제출 #1259085

#제출 시각아이디문제언어결과실행 시간메모리
1259085Sorting3개의 봉우리 (IOI25_triples)C++20
17 / 100
1812 ms15292 KiB
#include <iostream>
#include <algorithm>
#include <vector>
#include <map>
#include <set>
#include <array>
#include <iomanip>
#include <queue>
#include <stack>
#include <numeric>
#include <cassert>
#include <cmath>
#include <random>
#include <ctime>
#include <chrono>
#include <x86intrin.h>
#pragma GCC optimize("O3")
#pragma GCC target("avx2")

using namespace std;
typedef long long ll;
typedef vector<int> vi;
typedef vector<ll> vll;
typedef pair<int, int> pii;
typedef pair<ll, ll> pll;
#define all(x) (x).begin(), (x).end()
#define sz(x) ((int)x.size())
#define rep(i, a, b) for(int i = a; i < b; ++i)

template<typename C> struct rge{C l, r;};
template<typename C> rge<C> range(C i, C j) { return rge<C>{i, j}; }
template<typename C> ostream& operator<<(ostream &os, rge<C> r) { os << '{'; for(auto it = r.l; it != r.r; it++) os << "," + (it == r.l) << *it; os << '}'; return os; }
template<typename A, typename B> ostream& operator<<(ostream &os, const pair<A, B> &p) { return os << '{' << p.first << "," << p.second << '}'; }
template<typename T_container, typename T = typename enable_if<!is_same<T_container, string>::value, typename T_container::value_type>::type> ostream& operator<<(ostream &os, const T_container &v) { os << '{'; string sep; for (const T &x : v) os << sep << x, sep = ","; return os << '}'; }
void dbg_out() { cerr << ']' << endl; }
template<typename A> void dbg_out(A H) { cerr << H; dbg_out(); }
template<typename A, typename B, typename... C> void dbg_out(A H, B G, C... T) { cerr << H << ","; dbg_out(G, T...); }
#ifdef DEBUG
#define debug(...) cerr << "[" << #__VA_ARGS__ << "] = [", dbg_out(__VA_ARGS__)
#else
#define debug(...)
#endif

template <typename T>
void remove_duplicates(vector<T> &v){
    sort(all(v));
    v.resize(unique(all(v)) - v.begin());
}

template <typename T> void chmin(T &a, T b){ a = (b < a) ? b : a; }
template <typename T> void chmax(T &a, T b){ a = (b > a) ? b : a; } 

const int N = 2e5 + 3;

ll n;
alignas(64) int h[N];

ll count_easy() {
    set<array<int, 3>> triples;

    auto test3 = [&](int a, int b, int c) {
        if (c < 0 || c >= n) {
            return;
        }
        int arr[3]{a, b, c};
        sort(arr, arr + 3);
        a = arr[0];
        b = arr[1];
        c = arr[2];

        int diffs[3]{b - a, c - b, c - a};
        int vals[3]{h[a], h[b], h[c]};
        sort(diffs, diffs + 3);
        sort(vals, vals + 3);

        for (int i = 0; i < 3; ++i) {
            if (diffs[i] != vals[i]) {
                return;
            }
        }

        triples.insert({a, b, c});
    };

    auto test2 = [&](int a, int b) {
        if (b < 0 || b >= n) {
            return;
        }
        test3(a, b, a + h[b]);
        test3(a, b, a - h[b]);
        test3(a, b, b + h[b]);
        test3(a, b, b - h[b]);
    };

    for (int i = 0; i < n; ++i) {
        test2(i, i - h[i]);
        test2(i, i + h[i]);
    }
    return triples.size();
}

static inline __m256i seq8_from(int i) {
    const __m256i base = _mm256_set1_epi32(i);                 // [i,i,i,i,i,i,i,i]
    const __m256i inc  = _mm256_setr_epi32(0,1,2,3,4,5,6,7);   // [0..7] in ascending lane order
    return _mm256_add_epi32(base, inc);                        // [i..i+7]
}

int hsum(__m256i x) {
    __m128i l = _mm256_extracti128_si256(x, 0);
    __m128i h = _mm256_extracti128_si256(x, 1);
    l = _mm_add_epi32(l, h);
    l = _mm_hadd_epi32(l, l);
    return _mm_extract_epi32(l, 0) + _mm_extract_epi32(l, 1);
}

ll count_triples(vector<int> H) {
    n = H.size();
    for (int i = 0; i < n; ++i) {
        h[i] = H[i];
    }

    ll ans = count_easy();
    for (int i = 0; i < n; ++i) {
        int j = i + h[i];
        int k = j + h[i];

        if (k >= n) {
            continue;
        }
        ans -= h[k] == h[i] && h[j] == 2 * h[i];
    }

    for (int i = 0; i < n; ++i) {
        int j = i + 1;
        int k = j + h[i];
        
        __m256i vans = _mm256_set1_epi32(0);
        const __m256i ones = _mm256_set1_epi32(1);
        for (; k + 16 < n; k += 16, j += 16) {
            for (int iter = 0; iter < 2; ++iter, k += 8, j += 8) {
                __m256i hj = _mm256_loadu_si256((const __m256i*)&h[j]);
                __m256i hk = _mm256_loadu_si256((const __m256i*)&h[k]);
                __m256i kdiff = seq8_from(k - i);
                __m256i jdiff = seq8_from(j - i);
                
                // vector compares -> 0xFFFFFFFF per lane on match, else 0x0
                __m256i mj = _mm256_cmpeq_epi32(hj, kdiff);
                __m256i mk = _mm256_cmpeq_epi32(hk, jdiff);

                // lanes where both match
                __m256i m = _mm256_and_si256(mj, mk);

                // turn mask into 0/1 and accumulate
                __m256i inc = _mm256_and_si256(m, ones);
                vans = _mm256_add_epi32(vans, inc);
            }
        }
        ans += hsum(vans);
        for (; k < n; ++j, ++k) {
            ans += (h[j] == (k - i)) && (h[k] == (j - i));
        }
    }
    return ans;
}

std::vector<int> construct_range(int M, int K) { return {};}

#ifdef LOCAL_TEST
int main() {
    vector<int> H = {4,1,4,3,2,6,1};
    cout << count_triples(H) << "\n"; // 3
}
#endif
#Verdict Execution timeMemoryGrader output
Fetching results...
#Verdict Execution timeMemoryGrader output
Fetching results...
#Verdict Execution timeMemoryGrader output
Fetching results...
#Verdict Execution timeMemoryGrader output
Fetching results...
#Verdict Execution timeMemoryGrader output
Fetching results...
#Verdict Execution timeMemoryGrader output
Fetching results...
#Verdict Execution timeMemoryGrader output
Fetching results...
#Verdict Execution timeMemoryGrader output
Fetching results...
#Verdict Execution timeMemoryGrader output
Fetching results...
#Verdict Execution timeMemoryGrader output
Fetching results...
#Verdict Execution timeMemoryGrader output
Fetching results...
#Verdict Execution timeMemoryGrader output
Fetching results...
#Verdict Execution timeMemoryGrader output
Fetching results...