#include <iostream>
#include <algorithm>
#include <vector>
#include <map>
#include <set>
#include <array>
#include <iomanip>
#include <queue>
#include <stack>
#include <numeric>
#include <cassert>
#include <cmath>
#include <random>
#include <ctime>
#include <chrono>
#include <x86intrin.h>
#pragma GCC optimize("O3")
#pragma GCC target("avx2")
using namespace std;
typedef long long ll;
typedef vector<int> vi;
typedef vector<ll> vll;
typedef pair<int, int> pii;
typedef pair<ll, ll> pll;
#define all(x) (x).begin(), (x).end()
#define sz(x) ((int)x.size())
#define rep(i, a, b) for(int i = a; i < b; ++i)
template<typename C> struct rge{C l, r;};
template<typename C> rge<C> range(C i, C j) { return rge<C>{i, j}; }
template<typename C> ostream& operator<<(ostream &os, rge<C> r) { os << '{'; for(auto it = r.l; it != r.r; it++) os << "," + (it == r.l) << *it; os << '}'; return os; }
template<typename A, typename B> ostream& operator<<(ostream &os, const pair<A, B> &p) { return os << '{' << p.first << "," << p.second << '}'; }
template<typename T_container, typename T = typename enable_if<!is_same<T_container, string>::value, typename T_container::value_type>::type> ostream& operator<<(ostream &os, const T_container &v) { os << '{'; string sep; for (const T &x : v) os << sep << x, sep = ","; return os << '}'; }
void dbg_out() { cerr << ']' << endl; }
template<typename A> void dbg_out(A H) { cerr << H; dbg_out(); }
template<typename A, typename B, typename... C> void dbg_out(A H, B G, C... T) { cerr << H << ","; dbg_out(G, T...); }
#ifdef DEBUG
#define debug(...) cerr << "[" << #__VA_ARGS__ << "] = [", dbg_out(__VA_ARGS__)
#else
#define debug(...)
#endif
template <typename T>
void remove_duplicates(vector<T> &v){
sort(all(v));
v.resize(unique(all(v)) - v.begin());
}
template <typename T> void chmin(T &a, T b){ a = (b < a) ? b : a; }
template <typename T> void chmax(T &a, T b){ a = (b > a) ? b : a; }
const int N = 2e5 + 3;
ll n;
alignas(64) int h[N];
ll count_easy() {
set<array<int, 3>> triples;
auto test3 = [&](int a, int b, int c) {
if (c < 0 || c >= n) {
return;
}
int arr[3]{a, b, c};
sort(arr, arr + 3);
a = arr[0];
b = arr[1];
c = arr[2];
int diffs[3]{b - a, c - b, c - a};
int vals[3]{h[a], h[b], h[c]};
sort(diffs, diffs + 3);
sort(vals, vals + 3);
for (int i = 0; i < 3; ++i) {
if (diffs[i] != vals[i]) {
return;
}
}
triples.insert({a, b, c});
};
auto test2 = [&](int a, int b) {
if (b < 0 || b >= n) {
return;
}
test3(a, b, a + h[b]);
test3(a, b, a - h[b]);
test3(a, b, b + h[b]);
test3(a, b, b - h[b]);
};
for (int i = 0; i < n; ++i) {
test2(i, i - h[i]);
test2(i, i + h[i]);
}
return triples.size();
}
static inline __m256i seq8_from(int i) {
const __m256i base = _mm256_set1_epi32(i); // [i,i,i,i,i,i,i,i]
const __m256i inc = _mm256_setr_epi32(0,1,2,3,4,5,6,7); // [0..7] in ascending lane order
return _mm256_add_epi32(base, inc); // [i..i+7]
}
int hsum(__m256i x) {
__m128i l = _mm256_extracti128_si256(x, 0);
__m128i h = _mm256_extracti128_si256(x, 1);
l = _mm_add_epi32(l, h);
l = _mm_hadd_epi32(l, l);
return _mm_extract_epi32(l, 0) + _mm_extract_epi32(l, 1);
}
ll count_triples(vector<int> H) {
n = H.size();
for (int i = 0; i < n; ++i) {
h[i] = H[i];
}
ll ans = count_easy();
for (int i = 0; i < n; ++i) {
int j = i + h[i];
int k = j + h[i];
if (k >= n) {
continue;
}
ans -= h[k] == h[i] && h[j] == 2 * h[i];
}
ll mx = *max_element(h, h + n);
for (int i = 0; i < n; ++i) {
int j = i + 1;
int k = j + h[i];
__m256i kdiff = seq8_from(k - i);
__m256i jdiff = seq8_from(j - i);
__m256i vans = _mm256_set1_epi32(0);
const __m256i ones = _mm256_set1_epi32(1);
const __m256i eights = _mm256_set1_epi32(8);
ll C = min(n, (ll)i + mx + 1);
for (; k + 8 < C; k += 8, j += 8) {
__m256i hj = _mm256_loadu_si256((const __m256i*)&h[j]);
__m256i hk = _mm256_loadu_si256((const __m256i*)&h[k]);
// vector compares -> 0xFFFFFFFF per lane on match, else 0x0
__m256i mj = _mm256_cmpeq_epi32(hj, kdiff);
__m256i mk = _mm256_cmpeq_epi32(hk, jdiff);
// lanes where both match
__m256i m = _mm256_and_si256(mj, mk);
vans = _mm256_add_epi32(vans, m);
kdiff = _mm256_add_epi32(kdiff, eights);
jdiff = _mm256_add_epi32(jdiff, eights);
}
ans -= hsum(vans);
for (; k < C; ++j, ++k) {
ans += (h[j] == (k - i)) && (h[k] == (j - i));
}
}
return ans;
}
std::vector<int> construct_range(int M, int K) { return {};}
#ifdef LOCAL_TEST
int main() {
vector<int> H = {4,1,4,3,2,6,1};
cout << count_triples(H) << "\n"; // 3
}
#endif
# | Verdict | Execution time | Memory | Grader output |
---|
Fetching results... |
# | Verdict | Execution time | Memory | Grader output |
---|
Fetching results... |
# | Verdict | Execution time | Memory | Grader output |
---|
Fetching results... |
# | Verdict | Execution time | Memory | Grader output |
---|
Fetching results... |
# | Verdict | Execution time | Memory | Grader output |
---|
Fetching results... |
# | Verdict | Execution time | Memory | Grader output |
---|
Fetching results... |
# | Verdict | Execution time | Memory | Grader output |
---|
Fetching results... |
# | Verdict | Execution time | Memory | Grader output |
---|
Fetching results... |
# | Verdict | Execution time | Memory | Grader output |
---|
Fetching results... |
# | Verdict | Execution time | Memory | Grader output |
---|
Fetching results... |
# | Verdict | Execution time | Memory | Grader output |
---|
Fetching results... |
# | Verdict | Execution time | Memory | Grader output |
---|
Fetching results... |
# | Verdict | Execution time | Memory | Grader output |
---|
Fetching results... |