This submission is migrated from previous version of oj.uz, which used different machine for grading. This submission may have different result if resubmitted.
#include <bits/stdc++.h>
#define Loop(x,l,r) for (ll x = (l); x < (ll)(r); ++x)
#define LoopR(x,l,r) for (ll x = (r)-1; x >= (ll)(l); --x)
typedef long long ll;
typedef std::pair<int, int> pii;
typedef std::pair<ll , ll > pll;
using namespace std;
const int N = 200'064;
int rg[N];
short reg[2*N];
char delta[2*N];
int len = 0, len31;
vector<int> A[N];
int n, r, q;
char sum[2*N/32][32][32];
char cnt[2*N/32][32];
char scnt[2*N/32][32];
#pragma GCC optimize("O3,unroll-loops")
#pragma GCC target("avx2,abm,bmi,bmi2")
typedef char c32 __attribute__((vector_size(32),aligned(1)));
typedef short s16 __attribute__((vector_size(32),aligned(1)));
void dfs(int v)
{
reg[len] = rg[v];
delta[len] = 1;
++len;
for (int u : A[v])
dfs(u);
reg[len] = rg[v];
delta[len] = -1;
++len;
}
void init()
{
len31 = len/31+1;
Loop (i,0,len31)
{
int ii = i*31;
Loop (ir1,0,31) {
short r1 = reg[ii + ir1];
Loop (j,ii,ii+31) {
cnt[i][ir1] += reg[j] == r1;
scnt[i][ir1] += delta[j] & -(reg[j] == r1);
}
Loop (ir2,0,31) {
short r2 = reg[ii + ir2];
char ans = 0, pre = 0;
Loop (j,ii,ii+31) {
ans += pre & -(reg[j] == r2);
pre += delta[j] & -(reg[j] == r1);
}
sum[i][ir1][ir2] = ans;
}
}
}
}
int solve(short r1, short r2)
{
int ans = 0, pre = 0;
for (int i = 0, ii = 0; i < len31; ++i, ii += 31) {
s16 rl = *(s16 *)(reg+ii);
s16 rr = *(s16 *)(reg+ii+16);
c32 is_r1 = __builtin_ia32_packsswb256(rl == r1, rr == r1);
c32 is_r2 = __builtin_ia32_packsswb256(rl == r2, rr == r2);
int r1msk = __builtin_ia32_pmovmskb256(is_r1) | INT_MIN;
int r2msk = __builtin_ia32_pmovmskb256(is_r2) | INT_MIN;
int fr1 = __builtin_ctz(r1msk);
int fr2 = __builtin_ctz(r2msk);
ans += pre * cnt[i][fr2];
ans += sum[i][fr1][fr2];
pre += scnt[i][fr1];
}
return ans/2;
}
int main()
{
cin.tie(0) -> sync_with_stdio(false);
cin >> n >> r >> q;
cin >> rg[0];
Loop (i,1,n) {
int p;
cin >> p >> rg[i];
A[p-1].push_back(i);
}
dfs(0);
init();
while (q--) {
int r1, r2;
cin >> r1 >> r2;
cout << solve(r1, r2) << '\n';
cout.flush();
}
}
# | Verdict | Execution time | Memory | Grader output |
---|
Fetching results... |
# | Verdict | Execution time | Memory | Grader output |
---|
Fetching results... |