이 제출은 이전 버전의 oj.uz에서 채점하였습니다. 현재는 제출 당시와는 다른 서버에서 채점을 하기 때문에, 다시 제출하면 결과가 달라질 수도 있습니다.
#include <bits/stdc++.h>
#define Loop(x,l,r) for (ll x = (l); x < (r); ++x)
#define LoopR(x,l,r) for (ll x = (r)-1; x >= (l); --x)
typedef long long ll;
typedef std::pair<int, int> pii;
typedef std::pair<ll , ll > pll;
using namespace std;
const int N = 400'000;
const int S = 2000;
int a[N];
int q[N];
int n;
/*
__attribute__((optimize("O3,unroll-loops"),target("avx2")))
void up(int x, int y, int z, int l, int r)
{
typedef int ymm __attribute((vector_size(32),aligned(32)));
ymm *const b = (ymm*)a;
for (int i = l/8; i < r/8; i += 2) {
auto v = b[i], u = b[i+1];
v ^= v <= x? u: 0;
v ^= v <= y? u: 0;
v ^= v <= z? u: 0;
b[i] = v;
}
}
*/
void up(int,int,int,int,int);
asm("\n"
" .text\n"
" .p2align 4\n"
" .globl _Z2upiiiii\n"
" .type _Z2upiiiii, @function\n"
"_Z2upiiiii:\n"
".myLFB9897:\n"
" .cfi_startproc\n"
" movl %edx, %eax\n"
" movl %ecx, %edx\n"
" leal 7(%rcx), %ecx\n"
" testl %edx, %edx\n"
" cmovns %edx, %ecx\n"
" leal 7(%r8), %edx\n"
" sarl $3, %ecx\n"
" testl %r8d, %r8d\n"
" cmovns %r8d, %edx\n"
" sarl $3, %edx\n"
" cmpl %edx, %ecx\n"
" jge .myL21\n"
" vmovd %esi, %xmm4\n"
" subl $1, %edx\n"
" vmovd %edi, %xmm5\n"
" movslq %ecx, %rsi\n"
" subq %rcx, %rdx\n"
" leaq a(%rip), %rdi\n"
" vmovd %eax, %xmm3\n"
" movq %rsi, %rax\n"
" andl $4294967294, %edx\n"
" leaq 64(%rdi), %rcx\n"
" salq $5, %rax\n"
" vpbroadcastd %xmm5, %ymm5\n"
" addq %rsi, %rdx\n"
" vpbroadcastd %xmm4, %ymm4\n"
" vpbroadcastd %xmm3, %ymm3\n"
" addq %rdi, %rax\n"
" salq $5, %rdx\n"
" addq %rcx, %rdx\n"
" movq %rdx, %rcx\n"
" subq %rax, %rcx\n"
" subq $64, %rcx\n"
" shrq $6, %rcx\n"
" addq $1, %rcx\n"
" andl $3, %ecx\n"
" je .myL3\n"
" cmpq $1, %rcx\n"
" je .myL15\n"
" cmpq $2, %rcx\n"
" je .myL16\n"
" vmovdqa (%rax), %ymm0\n"
" vmovdqa 32(%rax), %ymm2\n"
" addq $64, %rax\n"
" vpcmpgtd %ymm5, %ymm0, %ymm1\n"
" vpandn %ymm2, %ymm1, %ymm1\n"
" vpxor %ymm1, %ymm0, %ymm0\n"
" vpcmpgtd %ymm4, %ymm0, %ymm1\n"
" vpandn %ymm2, %ymm1, %ymm1\n"
" vpxor %ymm1, %ymm0, %ymm0\n"
" vpcmpgtd %ymm3, %ymm0, %ymm1\n"
" vpandn %ymm2, %ymm1, %ymm1\n"
" vpxor %ymm1, %ymm0, %ymm0\n"
" vmovdqa %ymm0, -64(%rax)\n"
".myL16:\n"
" vmovdqa (%rax), %ymm0\n"
" vmovdqa 32(%rax), %ymm2\n"
" addq $64, %rax\n"
" vpcmpgtd %ymm5, %ymm0, %ymm1\n"
" vpandn %ymm2, %ymm1, %ymm1\n"
" vpxor %ymm1, %ymm0, %ymm0\n"
" vpcmpgtd %ymm4, %ymm0, %ymm1\n"
" vpandn %ymm2, %ymm1, %ymm1\n"
" vpxor %ymm1, %ymm0, %ymm0\n"
" vpcmpgtd %ymm3, %ymm0, %ymm1\n"
" vpandn %ymm2, %ymm1, %ymm1\n"
" vpxor %ymm1, %ymm0, %ymm0\n"
" vmovdqa %ymm0, -64(%rax)\n"
".myL15:\n"
" vmovdqa (%rax), %ymm0\n"
" vmovdqa 32(%rax), %ymm2\n"
" addq $64, %rax\n"
" vpcmpgtd %ymm5, %ymm0, %ymm1\n"
" vpandn %ymm2, %ymm1, %ymm1\n"
" vpxor %ymm1, %ymm0, %ymm0\n"
" vpcmpgtd %ymm4, %ymm0, %ymm1\n"
" vpandn %ymm2, %ymm1, %ymm1\n"
" vpxor %ymm1, %ymm0, %ymm0\n"
" vpcmpgtd %ymm3, %ymm0, %ymm1\n"
" vpandn %ymm2, %ymm1, %ymm1\n"
" vpxor %ymm1, %ymm0, %ymm0\n"
" vmovdqa %ymm0, -64(%rax)\n"
" cmpq %rax, %rdx\n"
" je .myL22\n"
".myL3:\n"
" vmovdqa (%rax), %ymm0\n"
" vmovdqa 32(%rax), %ymm2\n"
" leaq 64(%rax), %rcx\n"
" vpcmpgtd %ymm5, %ymm0, %ymm1\n"
" vpandn %ymm2, %ymm1, %ymm1\n"
" vpxor %ymm1, %ymm0, %ymm0\n"
" vpcmpgtd %ymm4, %ymm0, %ymm1\n"
" vpandn %ymm2, %ymm1, %ymm1\n"
" vpxor %ymm1, %ymm0, %ymm0\n"
" vpcmpgtd %ymm3, %ymm0, %ymm1\n"
" vpandn %ymm2, %ymm1, %ymm1\n"
" vmovdqa 96(%rax), %ymm2\n"
" vpxor %ymm1, %ymm0, %ymm0\n"
" vmovdqa %ymm0, (%rax)\n"
" vmovdqa 64(%rax), %ymm0\n"
" vpcmpgtd %ymm5, %ymm0, %ymm1\n"
" vpandn %ymm2, %ymm1, %ymm1\n"
" vpxor %ymm1, %ymm0, %ymm0\n"
" vpcmpgtd %ymm4, %ymm0, %ymm1\n"
" vpandn %ymm2, %ymm1, %ymm1\n"
" vpxor %ymm1, %ymm0, %ymm0\n"
" vpcmpgtd %ymm3, %ymm0, %ymm1\n"
" vpandn %ymm2, %ymm1, %ymm1\n"
" vmovdqa 160(%rax), %ymm2\n"
" vpxor %ymm1, %ymm0, %ymm0\n"
" vmovdqa %ymm0, 64(%rax)\n"
" vmovdqa 128(%rax), %ymm0\n"
" vpcmpgtd %ymm5, %ymm0, %ymm1\n"
" vpandn %ymm2, %ymm1, %ymm1\n"
" vpxor %ymm1, %ymm0, %ymm0\n"
" vpcmpgtd %ymm4, %ymm0, %ymm1\n"
" vpandn %ymm2, %ymm1, %ymm1\n"
" vpxor %ymm1, %ymm0, %ymm0\n"
" vpcmpgtd %ymm3, %ymm0, %ymm1\n"
" vpandn %ymm2, %ymm1, %ymm1\n"
" vpxor %ymm1, %ymm0, %ymm0\n"
" vmovdqa %ymm0, 128(%rax)\n"
" vmovdqa 128(%rcx), %ymm0\n"
" vmovdqa 160(%rcx), %ymm2\n"
" leaq 192(%rcx), %rax\n"
" vpcmpgtd %ymm5, %ymm0, %ymm1\n"
" vpandn %ymm2, %ymm1, %ymm1\n"
" vpxor %ymm1, %ymm0, %ymm0\n"
" vpcmpgtd %ymm4, %ymm0, %ymm1\n"
" vpandn %ymm2, %ymm1, %ymm1\n"
" vpxor %ymm1, %ymm0, %ymm0\n"
" vpcmpgtd %ymm3, %ymm0, %ymm1\n"
" vpandn %ymm2, %ymm1, %ymm1\n"
" vpxor %ymm1, %ymm0, %ymm0\n"
" vmovdqa %ymm0, 128(%rcx)\n"
" cmpq %rax, %rdx\n"
" jne .myL3\n"
".myL22:\n"
" vzeroupper\n"
".myL21:\n"
" ret\n"
" .cfi_endproc\n"
".myLFE9897:\n"
" .size _Z2upiiiii, .-_Z2upiiiii\n"
);
int main()
{
cin.tie(0) -> sync_with_stdio(false);
int k;
cin >> n >> k;
Loop (i,0,n) {
int x, y;
cin >> x >> y;
y ^= x;
a[i/8*16 + i%8] = x;
a[i/8*16 + i%8 + 8] = y;
}
Loop (i,0,k)
cin >> q[i];
for (int l = 0; l < N; l += S) {
for (int i = 0; i < k; i += 3)
up(q[i+0], q[i+1], q[i+2], l, l+S);
}
ll ans = 0;
Loop (i,0,n)
ans += a[i/8*16 + i%8];
cout << ans << '\n';
}
# | Verdict | Execution time | Memory | Grader output |
---|
Fetching results... |
# | Verdict | Execution time | Memory | Grader output |
---|
Fetching results... |
# | Verdict | Execution time | Memory | Grader output |
---|
Fetching results... |