제출 #644659

#제출 시각아이디문제언어결과실행 시간메모리
644659ymm운세 보기 2 (JOI14_fortune_telling2)C++17
35 / 100
3075 ms2444 KiB
#include <bits/stdc++.h>
#define Loop(x,l,r) for (ll x = (l); x < (r); ++x)
#define LoopR(x,l,r) for (ll x = (r)-1; x >= (l); --x)
typedef long long ll;
typedef std::pair<int, int> pii;
typedef std::pair<ll , ll > pll;
using namespace std;

const int N = 400'000;
const int S = 2000;
int a[N];
int q[N];
int n;

/*
__attribute__((optimize("O3,unroll-loops"),target("avx2")))
void up(int x, int y, int z, int l, int r)
{
	typedef int ymm __attribute((vector_size(32),aligned(32)));
	ymm *const b = (ymm*)a;
	for (int i = l/8; i < r/8; i += 2) {
		auto v = b[i], u = b[i+1];
		v ^= v <= x? u: 0;
		v ^= v <= y? u: 0;
		v ^= v <= z? u: 0;
		b[i] = v;
	}
}
*/
void up(int,int,int,int,int);
asm("\n"
"	.text\n"
"	.p2align 4\n"
"	.globl	_Z2upiiiii\n"
"	.type	_Z2upiiiii, @function\n"
"_Z2upiiiii:\n"
".myLFB9897:\n"
"	.cfi_startproc\n"
"	movl	%edx, %eax\n"
"	movl	%ecx, %edx\n"
"	leal	7(%rcx), %ecx\n"
"	testl	%edx, %edx\n"
"	cmovns	%edx, %ecx\n"
"	leal	7(%r8), %edx\n"
"	sarl	$3, %ecx\n"
"	testl	%r8d, %r8d\n"
"	cmovns	%r8d, %edx\n"
"	sarl	$3, %edx\n"
"	cmpl	%edx, %ecx\n"
"	jge	.myL21\n"
"	vmovd	%esi, %xmm4\n"
"	subl	$1, %edx\n"
"	vmovd	%edi, %xmm5\n"
"	movslq	%ecx, %rsi\n"
"	subq	%rcx, %rdx\n"
"	leaq	a(%rip), %rdi\n"
"	vmovd	%eax, %xmm3\n"
"	movq	%rsi, %rax\n"
"	andl	$4294967294, %edx\n"
"	leaq	64(%rdi), %rcx\n"
"	salq	$5, %rax\n"
"	vpbroadcastd	%xmm5, %ymm5\n"
"	addq	%rsi, %rdx\n"
"	vpbroadcastd	%xmm4, %ymm4\n"
"	vpbroadcastd	%xmm3, %ymm3\n"
"	addq	%rdi, %rax\n"
"	salq	$5, %rdx\n"
"	addq	%rcx, %rdx\n"
"	movq	%rdx, %rcx\n"
"	subq	%rax, %rcx\n"
"	subq	$64, %rcx\n"
"	shrq	$6, %rcx\n"
"	addq	$1, %rcx\n"
"	andl	$3, %ecx\n"
"	je	.myL3\n"
"	cmpq	$1, %rcx\n"
"	je	.myL15\n"
"	cmpq	$2, %rcx\n"
"	je	.myL16\n"
"	vmovdqa	(%rax), %ymm0\n"
"	vmovdqa	32(%rax), %ymm2\n"
"	addq	$64, %rax\n"
"	vpcmpgtd	%ymm5, %ymm0, %ymm1\n"
"	vpandn	%ymm2, %ymm1, %ymm1\n"
"	vpxor	%ymm1, %ymm0, %ymm0\n"
"	vpcmpgtd	%ymm4, %ymm0, %ymm1\n"
"	vpandn	%ymm2, %ymm1, %ymm1\n"
"	vpxor	%ymm1, %ymm0, %ymm0\n"
"	vpcmpgtd	%ymm3, %ymm0, %ymm1\n"
"	vpandn	%ymm2, %ymm1, %ymm1\n"
"	vpxor	%ymm1, %ymm0, %ymm0\n"
"	vmovdqa	%ymm0, -64(%rax)\n"
".myL16:\n"
"	vmovdqa	(%rax), %ymm0\n"
"	vmovdqa	32(%rax), %ymm2\n"
"	addq	$64, %rax\n"
"	vpcmpgtd	%ymm5, %ymm0, %ymm1\n"
"	vpandn	%ymm2, %ymm1, %ymm1\n"
"	vpxor	%ymm1, %ymm0, %ymm0\n"
"	vpcmpgtd	%ymm4, %ymm0, %ymm1\n"
"	vpandn	%ymm2, %ymm1, %ymm1\n"
"	vpxor	%ymm1, %ymm0, %ymm0\n"
"	vpcmpgtd	%ymm3, %ymm0, %ymm1\n"
"	vpandn	%ymm2, %ymm1, %ymm1\n"
"	vpxor	%ymm1, %ymm0, %ymm0\n"
"	vmovdqa	%ymm0, -64(%rax)\n"
".myL15:\n"
"	vmovdqa	(%rax), %ymm0\n"
"	vmovdqa	32(%rax), %ymm2\n"
"	addq	$64, %rax\n"
"	vpcmpgtd	%ymm5, %ymm0, %ymm1\n"
"	vpandn	%ymm2, %ymm1, %ymm1\n"
"	vpxor	%ymm1, %ymm0, %ymm0\n"
"	vpcmpgtd	%ymm4, %ymm0, %ymm1\n"
"	vpandn	%ymm2, %ymm1, %ymm1\n"
"	vpxor	%ymm1, %ymm0, %ymm0\n"
"	vpcmpgtd	%ymm3, %ymm0, %ymm1\n"
"	vpandn	%ymm2, %ymm1, %ymm1\n"
"	vpxor	%ymm1, %ymm0, %ymm0\n"
"	vmovdqa	%ymm0, -64(%rax)\n"
"	cmpq	%rax, %rdx\n"
"	je	.myL22\n"
".myL3:\n"
"	vmovdqa	(%rax), %ymm0\n"
"	vmovdqa	32(%rax), %ymm2\n"
"	leaq	64(%rax), %rcx\n"
"	vpcmpgtd	%ymm5, %ymm0, %ymm1\n"
"	vpandn	%ymm2, %ymm1, %ymm1\n"
"	vpxor	%ymm1, %ymm0, %ymm0\n"
"	vpcmpgtd	%ymm4, %ymm0, %ymm1\n"
"	vpandn	%ymm2, %ymm1, %ymm1\n"
"	vpxor	%ymm1, %ymm0, %ymm0\n"
"	vpcmpgtd	%ymm3, %ymm0, %ymm1\n"
"	vpandn	%ymm2, %ymm1, %ymm1\n"
"	vmovdqa	96(%rax), %ymm2\n"
"	vpxor	%ymm1, %ymm0, %ymm0\n"
"	vmovdqa	%ymm0, (%rax)\n"
"	vmovdqa	64(%rax), %ymm0\n"
"	vpcmpgtd	%ymm5, %ymm0, %ymm1\n"
"	vpandn	%ymm2, %ymm1, %ymm1\n"
"	vpxor	%ymm1, %ymm0, %ymm0\n"
"	vpcmpgtd	%ymm4, %ymm0, %ymm1\n"
"	vpandn	%ymm2, %ymm1, %ymm1\n"
"	vpxor	%ymm1, %ymm0, %ymm0\n"
"	vpcmpgtd	%ymm3, %ymm0, %ymm1\n"
"	vpandn	%ymm2, %ymm1, %ymm1\n"
"	vmovdqa	160(%rax), %ymm2\n"
"	vpxor	%ymm1, %ymm0, %ymm0\n"
"	vmovdqa	%ymm0, 64(%rax)\n"
"	vmovdqa	128(%rax), %ymm0\n"
"	vpcmpgtd	%ymm5, %ymm0, %ymm1\n"
"	vpandn	%ymm2, %ymm1, %ymm1\n"
"	vpxor	%ymm1, %ymm0, %ymm0\n"
"	vpcmpgtd	%ymm4, %ymm0, %ymm1\n"
"	vpandn	%ymm2, %ymm1, %ymm1\n"
"	vpxor	%ymm1, %ymm0, %ymm0\n"
"	vpcmpgtd	%ymm3, %ymm0, %ymm1\n"
"	vpandn	%ymm2, %ymm1, %ymm1\n"
"	vpxor	%ymm1, %ymm0, %ymm0\n"
"	vmovdqa	%ymm0, 128(%rax)\n"
"	vmovdqa	128(%rcx), %ymm0\n"
"	vmovdqa	160(%rcx), %ymm2\n"
"	leaq	192(%rcx), %rax\n"
"	vpcmpgtd	%ymm5, %ymm0, %ymm1\n"
"	vpandn	%ymm2, %ymm1, %ymm1\n"
"	vpxor	%ymm1, %ymm0, %ymm0\n"
"	vpcmpgtd	%ymm4, %ymm0, %ymm1\n"
"	vpandn	%ymm2, %ymm1, %ymm1\n"
"	vpxor	%ymm1, %ymm0, %ymm0\n"
"	vpcmpgtd	%ymm3, %ymm0, %ymm1\n"
"	vpandn	%ymm2, %ymm1, %ymm1\n"
"	vpxor	%ymm1, %ymm0, %ymm0\n"
"	vmovdqa	%ymm0, 128(%rcx)\n"
"	cmpq	%rax, %rdx\n"
"	jne	.myL3\n"
".myL22:\n"
"	vzeroupper\n"
".myL21:\n"
"	ret\n"
"	.cfi_endproc\n"
".myLFE9897:\n"
"	.size	_Z2upiiiii, .-_Z2upiiiii\n"
);

int main()
{
	cin.tie(0) -> sync_with_stdio(false);
	int k;
	cin >> n >> k;
	Loop (i,0,n) {
		int x, y;
		cin >> x >> y;
		y ^= x;
		a[i/8*16 + i%8] = x;
		a[i/8*16 + i%8 + 8] = y;
	}
	Loop (i,0,k)
		cin >> q[i];
	for (int l = 0; l < N; l += S) {
		for (int i = 0; i < k; i += 3)
			up(q[i+0], q[i+1], q[i+2], l, l+S);
	}
	ll ans = 0;
	Loop (i,0,n)
		ans += a[i/8*16 + i%8];
	cout << ans << '\n';
}
#Verdict Execution timeMemoryGrader output
Fetching results...
#Verdict Execution timeMemoryGrader output
Fetching results...
#Verdict Execution timeMemoryGrader output
Fetching results...