Submission #752241

# Submission time Handle Problem Language Result Execution time Memory
752241 2023-06-02T14:59:31 Z tch1cherin Olympic Bus (JOI20_ho_t4) C++17
37 / 100
1000 ms 12780 KB
#pragma clang attribute push (__attribute__((target("avx2,bmi,bmi2,lzcnt,popcnt"))), apply_to=function)
#pragma GCC optimize ("Ofast")
#pragma GCC target ("avx2,bmi,bmi2,lzcnt,popcnt")

#include <x86intrin.h>
#include <unistd.h>

#include <vector>
#include <ext/pb_ds/assoc_container.hpp>
using namespace std;
using namespace __gnu_pbds;
using namespace __gnu_cxx;
#include <sys/mman.h>
#include <bits/stdc++.h>
#define dprintf(...) //fprintf(stderr, __VA_ARGS__)
#define ll long long

// https://judge.yosupo.jp/submission/100577
template<typename T>
struct align_alloc
{
        T* oldptr=0;
        T* ptr = 0;
        int ptr_sz = 0;
        typedef T value_type;
        T* allocate(int n) {
                if (n <= 1<<14) {
                        int sz=(n*sizeof(T) + 63)>>6<<6;
                        if (sz <= ptr_sz) return ptr;

                        if(oldptr)free(oldptr);
                        oldptr=ptr;
                        ptr_sz = sz;
                        return ptr = (T*) aligned_alloc(64, sz);
                }
                int sz = (n*sizeof(T) + (1<<21) - 1) >> 21 << 21;
                if (sz <= ptr_sz) return ptr;

                if(oldptr)free(oldptr);
                oldptr=ptr;
                ptr_sz = sz;
                ptr = (T*)aligned_alloc(1<<21, sz);
                madvise(ptr, sz, MADV_HUGEPAGE);
                return ptr;
        }
        void deallocate(T* p, int n) {
                if (p==oldptr and oldptr) {
                        free(oldptr);
                        oldptr=0;
                }
        } ~align_alloc() {
                if(ptr) free(ptr);
                ptr=0;
                ptr_sz=0;
                if(oldptr) free(oldptr);
                oldptr=0;
        }
        template<typename U> struct rebind { typedef align_alloc<U> other; };
};

typedef __m256i mvi;
typedef __m256 mvf;
typedef __m256d mvd;
typedef unsigned long long ull;

pair</*min*/ll, /*ind*/int> argmin(mvi v1, mvi v2)
{
        mvi mmask = _mm256_cmpgt_epi64(v1, v2);
        mvi v=(mvi)_mm256_blendv_pd((mvd)v1, (mvd)v2, (mvd)mmask);
        mvi x=v;
        mvi y = (mvi)_mm256_permute_pd((mvd)x, 0b01'01); // 0<=>1 2<=>3
        mvi mask = _mm256_cmpgt_epi64(x, y);
        x = (mvi)_mm256_blendv_pd((mvd)x, (mvd)y, (mvd)mask);
        y = _mm256_permute4x64_epi64(x, 0b01'00'11'10);
        mask = _mm256_cmpgt_epi64(x, y);
        x = (mvi)_mm256_blendv_pd((mvd)x, (mvd)y, (mvd)mask);

        ll mn = _mm256_extract_epi64(x, 0);
        mask = _mm256_cmpeq_epi64(x, v);
        int ind = _tzcnt_u32(_mm256_movemask_pd((mvd)mask));
        int whichmask = _mm256_movemask_pd((mvd)mmask);
        return make_pair(mn, (whichmask & (1<<ind)) == 0?ind:ind+4);
}


struct simd_priority_queue
{
        vector<ll, align_alloc<ll>> heap;
        vector<int, align_alloc<int>> keys;
        // gp_hash_table<int, int> key_to_pos;
        vector<int> key_to_pos;

        simd_priority_queue(int nkeys) {key_to_pos.resize(nkeys);}
        int sz = 0;

        int par(int ind) { return (ind>>3)-1; }
        int ch(int ind, int c) { return (ind+1) * 8 + c; }

        void siftup(int ind, int key, ll val) {
                while ((ind & ~0x7) != 0 and heap[par(ind)] > val) {
                        heap[ind] = heap[par(ind)];
                        keys[ind] = keys[par(ind)];
                        key_to_pos[keys[ind]] = ind;
                        ind=par(ind);
                }
                heap[ind]=val;
                keys[ind]=key;
                key_to_pos[key] = ind;
        }
        void siftdown(int ind, int key, ll val) {
                while(1) {
                        dprintf("siftdown ind = %d key = %d val = %lld\n", ind, key, val);
                        if (__builtin_expect(ch(ind, 7) >= sz, 0)) {
                                int smallest = ch(ind, 0);
                                if (smallest >= sz) break;

                                for (int c = smallest+1; c < sz; c++)
                                        smallest += (heap[smallest] > heap[c]) * (c - smallest);

                                if (heap[smallest] >= val) break;

                                heap[ind] = heap[smallest];
                                keys[ind] = keys[smallest];
                                key_to_pos[keys[ind]] = ind;
                                ind = smallest;
                                break;
                        }
                        auto [mn, mind] = argmin(_mm256_load_si256((mvi*)&heap[ch(ind,0)]), _mm256_load_si256((mvi*)&heap[ch(ind,4)])); // loads 8 vals

                        if (mn >= val) break;
                        int smallest = ch(ind, mind);
                        heap[ind] = heap[smallest];
                        keys[ind] = keys[smallest];
                        key_to_pos[keys[ind]] = ind;
                        ind = smallest;
                }
                heap[ind]=val;
                keys[ind]=key;
                key_to_pos[key] = ind;
        }

        pair<int,ll> top() {
                if(sz==0)return make_pair(-999,-999);
                if(sz<8) {
                        int smallest=0;
                        for (int c=1;c<sz;c++)
                                smallest+=(heap[smallest]>heap[c])*(c-smallest);
                        return make_pair(keys[smallest], heap[smallest]);
                }
                auto [mn, amn] = argmin(_mm256_load_si256((mvi*)&heap[0]), _mm256_load_si256((mvi*)&heap[4]));
                return make_pair(keys[amn], mn);
        }
        int topind() {
                if(sz==0)return -999;
                if(sz<8) {
                        int smallest=0;
                        for (int c=1;c<sz;c++)
                                smallest+=(heap[smallest]>heap[c])*(c-smallest);
                        return smallest;
                }
                return argmin(_mm256_load_si256((mvi*)&heap[0]), _mm256_load_si256((mvi*)&heap[4])).second;
        }

        void decreasekey(int key, int nkey, ll nval) {
                int pos = key_to_pos[key];
                // key_to_pos.erase(key);
                siftup(pos, nkey, nval);
        }
        void insert(int key, ll val) {
                sz++;
                heap.resize(sz);
                keys.resize(sz);
                siftup(sz-1, key, val);
        }
        void extract() {
                int ti = topind();
                // key_to_pos.erase(keys[ti]);
                sz--;
                if(sz!=0 and ti != sz) siftdown(ti, keys.back(), heap.back());
                heap.resize(sz);
                keys.resize(sz);
        }
        void replace(int key, ll val) {
                siftdown(topind(), key, val);
        }
};

struct edge {
  int from, to, weight, cost, id;

  edge() {}

  edge(int _from, int _to, int _weight, int _cost, int _id) : from(_from), to(_to), weight(_weight), cost(_cost), id(_id) {}
};

struct node {
  int distance, parent;

  node() {
    distance = INT_MAX;
    parent = -1;
  }

  node(int _distance, int _parent) : distance(_distance), parent(_parent) {}
};

struct result {
  int edge;
  vector<int> distances; 
};

vector<node> dijkstra(vector<vector<edge>> graph, int start) {
  int n = (int)graph.size();
  vector<node> answer(n);
  simd_priority_queue q(n);
  q.insert(start, 0);
  answer[start].distance = 0;
  while (q.sz) {
    auto [u, d] = q.top();
    q.extract();
    if (answer[u].distance > d) {
      continue;
    }
    for (auto [from, to, weight, cost, id] : graph[u]) {
      if (answer[from].distance + weight < answer[to].distance) {
        answer[to] = node(answer[from].distance + weight, id);
        q.insert(to, answer[to].distance);
      }
    }
  }
  return answer;
}

vector<vector<edge>> transpose(vector<vector<edge>> graph) {
  int n = (int)graph.size();
  vector<vector<edge>> new_graph(n);
  for (int u = 0; u < n; u++) {
    for (auto [from, to, weight, cost, id] : graph[u]) {
      new_graph[to].emplace_back(to, from, weight, cost, id);
    }
  }
  return new_graph;
}

vector<vector<int>> find_shortest_paths_without_each_edge(vector<vector<edge>> graph, int start) {
  int n = (int)graph.size();
  int m = 0;
  for (int u = 0; u < n; u++) {
    for (auto [from, to, weight, cost, id] : graph[u]) {
      m = max(m, 1 + id);
    }
  }
  vector<node> result = dijkstra(graph, start);
  vector<int> dist(n);
  for (int i = 0; i < n; i++) {
    dist[i] = result[i].distance;
  }
  vector<vector<int>> answer(m);
  for (auto [distance, parent] : result) {
    if (parent != -1) {
      vector<vector<edge>> new_graph = graph;
      for (int u = 0; u < n; u++) {
        for (int i = 0; i < (int)new_graph[u].size(); i++) {
          auto [from, to, weight, cost, id] = new_graph[u][i];
          if (id == parent) {
            new_graph[u].erase(new_graph[u].begin() + i);
            break;
          }  
        }
      }
      vector<node> new_result = dijkstra(new_graph, start);
      answer[parent] = vector<int>(n);
      for (int i = 0; i < n; i++) {
        answer[parent][i] = new_result[i].distance;  
      }
    }
  }
  answer.push_back(dist);
  return answer;
}

void solve() {
  int n, m;
  cin >> n >> m;
  vector<vector<edge>> graph(n);
  for (int i = 0; i < m; i++) {
    int from, to, weight, cost;
    cin >> from >> to >> weight >> cost;
    from--, to--;
    graph[from].emplace_back(from, to, weight, cost, i);
  }
  vector<vector<edge>> rev_graph = transpose(graph);
  vector<vector<int>> result_a = find_shortest_paths_without_each_edge(graph, 0);
  vector<vector<int>> result_b = find_shortest_paths_without_each_edge(graph, n - 1);
  vector<vector<int>> rev_result_a = find_shortest_paths_without_each_edge(rev_graph, 0);
  vector<vector<int>> rev_result_b = find_shortest_paths_without_each_edge(rev_graph, n - 1);
  auto get = [](vector<vector<int>>& x, int i, int j) {
    if (x[i].empty()) {
      return x.back()[j];
    } else {
      return x[i][j];
    }
  };
  int ans = INT_MAX;
  if (result_a.back()[n - 1] != INT_MAX && result_b.back()[0] != INT_MAX) {
    ans = result_a.back()[n - 1] + result_b.back()[0];
  }
  for (int u = 0; u < n; u++) {
    for (auto [from, to, weight, cost, id] : graph[u]) {
      int AB = INT_MAX, BA = INT_MAX;
      AB = min(AB, get(result_a, id, n - 1));
      if (get(result_a, id, to) != INT_MAX && get(rev_result_b, id, from) != INT_MAX) {
        AB = min(AB, get(result_a, id, to) + get(rev_result_b, id, from) + weight);
      }
      BA = min(BA, get(result_b, id, 0));
      if (get(result_b, id, to) != INT_MAX && get(rev_result_a, id, from) != INT_MAX) {
        BA = min(BA, get(result_b, id, to) + get(rev_result_a, id, from) + weight); 
      }
      if (AB == INT_MAX || BA == INT_MAX) {
        continue;
      }
      ans = min(ans, AB + BA + cost);
    }
  }
  cout << (ans == INT_MAX ? -1 : ans) << "\n";
}

int main() {
  ios::sync_with_stdio(false);
  cin.tie(nullptr);
  solve();
}

Compilation message

ho_t4.cpp:1: warning: ignoring '#pragma clang attribute' [-Wunknown-pragmas]
    1 | #pragma clang attribute push (__attribute__((target("avx2,bmi,bmi2,lzcnt,popcnt"))), apply_to=function)
      |
# Verdict Execution time Memory Grader output
1 Correct 37 ms 1244 KB Output is correct
2 Correct 3 ms 468 KB Output is correct
3 Correct 47 ms 1284 KB Output is correct
4 Correct 51 ms 1284 KB Output is correct
5 Correct 4 ms 724 KB Output is correct
6 Correct 7 ms 512 KB Output is correct
7 Correct 0 ms 212 KB Output is correct
8 Correct 0 ms 212 KB Output is correct
9 Correct 1 ms 468 KB Output is correct
10 Correct 58 ms 1308 KB Output is correct
11 Correct 59 ms 1260 KB Output is correct
12 Correct 57 ms 1236 KB Output is correct
13 Correct 17 ms 852 KB Output is correct
14 Correct 33 ms 1124 KB Output is correct
15 Correct 33 ms 1100 KB Output is correct
16 Correct 34 ms 1108 KB Output is correct
# Verdict Execution time Memory Grader output
1 Correct 590 ms 12628 KB Output is correct
2 Correct 534 ms 12780 KB Output is correct
3 Correct 575 ms 12588 KB Output is correct
4 Correct 55 ms 1508 KB Output is correct
5 Correct 34 ms 1160 KB Output is correct
6 Correct 13 ms 724 KB Output is correct
7 Correct 1 ms 468 KB Output is correct
8 Correct 0 ms 212 KB Output is correct
9 Correct 186 ms 12496 KB Output is correct
10 Correct 160 ms 12520 KB Output is correct
11 Correct 439 ms 12404 KB Output is correct
12 Correct 355 ms 12356 KB Output is correct
13 Correct 415 ms 12468 KB Output is correct
14 Correct 391 ms 12720 KB Output is correct
# Verdict Execution time Memory Grader output
1 Correct 47 ms 1444 KB Output is correct
2 Correct 10 ms 628 KB Output is correct
3 Correct 265 ms 9956 KB Output is correct
4 Correct 6 ms 596 KB Output is correct
5 Correct 309 ms 12396 KB Output is correct
6 Correct 1 ms 212 KB Output is correct
7 Correct 0 ms 212 KB Output is correct
8 Correct 140 ms 12476 KB Output is correct
9 Correct 133 ms 12520 KB Output is correct
10 Correct 239 ms 12280 KB Output is correct
11 Correct 241 ms 12284 KB Output is correct
12 Correct 214 ms 12624 KB Output is correct
13 Correct 1 ms 212 KB Output is correct
14 Correct 1 ms 212 KB Output is correct
15 Correct 1 ms 212 KB Output is correct
16 Correct 1 ms 212 KB Output is correct
17 Correct 0 ms 212 KB Output is correct
18 Correct 0 ms 212 KB Output is correct
19 Correct 219 ms 12628 KB Output is correct
20 Correct 216 ms 12476 KB Output is correct
21 Correct 240 ms 12360 KB Output is correct
22 Correct 224 ms 12308 KB Output is correct
# Verdict Execution time Memory Grader output
1 Correct 37 ms 1244 KB Output is correct
2 Correct 3 ms 468 KB Output is correct
3 Correct 47 ms 1284 KB Output is correct
4 Correct 51 ms 1284 KB Output is correct
5 Correct 4 ms 724 KB Output is correct
6 Correct 7 ms 512 KB Output is correct
7 Correct 0 ms 212 KB Output is correct
8 Correct 0 ms 212 KB Output is correct
9 Correct 1 ms 468 KB Output is correct
10 Correct 58 ms 1308 KB Output is correct
11 Correct 59 ms 1260 KB Output is correct
12 Correct 57 ms 1236 KB Output is correct
13 Correct 17 ms 852 KB Output is correct
14 Correct 33 ms 1124 KB Output is correct
15 Correct 33 ms 1100 KB Output is correct
16 Correct 34 ms 1108 KB Output is correct
17 Correct 590 ms 12628 KB Output is correct
18 Correct 534 ms 12780 KB Output is correct
19 Correct 575 ms 12588 KB Output is correct
20 Correct 55 ms 1508 KB Output is correct
21 Correct 34 ms 1160 KB Output is correct
22 Correct 13 ms 724 KB Output is correct
23 Correct 1 ms 468 KB Output is correct
24 Correct 0 ms 212 KB Output is correct
25 Correct 186 ms 12496 KB Output is correct
26 Correct 160 ms 12520 KB Output is correct
27 Correct 439 ms 12404 KB Output is correct
28 Correct 355 ms 12356 KB Output is correct
29 Correct 415 ms 12468 KB Output is correct
30 Correct 391 ms 12720 KB Output is correct
31 Correct 47 ms 1444 KB Output is correct
32 Correct 10 ms 628 KB Output is correct
33 Correct 265 ms 9956 KB Output is correct
34 Correct 6 ms 596 KB Output is correct
35 Correct 309 ms 12396 KB Output is correct
36 Correct 1 ms 212 KB Output is correct
37 Correct 0 ms 212 KB Output is correct
38 Correct 140 ms 12476 KB Output is correct
39 Correct 133 ms 12520 KB Output is correct
40 Correct 239 ms 12280 KB Output is correct
41 Correct 241 ms 12284 KB Output is correct
42 Correct 214 ms 12624 KB Output is correct
43 Correct 1 ms 212 KB Output is correct
44 Correct 1 ms 212 KB Output is correct
45 Correct 1 ms 212 KB Output is correct
46 Correct 1 ms 212 KB Output is correct
47 Correct 0 ms 212 KB Output is correct
48 Correct 0 ms 212 KB Output is correct
49 Correct 219 ms 12628 KB Output is correct
50 Correct 216 ms 12476 KB Output is correct
51 Correct 240 ms 12360 KB Output is correct
52 Correct 224 ms 12308 KB Output is correct
53 Correct 679 ms 12412 KB Output is correct
54 Correct 663 ms 12440 KB Output is correct
55 Correct 608 ms 12636 KB Output is correct
56 Correct 53 ms 1304 KB Output is correct
57 Correct 53 ms 1312 KB Output is correct
58 Correct 530 ms 10088 KB Output is correct
59 Correct 529 ms 10140 KB Output is correct
60 Correct 573 ms 10236 KB Output is correct
61 Correct 479 ms 10132 KB Output is correct
62 Correct 478 ms 10188 KB Output is correct
63 Correct 539 ms 10048 KB Output is correct
64 Execution timed out 1080 ms 6300 KB Time limit exceeded
65 Halted 0 ms 0 KB -