#include "grader.h"
#include <bits/stdc++.h>
using namespace std;
/*
IOI 2010 "Language" style interactive solution.
- Feature hashing for n-grams (1,2,4 heavily weighted).
- Online perceptron-like updates: +1 to correct, -1 to predicted (if wrong).
- Uses presence (unique features per excerpt) to reduce domination by repeats.
Tuneable knobs:
BINS: hashing bins (power of two)
W1, W2, W4: gram weights
USE_3GRAM, W3: optional
*/
static constexpr int LANGS = 56;
// Hash bins (power of two). 2^17 = 131072.
// Memory per table: BINS * LANGS * 2 bytes ~ 14.7 MB (int16)
// With 3 tables (1,2,4): ~44 MB.
static constexpr int BINS_POW = 17;
static constexpr int BINS = 1 << BINS_POW;
static constexpr uint32_t MASK = BINS - 1;
// Gram weights (you can tweak)
static constexpr int W1 = 1;
static constexpr int W2 = 2;
static constexpr int W4 = 3;
// Optional 3-gram support (default off to match your request)
static constexpr bool USE_3GRAM = false;
static constexpr int W3 = 1;
static constexpr int BIAS_W = 1; // small prior
// Tables are stored transposed as [bin][lang] for cache-friendly scoring.
static vector<int16_t> tab1, tab2, tab3, tab4;
static int seen_cnt[LANGS];
static int total_seen = 0;
static inline uint64_t splitmix64(uint64_t x) {
x += 0x9e3779b97f4a7c15ULL;
x = (x ^ (x >> 30)) * 0xbf58476d1ce4e5b9ULL;
x = (x ^ (x >> 27)) * 0x94d049bb133111ebULL;
return x ^ (x >> 31);
}
static inline uint32_t h64_to_bin(uint64_t key) {
return (uint32_t)(splitmix64(key) & MASK);
}
static inline void sat_inc(vector<int16_t>& tab, uint32_t bin, int lang, int delta) {
int idx = (int)bin * LANGS + lang;
int v = (int)tab[idx] + delta;
if (v > 32767) v = 32767;
if (v < -32768) v = -32768;
tab[idx] = (int16_t)v;
}
static inline void make_features_1(const int E[100], vector<uint32_t>& out) {
out.clear();
out.reserve(100);
for (int i = 0; i < 100; i++) {
uint64_t key = (uint64_t)(uint32_t)E[i] * 0x9e3779b1u + 0x1234567u;
out.push_back(h64_to_bin(key));
}
sort(out.begin(), out.end());
out.erase(unique(out.begin(), out.end()), out.end());
}
static inline void make_features_2(const int E[100], vector<uint32_t>& out) {
out.clear();
out.reserve(99);
for (int i = 0; i + 1 < 100; i++) {
// Pack two 16-bit symbols into 32 bits safely (symbols in [1..65535])
uint64_t packed = (uint64_t)((uint32_t)E[i] | ((uint32_t)E[i + 1] << 16));
uint64_t key = packed ^ 0xA5A5A5A5ULL;
out.push_back(h64_to_bin(key));
}
sort(out.begin(), out.end());
out.erase(unique(out.begin(), out.end()), out.end());
}
static inline void make_features_3(const int E[100], vector<uint32_t>& out) {
out.clear();
out.reserve(98);
for (int i = 0; i + 2 < 100; i++) {
uint64_t packed =
(uint64_t)(uint32_t)E[i] |
((uint64_t)(uint32_t)E[i + 1] << 16) |
((uint64_t)(uint32_t)E[i + 2] << 32);
uint64_t key = packed ^ 0xC3C3C3C3C3ULL;
out.push_back(h64_to_bin(key));
}
sort(out.begin(), out.end());
out.erase(unique(out.begin(), out.end()), out.end());
}
static inline void make_features_4(const int E[100], vector<uint32_t>& out) {
out.clear();
out.reserve(97);
for (int i = 0; i + 3 < 100; i++) {
uint64_t packed =
(uint64_t)(uint32_t)E[i] |
((uint64_t)(uint32_t)E[i + 1] << 16) |
((uint64_t)(uint32_t)E[i + 2] << 32) |
((uint64_t)(uint32_t)E[i + 3] << 48);
uint64_t key = packed ^ 0x5F5F5F5F5F5FULL;
out.push_back(h64_to_bin(key));
}
sort(out.begin(), out.end());
out.erase(unique(out.begin(), out.end()), out.end());
}
static inline int predict_lang(const vector<uint32_t>& f1,
const vector<uint32_t>& f2,
const vector<uint32_t>& f3,
const vector<uint32_t>& f4) {
int32_t score[LANGS];
for (int l = 0; l < LANGS; l++) score[l] = (int32_t)seen_cnt[l] * BIAS_W;
// Add contributions feature-by-feature, updating all 56 languages contiguously.
for (uint32_t bin : f1) {
int base = (int)bin * LANGS;
for (int l = 0; l < LANGS; l++) score[l] += (int32_t)tab1[base + l] * W1;
}
for (uint32_t bin : f2) {
int base = (int)bin * LANGS;
for (int l = 0; l < LANGS; l++) score[l] += (int32_t)tab2[base + l] * W2;
}
if (USE_3GRAM) {
for (uint32_t bin : f3) {
int base = (int)bin * LANGS;
for (int l = 0; l < LANGS; l++) score[l] += (int32_t)tab3[base + l] * W3;
}
}
for (uint32_t bin : f4) {
int base = (int)bin * LANGS;
for (int l = 0; l < LANGS; l++) score[l] += (int32_t)tab4[base + l] * W4;
}
// Choose best; tie-break toward less-seen languages (gentle exploration).
int bestL = 0;
for (int l = 1; l < LANGS; l++) {
if (score[l] > score[bestL]) bestL = l;
else if (score[l] == score[bestL] && seen_cnt[l] < seen_cnt[bestL]) bestL = l;
}
return bestL;
}
static inline void update_model(int trueL, int predL,
const vector<uint32_t>& f1,
const vector<uint32_t>& f2,
const vector<uint32_t>& f3,
const vector<uint32_t>& f4) {
// Always reinforce the true language; penalize predicted if wrong.
for (uint32_t bin : f1) {
sat_inc(tab1, bin, trueL, +1);
if (predL != trueL) sat_inc(tab1, bin, predL, -1);
}
for (uint32_t bin : f2) {
sat_inc(tab2, bin, trueL, +1);
if (predL != trueL) sat_inc(tab2, bin, predL, -1);
}
if (USE_3GRAM) {
for (uint32_t bin : f3) {
sat_inc(tab3, bin, trueL, +1);
if (predL != trueL) sat_inc(tab3, bin, predL, -1);
}
}
for (uint32_t bin : f4) {
sat_inc(tab4, bin, trueL, +1);
if (predL != trueL) sat_inc(tab4, bin, predL, -1);
}
seen_cnt[trueL]++;
total_seen++;
}
void excerpt(int E[100]) {
// Lazy init (avoids static init order headaches)
static bool inited = false;
if (!inited) {
tab1.assign((size_t)BINS * LANGS, 0);
tab2.assign((size_t)BINS * LANGS, 0);
tab4.assign((size_t)BINS * LANGS, 0);
if (USE_3GRAM) tab3.assign((size_t)BINS * LANGS, 0);
memset(seen_cnt, 0, sizeof(seen_cnt));
inited = true;
}
vector<uint32_t> f1, f2, f3, f4;
make_features_1(E, f1);
make_features_2(E, f2);
if (USE_3GRAM) make_features_3(E, f3);
make_features_4(E, f4);
int guess = predict_lang(f1, f2, f3, f4);
// interactive call: returns the correct language
int correct = language(guess);
update_model(correct, guess, f1, f2, f3, f4);
}