/*
* IOI 2010 - Language
* N-gram approach (unigrams + bigrams + trigrams + 4-grams) with log-frequency scoring.
*
* Strategy:
* - For each language L, maintain frequency hash maps for n-grams (n=1..4).
* - For a new excerpt E, score each language as:
* sum over all distinct n-grams g in E of weight(n) * log(1 + freq[L][g])
* - Choose the language with highest score.
* - After getting correct label, update all n-gram counts.
*
* Memory:
* - Unigrams: freq[56][65536] int array (~14MB)
* - 2/3/4-grams: per-language unordered_maps (grow with data, bounded)
*
* Weights: higher-order grams are more discriminative.
* W1=1, W2=3, W3=6, W4=10
*/
#include "grader.h"
#include <bits/stdc++.h>
using namespace std;
static const int NLANGS = 56;
static const int NSYMBOLS = 65536;
// Unigram table (fast array)
static int freq1[NLANGS][NSYMBOLS];
// Higher-order n-gram tables (hash maps per language)
static std::unordered_map<uint64_t, int> freq2[NLANGS];
static std::unordered_map<uint64_t, int> freq3[NLANGS];
static std::unordered_map<uint64_t, int> freq4[NLANGS];
static int lang_count[NLANGS];
// N-gram score weights
static const double W1 = 1.0;
static const double W2 = 9.0;
static const double W3 = 1.0;
static const double W4 = 10.0;
// Scratch buffers (static to avoid stack allocation per call)
static bool seen1[NSYMBOLS]; // for deduplicating unigrams
static int distinct1[100]; // distinct unigrams in current excerpt
static std::unordered_map<uint64_t,int> bg_count; // bigrams in excerpt
static std::unordered_map<uint64_t,int> tg_count; // trigrams in excerpt
static std::unordered_map<uint64_t,int> fg_count; // 4-grams in excerpt
void excerpt(int *E) {
// ── Collect n-grams from E ──────────────────────────────────────────────
int nd1 = 0;
for (int i = 0; i < 100; i++) {
if (!seen1[E[i]]) { seen1[E[i]] = true; distinct1[nd1++] = E[i]; }
}
bg_count.clear();
for (int i = 0; i < 99; i++) {
uint64_t k = (uint64_t)E[i] * 65536ULL + E[i+1];
bg_count[k]++;
}
tg_count.clear();
for (int i = 0; i < 98; i++) {
uint64_t k = (uint64_t)E[i] * (65536ULL*65536ULL)
+ (uint64_t)E[i+1] * 65536ULL
+ E[i+2];
tg_count[k]++;
}
fg_count.clear();
for (int i = 0; i < 97; i++) {
uint64_t k = (uint64_t)E[i] * (65536ULL*65536ULL*65536ULL)
+ (uint64_t)E[i+1] * (65536ULL*65536ULL)
+ (uint64_t)E[i+2] * 65536ULL
+ E[i+3];
fg_count[k]++;
}
// ── Score each language ─────────────────────────────────────────────────
double best_score = -1e30;
int best_lang = 0;
for (int l = 0; l < NLANGS; l++) {
if (lang_count[l] == 0) {
// Unseen: rank below all seen languages, use index as tie-break
double s = -1e10 + l * 1e-6;
if (s > best_score) { best_score = s; best_lang = l; }
continue;
}
double score = 0.0;
// Unigrams
for (int i = 0; i < nd1; i++) {
int f = freq1[l][distinct1[i]];
if (f) score += W1 * log(1.0 + f);
}
// Bigrams
for (auto& [k, _] : bg_count) {
auto it = freq2[l].find(k);
if (it != freq2[l].end()) score += W2 * log(1.0 + it->second);
}
// Trigrams
for (auto& [k, _] : tg_count) {
auto it = freq3[l].find(k);
if (it != freq3[l].end()) score += W3 * log(1.0 + it->second);
}
// 4-grams
for (auto& [k, _] : fg_count) {
auto it = freq4[l].find(k);
if (it != freq4[l].end()) score += W4 * log(1.0 + it->second);
}
if (score > best_score) { best_score = score; best_lang = l; }
}
// Reset seen1 scratch buffer
for (int i = 0; i < nd1; i++) seen1[distinct1[i]] = false;
// ── Guess and receive correct label ────────────────────────────────────
int correct = language(best_lang);
// ── Update model ────────────────────────────────────────────────────────
for (int i = 0; i < 100; i++)
freq1[correct][E[i]]++;
for (int i = 0; i < 99; i++) {
uint64_t k = (uint64_t)E[i] * 65536ULL + E[i+1];
freq2[correct][k]++;
}
for (int i = 0; i < 98; i++) {
uint64_t k = (uint64_t)E[i] * (65536ULL*65536ULL)
+ (uint64_t)E[i+1] * 65536ULL
+ E[i+2];
freq3[correct][k]++;
}
for (int i = 0; i < 97; i++) {
uint64_t k = (uint64_t)E[i] * (65536ULL*65536ULL*65536ULL)
+ (uint64_t)E[i+1] * (65536ULL*65536ULL)
+ (uint64_t)E[i+2] * 65536ULL
+ E[i+3];
freq4[correct][k]++;
}
lang_count[correct]++;
}