/*
* IOI 2010 - Language
* N-gram approach (unigrams through 4-grams) with log-frequency scoring.
* No hashing — all flat arrays.
*
* Alphabet reduction: symbols are folded to 4 bits (0–15) via E[i] & 0xF
* so that 4-gram tables fit in memory.
*
* Memory:
* - freq1[56][16] = 3.6 KB
* - freq2[56][16][16] = 57.3 KB
* - freq3[56][16][16][16] = 917 KB
* - freq4[56][16][16][16][16] = ~14 MB
* Total: ~15 MB
*/
#include "grader.h"
#include <bits/stdc++.h>
using namespace std;
static const int NLANGS = 56;
static const int ALPHA = 16; // 4-bit folded alphabet
static int freq1[NLANGS][ALPHA];
static int freq2[NLANGS][ALPHA][ALPHA];
static int freq3[NLANGS][ALPHA][ALPHA][ALPHA];
static int freq4[NLANGS][ALPHA][ALPHA][ALPHA][ALPHA];
static int lang_count[NLANGS];
static const double W1 = 1.0;
static const double W2 = 3.0;
static const double W3 = 6.0;
static const double W4 = 10.0;
void excerpt(int *E) {
// Fold each symbol to 4 bits
int B[100];
for (int i = 0; i < 100; i++) B[i] = E[i] & 0xF;
// ── Score each language ─────────────────────────────────────────────────
double best_score = -1e30;
int best_lang = 0;
for (int l = 0; l < NLANGS; l++) {
if (lang_count[l] == 0) {
double s = -1e10 + l * 1e-6;
if (s > best_score) { best_score = s; best_lang = l; }
continue;
}
double score = 0.0;
// Unigrams
bool seen1[ALPHA] = {};
for (int i = 0; i < 100; i++) {
int a = B[i];
if (!seen1[a]) {
seen1[a] = true;
int f = freq1[l][a];
if (f) score += W1 * log(1.0 + f);
}
}
// Bigrams
bool seen2[ALPHA][ALPHA] = {};
for (int i = 0; i < 99; i++) {
int a = B[i], b = B[i+1];
if (!seen2[a][b]) {
seen2[a][b] = true;
int f = freq2[l][a][b];
if (f) score += W2 * log(1.0 + f);
}
}
// Trigrams
bool seen3[ALPHA][ALPHA][ALPHA] = {};
for (int i = 0; i < 98; i++) {
int a = B[i], b = B[i+1], c = B[i+2];
if (!seen3[a][b][c]) {
seen3[a][b][c] = true;
int f = freq3[l][a][b][c];
if (f) score += W3 * log(1.0 + f);
}
}
// 4-grams
bool seen4[ALPHA][ALPHA][ALPHA][ALPHA] = {};
for (int i = 0; i < 97; i++) {
int a = B[i], b = B[i+1], c = B[i+2], d = B[i+3];
if (!seen4[a][b][c][d]) {
seen4[a][b][c][d] = true;
int f = freq4[l][a][b][c][d];
if (f) score += W4 * log(1.0 + f);
}
}
if (score > best_score) { best_score = score; best_lang = l; }
}
// ── Guess and receive correct label ─────────────────────────────────────
int correct = language(best_lang);
// ── Update model ─────────────────────────────────────────────────────────
for (int i = 0; i < 100; i++)
freq1[correct][B[i]]++;
for (int i = 0; i < 99; i++)
freq2[correct][B[i]][B[i+1]]++;
for (int i = 0; i < 98; i++)
freq3[correct][B[i]][B[i+1]][B[i+2]]++;
for (int i = 0; i < 97; i++)
freq4[correct][B[i]][B[i+1]][B[i+2]][B[i+3]]++;
lang_count[correct]++;
}