/*
* IOI 2010 - Language
* Improved Rocchio's method with symbol frequency profiles.
*
* Strategy:
* - For each language L, maintain a frequency count of each symbol seen.
* - For a new excerpt E, compute similarity to each language as:
* sum over distinct symbols s in E of: freq[L][s] / total_symbols[L]
* (i.e., weighted by how characteristic the symbol is for that language)
* - Choose the language with highest similarity.
* - Also maintain count of how many excerpts we've seen per language
* (prior) to break ties.
*
* Better: use log-likelihood or a smoothed probability model.
* We'll use: score(L, E) = sum_{s in E} log(1 + freq[L][s])
* normalized by sqrt(sum_s freq[L][s]^2) for cosine-like similarity.
* But simpler: just sum of freq[L][s] for s in E (counts multiple occurrences).
*/
#include "grader.h"
#include <cstring>
#include <cmath>
#include <algorithm>
// 56 languages, symbols 1..65535
static const int NLANGS = 56;
static const int NSYMBOLS = 65536;
// Use short to save memory: cap at 255 (use unsigned short for up to 65535)
// Actually 10000 excerpts * 100 symbols = 1,000,000 total, per language ~17857 max per symbol
// Use int but that's 56 * 65536 * 4 = ~14MB, within 256MB limit
static int freq[NLANGS][NSYMBOLS];
static long long total[NLANGS]; // total symbol tokens per language
static int count[NLANGS]; // excerpts seen per language
void excerpt(int *E) {
// Compute scores for each language
double best_score = -1e18;
int best_lang = 0;
// Collect distinct symbols in E
static bool in_excerpt[NSYMBOLS];
static int symbols[100];
int nsym = 0;
for (int i = 0; i < 100; i++) {
int s = E[i];
if (!in_excerpt[s]) {
in_excerpt[s] = true;
symbols[nsym++] = s;
}
}
// Find language with best score
// For languages with data: use weighted similarity
// Score = sum over distinct s in E of log(1 + freq[L][s])
// For unseen languages, use 0
int total_seen = 0;
for (int l = 0; l < NLANGS; l++) total_seen += count[l];
for (int l = 0; l < NLANGS; l++) {
double score;
if (count[l] == 0) {
// Unseen language: small prior score
score = -1000.0 + (double)l * 0.001; // slight tie-break
} else {
score = 0.0;
for (int i = 0; i < nsym; i++) {
int s = symbols[i];
// Probability-based: log(freq[l][s] / total[l])
// Use freq directly for speed
score += log(1.0 + freq[l][s]);
}
// Normalize by number of distinct symbols seen for this language
// (optional, skip for speed)
}
if (score > best_score) {
best_score = score;
best_lang = l;
}
}
// Clear in_excerpt
for (int i = 0; i < nsym; i++) {
in_excerpt[symbols[i]] = false;
}
// Make the guess and get correct answer
int correct = language(best_lang);
// Update model with correct answer
for (int i = 0; i < 100; i++) {
freq[correct][E[i]]++;
}
total[correct] += 100;
count[correct]++;
}