Submission #1330771

#	Time	Username	Problem	Language	Result	Execution time	Memory
1330771		TheMiraiTraveller0711	Languages (IOI10_languages)	C++20	96 / 100	5152 ms	41332 KiB
lang
/*
 * IOI 2010 - Language
 * N-gram approach (unigrams + bigrams + trigrams + 4-grams) with log-frequency scoring.
 *
 * Strategy:
 * - For each language L, maintain frequency hash maps for n-grams (n=1..4).
 * - For a new excerpt E, score each language as:
 *     sum over all distinct n-grams g in E of weight(n) * log(1 + freq[L][g])
 * - Choose the language with highest score.
 * - After getting correct label, update all n-gram counts.
 *
 * Memory:
 * - Unigrams: freq[56][65536] int array (~14MB)
 * - 2/3/4-grams: per-language unordered_maps (grow with data, bounded)
 *
 * Weights: higher-order grams are more discriminative.
 * W1=1, W2=3, W3=6, W4=10
 */

#include "grader.h"
#include <bits/stdc++.h>
using namespace std;

static const int NLANGS   = 56;
static const int NSYMBOLS = 65536;

// Unigram table (fast array)
static int freq1[NLANGS][NSYMBOLS];

// Higher-order n-gram tables (hash maps per language)
static std::unordered_map<uint64_t, int> freq2[NLANGS];
static std::unordered_map<uint64_t, int> freq3[NLANGS];
static std::unordered_map<uint64_t, int> freq4[NLANGS];

static int lang_count[NLANGS];

// N-gram score weights
static const double W1 = 1.0;
static const double W2 = 9.0;
static const double W3 = 1.0;
static const double W4 = 10.0;

// Scratch buffers (static to avoid stack allocation per call)
static bool seen1[NSYMBOLS];          // for deduplicating unigrams
static int  distinct1[100];           // distinct unigrams in current excerpt
static std::unordered_map<uint64_t,int> bg_count;   // bigrams in excerpt
static std::unordered_map<uint64_t,int> tg_count;   // trigrams in excerpt
static std::unordered_map<uint64_t,int> fg_count;   // 4-grams in excerpt

void excerpt(int *E) {

    // ── Collect n-grams from E ──────────────────────────────────────────────

    int nd1 = 0;
    for (int i = 0; i < 100; i++) {
        if (!seen1[E[i]]) { seen1[E[i]] = true; distinct1[nd1++] = E[i]; }
    }

    bg_count.clear();
    for (int i = 0; i < 99; i++) {
        uint64_t k = (uint64_t)E[i] * 65536ULL + E[i+1];
        bg_count[k]++;
    }

    tg_count.clear();
    for (int i = 0; i < 98; i++) {
        uint64_t k = (uint64_t)E[i] * (65536ULL*65536ULL)
                   + (uint64_t)E[i+1] * 65536ULL
                   + E[i+2];
        tg_count[k]++;
    }

    fg_count.clear();
    for (int i = 0; i < 97; i++) {
        uint64_t k = (uint64_t)E[i]   * (65536ULL*65536ULL*65536ULL)
                   + (uint64_t)E[i+1] * (65536ULL*65536ULL)
                   + (uint64_t)E[i+2] * 65536ULL
                   + E[i+3];
        fg_count[k]++;
    }

    // ── Score each language ─────────────────────────────────────────────────

    double best_score = -1e30;
    int    best_lang  = 0;

    for (int l = 0; l < NLANGS; l++) {

        if (lang_count[l] == 0) {
            // Unseen: rank below all seen languages, use index as tie-break
            double s = -1e10 + l * 1e-6;
            if (s > best_score) { best_score = s; best_lang = l; }
            continue;
        }

        double score = 0.0;

        // Unigrams
        for (int i = 0; i < nd1; i++) {
            int f = freq1[l][distinct1[i]];
            if (f) score += W1 * log(1.0 + f);
        }

        // Bigrams
        for (auto& [k, _] : bg_count) {
            auto it = freq2[l].find(k);
            if (it != freq2[l].end()) score += W2 * log(1.0 + it->second);
        }

        // Trigrams
        for (auto& [k, _] : tg_count) {
            auto it = freq3[l].find(k);
            if (it != freq3[l].end()) score += W3 * log(1.0 + it->second);
        }

        // 4-grams
        for (auto& [k, _] : fg_count) {
            auto it = freq4[l].find(k);
            if (it != freq4[l].end()) score += W4 * log(1.0 + it->second);
        }

        if (score > best_score) { best_score = score; best_lang = l; }
    }

    // Reset seen1 scratch buffer
    for (int i = 0; i < nd1; i++) seen1[distinct1[i]] = false;

    // ── Guess and receive correct label ────────────────────────────────────

    int correct = language(best_lang);

    // ── Update model ────────────────────────────────────────────────────────

    for (int i = 0; i < 100; i++)
        freq1[correct][E[i]]++;

    for (int i = 0; i < 99; i++) {
        uint64_t k = (uint64_t)E[i] * 65536ULL + E[i+1];
        freq2[correct][k]++;
    }

    for (int i = 0; i < 98; i++) {
        uint64_t k = (uint64_t)E[i] * (65536ULL*65536ULL)
                   + (uint64_t)E[i+1] * 65536ULL
                   + E[i+2];
        freq3[correct][k]++;
    }

    for (int i = 0; i < 97; i++) {
        uint64_t k = (uint64_t)E[i]   * (65536ULL*65536ULL*65536ULL)
                   + (uint64_t)E[i+1] * (65536ULL*65536ULL)
                   + (uint64_t)E[i+2] * 65536ULL
                   + E[i+3];
        freq4[correct][k]++;
    }

    lang_count[correct]++;
}
Subtask 130 / 30

#	Verdict	Execution time	Memory	Grader output
Fetching results...
Subtask 266 / 70

#	Verdict	Execution time	Memory	Grader output
Fetching results...