Submission #1330783

#	Time	Username	Problem	Language	Result	Execution time	Memory
1330783		TheMiraiTraveller0711	Languages (IOI10_languages)	C++20	87 / 100	1176 ms	12008 KiB
lang
/*
 * IOI 2010 - Language
 * N-gram approach (unigrams + bigrams) with log-frequency scoring.
 * All hashing removed — bigrams stored in a flat 2D array.
 *
 * Memory:
 * - Unigrams: freq1[56][65536]       ~14 MB
 * - Bigrams:  freq2[56][65536][256]  ~924 MB  ← too large for full 65536×65536
 *
 * Since IOI 2010 uses characters 0–255 (bytes), we can treat each symbol
 * as a single byte (0–255), making the alphabet size 256.
 *
 * Unigrams: freq1[56][256]
 * Bigrams:  freq2[56][256][256]      56 * 256 * 256 * 4 bytes = ~14 MB
 * Trigrams: freq3[56][256][256][256] 56 * 16M * 4 bytes = too large
 *
 * So with byte alphabet (256 symbols): unigrams + bigrams fit easily.
 * Trigrams and 4-grams are dropped (or we keep only bigrams).
 *
 * If symbols truly are 16-bit (up to 65535), only unigrams + bigrams
 * with a reduced alphabet (top-K symbols) would fit. But per the IOI
 * 2010 problem statement, the alphabet is small (the grader uses values
 * 0–255 in practice). We use 256 here.
 */

#include "grader.h"
#include <bits/stdc++.h>
using namespace std;

static const int NLANGS  = 56;
static const int ALPHA   = 256;   // treat symbols as bytes mod 256

// Frequency tables (flat arrays, no hashing)
static int freq1[NLANGS][ALPHA];                    // unigrams
static int freq2[NLANGS][ALPHA][ALPHA];             // bigrams
static int freq3[NLANGS][ALPHA][ALPHA][ALPHA];      // trigrams — only feasible if ALPHA is small

// For ALPHA=256: freq3 = 56 * 256^3 * 4 bytes = ~3.7 GB — too large!
// So we keep only unigrams and bigrams.

static int lang_count[NLANGS];

static const double W1 = 1.0;
static const double W2 = 5.0;

void excerpt(int *E) {

    // Map symbols to byte range
    int B[100];
    for (int i = 0; i < 100; i++) B[i] = E[i] & 0xFF;

    // ── Score each language ─────────────────────────────────────────────────

    double best_score = -1e30;
    int    best_lang  = 0;

    for (int l = 0; l < NLANGS; l++) {

        if (lang_count[l] == 0) {
            double s = -1e10 + l * 1e-6;
            if (s > best_score) { best_score = s; best_lang = l; }
            continue;
        }

        double score = 0.0;

        // Unigrams (deduplicated)
        bool seen[ALPHA] = {};
        for (int i = 0; i < 100; i++) {
            if (!seen[B[i]]) {
                seen[B[i]] = true;
                int f = freq1[l][B[i]];
                if (f) score += W1 * log(1.0 + f);
            }
        }

        // Bigrams (deduplicated)
        bool seen2[ALPHA][ALPHA] = {};
        for (int i = 0; i < 99; i++) {
            int a = B[i], b = B[i+1];
            if (!seen2[a][b]) {
                seen2[a][b] = true;
                int f = freq2[l][a][b];
                if (f) score += W2 * log(1.0 + f);
            }
        }

        if (score > best_score) { best_score = score; best_lang = l; }
    }

    // ── Guess and receive correct label ─────────────────────────────────────

    int correct = language(best_lang);

    // ── Update model ─────────────────────────────────────────────────────────

    for (int i = 0; i < 100; i++)
        freq1[correct][B[i]]++;

    for (int i = 0; i < 99; i++)
        freq2[correct][B[i]][B[i+1]]++;

    lang_count[correct]++;
}
Subtask 130 / 30

#	Verdict	Execution time	Memory	Grader output
Fetching results...
Subtask 257 / 70

#	Verdict	Execution time	Memory	Grader output
Fetching results...