Submission #1330762

#	Time	Username	Problem	Language	Result	Execution time	Memory
1330762		TheMiraiTraveller0711	Languages (IOI10_languages)	C++20	65 / 100	182 ms	3004 KiB
lang
/*
 * IOI 2010 - Language
 * Improved Rocchio's method with symbol frequency profiles.
 *
 * Strategy:
 * - For each language L, maintain a frequency count of each symbol seen.
 * - For a new excerpt E, compute similarity to each language as:
 *     sum over distinct symbols s in E of: freq[L][s] / total_symbols[L]
 *   (i.e., weighted by how characteristic the symbol is for that language)
 * - Choose the language with highest similarity.
 * - Also maintain count of how many excerpts we've seen per language
 *   (prior) to break ties.
 *
 * Better: use log-likelihood or a smoothed probability model.
 * We'll use: score(L, E) = sum_{s in E} log(1 + freq[L][s])
 * normalized by sqrt(sum_s freq[L][s]^2) for cosine-like similarity.
 * But simpler: just sum of freq[L][s] for s in E (counts multiple occurrences).
 */

#include "grader.h"
#include <cstring>
#include <cmath>
#include <algorithm>

// 56 languages, symbols 1..65535
static const int NLANGS = 56;
static const int NSYMBOLS = 65536;

// Use short to save memory: cap at 255 (use unsigned short for up to 65535)
// Actually 10000 excerpts * 100 symbols = 1,000,000 total, per language ~17857 max per symbol
// Use int but that's 56 * 65536 * 4 = ~14MB, within 256MB limit
static int freq[NLANGS][NSYMBOLS];
static long long total[NLANGS]; // total symbol tokens per language
static int count[NLANGS];       // excerpts seen per language

void excerpt(int *E) {
    // Compute scores for each language
    double best_score = -1e18;
    int best_lang = 0;
    
    // Collect distinct symbols in E
    static bool in_excerpt[NSYMBOLS];
    static int symbols[100];
    int nsym = 0;
    
    for (int i = 0; i < 100; i++) {
        int s = E[i];
        if (!in_excerpt[s]) {
            in_excerpt[s] = true;
            symbols[nsym++] = s;
        }
    }
    
    // Find language with best score
    // For languages with data: use weighted similarity
    // Score = sum over distinct s in E of log(1 + freq[L][s])
    // For unseen languages, use 0
    
    int total_seen = 0;
    for (int l = 0; l < NLANGS; l++) total_seen += count[l];
    
    for (int l = 0; l < NLANGS; l++) {
        double score;
        if (count[l] == 0) {
            // Unseen language: small prior score
            score = -1000.0 + (double)l * 0.001; // slight tie-break
        } else {
            score = 0.0;
            for (int i = 0; i < nsym; i++) {
                int s = symbols[i];
                // Probability-based: log(freq[l][s] / total[l])
                // Use freq directly for speed
                score += log(1.0 + freq[l][s]);
            }
            // Normalize by number of distinct symbols seen for this language
            // (optional, skip for speed)
        }
        if (score > best_score) {
            best_score = score;
            best_lang = l;
        }
    }
    
    // Clear in_excerpt
    for (int i = 0; i < nsym; i++) {
        in_excerpt[symbols[i]] = false;
    }
    
    // Make the guess and get correct answer
    int correct = language(best_lang);
    
    // Update model with correct answer
    for (int i = 0; i < 100; i++) {
        freq[correct][E[i]]++;
    }
    total[correct] += 100;
    count[correct]++;
}
Subtask 130 / 30

#	Verdict	Execution time	Memory	Grader output
Fetching results...
Subtask 235 / 70

#	Verdict	Execution time	Memory	Grader output
Fetching results...