mirror of
https://github.com/kuhyx/testsAndMisc.git
synced 2026-07-04 12:03:11 +02:00
- C/lichess_random_engine, vocabulary_curve, misc/split, 1dvelocitysimulator, opening_learner: test suites added - CPP/miscelanious: tests added - TS/battery-status, champions_leauge_scores, two-inputs: tests added - python_pkg/fm24_searcher, wake_alarm: new packages added - Fix ruff/cppcheck/eslint/clang-format failures - Update .gitignore for C/C++ build artifacts
309 lines
9.5 KiB
C
309 lines
9.5 KiB
C
/*
|
|
* Vocabulary Learning Curve Analyzer - thin driver.
|
|
*/
|
|
|
|
#include "vocabulary.h"
|
|
|
|
#include <stdbool.h>
|
|
#include <stdio.h>
|
|
#include <stdlib.h>
|
|
#include <string.h>
|
|
|
|
/* Print excerpt words */
|
|
static void print_excerpt(const VocabContext *ctx, int start, int length)
|
|
{
|
|
for (int i = start; i < start + length; i++)
|
|
{
|
|
if (i > start)
|
|
printf(" ");
|
|
printf("%s", ctx->word_sequence[i]->word);
|
|
}
|
|
}
|
|
|
|
/* Print words needed (sorted by rank) */
|
|
static void print_words_needed(const VocabContext *ctx, int start, int length)
|
|
{
|
|
static WordEntry *unique_entries[MAX_UNIQUE_WORDS];
|
|
static bool seen_rank[MAX_UNIQUE_WORDS + 1];
|
|
memset(seen_rank, 0, (ctx->num_unique_words + 1) * sizeof(bool));
|
|
|
|
int count = 0;
|
|
for (int i = start; i < start + length; i++)
|
|
{
|
|
WordEntry *entry = ctx->word_sequence[i];
|
|
if (!seen_rank[entry->rank])
|
|
{
|
|
seen_rank[entry->rank] = true;
|
|
unique_entries[count++] = entry;
|
|
}
|
|
}
|
|
|
|
for (int i = 0; i < count - 1; i++)
|
|
{
|
|
for (int j = i + 1; j < count; j++)
|
|
{
|
|
if (unique_entries[i]->rank > unique_entries[j]->rank)
|
|
{
|
|
WordEntry *tmp = unique_entries[i];
|
|
unique_entries[i] = unique_entries[j];
|
|
unique_entries[j] = tmp;
|
|
}
|
|
}
|
|
}
|
|
|
|
for (int i = 0; i < count; i++)
|
|
{
|
|
if (i > 0)
|
|
printf(", ");
|
|
printf("%s(#%d)", unique_entries[i]->word, unique_entries[i]->rank);
|
|
}
|
|
}
|
|
|
|
/* Print results */
|
|
static void print_results(const VocabContext *ctx, ExcerptResult *results, int max_length)
|
|
{
|
|
printf("======================================================================\n");
|
|
printf("VOCABULARY LEARNING CURVE\n");
|
|
printf("======================================================================\n");
|
|
printf("\n");
|
|
printf("For each excerpt length, the minimum number of top-frequency\n");
|
|
printf("words you need to learn to understand 100%%%% of some excerpt.\n");
|
|
printf("\n");
|
|
printf("Total words in text: %d\n", ctx->num_words);
|
|
printf("Unique words: %d\n", ctx->num_unique_words);
|
|
printf("\n");
|
|
printf("----------------------------------------------------------------------\n");
|
|
|
|
int prev_vocab = 0;
|
|
int actual_max = max_length;
|
|
if (actual_max > ctx->num_words)
|
|
actual_max = ctx->num_words;
|
|
|
|
for (int i = 0; i < actual_max; i++)
|
|
{
|
|
ExcerptResult *r = &results[i];
|
|
printf("\n[Length %d] Vocab needed: %d", r->excerpt_length, r->min_vocab_needed);
|
|
if (r->min_vocab_needed > prev_vocab)
|
|
printf(" (+%d)", r->min_vocab_needed - prev_vocab);
|
|
printf("\n");
|
|
printf(" Excerpt: \"");
|
|
print_excerpt(ctx, r->start_pos, r->excerpt_length);
|
|
printf("\"\n");
|
|
printf(" Words: ");
|
|
print_words_needed(ctx, r->start_pos, r->excerpt_length);
|
|
printf("\n");
|
|
prev_vocab = r->min_vocab_needed;
|
|
}
|
|
|
|
printf("\n----------------------------------------------------------------------\n");
|
|
|
|
if (actual_max > 0)
|
|
{
|
|
ExcerptResult *final = &results[actual_max - 1];
|
|
printf("\nTo understand a %d-word excerpt,\n", final->excerpt_length);
|
|
printf("you need to learn at minimum %d top words.\n", final->min_vocab_needed);
|
|
}
|
|
}
|
|
|
|
static void dump_vocabulary(const VocabContext *ctx, int max_rank)
|
|
{
|
|
printf("VOCAB_DUMP_START\n");
|
|
for (int i = 0; i < ctx->num_unique_words; i++)
|
|
{
|
|
if (ctx->all_entries[i]->rank <= max_rank)
|
|
printf("%s;%d\n", ctx->all_entries[i]->word, ctx->all_entries[i]->rank);
|
|
}
|
|
printf("VOCAB_DUMP_END\n");
|
|
}
|
|
|
|
static void print_longest_excerpt_result(const VocabContext *ctx, int max_vocab, int best_start,
|
|
int best_length)
|
|
{
|
|
printf("======================================================================\n");
|
|
printf("INVERSE MODE: LONGEST EXCERPT WITH TOP %d WORDS\n", max_vocab);
|
|
printf("======================================================================\n");
|
|
printf("\n");
|
|
printf("Total words in text: %d\n", ctx->num_words);
|
|
printf("Unique words: %d\n", ctx->num_unique_words);
|
|
printf("Vocabulary limit: top %d words\n", max_vocab);
|
|
printf("\n");
|
|
printf("----------------------------------------------------------------------\n");
|
|
printf("\n");
|
|
|
|
if (best_length == 0)
|
|
{
|
|
printf("No valid excerpt found with top %d words.\n", max_vocab);
|
|
printf("The text may require rarer words from the very beginning.\n");
|
|
}
|
|
else
|
|
{
|
|
printf("LONGEST EXCERPT: %d words\n", best_length);
|
|
printf("Position: words %d to %d\n", best_start + 1, best_start + best_length);
|
|
printf("\n");
|
|
printf("Excerpt:\n \"");
|
|
print_excerpt(ctx, best_start, best_length);
|
|
printf("\"\n");
|
|
printf("\n");
|
|
|
|
int max_rank_used = 0;
|
|
const char *rarest_word = NULL;
|
|
for (int i = best_start; i < best_start + best_length; i++)
|
|
{
|
|
if (ctx->word_sequence[i]->rank > max_rank_used)
|
|
{
|
|
max_rank_used = ctx->word_sequence[i]->rank;
|
|
rarest_word = ctx->word_sequence[i]->word;
|
|
}
|
|
}
|
|
// cppcheck-suppress nullPointer
|
|
printf("Rarest word used: %s (#%d)\n", rarest_word, max_rank_used);
|
|
|
|
static bool seen_rank[MAX_UNIQUE_WORDS + 1];
|
|
memset(seen_rank, 0, (ctx->num_unique_words + 1) * sizeof(bool));
|
|
int unique_count = 0;
|
|
for (int i = best_start; i < best_start + best_length; i++)
|
|
{
|
|
if (!seen_rank[ctx->word_sequence[i]->rank])
|
|
{
|
|
seen_rank[ctx->word_sequence[i]->rank] = true;
|
|
unique_count++;
|
|
}
|
|
}
|
|
printf("Unique words in excerpt: %d\n", unique_count);
|
|
}
|
|
|
|
printf("\n----------------------------------------------------------------------\n");
|
|
}
|
|
|
|
int main(int argc, char *argv[])
|
|
{
|
|
if (argc < 2)
|
|
{
|
|
fprintf(stderr, "Usage: %s <file.txt> [options]\n", argv[0]);
|
|
fprintf(stderr, "\nModes:\n");
|
|
fprintf(stderr, " (default) Find minimum vocab needed for each excerpt length\n");
|
|
fprintf(stderr,
|
|
" --max-vocab N INVERSE: Find longest excerpt using only top N words\n");
|
|
fprintf(stderr, "\nOptions:\n");
|
|
fprintf(stderr, " max_length Maximum excerpt length to analyze (default: 30)\n");
|
|
fprintf(stderr, " --dump-vocab [N] Output all words with ranks up to N\n");
|
|
fprintf(stderr, "\nExamples:\n");
|
|
fprintf(stderr, " %s book.txt 50 # Analyze excerpts up to 50 words\n",
|
|
argv[0]);
|
|
fprintf(stderr, " %s book.txt --max-vocab 500 # Find longest excerpt with top 500 words\n",
|
|
argv[0]);
|
|
return 1;
|
|
}
|
|
|
|
const char *filename = argv[1];
|
|
int max_length = 30;
|
|
bool dump_vocab = false;
|
|
int dump_max_rank = 0;
|
|
int max_vocab_mode = 0;
|
|
|
|
for (int i = 2; i < argc; i++)
|
|
{
|
|
if (strcmp(argv[i], "--dump-vocab") == 0)
|
|
{
|
|
dump_vocab = true;
|
|
if (i + 1 < argc && argv[i + 1][0] != '-')
|
|
dump_max_rank = atoi(argv[++i]);
|
|
}
|
|
else if (strcmp(argv[i], "--max-vocab") == 0)
|
|
{
|
|
if (i + 1 < argc)
|
|
{
|
|
max_vocab_mode = atoi(argv[++i]);
|
|
if (max_vocab_mode < 1)
|
|
{
|
|
fprintf(stderr, "Error: --max-vocab requires a positive number\n");
|
|
return 1;
|
|
}
|
|
}
|
|
else
|
|
{
|
|
fprintf(stderr, "Error: --max-vocab requires a number\n");
|
|
return 1;
|
|
}
|
|
}
|
|
else if (argv[i][0] != '-')
|
|
{
|
|
max_length = atoi(argv[i]);
|
|
if (max_length < 1)
|
|
max_length = 1;
|
|
if (max_length > 1000)
|
|
max_length = 1000;
|
|
}
|
|
}
|
|
|
|
VocabContext ctx;
|
|
vocab_init(&ctx);
|
|
|
|
FILE *fp = fopen(filename, "r");
|
|
if (!fp)
|
|
{
|
|
fprintf(stderr, "Cannot open file: %s\n", filename);
|
|
return 1;
|
|
}
|
|
|
|
bool ok = vocab_process_stream(&ctx, fp);
|
|
fclose(fp);
|
|
|
|
if (!ok)
|
|
{
|
|
vocab_cleanup(&ctx);
|
|
return 1;
|
|
}
|
|
|
|
if (ctx.num_words == 0)
|
|
{
|
|
fprintf(stderr, "No words found in file\n");
|
|
vocab_cleanup(&ctx);
|
|
return 1;
|
|
}
|
|
|
|
vocab_assign_ranks(&ctx);
|
|
|
|
if (max_vocab_mode > 0)
|
|
{
|
|
int best_start = 0;
|
|
int best_length = 0;
|
|
vocab_find_longest_excerpt(&ctx, max_vocab_mode, &best_start, &best_length);
|
|
print_longest_excerpt_result(&ctx, max_vocab_mode, best_start, best_length);
|
|
|
|
if (dump_vocab)
|
|
{
|
|
if (dump_max_rank == 0)
|
|
dump_max_rank = max_vocab_mode;
|
|
dump_vocabulary(&ctx, dump_max_rank);
|
|
}
|
|
|
|
vocab_cleanup(&ctx);
|
|
return 0;
|
|
}
|
|
|
|
ExcerptResult *results = malloc(max_length * sizeof(ExcerptResult));
|
|
if (!results)
|
|
{
|
|
fprintf(stderr, "Memory allocation failed\n");
|
|
vocab_cleanup(&ctx);
|
|
return 1;
|
|
}
|
|
|
|
vocab_find_optimal_excerpts(&ctx, max_length, results);
|
|
print_results(&ctx, results, max_length);
|
|
|
|
if (dump_vocab)
|
|
{
|
|
if (dump_max_rank == 0 && max_length > 0)
|
|
dump_max_rank = results[max_length - 1].min_vocab_needed;
|
|
if (dump_max_rank > 0)
|
|
dump_vocabulary(&ctx, dump_max_rank);
|
|
}
|
|
|
|
free(results);
|
|
vocab_cleanup(&ctx);
|
|
|
|
return 0;
|
|
}
|