testsAndMisc/C/vocabulary_curve/main.c
Krzysztof kuhy Rudnicki f6b6995b0e Add tests and fix pre-commit issues across all projects
- C/lichess_random_engine, vocabulary_curve, misc/split,
  1dvelocitysimulator, opening_learner: test suites added
- CPP/miscelanious: tests added
- TS/battery-status, champions_leauge_scores, two-inputs: tests added
- python_pkg/fm24_searcher, wake_alarm: new packages added
- Fix ruff/cppcheck/eslint/clang-format failures
- Update .gitignore for C/C++ build artifacts
2026-04-12 20:45:24 +02:00

309 lines
9.5 KiB
C

/*
* Vocabulary Learning Curve Analyzer - thin driver.
*/
#include "vocabulary.h"
#include <stdbool.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
/* Print excerpt words */
static void print_excerpt(const VocabContext *ctx, int start, int length)
{
for (int i = start; i < start + length; i++)
{
if (i > start)
printf(" ");
printf("%s", ctx->word_sequence[i]->word);
}
}
/* Print words needed (sorted by rank) */
static void print_words_needed(const VocabContext *ctx, int start, int length)
{
static WordEntry *unique_entries[MAX_UNIQUE_WORDS];
static bool seen_rank[MAX_UNIQUE_WORDS + 1];
memset(seen_rank, 0, (ctx->num_unique_words + 1) * sizeof(bool));
int count = 0;
for (int i = start; i < start + length; i++)
{
WordEntry *entry = ctx->word_sequence[i];
if (!seen_rank[entry->rank])
{
seen_rank[entry->rank] = true;
unique_entries[count++] = entry;
}
}
for (int i = 0; i < count - 1; i++)
{
for (int j = i + 1; j < count; j++)
{
if (unique_entries[i]->rank > unique_entries[j]->rank)
{
WordEntry *tmp = unique_entries[i];
unique_entries[i] = unique_entries[j];
unique_entries[j] = tmp;
}
}
}
for (int i = 0; i < count; i++)
{
if (i > 0)
printf(", ");
printf("%s(#%d)", unique_entries[i]->word, unique_entries[i]->rank);
}
}
/* Print results */
static void print_results(const VocabContext *ctx, ExcerptResult *results, int max_length)
{
printf("======================================================================\n");
printf("VOCABULARY LEARNING CURVE\n");
printf("======================================================================\n");
printf("\n");
printf("For each excerpt length, the minimum number of top-frequency\n");
printf("words you need to learn to understand 100%%%% of some excerpt.\n");
printf("\n");
printf("Total words in text: %d\n", ctx->num_words);
printf("Unique words: %d\n", ctx->num_unique_words);
printf("\n");
printf("----------------------------------------------------------------------\n");
int prev_vocab = 0;
int actual_max = max_length;
if (actual_max > ctx->num_words)
actual_max = ctx->num_words;
for (int i = 0; i < actual_max; i++)
{
ExcerptResult *r = &results[i];
printf("\n[Length %d] Vocab needed: %d", r->excerpt_length, r->min_vocab_needed);
if (r->min_vocab_needed > prev_vocab)
printf(" (+%d)", r->min_vocab_needed - prev_vocab);
printf("\n");
printf(" Excerpt: \"");
print_excerpt(ctx, r->start_pos, r->excerpt_length);
printf("\"\n");
printf(" Words: ");
print_words_needed(ctx, r->start_pos, r->excerpt_length);
printf("\n");
prev_vocab = r->min_vocab_needed;
}
printf("\n----------------------------------------------------------------------\n");
if (actual_max > 0)
{
ExcerptResult *final = &results[actual_max - 1];
printf("\nTo understand a %d-word excerpt,\n", final->excerpt_length);
printf("you need to learn at minimum %d top words.\n", final->min_vocab_needed);
}
}
static void dump_vocabulary(const VocabContext *ctx, int max_rank)
{
printf("VOCAB_DUMP_START\n");
for (int i = 0; i < ctx->num_unique_words; i++)
{
if (ctx->all_entries[i]->rank <= max_rank)
printf("%s;%d\n", ctx->all_entries[i]->word, ctx->all_entries[i]->rank);
}
printf("VOCAB_DUMP_END\n");
}
static void print_longest_excerpt_result(const VocabContext *ctx, int max_vocab, int best_start,
int best_length)
{
printf("======================================================================\n");
printf("INVERSE MODE: LONGEST EXCERPT WITH TOP %d WORDS\n", max_vocab);
printf("======================================================================\n");
printf("\n");
printf("Total words in text: %d\n", ctx->num_words);
printf("Unique words: %d\n", ctx->num_unique_words);
printf("Vocabulary limit: top %d words\n", max_vocab);
printf("\n");
printf("----------------------------------------------------------------------\n");
printf("\n");
if (best_length == 0)
{
printf("No valid excerpt found with top %d words.\n", max_vocab);
printf("The text may require rarer words from the very beginning.\n");
}
else
{
printf("LONGEST EXCERPT: %d words\n", best_length);
printf("Position: words %d to %d\n", best_start + 1, best_start + best_length);
printf("\n");
printf("Excerpt:\n \"");
print_excerpt(ctx, best_start, best_length);
printf("\"\n");
printf("\n");
int max_rank_used = 0;
const char *rarest_word = NULL;
for (int i = best_start; i < best_start + best_length; i++)
{
if (ctx->word_sequence[i]->rank > max_rank_used)
{
max_rank_used = ctx->word_sequence[i]->rank;
rarest_word = ctx->word_sequence[i]->word;
}
}
// cppcheck-suppress nullPointer
printf("Rarest word used: %s (#%d)\n", rarest_word, max_rank_used);
static bool seen_rank[MAX_UNIQUE_WORDS + 1];
memset(seen_rank, 0, (ctx->num_unique_words + 1) * sizeof(bool));
int unique_count = 0;
for (int i = best_start; i < best_start + best_length; i++)
{
if (!seen_rank[ctx->word_sequence[i]->rank])
{
seen_rank[ctx->word_sequence[i]->rank] = true;
unique_count++;
}
}
printf("Unique words in excerpt: %d\n", unique_count);
}
printf("\n----------------------------------------------------------------------\n");
}
int main(int argc, char *argv[])
{
if (argc < 2)
{
fprintf(stderr, "Usage: %s <file.txt> [options]\n", argv[0]);
fprintf(stderr, "\nModes:\n");
fprintf(stderr, " (default) Find minimum vocab needed for each excerpt length\n");
fprintf(stderr,
" --max-vocab N INVERSE: Find longest excerpt using only top N words\n");
fprintf(stderr, "\nOptions:\n");
fprintf(stderr, " max_length Maximum excerpt length to analyze (default: 30)\n");
fprintf(stderr, " --dump-vocab [N] Output all words with ranks up to N\n");
fprintf(stderr, "\nExamples:\n");
fprintf(stderr, " %s book.txt 50 # Analyze excerpts up to 50 words\n",
argv[0]);
fprintf(stderr, " %s book.txt --max-vocab 500 # Find longest excerpt with top 500 words\n",
argv[0]);
return 1;
}
const char *filename = argv[1];
int max_length = 30;
bool dump_vocab = false;
int dump_max_rank = 0;
int max_vocab_mode = 0;
for (int i = 2; i < argc; i++)
{
if (strcmp(argv[i], "--dump-vocab") == 0)
{
dump_vocab = true;
if (i + 1 < argc && argv[i + 1][0] != '-')
dump_max_rank = atoi(argv[++i]);
}
else if (strcmp(argv[i], "--max-vocab") == 0)
{
if (i + 1 < argc)
{
max_vocab_mode = atoi(argv[++i]);
if (max_vocab_mode < 1)
{
fprintf(stderr, "Error: --max-vocab requires a positive number\n");
return 1;
}
}
else
{
fprintf(stderr, "Error: --max-vocab requires a number\n");
return 1;
}
}
else if (argv[i][0] != '-')
{
max_length = atoi(argv[i]);
if (max_length < 1)
max_length = 1;
if (max_length > 1000)
max_length = 1000;
}
}
VocabContext ctx;
vocab_init(&ctx);
FILE *fp = fopen(filename, "r");
if (!fp)
{
fprintf(stderr, "Cannot open file: %s\n", filename);
return 1;
}
bool ok = vocab_process_stream(&ctx, fp);
fclose(fp);
if (!ok)
{
vocab_cleanup(&ctx);
return 1;
}
if (ctx.num_words == 0)
{
fprintf(stderr, "No words found in file\n");
vocab_cleanup(&ctx);
return 1;
}
vocab_assign_ranks(&ctx);
if (max_vocab_mode > 0)
{
int best_start = 0;
int best_length = 0;
vocab_find_longest_excerpt(&ctx, max_vocab_mode, &best_start, &best_length);
print_longest_excerpt_result(&ctx, max_vocab_mode, best_start, best_length);
if (dump_vocab)
{
if (dump_max_rank == 0)
dump_max_rank = max_vocab_mode;
dump_vocabulary(&ctx, dump_max_rank);
}
vocab_cleanup(&ctx);
return 0;
}
ExcerptResult *results = malloc(max_length * sizeof(ExcerptResult));
if (!results)
{
fprintf(stderr, "Memory allocation failed\n");
vocab_cleanup(&ctx);
return 1;
}
vocab_find_optimal_excerpts(&ctx, max_length, results);
print_results(&ctx, results, max_length);
if (dump_vocab)
{
if (dump_max_rank == 0 && max_length > 0)
dump_max_rank = results[max_length - 1].min_vocab_needed;
if (dump_max_rank > 0)
dump_vocabulary(&ctx, dump_max_rank);
}
free(results);
vocab_cleanup(&ctx);
return 0;
}