testsAndMisc-archive/C/vocabulary_curve/vocabulary.h
Krzysztof kuhy Rudnicki 01091c09ce Add tests and fix pre-commit issues across all projects
- C/lichess_random_engine, vocabulary_curve, misc/split,
  1dvelocitysimulator, opening_learner: test suites added
- CPP/miscelanious: tests added
- TS/battery-status, champions_leauge_scores, two-inputs: tests added
- python_pkg/fm24_searcher, wake_alarm: new packages added
- Fix ruff/cppcheck/eslint/clang-format failures
- Update .gitignore for C/C++ build artifacts
2026-04-12 20:45:24 +02:00

79 lines
2.5 KiB
C

/*
* vocabulary.h - Core vocabulary analysis logic, extracted for testability.
*/
#pragma once
#include <stdbool.h>
#include <stdio.h>
#define MAX_WORD_LEN 64
#define MAX_WORDS 500000
#define MAX_UNIQUE_WORDS 100000
#define HASH_SIZE 200003 /* Prime number for better distribution */
/* Word entry for hash table */
typedef struct WordEntry
{
char word[MAX_WORD_LEN];
int count;
int rank; /* 1-indexed rank by frequency (1 = most common) */
struct WordEntry *next;
} WordEntry;
/* Result for each excerpt length */
typedef struct
{
int excerpt_length;
int min_vocab_needed;
int start_pos; /* Start position in word_sequence */
} ExcerptResult;
/* Context holding all mutable state (replaces static globals) */
typedef struct
{
WordEntry *hash_table[HASH_SIZE];
WordEntry *all_entries[MAX_UNIQUE_WORDS];
int num_unique_words;
WordEntry *word_sequence[MAX_WORDS];
int num_words;
} VocabContext;
/* Initialise a fresh context (zero everything) */
void vocab_init(VocabContext *ctx);
/* Free all allocated WordEntry nodes inside ctx */
void vocab_cleanup(VocabContext *ctx);
/* Hash a word (public for tests) */
unsigned int vocab_hash_word(const char *word);
/* Find or create a word entry in the context */
WordEntry *vocab_get_or_create_word(VocabContext *ctx, const char *word);
/* Check if a character can be part of a word */
bool vocab_is_word_char(int c);
/* Comparator for qsort (descending count) */
int vocab_compare_by_count(const void *a, const void *b);
/* Assign frequency ranks to all entries in ctx */
void vocab_assign_ranks(VocabContext *ctx);
/* Analyse one excerpt window and return the max rank required */
int vocab_analyze_excerpt(const VocabContext *ctx, int start, int length);
/* Read and index words from an open FILE stream into ctx */
bool vocab_process_stream(VocabContext *ctx, FILE *fp);
/* Find optimal excerpts for lengths 1..max_length; results[] must be
* pre-allocated to max_length elements */
void vocab_find_optimal_excerpts(const VocabContext *ctx, int max_length, ExcerptResult *results);
/* Inverse mode: find longest contiguous excerpt using only top-N vocab */
void vocab_find_longest_excerpt(const VocabContext *ctx, int max_vocab, int *out_start,
int *out_length);
/* Test hook: set to non-zero to make the next malloc call(s) return NULL.
* Only used by test_vocabulary.c to exercise the malloc-failure path. */
extern int vocab_test_fail_malloc_count;