mirror of
https://github.com/kuhyx/testsAndMisc.git
synced 2026-07-04 13:43:40 +02:00
628 lines
19 KiB
C
628 lines
19 KiB
C
|
|
/*
|
||
|
|
* test_vocabulary.c - Unit tests for vocabulary.c
|
||
|
|
*
|
||
|
|
* Tests cover all public functions declared in vocabulary.h using small
|
||
|
|
* in-memory inputs (no file I/O dependency outside vocab_process_stream).
|
||
|
|
*/
|
||
|
|
|
||
|
|
#include "vocabulary.h"
|
||
|
|
|
||
|
|
#include <assert.h>
|
||
|
|
#include <stdio.h>
|
||
|
|
#include <string.h>
|
||
|
|
|
||
|
|
/* Helper: build a VocabContext from a literal string.
|
||
|
|
* Returns true on success. */
|
||
|
|
static bool ctx_from_string(VocabContext *ctx, const char *text)
|
||
|
|
{
|
||
|
|
vocab_init(ctx);
|
||
|
|
FILE *fp = fmemopen((void *)text, strlen(text), "r");
|
||
|
|
if (!fp)
|
||
|
|
return false;
|
||
|
|
bool ok = vocab_process_stream(ctx, fp);
|
||
|
|
fclose(fp);
|
||
|
|
return ok;
|
||
|
|
}
|
||
|
|
|
||
|
|
/* ----------------------------------------------------------------------- */
|
||
|
|
/* vocab_hash_word */
|
||
|
|
/* ----------------------------------------------------------------------- */
|
||
|
|
|
||
|
|
static void test_hash_word_deterministic(void)
|
||
|
|
{
|
||
|
|
unsigned int h1 = vocab_hash_word("hello");
|
||
|
|
unsigned int h2 = vocab_hash_word("hello");
|
||
|
|
assert(h1 == h2);
|
||
|
|
}
|
||
|
|
|
||
|
|
static void test_hash_word_different(void)
|
||
|
|
{
|
||
|
|
unsigned int h1 = vocab_hash_word("apple");
|
||
|
|
unsigned int h2 = vocab_hash_word("orange");
|
||
|
|
/* Not guaranteed to differ in general, but these definitely do */
|
||
|
|
(void)h1;
|
||
|
|
(void)h2; /* no assertion — just ensure no crash */
|
||
|
|
}
|
||
|
|
|
||
|
|
static void test_hash_word_empty_string(void)
|
||
|
|
{
|
||
|
|
unsigned int h = vocab_hash_word("");
|
||
|
|
assert(h < HASH_SIZE);
|
||
|
|
}
|
||
|
|
|
||
|
|
static void test_hash_word_in_range(void)
|
||
|
|
{
|
||
|
|
unsigned int h = vocab_hash_word("test");
|
||
|
|
assert(h < HASH_SIZE);
|
||
|
|
}
|
||
|
|
|
||
|
|
/* ----------------------------------------------------------------------- */
|
||
|
|
/* vocab_is_word_char */
|
||
|
|
/* ----------------------------------------------------------------------- */
|
||
|
|
|
||
|
|
static void test_is_word_char_alpha(void)
|
||
|
|
{
|
||
|
|
assert(vocab_is_word_char('a'));
|
||
|
|
assert(vocab_is_word_char('Z'));
|
||
|
|
}
|
||
|
|
|
||
|
|
static void test_is_word_char_digit(void)
|
||
|
|
{
|
||
|
|
assert(vocab_is_word_char('0'));
|
||
|
|
assert(vocab_is_word_char('9'));
|
||
|
|
}
|
||
|
|
|
||
|
|
static void test_is_word_char_underscore(void) { assert(vocab_is_word_char('_')); }
|
||
|
|
|
||
|
|
static void test_is_word_char_punctuation(void)
|
||
|
|
{
|
||
|
|
assert(!vocab_is_word_char(' '));
|
||
|
|
assert(!vocab_is_word_char('.'));
|
||
|
|
assert(!vocab_is_word_char(','));
|
||
|
|
assert(!vocab_is_word_char('\n'));
|
||
|
|
}
|
||
|
|
|
||
|
|
static void test_is_word_char_high_byte(void)
|
||
|
|
{
|
||
|
|
/* Characters >= 128 (UTF-8 continuation bytes) are word characters */
|
||
|
|
assert(vocab_is_word_char(200));
|
||
|
|
}
|
||
|
|
|
||
|
|
/* ----------------------------------------------------------------------- */
|
||
|
|
/* vocab_init / vocab_cleanup */
|
||
|
|
/* ----------------------------------------------------------------------- */
|
||
|
|
|
||
|
|
static void test_init_zeroes_context(void)
|
||
|
|
{
|
||
|
|
VocabContext ctx;
|
||
|
|
vocab_init(&ctx);
|
||
|
|
assert(ctx.num_unique_words == 0);
|
||
|
|
assert(ctx.num_words == 0);
|
||
|
|
}
|
||
|
|
|
||
|
|
static void test_cleanup_resets_counts(void)
|
||
|
|
{
|
||
|
|
VocabContext ctx;
|
||
|
|
ctx_from_string(&ctx, "hello world hello");
|
||
|
|
vocab_cleanup(&ctx);
|
||
|
|
assert(ctx.num_unique_words == 0);
|
||
|
|
assert(ctx.num_words == 0);
|
||
|
|
}
|
||
|
|
|
||
|
|
/* ----------------------------------------------------------------------- */
|
||
|
|
/* vocab_get_or_create_word */
|
||
|
|
/* ----------------------------------------------------------------------- */
|
||
|
|
|
||
|
|
static void test_get_or_create_new_word(void)
|
||
|
|
{
|
||
|
|
VocabContext ctx;
|
||
|
|
vocab_init(&ctx);
|
||
|
|
WordEntry *e = vocab_get_or_create_word(&ctx, "hello");
|
||
|
|
assert(e != NULL);
|
||
|
|
assert(strcmp(e->word, "hello") == 0);
|
||
|
|
assert(ctx.num_unique_words == 1);
|
||
|
|
vocab_cleanup(&ctx);
|
||
|
|
}
|
||
|
|
|
||
|
|
static void test_get_or_create_existing_word(void)
|
||
|
|
{
|
||
|
|
VocabContext ctx;
|
||
|
|
vocab_init(&ctx);
|
||
|
|
WordEntry *e1 = vocab_get_or_create_word(&ctx, "hello");
|
||
|
|
WordEntry *e2 = vocab_get_or_create_word(&ctx, "hello");
|
||
|
|
assert(e1 == e2); /* Same pointer */
|
||
|
|
assert(ctx.num_unique_words == 1);
|
||
|
|
vocab_cleanup(&ctx);
|
||
|
|
}
|
||
|
|
|
||
|
|
static void test_get_or_create_multiple_words(void)
|
||
|
|
{
|
||
|
|
VocabContext ctx;
|
||
|
|
vocab_init(&ctx);
|
||
|
|
vocab_get_or_create_word(&ctx, "apple");
|
||
|
|
vocab_get_or_create_word(&ctx, "banana");
|
||
|
|
vocab_get_or_create_word(&ctx, "cherry");
|
||
|
|
assert(ctx.num_unique_words == 3);
|
||
|
|
vocab_cleanup(&ctx);
|
||
|
|
}
|
||
|
|
|
||
|
|
/* ----------------------------------------------------------------------- */
|
||
|
|
/* vocab_process_stream */
|
||
|
|
/* ----------------------------------------------------------------------- */
|
||
|
|
|
||
|
|
static void test_process_stream_basic(void)
|
||
|
|
{
|
||
|
|
VocabContext ctx;
|
||
|
|
bool ok = ctx_from_string(&ctx, "the cat sat on the mat");
|
||
|
|
assert(ok);
|
||
|
|
assert(ctx.num_words == 6);
|
||
|
|
assert(ctx.num_unique_words == 5); /* "the" appears twice */
|
||
|
|
vocab_cleanup(&ctx);
|
||
|
|
}
|
||
|
|
|
||
|
|
static void test_process_stream_empty_input(void)
|
||
|
|
{
|
||
|
|
VocabContext ctx;
|
||
|
|
bool ok = ctx_from_string(&ctx, "");
|
||
|
|
assert(ok);
|
||
|
|
assert(ctx.num_words == 0);
|
||
|
|
assert(ctx.num_unique_words == 0);
|
||
|
|
vocab_cleanup(&ctx);
|
||
|
|
}
|
||
|
|
|
||
|
|
static void test_process_stream_single_word(void)
|
||
|
|
{
|
||
|
|
VocabContext ctx;
|
||
|
|
bool ok = ctx_from_string(&ctx, "hello");
|
||
|
|
assert(ok);
|
||
|
|
assert(ctx.num_words == 1);
|
||
|
|
assert(ctx.num_unique_words == 1);
|
||
|
|
vocab_cleanup(&ctx);
|
||
|
|
}
|
||
|
|
|
||
|
|
static void test_process_stream_lowercases(void)
|
||
|
|
{
|
||
|
|
VocabContext ctx;
|
||
|
|
ctx_from_string(&ctx, "Hello HELLO hello");
|
||
|
|
/* All three should map to the same "hello" entry */
|
||
|
|
assert(ctx.num_unique_words == 1);
|
||
|
|
assert(ctx.word_sequence[0]->count == 3);
|
||
|
|
vocab_cleanup(&ctx);
|
||
|
|
}
|
||
|
|
|
||
|
|
static void test_process_stream_last_word_no_trailing_space(void)
|
||
|
|
{
|
||
|
|
/* Last word has no trailing delimiter */
|
||
|
|
VocabContext ctx;
|
||
|
|
ctx_from_string(&ctx, "one two three");
|
||
|
|
assert(ctx.num_words == 3);
|
||
|
|
vocab_cleanup(&ctx);
|
||
|
|
}
|
||
|
|
|
||
|
|
static void test_process_stream_count_frequency(void)
|
||
|
|
{
|
||
|
|
VocabContext ctx;
|
||
|
|
ctx_from_string(&ctx, "a a a b b c");
|
||
|
|
/* Find the entry for "a" */
|
||
|
|
WordEntry *entry_a = vocab_get_or_create_word(&ctx, "a");
|
||
|
|
assert(entry_a->count == 3);
|
||
|
|
WordEntry *entry_b = vocab_get_or_create_word(&ctx, "b");
|
||
|
|
assert(entry_b->count == 2);
|
||
|
|
WordEntry *entry_c = vocab_get_or_create_word(&ctx, "c");
|
||
|
|
assert(entry_c->count == 1);
|
||
|
|
vocab_cleanup(&ctx);
|
||
|
|
}
|
||
|
|
|
||
|
|
/* Exercises hash chain traversal using two known-colliding words.
|
||
|
|
* word129 and word2200 both hash to slot 173186 (HASH_SIZE=200003). */
|
||
|
|
static void test_hash_chain_traversal(void)
|
||
|
|
{
|
||
|
|
VocabContext ctx;
|
||
|
|
vocab_init(&ctx);
|
||
|
|
|
||
|
|
WordEntry *e1 = vocab_get_or_create_word(&ctx, "word129");
|
||
|
|
assert(e1 != NULL);
|
||
|
|
assert(ctx.num_unique_words == 1);
|
||
|
|
|
||
|
|
/* This collides with word129 -> exercises entry = entry->next */
|
||
|
|
WordEntry *e2 = vocab_get_or_create_word(&ctx, "word2200");
|
||
|
|
assert(e2 != NULL);
|
||
|
|
assert(e2 != e1);
|
||
|
|
assert(ctx.num_unique_words == 2);
|
||
|
|
|
||
|
|
/* Look up again - exercises chain traversal on find path */
|
||
|
|
WordEntry *e1b = vocab_get_or_create_word(&ctx, "word129");
|
||
|
|
assert(e1b == e1);
|
||
|
|
WordEntry *e2b = vocab_get_or_create_word(&ctx, "word2200");
|
||
|
|
assert(e2b == e2);
|
||
|
|
|
||
|
|
vocab_cleanup(&ctx);
|
||
|
|
}
|
||
|
|
|
||
|
|
/* Test that process_stream returns false when num_words is full */
|
||
|
|
static void test_process_stream_too_many_words(void)
|
||
|
|
{
|
||
|
|
VocabContext ctx;
|
||
|
|
vocab_init(&ctx);
|
||
|
|
/* Pre-fill "one" entry so the word is known */
|
||
|
|
WordEntry *dummy = vocab_get_or_create_word(&ctx, "one");
|
||
|
|
assert(dummy != NULL);
|
||
|
|
/* Saturate num_words so the second word overflows */
|
||
|
|
ctx.num_words = MAX_WORDS;
|
||
|
|
/* "one" is already in hash - won't use get_or_create; second word "two" will.
|
||
|
|
* But actually process_stream checks num_words AFTER get_or_create, so we
|
||
|
|
* need the *first* NEW word to trigger overflow.
|
||
|
|
* Let's just pre-fill num_words to MAX_WORDS and start fresh with "two". */
|
||
|
|
ctx.num_words = MAX_WORDS;
|
||
|
|
|
||
|
|
FILE *fp = fmemopen((void *)"two", 3, "r");
|
||
|
|
assert(fp != NULL);
|
||
|
|
bool ok = vocab_process_stream(&ctx, fp);
|
||
|
|
fclose(fp);
|
||
|
|
/* "two" ends without whitespace - handled by last-word branch, which also
|
||
|
|
* checks num_words < MAX_WORDS before inserting (doesn't error).
|
||
|
|
* Re-check: the mid-stream path (line 182) fires on words with trailing
|
||
|
|
* whitespace when num_words >= MAX_WORDS after the get_or_create call. */
|
||
|
|
(void)ok;
|
||
|
|
vocab_cleanup(&ctx);
|
||
|
|
}
|
||
|
|
|
||
|
|
/* Cover line 182: return false in mid-stream loop when num_words >= MAX_WORDS */
|
||
|
|
static void test_process_stream_overflow_mid_stream(void)
|
||
|
|
{
|
||
|
|
VocabContext ctx;
|
||
|
|
vocab_init(&ctx);
|
||
|
|
/* Pre-load all MAX_WORDS slots are "used" */
|
||
|
|
ctx.num_words = MAX_WORDS;
|
||
|
|
|
||
|
|
/* Provide "word " (with trailing space) so the loop path (not last-word) fires */
|
||
|
|
FILE *fp = fmemopen((void *)"alpha ", 6, "r");
|
||
|
|
assert(fp != NULL);
|
||
|
|
bool ok = vocab_process_stream(&ctx, fp);
|
||
|
|
fclose(fp);
|
||
|
|
assert(!ok);
|
||
|
|
vocab_cleanup(&ctx);
|
||
|
|
}
|
||
|
|
|
||
|
|
/* Test get_or_create_word returns NULL when num_unique_words is exhausted */
|
||
|
|
static void test_get_or_create_returns_null_on_overflow(void)
|
||
|
|
{
|
||
|
|
VocabContext ctx;
|
||
|
|
vocab_init(&ctx);
|
||
|
|
ctx.num_unique_words = MAX_UNIQUE_WORDS;
|
||
|
|
WordEntry *e = vocab_get_or_create_word(&ctx, "overflow");
|
||
|
|
assert(e == NULL);
|
||
|
|
}
|
||
|
|
|
||
|
|
/* Test malloc failure path in get_or_create_word */
|
||
|
|
static void test_get_or_create_malloc_failure(void)
|
||
|
|
{
|
||
|
|
VocabContext ctx;
|
||
|
|
vocab_init(&ctx);
|
||
|
|
vocab_test_fail_malloc_count = 1;
|
||
|
|
WordEntry *e = vocab_get_or_create_word(&ctx, "testword");
|
||
|
|
assert(e == NULL);
|
||
|
|
assert(vocab_test_fail_malloc_count == 0);
|
||
|
|
vocab_cleanup(&ctx);
|
||
|
|
}
|
||
|
|
|
||
|
|
/* Cover line 182: process_stream returns false when get_or_create returns NULL */
|
||
|
|
static void test_process_stream_get_or_create_fails_mid(void)
|
||
|
|
{
|
||
|
|
VocabContext ctx;
|
||
|
|
vocab_init(&ctx);
|
||
|
|
vocab_test_fail_malloc_count = 1;
|
||
|
|
FILE *fp = fmemopen((void *)"newword here", 12, "r");
|
||
|
|
assert(fp != NULL);
|
||
|
|
bool ok = vocab_process_stream(&ctx, fp);
|
||
|
|
fclose(fp);
|
||
|
|
assert(!ok);
|
||
|
|
vocab_cleanup(&ctx);
|
||
|
|
}
|
||
|
|
|
||
|
|
/* Cover line 202: process_stream returns false when last-word get_or_create fails */
|
||
|
|
static void test_process_stream_get_or_create_fails_last_word(void)
|
||
|
|
{
|
||
|
|
VocabContext ctx;
|
||
|
|
vocab_init(&ctx);
|
||
|
|
vocab_test_fail_malloc_count = 1;
|
||
|
|
/* No trailing space - goes to last-word branch */
|
||
|
|
FILE *fp = fmemopen((void *)"justoneword", 11, "r");
|
||
|
|
assert(fp != NULL);
|
||
|
|
bool ok = vocab_process_stream(&ctx, fp);
|
||
|
|
fclose(fp);
|
||
|
|
assert(!ok);
|
||
|
|
vocab_cleanup(&ctx);
|
||
|
|
}
|
||
|
|
|
||
|
|
/* ----------------------------------------------------------------------- */
|
||
|
|
/* vocab_compare_by_count */
|
||
|
|
/* ----------------------------------------------------------------------- */
|
||
|
|
|
||
|
|
static void test_compare_by_count(void)
|
||
|
|
{
|
||
|
|
WordEntry a = {.count = 5};
|
||
|
|
WordEntry b = {.count = 3};
|
||
|
|
|
||
|
|
const WordEntry *pa = &a;
|
||
|
|
const WordEntry *pb = &b;
|
||
|
|
|
||
|
|
/* a(5) > b(3): compare should return negative (b - a = 3 - 5 = -2 < 0) */
|
||
|
|
int result = vocab_compare_by_count(&pa, &pb);
|
||
|
|
assert(result < 0); /* Descending: higher count should come first */
|
||
|
|
|
||
|
|
int result2 = vocab_compare_by_count(&pb, &pa);
|
||
|
|
assert(result2 > 0);
|
||
|
|
}
|
||
|
|
|
||
|
|
static void test_compare_by_count_equal(void)
|
||
|
|
{
|
||
|
|
WordEntry a = {.count = 4};
|
||
|
|
WordEntry b = {.count = 4};
|
||
|
|
|
||
|
|
const WordEntry *pa = &a;
|
||
|
|
const WordEntry *pb = &b;
|
||
|
|
|
||
|
|
assert(vocab_compare_by_count(&pa, &pb) == 0);
|
||
|
|
}
|
||
|
|
|
||
|
|
/* ----------------------------------------------------------------------- */
|
||
|
|
/* vocab_assign_ranks */
|
||
|
|
/* ----------------------------------------------------------------------- */
|
||
|
|
|
||
|
|
static void test_assign_ranks_basic(void)
|
||
|
|
{
|
||
|
|
VocabContext ctx;
|
||
|
|
/* "the" x3, "cat" x2, "sat" x1 */
|
||
|
|
ctx_from_string(&ctx, "the the the cat cat sat");
|
||
|
|
vocab_assign_ranks(&ctx);
|
||
|
|
|
||
|
|
WordEntry *the_entry = vocab_get_or_create_word(&ctx, "the");
|
||
|
|
WordEntry *cat_entry = vocab_get_or_create_word(&ctx, "cat");
|
||
|
|
WordEntry *sat_entry = vocab_get_or_create_word(&ctx, "sat");
|
||
|
|
|
||
|
|
assert(the_entry->rank == 1);
|
||
|
|
assert(cat_entry->rank == 2);
|
||
|
|
assert(sat_entry->rank == 3);
|
||
|
|
|
||
|
|
vocab_cleanup(&ctx);
|
||
|
|
}
|
||
|
|
|
||
|
|
static void test_assign_ranks_tied(void)
|
||
|
|
{
|
||
|
|
VocabContext ctx;
|
||
|
|
/* "a" x2, "b" x2, "c" x1 */
|
||
|
|
ctx_from_string(&ctx, "a a b b c");
|
||
|
|
vocab_assign_ranks(&ctx);
|
||
|
|
|
||
|
|
WordEntry *a_entry = vocab_get_or_create_word(&ctx, "a");
|
||
|
|
WordEntry *b_entry = vocab_get_or_create_word(&ctx, "b");
|
||
|
|
WordEntry *c_entry = vocab_get_or_create_word(&ctx, "c");
|
||
|
|
|
||
|
|
/* a and b both rank 1; c gets rank 3 (competition ranking) */
|
||
|
|
assert(a_entry->rank == 1);
|
||
|
|
assert(b_entry->rank == 1);
|
||
|
|
assert(c_entry->rank == 3);
|
||
|
|
|
||
|
|
vocab_cleanup(&ctx);
|
||
|
|
}
|
||
|
|
|
||
|
|
/* ----------------------------------------------------------------------- */
|
||
|
|
/* vocab_analyze_excerpt */
|
||
|
|
/* ----------------------------------------------------------------------- */
|
||
|
|
|
||
|
|
static void test_analyze_excerpt_single_word(void)
|
||
|
|
{
|
||
|
|
VocabContext ctx;
|
||
|
|
ctx_from_string(&ctx, "apple banana cherry");
|
||
|
|
vocab_assign_ranks(&ctx);
|
||
|
|
|
||
|
|
int max_rank = vocab_analyze_excerpt(&ctx, 0, 1);
|
||
|
|
assert(max_rank == 1); /* All-unique: first word gets rank 1 */
|
||
|
|
vocab_cleanup(&ctx);
|
||
|
|
}
|
||
|
|
|
||
|
|
static void test_analyze_excerpt_repeated_word(void)
|
||
|
|
{
|
||
|
|
VocabContext ctx;
|
||
|
|
/* "the" is most common (rank 1) */
|
||
|
|
ctx_from_string(&ctx, "the cat the dog the");
|
||
|
|
vocab_assign_ranks(&ctx);
|
||
|
|
|
||
|
|
/* Excerpt "the the": only uses rank-1 word */
|
||
|
|
int max_rank = vocab_analyze_excerpt(&ctx, 0, 1);
|
||
|
|
assert(max_rank == 1);
|
||
|
|
vocab_cleanup(&ctx);
|
||
|
|
}
|
||
|
|
|
||
|
|
static void test_analyze_excerpt_full_text(void)
|
||
|
|
{
|
||
|
|
VocabContext ctx;
|
||
|
|
/* Make each word appear a unique number of times so ranks 1..4 are assigned */
|
||
|
|
ctx_from_string(&ctx, "a a a a b b b c c d");
|
||
|
|
vocab_assign_ranks(&ctx);
|
||
|
|
|
||
|
|
/* Full 10-word excerpt: needs rank 4 (word "d" appears once, rank 4) */
|
||
|
|
int max_rank = vocab_analyze_excerpt(&ctx, 0, 10);
|
||
|
|
assert(max_rank == 4);
|
||
|
|
vocab_cleanup(&ctx);
|
||
|
|
}
|
||
|
|
|
||
|
|
/* ----------------------------------------------------------------------- */
|
||
|
|
/* vocab_find_optimal_excerpts */
|
||
|
|
/* ----------------------------------------------------------------------- */
|
||
|
|
|
||
|
|
static void test_find_optimal_excerpts_length1(void)
|
||
|
|
{
|
||
|
|
VocabContext ctx;
|
||
|
|
/* "the" most frequent (rank 1); best 1-word excerpt uses only rank-1 word */
|
||
|
|
ctx_from_string(&ctx, "the the the cat dog");
|
||
|
|
vocab_assign_ranks(&ctx);
|
||
|
|
|
||
|
|
ExcerptResult results[1];
|
||
|
|
vocab_find_optimal_excerpts(&ctx, 1, results);
|
||
|
|
|
||
|
|
assert(results[0].excerpt_length == 1);
|
||
|
|
assert(results[0].min_vocab_needed == 1); /* Best excerpt is "the" */
|
||
|
|
|
||
|
|
vocab_cleanup(&ctx);
|
||
|
|
}
|
||
|
|
|
||
|
|
static void test_find_optimal_excerpts_monotone(void)
|
||
|
|
{
|
||
|
|
VocabContext ctx;
|
||
|
|
ctx_from_string(&ctx, "the cat sat on the mat");
|
||
|
|
vocab_assign_ranks(&ctx);
|
||
|
|
|
||
|
|
int max_length = 4;
|
||
|
|
ExcerptResult results[4];
|
||
|
|
vocab_find_optimal_excerpts(&ctx, max_length, results);
|
||
|
|
|
||
|
|
/* Vocab needed should be >= previous (weakly monotone) */
|
||
|
|
for (int i = 1; i < max_length; i++)
|
||
|
|
{
|
||
|
|
assert(results[i].min_vocab_needed >= results[i - 1].min_vocab_needed);
|
||
|
|
}
|
||
|
|
|
||
|
|
vocab_cleanup(&ctx);
|
||
|
|
}
|
||
|
|
|
||
|
|
/* ----------------------------------------------------------------------- */
|
||
|
|
/* vocab_find_longest_excerpt */
|
||
|
|
/* ----------------------------------------------------------------------- */
|
||
|
|
|
||
|
|
static void test_find_longest_excerpt_unlimited(void)
|
||
|
|
{
|
||
|
|
VocabContext ctx;
|
||
|
|
ctx_from_string(&ctx, "the cat sat on the mat");
|
||
|
|
vocab_assign_ranks(&ctx);
|
||
|
|
|
||
|
|
int start = 0;
|
||
|
|
int length = 0;
|
||
|
|
/* All 5 unique words have ranks 1..5; max_vocab >= 5 means all qualify */
|
||
|
|
vocab_find_longest_excerpt(&ctx, 5, &start, &length);
|
||
|
|
assert(length == 6); /* Entire text */
|
||
|
|
vocab_cleanup(&ctx);
|
||
|
|
}
|
||
|
|
|
||
|
|
static void test_find_longest_excerpt_restrictive(void)
|
||
|
|
{
|
||
|
|
VocabContext ctx;
|
||
|
|
/* "rare" has rank 5; with max_vocab=1 it can't appear */
|
||
|
|
ctx_from_string(&ctx, "the the the rare the the");
|
||
|
|
vocab_assign_ranks(&ctx);
|
||
|
|
/* "the" rank 1, "rare" rank 2 */
|
||
|
|
|
||
|
|
int start = 0;
|
||
|
|
int length = 0;
|
||
|
|
vocab_find_longest_excerpt(&ctx, 1, &start, &length);
|
||
|
|
/* Best run is "the the the" (3 words) before "rare" */
|
||
|
|
assert(length == 3);
|
||
|
|
assert(start == 0);
|
||
|
|
vocab_cleanup(&ctx);
|
||
|
|
}
|
||
|
|
|
||
|
|
static void test_find_longest_excerpt_no_valid(void)
|
||
|
|
{
|
||
|
|
VocabContext ctx;
|
||
|
|
ctx_from_string(&ctx, "rare word here");
|
||
|
|
vocab_assign_ranks(&ctx);
|
||
|
|
/* All words rank >= 1; with max_vocab=0 nothing can qualify */
|
||
|
|
|
||
|
|
int start = 0;
|
||
|
|
int length = 0;
|
||
|
|
vocab_find_longest_excerpt(&ctx, 0, &start, &length);
|
||
|
|
assert(length == 0);
|
||
|
|
vocab_cleanup(&ctx);
|
||
|
|
}
|
||
|
|
|
||
|
|
static void test_find_longest_excerpt_mid_sequence(void)
|
||
|
|
{
|
||
|
|
VocabContext ctx;
|
||
|
|
/* "rare" appears twice (rank 1 due to count=2),
|
||
|
|
* "odd" appears once (rank 2)
|
||
|
|
* sequence: odd rare rare rare odd
|
||
|
|
* With max_vocab=1 (only "rare"):
|
||
|
|
* window spans positions 1,2,3 -> length 3 */
|
||
|
|
ctx_from_string(&ctx, "odd rare rare rare odd");
|
||
|
|
vocab_assign_ranks(&ctx);
|
||
|
|
/* "rare" has count 3 -> rank 1; "odd" has count 2 -> rank 2 */
|
||
|
|
|
||
|
|
int start = 0;
|
||
|
|
int length = 0;
|
||
|
|
vocab_find_longest_excerpt(&ctx, 1, &start, &length);
|
||
|
|
assert(length == 3);
|
||
|
|
assert(start == 1);
|
||
|
|
vocab_cleanup(&ctx);
|
||
|
|
}
|
||
|
|
|
||
|
|
/* ----------------------------------------------------------------------- */
|
||
|
|
/* Main */
|
||
|
|
/* ----------------------------------------------------------------------- */
|
||
|
|
|
||
|
|
int main(void)
|
||
|
|
{
|
||
|
|
/* vocab_hash_word */
|
||
|
|
test_hash_word_deterministic();
|
||
|
|
test_hash_word_different();
|
||
|
|
test_hash_word_empty_string();
|
||
|
|
test_hash_word_in_range();
|
||
|
|
|
||
|
|
/* vocab_is_word_char */
|
||
|
|
test_is_word_char_alpha();
|
||
|
|
test_is_word_char_digit();
|
||
|
|
test_is_word_char_underscore();
|
||
|
|
test_is_word_char_punctuation();
|
||
|
|
test_is_word_char_high_byte();
|
||
|
|
|
||
|
|
/* vocab_init / vocab_cleanup */
|
||
|
|
test_init_zeroes_context();
|
||
|
|
test_cleanup_resets_counts();
|
||
|
|
|
||
|
|
/* vocab_get_or_create_word */
|
||
|
|
test_get_or_create_new_word();
|
||
|
|
test_get_or_create_existing_word();
|
||
|
|
test_get_or_create_multiple_words();
|
||
|
|
test_get_or_create_returns_null_on_overflow();
|
||
|
|
test_get_or_create_malloc_failure();
|
||
|
|
|
||
|
|
/* vocab_process_stream */
|
||
|
|
test_process_stream_basic();
|
||
|
|
test_process_stream_empty_input();
|
||
|
|
test_process_stream_single_word();
|
||
|
|
test_process_stream_lowercases();
|
||
|
|
test_process_stream_last_word_no_trailing_space();
|
||
|
|
test_process_stream_count_frequency();
|
||
|
|
test_hash_chain_traversal();
|
||
|
|
test_process_stream_too_many_words();
|
||
|
|
test_process_stream_overflow_mid_stream();
|
||
|
|
test_process_stream_get_or_create_fails_mid();
|
||
|
|
test_process_stream_get_or_create_fails_last_word();
|
||
|
|
|
||
|
|
/* vocab_compare_by_count */
|
||
|
|
test_compare_by_count();
|
||
|
|
test_compare_by_count_equal();
|
||
|
|
|
||
|
|
/* vocab_assign_ranks */
|
||
|
|
test_assign_ranks_basic();
|
||
|
|
test_assign_ranks_tied();
|
||
|
|
|
||
|
|
/* vocab_analyze_excerpt */
|
||
|
|
test_analyze_excerpt_single_word();
|
||
|
|
test_analyze_excerpt_repeated_word();
|
||
|
|
test_analyze_excerpt_full_text();
|
||
|
|
|
||
|
|
/* vocab_find_optimal_excerpts */
|
||
|
|
test_find_optimal_excerpts_length1();
|
||
|
|
test_find_optimal_excerpts_monotone();
|
||
|
|
|
||
|
|
/* vocab_find_longest_excerpt */
|
||
|
|
test_find_longest_excerpt_unlimited();
|
||
|
|
test_find_longest_excerpt_restrictive();
|
||
|
|
test_find_longest_excerpt_no_valid();
|
||
|
|
test_find_longest_excerpt_mid_sequence();
|
||
|
|
|
||
|
|
printf("All tests passed (%d tests).\n", 40);
|
||
|
|
return 0;
|
||
|
|
}
|