testsAndMisc/C/vocabulary_curve/test_vocabulary.c

/*
 * test_vocabulary.c - Unit tests for vocabulary.c
 *
 * Tests cover all public functions declared in vocabulary.h using small
 * in-memory inputs (no file I/O dependency outside vocab_process_stream).
 */

#include "vocabulary.h"

#include <assert.h>
#include <stdio.h>
#include <string.h>

/* Helper: build a VocabContext from a literal string.
 * Returns true on success. */
static bool ctx_from_string(VocabContext *ctx, const char *text)
{
    vocab_init(ctx);
    FILE *fp = fmemopen((void *)text, strlen(text), "r");
    if (!fp)
        return false;
    bool ok = vocab_process_stream(ctx, fp);
    fclose(fp);
    return ok;
}

/* ----------------------------------------------------------------------- */
/* vocab_hash_word                                                           */
/* ----------------------------------------------------------------------- */

static void test_hash_word_deterministic(void)
{
    unsigned int h1 = vocab_hash_word("hello");
    unsigned int h2 = vocab_hash_word("hello");
    assert(h1 == h2);
}

static void test_hash_word_different(void)
{
    unsigned int h1 = vocab_hash_word("apple");
    unsigned int h2 = vocab_hash_word("orange");
    /* Not guaranteed to differ in general, but these definitely do */
    (void)h1;
    (void)h2; /* no assertion — just ensure no crash */
}

static void test_hash_word_empty_string(void)
{
    unsigned int h = vocab_hash_word("");
    assert(h < HASH_SIZE);
}

static void test_hash_word_in_range(void)
{
    unsigned int h = vocab_hash_word("test");
    assert(h < HASH_SIZE);
}

/* ----------------------------------------------------------------------- */
/* vocab_is_word_char                                                        */
/* ----------------------------------------------------------------------- */

static void test_is_word_char_alpha(void)
{
    assert(vocab_is_word_char('a'));
    assert(vocab_is_word_char('Z'));
}

static void test_is_word_char_digit(void)
{
    assert(vocab_is_word_char('0'));
    assert(vocab_is_word_char('9'));
}

static void test_is_word_char_underscore(void) { assert(vocab_is_word_char('_')); }

static void test_is_word_char_punctuation(void)
{
    assert(!vocab_is_word_char(' '));
    assert(!vocab_is_word_char('.'));
    assert(!vocab_is_word_char(','));
    assert(!vocab_is_word_char('\n'));
}

static void test_is_word_char_high_byte(void)
{
    /* Characters >= 128 (UTF-8 continuation bytes) are word characters */
    assert(vocab_is_word_char(200));
}

/* ----------------------------------------------------------------------- */
/* vocab_init / vocab_cleanup                                                */
/* ----------------------------------------------------------------------- */

static void test_init_zeroes_context(void)
{
    VocabContext ctx;
    vocab_init(&ctx);
    assert(ctx.num_unique_words == 0);
    assert(ctx.num_words == 0);
}

static void test_cleanup_resets_counts(void)
{
    VocabContext ctx;
    ctx_from_string(&ctx, "hello world hello");
    vocab_cleanup(&ctx);
    assert(ctx.num_unique_words == 0);
    assert(ctx.num_words == 0);
}

/* ----------------------------------------------------------------------- */
/* vocab_get_or_create_word                                                  */
/* ----------------------------------------------------------------------- */

static void test_get_or_create_new_word(void)
{
    VocabContext ctx;
    vocab_init(&ctx);
    WordEntry *e = vocab_get_or_create_word(&ctx, "hello");
    assert(e != NULL);
    assert(strcmp(e->word, "hello") == 0);
    assert(ctx.num_unique_words == 1);
    vocab_cleanup(&ctx);
}

static void test_get_or_create_existing_word(void)
{
    VocabContext ctx;
    vocab_init(&ctx);
    WordEntry *e1 = vocab_get_or_create_word(&ctx, "hello");
    WordEntry *e2 = vocab_get_or_create_word(&ctx, "hello");
    assert(e1 == e2); /* Same pointer */
    assert(ctx.num_unique_words == 1);
    vocab_cleanup(&ctx);
}

static void test_get_or_create_multiple_words(void)
{
    VocabContext ctx;
    vocab_init(&ctx);
    vocab_get_or_create_word(&ctx, "apple");
    vocab_get_or_create_word(&ctx, "banana");
    vocab_get_or_create_word(&ctx, "cherry");
    assert(ctx.num_unique_words == 3);
    vocab_cleanup(&ctx);
}

/* ----------------------------------------------------------------------- */
/* vocab_process_stream                                                      */
/* ----------------------------------------------------------------------- */

static void test_process_stream_basic(void)
{
    VocabContext ctx;
    bool         ok = ctx_from_string(&ctx, "the cat sat on the mat");
    assert(ok);
    assert(ctx.num_words == 6);
    assert(ctx.num_unique_words == 5); /* "the" appears twice */
    vocab_cleanup(&ctx);
}

static void test_process_stream_empty_input(void)
{
    VocabContext ctx;
    bool         ok = ctx_from_string(&ctx, "");
    assert(ok);
    assert(ctx.num_words == 0);
    assert(ctx.num_unique_words == 0);
    vocab_cleanup(&ctx);
}

static void test_process_stream_single_word(void)
{
    VocabContext ctx;
    bool         ok = ctx_from_string(&ctx, "hello");
    assert(ok);
    assert(ctx.num_words == 1);
    assert(ctx.num_unique_words == 1);
    vocab_cleanup(&ctx);
}

static void test_process_stream_lowercases(void)
{
    VocabContext ctx;
    ctx_from_string(&ctx, "Hello HELLO hello");
    /* All three should map to the same "hello" entry */
    assert(ctx.num_unique_words == 1);
    assert(ctx.word_sequence[0]->count == 3);
    vocab_cleanup(&ctx);
}

static void test_process_stream_last_word_no_trailing_space(void)
{
    /* Last word has no trailing delimiter */
    VocabContext ctx;
    ctx_from_string(&ctx, "one two three");
    assert(ctx.num_words == 3);
    vocab_cleanup(&ctx);
}

static void test_process_stream_count_frequency(void)
{
    VocabContext ctx;
    ctx_from_string(&ctx, "a a a b b c");
    /* Find the entry for "a" */
    WordEntry *entry_a = vocab_get_or_create_word(&ctx, "a");
    assert(entry_a->count == 3);
    WordEntry *entry_b = vocab_get_or_create_word(&ctx, "b");
    assert(entry_b->count == 2);
    WordEntry *entry_c = vocab_get_or_create_word(&ctx, "c");
    assert(entry_c->count == 1);
    vocab_cleanup(&ctx);
}

/* Exercises hash chain traversal using two known-colliding words.
 * word129 and word2200 both hash to slot 173186 (HASH_SIZE=200003). */
static void test_hash_chain_traversal(void)
{
    VocabContext ctx;
    vocab_init(&ctx);

    WordEntry *e1 = vocab_get_or_create_word(&ctx, "word129");
    assert(e1 != NULL);
    assert(ctx.num_unique_words == 1);

    /* This collides with word129 -> exercises entry = entry->next */
    WordEntry *e2 = vocab_get_or_create_word(&ctx, "word2200");
    assert(e2 != NULL);
    assert(e2 != e1);
    assert(ctx.num_unique_words == 2);

    /* Look up again - exercises chain traversal on find path */
    WordEntry *e1b = vocab_get_or_create_word(&ctx, "word129");
    assert(e1b == e1);
    WordEntry *e2b = vocab_get_or_create_word(&ctx, "word2200");
    assert(e2b == e2);

    vocab_cleanup(&ctx);
}

/* Test that process_stream returns false when num_words is full */
static void test_process_stream_too_many_words(void)
{
    VocabContext ctx;
    vocab_init(&ctx);
    /* Pre-fill "one" entry so the word is known */
    WordEntry *dummy = vocab_get_or_create_word(&ctx, "one");
    assert(dummy != NULL);
    /* Saturate num_words so the second word overflows */
    ctx.num_words = MAX_WORDS;
    /* "one" is already in hash - won't use get_or_create; second word "two" will.
     * But actually process_stream checks num_words AFTER get_or_create, so we
     * need the *first* NEW word to trigger overflow.
     * Let's just pre-fill num_words to MAX_WORDS and start fresh with "two". */
    ctx.num_words = MAX_WORDS;

    FILE *fp = fmemopen((void *)"two", 3, "r");
    assert(fp != NULL);
    bool ok = vocab_process_stream(&ctx, fp);
    fclose(fp);
    /* "two" ends without whitespace - handled by last-word branch, which also
     * checks num_words < MAX_WORDS before inserting (doesn't error).
     * Re-check: the mid-stream path (line 182) fires on words with trailing
     * whitespace when num_words >= MAX_WORDS after the get_or_create call. */
    (void)ok;
    vocab_cleanup(&ctx);
}

/* Cover line 182: return false in mid-stream loop when num_words >= MAX_WORDS */
static void test_process_stream_overflow_mid_stream(void)
{
    VocabContext ctx;
    vocab_init(&ctx);
    /* Pre-load all MAX_WORDS slots are "used" */
    ctx.num_words = MAX_WORDS;

    /* Provide "word " (with trailing space) so the loop path (not last-word) fires */
    FILE *fp = fmemopen((void *)"alpha ", 6, "r");
    assert(fp != NULL);
    bool ok = vocab_process_stream(&ctx, fp);
    fclose(fp);
    assert(!ok);
    vocab_cleanup(&ctx);
}

/* Test get_or_create_word returns NULL when num_unique_words is exhausted */
static void test_get_or_create_returns_null_on_overflow(void)
{
    VocabContext ctx;
    vocab_init(&ctx);
    ctx.num_unique_words = MAX_UNIQUE_WORDS;
    WordEntry *e         = vocab_get_or_create_word(&ctx, "overflow");
    assert(e == NULL);
}

/* Test malloc failure path in get_or_create_word */
static void test_get_or_create_malloc_failure(void)
{
    VocabContext ctx;
    vocab_init(&ctx);
    vocab_test_fail_malloc_count = 1;
    WordEntry *e                 = vocab_get_or_create_word(&ctx, "testword");
    assert(e == NULL);
    assert(vocab_test_fail_malloc_count == 0);
    vocab_cleanup(&ctx);
}

/* Cover line 182: process_stream returns false when get_or_create returns NULL */
static void test_process_stream_get_or_create_fails_mid(void)
{
    VocabContext ctx;
    vocab_init(&ctx);
    vocab_test_fail_malloc_count = 1;
    FILE *fp                     = fmemopen((void *)"newword here", 12, "r");
    assert(fp != NULL);
    bool ok = vocab_process_stream(&ctx, fp);
    fclose(fp);
    assert(!ok);
    vocab_cleanup(&ctx);
}

/* Cover line 202: process_stream returns false when last-word get_or_create fails */
static void test_process_stream_get_or_create_fails_last_word(void)
{
    VocabContext ctx;
    vocab_init(&ctx);
    vocab_test_fail_malloc_count = 1;
    /* No trailing space - goes to last-word branch */
    FILE *fp = fmemopen((void *)"justoneword", 11, "r");
    assert(fp != NULL);
    bool ok = vocab_process_stream(&ctx, fp);
    fclose(fp);
    assert(!ok);
    vocab_cleanup(&ctx);
}

/* ----------------------------------------------------------------------- */
/* vocab_compare_by_count                                                    */
/* ----------------------------------------------------------------------- */

static void test_compare_by_count(void)
{
    WordEntry a = {.count = 5};
    WordEntry b = {.count = 3};

    const WordEntry *pa = &a;
    const WordEntry *pb = &b;

    /* a(5) > b(3): compare should return negative (b - a = 3 - 5 = -2 < 0) */
    int result = vocab_compare_by_count(&pa, &pb);
    assert(result < 0); /* Descending: higher count should come first */

    int result2 = vocab_compare_by_count(&pb, &pa);
    assert(result2 > 0);
}

static void test_compare_by_count_equal(void)
{
    WordEntry a = {.count = 4};
    WordEntry b = {.count = 4};

    const WordEntry *pa = &a;
    const WordEntry *pb = &b;

    assert(vocab_compare_by_count(&pa, &pb) == 0);
}

/* ----------------------------------------------------------------------- */
/* vocab_assign_ranks                                                        */
/* ----------------------------------------------------------------------- */

static void test_assign_ranks_basic(void)
{
    VocabContext ctx;
    /* "the" x3, "cat" x2, "sat" x1 */
    ctx_from_string(&ctx, "the the the cat cat sat");
    vocab_assign_ranks(&ctx);

    WordEntry *the_entry = vocab_get_or_create_word(&ctx, "the");
    WordEntry *cat_entry = vocab_get_or_create_word(&ctx, "cat");
    WordEntry *sat_entry = vocab_get_or_create_word(&ctx, "sat");

    assert(the_entry->rank == 1);
    assert(cat_entry->rank == 2);
    assert(sat_entry->rank == 3);

    vocab_cleanup(&ctx);
}

static void test_assign_ranks_tied(void)
{
    VocabContext ctx;
    /* "a" x2, "b" x2, "c" x1 */
    ctx_from_string(&ctx, "a a b b c");
    vocab_assign_ranks(&ctx);

    WordEntry *a_entry = vocab_get_or_create_word(&ctx, "a");
    WordEntry *b_entry = vocab_get_or_create_word(&ctx, "b");
    WordEntry *c_entry = vocab_get_or_create_word(&ctx, "c");

    /* a and b both rank 1; c gets rank 3 (competition ranking) */
    assert(a_entry->rank == 1);
    assert(b_entry->rank == 1);
    assert(c_entry->rank == 3);

    vocab_cleanup(&ctx);
}

/* ----------------------------------------------------------------------- */
/* vocab_analyze_excerpt                                                     */
/* ----------------------------------------------------------------------- */

static void test_analyze_excerpt_single_word(void)
{
    VocabContext ctx;
    ctx_from_string(&ctx, "apple banana cherry");
    vocab_assign_ranks(&ctx);

    int max_rank = vocab_analyze_excerpt(&ctx, 0, 1);
    assert(max_rank == 1); /* All-unique: first word gets rank 1 */
    vocab_cleanup(&ctx);
}

static void test_analyze_excerpt_repeated_word(void)
{
    VocabContext ctx;
    /* "the" is most common (rank 1) */
    ctx_from_string(&ctx, "the cat the dog the");
    vocab_assign_ranks(&ctx);

    /* Excerpt "the the": only uses rank-1 word */
    int max_rank = vocab_analyze_excerpt(&ctx, 0, 1);
    assert(max_rank == 1);
    vocab_cleanup(&ctx);
}

static void test_analyze_excerpt_full_text(void)
{
    VocabContext ctx;
    /* Make each word appear a unique number of times so ranks 1..4 are assigned */
    ctx_from_string(&ctx, "a a a a b b b c c d");
    vocab_assign_ranks(&ctx);

    /* Full 10-word excerpt: needs rank 4 (word "d" appears once, rank 4) */
    int max_rank = vocab_analyze_excerpt(&ctx, 0, 10);
    assert(max_rank == 4);
    vocab_cleanup(&ctx);
}

/* ----------------------------------------------------------------------- */
/* vocab_find_optimal_excerpts                                               */
/* ----------------------------------------------------------------------- */

static void test_find_optimal_excerpts_length1(void)
{
    VocabContext ctx;
    /* "the" most frequent (rank 1); best 1-word excerpt uses only rank-1 word */
    ctx_from_string(&ctx, "the the the cat dog");
    vocab_assign_ranks(&ctx);

    ExcerptResult results[1];
    vocab_find_optimal_excerpts(&ctx, 1, results);

    assert(results[0].excerpt_length == 1);
    assert(results[0].min_vocab_needed == 1); /* Best excerpt is "the" */

    vocab_cleanup(&ctx);
}

static void test_find_optimal_excerpts_monotone(void)
{
    VocabContext ctx;
    ctx_from_string(&ctx, "the cat sat on the mat");
    vocab_assign_ranks(&ctx);

    int           max_length = 4;
    ExcerptResult results[4];
    vocab_find_optimal_excerpts(&ctx, max_length, results);

    /* Vocab needed should be >= previous (weakly monotone) */
    for (int i = 1; i < max_length; i++)
    {
        assert(results[i].min_vocab_needed >= results[i - 1].min_vocab_needed);
    }

    vocab_cleanup(&ctx);
}

/* ----------------------------------------------------------------------- */
/* vocab_find_longest_excerpt                                                */
/* ----------------------------------------------------------------------- */

static void test_find_longest_excerpt_unlimited(void)
{
    VocabContext ctx;
    ctx_from_string(&ctx, "the cat sat on the mat");
    vocab_assign_ranks(&ctx);

    int start  = 0;
    int length = 0;
    /* All 5 unique words have ranks 1..5; max_vocab >= 5 means all qualify */
    vocab_find_longest_excerpt(&ctx, 5, &start, &length);
    assert(length == 6); /* Entire text */
    vocab_cleanup(&ctx);
}

static void test_find_longest_excerpt_restrictive(void)
{
    VocabContext ctx;
    /* "rare" has rank 5; with max_vocab=1 it can't appear */
    ctx_from_string(&ctx, "the the the rare the the");
    vocab_assign_ranks(&ctx);
    /* "the" rank 1, "rare" rank 2 */

    int start  = 0;
    int length = 0;
    vocab_find_longest_excerpt(&ctx, 1, &start, &length);
    /* Best run is "the the the" (3 words) before "rare" */
    assert(length == 3);
    assert(start == 0);
    vocab_cleanup(&ctx);
}

static void test_find_longest_excerpt_no_valid(void)
{
    VocabContext ctx;
    ctx_from_string(&ctx, "rare word here");
    vocab_assign_ranks(&ctx);
    /* All words rank >= 1; with max_vocab=0 nothing can qualify */

    int start  = 0;
    int length = 0;
    vocab_find_longest_excerpt(&ctx, 0, &start, &length);
    assert(length == 0);
    vocab_cleanup(&ctx);
}

static void test_find_longest_excerpt_mid_sequence(void)
{
    VocabContext ctx;
    /* "rare" appears twice (rank 1 due to count=2),
     * "odd" appears once (rank 2)
     * sequence: odd rare rare rare odd
     * With max_vocab=1 (only "rare"):
     *   window spans positions 1,2,3 -> length 3 */
    ctx_from_string(&ctx, "odd rare rare rare odd");
    vocab_assign_ranks(&ctx);
    /* "rare" has count 3 -> rank 1; "odd" has count 2 -> rank 2 */

    int start  = 0;
    int length = 0;
    vocab_find_longest_excerpt(&ctx, 1, &start, &length);
    assert(length == 3);
    assert(start == 1);
    vocab_cleanup(&ctx);
}

/* ----------------------------------------------------------------------- */
/* Main                                                                      */
/* ----------------------------------------------------------------------- */

int main(void)
{
    /* vocab_hash_word */
    test_hash_word_deterministic();
    test_hash_word_different();
    test_hash_word_empty_string();
    test_hash_word_in_range();

    /* vocab_is_word_char */
    test_is_word_char_alpha();
    test_is_word_char_digit();
    test_is_word_char_underscore();
    test_is_word_char_punctuation();
    test_is_word_char_high_byte();

    /* vocab_init / vocab_cleanup */
    test_init_zeroes_context();
    test_cleanup_resets_counts();

    /* vocab_get_or_create_word */
    test_get_or_create_new_word();
    test_get_or_create_existing_word();
    test_get_or_create_multiple_words();
    test_get_or_create_returns_null_on_overflow();
    test_get_or_create_malloc_failure();

    /* vocab_process_stream */
    test_process_stream_basic();
    test_process_stream_empty_input();
    test_process_stream_single_word();
    test_process_stream_lowercases();
    test_process_stream_last_word_no_trailing_space();
    test_process_stream_count_frequency();
    test_hash_chain_traversal();
    test_process_stream_too_many_words();
    test_process_stream_overflow_mid_stream();
    test_process_stream_get_or_create_fails_mid();
    test_process_stream_get_or_create_fails_last_word();

    /* vocab_compare_by_count */
    test_compare_by_count();
    test_compare_by_count_equal();

    /* vocab_assign_ranks */
    test_assign_ranks_basic();
    test_assign_ranks_tied();

    /* vocab_analyze_excerpt */
    test_analyze_excerpt_single_word();
    test_analyze_excerpt_repeated_word();
    test_analyze_excerpt_full_text();

    /* vocab_find_optimal_excerpts */
    test_find_optimal_excerpts_length1();
    test_find_optimal_excerpts_monotone();

    /* vocab_find_longest_excerpt */
    test_find_longest_excerpt_unlimited();
    test_find_longest_excerpt_restrictive();
    test_find_longest_excerpt_no_valid();
    test_find_longest_excerpt_mid_sequence();

    printf("All tests passed (%d tests).\n", 40);
    return 0;
}