/* * vocabulary.c - Core vocabulary analysis logic. */ #include "vocabulary.h" #include #include #include /* Test hook: test code can set this to make the next N malloc calls fail */ int vocab_test_fail_malloc_count = 0; static void *vocab_malloc(size_t size) { if (vocab_test_fail_malloc_count > 0) { vocab_test_fail_malloc_count--; return NULL; } return malloc(size); } /* ----------------------------------------------------------------------- */ /* Initialise / cleanup */ /* ----------------------------------------------------------------------- */ void vocab_init(VocabContext *ctx) { memset(ctx->hash_table, 0, sizeof(ctx->hash_table)); ctx->num_unique_words = 0; ctx->num_words = 0; } void vocab_cleanup(VocabContext *ctx) { for (int i = 0; i < ctx->num_unique_words; i++) { free(ctx->all_entries[i]); } ctx->num_unique_words = 0; ctx->num_words = 0; } /* ----------------------------------------------------------------------- */ /* Hash table helpers */ /* ----------------------------------------------------------------------- */ unsigned int vocab_hash_word(const char *word) { unsigned int hash = 5381; int c; while ((c = *word++)) { hash = ((hash << 5) + hash) + (unsigned int)c; } return hash % HASH_SIZE; } WordEntry *vocab_get_or_create_word(VocabContext *ctx, const char *word) { unsigned int h = vocab_hash_word(word); WordEntry *entry = ctx->hash_table[h]; while (entry) { if (strcmp(entry->word, word) == 0) { return entry; } entry = entry->next; } /* Create new entry */ if (ctx->num_unique_words >= MAX_UNIQUE_WORDS) { fprintf(stderr, "Too many unique words\n"); return NULL; } entry = vocab_malloc(sizeof(WordEntry)); if (!entry) { fprintf(stderr, "Memory allocation failed\n"); return NULL; } strncpy(entry->word, word, MAX_WORD_LEN - 1); entry->word[MAX_WORD_LEN - 1] = '\0'; entry->count = 0; entry->rank = 0; entry->next = ctx->hash_table[h]; ctx->hash_table[h] = entry; ctx->all_entries[ctx->num_unique_words++] = entry; return entry; } /* ----------------------------------------------------------------------- */ /* Character classification */ /* ----------------------------------------------------------------------- */ bool vocab_is_word_char(int c) { return isalnum(c) || c == '_' || (unsigned char)c >= 128; } /* ----------------------------------------------------------------------- */ /* Sorting / ranking */ /* ----------------------------------------------------------------------- */ int vocab_compare_by_count(const void *a, const void *b) { const WordEntry *wa = *(const WordEntry **)a; const WordEntry *wb = *(const WordEntry **)b; return wb->count - wa->count; /* Descending */ } void vocab_assign_ranks(VocabContext *ctx) { qsort(ctx->all_entries, ctx->num_unique_words, sizeof(WordEntry *), vocab_compare_by_count); for (int i = 0; i < ctx->num_unique_words; i++) { if (i == 0) { ctx->all_entries[i]->rank = 1; } else if (ctx->all_entries[i]->count == ctx->all_entries[i - 1]->count) { ctx->all_entries[i]->rank = ctx->all_entries[i - 1]->rank; } else { ctx->all_entries[i]->rank = i + 1; } } } /* ----------------------------------------------------------------------- */ /* Sliding-window analysis */ /* ----------------------------------------------------------------------- */ int vocab_analyze_excerpt(const VocabContext *ctx, int start, int length) { static bool seen_rank[MAX_UNIQUE_WORDS + 1]; memset(seen_rank, 0, (ctx->num_unique_words + 1) * sizeof(bool)); int max_rank = 0; for (int i = start; i < start + length; i++) { WordEntry *entry = ctx->word_sequence[i]; int rank = entry->rank; if (!seen_rank[rank]) { seen_rank[rank] = true; if (rank > max_rank) { max_rank = rank; } } } return max_rank; } /* ----------------------------------------------------------------------- */ /* File I/O */ /* ----------------------------------------------------------------------- */ bool vocab_process_stream(VocabContext *ctx, FILE *fp) { char word[MAX_WORD_LEN]; int word_len = 0; int c; while ((c = fgetc(fp)) != EOF) { if (vocab_is_word_char(c)) { if (word_len < MAX_WORD_LEN - 1) { word[word_len++] = tolower(c); } } else if (word_len > 0) { word[word_len] = '\0'; WordEntry *entry = vocab_get_or_create_word(ctx, word); if (!entry) return false; entry->count++; if (ctx->num_words >= MAX_WORDS) { fprintf(stderr, "Too many words in file\n"); return false; } ctx->word_sequence[ctx->num_words++] = entry; word_len = 0; } } /* Handle last word if file doesn't end with whitespace */ if (word_len > 0) { word[word_len] = '\0'; WordEntry *entry = vocab_get_or_create_word(ctx, word); if (!entry) return false; entry->count++; if (ctx->num_words < MAX_WORDS) { ctx->word_sequence[ctx->num_words++] = entry; } } return true; } /* ----------------------------------------------------------------------- */ /* Optimal-excerpt search */ /* ----------------------------------------------------------------------- */ void vocab_find_optimal_excerpts(const VocabContext *ctx, int max_length, ExcerptResult *results) { for (int length = 1; length <= max_length && length <= ctx->num_words; length++) { int best_vocab = ctx->num_unique_words + 1; int best_start = 0; for (int start = 0; start <= ctx->num_words - length; start++) { int vocab_needed = vocab_analyze_excerpt(ctx, start, length); if (vocab_needed < best_vocab) { best_vocab = vocab_needed; best_start = start; } } results[length - 1].excerpt_length = length; results[length - 1].min_vocab_needed = best_vocab; results[length - 1].start_pos = best_start; } } /* ----------------------------------------------------------------------- */ /* Inverse mode */ /* ----------------------------------------------------------------------- */ void vocab_find_longest_excerpt(const VocabContext *ctx, int max_vocab, int *out_start, int *out_length) { int best_start = 0; int best_length = 0; int left = 0; for (int right = 0; right < ctx->num_words; right++) { if (ctx->word_sequence[right]->rank > max_vocab) { left = right + 1; } else { int length = right - left + 1; if (length > best_length) { best_length = length; best_start = left; } } } *out_start = best_start; *out_length = best_length; }