From 510440e02d6e54d8546f41354a7ac502c4ecba46 Mon Sep 17 00:00:00 2001
From: Krzysztof Rudnicki <krzysztofrudnicki0@gmail.com>
Date: Sun, 28 Dec 2025 16:15:38 +0100
Subject: [PATCH] feat: vocabulary curbe in C

---
 C/vocabulary_curve/Makefile                   |  13 +
 C/vocabulary_curve/main.c                     | 359 ++++++++++++++++++
 C/vocabulary_curve/vocabulary_curve           | Bin 0 -> 16768 bytes
 .../tests/test_vocabulary_curve.py            | 244 ++++++++++++
 python_pkg/word_frequency/vocabulary_curve.py | 319 ++++++++++++++++
 5 files changed, 935 insertions(+)
 create mode 100644 C/vocabulary_curve/Makefile
 create mode 100644 C/vocabulary_curve/main.c
 create mode 100755 C/vocabulary_curve/vocabulary_curve
 create mode 100644 python_pkg/word_frequency/tests/test_vocabulary_curve.py
 create mode 100644 python_pkg/word_frequency/vocabulary_curve.py

diff --git a/C/vocabulary_curve/Makefile b/C/vocabulary_curve/Makefile
new file mode 100644
index 0000000..6311950
--- /dev/null
+++ b/C/vocabulary_curve/Makefile
@@ -0,0 +1,13 @@
+CC = gcc
+CFLAGS = -O3 -Wall -Wextra -march=native
+TARGET = vocabulary_curve
+
+all: $(TARGET)
+
+$(TARGET): main.c
+	$(CC) $(CFLAGS) -o $(TARGET) main.c
+
+clean:
+	rm -f $(TARGET)
+
+.PHONY: all clean
diff --git a/C/vocabulary_curve/main.c b/C/vocabulary_curve/main.c
new file mode 100644
index 0000000..0a76857
--- /dev/null
+++ b/C/vocabulary_curve/main.c
@@ -0,0 +1,359 @@
+/*
+ * Vocabulary Learning Curve Analyzer
+ * 
+ * For each excerpt length (1, 2, 3, ... N words), finds the excerpt that
+ * requires the minimum number of top-frequency words to understand 100%.
+ * 
+ * Usage:
+ *   ./vocabulary_curve <file.txt> [max_length]
+ *   ./vocabulary_curve test.txt 50
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <ctype.h>
+#include <stdbool.h>
+
+#define MAX_WORD_LEN 64
+#define MAX_WORDS 500000
+#define MAX_UNIQUE_WORDS 100000
+#define HASH_SIZE 200003  /* Prime number for better distribution */
+
+/* Word entry for hash table */
+typedef struct WordEntry {
+    char word[MAX_WORD_LEN];
+    int count;
+    int rank;  /* 1-indexed rank by frequency (1 = most common) */
+    struct WordEntry *next;
+} WordEntry;
+
+/* Hash table for word lookup */
+static WordEntry *hash_table[HASH_SIZE];
+static WordEntry *all_entries[MAX_UNIQUE_WORDS];
+static int num_unique_words = 0;
+
+/* All words in order of appearance - store POINTERS not indices */
+static WordEntry *word_sequence[MAX_WORDS];
+static int num_words = 0;
+
+/* Result for each excerpt length */
+typedef struct {
+    int excerpt_length;
+    int min_vocab_needed;
+    int start_pos;  /* Start position in word_sequence */
+} ExcerptResult;
+
+/* Simple hash function */
+static unsigned int hash_word(const char *word) {
+    unsigned int hash = 5381;
+    int c;
+    while ((c = *word++)) {
+        hash = ((hash << 5) + hash) + c;
+    }
+    return hash % HASH_SIZE;
+}
+
+/* Find or create word entry */
+static WordEntry *get_or_create_word(const char *word) {
+    unsigned int h = hash_word(word);
+    WordEntry *entry = hash_table[h];
+    
+    while (entry) {
+        if (strcmp(entry->word, word) == 0) {
+            return entry;
+        }
+        entry = entry->next;
+    }
+    
+    /* Create new entry */
+    if (num_unique_words >= MAX_UNIQUE_WORDS) {
+        fprintf(stderr, "Too many unique words\n");
+        exit(1);
+    }
+    
+    entry = malloc(sizeof(WordEntry));
+    if (!entry) {
+        fprintf(stderr, "Memory allocation failed\n");
+        exit(1);
+    }
+    
+    strncpy(entry->word, word, MAX_WORD_LEN - 1);
+    entry->word[MAX_WORD_LEN - 1] = '\0';
+    entry->count = 0;
+    entry->rank = 0;
+    entry->next = hash_table[h];
+    hash_table[h] = entry;
+    
+    all_entries[num_unique_words++] = entry;
+    
+    return entry;
+}
+
+/* Compare function for sorting by frequency (descending) */
+static int compare_by_count(const void *a, const void *b) {
+    const WordEntry *wa = *(const WordEntry **)a;
+    const WordEntry *wb = *(const WordEntry **)b;
+    return wb->count - wa->count;  /* Descending */
+}
+
+/* Check if character is part of a word */
+static bool is_word_char(int c) {
+    return isalnum(c) || c == '_' || (unsigned char)c >= 128;
+}
+
+/* Read and process file */
+static bool process_file(const char *filename) {
+    FILE *fp = fopen(filename, "r");
+    if (!fp) {
+        fprintf(stderr, "Cannot open file: %s\n", filename);
+        return false;
+    }
+    
+    char word[MAX_WORD_LEN];
+    int word_len = 0;
+    int c;
+    
+    while ((c = fgetc(fp)) != EOF) {
+        if (is_word_char(c)) {
+            if (word_len < MAX_WORD_LEN - 1) {
+                word[word_len++] = tolower(c);
+            }
+        } else if (word_len > 0) {
+            word[word_len] = '\0';
+            
+            WordEntry *entry = get_or_create_word(word);
+            entry->count++;
+            
+            if (num_words >= MAX_WORDS) {
+                fprintf(stderr, "Too many words in file\n");
+                fclose(fp);
+                return false;
+            }
+            
+            /* Store pointer directly - survives sorting */
+            word_sequence[num_words++] = entry;
+            
+            word_len = 0;
+        }
+    }
+    
+    /* Handle last word if file doesn't end with whitespace */
+    if (word_len > 0) {
+        word[word_len] = '\0';
+        WordEntry *entry = get_or_create_word(word);
+        entry->count++;
+        
+        if (num_words < MAX_WORDS) {
+            word_sequence[num_words++] = entry;
+        }
+    }
+    
+    fclose(fp);
+    return true;
+}
+
+/* Assign ranks based on frequency */
+static void assign_ranks(void) {
+    /* Sort all_entries by frequency (this doesn't affect word_sequence) */
+    qsort(all_entries, num_unique_words, sizeof(WordEntry *), compare_by_count);
+    
+    /* Assign 1-indexed ranks */
+    for (int i = 0; i < num_unique_words; i++) {
+        all_entries[i]->rank = i + 1;
+    }
+}
+
+/* Analyze excerpt and return max rank needed */
+static int analyze_excerpt(int start, int length) {
+    /* Track which entries we've seen using a simple visited array */
+    /* We use the rank field is already assigned, so we can check uniqueness */
+    static bool seen_rank[MAX_UNIQUE_WORDS + 1];
+    memset(seen_rank, 0, (num_unique_words + 1) * sizeof(bool));
+    
+    int max_rank = 0;
+    
+    for (int i = start; i < start + length; i++) {
+        WordEntry *entry = word_sequence[i];
+        int rank = entry->rank;
+        
+        if (!seen_rank[rank]) {
+            seen_rank[rank] = true;
+            if (rank > max_rank) {
+                max_rank = rank;
+            }
+        }
+    }
+    
+    return max_rank;
+}
+
+/* Find optimal excerpts for each length */
+static void find_optimal_excerpts(int max_length, ExcerptResult *results) {
+    for (int length = 1; length <= max_length && length <= num_words; length++) {
+        int best_vocab = num_unique_words + 1;
+        int best_start = 0;
+        
+        /* Slide window through text */
+        for (int start = 0; start <= num_words - length; start++) {
+            int vocab_needed = analyze_excerpt(start, length);
+            
+            if (vocab_needed < best_vocab) {
+                best_vocab = vocab_needed;
+                best_start = start;
+            }
+        }
+        
+        results[length - 1].excerpt_length = length;
+        results[length - 1].min_vocab_needed = best_vocab;
+        results[length - 1].start_pos = best_start;
+    }
+}
+
+/* Print excerpt words */
+static void print_excerpt(int start, int length) {
+    for (int i = start; i < start + length; i++) {
+        if (i > start) printf(" ");
+        printf("%s", word_sequence[i]->word);
+    }
+}
+
+/* Print words needed (sorted by rank) */
+static void print_words_needed(int start, int length) {
+    /* Collect unique entries */
+    static WordEntry *unique_entries[MAX_UNIQUE_WORDS];
+    static bool seen_rank[MAX_UNIQUE_WORDS + 1];
+    memset(seen_rank, 0, (num_unique_words + 1) * sizeof(bool));
+    
+    int count = 0;
+    for (int i = start; i < start + length; i++) {
+        WordEntry *entry = word_sequence[i];
+        if (!seen_rank[entry->rank]) {
+            seen_rank[entry->rank] = true;
+            unique_entries[count++] = entry;
+        }
+    }
+    
+    /* Sort by rank (simple bubble sort - small arrays) */
+    for (int i = 0; i < count - 1; i++) {
+        for (int j = i + 1; j < count; j++) {
+            if (unique_entries[i]->rank > unique_entries[j]->rank) {
+                WordEntry *tmp = unique_entries[i];
+                unique_entries[i] = unique_entries[j];
+                unique_entries[j] = tmp;
+            }
+        }
+    }
+    
+    /* Print */
+    for (int i = 0; i < count; i++) {
+        if (i > 0) printf(", ");
+        printf("%s(#%d)", unique_entries[i]->word, unique_entries[i]->rank);
+    }
+}
+
+/* Print results */
+static void print_results(ExcerptResult *results, int max_length) {
+    printf("======================================================================\n");
+    printf("VOCABULARY LEARNING CURVE\n");
+    printf("======================================================================\n");
+    printf("\n");
+    printf("For each excerpt length, the minimum number of top-frequency\n");
+    printf("words you need to learn to understand 100%% of some excerpt.\n");
+    printf("\n");
+    printf("Total words in text: %d\n", num_words);
+    printf("Unique words: %d\n", num_unique_words);
+    printf("\n");
+    printf("----------------------------------------------------------------------\n");
+    
+    int prev_vocab = 0;
+    int actual_max = max_length;
+    if (actual_max > num_words) actual_max = num_words;
+    
+    for (int i = 0; i < actual_max; i++) {
+        ExcerptResult *r = &results[i];
+        
+        printf("\n[Length %d] Vocab needed: %d", r->excerpt_length, r->min_vocab_needed);
+        if (r->min_vocab_needed > prev_vocab) {
+            printf(" (+%d)", r->min_vocab_needed - prev_vocab);
+        }
+        printf("\n");
+        
+        printf("  Excerpt: \"");
+        print_excerpt(r->start_pos, r->excerpt_length);
+        printf("\"\n");
+        
+        printf("  Words: ");
+        print_words_needed(r->start_pos, r->excerpt_length);
+        printf("\n");
+        
+        prev_vocab = r->min_vocab_needed;
+    }
+    
+    printf("\n----------------------------------------------------------------------\n");
+    
+    if (actual_max > 0) {
+        ExcerptResult *final = &results[actual_max - 1];
+        printf("\nTo understand a %d-word excerpt,\n", final->excerpt_length);
+        printf("you need to learn at minimum %d top words.\n", final->min_vocab_needed);
+    }
+}
+
+/* Free memory */
+static void cleanup(void) {
+    for (int i = 0; i < num_unique_words; i++) {
+        free(all_entries[i]);
+    }
+}
+
+int main(int argc, char *argv[]) {
+    if (argc < 2) {
+        fprintf(stderr, "Usage: %s <file.txt> [max_length]\n", argv[0]);
+        fprintf(stderr, "  max_length: maximum excerpt length to analyze (default: 30)\n");
+        return 1;
+    }
+    
+    const char *filename = argv[1];
+    int max_length = 30;
+    
+    if (argc >= 3) {
+        max_length = atoi(argv[2]);
+        if (max_length < 1) max_length = 1;
+        if (max_length > 1000) max_length = 1000;
+    }
+    
+    /* Initialize hash table */
+    memset(hash_table, 0, sizeof(hash_table));
+    
+    /* Process file */
+    if (!process_file(filename)) {
+        return 1;
+    }
+    
+    if (num_words == 0) {
+        fprintf(stderr, "No words found in file\n");
+        return 1;
+    }
+    
+    /* Assign ranks by frequency */
+    assign_ranks();
+    
+    /* Find optimal excerpts */
+    ExcerptResult *results = malloc(max_length * sizeof(ExcerptResult));
+    if (!results) {
+        fprintf(stderr, "Memory allocation failed\n");
+        cleanup();
+        return 1;
+    }
+    
+    find_optimal_excerpts(max_length, results);
+    
+    /* Print results */
+    print_results(results, max_length);
+    
+    /* Cleanup */
+    free(results);
+    cleanup();
+    
+    return 0;
+}
diff --git a/C/vocabulary_curve/vocabulary_curve b/C/vocabulary_curve/vocabulary_curve
new file mode 100755
index 0000000000000000000000000000000000000000..ee232fae6092c1cd9dcfb98406f63612442771e2
GIT binary patch
literal 16768
zcmeHOdw5humah&8L`>4Cpp1&%)y_;%(x!Pyz({xKz-{b6gupn0UYgDgX-T?6Ka@BQ
z;3V<$+P1RmV{rB%?l%u-_M3HdU6(;ugMkp8-P!mUH!D6EchuVxP;^}upS{0Rx2n@=
z)3Y<*`0Zb*{`#IezdGmCsdMXfUppK8%ZqF_CetK#B_nRrxfbG<1&d{dF*jSpDsa4j
z&0>=Q7vt|1#rbsFB&$HuQXxk$={+vW^Xd6QPEzS1QPL}vPE55*BxTbqp7cs6EBJi+
zn=(sIQdyp@PqE5b9_<m%NSdohu^Fni^oHuj(%~YDA3WY^WrSY@lipFGcU0(QDLF*t
zg35T38}x4y@yw@glu>f?BV+k`yM<mpbqIM$sSZj+>E9v>@~MI{N(w)Yp0l-Y$6H7*
z-#^K_g}kKF&nBVgxY!cNr}KrLq*Mo`tD#5n{6mxM9&1E>h0b4%IDe9g)4^j>gUYC{
ztyr~=742eu<cln~F&pPk6Al##p7%{)Bs(2{Ivu1xo0aaF^7#E-duAE`@#x>(aK&G*
z{$>2K_R!{q^Dk>}scsL26Wgn|FIrf=aK1AZan5Dr*4OZ-`gpsoin+Ih*M69kLkirN
zQzf7?zYMd>3gDXy;53}Zvr`Ix6qMzsDFDWkf3^VrqXIa^Zah1;6u^I00H?5xXXnWR
z__qt-jsoqys(}240{G<x@JkBdj~Bq_74YYKkiQ83{L~F#yg1ObG#(xYeg=AApBot@
zIeuB|7Cjzhor!pi#p2O$u(OM4S}Yz2Zq<TqTea3esGYS&b)D(kLvfM|wgsZBHPWev
zAsCEzb?Vw?tvwP17m7uKbLT-j8jrNIjzGHwV^{>qWHhdI1VUld)1pUVqca)`$6K>j
zpcA=Cj|yAC?E$Sd6b`h9Zq`|AusssfS%=;c)8nl5rf4Xxle@u=PPQ!;iN+b#K_hJ;
zTjBTBFVp5a=Z)qTWwZ0Mxp|}6xy}Vl^EEeWa7*73ipBM4bK|o1NLX(UY;M=#;FgX^
zShPU1M8_3GT#N9VgujjFiZE{wV*PBGeMa*|m<J|<w-S{NA}i;)vqGVAnkils*uFo%
zAJaw&drk1ud3iB=SMZPX@=5Fi!6(iQVJa(PpMj?&*ExCKo6fWFVX=-$oC$od+bWm1
z<olLdIL%>VY2v_O4ld_RDj3Sa?Of724Cml8a`2-$_}MwQ?6+i_=F4nyTREDiXl{_H
z)NSPmmoX+}nwyS{5iZvv;%B<)XyJT*wpgaPtqjT2U#2Rzl_N~Xh>#jL9W7b8mJq+l
zO-BnCTOrQ7-JEdSr(0QmRC92#1>$&94vr3zP1oe$6<HMf#T;DrEh=uy!OtK-q)xY$
zp>~}qFe2OC7DM=1R+b-kxh;lpn&V}<-EHLvKSzR~yWJK;_)IH1ah!<2L<A-xFcE>A
z2>h4*f=|@$W2I`c<N)^gYHwe>C^MvXA1v+XQkezM1I|?b3y$^+-N+H&MrG-v8KlZ*
ziKitoJ;eF_#1|3Y$N9&IrzJf7bI$*icv=$EdpZBN#M5${-p%>%6HiNHdKc&af_R$7
z)192ZgLqmB)0;T|UE*m8OgC}<X5wk-OS?HAC7zbNbPeZkAfA@Gw8Ht##M2U&p2qoW
ziKnG4%{ae?csubYz5r0Ul6YF;(nmR8PdqJc=^@T9CZ3kGbRT%o-<v$&^sM)+^E9tp
ztKNF-VTiV<$>Pc?WK^Tau?N)E*PXH1?|L2~P?Oh}dY--jV0Wg--urKe(*Bt@%w(>U
zYO3zBN3LP!>!?yqE!Y76RpU+535R{gL7Y{4&kDrMb=;q-4sEluR{MoLFvFPNc-Pzq
zC8NLlR3^UByaF2j;HSo`YLeBTEZ$z4ILjFBGpBTas@d;7P&#PunTlGwKfiM4q3)4g
zw|=@4Xa1&g^S`NHXxN{azE2Gvg4N8S+nbA<%#G4^;s?FQ?Y({|Q<WuvlQa4|?X34j
zd(UB@YH-0&m!GMr^5@{_e*BTIdmWHD_&J1UJ%()WksZ})@TGm!-O$W`;tDHmt*uVk
z9yK|Az^M1CnLf34F!6ye_=UPPQ)bRUeLmw555$M4Ez_SYX+Z6MqurtUjo%p?9W`q3
zh(A?#6;M>%lP*UDH#$m<LtZ29SVV$u!|QN(Qp;_iRosWx-Ae%3?|v8l9QHc+#v}<-
z4X)!2u9FZ`Jcz#0?3g*~U$djsb<&)VHZ{0jRE@)a;{*+?oHqP1SzmJcVKunYyXy)&
zOPubheSV<WZu8pxFCYS*+Jiga^whrkol-BnpYAnY@?;MAG6SB>K~F7C292jAd{mQg
zF!Qg6J`N8eY~9b69sJL27xmlFkm-j$ex?2+<J!;A%qn}Y9SVqz8>7XWq@W}b6!UAJ
zar82bDYM+lUjPZyhWs-$5Y2!8luI-bI%4H}kazW&v$8V(06e+i(J+tx=NPXj&2a1&
zH&M5!D}sGsq;FZh)v|OmS-p=WOJdOW)DC#<t4}oap0wZfA*Pwe;5&5g;Oi|8RZUfo
zY${{Etq05eMvJ4ypIUI_QuIihsnR)h_0cJ_U+875A-Sa(0Z?3hsQLl8<4IobHQw?<
zOF@5_ZU#}Kr}j1b-A`d|vm83{32)|Z%lUc;G$bRXi12i;&Ggoq_Pc%p+0->OLVw#~
zXQ<5KtsU{$SHDg@%@=&#pK`oGy>6!07-=-#_9f>#P`#fU%=G&(E1M&zq!!ZOc?*Ue
zjmELs;T=mcCT0)A>Rqs^+J@BB`LFm>a|Y}evS$<;D&761<{I)Y89N1opW>__lzV~r
zYey1C0KEf9P0ebA0aLZwF>FcQ3@%wR|3*JeGsgR=CYu_3(To#{mZ_7xj+ts|e!uHT
zc24=@`fssA4K{gq-GZyLM8$x227~>V&{(7R6>6}r_M^nB-JfCx{5FE;PnFNT1Si87
z_NLZmenInq-}n{US^LW!>%7L$>_OEw*qA#1fcgiHDmA(Ol$u=RP2S-3)<(SFnW7py
zPN~K;wY#s>Upt(5Q_UPuy9bazkoZeqav|>zgII2o>%FG-aV7)T;)@N;9F-5uX3Up5
z{|!X*5eT^Y`1x7_86O4<#eFx2NwbpB+Sd{<@t))BK3rx#b_!=>$t^U_c4BhyC#ip8
zfGvXJ8#oo_qYyy;T;3aesf~_|y7fqz-!_t*XZ{*x$yhOpFSdH$L9%ANZ~93oSdxfi
z^6cAj5l-<P$f~J1{Wx9d=W22Z?;Q@3Ltl6w4NTt47oHaN(&>C9viI&spw(LsK8$d0
zxX!cDbDigUkJeAqR`2mdnd^A^AJHDmUhue@y2a5n;GNUN%nF>cjN7dm&%%da-(Y8T
z7d|wF+3%hUdnwiCGnx-!O)|>&qbR)%Hq_*fY5UPw^fmKpxa2bi_L(puMm3*wF|@t*
zRVUImk{?sugV=Utt7{#t?hZ(zI-`6IFsv|Jh#1ok@G5a`6vXW%tgo)V*>9M4{7)v6
zzSNq>h5(wi03h%j7cjSiOO{l|#tsLalxp}QSRVQY)-SpiecD>4>ORMUWA;OWyPy6m
zE_Sw>tXm36jN03&mbFw%vV0CHUzwwP4;9{q!jS_gE~nyL+XmU*f?`@#A2|uJ@59%>
z$MR|33NC#LUCI10a!?-nI+W?AhIH&)D7%iBRb>4vHB}OcqH83t7*;d==2_6GomB^9
z=iBLJnE35IpCc&gt6&hW_rR8lT|#QX2iUt|GP#FpI}dWBYccf?+F!bk`-~T^SB<yS
z?%$qhYHo1#xt@cQHdLFLv80l*k4ANXpFyTmB~yvio{G=F*7O1h(bn{|%u)M=yU+rL
z&c`X_e?x_-1+NmB{RIZTdf>gK>VZ>}RNH|1!WZ%BFmT*q87R#hwfeo(C;xWU{WEk~
zV%|EnyY6{NqGfN#%hAdmz%w)U0WyPdcA(^50JiJ;NB!l4AZNlOd+99{F5zEtvYI?6
z>2_aQ?}jYP?)OyV^R=|`wm*GN54J$ZsMgeK+rARagE{t$0Zc##-YK!2unpOtzB<$M
zV&WL~6#F+!IrL+{{gJnP2adTsp_iAHPhzu&P=FoGPkwj}Tb-QqZe#E9IQA_gBW0*p
zO>R)t-XrmEsK$G=Gkp1@Os2OlVbV@tHHLFWwG|(KaDk`O&pfw&@!)MXwsXTXO>ofs
z6|_{NpH4;Re!7FktsOQe1E-DAz_O*xC@F;^PT-#rIlRmbF-+21)vJv2-(6{(e_|v_
zad?Uvnr=j-&@&g(=}FbaFw*za-8kMs&E2s242>c4MyfYiZnwmOB$n;RW+*GR8aaq(
zuW8V>e()1%A_5Z;7(W8^{Gd4!Q91(QE+r8TZA<9NO_69zY$|KiJ0j68g+E;h#6yv=
zLJw*5mZ>btmIcD$NL-<(GfFGK#Y!bKvWB<?C1jONWve2VO{F!G2)B&l>sK%H)UWe<
z)_hCxH+a^p@~v8-EL*o`eFH4T1MNZyX5#wxII2L^>vCMQkf|H|dU#8`4aL_f>m$Ly
zW+klaEqV(TGo|X%%9hzoQ5v=f^=K!oT*NA4;J-<YT&#epx|k&9C_F&xVu3BZHOdv#
zE@ymu{7Pj*M_{|w&TF}jHYZ35A||oLln-?zIuzZiM6qO*cti={$x_$Nx>D7mw+0gJ
zsDEC~Y}ly#N)ltsBT+>U1l#heo}<LubfqH{=5B=(9h>#25^04;oz-~ChO-<Fc2PU5
zGu0JI@Su^$?Rp@Jh)2*H@SF;d({RRIH8qu$q#uiP=&~v&g`)Z^NsLYXO6*j#sm)^?
z51<cJQyZnzbHw@ka*G4;Y`iL4C{R}SLo03PO<GcewwB=k*|AKf1$5(knGCJjSJ2jr
zC^oBig3^9_FQ|M*VY_(^vu&SdJFjd?>29oLgwq3f6Wb+haBv|#g+DIA%Esp7zZ>@5
zwB}VzTV649rTz4qN_VlP=PtQqp5r2*{JGsm(1%ekE_``XaqPtZ2fzZPTVFBl?xOmN
znRiX9uTWCO^%YfjmUt>^Zkz0_SkyhmTj4J1o?5ZUTT$bwsDcC}p;2E^y5e+3ZFvg+
zw&R)1i`YdLw&fvPD=LoKCheUJYp|bZtpQgeJQd?ejU?CTod!yT(SIUML|`HU6A_q*
zz(fQlA}|qwi3t2@5s>d&$@i;h&qZm_BCJeM`2v{SL%vL#QY87#lYEcsJPXdgdvvbg
z<@;PhEnAKUfe{iJ!aW*Fv}dNopXh>4<{V?EM7h|Gvqs^l+%w8|c5stuOcH9$6z^}*
zgJw#DLSOE?ao3b5`A$$3ZoN~IcoXeykpyAG7R!WK_MIU-#Nq9cqPTa<`4hqr?q6|U
z#`%QcpAhA^^UTYo-=doTcOu^(&#!nPzxu#+q@e2sZ58xJL2ncEenEdE=o5kt2>Obk
z?+AKQ(8=QEud@aHx}XaMT`uT)L8YI2-BxbJvSo{vDqKCxR$L35bDb__Zq3{UE|+Vr
zlpLSZc`0TeeCatno%bSE&Nhh-F2@1&f5JNj&abC{&B5P>(Xs2mHOS$eGCX#tbUFTn
zZ@QDNoJOx{RDtgjI9p(iyk>!K>b8?OW2e<|vKdFR;}y8PHfk66-p?!#AHv;OO4|i~
zG-275c#p*0$WRjXQg*yJ+y~soX0ZJJ@i(M^NkYD7&siKE#ovZE?d15A_+ez|rHA}P
z<7zzoSONSrG;cilS-_Qi<KXE|v4YXM$33>N7WjB}Ru;fF0-pvu59iql7LdP@@S+MP
z=S^ArEd}IXFMxlq0RGbg_*1~i&;0g2S3v%?0{98wigY@MW`LhjG?U43CyUB3(VoH3
zXN8jNpXUJ|FU|`K;6C8f(9ic=x=otq^jXw<CXGk2!0>w`C|M=s``ly1*|pq$Iol=X
z3x2NxA{xiX$`ZG&0RG(q_<g`B4*C7zA>hiF&9f(gUo;N>3h*<Fu&Rl9P>Q@pxMXtZ
zCk5~`v4D*GUOm0(?hLYEq@yzs#Vhh%S`ar`;|#CcYmulHjOu~7uF)+p))t7hX>tCl
zIqnE**4<uS6l1u{tLfo*G^EG)JwYvI-6Yf*iSoOIET-#WEgA@K#djv7c^4Cggu1ha
zY2I(G@-+IE;f;38*jj_CEnnklY|vEi8m6uAudes_wbjd)uWe}7nmzUY293Y@@0M@W
z(|hvb1AsA@rsciipDo2p_Tr<0EcAc9Y)@Yzj1lJV_m4p|y(JJ2h>rrsmf(wl5L!3(
z`-QB?eDMWA79eN%2M}WrO(Xl(`~Ko9gE8ge!-X+e{x<?zODv+b;T|o0Kj2#pV=bYu
zmWb&saBs|a5?M3z7Ku+LvIvir9wwE|v6yg(e?q~(zZfG&pJ$8#tgj@-K$?c{8nlL0
zUhVgN#elCma=yJ7YXISm!yD_fkDL!SvQqQ1rTn9i9NhZqBMVU5m^0SZfsPM~PewRx
zlQ}%b(4AYt31?>%Pcx$NF6PWan-d{?WEE;*PJBRurw5D+(lBDqmaZ_2S~MQDN^Zpc
zay%y)!)Pdr>g@qi5V_9wI9Wsso%CRXIeBk#MkBntIrTQJH5%y9wYC=MN=~@tsrOaK
z16wS8Ef9?c;4taSJgNzFgo5xP0?Rl!d3Ba)oL%O`RMCO4nltJA30c}E(N-7J7Ekg!
zK1tmImqhNjCGSKgSAVzIC`;;)f`ZC*N5URtuykbWFZ3O$lqcET<O73S@e%3&kf>i$
zS--rF$@hOP^y!*K>dWgINjpUWUH4GkQs3G=;&mEeRHf9H*GZDf>mrEol<$8KC|!d{
zeR<s>DLqr5B>k6il6Inuu4yDMuWKaTM9N6{{#z(xJCUJtBK75Um87))qa^FkZ@+;u
z(yu2WBzc`Csl2WueY%#**S`lC#YA3LX4hG<-imEJB&?L*f9?XDtFH(>N!@t?m&>@M
z59H~~{fMMj%L`gjzgtjA9z-U$ez~8NR9=@z`F#I>DD-9gnIa?wb%_2f&}{r9|1)H$
zeyK0_`;yB3Bjxk!e=<-1nu{$-Nz3x$C(oa>cL<r>@n7gWVkw_r|4S&Km$hXx<$VkJ
z-EMw;l9%*N=+c##)R*@)hN}T0NjZxf<t09X0_uBGUtULLzuy+^m;F=f$^P>ZbV)?&
z%kSR$E(3^EWs&Sr>Pz}DN^<q(b(tyjf1cH}GEz^{A}HtT%j=3gHI}^eQ<{-_Ssu!S
zW&L+sB@YYz;oo&iQBIXH9Ao98ej%PMmHPSPmwcs^OJ(1+O046cFnCII7`jh24S(ss
ooPX(lkM!Nrw2pn)HBhR|E0pzGrL-p>U;phzmc-e43VAI1Uyvt56#xJL

literal 0
HcmV?d00001

diff --git a/python_pkg/word_frequency/tests/test_vocabulary_curve.py b/python_pkg/word_frequency/tests/test_vocabulary_curve.py
new file mode 100644
index 0000000..74d7877
--- /dev/null
+++ b/python_pkg/word_frequency/tests/test_vocabulary_curve.py
@@ -0,0 +1,244 @@
+#!/usr/bin/env python3
+"""Tests for vocabulary_curve C implementation."""
+
+from __future__ import annotations
+
+import subprocess
+import tempfile
+from pathlib import Path
+
+import pytest
+
+# Path to the C executable
+C_EXECUTABLE = Path(__file__).parent.parent.parent.parent / "C" / "vocabulary_curve" / "vocabulary_curve"
+
+
+@pytest.fixture
+def sample_text_file(tmp_path: Path) -> Path:
+    """Create a sample text file for testing."""
+    text = """The quick brown fox jumps over the lazy dog.
+The fox was very quick and the dog was very lazy.
+Quick foxes and lazy dogs are common in stories."""
+    filepath = tmp_path / "sample.txt"
+    filepath.write_text(text, encoding="utf-8")
+    return filepath
+
+
+@pytest.fixture
+def polish_text_file(tmp_path: Path) -> Path:
+    """Create a Polish sample text file."""
+    text = """Litwo! Ojczyzno moja! Ty jesteś jak zdrowie.
+Ile cię trzeba cenić, ten tylko się dowie,
+Kto cię stracił. Dziś piękność twą w całej ozdobie
+Widzę i opisuję, bo tęsknię po tobie."""
+    filepath = tmp_path / "polish.txt"
+    filepath.write_text(text, encoding="utf-8")
+    return filepath
+
+
+def run_vocabulary_curve(filepath: Path, max_length: int = 10) -> str:
+    """Run the vocabulary_curve executable and return output."""
+    if not C_EXECUTABLE.exists():
+        pytest.skip(f"C executable not found at {C_EXECUTABLE}")
+    
+    result = subprocess.run(
+        [str(C_EXECUTABLE), str(filepath), str(max_length)],
+        capture_output=True,
+        text=True,
+        timeout=30,
+    )
+    return result.stdout
+
+
+def extract_excerpts_from_output(output: str) -> list[tuple[int, str]]:
+    """Extract (length, excerpt) pairs from output."""
+    excerpts = []
+    lines = output.split("\n")
+    
+    i = 0
+    while i < len(lines):
+        line = lines[i]
+        if line.strip().startswith("[Length "):
+            # Parse length
+            length = int(line.split("]")[0].split()[-1])
+            
+            # Find excerpt line
+            i += 1
+            while i < len(lines) and not lines[i].strip().startswith("Excerpt:"):
+                i += 1
+            
+            if i < len(lines):
+                excerpt_line = lines[i].strip()
+                # Extract text between quotes
+                if '"' in excerpt_line:
+                    start = excerpt_line.index('"') + 1
+                    end = excerpt_line.rindex('"')
+                    excerpt = excerpt_line[start:end]
+                    excerpts.append((length, excerpt))
+        i += 1
+    
+    return excerpts
+
+
+class TestExcerptValidity:
+    """Tests that verify excerpts are actually found in the source text."""
+
+    def test_excerpt_exists_in_source_text(self, sample_text_file: Path) -> None:
+        """Test that each excerpt can be found in the source text as contiguous words."""
+        import re
+        source_text = sample_text_file.read_text(encoding="utf-8").lower()
+        source_words = re.findall(r'\b[\w]+\b', source_text)
+        output = run_vocabulary_curve(sample_text_file, max_length=10)
+        excerpts = extract_excerpts_from_output(output)
+        
+        assert len(excerpts) > 0, "No excerpts found in output"
+        
+        for length, excerpt in excerpts:
+            excerpt_words = excerpt.lower().split()
+            # Find this sequence in source_words
+            found = False
+            for i in range(len(source_words) - len(excerpt_words) + 1):
+                if source_words[i:i+len(excerpt_words)] == excerpt_words:
+                    found = True
+                    break
+            assert found, (
+                f"Excerpt of length {length} not found in source text:\n"
+                f"  Excerpt words: {excerpt_words}\n"
+                f"  First 30 source words: {source_words[:30]}"
+            )
+
+    def test_excerpt_word_count_matches_length(self, sample_text_file: Path) -> None:
+        """Test that excerpt has the expected number of words."""
+        output = run_vocabulary_curve(sample_text_file, max_length=10)
+        excerpts = extract_excerpts_from_output(output)
+        
+        for length, excerpt in excerpts:
+            word_count = len(excerpt.split())
+            assert word_count == length, (
+                f"Expected {length} words, got {word_count}: '{excerpt}'"
+            )
+
+    def test_polish_excerpt_exists_in_source(self, polish_text_file: Path) -> None:
+        """Test Polish text excerpts are found in source as contiguous words."""
+        import re
+        source_text = polish_text_file.read_text(encoding="utf-8").lower()
+        source_words = re.findall(r'\b[\w]+\b', source_text)
+        output = run_vocabulary_curve(polish_text_file, max_length=8)
+        excerpts = extract_excerpts_from_output(output)
+        
+        assert len(excerpts) > 0, "No excerpts found in output"
+        
+        for length, excerpt in excerpts:
+            excerpt_words = excerpt.lower().split()
+            # Find this sequence in source_words
+            found = False
+            for i in range(len(source_words) - len(excerpt_words) + 1):
+                if source_words[i:i+len(excerpt_words)] == excerpt_words:
+                    found = True
+                    break
+            assert found, (
+                f"Polish excerpt of length {length} not found:\n"
+                f"  Excerpt words: {excerpt_words}\n"
+                f"  Source words: {source_words}"
+            )
+
+    def test_excerpt_is_contiguous(self, sample_text_file: Path) -> None:
+        """Test that excerpt words appear contiguously in source."""
+        import re
+        
+        source_text = sample_text_file.read_text(encoding="utf-8").lower()
+        # Extract words from source
+        source_words = re.findall(r'\b[\w]+\b', source_text)
+        
+        output = run_vocabulary_curve(sample_text_file, max_length=5)
+        excerpts = extract_excerpts_from_output(output)
+        
+        for length, excerpt in excerpts:
+            excerpt_words = excerpt.lower().split()
+            
+            # Find this sequence in source_words
+            found = False
+            for i in range(len(source_words) - length + 1):
+                if source_words[i:i+length] == excerpt_words:
+                    found = True
+                    break
+            
+            assert found, (
+                f"Excerpt words not found as contiguous sequence:\n"
+                f"  Excerpt: {excerpt_words}\n"
+                f"  First 20 source words: {source_words[:20]}"
+            )
+
+
+class TestVocabNeeded:
+    """Tests for vocabulary count calculations."""
+
+    def test_length_1_needs_vocab_1(self, sample_text_file: Path) -> None:
+        """Test that a 1-word excerpt needs exactly 1 vocabulary word."""
+        output = run_vocabulary_curve(sample_text_file, max_length=1)
+        
+        assert "[Length 1] Vocab needed: 1" in output
+
+    def test_vocab_needed_increases_monotonically(self, sample_text_file: Path) -> None:
+        """Test that vocab needed never decreases as length increases."""
+        output = run_vocabulary_curve(sample_text_file, max_length=10)
+        excerpts = extract_excerpts_from_output(output)
+        
+        # Extract vocab needed from output
+        prev_vocab = 0
+        for line in output.split("\n"):
+            if "Vocab needed:" in line:
+                # Parse "Vocab needed: X"
+                parts = line.split("Vocab needed:")
+                if len(parts) > 1:
+                    vocab = int(parts[1].split()[0])
+                    assert vocab >= prev_vocab, (
+                        f"Vocab decreased from {prev_vocab} to {vocab}"
+                    )
+                    prev_vocab = vocab
+
+
+class TestEdgeCases:
+    """Edge case tests."""
+
+    def test_empty_file(self, tmp_path: Path) -> None:
+        """Test handling of empty file."""
+        filepath = tmp_path / "empty.txt"
+        filepath.write_text("", encoding="utf-8")
+        
+        if not C_EXECUTABLE.exists():
+            pytest.skip("C executable not found")
+        
+        result = subprocess.run(
+            [str(C_EXECUTABLE), str(filepath), "5"],
+            capture_output=True,
+            text=True,
+        )
+        
+        assert result.returncode != 0 or "No words" in result.stderr
+
+    def test_single_word_file(self, tmp_path: Path) -> None:
+        """Test file with single word."""
+        filepath = tmp_path / "single.txt"
+        filepath.write_text("hello", encoding="utf-8")
+        
+        output = run_vocabulary_curve(filepath, max_length=5)
+        
+        assert "[Length 1] Vocab needed: 1" in output
+        # Should only have 1 length since there's only 1 word
+        assert "[Length 2]" not in output
+
+    def test_repeated_word_file(self, tmp_path: Path) -> None:
+        """Test file with same word repeated."""
+        filepath = tmp_path / "repeated.txt"
+        filepath.write_text("hello hello hello hello hello", encoding="utf-8")
+        
+        output = run_vocabulary_curve(filepath, max_length=5)
+        
+        # All excerpts should need only 1 vocabulary word
+        for i in range(1, 6):
+            assert f"[Length {i}] Vocab needed: 1" in output
+
+
+if __name__ == "__main__":
+    pytest.main([__file__, "-v"])
diff --git a/python_pkg/word_frequency/vocabulary_curve.py b/python_pkg/word_frequency/vocabulary_curve.py
new file mode 100644
index 0000000..5163c0e
--- /dev/null
+++ b/python_pkg/word_frequency/vocabulary_curve.py
@@ -0,0 +1,319 @@
+#!/usr/bin/env python3
+"""Vocabulary learning curve analyzer.
+
+Finds the minimum vocabulary needed to understand excerpts of increasing length.
+For each excerpt length (1, 2, 3, ... N words), finds the excerpt that requires
+the fewest top-frequency words to understand 100%.
+
+Usage:
+    python -m python_pkg.word_frequency.vocabulary_curve --file text.txt
+    python -m python_pkg.word_frequency.vocabulary_curve --file text.txt --max-length 50
+    python -m python_pkg.word_frequency.vocabulary_curve --text "some text here"
+"""
+
+from __future__ import annotations
+
+import argparse
+import sys
+from pathlib import Path
+from typing import TYPE_CHECKING, NamedTuple
+
+if TYPE_CHECKING:
+    from collections.abc import Sequence
+
+try:
+    from python_pkg.word_frequency.analyzer import analyze_text, read_file
+except ImportError:
+    from analyzer import analyze_text, read_file
+
+
+class ExcerptAnalysis(NamedTuple):
+    """Analysis result for an excerpt length."""
+
+    excerpt_length: int
+    min_vocab_needed: int
+    best_excerpt: str
+    words_needed: list[str]
+
+
+def get_word_rank(word: str, ranked_words: list[str]) -> int | None:
+    """Get the rank (1-indexed) of a word in the frequency list.
+
+    Args:
+        word: The word to look up.
+        ranked_words: List of words sorted by frequency (most common first).
+
+    Returns:
+        1-indexed rank, or None if word not in list.
+    """
+    try:
+        return ranked_words.index(word) + 1
+    except ValueError:
+        return None
+
+
+def analyze_excerpt(
+    excerpt_words: list[str],
+    ranked_words: list[str],
+) -> tuple[int, list[str]]:
+    """Analyze how many top words are needed to understand an excerpt 100%.
+
+    Args:
+        excerpt_words: List of words in the excerpt.
+        ranked_words: List of all words sorted by frequency (most common first).
+
+    Returns:
+        Tuple of (max_rank_needed, list_of_words_needed_sorted_by_rank).
+    """
+    unique_words = set(excerpt_words)
+    ranks: list[tuple[int, str]] = []
+
+    for word in unique_words:
+        rank = get_word_rank(word, ranked_words)
+        if rank is not None:
+            ranks.append((rank, word))
+        else:
+            # Word not in vocabulary - would need infinite learning
+            return float("inf"), []  # type: ignore[return-value]
+
+    if not ranks:
+        return 0, []
+
+    # Sort by rank
+    ranks.sort()
+    max_rank = ranks[-1][0]
+    words_needed = [word for _, word in ranks]
+
+    return max_rank, words_needed
+
+
+def find_optimal_excerpts(
+    text: str,
+    *,
+    max_length: int = 30,
+    case_sensitive: bool = False,
+) -> list[ExcerptAnalysis]:
+    """Find optimal excerpts for each length.
+
+    For each excerpt length from 1 to max_length, finds the excerpt
+    that requires the minimum number of top-frequency words to understand.
+
+    Args:
+        text: The source text to analyze.
+        max_length: Maximum excerpt length to analyze.
+        case_sensitive: Whether to treat words case-sensitively.
+
+    Returns:
+        List of ExcerptAnalysis for each length from 1 to max_length.
+    """
+    # Get word frequencies and create ranked list
+    word_counts = analyze_text(text, case_sensitive=case_sensitive)
+    ranked_words = [word for word, _ in word_counts.most_common()]
+
+    # Extract all words from text (preserving order)
+    import re
+    all_words = re.findall(r"\b[\w]+\b", text, re.UNICODE)
+    if not case_sensitive:
+        all_words = [w.lower() for w in all_words]
+
+    if not all_words:
+        return []
+
+    results: list[ExcerptAnalysis] = []
+
+    for length in range(1, min(max_length + 1, len(all_words) + 1)):
+        best_vocab_needed = float("inf")
+        best_excerpt_words: list[str] = []
+        best_words_needed: list[str] = []
+
+        # Slide window through text
+        for start in range(len(all_words) - length + 1):
+            excerpt_words = all_words[start : start + length]
+            vocab_needed, words_needed = analyze_excerpt(excerpt_words, ranked_words)
+
+            if vocab_needed < best_vocab_needed:
+                best_vocab_needed = vocab_needed
+                best_excerpt_words = excerpt_words
+                best_words_needed = words_needed
+
+        if best_vocab_needed != float("inf"):
+            results.append(
+                ExcerptAnalysis(
+                    excerpt_length=length,
+                    min_vocab_needed=int(best_vocab_needed),
+                    best_excerpt=" ".join(best_excerpt_words),
+                    words_needed=best_words_needed,
+                )
+            )
+
+    return results
+
+
+def format_results(
+    results: list[ExcerptAnalysis],
+    *,
+    show_excerpts: bool = False,
+    show_words: bool = False,
+) -> str:
+    """Format analysis results as a table.
+
+    Args:
+        results: List of ExcerptAnalysis results.
+        show_excerpts: If True, show the actual excerpt text.
+        show_words: If True, show which words are needed.
+
+    Returns:
+        Formatted string with results.
+    """
+    if not results:
+        return "No excerpts found."
+
+    lines: list[str] = []
+    lines.append("=" * 70)
+    lines.append("VOCABULARY LEARNING CURVE")
+    lines.append("=" * 70)
+    lines.append("")
+    lines.append("For each excerpt length, the minimum number of top-frequency")
+    lines.append("words you need to learn to understand 100% of some excerpt.")
+    lines.append("")
+    lines.append("-" * 70)
+
+    # Header
+    if show_excerpts:
+        lines.append(f"{'Length':>6}  {'Vocab':>5}  Excerpt")
+        lines.append(f"{'------':>6}  {'-----':>5}  {'-------'}")
+    else:
+        lines.append(f"{'Length':>6}  {'Vocab Needed':>12}")
+        lines.append(f"{'------':>6}  {'------------':>12}")
+
+    prev_vocab = 0
+    for r in results:
+        # Mark increases
+        marker = ""
+        if r.min_vocab_needed > prev_vocab:
+            marker = f" (+{r.min_vocab_needed - prev_vocab})"
+        prev_vocab = r.min_vocab_needed
+
+        if show_excerpts:
+            # Truncate long excerpts
+            excerpt = r.best_excerpt
+            if len(excerpt) > 50:
+                excerpt = excerpt[:47] + "..."
+            lines.append(f"{r.excerpt_length:>6}  {r.min_vocab_needed:>5}  {excerpt}")
+        else:
+            lines.append(f"{r.excerpt_length:>6}  {r.min_vocab_needed:>12}{marker}")
+
+        if show_words and r.words_needed:
+            lines.append(f"        Words: {', '.join(r.words_needed)}")
+
+    lines.append("-" * 70)
+    lines.append("")
+
+    # Summary statistics
+    if results:
+        final = results[-1]
+        lines.append(f"To understand a {final.excerpt_length}-word excerpt,")
+        lines.append(f"you need to learn at minimum {final.min_vocab_needed} top words.")
+
+    return "\n".join(lines)
+
+
+def main(argv: Sequence[str] | None = None) -> int:
+    """Main entry point.
+
+    Args:
+        argv: Command line arguments.
+
+    Returns:
+        Exit code.
+    """
+    parser = argparse.ArgumentParser(
+        description="Analyze minimum vocabulary needed for excerpt lengths.",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog=__doc__,
+    )
+
+    input_group = parser.add_mutually_exclusive_group(required=True)
+    input_group.add_argument(
+        "--text",
+        "-t",
+        type=str,
+        help="Raw text to analyze",
+    )
+    input_group.add_argument(
+        "--file",
+        "-f",
+        type=str,
+        help="Path to a file to analyze",
+    )
+
+    parser.add_argument(
+        "--max-length",
+        "-m",
+        type=int,
+        default=30,
+        help="Maximum excerpt length to analyze (default: 30)",
+    )
+    parser.add_argument(
+        "--show-excerpts",
+        "-e",
+        action="store_true",
+        help="Show the actual excerpt text for each length",
+    )
+    parser.add_argument(
+        "--show-words",
+        "-w",
+        action="store_true",
+        help="Show which words are needed for each excerpt",
+    )
+    parser.add_argument(
+        "--case-sensitive",
+        "-c",
+        action="store_true",
+        help="Treat words case-sensitively",
+    )
+    parser.add_argument(
+        "--output",
+        "-o",
+        type=str,
+        help="Output file path (default: print to stdout)",
+    )
+
+    args = parser.parse_args(argv)
+
+    try:
+        if args.text:
+            text = args.text
+        else:
+            text = read_file(args.file)
+
+        results = find_optimal_excerpts(
+            text,
+            max_length=args.max_length,
+            case_sensitive=args.case_sensitive,
+        )
+
+        output = format_results(
+            results,
+            show_excerpts=args.show_excerpts,
+            show_words=args.show_words,
+        )
+
+        if args.output:
+            Path(args.output).write_text(output, encoding="utf-8")
+            print(f"Output written to {args.output}")  # noqa: T201
+        else:
+            print(output)  # noqa: T201
+
+    except FileNotFoundError as e:
+        print(f"Error: File not found - {e}", file=sys.stderr)  # noqa: T201
+        return 1
+    except UnicodeDecodeError as e:
+        print(f"Error: Could not decode file - {e}", file=sys.stderr)  # noqa: T201
+        return 1
+
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())