From 510440e02d6e54d8546f41354a7ac502c4ecba46 Mon Sep 17 00:00:00 2001 From: Krzysztof Rudnicki Date: Sun, 28 Dec 2025 16:15:38 +0100 Subject: [PATCH] feat: vocabulary curbe in C --- C/vocabulary_curve/Makefile | 13 + C/vocabulary_curve/main.c | 359 ++++++++++++++++++ C/vocabulary_curve/vocabulary_curve | Bin 0 -> 16768 bytes .../tests/test_vocabulary_curve.py | 244 ++++++++++++ python_pkg/word_frequency/vocabulary_curve.py | 319 ++++++++++++++++ 5 files changed, 935 insertions(+) create mode 100644 C/vocabulary_curve/Makefile create mode 100644 C/vocabulary_curve/main.c create mode 100755 C/vocabulary_curve/vocabulary_curve create mode 100644 python_pkg/word_frequency/tests/test_vocabulary_curve.py create mode 100644 python_pkg/word_frequency/vocabulary_curve.py diff --git a/C/vocabulary_curve/Makefile b/C/vocabulary_curve/Makefile new file mode 100644 index 0000000..6311950 --- /dev/null +++ b/C/vocabulary_curve/Makefile @@ -0,0 +1,13 @@ +CC = gcc +CFLAGS = -O3 -Wall -Wextra -march=native +TARGET = vocabulary_curve + +all: $(TARGET) + +$(TARGET): main.c + $(CC) $(CFLAGS) -o $(TARGET) main.c + +clean: + rm -f $(TARGET) + +.PHONY: all clean diff --git a/C/vocabulary_curve/main.c b/C/vocabulary_curve/main.c new file mode 100644 index 0000000..0a76857 --- /dev/null +++ b/C/vocabulary_curve/main.c @@ -0,0 +1,359 @@ +/* + * Vocabulary Learning Curve Analyzer + * + * For each excerpt length (1, 2, 3, ... N words), finds the excerpt that + * requires the minimum number of top-frequency words to understand 100%. + * + * Usage: + * ./vocabulary_curve [max_length] + * ./vocabulary_curve test.txt 50 + */ + +#include +#include +#include +#include +#include + +#define MAX_WORD_LEN 64 +#define MAX_WORDS 500000 +#define MAX_UNIQUE_WORDS 100000 +#define HASH_SIZE 200003 /* Prime number for better distribution */ + +/* Word entry for hash table */ +typedef struct WordEntry { + char word[MAX_WORD_LEN]; + int count; + int rank; /* 1-indexed rank by frequency (1 = most common) */ + struct WordEntry *next; +} WordEntry; + +/* Hash table for word lookup */ +static WordEntry *hash_table[HASH_SIZE]; +static WordEntry *all_entries[MAX_UNIQUE_WORDS]; +static int num_unique_words = 0; + +/* All words in order of appearance - store POINTERS not indices */ +static WordEntry *word_sequence[MAX_WORDS]; +static int num_words = 0; + +/* Result for each excerpt length */ +typedef struct { + int excerpt_length; + int min_vocab_needed; + int start_pos; /* Start position in word_sequence */ +} ExcerptResult; + +/* Simple hash function */ +static unsigned int hash_word(const char *word) { + unsigned int hash = 5381; + int c; + while ((c = *word++)) { + hash = ((hash << 5) + hash) + c; + } + return hash % HASH_SIZE; +} + +/* Find or create word entry */ +static WordEntry *get_or_create_word(const char *word) { + unsigned int h = hash_word(word); + WordEntry *entry = hash_table[h]; + + while (entry) { + if (strcmp(entry->word, word) == 0) { + return entry; + } + entry = entry->next; + } + + /* Create new entry */ + if (num_unique_words >= MAX_UNIQUE_WORDS) { + fprintf(stderr, "Too many unique words\n"); + exit(1); + } + + entry = malloc(sizeof(WordEntry)); + if (!entry) { + fprintf(stderr, "Memory allocation failed\n"); + exit(1); + } + + strncpy(entry->word, word, MAX_WORD_LEN - 1); + entry->word[MAX_WORD_LEN - 1] = '\0'; + entry->count = 0; + entry->rank = 0; + entry->next = hash_table[h]; + hash_table[h] = entry; + + all_entries[num_unique_words++] = entry; + + return entry; +} + +/* Compare function for sorting by frequency (descending) */ +static int compare_by_count(const void *a, const void *b) { + const WordEntry *wa = *(const WordEntry **)a; + const WordEntry *wb = *(const WordEntry **)b; + return wb->count - wa->count; /* Descending */ +} + +/* Check if character is part of a word */ +static bool is_word_char(int c) { + return isalnum(c) || c == '_' || (unsigned char)c >= 128; +} + +/* Read and process file */ +static bool process_file(const char *filename) { + FILE *fp = fopen(filename, "r"); + if (!fp) { + fprintf(stderr, "Cannot open file: %s\n", filename); + return false; + } + + char word[MAX_WORD_LEN]; + int word_len = 0; + int c; + + while ((c = fgetc(fp)) != EOF) { + if (is_word_char(c)) { + if (word_len < MAX_WORD_LEN - 1) { + word[word_len++] = tolower(c); + } + } else if (word_len > 0) { + word[word_len] = '\0'; + + WordEntry *entry = get_or_create_word(word); + entry->count++; + + if (num_words >= MAX_WORDS) { + fprintf(stderr, "Too many words in file\n"); + fclose(fp); + return false; + } + + /* Store pointer directly - survives sorting */ + word_sequence[num_words++] = entry; + + word_len = 0; + } + } + + /* Handle last word if file doesn't end with whitespace */ + if (word_len > 0) { + word[word_len] = '\0'; + WordEntry *entry = get_or_create_word(word); + entry->count++; + + if (num_words < MAX_WORDS) { + word_sequence[num_words++] = entry; + } + } + + fclose(fp); + return true; +} + +/* Assign ranks based on frequency */ +static void assign_ranks(void) { + /* Sort all_entries by frequency (this doesn't affect word_sequence) */ + qsort(all_entries, num_unique_words, sizeof(WordEntry *), compare_by_count); + + /* Assign 1-indexed ranks */ + for (int i = 0; i < num_unique_words; i++) { + all_entries[i]->rank = i + 1; + } +} + +/* Analyze excerpt and return max rank needed */ +static int analyze_excerpt(int start, int length) { + /* Track which entries we've seen using a simple visited array */ + /* We use the rank field is already assigned, so we can check uniqueness */ + static bool seen_rank[MAX_UNIQUE_WORDS + 1]; + memset(seen_rank, 0, (num_unique_words + 1) * sizeof(bool)); + + int max_rank = 0; + + for (int i = start; i < start + length; i++) { + WordEntry *entry = word_sequence[i]; + int rank = entry->rank; + + if (!seen_rank[rank]) { + seen_rank[rank] = true; + if (rank > max_rank) { + max_rank = rank; + } + } + } + + return max_rank; +} + +/* Find optimal excerpts for each length */ +static void find_optimal_excerpts(int max_length, ExcerptResult *results) { + for (int length = 1; length <= max_length && length <= num_words; length++) { + int best_vocab = num_unique_words + 1; + int best_start = 0; + + /* Slide window through text */ + for (int start = 0; start <= num_words - length; start++) { + int vocab_needed = analyze_excerpt(start, length); + + if (vocab_needed < best_vocab) { + best_vocab = vocab_needed; + best_start = start; + } + } + + results[length - 1].excerpt_length = length; + results[length - 1].min_vocab_needed = best_vocab; + results[length - 1].start_pos = best_start; + } +} + +/* Print excerpt words */ +static void print_excerpt(int start, int length) { + for (int i = start; i < start + length; i++) { + if (i > start) printf(" "); + printf("%s", word_sequence[i]->word); + } +} + +/* Print words needed (sorted by rank) */ +static void print_words_needed(int start, int length) { + /* Collect unique entries */ + static WordEntry *unique_entries[MAX_UNIQUE_WORDS]; + static bool seen_rank[MAX_UNIQUE_WORDS + 1]; + memset(seen_rank, 0, (num_unique_words + 1) * sizeof(bool)); + + int count = 0; + for (int i = start; i < start + length; i++) { + WordEntry *entry = word_sequence[i]; + if (!seen_rank[entry->rank]) { + seen_rank[entry->rank] = true; + unique_entries[count++] = entry; + } + } + + /* Sort by rank (simple bubble sort - small arrays) */ + for (int i = 0; i < count - 1; i++) { + for (int j = i + 1; j < count; j++) { + if (unique_entries[i]->rank > unique_entries[j]->rank) { + WordEntry *tmp = unique_entries[i]; + unique_entries[i] = unique_entries[j]; + unique_entries[j] = tmp; + } + } + } + + /* Print */ + for (int i = 0; i < count; i++) { + if (i > 0) printf(", "); + printf("%s(#%d)", unique_entries[i]->word, unique_entries[i]->rank); + } +} + +/* Print results */ +static void print_results(ExcerptResult *results, int max_length) { + printf("======================================================================\n"); + printf("VOCABULARY LEARNING CURVE\n"); + printf("======================================================================\n"); + printf("\n"); + printf("For each excerpt length, the minimum number of top-frequency\n"); + printf("words you need to learn to understand 100%% of some excerpt.\n"); + printf("\n"); + printf("Total words in text: %d\n", num_words); + printf("Unique words: %d\n", num_unique_words); + printf("\n"); + printf("----------------------------------------------------------------------\n"); + + int prev_vocab = 0; + int actual_max = max_length; + if (actual_max > num_words) actual_max = num_words; + + for (int i = 0; i < actual_max; i++) { + ExcerptResult *r = &results[i]; + + printf("\n[Length %d] Vocab needed: %d", r->excerpt_length, r->min_vocab_needed); + if (r->min_vocab_needed > prev_vocab) { + printf(" (+%d)", r->min_vocab_needed - prev_vocab); + } + printf("\n"); + + printf(" Excerpt: \""); + print_excerpt(r->start_pos, r->excerpt_length); + printf("\"\n"); + + printf(" Words: "); + print_words_needed(r->start_pos, r->excerpt_length); + printf("\n"); + + prev_vocab = r->min_vocab_needed; + } + + printf("\n----------------------------------------------------------------------\n"); + + if (actual_max > 0) { + ExcerptResult *final = &results[actual_max - 1]; + printf("\nTo understand a %d-word excerpt,\n", final->excerpt_length); + printf("you need to learn at minimum %d top words.\n", final->min_vocab_needed); + } +} + +/* Free memory */ +static void cleanup(void) { + for (int i = 0; i < num_unique_words; i++) { + free(all_entries[i]); + } +} + +int main(int argc, char *argv[]) { + if (argc < 2) { + fprintf(stderr, "Usage: %s [max_length]\n", argv[0]); + fprintf(stderr, " max_length: maximum excerpt length to analyze (default: 30)\n"); + return 1; + } + + const char *filename = argv[1]; + int max_length = 30; + + if (argc >= 3) { + max_length = atoi(argv[2]); + if (max_length < 1) max_length = 1; + if (max_length > 1000) max_length = 1000; + } + + /* Initialize hash table */ + memset(hash_table, 0, sizeof(hash_table)); + + /* Process file */ + if (!process_file(filename)) { + return 1; + } + + if (num_words == 0) { + fprintf(stderr, "No words found in file\n"); + return 1; + } + + /* Assign ranks by frequency */ + assign_ranks(); + + /* Find optimal excerpts */ + ExcerptResult *results = malloc(max_length * sizeof(ExcerptResult)); + if (!results) { + fprintf(stderr, "Memory allocation failed\n"); + cleanup(); + return 1; + } + + find_optimal_excerpts(max_length, results); + + /* Print results */ + print_results(results, max_length); + + /* Cleanup */ + free(results); + cleanup(); + + return 0; +} diff --git a/C/vocabulary_curve/vocabulary_curve b/C/vocabulary_curve/vocabulary_curve new file mode 100755 index 0000000000000000000000000000000000000000..ee232fae6092c1cd9dcfb98406f63612442771e2 GIT binary patch literal 16768 zcmeHOdw5humah&8L`>4Cpp1&%)y_;%(x!Pyz({xKz-{b6gupn0UYgDgX-T?6Ka@BQ z;3V<$+P1RmV{rB%?l%u-_M3HdU6(;ugMkp8-P!mUH!D6EchuVxP;^}upS{0Rx2n@= z)3Y<*`0Zb*{`#IezdGmCsdMXfUppK8%ZqF_CetK#B_nRrxfbG<1&d{dF*jSpDsa4j z&0>=Q7vt|1#rbsFB&$HuQXxk$={+vW^Xd6QPEzS1QPL}vPE55*BxTbqp7cs6EBJi+ zn=(sIQdyp@PqE5b9_f?BV+k`yME9v>@~MI{N(w)Yp0l-Y$6H7* z-#^K_g}kKF&nBVgxY!cNr}KrLq*Mo`tD#5n{6mxM9&1E>h0b4%IDe9g)4^j>gUYC{ ztyr~=742eu(aK&G* z{$>2K_R!{q^Dk>}scsL26Wgn|FIrf=aK1AZan5Dr*4OZ-`gpsoin+Ih*M69kLkirN zQzf7?zYMd>3gDXy;53}Zvr`Ix6qMzsDFDWkf3^VrqXIa^Zah1;6u^I00H?5xXXnWR z__qt-jsoqys(}240{GqWHhdI1VUld)1pUVqca)`$6K>j zpcA=Cj|yAC?E$Sd6b`h9Zq`|AusssfS%=;c)8nl5rf4Xxle@u=PPQ!;iN+b#K_hJ; zTjBTBFVp5a=Z)qTWwZ0Mxp|}6xy}Vl^EEeWa7*73ipBM4bK|o1NLX(UY;M=#;FgX^ zShPU1M8_3GT#N9VgujjFiZE{wV*PBGeMa*|mVY2v_O4ld_RDj3Sa?Of724Cml8a`2-$_}MwQ?6+i_=F4nyTREDiXl{_H z)NSPmmoX+}nwyS{5iZvv;%B<)XyJT*wpgaPtqjT2U#2Rzl_N~Xh>#jL9W7b8mJq+l zO-BnCTOrQ7-JEdSr(0QmRC92#1>$&94vr3zP1oe$6~}qFe2OC7DM=1R+b-kxh;lpn&V}<-EHLvKSzR~yWJK;_)IH1ah!<2LA z2>h4*f=|@$W2I`cC^MvXA1v+XQkezM1I|?b3y$^+-N+H&MrG-v8KlZ* ziKitoJ;eF_#1|3Y$N9&IrzJf7bI$*icv=$EdpZBN#M5${-p%>%6HiNHdKc&af_R$7 z)192ZgLqmB)0;T|UE*m8OgC}Pc?WK^Tau?N)E*PXH1?|L2~P?Oh}dY--jV0Wg--urKe(*Bt@%w(>U zYO3zBN3LP!>!?yqE!Y76RpU+535R{gL7Y{4&kDrMb=;q-4sEluR{MoLFvFPNc-Pzq zC8NLlR3^UByaF2j;HSo`YLeBTEZ$z4ILjFBGpBTas@d;7P&#PunTlGwKfiM4q3)4g zw|=@4Xa1&g^S`NHXxN{azE2Gvg4N8S+nbA<%#G4^;s?FQ?Y({|Q%lP*UDH#$mK~F7C292jAd{mQg zF!Qg6J`N8eY~9b69sJL27xmlFkm-j$ex?2+7XWq@W}b6!UAJ zar82bDYM+lUjPZyhWs-$5Y2!8luI-bI%4H}kazW&v$8V(06e+i(J+tx=NPXj&2a1& zH&M5!D}sGsq;FZh)v|OmS-p=WOJdOW)DC#OLVw#~ zXQ<5KtsU{$SHDg@%@=&#pK`oGy>6!07-=-#_9f>#P`#fU%=G&(E1M&zq!!ZOc?*Ue zjmELs;T=mcCT0)A>Rqs^+J@BB`LFm>a|Y}evS$<;D&761<{I)Y89N1opW>__lzV~r zYey1C0KEf9P0ebA0aLZwF>FcQ3@%wR|3*JeGsgR=CYu_3(To#{mZ_7xj+ts|e!uHT zc24=@`fssA4K{gq-GZyLM8$x227~>V&{(7R6>6}r_M^nB-JfCx{5FE;PnFNT1Si87 z_NLZmenInq-}n{US^LW!>%7L$>_OEw*qA#1fcgiHDmA(Ol$u=RP2S-3)<(SFnW7py zPN~K;wY#s>Upt(5Q_UPuy9bazkoZeqav|>zgII2o>%FG-aV7)T;)@N;9F-5uX3Up5 z{|!X*5eT^Y`1x7_86O4<#eFx2NwbpB+Sd{<@t))BK3rx#b_!=>$t^U_c4BhyC#ip8 zfGvXJ8#oo_qYyy;T;3aesf~_|y7fqz-!_t*XZ{*x$yhOpFSdH$L9%ANZ~93oSdxfi z^6cAj5l-C9viI&spw(LsK8$d0 zxX!cDbDigUkJeAqR`2mdnd^A^AJHDmUhue@y2a5n;GNUN%nF>cjN7dm&%%da-(Y8T z7d|wF+3%hUdnwiCGnx-!O)|>&qbR)%Hq_*fY5UPw^fmKpxa2bi_L(puMm3*wF|@t* zRVUImk{?sugV=Utt7{#t?hZ(zI-`6IFsv|Jh#1ok@G5a`6vXW%tgo)V*>9M4{7)v6 zzSNq>h5(wi03h%j7cjSiOO{l|#tsLalxp}QSRVQY)-SpiecD>4>ORMUWA;OWyPy6m zE_Sw>tXm36jN03&mbFw%vV0CHUzwwP4;9{q!jS_gE~nyL+XmU*f?`@#A2|uJ@59%> z$MR|33NC#LUCI10a!?-nI+W?AhIH&)D7%iBRb>4vHB}OcqH83t7*;d==2_6GomB^9 z=iBLJnE35IpCc>6&hW_rR8lT|#QX2iUt|GP#FpI}dWBYccf?+F!bk`-~T^SBgK>VZ>}RNH|1!WZ%BFmT*q87R#hwfeo(C;xWU{WEk~ zV%|EnyY6{NqGfN#%hAdmz%w)U0WyPdcA(^50JiJ;NB!l4AZNlOd+99{F5zEtvYI?6 z>2_aQ?}jYP?)OyV^R=|`wm*GN54J$ZsMgeK+rARagE{t$0Zc##-YK!2unpOtzB<$M zV&WL~6#F+!IrL+{{gJnP2adTsp_iAHPhzu&P=FoGPkwj}Tb-QqZe#E9IQA_gBW0*p zO>R)t-XrmEsK$G=Gkp1@Os2OlVbV@tHHLFWwG|(KaDk`O&pfw&@!)MXwsXTXO>ofs z6|_{NpH4;Re!7FktsOQe1E-DAz_O*xC@F;^PT-#rIlRmbF-+21)vJv2-(6{(e_|v_ zad?Uvnr=j-&@&g(=}FbaFw*za-8kMs&E2s242>c4MyfYiZnwmOB$n;RW+*GR8aaq( zuW8V>e()1%A_5Z;7(W8^{Gd4!Q91(QE+r8TZA<9NO_69zY$|KiJ0j68g+E;h#6yv= zLJw*5mZ>btmIcD$NL-<(GfFGK#Y!bKvWBsK%H)UWe< z)_hCxH+a^p@~v8-EL*o`eFH4T1MNZyX5#wxII2L^>vCMQkf|H|dU#8`4aL_f>m$Ly zW+klaEqV(TGo|X%%9hzoQ5v=f^=K!oT*NA4;J-D7mw+0gJ zsDEC~Y}ly#N)ltsBT+>U1l#heo}#25^04;oz-~ChO-29oLgwq3f6Wb+haBv|#g+DIA%Esp7zZ>@5 zwB}VzTV649rTz4qN_VlP=PtQqp5r2*{JGsm(1%ekE_``XaqPtZ2fzZPTVFBl?xOmN znRiX9uTWCO^%YfjmUt>^Zkz0_SkyhmTj4J1o?5ZUTT$bwsDcC}p;2E^y5e+3ZFvg+ zw&R)1i`YdLw&fvPD=LoKCheUJYp|bZtpQgeJQd?ejU?CTod!yT(SIUML|`HU6A_q* zz(fQlA}|qwi3t2@5s>d&$@i;h&qZm_BCJeM`2v{SL%vL#QY87#lYEcsJPXdgdvvbg z<@;PhEnAKUfe{iJ!aW*Fv}dNopXh>4<{V?EM7h|Gvqs^l+%w8|c5stuOcH9$6z^}* zgJw#DLSOE?ao3b5`A$$3ZoN~IcoXeykpyAG7R!WK_MIU-#Nq9cqPTa<`4hqr?q6|U z#`%QcpAhA^^UTYo-=doTcOu^(&#!nPzxu#+q@e2sZ58xJL2ncEenEdE=o5kt2>Obk z?+AKQ(8=QEud@aHx}XaMT`uT)L8YI2-BxbJvSo{vDqKCxR$L35bDb__Zq3{UE|+Vr zlpLSZc`0TeeCatno%bSE&Nhh-F2@1&f5JNj&abC{&B5P>(Xs2mHOS$eGCX#tbUFTn zZ@QDNoJOx{RDtgjI9p(iyk>!K>b8?OW2e<|vKdFR;}y8PHfk66-p?!#AHv;OO4|i~ zG-275c#p*0$WRjXQg*yJ+y~soX0ZJJ@i(M^NkYD7&siKE#ovZE?d15A_+ez|rHA}P z<7zzoSONSrG;cilS-_QiN7WjB}Ru;fF0-pvu59iql7LdP@@S+MP z=S^ArEd}IXFMxlq0RGbg_*1~i&;0g2S3v%?0{98wigY@MW`LhjG?U43CyUB3(VoH3 zXN8jNpXUJ|FU|`K;6C8f(9ic=x=otq^jXww`C|M=s``ly1*|pq$Iol=X z3x2NxA{xiX$`ZG&0RG(q_hiF&9f(gUo;N>3h*Q@pxMXtZ zCk5~`v4D*GUOm0(?hLYEq@yzs#Vhh%S`ar`;|#CcYmulHjOu~7uF)+p))t7hX>tCl zIqnE**4rrsmf(wl5L!3( z`-QB?eDMWA79eN%2M}WrO(Xl(`~Ko9gE8ge!-X+e{x%Pcx$NF6PWan-d{?WEE;*PJBRurw5D+(lBDqmaZ_2S~MQDN^Zpc zay%y)!)Pdr>g@qi5V_9wI9Wsso%CRXIeBk#MkBntIrTQJH5%y9wYC=MN=~@tsrOaK z16wS8Ef9?c;4taSJgNzFgo5xP0?Rl!d3Ba)oL%O`RMCO4nltJA30c}E(N-7J7Ekg! zK1tmImqhNjCGSKgSAVzIC`;;)f`ZC*N5URtuykbWFZ3O$lqcETi$ zS--rF$@hOP^y!*K>dWgINjpUWUH4GkQs3G=;&mEeRHf9H*GZDf>mrEol<$8KC|!d{ zeRDLqr5B>k6il6Inuu4yDMuWKaTM9N6{{#z(xJCUJtBK75Um87))qa^FkZ@+;u z(yu2WBzc`Csl2WueY%#**S`lC#YA3LX4hG<-imEJB&?L*f9?XDtFH(>N!@t?m&>@M z59H~~{fMMj%L`gjzgtjA9z-U$ez~8NR9=@z`F#I>DD-9gnIa?wb%_2f&}{r9|1)H$ zeyK0_`;yB3Bjxk!e=<-1nu{$-Nz3x$C(oa>cL@n7gWVkw_r|4S&Km$hXx<$VkJ z-EMw;l9%*N=+c##)R*@)hN}T0NjZxfZbV)?& z%kSR$E(3^EWs&Sr>Pz}DN^U;phzmc-e43VAI1Uyvt56#xJL literal 0 HcmV?d00001 diff --git a/python_pkg/word_frequency/tests/test_vocabulary_curve.py b/python_pkg/word_frequency/tests/test_vocabulary_curve.py new file mode 100644 index 0000000..74d7877 --- /dev/null +++ b/python_pkg/word_frequency/tests/test_vocabulary_curve.py @@ -0,0 +1,244 @@ +#!/usr/bin/env python3 +"""Tests for vocabulary_curve C implementation.""" + +from __future__ import annotations + +import subprocess +import tempfile +from pathlib import Path + +import pytest + +# Path to the C executable +C_EXECUTABLE = Path(__file__).parent.parent.parent.parent / "C" / "vocabulary_curve" / "vocabulary_curve" + + +@pytest.fixture +def sample_text_file(tmp_path: Path) -> Path: + """Create a sample text file for testing.""" + text = """The quick brown fox jumps over the lazy dog. +The fox was very quick and the dog was very lazy. +Quick foxes and lazy dogs are common in stories.""" + filepath = tmp_path / "sample.txt" + filepath.write_text(text, encoding="utf-8") + return filepath + + +@pytest.fixture +def polish_text_file(tmp_path: Path) -> Path: + """Create a Polish sample text file.""" + text = """Litwo! Ojczyzno moja! Ty jesteś jak zdrowie. +Ile cię trzeba cenić, ten tylko się dowie, +Kto cię stracił. Dziś piękność twą w całej ozdobie +Widzę i opisuję, bo tęsknię po tobie.""" + filepath = tmp_path / "polish.txt" + filepath.write_text(text, encoding="utf-8") + return filepath + + +def run_vocabulary_curve(filepath: Path, max_length: int = 10) -> str: + """Run the vocabulary_curve executable and return output.""" + if not C_EXECUTABLE.exists(): + pytest.skip(f"C executable not found at {C_EXECUTABLE}") + + result = subprocess.run( + [str(C_EXECUTABLE), str(filepath), str(max_length)], + capture_output=True, + text=True, + timeout=30, + ) + return result.stdout + + +def extract_excerpts_from_output(output: str) -> list[tuple[int, str]]: + """Extract (length, excerpt) pairs from output.""" + excerpts = [] + lines = output.split("\n") + + i = 0 + while i < len(lines): + line = lines[i] + if line.strip().startswith("[Length "): + # Parse length + length = int(line.split("]")[0].split()[-1]) + + # Find excerpt line + i += 1 + while i < len(lines) and not lines[i].strip().startswith("Excerpt:"): + i += 1 + + if i < len(lines): + excerpt_line = lines[i].strip() + # Extract text between quotes + if '"' in excerpt_line: + start = excerpt_line.index('"') + 1 + end = excerpt_line.rindex('"') + excerpt = excerpt_line[start:end] + excerpts.append((length, excerpt)) + i += 1 + + return excerpts + + +class TestExcerptValidity: + """Tests that verify excerpts are actually found in the source text.""" + + def test_excerpt_exists_in_source_text(self, sample_text_file: Path) -> None: + """Test that each excerpt can be found in the source text as contiguous words.""" + import re + source_text = sample_text_file.read_text(encoding="utf-8").lower() + source_words = re.findall(r'\b[\w]+\b', source_text) + output = run_vocabulary_curve(sample_text_file, max_length=10) + excerpts = extract_excerpts_from_output(output) + + assert len(excerpts) > 0, "No excerpts found in output" + + for length, excerpt in excerpts: + excerpt_words = excerpt.lower().split() + # Find this sequence in source_words + found = False + for i in range(len(source_words) - len(excerpt_words) + 1): + if source_words[i:i+len(excerpt_words)] == excerpt_words: + found = True + break + assert found, ( + f"Excerpt of length {length} not found in source text:\n" + f" Excerpt words: {excerpt_words}\n" + f" First 30 source words: {source_words[:30]}" + ) + + def test_excerpt_word_count_matches_length(self, sample_text_file: Path) -> None: + """Test that excerpt has the expected number of words.""" + output = run_vocabulary_curve(sample_text_file, max_length=10) + excerpts = extract_excerpts_from_output(output) + + for length, excerpt in excerpts: + word_count = len(excerpt.split()) + assert word_count == length, ( + f"Expected {length} words, got {word_count}: '{excerpt}'" + ) + + def test_polish_excerpt_exists_in_source(self, polish_text_file: Path) -> None: + """Test Polish text excerpts are found in source as contiguous words.""" + import re + source_text = polish_text_file.read_text(encoding="utf-8").lower() + source_words = re.findall(r'\b[\w]+\b', source_text) + output = run_vocabulary_curve(polish_text_file, max_length=8) + excerpts = extract_excerpts_from_output(output) + + assert len(excerpts) > 0, "No excerpts found in output" + + for length, excerpt in excerpts: + excerpt_words = excerpt.lower().split() + # Find this sequence in source_words + found = False + for i in range(len(source_words) - len(excerpt_words) + 1): + if source_words[i:i+len(excerpt_words)] == excerpt_words: + found = True + break + assert found, ( + f"Polish excerpt of length {length} not found:\n" + f" Excerpt words: {excerpt_words}\n" + f" Source words: {source_words}" + ) + + def test_excerpt_is_contiguous(self, sample_text_file: Path) -> None: + """Test that excerpt words appear contiguously in source.""" + import re + + source_text = sample_text_file.read_text(encoding="utf-8").lower() + # Extract words from source + source_words = re.findall(r'\b[\w]+\b', source_text) + + output = run_vocabulary_curve(sample_text_file, max_length=5) + excerpts = extract_excerpts_from_output(output) + + for length, excerpt in excerpts: + excerpt_words = excerpt.lower().split() + + # Find this sequence in source_words + found = False + for i in range(len(source_words) - length + 1): + if source_words[i:i+length] == excerpt_words: + found = True + break + + assert found, ( + f"Excerpt words not found as contiguous sequence:\n" + f" Excerpt: {excerpt_words}\n" + f" First 20 source words: {source_words[:20]}" + ) + + +class TestVocabNeeded: + """Tests for vocabulary count calculations.""" + + def test_length_1_needs_vocab_1(self, sample_text_file: Path) -> None: + """Test that a 1-word excerpt needs exactly 1 vocabulary word.""" + output = run_vocabulary_curve(sample_text_file, max_length=1) + + assert "[Length 1] Vocab needed: 1" in output + + def test_vocab_needed_increases_monotonically(self, sample_text_file: Path) -> None: + """Test that vocab needed never decreases as length increases.""" + output = run_vocabulary_curve(sample_text_file, max_length=10) + excerpts = extract_excerpts_from_output(output) + + # Extract vocab needed from output + prev_vocab = 0 + for line in output.split("\n"): + if "Vocab needed:" in line: + # Parse "Vocab needed: X" + parts = line.split("Vocab needed:") + if len(parts) > 1: + vocab = int(parts[1].split()[0]) + assert vocab >= prev_vocab, ( + f"Vocab decreased from {prev_vocab} to {vocab}" + ) + prev_vocab = vocab + + +class TestEdgeCases: + """Edge case tests.""" + + def test_empty_file(self, tmp_path: Path) -> None: + """Test handling of empty file.""" + filepath = tmp_path / "empty.txt" + filepath.write_text("", encoding="utf-8") + + if not C_EXECUTABLE.exists(): + pytest.skip("C executable not found") + + result = subprocess.run( + [str(C_EXECUTABLE), str(filepath), "5"], + capture_output=True, + text=True, + ) + + assert result.returncode != 0 or "No words" in result.stderr + + def test_single_word_file(self, tmp_path: Path) -> None: + """Test file with single word.""" + filepath = tmp_path / "single.txt" + filepath.write_text("hello", encoding="utf-8") + + output = run_vocabulary_curve(filepath, max_length=5) + + assert "[Length 1] Vocab needed: 1" in output + # Should only have 1 length since there's only 1 word + assert "[Length 2]" not in output + + def test_repeated_word_file(self, tmp_path: Path) -> None: + """Test file with same word repeated.""" + filepath = tmp_path / "repeated.txt" + filepath.write_text("hello hello hello hello hello", encoding="utf-8") + + output = run_vocabulary_curve(filepath, max_length=5) + + # All excerpts should need only 1 vocabulary word + for i in range(1, 6): + assert f"[Length {i}] Vocab needed: 1" in output + + +if __name__ == "__main__": + pytest.main([__file__, "-v"]) diff --git a/python_pkg/word_frequency/vocabulary_curve.py b/python_pkg/word_frequency/vocabulary_curve.py new file mode 100644 index 0000000..5163c0e --- /dev/null +++ b/python_pkg/word_frequency/vocabulary_curve.py @@ -0,0 +1,319 @@ +#!/usr/bin/env python3 +"""Vocabulary learning curve analyzer. + +Finds the minimum vocabulary needed to understand excerpts of increasing length. +For each excerpt length (1, 2, 3, ... N words), finds the excerpt that requires +the fewest top-frequency words to understand 100%. + +Usage: + python -m python_pkg.word_frequency.vocabulary_curve --file text.txt + python -m python_pkg.word_frequency.vocabulary_curve --file text.txt --max-length 50 + python -m python_pkg.word_frequency.vocabulary_curve --text "some text here" +""" + +from __future__ import annotations + +import argparse +import sys +from pathlib import Path +from typing import TYPE_CHECKING, NamedTuple + +if TYPE_CHECKING: + from collections.abc import Sequence + +try: + from python_pkg.word_frequency.analyzer import analyze_text, read_file +except ImportError: + from analyzer import analyze_text, read_file + + +class ExcerptAnalysis(NamedTuple): + """Analysis result for an excerpt length.""" + + excerpt_length: int + min_vocab_needed: int + best_excerpt: str + words_needed: list[str] + + +def get_word_rank(word: str, ranked_words: list[str]) -> int | None: + """Get the rank (1-indexed) of a word in the frequency list. + + Args: + word: The word to look up. + ranked_words: List of words sorted by frequency (most common first). + + Returns: + 1-indexed rank, or None if word not in list. + """ + try: + return ranked_words.index(word) + 1 + except ValueError: + return None + + +def analyze_excerpt( + excerpt_words: list[str], + ranked_words: list[str], +) -> tuple[int, list[str]]: + """Analyze how many top words are needed to understand an excerpt 100%. + + Args: + excerpt_words: List of words in the excerpt. + ranked_words: List of all words sorted by frequency (most common first). + + Returns: + Tuple of (max_rank_needed, list_of_words_needed_sorted_by_rank). + """ + unique_words = set(excerpt_words) + ranks: list[tuple[int, str]] = [] + + for word in unique_words: + rank = get_word_rank(word, ranked_words) + if rank is not None: + ranks.append((rank, word)) + else: + # Word not in vocabulary - would need infinite learning + return float("inf"), [] # type: ignore[return-value] + + if not ranks: + return 0, [] + + # Sort by rank + ranks.sort() + max_rank = ranks[-1][0] + words_needed = [word for _, word in ranks] + + return max_rank, words_needed + + +def find_optimal_excerpts( + text: str, + *, + max_length: int = 30, + case_sensitive: bool = False, +) -> list[ExcerptAnalysis]: + """Find optimal excerpts for each length. + + For each excerpt length from 1 to max_length, finds the excerpt + that requires the minimum number of top-frequency words to understand. + + Args: + text: The source text to analyze. + max_length: Maximum excerpt length to analyze. + case_sensitive: Whether to treat words case-sensitively. + + Returns: + List of ExcerptAnalysis for each length from 1 to max_length. + """ + # Get word frequencies and create ranked list + word_counts = analyze_text(text, case_sensitive=case_sensitive) + ranked_words = [word for word, _ in word_counts.most_common()] + + # Extract all words from text (preserving order) + import re + all_words = re.findall(r"\b[\w]+\b", text, re.UNICODE) + if not case_sensitive: + all_words = [w.lower() for w in all_words] + + if not all_words: + return [] + + results: list[ExcerptAnalysis] = [] + + for length in range(1, min(max_length + 1, len(all_words) + 1)): + best_vocab_needed = float("inf") + best_excerpt_words: list[str] = [] + best_words_needed: list[str] = [] + + # Slide window through text + for start in range(len(all_words) - length + 1): + excerpt_words = all_words[start : start + length] + vocab_needed, words_needed = analyze_excerpt(excerpt_words, ranked_words) + + if vocab_needed < best_vocab_needed: + best_vocab_needed = vocab_needed + best_excerpt_words = excerpt_words + best_words_needed = words_needed + + if best_vocab_needed != float("inf"): + results.append( + ExcerptAnalysis( + excerpt_length=length, + min_vocab_needed=int(best_vocab_needed), + best_excerpt=" ".join(best_excerpt_words), + words_needed=best_words_needed, + ) + ) + + return results + + +def format_results( + results: list[ExcerptAnalysis], + *, + show_excerpts: bool = False, + show_words: bool = False, +) -> str: + """Format analysis results as a table. + + Args: + results: List of ExcerptAnalysis results. + show_excerpts: If True, show the actual excerpt text. + show_words: If True, show which words are needed. + + Returns: + Formatted string with results. + """ + if not results: + return "No excerpts found." + + lines: list[str] = [] + lines.append("=" * 70) + lines.append("VOCABULARY LEARNING CURVE") + lines.append("=" * 70) + lines.append("") + lines.append("For each excerpt length, the minimum number of top-frequency") + lines.append("words you need to learn to understand 100% of some excerpt.") + lines.append("") + lines.append("-" * 70) + + # Header + if show_excerpts: + lines.append(f"{'Length':>6} {'Vocab':>5} Excerpt") + lines.append(f"{'------':>6} {'-----':>5} {'-------'}") + else: + lines.append(f"{'Length':>6} {'Vocab Needed':>12}") + lines.append(f"{'------':>6} {'------------':>12}") + + prev_vocab = 0 + for r in results: + # Mark increases + marker = "" + if r.min_vocab_needed > prev_vocab: + marker = f" (+{r.min_vocab_needed - prev_vocab})" + prev_vocab = r.min_vocab_needed + + if show_excerpts: + # Truncate long excerpts + excerpt = r.best_excerpt + if len(excerpt) > 50: + excerpt = excerpt[:47] + "..." + lines.append(f"{r.excerpt_length:>6} {r.min_vocab_needed:>5} {excerpt}") + else: + lines.append(f"{r.excerpt_length:>6} {r.min_vocab_needed:>12}{marker}") + + if show_words and r.words_needed: + lines.append(f" Words: {', '.join(r.words_needed)}") + + lines.append("-" * 70) + lines.append("") + + # Summary statistics + if results: + final = results[-1] + lines.append(f"To understand a {final.excerpt_length}-word excerpt,") + lines.append(f"you need to learn at minimum {final.min_vocab_needed} top words.") + + return "\n".join(lines) + + +def main(argv: Sequence[str] | None = None) -> int: + """Main entry point. + + Args: + argv: Command line arguments. + + Returns: + Exit code. + """ + parser = argparse.ArgumentParser( + description="Analyze minimum vocabulary needed for excerpt lengths.", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=__doc__, + ) + + input_group = parser.add_mutually_exclusive_group(required=True) + input_group.add_argument( + "--text", + "-t", + type=str, + help="Raw text to analyze", + ) + input_group.add_argument( + "--file", + "-f", + type=str, + help="Path to a file to analyze", + ) + + parser.add_argument( + "--max-length", + "-m", + type=int, + default=30, + help="Maximum excerpt length to analyze (default: 30)", + ) + parser.add_argument( + "--show-excerpts", + "-e", + action="store_true", + help="Show the actual excerpt text for each length", + ) + parser.add_argument( + "--show-words", + "-w", + action="store_true", + help="Show which words are needed for each excerpt", + ) + parser.add_argument( + "--case-sensitive", + "-c", + action="store_true", + help="Treat words case-sensitively", + ) + parser.add_argument( + "--output", + "-o", + type=str, + help="Output file path (default: print to stdout)", + ) + + args = parser.parse_args(argv) + + try: + if args.text: + text = args.text + else: + text = read_file(args.file) + + results = find_optimal_excerpts( + text, + max_length=args.max_length, + case_sensitive=args.case_sensitive, + ) + + output = format_results( + results, + show_excerpts=args.show_excerpts, + show_words=args.show_words, + ) + + if args.output: + Path(args.output).write_text(output, encoding="utf-8") + print(f"Output written to {args.output}") # noqa: T201 + else: + print(output) # noqa: T201 + + except FileNotFoundError as e: + print(f"Error: File not found - {e}", file=sys.stderr) # noqa: T201 + return 1 + except UnicodeDecodeError as e: + print(f"Error: Could not decode file - {e}", file=sys.stderr) # noqa: T201 + return 1 + + return 0 + + +if __name__ == "__main__": + sys.exit(main())