From aacb4b03efa2f578ed4c3f3d915b70a252fbba5f Mon Sep 17 00:00:00 2001 From: Krzysztof Rudnicki Date: Mon, 29 Dec 2025 16:10:26 +0100 Subject: [PATCH] feat: added inverse mode for anki --- python_pkg/word_frequency/anki_generator.py | 259 ++++++++- .../polish_pan_tadeusz_anki_top500.txt | 536 ++++++++++++++++++ 2 files changed, 793 insertions(+), 2 deletions(-) create mode 100644 python_pkg/word_frequency/test_texts/polish_pan_tadeusz_anki_top500.txt diff --git a/python_pkg/word_frequency/anki_generator.py b/python_pkg/word_frequency/anki_generator.py index 5c0bcf4..48a97a7 100644 --- a/python_pkg/word_frequency/anki_generator.py +++ b/python_pkg/word_frequency/anki_generator.py @@ -94,6 +94,106 @@ def run_vocabulary_curve(filepath: Path, max_length: int, *, dump_vocab: bool = return result.stdout +def run_vocabulary_curve_inverse(filepath: Path, max_vocab: int, *, dump_vocab: bool = False) -> str: + """Run the C vocabulary_curve executable in inverse mode. + + Args: + filepath: Path to the text file. + max_vocab: Maximum vocabulary size (top N words). + dump_vocab: If True, also dump all vocabulary up to max_vocab. + + Returns: + Output from the executable. + + Raises: + FileNotFoundError: If executable not found. + subprocess.CalledProcessError: If execution fails. + """ + if not C_EXECUTABLE.exists(): + raise FileNotFoundError( + f"C executable not found at {C_EXECUTABLE}. " + "Please compile it first: cd C/vocabulary_curve && make" + ) + + cmd = [str(C_EXECUTABLE), str(filepath), "--max-vocab", str(max_vocab)] + if dump_vocab: + cmd.append("--dump-vocab") + + result = subprocess.run( + cmd, + capture_output=True, + text=True, + timeout=120, + check=True, + ) + return result.stdout + + +def parse_inverse_mode_output(output: str) -> tuple[str, int, int, list[tuple[str, int]]]: + """Parse output from vocabulary_curve inverse mode. + + Args: + output: Raw output from vocabulary_curve --max-vocab. + + Returns: + Tuple of (excerpt_text, excerpt_length, max_rank_used, all_vocab_words). + """ + lines = output.split("\n") + excerpt = "" + excerpt_length = 0 + max_rank_used = 0 + all_vocab: list[tuple[str, int]] = [] + + for i, line in enumerate(lines): + line = line.strip() + + if line.startswith("LONGEST EXCERPT:"): + parts = line.split() + if len(parts) >= 3: + excerpt_length = int(parts[2]) + + elif line.startswith("Excerpt:"): + # Next line(s) contain the excerpt + i += 1 + excerpt_parts = [] + while i < len(lines): + next_line = lines[i].strip() + if next_line.startswith('"'): + next_line = next_line[1:] + if next_line.endswith('"'): + next_line = next_line[:-1] + excerpt_parts.append(next_line) + break + excerpt_parts.append(next_line) + i += 1 + excerpt = " ".join(excerpt_parts) + + elif line.startswith("Rarest word used:"): + # Parse "word (#rank)" + match = re.search(r"\(#(\d+)\)", line) + if match: + max_rank_used = int(match.group(1)) + + # Parse VOCAB_DUMP section if present + in_vocab_dump = False + for line in lines: + if line.strip() == "VOCAB_DUMP_START": + in_vocab_dump = True + continue + if line.strip() == "VOCAB_DUMP_END": + break + if in_vocab_dump and ";" in line: + parts = line.strip().split(";") + if len(parts) == 2: + word, rank_str = parts + try: + all_vocab.append((word, int(rank_str))) + except ValueError: + pass + + return excerpt, excerpt_length, max_rank_used, all_vocab + + def parse_vocabulary_curve_output(output: str, target_length: int) -> tuple[str, list[tuple[str, int]], list[tuple[str, int]]]: """Parse output from vocabulary_curve to get words needed. @@ -524,6 +624,101 @@ def generate_flashcards( return anki_content, excerpt, len(words_with_ranks), max_rank +def generate_flashcards_inverse( + filepath: str | Path, + max_vocab: int, + source_lang: str | None = None, + target_lang: str = "en", + include_context: bool = False, + deck_name: str | None = None, + no_translate: bool = False, + *, + force: bool = False, +) -> tuple[str, str, int, int, int]: + """Generate Anki flashcards for the longest excerpt using top N words. + + This is the inverse mode: given a vocabulary size, find the longest + excerpt that can be understood with only those words. + + Args: + filepath: Path to the source text file. + max_vocab: Maximum vocabulary size (top N words to learn). + source_lang: Source language (auto-detected if None). + target_lang: Target language for translations. + include_context: Whether to include example contexts. + deck_name: Optional deck name. + no_translate: If True, skip translation. + force: If True, ignore all caches and regenerate. + + Returns: + Tuple of (anki_content, excerpt, excerpt_length, num_words, max_rank_used). + """ + filepath = Path(filepath) + + # Read the text (only needed for context finding) + text = read_file(filepath) if include_context else "" + + # Auto-detect language if not provided + if source_lang is None: + sample_text = read_file(filepath)[:1000] if not text else text[:1000] + source_lang = detect_language(sample_text) + if source_lang is None: + raise ValueError( + "Could not auto-detect source language. " + "Please specify with --from (e.g., --from pl for Polish). " + "Install langdetect for auto-detection: pip install langdetect" + ) + + # Run vocabulary curve in inverse mode + output = run_vocabulary_curve_inverse(filepath, max_vocab, dump_vocab=True) + + # Parse the output + excerpt, excerpt_length, max_rank_used, all_vocab_words = parse_inverse_mode_output(output) + + if excerpt_length == 0: + raise ValueError( + f"No valid excerpt found using only top {max_vocab} words. " + "Try increasing the vocabulary limit." + ) + + if not all_vocab_words: + raise ValueError(f"No vocabulary returned for max_vocab={max_vocab}") + + # Use all vocabulary up to max_vocab + words_with_ranks = all_vocab_words + + # Find words that appear in the excerpt (for highlighting) + excerpt_word_set = set(excerpt.lower().split()) + excerpt_words = [(w, r) for w, r in all_vocab_words if w.lower() in excerpt_word_set] + + # Get contexts if requested + contexts = None + if include_context: + if not text: + text = read_file(filepath) + words = [w for w, _ in words_with_ranks] + contexts = find_word_contexts(text, words) + + # Generate deck name + if deck_name is None: + deck_name = f"{filepath.stem}_top{max_vocab}" + + # Generate Anki content + anki_content = generate_anki_deck( + words_with_ranks, + source_lang, + target_lang, + contexts, + deck_name, + include_context, + no_translate, + excerpt, + excerpt_words if excerpt_words else None, + ) + + return anki_content, excerpt, excerpt_length, len(words_with_ranks), max_rank_used + + def main(argv: Sequence[str] | None = None) -> int: """Main entry point. @@ -553,6 +748,13 @@ def main(argv: Sequence[str] | None = None) -> int: default=None, help="Target excerpt length (how many words you want to understand)", ) + parser.add_argument( + "--max-vocab", + "-v", + type=int, + default=None, + help="INVERSE MODE: Learn top N words, find longest excerpt you can understand", + ) parser.add_argument( "--from", dest="source_lang", @@ -669,8 +871,10 @@ def main(argv: Sequence[str] | None = None) -> int: # Validate required arguments for main functionality if args.file is None: parser.error("--file/-f is required") - if args.length is None: - parser.error("--length/-l is required") + if args.length is None and args.max_vocab is None: + parser.error("Either --length/-l or --max-vocab/-v is required") + if args.length is not None and args.max_vocab is not None: + parser.error("Cannot use both --length and --max-vocab. Choose one mode.") try: filepath = Path(args.file) @@ -678,6 +882,57 @@ def main(argv: Sequence[str] | None = None) -> int: print(f"Error: File not found: {args.file}", file=sys.stderr) # noqa: T201 return 1 + # INVERSE MODE: --max-vocab + if args.max_vocab is not None: + if not args.quiet: + print(f"Analyzing {filepath.name}...") # noqa: T201 + print(f"Finding longest excerpt using top {args.max_vocab} words...") # noqa: T201 + + # Generate flashcards in inverse mode + anki_content, excerpt, excerpt_length, num_words, max_rank_used = generate_flashcards_inverse( + filepath, + args.max_vocab, + source_lang=args.source_lang, + target_lang=args.target_lang, + include_context=args.include_context, + deck_name=args.deck_name, + no_translate=args.no_translate, + force=args.force, + ) + + # Determine output path + if args.output: + output_path = Path(args.output) + else: + output_path = filepath.parent / f"{filepath.stem}_anki_top{args.max_vocab}.txt" + + # Write output + output_path.write_text(anki_content, encoding="utf-8") + + if not args.quiet: + print("") # noqa: T201 + print("=" * 60) # noqa: T201 + print("FLASHCARD GENERATION COMPLETE (INVERSE MODE)") # noqa: T201 + print("=" * 60) # noqa: T201 + print(f"Learning: top {args.max_vocab} words") # noqa: T201 + print(f"Longest excerpt you can understand: {excerpt_length} words") # noqa: T201 + print(f' "{excerpt}"') # noqa: T201 + print("") # noqa: T201 + print(f"Rarest word in excerpt: #{max_rank_used}") # noqa: T201 + print(f"Flashcards: {num_words}") # noqa: T201 + print(f"Output file: {output_path}") # noqa: T201 + print("") # noqa: T201 + print("To import into Anki:") # noqa: T201 + print(" 1. Open Anki") # noqa: T201 + print(" 2. File -> Import") # noqa: T201 + print(f" 3. Select: {output_path}") # noqa: T201 + print(" 4. Click Import") # noqa: T201 + else: + print(output_path) # noqa: T201 + + return 0 + + # NORMAL MODE: --length if not args.quiet: print(f"Analyzing {filepath.name}...") # noqa: T201 print(f"Finding vocabulary for {args.length}-word excerpt...") # noqa: T201 diff --git a/python_pkg/word_frequency/test_texts/polish_pan_tadeusz_anki_top500.txt b/python_pkg/word_frequency/test_texts/polish_pan_tadeusz_anki_top500.txt new file mode 100644 index 0000000..92d83f7 --- /dev/null +++ b/python_pkg/word_frequency/test_texts/polish_pan_tadeusz_anki_top500.txt @@ -0,0 +1,536 @@ +#separator:semicolon +#html:true +#deck:polish_pan_tadeusz_top500 +#tags:vocabulary pl +#columns:Front;Back;Rank + +📖 TARGET EXCERPT;— rzekł protazy — że o tej to zosi o której rękę teraz nasz tadeusz;#0 +i;and;#1 +w;In;#2 +się;au;#3 +z;ed;#4 +na;au;#5 +nie;no;#6 +—;—;#7 +jak;how to;#8 +do;au;#9 +a;a;#10 +to;This;#11 +że;That;#12 +o;o;#13 +za;for;#14 +po;au;#15 +już;Now.;#16 +tak;Yes;#17 +co;a;#18 +od;from;#19 +»;»;#20 +lecz;but;#21 +bo;because;#22 +gdy;when;#23 +pan;Mr;#24 +ja;me;#24 +jest;is;#26 +ale;but;#27 +był;was;#28 +nim;dic;#29 +rzekł;He said;#30 +go;dic;#31 +tylko;only;#32 +jako;as;#33 +mu;au;#34 +Że;That;#35 +mnie;me;#36 +tu;Here.;#37 +on;he;#37 +ten;is;#39 +czy;is;#40 +hrabia;Count;#41 +sędzia;Judge;#42 +tam;There.;#43 +pod;under;#44 +aż;au;#45 +dla;for;#45 +u;au;#47 +nad;over;#48 +więc;Well;#49 +ich;their;#49 +tadeusz;tadeus;#51 +tym;with;#52 +przed;before;#53 +przy;with;#54 +sam;alone;#54 +przez;au;#56 +ze;ed;#56 +jeszcze;more;#58 +kto;who;#59 +gdzie;where;#59 +bez;without;#59 +jej;her;#59 +ku;the;#63 +wszyscy;All;#63 +wojski;military;#65 +choć;Come on.;#66 +było;was;#67 +potem;Then;#68 +miał;had;#69 +mi;me;#69 +jego;his;#71 +teraz;Now;#71 +ją;her;#73 +by;by;#74 +oczy;eyes;#74 +dziś;Today;#76 +może;may;#77 +domu;home;#77 +kiedy;when;#79 +ma;has;#80 +który;which;#81 +nawet;even;#82 +znowu;again;#82 +nas;us;#84 +jakby;like;#85 +jeśli;if;#85 +wszystko;everything;#87 +raz;once;#88 +szlachta;nobles;#89 +niech;Let;#89 +we;au;#91 +tej;this;#91 +lub;or;#91 +też;also;#94 +sobie;I'm sorry.;#94 +albo;either;#96 +ręce;Hands;#96 +te;These;#96 +gerwazy;gervas;#96 +między;between;#100 +są;are;#100 +była;was;#102 +cóż;Well;#102 +będzie;will be;#104 +głowy;Common;#104 +telimena;telimena;#104 +bardzo;Very;#107 +tadeusza;tadeus;#108 +razem;Total;#108 +głowę;Head;#110 +je;eat;#111 +ziemi;soil;#112 +mój;mine;#112 +robak;worm;#112 +ręką;hand;#112 +klucznik;keypad;#112 +ci;Common;#117 +dwa;two;#117 +ty;you;#119 +podkomorzy;sub-chambers;#119 +zamku;castle;#119 +gdyby;if;#119 +nic;Nothing.;#123 +wszystkie;all;#123 +widać;You can see;#125 +rękę;Hand;#125 +siebie;self;#125 +krzyknął;He screamed;#125 +tymczasem;Meanwhile;#129 +wtem;int;#129 +które;which;#129 +rzecz;thing;#129 +ani;neither;#129 +ledwie;Only;#129 +ni;au;#129 +zaraz;Wait.;#136 +jeden;one;#136 +was;you;#136 +zawsze;Always;#139 +…;...;#139 +nich;them;#141 +tego;This;#141 +nigdy;never;#143 +coraz;getting;#143 +długo;long;#145 +chciał;He wanted to;#145 +czas;time;#145 +nikt;Nobody;#145 +zaś;and;#145 +tyle;That's it.;#145 +ksiądz;priest;#145 +panie;Sir;#145 +dotąd;so far;#145 +przecież;It's not like;#145 +wszystkich;all;#155 +pana;Mr;#155 +wielki;big;#155 +mam;Got it.;#155 +ryków;roar;#155 +strony;pages;#155 +sędziego;Judge;#161 +trzeba;need;#161 +właśnie;Yeah.;#161 +gości;guests;#161 +niej;her;#165 +drzwi;door;#165 +wtenczas;then;#165 +niby;As if;#165 +góry;mountains;#165 +szlachty;nobles;#165 +litwie;Lithuania;#171 +stał;constant;#171 +każdy;each;#171 +stąd;From here;#171 +pierwszy;first;#171 +bóg;God;#171 +wkoło;around;#177 +serce;heart;#177 +której;of;#177 +wnet;immediately;#177 +my;us;#177 +asesor;asessor;#177 +dwóch;two;#177 +tych;These;#184 +która;which;#184 +przerwał;discontinued;#184 +zosia;zosis;#184 +cię;you;#188 +dobrze;Good.;#188 +okiem;eye;#188 +stary;man;#188 +wie;knows;#188 +chociaż;Although;#188 +których;which;#188 +rejent;regint;#188 +koniec;End;#188 +zosi;ris;#188 +nam;us;#198 +głową;head;#198 +hrabiego;Count;#198 +człowiek;man;#201 +świecie;world;#201 +wszak;After all,;#201 +wielkie;big;#204 +tuż;right;#204 +coś;something;#204 +wy;you;#204 +dwie;two;#204 +broń;weapons;#204 +słychać;I can hear you.;#204 +«;«;#204 +widząc;seeing;#212 +mógł;could;#212 +ta;ta;#212 +nieraz;often;#212 +oba;both;#212 +tę;this;#212 +woźny;Janitor;#218 +kształt;shape;#218 +ręku;hands;#218 +im;i;#218 +końcu;End;#218 +pewnie;Sure.;#218 +nagle;suddenly;#218 +wreszcie;Finally;#218 +ów;This;#218 +wiem;I know.;#218 +często;Common;#218 +iż;that;#218 +usta;mouth;#218 +swe;his;#231 +dalej;Come on.;#231 +zamek;lock;#231 +horeszków;Horns;#231 +byłem;I was.;#231 +sobą;me;#231 +znak;character;#231 +czasem;Sometimes;#231 +także;also;#231 +ciebie;you;#231 +swą;his;#231 +zawołał;called;#231 +niż;than;#243 +były;were;#243 +niegdyś;Once;#243 +lat;years;#243 +panów;gentlemen;#243 +prawda;Right.;#243 +głos;voice;#243 +słowo;word;#243 +swej;his;#243 +drzewa;trees;#252 +trzy;three;#252 +twarz;face;#252 +cały;whole;#252 +naprzód;forward;#252 +mówiąc;Saying;#252 +śród;Mid;#252 +zaczął;started;#252 +wiesz;You know;#252 +major;Major;#252 +oczyma;eyes;#262 +pole;field;#262 +aby;au;#262 +swych;his;#262 +ona;She;#262 +jenerał;protein;#262 +sopliców;soplicas;#262 +jacek;jack;#262 +dworze;Court;#270 +widział;He saw;#270 +dosyć;Enough;#270 +jutro;tomorrow;#270 +młodzież;adolescents;#270 +wolna;free;#270 +którą;which;#270 +pomiędzy;between;#270 +pani;Madam;#270 +taki;such;#270 +środku;center;#270 +cała;whole;#270 +całą;whole;#270 +czym;with;#270 +ludzi;people;#270 +jestem;I am;#270 +ostatni;last;#286 +jesteś;You are;#286 +wiele;multiple;#286 +dał;gave;#286 +nieco;a little;#286 +wzrok;eyesight;#286 +ażeby;to;#286 +chce;I want;#286 +ach;ah;#286 +nogi;legs;#286 +oczu;eyes;#286 +zbyt;too;#286 +drugiej;second;#286 +cicho;Quiet.;#286 +wiatr;wind;#286 +maciej;maj;#286 +czyli;or;#302 +stołu;table;#302 +wojskiego;military;#302 +dworu;court;#302 +miała;had;#302 +całe;whole;#302 +którym;who;#302 +dość;Enough;#302 +kilka;several;#302 +serca;Cardiac;#302 +lata;years;#302 +swym;his;#302 +równie;same;#302 +drogi;roads;#302 +drugi;second;#302 +całej;whole;#317 +dawniej;past;#317 +siedzi;sits;#317 +białe;white;#317 +rąk;hands;#317 +wyszedł;He left;#317 +czuł;feeling;#317 +siedział;He was in prison.;#317 +szlachcic;noble;#317 +nasz;our;#317 +wziął;took;#317 +jeżeli;if;#317 +wpół;half;#317 +soplica;soplica;#317 +razy;times;#317 +panu;Sir;#317 +jedną;one;#317 +drugą;second;#317 +świat;world;#317 +lud;people;#317 +zosię;suspension;#317 +idzie;He's coming.;#317 +protazy;protease;#317 +moja;mine;#340 +tobie;You;#340 +widzi;see;#340 +lubił;He liked;#340 +niebo;sky;#340 +moskali;moscals;#340 +panny;Misses;#340 +mówił;said;#340 +bieży;Tracks;#340 +którego;of;#340 +goście;guests;#340 +należy;au;#340 +swój;own;#340 +czasu;time;#340 +nasze;our;#340 +wielkim;big;#340 +księdza;priest;#340 +jednej;one;#340 +nią;with her.;#340 +niedźwiedź;bear;#340 +tyłu;rear;#340 +strzelcy;shooters;#340 +oto;Here.;#340 +zdrowie;Health;#363 +blisko;close;#363 +wiedział;knew;#363 +myśl;Think;#363 +lasu;forest;#363 +pół;half;#363 +konia;horse;#363 +miejsce;place;#363 +moje;mine;#363 +ziemię;soil;#363 +prawa;rights;#363 +ludzie;people;#363 +innych;other;#363 +być;be;#363 +soplicowie;soplices;#363 +prawo;right;#363 +można;can;#363 +wszakże;But;#363 +dąbrowski;Dabrowski;#363 +długi;long;#363 +sami;Alone;#363 +prawie;almost;#363 +rad;councils;#363 +dawno;long;#386 +obok;next door;#386 +podniósł;raised;#386 +dzieje;history;#386 +słowa;words;#386 +myśli;thoughts;#386 +kobiety;women;#386 +dzieci;children;#386 +oko;eye;#386 +bernardyn;bernardine;#386 +wam;you;#386 +psy;Dogs;#386 +koło;wheel;#386 +kraju;country;#386 +widziałem;I saw;#386 +miałem;I had;#386 +muszę;I have to;#386 +żeby;To;#386 +bracie;Brother;#386 +wołał;called;#386 +broni;weapons;#386 +maciek;mat;#386 +dom;home;#408 +młody;young;#408 +poznał;met;#408 +stanął;stopped;#408 +mówić;speak;#408 +las;forest;#408 +każe;says;#408 +świata;world;#408 +sędziemu;Judge;#408 +wielka;big;#408 +więcej;more;#408 +rzeczy;things;#408 +różne;different;#408 +dziecię;child;#408 +wielkiej;big;#408 +człek;man;#408 +gadać;Talk;#408 +rozmowę;conversation;#408 +no;no;#408 +drugim;second;#408 +wsi;villages;#408 +spór;dispute;#408 +chwilę;Wait.;#408 +dawał;was;#408 +bracia;brothers;#408 +krwi;blood;#408 +kędy;where;#408 +niebie;sky;#408 +znów;again;#408 +stolnik;stool;#408 +«nie;«no;#408 +wpadł;He came by.;#408 +waść;weight;#408 +rzekła;said;#408 +me;me;#408 +cesarz;Emperor;#408 +chciałem;I wanted to.;#408 +drogę;road;#408 +chcąc;wanting;#408 +dobrzyńscy;Good;#408 +chrzciciel;baptizer;#408 +księga;Book;#449 +życie;Life;#449 +wciąż;Still;#449 +takim;such;#449 +całym;whole;#449 +któż;Who;#449 +jednym;one;#449 +stało;fixed;#449 +rana;wound;#449 +spojrzał;He looked;#449 +czasie;time;#449 +słońce;sun;#449 +niego;him;#449 +skoro;if;#449 +jedna;one;#449 +mają;have;#449 +dzień;day;#449 +lada;roll;#449 +wojna;War;#449 +tył;rear;#449 +palcem;finger;#449 +prosto;straight;#449 +krzycząc;screaming;#449 +młodzi;young;#449 +ust;mouth;#449 +ruszył;He's moving;#449 +pas;belt;#449 +mych;my;#449 +nań;ed;#449 +uszy;ears;#449 +krzyk;scream;#449 +koń;horse;#449 +wszędzie;everywhere;#449 +panowie;Gentlemen;#449 +masz;Here.;#449 +szlachtę;noble;#449 +chwili;moment;#449 +sercu;heart;#449 +rzecze;says;#449 +niechaj;Let's go.;#449 +książę;Prince;#449 +pono;pono;#449 +górę;top;#449 +imię;name;#449 +wasze;Yours;#449 +płut;Loop;#449 +ile;how much;#495 +patrząc;looking;#495 +ramiona;arms;#495 +wkrótce;soon;#495 +starzy;old;#495 +nieba;sky;#495 +szedł;He was walking;#495 +boku;side;#495 +wznosi;raises;#495 +kazał;He said;#495 +miejscu;site;#495 +miejsca;places;#495 +każdej;each;#495 +jaki;which;#495 +obie;both;#495 +byli;were;#495 +spod;from;#495 +charty;chart;#495 +sama;Alone;#495 +krzyknęli;They screamed;#495 +król;king;#495 +rozkazy;orders;#495 +swoim;his;#495 +woła;calls;#495 +rękami;hands;#495 +swoje;his;#495 +mię;me;#495 +tłum;crowd;#495 +łzami;tears;#495 +tysiąc;thousand;#495 +nimi;with;#495 +konewka;water;#495 +czoło;forehead;#495 +głupi;Stupid.;#495 +maćka;macaw;#495 \ No newline at end of file