feat: added inverse mode for anki

This commit is contained in:
Krzysztof Rudnicki 2025-12-29 16:10:26 +01:00
parent d2b6f00185
commit 22539837ab
2 changed files with 793 additions and 2 deletions

View File

@ -94,6 +94,106 @@ def run_vocabulary_curve(filepath: Path, max_length: int, *, dump_vocab: bool =
return result.stdout return result.stdout
def run_vocabulary_curve_inverse(filepath: Path, max_vocab: int, *, dump_vocab: bool = False) -> str:
"""Run the C vocabulary_curve executable in inverse mode.
Args:
filepath: Path to the text file.
max_vocab: Maximum vocabulary size (top N words).
dump_vocab: If True, also dump all vocabulary up to max_vocab.
Returns:
Output from the executable.
Raises:
FileNotFoundError: If executable not found.
subprocess.CalledProcessError: If execution fails.
"""
if not C_EXECUTABLE.exists():
raise FileNotFoundError(
f"C executable not found at {C_EXECUTABLE}. "
"Please compile it first: cd C/vocabulary_curve && make"
)
cmd = [str(C_EXECUTABLE), str(filepath), "--max-vocab", str(max_vocab)]
if dump_vocab:
cmd.append("--dump-vocab")
result = subprocess.run(
cmd,
capture_output=True,
text=True,
timeout=120,
check=True,
)
return result.stdout
def parse_inverse_mode_output(output: str) -> tuple[str, int, int, list[tuple[str, int]]]:
"""Parse output from vocabulary_curve inverse mode.
Args:
output: Raw output from vocabulary_curve --max-vocab.
Returns:
Tuple of (excerpt_text, excerpt_length, max_rank_used, all_vocab_words).
"""
lines = output.split("\n")
excerpt = ""
excerpt_length = 0
max_rank_used = 0
all_vocab: list[tuple[str, int]] = []
for i, line in enumerate(lines):
line = line.strip()
if line.startswith("LONGEST EXCERPT:"):
parts = line.split()
if len(parts) >= 3:
excerpt_length = int(parts[2])
elif line.startswith("Excerpt:"):
# Next line(s) contain the excerpt
i += 1
excerpt_parts = []
while i < len(lines):
next_line = lines[i].strip()
if next_line.startswith('"'):
next_line = next_line[1:]
if next_line.endswith('"'):
next_line = next_line[:-1]
excerpt_parts.append(next_line)
break
excerpt_parts.append(next_line)
i += 1
excerpt = " ".join(excerpt_parts)
elif line.startswith("Rarest word used:"):
# Parse "word (#rank)"
match = re.search(r"\(#(\d+)\)", line)
if match:
max_rank_used = int(match.group(1))
# Parse VOCAB_DUMP section if present
in_vocab_dump = False
for line in lines:
if line.strip() == "VOCAB_DUMP_START":
in_vocab_dump = True
continue
if line.strip() == "VOCAB_DUMP_END":
break
if in_vocab_dump and ";" in line:
parts = line.strip().split(";")
if len(parts) == 2:
word, rank_str = parts
try:
all_vocab.append((word, int(rank_str)))
except ValueError:
pass
return excerpt, excerpt_length, max_rank_used, all_vocab
def parse_vocabulary_curve_output(output: str, target_length: int) -> tuple[str, list[tuple[str, int]], list[tuple[str, int]]]: def parse_vocabulary_curve_output(output: str, target_length: int) -> tuple[str, list[tuple[str, int]], list[tuple[str, int]]]:
"""Parse output from vocabulary_curve to get words needed. """Parse output from vocabulary_curve to get words needed.
@ -524,6 +624,101 @@ def generate_flashcards(
return anki_content, excerpt, len(words_with_ranks), max_rank return anki_content, excerpt, len(words_with_ranks), max_rank
def generate_flashcards_inverse(
filepath: str | Path,
max_vocab: int,
source_lang: str | None = None,
target_lang: str = "en",
include_context: bool = False,
deck_name: str | None = None,
no_translate: bool = False,
*,
force: bool = False,
) -> tuple[str, str, int, int, int]:
"""Generate Anki flashcards for the longest excerpt using top N words.
This is the inverse mode: given a vocabulary size, find the longest
excerpt that can be understood with only those words.
Args:
filepath: Path to the source text file.
max_vocab: Maximum vocabulary size (top N words to learn).
source_lang: Source language (auto-detected if None).
target_lang: Target language for translations.
include_context: Whether to include example contexts.
deck_name: Optional deck name.
no_translate: If True, skip translation.
force: If True, ignore all caches and regenerate.
Returns:
Tuple of (anki_content, excerpt, excerpt_length, num_words, max_rank_used).
"""
filepath = Path(filepath)
# Read the text (only needed for context finding)
text = read_file(filepath) if include_context else ""
# Auto-detect language if not provided
if source_lang is None:
sample_text = read_file(filepath)[:1000] if not text else text[:1000]
source_lang = detect_language(sample_text)
if source_lang is None:
raise ValueError(
"Could not auto-detect source language. "
"Please specify with --from (e.g., --from pl for Polish). "
"Install langdetect for auto-detection: pip install langdetect"
)
# Run vocabulary curve in inverse mode
output = run_vocabulary_curve_inverse(filepath, max_vocab, dump_vocab=True)
# Parse the output
excerpt, excerpt_length, max_rank_used, all_vocab_words = parse_inverse_mode_output(output)
if excerpt_length == 0:
raise ValueError(
f"No valid excerpt found using only top {max_vocab} words. "
"Try increasing the vocabulary limit."
)
if not all_vocab_words:
raise ValueError(f"No vocabulary returned for max_vocab={max_vocab}")
# Use all vocabulary up to max_vocab
words_with_ranks = all_vocab_words
# Find words that appear in the excerpt (for highlighting)
excerpt_word_set = set(excerpt.lower().split())
excerpt_words = [(w, r) for w, r in all_vocab_words if w.lower() in excerpt_word_set]
# Get contexts if requested
contexts = None
if include_context:
if not text:
text = read_file(filepath)
words = [w for w, _ in words_with_ranks]
contexts = find_word_contexts(text, words)
# Generate deck name
if deck_name is None:
deck_name = f"{filepath.stem}_top{max_vocab}"
# Generate Anki content
anki_content = generate_anki_deck(
words_with_ranks,
source_lang,
target_lang,
contexts,
deck_name,
include_context,
no_translate,
excerpt,
excerpt_words if excerpt_words else None,
)
return anki_content, excerpt, excerpt_length, len(words_with_ranks), max_rank_used
def main(argv: Sequence[str] | None = None) -> int: def main(argv: Sequence[str] | None = None) -> int:
"""Main entry point. """Main entry point.
@ -553,6 +748,13 @@ def main(argv: Sequence[str] | None = None) -> int:
default=None, default=None,
help="Target excerpt length (how many words you want to understand)", help="Target excerpt length (how many words you want to understand)",
) )
parser.add_argument(
"--max-vocab",
"-v",
type=int,
default=None,
help="INVERSE MODE: Learn top N words, find longest excerpt you can understand",
)
parser.add_argument( parser.add_argument(
"--from", "--from",
dest="source_lang", dest="source_lang",
@ -669,8 +871,10 @@ def main(argv: Sequence[str] | None = None) -> int:
# Validate required arguments for main functionality # Validate required arguments for main functionality
if args.file is None: if args.file is None:
parser.error("--file/-f is required") parser.error("--file/-f is required")
if args.length is None: if args.length is None and args.max_vocab is None:
parser.error("--length/-l is required") parser.error("Either --length/-l or --max-vocab/-v is required")
if args.length is not None and args.max_vocab is not None:
parser.error("Cannot use both --length and --max-vocab. Choose one mode.")
try: try:
filepath = Path(args.file) filepath = Path(args.file)
@ -678,6 +882,57 @@ def main(argv: Sequence[str] | None = None) -> int:
print(f"Error: File not found: {args.file}", file=sys.stderr) # noqa: T201 print(f"Error: File not found: {args.file}", file=sys.stderr) # noqa: T201
return 1 return 1
# INVERSE MODE: --max-vocab
if args.max_vocab is not None:
if not args.quiet:
print(f"Analyzing {filepath.name}...") # noqa: T201
print(f"Finding longest excerpt using top {args.max_vocab} words...") # noqa: T201
# Generate flashcards in inverse mode
anki_content, excerpt, excerpt_length, num_words, max_rank_used = generate_flashcards_inverse(
filepath,
args.max_vocab,
source_lang=args.source_lang,
target_lang=args.target_lang,
include_context=args.include_context,
deck_name=args.deck_name,
no_translate=args.no_translate,
force=args.force,
)
# Determine output path
if args.output:
output_path = Path(args.output)
else:
output_path = filepath.parent / f"{filepath.stem}_anki_top{args.max_vocab}.txt"
# Write output
output_path.write_text(anki_content, encoding="utf-8")
if not args.quiet:
print("") # noqa: T201
print("=" * 60) # noqa: T201
print("FLASHCARD GENERATION COMPLETE (INVERSE MODE)") # noqa: T201
print("=" * 60) # noqa: T201
print(f"Learning: top {args.max_vocab} words") # noqa: T201
print(f"Longest excerpt you can understand: {excerpt_length} words") # noqa: T201
print(f' "{excerpt}"') # noqa: T201
print("") # noqa: T201
print(f"Rarest word in excerpt: #{max_rank_used}") # noqa: T201
print(f"Flashcards: {num_words}") # noqa: T201
print(f"Output file: {output_path}") # noqa: T201
print("") # noqa: T201
print("To import into Anki:") # noqa: T201
print(" 1. Open Anki") # noqa: T201
print(" 2. File -> Import") # noqa: T201
print(f" 3. Select: {output_path}") # noqa: T201
print(" 4. Click Import") # noqa: T201
else:
print(output_path) # noqa: T201
return 0
# NORMAL MODE: --length
if not args.quiet: if not args.quiet:
print(f"Analyzing {filepath.name}...") # noqa: T201 print(f"Analyzing {filepath.name}...") # noqa: T201
print(f"Finding vocabulary for {args.length}-word excerpt...") # noqa: T201 print(f"Finding vocabulary for {args.length}-word excerpt...") # noqa: T201

View File

@ -0,0 +1,536 @@
#separator:semicolon
#html:true
#deck:polish_pan_tadeusz_top500
#tags:vocabulary pl
#columns:Front;Back;Rank
📖 TARGET EXCERPT;— rzekł protazy — że o tej to zosi o której rękę teraz <b>nasz</b> tadeusz;#0
i;and;#1
w;In;#2
się;au;#3
z;ed;#4
na;au;#5
nie;no;#6
—;—;#7
jak;how to;#8
do;au;#9
a;a;#10
to;This;#11
że;That;#12
o;o;#13
za;for;#14
po;au;#15
już;Now.;#16
tak;Yes;#17
co;a;#18
od;from;#19
»;»;#20
lecz;but;#21
bo;because;#22
gdy;when;#23
pan;Mr;#24
ja;me;#24
jest;is;#26
ale;but;#27
był;was;#28
nim;dic;#29
rzekł;He said;#30
go;dic;#31
tylko;only;#32
jako;as;#33
mu;au;#34
Że;That;#35
mnie;me;#36
tu;Here.;#37
on;he;#37
ten;is;#39
czy;is;#40
hrabia;Count;#41
sędzia;Judge;#42
tam;There.;#43
pod;under;#44
aż;au;#45
dla;for;#45
u;au;#47
nad;over;#48
więc;Well;#49
ich;their;#49
tadeusz;tadeus;#51
tym;with;#52
przed;before;#53
przy;with;#54
sam;alone;#54
przez;au;#56
ze;ed;#56
jeszcze;more;#58
kto;who;#59
gdzie;where;#59
bez;without;#59
jej;her;#59
ku;the;#63
wszyscy;All;#63
wojski;military;#65
choć;Come on.;#66
było;was;#67
potem;Then;#68
miał;had;#69
mi;me;#69
jego;his;#71
teraz;Now;#71
ją;her;#73
by;by;#74
oczy;eyes;#74
dziś;Today;#76
może;may;#77
domu;home;#77
kiedy;when;#79
ma;has;#80
który;which;#81
nawet;even;#82
znowu;again;#82
nas;us;#84
jakby;like;#85
jeśli;if;#85
wszystko;everything;#87
raz;once;#88
szlachta;nobles;#89
niech;Let;#89
we;au;#91
tej;this;#91
lub;or;#91
też;also;#94
sobie;I'm sorry.;#94
albo;either;#96
ręce;Hands;#96
te;These;#96
gerwazy;gervas;#96
między;between;#100
są;are;#100
była;was;#102
cóż;Well;#102
będzie;will be;#104
głowy;Common;#104
telimena;telimena;#104
bardzo;Very;#107
tadeusza;tadeus;#108
razem;Total;#108
głowę;Head;#110
je;eat;#111
ziemi;soil;#112
mój;mine;#112
robak;worm;#112
ręką;hand;#112
klucznik;keypad;#112
ci;Common;#117
dwa;two;#117
ty;you;#119
podkomorzy;sub-chambers;#119
zamku;castle;#119
gdyby;if;#119
nic;Nothing.;#123
wszystkie;all;#123
widać;You can see;#125
rękę;Hand;#125
siebie;self;#125
krzyknął;He screamed;#125
tymczasem;Meanwhile;#129
wtem;int;#129
które;which;#129
rzecz;thing;#129
ani;neither;#129
ledwie;Only;#129
ni;au;#129
zaraz;Wait.;#136
jeden;one;#136
was;you;#136
zawsze;Always;#139
…;...;#139
nich;them;#141
tego;This;#141
nigdy;never;#143
coraz;getting;#143
długo;long;#145
chciał;He wanted to;#145
czas;time;#145
nikt;Nobody;#145
zaś;and;#145
tyle;That's it.;#145
ksiądz;priest;#145
panie;Sir;#145
dotąd;so far;#145
przecież;It's not like;#145
wszystkich;all;#155
pana;Mr;#155
wielki;big;#155
mam;Got it.;#155
ryków;roar;#155
strony;pages;#155
sędziego;Judge;#161
trzeba;need;#161
właśnie;Yeah.;#161
gości;guests;#161
niej;her;#165
drzwi;door;#165
wtenczas;then;#165
niby;As if;#165
góry;mountains;#165
szlachty;nobles;#165
litwie;Lithuania;#171
stał;constant;#171
każdy;each;#171
stąd;From here;#171
pierwszy;first;#171
bóg;God;#171
wkoło;around;#177
serce;heart;#177
której;of;#177
wnet;immediately;#177
my;us;#177
asesor;asessor;#177
dwóch;two;#177
tych;These;#184
która;which;#184
przerwał;discontinued;#184
zosia;zosis;#184
cię;you;#188
dobrze;Good.;#188
okiem;eye;#188
stary;man;#188
wie;knows;#188
chociaż;Although;#188
których;which;#188
rejent;regint;#188
koniec;End;#188
zosi;ris;#188
nam;us;#198
głową;head;#198
hrabiego;Count;#198
człowiek;man;#201
świecie;world;#201
wszak;After all,;#201
wielkie;big;#204
tuż;right;#204
coś;something;#204
wy;you;#204
dwie;two;#204
broń;weapons;#204
słychać;I can hear you.;#204
«;«;#204
widząc;seeing;#212
mógł;could;#212
ta;ta;#212
nieraz;often;#212
oba;both;#212
tę;this;#212
woźny;Janitor;#218
kształt;shape;#218
ręku;hands;#218
im;i;#218
końcu;End;#218
pewnie;Sure.;#218
nagle;suddenly;#218
wreszcie;Finally;#218
ów;This;#218
wiem;I know.;#218
często;Common;#218
iż;that;#218
usta;mouth;#218
swe;his;#231
dalej;Come on.;#231
zamek;lock;#231
horeszków;Horns;#231
byłem;I was.;#231
sobą;me;#231
znak;character;#231
czasem;Sometimes;#231
także;also;#231
ciebie;you;#231
swą;his;#231
zawołał;called;#231
niż;than;#243
były;were;#243
niegdyś;Once;#243
lat;years;#243
panów;gentlemen;#243
prawda;Right.;#243
głos;voice;#243
słowo;word;#243
swej;his;#243
drzewa;trees;#252
trzy;three;#252
twarz;face;#252
cały;whole;#252
naprzód;forward;#252
mówiąc;Saying;#252
śród;Mid;#252
zaczął;started;#252
wiesz;You know;#252
major;Major;#252
oczyma;eyes;#262
pole;field;#262
aby;au;#262
swych;his;#262
ona;She;#262
jenerał;protein;#262
sopliców;soplicas;#262
jacek;jack;#262
dworze;Court;#270
widział;He saw;#270
dosyć;Enough;#270
jutro;tomorrow;#270
młodzież;adolescents;#270
wolna;free;#270
którą;which;#270
pomiędzy;between;#270
pani;Madam;#270
taki;such;#270
środku;center;#270
cała;whole;#270
całą;whole;#270
czym;with;#270
ludzi;people;#270
jestem;I am;#270
ostatni;last;#286
jesteś;You are;#286
wiele;multiple;#286
dał;gave;#286
nieco;a little;#286
wzrok;eyesight;#286
ażeby;to;#286
chce;I want;#286
ach;ah;#286
nogi;legs;#286
oczu;eyes;#286
zbyt;too;#286
drugiej;second;#286
cicho;Quiet.;#286
wiatr;wind;#286
maciej;maj;#286
czyli;or;#302
stołu;table;#302
wojskiego;military;#302
dworu;court;#302
miała;had;#302
całe;whole;#302
którym;who;#302
dość;Enough;#302
kilka;several;#302
serca;Cardiac;#302
lata;years;#302
swym;his;#302
równie;same;#302
drogi;roads;#302
drugi;second;#302
całej;whole;#317
dawniej;past;#317
siedzi;sits;#317
białe;white;#317
rąk;hands;#317
wyszedł;He left;#317
czuł;feeling;#317
siedział;He was in prison.;#317
szlachcic;noble;#317
nasz;our;#317
wziął;took;#317
jeżeli;if;#317
wpół;half;#317
soplica;soplica;#317
razy;times;#317
panu;Sir;#317
jedną;one;#317
drugą;second;#317
świat;world;#317
lud;people;#317
zosię;suspension;#317
idzie;He's coming.;#317
protazy;protease;#317
moja;mine;#340
tobie;You;#340
widzi;see;#340
lubił;He liked;#340
niebo;sky;#340
moskali;moscals;#340
panny;Misses;#340
mówił;said;#340
bieży;Tracks;#340
którego;of;#340
goście;guests;#340
należy;au;#340
swój;own;#340
czasu;time;#340
nasze;our;#340
wielkim;big;#340
księdza;priest;#340
jednej;one;#340
nią;with her.;#340
niedźwiedź;bear;#340
tyłu;rear;#340
strzelcy;shooters;#340
oto;Here.;#340
zdrowie;Health;#363
blisko;close;#363
wiedział;knew;#363
myśl;Think;#363
lasu;forest;#363
pół;half;#363
konia;horse;#363
miejsce;place;#363
moje;mine;#363
ziemię;soil;#363
prawa;rights;#363
ludzie;people;#363
innych;other;#363
być;be;#363
soplicowie;soplices;#363
prawo;right;#363
można;can;#363
wszakże;But;#363
dąbrowski;Dabrowski;#363
długi;long;#363
sami;Alone;#363
prawie;almost;#363
rad;councils;#363
dawno;long;#386
obok;next door;#386
podniósł;raised;#386
dzieje;history;#386
słowa;words;#386
myśli;thoughts;#386
kobiety;women;#386
dzieci;children;#386
oko;eye;#386
bernardyn;bernardine;#386
wam;you;#386
psy;Dogs;#386
koło;wheel;#386
kraju;country;#386
widziałem;I saw;#386
miałem;I had;#386
muszę;I have to;#386
żeby;To;#386
bracie;Brother;#386
wołał;called;#386
broni;weapons;#386
maciek;mat;#386
dom;home;#408
młody;young;#408
poznał;met;#408
stanął;stopped;#408
mówić;speak;#408
las;forest;#408
każe;says;#408
świata;world;#408
sędziemu;Judge;#408
wielka;big;#408
więcej;more;#408
rzeczy;things;#408
różne;different;#408
dziecię;child;#408
wielkiej;big;#408
człek;man;#408
gadać;Talk;#408
rozmowę;conversation;#408
no;no;#408
drugim;second;#408
wsi;villages;#408
spór;dispute;#408
chwilę;Wait.;#408
dawał;was;#408
bracia;brothers;#408
krwi;blood;#408
kędy;where;#408
niebie;sky;#408
znów;again;#408
stolnik;stool;#408
«nie;«no;#408
wpadł;He came by.;#408
waść;weight;#408
rzekła;said;#408
me;me;#408
cesarz;Emperor;#408
chciałem;I wanted to.;#408
drogę;road;#408
chcąc;wanting;#408
dobrzyńscy;Good;#408
chrzciciel;baptizer;#408
księga;Book;#449
życie;Life;#449
wciąż;Still;#449
takim;such;#449
całym;whole;#449
któż;Who;#449
jednym;one;#449
stało;fixed;#449
rana;wound;#449
spojrzał;He looked;#449
czasie;time;#449
słońce;sun;#449
niego;him;#449
skoro;if;#449
jedna;one;#449
mają;have;#449
dzień;day;#449
lada;roll;#449
wojna;War;#449
tył;rear;#449
palcem;finger;#449
prosto;straight;#449
krzycząc;screaming;#449
młodzi;young;#449
ust;mouth;#449
ruszył;He's moving;#449
pas;belt;#449
mych;my;#449
nań;ed;#449
uszy;ears;#449
krzyk;scream;#449
koń;horse;#449
wszędzie;everywhere;#449
panowie;Gentlemen;#449
masz;Here.;#449
szlachtę;noble;#449
chwili;moment;#449
sercu;heart;#449
rzecze;says;#449
niechaj;Let's go.;#449
książę;Prince;#449
pono;pono;#449
górę;top;#449
imię;name;#449
wasze;Yours;#449
płut;Loop;#449
ile;how much;#495
patrząc;looking;#495
ramiona;arms;#495
wkrótce;soon;#495
starzy;old;#495
nieba;sky;#495
szedł;He was walking;#495
boku;side;#495
wznosi;raises;#495
kazał;He said;#495
miejscu;site;#495
miejsca;places;#495
każdej;each;#495
jaki;which;#495
obie;both;#495
byli;were;#495
spod;from;#495
charty;chart;#495
sama;Alone;#495
krzyknęli;They screamed;#495
król;king;#495
rozkazy;orders;#495
swoim;his;#495
woła;calls;#495
rękami;hands;#495
swoje;his;#495
mię;me;#495
tłum;crowd;#495
łzami;tears;#495
tysiąc;thousand;#495
nimi;with;#495
konewka;water;#495
czoło;forehead;#495
głupi;Stupid.;#495
maćka;macaw;#495