#!/usr/bin/env python3
"""
Anki Generator - Modular approach with 3 combinable strategies.
Usage:
python anki_generator.py [options]
Options:
--filter Apply strict filtering (answers > 100 chars)
--extract Use improved extraction algorithm
--main-only Only generate main exam questions (45 comprehensive cards)
Combinations:
python anki_generator.py # Basic extraction, no filter
python anki_generator.py --filter # Approach 1: Strict filter only
python anki_generator.py --extract # Approach 2: Better extraction only
python anki_generator.py --main-only # Approach 3: Main questions only
python anki_generator.py --filter --extract # Approach 4: Filter + Better extraction
python anki_generator.py --filter --main-only # Approach 5: Filter + Main only
python anki_generator.py --extract --main-only # Approach 6: Better extraction + Main only
python anki_generator.py --filter --extract --main-only # Approach 7: All three
"""
import os
import re
import argparse
from pathlib import Path
# =============================================================================
# SHARED UTILITIES
# =============================================================================
def clean_text(text):
"""Clean and format text for Anki."""
if not text:
return ""
text = re.sub(r'\*\*(.+?)\*\*', r'\1', text)
text = re.sub(r'(?\1', text)
text = text.replace('\t', ' ')
text = text.replace('"', '"')
text = re.sub(r' +', ' ', text)
return text.strip()
def get_file_metadata(filepath):
"""Extract question number and subject from filename."""
filename = os.path.basename(filepath)
match = re.match(r'(\d+)-(.+)\.md', filename)
num = match.group(1) if match else "00"
with open(filepath, 'r', encoding='utf-8') as f:
content = f.read()
subj_match = re.search(r'Przedmiot:\s*(\w+)', content)
subject = subj_match.group(1) if subj_match else "Ogólne"
return num, subject, content
def get_main_question(content):
"""Extract the main exam question."""
q_match = re.search(r'## Pytanie\s*\n\s*\*\*["\']?(.+?)["\']?\*\*', content, re.DOTALL)
if q_match:
return re.sub(r'\s+', ' ', q_match.group(1).strip())
return None
# =============================================================================
# APPROACH 1: STRICT FILTERING
# =============================================================================
def apply_strict_filter(cards, min_length=100):
"""Filter cards to only include those with answers > min_length characters."""
return [c for c in cards if len(c['back']) > min_length]
# =============================================================================
# APPROACH 2: BETTER EXTRACTION
# =============================================================================
def extract_structured_content(body):
"""Improved extraction - multiple content types with better formatting."""
parts = []
# 1. Definitions
def_match = re.search(r'#### Definicja[^\n]*\n([^\n#]+)', body)
if def_match:
parts.append(f"Definicja: {def_match.group(1).strip()}")
# 2. Bullet points with bold terms
bullets = re.findall(r'[-•]\s*\*\*([^*]+)\*\*[:\s-]*([^\n]*)', body)
for term, desc in bullets[:5]:
if desc.strip():
parts.append(f"• {term}: {desc.strip()}")
else:
parts.append(f"• {term}")
# 3. Key-value patterns
if len(parts) < 2:
kvs = re.findall(r'\*\*([^*\n]+)\*\*\s*[-–:]\s*([^\n*]{10,150})', body)
for k, v in kvs[:4]:
entry = f"{k.strip()}: {v.strip()}"
if entry not in parts:
parts.append(entry)
# 4. Paragraphs as fallback
if not parts:
paras = [p.strip() for p in body.split('\n\n')
if p.strip() and not p.startswith('```') and not p.startswith('|') and len(p.strip()) > 30]
for p in paras[:2]:
parts.append(p[:300])
return '
'.join([clean_text(p) for p in parts]) if parts else None
def extract_cards_better(filepath):
"""Extract cards with improved algorithm."""
num, subject, content = get_file_metadata(filepath)
base_tags = f"egzamin pyt{num} {subject}"
cards = []
# Main question
main_q = get_main_question(content)
if main_q:
answer_match = re.search(r'## 📚 Odpowiedź główna\s*\n(.+?)(?=\n## [^�]|\Z)', content, re.DOTALL)
if answer_match:
answer = extract_structured_content(answer_match.group(1))
if answer:
cards.append({'front': clean_text(main_q), 'back': answer, 'tags': f"{base_tags} main"})
# Detail sections
sections = re.findall(r'^### (?:\d+\.\s*)?([^\n]+)\n((?:(?!^### ).)*)', content, re.MULTILINE | re.DOTALL)
for header, body in sections:
header = header.strip()
if 'Przykład' in header or '"' in header or 'Mnemonic' in header or len(body) < 50:
continue
answer = extract_structured_content(body)
if answer:
cards.append({'front': f"Wyjaśnij: {clean_text(header)}", 'back': answer, 'tags': f"{base_tags} detail"})
return cards
def extract_cards_basic(filepath):
"""Basic extraction - simpler algorithm."""
num, subject, content = get_file_metadata(filepath)
base_tags = f"egzamin pyt{num} {subject}"
cards = []
# Main question - just headers
main_q = get_main_question(content)
if main_q:
answer_match = re.search(r'## 📚 Odpowiedź główna\s*\n(.+?)(?=\n## |\Z)', content, re.DOTALL)
if answer_match:
headers = re.findall(r'^### (?:\d+\.\s*)?(.+)$', answer_match.group(1), re.MULTILINE)
if headers:
answer = '