mirror of
https://github.com/kuhyx/praca_magisterska.git
synced 2026-07-04 13:23:05 +02:00
332 lines
13 KiB
Python
332 lines
13 KiB
Python
|
|
#!/usr/bin/env python3
|
|||
|
|
"""
|
|||
|
|
Anki Generator - Modular approach with 3 combinable strategies.
|
|||
|
|
|
|||
|
|
Usage:
|
|||
|
|
python anki_generator.py [options]
|
|||
|
|
|
|||
|
|
Options:
|
|||
|
|
--filter Apply strict filtering (answers > 100 chars)
|
|||
|
|
--extract Use improved extraction algorithm
|
|||
|
|
--main-only Only generate main exam questions (45 comprehensive cards)
|
|||
|
|
|
|||
|
|
Combinations:
|
|||
|
|
python anki_generator.py # Basic extraction, no filter
|
|||
|
|
python anki_generator.py --filter # Approach 1: Strict filter only
|
|||
|
|
python anki_generator.py --extract # Approach 2: Better extraction only
|
|||
|
|
python anki_generator.py --main-only # Approach 3: Main questions only
|
|||
|
|
python anki_generator.py --filter --extract # Approach 4: Filter + Better extraction
|
|||
|
|
python anki_generator.py --filter --main-only # Approach 5: Filter + Main only
|
|||
|
|
python anki_generator.py --extract --main-only # Approach 6: Better extraction + Main only
|
|||
|
|
python anki_generator.py --filter --extract --main-only # Approach 7: All three
|
|||
|
|
"""
|
|||
|
|
|
|||
|
|
import os
|
|||
|
|
import re
|
|||
|
|
import argparse
|
|||
|
|
from pathlib import Path
|
|||
|
|
|
|||
|
|
# =============================================================================
|
|||
|
|
# SHARED UTILITIES
|
|||
|
|
# =============================================================================
|
|||
|
|
|
|||
|
|
def clean_text(text):
|
|||
|
|
"""Clean and format text for Anki."""
|
|||
|
|
if not text:
|
|||
|
|
return ""
|
|||
|
|
text = re.sub(r'\*\*(.+?)\*\*', r'<b>\1</b>', text)
|
|||
|
|
text = re.sub(r'(?<!\*)\*([^*]+)\*(?!\*)', r'<i>\1</i>', text)
|
|||
|
|
text = text.replace('\t', ' ')
|
|||
|
|
text = text.replace('"', '"')
|
|||
|
|
text = re.sub(r' +', ' ', text)
|
|||
|
|
return text.strip()
|
|||
|
|
|
|||
|
|
def get_file_metadata(filepath):
|
|||
|
|
"""Extract question number and subject from filename."""
|
|||
|
|
filename = os.path.basename(filepath)
|
|||
|
|
match = re.match(r'(\d+)-(.+)\.md', filename)
|
|||
|
|
num = match.group(1) if match else "00"
|
|||
|
|
|
|||
|
|
with open(filepath, 'r', encoding='utf-8') as f:
|
|||
|
|
content = f.read()
|
|||
|
|
|
|||
|
|
subj_match = re.search(r'Przedmiot:\s*(\w+)', content)
|
|||
|
|
subject = subj_match.group(1) if subj_match else "Ogólne"
|
|||
|
|
|
|||
|
|
return num, subject, content
|
|||
|
|
|
|||
|
|
def get_main_question(content):
|
|||
|
|
"""Extract the main exam question."""
|
|||
|
|
q_match = re.search(r'## Pytanie\s*\n\s*\*\*["\']?(.+?)["\']?\*\*', content, re.DOTALL)
|
|||
|
|
if q_match:
|
|||
|
|
return re.sub(r'\s+', ' ', q_match.group(1).strip())
|
|||
|
|
return None
|
|||
|
|
|
|||
|
|
# =============================================================================
|
|||
|
|
# APPROACH 1: STRICT FILTERING
|
|||
|
|
# =============================================================================
|
|||
|
|
|
|||
|
|
def apply_strict_filter(cards, min_length=100):
|
|||
|
|
"""Filter cards to only include those with answers > min_length characters."""
|
|||
|
|
return [c for c in cards if len(c['back']) > min_length]
|
|||
|
|
|
|||
|
|
# =============================================================================
|
|||
|
|
# APPROACH 2: BETTER EXTRACTION
|
|||
|
|
# =============================================================================
|
|||
|
|
|
|||
|
|
def extract_structured_content(body):
|
|||
|
|
"""Improved extraction - multiple content types with better formatting."""
|
|||
|
|
parts = []
|
|||
|
|
|
|||
|
|
# 1. Definitions
|
|||
|
|
def_match = re.search(r'#### Definicja[^\n]*\n([^\n#]+)', body)
|
|||
|
|
if def_match:
|
|||
|
|
parts.append(f"<b>Definicja:</b> {def_match.group(1).strip()}")
|
|||
|
|
|
|||
|
|
# 2. Bullet points with bold terms
|
|||
|
|
bullets = re.findall(r'[-•]\s*\*\*([^*]+)\*\*[:\s-]*([^\n]*)', body)
|
|||
|
|
for term, desc in bullets[:5]:
|
|||
|
|
if desc.strip():
|
|||
|
|
parts.append(f"• <b>{term}</b>: {desc.strip()}")
|
|||
|
|
else:
|
|||
|
|
parts.append(f"• <b>{term}</b>")
|
|||
|
|
|
|||
|
|
# 3. Key-value patterns
|
|||
|
|
if len(parts) < 2:
|
|||
|
|
kvs = re.findall(r'\*\*([^*\n]+)\*\*\s*[-–:]\s*([^\n*]{10,150})', body)
|
|||
|
|
for k, v in kvs[:4]:
|
|||
|
|
entry = f"<b>{k.strip()}</b>: {v.strip()}"
|
|||
|
|
if entry not in parts:
|
|||
|
|
parts.append(entry)
|
|||
|
|
|
|||
|
|
# 4. Paragraphs as fallback
|
|||
|
|
if not parts:
|
|||
|
|
paras = [p.strip() for p in body.split('\n\n')
|
|||
|
|
if p.strip() and not p.startswith('```') and not p.startswith('|') and len(p.strip()) > 30]
|
|||
|
|
for p in paras[:2]:
|
|||
|
|
parts.append(p[:300])
|
|||
|
|
|
|||
|
|
return '<br>'.join([clean_text(p) for p in parts]) if parts else None
|
|||
|
|
|
|||
|
|
def extract_cards_better(filepath):
|
|||
|
|
"""Extract cards with improved algorithm."""
|
|||
|
|
num, subject, content = get_file_metadata(filepath)
|
|||
|
|
base_tags = f"egzamin pyt{num} {subject}"
|
|||
|
|
cards = []
|
|||
|
|
|
|||
|
|
# Main question
|
|||
|
|
main_q = get_main_question(content)
|
|||
|
|
if main_q:
|
|||
|
|
answer_match = re.search(r'## 📚 Odpowiedź główna\s*\n(.+?)(?=\n## [^<5E>]|\Z)', content, re.DOTALL)
|
|||
|
|
if answer_match:
|
|||
|
|
answer = extract_structured_content(answer_match.group(1))
|
|||
|
|
if answer:
|
|||
|
|
cards.append({'front': clean_text(main_q), 'back': answer, 'tags': f"{base_tags} main"})
|
|||
|
|
|
|||
|
|
# Detail sections
|
|||
|
|
sections = re.findall(r'^### (?:\d+\.\s*)?([^\n]+)\n((?:(?!^### ).)*)', content, re.MULTILINE | re.DOTALL)
|
|||
|
|
for header, body in sections:
|
|||
|
|
header = header.strip()
|
|||
|
|
if 'Przykład' in header or '"' in header or 'Mnemonic' in header or len(body) < 50:
|
|||
|
|
continue
|
|||
|
|
|
|||
|
|
answer = extract_structured_content(body)
|
|||
|
|
if answer:
|
|||
|
|
cards.append({'front': f"Wyjaśnij: {clean_text(header)}", 'back': answer, 'tags': f"{base_tags} detail"})
|
|||
|
|
|
|||
|
|
return cards
|
|||
|
|
|
|||
|
|
def extract_cards_basic(filepath):
|
|||
|
|
"""Basic extraction - simpler algorithm."""
|
|||
|
|
num, subject, content = get_file_metadata(filepath)
|
|||
|
|
base_tags = f"egzamin pyt{num} {subject}"
|
|||
|
|
cards = []
|
|||
|
|
|
|||
|
|
# Main question - just headers
|
|||
|
|
main_q = get_main_question(content)
|
|||
|
|
if main_q:
|
|||
|
|
answer_match = re.search(r'## 📚 Odpowiedź główna\s*\n(.+?)(?=\n## |\Z)', content, re.DOTALL)
|
|||
|
|
if answer_match:
|
|||
|
|
headers = re.findall(r'^### (?:\d+\.\s*)?(.+)$', answer_match.group(1), re.MULTILINE)
|
|||
|
|
if headers:
|
|||
|
|
answer = '<ul>' + ''.join([f'<li>{clean_text(h)}</li>' for h in headers[:6]]) + '</ul>'
|
|||
|
|
cards.append({'front': clean_text(main_q), 'back': answer, 'tags': f"{base_tags} main"})
|
|||
|
|
|
|||
|
|
# Detail sections - first paragraph only
|
|||
|
|
sections = re.findall(r'^### (?:\d+\.\s*)?([^\n]+)\n((?:(?!^### ).)*)', content, re.MULTILINE | re.DOTALL)
|
|||
|
|
for header, body in sections:
|
|||
|
|
header = header.strip()
|
|||
|
|
body = body.strip()
|
|||
|
|
if len(body) < 50 or 'Przykład' in header:
|
|||
|
|
continue
|
|||
|
|
|
|||
|
|
paras = [p.strip() for p in body.split('\n\n') if p.strip() and not p.startswith('```')]
|
|||
|
|
if paras:
|
|||
|
|
answer = clean_text(paras[0][:400])
|
|||
|
|
cards.append({'front': f"Wyjaśnij: {clean_text(header)}", 'back': answer, 'tags': f"{base_tags} detail"})
|
|||
|
|
|
|||
|
|
return cards
|
|||
|
|
|
|||
|
|
# =============================================================================
|
|||
|
|
# APPROACH 3: MAIN QUESTIONS ONLY
|
|||
|
|
# =============================================================================
|
|||
|
|
|
|||
|
|
def extract_main_only(filepath):
|
|||
|
|
"""Extract only the main exam question with comprehensive answer."""
|
|||
|
|
num, subject, content = get_file_metadata(filepath)
|
|||
|
|
base_tags = f"egzamin pyt{num} {subject} main"
|
|||
|
|
|
|||
|
|
main_q = get_main_question(content)
|
|||
|
|
if not main_q:
|
|||
|
|
return []
|
|||
|
|
|
|||
|
|
# Build comprehensive answer from multiple sections
|
|||
|
|
answer_parts = []
|
|||
|
|
|
|||
|
|
# Get main answer section
|
|||
|
|
answer_match = re.search(r'## 📚 Odpowiedź główna\s*\n(.+?)(?=\n## [^<5E>]|\Z)', content, re.DOTALL)
|
|||
|
|
if answer_match:
|
|||
|
|
section = answer_match.group(1)
|
|||
|
|
|
|||
|
|
# Get all ### headers with their first substantive content
|
|||
|
|
headers = re.findall(r'^### (?:\d+\.\s*)?([^\n]+)\n((?:(?!^### ).)*)', section, re.MULTILINE | re.DOTALL)
|
|||
|
|
|
|||
|
|
for header, body in headers[:5]:
|
|||
|
|
header = header.strip()
|
|||
|
|
if 'Przykład' in header or 'Mnemonic' in header or '"' in header:
|
|||
|
|
continue
|
|||
|
|
|
|||
|
|
# Get key point from this section
|
|||
|
|
key_point = None
|
|||
|
|
|
|||
|
|
# Try to get a definition or first bullet
|
|||
|
|
def_match = re.search(r'Rozpoznawana klasa języków\s*\n\s*\*\*([^*]+)\*\*', body)
|
|||
|
|
if def_match:
|
|||
|
|
key_point = def_match.group(1).strip()
|
|||
|
|
|
|||
|
|
if not key_point:
|
|||
|
|
bullets = re.findall(r'[-•]\s*\*\*([^*]+)\*\*[:\s-]*([^\n]*)', body)
|
|||
|
|
if bullets:
|
|||
|
|
term, desc = bullets[0]
|
|||
|
|
key_point = f"{term}: {desc.strip()}" if desc.strip() else term
|
|||
|
|
|
|||
|
|
if not key_point:
|
|||
|
|
para_match = re.search(r'\n\n([^#\n\-•|`][^\n]{20,150})', body)
|
|||
|
|
if para_match:
|
|||
|
|
key_point = para_match.group(1).strip()
|
|||
|
|
|
|||
|
|
if key_point:
|
|||
|
|
answer_parts.append(f"<b>{header}</b>: {key_point}")
|
|||
|
|
|
|||
|
|
if answer_parts:
|
|||
|
|
answer = '<br><br>'.join([clean_text(p) for p in answer_parts])
|
|||
|
|
return [{'front': clean_text(main_q), 'back': answer, 'tags': base_tags}]
|
|||
|
|
|
|||
|
|
return []
|
|||
|
|
|
|||
|
|
# =============================================================================
|
|||
|
|
# MAIN GENERATOR
|
|||
|
|
# =============================================================================
|
|||
|
|
|
|||
|
|
def generate_anki(use_filter=False, use_better_extract=False, main_only=False):
|
|||
|
|
"""Generate Anki deck with specified approaches."""
|
|||
|
|
odpowiedzi_dir = Path("/home/kuchy/praca_magisterska/pytania/odpowiedzi")
|
|||
|
|
|
|||
|
|
# Determine output filename based on options
|
|||
|
|
suffix_parts = []
|
|||
|
|
if use_filter:
|
|||
|
|
suffix_parts.append("filter")
|
|||
|
|
if use_better_extract:
|
|||
|
|
suffix_parts.append("extract")
|
|||
|
|
if main_only:
|
|||
|
|
suffix_parts.append("main")
|
|||
|
|
suffix = "_".join(suffix_parts) if suffix_parts else "basic"
|
|||
|
|
|
|||
|
|
output_file = Path(f"/home/kuchy/praca_magisterska/pytania/anki_{suffix}.txt")
|
|||
|
|
deck_name = f"Egzamin_{suffix.replace('_', '+')}"
|
|||
|
|
|
|||
|
|
all_cards = []
|
|||
|
|
|
|||
|
|
for md_file in sorted(odpowiedzi_dir.glob("*.md")):
|
|||
|
|
if main_only:
|
|||
|
|
# Approach 3: Only main questions
|
|||
|
|
cards = extract_main_only(md_file)
|
|||
|
|
elif use_better_extract:
|
|||
|
|
# Approach 2: Better extraction
|
|||
|
|
cards = extract_cards_better(md_file)
|
|||
|
|
else:
|
|||
|
|
# Basic extraction
|
|||
|
|
cards = extract_cards_basic(md_file)
|
|||
|
|
|
|||
|
|
all_cards.extend(cards)
|
|||
|
|
|
|||
|
|
# Approach 1: Apply filtering if requested
|
|||
|
|
if use_filter:
|
|||
|
|
all_cards = apply_strict_filter(all_cards, min_length=100)
|
|||
|
|
|
|||
|
|
# Remove duplicates
|
|||
|
|
seen = set()
|
|||
|
|
unique = []
|
|||
|
|
for c in all_cards:
|
|||
|
|
key = c['front'][:80]
|
|||
|
|
if key not in seen:
|
|||
|
|
seen.add(key)
|
|||
|
|
unique.append(c)
|
|||
|
|
|
|||
|
|
# Write output
|
|||
|
|
with open(output_file, 'w', encoding='utf-8') as f:
|
|||
|
|
f.write(f"#separator:Tab\n#html:true\n#notetype:Basic\n#deck:{deck_name}\n\n")
|
|||
|
|
for c in unique:
|
|||
|
|
f.write(f"{c['front']}\t{c['back']}\t{c['tags']}\n")
|
|||
|
|
|
|||
|
|
# Statistics
|
|||
|
|
lengths = [len(c['back']) for c in unique]
|
|||
|
|
short = sum(1 for l in lengths if l < 50)
|
|||
|
|
medium = sum(1 for l in lengths if 50 <= l < 150)
|
|||
|
|
good = sum(1 for l in lengths if l >= 150)
|
|||
|
|
|
|||
|
|
print(f"✅ Generated: {output_file.name}")
|
|||
|
|
print(f" Cards: {len(unique)}")
|
|||
|
|
print(f" Quality: {short} short / {medium} medium / {good} good")
|
|||
|
|
print()
|
|||
|
|
|
|||
|
|
return output_file
|
|||
|
|
|
|||
|
|
def main():
|
|||
|
|
parser = argparse.ArgumentParser(description="Generate Anki flashcards with modular approaches")
|
|||
|
|
parser.add_argument('--filter', action='store_true', help='Approach 1: Strict filtering (>100 chars)')
|
|||
|
|
parser.add_argument('--extract', action='store_true', help='Approach 2: Better extraction algorithm')
|
|||
|
|
parser.add_argument('--main-only', action='store_true', help='Approach 3: Main exam questions only')
|
|||
|
|
parser.add_argument('--all-combinations', action='store_true', help='Generate all 7 combinations')
|
|||
|
|
|
|||
|
|
args = parser.parse_args()
|
|||
|
|
|
|||
|
|
if args.all_combinations:
|
|||
|
|
# Generate all 7 combinations
|
|||
|
|
print("=" * 60)
|
|||
|
|
print("Generating all 7 combinations...")
|
|||
|
|
print("=" * 60 + "\n")
|
|||
|
|
|
|||
|
|
combinations = [
|
|||
|
|
(True, False, False), # 1: Filter only
|
|||
|
|
(False, True, False), # 2: Extract only
|
|||
|
|
(False, False, True), # 3: Main only
|
|||
|
|
(True, True, False), # 4: Filter + Extract
|
|||
|
|
(True, False, True), # 5: Filter + Main
|
|||
|
|
(False, True, True), # 6: Extract + Main
|
|||
|
|
(True, True, True), # 7: All three
|
|||
|
|
]
|
|||
|
|
|
|||
|
|
for i, (f, e, m) in enumerate(combinations, 1):
|
|||
|
|
print(f"--- Combination {i} (filter={f}, extract={e}, main={m}) ---")
|
|||
|
|
generate_anki(use_filter=f, use_better_extract=e, main_only=m)
|
|||
|
|
else:
|
|||
|
|
generate_anki(
|
|||
|
|
use_filter=args.filter,
|
|||
|
|
use_better_extract=args.extract,
|
|||
|
|
main_only=args.main_only
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
if __name__ == "__main__":
|
|||
|
|
main()
|