testsAndMisc-archive/python_pkg/praca_magisterska_video/generate_images/generate_anki.py
Krzysztof kuhy Rudnicki be31e9abd7 refactor(praca_magisterska_video): fix ruff violations and remove noqa from diagram generators
- Add type annotations, docstrings, and constants
- Remove commented-out code and print statements
- Fix all lint issues in 11 generate_images files
2026-03-13 20:52:27 +01:00

317 lines
8.6 KiB
Python
Executable File

#!/usr/bin/env python3
"""Generate Anki flashcards from exam questions in odpowiedzi/ folder.
Creates a tab-separated file compatible with Anki import.
"""
from __future__ import annotations
import logging
from pathlib import Path
import re
logger = logging.getLogger(__name__)
MIN_BODY_LENGTH = 50
MIN_DEFINITION_LENGTH = 20
MAX_DEFINITION_LENGTH = 200
MIN_BULLET_COUNT = 5
MIN_SUBSECTION_LENGTH = 5
MIN_FORMULA_LENGTH = 20
def _get_metadata(
filepath: str,
) -> tuple[str, str, str, str, str]:
"""Extract metadata from file."""
with Path(filepath).open(encoding="utf-8") as f:
content = f.read()
filename = Path(filepath).name
match = re.match(r"(\d+)-(.+)\.md", filename)
if match:
num = match.group(1)
topic = match.group(2).replace("-", "_")
else:
num = "00"
topic = "unknown"
title_match = re.search(r"^# (.+)$", content, re.MULTILINE)
title = title_match.group(1) if title_match else "Unknown"
question_match = re.search(
r'## Pytanie\s*\n\s*\*\*["\']?(.+?)["\']?\*\*',
content,
re.DOTALL,
)
if question_match:
main_question = question_match.group(1).strip()
main_question = re.sub(r"\s+", " ", main_question)
else:
main_question = title
return num, topic, title, main_question, content
def _extract_main_card(
content: str,
main_question: str,
subject: str,
num: str,
topic: str,
) -> list[dict[str, str]]:
"""Extract the main question card."""
answer_parts: list[str] = []
main_answer = re.search(
r"## 📚 Odpowiedź główna\s*\n(.+?)"
r"(?=\n## |\n---\s*\n## |\Z)",
content,
re.DOTALL,
)
if main_answer:
answer_text = main_answer.group(1)
headers = re.findall(r"### (.+)", answer_text)
answer_parts.extend(f"{h}" for h in headers[:5])
definitions = re.findall(
r"\*\*([^*]+)\*\*\s*[--:]\s*([^*\n]+)", content
)
for term, definition in definitions[:3]:
if (
len(definition) > MIN_DEFINITION_LENGTH
and len(definition) < MAX_DEFINITION_LENGTH
):
answer_parts.append(
f"{term}: {definition.strip()}"
)
if not answer_parts:
return []
answer_html = "<br>".join(answer_parts[:8])
return [
{
"question": main_question,
"answer": answer_html,
"tags": (
f"egzamin_magisterski pytanie_{num}"
f" {subject} {topic}"
),
}
]
def _extract_subsection_answer(body_clean: str) -> str | None:
"""Extract answer text from a subsection body."""
bullets = re.findall(
r"[-•]\s*\*\*(.+?)\*\*[:\s]*([^\n]+)?", body_clean
)
if bullets:
return "<br>".join(
f"{b[0]}: {b[1].strip()}" if b[1] else f"{b[0]}"
for b in bullets[:MIN_BULLET_COUNT]
)
paragraphs = [
p.strip()
for p in body_clean.split("\n\n")
if p.strip()
and not p.startswith("```")
and not p.startswith("|")
]
if paragraphs:
first_para = paragraphs[0]
first_para = re.sub(r"\*\*(.+?)\*\*", r"\1", first_para)
first_para = re.sub(r"\*(.+?)\*", r"\1", first_para)
return first_para[:400]
return None
def _extract_sub_cards(
content: str,
title: str,
subject: str,
num: str,
topic: str,
) -> list[dict[str, str]]:
"""Extract sub-concept cards."""
cards: list[dict[str, str]] = []
subsections = re.findall(
r"### (\d+\.\s+)?(.+?)\n\n(.+?)"
r"(?=\n### |\n## |\n---|\Z)",
content,
re.DOTALL,
)
for _, header, body in subsections:
if (
len(header) < MIN_SUBSECTION_LENGTH
or header.startswith("Przykład")
):
continue
body_clean = body.strip()
if len(body_clean) < MIN_BODY_LENGTH:
continue
answer_text = _extract_subsection_answer(body_clean)
if not answer_text:
continue
sub_question = (
f"Co to jest {header}?"
if not header.endswith("?")
else header
)
if any(
kw in header
for kw in ("Charakterystyka", "Definicja", "Właściwości")
):
parent = title.replace("Pytanie", "").strip(
": 0123456789"
)
sub_question = f"{header} - {parent}"
cards.append(
{
"question": sub_question,
"answer": answer_text,
"tags": (
f"egzamin_magisterski pytanie_{num}"
f" {subject} {topic} szczegoly"
),
}
)
return cards
def _extract_formula_cards(
content: str,
subject: str,
num: str,
) -> list[dict[str, str]]:
"""Extract formula/definition cards."""
cards: list[dict[str, str]] = []
formulas = re.findall(
r"\*\*([A-Za-z\s]+"
r"(?:formuła|wzór|twierdzenie|definicja|lemat))"
r"\*\*[:\s]*\n?(.+?)(?=\n\n|\n\*\*|\Z)",
content,
re.IGNORECASE | re.DOTALL,
)
for formula_name, formula_content in formulas:
if len(formula_content) > MIN_FORMULA_LENGTH:
cards.append(
{
"question": f"Podaj {formula_name.strip()}",
"answer": formula_content.strip()[:300],
"tags": (
f"egzamin_magisterski pytanie_{num}"
f" {subject} formuly"
),
}
)
return cards
def extract_question_and_answer(
filepath: str,
) -> list[dict[str, str]]:
"""Extract main question and key answer points from a markdown file."""
num, topic, title, main_question, content = _get_metadata(
filepath
)
subject_match = re.search(r"Przedmiot:\s*(\w+)", content)
subject = (
subject_match.group(1) if subject_match else "Ogólne"
)
cards: list[dict[str, str]] = []
cards.extend(
_extract_main_card(
content, main_question, subject, num, topic
)
)
cards.extend(
_extract_sub_cards(
content, title, subject, num, topic
)
)
cards.extend(
_extract_formula_cards(content, subject, num)
)
return cards
def clean_for_anki(text: str) -> str:
"""Clean text for Anki import - escape special characters."""
# Replace tabs with spaces
text = text.replace("\t", " ")
# Convert markdown formatting to HTML
text = re.sub(r"\*\*(.+?)\*\*", r"<b>\1</b>", text)
text = re.sub(r"\*(.+?)\*", r"<i>\1</i>", text)
# Handle newlines - convert to <br> for Anki
text = text.replace("\n", "<br>")
# Remove multiple <br>
text = re.sub(r"(<br>)+", "<br>", text)
# Remove leading/trailing <br>
text = re.sub(r"^<br>|<br>$", "", text)
# Escape quotes in a way that works with tab-separated
text = text.replace('"', "&quot;")
return text.strip()
def main() -> None:
"""Main."""
odpowiedzi_dir = Path("/home/kuchy/praca_magisterska/pytania/odpowiedzi")
output_file = Path(
"/home/kuchy/praca_magisterska/pytania/anki_egzamin_magisterski.txt"
)
all_cards = []
# Process each file
for md_file in sorted(odpowiedzi_dir.glob("*.md")):
logger.info("Processing: %s", md_file.name)
try:
cards = extract_question_and_answer(md_file)
all_cards.extend(cards)
logger.info(" -> Extracted %d cards", len(cards))
except (ValueError, OSError) as e:
logger.info(" -> Error: %s", e)
# Write Anki file with headers
with Path(output_file).open("w", encoding="utf-8") as f:
# Anki file headers
f.write("#separator:tab\n")
f.write("#html:true\n")
f.write("#columns:Front\tBack\tTags\n")
f.write("#deck:Egzamin Magisterski ISY\n")
f.write("#notetype:Basic\n")
f.write("\n")
for card in all_cards:
front = clean_for_anki(card["question"])
back = clean_for_anki(card["answer"])
tags = card["tags"]
f.write(f"{front}\t{back}\t{tags}\n")
logger.info("Created %d flashcards", len(all_cards))
logger.info("Output: %s", output_file)
logger.info("To import into Anki:")
logger.info("1. Open Anki -> File -> Import")
logger.info("2. Select the .txt file")
logger.info("3. Verify 'Allow HTML' is checked")
logger.info("4. Click Import")
if __name__ == "__main__":
main()