mirror of
https://github.com/kuhyx/testsAndMisc.git
synced 2026-07-04 15:03:01 +02:00
- Add type annotations, docstrings, and constants - Remove commented-out code and print statements - Fix all lint issues in 11 generate_images files
317 lines
8.6 KiB
Python
Executable File
317 lines
8.6 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
"""Generate Anki flashcards from exam questions in odpowiedzi/ folder.
|
|
|
|
Creates a tab-separated file compatible with Anki import.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import logging
|
|
from pathlib import Path
|
|
import re
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
MIN_BODY_LENGTH = 50
|
|
MIN_DEFINITION_LENGTH = 20
|
|
MAX_DEFINITION_LENGTH = 200
|
|
MIN_BULLET_COUNT = 5
|
|
MIN_SUBSECTION_LENGTH = 5
|
|
MIN_FORMULA_LENGTH = 20
|
|
|
|
|
|
def _get_metadata(
|
|
filepath: str,
|
|
) -> tuple[str, str, str, str, str]:
|
|
"""Extract metadata from file."""
|
|
with Path(filepath).open(encoding="utf-8") as f:
|
|
content = f.read()
|
|
|
|
filename = Path(filepath).name
|
|
match = re.match(r"(\d+)-(.+)\.md", filename)
|
|
if match:
|
|
num = match.group(1)
|
|
topic = match.group(2).replace("-", "_")
|
|
else:
|
|
num = "00"
|
|
topic = "unknown"
|
|
|
|
title_match = re.search(r"^# (.+)$", content, re.MULTILINE)
|
|
title = title_match.group(1) if title_match else "Unknown"
|
|
|
|
question_match = re.search(
|
|
r'## Pytanie\s*\n\s*\*\*["\']?(.+?)["\']?\*\*',
|
|
content,
|
|
re.DOTALL,
|
|
)
|
|
if question_match:
|
|
main_question = question_match.group(1).strip()
|
|
main_question = re.sub(r"\s+", " ", main_question)
|
|
else:
|
|
main_question = title
|
|
|
|
return num, topic, title, main_question, content
|
|
|
|
|
|
def _extract_main_card(
|
|
content: str,
|
|
main_question: str,
|
|
subject: str,
|
|
num: str,
|
|
topic: str,
|
|
) -> list[dict[str, str]]:
|
|
"""Extract the main question card."""
|
|
answer_parts: list[str] = []
|
|
|
|
main_answer = re.search(
|
|
r"## 📚 Odpowiedź główna\s*\n(.+?)"
|
|
r"(?=\n## |\n---\s*\n## |\Z)",
|
|
content,
|
|
re.DOTALL,
|
|
)
|
|
if main_answer:
|
|
answer_text = main_answer.group(1)
|
|
headers = re.findall(r"### (.+)", answer_text)
|
|
answer_parts.extend(f"• {h}" for h in headers[:5])
|
|
|
|
definitions = re.findall(
|
|
r"\*\*([^*]+)\*\*\s*[--:]\s*([^*\n]+)", content
|
|
)
|
|
for term, definition in definitions[:3]:
|
|
if (
|
|
len(definition) > MIN_DEFINITION_LENGTH
|
|
and len(definition) < MAX_DEFINITION_LENGTH
|
|
):
|
|
answer_parts.append(
|
|
f"• {term}: {definition.strip()}"
|
|
)
|
|
|
|
if not answer_parts:
|
|
return []
|
|
|
|
answer_html = "<br>".join(answer_parts[:8])
|
|
return [
|
|
{
|
|
"question": main_question,
|
|
"answer": answer_html,
|
|
"tags": (
|
|
f"egzamin_magisterski pytanie_{num}"
|
|
f" {subject} {topic}"
|
|
),
|
|
}
|
|
]
|
|
|
|
|
|
def _extract_subsection_answer(body_clean: str) -> str | None:
|
|
"""Extract answer text from a subsection body."""
|
|
bullets = re.findall(
|
|
r"[-•]\s*\*\*(.+?)\*\*[:\s]*([^\n]+)?", body_clean
|
|
)
|
|
if bullets:
|
|
return "<br>".join(
|
|
f"• {b[0]}: {b[1].strip()}" if b[1] else f"• {b[0]}"
|
|
for b in bullets[:MIN_BULLET_COUNT]
|
|
)
|
|
|
|
paragraphs = [
|
|
p.strip()
|
|
for p in body_clean.split("\n\n")
|
|
if p.strip()
|
|
and not p.startswith("```")
|
|
and not p.startswith("|")
|
|
]
|
|
if paragraphs:
|
|
first_para = paragraphs[0]
|
|
first_para = re.sub(r"\*\*(.+?)\*\*", r"\1", first_para)
|
|
first_para = re.sub(r"\*(.+?)\*", r"\1", first_para)
|
|
return first_para[:400]
|
|
|
|
return None
|
|
|
|
|
|
def _extract_sub_cards(
|
|
content: str,
|
|
title: str,
|
|
subject: str,
|
|
num: str,
|
|
topic: str,
|
|
) -> list[dict[str, str]]:
|
|
"""Extract sub-concept cards."""
|
|
cards: list[dict[str, str]] = []
|
|
subsections = re.findall(
|
|
r"### (\d+\.\s+)?(.+?)\n\n(.+?)"
|
|
r"(?=\n### |\n## |\n---|\Z)",
|
|
content,
|
|
re.DOTALL,
|
|
)
|
|
|
|
for _, header, body in subsections:
|
|
if (
|
|
len(header) < MIN_SUBSECTION_LENGTH
|
|
or header.startswith("Przykład")
|
|
):
|
|
continue
|
|
|
|
body_clean = body.strip()
|
|
if len(body_clean) < MIN_BODY_LENGTH:
|
|
continue
|
|
|
|
answer_text = _extract_subsection_answer(body_clean)
|
|
if not answer_text:
|
|
continue
|
|
|
|
sub_question = (
|
|
f"Co to jest {header}?"
|
|
if not header.endswith("?")
|
|
else header
|
|
)
|
|
|
|
if any(
|
|
kw in header
|
|
for kw in ("Charakterystyka", "Definicja", "Właściwości")
|
|
):
|
|
parent = title.replace("Pytanie", "").strip(
|
|
": 0123456789"
|
|
)
|
|
sub_question = f"{header} - {parent}"
|
|
|
|
cards.append(
|
|
{
|
|
"question": sub_question,
|
|
"answer": answer_text,
|
|
"tags": (
|
|
f"egzamin_magisterski pytanie_{num}"
|
|
f" {subject} {topic} szczegoly"
|
|
),
|
|
}
|
|
)
|
|
|
|
return cards
|
|
|
|
|
|
def _extract_formula_cards(
|
|
content: str,
|
|
subject: str,
|
|
num: str,
|
|
) -> list[dict[str, str]]:
|
|
"""Extract formula/definition cards."""
|
|
cards: list[dict[str, str]] = []
|
|
formulas = re.findall(
|
|
r"\*\*([A-Za-z\s]+"
|
|
r"(?:formuła|wzór|twierdzenie|definicja|lemat))"
|
|
r"\*\*[:\s]*\n?(.+?)(?=\n\n|\n\*\*|\Z)",
|
|
content,
|
|
re.IGNORECASE | re.DOTALL,
|
|
)
|
|
for formula_name, formula_content in formulas:
|
|
if len(formula_content) > MIN_FORMULA_LENGTH:
|
|
cards.append(
|
|
{
|
|
"question": f"Podaj {formula_name.strip()}",
|
|
"answer": formula_content.strip()[:300],
|
|
"tags": (
|
|
f"egzamin_magisterski pytanie_{num}"
|
|
f" {subject} formuly"
|
|
),
|
|
}
|
|
)
|
|
|
|
return cards
|
|
|
|
|
|
def extract_question_and_answer(
|
|
filepath: str,
|
|
) -> list[dict[str, str]]:
|
|
"""Extract main question and key answer points from a markdown file."""
|
|
num, topic, title, main_question, content = _get_metadata(
|
|
filepath
|
|
)
|
|
|
|
subject_match = re.search(r"Przedmiot:\s*(\w+)", content)
|
|
subject = (
|
|
subject_match.group(1) if subject_match else "Ogólne"
|
|
)
|
|
|
|
cards: list[dict[str, str]] = []
|
|
cards.extend(
|
|
_extract_main_card(
|
|
content, main_question, subject, num, topic
|
|
)
|
|
)
|
|
cards.extend(
|
|
_extract_sub_cards(
|
|
content, title, subject, num, topic
|
|
)
|
|
)
|
|
cards.extend(
|
|
_extract_formula_cards(content, subject, num)
|
|
)
|
|
|
|
return cards
|
|
|
|
|
|
def clean_for_anki(text: str) -> str:
|
|
"""Clean text for Anki import - escape special characters."""
|
|
# Replace tabs with spaces
|
|
text = text.replace("\t", " ")
|
|
# Convert markdown formatting to HTML
|
|
text = re.sub(r"\*\*(.+?)\*\*", r"<b>\1</b>", text)
|
|
text = re.sub(r"\*(.+?)\*", r"<i>\1</i>", text)
|
|
# Handle newlines - convert to <br> for Anki
|
|
text = text.replace("\n", "<br>")
|
|
# Remove multiple <br>
|
|
text = re.sub(r"(<br>)+", "<br>", text)
|
|
# Remove leading/trailing <br>
|
|
text = re.sub(r"^<br>|<br>$", "", text)
|
|
# Escape quotes in a way that works with tab-separated
|
|
text = text.replace('"', """)
|
|
return text.strip()
|
|
|
|
|
|
def main() -> None:
|
|
"""Main."""
|
|
odpowiedzi_dir = Path("/home/kuchy/praca_magisterska/pytania/odpowiedzi")
|
|
output_file = Path(
|
|
"/home/kuchy/praca_magisterska/pytania/anki_egzamin_magisterski.txt"
|
|
)
|
|
|
|
all_cards = []
|
|
|
|
# Process each file
|
|
for md_file in sorted(odpowiedzi_dir.glob("*.md")):
|
|
logger.info("Processing: %s", md_file.name)
|
|
try:
|
|
cards = extract_question_and_answer(md_file)
|
|
all_cards.extend(cards)
|
|
logger.info(" -> Extracted %d cards", len(cards))
|
|
except (ValueError, OSError) as e:
|
|
logger.info(" -> Error: %s", e)
|
|
|
|
# Write Anki file with headers
|
|
with Path(output_file).open("w", encoding="utf-8") as f:
|
|
# Anki file headers
|
|
f.write("#separator:tab\n")
|
|
f.write("#html:true\n")
|
|
f.write("#columns:Front\tBack\tTags\n")
|
|
f.write("#deck:Egzamin Magisterski ISY\n")
|
|
f.write("#notetype:Basic\n")
|
|
f.write("\n")
|
|
|
|
for card in all_cards:
|
|
front = clean_for_anki(card["question"])
|
|
back = clean_for_anki(card["answer"])
|
|
tags = card["tags"]
|
|
f.write(f"{front}\t{back}\t{tags}\n")
|
|
|
|
logger.info("Created %d flashcards", len(all_cards))
|
|
logger.info("Output: %s", output_file)
|
|
logger.info("To import into Anki:")
|
|
logger.info("1. Open Anki -> File -> Import")
|
|
logger.info("2. Select the .txt file")
|
|
logger.info("3. Verify 'Allow HTML' is checked")
|
|
logger.info("4. Click Import")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|