refactor(praca_magisterska_video): fix ruff violations and remove noqa from diagram generators

- Add type annotations, docstrings, and constants
- Remove commented-out code and print statements
- Fix all lint issues in 11 generate_images files
This commit is contained in:
Krzysztof kuhy Rudnicki 2026-03-13 20:52:27 +01:00
parent 1e108d1e3f
commit d488c87203
11 changed files with 2726 additions and 1346 deletions

View File

@ -7,11 +7,17 @@
from __future__ import annotations
import logging
from pathlib import Path
import re
logger = logging.getLogger(__name__)
def clean_text(text) -> str:
MIN_BODY_LENGTH = 50
MIN_ANSWER_LENGTH = 100
def clean_text(text: str) -> str:
"""Clean text."""
if not text:
return ""
@ -23,7 +29,7 @@ def clean_text(text) -> str:
return text.strip()
def extract_cards(filepath) -> list[dict[str, str]]:
def extract_cards(filepath: str) -> list[dict[str, str]]:
"""Extract cards."""
with Path(filepath).open(encoding="utf-8") as f:
content = f.read()
@ -68,10 +74,10 @@ def extract_cards(filepath) -> list[dict[str, str]]:
content,
re.MULTILINE | re.DOTALL,
)
for header, body in sections:
header = header.strip()
body = body.strip()
if len(body) < 50:
for raw_header, raw_body in sections:
header = raw_header.strip()
body = raw_body.strip()
if len(body) < MIN_BODY_LENGTH:
continue
# Get first paragraph
@ -102,8 +108,10 @@ def main() -> None:
for md_file in sorted(odpowiedzi_dir.glob("*.md")):
all_cards.extend(extract_cards(md_file))
# APPROACH 1: Strict filtering - only cards with answer > 100 chars
filtered_cards = [c for c in all_cards if len(c["back"]) > 100]
# APPROACH 1: Strict filtering - only cards with answer > threshold
filtered_cards = [
c for c in all_cards if len(c["back"]) > MIN_ANSWER_LENGTH
]
# Remove duplicates
seen = set()
@ -120,7 +128,11 @@ def main() -> None:
for c in unique:
f.write(f"{c['front']}\t{c['back']}\t{c['tags']}\n")
print(f"✅ Approach 1 (Strict Filter): {len(unique)} cards -> {output_file.name}")
logger.info(
"Approach 1 (Strict Filter): %d cards -> %s",
len(unique),
output_file.name,
)
if __name__ == "__main__":

View File

@ -7,11 +7,17 @@
from __future__ import annotations
import logging
from pathlib import Path
import re
logger = logging.getLogger(__name__)
def clean_text(text) -> str:
MIN_PARA_LENGTH = 30
MIN_BODY_LENGTH = 50
def clean_text(text: str) -> str:
"""Clean text."""
if not text:
return ""
@ -23,7 +29,7 @@ def clean_text(text) -> str:
return text.strip()
def extract_structured_content(body) -> str | None:
def extract_structured_content(body: str) -> str | None:
"""Better extraction - look for multiple content types."""
parts = []
@ -54,15 +60,14 @@ def extract_structured_content(body) -> str | None:
if p.strip()
and not p.startswith("```")
and not p.startswith("|")
and len(p.strip()) > 30
and len(p.strip()) > MIN_PARA_LENGTH
]
for p in paras[:2]:
parts.append(p[:300])
parts.extend(p[:300] for p in paras[:2])
return "<br>".join([clean_text(p) for p in parts]) if parts else None
def extract_cards(filepath) -> list[dict[str, str]]:
def extract_cards(filepath: str) -> list[dict[str, str]]:
"""Extract cards."""
with Path(filepath).open(encoding="utf-8") as f:
content = f.read()
@ -99,9 +104,9 @@ def extract_cards(filepath) -> list[dict[str, str]]:
content,
re.MULTILINE | re.DOTALL,
)
for header, body in sections:
header = header.strip()
if "Przykład" in header or '"' in header or len(body) < 50:
for raw_header, body in sections:
header = raw_header.strip()
if "Przykład" in header or '"' in header or len(body) < MIN_BODY_LENGTH:
continue
answer = extract_structured_content(body)
@ -143,8 +148,10 @@ def main() -> None:
for c in unique:
f.write(f"{c['front']}\t{c['back']}\t{c['tags']}\n")
print(
f"✅ Approach 2 (Better Extraction): {len(unique)} cards -> {output_file.name}"
logger.info(
"Approach 2 (Better Extraction): %d cards -> %s",
len(unique),
output_file.name,
)

View File

@ -7,31 +7,41 @@ Usage:
Options:
--filter Apply strict filtering (answers > 100 chars)
--extract Use improved extraction algorithm
--main-only Only generate main exam questions (45 comprehensive cards)
--main-only Only generate main exam questions
Combinations:
python anki_generator.py # Basic extraction, no filter
python anki_generator.py --filter # Approach 1: Strict filter only
python anki_generator.py --extract # Approach 2: Better extraction only
python anki_generator.py --main-only # Approach 3: Main questions only
python anki_generator.py --filter --extract # Approach 4: Filter + Better extraction
python anki_generator.py --filter --main-only # Approach 5: Filter + Main only
python anki_generator.py --extract --main-only # Approach 6: Better extraction + Main only
python anki_generator.py --filter --extract --main-only # Approach 7: All three
python anki_generator.py
python anki_generator.py --filter
python anki_generator.py --extract
python anki_generator.py --main-only
python anki_generator.py --filter --extract
python anki_generator.py --filter --main-only
python anki_generator.py --extract --main-only
python anki_generator.py --filter --extract --main-only
"""
from __future__ import annotations
import argparse
import logging
from pathlib import Path
import re
logger = logging.getLogger(__name__)
MIN_PARTS_THRESHOLD = 2
MIN_BODY_LENGTH = 50
MIN_PARA_LENGTH = 30
SHORT_THRESHOLD = 50
MEDIUM_THRESHOLD = 150
DEFAULT_MIN_ANSWER_LENGTH = 100
# =============================================================================
# SHARED UTILITIES
# =============================================================================
def clean_text(text) -> str:
def clean_text(text: str) -> str:
"""Clean and format text for Anki."""
if not text:
return ""
@ -43,7 +53,7 @@ def clean_text(text) -> str:
return text.strip()
def get_file_metadata(filepath) -> tuple[str, str, str]:
def get_file_metadata(filepath: str) -> tuple[str, str, str]:
"""Extract question number and subject from filename."""
filename = Path(filepath).name
match = re.match(r"(\d+)-(.+)\.md", filename)
@ -58,7 +68,7 @@ def get_file_metadata(filepath) -> tuple[str, str, str]:
return num, subject, content
def get_main_question(content) -> str | None:
def get_main_question(content: str) -> str | None:
"""Extract the main exam question."""
q_match = re.search(
r'## Pytanie\s*\n\s*\*\*["\']?(.+?)["\']?\*\*', content, re.DOTALL
@ -73,7 +83,10 @@ def get_main_question(content) -> str | None:
# =============================================================================
def apply_strict_filter(cards, min_length=100) -> list[dict[str, str]]:
def apply_strict_filter(
cards: list[dict[str, str]],
min_length: int = DEFAULT_MIN_ANSWER_LENGTH,
) -> list[dict[str, str]]:
"""Filter cards to only include those with answers > min_length characters."""
return [c for c in cards if len(c["back"]) > min_length]
@ -83,7 +96,7 @@ def apply_strict_filter(cards, min_length=100) -> list[dict[str, str]]:
# =============================================================================
def extract_structured_content(body) -> str | None:
def extract_structured_content(body: str) -> str | None:
"""Improved extraction - multiple content types with better formatting."""
parts = []
@ -101,7 +114,7 @@ def extract_structured_content(body) -> str | None:
parts.append(f"• <b>{term}</b>")
# 3. Key-value patterns
if len(parts) < 2:
if len(parts) < MIN_PARTS_THRESHOLD:
kvs = re.findall(r"\*\*([^*\n]+)\*\*\s*[--:]\s*([^\n*]{10,150})", body)
for k, v in kvs[:4]:
entry = f"<b>{k.strip()}</b>: {v.strip()}"
@ -116,15 +129,14 @@ def extract_structured_content(body) -> str | None:
if p.strip()
and not p.startswith("```")
and not p.startswith("|")
and len(p.strip()) > 30
and len(p.strip()) > MIN_PARA_LENGTH
]
for p in paras[:2]:
parts.append(p[:300])
parts.extend(p[:300] for p in paras[:2])
return "<br>".join([clean_text(p) for p in parts]) if parts else None
def extract_cards_better(filepath) -> list[dict[str, str]]:
def extract_cards_better(filepath: str) -> list[dict[str, str]]:
"""Extract cards with improved algorithm."""
num, subject, content = get_file_metadata(filepath)
base_tags = f"egzamin pyt{num} {subject}"
@ -153,13 +165,13 @@ def extract_cards_better(filepath) -> list[dict[str, str]]:
content,
re.MULTILINE | re.DOTALL,
)
for header, body in sections:
header = header.strip()
for raw_header, body in sections:
header = raw_header.strip()
if (
"Przykład" in header
or '"' in header
or "Mnemonic" in header
or len(body) < 50
or len(body) < MIN_BODY_LENGTH
):
continue
@ -176,7 +188,7 @@ def extract_cards_better(filepath) -> list[dict[str, str]]:
return cards
def extract_cards_basic(filepath) -> list[dict[str, str]]:
def extract_cards_basic(filepath: str) -> list[dict[str, str]]:
"""Basic extraction - simpler algorithm."""
num, subject, content = get_file_metadata(filepath)
base_tags = f"egzamin pyt{num} {subject}"
@ -212,10 +224,10 @@ def extract_cards_basic(filepath) -> list[dict[str, str]]:
content,
re.MULTILINE | re.DOTALL,
)
for header, body in sections:
header = header.strip()
body = body.strip()
if len(body) < 50 or "Przykład" in header:
for raw_header, raw_body in sections:
header = raw_header.strip()
body = raw_body.strip()
if len(body) < MIN_BODY_LENGTH or "Przykład" in header:
continue
paras = [
@ -241,7 +253,28 @@ def extract_cards_basic(filepath) -> list[dict[str, str]]:
# =============================================================================
def extract_main_only(filepath) -> list[dict[str, str]]:
def _extract_key_point(body: str) -> str | None:
"""Extract a key point from a section body."""
# Try to get a definition or first bullet
def_match = re.search(
r"Rozpoznawana klasa języków\s*\n\s*\*\*([^*]+)\*\*", body
)
if def_match:
return def_match.group(1).strip()
bullets = re.findall(r"[-•]\s*\*\*([^*]+)\*\*[:\s-]*([^\n]*)", body)
if bullets:
term, desc = bullets[0]
return f"{term}: {desc.strip()}" if desc.strip() else term
para_match = re.search(r"\n\n([^#\n\-•|`][^\n]{20,150})", body)
if para_match:
return para_match.group(1).strip()
return None
def extract_main_only(filepath: str) -> list[dict[str, str]]:
"""Extract only the main exam question with comprehensive answer."""
num, subject, content = get_file_metadata(filepath)
base_tags = f"egzamin pyt{num} {subject} main"
@ -255,7 +288,9 @@ def extract_main_only(filepath) -> list[dict[str, str]]:
# Get main answer section
answer_match = re.search(
r"## 📚 Odpowiedź główna\s*\n(.+?)(?=\n## [^<5E>]|\Z)", content, re.DOTALL
r"## 📚 Odpowiedź główna\s*\n(.+?)(?=\n## [^<5E>]|\Z)",
content,
re.DOTALL,
)
if answer_match:
section = answer_match.group(1)
@ -267,32 +302,16 @@ def extract_main_only(filepath) -> list[dict[str, str]]:
re.MULTILINE | re.DOTALL,
)
for header, body in headers[:5]:
header = header.strip()
if "Przykład" in header or "Mnemonic" in header or '"' in header:
for raw_header, body in headers[:5]:
header = raw_header.strip()
if (
"Przykład" in header
or "Mnemonic" in header
or '"' in header
):
continue
# Get key point from this section
key_point = None
# Try to get a definition or first bullet
def_match = re.search(
r"Rozpoznawana klasa języków\s*\n\s*\*\*([^*]+)\*\*", body
)
if def_match:
key_point = def_match.group(1).strip()
if not key_point:
bullets = re.findall(r"[-•]\s*\*\*([^*]+)\*\*[:\s-]*([^\n]*)", body)
if bullets:
term, desc = bullets[0]
key_point = f"{term}: {desc.strip()}" if desc.strip() else term
if not key_point:
para_match = re.search(r"\n\n([^#\n\-•|`][^\n]{20,150})", body)
if para_match:
key_point = para_match.group(1).strip()
key_point = _extract_key_point(body)
if key_point:
answer_parts.append(f"<b>{header}</b>: {key_point}")
@ -308,9 +327,58 @@ def extract_main_only(filepath) -> list[dict[str, str]]:
# =============================================================================
def generate_anki(use_filter=False, use_better_extract=False, main_only=False) -> Path:
def _collect_cards(
odpowiedzi_dir: Path,
*,
use_better_extract: bool,
main_only: bool,
) -> list[dict[str, str]]:
"""Collect cards from all files using the specified approach."""
all_cards = []
for md_file in sorted(odpowiedzi_dir.glob("*.md")):
if main_only:
cards = extract_main_only(md_file)
elif use_better_extract:
cards = extract_cards_better(md_file)
else:
cards = extract_cards_basic(md_file)
all_cards.extend(cards)
return all_cards
def _log_statistics(unique: list[dict[str, str]], output_file: Path) -> None:
"""Log quality statistics for the generated cards."""
lengths = [len(c["back"]) for c in unique]
short = sum(1 for length in lengths if length < SHORT_THRESHOLD)
medium = sum(
1
for length in lengths
if SHORT_THRESHOLD <= length < MEDIUM_THRESHOLD
)
good = sum(
1 for length in lengths if length >= MEDIUM_THRESHOLD
)
logger.info("Generated: %s", output_file.name)
logger.info(" Cards: %d", len(unique))
logger.info(
" Quality: %d short / %d medium / %d good",
short,
medium,
good,
)
def generate_anki(
*,
use_filter: bool = False,
use_better_extract: bool = False,
main_only: bool = False,
) -> Path:
"""Generate Anki deck with specified approaches."""
odpowiedzi_dir = Path("/home/kuchy/praca_magisterska/pytania/odpowiedzi")
odpowiedzi_dir = Path(
"/home/kuchy/praca_magisterska/pytania/odpowiedzi"
)
# Determine output filename based on options
suffix_parts = []
@ -322,30 +390,25 @@ def generate_anki(use_filter=False, use_better_extract=False, main_only=False) -
suffix_parts.append("main")
suffix = "_".join(suffix_parts) if suffix_parts else "basic"
output_file = Path(f"/home/kuchy/praca_magisterska/pytania/anki_{suffix}.txt")
output_file = Path(
f"/home/kuchy/praca_magisterska/pytania/anki_{suffix}.txt"
)
deck_name = f"Egzamin_{suffix.replace('_', '+')}"
all_cards = []
for md_file in sorted(odpowiedzi_dir.glob("*.md")):
if main_only:
# Approach 3: Only main questions
cards = extract_main_only(md_file)
elif use_better_extract:
# Approach 2: Better extraction
cards = extract_cards_better(md_file)
else:
# Basic extraction
cards = extract_cards_basic(md_file)
all_cards.extend(cards)
all_cards = _collect_cards(
odpowiedzi_dir,
use_better_extract=use_better_extract,
main_only=main_only,
)
# Approach 1: Apply filtering if requested
if use_filter:
all_cards = apply_strict_filter(all_cards, min_length=100)
all_cards = apply_strict_filter(
all_cards, min_length=DEFAULT_MIN_ANSWER_LENGTH
)
# Remove duplicates
seen = set()
seen: set[str] = set()
unique = []
for c in all_cards:
key = c["front"][:80]
@ -355,20 +418,14 @@ def generate_anki(use_filter=False, use_better_extract=False, main_only=False) -
# Write output
with Path(output_file).open("w", encoding="utf-8") as f:
f.write(f"#separator:Tab\n#html:true\n#notetype:Basic\n#deck:{deck_name}\n\n")
f.write(
"#separator:Tab\n#html:true\n"
f"#notetype:Basic\n#deck:{deck_name}\n\n"
)
for c in unique:
f.write(f"{c['front']}\t{c['back']}\t{c['tags']}\n")
# Statistics
lengths = [len(c["back"]) for c in unique]
short = sum(1 for l in lengths if l < 50)
medium = sum(1 for l in lengths if 50 <= l < 150)
good = sum(1 for l in lengths if l >= 150)
print(f"✅ Generated: {output_file.name}")
print(f" Cards: {len(unique)}")
print(f" Quality: {short} short / {medium} medium / {good} good")
print()
_log_statistics(unique, output_file)
return output_file
@ -397,9 +454,9 @@ def main() -> None:
if args.all_combinations:
# Generate all 7 combinations
print("=" * 60)
print("Generating all 7 combinations...")
print("=" * 60 + "\n")
logger.info("=" * 60)
logger.info("Generating all 7 combinations...")
logger.info("=" * 60)
combinations = [
(True, False, False), # 1: Filter only
@ -411,9 +468,22 @@ def main() -> None:
(True, True, True), # 7: All three
]
for i, (f, e, m) in enumerate(combinations, 1):
print(f"--- Combination {i} (filter={f}, extract={e}, main={m}) ---")
generate_anki(use_filter=f, use_better_extract=e, main_only=m)
for i, (f_flag, e_flag, m_flag) in enumerate(
combinations, 1
):
logger.info(
"--- Combination %d (filter=%s, extract=%s,"
" main=%s) ---",
i,
f_flag,
e_flag,
m_flag,
)
generate_anki(
use_filter=f_flag,
use_better_extract=e_flag,
main_only=m_flag,
)
else:
generate_anki(
use_filter=args.filter,

View File

@ -6,18 +6,27 @@ Creates a tab-separated file compatible with Anki import.
from __future__ import annotations
import logging
from pathlib import Path
import re
logger = logging.getLogger(__name__)
def extract_question_and_answer(filepath) -> list[dict[str, str]]:
"""Extract main question and key answer points from a markdown file."""
MIN_BODY_LENGTH = 50
MIN_DEFINITION_LENGTH = 20
MAX_DEFINITION_LENGTH = 200
MIN_BULLET_COUNT = 5
MIN_SUBSECTION_LENGTH = 5
MIN_FORMULA_LENGTH = 20
def _get_metadata(
filepath: str,
) -> tuple[str, str, str, str, str]:
"""Extract metadata from file."""
with Path(filepath).open(encoding="utf-8") as f:
content = f.read()
cards = []
# Extract file number for tagging
filename = Path(filepath).name
match = re.match(r"(\d+)-(.+)\.md", filename)
if match:
@ -27,13 +36,13 @@ def extract_question_and_answer(filepath) -> list[dict[str, str]]:
num = "00"
topic = "unknown"
# Extract main title (usually contains the question)
title_match = re.search(r"^# (.+)$", content, re.MULTILINE)
title = title_match.group(1) if title_match else "Unknown"
# Extract the main question from ## Pytanie section
question_match = re.search(
r'## Pytanie\s*\n\s*\*\*["\']?(.+?)["\']?\*\*', content, re.DOTALL
r'## Pytanie\s*\n\s*\*\*["\']?(.+?)["\']?\*\*',
content,
re.DOTALL,
)
if question_match:
main_question = question_match.group(1).strip()
@ -41,124 +50,207 @@ def extract_question_and_answer(filepath) -> list[dict[str, str]]:
else:
main_question = title
# Extract subject/przedmiot
subject_match = re.search(r"Przedmiot:\s*(\w+)", content)
subject = subject_match.group(1) if subject_match else "Ogólne"
return num, topic, title, main_question, content
# Create main question card - extract key sections for answer
answer_parts = []
# Look for main answer section
def _extract_main_card(
content: str,
main_question: str,
subject: str,
num: str,
topic: str,
) -> list[dict[str, str]]:
"""Extract the main question card."""
answer_parts: list[str] = []
main_answer = re.search(
r"## 📚 Odpowiedź główna\s*\n(.+?)(?=\n## |\n---\s*\n## |\Z)",
r"## 📚 Odpowiedź główna\s*\n(.+?)"
r"(?=\n## |\n---\s*\n## |\Z)",
content,
re.DOTALL,
)
if main_answer:
answer_text = main_answer.group(1)
# Extract key points, definitions, headers
headers = re.findall(r"### (.+)", answer_text)
for h in headers[:5]: # Limit to first 5 headers
answer_parts.append(f"{h}")
answer_parts.extend(f"{h}" for h in headers[:5])
# Also extract key definitions if present
definitions = re.findall(r"\*\*([^*]+)\*\*\s*[--:]\s*([^*\n]+)", content)
definitions = re.findall(
r"\*\*([^*]+)\*\*\s*[--:]\s*([^*\n]+)", content
)
for term, definition in definitions[:3]:
if len(definition) > 20 and len(definition) < 200:
answer_parts.append(f"{term}: {definition.strip()}")
if (
len(definition) > MIN_DEFINITION_LENGTH
and len(definition) < MAX_DEFINITION_LENGTH
):
answer_parts.append(
f"{term}: {definition.strip()}"
)
# If we found answer parts, create main card
if answer_parts:
answer_html = "<br>".join(answer_parts[:8]) # Limit answer length
cards.append(
{
"question": main_question,
"answer": answer_html,
"tags": f"egzamin_magisterski pytanie_{num} {subject} {topic}",
}
if not answer_parts:
return []
answer_html = "<br>".join(answer_parts[:8])
return [
{
"question": main_question,
"answer": answer_html,
"tags": (
f"egzamin_magisterski pytanie_{num}"
f" {subject} {topic}"
),
}
]
def _extract_subsection_answer(body_clean: str) -> str | None:
"""Extract answer text from a subsection body."""
bullets = re.findall(
r"[-•]\s*\*\*(.+?)\*\*[:\s]*([^\n]+)?", body_clean
)
if bullets:
return "<br>".join(
f"{b[0]}: {b[1].strip()}" if b[1] else f"{b[0]}"
for b in bullets[:MIN_BULLET_COUNT]
)
# Extract sub-questions and key concepts as additional cards
# Look for ### headers with explanations
paragraphs = [
p.strip()
for p in body_clean.split("\n\n")
if p.strip()
and not p.startswith("```")
and not p.startswith("|")
]
if paragraphs:
first_para = paragraphs[0]
first_para = re.sub(r"\*\*(.+?)\*\*", r"\1", first_para)
first_para = re.sub(r"\*(.+?)\*", r"\1", first_para)
return first_para[:400]
return None
def _extract_sub_cards(
content: str,
title: str,
subject: str,
num: str,
topic: str,
) -> list[dict[str, str]]:
"""Extract sub-concept cards."""
cards: list[dict[str, str]] = []
subsections = re.findall(
r"### (\d+\.\s+)?(.+?)\n\n(.+?)(?=\n### |\n## |\n---|\Z)", content, re.DOTALL
r"### (\d+\.\s+)?(.+?)\n\n(.+?)"
r"(?=\n### |\n## |\n---|\Z)",
content,
re.DOTALL,
)
for _, header, body in subsections:
if len(header) < 5 or header.startswith("Przykład"):
continue
# Extract first substantive paragraph or key points
body_clean = body.strip()
# Skip very short or code-only sections
if len(body_clean) < 50:
continue
# Extract bullet points or first paragraph
bullets = re.findall(r"[-•]\s*\*\*(.+?)\*\*[:\s]*([^\n]+)?", body_clean)
if bullets:
answer_text = "<br>".join(
[
f"{b[0]}: {b[1].strip()}" if b[1] else f"{b[0]}"
for b in bullets[:5]
]
)
else:
# Get first meaningful paragraph
paragraphs = [
p.strip()
for p in body_clean.split("\n\n")
if p.strip() and not p.startswith("```") and not p.startswith("|")
]
if paragraphs:
first_para = paragraphs[0]
# Clean markdown
first_para = re.sub(r"\*\*(.+?)\*\*", r"\1", first_para)
first_para = re.sub(r"\*(.+?)\*", r"\1", first_para)
answer_text = first_para[:400]
else:
continue
# Create sub-concept card
sub_question = f"Co to jest {header}?" if not header.endswith("?") else header
if (
"Charakterystyka" in header
or "Definicja" in header
or "Właściwości" in header
len(header) < MIN_SUBSECTION_LENGTH
or header.startswith("Przykład")
):
# These are answer-type headers, reframe
parent_topic = title.replace("Pytanie", "").strip(": 0123456789")
sub_question = f"{header} - {parent_topic}"
continue
body_clean = body.strip()
if len(body_clean) < MIN_BODY_LENGTH:
continue
answer_text = _extract_subsection_answer(body_clean)
if not answer_text:
continue
sub_question = (
f"Co to jest {header}?"
if not header.endswith("?")
else header
)
if any(
kw in header
for kw in ("Charakterystyka", "Definicja", "Właściwości")
):
parent = title.replace("Pytanie", "").strip(
": 0123456789"
)
sub_question = f"{header} - {parent}"
cards.append(
{
"question": sub_question,
"answer": answer_text,
"tags": f"egzamin_magisterski pytanie_{num} {subject} {topic} szczegoly",
"tags": (
f"egzamin_magisterski pytanie_{num}"
f" {subject} {topic} szczegoly"
),
}
)
# Extract key formulas/definitions as separate cards
return cards
def _extract_formula_cards(
content: str,
subject: str,
num: str,
) -> list[dict[str, str]]:
"""Extract formula/definition cards."""
cards: list[dict[str, str]] = []
formulas = re.findall(
r"\*\*([A-Za-z\s]+(?:formuła|wzór|twierdzenie|definicja|lemat))\*\*[:\s]*\n?(.+?)(?=\n\n|\n\*\*|\Z)",
r"\*\*([A-Za-z\s]+"
r"(?:formuła|wzór|twierdzenie|definicja|lemat))"
r"\*\*[:\s]*\n?(.+?)(?=\n\n|\n\*\*|\Z)",
content,
re.IGNORECASE | re.DOTALL,
)
for formula_name, formula_content in formulas:
if len(formula_content) > 20:
if len(formula_content) > MIN_FORMULA_LENGTH:
cards.append(
{
"question": f"Podaj {formula_name.strip()}",
"answer": formula_content.strip()[:300],
"tags": f"egzamin_magisterski pytanie_{num} {subject} formuly",
"tags": (
f"egzamin_magisterski pytanie_{num}"
f" {subject} formuly"
),
}
)
return cards
def clean_for_anki(text) -> str:
def extract_question_and_answer(
filepath: str,
) -> list[dict[str, str]]:
"""Extract main question and key answer points from a markdown file."""
num, topic, title, main_question, content = _get_metadata(
filepath
)
subject_match = re.search(r"Przedmiot:\s*(\w+)", content)
subject = (
subject_match.group(1) if subject_match else "Ogólne"
)
cards: list[dict[str, str]] = []
cards.extend(
_extract_main_card(
content, main_question, subject, num, topic
)
)
cards.extend(
_extract_sub_cards(
content, title, subject, num, topic
)
)
cards.extend(
_extract_formula_cards(content, subject, num)
)
return cards
def clean_for_anki(text: str) -> str:
"""Clean text for Anki import - escape special characters."""
# Replace tabs with spaces
text = text.replace("\t", " ")
@ -187,13 +279,13 @@ def main() -> None:
# Process each file
for md_file in sorted(odpowiedzi_dir.glob("*.md")):
print(f"Processing: {md_file.name}")
logger.info("Processing: %s", md_file.name)
try:
cards = extract_question_and_answer(md_file)
all_cards.extend(cards)
print(f" -> Extracted {len(cards)} cards")
except Exception as e:
print(f" -> Error: {e}")
logger.info(" -> Extracted %d cards", len(cards))
except (ValueError, OSError) as e:
logger.info(" -> Error: %s", e)
# Write Anki file with headers
with Path(output_file).open("w", encoding="utf-8") as f:
@ -211,13 +303,13 @@ def main() -> None:
tags = card["tags"]
f.write(f"{front}\t{back}\t{tags}\n")
print(f"\n✅ Created {len(all_cards)} flashcards")
print(f"📁 Output: {output_file}")
print("\nTo import into Anki:")
print("1. Open Anki → File → Import")
print("2. Select the .txt file")
print("3. Verify 'Allow HTML' is checked")
print("4. Click Import")
logger.info("Created %d flashcards", len(all_cards))
logger.info("Output: %s", output_file)
logger.info("To import into Anki:")
logger.info("1. Open Anki -> File -> Import")
logger.info("2. Select the .txt file")
logger.info("3. Verify 'Allow HTML' is checked")
logger.info("4. Click Import")
if __name__ == "__main__":

View File

@ -6,11 +6,22 @@ Creates tab-separated file for Anki import with proper HTML formatting.
from __future__ import annotations
import logging
from pathlib import Path
import re
logger = logging.getLogger(__name__)
def clean_text(text) -> str:
MIN_HEADER_LENGTH = 3
MIN_MATCH_LENGTH = 10
MIN_BODY_LENGTH = 50
MIN_QA_LENGTH = 30
MAX_CONTENT_LENGTH = 300
MAX_ANSWER_LENGTH = 400
MAX_COMPARISON_ITEMS = 6
def clean_text(text: str) -> str:
"""Clean and format text for Anki."""
if not text:
return ""
@ -28,7 +39,7 @@ def clean_text(text) -> str:
return text.strip()
def format_list(items, numbered=False) -> str:
def format_list(items: list[str], *, numbered: bool = False) -> str:
"""Format a list of items as HTML."""
if not items:
return ""
@ -43,119 +54,148 @@ def format_list(items, numbered=False) -> str:
return html
def extract_from_file(filepath) -> list[dict[str, str]]:
"""Extract flashcard data from a markdown file."""
def _get_file_metadata(
filepath: str,
) -> tuple[str, str, str]:
"""Extract metadata from file."""
with Path(filepath).open(encoding="utf-8") as f:
content = f.read()
cards = []
# Get file metadata
filename = Path(filepath).name
match = re.match(r"(\d+)-(.+)\.md", filename)
num = match.group(1) if match else "00"
match.group(2).replace("-", "_") if match else "unknown"
# Extract subject
subj_match = re.search(r"Przedmiot:\s*(\w+)", content)
subject = subj_match.group(1) if subj_match else "Ogólne"
# Base tags
base_tags = f"egzamin_magisterski pyt{num} {subject}"
return num, subject, content
# =====================================================
# CARD TYPE 1: Main Exam Question
# =====================================================
def _extract_main_question_card(
content: str, base_tags: str,
) -> list[dict[str, str]]:
"""Extract the main exam question card."""
q_match = re.search(
r'## Pytanie\s*\n\s*\*\*["\']?(.+?)["\']?\*\*', content, re.DOTALL
r'## Pytanie\s*\n\s*\*\*["\']?(.+?)["\']?\*\*',
content,
re.DOTALL,
)
if q_match:
main_q = re.sub(r"\s+", " ", q_match.group(1).strip())
if not q_match:
return []
# Extract key topics from main answer
answer_match = re.search(
r"## 📚 Odpowiedź główna\s*\n(.+?)(?=\n## [<5B>🎯]|\n---\s*\n## |\Z)",
content,
re.DOTALL,
main_q = re.sub(r"\s+", " ", q_match.group(1).strip())
answer_match = re.search(
r"## 📚 Odpowiedź główna\s*\n(.+?)"
r"(?=\n## [📚🎯]|\n---\s*\n## |\Z)",
content,
re.DOTALL,
)
if not answer_match:
return []
answer_section = answer_match.group(1)
headers = re.findall(
r"^### (?:\d+\.\s*)?(.+)$",
answer_section,
re.MULTILINE,
)
headers = [
h.strip()
for h in headers
if len(h.strip()) > MIN_HEADER_LENGTH
][:6]
if not headers:
return []
answer_html = (
"<b>Kluczowe zagadnienia:</b>" + format_list(headers)
)
return [
{
"front": clean_text(main_q),
"back": answer_html,
"tags": f"{base_tags} pytanie_glowne",
}
]
def _make_question_text(header: str) -> str:
"""Generate a question from a section header."""
if "Definicja" in header or "Co to" in header:
return (
f"Co to jest:"
f" {header.replace('Definicja', '').strip()}?"
)
if answer_match:
answer_section = answer_match.group(1)
# Get main headers
headers = re.findall(
r"^### (?:\d+\.\s*)?(.+)$", answer_section, re.MULTILINE
if "Charakterystyka" in header:
stripped = header.replace("Charakterystyka", "").strip()
return f"Scharakteryzuj: {stripped}"
if header.endswith("?"):
return header
return f"Omów: {header}"
def _extract_body_parts(body: str) -> list[str]:
"""Extract structured answer parts from a section body."""
answer_parts: list[str] = []
subheaders = re.findall(r"^#### (.+)$", body, re.MULTILINE)
if subheaders:
answer_parts.extend(subheaders[:4])
bullets = re.findall(
r"[-•]\s*\*\*([^*]+)\*\*[:\s-]*([^\n]+)?", body
)
for term, desc in bullets[:5]:
if desc:
answer_parts.append(
f"<b>{term}</b>: {desc.strip()}"
)
headers = [h.strip() for h in headers if len(h.strip()) > 3][:6]
else:
answer_parts.append(f"<b>{term}</b>")
if headers:
answer_html = "<b>Kluczowe zagadnienia:</b>" + format_list(headers)
cards.append(
{
"front": clean_text(main_q),
"back": answer_html,
"tags": f"{base_tags} pytanie_glowne",
}
)
if not answer_parts:
paras = [
p.strip()
for p in body.split("\n\n")
if p.strip()
and not p.strip().startswith("```")
and not p.strip().startswith("|")
]
if paras:
first = paras[0]
if len(first) > MAX_CONTENT_LENGTH:
first = first[:MAX_CONTENT_LENGTH] + "..."
answer_parts.append(first)
# =====================================================
# CARD TYPE 2: Subsection Cards (detailed concepts)
# =====================================================
# Find all ### sections
return answer_parts
def _extract_subsection_cards(
content: str, base_tags: str,
) -> list[dict[str, str]]:
"""Extract subsection detail cards."""
cards: list[dict[str, str]] = []
sections = re.findall(
r"^### (?:\d+\.\s*)?(.+?)\n((?:(?!^###).)+)", content, re.MULTILINE | re.DOTALL
r"^### (?:\d+\.\s*)?(.+?)\n((?:(?!^###).)+)",
content,
re.MULTILINE | re.DOTALL,
)
for header, body in sections:
header = header.strip()
body = body.strip()
for raw_header, raw_body in sections:
header = raw_header.strip()
body = raw_body.strip()
# Skip very short sections or example sections
if len(body) < 50 or header.lower().startswith("przykład"):
if (
len(body) < MIN_BODY_LENGTH
or header.lower().startswith("przykład")
):
continue
# Extract key information from body
answer_parts = []
# Look for #### sub-headers
subheaders = re.findall(r"^#### (.+)$", body, re.MULTILINE)
if subheaders:
answer_parts.extend(subheaders[:4])
# Look for bullet points with bold terms
bullets = re.findall(r"[-•]\s*\*\*([^*]+)\*\*[:\s-]*([^\n]+)?", body)
for term, desc in bullets[:5]:
if desc:
answer_parts.append(f"<b>{term}</b>: {desc.strip()}")
else:
answer_parts.append(f"<b>{term}</b>")
# If no structured content, get first paragraph
if not answer_parts:
paras = [
p.strip()
for p in body.split("\n\n")
if p.strip()
and not p.strip().startswith("```")
and not p.strip().startswith("|")
]
if paras:
first = paras[0]
# Limit length
if len(first) > 300:
first = first[:300] + "..."
answer_parts.append(first)
answer_parts = _extract_body_parts(body)
if answer_parts:
# Determine card type
if "Definicja" in header or "Co to" in header:
q = f"Co to jest: {header.replace('Definicja', '').strip()}?"
elif "Charakterystyka" in header:
q = f"Scharakteryzuj: {header.replace('Charakterystyka', '').strip()}"
elif header.endswith("?"):
q = header
else:
q = f"Omów: {header}"
# Format answer
question = _make_question_text(header)
if len(answer_parts) > 1:
answer_html = format_list(answer_parts)
else:
@ -163,15 +203,20 @@ def extract_from_file(filepath) -> list[dict[str, str]]:
cards.append(
{
"front": clean_text(q),
"front": clean_text(question),
"back": answer_html,
"tags": f"{base_tags} szczegoly",
}
)
# =====================================================
# CARD TYPE 3: Algorithms/Formulas
# =====================================================
return cards
def _extract_algo_cards(
content: str, base_tags: str,
) -> list[dict[str, str]]:
"""Extract algorithm/formula cards."""
cards: list[dict[str, str]] = []
algo_patterns = [
r"#### Złożoność(?:\s+czasowa)?\s*\n(.+?)(?=\n####|\n###|\Z)",
r"Złożoność:\s*\*\*([^*]+)\*\*",
@ -179,85 +224,137 @@ def extract_from_file(filepath) -> list[dict[str, str]]:
for pattern in algo_patterns:
matches = re.findall(pattern, content, re.DOTALL)
for match in matches[:2]:
if len(match) > 10:
# Find context - which algorithm?
for algo_match in matches[:2]:
if len(algo_match) > MIN_MATCH_LENGTH:
algo_context = re.search(
r"### (\d+\.\s*)?(.+?)(?=\n)", content[: content.find(match)]
r"### (\d+\.\s*)?(.+?)(?=\n)",
content[: content.find(algo_match)],
)
if algo_context:
algo_name = algo_context.group(2).strip()
cards.append(
{
"front": f"Jaka jest złożoność algorytmu/metody: {algo_name}?",
"back": clean_text(match.strip()[:200]),
"front": (
"Jaka jest złożoność"
f" algorytmu/metody: {algo_name}?"
),
"back": clean_text(
algo_match.strip()[:200]
),
"tags": f"{base_tags} zlozonosc",
}
)
break
# =====================================================
# CARD TYPE 4: Comparisons (when file contains comparisons)
# =====================================================
return cards
def _extract_comparison_cards(
content: str, base_tags: str, num: str,
) -> list[dict[str, str]]:
"""Extract comparison cards."""
compare_match = re.search(
r"## .*(Porównanie|Zestawienie|vs).*\n(.+?)(?=\n## |\Z)",
content,
re.DOTALL | re.IGNORECASE,
)
if compare_match:
compare_section = compare_match.group(2)
# Extract comparison items
items = re.findall(r"\|\s*\*\*([^|*]+)\*\*\s*\|([^|]+)\|", compare_section)
if items:
comparison_html = "<table><tr><th>Aspekt</th><th>Wartość</th></tr>"
for aspect, value in items[:6]:
comparison_html += f"<tr><td>{clean_text(aspect)}</td><td>{clean_text(value)}</td></tr>"
comparison_html += "</table>"
if not compare_match:
return []
# Get comparison title
title_match = re.search(
r"## .*(Porównanie|Zestawienie).*?(\w+.*?(?:vs|i|oraz).*?\w+)",
compare_match.group(0),
re.IGNORECASE,
)
if title_match:
cards.append(
{
"front": f"Porównaj kluczowe różnice w temacie: pytanie {num}",
"back": comparison_html,
"tags": f"{base_tags} porownanie",
}
)
compare_section = compare_match.group(2)
items = re.findall(
r"\|\s*\*\*([^|*]+)\*\*\s*\|([^|]+)\|",
compare_section,
)
if not items:
return []
# =====================================================
# CARD TYPE 5: Q&A from practice questions section
# =====================================================
qa_section = re.search(r"## 🎓 Pytania.*?\n(.+?)(?=\n## |\Z)", content, re.DOTALL)
if qa_section:
qa_content = qa_section.group(1)
# Find Q&A pairs
qas = re.findall(
r'### Q\d+:?\s*["\']?(.+?)["\']?\s*\n.*?Odpowiedź:\s*\n?(.+?)(?=\n### |\Z)',
qa_content,
re.DOTALL,
comparison_html = (
"<table><tr><th>Aspekt</th><th>Wartość</th></tr>"
)
for aspect, value in items[:MAX_COMPARISON_ITEMS]:
comparison_html += (
f"<tr><td>{clean_text(aspect)}</td>"
f"<td>{clean_text(value)}</td></tr>"
)
for q, a in qas[:3]:
q = re.sub(r"\s+", " ", q.strip())
a = a.strip()
if len(a) > 30:
# Limit answer length
a_lines = a.split("\n")
a_short = "\n".join(a_lines[:5])
if len(a_short) > 400:
a_short = a_short[:400] + "..."
comparison_html += "</table>"
cards.append(
{
"front": clean_text(q),
"back": clean_text(a_short).replace("\n", "<br>"),
"tags": f"{base_tags} egzamin_praktyka",
}
)
title_match = re.search(
r"## .*(Porównanie|Zestawienie)"
r".*?(\w+.*?(?:vs|i|oraz).*?\w+)",
compare_match.group(0),
re.IGNORECASE,
)
if not title_match:
return []
return [
{
"front": (
"Porównaj kluczowe różnice"
f" w temacie: pytanie {num}"
),
"back": comparison_html,
"tags": f"{base_tags} porownanie",
}
]
def _extract_qa_cards(
content: str, base_tags: str,
) -> list[dict[str, str]]:
"""Extract Q&A practice cards."""
cards: list[dict[str, str]] = []
qa_section = re.search(
r"## 🎓 Pytania.*?\n(.+?)(?=\n## |\Z)",
content,
re.DOTALL,
)
if not qa_section:
return cards
qa_content = qa_section.group(1)
qas = re.findall(
r"### Q\d+:?\s*[\"']?(.+?)[\"']?\s*\n"
r".*?Odpowiedź:\s*\n?(.+?)(?=\n### |\Z)",
qa_content,
re.DOTALL,
)
for raw_q, raw_a in qas[:3]:
question = re.sub(r"\s+", " ", raw_q.strip())
answer = raw_a.strip()
if len(answer) > MIN_QA_LENGTH:
a_lines = answer.split("\n")
a_short = "\n".join(a_lines[:5])
if len(a_short) > MAX_ANSWER_LENGTH:
a_short = a_short[:MAX_ANSWER_LENGTH] + "..."
cards.append(
{
"front": clean_text(question),
"back": clean_text(a_short).replace(
"\n", "<br>"
),
"tags": f"{base_tags} egzamin_praktyka",
}
)
return cards
def extract_from_file(filepath: str) -> list[dict[str, str]]:
"""Extract flashcard data from a markdown file."""
num, subject, content = _get_file_metadata(filepath)
base_tags = f"egzamin_magisterski pyt{num} {subject}"
cards: list[dict[str, str]] = []
cards.extend(_extract_main_question_card(content, base_tags))
cards.extend(_extract_subsection_cards(content, base_tags))
cards.extend(_extract_algo_cards(content, base_tags))
cards.extend(
_extract_comparison_cards(content, base_tags, num)
)
cards.extend(_extract_qa_cards(content, base_tags))
return cards
@ -272,13 +369,13 @@ def main() -> None:
all_cards = []
for md_file in sorted(odpowiedzi_dir.glob("*.md")):
print(f"Processing: {md_file.name}", end=" ")
logger.info("Processing: %s", md_file.name)
try:
cards = extract_from_file(md_file)
all_cards.extend(cards)
print(f"{len(cards)} cards")
except Exception as e:
print(f"→ ERROR: {e}")
logger.info(" -> %d cards", len(cards))
except (ValueError, OSError) as e:
logger.info(" -> ERROR: %s", e)
# Remove potential duplicates (same front)
seen = set()
@ -306,23 +403,25 @@ def main() -> None:
f.write(f"{front}\t{back}\t{tags}\n")
print(f"\n{'=' * 50}")
print(f"✅ Generated {len(unique_cards)} unique flashcards")
print(f"📁 Saved to: {output_file}")
print(f"{'=' * 50}")
print("\n📋 IMPORT INSTRUCTIONS:")
print("" * 40)
print("Anki Desktop:")
print(" 1. File → Import")
print(" 2. Select: anki_egzamin_magisterski.txt")
print(" 3. Verify: Fields separated by Tab")
print(" 4. Check: Allow HTML in fields")
print(" 5. Click Import")
print()
print("AnkiWeb / AnkiDroid:")
print(" 1. First import on Anki Desktop")
print(" 2. Click Sync to upload to AnkiWeb")
print(" 3. Sync on mobile to download")
logger.info("=" * 50)
logger.info(
"Generated %d unique flashcards", len(unique_cards)
)
logger.info("Saved to: %s", output_file)
logger.info("=" * 50)
logger.info("IMPORT INSTRUCTIONS:")
logger.info("-" * 40)
logger.info("Anki Desktop:")
logger.info(" 1. File -> Import")
logger.info(" 2. Select: anki_egzamin_magisterski.txt")
logger.info(" 3. Verify: Fields separated by Tab")
logger.info(" 4. Check: Allow HTML in fields")
logger.info(" 5. Click Import")
logger.info("")
logger.info("AnkiWeb / AnkiDroid:")
logger.info(" 1. First import on Anki Desktop")
logger.info(" 2. Click Sync to upload to AnkiWeb")
logger.info(" 3. Sync on mobile to download")
if __name__ == "__main__":

View File

@ -6,12 +6,16 @@ Creates a tab-separated file compatible with Anki import.
from __future__ import annotations
import logging
from pathlib import Path
import re
import traceback
logger = logging.getLogger(__name__)
MIN_HEADER_WORDS = 3
def extract_main_question(content, filename) -> str:
def extract_main_question(content: str, filename: str) -> str:
"""Extract the main exam question from the file."""
# Extract the main question from ## Pytanie section
question_match = re.search(
@ -26,13 +30,13 @@ def extract_main_question(content, filename) -> str:
return title_match.group(1) if title_match else filename
def extract_subject(content) -> str:
def extract_subject(content: str) -> str:
"""Extract the subject code."""
subject_match = re.search(r"Przedmiot:\s*(\w+)", content)
return subject_match.group(1) if subject_match else "Ogólne"
def extract_key_points(content) -> list[str]:
def extract_key_points(content: str) -> list[str]:
"""Extract key points from the main answer section."""
points = []
@ -51,14 +55,14 @@ def extract_key_points(content) -> list[str]:
headers = re.findall(r"^### (.+)$", answer_text, re.MULTILINE)
for h in headers[:6]:
# Clean header
h = re.sub(r"\d+\.\s*", "", h).strip()
if h and len(h) > 3:
points.append(h)
cleaned = re.sub(r"\d+\.\s*", "", h).strip()
if cleaned and len(cleaned) > MIN_HEADER_WORDS:
points.append(cleaned)
return points
def extract_definitions(content) -> list[tuple[str, str]]:
def extract_definitions(content: str) -> list[tuple[str, str]]:
"""Extract key definitions from the content."""
definitions = []
@ -66,9 +70,9 @@ def extract_definitions(content) -> list[tuple[str, str]]:
pattern = r"\*\*([^*\n]+)\*\*\s*[--:]\s*([^*\n]{20,150})"
matches = re.findall(pattern, content)
for term, definition in matches:
term = term.strip()
definition = definition.strip()
for raw_term, raw_def in matches:
term = raw_term.strip()
definition = raw_def.strip()
# Filter out non-definition patterns
if (
term
@ -81,7 +85,7 @@ def extract_definitions(content) -> list[tuple[str, str]]:
return definitions[:5]
def clean_html(text) -> str:
def clean_html(text: str) -> str:
"""Convert markdown to HTML and clean for Anki."""
if not text:
return ""
@ -101,7 +105,7 @@ def clean_html(text) -> str:
return text.strip()
def process_file(filepath) -> list[dict[str, str]]:
def process_file(filepath: str) -> list[dict[str, str]]:
"""Process a single file and return flashcards."""
with Path(filepath).open(encoding="utf-8") as f:
content = f.read()
@ -111,11 +115,7 @@ def process_file(filepath) -> list[dict[str, str]]:
# Extract metadata
filename = Path(filepath).name
match = re.match(r"(\d+)-(.+)\.md", filename)
if match:
num = match.group(1)
match.group(2).replace("-", "_")
else:
num = "00"
num = match.group(1) if match else "00"
subject = extract_subject(content)
main_question = extract_main_question(content, filename)
@ -156,14 +156,13 @@ def main() -> None:
# Process each file
for md_file in sorted(odpowiedzi_dir.glob("*.md")):
print(f"Processing: {md_file.name}")
logger.info("Processing: %s", md_file.name)
try:
cards = process_file(md_file)
all_cards.extend(cards)
print(f" -> {len(cards)} cards")
except Exception as e:
print(f" -> Error: {e}")
traceback.print_exc()
logger.info(" -> %d cards", len(cards))
except (ValueError, OSError):
logger.exception(" -> Error processing file")
# Write Anki-compatible file
with Path(output_file).open("w", encoding="utf-8") as f:
@ -186,16 +185,22 @@ def main() -> None:
f.write(f"{front}\t{back}\t{tags}\n")
print(f"\n✅ Created {len(all_cards)} flashcards")
print(f"📁 Output: {output_file}")
print("\n=== Import Instructions ===")
print("1. Open Anki desktop → File → Import")
print("2. Select: anki_egzamin_magisterski.txt")
print("3. Set 'Fields separated by: Tab'")
print("4. Check 'Allow HTML in fields'")
print("5. Map: Field 1 → Front, Field 2 → Back, Field 3 → Tags")
print("6. Click Import")
print("\nFor AnkiWeb/AnkiDroid: Sync after importing on desktop")
logger.info("Created %d flashcards", len(all_cards))
logger.info("Output: %s", output_file)
logger.info("=== Import Instructions ===")
logger.info("1. Open Anki desktop -> File -> Import")
logger.info("2. Select: anki_egzamin_magisterski.txt")
logger.info("3. Set 'Fields separated by: Tab'")
logger.info("4. Check 'Allow HTML in fields'")
logger.info(
"5. Map: Field 1 -> Front, Field 2 -> Back,"
" Field 3 -> Tags"
)
logger.info("6. Click Import")
logger.info(
"For AnkiWeb/AnkiDroid:"
" Sync after importing on desktop"
)
if __name__ == "__main__":

View File

@ -3,11 +3,18 @@
from __future__ import annotations
import logging
from pathlib import Path
import re
logger = logging.getLogger(__name__)
def clean_text(text) -> str:
MIN_PARA_LENGTH = 20
MAX_PARA_LENGTH = 400
MIN_BODY_LENGTH = 80
def clean_text(text: str) -> str:
"""Clean text for Anki."""
if not text:
return ""
@ -19,7 +26,7 @@ def clean_text(text) -> str:
return text.strip()
def extract_real_answer(content, section_name) -> str | None:
def extract_real_answer(content: str, section_name: str) -> str | None:
"""Extract actual content from a section, not just headers."""
# Find the section
pattern = rf"### (?:\d+\.\s*)?{re.escape(section_name)}\s*\n((?:(?!^### ).)+)"
@ -52,19 +59,21 @@ def extract_real_answer(content, section_name) -> str | None:
for p in body.split("\n\n")
if p.strip() and not p.startswith("```") and not p.startswith("|")
]
for p in paras[:2]:
if len(p) > 20 and len(p) < 400:
lines.append(p)
lines.extend(
p for p in paras[:2]
if len(p) > MIN_PARA_LENGTH and len(p) < MAX_PARA_LENGTH
)
return "<br>".join(lines[:6]) if lines else None
def extract_cards(filepath) -> list[dict[str, str]]:
"""Extract flashcards from a file."""
def _read_file_metadata(
filepath: str | Path,
) -> tuple[str, str, str | None]:
"""Read file and extract metadata."""
with Path(filepath).open(encoding="utf-8") as f:
content = f.read()
cards = []
filename = Path(filepath).name
match = re.match(r"(\d+)-(.+)\.md", filename)
num = match.group(1) if match else "00"
@ -73,182 +82,228 @@ def extract_cards(filepath) -> list[dict[str, str]]:
subject = subj_match.group(1) if subj_match else "Ogólne"
base_tags = f"egzamin_magisterski pyt{num} {subject}"
# Get main question
q_match = re.search(
r'## Pytanie\s*\n\s*\*\*["\']?(.+?)["\']?\*\*', content, re.DOTALL
r'## Pytanie\s*\n\s*\*\*["\']?(.+?)["\']?\*\*',
content,
re.DOTALL,
)
main_question = (
re.sub(r"\s+", " ", q_match.group(1).strip()) if q_match else None
)
main_question = re.sub(r"\s+", " ", q_match.group(1).strip()) if q_match else None
# ===============================================
# MAIN CARD: Question with REAL answer summary
# ===============================================
if main_question:
# Build a real answer from the main sections
answer_parts = []
return content, base_tags, main_question
# For automata question - extract key facts about each automaton
if "automat" in main_question.lower() or "maszyn" in main_question.lower():
# FA
fa_match = re.search(
r"Automat Skończony.*?Rozpoznawana klasa języków\s*\n\s*\*\*([^*]+)\*\*",
content,
re.DOTALL,
def _extract_automata_facts(content: str) -> list[str]:
"""Extract automata-specific facts."""
parts: list[str] = []
automata = [
("Automat Skończony", "FA"),
("Automat ze Stosem", "PDA"),
("Maszyna Turinga", "TM"),
]
for name, abbrev in automata:
pattern = (
rf"{name}.*?Rozpoznawana klasa języków"
r"\s*\n\s*\*\*([^*]+)\*\*"
)
match = re.search(pattern, content, re.DOTALL)
if match:
parts.append(
f"<b>{name} ({abbrev})</b>: "
f"{match.group(1).strip()}"
)
if fa_match:
answer_parts.append(
f"<b>Automat Skończony (FA)</b>: {fa_match.group(1).strip()}"
)
return parts
# PDA
pda_match = re.search(
r"Automat ze Stosem.*?Rozpoznawana klasa języków\s*\n\s*\*\*([^*]+)\*\*",
content,
re.DOTALL,
)
if pda_match:
answer_parts.append(
f"<b>Automat ze Stosem (PDA)</b>: {pda_match.group(1).strip()}"
)
# TM
tm_match = re.search(
r"Maszyna Turinga.*?Rozpoznawana klasa języków\s*\n\s*\*\*([^*]+)\*\*",
content,
re.DOTALL,
)
if tm_match:
answer_parts.append(
f"<b>Maszyna Turinga (TM)</b>: {tm_match.group(1).strip()}"
)
def _extract_generic_facts(content: str) -> list[str]:
"""Extract generic definitions and summaries."""
parts: list[str] = []
key_patterns = [
r"#### Definicja\s*\n([^\n#]+)",
r"#### Charakterystyka\s*\n([^\n#]+)",
r"\*\*Definicja[:\s]*\*\*\s*([^\n]+)",
]
for pattern in key_patterns:
parts.extend(
found.strip()
for found in re.findall(pattern, content)[:3]
if len(found) > MIN_PARA_LENGTH
)
return parts
# Generic extraction if specific didn't work
if not answer_parts:
# Look for key definitions/summaries
key_patterns = [
r"#### Definicja\s*\n([^\n#]+)",
r"#### Charakterystyka\s*\n([^\n#]+)",
r"\*\*Definicja[:\s]*\*\*\s*([^\n]+)",
]
for pattern in key_patterns:
for match in re.findall(pattern, content)[:3]:
if len(match) > 20:
answer_parts.append(match.strip())
# Still nothing? Get first substantive paragraph from main answer
if not answer_parts:
main_answer = re.search(
r"## 📚 Odpowiedź główna\s*\n(.+?)(?=\n## |\Z)", content, re.DOTALL
)
if main_answer:
# Skip headers, get actual content
text = main_answer.group(1)
paras = re.findall(r"\n\n([^#\n][^\n]{50,300})", text)
answer_parts = paras[:3]
def _extract_first_paragraphs(content: str) -> list[str]:
"""Extract first substantive paragraphs from main answer."""
main_answer = re.search(
r"## 📚 Odpowiedź główna\s*\n(.+?)(?=\n## |\Z)",
content,
re.DOTALL,
)
if not main_answer:
return []
text = main_answer.group(1)
paras = re.findall(r"\n\n([^#\n][^\n]{50,300})", text)
return paras[:3]
if answer_parts:
answer = "<br><br>".join([clean_text(p) for p in answer_parts])
cards.append(
{
"front": clean_text(main_question),
"back": answer,
"tags": f"{base_tags} pytanie_glowne",
}
def _build_main_card(
content: str,
main_question: str | None,
base_tags: str,
) -> dict[str, str] | None:
"""Build the main question card."""
if not main_question:
return None
answer_parts: list[str] = []
if (
"automat" in main_question.lower()
or "maszyn" in main_question.lower()
):
answer_parts = _extract_automata_facts(content)
if not answer_parts:
answer_parts = _extract_generic_facts(content)
if not answer_parts:
answer_parts = _extract_first_paragraphs(content)
if not answer_parts:
return None
answer = "<br><br>".join(
clean_text(p) for p in answer_parts
)
return {
"front": clean_text(main_question),
"back": answer,
"tags": f"{base_tags} pytanie_glowne",
}
def _extract_section_content(body: str) -> list[str]:
"""Extract content lines from a section body."""
answer_lines: list[str] = []
def_match = re.search(
r"#### Definicja[^\n]*\n([^\n#]+(?:\n[^\n#]+)?)", body,
)
if def_match:
answer_lines.append(def_match.group(1).strip())
char_match = re.search(
r"#### Charakterystyka\s*\n((?:[-•][^\n]+\n?)+)", body,
)
if char_match:
bullets = re.findall(
r"[-•]\s*\*\*([^*]+)\*\*[:\s]*([^\n]*)",
char_match.group(1),
)
for term, desc in bullets[:4]:
answer_lines.append(
f"• <b>{term}</b>: {desc.strip()}"
if desc
else f"• <b>{term}</b>"
)
# ===============================================
# CONCEPT CARDS: Specific topics with real content
# ===============================================
# Find all ### sections and extract their actual content
if not answer_lines:
bullets = re.findall(
r"[-•]\s*\*\*([^*]+)\*\*[:\s]*([^\n]*)", body,
)
for term, desc in bullets[:5]:
answer_lines.append(
f"• <b>{term}</b>: {desc.strip()}"
if desc
else f"• <b>{term}</b>"
)
if not answer_lines:
first_para = re.search(
r"^([^#\n\-•|`][^\n]{30,250})", body, re.MULTILINE,
)
if first_para:
answer_lines.append(first_para.group(1))
return answer_lines
def _build_concept_cards(
content: str, base_tags: str,
) -> list[dict[str, str]]:
"""Build concept cards from ### sections."""
cards: list[dict[str, str]] = []
sections = re.findall(
r"^### (?:\d+\.\s*)?([^\n]+)\n((?:(?!^### ).)*)",
content,
re.MULTILINE | re.DOTALL,
)
for header, body in sections:
header = header.strip()
body = body.strip()
for raw_header, raw_body in sections:
header = raw_header.strip()
body = raw_body.strip()
# Skip short sections, mnemonics, examples
if (
len(body) < 80
len(body) < MIN_BODY_LENGTH
or "Przykład" in header
or "Mnemonic" in header
or '"' in header
):
continue
# Extract real content
answer_lines = []
# Get definition if present
def_match = re.search(r"#### Definicja[^\n]*\n([^\n#]+(?:\n[^\n#]+)?)", body)
if def_match:
answer_lines.append(def_match.group(1).strip())
# Get characterization
char_match = re.search(r"#### Charakterystyka\s*\n((?:[-•][^\n]+\n?)+)", body)
if char_match:
bullets = re.findall(
r"[-•]\s*\*\*([^*]+)\*\*[:\s]*([^\n]*)", char_match.group(1)
)
for term, desc in bullets[:4]:
answer_lines.append(
f"• <b>{term}</b>: {desc.strip()}" if desc else f"• <b>{term}</b>"
)
# Get bullet points if no structured content yet
answer_lines = _extract_section_content(body)
if not answer_lines:
bullets = re.findall(r"[-•]\s*\*\*([^*]+)\*\*[:\s]*([^\n]*)", body)
for term, desc in bullets[:5]:
answer_lines.append(
f"• <b>{term}</b>: {desc.strip()}" if desc else f"• <b>{term}</b>"
)
continue
# Get first paragraph if still nothing
if not answer_lines:
first_para = re.search(r"^([^#\n\-•|`][^\n]{30,250})", body, re.MULTILINE)
if first_para:
answer_lines.append(first_para.group(1))
question = (
header if header.endswith("?") else f"Wyjaśnij: {header}"
)
answer = "<br>".join(
clean_text(line) for line in answer_lines
)
cards.append(
{
"front": clean_text(question),
"back": answer,
"tags": f"{base_tags} szczegoly",
}
)
if answer_lines:
question = f"Wyjaśnij: {header}" if not header.endswith("?") else header
answer = "<br>".join([clean_text(l) for l in answer_lines])
return cards
cards.append(
{
"front": clean_text(question),
"back": answer,
"tags": f"{base_tags} szczegoly",
}
)
# ===============================================
# Q&A CARDS: From practice questions section
# ===============================================
def _build_qa_cards(
content: str, base_tags: str,
) -> list[dict[str, str]]:
"""Build Q&A practice cards."""
cards: list[dict[str, str]] = []
qa_matches = re.findall(
r'### Q\d+:\s*["\']?([^"\'?\n]+)\?*["\']?\s*\n.*?Odpowiedź:\s*\n(.+?)(?=\n### |\n## |\Z)',
r'### Q\d+:\s*["\']?([^"\'?\n]+)\?*["\']?\s*\n'
r".*?Odpowiedź:\s*\n(.+?)(?=\n### |\n## |\Z)",
content,
re.DOTALL,
)
for question, answer in qa_matches[:5]:
question = question.strip()
answer = answer.strip()
for raw_question, raw_answer in qa_matches[:5]:
question = raw_question.strip()
answer_text = raw_answer.strip()
# Clean up answer - get first meaningful part
answer_lines = answer.split("\n")
clean_answer = []
for line in answer_lines[:6]:
line = line.strip()
if line and not line.startswith("```") and not line.startswith("|"):
clean_answer.append(line)
answer_lines = answer_text.split("\n")
clean_answer = [
stripped
for raw_line in answer_lines[:6]
if (stripped := raw_line.strip())
and not stripped.startswith("```")
and not stripped.startswith("|")
]
if clean_answer:
cards.append(
{
"front": clean_text(question + "?"),
"back": "<br>".join([clean_text(l) for l in clean_answer]),
"back": "<br>".join(
clean_text(line) for line in clean_answer
),
"tags": f"{base_tags} qa",
}
)
@ -256,6 +311,20 @@ def extract_cards(filepath) -> list[dict[str, str]]:
return cards
def extract_cards(filepath: str | Path) -> list[dict[str, str]]:
"""Extract flashcards from a file."""
content, base_tags, main_question = _read_file_metadata(filepath)
cards: list[dict[str, str]] = []
main_card = _build_main_card(content, main_question, base_tags)
if main_card:
cards.append(main_card)
cards.extend(_build_concept_cards(content, base_tags))
cards.extend(_build_qa_cards(content, base_tags))
return cards
def main() -> None:
"""Main."""
odpowiedzi_dir = Path("/home/kuchy/praca_magisterska/pytania/odpowiedzi")
@ -266,13 +335,13 @@ def main() -> None:
all_cards = []
for md_file in sorted(odpowiedzi_dir.glob("*.md")):
print(f"Processing: {md_file.name}", end=" ")
logger.info("Processing: %s", md_file.name)
try:
cards = extract_cards(md_file)
all_cards.extend(cards)
print(f"{len(cards)} cards")
except Exception as e:
print(f"→ ERROR: {e}")
logger.info(" -> %d cards", len(cards))
except (ValueError, OSError):
logger.exception(" -> Error processing file")
# Remove duplicates
seen = set()
@ -299,8 +368,12 @@ def main() -> None:
tags = card["tags"]
f.write(f"{front}\t{back}\t{tags}\n")
print(f"\n✅ Generated {len(unique_cards)} flashcards")
print(f"📁 Output: {output_file}")
logger.info(
"Generated %d unique cards from %d total",
len(unique_cards),
len(all_cards),
)
logger.info("Output: %s", output_file)
if __name__ == "__main__":

View File

@ -7,6 +7,8 @@ Designed for A4 laser printer output (300 DPI, black & white).
from __future__ import annotations
import logging
import matplotlib as mpl
mpl.use("Agg")
@ -20,6 +22,8 @@ if TYPE_CHECKING:
from matplotlib.axes import Axes
from matplotlib.figure import Figure
logger = logging.getLogger(__name__)
OUTPUT_DIR = str(Path(__file__).resolve().parent / "img")
Path(OUTPUT_DIR).mkdir(parents=True, exist_ok=True)
@ -33,19 +37,35 @@ FIXED_COLOR = "#D0F0D0" # light green-ish gray for fixed
FD_ARROW_COLOR = "#444444"
def _compute_col_widths(
headers: list[str],
rows: list[list[str]],
) -> list[float]:
"""Auto-calculate column widths based on content."""
col_widths: list[float] = []
for c in range(len(headers)):
max_len = len(headers[c])
for r in rows:
if c < len(r):
max_len = max(max_len, len(str(r[c])))
col_widths.append(max(max_len * 0.08 + 0.1, 0.5))
return col_widths
def draw_table(
ax,
x,
y,
title,
headers,
rows,
col_widths=None,
highlight_cols=None,
highlight_rows=None,
highlight_cells=None,
strikethrough_cells=None,
title_fontsize=9,
ax: Axes,
x: float,
y: float,
title: str,
headers: list[str],
rows: list[list[str]],
*,
col_widths: list[float] | None = None,
highlight_cols: set[int] | None = None,
highlight_rows: set[int] | None = None,
highlight_cells: set[tuple[int, int]] | None = None,
strikethrough_cells: set[tuple[int, int]] | None = None,
title_fontsize: int = 9,
) -> tuple[float, float]:
"""Draw a single table on the axes at position (x, y).
@ -66,18 +86,10 @@ def draw_table(
Returns:
(width, height) of the drawn table
"""
n_cols = len(headers)
n_rows = len(rows)
if col_widths is None:
# Auto-calculate based on content
col_widths = []
for c in range(n_cols):
max_len = len(headers[c])
for r in rows:
if c < len(r):
max_len = max(max_len, len(str(r[c])))
col_widths.append(max(max_len * 0.08 + 0.1, 0.5))
col_widths = _compute_col_widths(headers, rows)
row_height = 0.22
total_width = sum(col_widths)
@ -172,7 +184,10 @@ def draw_table(
return total_width, total_height + 0.25 # extra for title
def create_figure(width_inches=11.69, height_inches=8.27) -> tuple[Figure, Axes]:
def create_figure(
width_inches: float = 11.69,
height_inches: float = 8.27,
) -> tuple[Figure, Axes]:
"""Create A4 landscape figure."""
fig, ax = plt.subplots(1, 1, figsize=(width_inches, height_inches), dpi=DPI)
ax.set_xlim(0, width_inches)
@ -182,7 +197,16 @@ def create_figure(width_inches=11.69, height_inches=8.27) -> tuple[Figure, Axes]
return fig, ax
def add_arrow(ax, x1, y1, x2, y2, label="", color="black") -> None:
def add_arrow(
ax: Axes,
x1: float,
y1: float,
x2: float,
y2: float,
label: str = "",
*,
color: str = "black",
) -> None:
"""Draw an arrow with optional label."""
ax.annotate(
"",
@ -205,7 +229,15 @@ def add_arrow(ax, x1, y1, x2, y2, label="", color="black") -> None:
def add_label(
ax, x, y, text, fontsize=8, color="black", ha="left", style="normal"
ax: Axes,
x: float,
y: float,
text: str,
*,
fontsize: int = 8,
color: str = "black",
ha: str = "left",
style: str = "normal",
) -> None:
"""Add a text label."""
ax.text(
@ -289,7 +321,10 @@ def draw_0nf() -> None:
ax,
0.8,
1.2,
"Zaleznosci funkcyjne: StID -> Imie, WydzialID | WydzialID -> NazwaWydzialu",
(
"Zaleznosci funkcyjne: StID -> Imie, WydzialID"
" | WydzialID -> NazwaWydzialu"
),
fontsize=8,
color="#333333",
)
@ -297,7 +332,10 @@ def draw_0nf() -> None:
ax,
0.8,
0.9,
" KursID -> NazwaKursu | (StID,KursID) -> Prowadzacy | Prowadzacy -> KursID",
(
" KursID -> NazwaKursu | (StID,KursID)"
" -> Prowadzacy | Prowadzacy -> KursID"
),
fontsize=8,
color="#333333",
)
@ -309,7 +347,7 @@ def draw_0nf() -> None:
pad_inches=0.2,
)
plt.close(fig)
print("Generated: nf_0nf_table.png")
logger.info("Generated: nf_0nf_table.png")
# ============================================================
@ -399,7 +437,10 @@ def draw_1nf() -> None:
ax,
0.5,
1.5,
" Imie, WydzialID, NazwaWydzialu zaleza TYLKO od StID (czesc klucza).",
(
" Imie, WydzialID, NazwaWydzialu"
" zaleza TYLKO od StID (czesc klucza)."
),
fontsize=9,
color="black",
)
@ -419,7 +460,7 @@ def draw_1nf() -> None:
pad_inches=0.2,
)
plt.close(fig)
print("Generated: nf_1nf_tables.png")
logger.info("Generated: nf_1nf_tables.png")
# ============================================================
@ -477,7 +518,10 @@ def draw_2nf() -> None:
ax,
0.3,
3.3,
"KROK: Rozbito czesc. zaleznosci — atrybuty zalezne od czesci klucza wydzielone.",
(
"KROK: Rozbito czesc. zaleznosci"
" — atrybuty zalezne od czesci klucza wydzielone."
),
fontsize=9,
)
add_label(
@ -528,7 +572,7 @@ def draw_2nf() -> None:
pad_inches=0.2,
)
plt.close(fig)
print("Generated: nf_2nf_tables.png")
logger.info("Generated: nf_2nf_tables.png")
# ============================================================