refactor(praca_magisterska_video): fix ruff violations and remove noqa from diagram generators

- Add type annotations, docstrings, and constants
- Remove commented-out code and print statements
- Fix all lint issues in 11 generate_images files
This commit is contained in:
Krzysztof kuhy Rudnicki 2026-03-13 20:52:27 +01:00
parent 1e108d1e3f
commit d488c87203
11 changed files with 2726 additions and 1346 deletions

View File

@ -7,11 +7,17 @@
from __future__ import annotations from __future__ import annotations
import logging
from pathlib import Path from pathlib import Path
import re import re
logger = logging.getLogger(__name__)
def clean_text(text) -> str: MIN_BODY_LENGTH = 50
MIN_ANSWER_LENGTH = 100
def clean_text(text: str) -> str:
"""Clean text.""" """Clean text."""
if not text: if not text:
return "" return ""
@ -23,7 +29,7 @@ def clean_text(text) -> str:
return text.strip() return text.strip()
def extract_cards(filepath) -> list[dict[str, str]]: def extract_cards(filepath: str) -> list[dict[str, str]]:
"""Extract cards.""" """Extract cards."""
with Path(filepath).open(encoding="utf-8") as f: with Path(filepath).open(encoding="utf-8") as f:
content = f.read() content = f.read()
@ -68,10 +74,10 @@ def extract_cards(filepath) -> list[dict[str, str]]:
content, content,
re.MULTILINE | re.DOTALL, re.MULTILINE | re.DOTALL,
) )
for header, body in sections: for raw_header, raw_body in sections:
header = header.strip() header = raw_header.strip()
body = body.strip() body = raw_body.strip()
if len(body) < 50: if len(body) < MIN_BODY_LENGTH:
continue continue
# Get first paragraph # Get first paragraph
@ -102,8 +108,10 @@ def main() -> None:
for md_file in sorted(odpowiedzi_dir.glob("*.md")): for md_file in sorted(odpowiedzi_dir.glob("*.md")):
all_cards.extend(extract_cards(md_file)) all_cards.extend(extract_cards(md_file))
# APPROACH 1: Strict filtering - only cards with answer > 100 chars # APPROACH 1: Strict filtering - only cards with answer > threshold
filtered_cards = [c for c in all_cards if len(c["back"]) > 100] filtered_cards = [
c for c in all_cards if len(c["back"]) > MIN_ANSWER_LENGTH
]
# Remove duplicates # Remove duplicates
seen = set() seen = set()
@ -120,7 +128,11 @@ def main() -> None:
for c in unique: for c in unique:
f.write(f"{c['front']}\t{c['back']}\t{c['tags']}\n") f.write(f"{c['front']}\t{c['back']}\t{c['tags']}\n")
print(f"✅ Approach 1 (Strict Filter): {len(unique)} cards -> {output_file.name}") logger.info(
"Approach 1 (Strict Filter): %d cards -> %s",
len(unique),
output_file.name,
)
if __name__ == "__main__": if __name__ == "__main__":

View File

@ -7,11 +7,17 @@
from __future__ import annotations from __future__ import annotations
import logging
from pathlib import Path from pathlib import Path
import re import re
logger = logging.getLogger(__name__)
def clean_text(text) -> str: MIN_PARA_LENGTH = 30
MIN_BODY_LENGTH = 50
def clean_text(text: str) -> str:
"""Clean text.""" """Clean text."""
if not text: if not text:
return "" return ""
@ -23,7 +29,7 @@ def clean_text(text) -> str:
return text.strip() return text.strip()
def extract_structured_content(body) -> str | None: def extract_structured_content(body: str) -> str | None:
"""Better extraction - look for multiple content types.""" """Better extraction - look for multiple content types."""
parts = [] parts = []
@ -54,15 +60,14 @@ def extract_structured_content(body) -> str | None:
if p.strip() if p.strip()
and not p.startswith("```") and not p.startswith("```")
and not p.startswith("|") and not p.startswith("|")
and len(p.strip()) > 30 and len(p.strip()) > MIN_PARA_LENGTH
] ]
for p in paras[:2]: parts.extend(p[:300] for p in paras[:2])
parts.append(p[:300])
return "<br>".join([clean_text(p) for p in parts]) if parts else None return "<br>".join([clean_text(p) for p in parts]) if parts else None
def extract_cards(filepath) -> list[dict[str, str]]: def extract_cards(filepath: str) -> list[dict[str, str]]:
"""Extract cards.""" """Extract cards."""
with Path(filepath).open(encoding="utf-8") as f: with Path(filepath).open(encoding="utf-8") as f:
content = f.read() content = f.read()
@ -99,9 +104,9 @@ def extract_cards(filepath) -> list[dict[str, str]]:
content, content,
re.MULTILINE | re.DOTALL, re.MULTILINE | re.DOTALL,
) )
for header, body in sections: for raw_header, body in sections:
header = header.strip() header = raw_header.strip()
if "Przykład" in header or '"' in header or len(body) < 50: if "Przykład" in header or '"' in header or len(body) < MIN_BODY_LENGTH:
continue continue
answer = extract_structured_content(body) answer = extract_structured_content(body)
@ -143,8 +148,10 @@ def main() -> None:
for c in unique: for c in unique:
f.write(f"{c['front']}\t{c['back']}\t{c['tags']}\n") f.write(f"{c['front']}\t{c['back']}\t{c['tags']}\n")
print( logger.info(
f"✅ Approach 2 (Better Extraction): {len(unique)} cards -> {output_file.name}" "Approach 2 (Better Extraction): %d cards -> %s",
len(unique),
output_file.name,
) )

View File

@ -7,31 +7,41 @@ Usage:
Options: Options:
--filter Apply strict filtering (answers > 100 chars) --filter Apply strict filtering (answers > 100 chars)
--extract Use improved extraction algorithm --extract Use improved extraction algorithm
--main-only Only generate main exam questions (45 comprehensive cards) --main-only Only generate main exam questions
Combinations: Combinations:
python anki_generator.py # Basic extraction, no filter python anki_generator.py
python anki_generator.py --filter # Approach 1: Strict filter only python anki_generator.py --filter
python anki_generator.py --extract # Approach 2: Better extraction only python anki_generator.py --extract
python anki_generator.py --main-only # Approach 3: Main questions only python anki_generator.py --main-only
python anki_generator.py --filter --extract # Approach 4: Filter + Better extraction python anki_generator.py --filter --extract
python anki_generator.py --filter --main-only # Approach 5: Filter + Main only python anki_generator.py --filter --main-only
python anki_generator.py --extract --main-only # Approach 6: Better extraction + Main only python anki_generator.py --extract --main-only
python anki_generator.py --filter --extract --main-only # Approach 7: All three python anki_generator.py --filter --extract --main-only
""" """
from __future__ import annotations from __future__ import annotations
import argparse import argparse
import logging
from pathlib import Path from pathlib import Path
import re import re
logger = logging.getLogger(__name__)
MIN_PARTS_THRESHOLD = 2
MIN_BODY_LENGTH = 50
MIN_PARA_LENGTH = 30
SHORT_THRESHOLD = 50
MEDIUM_THRESHOLD = 150
DEFAULT_MIN_ANSWER_LENGTH = 100
# ============================================================================= # =============================================================================
# SHARED UTILITIES # SHARED UTILITIES
# ============================================================================= # =============================================================================
def clean_text(text) -> str: def clean_text(text: str) -> str:
"""Clean and format text for Anki.""" """Clean and format text for Anki."""
if not text: if not text:
return "" return ""
@ -43,7 +53,7 @@ def clean_text(text) -> str:
return text.strip() return text.strip()
def get_file_metadata(filepath) -> tuple[str, str, str]: def get_file_metadata(filepath: str) -> tuple[str, str, str]:
"""Extract question number and subject from filename.""" """Extract question number and subject from filename."""
filename = Path(filepath).name filename = Path(filepath).name
match = re.match(r"(\d+)-(.+)\.md", filename) match = re.match(r"(\d+)-(.+)\.md", filename)
@ -58,7 +68,7 @@ def get_file_metadata(filepath) -> tuple[str, str, str]:
return num, subject, content return num, subject, content
def get_main_question(content) -> str | None: def get_main_question(content: str) -> str | None:
"""Extract the main exam question.""" """Extract the main exam question."""
q_match = re.search( q_match = re.search(
r'## Pytanie\s*\n\s*\*\*["\']?(.+?)["\']?\*\*', content, re.DOTALL r'## Pytanie\s*\n\s*\*\*["\']?(.+?)["\']?\*\*', content, re.DOTALL
@ -73,7 +83,10 @@ def get_main_question(content) -> str | None:
# ============================================================================= # =============================================================================
def apply_strict_filter(cards, min_length=100) -> list[dict[str, str]]: def apply_strict_filter(
cards: list[dict[str, str]],
min_length: int = DEFAULT_MIN_ANSWER_LENGTH,
) -> list[dict[str, str]]:
"""Filter cards to only include those with answers > min_length characters.""" """Filter cards to only include those with answers > min_length characters."""
return [c for c in cards if len(c["back"]) > min_length] return [c for c in cards if len(c["back"]) > min_length]
@ -83,7 +96,7 @@ def apply_strict_filter(cards, min_length=100) -> list[dict[str, str]]:
# ============================================================================= # =============================================================================
def extract_structured_content(body) -> str | None: def extract_structured_content(body: str) -> str | None:
"""Improved extraction - multiple content types with better formatting.""" """Improved extraction - multiple content types with better formatting."""
parts = [] parts = []
@ -101,7 +114,7 @@ def extract_structured_content(body) -> str | None:
parts.append(f"• <b>{term}</b>") parts.append(f"• <b>{term}</b>")
# 3. Key-value patterns # 3. Key-value patterns
if len(parts) < 2: if len(parts) < MIN_PARTS_THRESHOLD:
kvs = re.findall(r"\*\*([^*\n]+)\*\*\s*[--:]\s*([^\n*]{10,150})", body) kvs = re.findall(r"\*\*([^*\n]+)\*\*\s*[--:]\s*([^\n*]{10,150})", body)
for k, v in kvs[:4]: for k, v in kvs[:4]:
entry = f"<b>{k.strip()}</b>: {v.strip()}" entry = f"<b>{k.strip()}</b>: {v.strip()}"
@ -116,15 +129,14 @@ def extract_structured_content(body) -> str | None:
if p.strip() if p.strip()
and not p.startswith("```") and not p.startswith("```")
and not p.startswith("|") and not p.startswith("|")
and len(p.strip()) > 30 and len(p.strip()) > MIN_PARA_LENGTH
] ]
for p in paras[:2]: parts.extend(p[:300] for p in paras[:2])
parts.append(p[:300])
return "<br>".join([clean_text(p) for p in parts]) if parts else None return "<br>".join([clean_text(p) for p in parts]) if parts else None
def extract_cards_better(filepath) -> list[dict[str, str]]: def extract_cards_better(filepath: str) -> list[dict[str, str]]:
"""Extract cards with improved algorithm.""" """Extract cards with improved algorithm."""
num, subject, content = get_file_metadata(filepath) num, subject, content = get_file_metadata(filepath)
base_tags = f"egzamin pyt{num} {subject}" base_tags = f"egzamin pyt{num} {subject}"
@ -153,13 +165,13 @@ def extract_cards_better(filepath) -> list[dict[str, str]]:
content, content,
re.MULTILINE | re.DOTALL, re.MULTILINE | re.DOTALL,
) )
for header, body in sections: for raw_header, body in sections:
header = header.strip() header = raw_header.strip()
if ( if (
"Przykład" in header "Przykład" in header
or '"' in header or '"' in header
or "Mnemonic" in header or "Mnemonic" in header
or len(body) < 50 or len(body) < MIN_BODY_LENGTH
): ):
continue continue
@ -176,7 +188,7 @@ def extract_cards_better(filepath) -> list[dict[str, str]]:
return cards return cards
def extract_cards_basic(filepath) -> list[dict[str, str]]: def extract_cards_basic(filepath: str) -> list[dict[str, str]]:
"""Basic extraction - simpler algorithm.""" """Basic extraction - simpler algorithm."""
num, subject, content = get_file_metadata(filepath) num, subject, content = get_file_metadata(filepath)
base_tags = f"egzamin pyt{num} {subject}" base_tags = f"egzamin pyt{num} {subject}"
@ -212,10 +224,10 @@ def extract_cards_basic(filepath) -> list[dict[str, str]]:
content, content,
re.MULTILINE | re.DOTALL, re.MULTILINE | re.DOTALL,
) )
for header, body in sections: for raw_header, raw_body in sections:
header = header.strip() header = raw_header.strip()
body = body.strip() body = raw_body.strip()
if len(body) < 50 or "Przykład" in header: if len(body) < MIN_BODY_LENGTH or "Przykład" in header:
continue continue
paras = [ paras = [
@ -241,7 +253,28 @@ def extract_cards_basic(filepath) -> list[dict[str, str]]:
# ============================================================================= # =============================================================================
def extract_main_only(filepath) -> list[dict[str, str]]: def _extract_key_point(body: str) -> str | None:
"""Extract a key point from a section body."""
# Try to get a definition or first bullet
def_match = re.search(
r"Rozpoznawana klasa języków\s*\n\s*\*\*([^*]+)\*\*", body
)
if def_match:
return def_match.group(1).strip()
bullets = re.findall(r"[-•]\s*\*\*([^*]+)\*\*[:\s-]*([^\n]*)", body)
if bullets:
term, desc = bullets[0]
return f"{term}: {desc.strip()}" if desc.strip() else term
para_match = re.search(r"\n\n([^#\n\-•|`][^\n]{20,150})", body)
if para_match:
return para_match.group(1).strip()
return None
def extract_main_only(filepath: str) -> list[dict[str, str]]:
"""Extract only the main exam question with comprehensive answer.""" """Extract only the main exam question with comprehensive answer."""
num, subject, content = get_file_metadata(filepath) num, subject, content = get_file_metadata(filepath)
base_tags = f"egzamin pyt{num} {subject} main" base_tags = f"egzamin pyt{num} {subject} main"
@ -255,7 +288,9 @@ def extract_main_only(filepath) -> list[dict[str, str]]:
# Get main answer section # Get main answer section
answer_match = re.search( answer_match = re.search(
r"## 📚 Odpowiedź główna\s*\n(.+?)(?=\n## [^<5E>]|\Z)", content, re.DOTALL r"## 📚 Odpowiedź główna\s*\n(.+?)(?=\n## [^<5E>]|\Z)",
content,
re.DOTALL,
) )
if answer_match: if answer_match:
section = answer_match.group(1) section = answer_match.group(1)
@ -267,32 +302,16 @@ def extract_main_only(filepath) -> list[dict[str, str]]:
re.MULTILINE | re.DOTALL, re.MULTILINE | re.DOTALL,
) )
for header, body in headers[:5]: for raw_header, body in headers[:5]:
header = header.strip() header = raw_header.strip()
if "Przykład" in header or "Mnemonic" in header or '"' in header: if (
"Przykład" in header
or "Mnemonic" in header
or '"' in header
):
continue continue
# Get key point from this section key_point = _extract_key_point(body)
key_point = None
# Try to get a definition or first bullet
def_match = re.search(
r"Rozpoznawana klasa języków\s*\n\s*\*\*([^*]+)\*\*", body
)
if def_match:
key_point = def_match.group(1).strip()
if not key_point:
bullets = re.findall(r"[-•]\s*\*\*([^*]+)\*\*[:\s-]*([^\n]*)", body)
if bullets:
term, desc = bullets[0]
key_point = f"{term}: {desc.strip()}" if desc.strip() else term
if not key_point:
para_match = re.search(r"\n\n([^#\n\-•|`][^\n]{20,150})", body)
if para_match:
key_point = para_match.group(1).strip()
if key_point: if key_point:
answer_parts.append(f"<b>{header}</b>: {key_point}") answer_parts.append(f"<b>{header}</b>: {key_point}")
@ -308,9 +327,58 @@ def extract_main_only(filepath) -> list[dict[str, str]]:
# ============================================================================= # =============================================================================
def generate_anki(use_filter=False, use_better_extract=False, main_only=False) -> Path: def _collect_cards(
odpowiedzi_dir: Path,
*,
use_better_extract: bool,
main_only: bool,
) -> list[dict[str, str]]:
"""Collect cards from all files using the specified approach."""
all_cards = []
for md_file in sorted(odpowiedzi_dir.glob("*.md")):
if main_only:
cards = extract_main_only(md_file)
elif use_better_extract:
cards = extract_cards_better(md_file)
else:
cards = extract_cards_basic(md_file)
all_cards.extend(cards)
return all_cards
def _log_statistics(unique: list[dict[str, str]], output_file: Path) -> None:
"""Log quality statistics for the generated cards."""
lengths = [len(c["back"]) for c in unique]
short = sum(1 for length in lengths if length < SHORT_THRESHOLD)
medium = sum(
1
for length in lengths
if SHORT_THRESHOLD <= length < MEDIUM_THRESHOLD
)
good = sum(
1 for length in lengths if length >= MEDIUM_THRESHOLD
)
logger.info("Generated: %s", output_file.name)
logger.info(" Cards: %d", len(unique))
logger.info(
" Quality: %d short / %d medium / %d good",
short,
medium,
good,
)
def generate_anki(
*,
use_filter: bool = False,
use_better_extract: bool = False,
main_only: bool = False,
) -> Path:
"""Generate Anki deck with specified approaches.""" """Generate Anki deck with specified approaches."""
odpowiedzi_dir = Path("/home/kuchy/praca_magisterska/pytania/odpowiedzi") odpowiedzi_dir = Path(
"/home/kuchy/praca_magisterska/pytania/odpowiedzi"
)
# Determine output filename based on options # Determine output filename based on options
suffix_parts = [] suffix_parts = []
@ -322,30 +390,25 @@ def generate_anki(use_filter=False, use_better_extract=False, main_only=False) -
suffix_parts.append("main") suffix_parts.append("main")
suffix = "_".join(suffix_parts) if suffix_parts else "basic" suffix = "_".join(suffix_parts) if suffix_parts else "basic"
output_file = Path(f"/home/kuchy/praca_magisterska/pytania/anki_{suffix}.txt") output_file = Path(
f"/home/kuchy/praca_magisterska/pytania/anki_{suffix}.txt"
)
deck_name = f"Egzamin_{suffix.replace('_', '+')}" deck_name = f"Egzamin_{suffix.replace('_', '+')}"
all_cards = [] all_cards = _collect_cards(
odpowiedzi_dir,
for md_file in sorted(odpowiedzi_dir.glob("*.md")): use_better_extract=use_better_extract,
if main_only: main_only=main_only,
# Approach 3: Only main questions )
cards = extract_main_only(md_file)
elif use_better_extract:
# Approach 2: Better extraction
cards = extract_cards_better(md_file)
else:
# Basic extraction
cards = extract_cards_basic(md_file)
all_cards.extend(cards)
# Approach 1: Apply filtering if requested # Approach 1: Apply filtering if requested
if use_filter: if use_filter:
all_cards = apply_strict_filter(all_cards, min_length=100) all_cards = apply_strict_filter(
all_cards, min_length=DEFAULT_MIN_ANSWER_LENGTH
)
# Remove duplicates # Remove duplicates
seen = set() seen: set[str] = set()
unique = [] unique = []
for c in all_cards: for c in all_cards:
key = c["front"][:80] key = c["front"][:80]
@ -355,20 +418,14 @@ def generate_anki(use_filter=False, use_better_extract=False, main_only=False) -
# Write output # Write output
with Path(output_file).open("w", encoding="utf-8") as f: with Path(output_file).open("w", encoding="utf-8") as f:
f.write(f"#separator:Tab\n#html:true\n#notetype:Basic\n#deck:{deck_name}\n\n") f.write(
"#separator:Tab\n#html:true\n"
f"#notetype:Basic\n#deck:{deck_name}\n\n"
)
for c in unique: for c in unique:
f.write(f"{c['front']}\t{c['back']}\t{c['tags']}\n") f.write(f"{c['front']}\t{c['back']}\t{c['tags']}\n")
# Statistics _log_statistics(unique, output_file)
lengths = [len(c["back"]) for c in unique]
short = sum(1 for l in lengths if l < 50)
medium = sum(1 for l in lengths if 50 <= l < 150)
good = sum(1 for l in lengths if l >= 150)
print(f"✅ Generated: {output_file.name}")
print(f" Cards: {len(unique)}")
print(f" Quality: {short} short / {medium} medium / {good} good")
print()
return output_file return output_file
@ -397,9 +454,9 @@ def main() -> None:
if args.all_combinations: if args.all_combinations:
# Generate all 7 combinations # Generate all 7 combinations
print("=" * 60) logger.info("=" * 60)
print("Generating all 7 combinations...") logger.info("Generating all 7 combinations...")
print("=" * 60 + "\n") logger.info("=" * 60)
combinations = [ combinations = [
(True, False, False), # 1: Filter only (True, False, False), # 1: Filter only
@ -411,9 +468,22 @@ def main() -> None:
(True, True, True), # 7: All three (True, True, True), # 7: All three
] ]
for i, (f, e, m) in enumerate(combinations, 1): for i, (f_flag, e_flag, m_flag) in enumerate(
print(f"--- Combination {i} (filter={f}, extract={e}, main={m}) ---") combinations, 1
generate_anki(use_filter=f, use_better_extract=e, main_only=m) ):
logger.info(
"--- Combination %d (filter=%s, extract=%s,"
" main=%s) ---",
i,
f_flag,
e_flag,
m_flag,
)
generate_anki(
use_filter=f_flag,
use_better_extract=e_flag,
main_only=m_flag,
)
else: else:
generate_anki( generate_anki(
use_filter=args.filter, use_filter=args.filter,

View File

@ -6,18 +6,27 @@ Creates a tab-separated file compatible with Anki import.
from __future__ import annotations from __future__ import annotations
import logging
from pathlib import Path from pathlib import Path
import re import re
logger = logging.getLogger(__name__)
def extract_question_and_answer(filepath) -> list[dict[str, str]]: MIN_BODY_LENGTH = 50
"""Extract main question and key answer points from a markdown file.""" MIN_DEFINITION_LENGTH = 20
MAX_DEFINITION_LENGTH = 200
MIN_BULLET_COUNT = 5
MIN_SUBSECTION_LENGTH = 5
MIN_FORMULA_LENGTH = 20
def _get_metadata(
filepath: str,
) -> tuple[str, str, str, str, str]:
"""Extract metadata from file."""
with Path(filepath).open(encoding="utf-8") as f: with Path(filepath).open(encoding="utf-8") as f:
content = f.read() content = f.read()
cards = []
# Extract file number for tagging
filename = Path(filepath).name filename = Path(filepath).name
match = re.match(r"(\d+)-(.+)\.md", filename) match = re.match(r"(\d+)-(.+)\.md", filename)
if match: if match:
@ -27,13 +36,13 @@ def extract_question_and_answer(filepath) -> list[dict[str, str]]:
num = "00" num = "00"
topic = "unknown" topic = "unknown"
# Extract main title (usually contains the question)
title_match = re.search(r"^# (.+)$", content, re.MULTILINE) title_match = re.search(r"^# (.+)$", content, re.MULTILINE)
title = title_match.group(1) if title_match else "Unknown" title = title_match.group(1) if title_match else "Unknown"
# Extract the main question from ## Pytanie section
question_match = re.search( question_match = re.search(
r'## Pytanie\s*\n\s*\*\*["\']?(.+?)["\']?\*\*', content, re.DOTALL r'## Pytanie\s*\n\s*\*\*["\']?(.+?)["\']?\*\*',
content,
re.DOTALL,
) )
if question_match: if question_match:
main_question = question_match.group(1).strip() main_question = question_match.group(1).strip()
@ -41,124 +50,207 @@ def extract_question_and_answer(filepath) -> list[dict[str, str]]:
else: else:
main_question = title main_question = title
# Extract subject/przedmiot return num, topic, title, main_question, content
subject_match = re.search(r"Przedmiot:\s*(\w+)", content)
subject = subject_match.group(1) if subject_match else "Ogólne"
# Create main question card - extract key sections for answer
answer_parts = []
# Look for main answer section def _extract_main_card(
content: str,
main_question: str,
subject: str,
num: str,
topic: str,
) -> list[dict[str, str]]:
"""Extract the main question card."""
answer_parts: list[str] = []
main_answer = re.search( main_answer = re.search(
r"## 📚 Odpowiedź główna\s*\n(.+?)(?=\n## |\n---\s*\n## |\Z)", r"## 📚 Odpowiedź główna\s*\n(.+?)"
r"(?=\n## |\n---\s*\n## |\Z)",
content, content,
re.DOTALL, re.DOTALL,
) )
if main_answer: if main_answer:
answer_text = main_answer.group(1) answer_text = main_answer.group(1)
# Extract key points, definitions, headers
headers = re.findall(r"### (.+)", answer_text) headers = re.findall(r"### (.+)", answer_text)
for h in headers[:5]: # Limit to first 5 headers answer_parts.extend(f"{h}" for h in headers[:5])
answer_parts.append(f"{h}")
# Also extract key definitions if present definitions = re.findall(
definitions = re.findall(r"\*\*([^*]+)\*\*\s*[--:]\s*([^*\n]+)", content) r"\*\*([^*]+)\*\*\s*[--:]\s*([^*\n]+)", content
)
for term, definition in definitions[:3]: for term, definition in definitions[:3]:
if len(definition) > 20 and len(definition) < 200: if (
answer_parts.append(f"{term}: {definition.strip()}") len(definition) > MIN_DEFINITION_LENGTH
and len(definition) < MAX_DEFINITION_LENGTH
):
answer_parts.append(
f"{term}: {definition.strip()}"
)
# If we found answer parts, create main card if not answer_parts:
if answer_parts: return []
answer_html = "<br>".join(answer_parts[:8]) # Limit answer length
cards.append( answer_html = "<br>".join(answer_parts[:8])
{ return [
"question": main_question, {
"answer": answer_html, "question": main_question,
"tags": f"egzamin_magisterski pytanie_{num} {subject} {topic}", "answer": answer_html,
} "tags": (
f"egzamin_magisterski pytanie_{num}"
f" {subject} {topic}"
),
}
]
def _extract_subsection_answer(body_clean: str) -> str | None:
"""Extract answer text from a subsection body."""
bullets = re.findall(
r"[-•]\s*\*\*(.+?)\*\*[:\s]*([^\n]+)?", body_clean
)
if bullets:
return "<br>".join(
f"{b[0]}: {b[1].strip()}" if b[1] else f"{b[0]}"
for b in bullets[:MIN_BULLET_COUNT]
) )
# Extract sub-questions and key concepts as additional cards paragraphs = [
# Look for ### headers with explanations p.strip()
for p in body_clean.split("\n\n")
if p.strip()
and not p.startswith("```")
and not p.startswith("|")
]
if paragraphs:
first_para = paragraphs[0]
first_para = re.sub(r"\*\*(.+?)\*\*", r"\1", first_para)
first_para = re.sub(r"\*(.+?)\*", r"\1", first_para)
return first_para[:400]
return None
def _extract_sub_cards(
content: str,
title: str,
subject: str,
num: str,
topic: str,
) -> list[dict[str, str]]:
"""Extract sub-concept cards."""
cards: list[dict[str, str]] = []
subsections = re.findall( subsections = re.findall(
r"### (\d+\.\s+)?(.+?)\n\n(.+?)(?=\n### |\n## |\n---|\Z)", content, re.DOTALL r"### (\d+\.\s+)?(.+?)\n\n(.+?)"
r"(?=\n### |\n## |\n---|\Z)",
content,
re.DOTALL,
) )
for _, header, body in subsections: for _, header, body in subsections:
if len(header) < 5 or header.startswith("Przykład"):
continue
# Extract first substantive paragraph or key points
body_clean = body.strip()
# Skip very short or code-only sections
if len(body_clean) < 50:
continue
# Extract bullet points or first paragraph
bullets = re.findall(r"[-•]\s*\*\*(.+?)\*\*[:\s]*([^\n]+)?", body_clean)
if bullets:
answer_text = "<br>".join(
[
f"{b[0]}: {b[1].strip()}" if b[1] else f"{b[0]}"
for b in bullets[:5]
]
)
else:
# Get first meaningful paragraph
paragraphs = [
p.strip()
for p in body_clean.split("\n\n")
if p.strip() and not p.startswith("```") and not p.startswith("|")
]
if paragraphs:
first_para = paragraphs[0]
# Clean markdown
first_para = re.sub(r"\*\*(.+?)\*\*", r"\1", first_para)
first_para = re.sub(r"\*(.+?)\*", r"\1", first_para)
answer_text = first_para[:400]
else:
continue
# Create sub-concept card
sub_question = f"Co to jest {header}?" if not header.endswith("?") else header
if ( if (
"Charakterystyka" in header len(header) < MIN_SUBSECTION_LENGTH
or "Definicja" in header or header.startswith("Przykład")
or "Właściwości" in header
): ):
# These are answer-type headers, reframe continue
parent_topic = title.replace("Pytanie", "").strip(": 0123456789")
sub_question = f"{header} - {parent_topic}" body_clean = body.strip()
if len(body_clean) < MIN_BODY_LENGTH:
continue
answer_text = _extract_subsection_answer(body_clean)
if not answer_text:
continue
sub_question = (
f"Co to jest {header}?"
if not header.endswith("?")
else header
)
if any(
kw in header
for kw in ("Charakterystyka", "Definicja", "Właściwości")
):
parent = title.replace("Pytanie", "").strip(
": 0123456789"
)
sub_question = f"{header} - {parent}"
cards.append( cards.append(
{ {
"question": sub_question, "question": sub_question,
"answer": answer_text, "answer": answer_text,
"tags": f"egzamin_magisterski pytanie_{num} {subject} {topic} szczegoly", "tags": (
f"egzamin_magisterski pytanie_{num}"
f" {subject} {topic} szczegoly"
),
} }
) )
# Extract key formulas/definitions as separate cards return cards
def _extract_formula_cards(
content: str,
subject: str,
num: str,
) -> list[dict[str, str]]:
"""Extract formula/definition cards."""
cards: list[dict[str, str]] = []
formulas = re.findall( formulas = re.findall(
r"\*\*([A-Za-z\s]+(?:formuła|wzór|twierdzenie|definicja|lemat))\*\*[:\s]*\n?(.+?)(?=\n\n|\n\*\*|\Z)", r"\*\*([A-Za-z\s]+"
r"(?:formuła|wzór|twierdzenie|definicja|lemat))"
r"\*\*[:\s]*\n?(.+?)(?=\n\n|\n\*\*|\Z)",
content, content,
re.IGNORECASE | re.DOTALL, re.IGNORECASE | re.DOTALL,
) )
for formula_name, formula_content in formulas: for formula_name, formula_content in formulas:
if len(formula_content) > 20: if len(formula_content) > MIN_FORMULA_LENGTH:
cards.append( cards.append(
{ {
"question": f"Podaj {formula_name.strip()}", "question": f"Podaj {formula_name.strip()}",
"answer": formula_content.strip()[:300], "answer": formula_content.strip()[:300],
"tags": f"egzamin_magisterski pytanie_{num} {subject} formuly", "tags": (
f"egzamin_magisterski pytanie_{num}"
f" {subject} formuly"
),
} }
) )
return cards return cards
def clean_for_anki(text) -> str: def extract_question_and_answer(
filepath: str,
) -> list[dict[str, str]]:
"""Extract main question and key answer points from a markdown file."""
num, topic, title, main_question, content = _get_metadata(
filepath
)
subject_match = re.search(r"Przedmiot:\s*(\w+)", content)
subject = (
subject_match.group(1) if subject_match else "Ogólne"
)
cards: list[dict[str, str]] = []
cards.extend(
_extract_main_card(
content, main_question, subject, num, topic
)
)
cards.extend(
_extract_sub_cards(
content, title, subject, num, topic
)
)
cards.extend(
_extract_formula_cards(content, subject, num)
)
return cards
def clean_for_anki(text: str) -> str:
"""Clean text for Anki import - escape special characters.""" """Clean text for Anki import - escape special characters."""
# Replace tabs with spaces # Replace tabs with spaces
text = text.replace("\t", " ") text = text.replace("\t", " ")
@ -187,13 +279,13 @@ def main() -> None:
# Process each file # Process each file
for md_file in sorted(odpowiedzi_dir.glob("*.md")): for md_file in sorted(odpowiedzi_dir.glob("*.md")):
print(f"Processing: {md_file.name}") logger.info("Processing: %s", md_file.name)
try: try:
cards = extract_question_and_answer(md_file) cards = extract_question_and_answer(md_file)
all_cards.extend(cards) all_cards.extend(cards)
print(f" -> Extracted {len(cards)} cards") logger.info(" -> Extracted %d cards", len(cards))
except Exception as e: except (ValueError, OSError) as e:
print(f" -> Error: {e}") logger.info(" -> Error: %s", e)
# Write Anki file with headers # Write Anki file with headers
with Path(output_file).open("w", encoding="utf-8") as f: with Path(output_file).open("w", encoding="utf-8") as f:
@ -211,13 +303,13 @@ def main() -> None:
tags = card["tags"] tags = card["tags"]
f.write(f"{front}\t{back}\t{tags}\n") f.write(f"{front}\t{back}\t{tags}\n")
print(f"\n✅ Created {len(all_cards)} flashcards") logger.info("Created %d flashcards", len(all_cards))
print(f"📁 Output: {output_file}") logger.info("Output: %s", output_file)
print("\nTo import into Anki:") logger.info("To import into Anki:")
print("1. Open Anki → File → Import") logger.info("1. Open Anki -> File -> Import")
print("2. Select the .txt file") logger.info("2. Select the .txt file")
print("3. Verify 'Allow HTML' is checked") logger.info("3. Verify 'Allow HTML' is checked")
print("4. Click Import") logger.info("4. Click Import")
if __name__ == "__main__": if __name__ == "__main__":

View File

@ -6,11 +6,22 @@ Creates tab-separated file for Anki import with proper HTML formatting.
from __future__ import annotations from __future__ import annotations
import logging
from pathlib import Path from pathlib import Path
import re import re
logger = logging.getLogger(__name__)
def clean_text(text) -> str: MIN_HEADER_LENGTH = 3
MIN_MATCH_LENGTH = 10
MIN_BODY_LENGTH = 50
MIN_QA_LENGTH = 30
MAX_CONTENT_LENGTH = 300
MAX_ANSWER_LENGTH = 400
MAX_COMPARISON_ITEMS = 6
def clean_text(text: str) -> str:
"""Clean and format text for Anki.""" """Clean and format text for Anki."""
if not text: if not text:
return "" return ""
@ -28,7 +39,7 @@ def clean_text(text) -> str:
return text.strip() return text.strip()
def format_list(items, numbered=False) -> str: def format_list(items: list[str], *, numbered: bool = False) -> str:
"""Format a list of items as HTML.""" """Format a list of items as HTML."""
if not items: if not items:
return "" return ""
@ -43,119 +54,148 @@ def format_list(items, numbered=False) -> str:
return html return html
def extract_from_file(filepath) -> list[dict[str, str]]: def _get_file_metadata(
"""Extract flashcard data from a markdown file.""" filepath: str,
) -> tuple[str, str, str]:
"""Extract metadata from file."""
with Path(filepath).open(encoding="utf-8") as f: with Path(filepath).open(encoding="utf-8") as f:
content = f.read() content = f.read()
cards = []
# Get file metadata
filename = Path(filepath).name filename = Path(filepath).name
match = re.match(r"(\d+)-(.+)\.md", filename) match = re.match(r"(\d+)-(.+)\.md", filename)
num = match.group(1) if match else "00" num = match.group(1) if match else "00"
match.group(2).replace("-", "_") if match else "unknown"
# Extract subject
subj_match = re.search(r"Przedmiot:\s*(\w+)", content) subj_match = re.search(r"Przedmiot:\s*(\w+)", content)
subject = subj_match.group(1) if subj_match else "Ogólne" subject = subj_match.group(1) if subj_match else "Ogólne"
# Base tags return num, subject, content
base_tags = f"egzamin_magisterski pyt{num} {subject}"
# =====================================================
# CARD TYPE 1: Main Exam Question def _extract_main_question_card(
# ===================================================== content: str, base_tags: str,
) -> list[dict[str, str]]:
"""Extract the main exam question card."""
q_match = re.search( q_match = re.search(
r'## Pytanie\s*\n\s*\*\*["\']?(.+?)["\']?\*\*', content, re.DOTALL r'## Pytanie\s*\n\s*\*\*["\']?(.+?)["\']?\*\*',
content,
re.DOTALL,
) )
if q_match: if not q_match:
main_q = re.sub(r"\s+", " ", q_match.group(1).strip()) return []
# Extract key topics from main answer main_q = re.sub(r"\s+", " ", q_match.group(1).strip())
answer_match = re.search( answer_match = re.search(
r"## 📚 Odpowiedź główna\s*\n(.+?)(?=\n## [<5B>🎯]|\n---\s*\n## |\Z)", r"## 📚 Odpowiedź główna\s*\n(.+?)"
content, r"(?=\n## [📚🎯]|\n---\s*\n## |\Z)",
re.DOTALL, content,
re.DOTALL,
)
if not answer_match:
return []
answer_section = answer_match.group(1)
headers = re.findall(
r"^### (?:\d+\.\s*)?(.+)$",
answer_section,
re.MULTILINE,
)
headers = [
h.strip()
for h in headers
if len(h.strip()) > MIN_HEADER_LENGTH
][:6]
if not headers:
return []
answer_html = (
"<b>Kluczowe zagadnienia:</b>" + format_list(headers)
)
return [
{
"front": clean_text(main_q),
"back": answer_html,
"tags": f"{base_tags} pytanie_glowne",
}
]
def _make_question_text(header: str) -> str:
"""Generate a question from a section header."""
if "Definicja" in header or "Co to" in header:
return (
f"Co to jest:"
f" {header.replace('Definicja', '').strip()}?"
) )
if answer_match: if "Charakterystyka" in header:
answer_section = answer_match.group(1) stripped = header.replace("Charakterystyka", "").strip()
# Get main headers return f"Scharakteryzuj: {stripped}"
headers = re.findall( if header.endswith("?"):
r"^### (?:\d+\.\s*)?(.+)$", answer_section, re.MULTILINE return header
return f"Omów: {header}"
def _extract_body_parts(body: str) -> list[str]:
"""Extract structured answer parts from a section body."""
answer_parts: list[str] = []
subheaders = re.findall(r"^#### (.+)$", body, re.MULTILINE)
if subheaders:
answer_parts.extend(subheaders[:4])
bullets = re.findall(
r"[-•]\s*\*\*([^*]+)\*\*[:\s-]*([^\n]+)?", body
)
for term, desc in bullets[:5]:
if desc:
answer_parts.append(
f"<b>{term}</b>: {desc.strip()}"
) )
headers = [h.strip() for h in headers if len(h.strip()) > 3][:6] else:
answer_parts.append(f"<b>{term}</b>")
if headers: if not answer_parts:
answer_html = "<b>Kluczowe zagadnienia:</b>" + format_list(headers) paras = [
cards.append( p.strip()
{ for p in body.split("\n\n")
"front": clean_text(main_q), if p.strip()
"back": answer_html, and not p.strip().startswith("```")
"tags": f"{base_tags} pytanie_glowne", and not p.strip().startswith("|")
} ]
) if paras:
first = paras[0]
if len(first) > MAX_CONTENT_LENGTH:
first = first[:MAX_CONTENT_LENGTH] + "..."
answer_parts.append(first)
# ===================================================== return answer_parts
# CARD TYPE 2: Subsection Cards (detailed concepts)
# =====================================================
# Find all ### sections def _extract_subsection_cards(
content: str, base_tags: str,
) -> list[dict[str, str]]:
"""Extract subsection detail cards."""
cards: list[dict[str, str]] = []
sections = re.findall( sections = re.findall(
r"^### (?:\d+\.\s*)?(.+?)\n((?:(?!^###).)+)", content, re.MULTILINE | re.DOTALL r"^### (?:\d+\.\s*)?(.+?)\n((?:(?!^###).)+)",
content,
re.MULTILINE | re.DOTALL,
) )
for header, body in sections: for raw_header, raw_body in sections:
header = header.strip() header = raw_header.strip()
body = body.strip() body = raw_body.strip()
# Skip very short sections or example sections if (
if len(body) < 50 or header.lower().startswith("przykład"): len(body) < MIN_BODY_LENGTH
or header.lower().startswith("przykład")
):
continue continue
# Extract key information from body answer_parts = _extract_body_parts(body)
answer_parts = []
# Look for #### sub-headers
subheaders = re.findall(r"^#### (.+)$", body, re.MULTILINE)
if subheaders:
answer_parts.extend(subheaders[:4])
# Look for bullet points with bold terms
bullets = re.findall(r"[-•]\s*\*\*([^*]+)\*\*[:\s-]*([^\n]+)?", body)
for term, desc in bullets[:5]:
if desc:
answer_parts.append(f"<b>{term}</b>: {desc.strip()}")
else:
answer_parts.append(f"<b>{term}</b>")
# If no structured content, get first paragraph
if not answer_parts:
paras = [
p.strip()
for p in body.split("\n\n")
if p.strip()
and not p.strip().startswith("```")
and not p.strip().startswith("|")
]
if paras:
first = paras[0]
# Limit length
if len(first) > 300:
first = first[:300] + "..."
answer_parts.append(first)
if answer_parts: if answer_parts:
# Determine card type question = _make_question_text(header)
if "Definicja" in header or "Co to" in header:
q = f"Co to jest: {header.replace('Definicja', '').strip()}?"
elif "Charakterystyka" in header:
q = f"Scharakteryzuj: {header.replace('Charakterystyka', '').strip()}"
elif header.endswith("?"):
q = header
else:
q = f"Omów: {header}"
# Format answer
if len(answer_parts) > 1: if len(answer_parts) > 1:
answer_html = format_list(answer_parts) answer_html = format_list(answer_parts)
else: else:
@ -163,15 +203,20 @@ def extract_from_file(filepath) -> list[dict[str, str]]:
cards.append( cards.append(
{ {
"front": clean_text(q), "front": clean_text(question),
"back": answer_html, "back": answer_html,
"tags": f"{base_tags} szczegoly", "tags": f"{base_tags} szczegoly",
} }
) )
# ===================================================== return cards
# CARD TYPE 3: Algorithms/Formulas
# =====================================================
def _extract_algo_cards(
content: str, base_tags: str,
) -> list[dict[str, str]]:
"""Extract algorithm/formula cards."""
cards: list[dict[str, str]] = []
algo_patterns = [ algo_patterns = [
r"#### Złożoność(?:\s+czasowa)?\s*\n(.+?)(?=\n####|\n###|\Z)", r"#### Złożoność(?:\s+czasowa)?\s*\n(.+?)(?=\n####|\n###|\Z)",
r"Złożoność:\s*\*\*([^*]+)\*\*", r"Złożoność:\s*\*\*([^*]+)\*\*",
@ -179,85 +224,137 @@ def extract_from_file(filepath) -> list[dict[str, str]]:
for pattern in algo_patterns: for pattern in algo_patterns:
matches = re.findall(pattern, content, re.DOTALL) matches = re.findall(pattern, content, re.DOTALL)
for match in matches[:2]: for algo_match in matches[:2]:
if len(match) > 10: if len(algo_match) > MIN_MATCH_LENGTH:
# Find context - which algorithm?
algo_context = re.search( algo_context = re.search(
r"### (\d+\.\s*)?(.+?)(?=\n)", content[: content.find(match)] r"### (\d+\.\s*)?(.+?)(?=\n)",
content[: content.find(algo_match)],
) )
if algo_context: if algo_context:
algo_name = algo_context.group(2).strip() algo_name = algo_context.group(2).strip()
cards.append( cards.append(
{ {
"front": f"Jaka jest złożoność algorytmu/metody: {algo_name}?", "front": (
"back": clean_text(match.strip()[:200]), "Jaka jest złożoność"
f" algorytmu/metody: {algo_name}?"
),
"back": clean_text(
algo_match.strip()[:200]
),
"tags": f"{base_tags} zlozonosc", "tags": f"{base_tags} zlozonosc",
} }
) )
break break
# ===================================================== return cards
# CARD TYPE 4: Comparisons (when file contains comparisons)
# =====================================================
def _extract_comparison_cards(
content: str, base_tags: str, num: str,
) -> list[dict[str, str]]:
"""Extract comparison cards."""
compare_match = re.search( compare_match = re.search(
r"## .*(Porównanie|Zestawienie|vs).*\n(.+?)(?=\n## |\Z)", r"## .*(Porównanie|Zestawienie|vs).*\n(.+?)(?=\n## |\Z)",
content, content,
re.DOTALL | re.IGNORECASE, re.DOTALL | re.IGNORECASE,
) )
if compare_match: if not compare_match:
compare_section = compare_match.group(2) return []
# Extract comparison items
items = re.findall(r"\|\s*\*\*([^|*]+)\*\*\s*\|([^|]+)\|", compare_section)
if items:
comparison_html = "<table><tr><th>Aspekt</th><th>Wartość</th></tr>"
for aspect, value in items[:6]:
comparison_html += f"<tr><td>{clean_text(aspect)}</td><td>{clean_text(value)}</td></tr>"
comparison_html += "</table>"
# Get comparison title compare_section = compare_match.group(2)
title_match = re.search( items = re.findall(
r"## .*(Porównanie|Zestawienie).*?(\w+.*?(?:vs|i|oraz).*?\w+)", r"\|\s*\*\*([^|*]+)\*\*\s*\|([^|]+)\|",
compare_match.group(0), compare_section,
re.IGNORECASE, )
) if not items:
if title_match: return []
cards.append(
{
"front": f"Porównaj kluczowe różnice w temacie: pytanie {num}",
"back": comparison_html,
"tags": f"{base_tags} porownanie",
}
)
# ===================================================== comparison_html = (
# CARD TYPE 5: Q&A from practice questions section "<table><tr><th>Aspekt</th><th>Wartość</th></tr>"
# ===================================================== )
qa_section = re.search(r"## 🎓 Pytania.*?\n(.+?)(?=\n## |\Z)", content, re.DOTALL) for aspect, value in items[:MAX_COMPARISON_ITEMS]:
if qa_section: comparison_html += (
qa_content = qa_section.group(1) f"<tr><td>{clean_text(aspect)}</td>"
# Find Q&A pairs f"<td>{clean_text(value)}</td></tr>"
qas = re.findall(
r'### Q\d+:?\s*["\']?(.+?)["\']?\s*\n.*?Odpowiedź:\s*\n?(.+?)(?=\n### |\Z)',
qa_content,
re.DOTALL,
) )
for q, a in qas[:3]: comparison_html += "</table>"
q = re.sub(r"\s+", " ", q.strip())
a = a.strip()
if len(a) > 30:
# Limit answer length
a_lines = a.split("\n")
a_short = "\n".join(a_lines[:5])
if len(a_short) > 400:
a_short = a_short[:400] + "..."
cards.append( title_match = re.search(
{ r"## .*(Porównanie|Zestawienie)"
"front": clean_text(q), r".*?(\w+.*?(?:vs|i|oraz).*?\w+)",
"back": clean_text(a_short).replace("\n", "<br>"), compare_match.group(0),
"tags": f"{base_tags} egzamin_praktyka", re.IGNORECASE,
} )
) if not title_match:
return []
return [
{
"front": (
"Porównaj kluczowe różnice"
f" w temacie: pytanie {num}"
),
"back": comparison_html,
"tags": f"{base_tags} porownanie",
}
]
def _extract_qa_cards(
content: str, base_tags: str,
) -> list[dict[str, str]]:
"""Extract Q&A practice cards."""
cards: list[dict[str, str]] = []
qa_section = re.search(
r"## 🎓 Pytania.*?\n(.+?)(?=\n## |\Z)",
content,
re.DOTALL,
)
if not qa_section:
return cards
qa_content = qa_section.group(1)
qas = re.findall(
r"### Q\d+:?\s*[\"']?(.+?)[\"']?\s*\n"
r".*?Odpowiedź:\s*\n?(.+?)(?=\n### |\Z)",
qa_content,
re.DOTALL,
)
for raw_q, raw_a in qas[:3]:
question = re.sub(r"\s+", " ", raw_q.strip())
answer = raw_a.strip()
if len(answer) > MIN_QA_LENGTH:
a_lines = answer.split("\n")
a_short = "\n".join(a_lines[:5])
if len(a_short) > MAX_ANSWER_LENGTH:
a_short = a_short[:MAX_ANSWER_LENGTH] + "..."
cards.append(
{
"front": clean_text(question),
"back": clean_text(a_short).replace(
"\n", "<br>"
),
"tags": f"{base_tags} egzamin_praktyka",
}
)
return cards
def extract_from_file(filepath: str) -> list[dict[str, str]]:
"""Extract flashcard data from a markdown file."""
num, subject, content = _get_file_metadata(filepath)
base_tags = f"egzamin_magisterski pyt{num} {subject}"
cards: list[dict[str, str]] = []
cards.extend(_extract_main_question_card(content, base_tags))
cards.extend(_extract_subsection_cards(content, base_tags))
cards.extend(_extract_algo_cards(content, base_tags))
cards.extend(
_extract_comparison_cards(content, base_tags, num)
)
cards.extend(_extract_qa_cards(content, base_tags))
return cards return cards
@ -272,13 +369,13 @@ def main() -> None:
all_cards = [] all_cards = []
for md_file in sorted(odpowiedzi_dir.glob("*.md")): for md_file in sorted(odpowiedzi_dir.glob("*.md")):
print(f"Processing: {md_file.name}", end=" ") logger.info("Processing: %s", md_file.name)
try: try:
cards = extract_from_file(md_file) cards = extract_from_file(md_file)
all_cards.extend(cards) all_cards.extend(cards)
print(f"{len(cards)} cards") logger.info(" -> %d cards", len(cards))
except Exception as e: except (ValueError, OSError) as e:
print(f"→ ERROR: {e}") logger.info(" -> ERROR: %s", e)
# Remove potential duplicates (same front) # Remove potential duplicates (same front)
seen = set() seen = set()
@ -306,23 +403,25 @@ def main() -> None:
f.write(f"{front}\t{back}\t{tags}\n") f.write(f"{front}\t{back}\t{tags}\n")
print(f"\n{'=' * 50}") logger.info("=" * 50)
print(f"✅ Generated {len(unique_cards)} unique flashcards") logger.info(
print(f"📁 Saved to: {output_file}") "Generated %d unique flashcards", len(unique_cards)
print(f"{'=' * 50}") )
print("\n📋 IMPORT INSTRUCTIONS:") logger.info("Saved to: %s", output_file)
print("" * 40) logger.info("=" * 50)
print("Anki Desktop:") logger.info("IMPORT INSTRUCTIONS:")
print(" 1. File → Import") logger.info("-" * 40)
print(" 2. Select: anki_egzamin_magisterski.txt") logger.info("Anki Desktop:")
print(" 3. Verify: Fields separated by Tab") logger.info(" 1. File -> Import")
print(" 4. Check: Allow HTML in fields") logger.info(" 2. Select: anki_egzamin_magisterski.txt")
print(" 5. Click Import") logger.info(" 3. Verify: Fields separated by Tab")
print() logger.info(" 4. Check: Allow HTML in fields")
print("AnkiWeb / AnkiDroid:") logger.info(" 5. Click Import")
print(" 1. First import on Anki Desktop") logger.info("")
print(" 2. Click Sync to upload to AnkiWeb") logger.info("AnkiWeb / AnkiDroid:")
print(" 3. Sync on mobile to download") logger.info(" 1. First import on Anki Desktop")
logger.info(" 2. Click Sync to upload to AnkiWeb")
logger.info(" 3. Sync on mobile to download")
if __name__ == "__main__": if __name__ == "__main__":

View File

@ -6,12 +6,16 @@ Creates a tab-separated file compatible with Anki import.
from __future__ import annotations from __future__ import annotations
import logging
from pathlib import Path from pathlib import Path
import re import re
import traceback
logger = logging.getLogger(__name__)
MIN_HEADER_WORDS = 3
def extract_main_question(content, filename) -> str: def extract_main_question(content: str, filename: str) -> str:
"""Extract the main exam question from the file.""" """Extract the main exam question from the file."""
# Extract the main question from ## Pytanie section # Extract the main question from ## Pytanie section
question_match = re.search( question_match = re.search(
@ -26,13 +30,13 @@ def extract_main_question(content, filename) -> str:
return title_match.group(1) if title_match else filename return title_match.group(1) if title_match else filename
def extract_subject(content) -> str: def extract_subject(content: str) -> str:
"""Extract the subject code.""" """Extract the subject code."""
subject_match = re.search(r"Przedmiot:\s*(\w+)", content) subject_match = re.search(r"Przedmiot:\s*(\w+)", content)
return subject_match.group(1) if subject_match else "Ogólne" return subject_match.group(1) if subject_match else "Ogólne"
def extract_key_points(content) -> list[str]: def extract_key_points(content: str) -> list[str]:
"""Extract key points from the main answer section.""" """Extract key points from the main answer section."""
points = [] points = []
@ -51,14 +55,14 @@ def extract_key_points(content) -> list[str]:
headers = re.findall(r"^### (.+)$", answer_text, re.MULTILINE) headers = re.findall(r"^### (.+)$", answer_text, re.MULTILINE)
for h in headers[:6]: for h in headers[:6]:
# Clean header # Clean header
h = re.sub(r"\d+\.\s*", "", h).strip() cleaned = re.sub(r"\d+\.\s*", "", h).strip()
if h and len(h) > 3: if cleaned and len(cleaned) > MIN_HEADER_WORDS:
points.append(h) points.append(cleaned)
return points return points
def extract_definitions(content) -> list[tuple[str, str]]: def extract_definitions(content: str) -> list[tuple[str, str]]:
"""Extract key definitions from the content.""" """Extract key definitions from the content."""
definitions = [] definitions = []
@ -66,9 +70,9 @@ def extract_definitions(content) -> list[tuple[str, str]]:
pattern = r"\*\*([^*\n]+)\*\*\s*[--:]\s*([^*\n]{20,150})" pattern = r"\*\*([^*\n]+)\*\*\s*[--:]\s*([^*\n]{20,150})"
matches = re.findall(pattern, content) matches = re.findall(pattern, content)
for term, definition in matches: for raw_term, raw_def in matches:
term = term.strip() term = raw_term.strip()
definition = definition.strip() definition = raw_def.strip()
# Filter out non-definition patterns # Filter out non-definition patterns
if ( if (
term term
@ -81,7 +85,7 @@ def extract_definitions(content) -> list[tuple[str, str]]:
return definitions[:5] return definitions[:5]
def clean_html(text) -> str: def clean_html(text: str) -> str:
"""Convert markdown to HTML and clean for Anki.""" """Convert markdown to HTML and clean for Anki."""
if not text: if not text:
return "" return ""
@ -101,7 +105,7 @@ def clean_html(text) -> str:
return text.strip() return text.strip()
def process_file(filepath) -> list[dict[str, str]]: def process_file(filepath: str) -> list[dict[str, str]]:
"""Process a single file and return flashcards.""" """Process a single file and return flashcards."""
with Path(filepath).open(encoding="utf-8") as f: with Path(filepath).open(encoding="utf-8") as f:
content = f.read() content = f.read()
@ -111,11 +115,7 @@ def process_file(filepath) -> list[dict[str, str]]:
# Extract metadata # Extract metadata
filename = Path(filepath).name filename = Path(filepath).name
match = re.match(r"(\d+)-(.+)\.md", filename) match = re.match(r"(\d+)-(.+)\.md", filename)
if match: num = match.group(1) if match else "00"
num = match.group(1)
match.group(2).replace("-", "_")
else:
num = "00"
subject = extract_subject(content) subject = extract_subject(content)
main_question = extract_main_question(content, filename) main_question = extract_main_question(content, filename)
@ -156,14 +156,13 @@ def main() -> None:
# Process each file # Process each file
for md_file in sorted(odpowiedzi_dir.glob("*.md")): for md_file in sorted(odpowiedzi_dir.glob("*.md")):
print(f"Processing: {md_file.name}") logger.info("Processing: %s", md_file.name)
try: try:
cards = process_file(md_file) cards = process_file(md_file)
all_cards.extend(cards) all_cards.extend(cards)
print(f" -> {len(cards)} cards") logger.info(" -> %d cards", len(cards))
except Exception as e: except (ValueError, OSError):
print(f" -> Error: {e}") logger.exception(" -> Error processing file")
traceback.print_exc()
# Write Anki-compatible file # Write Anki-compatible file
with Path(output_file).open("w", encoding="utf-8") as f: with Path(output_file).open("w", encoding="utf-8") as f:
@ -186,16 +185,22 @@ def main() -> None:
f.write(f"{front}\t{back}\t{tags}\n") f.write(f"{front}\t{back}\t{tags}\n")
print(f"\n✅ Created {len(all_cards)} flashcards") logger.info("Created %d flashcards", len(all_cards))
print(f"📁 Output: {output_file}") logger.info("Output: %s", output_file)
print("\n=== Import Instructions ===") logger.info("=== Import Instructions ===")
print("1. Open Anki desktop → File → Import") logger.info("1. Open Anki desktop -> File -> Import")
print("2. Select: anki_egzamin_magisterski.txt") logger.info("2. Select: anki_egzamin_magisterski.txt")
print("3. Set 'Fields separated by: Tab'") logger.info("3. Set 'Fields separated by: Tab'")
print("4. Check 'Allow HTML in fields'") logger.info("4. Check 'Allow HTML in fields'")
print("5. Map: Field 1 → Front, Field 2 → Back, Field 3 → Tags") logger.info(
print("6. Click Import") "5. Map: Field 1 -> Front, Field 2 -> Back,"
print("\nFor AnkiWeb/AnkiDroid: Sync after importing on desktop") " Field 3 -> Tags"
)
logger.info("6. Click Import")
logger.info(
"For AnkiWeb/AnkiDroid:"
" Sync after importing on desktop"
)
if __name__ == "__main__": if __name__ == "__main__":

View File

@ -3,11 +3,18 @@
from __future__ import annotations from __future__ import annotations
import logging
from pathlib import Path from pathlib import Path
import re import re
logger = logging.getLogger(__name__)
def clean_text(text) -> str: MIN_PARA_LENGTH = 20
MAX_PARA_LENGTH = 400
MIN_BODY_LENGTH = 80
def clean_text(text: str) -> str:
"""Clean text for Anki.""" """Clean text for Anki."""
if not text: if not text:
return "" return ""
@ -19,7 +26,7 @@ def clean_text(text) -> str:
return text.strip() return text.strip()
def extract_real_answer(content, section_name) -> str | None: def extract_real_answer(content: str, section_name: str) -> str | None:
"""Extract actual content from a section, not just headers.""" """Extract actual content from a section, not just headers."""
# Find the section # Find the section
pattern = rf"### (?:\d+\.\s*)?{re.escape(section_name)}\s*\n((?:(?!^### ).)+)" pattern = rf"### (?:\d+\.\s*)?{re.escape(section_name)}\s*\n((?:(?!^### ).)+)"
@ -52,19 +59,21 @@ def extract_real_answer(content, section_name) -> str | None:
for p in body.split("\n\n") for p in body.split("\n\n")
if p.strip() and not p.startswith("```") and not p.startswith("|") if p.strip() and not p.startswith("```") and not p.startswith("|")
] ]
for p in paras[:2]: lines.extend(
if len(p) > 20 and len(p) < 400: p for p in paras[:2]
lines.append(p) if len(p) > MIN_PARA_LENGTH and len(p) < MAX_PARA_LENGTH
)
return "<br>".join(lines[:6]) if lines else None return "<br>".join(lines[:6]) if lines else None
def extract_cards(filepath) -> list[dict[str, str]]: def _read_file_metadata(
"""Extract flashcards from a file.""" filepath: str | Path,
) -> tuple[str, str, str | None]:
"""Read file and extract metadata."""
with Path(filepath).open(encoding="utf-8") as f: with Path(filepath).open(encoding="utf-8") as f:
content = f.read() content = f.read()
cards = []
filename = Path(filepath).name filename = Path(filepath).name
match = re.match(r"(\d+)-(.+)\.md", filename) match = re.match(r"(\d+)-(.+)\.md", filename)
num = match.group(1) if match else "00" num = match.group(1) if match else "00"
@ -73,182 +82,228 @@ def extract_cards(filepath) -> list[dict[str, str]]:
subject = subj_match.group(1) if subj_match else "Ogólne" subject = subj_match.group(1) if subj_match else "Ogólne"
base_tags = f"egzamin_magisterski pyt{num} {subject}" base_tags = f"egzamin_magisterski pyt{num} {subject}"
# Get main question
q_match = re.search( q_match = re.search(
r'## Pytanie\s*\n\s*\*\*["\']?(.+?)["\']?\*\*', content, re.DOTALL r'## Pytanie\s*\n\s*\*\*["\']?(.+?)["\']?\*\*',
content,
re.DOTALL,
)
main_question = (
re.sub(r"\s+", " ", q_match.group(1).strip()) if q_match else None
) )
main_question = re.sub(r"\s+", " ", q_match.group(1).strip()) if q_match else None
# =============================================== return content, base_tags, main_question
# MAIN CARD: Question with REAL answer summary
# ===============================================
if main_question:
# Build a real answer from the main sections
answer_parts = []
# For automata question - extract key facts about each automaton
if "automat" in main_question.lower() or "maszyn" in main_question.lower(): def _extract_automata_facts(content: str) -> list[str]:
# FA """Extract automata-specific facts."""
fa_match = re.search( parts: list[str] = []
r"Automat Skończony.*?Rozpoznawana klasa języków\s*\n\s*\*\*([^*]+)\*\*", automata = [
content, ("Automat Skończony", "FA"),
re.DOTALL, ("Automat ze Stosem", "PDA"),
("Maszyna Turinga", "TM"),
]
for name, abbrev in automata:
pattern = (
rf"{name}.*?Rozpoznawana klasa języków"
r"\s*\n\s*\*\*([^*]+)\*\*"
)
match = re.search(pattern, content, re.DOTALL)
if match:
parts.append(
f"<b>{name} ({abbrev})</b>: "
f"{match.group(1).strip()}"
) )
if fa_match: return parts
answer_parts.append(
f"<b>Automat Skończony (FA)</b>: {fa_match.group(1).strip()}"
)
# PDA
pda_match = re.search(
r"Automat ze Stosem.*?Rozpoznawana klasa języków\s*\n\s*\*\*([^*]+)\*\*",
content,
re.DOTALL,
)
if pda_match:
answer_parts.append(
f"<b>Automat ze Stosem (PDA)</b>: {pda_match.group(1).strip()}"
)
# TM def _extract_generic_facts(content: str) -> list[str]:
tm_match = re.search( """Extract generic definitions and summaries."""
r"Maszyna Turinga.*?Rozpoznawana klasa języków\s*\n\s*\*\*([^*]+)\*\*", parts: list[str] = []
content, key_patterns = [
re.DOTALL, r"#### Definicja\s*\n([^\n#]+)",
) r"#### Charakterystyka\s*\n([^\n#]+)",
if tm_match: r"\*\*Definicja[:\s]*\*\*\s*([^\n]+)",
answer_parts.append( ]
f"<b>Maszyna Turinga (TM)</b>: {tm_match.group(1).strip()}" for pattern in key_patterns:
) parts.extend(
found.strip()
for found in re.findall(pattern, content)[:3]
if len(found) > MIN_PARA_LENGTH
)
return parts
# Generic extraction if specific didn't work
if not answer_parts:
# Look for key definitions/summaries
key_patterns = [
r"#### Definicja\s*\n([^\n#]+)",
r"#### Charakterystyka\s*\n([^\n#]+)",
r"\*\*Definicja[:\s]*\*\*\s*([^\n]+)",
]
for pattern in key_patterns:
for match in re.findall(pattern, content)[:3]:
if len(match) > 20:
answer_parts.append(match.strip())
# Still nothing? Get first substantive paragraph from main answer def _extract_first_paragraphs(content: str) -> list[str]:
if not answer_parts: """Extract first substantive paragraphs from main answer."""
main_answer = re.search( main_answer = re.search(
r"## 📚 Odpowiedź główna\s*\n(.+?)(?=\n## |\Z)", content, re.DOTALL r"## 📚 Odpowiedź główna\s*\n(.+?)(?=\n## |\Z)",
) content,
if main_answer: re.DOTALL,
# Skip headers, get actual content )
text = main_answer.group(1) if not main_answer:
paras = re.findall(r"\n\n([^#\n][^\n]{50,300})", text) return []
answer_parts = paras[:3] text = main_answer.group(1)
paras = re.findall(r"\n\n([^#\n][^\n]{50,300})", text)
return paras[:3]
if answer_parts:
answer = "<br><br>".join([clean_text(p) for p in answer_parts]) def _build_main_card(
cards.append( content: str,
{ main_question: str | None,
"front": clean_text(main_question), base_tags: str,
"back": answer, ) -> dict[str, str] | None:
"tags": f"{base_tags} pytanie_glowne", """Build the main question card."""
} if not main_question:
return None
answer_parts: list[str] = []
if (
"automat" in main_question.lower()
or "maszyn" in main_question.lower()
):
answer_parts = _extract_automata_facts(content)
if not answer_parts:
answer_parts = _extract_generic_facts(content)
if not answer_parts:
answer_parts = _extract_first_paragraphs(content)
if not answer_parts:
return None
answer = "<br><br>".join(
clean_text(p) for p in answer_parts
)
return {
"front": clean_text(main_question),
"back": answer,
"tags": f"{base_tags} pytanie_glowne",
}
def _extract_section_content(body: str) -> list[str]:
"""Extract content lines from a section body."""
answer_lines: list[str] = []
def_match = re.search(
r"#### Definicja[^\n]*\n([^\n#]+(?:\n[^\n#]+)?)", body,
)
if def_match:
answer_lines.append(def_match.group(1).strip())
char_match = re.search(
r"#### Charakterystyka\s*\n((?:[-•][^\n]+\n?)+)", body,
)
if char_match:
bullets = re.findall(
r"[-•]\s*\*\*([^*]+)\*\*[:\s]*([^\n]*)",
char_match.group(1),
)
for term, desc in bullets[:4]:
answer_lines.append(
f"• <b>{term}</b>: {desc.strip()}"
if desc
else f"• <b>{term}</b>"
) )
# =============================================== if not answer_lines:
# CONCEPT CARDS: Specific topics with real content bullets = re.findall(
# =============================================== r"[-•]\s*\*\*([^*]+)\*\*[:\s]*([^\n]*)", body,
# Find all ### sections and extract their actual content )
for term, desc in bullets[:5]:
answer_lines.append(
f"• <b>{term}</b>: {desc.strip()}"
if desc
else f"• <b>{term}</b>"
)
if not answer_lines:
first_para = re.search(
r"^([^#\n\-•|`][^\n]{30,250})", body, re.MULTILINE,
)
if first_para:
answer_lines.append(first_para.group(1))
return answer_lines
def _build_concept_cards(
content: str, base_tags: str,
) -> list[dict[str, str]]:
"""Build concept cards from ### sections."""
cards: list[dict[str, str]] = []
sections = re.findall( sections = re.findall(
r"^### (?:\d+\.\s*)?([^\n]+)\n((?:(?!^### ).)*)", r"^### (?:\d+\.\s*)?([^\n]+)\n((?:(?!^### ).)*)",
content, content,
re.MULTILINE | re.DOTALL, re.MULTILINE | re.DOTALL,
) )
for header, body in sections: for raw_header, raw_body in sections:
header = header.strip() header = raw_header.strip()
body = body.strip() body = raw_body.strip()
# Skip short sections, mnemonics, examples
if ( if (
len(body) < 80 len(body) < MIN_BODY_LENGTH
or "Przykład" in header or "Przykład" in header
or "Mnemonic" in header or "Mnemonic" in header
or '"' in header or '"' in header
): ):
continue continue
# Extract real content answer_lines = _extract_section_content(body)
answer_lines = []
# Get definition if present
def_match = re.search(r"#### Definicja[^\n]*\n([^\n#]+(?:\n[^\n#]+)?)", body)
if def_match:
answer_lines.append(def_match.group(1).strip())
# Get characterization
char_match = re.search(r"#### Charakterystyka\s*\n((?:[-•][^\n]+\n?)+)", body)
if char_match:
bullets = re.findall(
r"[-•]\s*\*\*([^*]+)\*\*[:\s]*([^\n]*)", char_match.group(1)
)
for term, desc in bullets[:4]:
answer_lines.append(
f"• <b>{term}</b>: {desc.strip()}" if desc else f"• <b>{term}</b>"
)
# Get bullet points if no structured content yet
if not answer_lines: if not answer_lines:
bullets = re.findall(r"[-•]\s*\*\*([^*]+)\*\*[:\s]*([^\n]*)", body) continue
for term, desc in bullets[:5]:
answer_lines.append(
f"• <b>{term}</b>: {desc.strip()}" if desc else f"• <b>{term}</b>"
)
# Get first paragraph if still nothing question = (
if not answer_lines: header if header.endswith("?") else f"Wyjaśnij: {header}"
first_para = re.search(r"^([^#\n\-•|`][^\n]{30,250})", body, re.MULTILINE) )
if first_para: answer = "<br>".join(
answer_lines.append(first_para.group(1)) clean_text(line) for line in answer_lines
)
cards.append(
{
"front": clean_text(question),
"back": answer,
"tags": f"{base_tags} szczegoly",
}
)
if answer_lines: return cards
question = f"Wyjaśnij: {header}" if not header.endswith("?") else header
answer = "<br>".join([clean_text(l) for l in answer_lines])
cards.append(
{
"front": clean_text(question),
"back": answer,
"tags": f"{base_tags} szczegoly",
}
)
# =============================================== def _build_qa_cards(
# Q&A CARDS: From practice questions section content: str, base_tags: str,
# =============================================== ) -> list[dict[str, str]]:
"""Build Q&A practice cards."""
cards: list[dict[str, str]] = []
qa_matches = re.findall( qa_matches = re.findall(
r'### Q\d+:\s*["\']?([^"\'?\n]+)\?*["\']?\s*\n.*?Odpowiedź:\s*\n(.+?)(?=\n### |\n## |\Z)', r'### Q\d+:\s*["\']?([^"\'?\n]+)\?*["\']?\s*\n'
r".*?Odpowiedź:\s*\n(.+?)(?=\n### |\n## |\Z)",
content, content,
re.DOTALL, re.DOTALL,
) )
for question, answer in qa_matches[:5]: for raw_question, raw_answer in qa_matches[:5]:
question = question.strip() question = raw_question.strip()
answer = answer.strip() answer_text = raw_answer.strip()
# Clean up answer - get first meaningful part answer_lines = answer_text.split("\n")
answer_lines = answer.split("\n") clean_answer = [
clean_answer = [] stripped
for line in answer_lines[:6]: for raw_line in answer_lines[:6]
line = line.strip() if (stripped := raw_line.strip())
if line and not line.startswith("```") and not line.startswith("|"): and not stripped.startswith("```")
clean_answer.append(line) and not stripped.startswith("|")
]
if clean_answer: if clean_answer:
cards.append( cards.append(
{ {
"front": clean_text(question + "?"), "front": clean_text(question + "?"),
"back": "<br>".join([clean_text(l) for l in clean_answer]), "back": "<br>".join(
clean_text(line) for line in clean_answer
),
"tags": f"{base_tags} qa", "tags": f"{base_tags} qa",
} }
) )
@ -256,6 +311,20 @@ def extract_cards(filepath) -> list[dict[str, str]]:
return cards return cards
def extract_cards(filepath: str | Path) -> list[dict[str, str]]:
"""Extract flashcards from a file."""
content, base_tags, main_question = _read_file_metadata(filepath)
cards: list[dict[str, str]] = []
main_card = _build_main_card(content, main_question, base_tags)
if main_card:
cards.append(main_card)
cards.extend(_build_concept_cards(content, base_tags))
cards.extend(_build_qa_cards(content, base_tags))
return cards
def main() -> None: def main() -> None:
"""Main.""" """Main."""
odpowiedzi_dir = Path("/home/kuchy/praca_magisterska/pytania/odpowiedzi") odpowiedzi_dir = Path("/home/kuchy/praca_magisterska/pytania/odpowiedzi")
@ -266,13 +335,13 @@ def main() -> None:
all_cards = [] all_cards = []
for md_file in sorted(odpowiedzi_dir.glob("*.md")): for md_file in sorted(odpowiedzi_dir.glob("*.md")):
print(f"Processing: {md_file.name}", end=" ") logger.info("Processing: %s", md_file.name)
try: try:
cards = extract_cards(md_file) cards = extract_cards(md_file)
all_cards.extend(cards) all_cards.extend(cards)
print(f"{len(cards)} cards") logger.info(" -> %d cards", len(cards))
except Exception as e: except (ValueError, OSError):
print(f"→ ERROR: {e}") logger.exception(" -> Error processing file")
# Remove duplicates # Remove duplicates
seen = set() seen = set()
@ -299,8 +368,12 @@ def main() -> None:
tags = card["tags"] tags = card["tags"]
f.write(f"{front}\t{back}\t{tags}\n") f.write(f"{front}\t{back}\t{tags}\n")
print(f"\n✅ Generated {len(unique_cards)} flashcards") logger.info(
print(f"📁 Output: {output_file}") "Generated %d unique cards from %d total",
len(unique_cards),
len(all_cards),
)
logger.info("Output: %s", output_file)
if __name__ == "__main__": if __name__ == "__main__":

View File

@ -7,6 +7,8 @@ Designed for A4 laser printer output (300 DPI, black & white).
from __future__ import annotations from __future__ import annotations
import logging
import matplotlib as mpl import matplotlib as mpl
mpl.use("Agg") mpl.use("Agg")
@ -20,6 +22,8 @@ if TYPE_CHECKING:
from matplotlib.axes import Axes from matplotlib.axes import Axes
from matplotlib.figure import Figure from matplotlib.figure import Figure
logger = logging.getLogger(__name__)
OUTPUT_DIR = str(Path(__file__).resolve().parent / "img") OUTPUT_DIR = str(Path(__file__).resolve().parent / "img")
Path(OUTPUT_DIR).mkdir(parents=True, exist_ok=True) Path(OUTPUT_DIR).mkdir(parents=True, exist_ok=True)
@ -33,19 +37,35 @@ FIXED_COLOR = "#D0F0D0" # light green-ish gray for fixed
FD_ARROW_COLOR = "#444444" FD_ARROW_COLOR = "#444444"
def _compute_col_widths(
headers: list[str],
rows: list[list[str]],
) -> list[float]:
"""Auto-calculate column widths based on content."""
col_widths: list[float] = []
for c in range(len(headers)):
max_len = len(headers[c])
for r in rows:
if c < len(r):
max_len = max(max_len, len(str(r[c])))
col_widths.append(max(max_len * 0.08 + 0.1, 0.5))
return col_widths
def draw_table( def draw_table(
ax, ax: Axes,
x, x: float,
y, y: float,
title, title: str,
headers, headers: list[str],
rows, rows: list[list[str]],
col_widths=None, *,
highlight_cols=None, col_widths: list[float] | None = None,
highlight_rows=None, highlight_cols: set[int] | None = None,
highlight_cells=None, highlight_rows: set[int] | None = None,
strikethrough_cells=None, highlight_cells: set[tuple[int, int]] | None = None,
title_fontsize=9, strikethrough_cells: set[tuple[int, int]] | None = None,
title_fontsize: int = 9,
) -> tuple[float, float]: ) -> tuple[float, float]:
"""Draw a single table on the axes at position (x, y). """Draw a single table on the axes at position (x, y).
@ -66,18 +86,10 @@ def draw_table(
Returns: Returns:
(width, height) of the drawn table (width, height) of the drawn table
""" """
n_cols = len(headers)
n_rows = len(rows) n_rows = len(rows)
if col_widths is None: if col_widths is None:
# Auto-calculate based on content col_widths = _compute_col_widths(headers, rows)
col_widths = []
for c in range(n_cols):
max_len = len(headers[c])
for r in rows:
if c < len(r):
max_len = max(max_len, len(str(r[c])))
col_widths.append(max(max_len * 0.08 + 0.1, 0.5))
row_height = 0.22 row_height = 0.22
total_width = sum(col_widths) total_width = sum(col_widths)
@ -172,7 +184,10 @@ def draw_table(
return total_width, total_height + 0.25 # extra for title return total_width, total_height + 0.25 # extra for title
def create_figure(width_inches=11.69, height_inches=8.27) -> tuple[Figure, Axes]: def create_figure(
width_inches: float = 11.69,
height_inches: float = 8.27,
) -> tuple[Figure, Axes]:
"""Create A4 landscape figure.""" """Create A4 landscape figure."""
fig, ax = plt.subplots(1, 1, figsize=(width_inches, height_inches), dpi=DPI) fig, ax = plt.subplots(1, 1, figsize=(width_inches, height_inches), dpi=DPI)
ax.set_xlim(0, width_inches) ax.set_xlim(0, width_inches)
@ -182,7 +197,16 @@ def create_figure(width_inches=11.69, height_inches=8.27) -> tuple[Figure, Axes]
return fig, ax return fig, ax
def add_arrow(ax, x1, y1, x2, y2, label="", color="black") -> None: def add_arrow(
ax: Axes,
x1: float,
y1: float,
x2: float,
y2: float,
label: str = "",
*,
color: str = "black",
) -> None:
"""Draw an arrow with optional label.""" """Draw an arrow with optional label."""
ax.annotate( ax.annotate(
"", "",
@ -205,7 +229,15 @@ def add_arrow(ax, x1, y1, x2, y2, label="", color="black") -> None:
def add_label( def add_label(
ax, x, y, text, fontsize=8, color="black", ha="left", style="normal" ax: Axes,
x: float,
y: float,
text: str,
*,
fontsize: int = 8,
color: str = "black",
ha: str = "left",
style: str = "normal",
) -> None: ) -> None:
"""Add a text label.""" """Add a text label."""
ax.text( ax.text(
@ -289,7 +321,10 @@ def draw_0nf() -> None:
ax, ax,
0.8, 0.8,
1.2, 1.2,
"Zaleznosci funkcyjne: StID -> Imie, WydzialID | WydzialID -> NazwaWydzialu", (
"Zaleznosci funkcyjne: StID -> Imie, WydzialID"
" | WydzialID -> NazwaWydzialu"
),
fontsize=8, fontsize=8,
color="#333333", color="#333333",
) )
@ -297,7 +332,10 @@ def draw_0nf() -> None:
ax, ax,
0.8, 0.8,
0.9, 0.9,
" KursID -> NazwaKursu | (StID,KursID) -> Prowadzacy | Prowadzacy -> KursID", (
" KursID -> NazwaKursu | (StID,KursID)"
" -> Prowadzacy | Prowadzacy -> KursID"
),
fontsize=8, fontsize=8,
color="#333333", color="#333333",
) )
@ -309,7 +347,7 @@ def draw_0nf() -> None:
pad_inches=0.2, pad_inches=0.2,
) )
plt.close(fig) plt.close(fig)
print("Generated: nf_0nf_table.png") logger.info("Generated: nf_0nf_table.png")
# ============================================================ # ============================================================
@ -399,7 +437,10 @@ def draw_1nf() -> None:
ax, ax,
0.5, 0.5,
1.5, 1.5,
" Imie, WydzialID, NazwaWydzialu zaleza TYLKO od StID (czesc klucza).", (
" Imie, WydzialID, NazwaWydzialu"
" zaleza TYLKO od StID (czesc klucza)."
),
fontsize=9, fontsize=9,
color="black", color="black",
) )
@ -419,7 +460,7 @@ def draw_1nf() -> None:
pad_inches=0.2, pad_inches=0.2,
) )
plt.close(fig) plt.close(fig)
print("Generated: nf_1nf_tables.png") logger.info("Generated: nf_1nf_tables.png")
# ============================================================ # ============================================================
@ -477,7 +518,10 @@ def draw_2nf() -> None:
ax, ax,
0.3, 0.3,
3.3, 3.3,
"KROK: Rozbito czesc. zaleznosci — atrybuty zalezne od czesci klucza wydzielone.", (
"KROK: Rozbito czesc. zaleznosci"
" — atrybuty zalezne od czesci klucza wydzielone."
),
fontsize=9, fontsize=9,
) )
add_label( add_label(
@ -528,7 +572,7 @@ def draw_2nf() -> None:
pad_inches=0.2, pad_inches=0.2,
) )
plt.close(fig) plt.close(fig)
print("Generated: nf_2nf_tables.png") logger.info("Generated: nf_2nf_tables.png")
# ============================================================ # ============================================================