mirror of
https://github.com/kuhyx/testsAndMisc-archive.git
synced 2026-07-04 13:23:01 +02:00
refactor(praca_magisterska_video): fix ruff violations and remove noqa from diagram generators
- Add type annotations, docstrings, and constants - Remove commented-out code and print statements - Fix all lint issues in 11 generate_images files
This commit is contained in:
parent
2486449300
commit
be31e9abd7
@ -7,11 +7,17 @@
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
from pathlib import Path
|
||||
import re
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
def clean_text(text) -> str:
|
||||
MIN_BODY_LENGTH = 50
|
||||
MIN_ANSWER_LENGTH = 100
|
||||
|
||||
|
||||
def clean_text(text: str) -> str:
|
||||
"""Clean text."""
|
||||
if not text:
|
||||
return ""
|
||||
@ -23,7 +29,7 @@ def clean_text(text) -> str:
|
||||
return text.strip()
|
||||
|
||||
|
||||
def extract_cards(filepath) -> list[dict[str, str]]:
|
||||
def extract_cards(filepath: str) -> list[dict[str, str]]:
|
||||
"""Extract cards."""
|
||||
with Path(filepath).open(encoding="utf-8") as f:
|
||||
content = f.read()
|
||||
@ -68,10 +74,10 @@ def extract_cards(filepath) -> list[dict[str, str]]:
|
||||
content,
|
||||
re.MULTILINE | re.DOTALL,
|
||||
)
|
||||
for header, body in sections:
|
||||
header = header.strip()
|
||||
body = body.strip()
|
||||
if len(body) < 50:
|
||||
for raw_header, raw_body in sections:
|
||||
header = raw_header.strip()
|
||||
body = raw_body.strip()
|
||||
if len(body) < MIN_BODY_LENGTH:
|
||||
continue
|
||||
|
||||
# Get first paragraph
|
||||
@ -102,8 +108,10 @@ def main() -> None:
|
||||
for md_file in sorted(odpowiedzi_dir.glob("*.md")):
|
||||
all_cards.extend(extract_cards(md_file))
|
||||
|
||||
# APPROACH 1: Strict filtering - only cards with answer > 100 chars
|
||||
filtered_cards = [c for c in all_cards if len(c["back"]) > 100]
|
||||
# APPROACH 1: Strict filtering - only cards with answer > threshold
|
||||
filtered_cards = [
|
||||
c for c in all_cards if len(c["back"]) > MIN_ANSWER_LENGTH
|
||||
]
|
||||
|
||||
# Remove duplicates
|
||||
seen = set()
|
||||
@ -120,7 +128,11 @@ def main() -> None:
|
||||
for c in unique:
|
||||
f.write(f"{c['front']}\t{c['back']}\t{c['tags']}\n")
|
||||
|
||||
print(f"✅ Approach 1 (Strict Filter): {len(unique)} cards -> {output_file.name}")
|
||||
logger.info(
|
||||
"Approach 1 (Strict Filter): %d cards -> %s",
|
||||
len(unique),
|
||||
output_file.name,
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
@ -7,11 +7,17 @@
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
from pathlib import Path
|
||||
import re
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
def clean_text(text) -> str:
|
||||
MIN_PARA_LENGTH = 30
|
||||
MIN_BODY_LENGTH = 50
|
||||
|
||||
|
||||
def clean_text(text: str) -> str:
|
||||
"""Clean text."""
|
||||
if not text:
|
||||
return ""
|
||||
@ -23,7 +29,7 @@ def clean_text(text) -> str:
|
||||
return text.strip()
|
||||
|
||||
|
||||
def extract_structured_content(body) -> str | None:
|
||||
def extract_structured_content(body: str) -> str | None:
|
||||
"""Better extraction - look for multiple content types."""
|
||||
parts = []
|
||||
|
||||
@ -54,15 +60,14 @@ def extract_structured_content(body) -> str | None:
|
||||
if p.strip()
|
||||
and not p.startswith("```")
|
||||
and not p.startswith("|")
|
||||
and len(p.strip()) > 30
|
||||
and len(p.strip()) > MIN_PARA_LENGTH
|
||||
]
|
||||
for p in paras[:2]:
|
||||
parts.append(p[:300])
|
||||
parts.extend(p[:300] for p in paras[:2])
|
||||
|
||||
return "<br>".join([clean_text(p) for p in parts]) if parts else None
|
||||
|
||||
|
||||
def extract_cards(filepath) -> list[dict[str, str]]:
|
||||
def extract_cards(filepath: str) -> list[dict[str, str]]:
|
||||
"""Extract cards."""
|
||||
with Path(filepath).open(encoding="utf-8") as f:
|
||||
content = f.read()
|
||||
@ -99,9 +104,9 @@ def extract_cards(filepath) -> list[dict[str, str]]:
|
||||
content,
|
||||
re.MULTILINE | re.DOTALL,
|
||||
)
|
||||
for header, body in sections:
|
||||
header = header.strip()
|
||||
if "Przykład" in header or '"' in header or len(body) < 50:
|
||||
for raw_header, body in sections:
|
||||
header = raw_header.strip()
|
||||
if "Przykład" in header or '"' in header or len(body) < MIN_BODY_LENGTH:
|
||||
continue
|
||||
|
||||
answer = extract_structured_content(body)
|
||||
@ -143,8 +148,10 @@ def main() -> None:
|
||||
for c in unique:
|
||||
f.write(f"{c['front']}\t{c['back']}\t{c['tags']}\n")
|
||||
|
||||
print(
|
||||
f"✅ Approach 2 (Better Extraction): {len(unique)} cards -> {output_file.name}"
|
||||
logger.info(
|
||||
"Approach 2 (Better Extraction): %d cards -> %s",
|
||||
len(unique),
|
||||
output_file.name,
|
||||
)
|
||||
|
||||
|
||||
|
||||
@ -7,31 +7,41 @@ Usage:
|
||||
Options:
|
||||
--filter Apply strict filtering (answers > 100 chars)
|
||||
--extract Use improved extraction algorithm
|
||||
--main-only Only generate main exam questions (45 comprehensive cards)
|
||||
--main-only Only generate main exam questions
|
||||
|
||||
Combinations:
|
||||
python anki_generator.py # Basic extraction, no filter
|
||||
python anki_generator.py --filter # Approach 1: Strict filter only
|
||||
python anki_generator.py --extract # Approach 2: Better extraction only
|
||||
python anki_generator.py --main-only # Approach 3: Main questions only
|
||||
python anki_generator.py --filter --extract # Approach 4: Filter + Better extraction
|
||||
python anki_generator.py --filter --main-only # Approach 5: Filter + Main only
|
||||
python anki_generator.py --extract --main-only # Approach 6: Better extraction + Main only
|
||||
python anki_generator.py --filter --extract --main-only # Approach 7: All three
|
||||
python anki_generator.py
|
||||
python anki_generator.py --filter
|
||||
python anki_generator.py --extract
|
||||
python anki_generator.py --main-only
|
||||
python anki_generator.py --filter --extract
|
||||
python anki_generator.py --filter --main-only
|
||||
python anki_generator.py --extract --main-only
|
||||
python anki_generator.py --filter --extract --main-only
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import logging
|
||||
from pathlib import Path
|
||||
import re
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
MIN_PARTS_THRESHOLD = 2
|
||||
MIN_BODY_LENGTH = 50
|
||||
MIN_PARA_LENGTH = 30
|
||||
SHORT_THRESHOLD = 50
|
||||
MEDIUM_THRESHOLD = 150
|
||||
DEFAULT_MIN_ANSWER_LENGTH = 100
|
||||
|
||||
# =============================================================================
|
||||
# SHARED UTILITIES
|
||||
# =============================================================================
|
||||
|
||||
|
||||
def clean_text(text) -> str:
|
||||
def clean_text(text: str) -> str:
|
||||
"""Clean and format text for Anki."""
|
||||
if not text:
|
||||
return ""
|
||||
@ -43,7 +53,7 @@ def clean_text(text) -> str:
|
||||
return text.strip()
|
||||
|
||||
|
||||
def get_file_metadata(filepath) -> tuple[str, str, str]:
|
||||
def get_file_metadata(filepath: str) -> tuple[str, str, str]:
|
||||
"""Extract question number and subject from filename."""
|
||||
filename = Path(filepath).name
|
||||
match = re.match(r"(\d+)-(.+)\.md", filename)
|
||||
@ -58,7 +68,7 @@ def get_file_metadata(filepath) -> tuple[str, str, str]:
|
||||
return num, subject, content
|
||||
|
||||
|
||||
def get_main_question(content) -> str | None:
|
||||
def get_main_question(content: str) -> str | None:
|
||||
"""Extract the main exam question."""
|
||||
q_match = re.search(
|
||||
r'## Pytanie\s*\n\s*\*\*["\']?(.+?)["\']?\*\*', content, re.DOTALL
|
||||
@ -73,7 +83,10 @@ def get_main_question(content) -> str | None:
|
||||
# =============================================================================
|
||||
|
||||
|
||||
def apply_strict_filter(cards, min_length=100) -> list[dict[str, str]]:
|
||||
def apply_strict_filter(
|
||||
cards: list[dict[str, str]],
|
||||
min_length: int = DEFAULT_MIN_ANSWER_LENGTH,
|
||||
) -> list[dict[str, str]]:
|
||||
"""Filter cards to only include those with answers > min_length characters."""
|
||||
return [c for c in cards if len(c["back"]) > min_length]
|
||||
|
||||
@ -83,7 +96,7 @@ def apply_strict_filter(cards, min_length=100) -> list[dict[str, str]]:
|
||||
# =============================================================================
|
||||
|
||||
|
||||
def extract_structured_content(body) -> str | None:
|
||||
def extract_structured_content(body: str) -> str | None:
|
||||
"""Improved extraction - multiple content types with better formatting."""
|
||||
parts = []
|
||||
|
||||
@ -101,7 +114,7 @@ def extract_structured_content(body) -> str | None:
|
||||
parts.append(f"• <b>{term}</b>")
|
||||
|
||||
# 3. Key-value patterns
|
||||
if len(parts) < 2:
|
||||
if len(parts) < MIN_PARTS_THRESHOLD:
|
||||
kvs = re.findall(r"\*\*([^*\n]+)\*\*\s*[--:]\s*([^\n*]{10,150})", body)
|
||||
for k, v in kvs[:4]:
|
||||
entry = f"<b>{k.strip()}</b>: {v.strip()}"
|
||||
@ -116,15 +129,14 @@ def extract_structured_content(body) -> str | None:
|
||||
if p.strip()
|
||||
and not p.startswith("```")
|
||||
and not p.startswith("|")
|
||||
and len(p.strip()) > 30
|
||||
and len(p.strip()) > MIN_PARA_LENGTH
|
||||
]
|
||||
for p in paras[:2]:
|
||||
parts.append(p[:300])
|
||||
parts.extend(p[:300] for p in paras[:2])
|
||||
|
||||
return "<br>".join([clean_text(p) for p in parts]) if parts else None
|
||||
|
||||
|
||||
def extract_cards_better(filepath) -> list[dict[str, str]]:
|
||||
def extract_cards_better(filepath: str) -> list[dict[str, str]]:
|
||||
"""Extract cards with improved algorithm."""
|
||||
num, subject, content = get_file_metadata(filepath)
|
||||
base_tags = f"egzamin pyt{num} {subject}"
|
||||
@ -153,13 +165,13 @@ def extract_cards_better(filepath) -> list[dict[str, str]]:
|
||||
content,
|
||||
re.MULTILINE | re.DOTALL,
|
||||
)
|
||||
for header, body in sections:
|
||||
header = header.strip()
|
||||
for raw_header, body in sections:
|
||||
header = raw_header.strip()
|
||||
if (
|
||||
"Przykład" in header
|
||||
or '"' in header
|
||||
or "Mnemonic" in header
|
||||
or len(body) < 50
|
||||
or len(body) < MIN_BODY_LENGTH
|
||||
):
|
||||
continue
|
||||
|
||||
@ -176,7 +188,7 @@ def extract_cards_better(filepath) -> list[dict[str, str]]:
|
||||
return cards
|
||||
|
||||
|
||||
def extract_cards_basic(filepath) -> list[dict[str, str]]:
|
||||
def extract_cards_basic(filepath: str) -> list[dict[str, str]]:
|
||||
"""Basic extraction - simpler algorithm."""
|
||||
num, subject, content = get_file_metadata(filepath)
|
||||
base_tags = f"egzamin pyt{num} {subject}"
|
||||
@ -212,10 +224,10 @@ def extract_cards_basic(filepath) -> list[dict[str, str]]:
|
||||
content,
|
||||
re.MULTILINE | re.DOTALL,
|
||||
)
|
||||
for header, body in sections:
|
||||
header = header.strip()
|
||||
body = body.strip()
|
||||
if len(body) < 50 or "Przykład" in header:
|
||||
for raw_header, raw_body in sections:
|
||||
header = raw_header.strip()
|
||||
body = raw_body.strip()
|
||||
if len(body) < MIN_BODY_LENGTH or "Przykład" in header:
|
||||
continue
|
||||
|
||||
paras = [
|
||||
@ -241,7 +253,28 @@ def extract_cards_basic(filepath) -> list[dict[str, str]]:
|
||||
# =============================================================================
|
||||
|
||||
|
||||
def extract_main_only(filepath) -> list[dict[str, str]]:
|
||||
def _extract_key_point(body: str) -> str | None:
|
||||
"""Extract a key point from a section body."""
|
||||
# Try to get a definition or first bullet
|
||||
def_match = re.search(
|
||||
r"Rozpoznawana klasa języków\s*\n\s*\*\*([^*]+)\*\*", body
|
||||
)
|
||||
if def_match:
|
||||
return def_match.group(1).strip()
|
||||
|
||||
bullets = re.findall(r"[-•]\s*\*\*([^*]+)\*\*[:\s-]*([^\n]*)", body)
|
||||
if bullets:
|
||||
term, desc = bullets[0]
|
||||
return f"{term}: {desc.strip()}" if desc.strip() else term
|
||||
|
||||
para_match = re.search(r"\n\n([^#\n\-•|`][^\n]{20,150})", body)
|
||||
if para_match:
|
||||
return para_match.group(1).strip()
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def extract_main_only(filepath: str) -> list[dict[str, str]]:
|
||||
"""Extract only the main exam question with comprehensive answer."""
|
||||
num, subject, content = get_file_metadata(filepath)
|
||||
base_tags = f"egzamin pyt{num} {subject} main"
|
||||
@ -255,7 +288,9 @@ def extract_main_only(filepath) -> list[dict[str, str]]:
|
||||
|
||||
# Get main answer section
|
||||
answer_match = re.search(
|
||||
r"## 📚 Odpowiedź główna\s*\n(.+?)(?=\n## [^<5E>]|\Z)", content, re.DOTALL
|
||||
r"## 📚 Odpowiedź główna\s*\n(.+?)(?=\n## [^<5E>]|\Z)",
|
||||
content,
|
||||
re.DOTALL,
|
||||
)
|
||||
if answer_match:
|
||||
section = answer_match.group(1)
|
||||
@ -267,32 +302,16 @@ def extract_main_only(filepath) -> list[dict[str, str]]:
|
||||
re.MULTILINE | re.DOTALL,
|
||||
)
|
||||
|
||||
for header, body in headers[:5]:
|
||||
header = header.strip()
|
||||
if "Przykład" in header or "Mnemonic" in header or '"' in header:
|
||||
for raw_header, body in headers[:5]:
|
||||
header = raw_header.strip()
|
||||
if (
|
||||
"Przykład" in header
|
||||
or "Mnemonic" in header
|
||||
or '"' in header
|
||||
):
|
||||
continue
|
||||
|
||||
# Get key point from this section
|
||||
key_point = None
|
||||
|
||||
# Try to get a definition or first bullet
|
||||
def_match = re.search(
|
||||
r"Rozpoznawana klasa języków\s*\n\s*\*\*([^*]+)\*\*", body
|
||||
)
|
||||
if def_match:
|
||||
key_point = def_match.group(1).strip()
|
||||
|
||||
if not key_point:
|
||||
bullets = re.findall(r"[-•]\s*\*\*([^*]+)\*\*[:\s-]*([^\n]*)", body)
|
||||
if bullets:
|
||||
term, desc = bullets[0]
|
||||
key_point = f"{term}: {desc.strip()}" if desc.strip() else term
|
||||
|
||||
if not key_point:
|
||||
para_match = re.search(r"\n\n([^#\n\-•|`][^\n]{20,150})", body)
|
||||
if para_match:
|
||||
key_point = para_match.group(1).strip()
|
||||
|
||||
key_point = _extract_key_point(body)
|
||||
if key_point:
|
||||
answer_parts.append(f"<b>{header}</b>: {key_point}")
|
||||
|
||||
@ -308,9 +327,58 @@ def extract_main_only(filepath) -> list[dict[str, str]]:
|
||||
# =============================================================================
|
||||
|
||||
|
||||
def generate_anki(use_filter=False, use_better_extract=False, main_only=False) -> Path:
|
||||
def _collect_cards(
|
||||
odpowiedzi_dir: Path,
|
||||
*,
|
||||
use_better_extract: bool,
|
||||
main_only: bool,
|
||||
) -> list[dict[str, str]]:
|
||||
"""Collect cards from all files using the specified approach."""
|
||||
all_cards = []
|
||||
for md_file in sorted(odpowiedzi_dir.glob("*.md")):
|
||||
if main_only:
|
||||
cards = extract_main_only(md_file)
|
||||
elif use_better_extract:
|
||||
cards = extract_cards_better(md_file)
|
||||
else:
|
||||
cards = extract_cards_basic(md_file)
|
||||
all_cards.extend(cards)
|
||||
return all_cards
|
||||
|
||||
|
||||
def _log_statistics(unique: list[dict[str, str]], output_file: Path) -> None:
|
||||
"""Log quality statistics for the generated cards."""
|
||||
lengths = [len(c["back"]) for c in unique]
|
||||
short = sum(1 for length in lengths if length < SHORT_THRESHOLD)
|
||||
medium = sum(
|
||||
1
|
||||
for length in lengths
|
||||
if SHORT_THRESHOLD <= length < MEDIUM_THRESHOLD
|
||||
)
|
||||
good = sum(
|
||||
1 for length in lengths if length >= MEDIUM_THRESHOLD
|
||||
)
|
||||
|
||||
logger.info("Generated: %s", output_file.name)
|
||||
logger.info(" Cards: %d", len(unique))
|
||||
logger.info(
|
||||
" Quality: %d short / %d medium / %d good",
|
||||
short,
|
||||
medium,
|
||||
good,
|
||||
)
|
||||
|
||||
|
||||
def generate_anki(
|
||||
*,
|
||||
use_filter: bool = False,
|
||||
use_better_extract: bool = False,
|
||||
main_only: bool = False,
|
||||
) -> Path:
|
||||
"""Generate Anki deck with specified approaches."""
|
||||
odpowiedzi_dir = Path("/home/kuchy/praca_magisterska/pytania/odpowiedzi")
|
||||
odpowiedzi_dir = Path(
|
||||
"/home/kuchy/praca_magisterska/pytania/odpowiedzi"
|
||||
)
|
||||
|
||||
# Determine output filename based on options
|
||||
suffix_parts = []
|
||||
@ -322,30 +390,25 @@ def generate_anki(use_filter=False, use_better_extract=False, main_only=False) -
|
||||
suffix_parts.append("main")
|
||||
suffix = "_".join(suffix_parts) if suffix_parts else "basic"
|
||||
|
||||
output_file = Path(f"/home/kuchy/praca_magisterska/pytania/anki_{suffix}.txt")
|
||||
output_file = Path(
|
||||
f"/home/kuchy/praca_magisterska/pytania/anki_{suffix}.txt"
|
||||
)
|
||||
deck_name = f"Egzamin_{suffix.replace('_', '+')}"
|
||||
|
||||
all_cards = []
|
||||
|
||||
for md_file in sorted(odpowiedzi_dir.glob("*.md")):
|
||||
if main_only:
|
||||
# Approach 3: Only main questions
|
||||
cards = extract_main_only(md_file)
|
||||
elif use_better_extract:
|
||||
# Approach 2: Better extraction
|
||||
cards = extract_cards_better(md_file)
|
||||
else:
|
||||
# Basic extraction
|
||||
cards = extract_cards_basic(md_file)
|
||||
|
||||
all_cards.extend(cards)
|
||||
all_cards = _collect_cards(
|
||||
odpowiedzi_dir,
|
||||
use_better_extract=use_better_extract,
|
||||
main_only=main_only,
|
||||
)
|
||||
|
||||
# Approach 1: Apply filtering if requested
|
||||
if use_filter:
|
||||
all_cards = apply_strict_filter(all_cards, min_length=100)
|
||||
all_cards = apply_strict_filter(
|
||||
all_cards, min_length=DEFAULT_MIN_ANSWER_LENGTH
|
||||
)
|
||||
|
||||
# Remove duplicates
|
||||
seen = set()
|
||||
seen: set[str] = set()
|
||||
unique = []
|
||||
for c in all_cards:
|
||||
key = c["front"][:80]
|
||||
@ -355,20 +418,14 @@ def generate_anki(use_filter=False, use_better_extract=False, main_only=False) -
|
||||
|
||||
# Write output
|
||||
with Path(output_file).open("w", encoding="utf-8") as f:
|
||||
f.write(f"#separator:Tab\n#html:true\n#notetype:Basic\n#deck:{deck_name}\n\n")
|
||||
f.write(
|
||||
"#separator:Tab\n#html:true\n"
|
||||
f"#notetype:Basic\n#deck:{deck_name}\n\n"
|
||||
)
|
||||
for c in unique:
|
||||
f.write(f"{c['front']}\t{c['back']}\t{c['tags']}\n")
|
||||
|
||||
# Statistics
|
||||
lengths = [len(c["back"]) for c in unique]
|
||||
short = sum(1 for l in lengths if l < 50)
|
||||
medium = sum(1 for l in lengths if 50 <= l < 150)
|
||||
good = sum(1 for l in lengths if l >= 150)
|
||||
|
||||
print(f"✅ Generated: {output_file.name}")
|
||||
print(f" Cards: {len(unique)}")
|
||||
print(f" Quality: {short} short / {medium} medium / {good} good")
|
||||
print()
|
||||
_log_statistics(unique, output_file)
|
||||
|
||||
return output_file
|
||||
|
||||
@ -397,9 +454,9 @@ def main() -> None:
|
||||
|
||||
if args.all_combinations:
|
||||
# Generate all 7 combinations
|
||||
print("=" * 60)
|
||||
print("Generating all 7 combinations...")
|
||||
print("=" * 60 + "\n")
|
||||
logger.info("=" * 60)
|
||||
logger.info("Generating all 7 combinations...")
|
||||
logger.info("=" * 60)
|
||||
|
||||
combinations = [
|
||||
(True, False, False), # 1: Filter only
|
||||
@ -411,9 +468,22 @@ def main() -> None:
|
||||
(True, True, True), # 7: All three
|
||||
]
|
||||
|
||||
for i, (f, e, m) in enumerate(combinations, 1):
|
||||
print(f"--- Combination {i} (filter={f}, extract={e}, main={m}) ---")
|
||||
generate_anki(use_filter=f, use_better_extract=e, main_only=m)
|
||||
for i, (f_flag, e_flag, m_flag) in enumerate(
|
||||
combinations, 1
|
||||
):
|
||||
logger.info(
|
||||
"--- Combination %d (filter=%s, extract=%s,"
|
||||
" main=%s) ---",
|
||||
i,
|
||||
f_flag,
|
||||
e_flag,
|
||||
m_flag,
|
||||
)
|
||||
generate_anki(
|
||||
use_filter=f_flag,
|
||||
use_better_extract=e_flag,
|
||||
main_only=m_flag,
|
||||
)
|
||||
else:
|
||||
generate_anki(
|
||||
use_filter=args.filter,
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@ -6,18 +6,27 @@ Creates a tab-separated file compatible with Anki import.
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
from pathlib import Path
|
||||
import re
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
def extract_question_and_answer(filepath) -> list[dict[str, str]]:
|
||||
"""Extract main question and key answer points from a markdown file."""
|
||||
MIN_BODY_LENGTH = 50
|
||||
MIN_DEFINITION_LENGTH = 20
|
||||
MAX_DEFINITION_LENGTH = 200
|
||||
MIN_BULLET_COUNT = 5
|
||||
MIN_SUBSECTION_LENGTH = 5
|
||||
MIN_FORMULA_LENGTH = 20
|
||||
|
||||
|
||||
def _get_metadata(
|
||||
filepath: str,
|
||||
) -> tuple[str, str, str, str, str]:
|
||||
"""Extract metadata from file."""
|
||||
with Path(filepath).open(encoding="utf-8") as f:
|
||||
content = f.read()
|
||||
|
||||
cards = []
|
||||
|
||||
# Extract file number for tagging
|
||||
filename = Path(filepath).name
|
||||
match = re.match(r"(\d+)-(.+)\.md", filename)
|
||||
if match:
|
||||
@ -27,13 +36,13 @@ def extract_question_and_answer(filepath) -> list[dict[str, str]]:
|
||||
num = "00"
|
||||
topic = "unknown"
|
||||
|
||||
# Extract main title (usually contains the question)
|
||||
title_match = re.search(r"^# (.+)$", content, re.MULTILINE)
|
||||
title = title_match.group(1) if title_match else "Unknown"
|
||||
|
||||
# Extract the main question from ## Pytanie section
|
||||
question_match = re.search(
|
||||
r'## Pytanie\s*\n\s*\*\*["\']?(.+?)["\']?\*\*', content, re.DOTALL
|
||||
r'## Pytanie\s*\n\s*\*\*["\']?(.+?)["\']?\*\*',
|
||||
content,
|
||||
re.DOTALL,
|
||||
)
|
||||
if question_match:
|
||||
main_question = question_match.group(1).strip()
|
||||
@ -41,124 +50,207 @@ def extract_question_and_answer(filepath) -> list[dict[str, str]]:
|
||||
else:
|
||||
main_question = title
|
||||
|
||||
# Extract subject/przedmiot
|
||||
subject_match = re.search(r"Przedmiot:\s*(\w+)", content)
|
||||
subject = subject_match.group(1) if subject_match else "Ogólne"
|
||||
return num, topic, title, main_question, content
|
||||
|
||||
# Create main question card - extract key sections for answer
|
||||
answer_parts = []
|
||||
|
||||
# Look for main answer section
|
||||
def _extract_main_card(
|
||||
content: str,
|
||||
main_question: str,
|
||||
subject: str,
|
||||
num: str,
|
||||
topic: str,
|
||||
) -> list[dict[str, str]]:
|
||||
"""Extract the main question card."""
|
||||
answer_parts: list[str] = []
|
||||
|
||||
main_answer = re.search(
|
||||
r"## 📚 Odpowiedź główna\s*\n(.+?)(?=\n## |\n---\s*\n## |\Z)",
|
||||
r"## 📚 Odpowiedź główna\s*\n(.+?)"
|
||||
r"(?=\n## |\n---\s*\n## |\Z)",
|
||||
content,
|
||||
re.DOTALL,
|
||||
)
|
||||
if main_answer:
|
||||
answer_text = main_answer.group(1)
|
||||
# Extract key points, definitions, headers
|
||||
headers = re.findall(r"### (.+)", answer_text)
|
||||
for h in headers[:5]: # Limit to first 5 headers
|
||||
answer_parts.append(f"• {h}")
|
||||
answer_parts.extend(f"• {h}" for h in headers[:5])
|
||||
|
||||
# Also extract key definitions if present
|
||||
definitions = re.findall(r"\*\*([^*]+)\*\*\s*[--:]\s*([^*\n]+)", content)
|
||||
definitions = re.findall(
|
||||
r"\*\*([^*]+)\*\*\s*[--:]\s*([^*\n]+)", content
|
||||
)
|
||||
for term, definition in definitions[:3]:
|
||||
if len(definition) > 20 and len(definition) < 200:
|
||||
answer_parts.append(f"• {term}: {definition.strip()}")
|
||||
if (
|
||||
len(definition) > MIN_DEFINITION_LENGTH
|
||||
and len(definition) < MAX_DEFINITION_LENGTH
|
||||
):
|
||||
answer_parts.append(
|
||||
f"• {term}: {definition.strip()}"
|
||||
)
|
||||
|
||||
# If we found answer parts, create main card
|
||||
if answer_parts:
|
||||
answer_html = "<br>".join(answer_parts[:8]) # Limit answer length
|
||||
cards.append(
|
||||
{
|
||||
"question": main_question,
|
||||
"answer": answer_html,
|
||||
"tags": f"egzamin_magisterski pytanie_{num} {subject} {topic}",
|
||||
}
|
||||
if not answer_parts:
|
||||
return []
|
||||
|
||||
answer_html = "<br>".join(answer_parts[:8])
|
||||
return [
|
||||
{
|
||||
"question": main_question,
|
||||
"answer": answer_html,
|
||||
"tags": (
|
||||
f"egzamin_magisterski pytanie_{num}"
|
||||
f" {subject} {topic}"
|
||||
),
|
||||
}
|
||||
]
|
||||
|
||||
|
||||
def _extract_subsection_answer(body_clean: str) -> str | None:
|
||||
"""Extract answer text from a subsection body."""
|
||||
bullets = re.findall(
|
||||
r"[-•]\s*\*\*(.+?)\*\*[:\s]*([^\n]+)?", body_clean
|
||||
)
|
||||
if bullets:
|
||||
return "<br>".join(
|
||||
f"• {b[0]}: {b[1].strip()}" if b[1] else f"• {b[0]}"
|
||||
for b in bullets[:MIN_BULLET_COUNT]
|
||||
)
|
||||
|
||||
# Extract sub-questions and key concepts as additional cards
|
||||
# Look for ### headers with explanations
|
||||
paragraphs = [
|
||||
p.strip()
|
||||
for p in body_clean.split("\n\n")
|
||||
if p.strip()
|
||||
and not p.startswith("```")
|
||||
and not p.startswith("|")
|
||||
]
|
||||
if paragraphs:
|
||||
first_para = paragraphs[0]
|
||||
first_para = re.sub(r"\*\*(.+?)\*\*", r"\1", first_para)
|
||||
first_para = re.sub(r"\*(.+?)\*", r"\1", first_para)
|
||||
return first_para[:400]
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def _extract_sub_cards(
|
||||
content: str,
|
||||
title: str,
|
||||
subject: str,
|
||||
num: str,
|
||||
topic: str,
|
||||
) -> list[dict[str, str]]:
|
||||
"""Extract sub-concept cards."""
|
||||
cards: list[dict[str, str]] = []
|
||||
subsections = re.findall(
|
||||
r"### (\d+\.\s+)?(.+?)\n\n(.+?)(?=\n### |\n## |\n---|\Z)", content, re.DOTALL
|
||||
r"### (\d+\.\s+)?(.+?)\n\n(.+?)"
|
||||
r"(?=\n### |\n## |\n---|\Z)",
|
||||
content,
|
||||
re.DOTALL,
|
||||
)
|
||||
|
||||
for _, header, body in subsections:
|
||||
if len(header) < 5 or header.startswith("Przykład"):
|
||||
continue
|
||||
|
||||
# Extract first substantive paragraph or key points
|
||||
body_clean = body.strip()
|
||||
|
||||
# Skip very short or code-only sections
|
||||
if len(body_clean) < 50:
|
||||
continue
|
||||
|
||||
# Extract bullet points or first paragraph
|
||||
bullets = re.findall(r"[-•]\s*\*\*(.+?)\*\*[:\s]*([^\n]+)?", body_clean)
|
||||
if bullets:
|
||||
answer_text = "<br>".join(
|
||||
[
|
||||
f"• {b[0]}: {b[1].strip()}" if b[1] else f"• {b[0]}"
|
||||
for b in bullets[:5]
|
||||
]
|
||||
)
|
||||
else:
|
||||
# Get first meaningful paragraph
|
||||
paragraphs = [
|
||||
p.strip()
|
||||
for p in body_clean.split("\n\n")
|
||||
if p.strip() and not p.startswith("```") and not p.startswith("|")
|
||||
]
|
||||
if paragraphs:
|
||||
first_para = paragraphs[0]
|
||||
# Clean markdown
|
||||
first_para = re.sub(r"\*\*(.+?)\*\*", r"\1", first_para)
|
||||
first_para = re.sub(r"\*(.+?)\*", r"\1", first_para)
|
||||
answer_text = first_para[:400]
|
||||
else:
|
||||
continue
|
||||
|
||||
# Create sub-concept card
|
||||
sub_question = f"Co to jest {header}?" if not header.endswith("?") else header
|
||||
if (
|
||||
"Charakterystyka" in header
|
||||
or "Definicja" in header
|
||||
or "Właściwości" in header
|
||||
len(header) < MIN_SUBSECTION_LENGTH
|
||||
or header.startswith("Przykład")
|
||||
):
|
||||
# These are answer-type headers, reframe
|
||||
parent_topic = title.replace("Pytanie", "").strip(": 0123456789")
|
||||
sub_question = f"{header} - {parent_topic}"
|
||||
continue
|
||||
|
||||
body_clean = body.strip()
|
||||
if len(body_clean) < MIN_BODY_LENGTH:
|
||||
continue
|
||||
|
||||
answer_text = _extract_subsection_answer(body_clean)
|
||||
if not answer_text:
|
||||
continue
|
||||
|
||||
sub_question = (
|
||||
f"Co to jest {header}?"
|
||||
if not header.endswith("?")
|
||||
else header
|
||||
)
|
||||
|
||||
if any(
|
||||
kw in header
|
||||
for kw in ("Charakterystyka", "Definicja", "Właściwości")
|
||||
):
|
||||
parent = title.replace("Pytanie", "").strip(
|
||||
": 0123456789"
|
||||
)
|
||||
sub_question = f"{header} - {parent}"
|
||||
|
||||
cards.append(
|
||||
{
|
||||
"question": sub_question,
|
||||
"answer": answer_text,
|
||||
"tags": f"egzamin_magisterski pytanie_{num} {subject} {topic} szczegoly",
|
||||
"tags": (
|
||||
f"egzamin_magisterski pytanie_{num}"
|
||||
f" {subject} {topic} szczegoly"
|
||||
),
|
||||
}
|
||||
)
|
||||
|
||||
# Extract key formulas/definitions as separate cards
|
||||
return cards
|
||||
|
||||
|
||||
def _extract_formula_cards(
|
||||
content: str,
|
||||
subject: str,
|
||||
num: str,
|
||||
) -> list[dict[str, str]]:
|
||||
"""Extract formula/definition cards."""
|
||||
cards: list[dict[str, str]] = []
|
||||
formulas = re.findall(
|
||||
r"\*\*([A-Za-z\s]+(?:formuła|wzór|twierdzenie|definicja|lemat))\*\*[:\s]*\n?(.+?)(?=\n\n|\n\*\*|\Z)",
|
||||
r"\*\*([A-Za-z\s]+"
|
||||
r"(?:formuła|wzór|twierdzenie|definicja|lemat))"
|
||||
r"\*\*[:\s]*\n?(.+?)(?=\n\n|\n\*\*|\Z)",
|
||||
content,
|
||||
re.IGNORECASE | re.DOTALL,
|
||||
)
|
||||
for formula_name, formula_content in formulas:
|
||||
if len(formula_content) > 20:
|
||||
if len(formula_content) > MIN_FORMULA_LENGTH:
|
||||
cards.append(
|
||||
{
|
||||
"question": f"Podaj {formula_name.strip()}",
|
||||
"answer": formula_content.strip()[:300],
|
||||
"tags": f"egzamin_magisterski pytanie_{num} {subject} formuly",
|
||||
"tags": (
|
||||
f"egzamin_magisterski pytanie_{num}"
|
||||
f" {subject} formuly"
|
||||
),
|
||||
}
|
||||
)
|
||||
|
||||
return cards
|
||||
|
||||
|
||||
def clean_for_anki(text) -> str:
|
||||
def extract_question_and_answer(
|
||||
filepath: str,
|
||||
) -> list[dict[str, str]]:
|
||||
"""Extract main question and key answer points from a markdown file."""
|
||||
num, topic, title, main_question, content = _get_metadata(
|
||||
filepath
|
||||
)
|
||||
|
||||
subject_match = re.search(r"Przedmiot:\s*(\w+)", content)
|
||||
subject = (
|
||||
subject_match.group(1) if subject_match else "Ogólne"
|
||||
)
|
||||
|
||||
cards: list[dict[str, str]] = []
|
||||
cards.extend(
|
||||
_extract_main_card(
|
||||
content, main_question, subject, num, topic
|
||||
)
|
||||
)
|
||||
cards.extend(
|
||||
_extract_sub_cards(
|
||||
content, title, subject, num, topic
|
||||
)
|
||||
)
|
||||
cards.extend(
|
||||
_extract_formula_cards(content, subject, num)
|
||||
)
|
||||
|
||||
return cards
|
||||
|
||||
|
||||
def clean_for_anki(text: str) -> str:
|
||||
"""Clean text for Anki import - escape special characters."""
|
||||
# Replace tabs with spaces
|
||||
text = text.replace("\t", " ")
|
||||
@ -187,13 +279,13 @@ def main() -> None:
|
||||
|
||||
# Process each file
|
||||
for md_file in sorted(odpowiedzi_dir.glob("*.md")):
|
||||
print(f"Processing: {md_file.name}")
|
||||
logger.info("Processing: %s", md_file.name)
|
||||
try:
|
||||
cards = extract_question_and_answer(md_file)
|
||||
all_cards.extend(cards)
|
||||
print(f" -> Extracted {len(cards)} cards")
|
||||
except Exception as e:
|
||||
print(f" -> Error: {e}")
|
||||
logger.info(" -> Extracted %d cards", len(cards))
|
||||
except (ValueError, OSError) as e:
|
||||
logger.info(" -> Error: %s", e)
|
||||
|
||||
# Write Anki file with headers
|
||||
with Path(output_file).open("w", encoding="utf-8") as f:
|
||||
@ -211,13 +303,13 @@ def main() -> None:
|
||||
tags = card["tags"]
|
||||
f.write(f"{front}\t{back}\t{tags}\n")
|
||||
|
||||
print(f"\n✅ Created {len(all_cards)} flashcards")
|
||||
print(f"📁 Output: {output_file}")
|
||||
print("\nTo import into Anki:")
|
||||
print("1. Open Anki → File → Import")
|
||||
print("2. Select the .txt file")
|
||||
print("3. Verify 'Allow HTML' is checked")
|
||||
print("4. Click Import")
|
||||
logger.info("Created %d flashcards", len(all_cards))
|
||||
logger.info("Output: %s", output_file)
|
||||
logger.info("To import into Anki:")
|
||||
logger.info("1. Open Anki -> File -> Import")
|
||||
logger.info("2. Select the .txt file")
|
||||
logger.info("3. Verify 'Allow HTML' is checked")
|
||||
logger.info("4. Click Import")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
@ -6,11 +6,22 @@ Creates tab-separated file for Anki import with proper HTML formatting.
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
from pathlib import Path
|
||||
import re
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
def clean_text(text) -> str:
|
||||
MIN_HEADER_LENGTH = 3
|
||||
MIN_MATCH_LENGTH = 10
|
||||
MIN_BODY_LENGTH = 50
|
||||
MIN_QA_LENGTH = 30
|
||||
MAX_CONTENT_LENGTH = 300
|
||||
MAX_ANSWER_LENGTH = 400
|
||||
MAX_COMPARISON_ITEMS = 6
|
||||
|
||||
|
||||
def clean_text(text: str) -> str:
|
||||
"""Clean and format text for Anki."""
|
||||
if not text:
|
||||
return ""
|
||||
@ -28,7 +39,7 @@ def clean_text(text) -> str:
|
||||
return text.strip()
|
||||
|
||||
|
||||
def format_list(items, numbered=False) -> str:
|
||||
def format_list(items: list[str], *, numbered: bool = False) -> str:
|
||||
"""Format a list of items as HTML."""
|
||||
if not items:
|
||||
return ""
|
||||
@ -43,119 +54,148 @@ def format_list(items, numbered=False) -> str:
|
||||
return html
|
||||
|
||||
|
||||
def extract_from_file(filepath) -> list[dict[str, str]]:
|
||||
"""Extract flashcard data from a markdown file."""
|
||||
def _get_file_metadata(
|
||||
filepath: str,
|
||||
) -> tuple[str, str, str]:
|
||||
"""Extract metadata from file."""
|
||||
with Path(filepath).open(encoding="utf-8") as f:
|
||||
content = f.read()
|
||||
|
||||
cards = []
|
||||
|
||||
# Get file metadata
|
||||
filename = Path(filepath).name
|
||||
match = re.match(r"(\d+)-(.+)\.md", filename)
|
||||
num = match.group(1) if match else "00"
|
||||
match.group(2).replace("-", "_") if match else "unknown"
|
||||
|
||||
# Extract subject
|
||||
subj_match = re.search(r"Przedmiot:\s*(\w+)", content)
|
||||
subject = subj_match.group(1) if subj_match else "Ogólne"
|
||||
|
||||
# Base tags
|
||||
base_tags = f"egzamin_magisterski pyt{num} {subject}"
|
||||
return num, subject, content
|
||||
|
||||
# =====================================================
|
||||
# CARD TYPE 1: Main Exam Question
|
||||
# =====================================================
|
||||
|
||||
def _extract_main_question_card(
|
||||
content: str, base_tags: str,
|
||||
) -> list[dict[str, str]]:
|
||||
"""Extract the main exam question card."""
|
||||
q_match = re.search(
|
||||
r'## Pytanie\s*\n\s*\*\*["\']?(.+?)["\']?\*\*', content, re.DOTALL
|
||||
r'## Pytanie\s*\n\s*\*\*["\']?(.+?)["\']?\*\*',
|
||||
content,
|
||||
re.DOTALL,
|
||||
)
|
||||
if q_match:
|
||||
main_q = re.sub(r"\s+", " ", q_match.group(1).strip())
|
||||
if not q_match:
|
||||
return []
|
||||
|
||||
# Extract key topics from main answer
|
||||
answer_match = re.search(
|
||||
r"## 📚 Odpowiedź główna\s*\n(.+?)(?=\n## [<5B>🎯]|\n---\s*\n## |\Z)",
|
||||
content,
|
||||
re.DOTALL,
|
||||
main_q = re.sub(r"\s+", " ", q_match.group(1).strip())
|
||||
answer_match = re.search(
|
||||
r"## 📚 Odpowiedź główna\s*\n(.+?)"
|
||||
r"(?=\n## [📚🎯]|\n---\s*\n## |\Z)",
|
||||
content,
|
||||
re.DOTALL,
|
||||
)
|
||||
if not answer_match:
|
||||
return []
|
||||
|
||||
answer_section = answer_match.group(1)
|
||||
headers = re.findall(
|
||||
r"^### (?:\d+\.\s*)?(.+)$",
|
||||
answer_section,
|
||||
re.MULTILINE,
|
||||
)
|
||||
headers = [
|
||||
h.strip()
|
||||
for h in headers
|
||||
if len(h.strip()) > MIN_HEADER_LENGTH
|
||||
][:6]
|
||||
|
||||
if not headers:
|
||||
return []
|
||||
|
||||
answer_html = (
|
||||
"<b>Kluczowe zagadnienia:</b>" + format_list(headers)
|
||||
)
|
||||
return [
|
||||
{
|
||||
"front": clean_text(main_q),
|
||||
"back": answer_html,
|
||||
"tags": f"{base_tags} pytanie_glowne",
|
||||
}
|
||||
]
|
||||
|
||||
|
||||
def _make_question_text(header: str) -> str:
|
||||
"""Generate a question from a section header."""
|
||||
if "Definicja" in header or "Co to" in header:
|
||||
return (
|
||||
f"Co to jest:"
|
||||
f" {header.replace('Definicja', '').strip()}?"
|
||||
)
|
||||
if answer_match:
|
||||
answer_section = answer_match.group(1)
|
||||
# Get main headers
|
||||
headers = re.findall(
|
||||
r"^### (?:\d+\.\s*)?(.+)$", answer_section, re.MULTILINE
|
||||
if "Charakterystyka" in header:
|
||||
stripped = header.replace("Charakterystyka", "").strip()
|
||||
return f"Scharakteryzuj: {stripped}"
|
||||
if header.endswith("?"):
|
||||
return header
|
||||
return f"Omów: {header}"
|
||||
|
||||
|
||||
def _extract_body_parts(body: str) -> list[str]:
|
||||
"""Extract structured answer parts from a section body."""
|
||||
answer_parts: list[str] = []
|
||||
|
||||
subheaders = re.findall(r"^#### (.+)$", body, re.MULTILINE)
|
||||
if subheaders:
|
||||
answer_parts.extend(subheaders[:4])
|
||||
|
||||
bullets = re.findall(
|
||||
r"[-•]\s*\*\*([^*]+)\*\*[:\s-]*([^\n]+)?", body
|
||||
)
|
||||
for term, desc in bullets[:5]:
|
||||
if desc:
|
||||
answer_parts.append(
|
||||
f"<b>{term}</b>: {desc.strip()}"
|
||||
)
|
||||
headers = [h.strip() for h in headers if len(h.strip()) > 3][:6]
|
||||
else:
|
||||
answer_parts.append(f"<b>{term}</b>")
|
||||
|
||||
if headers:
|
||||
answer_html = "<b>Kluczowe zagadnienia:</b>" + format_list(headers)
|
||||
cards.append(
|
||||
{
|
||||
"front": clean_text(main_q),
|
||||
"back": answer_html,
|
||||
"tags": f"{base_tags} pytanie_glowne",
|
||||
}
|
||||
)
|
||||
if not answer_parts:
|
||||
paras = [
|
||||
p.strip()
|
||||
for p in body.split("\n\n")
|
||||
if p.strip()
|
||||
and not p.strip().startswith("```")
|
||||
and not p.strip().startswith("|")
|
||||
]
|
||||
if paras:
|
||||
first = paras[0]
|
||||
if len(first) > MAX_CONTENT_LENGTH:
|
||||
first = first[:MAX_CONTENT_LENGTH] + "..."
|
||||
answer_parts.append(first)
|
||||
|
||||
# =====================================================
|
||||
# CARD TYPE 2: Subsection Cards (detailed concepts)
|
||||
# =====================================================
|
||||
# Find all ### sections
|
||||
return answer_parts
|
||||
|
||||
|
||||
def _extract_subsection_cards(
|
||||
content: str, base_tags: str,
|
||||
) -> list[dict[str, str]]:
|
||||
"""Extract subsection detail cards."""
|
||||
cards: list[dict[str, str]] = []
|
||||
sections = re.findall(
|
||||
r"^### (?:\d+\.\s*)?(.+?)\n((?:(?!^###).)+)", content, re.MULTILINE | re.DOTALL
|
||||
r"^### (?:\d+\.\s*)?(.+?)\n((?:(?!^###).)+)",
|
||||
content,
|
||||
re.MULTILINE | re.DOTALL,
|
||||
)
|
||||
|
||||
for header, body in sections:
|
||||
header = header.strip()
|
||||
body = body.strip()
|
||||
for raw_header, raw_body in sections:
|
||||
header = raw_header.strip()
|
||||
body = raw_body.strip()
|
||||
|
||||
# Skip very short sections or example sections
|
||||
if len(body) < 50 or header.lower().startswith("przykład"):
|
||||
if (
|
||||
len(body) < MIN_BODY_LENGTH
|
||||
or header.lower().startswith("przykład")
|
||||
):
|
||||
continue
|
||||
|
||||
# Extract key information from body
|
||||
answer_parts = []
|
||||
|
||||
# Look for #### sub-headers
|
||||
subheaders = re.findall(r"^#### (.+)$", body, re.MULTILINE)
|
||||
if subheaders:
|
||||
answer_parts.extend(subheaders[:4])
|
||||
|
||||
# Look for bullet points with bold terms
|
||||
bullets = re.findall(r"[-•]\s*\*\*([^*]+)\*\*[:\s-]*([^\n]+)?", body)
|
||||
for term, desc in bullets[:5]:
|
||||
if desc:
|
||||
answer_parts.append(f"<b>{term}</b>: {desc.strip()}")
|
||||
else:
|
||||
answer_parts.append(f"<b>{term}</b>")
|
||||
|
||||
# If no structured content, get first paragraph
|
||||
if not answer_parts:
|
||||
paras = [
|
||||
p.strip()
|
||||
for p in body.split("\n\n")
|
||||
if p.strip()
|
||||
and not p.strip().startswith("```")
|
||||
and not p.strip().startswith("|")
|
||||
]
|
||||
if paras:
|
||||
first = paras[0]
|
||||
# Limit length
|
||||
if len(first) > 300:
|
||||
first = first[:300] + "..."
|
||||
answer_parts.append(first)
|
||||
answer_parts = _extract_body_parts(body)
|
||||
|
||||
if answer_parts:
|
||||
# Determine card type
|
||||
if "Definicja" in header or "Co to" in header:
|
||||
q = f"Co to jest: {header.replace('Definicja', '').strip()}?"
|
||||
elif "Charakterystyka" in header:
|
||||
q = f"Scharakteryzuj: {header.replace('Charakterystyka', '').strip()}"
|
||||
elif header.endswith("?"):
|
||||
q = header
|
||||
else:
|
||||
q = f"Omów: {header}"
|
||||
|
||||
# Format answer
|
||||
question = _make_question_text(header)
|
||||
if len(answer_parts) > 1:
|
||||
answer_html = format_list(answer_parts)
|
||||
else:
|
||||
@ -163,15 +203,20 @@ def extract_from_file(filepath) -> list[dict[str, str]]:
|
||||
|
||||
cards.append(
|
||||
{
|
||||
"front": clean_text(q),
|
||||
"front": clean_text(question),
|
||||
"back": answer_html,
|
||||
"tags": f"{base_tags} szczegoly",
|
||||
}
|
||||
)
|
||||
|
||||
# =====================================================
|
||||
# CARD TYPE 3: Algorithms/Formulas
|
||||
# =====================================================
|
||||
return cards
|
||||
|
||||
|
||||
def _extract_algo_cards(
|
||||
content: str, base_tags: str,
|
||||
) -> list[dict[str, str]]:
|
||||
"""Extract algorithm/formula cards."""
|
||||
cards: list[dict[str, str]] = []
|
||||
algo_patterns = [
|
||||
r"#### Złożoność(?:\s+czasowa)?\s*\n(.+?)(?=\n####|\n###|\Z)",
|
||||
r"Złożoność:\s*\*\*([^*]+)\*\*",
|
||||
@ -179,85 +224,137 @@ def extract_from_file(filepath) -> list[dict[str, str]]:
|
||||
|
||||
for pattern in algo_patterns:
|
||||
matches = re.findall(pattern, content, re.DOTALL)
|
||||
for match in matches[:2]:
|
||||
if len(match) > 10:
|
||||
# Find context - which algorithm?
|
||||
for algo_match in matches[:2]:
|
||||
if len(algo_match) > MIN_MATCH_LENGTH:
|
||||
algo_context = re.search(
|
||||
r"### (\d+\.\s*)?(.+?)(?=\n)", content[: content.find(match)]
|
||||
r"### (\d+\.\s*)?(.+?)(?=\n)",
|
||||
content[: content.find(algo_match)],
|
||||
)
|
||||
if algo_context:
|
||||
algo_name = algo_context.group(2).strip()
|
||||
cards.append(
|
||||
{
|
||||
"front": f"Jaka jest złożoność algorytmu/metody: {algo_name}?",
|
||||
"back": clean_text(match.strip()[:200]),
|
||||
"front": (
|
||||
"Jaka jest złożoność"
|
||||
f" algorytmu/metody: {algo_name}?"
|
||||
),
|
||||
"back": clean_text(
|
||||
algo_match.strip()[:200]
|
||||
),
|
||||
"tags": f"{base_tags} zlozonosc",
|
||||
}
|
||||
)
|
||||
break
|
||||
|
||||
# =====================================================
|
||||
# CARD TYPE 4: Comparisons (when file contains comparisons)
|
||||
# =====================================================
|
||||
return cards
|
||||
|
||||
|
||||
def _extract_comparison_cards(
|
||||
content: str, base_tags: str, num: str,
|
||||
) -> list[dict[str, str]]:
|
||||
"""Extract comparison cards."""
|
||||
compare_match = re.search(
|
||||
r"## .*(Porównanie|Zestawienie|vs).*\n(.+?)(?=\n## |\Z)",
|
||||
content,
|
||||
re.DOTALL | re.IGNORECASE,
|
||||
)
|
||||
if compare_match:
|
||||
compare_section = compare_match.group(2)
|
||||
# Extract comparison items
|
||||
items = re.findall(r"\|\s*\*\*([^|*]+)\*\*\s*\|([^|]+)\|", compare_section)
|
||||
if items:
|
||||
comparison_html = "<table><tr><th>Aspekt</th><th>Wartość</th></tr>"
|
||||
for aspect, value in items[:6]:
|
||||
comparison_html += f"<tr><td>{clean_text(aspect)}</td><td>{clean_text(value)}</td></tr>"
|
||||
comparison_html += "</table>"
|
||||
if not compare_match:
|
||||
return []
|
||||
|
||||
# Get comparison title
|
||||
title_match = re.search(
|
||||
r"## .*(Porównanie|Zestawienie).*?(\w+.*?(?:vs|i|oraz).*?\w+)",
|
||||
compare_match.group(0),
|
||||
re.IGNORECASE,
|
||||
)
|
||||
if title_match:
|
||||
cards.append(
|
||||
{
|
||||
"front": f"Porównaj kluczowe różnice w temacie: pytanie {num}",
|
||||
"back": comparison_html,
|
||||
"tags": f"{base_tags} porownanie",
|
||||
}
|
||||
)
|
||||
compare_section = compare_match.group(2)
|
||||
items = re.findall(
|
||||
r"\|\s*\*\*([^|*]+)\*\*\s*\|([^|]+)\|",
|
||||
compare_section,
|
||||
)
|
||||
if not items:
|
||||
return []
|
||||
|
||||
# =====================================================
|
||||
# CARD TYPE 5: Q&A from practice questions section
|
||||
# =====================================================
|
||||
qa_section = re.search(r"## 🎓 Pytania.*?\n(.+?)(?=\n## |\Z)", content, re.DOTALL)
|
||||
if qa_section:
|
||||
qa_content = qa_section.group(1)
|
||||
# Find Q&A pairs
|
||||
qas = re.findall(
|
||||
r'### Q\d+:?\s*["\']?(.+?)["\']?\s*\n.*?Odpowiedź:\s*\n?(.+?)(?=\n### |\Z)',
|
||||
qa_content,
|
||||
re.DOTALL,
|
||||
comparison_html = (
|
||||
"<table><tr><th>Aspekt</th><th>Wartość</th></tr>"
|
||||
)
|
||||
for aspect, value in items[:MAX_COMPARISON_ITEMS]:
|
||||
comparison_html += (
|
||||
f"<tr><td>{clean_text(aspect)}</td>"
|
||||
f"<td>{clean_text(value)}</td></tr>"
|
||||
)
|
||||
for q, a in qas[:3]:
|
||||
q = re.sub(r"\s+", " ", q.strip())
|
||||
a = a.strip()
|
||||
if len(a) > 30:
|
||||
# Limit answer length
|
||||
a_lines = a.split("\n")
|
||||
a_short = "\n".join(a_lines[:5])
|
||||
if len(a_short) > 400:
|
||||
a_short = a_short[:400] + "..."
|
||||
comparison_html += "</table>"
|
||||
|
||||
cards.append(
|
||||
{
|
||||
"front": clean_text(q),
|
||||
"back": clean_text(a_short).replace("\n", "<br>"),
|
||||
"tags": f"{base_tags} egzamin_praktyka",
|
||||
}
|
||||
)
|
||||
title_match = re.search(
|
||||
r"## .*(Porównanie|Zestawienie)"
|
||||
r".*?(\w+.*?(?:vs|i|oraz).*?\w+)",
|
||||
compare_match.group(0),
|
||||
re.IGNORECASE,
|
||||
)
|
||||
if not title_match:
|
||||
return []
|
||||
|
||||
return [
|
||||
{
|
||||
"front": (
|
||||
"Porównaj kluczowe różnice"
|
||||
f" w temacie: pytanie {num}"
|
||||
),
|
||||
"back": comparison_html,
|
||||
"tags": f"{base_tags} porownanie",
|
||||
}
|
||||
]
|
||||
|
||||
|
||||
def _extract_qa_cards(
|
||||
content: str, base_tags: str,
|
||||
) -> list[dict[str, str]]:
|
||||
"""Extract Q&A practice cards."""
|
||||
cards: list[dict[str, str]] = []
|
||||
qa_section = re.search(
|
||||
r"## 🎓 Pytania.*?\n(.+?)(?=\n## |\Z)",
|
||||
content,
|
||||
re.DOTALL,
|
||||
)
|
||||
if not qa_section:
|
||||
return cards
|
||||
|
||||
qa_content = qa_section.group(1)
|
||||
qas = re.findall(
|
||||
r"### Q\d+:?\s*[\"']?(.+?)[\"']?\s*\n"
|
||||
r".*?Odpowiedź:\s*\n?(.+?)(?=\n### |\Z)",
|
||||
qa_content,
|
||||
re.DOTALL,
|
||||
)
|
||||
for raw_q, raw_a in qas[:3]:
|
||||
question = re.sub(r"\s+", " ", raw_q.strip())
|
||||
answer = raw_a.strip()
|
||||
if len(answer) > MIN_QA_LENGTH:
|
||||
a_lines = answer.split("\n")
|
||||
a_short = "\n".join(a_lines[:5])
|
||||
if len(a_short) > MAX_ANSWER_LENGTH:
|
||||
a_short = a_short[:MAX_ANSWER_LENGTH] + "..."
|
||||
|
||||
cards.append(
|
||||
{
|
||||
"front": clean_text(question),
|
||||
"back": clean_text(a_short).replace(
|
||||
"\n", "<br>"
|
||||
),
|
||||
"tags": f"{base_tags} egzamin_praktyka",
|
||||
}
|
||||
)
|
||||
|
||||
return cards
|
||||
|
||||
|
||||
def extract_from_file(filepath: str) -> list[dict[str, str]]:
|
||||
"""Extract flashcard data from a markdown file."""
|
||||
num, subject, content = _get_file_metadata(filepath)
|
||||
base_tags = f"egzamin_magisterski pyt{num} {subject}"
|
||||
|
||||
cards: list[dict[str, str]] = []
|
||||
cards.extend(_extract_main_question_card(content, base_tags))
|
||||
cards.extend(_extract_subsection_cards(content, base_tags))
|
||||
cards.extend(_extract_algo_cards(content, base_tags))
|
||||
cards.extend(
|
||||
_extract_comparison_cards(content, base_tags, num)
|
||||
)
|
||||
cards.extend(_extract_qa_cards(content, base_tags))
|
||||
|
||||
return cards
|
||||
|
||||
@ -272,13 +369,13 @@ def main() -> None:
|
||||
all_cards = []
|
||||
|
||||
for md_file in sorted(odpowiedzi_dir.glob("*.md")):
|
||||
print(f"Processing: {md_file.name}", end=" ")
|
||||
logger.info("Processing: %s", md_file.name)
|
||||
try:
|
||||
cards = extract_from_file(md_file)
|
||||
all_cards.extend(cards)
|
||||
print(f"→ {len(cards)} cards")
|
||||
except Exception as e:
|
||||
print(f"→ ERROR: {e}")
|
||||
logger.info(" -> %d cards", len(cards))
|
||||
except (ValueError, OSError) as e:
|
||||
logger.info(" -> ERROR: %s", e)
|
||||
|
||||
# Remove potential duplicates (same front)
|
||||
seen = set()
|
||||
@ -306,23 +403,25 @@ def main() -> None:
|
||||
|
||||
f.write(f"{front}\t{back}\t{tags}\n")
|
||||
|
||||
print(f"\n{'=' * 50}")
|
||||
print(f"✅ Generated {len(unique_cards)} unique flashcards")
|
||||
print(f"📁 Saved to: {output_file}")
|
||||
print(f"{'=' * 50}")
|
||||
print("\n📋 IMPORT INSTRUCTIONS:")
|
||||
print("─" * 40)
|
||||
print("Anki Desktop:")
|
||||
print(" 1. File → Import")
|
||||
print(" 2. Select: anki_egzamin_magisterski.txt")
|
||||
print(" 3. Verify: Fields separated by Tab")
|
||||
print(" 4. Check: Allow HTML in fields")
|
||||
print(" 5. Click Import")
|
||||
print()
|
||||
print("AnkiWeb / AnkiDroid:")
|
||||
print(" 1. First import on Anki Desktop")
|
||||
print(" 2. Click Sync to upload to AnkiWeb")
|
||||
print(" 3. Sync on mobile to download")
|
||||
logger.info("=" * 50)
|
||||
logger.info(
|
||||
"Generated %d unique flashcards", len(unique_cards)
|
||||
)
|
||||
logger.info("Saved to: %s", output_file)
|
||||
logger.info("=" * 50)
|
||||
logger.info("IMPORT INSTRUCTIONS:")
|
||||
logger.info("-" * 40)
|
||||
logger.info("Anki Desktop:")
|
||||
logger.info(" 1. File -> Import")
|
||||
logger.info(" 2. Select: anki_egzamin_magisterski.txt")
|
||||
logger.info(" 3. Verify: Fields separated by Tab")
|
||||
logger.info(" 4. Check: Allow HTML in fields")
|
||||
logger.info(" 5. Click Import")
|
||||
logger.info("")
|
||||
logger.info("AnkiWeb / AnkiDroid:")
|
||||
logger.info(" 1. First import on Anki Desktop")
|
||||
logger.info(" 2. Click Sync to upload to AnkiWeb")
|
||||
logger.info(" 3. Sync on mobile to download")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
@ -6,12 +6,16 @@ Creates a tab-separated file compatible with Anki import.
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
from pathlib import Path
|
||||
import re
|
||||
import traceback
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
MIN_HEADER_WORDS = 3
|
||||
|
||||
|
||||
def extract_main_question(content, filename) -> str:
|
||||
def extract_main_question(content: str, filename: str) -> str:
|
||||
"""Extract the main exam question from the file."""
|
||||
# Extract the main question from ## Pytanie section
|
||||
question_match = re.search(
|
||||
@ -26,13 +30,13 @@ def extract_main_question(content, filename) -> str:
|
||||
return title_match.group(1) if title_match else filename
|
||||
|
||||
|
||||
def extract_subject(content) -> str:
|
||||
def extract_subject(content: str) -> str:
|
||||
"""Extract the subject code."""
|
||||
subject_match = re.search(r"Przedmiot:\s*(\w+)", content)
|
||||
return subject_match.group(1) if subject_match else "Ogólne"
|
||||
|
||||
|
||||
def extract_key_points(content) -> list[str]:
|
||||
def extract_key_points(content: str) -> list[str]:
|
||||
"""Extract key points from the main answer section."""
|
||||
points = []
|
||||
|
||||
@ -51,14 +55,14 @@ def extract_key_points(content) -> list[str]:
|
||||
headers = re.findall(r"^### (.+)$", answer_text, re.MULTILINE)
|
||||
for h in headers[:6]:
|
||||
# Clean header
|
||||
h = re.sub(r"\d+\.\s*", "", h).strip()
|
||||
if h and len(h) > 3:
|
||||
points.append(h)
|
||||
cleaned = re.sub(r"\d+\.\s*", "", h).strip()
|
||||
if cleaned and len(cleaned) > MIN_HEADER_WORDS:
|
||||
points.append(cleaned)
|
||||
|
||||
return points
|
||||
|
||||
|
||||
def extract_definitions(content) -> list[tuple[str, str]]:
|
||||
def extract_definitions(content: str) -> list[tuple[str, str]]:
|
||||
"""Extract key definitions from the content."""
|
||||
definitions = []
|
||||
|
||||
@ -66,9 +70,9 @@ def extract_definitions(content) -> list[tuple[str, str]]:
|
||||
pattern = r"\*\*([^*\n]+)\*\*\s*[--:]\s*([^*\n]{20,150})"
|
||||
matches = re.findall(pattern, content)
|
||||
|
||||
for term, definition in matches:
|
||||
term = term.strip()
|
||||
definition = definition.strip()
|
||||
for raw_term, raw_def in matches:
|
||||
term = raw_term.strip()
|
||||
definition = raw_def.strip()
|
||||
# Filter out non-definition patterns
|
||||
if (
|
||||
term
|
||||
@ -81,7 +85,7 @@ def extract_definitions(content) -> list[tuple[str, str]]:
|
||||
return definitions[:5]
|
||||
|
||||
|
||||
def clean_html(text) -> str:
|
||||
def clean_html(text: str) -> str:
|
||||
"""Convert markdown to HTML and clean for Anki."""
|
||||
if not text:
|
||||
return ""
|
||||
@ -101,7 +105,7 @@ def clean_html(text) -> str:
|
||||
return text.strip()
|
||||
|
||||
|
||||
def process_file(filepath) -> list[dict[str, str]]:
|
||||
def process_file(filepath: str) -> list[dict[str, str]]:
|
||||
"""Process a single file and return flashcards."""
|
||||
with Path(filepath).open(encoding="utf-8") as f:
|
||||
content = f.read()
|
||||
@ -111,11 +115,7 @@ def process_file(filepath) -> list[dict[str, str]]:
|
||||
# Extract metadata
|
||||
filename = Path(filepath).name
|
||||
match = re.match(r"(\d+)-(.+)\.md", filename)
|
||||
if match:
|
||||
num = match.group(1)
|
||||
match.group(2).replace("-", "_")
|
||||
else:
|
||||
num = "00"
|
||||
num = match.group(1) if match else "00"
|
||||
|
||||
subject = extract_subject(content)
|
||||
main_question = extract_main_question(content, filename)
|
||||
@ -156,14 +156,13 @@ def main() -> None:
|
||||
|
||||
# Process each file
|
||||
for md_file in sorted(odpowiedzi_dir.glob("*.md")):
|
||||
print(f"Processing: {md_file.name}")
|
||||
logger.info("Processing: %s", md_file.name)
|
||||
try:
|
||||
cards = process_file(md_file)
|
||||
all_cards.extend(cards)
|
||||
print(f" -> {len(cards)} cards")
|
||||
except Exception as e:
|
||||
print(f" -> Error: {e}")
|
||||
traceback.print_exc()
|
||||
logger.info(" -> %d cards", len(cards))
|
||||
except (ValueError, OSError):
|
||||
logger.exception(" -> Error processing file")
|
||||
|
||||
# Write Anki-compatible file
|
||||
with Path(output_file).open("w", encoding="utf-8") as f:
|
||||
@ -186,16 +185,22 @@ def main() -> None:
|
||||
|
||||
f.write(f"{front}\t{back}\t{tags}\n")
|
||||
|
||||
print(f"\n✅ Created {len(all_cards)} flashcards")
|
||||
print(f"📁 Output: {output_file}")
|
||||
print("\n=== Import Instructions ===")
|
||||
print("1. Open Anki desktop → File → Import")
|
||||
print("2. Select: anki_egzamin_magisterski.txt")
|
||||
print("3. Set 'Fields separated by: Tab'")
|
||||
print("4. Check 'Allow HTML in fields'")
|
||||
print("5. Map: Field 1 → Front, Field 2 → Back, Field 3 → Tags")
|
||||
print("6. Click Import")
|
||||
print("\nFor AnkiWeb/AnkiDroid: Sync after importing on desktop")
|
||||
logger.info("Created %d flashcards", len(all_cards))
|
||||
logger.info("Output: %s", output_file)
|
||||
logger.info("=== Import Instructions ===")
|
||||
logger.info("1. Open Anki desktop -> File -> Import")
|
||||
logger.info("2. Select: anki_egzamin_magisterski.txt")
|
||||
logger.info("3. Set 'Fields separated by: Tab'")
|
||||
logger.info("4. Check 'Allow HTML in fields'")
|
||||
logger.info(
|
||||
"5. Map: Field 1 -> Front, Field 2 -> Back,"
|
||||
" Field 3 -> Tags"
|
||||
)
|
||||
logger.info("6. Click Import")
|
||||
logger.info(
|
||||
"For AnkiWeb/AnkiDroid:"
|
||||
" Sync after importing on desktop"
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
@ -3,11 +3,18 @@
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
from pathlib import Path
|
||||
import re
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
def clean_text(text) -> str:
|
||||
MIN_PARA_LENGTH = 20
|
||||
MAX_PARA_LENGTH = 400
|
||||
MIN_BODY_LENGTH = 80
|
||||
|
||||
|
||||
def clean_text(text: str) -> str:
|
||||
"""Clean text for Anki."""
|
||||
if not text:
|
||||
return ""
|
||||
@ -19,7 +26,7 @@ def clean_text(text) -> str:
|
||||
return text.strip()
|
||||
|
||||
|
||||
def extract_real_answer(content, section_name) -> str | None:
|
||||
def extract_real_answer(content: str, section_name: str) -> str | None:
|
||||
"""Extract actual content from a section, not just headers."""
|
||||
# Find the section
|
||||
pattern = rf"### (?:\d+\.\s*)?{re.escape(section_name)}\s*\n((?:(?!^### ).)+)"
|
||||
@ -52,19 +59,21 @@ def extract_real_answer(content, section_name) -> str | None:
|
||||
for p in body.split("\n\n")
|
||||
if p.strip() and not p.startswith("```") and not p.startswith("|")
|
||||
]
|
||||
for p in paras[:2]:
|
||||
if len(p) > 20 and len(p) < 400:
|
||||
lines.append(p)
|
||||
lines.extend(
|
||||
p for p in paras[:2]
|
||||
if len(p) > MIN_PARA_LENGTH and len(p) < MAX_PARA_LENGTH
|
||||
)
|
||||
|
||||
return "<br>".join(lines[:6]) if lines else None
|
||||
|
||||
|
||||
def extract_cards(filepath) -> list[dict[str, str]]:
|
||||
"""Extract flashcards from a file."""
|
||||
def _read_file_metadata(
|
||||
filepath: str | Path,
|
||||
) -> tuple[str, str, str | None]:
|
||||
"""Read file and extract metadata."""
|
||||
with Path(filepath).open(encoding="utf-8") as f:
|
||||
content = f.read()
|
||||
|
||||
cards = []
|
||||
filename = Path(filepath).name
|
||||
match = re.match(r"(\d+)-(.+)\.md", filename)
|
||||
num = match.group(1) if match else "00"
|
||||
@ -73,182 +82,228 @@ def extract_cards(filepath) -> list[dict[str, str]]:
|
||||
subject = subj_match.group(1) if subj_match else "Ogólne"
|
||||
base_tags = f"egzamin_magisterski pyt{num} {subject}"
|
||||
|
||||
# Get main question
|
||||
q_match = re.search(
|
||||
r'## Pytanie\s*\n\s*\*\*["\']?(.+?)["\']?\*\*', content, re.DOTALL
|
||||
r'## Pytanie\s*\n\s*\*\*["\']?(.+?)["\']?\*\*',
|
||||
content,
|
||||
re.DOTALL,
|
||||
)
|
||||
main_question = (
|
||||
re.sub(r"\s+", " ", q_match.group(1).strip()) if q_match else None
|
||||
)
|
||||
main_question = re.sub(r"\s+", " ", q_match.group(1).strip()) if q_match else None
|
||||
|
||||
# ===============================================
|
||||
# MAIN CARD: Question with REAL answer summary
|
||||
# ===============================================
|
||||
if main_question:
|
||||
# Build a real answer from the main sections
|
||||
answer_parts = []
|
||||
return content, base_tags, main_question
|
||||
|
||||
# For automata question - extract key facts about each automaton
|
||||
if "automat" in main_question.lower() or "maszyn" in main_question.lower():
|
||||
# FA
|
||||
fa_match = re.search(
|
||||
r"Automat Skończony.*?Rozpoznawana klasa języków\s*\n\s*\*\*([^*]+)\*\*",
|
||||
content,
|
||||
re.DOTALL,
|
||||
|
||||
def _extract_automata_facts(content: str) -> list[str]:
|
||||
"""Extract automata-specific facts."""
|
||||
parts: list[str] = []
|
||||
automata = [
|
||||
("Automat Skończony", "FA"),
|
||||
("Automat ze Stosem", "PDA"),
|
||||
("Maszyna Turinga", "TM"),
|
||||
]
|
||||
for name, abbrev in automata:
|
||||
pattern = (
|
||||
rf"{name}.*?Rozpoznawana klasa języków"
|
||||
r"\s*\n\s*\*\*([^*]+)\*\*"
|
||||
)
|
||||
match = re.search(pattern, content, re.DOTALL)
|
||||
if match:
|
||||
parts.append(
|
||||
f"<b>{name} ({abbrev})</b>: "
|
||||
f"{match.group(1).strip()}"
|
||||
)
|
||||
if fa_match:
|
||||
answer_parts.append(
|
||||
f"<b>Automat Skończony (FA)</b>: {fa_match.group(1).strip()}"
|
||||
)
|
||||
return parts
|
||||
|
||||
# PDA
|
||||
pda_match = re.search(
|
||||
r"Automat ze Stosem.*?Rozpoznawana klasa języków\s*\n\s*\*\*([^*]+)\*\*",
|
||||
content,
|
||||
re.DOTALL,
|
||||
)
|
||||
if pda_match:
|
||||
answer_parts.append(
|
||||
f"<b>Automat ze Stosem (PDA)</b>: {pda_match.group(1).strip()}"
|
||||
)
|
||||
|
||||
# TM
|
||||
tm_match = re.search(
|
||||
r"Maszyna Turinga.*?Rozpoznawana klasa języków\s*\n\s*\*\*([^*]+)\*\*",
|
||||
content,
|
||||
re.DOTALL,
|
||||
)
|
||||
if tm_match:
|
||||
answer_parts.append(
|
||||
f"<b>Maszyna Turinga (TM)</b>: {tm_match.group(1).strip()}"
|
||||
)
|
||||
def _extract_generic_facts(content: str) -> list[str]:
|
||||
"""Extract generic definitions and summaries."""
|
||||
parts: list[str] = []
|
||||
key_patterns = [
|
||||
r"#### Definicja\s*\n([^\n#]+)",
|
||||
r"#### Charakterystyka\s*\n([^\n#]+)",
|
||||
r"\*\*Definicja[:\s]*\*\*\s*([^\n]+)",
|
||||
]
|
||||
for pattern in key_patterns:
|
||||
parts.extend(
|
||||
found.strip()
|
||||
for found in re.findall(pattern, content)[:3]
|
||||
if len(found) > MIN_PARA_LENGTH
|
||||
)
|
||||
return parts
|
||||
|
||||
# Generic extraction if specific didn't work
|
||||
if not answer_parts:
|
||||
# Look for key definitions/summaries
|
||||
key_patterns = [
|
||||
r"#### Definicja\s*\n([^\n#]+)",
|
||||
r"#### Charakterystyka\s*\n([^\n#]+)",
|
||||
r"\*\*Definicja[:\s]*\*\*\s*([^\n]+)",
|
||||
]
|
||||
for pattern in key_patterns:
|
||||
for match in re.findall(pattern, content)[:3]:
|
||||
if len(match) > 20:
|
||||
answer_parts.append(match.strip())
|
||||
|
||||
# Still nothing? Get first substantive paragraph from main answer
|
||||
if not answer_parts:
|
||||
main_answer = re.search(
|
||||
r"## 📚 Odpowiedź główna\s*\n(.+?)(?=\n## |\Z)", content, re.DOTALL
|
||||
)
|
||||
if main_answer:
|
||||
# Skip headers, get actual content
|
||||
text = main_answer.group(1)
|
||||
paras = re.findall(r"\n\n([^#\n][^\n]{50,300})", text)
|
||||
answer_parts = paras[:3]
|
||||
def _extract_first_paragraphs(content: str) -> list[str]:
|
||||
"""Extract first substantive paragraphs from main answer."""
|
||||
main_answer = re.search(
|
||||
r"## 📚 Odpowiedź główna\s*\n(.+?)(?=\n## |\Z)",
|
||||
content,
|
||||
re.DOTALL,
|
||||
)
|
||||
if not main_answer:
|
||||
return []
|
||||
text = main_answer.group(1)
|
||||
paras = re.findall(r"\n\n([^#\n][^\n]{50,300})", text)
|
||||
return paras[:3]
|
||||
|
||||
if answer_parts:
|
||||
answer = "<br><br>".join([clean_text(p) for p in answer_parts])
|
||||
cards.append(
|
||||
{
|
||||
"front": clean_text(main_question),
|
||||
"back": answer,
|
||||
"tags": f"{base_tags} pytanie_glowne",
|
||||
}
|
||||
|
||||
def _build_main_card(
|
||||
content: str,
|
||||
main_question: str | None,
|
||||
base_tags: str,
|
||||
) -> dict[str, str] | None:
|
||||
"""Build the main question card."""
|
||||
if not main_question:
|
||||
return None
|
||||
|
||||
answer_parts: list[str] = []
|
||||
if (
|
||||
"automat" in main_question.lower()
|
||||
or "maszyn" in main_question.lower()
|
||||
):
|
||||
answer_parts = _extract_automata_facts(content)
|
||||
|
||||
if not answer_parts:
|
||||
answer_parts = _extract_generic_facts(content)
|
||||
|
||||
if not answer_parts:
|
||||
answer_parts = _extract_first_paragraphs(content)
|
||||
|
||||
if not answer_parts:
|
||||
return None
|
||||
|
||||
answer = "<br><br>".join(
|
||||
clean_text(p) for p in answer_parts
|
||||
)
|
||||
return {
|
||||
"front": clean_text(main_question),
|
||||
"back": answer,
|
||||
"tags": f"{base_tags} pytanie_glowne",
|
||||
}
|
||||
|
||||
|
||||
def _extract_section_content(body: str) -> list[str]:
|
||||
"""Extract content lines from a section body."""
|
||||
answer_lines: list[str] = []
|
||||
|
||||
def_match = re.search(
|
||||
r"#### Definicja[^\n]*\n([^\n#]+(?:\n[^\n#]+)?)", body,
|
||||
)
|
||||
if def_match:
|
||||
answer_lines.append(def_match.group(1).strip())
|
||||
|
||||
char_match = re.search(
|
||||
r"#### Charakterystyka\s*\n((?:[-•][^\n]+\n?)+)", body,
|
||||
)
|
||||
if char_match:
|
||||
bullets = re.findall(
|
||||
r"[-•]\s*\*\*([^*]+)\*\*[:\s]*([^\n]*)",
|
||||
char_match.group(1),
|
||||
)
|
||||
for term, desc in bullets[:4]:
|
||||
answer_lines.append(
|
||||
f"• <b>{term}</b>: {desc.strip()}"
|
||||
if desc
|
||||
else f"• <b>{term}</b>"
|
||||
)
|
||||
|
||||
# ===============================================
|
||||
# CONCEPT CARDS: Specific topics with real content
|
||||
# ===============================================
|
||||
# Find all ### sections and extract their actual content
|
||||
if not answer_lines:
|
||||
bullets = re.findall(
|
||||
r"[-•]\s*\*\*([^*]+)\*\*[:\s]*([^\n]*)", body,
|
||||
)
|
||||
for term, desc in bullets[:5]:
|
||||
answer_lines.append(
|
||||
f"• <b>{term}</b>: {desc.strip()}"
|
||||
if desc
|
||||
else f"• <b>{term}</b>"
|
||||
)
|
||||
|
||||
if not answer_lines:
|
||||
first_para = re.search(
|
||||
r"^([^#\n\-•|`][^\n]{30,250})", body, re.MULTILINE,
|
||||
)
|
||||
if first_para:
|
||||
answer_lines.append(first_para.group(1))
|
||||
|
||||
return answer_lines
|
||||
|
||||
|
||||
def _build_concept_cards(
|
||||
content: str, base_tags: str,
|
||||
) -> list[dict[str, str]]:
|
||||
"""Build concept cards from ### sections."""
|
||||
cards: list[dict[str, str]] = []
|
||||
sections = re.findall(
|
||||
r"^### (?:\d+\.\s*)?([^\n]+)\n((?:(?!^### ).)*)",
|
||||
content,
|
||||
re.MULTILINE | re.DOTALL,
|
||||
)
|
||||
|
||||
for header, body in sections:
|
||||
header = header.strip()
|
||||
body = body.strip()
|
||||
for raw_header, raw_body in sections:
|
||||
header = raw_header.strip()
|
||||
body = raw_body.strip()
|
||||
|
||||
# Skip short sections, mnemonics, examples
|
||||
if (
|
||||
len(body) < 80
|
||||
len(body) < MIN_BODY_LENGTH
|
||||
or "Przykład" in header
|
||||
or "Mnemonic" in header
|
||||
or '"' in header
|
||||
):
|
||||
continue
|
||||
|
||||
# Extract real content
|
||||
answer_lines = []
|
||||
|
||||
# Get definition if present
|
||||
def_match = re.search(r"#### Definicja[^\n]*\n([^\n#]+(?:\n[^\n#]+)?)", body)
|
||||
if def_match:
|
||||
answer_lines.append(def_match.group(1).strip())
|
||||
|
||||
# Get characterization
|
||||
char_match = re.search(r"#### Charakterystyka\s*\n((?:[-•][^\n]+\n?)+)", body)
|
||||
if char_match:
|
||||
bullets = re.findall(
|
||||
r"[-•]\s*\*\*([^*]+)\*\*[:\s]*([^\n]*)", char_match.group(1)
|
||||
)
|
||||
for term, desc in bullets[:4]:
|
||||
answer_lines.append(
|
||||
f"• <b>{term}</b>: {desc.strip()}" if desc else f"• <b>{term}</b>"
|
||||
)
|
||||
|
||||
# Get bullet points if no structured content yet
|
||||
answer_lines = _extract_section_content(body)
|
||||
if not answer_lines:
|
||||
bullets = re.findall(r"[-•]\s*\*\*([^*]+)\*\*[:\s]*([^\n]*)", body)
|
||||
for term, desc in bullets[:5]:
|
||||
answer_lines.append(
|
||||
f"• <b>{term}</b>: {desc.strip()}" if desc else f"• <b>{term}</b>"
|
||||
)
|
||||
continue
|
||||
|
||||
# Get first paragraph if still nothing
|
||||
if not answer_lines:
|
||||
first_para = re.search(r"^([^#\n\-•|`][^\n]{30,250})", body, re.MULTILINE)
|
||||
if first_para:
|
||||
answer_lines.append(first_para.group(1))
|
||||
question = (
|
||||
header if header.endswith("?") else f"Wyjaśnij: {header}"
|
||||
)
|
||||
answer = "<br>".join(
|
||||
clean_text(line) for line in answer_lines
|
||||
)
|
||||
cards.append(
|
||||
{
|
||||
"front": clean_text(question),
|
||||
"back": answer,
|
||||
"tags": f"{base_tags} szczegoly",
|
||||
}
|
||||
)
|
||||
|
||||
if answer_lines:
|
||||
question = f"Wyjaśnij: {header}" if not header.endswith("?") else header
|
||||
answer = "<br>".join([clean_text(l) for l in answer_lines])
|
||||
return cards
|
||||
|
||||
cards.append(
|
||||
{
|
||||
"front": clean_text(question),
|
||||
"back": answer,
|
||||
"tags": f"{base_tags} szczegoly",
|
||||
}
|
||||
)
|
||||
|
||||
# ===============================================
|
||||
# Q&A CARDS: From practice questions section
|
||||
# ===============================================
|
||||
def _build_qa_cards(
|
||||
content: str, base_tags: str,
|
||||
) -> list[dict[str, str]]:
|
||||
"""Build Q&A practice cards."""
|
||||
cards: list[dict[str, str]] = []
|
||||
qa_matches = re.findall(
|
||||
r'### Q\d+:\s*["\']?([^"\'?\n]+)\?*["\']?\s*\n.*?Odpowiedź:\s*\n(.+?)(?=\n### |\n## |\Z)',
|
||||
r'### Q\d+:\s*["\']?([^"\'?\n]+)\?*["\']?\s*\n'
|
||||
r".*?Odpowiedź:\s*\n(.+?)(?=\n### |\n## |\Z)",
|
||||
content,
|
||||
re.DOTALL,
|
||||
)
|
||||
|
||||
for question, answer in qa_matches[:5]:
|
||||
question = question.strip()
|
||||
answer = answer.strip()
|
||||
for raw_question, raw_answer in qa_matches[:5]:
|
||||
question = raw_question.strip()
|
||||
answer_text = raw_answer.strip()
|
||||
|
||||
# Clean up answer - get first meaningful part
|
||||
answer_lines = answer.split("\n")
|
||||
clean_answer = []
|
||||
for line in answer_lines[:6]:
|
||||
line = line.strip()
|
||||
if line and not line.startswith("```") and not line.startswith("|"):
|
||||
clean_answer.append(line)
|
||||
answer_lines = answer_text.split("\n")
|
||||
clean_answer = [
|
||||
stripped
|
||||
for raw_line in answer_lines[:6]
|
||||
if (stripped := raw_line.strip())
|
||||
and not stripped.startswith("```")
|
||||
and not stripped.startswith("|")
|
||||
]
|
||||
|
||||
if clean_answer:
|
||||
cards.append(
|
||||
{
|
||||
"front": clean_text(question + "?"),
|
||||
"back": "<br>".join([clean_text(l) for l in clean_answer]),
|
||||
"back": "<br>".join(
|
||||
clean_text(line) for line in clean_answer
|
||||
),
|
||||
"tags": f"{base_tags} qa",
|
||||
}
|
||||
)
|
||||
@ -256,6 +311,20 @@ def extract_cards(filepath) -> list[dict[str, str]]:
|
||||
return cards
|
||||
|
||||
|
||||
def extract_cards(filepath: str | Path) -> list[dict[str, str]]:
|
||||
"""Extract flashcards from a file."""
|
||||
content, base_tags, main_question = _read_file_metadata(filepath)
|
||||
|
||||
cards: list[dict[str, str]] = []
|
||||
main_card = _build_main_card(content, main_question, base_tags)
|
||||
if main_card:
|
||||
cards.append(main_card)
|
||||
|
||||
cards.extend(_build_concept_cards(content, base_tags))
|
||||
cards.extend(_build_qa_cards(content, base_tags))
|
||||
return cards
|
||||
|
||||
|
||||
def main() -> None:
|
||||
"""Main."""
|
||||
odpowiedzi_dir = Path("/home/kuchy/praca_magisterska/pytania/odpowiedzi")
|
||||
@ -266,13 +335,13 @@ def main() -> None:
|
||||
all_cards = []
|
||||
|
||||
for md_file in sorted(odpowiedzi_dir.glob("*.md")):
|
||||
print(f"Processing: {md_file.name}", end=" ")
|
||||
logger.info("Processing: %s", md_file.name)
|
||||
try:
|
||||
cards = extract_cards(md_file)
|
||||
all_cards.extend(cards)
|
||||
print(f"→ {len(cards)} cards")
|
||||
except Exception as e:
|
||||
print(f"→ ERROR: {e}")
|
||||
logger.info(" -> %d cards", len(cards))
|
||||
except (ValueError, OSError):
|
||||
logger.exception(" -> Error processing file")
|
||||
|
||||
# Remove duplicates
|
||||
seen = set()
|
||||
@ -299,8 +368,12 @@ def main() -> None:
|
||||
tags = card["tags"]
|
||||
f.write(f"{front}\t{back}\t{tags}\n")
|
||||
|
||||
print(f"\n✅ Generated {len(unique_cards)} flashcards")
|
||||
print(f"📁 Output: {output_file}")
|
||||
logger.info(
|
||||
"Generated %d unique cards from %d total",
|
||||
len(unique_cards),
|
||||
len(all_cards),
|
||||
)
|
||||
logger.info("Output: %s", output_file)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@ -7,6 +7,8 @@ Designed for A4 laser printer output (300 DPI, black & white).
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
|
||||
import matplotlib as mpl
|
||||
|
||||
mpl.use("Agg")
|
||||
@ -20,6 +22,8 @@ if TYPE_CHECKING:
|
||||
from matplotlib.axes import Axes
|
||||
from matplotlib.figure import Figure
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
OUTPUT_DIR = str(Path(__file__).resolve().parent / "img")
|
||||
Path(OUTPUT_DIR).mkdir(parents=True, exist_ok=True)
|
||||
|
||||
@ -33,19 +37,35 @@ FIXED_COLOR = "#D0F0D0" # light green-ish gray for fixed
|
||||
FD_ARROW_COLOR = "#444444"
|
||||
|
||||
|
||||
def _compute_col_widths(
|
||||
headers: list[str],
|
||||
rows: list[list[str]],
|
||||
) -> list[float]:
|
||||
"""Auto-calculate column widths based on content."""
|
||||
col_widths: list[float] = []
|
||||
for c in range(len(headers)):
|
||||
max_len = len(headers[c])
|
||||
for r in rows:
|
||||
if c < len(r):
|
||||
max_len = max(max_len, len(str(r[c])))
|
||||
col_widths.append(max(max_len * 0.08 + 0.1, 0.5))
|
||||
return col_widths
|
||||
|
||||
|
||||
def draw_table(
|
||||
ax,
|
||||
x,
|
||||
y,
|
||||
title,
|
||||
headers,
|
||||
rows,
|
||||
col_widths=None,
|
||||
highlight_cols=None,
|
||||
highlight_rows=None,
|
||||
highlight_cells=None,
|
||||
strikethrough_cells=None,
|
||||
title_fontsize=9,
|
||||
ax: Axes,
|
||||
x: float,
|
||||
y: float,
|
||||
title: str,
|
||||
headers: list[str],
|
||||
rows: list[list[str]],
|
||||
*,
|
||||
col_widths: list[float] | None = None,
|
||||
highlight_cols: set[int] | None = None,
|
||||
highlight_rows: set[int] | None = None,
|
||||
highlight_cells: set[tuple[int, int]] | None = None,
|
||||
strikethrough_cells: set[tuple[int, int]] | None = None,
|
||||
title_fontsize: int = 9,
|
||||
) -> tuple[float, float]:
|
||||
"""Draw a single table on the axes at position (x, y).
|
||||
|
||||
@ -66,18 +86,10 @@ def draw_table(
|
||||
Returns:
|
||||
(width, height) of the drawn table
|
||||
"""
|
||||
n_cols = len(headers)
|
||||
n_rows = len(rows)
|
||||
|
||||
if col_widths is None:
|
||||
# Auto-calculate based on content
|
||||
col_widths = []
|
||||
for c in range(n_cols):
|
||||
max_len = len(headers[c])
|
||||
for r in rows:
|
||||
if c < len(r):
|
||||
max_len = max(max_len, len(str(r[c])))
|
||||
col_widths.append(max(max_len * 0.08 + 0.1, 0.5))
|
||||
col_widths = _compute_col_widths(headers, rows)
|
||||
|
||||
row_height = 0.22
|
||||
total_width = sum(col_widths)
|
||||
@ -172,7 +184,10 @@ def draw_table(
|
||||
return total_width, total_height + 0.25 # extra for title
|
||||
|
||||
|
||||
def create_figure(width_inches=11.69, height_inches=8.27) -> tuple[Figure, Axes]:
|
||||
def create_figure(
|
||||
width_inches: float = 11.69,
|
||||
height_inches: float = 8.27,
|
||||
) -> tuple[Figure, Axes]:
|
||||
"""Create A4 landscape figure."""
|
||||
fig, ax = plt.subplots(1, 1, figsize=(width_inches, height_inches), dpi=DPI)
|
||||
ax.set_xlim(0, width_inches)
|
||||
@ -182,7 +197,16 @@ def create_figure(width_inches=11.69, height_inches=8.27) -> tuple[Figure, Axes]
|
||||
return fig, ax
|
||||
|
||||
|
||||
def add_arrow(ax, x1, y1, x2, y2, label="", color="black") -> None:
|
||||
def add_arrow(
|
||||
ax: Axes,
|
||||
x1: float,
|
||||
y1: float,
|
||||
x2: float,
|
||||
y2: float,
|
||||
label: str = "",
|
||||
*,
|
||||
color: str = "black",
|
||||
) -> None:
|
||||
"""Draw an arrow with optional label."""
|
||||
ax.annotate(
|
||||
"",
|
||||
@ -205,7 +229,15 @@ def add_arrow(ax, x1, y1, x2, y2, label="", color="black") -> None:
|
||||
|
||||
|
||||
def add_label(
|
||||
ax, x, y, text, fontsize=8, color="black", ha="left", style="normal"
|
||||
ax: Axes,
|
||||
x: float,
|
||||
y: float,
|
||||
text: str,
|
||||
*,
|
||||
fontsize: int = 8,
|
||||
color: str = "black",
|
||||
ha: str = "left",
|
||||
style: str = "normal",
|
||||
) -> None:
|
||||
"""Add a text label."""
|
||||
ax.text(
|
||||
@ -289,7 +321,10 @@ def draw_0nf() -> None:
|
||||
ax,
|
||||
0.8,
|
||||
1.2,
|
||||
"Zaleznosci funkcyjne: StID -> Imie, WydzialID | WydzialID -> NazwaWydzialu",
|
||||
(
|
||||
"Zaleznosci funkcyjne: StID -> Imie, WydzialID"
|
||||
" | WydzialID -> NazwaWydzialu"
|
||||
),
|
||||
fontsize=8,
|
||||
color="#333333",
|
||||
)
|
||||
@ -297,7 +332,10 @@ def draw_0nf() -> None:
|
||||
ax,
|
||||
0.8,
|
||||
0.9,
|
||||
" KursID -> NazwaKursu | (StID,KursID) -> Prowadzacy | Prowadzacy -> KursID",
|
||||
(
|
||||
" KursID -> NazwaKursu | (StID,KursID)"
|
||||
" -> Prowadzacy | Prowadzacy -> KursID"
|
||||
),
|
||||
fontsize=8,
|
||||
color="#333333",
|
||||
)
|
||||
@ -309,7 +347,7 @@ def draw_0nf() -> None:
|
||||
pad_inches=0.2,
|
||||
)
|
||||
plt.close(fig)
|
||||
print("Generated: nf_0nf_table.png")
|
||||
logger.info("Generated: nf_0nf_table.png")
|
||||
|
||||
|
||||
# ============================================================
|
||||
@ -399,7 +437,10 @@ def draw_1nf() -> None:
|
||||
ax,
|
||||
0.5,
|
||||
1.5,
|
||||
" Imie, WydzialID, NazwaWydzialu zaleza TYLKO od StID (czesc klucza).",
|
||||
(
|
||||
" Imie, WydzialID, NazwaWydzialu"
|
||||
" zaleza TYLKO od StID (czesc klucza)."
|
||||
),
|
||||
fontsize=9,
|
||||
color="black",
|
||||
)
|
||||
@ -419,7 +460,7 @@ def draw_1nf() -> None:
|
||||
pad_inches=0.2,
|
||||
)
|
||||
plt.close(fig)
|
||||
print("Generated: nf_1nf_tables.png")
|
||||
logger.info("Generated: nf_1nf_tables.png")
|
||||
|
||||
|
||||
# ============================================================
|
||||
@ -477,7 +518,10 @@ def draw_2nf() -> None:
|
||||
ax,
|
||||
0.3,
|
||||
3.3,
|
||||
"KROK: Rozbito czesc. zaleznosci — atrybuty zalezne od czesci klucza wydzielone.",
|
||||
(
|
||||
"KROK: Rozbito czesc. zaleznosci"
|
||||
" — atrybuty zalezne od czesci klucza wydzielone."
|
||||
),
|
||||
fontsize=9,
|
||||
)
|
||||
add_label(
|
||||
@ -528,7 +572,7 @@ def draw_2nf() -> None:
|
||||
pad_inches=0.2,
|
||||
)
|
||||
plt.close(fig)
|
||||
print("Generated: nf_2nf_tables.png")
|
||||
logger.info("Generated: nf_2nf_tables.png")
|
||||
|
||||
|
||||
# ============================================================
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue
Block a user