mirror of
https://github.com/kuhyx/testsAndMisc.git
synced 2026-07-04 15:43:06 +02:00
refactor(praca_magisterska_video): fix ruff violations and remove noqa from diagram generators
- Add type annotations, docstrings, and constants - Remove commented-out code and print statements - Fix all lint issues in 11 generate_images files
This commit is contained in:
parent
1e108d1e3f
commit
d488c87203
@ -7,11 +7,17 @@
|
|||||||
|
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import logging
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
import re
|
import re
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
def clean_text(text) -> str:
|
MIN_BODY_LENGTH = 50
|
||||||
|
MIN_ANSWER_LENGTH = 100
|
||||||
|
|
||||||
|
|
||||||
|
def clean_text(text: str) -> str:
|
||||||
"""Clean text."""
|
"""Clean text."""
|
||||||
if not text:
|
if not text:
|
||||||
return ""
|
return ""
|
||||||
@ -23,7 +29,7 @@ def clean_text(text) -> str:
|
|||||||
return text.strip()
|
return text.strip()
|
||||||
|
|
||||||
|
|
||||||
def extract_cards(filepath) -> list[dict[str, str]]:
|
def extract_cards(filepath: str) -> list[dict[str, str]]:
|
||||||
"""Extract cards."""
|
"""Extract cards."""
|
||||||
with Path(filepath).open(encoding="utf-8") as f:
|
with Path(filepath).open(encoding="utf-8") as f:
|
||||||
content = f.read()
|
content = f.read()
|
||||||
@ -68,10 +74,10 @@ def extract_cards(filepath) -> list[dict[str, str]]:
|
|||||||
content,
|
content,
|
||||||
re.MULTILINE | re.DOTALL,
|
re.MULTILINE | re.DOTALL,
|
||||||
)
|
)
|
||||||
for header, body in sections:
|
for raw_header, raw_body in sections:
|
||||||
header = header.strip()
|
header = raw_header.strip()
|
||||||
body = body.strip()
|
body = raw_body.strip()
|
||||||
if len(body) < 50:
|
if len(body) < MIN_BODY_LENGTH:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
# Get first paragraph
|
# Get first paragraph
|
||||||
@ -102,8 +108,10 @@ def main() -> None:
|
|||||||
for md_file in sorted(odpowiedzi_dir.glob("*.md")):
|
for md_file in sorted(odpowiedzi_dir.glob("*.md")):
|
||||||
all_cards.extend(extract_cards(md_file))
|
all_cards.extend(extract_cards(md_file))
|
||||||
|
|
||||||
# APPROACH 1: Strict filtering - only cards with answer > 100 chars
|
# APPROACH 1: Strict filtering - only cards with answer > threshold
|
||||||
filtered_cards = [c for c in all_cards if len(c["back"]) > 100]
|
filtered_cards = [
|
||||||
|
c for c in all_cards if len(c["back"]) > MIN_ANSWER_LENGTH
|
||||||
|
]
|
||||||
|
|
||||||
# Remove duplicates
|
# Remove duplicates
|
||||||
seen = set()
|
seen = set()
|
||||||
@ -120,7 +128,11 @@ def main() -> None:
|
|||||||
for c in unique:
|
for c in unique:
|
||||||
f.write(f"{c['front']}\t{c['back']}\t{c['tags']}\n")
|
f.write(f"{c['front']}\t{c['back']}\t{c['tags']}\n")
|
||||||
|
|
||||||
print(f"✅ Approach 1 (Strict Filter): {len(unique)} cards -> {output_file.name}")
|
logger.info(
|
||||||
|
"Approach 1 (Strict Filter): %d cards -> %s",
|
||||||
|
len(unique),
|
||||||
|
output_file.name,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
|||||||
@ -7,11 +7,17 @@
|
|||||||
|
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import logging
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
import re
|
import re
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
def clean_text(text) -> str:
|
MIN_PARA_LENGTH = 30
|
||||||
|
MIN_BODY_LENGTH = 50
|
||||||
|
|
||||||
|
|
||||||
|
def clean_text(text: str) -> str:
|
||||||
"""Clean text."""
|
"""Clean text."""
|
||||||
if not text:
|
if not text:
|
||||||
return ""
|
return ""
|
||||||
@ -23,7 +29,7 @@ def clean_text(text) -> str:
|
|||||||
return text.strip()
|
return text.strip()
|
||||||
|
|
||||||
|
|
||||||
def extract_structured_content(body) -> str | None:
|
def extract_structured_content(body: str) -> str | None:
|
||||||
"""Better extraction - look for multiple content types."""
|
"""Better extraction - look for multiple content types."""
|
||||||
parts = []
|
parts = []
|
||||||
|
|
||||||
@ -54,15 +60,14 @@ def extract_structured_content(body) -> str | None:
|
|||||||
if p.strip()
|
if p.strip()
|
||||||
and not p.startswith("```")
|
and not p.startswith("```")
|
||||||
and not p.startswith("|")
|
and not p.startswith("|")
|
||||||
and len(p.strip()) > 30
|
and len(p.strip()) > MIN_PARA_LENGTH
|
||||||
]
|
]
|
||||||
for p in paras[:2]:
|
parts.extend(p[:300] for p in paras[:2])
|
||||||
parts.append(p[:300])
|
|
||||||
|
|
||||||
return "<br>".join([clean_text(p) for p in parts]) if parts else None
|
return "<br>".join([clean_text(p) for p in parts]) if parts else None
|
||||||
|
|
||||||
|
|
||||||
def extract_cards(filepath) -> list[dict[str, str]]:
|
def extract_cards(filepath: str) -> list[dict[str, str]]:
|
||||||
"""Extract cards."""
|
"""Extract cards."""
|
||||||
with Path(filepath).open(encoding="utf-8") as f:
|
with Path(filepath).open(encoding="utf-8") as f:
|
||||||
content = f.read()
|
content = f.read()
|
||||||
@ -99,9 +104,9 @@ def extract_cards(filepath) -> list[dict[str, str]]:
|
|||||||
content,
|
content,
|
||||||
re.MULTILINE | re.DOTALL,
|
re.MULTILINE | re.DOTALL,
|
||||||
)
|
)
|
||||||
for header, body in sections:
|
for raw_header, body in sections:
|
||||||
header = header.strip()
|
header = raw_header.strip()
|
||||||
if "Przykład" in header or '"' in header or len(body) < 50:
|
if "Przykład" in header or '"' in header or len(body) < MIN_BODY_LENGTH:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
answer = extract_structured_content(body)
|
answer = extract_structured_content(body)
|
||||||
@ -143,8 +148,10 @@ def main() -> None:
|
|||||||
for c in unique:
|
for c in unique:
|
||||||
f.write(f"{c['front']}\t{c['back']}\t{c['tags']}\n")
|
f.write(f"{c['front']}\t{c['back']}\t{c['tags']}\n")
|
||||||
|
|
||||||
print(
|
logger.info(
|
||||||
f"✅ Approach 2 (Better Extraction): {len(unique)} cards -> {output_file.name}"
|
"Approach 2 (Better Extraction): %d cards -> %s",
|
||||||
|
len(unique),
|
||||||
|
output_file.name,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@ -7,31 +7,41 @@ Usage:
|
|||||||
Options:
|
Options:
|
||||||
--filter Apply strict filtering (answers > 100 chars)
|
--filter Apply strict filtering (answers > 100 chars)
|
||||||
--extract Use improved extraction algorithm
|
--extract Use improved extraction algorithm
|
||||||
--main-only Only generate main exam questions (45 comprehensive cards)
|
--main-only Only generate main exam questions
|
||||||
|
|
||||||
Combinations:
|
Combinations:
|
||||||
python anki_generator.py # Basic extraction, no filter
|
python anki_generator.py
|
||||||
python anki_generator.py --filter # Approach 1: Strict filter only
|
python anki_generator.py --filter
|
||||||
python anki_generator.py --extract # Approach 2: Better extraction only
|
python anki_generator.py --extract
|
||||||
python anki_generator.py --main-only # Approach 3: Main questions only
|
python anki_generator.py --main-only
|
||||||
python anki_generator.py --filter --extract # Approach 4: Filter + Better extraction
|
python anki_generator.py --filter --extract
|
||||||
python anki_generator.py --filter --main-only # Approach 5: Filter + Main only
|
python anki_generator.py --filter --main-only
|
||||||
python anki_generator.py --extract --main-only # Approach 6: Better extraction + Main only
|
python anki_generator.py --extract --main-only
|
||||||
python anki_generator.py --filter --extract --main-only # Approach 7: All three
|
python anki_generator.py --filter --extract --main-only
|
||||||
"""
|
"""
|
||||||
|
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
import argparse
|
import argparse
|
||||||
|
import logging
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
import re
|
import re
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
MIN_PARTS_THRESHOLD = 2
|
||||||
|
MIN_BODY_LENGTH = 50
|
||||||
|
MIN_PARA_LENGTH = 30
|
||||||
|
SHORT_THRESHOLD = 50
|
||||||
|
MEDIUM_THRESHOLD = 150
|
||||||
|
DEFAULT_MIN_ANSWER_LENGTH = 100
|
||||||
|
|
||||||
# =============================================================================
|
# =============================================================================
|
||||||
# SHARED UTILITIES
|
# SHARED UTILITIES
|
||||||
# =============================================================================
|
# =============================================================================
|
||||||
|
|
||||||
|
|
||||||
def clean_text(text) -> str:
|
def clean_text(text: str) -> str:
|
||||||
"""Clean and format text for Anki."""
|
"""Clean and format text for Anki."""
|
||||||
if not text:
|
if not text:
|
||||||
return ""
|
return ""
|
||||||
@ -43,7 +53,7 @@ def clean_text(text) -> str:
|
|||||||
return text.strip()
|
return text.strip()
|
||||||
|
|
||||||
|
|
||||||
def get_file_metadata(filepath) -> tuple[str, str, str]:
|
def get_file_metadata(filepath: str) -> tuple[str, str, str]:
|
||||||
"""Extract question number and subject from filename."""
|
"""Extract question number and subject from filename."""
|
||||||
filename = Path(filepath).name
|
filename = Path(filepath).name
|
||||||
match = re.match(r"(\d+)-(.+)\.md", filename)
|
match = re.match(r"(\d+)-(.+)\.md", filename)
|
||||||
@ -58,7 +68,7 @@ def get_file_metadata(filepath) -> tuple[str, str, str]:
|
|||||||
return num, subject, content
|
return num, subject, content
|
||||||
|
|
||||||
|
|
||||||
def get_main_question(content) -> str | None:
|
def get_main_question(content: str) -> str | None:
|
||||||
"""Extract the main exam question."""
|
"""Extract the main exam question."""
|
||||||
q_match = re.search(
|
q_match = re.search(
|
||||||
r'## Pytanie\s*\n\s*\*\*["\']?(.+?)["\']?\*\*', content, re.DOTALL
|
r'## Pytanie\s*\n\s*\*\*["\']?(.+?)["\']?\*\*', content, re.DOTALL
|
||||||
@ -73,7 +83,10 @@ def get_main_question(content) -> str | None:
|
|||||||
# =============================================================================
|
# =============================================================================
|
||||||
|
|
||||||
|
|
||||||
def apply_strict_filter(cards, min_length=100) -> list[dict[str, str]]:
|
def apply_strict_filter(
|
||||||
|
cards: list[dict[str, str]],
|
||||||
|
min_length: int = DEFAULT_MIN_ANSWER_LENGTH,
|
||||||
|
) -> list[dict[str, str]]:
|
||||||
"""Filter cards to only include those with answers > min_length characters."""
|
"""Filter cards to only include those with answers > min_length characters."""
|
||||||
return [c for c in cards if len(c["back"]) > min_length]
|
return [c for c in cards if len(c["back"]) > min_length]
|
||||||
|
|
||||||
@ -83,7 +96,7 @@ def apply_strict_filter(cards, min_length=100) -> list[dict[str, str]]:
|
|||||||
# =============================================================================
|
# =============================================================================
|
||||||
|
|
||||||
|
|
||||||
def extract_structured_content(body) -> str | None:
|
def extract_structured_content(body: str) -> str | None:
|
||||||
"""Improved extraction - multiple content types with better formatting."""
|
"""Improved extraction - multiple content types with better formatting."""
|
||||||
parts = []
|
parts = []
|
||||||
|
|
||||||
@ -101,7 +114,7 @@ def extract_structured_content(body) -> str | None:
|
|||||||
parts.append(f"• <b>{term}</b>")
|
parts.append(f"• <b>{term}</b>")
|
||||||
|
|
||||||
# 3. Key-value patterns
|
# 3. Key-value patterns
|
||||||
if len(parts) < 2:
|
if len(parts) < MIN_PARTS_THRESHOLD:
|
||||||
kvs = re.findall(r"\*\*([^*\n]+)\*\*\s*[--:]\s*([^\n*]{10,150})", body)
|
kvs = re.findall(r"\*\*([^*\n]+)\*\*\s*[--:]\s*([^\n*]{10,150})", body)
|
||||||
for k, v in kvs[:4]:
|
for k, v in kvs[:4]:
|
||||||
entry = f"<b>{k.strip()}</b>: {v.strip()}"
|
entry = f"<b>{k.strip()}</b>: {v.strip()}"
|
||||||
@ -116,15 +129,14 @@ def extract_structured_content(body) -> str | None:
|
|||||||
if p.strip()
|
if p.strip()
|
||||||
and not p.startswith("```")
|
and not p.startswith("```")
|
||||||
and not p.startswith("|")
|
and not p.startswith("|")
|
||||||
and len(p.strip()) > 30
|
and len(p.strip()) > MIN_PARA_LENGTH
|
||||||
]
|
]
|
||||||
for p in paras[:2]:
|
parts.extend(p[:300] for p in paras[:2])
|
||||||
parts.append(p[:300])
|
|
||||||
|
|
||||||
return "<br>".join([clean_text(p) for p in parts]) if parts else None
|
return "<br>".join([clean_text(p) for p in parts]) if parts else None
|
||||||
|
|
||||||
|
|
||||||
def extract_cards_better(filepath) -> list[dict[str, str]]:
|
def extract_cards_better(filepath: str) -> list[dict[str, str]]:
|
||||||
"""Extract cards with improved algorithm."""
|
"""Extract cards with improved algorithm."""
|
||||||
num, subject, content = get_file_metadata(filepath)
|
num, subject, content = get_file_metadata(filepath)
|
||||||
base_tags = f"egzamin pyt{num} {subject}"
|
base_tags = f"egzamin pyt{num} {subject}"
|
||||||
@ -153,13 +165,13 @@ def extract_cards_better(filepath) -> list[dict[str, str]]:
|
|||||||
content,
|
content,
|
||||||
re.MULTILINE | re.DOTALL,
|
re.MULTILINE | re.DOTALL,
|
||||||
)
|
)
|
||||||
for header, body in sections:
|
for raw_header, body in sections:
|
||||||
header = header.strip()
|
header = raw_header.strip()
|
||||||
if (
|
if (
|
||||||
"Przykład" in header
|
"Przykład" in header
|
||||||
or '"' in header
|
or '"' in header
|
||||||
or "Mnemonic" in header
|
or "Mnemonic" in header
|
||||||
or len(body) < 50
|
or len(body) < MIN_BODY_LENGTH
|
||||||
):
|
):
|
||||||
continue
|
continue
|
||||||
|
|
||||||
@ -176,7 +188,7 @@ def extract_cards_better(filepath) -> list[dict[str, str]]:
|
|||||||
return cards
|
return cards
|
||||||
|
|
||||||
|
|
||||||
def extract_cards_basic(filepath) -> list[dict[str, str]]:
|
def extract_cards_basic(filepath: str) -> list[dict[str, str]]:
|
||||||
"""Basic extraction - simpler algorithm."""
|
"""Basic extraction - simpler algorithm."""
|
||||||
num, subject, content = get_file_metadata(filepath)
|
num, subject, content = get_file_metadata(filepath)
|
||||||
base_tags = f"egzamin pyt{num} {subject}"
|
base_tags = f"egzamin pyt{num} {subject}"
|
||||||
@ -212,10 +224,10 @@ def extract_cards_basic(filepath) -> list[dict[str, str]]:
|
|||||||
content,
|
content,
|
||||||
re.MULTILINE | re.DOTALL,
|
re.MULTILINE | re.DOTALL,
|
||||||
)
|
)
|
||||||
for header, body in sections:
|
for raw_header, raw_body in sections:
|
||||||
header = header.strip()
|
header = raw_header.strip()
|
||||||
body = body.strip()
|
body = raw_body.strip()
|
||||||
if len(body) < 50 or "Przykład" in header:
|
if len(body) < MIN_BODY_LENGTH or "Przykład" in header:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
paras = [
|
paras = [
|
||||||
@ -241,7 +253,28 @@ def extract_cards_basic(filepath) -> list[dict[str, str]]:
|
|||||||
# =============================================================================
|
# =============================================================================
|
||||||
|
|
||||||
|
|
||||||
def extract_main_only(filepath) -> list[dict[str, str]]:
|
def _extract_key_point(body: str) -> str | None:
|
||||||
|
"""Extract a key point from a section body."""
|
||||||
|
# Try to get a definition or first bullet
|
||||||
|
def_match = re.search(
|
||||||
|
r"Rozpoznawana klasa języków\s*\n\s*\*\*([^*]+)\*\*", body
|
||||||
|
)
|
||||||
|
if def_match:
|
||||||
|
return def_match.group(1).strip()
|
||||||
|
|
||||||
|
bullets = re.findall(r"[-•]\s*\*\*([^*]+)\*\*[:\s-]*([^\n]*)", body)
|
||||||
|
if bullets:
|
||||||
|
term, desc = bullets[0]
|
||||||
|
return f"{term}: {desc.strip()}" if desc.strip() else term
|
||||||
|
|
||||||
|
para_match = re.search(r"\n\n([^#\n\-•|`][^\n]{20,150})", body)
|
||||||
|
if para_match:
|
||||||
|
return para_match.group(1).strip()
|
||||||
|
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def extract_main_only(filepath: str) -> list[dict[str, str]]:
|
||||||
"""Extract only the main exam question with comprehensive answer."""
|
"""Extract only the main exam question with comprehensive answer."""
|
||||||
num, subject, content = get_file_metadata(filepath)
|
num, subject, content = get_file_metadata(filepath)
|
||||||
base_tags = f"egzamin pyt{num} {subject} main"
|
base_tags = f"egzamin pyt{num} {subject} main"
|
||||||
@ -255,7 +288,9 @@ def extract_main_only(filepath) -> list[dict[str, str]]:
|
|||||||
|
|
||||||
# Get main answer section
|
# Get main answer section
|
||||||
answer_match = re.search(
|
answer_match = re.search(
|
||||||
r"## 📚 Odpowiedź główna\s*\n(.+?)(?=\n## [^<5E>]|\Z)", content, re.DOTALL
|
r"## 📚 Odpowiedź główna\s*\n(.+?)(?=\n## [^<5E>]|\Z)",
|
||||||
|
content,
|
||||||
|
re.DOTALL,
|
||||||
)
|
)
|
||||||
if answer_match:
|
if answer_match:
|
||||||
section = answer_match.group(1)
|
section = answer_match.group(1)
|
||||||
@ -267,32 +302,16 @@ def extract_main_only(filepath) -> list[dict[str, str]]:
|
|||||||
re.MULTILINE | re.DOTALL,
|
re.MULTILINE | re.DOTALL,
|
||||||
)
|
)
|
||||||
|
|
||||||
for header, body in headers[:5]:
|
for raw_header, body in headers[:5]:
|
||||||
header = header.strip()
|
header = raw_header.strip()
|
||||||
if "Przykład" in header or "Mnemonic" in header or '"' in header:
|
if (
|
||||||
|
"Przykład" in header
|
||||||
|
or "Mnemonic" in header
|
||||||
|
or '"' in header
|
||||||
|
):
|
||||||
continue
|
continue
|
||||||
|
|
||||||
# Get key point from this section
|
key_point = _extract_key_point(body)
|
||||||
key_point = None
|
|
||||||
|
|
||||||
# Try to get a definition or first bullet
|
|
||||||
def_match = re.search(
|
|
||||||
r"Rozpoznawana klasa języków\s*\n\s*\*\*([^*]+)\*\*", body
|
|
||||||
)
|
|
||||||
if def_match:
|
|
||||||
key_point = def_match.group(1).strip()
|
|
||||||
|
|
||||||
if not key_point:
|
|
||||||
bullets = re.findall(r"[-•]\s*\*\*([^*]+)\*\*[:\s-]*([^\n]*)", body)
|
|
||||||
if bullets:
|
|
||||||
term, desc = bullets[0]
|
|
||||||
key_point = f"{term}: {desc.strip()}" if desc.strip() else term
|
|
||||||
|
|
||||||
if not key_point:
|
|
||||||
para_match = re.search(r"\n\n([^#\n\-•|`][^\n]{20,150})", body)
|
|
||||||
if para_match:
|
|
||||||
key_point = para_match.group(1).strip()
|
|
||||||
|
|
||||||
if key_point:
|
if key_point:
|
||||||
answer_parts.append(f"<b>{header}</b>: {key_point}")
|
answer_parts.append(f"<b>{header}</b>: {key_point}")
|
||||||
|
|
||||||
@ -308,9 +327,58 @@ def extract_main_only(filepath) -> list[dict[str, str]]:
|
|||||||
# =============================================================================
|
# =============================================================================
|
||||||
|
|
||||||
|
|
||||||
def generate_anki(use_filter=False, use_better_extract=False, main_only=False) -> Path:
|
def _collect_cards(
|
||||||
|
odpowiedzi_dir: Path,
|
||||||
|
*,
|
||||||
|
use_better_extract: bool,
|
||||||
|
main_only: bool,
|
||||||
|
) -> list[dict[str, str]]:
|
||||||
|
"""Collect cards from all files using the specified approach."""
|
||||||
|
all_cards = []
|
||||||
|
for md_file in sorted(odpowiedzi_dir.glob("*.md")):
|
||||||
|
if main_only:
|
||||||
|
cards = extract_main_only(md_file)
|
||||||
|
elif use_better_extract:
|
||||||
|
cards = extract_cards_better(md_file)
|
||||||
|
else:
|
||||||
|
cards = extract_cards_basic(md_file)
|
||||||
|
all_cards.extend(cards)
|
||||||
|
return all_cards
|
||||||
|
|
||||||
|
|
||||||
|
def _log_statistics(unique: list[dict[str, str]], output_file: Path) -> None:
|
||||||
|
"""Log quality statistics for the generated cards."""
|
||||||
|
lengths = [len(c["back"]) for c in unique]
|
||||||
|
short = sum(1 for length in lengths if length < SHORT_THRESHOLD)
|
||||||
|
medium = sum(
|
||||||
|
1
|
||||||
|
for length in lengths
|
||||||
|
if SHORT_THRESHOLD <= length < MEDIUM_THRESHOLD
|
||||||
|
)
|
||||||
|
good = sum(
|
||||||
|
1 for length in lengths if length >= MEDIUM_THRESHOLD
|
||||||
|
)
|
||||||
|
|
||||||
|
logger.info("Generated: %s", output_file.name)
|
||||||
|
logger.info(" Cards: %d", len(unique))
|
||||||
|
logger.info(
|
||||||
|
" Quality: %d short / %d medium / %d good",
|
||||||
|
short,
|
||||||
|
medium,
|
||||||
|
good,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def generate_anki(
|
||||||
|
*,
|
||||||
|
use_filter: bool = False,
|
||||||
|
use_better_extract: bool = False,
|
||||||
|
main_only: bool = False,
|
||||||
|
) -> Path:
|
||||||
"""Generate Anki deck with specified approaches."""
|
"""Generate Anki deck with specified approaches."""
|
||||||
odpowiedzi_dir = Path("/home/kuchy/praca_magisterska/pytania/odpowiedzi")
|
odpowiedzi_dir = Path(
|
||||||
|
"/home/kuchy/praca_magisterska/pytania/odpowiedzi"
|
||||||
|
)
|
||||||
|
|
||||||
# Determine output filename based on options
|
# Determine output filename based on options
|
||||||
suffix_parts = []
|
suffix_parts = []
|
||||||
@ -322,30 +390,25 @@ def generate_anki(use_filter=False, use_better_extract=False, main_only=False) -
|
|||||||
suffix_parts.append("main")
|
suffix_parts.append("main")
|
||||||
suffix = "_".join(suffix_parts) if suffix_parts else "basic"
|
suffix = "_".join(suffix_parts) if suffix_parts else "basic"
|
||||||
|
|
||||||
output_file = Path(f"/home/kuchy/praca_magisterska/pytania/anki_{suffix}.txt")
|
output_file = Path(
|
||||||
|
f"/home/kuchy/praca_magisterska/pytania/anki_{suffix}.txt"
|
||||||
|
)
|
||||||
deck_name = f"Egzamin_{suffix.replace('_', '+')}"
|
deck_name = f"Egzamin_{suffix.replace('_', '+')}"
|
||||||
|
|
||||||
all_cards = []
|
all_cards = _collect_cards(
|
||||||
|
odpowiedzi_dir,
|
||||||
for md_file in sorted(odpowiedzi_dir.glob("*.md")):
|
use_better_extract=use_better_extract,
|
||||||
if main_only:
|
main_only=main_only,
|
||||||
# Approach 3: Only main questions
|
)
|
||||||
cards = extract_main_only(md_file)
|
|
||||||
elif use_better_extract:
|
|
||||||
# Approach 2: Better extraction
|
|
||||||
cards = extract_cards_better(md_file)
|
|
||||||
else:
|
|
||||||
# Basic extraction
|
|
||||||
cards = extract_cards_basic(md_file)
|
|
||||||
|
|
||||||
all_cards.extend(cards)
|
|
||||||
|
|
||||||
# Approach 1: Apply filtering if requested
|
# Approach 1: Apply filtering if requested
|
||||||
if use_filter:
|
if use_filter:
|
||||||
all_cards = apply_strict_filter(all_cards, min_length=100)
|
all_cards = apply_strict_filter(
|
||||||
|
all_cards, min_length=DEFAULT_MIN_ANSWER_LENGTH
|
||||||
|
)
|
||||||
|
|
||||||
# Remove duplicates
|
# Remove duplicates
|
||||||
seen = set()
|
seen: set[str] = set()
|
||||||
unique = []
|
unique = []
|
||||||
for c in all_cards:
|
for c in all_cards:
|
||||||
key = c["front"][:80]
|
key = c["front"][:80]
|
||||||
@ -355,20 +418,14 @@ def generate_anki(use_filter=False, use_better_extract=False, main_only=False) -
|
|||||||
|
|
||||||
# Write output
|
# Write output
|
||||||
with Path(output_file).open("w", encoding="utf-8") as f:
|
with Path(output_file).open("w", encoding="utf-8") as f:
|
||||||
f.write(f"#separator:Tab\n#html:true\n#notetype:Basic\n#deck:{deck_name}\n\n")
|
f.write(
|
||||||
|
"#separator:Tab\n#html:true\n"
|
||||||
|
f"#notetype:Basic\n#deck:{deck_name}\n\n"
|
||||||
|
)
|
||||||
for c in unique:
|
for c in unique:
|
||||||
f.write(f"{c['front']}\t{c['back']}\t{c['tags']}\n")
|
f.write(f"{c['front']}\t{c['back']}\t{c['tags']}\n")
|
||||||
|
|
||||||
# Statistics
|
_log_statistics(unique, output_file)
|
||||||
lengths = [len(c["back"]) for c in unique]
|
|
||||||
short = sum(1 for l in lengths if l < 50)
|
|
||||||
medium = sum(1 for l in lengths if 50 <= l < 150)
|
|
||||||
good = sum(1 for l in lengths if l >= 150)
|
|
||||||
|
|
||||||
print(f"✅ Generated: {output_file.name}")
|
|
||||||
print(f" Cards: {len(unique)}")
|
|
||||||
print(f" Quality: {short} short / {medium} medium / {good} good")
|
|
||||||
print()
|
|
||||||
|
|
||||||
return output_file
|
return output_file
|
||||||
|
|
||||||
@ -397,9 +454,9 @@ def main() -> None:
|
|||||||
|
|
||||||
if args.all_combinations:
|
if args.all_combinations:
|
||||||
# Generate all 7 combinations
|
# Generate all 7 combinations
|
||||||
print("=" * 60)
|
logger.info("=" * 60)
|
||||||
print("Generating all 7 combinations...")
|
logger.info("Generating all 7 combinations...")
|
||||||
print("=" * 60 + "\n")
|
logger.info("=" * 60)
|
||||||
|
|
||||||
combinations = [
|
combinations = [
|
||||||
(True, False, False), # 1: Filter only
|
(True, False, False), # 1: Filter only
|
||||||
@ -411,9 +468,22 @@ def main() -> None:
|
|||||||
(True, True, True), # 7: All three
|
(True, True, True), # 7: All three
|
||||||
]
|
]
|
||||||
|
|
||||||
for i, (f, e, m) in enumerate(combinations, 1):
|
for i, (f_flag, e_flag, m_flag) in enumerate(
|
||||||
print(f"--- Combination {i} (filter={f}, extract={e}, main={m}) ---")
|
combinations, 1
|
||||||
generate_anki(use_filter=f, use_better_extract=e, main_only=m)
|
):
|
||||||
|
logger.info(
|
||||||
|
"--- Combination %d (filter=%s, extract=%s,"
|
||||||
|
" main=%s) ---",
|
||||||
|
i,
|
||||||
|
f_flag,
|
||||||
|
e_flag,
|
||||||
|
m_flag,
|
||||||
|
)
|
||||||
|
generate_anki(
|
||||||
|
use_filter=f_flag,
|
||||||
|
use_better_extract=e_flag,
|
||||||
|
main_only=m_flag,
|
||||||
|
)
|
||||||
else:
|
else:
|
||||||
generate_anki(
|
generate_anki(
|
||||||
use_filter=args.filter,
|
use_filter=args.filter,
|
||||||
|
|||||||
File diff suppressed because it is too large
Load Diff
@ -6,18 +6,27 @@ Creates a tab-separated file compatible with Anki import.
|
|||||||
|
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import logging
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
import re
|
import re
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
def extract_question_and_answer(filepath) -> list[dict[str, str]]:
|
MIN_BODY_LENGTH = 50
|
||||||
"""Extract main question and key answer points from a markdown file."""
|
MIN_DEFINITION_LENGTH = 20
|
||||||
|
MAX_DEFINITION_LENGTH = 200
|
||||||
|
MIN_BULLET_COUNT = 5
|
||||||
|
MIN_SUBSECTION_LENGTH = 5
|
||||||
|
MIN_FORMULA_LENGTH = 20
|
||||||
|
|
||||||
|
|
||||||
|
def _get_metadata(
|
||||||
|
filepath: str,
|
||||||
|
) -> tuple[str, str, str, str, str]:
|
||||||
|
"""Extract metadata from file."""
|
||||||
with Path(filepath).open(encoding="utf-8") as f:
|
with Path(filepath).open(encoding="utf-8") as f:
|
||||||
content = f.read()
|
content = f.read()
|
||||||
|
|
||||||
cards = []
|
|
||||||
|
|
||||||
# Extract file number for tagging
|
|
||||||
filename = Path(filepath).name
|
filename = Path(filepath).name
|
||||||
match = re.match(r"(\d+)-(.+)\.md", filename)
|
match = re.match(r"(\d+)-(.+)\.md", filename)
|
||||||
if match:
|
if match:
|
||||||
@ -27,13 +36,13 @@ def extract_question_and_answer(filepath) -> list[dict[str, str]]:
|
|||||||
num = "00"
|
num = "00"
|
||||||
topic = "unknown"
|
topic = "unknown"
|
||||||
|
|
||||||
# Extract main title (usually contains the question)
|
|
||||||
title_match = re.search(r"^# (.+)$", content, re.MULTILINE)
|
title_match = re.search(r"^# (.+)$", content, re.MULTILINE)
|
||||||
title = title_match.group(1) if title_match else "Unknown"
|
title = title_match.group(1) if title_match else "Unknown"
|
||||||
|
|
||||||
# Extract the main question from ## Pytanie section
|
|
||||||
question_match = re.search(
|
question_match = re.search(
|
||||||
r'## Pytanie\s*\n\s*\*\*["\']?(.+?)["\']?\*\*', content, re.DOTALL
|
r'## Pytanie\s*\n\s*\*\*["\']?(.+?)["\']?\*\*',
|
||||||
|
content,
|
||||||
|
re.DOTALL,
|
||||||
)
|
)
|
||||||
if question_match:
|
if question_match:
|
||||||
main_question = question_match.group(1).strip()
|
main_question = question_match.group(1).strip()
|
||||||
@ -41,124 +50,207 @@ def extract_question_and_answer(filepath) -> list[dict[str, str]]:
|
|||||||
else:
|
else:
|
||||||
main_question = title
|
main_question = title
|
||||||
|
|
||||||
# Extract subject/przedmiot
|
return num, topic, title, main_question, content
|
||||||
subject_match = re.search(r"Przedmiot:\s*(\w+)", content)
|
|
||||||
subject = subject_match.group(1) if subject_match else "Ogólne"
|
|
||||||
|
|
||||||
# Create main question card - extract key sections for answer
|
|
||||||
answer_parts = []
|
|
||||||
|
|
||||||
# Look for main answer section
|
def _extract_main_card(
|
||||||
|
content: str,
|
||||||
|
main_question: str,
|
||||||
|
subject: str,
|
||||||
|
num: str,
|
||||||
|
topic: str,
|
||||||
|
) -> list[dict[str, str]]:
|
||||||
|
"""Extract the main question card."""
|
||||||
|
answer_parts: list[str] = []
|
||||||
|
|
||||||
main_answer = re.search(
|
main_answer = re.search(
|
||||||
r"## 📚 Odpowiedź główna\s*\n(.+?)(?=\n## |\n---\s*\n## |\Z)",
|
r"## 📚 Odpowiedź główna\s*\n(.+?)"
|
||||||
|
r"(?=\n## |\n---\s*\n## |\Z)",
|
||||||
content,
|
content,
|
||||||
re.DOTALL,
|
re.DOTALL,
|
||||||
)
|
)
|
||||||
if main_answer:
|
if main_answer:
|
||||||
answer_text = main_answer.group(1)
|
answer_text = main_answer.group(1)
|
||||||
# Extract key points, definitions, headers
|
|
||||||
headers = re.findall(r"### (.+)", answer_text)
|
headers = re.findall(r"### (.+)", answer_text)
|
||||||
for h in headers[:5]: # Limit to first 5 headers
|
answer_parts.extend(f"• {h}" for h in headers[:5])
|
||||||
answer_parts.append(f"• {h}")
|
|
||||||
|
|
||||||
# Also extract key definitions if present
|
definitions = re.findall(
|
||||||
definitions = re.findall(r"\*\*([^*]+)\*\*\s*[--:]\s*([^*\n]+)", content)
|
r"\*\*([^*]+)\*\*\s*[--:]\s*([^*\n]+)", content
|
||||||
|
)
|
||||||
for term, definition in definitions[:3]:
|
for term, definition in definitions[:3]:
|
||||||
if len(definition) > 20 and len(definition) < 200:
|
if (
|
||||||
answer_parts.append(f"• {term}: {definition.strip()}")
|
len(definition) > MIN_DEFINITION_LENGTH
|
||||||
|
and len(definition) < MAX_DEFINITION_LENGTH
|
||||||
|
):
|
||||||
|
answer_parts.append(
|
||||||
|
f"• {term}: {definition.strip()}"
|
||||||
|
)
|
||||||
|
|
||||||
# If we found answer parts, create main card
|
if not answer_parts:
|
||||||
if answer_parts:
|
return []
|
||||||
answer_html = "<br>".join(answer_parts[:8]) # Limit answer length
|
|
||||||
cards.append(
|
answer_html = "<br>".join(answer_parts[:8])
|
||||||
{
|
return [
|
||||||
"question": main_question,
|
{
|
||||||
"answer": answer_html,
|
"question": main_question,
|
||||||
"tags": f"egzamin_magisterski pytanie_{num} {subject} {topic}",
|
"answer": answer_html,
|
||||||
}
|
"tags": (
|
||||||
|
f"egzamin_magisterski pytanie_{num}"
|
||||||
|
f" {subject} {topic}"
|
||||||
|
),
|
||||||
|
}
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
def _extract_subsection_answer(body_clean: str) -> str | None:
|
||||||
|
"""Extract answer text from a subsection body."""
|
||||||
|
bullets = re.findall(
|
||||||
|
r"[-•]\s*\*\*(.+?)\*\*[:\s]*([^\n]+)?", body_clean
|
||||||
|
)
|
||||||
|
if bullets:
|
||||||
|
return "<br>".join(
|
||||||
|
f"• {b[0]}: {b[1].strip()}" if b[1] else f"• {b[0]}"
|
||||||
|
for b in bullets[:MIN_BULLET_COUNT]
|
||||||
)
|
)
|
||||||
|
|
||||||
# Extract sub-questions and key concepts as additional cards
|
paragraphs = [
|
||||||
# Look for ### headers with explanations
|
p.strip()
|
||||||
|
for p in body_clean.split("\n\n")
|
||||||
|
if p.strip()
|
||||||
|
and not p.startswith("```")
|
||||||
|
and not p.startswith("|")
|
||||||
|
]
|
||||||
|
if paragraphs:
|
||||||
|
first_para = paragraphs[0]
|
||||||
|
first_para = re.sub(r"\*\*(.+?)\*\*", r"\1", first_para)
|
||||||
|
first_para = re.sub(r"\*(.+?)\*", r"\1", first_para)
|
||||||
|
return first_para[:400]
|
||||||
|
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def _extract_sub_cards(
|
||||||
|
content: str,
|
||||||
|
title: str,
|
||||||
|
subject: str,
|
||||||
|
num: str,
|
||||||
|
topic: str,
|
||||||
|
) -> list[dict[str, str]]:
|
||||||
|
"""Extract sub-concept cards."""
|
||||||
|
cards: list[dict[str, str]] = []
|
||||||
subsections = re.findall(
|
subsections = re.findall(
|
||||||
r"### (\d+\.\s+)?(.+?)\n\n(.+?)(?=\n### |\n## |\n---|\Z)", content, re.DOTALL
|
r"### (\d+\.\s+)?(.+?)\n\n(.+?)"
|
||||||
|
r"(?=\n### |\n## |\n---|\Z)",
|
||||||
|
content,
|
||||||
|
re.DOTALL,
|
||||||
)
|
)
|
||||||
|
|
||||||
for _, header, body in subsections:
|
for _, header, body in subsections:
|
||||||
if len(header) < 5 or header.startswith("Przykład"):
|
|
||||||
continue
|
|
||||||
|
|
||||||
# Extract first substantive paragraph or key points
|
|
||||||
body_clean = body.strip()
|
|
||||||
|
|
||||||
# Skip very short or code-only sections
|
|
||||||
if len(body_clean) < 50:
|
|
||||||
continue
|
|
||||||
|
|
||||||
# Extract bullet points or first paragraph
|
|
||||||
bullets = re.findall(r"[-•]\s*\*\*(.+?)\*\*[:\s]*([^\n]+)?", body_clean)
|
|
||||||
if bullets:
|
|
||||||
answer_text = "<br>".join(
|
|
||||||
[
|
|
||||||
f"• {b[0]}: {b[1].strip()}" if b[1] else f"• {b[0]}"
|
|
||||||
for b in bullets[:5]
|
|
||||||
]
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
# Get first meaningful paragraph
|
|
||||||
paragraphs = [
|
|
||||||
p.strip()
|
|
||||||
for p in body_clean.split("\n\n")
|
|
||||||
if p.strip() and not p.startswith("```") and not p.startswith("|")
|
|
||||||
]
|
|
||||||
if paragraphs:
|
|
||||||
first_para = paragraphs[0]
|
|
||||||
# Clean markdown
|
|
||||||
first_para = re.sub(r"\*\*(.+?)\*\*", r"\1", first_para)
|
|
||||||
first_para = re.sub(r"\*(.+?)\*", r"\1", first_para)
|
|
||||||
answer_text = first_para[:400]
|
|
||||||
else:
|
|
||||||
continue
|
|
||||||
|
|
||||||
# Create sub-concept card
|
|
||||||
sub_question = f"Co to jest {header}?" if not header.endswith("?") else header
|
|
||||||
if (
|
if (
|
||||||
"Charakterystyka" in header
|
len(header) < MIN_SUBSECTION_LENGTH
|
||||||
or "Definicja" in header
|
or header.startswith("Przykład")
|
||||||
or "Właściwości" in header
|
|
||||||
):
|
):
|
||||||
# These are answer-type headers, reframe
|
continue
|
||||||
parent_topic = title.replace("Pytanie", "").strip(": 0123456789")
|
|
||||||
sub_question = f"{header} - {parent_topic}"
|
body_clean = body.strip()
|
||||||
|
if len(body_clean) < MIN_BODY_LENGTH:
|
||||||
|
continue
|
||||||
|
|
||||||
|
answer_text = _extract_subsection_answer(body_clean)
|
||||||
|
if not answer_text:
|
||||||
|
continue
|
||||||
|
|
||||||
|
sub_question = (
|
||||||
|
f"Co to jest {header}?"
|
||||||
|
if not header.endswith("?")
|
||||||
|
else header
|
||||||
|
)
|
||||||
|
|
||||||
|
if any(
|
||||||
|
kw in header
|
||||||
|
for kw in ("Charakterystyka", "Definicja", "Właściwości")
|
||||||
|
):
|
||||||
|
parent = title.replace("Pytanie", "").strip(
|
||||||
|
": 0123456789"
|
||||||
|
)
|
||||||
|
sub_question = f"{header} - {parent}"
|
||||||
|
|
||||||
cards.append(
|
cards.append(
|
||||||
{
|
{
|
||||||
"question": sub_question,
|
"question": sub_question,
|
||||||
"answer": answer_text,
|
"answer": answer_text,
|
||||||
"tags": f"egzamin_magisterski pytanie_{num} {subject} {topic} szczegoly",
|
"tags": (
|
||||||
|
f"egzamin_magisterski pytanie_{num}"
|
||||||
|
f" {subject} {topic} szczegoly"
|
||||||
|
),
|
||||||
}
|
}
|
||||||
)
|
)
|
||||||
|
|
||||||
# Extract key formulas/definitions as separate cards
|
return cards
|
||||||
|
|
||||||
|
|
||||||
|
def _extract_formula_cards(
|
||||||
|
content: str,
|
||||||
|
subject: str,
|
||||||
|
num: str,
|
||||||
|
) -> list[dict[str, str]]:
|
||||||
|
"""Extract formula/definition cards."""
|
||||||
|
cards: list[dict[str, str]] = []
|
||||||
formulas = re.findall(
|
formulas = re.findall(
|
||||||
r"\*\*([A-Za-z\s]+(?:formuła|wzór|twierdzenie|definicja|lemat))\*\*[:\s]*\n?(.+?)(?=\n\n|\n\*\*|\Z)",
|
r"\*\*([A-Za-z\s]+"
|
||||||
|
r"(?:formuła|wzór|twierdzenie|definicja|lemat))"
|
||||||
|
r"\*\*[:\s]*\n?(.+?)(?=\n\n|\n\*\*|\Z)",
|
||||||
content,
|
content,
|
||||||
re.IGNORECASE | re.DOTALL,
|
re.IGNORECASE | re.DOTALL,
|
||||||
)
|
)
|
||||||
for formula_name, formula_content in formulas:
|
for formula_name, formula_content in formulas:
|
||||||
if len(formula_content) > 20:
|
if len(formula_content) > MIN_FORMULA_LENGTH:
|
||||||
cards.append(
|
cards.append(
|
||||||
{
|
{
|
||||||
"question": f"Podaj {formula_name.strip()}",
|
"question": f"Podaj {formula_name.strip()}",
|
||||||
"answer": formula_content.strip()[:300],
|
"answer": formula_content.strip()[:300],
|
||||||
"tags": f"egzamin_magisterski pytanie_{num} {subject} formuly",
|
"tags": (
|
||||||
|
f"egzamin_magisterski pytanie_{num}"
|
||||||
|
f" {subject} formuly"
|
||||||
|
),
|
||||||
}
|
}
|
||||||
)
|
)
|
||||||
|
|
||||||
return cards
|
return cards
|
||||||
|
|
||||||
|
|
||||||
def clean_for_anki(text) -> str:
|
def extract_question_and_answer(
|
||||||
|
filepath: str,
|
||||||
|
) -> list[dict[str, str]]:
|
||||||
|
"""Extract main question and key answer points from a markdown file."""
|
||||||
|
num, topic, title, main_question, content = _get_metadata(
|
||||||
|
filepath
|
||||||
|
)
|
||||||
|
|
||||||
|
subject_match = re.search(r"Przedmiot:\s*(\w+)", content)
|
||||||
|
subject = (
|
||||||
|
subject_match.group(1) if subject_match else "Ogólne"
|
||||||
|
)
|
||||||
|
|
||||||
|
cards: list[dict[str, str]] = []
|
||||||
|
cards.extend(
|
||||||
|
_extract_main_card(
|
||||||
|
content, main_question, subject, num, topic
|
||||||
|
)
|
||||||
|
)
|
||||||
|
cards.extend(
|
||||||
|
_extract_sub_cards(
|
||||||
|
content, title, subject, num, topic
|
||||||
|
)
|
||||||
|
)
|
||||||
|
cards.extend(
|
||||||
|
_extract_formula_cards(content, subject, num)
|
||||||
|
)
|
||||||
|
|
||||||
|
return cards
|
||||||
|
|
||||||
|
|
||||||
|
def clean_for_anki(text: str) -> str:
|
||||||
"""Clean text for Anki import - escape special characters."""
|
"""Clean text for Anki import - escape special characters."""
|
||||||
# Replace tabs with spaces
|
# Replace tabs with spaces
|
||||||
text = text.replace("\t", " ")
|
text = text.replace("\t", " ")
|
||||||
@ -187,13 +279,13 @@ def main() -> None:
|
|||||||
|
|
||||||
# Process each file
|
# Process each file
|
||||||
for md_file in sorted(odpowiedzi_dir.glob("*.md")):
|
for md_file in sorted(odpowiedzi_dir.glob("*.md")):
|
||||||
print(f"Processing: {md_file.name}")
|
logger.info("Processing: %s", md_file.name)
|
||||||
try:
|
try:
|
||||||
cards = extract_question_and_answer(md_file)
|
cards = extract_question_and_answer(md_file)
|
||||||
all_cards.extend(cards)
|
all_cards.extend(cards)
|
||||||
print(f" -> Extracted {len(cards)} cards")
|
logger.info(" -> Extracted %d cards", len(cards))
|
||||||
except Exception as e:
|
except (ValueError, OSError) as e:
|
||||||
print(f" -> Error: {e}")
|
logger.info(" -> Error: %s", e)
|
||||||
|
|
||||||
# Write Anki file with headers
|
# Write Anki file with headers
|
||||||
with Path(output_file).open("w", encoding="utf-8") as f:
|
with Path(output_file).open("w", encoding="utf-8") as f:
|
||||||
@ -211,13 +303,13 @@ def main() -> None:
|
|||||||
tags = card["tags"]
|
tags = card["tags"]
|
||||||
f.write(f"{front}\t{back}\t{tags}\n")
|
f.write(f"{front}\t{back}\t{tags}\n")
|
||||||
|
|
||||||
print(f"\n✅ Created {len(all_cards)} flashcards")
|
logger.info("Created %d flashcards", len(all_cards))
|
||||||
print(f"📁 Output: {output_file}")
|
logger.info("Output: %s", output_file)
|
||||||
print("\nTo import into Anki:")
|
logger.info("To import into Anki:")
|
||||||
print("1. Open Anki → File → Import")
|
logger.info("1. Open Anki -> File -> Import")
|
||||||
print("2. Select the .txt file")
|
logger.info("2. Select the .txt file")
|
||||||
print("3. Verify 'Allow HTML' is checked")
|
logger.info("3. Verify 'Allow HTML' is checked")
|
||||||
print("4. Click Import")
|
logger.info("4. Click Import")
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
|||||||
@ -6,11 +6,22 @@ Creates tab-separated file for Anki import with proper HTML formatting.
|
|||||||
|
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import logging
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
import re
|
import re
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
def clean_text(text) -> str:
|
MIN_HEADER_LENGTH = 3
|
||||||
|
MIN_MATCH_LENGTH = 10
|
||||||
|
MIN_BODY_LENGTH = 50
|
||||||
|
MIN_QA_LENGTH = 30
|
||||||
|
MAX_CONTENT_LENGTH = 300
|
||||||
|
MAX_ANSWER_LENGTH = 400
|
||||||
|
MAX_COMPARISON_ITEMS = 6
|
||||||
|
|
||||||
|
|
||||||
|
def clean_text(text: str) -> str:
|
||||||
"""Clean and format text for Anki."""
|
"""Clean and format text for Anki."""
|
||||||
if not text:
|
if not text:
|
||||||
return ""
|
return ""
|
||||||
@ -28,7 +39,7 @@ def clean_text(text) -> str:
|
|||||||
return text.strip()
|
return text.strip()
|
||||||
|
|
||||||
|
|
||||||
def format_list(items, numbered=False) -> str:
|
def format_list(items: list[str], *, numbered: bool = False) -> str:
|
||||||
"""Format a list of items as HTML."""
|
"""Format a list of items as HTML."""
|
||||||
if not items:
|
if not items:
|
||||||
return ""
|
return ""
|
||||||
@ -43,119 +54,148 @@ def format_list(items, numbered=False) -> str:
|
|||||||
return html
|
return html
|
||||||
|
|
||||||
|
|
||||||
def extract_from_file(filepath) -> list[dict[str, str]]:
|
def _get_file_metadata(
|
||||||
"""Extract flashcard data from a markdown file."""
|
filepath: str,
|
||||||
|
) -> tuple[str, str, str]:
|
||||||
|
"""Extract metadata from file."""
|
||||||
with Path(filepath).open(encoding="utf-8") as f:
|
with Path(filepath).open(encoding="utf-8") as f:
|
||||||
content = f.read()
|
content = f.read()
|
||||||
|
|
||||||
cards = []
|
|
||||||
|
|
||||||
# Get file metadata
|
|
||||||
filename = Path(filepath).name
|
filename = Path(filepath).name
|
||||||
match = re.match(r"(\d+)-(.+)\.md", filename)
|
match = re.match(r"(\d+)-(.+)\.md", filename)
|
||||||
num = match.group(1) if match else "00"
|
num = match.group(1) if match else "00"
|
||||||
match.group(2).replace("-", "_") if match else "unknown"
|
|
||||||
|
|
||||||
# Extract subject
|
|
||||||
subj_match = re.search(r"Przedmiot:\s*(\w+)", content)
|
subj_match = re.search(r"Przedmiot:\s*(\w+)", content)
|
||||||
subject = subj_match.group(1) if subj_match else "Ogólne"
|
subject = subj_match.group(1) if subj_match else "Ogólne"
|
||||||
|
|
||||||
# Base tags
|
return num, subject, content
|
||||||
base_tags = f"egzamin_magisterski pyt{num} {subject}"
|
|
||||||
|
|
||||||
# =====================================================
|
|
||||||
# CARD TYPE 1: Main Exam Question
|
def _extract_main_question_card(
|
||||||
# =====================================================
|
content: str, base_tags: str,
|
||||||
|
) -> list[dict[str, str]]:
|
||||||
|
"""Extract the main exam question card."""
|
||||||
q_match = re.search(
|
q_match = re.search(
|
||||||
r'## Pytanie\s*\n\s*\*\*["\']?(.+?)["\']?\*\*', content, re.DOTALL
|
r'## Pytanie\s*\n\s*\*\*["\']?(.+?)["\']?\*\*',
|
||||||
|
content,
|
||||||
|
re.DOTALL,
|
||||||
)
|
)
|
||||||
if q_match:
|
if not q_match:
|
||||||
main_q = re.sub(r"\s+", " ", q_match.group(1).strip())
|
return []
|
||||||
|
|
||||||
# Extract key topics from main answer
|
main_q = re.sub(r"\s+", " ", q_match.group(1).strip())
|
||||||
answer_match = re.search(
|
answer_match = re.search(
|
||||||
r"## 📚 Odpowiedź główna\s*\n(.+?)(?=\n## [<5B>🎯]|\n---\s*\n## |\Z)",
|
r"## 📚 Odpowiedź główna\s*\n(.+?)"
|
||||||
content,
|
r"(?=\n## [📚🎯]|\n---\s*\n## |\Z)",
|
||||||
re.DOTALL,
|
content,
|
||||||
|
re.DOTALL,
|
||||||
|
)
|
||||||
|
if not answer_match:
|
||||||
|
return []
|
||||||
|
|
||||||
|
answer_section = answer_match.group(1)
|
||||||
|
headers = re.findall(
|
||||||
|
r"^### (?:\d+\.\s*)?(.+)$",
|
||||||
|
answer_section,
|
||||||
|
re.MULTILINE,
|
||||||
|
)
|
||||||
|
headers = [
|
||||||
|
h.strip()
|
||||||
|
for h in headers
|
||||||
|
if len(h.strip()) > MIN_HEADER_LENGTH
|
||||||
|
][:6]
|
||||||
|
|
||||||
|
if not headers:
|
||||||
|
return []
|
||||||
|
|
||||||
|
answer_html = (
|
||||||
|
"<b>Kluczowe zagadnienia:</b>" + format_list(headers)
|
||||||
|
)
|
||||||
|
return [
|
||||||
|
{
|
||||||
|
"front": clean_text(main_q),
|
||||||
|
"back": answer_html,
|
||||||
|
"tags": f"{base_tags} pytanie_glowne",
|
||||||
|
}
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
def _make_question_text(header: str) -> str:
|
||||||
|
"""Generate a question from a section header."""
|
||||||
|
if "Definicja" in header or "Co to" in header:
|
||||||
|
return (
|
||||||
|
f"Co to jest:"
|
||||||
|
f" {header.replace('Definicja', '').strip()}?"
|
||||||
)
|
)
|
||||||
if answer_match:
|
if "Charakterystyka" in header:
|
||||||
answer_section = answer_match.group(1)
|
stripped = header.replace("Charakterystyka", "").strip()
|
||||||
# Get main headers
|
return f"Scharakteryzuj: {stripped}"
|
||||||
headers = re.findall(
|
if header.endswith("?"):
|
||||||
r"^### (?:\d+\.\s*)?(.+)$", answer_section, re.MULTILINE
|
return header
|
||||||
|
return f"Omów: {header}"
|
||||||
|
|
||||||
|
|
||||||
|
def _extract_body_parts(body: str) -> list[str]:
|
||||||
|
"""Extract structured answer parts from a section body."""
|
||||||
|
answer_parts: list[str] = []
|
||||||
|
|
||||||
|
subheaders = re.findall(r"^#### (.+)$", body, re.MULTILINE)
|
||||||
|
if subheaders:
|
||||||
|
answer_parts.extend(subheaders[:4])
|
||||||
|
|
||||||
|
bullets = re.findall(
|
||||||
|
r"[-•]\s*\*\*([^*]+)\*\*[:\s-]*([^\n]+)?", body
|
||||||
|
)
|
||||||
|
for term, desc in bullets[:5]:
|
||||||
|
if desc:
|
||||||
|
answer_parts.append(
|
||||||
|
f"<b>{term}</b>: {desc.strip()}"
|
||||||
)
|
)
|
||||||
headers = [h.strip() for h in headers if len(h.strip()) > 3][:6]
|
else:
|
||||||
|
answer_parts.append(f"<b>{term}</b>")
|
||||||
|
|
||||||
if headers:
|
if not answer_parts:
|
||||||
answer_html = "<b>Kluczowe zagadnienia:</b>" + format_list(headers)
|
paras = [
|
||||||
cards.append(
|
p.strip()
|
||||||
{
|
for p in body.split("\n\n")
|
||||||
"front": clean_text(main_q),
|
if p.strip()
|
||||||
"back": answer_html,
|
and not p.strip().startswith("```")
|
||||||
"tags": f"{base_tags} pytanie_glowne",
|
and not p.strip().startswith("|")
|
||||||
}
|
]
|
||||||
)
|
if paras:
|
||||||
|
first = paras[0]
|
||||||
|
if len(first) > MAX_CONTENT_LENGTH:
|
||||||
|
first = first[:MAX_CONTENT_LENGTH] + "..."
|
||||||
|
answer_parts.append(first)
|
||||||
|
|
||||||
# =====================================================
|
return answer_parts
|
||||||
# CARD TYPE 2: Subsection Cards (detailed concepts)
|
|
||||||
# =====================================================
|
|
||||||
# Find all ### sections
|
def _extract_subsection_cards(
|
||||||
|
content: str, base_tags: str,
|
||||||
|
) -> list[dict[str, str]]:
|
||||||
|
"""Extract subsection detail cards."""
|
||||||
|
cards: list[dict[str, str]] = []
|
||||||
sections = re.findall(
|
sections = re.findall(
|
||||||
r"^### (?:\d+\.\s*)?(.+?)\n((?:(?!^###).)+)", content, re.MULTILINE | re.DOTALL
|
r"^### (?:\d+\.\s*)?(.+?)\n((?:(?!^###).)+)",
|
||||||
|
content,
|
||||||
|
re.MULTILINE | re.DOTALL,
|
||||||
)
|
)
|
||||||
|
|
||||||
for header, body in sections:
|
for raw_header, raw_body in sections:
|
||||||
header = header.strip()
|
header = raw_header.strip()
|
||||||
body = body.strip()
|
body = raw_body.strip()
|
||||||
|
|
||||||
# Skip very short sections or example sections
|
if (
|
||||||
if len(body) < 50 or header.lower().startswith("przykład"):
|
len(body) < MIN_BODY_LENGTH
|
||||||
|
or header.lower().startswith("przykład")
|
||||||
|
):
|
||||||
continue
|
continue
|
||||||
|
|
||||||
# Extract key information from body
|
answer_parts = _extract_body_parts(body)
|
||||||
answer_parts = []
|
|
||||||
|
|
||||||
# Look for #### sub-headers
|
|
||||||
subheaders = re.findall(r"^#### (.+)$", body, re.MULTILINE)
|
|
||||||
if subheaders:
|
|
||||||
answer_parts.extend(subheaders[:4])
|
|
||||||
|
|
||||||
# Look for bullet points with bold terms
|
|
||||||
bullets = re.findall(r"[-•]\s*\*\*([^*]+)\*\*[:\s-]*([^\n]+)?", body)
|
|
||||||
for term, desc in bullets[:5]:
|
|
||||||
if desc:
|
|
||||||
answer_parts.append(f"<b>{term}</b>: {desc.strip()}")
|
|
||||||
else:
|
|
||||||
answer_parts.append(f"<b>{term}</b>")
|
|
||||||
|
|
||||||
# If no structured content, get first paragraph
|
|
||||||
if not answer_parts:
|
|
||||||
paras = [
|
|
||||||
p.strip()
|
|
||||||
for p in body.split("\n\n")
|
|
||||||
if p.strip()
|
|
||||||
and not p.strip().startswith("```")
|
|
||||||
and not p.strip().startswith("|")
|
|
||||||
]
|
|
||||||
if paras:
|
|
||||||
first = paras[0]
|
|
||||||
# Limit length
|
|
||||||
if len(first) > 300:
|
|
||||||
first = first[:300] + "..."
|
|
||||||
answer_parts.append(first)
|
|
||||||
|
|
||||||
if answer_parts:
|
if answer_parts:
|
||||||
# Determine card type
|
question = _make_question_text(header)
|
||||||
if "Definicja" in header or "Co to" in header:
|
|
||||||
q = f"Co to jest: {header.replace('Definicja', '').strip()}?"
|
|
||||||
elif "Charakterystyka" in header:
|
|
||||||
q = f"Scharakteryzuj: {header.replace('Charakterystyka', '').strip()}"
|
|
||||||
elif header.endswith("?"):
|
|
||||||
q = header
|
|
||||||
else:
|
|
||||||
q = f"Omów: {header}"
|
|
||||||
|
|
||||||
# Format answer
|
|
||||||
if len(answer_parts) > 1:
|
if len(answer_parts) > 1:
|
||||||
answer_html = format_list(answer_parts)
|
answer_html = format_list(answer_parts)
|
||||||
else:
|
else:
|
||||||
@ -163,15 +203,20 @@ def extract_from_file(filepath) -> list[dict[str, str]]:
|
|||||||
|
|
||||||
cards.append(
|
cards.append(
|
||||||
{
|
{
|
||||||
"front": clean_text(q),
|
"front": clean_text(question),
|
||||||
"back": answer_html,
|
"back": answer_html,
|
||||||
"tags": f"{base_tags} szczegoly",
|
"tags": f"{base_tags} szczegoly",
|
||||||
}
|
}
|
||||||
)
|
)
|
||||||
|
|
||||||
# =====================================================
|
return cards
|
||||||
# CARD TYPE 3: Algorithms/Formulas
|
|
||||||
# =====================================================
|
|
||||||
|
def _extract_algo_cards(
|
||||||
|
content: str, base_tags: str,
|
||||||
|
) -> list[dict[str, str]]:
|
||||||
|
"""Extract algorithm/formula cards."""
|
||||||
|
cards: list[dict[str, str]] = []
|
||||||
algo_patterns = [
|
algo_patterns = [
|
||||||
r"#### Złożoność(?:\s+czasowa)?\s*\n(.+?)(?=\n####|\n###|\Z)",
|
r"#### Złożoność(?:\s+czasowa)?\s*\n(.+?)(?=\n####|\n###|\Z)",
|
||||||
r"Złożoność:\s*\*\*([^*]+)\*\*",
|
r"Złożoność:\s*\*\*([^*]+)\*\*",
|
||||||
@ -179,85 +224,137 @@ def extract_from_file(filepath) -> list[dict[str, str]]:
|
|||||||
|
|
||||||
for pattern in algo_patterns:
|
for pattern in algo_patterns:
|
||||||
matches = re.findall(pattern, content, re.DOTALL)
|
matches = re.findall(pattern, content, re.DOTALL)
|
||||||
for match in matches[:2]:
|
for algo_match in matches[:2]:
|
||||||
if len(match) > 10:
|
if len(algo_match) > MIN_MATCH_LENGTH:
|
||||||
# Find context - which algorithm?
|
|
||||||
algo_context = re.search(
|
algo_context = re.search(
|
||||||
r"### (\d+\.\s*)?(.+?)(?=\n)", content[: content.find(match)]
|
r"### (\d+\.\s*)?(.+?)(?=\n)",
|
||||||
|
content[: content.find(algo_match)],
|
||||||
)
|
)
|
||||||
if algo_context:
|
if algo_context:
|
||||||
algo_name = algo_context.group(2).strip()
|
algo_name = algo_context.group(2).strip()
|
||||||
cards.append(
|
cards.append(
|
||||||
{
|
{
|
||||||
"front": f"Jaka jest złożoność algorytmu/metody: {algo_name}?",
|
"front": (
|
||||||
"back": clean_text(match.strip()[:200]),
|
"Jaka jest złożoność"
|
||||||
|
f" algorytmu/metody: {algo_name}?"
|
||||||
|
),
|
||||||
|
"back": clean_text(
|
||||||
|
algo_match.strip()[:200]
|
||||||
|
),
|
||||||
"tags": f"{base_tags} zlozonosc",
|
"tags": f"{base_tags} zlozonosc",
|
||||||
}
|
}
|
||||||
)
|
)
|
||||||
break
|
break
|
||||||
|
|
||||||
# =====================================================
|
return cards
|
||||||
# CARD TYPE 4: Comparisons (when file contains comparisons)
|
|
||||||
# =====================================================
|
|
||||||
|
def _extract_comparison_cards(
|
||||||
|
content: str, base_tags: str, num: str,
|
||||||
|
) -> list[dict[str, str]]:
|
||||||
|
"""Extract comparison cards."""
|
||||||
compare_match = re.search(
|
compare_match = re.search(
|
||||||
r"## .*(Porównanie|Zestawienie|vs).*\n(.+?)(?=\n## |\Z)",
|
r"## .*(Porównanie|Zestawienie|vs).*\n(.+?)(?=\n## |\Z)",
|
||||||
content,
|
content,
|
||||||
re.DOTALL | re.IGNORECASE,
|
re.DOTALL | re.IGNORECASE,
|
||||||
)
|
)
|
||||||
if compare_match:
|
if not compare_match:
|
||||||
compare_section = compare_match.group(2)
|
return []
|
||||||
# Extract comparison items
|
|
||||||
items = re.findall(r"\|\s*\*\*([^|*]+)\*\*\s*\|([^|]+)\|", compare_section)
|
|
||||||
if items:
|
|
||||||
comparison_html = "<table><tr><th>Aspekt</th><th>Wartość</th></tr>"
|
|
||||||
for aspect, value in items[:6]:
|
|
||||||
comparison_html += f"<tr><td>{clean_text(aspect)}</td><td>{clean_text(value)}</td></tr>"
|
|
||||||
comparison_html += "</table>"
|
|
||||||
|
|
||||||
# Get comparison title
|
compare_section = compare_match.group(2)
|
||||||
title_match = re.search(
|
items = re.findall(
|
||||||
r"## .*(Porównanie|Zestawienie).*?(\w+.*?(?:vs|i|oraz).*?\w+)",
|
r"\|\s*\*\*([^|*]+)\*\*\s*\|([^|]+)\|",
|
||||||
compare_match.group(0),
|
compare_section,
|
||||||
re.IGNORECASE,
|
)
|
||||||
)
|
if not items:
|
||||||
if title_match:
|
return []
|
||||||
cards.append(
|
|
||||||
{
|
|
||||||
"front": f"Porównaj kluczowe różnice w temacie: pytanie {num}",
|
|
||||||
"back": comparison_html,
|
|
||||||
"tags": f"{base_tags} porownanie",
|
|
||||||
}
|
|
||||||
)
|
|
||||||
|
|
||||||
# =====================================================
|
comparison_html = (
|
||||||
# CARD TYPE 5: Q&A from practice questions section
|
"<table><tr><th>Aspekt</th><th>Wartość</th></tr>"
|
||||||
# =====================================================
|
)
|
||||||
qa_section = re.search(r"## 🎓 Pytania.*?\n(.+?)(?=\n## |\Z)", content, re.DOTALL)
|
for aspect, value in items[:MAX_COMPARISON_ITEMS]:
|
||||||
if qa_section:
|
comparison_html += (
|
||||||
qa_content = qa_section.group(1)
|
f"<tr><td>{clean_text(aspect)}</td>"
|
||||||
# Find Q&A pairs
|
f"<td>{clean_text(value)}</td></tr>"
|
||||||
qas = re.findall(
|
|
||||||
r'### Q\d+:?\s*["\']?(.+?)["\']?\s*\n.*?Odpowiedź:\s*\n?(.+?)(?=\n### |\Z)',
|
|
||||||
qa_content,
|
|
||||||
re.DOTALL,
|
|
||||||
)
|
)
|
||||||
for q, a in qas[:3]:
|
comparison_html += "</table>"
|
||||||
q = re.sub(r"\s+", " ", q.strip())
|
|
||||||
a = a.strip()
|
|
||||||
if len(a) > 30:
|
|
||||||
# Limit answer length
|
|
||||||
a_lines = a.split("\n")
|
|
||||||
a_short = "\n".join(a_lines[:5])
|
|
||||||
if len(a_short) > 400:
|
|
||||||
a_short = a_short[:400] + "..."
|
|
||||||
|
|
||||||
cards.append(
|
title_match = re.search(
|
||||||
{
|
r"## .*(Porównanie|Zestawienie)"
|
||||||
"front": clean_text(q),
|
r".*?(\w+.*?(?:vs|i|oraz).*?\w+)",
|
||||||
"back": clean_text(a_short).replace("\n", "<br>"),
|
compare_match.group(0),
|
||||||
"tags": f"{base_tags} egzamin_praktyka",
|
re.IGNORECASE,
|
||||||
}
|
)
|
||||||
)
|
if not title_match:
|
||||||
|
return []
|
||||||
|
|
||||||
|
return [
|
||||||
|
{
|
||||||
|
"front": (
|
||||||
|
"Porównaj kluczowe różnice"
|
||||||
|
f" w temacie: pytanie {num}"
|
||||||
|
),
|
||||||
|
"back": comparison_html,
|
||||||
|
"tags": f"{base_tags} porownanie",
|
||||||
|
}
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
def _extract_qa_cards(
|
||||||
|
content: str, base_tags: str,
|
||||||
|
) -> list[dict[str, str]]:
|
||||||
|
"""Extract Q&A practice cards."""
|
||||||
|
cards: list[dict[str, str]] = []
|
||||||
|
qa_section = re.search(
|
||||||
|
r"## 🎓 Pytania.*?\n(.+?)(?=\n## |\Z)",
|
||||||
|
content,
|
||||||
|
re.DOTALL,
|
||||||
|
)
|
||||||
|
if not qa_section:
|
||||||
|
return cards
|
||||||
|
|
||||||
|
qa_content = qa_section.group(1)
|
||||||
|
qas = re.findall(
|
||||||
|
r"### Q\d+:?\s*[\"']?(.+?)[\"']?\s*\n"
|
||||||
|
r".*?Odpowiedź:\s*\n?(.+?)(?=\n### |\Z)",
|
||||||
|
qa_content,
|
||||||
|
re.DOTALL,
|
||||||
|
)
|
||||||
|
for raw_q, raw_a in qas[:3]:
|
||||||
|
question = re.sub(r"\s+", " ", raw_q.strip())
|
||||||
|
answer = raw_a.strip()
|
||||||
|
if len(answer) > MIN_QA_LENGTH:
|
||||||
|
a_lines = answer.split("\n")
|
||||||
|
a_short = "\n".join(a_lines[:5])
|
||||||
|
if len(a_short) > MAX_ANSWER_LENGTH:
|
||||||
|
a_short = a_short[:MAX_ANSWER_LENGTH] + "..."
|
||||||
|
|
||||||
|
cards.append(
|
||||||
|
{
|
||||||
|
"front": clean_text(question),
|
||||||
|
"back": clean_text(a_short).replace(
|
||||||
|
"\n", "<br>"
|
||||||
|
),
|
||||||
|
"tags": f"{base_tags} egzamin_praktyka",
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
return cards
|
||||||
|
|
||||||
|
|
||||||
|
def extract_from_file(filepath: str) -> list[dict[str, str]]:
|
||||||
|
"""Extract flashcard data from a markdown file."""
|
||||||
|
num, subject, content = _get_file_metadata(filepath)
|
||||||
|
base_tags = f"egzamin_magisterski pyt{num} {subject}"
|
||||||
|
|
||||||
|
cards: list[dict[str, str]] = []
|
||||||
|
cards.extend(_extract_main_question_card(content, base_tags))
|
||||||
|
cards.extend(_extract_subsection_cards(content, base_tags))
|
||||||
|
cards.extend(_extract_algo_cards(content, base_tags))
|
||||||
|
cards.extend(
|
||||||
|
_extract_comparison_cards(content, base_tags, num)
|
||||||
|
)
|
||||||
|
cards.extend(_extract_qa_cards(content, base_tags))
|
||||||
|
|
||||||
return cards
|
return cards
|
||||||
|
|
||||||
@ -272,13 +369,13 @@ def main() -> None:
|
|||||||
all_cards = []
|
all_cards = []
|
||||||
|
|
||||||
for md_file in sorted(odpowiedzi_dir.glob("*.md")):
|
for md_file in sorted(odpowiedzi_dir.glob("*.md")):
|
||||||
print(f"Processing: {md_file.name}", end=" ")
|
logger.info("Processing: %s", md_file.name)
|
||||||
try:
|
try:
|
||||||
cards = extract_from_file(md_file)
|
cards = extract_from_file(md_file)
|
||||||
all_cards.extend(cards)
|
all_cards.extend(cards)
|
||||||
print(f"→ {len(cards)} cards")
|
logger.info(" -> %d cards", len(cards))
|
||||||
except Exception as e:
|
except (ValueError, OSError) as e:
|
||||||
print(f"→ ERROR: {e}")
|
logger.info(" -> ERROR: %s", e)
|
||||||
|
|
||||||
# Remove potential duplicates (same front)
|
# Remove potential duplicates (same front)
|
||||||
seen = set()
|
seen = set()
|
||||||
@ -306,23 +403,25 @@ def main() -> None:
|
|||||||
|
|
||||||
f.write(f"{front}\t{back}\t{tags}\n")
|
f.write(f"{front}\t{back}\t{tags}\n")
|
||||||
|
|
||||||
print(f"\n{'=' * 50}")
|
logger.info("=" * 50)
|
||||||
print(f"✅ Generated {len(unique_cards)} unique flashcards")
|
logger.info(
|
||||||
print(f"📁 Saved to: {output_file}")
|
"Generated %d unique flashcards", len(unique_cards)
|
||||||
print(f"{'=' * 50}")
|
)
|
||||||
print("\n📋 IMPORT INSTRUCTIONS:")
|
logger.info("Saved to: %s", output_file)
|
||||||
print("─" * 40)
|
logger.info("=" * 50)
|
||||||
print("Anki Desktop:")
|
logger.info("IMPORT INSTRUCTIONS:")
|
||||||
print(" 1. File → Import")
|
logger.info("-" * 40)
|
||||||
print(" 2. Select: anki_egzamin_magisterski.txt")
|
logger.info("Anki Desktop:")
|
||||||
print(" 3. Verify: Fields separated by Tab")
|
logger.info(" 1. File -> Import")
|
||||||
print(" 4. Check: Allow HTML in fields")
|
logger.info(" 2. Select: anki_egzamin_magisterski.txt")
|
||||||
print(" 5. Click Import")
|
logger.info(" 3. Verify: Fields separated by Tab")
|
||||||
print()
|
logger.info(" 4. Check: Allow HTML in fields")
|
||||||
print("AnkiWeb / AnkiDroid:")
|
logger.info(" 5. Click Import")
|
||||||
print(" 1. First import on Anki Desktop")
|
logger.info("")
|
||||||
print(" 2. Click Sync to upload to AnkiWeb")
|
logger.info("AnkiWeb / AnkiDroid:")
|
||||||
print(" 3. Sync on mobile to download")
|
logger.info(" 1. First import on Anki Desktop")
|
||||||
|
logger.info(" 2. Click Sync to upload to AnkiWeb")
|
||||||
|
logger.info(" 3. Sync on mobile to download")
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
|||||||
@ -6,12 +6,16 @@ Creates a tab-separated file compatible with Anki import.
|
|||||||
|
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import logging
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
import re
|
import re
|
||||||
import traceback
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
MIN_HEADER_WORDS = 3
|
||||||
|
|
||||||
|
|
||||||
def extract_main_question(content, filename) -> str:
|
def extract_main_question(content: str, filename: str) -> str:
|
||||||
"""Extract the main exam question from the file."""
|
"""Extract the main exam question from the file."""
|
||||||
# Extract the main question from ## Pytanie section
|
# Extract the main question from ## Pytanie section
|
||||||
question_match = re.search(
|
question_match = re.search(
|
||||||
@ -26,13 +30,13 @@ def extract_main_question(content, filename) -> str:
|
|||||||
return title_match.group(1) if title_match else filename
|
return title_match.group(1) if title_match else filename
|
||||||
|
|
||||||
|
|
||||||
def extract_subject(content) -> str:
|
def extract_subject(content: str) -> str:
|
||||||
"""Extract the subject code."""
|
"""Extract the subject code."""
|
||||||
subject_match = re.search(r"Przedmiot:\s*(\w+)", content)
|
subject_match = re.search(r"Przedmiot:\s*(\w+)", content)
|
||||||
return subject_match.group(1) if subject_match else "Ogólne"
|
return subject_match.group(1) if subject_match else "Ogólne"
|
||||||
|
|
||||||
|
|
||||||
def extract_key_points(content) -> list[str]:
|
def extract_key_points(content: str) -> list[str]:
|
||||||
"""Extract key points from the main answer section."""
|
"""Extract key points from the main answer section."""
|
||||||
points = []
|
points = []
|
||||||
|
|
||||||
@ -51,14 +55,14 @@ def extract_key_points(content) -> list[str]:
|
|||||||
headers = re.findall(r"^### (.+)$", answer_text, re.MULTILINE)
|
headers = re.findall(r"^### (.+)$", answer_text, re.MULTILINE)
|
||||||
for h in headers[:6]:
|
for h in headers[:6]:
|
||||||
# Clean header
|
# Clean header
|
||||||
h = re.sub(r"\d+\.\s*", "", h).strip()
|
cleaned = re.sub(r"\d+\.\s*", "", h).strip()
|
||||||
if h and len(h) > 3:
|
if cleaned and len(cleaned) > MIN_HEADER_WORDS:
|
||||||
points.append(h)
|
points.append(cleaned)
|
||||||
|
|
||||||
return points
|
return points
|
||||||
|
|
||||||
|
|
||||||
def extract_definitions(content) -> list[tuple[str, str]]:
|
def extract_definitions(content: str) -> list[tuple[str, str]]:
|
||||||
"""Extract key definitions from the content."""
|
"""Extract key definitions from the content."""
|
||||||
definitions = []
|
definitions = []
|
||||||
|
|
||||||
@ -66,9 +70,9 @@ def extract_definitions(content) -> list[tuple[str, str]]:
|
|||||||
pattern = r"\*\*([^*\n]+)\*\*\s*[--:]\s*([^*\n]{20,150})"
|
pattern = r"\*\*([^*\n]+)\*\*\s*[--:]\s*([^*\n]{20,150})"
|
||||||
matches = re.findall(pattern, content)
|
matches = re.findall(pattern, content)
|
||||||
|
|
||||||
for term, definition in matches:
|
for raw_term, raw_def in matches:
|
||||||
term = term.strip()
|
term = raw_term.strip()
|
||||||
definition = definition.strip()
|
definition = raw_def.strip()
|
||||||
# Filter out non-definition patterns
|
# Filter out non-definition patterns
|
||||||
if (
|
if (
|
||||||
term
|
term
|
||||||
@ -81,7 +85,7 @@ def extract_definitions(content) -> list[tuple[str, str]]:
|
|||||||
return definitions[:5]
|
return definitions[:5]
|
||||||
|
|
||||||
|
|
||||||
def clean_html(text) -> str:
|
def clean_html(text: str) -> str:
|
||||||
"""Convert markdown to HTML and clean for Anki."""
|
"""Convert markdown to HTML and clean for Anki."""
|
||||||
if not text:
|
if not text:
|
||||||
return ""
|
return ""
|
||||||
@ -101,7 +105,7 @@ def clean_html(text) -> str:
|
|||||||
return text.strip()
|
return text.strip()
|
||||||
|
|
||||||
|
|
||||||
def process_file(filepath) -> list[dict[str, str]]:
|
def process_file(filepath: str) -> list[dict[str, str]]:
|
||||||
"""Process a single file and return flashcards."""
|
"""Process a single file and return flashcards."""
|
||||||
with Path(filepath).open(encoding="utf-8") as f:
|
with Path(filepath).open(encoding="utf-8") as f:
|
||||||
content = f.read()
|
content = f.read()
|
||||||
@ -111,11 +115,7 @@ def process_file(filepath) -> list[dict[str, str]]:
|
|||||||
# Extract metadata
|
# Extract metadata
|
||||||
filename = Path(filepath).name
|
filename = Path(filepath).name
|
||||||
match = re.match(r"(\d+)-(.+)\.md", filename)
|
match = re.match(r"(\d+)-(.+)\.md", filename)
|
||||||
if match:
|
num = match.group(1) if match else "00"
|
||||||
num = match.group(1)
|
|
||||||
match.group(2).replace("-", "_")
|
|
||||||
else:
|
|
||||||
num = "00"
|
|
||||||
|
|
||||||
subject = extract_subject(content)
|
subject = extract_subject(content)
|
||||||
main_question = extract_main_question(content, filename)
|
main_question = extract_main_question(content, filename)
|
||||||
@ -156,14 +156,13 @@ def main() -> None:
|
|||||||
|
|
||||||
# Process each file
|
# Process each file
|
||||||
for md_file in sorted(odpowiedzi_dir.glob("*.md")):
|
for md_file in sorted(odpowiedzi_dir.glob("*.md")):
|
||||||
print(f"Processing: {md_file.name}")
|
logger.info("Processing: %s", md_file.name)
|
||||||
try:
|
try:
|
||||||
cards = process_file(md_file)
|
cards = process_file(md_file)
|
||||||
all_cards.extend(cards)
|
all_cards.extend(cards)
|
||||||
print(f" -> {len(cards)} cards")
|
logger.info(" -> %d cards", len(cards))
|
||||||
except Exception as e:
|
except (ValueError, OSError):
|
||||||
print(f" -> Error: {e}")
|
logger.exception(" -> Error processing file")
|
||||||
traceback.print_exc()
|
|
||||||
|
|
||||||
# Write Anki-compatible file
|
# Write Anki-compatible file
|
||||||
with Path(output_file).open("w", encoding="utf-8") as f:
|
with Path(output_file).open("w", encoding="utf-8") as f:
|
||||||
@ -186,16 +185,22 @@ def main() -> None:
|
|||||||
|
|
||||||
f.write(f"{front}\t{back}\t{tags}\n")
|
f.write(f"{front}\t{back}\t{tags}\n")
|
||||||
|
|
||||||
print(f"\n✅ Created {len(all_cards)} flashcards")
|
logger.info("Created %d flashcards", len(all_cards))
|
||||||
print(f"📁 Output: {output_file}")
|
logger.info("Output: %s", output_file)
|
||||||
print("\n=== Import Instructions ===")
|
logger.info("=== Import Instructions ===")
|
||||||
print("1. Open Anki desktop → File → Import")
|
logger.info("1. Open Anki desktop -> File -> Import")
|
||||||
print("2. Select: anki_egzamin_magisterski.txt")
|
logger.info("2. Select: anki_egzamin_magisterski.txt")
|
||||||
print("3. Set 'Fields separated by: Tab'")
|
logger.info("3. Set 'Fields separated by: Tab'")
|
||||||
print("4. Check 'Allow HTML in fields'")
|
logger.info("4. Check 'Allow HTML in fields'")
|
||||||
print("5. Map: Field 1 → Front, Field 2 → Back, Field 3 → Tags")
|
logger.info(
|
||||||
print("6. Click Import")
|
"5. Map: Field 1 -> Front, Field 2 -> Back,"
|
||||||
print("\nFor AnkiWeb/AnkiDroid: Sync after importing on desktop")
|
" Field 3 -> Tags"
|
||||||
|
)
|
||||||
|
logger.info("6. Click Import")
|
||||||
|
logger.info(
|
||||||
|
"For AnkiWeb/AnkiDroid:"
|
||||||
|
" Sync after importing on desktop"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
|||||||
@ -3,11 +3,18 @@
|
|||||||
|
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import logging
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
import re
|
import re
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
def clean_text(text) -> str:
|
MIN_PARA_LENGTH = 20
|
||||||
|
MAX_PARA_LENGTH = 400
|
||||||
|
MIN_BODY_LENGTH = 80
|
||||||
|
|
||||||
|
|
||||||
|
def clean_text(text: str) -> str:
|
||||||
"""Clean text for Anki."""
|
"""Clean text for Anki."""
|
||||||
if not text:
|
if not text:
|
||||||
return ""
|
return ""
|
||||||
@ -19,7 +26,7 @@ def clean_text(text) -> str:
|
|||||||
return text.strip()
|
return text.strip()
|
||||||
|
|
||||||
|
|
||||||
def extract_real_answer(content, section_name) -> str | None:
|
def extract_real_answer(content: str, section_name: str) -> str | None:
|
||||||
"""Extract actual content from a section, not just headers."""
|
"""Extract actual content from a section, not just headers."""
|
||||||
# Find the section
|
# Find the section
|
||||||
pattern = rf"### (?:\d+\.\s*)?{re.escape(section_name)}\s*\n((?:(?!^### ).)+)"
|
pattern = rf"### (?:\d+\.\s*)?{re.escape(section_name)}\s*\n((?:(?!^### ).)+)"
|
||||||
@ -52,19 +59,21 @@ def extract_real_answer(content, section_name) -> str | None:
|
|||||||
for p in body.split("\n\n")
|
for p in body.split("\n\n")
|
||||||
if p.strip() and not p.startswith("```") and not p.startswith("|")
|
if p.strip() and not p.startswith("```") and not p.startswith("|")
|
||||||
]
|
]
|
||||||
for p in paras[:2]:
|
lines.extend(
|
||||||
if len(p) > 20 and len(p) < 400:
|
p for p in paras[:2]
|
||||||
lines.append(p)
|
if len(p) > MIN_PARA_LENGTH and len(p) < MAX_PARA_LENGTH
|
||||||
|
)
|
||||||
|
|
||||||
return "<br>".join(lines[:6]) if lines else None
|
return "<br>".join(lines[:6]) if lines else None
|
||||||
|
|
||||||
|
|
||||||
def extract_cards(filepath) -> list[dict[str, str]]:
|
def _read_file_metadata(
|
||||||
"""Extract flashcards from a file."""
|
filepath: str | Path,
|
||||||
|
) -> tuple[str, str, str | None]:
|
||||||
|
"""Read file and extract metadata."""
|
||||||
with Path(filepath).open(encoding="utf-8") as f:
|
with Path(filepath).open(encoding="utf-8") as f:
|
||||||
content = f.read()
|
content = f.read()
|
||||||
|
|
||||||
cards = []
|
|
||||||
filename = Path(filepath).name
|
filename = Path(filepath).name
|
||||||
match = re.match(r"(\d+)-(.+)\.md", filename)
|
match = re.match(r"(\d+)-(.+)\.md", filename)
|
||||||
num = match.group(1) if match else "00"
|
num = match.group(1) if match else "00"
|
||||||
@ -73,182 +82,228 @@ def extract_cards(filepath) -> list[dict[str, str]]:
|
|||||||
subject = subj_match.group(1) if subj_match else "Ogólne"
|
subject = subj_match.group(1) if subj_match else "Ogólne"
|
||||||
base_tags = f"egzamin_magisterski pyt{num} {subject}"
|
base_tags = f"egzamin_magisterski pyt{num} {subject}"
|
||||||
|
|
||||||
# Get main question
|
|
||||||
q_match = re.search(
|
q_match = re.search(
|
||||||
r'## Pytanie\s*\n\s*\*\*["\']?(.+?)["\']?\*\*', content, re.DOTALL
|
r'## Pytanie\s*\n\s*\*\*["\']?(.+?)["\']?\*\*',
|
||||||
|
content,
|
||||||
|
re.DOTALL,
|
||||||
|
)
|
||||||
|
main_question = (
|
||||||
|
re.sub(r"\s+", " ", q_match.group(1).strip()) if q_match else None
|
||||||
)
|
)
|
||||||
main_question = re.sub(r"\s+", " ", q_match.group(1).strip()) if q_match else None
|
|
||||||
|
|
||||||
# ===============================================
|
return content, base_tags, main_question
|
||||||
# MAIN CARD: Question with REAL answer summary
|
|
||||||
# ===============================================
|
|
||||||
if main_question:
|
|
||||||
# Build a real answer from the main sections
|
|
||||||
answer_parts = []
|
|
||||||
|
|
||||||
# For automata question - extract key facts about each automaton
|
|
||||||
if "automat" in main_question.lower() or "maszyn" in main_question.lower():
|
def _extract_automata_facts(content: str) -> list[str]:
|
||||||
# FA
|
"""Extract automata-specific facts."""
|
||||||
fa_match = re.search(
|
parts: list[str] = []
|
||||||
r"Automat Skończony.*?Rozpoznawana klasa języków\s*\n\s*\*\*([^*]+)\*\*",
|
automata = [
|
||||||
content,
|
("Automat Skończony", "FA"),
|
||||||
re.DOTALL,
|
("Automat ze Stosem", "PDA"),
|
||||||
|
("Maszyna Turinga", "TM"),
|
||||||
|
]
|
||||||
|
for name, abbrev in automata:
|
||||||
|
pattern = (
|
||||||
|
rf"{name}.*?Rozpoznawana klasa języków"
|
||||||
|
r"\s*\n\s*\*\*([^*]+)\*\*"
|
||||||
|
)
|
||||||
|
match = re.search(pattern, content, re.DOTALL)
|
||||||
|
if match:
|
||||||
|
parts.append(
|
||||||
|
f"<b>{name} ({abbrev})</b>: "
|
||||||
|
f"{match.group(1).strip()}"
|
||||||
)
|
)
|
||||||
if fa_match:
|
return parts
|
||||||
answer_parts.append(
|
|
||||||
f"<b>Automat Skończony (FA)</b>: {fa_match.group(1).strip()}"
|
|
||||||
)
|
|
||||||
|
|
||||||
# PDA
|
|
||||||
pda_match = re.search(
|
|
||||||
r"Automat ze Stosem.*?Rozpoznawana klasa języków\s*\n\s*\*\*([^*]+)\*\*",
|
|
||||||
content,
|
|
||||||
re.DOTALL,
|
|
||||||
)
|
|
||||||
if pda_match:
|
|
||||||
answer_parts.append(
|
|
||||||
f"<b>Automat ze Stosem (PDA)</b>: {pda_match.group(1).strip()}"
|
|
||||||
)
|
|
||||||
|
|
||||||
# TM
|
def _extract_generic_facts(content: str) -> list[str]:
|
||||||
tm_match = re.search(
|
"""Extract generic definitions and summaries."""
|
||||||
r"Maszyna Turinga.*?Rozpoznawana klasa języków\s*\n\s*\*\*([^*]+)\*\*",
|
parts: list[str] = []
|
||||||
content,
|
key_patterns = [
|
||||||
re.DOTALL,
|
r"#### Definicja\s*\n([^\n#]+)",
|
||||||
)
|
r"#### Charakterystyka\s*\n([^\n#]+)",
|
||||||
if tm_match:
|
r"\*\*Definicja[:\s]*\*\*\s*([^\n]+)",
|
||||||
answer_parts.append(
|
]
|
||||||
f"<b>Maszyna Turinga (TM)</b>: {tm_match.group(1).strip()}"
|
for pattern in key_patterns:
|
||||||
)
|
parts.extend(
|
||||||
|
found.strip()
|
||||||
|
for found in re.findall(pattern, content)[:3]
|
||||||
|
if len(found) > MIN_PARA_LENGTH
|
||||||
|
)
|
||||||
|
return parts
|
||||||
|
|
||||||
# Generic extraction if specific didn't work
|
|
||||||
if not answer_parts:
|
|
||||||
# Look for key definitions/summaries
|
|
||||||
key_patterns = [
|
|
||||||
r"#### Definicja\s*\n([^\n#]+)",
|
|
||||||
r"#### Charakterystyka\s*\n([^\n#]+)",
|
|
||||||
r"\*\*Definicja[:\s]*\*\*\s*([^\n]+)",
|
|
||||||
]
|
|
||||||
for pattern in key_patterns:
|
|
||||||
for match in re.findall(pattern, content)[:3]:
|
|
||||||
if len(match) > 20:
|
|
||||||
answer_parts.append(match.strip())
|
|
||||||
|
|
||||||
# Still nothing? Get first substantive paragraph from main answer
|
def _extract_first_paragraphs(content: str) -> list[str]:
|
||||||
if not answer_parts:
|
"""Extract first substantive paragraphs from main answer."""
|
||||||
main_answer = re.search(
|
main_answer = re.search(
|
||||||
r"## 📚 Odpowiedź główna\s*\n(.+?)(?=\n## |\Z)", content, re.DOTALL
|
r"## 📚 Odpowiedź główna\s*\n(.+?)(?=\n## |\Z)",
|
||||||
)
|
content,
|
||||||
if main_answer:
|
re.DOTALL,
|
||||||
# Skip headers, get actual content
|
)
|
||||||
text = main_answer.group(1)
|
if not main_answer:
|
||||||
paras = re.findall(r"\n\n([^#\n][^\n]{50,300})", text)
|
return []
|
||||||
answer_parts = paras[:3]
|
text = main_answer.group(1)
|
||||||
|
paras = re.findall(r"\n\n([^#\n][^\n]{50,300})", text)
|
||||||
|
return paras[:3]
|
||||||
|
|
||||||
if answer_parts:
|
|
||||||
answer = "<br><br>".join([clean_text(p) for p in answer_parts])
|
def _build_main_card(
|
||||||
cards.append(
|
content: str,
|
||||||
{
|
main_question: str | None,
|
||||||
"front": clean_text(main_question),
|
base_tags: str,
|
||||||
"back": answer,
|
) -> dict[str, str] | None:
|
||||||
"tags": f"{base_tags} pytanie_glowne",
|
"""Build the main question card."""
|
||||||
}
|
if not main_question:
|
||||||
|
return None
|
||||||
|
|
||||||
|
answer_parts: list[str] = []
|
||||||
|
if (
|
||||||
|
"automat" in main_question.lower()
|
||||||
|
or "maszyn" in main_question.lower()
|
||||||
|
):
|
||||||
|
answer_parts = _extract_automata_facts(content)
|
||||||
|
|
||||||
|
if not answer_parts:
|
||||||
|
answer_parts = _extract_generic_facts(content)
|
||||||
|
|
||||||
|
if not answer_parts:
|
||||||
|
answer_parts = _extract_first_paragraphs(content)
|
||||||
|
|
||||||
|
if not answer_parts:
|
||||||
|
return None
|
||||||
|
|
||||||
|
answer = "<br><br>".join(
|
||||||
|
clean_text(p) for p in answer_parts
|
||||||
|
)
|
||||||
|
return {
|
||||||
|
"front": clean_text(main_question),
|
||||||
|
"back": answer,
|
||||||
|
"tags": f"{base_tags} pytanie_glowne",
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def _extract_section_content(body: str) -> list[str]:
|
||||||
|
"""Extract content lines from a section body."""
|
||||||
|
answer_lines: list[str] = []
|
||||||
|
|
||||||
|
def_match = re.search(
|
||||||
|
r"#### Definicja[^\n]*\n([^\n#]+(?:\n[^\n#]+)?)", body,
|
||||||
|
)
|
||||||
|
if def_match:
|
||||||
|
answer_lines.append(def_match.group(1).strip())
|
||||||
|
|
||||||
|
char_match = re.search(
|
||||||
|
r"#### Charakterystyka\s*\n((?:[-•][^\n]+\n?)+)", body,
|
||||||
|
)
|
||||||
|
if char_match:
|
||||||
|
bullets = re.findall(
|
||||||
|
r"[-•]\s*\*\*([^*]+)\*\*[:\s]*([^\n]*)",
|
||||||
|
char_match.group(1),
|
||||||
|
)
|
||||||
|
for term, desc in bullets[:4]:
|
||||||
|
answer_lines.append(
|
||||||
|
f"• <b>{term}</b>: {desc.strip()}"
|
||||||
|
if desc
|
||||||
|
else f"• <b>{term}</b>"
|
||||||
)
|
)
|
||||||
|
|
||||||
# ===============================================
|
if not answer_lines:
|
||||||
# CONCEPT CARDS: Specific topics with real content
|
bullets = re.findall(
|
||||||
# ===============================================
|
r"[-•]\s*\*\*([^*]+)\*\*[:\s]*([^\n]*)", body,
|
||||||
# Find all ### sections and extract their actual content
|
)
|
||||||
|
for term, desc in bullets[:5]:
|
||||||
|
answer_lines.append(
|
||||||
|
f"• <b>{term}</b>: {desc.strip()}"
|
||||||
|
if desc
|
||||||
|
else f"• <b>{term}</b>"
|
||||||
|
)
|
||||||
|
|
||||||
|
if not answer_lines:
|
||||||
|
first_para = re.search(
|
||||||
|
r"^([^#\n\-•|`][^\n]{30,250})", body, re.MULTILINE,
|
||||||
|
)
|
||||||
|
if first_para:
|
||||||
|
answer_lines.append(first_para.group(1))
|
||||||
|
|
||||||
|
return answer_lines
|
||||||
|
|
||||||
|
|
||||||
|
def _build_concept_cards(
|
||||||
|
content: str, base_tags: str,
|
||||||
|
) -> list[dict[str, str]]:
|
||||||
|
"""Build concept cards from ### sections."""
|
||||||
|
cards: list[dict[str, str]] = []
|
||||||
sections = re.findall(
|
sections = re.findall(
|
||||||
r"^### (?:\d+\.\s*)?([^\n]+)\n((?:(?!^### ).)*)",
|
r"^### (?:\d+\.\s*)?([^\n]+)\n((?:(?!^### ).)*)",
|
||||||
content,
|
content,
|
||||||
re.MULTILINE | re.DOTALL,
|
re.MULTILINE | re.DOTALL,
|
||||||
)
|
)
|
||||||
|
|
||||||
for header, body in sections:
|
for raw_header, raw_body in sections:
|
||||||
header = header.strip()
|
header = raw_header.strip()
|
||||||
body = body.strip()
|
body = raw_body.strip()
|
||||||
|
|
||||||
# Skip short sections, mnemonics, examples
|
|
||||||
if (
|
if (
|
||||||
len(body) < 80
|
len(body) < MIN_BODY_LENGTH
|
||||||
or "Przykład" in header
|
or "Przykład" in header
|
||||||
or "Mnemonic" in header
|
or "Mnemonic" in header
|
||||||
or '"' in header
|
or '"' in header
|
||||||
):
|
):
|
||||||
continue
|
continue
|
||||||
|
|
||||||
# Extract real content
|
answer_lines = _extract_section_content(body)
|
||||||
answer_lines = []
|
|
||||||
|
|
||||||
# Get definition if present
|
|
||||||
def_match = re.search(r"#### Definicja[^\n]*\n([^\n#]+(?:\n[^\n#]+)?)", body)
|
|
||||||
if def_match:
|
|
||||||
answer_lines.append(def_match.group(1).strip())
|
|
||||||
|
|
||||||
# Get characterization
|
|
||||||
char_match = re.search(r"#### Charakterystyka\s*\n((?:[-•][^\n]+\n?)+)", body)
|
|
||||||
if char_match:
|
|
||||||
bullets = re.findall(
|
|
||||||
r"[-•]\s*\*\*([^*]+)\*\*[:\s]*([^\n]*)", char_match.group(1)
|
|
||||||
)
|
|
||||||
for term, desc in bullets[:4]:
|
|
||||||
answer_lines.append(
|
|
||||||
f"• <b>{term}</b>: {desc.strip()}" if desc else f"• <b>{term}</b>"
|
|
||||||
)
|
|
||||||
|
|
||||||
# Get bullet points if no structured content yet
|
|
||||||
if not answer_lines:
|
if not answer_lines:
|
||||||
bullets = re.findall(r"[-•]\s*\*\*([^*]+)\*\*[:\s]*([^\n]*)", body)
|
continue
|
||||||
for term, desc in bullets[:5]:
|
|
||||||
answer_lines.append(
|
|
||||||
f"• <b>{term}</b>: {desc.strip()}" if desc else f"• <b>{term}</b>"
|
|
||||||
)
|
|
||||||
|
|
||||||
# Get first paragraph if still nothing
|
question = (
|
||||||
if not answer_lines:
|
header if header.endswith("?") else f"Wyjaśnij: {header}"
|
||||||
first_para = re.search(r"^([^#\n\-•|`][^\n]{30,250})", body, re.MULTILINE)
|
)
|
||||||
if first_para:
|
answer = "<br>".join(
|
||||||
answer_lines.append(first_para.group(1))
|
clean_text(line) for line in answer_lines
|
||||||
|
)
|
||||||
|
cards.append(
|
||||||
|
{
|
||||||
|
"front": clean_text(question),
|
||||||
|
"back": answer,
|
||||||
|
"tags": f"{base_tags} szczegoly",
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
if answer_lines:
|
return cards
|
||||||
question = f"Wyjaśnij: {header}" if not header.endswith("?") else header
|
|
||||||
answer = "<br>".join([clean_text(l) for l in answer_lines])
|
|
||||||
|
|
||||||
cards.append(
|
|
||||||
{
|
|
||||||
"front": clean_text(question),
|
|
||||||
"back": answer,
|
|
||||||
"tags": f"{base_tags} szczegoly",
|
|
||||||
}
|
|
||||||
)
|
|
||||||
|
|
||||||
# ===============================================
|
def _build_qa_cards(
|
||||||
# Q&A CARDS: From practice questions section
|
content: str, base_tags: str,
|
||||||
# ===============================================
|
) -> list[dict[str, str]]:
|
||||||
|
"""Build Q&A practice cards."""
|
||||||
|
cards: list[dict[str, str]] = []
|
||||||
qa_matches = re.findall(
|
qa_matches = re.findall(
|
||||||
r'### Q\d+:\s*["\']?([^"\'?\n]+)\?*["\']?\s*\n.*?Odpowiedź:\s*\n(.+?)(?=\n### |\n## |\Z)',
|
r'### Q\d+:\s*["\']?([^"\'?\n]+)\?*["\']?\s*\n'
|
||||||
|
r".*?Odpowiedź:\s*\n(.+?)(?=\n### |\n## |\Z)",
|
||||||
content,
|
content,
|
||||||
re.DOTALL,
|
re.DOTALL,
|
||||||
)
|
)
|
||||||
|
|
||||||
for question, answer in qa_matches[:5]:
|
for raw_question, raw_answer in qa_matches[:5]:
|
||||||
question = question.strip()
|
question = raw_question.strip()
|
||||||
answer = answer.strip()
|
answer_text = raw_answer.strip()
|
||||||
|
|
||||||
# Clean up answer - get first meaningful part
|
answer_lines = answer_text.split("\n")
|
||||||
answer_lines = answer.split("\n")
|
clean_answer = [
|
||||||
clean_answer = []
|
stripped
|
||||||
for line in answer_lines[:6]:
|
for raw_line in answer_lines[:6]
|
||||||
line = line.strip()
|
if (stripped := raw_line.strip())
|
||||||
if line and not line.startswith("```") and not line.startswith("|"):
|
and not stripped.startswith("```")
|
||||||
clean_answer.append(line)
|
and not stripped.startswith("|")
|
||||||
|
]
|
||||||
|
|
||||||
if clean_answer:
|
if clean_answer:
|
||||||
cards.append(
|
cards.append(
|
||||||
{
|
{
|
||||||
"front": clean_text(question + "?"),
|
"front": clean_text(question + "?"),
|
||||||
"back": "<br>".join([clean_text(l) for l in clean_answer]),
|
"back": "<br>".join(
|
||||||
|
clean_text(line) for line in clean_answer
|
||||||
|
),
|
||||||
"tags": f"{base_tags} qa",
|
"tags": f"{base_tags} qa",
|
||||||
}
|
}
|
||||||
)
|
)
|
||||||
@ -256,6 +311,20 @@ def extract_cards(filepath) -> list[dict[str, str]]:
|
|||||||
return cards
|
return cards
|
||||||
|
|
||||||
|
|
||||||
|
def extract_cards(filepath: str | Path) -> list[dict[str, str]]:
|
||||||
|
"""Extract flashcards from a file."""
|
||||||
|
content, base_tags, main_question = _read_file_metadata(filepath)
|
||||||
|
|
||||||
|
cards: list[dict[str, str]] = []
|
||||||
|
main_card = _build_main_card(content, main_question, base_tags)
|
||||||
|
if main_card:
|
||||||
|
cards.append(main_card)
|
||||||
|
|
||||||
|
cards.extend(_build_concept_cards(content, base_tags))
|
||||||
|
cards.extend(_build_qa_cards(content, base_tags))
|
||||||
|
return cards
|
||||||
|
|
||||||
|
|
||||||
def main() -> None:
|
def main() -> None:
|
||||||
"""Main."""
|
"""Main."""
|
||||||
odpowiedzi_dir = Path("/home/kuchy/praca_magisterska/pytania/odpowiedzi")
|
odpowiedzi_dir = Path("/home/kuchy/praca_magisterska/pytania/odpowiedzi")
|
||||||
@ -266,13 +335,13 @@ def main() -> None:
|
|||||||
all_cards = []
|
all_cards = []
|
||||||
|
|
||||||
for md_file in sorted(odpowiedzi_dir.glob("*.md")):
|
for md_file in sorted(odpowiedzi_dir.glob("*.md")):
|
||||||
print(f"Processing: {md_file.name}", end=" ")
|
logger.info("Processing: %s", md_file.name)
|
||||||
try:
|
try:
|
||||||
cards = extract_cards(md_file)
|
cards = extract_cards(md_file)
|
||||||
all_cards.extend(cards)
|
all_cards.extend(cards)
|
||||||
print(f"→ {len(cards)} cards")
|
logger.info(" -> %d cards", len(cards))
|
||||||
except Exception as e:
|
except (ValueError, OSError):
|
||||||
print(f"→ ERROR: {e}")
|
logger.exception(" -> Error processing file")
|
||||||
|
|
||||||
# Remove duplicates
|
# Remove duplicates
|
||||||
seen = set()
|
seen = set()
|
||||||
@ -299,8 +368,12 @@ def main() -> None:
|
|||||||
tags = card["tags"]
|
tags = card["tags"]
|
||||||
f.write(f"{front}\t{back}\t{tags}\n")
|
f.write(f"{front}\t{back}\t{tags}\n")
|
||||||
|
|
||||||
print(f"\n✅ Generated {len(unique_cards)} flashcards")
|
logger.info(
|
||||||
print(f"📁 Output: {output_file}")
|
"Generated %d unique cards from %d total",
|
||||||
|
len(unique_cards),
|
||||||
|
len(all_cards),
|
||||||
|
)
|
||||||
|
logger.info("Output: %s", output_file)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
|||||||
File diff suppressed because it is too large
Load Diff
@ -7,6 +7,8 @@ Designed for A4 laser printer output (300 DPI, black & white).
|
|||||||
|
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import logging
|
||||||
|
|
||||||
import matplotlib as mpl
|
import matplotlib as mpl
|
||||||
|
|
||||||
mpl.use("Agg")
|
mpl.use("Agg")
|
||||||
@ -20,6 +22,8 @@ if TYPE_CHECKING:
|
|||||||
from matplotlib.axes import Axes
|
from matplotlib.axes import Axes
|
||||||
from matplotlib.figure import Figure
|
from matplotlib.figure import Figure
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
OUTPUT_DIR = str(Path(__file__).resolve().parent / "img")
|
OUTPUT_DIR = str(Path(__file__).resolve().parent / "img")
|
||||||
Path(OUTPUT_DIR).mkdir(parents=True, exist_ok=True)
|
Path(OUTPUT_DIR).mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
@ -33,19 +37,35 @@ FIXED_COLOR = "#D0F0D0" # light green-ish gray for fixed
|
|||||||
FD_ARROW_COLOR = "#444444"
|
FD_ARROW_COLOR = "#444444"
|
||||||
|
|
||||||
|
|
||||||
|
def _compute_col_widths(
|
||||||
|
headers: list[str],
|
||||||
|
rows: list[list[str]],
|
||||||
|
) -> list[float]:
|
||||||
|
"""Auto-calculate column widths based on content."""
|
||||||
|
col_widths: list[float] = []
|
||||||
|
for c in range(len(headers)):
|
||||||
|
max_len = len(headers[c])
|
||||||
|
for r in rows:
|
||||||
|
if c < len(r):
|
||||||
|
max_len = max(max_len, len(str(r[c])))
|
||||||
|
col_widths.append(max(max_len * 0.08 + 0.1, 0.5))
|
||||||
|
return col_widths
|
||||||
|
|
||||||
|
|
||||||
def draw_table(
|
def draw_table(
|
||||||
ax,
|
ax: Axes,
|
||||||
x,
|
x: float,
|
||||||
y,
|
y: float,
|
||||||
title,
|
title: str,
|
||||||
headers,
|
headers: list[str],
|
||||||
rows,
|
rows: list[list[str]],
|
||||||
col_widths=None,
|
*,
|
||||||
highlight_cols=None,
|
col_widths: list[float] | None = None,
|
||||||
highlight_rows=None,
|
highlight_cols: set[int] | None = None,
|
||||||
highlight_cells=None,
|
highlight_rows: set[int] | None = None,
|
||||||
strikethrough_cells=None,
|
highlight_cells: set[tuple[int, int]] | None = None,
|
||||||
title_fontsize=9,
|
strikethrough_cells: set[tuple[int, int]] | None = None,
|
||||||
|
title_fontsize: int = 9,
|
||||||
) -> tuple[float, float]:
|
) -> tuple[float, float]:
|
||||||
"""Draw a single table on the axes at position (x, y).
|
"""Draw a single table on the axes at position (x, y).
|
||||||
|
|
||||||
@ -66,18 +86,10 @@ def draw_table(
|
|||||||
Returns:
|
Returns:
|
||||||
(width, height) of the drawn table
|
(width, height) of the drawn table
|
||||||
"""
|
"""
|
||||||
n_cols = len(headers)
|
|
||||||
n_rows = len(rows)
|
n_rows = len(rows)
|
||||||
|
|
||||||
if col_widths is None:
|
if col_widths is None:
|
||||||
# Auto-calculate based on content
|
col_widths = _compute_col_widths(headers, rows)
|
||||||
col_widths = []
|
|
||||||
for c in range(n_cols):
|
|
||||||
max_len = len(headers[c])
|
|
||||||
for r in rows:
|
|
||||||
if c < len(r):
|
|
||||||
max_len = max(max_len, len(str(r[c])))
|
|
||||||
col_widths.append(max(max_len * 0.08 + 0.1, 0.5))
|
|
||||||
|
|
||||||
row_height = 0.22
|
row_height = 0.22
|
||||||
total_width = sum(col_widths)
|
total_width = sum(col_widths)
|
||||||
@ -172,7 +184,10 @@ def draw_table(
|
|||||||
return total_width, total_height + 0.25 # extra for title
|
return total_width, total_height + 0.25 # extra for title
|
||||||
|
|
||||||
|
|
||||||
def create_figure(width_inches=11.69, height_inches=8.27) -> tuple[Figure, Axes]:
|
def create_figure(
|
||||||
|
width_inches: float = 11.69,
|
||||||
|
height_inches: float = 8.27,
|
||||||
|
) -> tuple[Figure, Axes]:
|
||||||
"""Create A4 landscape figure."""
|
"""Create A4 landscape figure."""
|
||||||
fig, ax = plt.subplots(1, 1, figsize=(width_inches, height_inches), dpi=DPI)
|
fig, ax = plt.subplots(1, 1, figsize=(width_inches, height_inches), dpi=DPI)
|
||||||
ax.set_xlim(0, width_inches)
|
ax.set_xlim(0, width_inches)
|
||||||
@ -182,7 +197,16 @@ def create_figure(width_inches=11.69, height_inches=8.27) -> tuple[Figure, Axes]
|
|||||||
return fig, ax
|
return fig, ax
|
||||||
|
|
||||||
|
|
||||||
def add_arrow(ax, x1, y1, x2, y2, label="", color="black") -> None:
|
def add_arrow(
|
||||||
|
ax: Axes,
|
||||||
|
x1: float,
|
||||||
|
y1: float,
|
||||||
|
x2: float,
|
||||||
|
y2: float,
|
||||||
|
label: str = "",
|
||||||
|
*,
|
||||||
|
color: str = "black",
|
||||||
|
) -> None:
|
||||||
"""Draw an arrow with optional label."""
|
"""Draw an arrow with optional label."""
|
||||||
ax.annotate(
|
ax.annotate(
|
||||||
"",
|
"",
|
||||||
@ -205,7 +229,15 @@ def add_arrow(ax, x1, y1, x2, y2, label="", color="black") -> None:
|
|||||||
|
|
||||||
|
|
||||||
def add_label(
|
def add_label(
|
||||||
ax, x, y, text, fontsize=8, color="black", ha="left", style="normal"
|
ax: Axes,
|
||||||
|
x: float,
|
||||||
|
y: float,
|
||||||
|
text: str,
|
||||||
|
*,
|
||||||
|
fontsize: int = 8,
|
||||||
|
color: str = "black",
|
||||||
|
ha: str = "left",
|
||||||
|
style: str = "normal",
|
||||||
) -> None:
|
) -> None:
|
||||||
"""Add a text label."""
|
"""Add a text label."""
|
||||||
ax.text(
|
ax.text(
|
||||||
@ -289,7 +321,10 @@ def draw_0nf() -> None:
|
|||||||
ax,
|
ax,
|
||||||
0.8,
|
0.8,
|
||||||
1.2,
|
1.2,
|
||||||
"Zaleznosci funkcyjne: StID -> Imie, WydzialID | WydzialID -> NazwaWydzialu",
|
(
|
||||||
|
"Zaleznosci funkcyjne: StID -> Imie, WydzialID"
|
||||||
|
" | WydzialID -> NazwaWydzialu"
|
||||||
|
),
|
||||||
fontsize=8,
|
fontsize=8,
|
||||||
color="#333333",
|
color="#333333",
|
||||||
)
|
)
|
||||||
@ -297,7 +332,10 @@ def draw_0nf() -> None:
|
|||||||
ax,
|
ax,
|
||||||
0.8,
|
0.8,
|
||||||
0.9,
|
0.9,
|
||||||
" KursID -> NazwaKursu | (StID,KursID) -> Prowadzacy | Prowadzacy -> KursID",
|
(
|
||||||
|
" KursID -> NazwaKursu | (StID,KursID)"
|
||||||
|
" -> Prowadzacy | Prowadzacy -> KursID"
|
||||||
|
),
|
||||||
fontsize=8,
|
fontsize=8,
|
||||||
color="#333333",
|
color="#333333",
|
||||||
)
|
)
|
||||||
@ -309,7 +347,7 @@ def draw_0nf() -> None:
|
|||||||
pad_inches=0.2,
|
pad_inches=0.2,
|
||||||
)
|
)
|
||||||
plt.close(fig)
|
plt.close(fig)
|
||||||
print("Generated: nf_0nf_table.png")
|
logger.info("Generated: nf_0nf_table.png")
|
||||||
|
|
||||||
|
|
||||||
# ============================================================
|
# ============================================================
|
||||||
@ -399,7 +437,10 @@ def draw_1nf() -> None:
|
|||||||
ax,
|
ax,
|
||||||
0.5,
|
0.5,
|
||||||
1.5,
|
1.5,
|
||||||
" Imie, WydzialID, NazwaWydzialu zaleza TYLKO od StID (czesc klucza).",
|
(
|
||||||
|
" Imie, WydzialID, NazwaWydzialu"
|
||||||
|
" zaleza TYLKO od StID (czesc klucza)."
|
||||||
|
),
|
||||||
fontsize=9,
|
fontsize=9,
|
||||||
color="black",
|
color="black",
|
||||||
)
|
)
|
||||||
@ -419,7 +460,7 @@ def draw_1nf() -> None:
|
|||||||
pad_inches=0.2,
|
pad_inches=0.2,
|
||||||
)
|
)
|
||||||
plt.close(fig)
|
plt.close(fig)
|
||||||
print("Generated: nf_1nf_tables.png")
|
logger.info("Generated: nf_1nf_tables.png")
|
||||||
|
|
||||||
|
|
||||||
# ============================================================
|
# ============================================================
|
||||||
@ -477,7 +518,10 @@ def draw_2nf() -> None:
|
|||||||
ax,
|
ax,
|
||||||
0.3,
|
0.3,
|
||||||
3.3,
|
3.3,
|
||||||
"KROK: Rozbito czesc. zaleznosci — atrybuty zalezne od czesci klucza wydzielone.",
|
(
|
||||||
|
"KROK: Rozbito czesc. zaleznosci"
|
||||||
|
" — atrybuty zalezne od czesci klucza wydzielone."
|
||||||
|
),
|
||||||
fontsize=9,
|
fontsize=9,
|
||||||
)
|
)
|
||||||
add_label(
|
add_label(
|
||||||
@ -528,7 +572,7 @@ def draw_2nf() -> None:
|
|||||||
pad_inches=0.2,
|
pad_inches=0.2,
|
||||||
)
|
)
|
||||||
plt.close(fig)
|
plt.close(fig)
|
||||||
print("Generated: nf_2nf_tables.png")
|
logger.info("Generated: nf_2nf_tables.png")
|
||||||
|
|
||||||
|
|
||||||
# ============================================================
|
# ============================================================
|
||||||
|
|||||||
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue
Block a user