mirror of
https://github.com/kuhyx/praca_magisterska.git
synced 2026-07-04 15:23:10 +02:00
258 lines
9.7 KiB
Python
258 lines
9.7 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Generate Anki flashcards with ACTUAL substantive answers, not just headers.
|
|
"""
|
|
|
|
import os
|
|
import re
|
|
from pathlib import Path
|
|
|
|
def clean_text(text):
|
|
"""Clean text for Anki."""
|
|
if not text:
|
|
return ""
|
|
text = re.sub(r'\*\*(.+?)\*\*', r'<b>\1</b>', text)
|
|
text = re.sub(r'(?<!\*)\*([^*]+)\*(?!\*)', r'<i>\1</i>', text)
|
|
text = text.replace('\t', ' ')
|
|
text = text.replace('"', '"')
|
|
text = re.sub(r' +', ' ', text)
|
|
return text.strip()
|
|
|
|
def extract_real_answer(content, section_name):
|
|
"""Extract actual content from a section, not just headers."""
|
|
# Find the section
|
|
pattern = rf'### (?:\d+\.\s*)?{re.escape(section_name)}\s*\n((?:(?!^### ).)+)'
|
|
match = re.search(pattern, content, re.MULTILINE | re.DOTALL)
|
|
if not match:
|
|
return None
|
|
|
|
body = match.group(1).strip()
|
|
|
|
# Extract meaningful content
|
|
lines = []
|
|
|
|
# Get subheaders with their first line of content
|
|
subheader_pattern = r'#### ([^\n]+)\n([^\n#]+)'
|
|
for sub_header, first_line in re.findall(subheader_pattern, body):
|
|
lines.append(f"<b>{sub_header.strip()}</b>: {first_line.strip()}")
|
|
|
|
# Get bullet points
|
|
bullet_pattern = r'[-•]\s*\*\*([^*]+)\*\*[:\s-]*([^\n]*)'
|
|
for term, desc in re.findall(bullet_pattern, body):
|
|
if desc.strip():
|
|
lines.append(f"• <b>{term.strip()}</b>: {desc.strip()}")
|
|
else:
|
|
lines.append(f"• <b>{term.strip()}</b>")
|
|
|
|
# If no structured content, get paragraphs
|
|
if not lines:
|
|
paras = [p.strip() for p in body.split('\n\n')
|
|
if p.strip() and not p.startswith('```') and not p.startswith('|')]
|
|
for p in paras[:2]:
|
|
if len(p) > 20 and len(p) < 400:
|
|
lines.append(p)
|
|
|
|
return '<br>'.join(lines[:6]) if lines else None
|
|
|
|
def extract_cards(filepath):
|
|
"""Extract flashcards from a file."""
|
|
with open(filepath, 'r', encoding='utf-8') as f:
|
|
content = f.read()
|
|
|
|
cards = []
|
|
filename = os.path.basename(filepath)
|
|
match = re.match(r'(\d+)-(.+)\.md', filename)
|
|
num = match.group(1) if match else "00"
|
|
|
|
subj_match = re.search(r'Przedmiot:\s*(\w+)', content)
|
|
subject = subj_match.group(1) if subj_match else "Ogólne"
|
|
base_tags = f"egzamin_magisterski pyt{num} {subject}"
|
|
|
|
# Get main question
|
|
q_match = re.search(r'## Pytanie\s*\n\s*\*\*["\']?(.+?)["\']?\*\*', content, re.DOTALL)
|
|
main_question = re.sub(r'\s+', ' ', q_match.group(1).strip()) if q_match else None
|
|
|
|
# ===============================================
|
|
# MAIN CARD: Question with REAL answer summary
|
|
# ===============================================
|
|
if main_question:
|
|
# Build a real answer from the main sections
|
|
answer_parts = []
|
|
|
|
# For automata question - extract key facts about each automaton
|
|
if "automat" in main_question.lower() or "maszyn" in main_question.lower():
|
|
# FA
|
|
fa_match = re.search(r'Automat Skończony.*?Rozpoznawana klasa języków\s*\n\s*\*\*([^*]+)\*\*', content, re.DOTALL)
|
|
if fa_match:
|
|
answer_parts.append(f"<b>Automat Skończony (FA)</b>: {fa_match.group(1).strip()}")
|
|
|
|
# PDA
|
|
pda_match = re.search(r'Automat ze Stosem.*?Rozpoznawana klasa języków\s*\n\s*\*\*([^*]+)\*\*', content, re.DOTALL)
|
|
if pda_match:
|
|
answer_parts.append(f"<b>Automat ze Stosem (PDA)</b>: {pda_match.group(1).strip()}")
|
|
|
|
# TM
|
|
tm_match = re.search(r'Maszyna Turinga.*?Rozpoznawana klasa języków\s*\n\s*\*\*([^*]+)\*\*', content, re.DOTALL)
|
|
if tm_match:
|
|
answer_parts.append(f"<b>Maszyna Turinga (TM)</b>: {tm_match.group(1).strip()}")
|
|
|
|
# Generic extraction if specific didn't work
|
|
if not answer_parts:
|
|
# Look for key definitions/summaries
|
|
key_patterns = [
|
|
r'#### Definicja\s*\n([^\n#]+)',
|
|
r'#### Charakterystyka\s*\n([^\n#]+)',
|
|
r'\*\*Definicja[:\s]*\*\*\s*([^\n]+)',
|
|
]
|
|
for pattern in key_patterns:
|
|
for match in re.findall(pattern, content)[:3]:
|
|
if len(match) > 20:
|
|
answer_parts.append(match.strip())
|
|
|
|
# Still nothing? Get first substantive paragraph from main answer
|
|
if not answer_parts:
|
|
main_answer = re.search(r'## 📚 Odpowiedź główna\s*\n(.+?)(?=\n## |\Z)', content, re.DOTALL)
|
|
if main_answer:
|
|
# Skip headers, get actual content
|
|
text = main_answer.group(1)
|
|
paras = re.findall(r'\n\n([^#\n][^\n]{50,300})', text)
|
|
answer_parts = paras[:3]
|
|
|
|
if answer_parts:
|
|
answer = '<br><br>'.join([clean_text(p) for p in answer_parts])
|
|
cards.append({
|
|
'front': clean_text(main_question),
|
|
'back': answer,
|
|
'tags': f"{base_tags} pytanie_glowne"
|
|
})
|
|
|
|
# ===============================================
|
|
# CONCEPT CARDS: Specific topics with real content
|
|
# ===============================================
|
|
# Find all ### sections and extract their actual content
|
|
sections = re.findall(
|
|
r'^### (?:\d+\.\s*)?([^\n]+)\n((?:(?!^### ).)*)',
|
|
content,
|
|
re.MULTILINE | re.DOTALL
|
|
)
|
|
|
|
for header, body in sections:
|
|
header = header.strip()
|
|
body = body.strip()
|
|
|
|
# Skip short sections, mnemonics, examples
|
|
if len(body) < 80 or 'Przykład' in header or 'Mnemonic' in header or '"' in header:
|
|
continue
|
|
|
|
# Extract real content
|
|
answer_lines = []
|
|
|
|
# Get definition if present
|
|
def_match = re.search(r'#### Definicja[^\n]*\n([^\n#]+(?:\n[^\n#]+)?)', body)
|
|
if def_match:
|
|
answer_lines.append(def_match.group(1).strip())
|
|
|
|
# Get characterization
|
|
char_match = re.search(r'#### Charakterystyka\s*\n((?:[-•][^\n]+\n?)+)', body)
|
|
if char_match:
|
|
bullets = re.findall(r'[-•]\s*\*\*([^*]+)\*\*[:\s]*([^\n]*)', char_match.group(1))
|
|
for term, desc in bullets[:4]:
|
|
answer_lines.append(f"• <b>{term}</b>: {desc.strip()}" if desc else f"• <b>{term}</b>")
|
|
|
|
# Get bullet points if no structured content yet
|
|
if not answer_lines:
|
|
bullets = re.findall(r'[-•]\s*\*\*([^*]+)\*\*[:\s]*([^\n]*)', body)
|
|
for term, desc in bullets[:5]:
|
|
answer_lines.append(f"• <b>{term}</b>: {desc.strip()}" if desc else f"• <b>{term}</b>")
|
|
|
|
# Get first paragraph if still nothing
|
|
if not answer_lines:
|
|
first_para = re.search(r'^([^#\n\-•|`][^\n]{30,250})', body, re.MULTILINE)
|
|
if first_para:
|
|
answer_lines.append(first_para.group(1))
|
|
|
|
if answer_lines:
|
|
question = f"Wyjaśnij: {header}" if not header.endswith('?') else header
|
|
answer = '<br>'.join([clean_text(l) for l in answer_lines])
|
|
|
|
cards.append({
|
|
'front': clean_text(question),
|
|
'back': answer,
|
|
'tags': f"{base_tags} szczegoly"
|
|
})
|
|
|
|
# ===============================================
|
|
# Q&A CARDS: From practice questions section
|
|
# ===============================================
|
|
qa_matches = re.findall(
|
|
r'### Q\d+:\s*["\']?([^"\'?\n]+)\?*["\']?\s*\n.*?Odpowiedź:\s*\n(.+?)(?=\n### |\n## |\Z)',
|
|
content, re.DOTALL
|
|
)
|
|
|
|
for question, answer in qa_matches[:5]:
|
|
question = question.strip()
|
|
answer = answer.strip()
|
|
|
|
# Clean up answer - get first meaningful part
|
|
answer_lines = answer.split('\n')
|
|
clean_answer = []
|
|
for line in answer_lines[:6]:
|
|
line = line.strip()
|
|
if line and not line.startswith('```') and not line.startswith('|'):
|
|
clean_answer.append(line)
|
|
|
|
if clean_answer:
|
|
cards.append({
|
|
'front': clean_text(question + '?'),
|
|
'back': '<br>'.join([clean_text(l) for l in clean_answer]),
|
|
'tags': f"{base_tags} qa"
|
|
})
|
|
|
|
return cards
|
|
|
|
def main():
|
|
odpowiedzi_dir = Path("/home/kuchy/praca_magisterska/pytania/odpowiedzi")
|
|
output_file = Path("/home/kuchy/praca_magisterska/pytania/anki_egzamin_magisterski.txt")
|
|
|
|
all_cards = []
|
|
|
|
for md_file in sorted(odpowiedzi_dir.glob("*.md")):
|
|
print(f"Processing: {md_file.name}", end=" ")
|
|
try:
|
|
cards = extract_cards(md_file)
|
|
all_cards.extend(cards)
|
|
print(f"→ {len(cards)} cards")
|
|
except Exception as e:
|
|
print(f"→ ERROR: {e}")
|
|
|
|
# Remove duplicates
|
|
seen = set()
|
|
unique_cards = []
|
|
for card in all_cards:
|
|
key = card['front'][:100]
|
|
if key not in seen:
|
|
seen.add(key)
|
|
unique_cards.append(card)
|
|
|
|
# Write file
|
|
with open(output_file, 'w', encoding='utf-8') as f:
|
|
f.write("#separator:Tab\n")
|
|
f.write("#html:true\n")
|
|
f.write("#notetype:Basic\n")
|
|
f.write("#deck:Egzamin Magisterski ISY\n")
|
|
f.write("#columns:Front\tBack\tTags\n")
|
|
f.write("#tags column:3\n")
|
|
f.write("\n")
|
|
|
|
for card in unique_cards:
|
|
front = card['front'].replace('\t', ' ')
|
|
back = card['back'].replace('\t', ' ')
|
|
tags = card['tags']
|
|
f.write(f"{front}\t{back}\t{tags}\n")
|
|
|
|
print(f"\n✅ Generated {len(unique_cards)} flashcards")
|
|
print(f"📁 Output: {output_file}")
|
|
|
|
if __name__ == "__main__":
|
|
main()
|