praca_magisterska/pytania/generate_anki.py

186 lines
7.1 KiB
Python
Raw Normal View History

#!/usr/bin/env python3
"""
Generate Anki flashcards from exam questions in odpowiedzi/ folder.
Creates a tab-separated file compatible with Anki import.
"""
import os
import re
from pathlib import Path
def extract_question_and_answer(filepath):
"""Extract main question and key answer points from a markdown file."""
with open(filepath, 'r', encoding='utf-8') as f:
content = f.read()
cards = []
# Extract file number for tagging
filename = os.path.basename(filepath)
match = re.match(r'(\d+)-(.+)\.md', filename)
if match:
num = match.group(1)
topic = match.group(2).replace('-', '_')
else:
num = "00"
topic = "unknown"
# Extract main title (usually contains the question)
title_match = re.search(r'^# (.+)$', content, re.MULTILINE)
title = title_match.group(1) if title_match else "Unknown"
# Extract the main question from ## Pytanie section
question_match = re.search(r'## Pytanie\s*\n\s*\*\*["\']?(.+?)["\']?\*\*', content, re.DOTALL)
if question_match:
main_question = question_match.group(1).strip()
main_question = re.sub(r'\s+', ' ', main_question)
else:
main_question = title
# Extract subject/przedmiot
subject_match = re.search(r'Przedmiot:\s*(\w+)', content)
subject = subject_match.group(1) if subject_match else "Ogólne"
# Create main question card - extract key sections for answer
answer_parts = []
# Look for main answer section
main_answer = re.search(r'## 📚 Odpowiedź główna\s*\n(.+?)(?=\n## |\n---\s*\n## |\Z)', content, re.DOTALL)
if main_answer:
answer_text = main_answer.group(1)
# Extract key points, definitions, headers
headers = re.findall(r'### (.+)', answer_text)
for h in headers[:5]: # Limit to first 5 headers
answer_parts.append(f"{h}")
# Also extract key definitions if present
definitions = re.findall(r'\*\*([^*]+)\*\*\s*[-:]\s*([^*\n]+)', content)
for term, definition in definitions[:3]:
if len(definition) > 20 and len(definition) < 200:
answer_parts.append(f"{term}: {definition.strip()}")
# If we found answer parts, create main card
if answer_parts:
answer_html = "<br>".join(answer_parts[:8]) # Limit answer length
cards.append({
'question': main_question,
'answer': answer_html,
'tags': f"egzamin_magisterski pytanie_{num} {subject} {topic}"
})
# Extract sub-questions and key concepts as additional cards
# Look for ### headers with explanations
subsections = re.findall(r'### (\d+\.\s+)?(.+?)\n\n(.+?)(?=\n### |\n## |\n---|\Z)', content, re.DOTALL)
for _, header, body in subsections:
if len(header) < 5 or header.startswith('Przykład'):
continue
# Extract first substantive paragraph or key points
body_clean = body.strip()
# Skip very short or code-only sections
if len(body_clean) < 50:
continue
# Extract bullet points or first paragraph
bullets = re.findall(r'[-•]\s*\*\*(.+?)\*\*[:\s]*([^\n]+)?', body_clean)
if bullets:
answer_text = "<br>".join([f"{b[0]}: {b[1].strip()}" if b[1] else f"{b[0]}" for b in bullets[:5]])
else:
# Get first meaningful paragraph
paragraphs = [p.strip() for p in body_clean.split('\n\n') if p.strip() and not p.startswith('```') and not p.startswith('|')]
if paragraphs:
first_para = paragraphs[0]
# Clean markdown
first_para = re.sub(r'\*\*(.+?)\*\*', r'\1', first_para)
first_para = re.sub(r'\*(.+?)\*', r'\1', first_para)
answer_text = first_para[:400]
else:
continue
# Create sub-concept card
sub_question = f"Co to jest {header}?" if not header.endswith('?') else header
if "Charakterystyka" in header or "Definicja" in header or "Właściwości" in header:
# These are answer-type headers, reframe
parent_topic = title.replace("Pytanie", "").strip(": 0123456789")
sub_question = f"{header} - {parent_topic}"
cards.append({
'question': sub_question,
'answer': answer_text,
'tags': f"egzamin_magisterski pytanie_{num} {subject} {topic} szczegoly"
})
# Extract key formulas/definitions as separate cards
formulas = re.findall(r'\*\*([A-Za-z\s]+(?:formuła|wzór|twierdzenie|definicja|lemat))\*\*[:\s]*\n?(.+?)(?=\n\n|\n\*\*|\Z)', content, re.IGNORECASE | re.DOTALL)
for formula_name, formula_content in formulas:
if len(formula_content) > 20:
cards.append({
'question': f"Podaj {formula_name.strip()}",
'answer': formula_content.strip()[:300],
'tags': f"egzamin_magisterski pytanie_{num} {subject} formuly"
})
return cards
def clean_for_anki(text):
"""Clean text for Anki import - escape special characters."""
# Replace tabs with spaces
text = text.replace('\t', ' ')
# Convert markdown formatting to HTML
text = re.sub(r'\*\*(.+?)\*\*', r'<b>\1</b>', text)
text = re.sub(r'\*(.+?)\*', r'<i>\1</i>', text)
# Handle newlines - convert to <br> for Anki
text = text.replace('\n', '<br>')
# Remove multiple <br>
text = re.sub(r'(<br>)+', '<br>', text)
# Remove leading/trailing <br>
text = re.sub(r'^<br>|<br>$', '', text)
# Escape quotes in a way that works with tab-separated
text = text.replace('"', '&quot;')
return text.strip()
def main():
odpowiedzi_dir = Path("/home/kuchy/praca_magisterska/pytania/odpowiedzi")
output_file = Path("/home/kuchy/praca_magisterska/pytania/anki_egzamin_magisterski.txt")
all_cards = []
# Process each file
for md_file in sorted(odpowiedzi_dir.glob("*.md")):
print(f"Processing: {md_file.name}")
try:
cards = extract_question_and_answer(md_file)
all_cards.extend(cards)
print(f" -> Extracted {len(cards)} cards")
except Exception as e:
print(f" -> Error: {e}")
# Write Anki file with headers
with open(output_file, 'w', encoding='utf-8') as f:
# Anki file headers
f.write("#separator:tab\n")
f.write("#html:true\n")
f.write("#columns:Front\tBack\tTags\n")
f.write("#deck:Egzamin Magisterski ISY\n")
f.write("#notetype:Basic\n")
f.write("\n")
for card in all_cards:
front = clean_for_anki(card['question'])
back = clean_for_anki(card['answer'])
tags = card['tags']
f.write(f"{front}\t{back}\t{tags}\n")
print(f"\n✅ Created {len(all_cards)} flashcards")
print(f"📁 Output: {output_file}")
print("\nTo import into Anki:")
print("1. Open Anki → File → Import")
print("2. Select the .txt file")
print("3. Verify 'Allow HTML' is checked")
print("4. Click Import")
if __name__ == "__main__":
main()