mirror of
https://github.com/kuhyx/praca_magisterska.git
synced 2026-07-04 13:23:05 +02:00
186 lines
7.1 KiB
Python
186 lines
7.1 KiB
Python
#!/usr/bin/env python3
|
||
"""
|
||
Generate Anki flashcards from exam questions in odpowiedzi/ folder.
|
||
Creates a tab-separated file compatible with Anki import.
|
||
"""
|
||
|
||
import os
|
||
import re
|
||
from pathlib import Path
|
||
|
||
def extract_question_and_answer(filepath):
|
||
"""Extract main question and key answer points from a markdown file."""
|
||
with open(filepath, 'r', encoding='utf-8') as f:
|
||
content = f.read()
|
||
|
||
cards = []
|
||
|
||
# Extract file number for tagging
|
||
filename = os.path.basename(filepath)
|
||
match = re.match(r'(\d+)-(.+)\.md', filename)
|
||
if match:
|
||
num = match.group(1)
|
||
topic = match.group(2).replace('-', '_')
|
||
else:
|
||
num = "00"
|
||
topic = "unknown"
|
||
|
||
# Extract main title (usually contains the question)
|
||
title_match = re.search(r'^# (.+)$', content, re.MULTILINE)
|
||
title = title_match.group(1) if title_match else "Unknown"
|
||
|
||
# Extract the main question from ## Pytanie section
|
||
question_match = re.search(r'## Pytanie\s*\n\s*\*\*["\']?(.+?)["\']?\*\*', content, re.DOTALL)
|
||
if question_match:
|
||
main_question = question_match.group(1).strip()
|
||
main_question = re.sub(r'\s+', ' ', main_question)
|
||
else:
|
||
main_question = title
|
||
|
||
# Extract subject/przedmiot
|
||
subject_match = re.search(r'Przedmiot:\s*(\w+)', content)
|
||
subject = subject_match.group(1) if subject_match else "Ogólne"
|
||
|
||
# Create main question card - extract key sections for answer
|
||
answer_parts = []
|
||
|
||
# Look for main answer section
|
||
main_answer = re.search(r'## 📚 Odpowiedź główna\s*\n(.+?)(?=\n## |\n---\s*\n## |\Z)', content, re.DOTALL)
|
||
if main_answer:
|
||
answer_text = main_answer.group(1)
|
||
# Extract key points, definitions, headers
|
||
headers = re.findall(r'### (.+)', answer_text)
|
||
for h in headers[:5]: # Limit to first 5 headers
|
||
answer_parts.append(f"• {h}")
|
||
|
||
# Also extract key definitions if present
|
||
definitions = re.findall(r'\*\*([^*]+)\*\*\s*[-–:]\s*([^*\n]+)', content)
|
||
for term, definition in definitions[:3]:
|
||
if len(definition) > 20 and len(definition) < 200:
|
||
answer_parts.append(f"• {term}: {definition.strip()}")
|
||
|
||
# If we found answer parts, create main card
|
||
if answer_parts:
|
||
answer_html = "<br>".join(answer_parts[:8]) # Limit answer length
|
||
cards.append({
|
||
'question': main_question,
|
||
'answer': answer_html,
|
||
'tags': f"egzamin_magisterski pytanie_{num} {subject} {topic}"
|
||
})
|
||
|
||
# Extract sub-questions and key concepts as additional cards
|
||
# Look for ### headers with explanations
|
||
subsections = re.findall(r'### (\d+\.\s+)?(.+?)\n\n(.+?)(?=\n### |\n## |\n---|\Z)', content, re.DOTALL)
|
||
|
||
for _, header, body in subsections:
|
||
if len(header) < 5 or header.startswith('Przykład'):
|
||
continue
|
||
|
||
# Extract first substantive paragraph or key points
|
||
body_clean = body.strip()
|
||
|
||
# Skip very short or code-only sections
|
||
if len(body_clean) < 50:
|
||
continue
|
||
|
||
# Extract bullet points or first paragraph
|
||
bullets = re.findall(r'[-•]\s*\*\*(.+?)\*\*[:\s]*([^\n]+)?', body_clean)
|
||
if bullets:
|
||
answer_text = "<br>".join([f"• {b[0]}: {b[1].strip()}" if b[1] else f"• {b[0]}" for b in bullets[:5]])
|
||
else:
|
||
# Get first meaningful paragraph
|
||
paragraphs = [p.strip() for p in body_clean.split('\n\n') if p.strip() and not p.startswith('```') and not p.startswith('|')]
|
||
if paragraphs:
|
||
first_para = paragraphs[0]
|
||
# Clean markdown
|
||
first_para = re.sub(r'\*\*(.+?)\*\*', r'\1', first_para)
|
||
first_para = re.sub(r'\*(.+?)\*', r'\1', first_para)
|
||
answer_text = first_para[:400]
|
||
else:
|
||
continue
|
||
|
||
# Create sub-concept card
|
||
sub_question = f"Co to jest {header}?" if not header.endswith('?') else header
|
||
if "Charakterystyka" in header or "Definicja" in header or "Właściwości" in header:
|
||
# These are answer-type headers, reframe
|
||
parent_topic = title.replace("Pytanie", "").strip(": 0123456789")
|
||
sub_question = f"{header} - {parent_topic}"
|
||
|
||
cards.append({
|
||
'question': sub_question,
|
||
'answer': answer_text,
|
||
'tags': f"egzamin_magisterski pytanie_{num} {subject} {topic} szczegoly"
|
||
})
|
||
|
||
# Extract key formulas/definitions as separate cards
|
||
formulas = re.findall(r'\*\*([A-Za-z\s]+(?:formuła|wzór|twierdzenie|definicja|lemat))\*\*[:\s]*\n?(.+?)(?=\n\n|\n\*\*|\Z)', content, re.IGNORECASE | re.DOTALL)
|
||
for formula_name, formula_content in formulas:
|
||
if len(formula_content) > 20:
|
||
cards.append({
|
||
'question': f"Podaj {formula_name.strip()}",
|
||
'answer': formula_content.strip()[:300],
|
||
'tags': f"egzamin_magisterski pytanie_{num} {subject} formuly"
|
||
})
|
||
|
||
return cards
|
||
|
||
def clean_for_anki(text):
|
||
"""Clean text for Anki import - escape special characters."""
|
||
# Replace tabs with spaces
|
||
text = text.replace('\t', ' ')
|
||
# Convert markdown formatting to HTML
|
||
text = re.sub(r'\*\*(.+?)\*\*', r'<b>\1</b>', text)
|
||
text = re.sub(r'\*(.+?)\*', r'<i>\1</i>', text)
|
||
# Handle newlines - convert to <br> for Anki
|
||
text = text.replace('\n', '<br>')
|
||
# Remove multiple <br>
|
||
text = re.sub(r'(<br>)+', '<br>', text)
|
||
# Remove leading/trailing <br>
|
||
text = re.sub(r'^<br>|<br>$', '', text)
|
||
# Escape quotes in a way that works with tab-separated
|
||
text = text.replace('"', '"')
|
||
return text.strip()
|
||
|
||
def main():
|
||
odpowiedzi_dir = Path("/home/kuchy/praca_magisterska/pytania/odpowiedzi")
|
||
output_file = Path("/home/kuchy/praca_magisterska/pytania/anki_egzamin_magisterski.txt")
|
||
|
||
all_cards = []
|
||
|
||
# Process each file
|
||
for md_file in sorted(odpowiedzi_dir.glob("*.md")):
|
||
print(f"Processing: {md_file.name}")
|
||
try:
|
||
cards = extract_question_and_answer(md_file)
|
||
all_cards.extend(cards)
|
||
print(f" -> Extracted {len(cards)} cards")
|
||
except Exception as e:
|
||
print(f" -> Error: {e}")
|
||
|
||
# Write Anki file with headers
|
||
with open(output_file, 'w', encoding='utf-8') as f:
|
||
# Anki file headers
|
||
f.write("#separator:tab\n")
|
||
f.write("#html:true\n")
|
||
f.write("#columns:Front\tBack\tTags\n")
|
||
f.write("#deck:Egzamin Magisterski ISY\n")
|
||
f.write("#notetype:Basic\n")
|
||
f.write("\n")
|
||
|
||
for card in all_cards:
|
||
front = clean_for_anki(card['question'])
|
||
back = clean_for_anki(card['answer'])
|
||
tags = card['tags']
|
||
f.write(f"{front}\t{back}\t{tags}\n")
|
||
|
||
print(f"\n✅ Created {len(all_cards)} flashcards")
|
||
print(f"📁 Output: {output_file}")
|
||
print("\nTo import into Anki:")
|
||
print("1. Open Anki → File → Import")
|
||
print("2. Select the .txt file")
|
||
print("3. Verify 'Allow HTML' is checked")
|
||
print("4. Click Import")
|
||
|
||
if __name__ == "__main__":
|
||
main()
|