praca_magisterska/pytania/generate_anki_final.py

289 lines
11 KiB
Python
Raw Permalink Blame History

#!/usr/bin/env python3
"""
Generate comprehensive Anki flashcards from exam questions.
Creates tab-separated file for Anki import with proper HTML formatting.
"""
import os
import re
from pathlib import Path
def clean_text(text):
"""Clean and format text for Anki."""
if not text:
return ""
# Convert markdown formatting to HTML
text = re.sub(r'\*\*(.+?)\*\*', r'<b>\1</b>', text)
text = re.sub(r'(?<!\*)\*([^*]+)\*(?!\*)', r'<i>\1</i>', text)
# Handle special characters
text = text.replace('\t', ' ')
text = text.replace('"', '&quot;')
# Clean up whitespace but preserve intentional line breaks
text = re.sub(r' +', ' ', text)
text = text.strip()
return text
def format_list(items, numbered=False):
"""Format a list of items as HTML."""
if not items:
return ""
tag = "ol" if numbered else "ul"
html = f"<{tag}>"
for item in items:
cleaned = clean_text(item)
if cleaned:
html += f"<li>{cleaned}</li>"
html += f"</{tag}>"
return html
def extract_from_file(filepath):
"""Extract flashcard data from a markdown file."""
with open(filepath, 'r', encoding='utf-8') as f:
content = f.read()
cards = []
# Get file metadata
filename = os.path.basename(filepath)
match = re.match(r'(\d+)-(.+)\.md', filename)
num = match.group(1) if match else "00"
topic_slug = match.group(2).replace('-', '_') if match else "unknown"
# Extract subject
subj_match = re.search(r'Przedmiot:\s*(\w+)', content)
subject = subj_match.group(1) if subj_match else "Ogólne"
# Base tags
base_tags = f"egzamin_magisterski pyt{num} {subject}"
# =====================================================
# CARD TYPE 1: Main Exam Question
# =====================================================
q_match = re.search(r'## Pytanie\s*\n\s*\*\*["\']?(.+?)["\']?\*\*', content, re.DOTALL)
if q_match:
main_q = re.sub(r'\s+', ' ', q_match.group(1).strip())
# Extract key topics from main answer
answer_match = re.search(r'## 📚 Odpowiedź główna\s*\n(.+?)(?=\n## [<5B>🎯]|\n---\s*\n## |\Z)', content, re.DOTALL)
if answer_match:
answer_section = answer_match.group(1)
# Get main headers
headers = re.findall(r'^### (?:\d+\.\s*)?(.+)$', answer_section, re.MULTILINE)
headers = [h.strip() for h in headers if len(h.strip()) > 3][:6]
if headers:
answer_html = "<b>Kluczowe zagadnienia:</b>" + format_list(headers)
cards.append({
'front': clean_text(main_q),
'back': answer_html,
'tags': f"{base_tags} pytanie_glowne"
})
# =====================================================
# CARD TYPE 2: Subsection Cards (detailed concepts)
# =====================================================
# Find all ### sections
sections = re.findall(
r'^### (?:\d+\.\s*)?(.+?)\n((?:(?!^###).)+)',
content,
re.MULTILINE | re.DOTALL
)
for header, body in sections:
header = header.strip()
body = body.strip()
# Skip very short sections or example sections
if len(body) < 50 or header.lower().startswith('przykład'):
continue
# Extract key information from body
answer_parts = []
# Look for #### sub-headers
subheaders = re.findall(r'^#### (.+)$', body, re.MULTILINE)
if subheaders:
answer_parts.extend(subheaders[:4])
# Look for bullet points with bold terms
bullets = re.findall(r'[-•]\s*\*\*([^*]+)\*\*[:\s-]*([^\n]+)?', body)
for term, desc in bullets[:5]:
if desc:
answer_parts.append(f"<b>{term}</b>: {desc.strip()}")
else:
answer_parts.append(f"<b>{term}</b>")
# If no structured content, get first paragraph
if not answer_parts:
paras = [p.strip() for p in body.split('\n\n')
if p.strip() and not p.strip().startswith('```')
and not p.strip().startswith('|')]
if paras:
first = paras[0]
# Limit length
if len(first) > 300:
first = first[:300] + "..."
answer_parts.append(first)
if answer_parts:
# Determine card type
if "Definicja" in header or "Co to" in header:
q = f"Co to jest: {header.replace('Definicja', '').strip()}?"
elif "Charakterystyka" in header:
q = f"Scharakteryzuj: {header.replace('Charakterystyka', '').strip()}"
elif header.endswith('?'):
q = header
else:
q = f"Omów: {header}"
# Format answer
if len(answer_parts) > 1:
answer_html = format_list(answer_parts)
else:
answer_html = clean_text(answer_parts[0])
cards.append({
'front': clean_text(q),
'back': answer_html,
'tags': f"{base_tags} szczegoly"
})
# =====================================================
# CARD TYPE 3: Algorithms/Formulas
# =====================================================
algo_patterns = [
r'#### Złożoność(?:\s+czasowa)?\s*\n(.+?)(?=\n####|\n###|\Z)',
r'Złożoność:\s*\*\*([^*]+)\*\*',
]
for pattern in algo_patterns:
matches = re.findall(pattern, content, re.DOTALL)
for match in matches[:2]:
if len(match) > 10:
# Find context - which algorithm?
algo_context = re.search(r'### (\d+\.\s*)?(.+?)(?=\n)', content[:content.find(match)])
if algo_context:
algo_name = algo_context.group(2).strip()
cards.append({
'front': f"Jaka jest złożoność algorytmu/metody: {algo_name}?",
'back': clean_text(match.strip()[:200]),
'tags': f"{base_tags} zlozonosc"
})
break
# =====================================================
# CARD TYPE 4: Comparisons (when file contains comparisons)
# =====================================================
compare_match = re.search(r'## .*(Porównanie|Zestawienie|vs).*\n(.+?)(?=\n## |\Z)', content, re.DOTALL | re.IGNORECASE)
if compare_match:
compare_section = compare_match.group(2)
# Extract comparison items
items = re.findall(r'\|\s*\*\*([^|*]+)\*\*\s*\|([^|]+)\|', compare_section)
if items:
comparison_html = "<table><tr><th>Aspekt</th><th>Wartość</th></tr>"
for aspect, value in items[:6]:
comparison_html += f"<tr><td>{clean_text(aspect)}</td><td>{clean_text(value)}</td></tr>"
comparison_html += "</table>"
# Get comparison title
title_match = re.search(r'## .*(Porównanie|Zestawienie).*?(\w+.*?(?:vs|i|oraz).*?\w+)', compare_match.group(0), re.IGNORECASE)
if title_match:
cards.append({
'front': f"Porównaj kluczowe różnice w temacie: pytanie {num}",
'back': comparison_html,
'tags': f"{base_tags} porownanie"
})
# =====================================================
# CARD TYPE 5: Q&A from practice questions section
# =====================================================
qa_section = re.search(r'## 🎓 Pytania.*?\n(.+?)(?=\n## |\Z)', content, re.DOTALL)
if qa_section:
qa_content = qa_section.group(1)
# Find Q&A pairs
qas = re.findall(r'### Q\d+:?\s*["\']?(.+?)["\']?\s*\n.*?Odpowiedź:\s*\n?(.+?)(?=\n### |\Z)', qa_content, re.DOTALL)
for q, a in qas[:3]:
q = re.sub(r'\s+', ' ', q.strip())
a = a.strip()
if len(a) > 30:
# Limit answer length
a_lines = a.split('\n')
a_short = '\n'.join(a_lines[:5])
if len(a_short) > 400:
a_short = a_short[:400] + "..."
cards.append({
'front': clean_text(q),
'back': clean_text(a_short).replace('\n', '<br>'),
'tags': f"{base_tags} egzamin_praktyka"
})
return cards
def main():
odpowiedzi_dir = Path("/home/kuchy/praca_magisterska/pytania/odpowiedzi")
output_file = Path("/home/kuchy/praca_magisterska/pytania/anki_egzamin_magisterski.txt")
all_cards = []
for md_file in sorted(odpowiedzi_dir.glob("*.md")):
print(f"Processing: {md_file.name}", end=" ")
try:
cards = extract_from_file(md_file)
all_cards.extend(cards)
print(f"{len(cards)} cards")
except Exception as e:
print(f"→ ERROR: {e}")
# Remove potential duplicates (same front)
seen = set()
unique_cards = []
for card in all_cards:
if card['front'] not in seen:
seen.add(card['front'])
unique_cards.append(card)
# Write output file
with open(output_file, 'w', encoding='utf-8') as f:
# Anki headers
f.write("#separator:tab\n")
f.write("#html:true\n")
f.write("#tags column:3\n")
f.write("#deck:Egzamin Magisterski ISY\n")
f.write("#notetype:Basic\n")
f.write("\n")
for card in unique_cards:
# Ensure no tabs in content (would break parsing)
front = card['front'].replace('\t', ' ')
back = card['back'].replace('\t', ' ')
tags = card['tags']
f.write(f"{front}\t{back}\t{tags}\n")
print(f"\n{'='*50}")
print(f"✅ Generated {len(unique_cards)} unique flashcards")
print(f"📁 Saved to: {output_file}")
print(f"{'='*50}")
print("\n📋 IMPORT INSTRUCTIONS:")
print("" * 40)
print("Anki Desktop:")
print(" 1. File → Import")
print(" 2. Select: anki_egzamin_magisterski.txt")
print(" 3. Verify: Fields separated by Tab")
print(" 4. Check: Allow HTML in fields")
print(" 5. Click Import")
print()
print("AnkiWeb / AnkiDroid:")
print(" 1. First import on Anki Desktop")
print(" 2. Click Sync to upload to AnkiWeb")
print(" 3. Sync on mobile to download")
if __name__ == "__main__":
main()