#!/usr/bin/env python3 """ Generate Anki flashcards with ACTUAL substantive answers, not just headers. """ import os import re from pathlib import Path def clean_text(text): """Clean text for Anki.""" if not text: return "" text = re.sub(r'\*\*(.+?)\*\*', r'\1', text) text = re.sub(r'(?\1', text) text = text.replace('\t', ' ') text = text.replace('"', '"') text = re.sub(r' +', ' ', text) return text.strip() def extract_real_answer(content, section_name): """Extract actual content from a section, not just headers.""" # Find the section pattern = rf'### (?:\d+\.\s*)?{re.escape(section_name)}\s*\n((?:(?!^### ).)+)' match = re.search(pattern, content, re.MULTILINE | re.DOTALL) if not match: return None body = match.group(1).strip() # Extract meaningful content lines = [] # Get subheaders with their first line of content subheader_pattern = r'#### ([^\n]+)\n([^\n#]+)' for sub_header, first_line in re.findall(subheader_pattern, body): lines.append(f"{sub_header.strip()}: {first_line.strip()}") # Get bullet points bullet_pattern = r'[-•]\s*\*\*([^*]+)\*\*[:\s-]*([^\n]*)' for term, desc in re.findall(bullet_pattern, body): if desc.strip(): lines.append(f"• {term.strip()}: {desc.strip()}") else: lines.append(f"• {term.strip()}") # If no structured content, get paragraphs if not lines: paras = [p.strip() for p in body.split('\n\n') if p.strip() and not p.startswith('```') and not p.startswith('|')] for p in paras[:2]: if len(p) > 20 and len(p) < 400: lines.append(p) return '
'.join(lines[:6]) if lines else None def extract_cards(filepath): """Extract flashcards from a file.""" with open(filepath, 'r', encoding='utf-8') as f: content = f.read() cards = [] filename = os.path.basename(filepath) match = re.match(r'(\d+)-(.+)\.md', filename) num = match.group(1) if match else "00" subj_match = re.search(r'Przedmiot:\s*(\w+)', content) subject = subj_match.group(1) if subj_match else "Ogólne" base_tags = f"egzamin_magisterski pyt{num} {subject}" # Get main question q_match = re.search(r'## Pytanie\s*\n\s*\*\*["\']?(.+?)["\']?\*\*', content, re.DOTALL) main_question = re.sub(r'\s+', ' ', q_match.group(1).strip()) if q_match else None # =============================================== # MAIN CARD: Question with REAL answer summary # =============================================== if main_question: # Build a real answer from the main sections answer_parts = [] # For automata question - extract key facts about each automaton if "automat" in main_question.lower() or "maszyn" in main_question.lower(): # FA fa_match = re.search(r'Automat Skończony.*?Rozpoznawana klasa języków\s*\n\s*\*\*([^*]+)\*\*', content, re.DOTALL) if fa_match: answer_parts.append(f"Automat Skończony (FA): {fa_match.group(1).strip()}") # PDA pda_match = re.search(r'Automat ze Stosem.*?Rozpoznawana klasa języków\s*\n\s*\*\*([^*]+)\*\*', content, re.DOTALL) if pda_match: answer_parts.append(f"Automat ze Stosem (PDA): {pda_match.group(1).strip()}") # TM tm_match = re.search(r'Maszyna Turinga.*?Rozpoznawana klasa języków\s*\n\s*\*\*([^*]+)\*\*', content, re.DOTALL) if tm_match: answer_parts.append(f"Maszyna Turinga (TM): {tm_match.group(1).strip()}") # Generic extraction if specific didn't work if not answer_parts: # Look for key definitions/summaries key_patterns = [ r'#### Definicja\s*\n([^\n#]+)', r'#### Charakterystyka\s*\n([^\n#]+)', r'\*\*Definicja[:\s]*\*\*\s*([^\n]+)', ] for pattern in key_patterns: for match in re.findall(pattern, content)[:3]: if len(match) > 20: answer_parts.append(match.strip()) # Still nothing? Get first substantive paragraph from main answer if not answer_parts: main_answer = re.search(r'## 📚 Odpowiedź główna\s*\n(.+?)(?=\n## |\Z)', content, re.DOTALL) if main_answer: # Skip headers, get actual content text = main_answer.group(1) paras = re.findall(r'\n\n([^#\n][^\n]{50,300})', text) answer_parts = paras[:3] if answer_parts: answer = '

'.join([clean_text(p) for p in answer_parts]) cards.append({ 'front': clean_text(main_question), 'back': answer, 'tags': f"{base_tags} pytanie_glowne" }) # =============================================== # CONCEPT CARDS: Specific topics with real content # =============================================== # Find all ### sections and extract their actual content sections = re.findall( r'^### (?:\d+\.\s*)?([^\n]+)\n((?:(?!^### ).)*)', content, re.MULTILINE | re.DOTALL ) for header, body in sections: header = header.strip() body = body.strip() # Skip short sections, mnemonics, examples if len(body) < 80 or 'Przykład' in header or 'Mnemonic' in header or '"' in header: continue # Extract real content answer_lines = [] # Get definition if present def_match = re.search(r'#### Definicja[^\n]*\n([^\n#]+(?:\n[^\n#]+)?)', body) if def_match: answer_lines.append(def_match.group(1).strip()) # Get characterization char_match = re.search(r'#### Charakterystyka\s*\n((?:[-•][^\n]+\n?)+)', body) if char_match: bullets = re.findall(r'[-•]\s*\*\*([^*]+)\*\*[:\s]*([^\n]*)', char_match.group(1)) for term, desc in bullets[:4]: answer_lines.append(f"• {term}: {desc.strip()}" if desc else f"• {term}") # Get bullet points if no structured content yet if not answer_lines: bullets = re.findall(r'[-•]\s*\*\*([^*]+)\*\*[:\s]*([^\n]*)', body) for term, desc in bullets[:5]: answer_lines.append(f"• {term}: {desc.strip()}" if desc else f"• {term}") # Get first paragraph if still nothing if not answer_lines: first_para = re.search(r'^([^#\n\-•|`][^\n]{30,250})', body, re.MULTILINE) if first_para: answer_lines.append(first_para.group(1)) if answer_lines: question = f"Wyjaśnij: {header}" if not header.endswith('?') else header answer = '
'.join([clean_text(l) for l in answer_lines]) cards.append({ 'front': clean_text(question), 'back': answer, 'tags': f"{base_tags} szczegoly" }) # =============================================== # Q&A CARDS: From practice questions section # =============================================== qa_matches = re.findall( r'### Q\d+:\s*["\']?([^"\'?\n]+)\?*["\']?\s*\n.*?Odpowiedź:\s*\n(.+?)(?=\n### |\n## |\Z)', content, re.DOTALL ) for question, answer in qa_matches[:5]: question = question.strip() answer = answer.strip() # Clean up answer - get first meaningful part answer_lines = answer.split('\n') clean_answer = [] for line in answer_lines[:6]: line = line.strip() if line and not line.startswith('```') and not line.startswith('|'): clean_answer.append(line) if clean_answer: cards.append({ 'front': clean_text(question + '?'), 'back': '
'.join([clean_text(l) for l in clean_answer]), 'tags': f"{base_tags} qa" }) return cards def main(): odpowiedzi_dir = Path("/home/kuchy/praca_magisterska/pytania/odpowiedzi") output_file = Path("/home/kuchy/praca_magisterska/pytania/anki_egzamin_magisterski.txt") all_cards = [] for md_file in sorted(odpowiedzi_dir.glob("*.md")): print(f"Processing: {md_file.name}", end=" ") try: cards = extract_cards(md_file) all_cards.extend(cards) print(f"→ {len(cards)} cards") except Exception as e: print(f"→ ERROR: {e}") # Remove duplicates seen = set() unique_cards = [] for card in all_cards: key = card['front'][:100] if key not in seen: seen.add(key) unique_cards.append(card) # Write file with open(output_file, 'w', encoding='utf-8') as f: f.write("#separator:Tab\n") f.write("#html:true\n") f.write("#notetype:Basic\n") f.write("#deck:Egzamin Magisterski ISY\n") f.write("#columns:Front\tBack\tTags\n") f.write("#tags column:3\n") f.write("\n") for card in unique_cards: front = card['front'].replace('\t', ' ') back = card['back'].replace('\t', ' ') tags = card['tags'] f.write(f"{front}\t{back}\t{tags}\n") print(f"\n✅ Generated {len(unique_cards)} flashcards") print(f"📁 Output: {output_file}") if __name__ == "__main__": main()