praca_magisterska/pytania/generate_anki_v3.py

#!/usr/bin/env python3
"""
Generate Anki flashcards with ACTUAL substantive answers, not just headers.
"""

import os
import re
from pathlib import Path

def clean_text(text):
    """Clean text for Anki."""
    if not text:
        return ""
    text = re.sub(r'\*\*(.+?)\*\*', r'<b>\1</b>', text)
    text = re.sub(r'(?<!\*)\*([^*]+)\*(?!\*)', r'<i>\1</i>', text)
    text = text.replace('\t', ' ')
    text = text.replace('"', '&quot;')
    text = re.sub(r' +', ' ', text)
    return text.strip()

def extract_real_answer(content, section_name):
    """Extract actual content from a section, not just headers."""
    # Find the section
    pattern = rf'### (?:\d+\.\s*)?{re.escape(section_name)}\s*\n((?:(?!^### ).)+)'
    match = re.search(pattern, content, re.MULTILINE | re.DOTALL)
    if not match:
        return None
    
    body = match.group(1).strip()
    
    # Extract meaningful content
    lines = []
    
    # Get subheaders with their first line of content
    subheader_pattern = r'#### ([^\n]+)\n([^\n#]+)'
    for sub_header, first_line in re.findall(subheader_pattern, body):
        lines.append(f"<b>{sub_header.strip()}</b>: {first_line.strip()}")
    
    # Get bullet points
    bullet_pattern = r'[-•]\s*\*\*([^*]+)\*\*[:\s-]*([^\n]*)'
    for term, desc in re.findall(bullet_pattern, body):
        if desc.strip():
            lines.append(f"• <b>{term.strip()}</b>: {desc.strip()}")
        else:
            lines.append(f"• <b>{term.strip()}</b>")
    
    # If no structured content, get paragraphs
    if not lines:
        paras = [p.strip() for p in body.split('\n\n') 
                if p.strip() and not p.startswith('```') and not p.startswith('|')]
        for p in paras[:2]:
            if len(p) > 20 and len(p) < 400:
                lines.append(p)
    
    return '<br>'.join(lines[:6]) if lines else None

def extract_cards(filepath):
    """Extract flashcards from a file."""
    with open(filepath, 'r', encoding='utf-8') as f:
        content = f.read()
    
    cards = []
    filename = os.path.basename(filepath)
    match = re.match(r'(\d+)-(.+)\.md', filename)
    num = match.group(1) if match else "00"
    
    subj_match = re.search(r'Przedmiot:\s*(\w+)', content)
    subject = subj_match.group(1) if subj_match else "Ogólne"
    base_tags = f"egzamin_magisterski pyt{num} {subject}"
    
    # Get main question
    q_match = re.search(r'## Pytanie\s*\n\s*\*\*["\']?(.+?)["\']?\*\*', content, re.DOTALL)
    main_question = re.sub(r'\s+', ' ', q_match.group(1).strip()) if q_match else None
    
    # ===============================================
    # MAIN CARD: Question with REAL answer summary
    # ===============================================
    if main_question:
        # Build a real answer from the main sections
        answer_parts = []
        
        # For automata question - extract key facts about each automaton
        if "automat" in main_question.lower() or "maszyn" in main_question.lower():
            # FA
            fa_match = re.search(r'Automat Skończony.*?Rozpoznawana klasa języków\s*\n\s*\*\*([^*]+)\*\*', content, re.DOTALL)
            if fa_match:
                answer_parts.append(f"<b>Automat Skończony (FA)</b>: {fa_match.group(1).strip()}")
            
            # PDA
            pda_match = re.search(r'Automat ze Stosem.*?Rozpoznawana klasa języków\s*\n\s*\*\*([^*]+)\*\*', content, re.DOTALL)
            if pda_match:
                answer_parts.append(f"<b>Automat ze Stosem (PDA)</b>: {pda_match.group(1).strip()}")
            
            # TM
            tm_match = re.search(r'Maszyna Turinga.*?Rozpoznawana klasa języków\s*\n\s*\*\*([^*]+)\*\*', content, re.DOTALL)
            if tm_match:
                answer_parts.append(f"<b>Maszyna Turinga (TM)</b>: {tm_match.group(1).strip()}")
        
        # Generic extraction if specific didn't work
        if not answer_parts:
            # Look for key definitions/summaries
            key_patterns = [
                r'#### Definicja\s*\n([^\n#]+)',
                r'#### Charakterystyka\s*\n([^\n#]+)',
                r'\*\*Definicja[:\s]*\*\*\s*([^\n]+)',
            ]
            for pattern in key_patterns:
                for match in re.findall(pattern, content)[:3]:
                    if len(match) > 20:
                        answer_parts.append(match.strip())
        
        # Still nothing? Get first substantive paragraph from main answer
        if not answer_parts:
            main_answer = re.search(r'## 📚 Odpowiedź główna\s*\n(.+?)(?=\n## |\Z)', content, re.DOTALL)
            if main_answer:
                # Skip headers, get actual content
                text = main_answer.group(1)
                paras = re.findall(r'\n\n([^#\n][^\n]{50,300})', text)
                answer_parts = paras[:3]
        
        if answer_parts:
            answer = '<br><br>'.join([clean_text(p) for p in answer_parts])
            cards.append({
                'front': clean_text(main_question),
                'back': answer,
                'tags': f"{base_tags} pytanie_glowne"
            })
    
    # ===============================================
    # CONCEPT CARDS: Specific topics with real content
    # ===============================================
    # Find all ### sections and extract their actual content
    sections = re.findall(
        r'^### (?:\d+\.\s*)?([^\n]+)\n((?:(?!^### ).)*)', 
        content, 
        re.MULTILINE | re.DOTALL
    )
    
    for header, body in sections:
        header = header.strip()
        body = body.strip()
        
        # Skip short sections, mnemonics, examples
        if len(body) < 80 or 'Przykład' in header or 'Mnemonic' in header or '"' in header:
            continue
        
        # Extract real content
        answer_lines = []
        
        # Get definition if present
        def_match = re.search(r'#### Definicja[^\n]*\n([^\n#]+(?:\n[^\n#]+)?)', body)
        if def_match:
            answer_lines.append(def_match.group(1).strip())
        
        # Get characterization
        char_match = re.search(r'#### Charakterystyka\s*\n((?:[-•][^\n]+\n?)+)', body)
        if char_match:
            bullets = re.findall(r'[-•]\s*\*\*([^*]+)\*\*[:\s]*([^\n]*)', char_match.group(1))
            for term, desc in bullets[:4]:
                answer_lines.append(f"• <b>{term}</b>: {desc.strip()}" if desc else f"• <b>{term}</b>")
        
        # Get bullet points if no structured content yet
        if not answer_lines:
            bullets = re.findall(r'[-•]\s*\*\*([^*]+)\*\*[:\s]*([^\n]*)', body)
            for term, desc in bullets[:5]:
                answer_lines.append(f"• <b>{term}</b>: {desc.strip()}" if desc else f"• <b>{term}</b>")
        
        # Get first paragraph if still nothing
        if not answer_lines:
            first_para = re.search(r'^([^#\n\-•|`][^\n]{30,250})', body, re.MULTILINE)
            if first_para:
                answer_lines.append(first_para.group(1))
        
        if answer_lines:
            question = f"Wyjaśnij: {header}" if not header.endswith('?') else header
            answer = '<br>'.join([clean_text(l) for l in answer_lines])
            
            cards.append({
                'front': clean_text(question),
                'back': answer,
                'tags': f"{base_tags} szczegoly"
            })
    
    # ===============================================
    # Q&A CARDS: From practice questions section
    # ===============================================
    qa_matches = re.findall(
        r'### Q\d+:\s*["\']?([^"\'?\n]+)\?*["\']?\s*\n.*?Odpowiedź:\s*\n(.+?)(?=\n### |\n## |\Z)',
        content, re.DOTALL
    )
    
    for question, answer in qa_matches[:5]:
        question = question.strip()
        answer = answer.strip()
        
        # Clean up answer - get first meaningful part
        answer_lines = answer.split('\n')
        clean_answer = []
        for line in answer_lines[:6]:
            line = line.strip()
            if line and not line.startswith('```') and not line.startswith('|'):
                clean_answer.append(line)
        
        if clean_answer:
            cards.append({
                'front': clean_text(question + '?'),
                'back': '<br>'.join([clean_text(l) for l in clean_answer]),
                'tags': f"{base_tags} qa"
            })
    
    return cards

def main():
    odpowiedzi_dir = Path("/home/kuchy/praca_magisterska/pytania/odpowiedzi")
    output_file = Path("/home/kuchy/praca_magisterska/pytania/anki_egzamin_magisterski.txt")
    
    all_cards = []
    
    for md_file in sorted(odpowiedzi_dir.glob("*.md")):
        print(f"Processing: {md_file.name}", end=" ")
        try:
            cards = extract_cards(md_file)
            all_cards.extend(cards)
            print(f"→ {len(cards)} cards")
        except Exception as e:
            print(f"→ ERROR: {e}")
    
    # Remove duplicates
    seen = set()
    unique_cards = []
    for card in all_cards:
        key = card['front'][:100]
        if key not in seen:
            seen.add(key)
            unique_cards.append(card)
    
    # Write file
    with open(output_file, 'w', encoding='utf-8') as f:
        f.write("#separator:Tab\n")
        f.write("#html:true\n")
        f.write("#notetype:Basic\n")
        f.write("#deck:Egzamin Magisterski ISY\n")
        f.write("#columns:Front\tBack\tTags\n")
        f.write("#tags column:3\n")
        f.write("\n")
        
        for card in unique_cards:
            front = card['front'].replace('\t', ' ')
            back = card['back'].replace('\t', ' ')
            tags = card['tags']
            f.write(f"{front}\t{back}\t{tags}\n")
    
    print(f"\n✅ Generated {len(unique_cards)} flashcards")
    print(f"📁 Output: {output_file}")

if __name__ == "__main__":
    main()
feat: automatic and manual anki cards generator 2025-12-28 17:42:23 +01:00			`#!/usr/bin/env python3`
			`"""`
			`Generate Anki flashcards with ACTUAL substantive answers, not just headers.`
			`"""`

			`import os`
			`import re`
			`from pathlib import Path`

			`def clean_text(text):`
			`"""Clean text for Anki."""`
			`if not text:`
			`return ""`
			`text = re.sub(r'\\(.+?)\\', r'<b>\1</b>', text)`
			`text = re.sub(r'(?<!\)\([^]+)\(?!\*)', r'<i>\1</i>', text)`
			`text = text.replace('\t', ' ')`
			`text = text.replace('"', '"')`
			`text = re.sub(r' +', ' ', text)`
			`return text.strip()`

			`def extract_real_answer(content, section_name):`
			`"""Extract actual content from a section, not just headers."""`
			`# Find the section`
			`pattern = rf'### (?:\d+\.\s)?{re.escape(section_name)}\s\n((?:(?!^### ).)+)'`
			`match = re.search(pattern, content, re.MULTILINE \| re.DOTALL)`
			`if not match:`
			`return None`

			`body = match.group(1).strip()`

			`# Extract meaningful content`
			`lines = []`

			`# Get subheaders with their first line of content`
			`subheader_pattern = r'#### ([^\n]+)\n([^\n#]+)'`
			`for sub_header, first_line in re.findall(subheader_pattern, body):`
			`lines.append(f"<b>{sub_header.strip()}</b>: {first_line.strip()}")`

			`# Get bullet points`
			`bullet_pattern = r'[-•]\s\\([^]+)\\[:\s-]([^\n])'`
			`for term, desc in re.findall(bullet_pattern, body):`
			`if desc.strip():`
			`lines.append(f"• <b>{term.strip()}</b>: {desc.strip()}")`
			`else:`
			`lines.append(f"• <b>{term.strip()}</b>")`

			`# If no structured content, get paragraphs`
			`if not lines:`
			`paras = [p.strip() for p in body.split('\n\n')`
			if p.strip() and not p.startswith('```') and not p.startswith('\|')]
			`for p in paras[:2]:`
			`if len(p) > 20 and len(p) < 400:`
			`lines.append(p)`

			`return '<br>'.join(lines[:6]) if lines else None`

			`def extract_cards(filepath):`
			`"""Extract flashcards from a file."""`
			`with open(filepath, 'r', encoding='utf-8') as f:`
			`content = f.read()`

			`cards = []`
			`filename = os.path.basename(filepath)`
			`match = re.match(r'(\d+)-(.+)\.md', filename)`
			`num = match.group(1) if match else "00"`

			`subj_match = re.search(r'Przedmiot:\s*(\w+)', content)`
			`subject = subj_match.group(1) if subj_match else "Ogólne"`
			`base_tags = f"egzamin_magisterski pyt{num} {subject}"`

			`# Get main question`
			`q_match = re.search(r'## Pytanie\s\n\s\\["\']?(.+?)["\']?\\', content, re.DOTALL)`
			`main_question = re.sub(r'\s+', ' ', q_match.group(1).strip()) if q_match else None`

			`# ===============================================`
			`# MAIN CARD: Question with REAL answer summary`
			`# ===============================================`
			`if main_question:`
			`# Build a real answer from the main sections`
			`answer_parts = []`

			`# For automata question - extract key facts about each automaton`
			`if "automat" in main_question.lower() or "maszyn" in main_question.lower():`
			`# FA`
			`fa_match = re.search(r'Automat Skończony.?Rozpoznawana klasa języków\s\n\s\\([^]+)\\', content, re.DOTALL)`
			`if fa_match:`
			`answer_parts.append(f"<b>Automat Skończony (FA)</b>: {fa_match.group(1).strip()}")`

			`# PDA`
			`pda_match = re.search(r'Automat ze Stosem.?Rozpoznawana klasa języków\s\n\s\\([^]+)\\', content, re.DOTALL)`
			`if pda_match:`
			`answer_parts.append(f"<b>Automat ze Stosem (PDA)</b>: {pda_match.group(1).strip()}")`

			`# TM`
			`tm_match = re.search(r'Maszyna Turinga.?Rozpoznawana klasa języków\s\n\s\\([^]+)\\', content, re.DOTALL)`
			`if tm_match:`
			`answer_parts.append(f"<b>Maszyna Turinga (TM)</b>: {tm_match.group(1).strip()}")`

			`# Generic extraction if specific didn't work`
			`if not answer_parts:`
			`# Look for key definitions/summaries`
			`key_patterns = [`
			`r'#### Definicja\s*\n([^\n#]+)',`
			`r'#### Charakterystyka\s*\n([^\n#]+)',`
			`r'\\Definicja[:\s]\\\s([^\n]+)',`
			`]`
			`for pattern in key_patterns:`
			`for match in re.findall(pattern, content)[:3]:`
			`if len(match) > 20:`
			`answer_parts.append(match.strip())`

			`# Still nothing? Get first substantive paragraph from main answer`
			`if not answer_parts:`
			`main_answer = re.search(r'## 📚 Odpowiedź główna\s*\n(.+?)(?=\n## \|\Z)', content, re.DOTALL)`
			`if main_answer:`
			`# Skip headers, get actual content`
			`text = main_answer.group(1)`
			`paras = re.findall(r'\n\n([^#\n][^\n]{50,300})', text)`
			`answer_parts = paras[:3]`

			`if answer_parts:`
			`answer = '<br><br>'.join([clean_text(p) for p in answer_parts])`
			`cards.append({`
			`'front': clean_text(main_question),`
			`'back': answer,`
			`'tags': f"{base_tags} pytanie_glowne"`
			`})`

			`# ===============================================`
			`# CONCEPT CARDS: Specific topics with real content`
			`# ===============================================`
			`# Find all ### sections and extract their actual content`
			`sections = re.findall(`
			`r'^### (?:\d+\.\s)?([^\n]+)\n((?:(?!^### ).))',`
			`content,`
			`re.MULTILINE \| re.DOTALL`
			`)`

			`for header, body in sections:`
			`header = header.strip()`
			`body = body.strip()`

			`# Skip short sections, mnemonics, examples`
			`if len(body) < 80 or 'Przykład' in header or 'Mnemonic' in header or '"' in header:`
			`continue`

			`# Extract real content`
			`answer_lines = []`

			`# Get definition if present`
			`def_match = re.search(r'#### Definicja[^\n]*\n([^\n#]+(?:\n[^\n#]+)?)', body)`
			`if def_match:`
			`answer_lines.append(def_match.group(1).strip())`

			`# Get characterization`
			`char_match = re.search(r'#### Charakterystyka\s*\n((?:[-•][^\n]+\n?)+)', body)`
			`if char_match:`
			`bullets = re.findall(r'[-•]\s\\([^]+)\\[:\s]([^\n])', char_match.group(1))`
			`for term, desc in bullets[:4]:`
			`answer_lines.append(f"• <b>{term}</b>: {desc.strip()}" if desc else f"• <b>{term}</b>")`

			`# Get bullet points if no structured content yet`
			`if not answer_lines:`
			`bullets = re.findall(r'[-•]\s\\([^]+)\\[:\s]([^\n])', body)`
			`for term, desc in bullets[:5]:`
			`answer_lines.append(f"• <b>{term}</b>: {desc.strip()}" if desc else f"• <b>{term}</b>")`

			`# Get first paragraph if still nothing`
			`if not answer_lines:`
			first_para = re.search(r'^([^#\n\-•\|`][^\n]{30,250})', body, re.MULTILINE)
			`if first_para:`
			`answer_lines.append(first_para.group(1))`

			`if answer_lines:`
			`question = f"Wyjaśnij: {header}" if not header.endswith('?') else header`
			`answer = '<br>'.join([clean_text(l) for l in answer_lines])`

			`cards.append({`
			`'front': clean_text(question),`
			`'back': answer,`
			`'tags': f"{base_tags} szczegoly"`
			`})`

			`# ===============================================`
			`# Q&A CARDS: From practice questions section`
			`# ===============================================`
			`qa_matches = re.findall(`
			`r'### Q\d+:\s["\']?([^"\'?\n]+)\?["\']?\s\n.?Odpowiedź:\s*\n(.+?)(?=\n### \|\n## \|\Z)',`
			`content, re.DOTALL`
			`)`

			`for question, answer in qa_matches[:5]:`
			`question = question.strip()`
			`answer = answer.strip()`

			`# Clean up answer - get first meaningful part`
			`answer_lines = answer.split('\n')`
			`clean_answer = []`
			`for line in answer_lines[:6]:`
			`line = line.strip()`
			if line and not line.startswith('```') and not line.startswith('\|'):
			`clean_answer.append(line)`

			`if clean_answer:`
			`cards.append({`
			`'front': clean_text(question + '?'),`
			`'back': '<br>'.join([clean_text(l) for l in clean_answer]),`
			`'tags': f"{base_tags} qa"`
			`})`

			`return cards`

			`def main():`
			`odpowiedzi_dir = Path("/home/kuchy/praca_magisterska/pytania/odpowiedzi")`
			`output_file = Path("/home/kuchy/praca_magisterska/pytania/anki_egzamin_magisterski.txt")`

			`all_cards = []`

			`for md_file in sorted(odpowiedzi_dir.glob("*.md")):`
			`print(f"Processing: {md_file.name}", end=" ")`
			`try:`
			`cards = extract_cards(md_file)`
			`all_cards.extend(cards)`
			`print(f"→ {len(cards)} cards")`
			`except Exception as e:`
			`print(f"→ ERROR: {e}")`

			`# Remove duplicates`
			`seen = set()`
			`unique_cards = []`
			`for card in all_cards:`
			`key = card['front'][:100]`
			`if key not in seen:`
			`seen.add(key)`
			`unique_cards.append(card)`

			`# Write file`
			`with open(output_file, 'w', encoding='utf-8') as f:`
			`f.write("#separator:Tab\n")`
			`f.write("#html:true\n")`
			`f.write("#notetype:Basic\n")`
			`f.write("#deck:Egzamin Magisterski ISY\n")`
			`f.write("#columns:Front\tBack\tTags\n")`
			`f.write("#tags column:3\n")`
			`f.write("\n")`

			`for card in unique_cards:`
			`front = card['front'].replace('\t', ' ')`
			`back = card['back'].replace('\t', ' ')`
			`tags = card['tags']`
			`f.write(f"{front}\t{back}\t{tags}\n")`

			`print(f"\n✅ Generated {len(unique_cards)} flashcards")`
			`print(f"📁 Output: {output_file}")`

			`if __name__ == "__main__":`
			`main()`