praca_magisterska/pytania/generate_anki_v3.py

#!/usr/bin/env python3
"""
Generate Anki flashcards with ACTUAL substantive answers, not just headers.
"""

import os
import re
from pathlib import Path

def clean_text(text):
    """Clean text for Anki."""
    if not text:
        return ""
    text = re.sub(r'\*\*(.+?)\*\*', r'<b>\1</b>', text)
    text = re.sub(r'(?<!\*)\*([^*]+)\*(?!\*)', r'<i>\1</i>', text)
    text = text.replace('\t', ' ')
    text = text.replace('"', '&quot;')
    text = re.sub(r' +', ' ', text)
    return text.strip()

def extract_real_answer(content, section_name):
    """Extract actual content from a section, not just headers."""
    # Find the section
    pattern = rf'### (?:\d+\.\s*)?{re.escape(section_name)}\s*\n((?:(?!^### ).)+)'
    match = re.search(pattern, content, re.MULTILINE | re.DOTALL)
    if not match:
        return None

    body = match.group(1).strip()

    # Extract meaningful content
    lines = []

    # Get subheaders with their first line of content
    subheader_pattern = r'#### ([^\n]+)\n([^\n#]+)'
    for sub_header, first_line in re.findall(subheader_pattern, body):
        lines.append(f"<b>{sub_header.strip()}</b>: {first_line.strip()}")

    # Get bullet points
    bullet_pattern = r'[-•]\s*\*\*([^*]+)\*\*[:\s-]*([^\n]*)'
    for term, desc in re.findall(bullet_pattern, body):
        if desc.strip():
            lines.append(f"• <b>{term.strip()}</b>: {desc.strip()}")
        else:
            lines.append(f"• <b>{term.strip()}</b>")

    # If no structured content, get paragraphs
    if not lines:
        paras = [p.strip() for p in body.split('\n\n')
                if p.strip() and not p.startswith('```') and not p.startswith('|')]
        for p in paras[:2]:
            if len(p) > 20 and len(p) < 400:
                lines.append(p)

    return '<br>'.join(lines[:6]) if lines else None

def extract_cards(filepath):
    """Extract flashcards from a file."""
    with open(filepath, 'r', encoding='utf-8') as f:
        content = f.read()

    cards = []
    filename = os.path.basename(filepath)
    match = re.match(r'(\d+)-(.+)\.md', filename)
    num = match.group(1) if match else "00"

    subj_match = re.search(r'Przedmiot:\s*(\w+)', content)
    subject = subj_match.group(1) if subj_match else "Ogólne"
    base_tags = f"egzamin_magisterski pyt{num} {subject}"

    # Get main question
    q_match = re.search(r'## Pytanie\s*\n\s*\*\*["\']?(.+?)["\']?\*\*', content, re.DOTALL)
    main_question = re.sub(r'\s+', ' ', q_match.group(1).strip()) if q_match else None

    # ===============================================
    # MAIN CARD: Question with REAL answer summary
    # ===============================================
    if main_question:
        # Build a real answer from the main sections
        answer_parts = []

        # For automata question - extract key facts about each automaton
        if "automat" in main_question.lower() or "maszyn" in main_question.lower():
            # FA
            fa_match = re.search(r'Automat Skończony.*?Rozpoznawana klasa języków\s*\n\s*\*\*([^*]+)\*\*', content, re.DOTALL)
            if fa_match:
                answer_parts.append(f"<b>Automat Skończony (FA)</b>: {fa_match.group(1).strip()}")

            # PDA
            pda_match = re.search(r'Automat ze Stosem.*?Rozpoznawana klasa języków\s*\n\s*\*\*([^*]+)\*\*', content, re.DOTALL)
            if pda_match:
                answer_parts.append(f"<b>Automat ze Stosem (PDA)</b>: {pda_match.group(1).strip()}")

            # TM
            tm_match = re.search(r'Maszyna Turinga.*?Rozpoznawana klasa języków\s*\n\s*\*\*([^*]+)\*\*', content, re.DOTALL)
            if tm_match:
                answer_parts.append(f"<b>Maszyna Turinga (TM)</b>: {tm_match.group(1).strip()}")

        # Generic extraction if specific didn't work
        if not answer_parts:
            # Look for key definitions/summaries
            key_patterns = [
                r'#### Definicja\s*\n([^\n#]+)',
                r'#### Charakterystyka\s*\n([^\n#]+)',
                r'\*\*Definicja[:\s]*\*\*\s*([^\n]+)',
            ]
            for pattern in key_patterns:
                for match in re.findall(pattern, content)[:3]:
                    if len(match) > 20:
                        answer_parts.append(match.strip())

        # Still nothing? Get first substantive paragraph from main answer
        if not answer_parts:
            main_answer = re.search(r'## 📚 Odpowiedź główna\s*\n(.+?)(?=\n## |\Z)', content, re.DOTALL)
            if main_answer:
                # Skip headers, get actual content
                text = main_answer.group(1)
                paras = re.findall(r'\n\n([^#\n][^\n]{50,300})', text)
                answer_parts = paras[:3]

        if answer_parts:
            answer = '<br><br>'.join([clean_text(p) for p in answer_parts])
            cards.append({
                'front': clean_text(main_question),
                'back': answer,
                'tags': f"{base_tags} pytanie_glowne"
            })

    # ===============================================
    # CONCEPT CARDS: Specific topics with real content
    # ===============================================
    # Find all ### sections and extract their actual content
    sections = re.findall(
        r'^### (?:\d+\.\s*)?([^\n]+)\n((?:(?!^### ).)*)',
        content,
        re.MULTILINE | re.DOTALL
    )

    for header, body in sections:
        header = header.strip()
        body = body.strip()

        # Skip short sections, mnemonics, examples
        if len(body) < 80 or 'Przykład' in header or 'Mnemonic' in header or '"' in header:
            continue

        # Extract real content
        answer_lines = []

        # Get definition if present
        def_match = re.search(r'#### Definicja[^\n]*\n([^\n#]+(?:\n[^\n#]+)?)', body)
        if def_match:
            answer_lines.append(def_match.group(1).strip())

        # Get characterization
        char_match = re.search(r'#### Charakterystyka\s*\n((?:[-•][^\n]+\n?)+)', body)
        if char_match:
            bullets = re.findall(r'[-•]\s*\*\*([^*]+)\*\*[:\s]*([^\n]*)', char_match.group(1))
            for term, desc in bullets[:4]:
                answer_lines.append(f"• <b>{term}</b>: {desc.strip()}" if desc else f"• <b>{term}</b>")

        # Get bullet points if no structured content yet
        if not answer_lines:
            bullets = re.findall(r'[-•]\s*\*\*([^*]+)\*\*[:\s]*([^\n]*)', body)
            for term, desc in bullets[:5]:
                answer_lines.append(f"• <b>{term}</b>: {desc.strip()}" if desc else f"• <b>{term}</b>")

        # Get first paragraph if still nothing
        if not answer_lines:
            first_para = re.search(r'^([^#\n\-•|`][^\n]{30,250})', body, re.MULTILINE)
            if first_para:
                answer_lines.append(first_para.group(1))

        if answer_lines:
            question = f"Wyjaśnij: {header}" if not header.endswith('?') else header
            answer = '<br>'.join([clean_text(l) for l in answer_lines])

            cards.append({
                'front': clean_text(question),
                'back': answer,
                'tags': f"{base_tags} szczegoly"
            })

    # ===============================================
    # Q&A CARDS: From practice questions section
    # ===============================================
    qa_matches = re.findall(
        r'### Q\d+:\s*["\']?([^"\'?\n]+)\?*["\']?\s*\n.*?Odpowiedź:\s*\n(.+?)(?=\n### |\n## |\Z)',
        content, re.DOTALL
    )

    for question, answer in qa_matches[:5]:
        question = question.strip()
        answer = answer.strip()

        # Clean up answer - get first meaningful part
        answer_lines = answer.split('\n')
        clean_answer = []
        for line in answer_lines[:6]:
            line = line.strip()
            if line and not line.startswith('```') and not line.startswith('|'):
                clean_answer.append(line)

        if clean_answer:
            cards.append({
                'front': clean_text(question + '?'),
                'back': '<br>'.join([clean_text(l) for l in clean_answer]),
                'tags': f"{base_tags} qa"
            })

    return cards

def main():
    odpowiedzi_dir = Path("/home/kuchy/praca_magisterska/pytania/odpowiedzi")
    output_file = Path("/home/kuchy/praca_magisterska/pytania/anki_egzamin_magisterski.txt")

    all_cards = []

    for md_file in sorted(odpowiedzi_dir.glob("*.md")):
        print(f"Processing: {md_file.name}", end=" ")
        try:
            cards = extract_cards(md_file)
            all_cards.extend(cards)
            print(f"→ {len(cards)} cards")
        except Exception as e:
            print(f"→ ERROR: {e}")

    # Remove duplicates
    seen = set()
    unique_cards = []
    for card in all_cards:
        key = card['front'][:100]
        if key not in seen:
            seen.add(key)
            unique_cards.append(card)

    # Write file
    with open(output_file, 'w', encoding='utf-8') as f:
        f.write("#separator:Tab\n")
        f.write("#html:true\n")
        f.write("#notetype:Basic\n")
        f.write("#deck:Egzamin Magisterski ISY\n")
        f.write("#columns:Front\tBack\tTags\n")
        f.write("#tags column:3\n")
        f.write("\n")

        for card in unique_cards:
            front = card['front'].replace('\t', ' ')
            back = card['back'].replace('\t', ' ')
            tags = card['tags']
            f.write(f"{front}\t{back}\t{tags}\n")

    print(f"\n✅ Generated {len(unique_cards)} flashcards")
    print(f"📁 Output: {output_file}")

if __name__ == "__main__":
    main()