"""Batch generation helpers for the learning pipe module.""" from __future__ import annotations from dataclasses import dataclass from typing import TYPE_CHECKING from python_pkg.word_frequency.excerpt_finder import find_best_excerpt import python_pkg.word_frequency.translator as _translator if TYPE_CHECKING: from python_pkg.word_frequency._learning_constants import LessonConfig def _detect_translation_language( text: str, config: LessonConfig, lines: list[str], ) -> tuple[str | None, str | None]: """Detect translation settings and return (from, to) pair.""" actual_from = config.translate_from actual_to = config.translate_to or "en" if actual_from == "auto" or (config.translate_to and not config.translate_from): detected = _translator.detect_language(text) if detected: actual_from = detected lines.append(f"Detected language: {detected}") else: lines.append( "Warning: Could not detect language " "(install langdetect: " "pip install langdetect)" ) actual_from = None return actual_from, actual_to def _format_word_list( batch_words: list[tuple[str, int]], start_idx: int, total_words: int, translations: dict[str, str], ) -> list[str]: """Format the vocabulary word list for a batch.""" lines: list[str] = [] for i, (word, count) in enumerate( batch_words, start=start_idx + 1, ): percentage = (count / total_words) * 100 if translations: trans = translations.get(word, "?") lines.append( f" {i:3}. {word:<20} -> {trans:<20}" f" ({count:,} occurrences, " f"{percentage:.2f}%)" ) else: lines.append( f" {i:3}. {word:<20} ({count:,} occurrences, {percentage:.2f}%)" ) return lines @dataclass(frozen=True) class _LessonContext: """Shared context for batch generation.""" text: str word_counts: dict[str, int] config: LessonConfig def _generate_batch_section( ctx: _LessonContext, batch_num: int, batch_words: list[tuple[str, int]], cumulative_words: list[str], ) -> list[str]: """Generate lines for a single batch section.""" config = ctx.config total_words = sum(ctx.word_counts.values()) start_idx = batch_num * config.batch_size end_idx = start_idx + config.batch_size lines: list[str] = [] lines.append("-" * 70) lines.append( f"BATCH {batch_num + 1}: Words " f"{start_idx + 1} - " f"{min(end_idx, start_idx + len(batch_words))}" ) lines.append("-" * 70) lines.append("") # Get translations if requested translations: dict[str, str] = {} do_translate = config.translate_from is not None and config.translate_to is not None if do_translate: words_to_translate = [word for word, _ in batch_words] translation_results = _translator.translate_words_batch( words_to_translate, config.translate_from, config.translate_to, ) translations = { r.source_word: r.translated_word for r in translation_results if r.success } lines.append("VOCABULARY TO LEARN:") lines.append("") lines.extend( _format_word_list( batch_words, start_idx, total_words, translations, ) ) lines.append("") # Cumulative coverage cumulative_count = sum( ctx.word_counts[w] for w in cumulative_words if w in ctx.word_counts ) coverage = (cumulative_count / total_words) * 100 lines.append( f"After learning these words, you'll recognize ~{coverage:.1f}% of the text" ) lines.append("") # Excerpts lines.append("PRACTICE EXCERPTS:") lines.append("(Excerpts where your learned vocabulary is most concentrated)") lines.append("") excerpts = find_best_excerpt( ctx.text, cumulative_words, config.excerpt_length, case_sensitive=config.case_sensitive, top_n=config.excerpts_per_batch, ) for j, excerpt in enumerate(excerpts, 1): lines.append(f" Excerpt {j} ({excerpt.match_percentage:.1f}% known words):") lines.append(f' "{excerpt.excerpt}"') lines.append("") return lines