testsAndMisc-archive/python_pkg/music_gen/_music_speech.py
Krzysztof kuhy Rudnicki 8f2fbd2311 refactor: enforce 500-line limit on all Python source files
Split 18+ Python files that exceeded 500 lines into smaller modules
with helper files (prefixed with _). All functions are re-exported
from the original modules to maintain backward compatibility with
test patches and external imports.

Files split:
- moviepy_showcase.py (1212 -> 302 + 3 helpers)
- anki_generator.py (1174 -> 473 + 4 helpers)
- test_analyze_chess_game.py (1152 -> 361 + 2 parts)
- poker_modifier_app.py (1024 -> 263 + 2 helpers)
- transcribe_fw.py (1007 -> 342 + 3 helpers)
- music_generator.py (1002 -> 319 + 2 helpers)
- translator.py (951 -> 442 + 2 helpers)
- cinema_planner.py (893 -> 369 + 2 helpers)
- lichess_bot/main.py (757 -> 495 + _game_logic.py)
- test_translator.py (725 -> 289 + part2 + conftest)
- test_lichess_api.py (680 -> 475 + part2)
- learning_pipe.py (668 -> 375 + 2 helpers)
- cache.py (655 -> 360 + _cache_decks.py)
- analyze_chess_game.py (632 -> 463 + _move_analysis.py)
- visualize_q02.py (609 -> 371 + helper)
- repo_explorer.py (602 -> 347 + 2 helpers)
- keyboard_coop/main.py (515 -> 416 + _dictionary.py)
- scanning.py (501 -> 314 + _enforce_loop.py)

All tests pass: 144 lichess_bot (100% branch coverage), 243 others.
No new lint errors introduced.
2026-03-17 22:47:42 +01:00

381 lines
11 KiB
Python

"""Bark speech synthesis, vocal generation, and song mixing."""
from __future__ import annotations
from datetime import datetime, timezone
from pathlib import Path
from python_pkg.music_gen._music_generation import (
SEGMENT_DURATION,
_generate_long_audio,
generate_segment,
load_model,
select_model_size,
)
BARK_MAX_CHARS = 200 # Max characters per Bark segment (~13s of speech)
# Available Bark voice presets
BARK_VOICES = [
"v2/en_speaker_0",
"v2/en_speaker_1",
"v2/en_speaker_2",
"v2/en_speaker_3",
"v2/en_speaker_4",
"v2/en_speaker_5",
"v2/en_speaker_6",
"v2/en_speaker_7",
"v2/en_speaker_8",
"v2/en_speaker_9",
]
def generate_speech(
text: str,
voice: str = "v2/en_speaker_6",
output_dir: Path | None = None,
) -> Path:
"""Generate speech audio from text using Bark.
Bark supports various speech patterns:
- [laughter], [laughs], [sighs], [music]
- [gasps], [clears throat], — or ... for hesitations
- ♪ for singing
Args:
text: Text to convert to speech (max ~13s per segment)
voice: Voice preset to use (see BARK_VOICES)
output_dir: Directory to save output (defaults to ./output)
Returns:
Path to the generated audio file
"""
import functools
import numpy as np
import scipy.io.wavfile
import torch
# Bark uses older checkpoint format with pickle
# Monkey-patch torch.load to allow unsafe loading for Bark models
original_torch_load = torch.load
@functools.wraps(original_torch_load)
def patched_load(*args: object, **kwargs: object) -> object:
kwargs.setdefault("weights_only", False)
return original_torch_load(*args, **kwargs)
torch.load = patched_load
try:
from bark import SAMPLE_RATE, generate_audio, preload_models
if output_dir is None:
output_dir = Path(__file__).parent / "output"
output_dir.mkdir(exist_ok=True)
print("\nLoading Bark model...")
print("(First run will download models, ~5GB total)")
preload_models()
print(f"\nGenerating speech with voice: {voice}")
print(f"Text: {text!r}")
# Bark can only generate ~13s at a time
# For longer text, we need to split into sentences
audio_segments = []
# Split on sentence boundaries for longer texts
sentences = _split_into_sentences(text)
for i, sentence in enumerate(sentences):
if len(sentences) > 1:
print(f" Generating segment {i + 1}/{len(sentences)}...")
audio = generate_audio(
sentence.strip(),
history_prompt=voice,
)
audio_segments.append(audio)
# Combine segments
if len(audio_segments) > 1:
audio_data = np.concatenate(audio_segments)
else:
audio_data = audio_segments[0]
# Create filename
timestamp = datetime.now(tz=timezone.utc).strftime("%Y%m%d_%H%M%S")
safe_text = "".join(c if c.isalnum() or c in " -_" else "" for c in text[:30])
safe_text = safe_text.strip().replace(" ", "_")
filename = f"{timestamp}_speech_{safe_text}.wav"
output_path = output_dir / filename
scipy.io.wavfile.write(output_path, SAMPLE_RATE, audio_data)
print(f"\nSaved to: {output_path}")
print(f"Duration: {len(audio_data) / SAMPLE_RATE:.1f}s")
return output_path
finally:
# Restore original torch.load
torch.load = original_torch_load
def _split_into_sentences(text: str) -> list[str]:
"""Split text into sentences for Bark processing.
Args:
text: Text to split
Returns:
List of sentences
"""
import re
# Split on sentence-ending punctuation followed by space
sentences = re.split(r"(?<=[.!?])\s+", text.strip())
# Group very short sentences together
result = []
current = ""
for sentence in sentences:
if len(current) + len(sentence) < BARK_MAX_CHARS:
current = f"{current} {sentence}".strip()
else:
if current:
result.append(current)
current = sentence
if current:
result.append(current)
return result or [text]
def _resample_audio(
audio: object,
orig_sr: int,
target_sr: int,
) -> object:
"""Resample audio to a different sample rate.
Args:
audio: Audio data as numpy array
orig_sr: Original sample rate
target_sr: Target sample rate
Returns:
Resampled audio data
"""
import numpy as np
from scipy import signal
if orig_sr == target_sr:
return audio
# Calculate the resampling ratio
duration = len(audio) / orig_sr
target_length = int(duration * target_sr)
return signal.resample(audio, target_length).astype(np.float32)
def _mix_audio(
instrumental: object,
vocals: object,
vocal_volume: float = 0.8,
instrumental_volume: float = 0.6,
) -> object:
"""Mix vocals over instrumental track.
Args:
instrumental: Instrumental audio (numpy array)
vocals: Vocal audio (numpy array)
vocal_volume: Volume multiplier for vocals (0.0-1.0)
instrumental_volume: Volume multiplier for instrumental (0.0-1.0)
Returns:
Mixed audio data
"""
import numpy as np
# Ensure same length - pad or trim vocals to match instrumental
if len(vocals) < len(instrumental):
# Pad vocals with silence at the end
vocals = np.pad(vocals, (0, len(instrumental) - len(vocals)))
elif len(vocals) > len(instrumental):
# Trim vocals to match instrumental
vocals = vocals[: len(instrumental)]
# Mix the tracks
mixed = (instrumental * instrumental_volume) + (vocals * vocal_volume)
# Normalize to prevent clipping
max_val = np.max(np.abs(mixed))
if max_val > 1.0:
mixed = mixed / max_val
return mixed.astype(np.float32)
def _generate_vocals_for_song(lyrics: str, voice: str) -> tuple[object, int]:
"""Generate vocals using Bark for song mixing.
Args:
lyrics: Text/lyrics to sing
voice: Bark voice preset
Returns:
Tuple of (vocal audio array, sample rate)
"""
import functools
import numpy as np
import torch
# Patch torch.load for Bark compatibility
original_torch_load = torch.load
@functools.wraps(original_torch_load)
def patched_load(*args: object, **kwargs: object) -> object:
kwargs.setdefault("weights_only", False)
return original_torch_load(*args, **kwargs)
torch.load = patched_load
try:
from bark import SAMPLE_RATE as BARK_SR
from bark import generate_audio, preload_models
print("Loading Bark model...")
preload_models()
print(f"Generating vocals with voice: {voice}")
print(f"Lyrics: {lyrics!r}")
sentences = _split_into_sentences(lyrics)
vocal_segments = []
for i, sentence in enumerate(sentences):
if len(sentences) > 1:
print(f" Vocal segment {i + 1}/{len(sentences)}...")
audio = generate_audio(sentence.strip(), history_prompt=voice)
vocal_segments.append(audio)
if len(vocal_segments) > 1:
vocals = np.concatenate(vocal_segments)
else:
vocals = vocal_segments[0]
return vocals, BARK_SR
finally:
torch.load = original_torch_load
def _generate_instrumental_for_song(
music_prompt: str,
duration: int,
) -> tuple[object, int]:
"""Generate instrumental music using MusicGen for song mixing.
Args:
music_prompt: Description of the music
duration: Duration in seconds
Returns:
Tuple of (instrumental audio array, sample rate)
"""
model_size = select_model_size(None)
model, processor = load_model(model_size)
print(f"Music prompt: {music_prompt!r}")
print(f"Duration: {duration}s")
device = str(next(model.parameters()).device)
sample_rate = model.config.audio_encoder.sampling_rate
if duration <= SEGMENT_DURATION:
instrumental = generate_segment(
music_prompt,
model,
processor,
duration,
device,
)
else:
instrumental = _generate_long_audio(
music_prompt,
model,
processor,
duration,
)
return instrumental, sample_rate
def generate_song(
lyrics: str,
music_prompt: str,
voice: str = "v2/en_speaker_6",
output_dir: Path | None = None,
) -> Path:
"""Generate a complete song with vocals over instrumental music.
This combines Bark for vocals and MusicGen for instrumental backing.
Args:
lyrics: The lyrics/text to sing (use ♪ for singing style)
music_prompt: Description of the instrumental music
voice: Bark voice preset (default: v2/en_speaker_6)
output_dir: Directory to save output
Returns:
Path to the generated song file
"""
import scipy.io.wavfile
if output_dir is None:
output_dir = Path(__file__).parent / "output"
output_dir.mkdir(exist_ok=True)
print("=" * 60)
print("GENERATING SONG WITH VOCALS")
print("=" * 60)
# Step 1: Generate vocals
print("\n[1/3] Generating vocals...")
vocals, bark_sr = _generate_vocals_for_song(lyrics, voice)
vocal_duration = len(vocals) / bark_sr
print(f"Vocals generated: {vocal_duration:.1f}s")
# Step 2: Generate instrumental (match vocal duration + buffer)
print("\n[2/3] Generating instrumental music...")
music_duration = int(vocal_duration) + 2
instrumental, musicgen_sr = _generate_instrumental_for_song(
music_prompt,
music_duration,
)
print(f"Instrumental generated: {len(instrumental) / musicgen_sr:.1f}s")
# Step 3: Mix vocals and instrumental
print("\n[3/3] Mixing vocals and instrumental...")
vocals_resampled = _resample_audio(vocals, bark_sr, musicgen_sr)
mixed = _mix_audio(instrumental, vocals_resampled)
# Save the song
timestamp = datetime.now(tz=timezone.utc).strftime("%Y%m%d_%H%M%S")
safe_lyrics = "".join(c if c.isalnum() or c in " -_" else "" for c in lyrics[:20])
safe_lyrics = safe_lyrics.strip().replace(" ", "_")
filename = f"{timestamp}_song_{safe_lyrics}.wav"
output_path = output_dir / filename
scipy.io.wavfile.write(output_path, musicgen_sr, mixed)
print("\n" + "=" * 60)
print(f"Song saved to: {output_path}")
print(f"Duration: {len(mixed) / musicgen_sr:.1f}s")
print("=" * 60)
return output_path