From 740726a3aea0de0227c2c5183c1e9db0df017ec9 Mon Sep 17 00:00:00 2001 From: Krzysztof kuhy Rudnicki Date: Thu, 4 Dec 2025 21:26:52 +0100 Subject: [PATCH] music_gen: add segmented generation, Bark vocals, and song mixing - Add segmented generation with crossfading for long audio (>30s) - Add Bark integration for speech/vocal generation (--speech flag) - Add full song generation with vocals over instrumental (--song flag) - Auto-select MusicGen model size based on available VRAM - Enforce CUDA for NVIDIA GPUs (no CPU fallback) - Update README with new features and examples --- python_pkg/music_gen/README.md | 75 ++- python_pkg/music_gen/music_generator.py | 705 ++++++++++++++++++++++-- 2 files changed, 713 insertions(+), 67 deletions(-) diff --git a/python_pkg/music_gen/README.md b/python_pkg/music_gen/README.md index ee39f56..8cc5621 100644 --- a/python_pkg/music_gen/README.md +++ b/python_pkg/music_gen/README.md @@ -1,6 +1,14 @@ -# MusicGen - Local AI Music Generator +# MusicGen - Local AI Music & Speech Generator -Generate music from text prompts using Meta's open-source MusicGen model. +Generate music and speech/vocals from text prompts using Meta's MusicGen and Suno's Bark. + +## Features + +- **Music Generation**: Create instrumental music from text descriptions (MusicGen) +- **Long Audio Support**: Generate music of any length via automatic segmentation with crossfading +- **Speech/Vocals**: Generate speech and singing with Bark (optional) +- **CUDA Optimized**: Auto-detects GPU and selects best model for your VRAM +- **No API Keys**: Runs 100% locally on your hardware ## Quick Start @@ -18,22 +26,53 @@ python music_generator.py "upbeat electronic dance music with synths" ## Usage -### Single Generation +### Music Generation (MusicGen) ```bash # Basic usage python music_generator.py "jazz piano with soft drums" -# Set duration (in seconds, max ~30 recommended) -python music_generator.py --duration 20 "epic orchestral soundtrack" +# Set duration (any length supported via segmentation) +python music_generator.py --duration 60 "epic orchestral soundtrack" + +# Generate a full 3-minute track +python music_generator.py --duration 180 "ambient electronic music" # Use smaller/faster model python music_generator.py --model small "rock guitar riff" -# Use larger/better quality model (needs 16GB+ VRAM) +# Use larger/better quality model (needs 12GB+ VRAM) python music_generator.py --model large "ambient electronic" ``` +### Speech/Vocals Generation (Bark) + +```bash +# First install Bark (not included in base setup) +pip install git+https://github.com/suno-ai/bark.git + +# Generate speech +python music_generator.py --speech "Hello, how are you today?" + +# Use different voice +python music_generator.py --speech --voice v2/en_speaker_3 "Welcome!" + +# Generate singing +python music_generator.py --speech "♪ La la la, I love to sing ♪" + +# With laughter and expression +python music_generator.py --speech "That's so funny! [laughter] I can't believe it." +``` + +**Bark special tokens:** + +- `[laughter]`, `[laughs]`, `[sighs]`, `[gasps]` - expressions +- `[music]`, `[clears throat]` - sounds +- `♪` - singing +- `...` or `—` - hesitations + +**Available voices:** `v2/en_speaker_0` through `v2/en_speaker_9` + ### Interactive Mode ```bash @@ -47,20 +86,20 @@ In interactive mode: - `:h` - Show example prompts - `:q` - Quit -## Model Sizes +## Model Sizes (Auto-Selected by VRAM) | Model | Size | VRAM | Quality | Speed | | ------ | ------ | ----- | ------- | ------ | -| small | ~500MB | ~4GB | Good | Fast | -| medium | ~3.3GB | ~8GB | Better | Medium | -| large | ~6.5GB | ~16GB | Best | Slow | +| small | ~500MB | 3GB+ | Good | Fast | +| medium | ~3.3GB | 8GB+ | Better | Medium | +| large | ~6.5GB | 12GB+ | Best | Slow | ## Requirements - Python 3.10+ -- 8GB+ RAM (16GB recommended) -- GPU recommended (CUDA or Apple Silicon MPS) -- Works on CPU but much slower +- NVIDIA GPU with CUDA (required for NVIDIA systems) +- Apple Silicon supported via MPS +- 8GB+ VRAM recommended for best results ## Output @@ -83,7 +122,7 @@ Generated audio files are saved to `./output/` as WAV files with timestamps. ### Out of Memory - Try `--model small` for lower VRAM usage -- Reduce duration with `--duration 5` +- Reduce duration with `--duration 10` - Close other GPU applications ### Slow Generation @@ -96,3 +135,11 @@ Generated audio files are saved to `./output/` as WAV files with timestamps. - Check if scipy is installed: `pip install scipy` - Try a different audio player (VLC recommended) + +### CUDA Not Available + +If you see "NVIDIA GPU detected but CUDA is not available": + +```bash +pip install torch --index-url https://download.pytorch.org/whl/cu121 +``` diff --git a/python_pkg/music_gen/music_generator.py b/python_pkg/music_gen/music_generator.py index 212898b..3bcb84c 100755 --- a/python_pkg/music_gen/music_generator.py +++ b/python_pkg/music_gen/music_generator.py @@ -27,9 +27,18 @@ warnings.filterwarnings("ignore", category=UserWarning) VRAM_THRESHOLD_LARGE = 12 # Use large model with 12GB+ VRAM VRAM_THRESHOLD_MEDIUM = 8 # Use medium model with 8GB+ VRAM +# Generation settings for segmented long audio +SEGMENT_DURATION = 25 # Seconds per segment (under 30s MusicGen limit) +CROSSFADE_DURATION = 2 # Seconds of crossfade between segments +BARK_MAX_CHARS = 200 # Max characters per Bark segment (~13s of speech) -def check_dependencies() -> bool: - """Check if required packages are installed.""" + +def check_dependencies(*, include_bark: bool = False) -> bool: + """Check if required packages are installed. + + Args: + include_bark: Whether to check for Bark dependencies as well. + """ missing = [] try: @@ -47,12 +56,21 @@ def check_dependencies() -> bool: except ImportError: missing.append("scipy") + if include_bark: + try: + from bark import generate_audio as _bark_gen # noqa: F401 + except ImportError: + missing.append("git+https://github.com/suno-ai/bark.git") + if missing: print("Missing dependencies. Install with:") print(f" pip install {' '.join(missing)}") print("\nFor CUDA support:") print(" pip install torch --index-url https://download.pytorch.org/whl/cu121") print(" pip install transformers scipy") + if include_bark: + print("\nFor Bark vocals:") + print(" pip install git+https://github.com/suno-ai/bark.git") return False return True @@ -182,6 +200,547 @@ def load_model( return model, processor +# Available Bark voice presets +BARK_VOICES = [ + "v2/en_speaker_0", + "v2/en_speaker_1", + "v2/en_speaker_2", + "v2/en_speaker_3", + "v2/en_speaker_4", + "v2/en_speaker_5", + "v2/en_speaker_6", + "v2/en_speaker_7", + "v2/en_speaker_8", + "v2/en_speaker_9", +] + + +def generate_speech( + text: str, + voice: str = "v2/en_speaker_6", + output_dir: Path | None = None, +) -> Path: + """Generate speech audio from text using Bark. + + Bark supports various speech patterns: + - [laughter], [laughs], [sighs], [music] + - [gasps], [clears throat], — or ... for hesitations + - ♪ for singing + + Args: + text: Text to convert to speech (max ~13s per segment) + voice: Voice preset to use (see BARK_VOICES) + output_dir: Directory to save output (defaults to ./output) + + Returns: + Path to the generated audio file + """ + import functools + + import numpy as np + import scipy.io.wavfile + import torch + + # Bark uses older checkpoint format with pickle + # Monkey-patch torch.load to allow unsafe loading for Bark models + original_torch_load = torch.load + + @functools.wraps(original_torch_load) + def patched_load(*args: object, **kwargs: object) -> object: + kwargs.setdefault("weights_only", False) + return original_torch_load(*args, **kwargs) + + torch.load = patched_load + + try: + from bark import SAMPLE_RATE, generate_audio, preload_models + + if output_dir is None: + output_dir = Path(__file__).parent / "output" + output_dir.mkdir(exist_ok=True) + + print("\nLoading Bark model...") + print("(First run will download models, ~5GB total)") + preload_models() + + print(f"\nGenerating speech with voice: {voice}") + print(f"Text: {text!r}") + + # Bark can only generate ~13s at a time + # For longer text, we need to split into sentences + audio_segments = [] + + # Split on sentence boundaries for longer texts + sentences = _split_into_sentences(text) + + for i, sentence in enumerate(sentences): + if len(sentences) > 1: + print(f" Generating segment {i + 1}/{len(sentences)}...") + + audio = generate_audio( + sentence.strip(), + history_prompt=voice, + ) + audio_segments.append(audio) + + # Combine segments + if len(audio_segments) > 1: + audio_data = np.concatenate(audio_segments) + else: + audio_data = audio_segments[0] + + # Create filename + timestamp = datetime.now(tz=timezone.utc).strftime("%Y%m%d_%H%M%S") + safe_text = "".join(c if c.isalnum() or c in " -_" else "" for c in text[:30]) + safe_text = safe_text.strip().replace(" ", "_") + filename = f"{timestamp}_speech_{safe_text}.wav" + output_path = output_dir / filename + + scipy.io.wavfile.write(output_path, SAMPLE_RATE, audio_data) + + print(f"\nSaved to: {output_path}") + print(f"Duration: {len(audio_data) / SAMPLE_RATE:.1f}s") + + return output_path + finally: + # Restore original torch.load + torch.load = original_torch_load + + +def _split_into_sentences(text: str) -> list[str]: + """Split text into sentences for Bark processing. + + Args: + text: Text to split + + Returns: + List of sentences + """ + import re + + # Split on sentence-ending punctuation followed by space + sentences = re.split(r"(?<=[.!?])\s+", text.strip()) + + # Group very short sentences together + result = [] + current = "" + for sentence in sentences: + if len(current) + len(sentence) < BARK_MAX_CHARS: + current = f"{current} {sentence}".strip() + else: + if current: + result.append(current) + current = sentence + if current: + result.append(current) + + return result if result else [text] + + +def _resample_audio( + audio: object, + orig_sr: int, + target_sr: int, +) -> object: + """Resample audio to a different sample rate. + + Args: + audio: Audio data as numpy array + orig_sr: Original sample rate + target_sr: Target sample rate + + Returns: + Resampled audio data + """ + import numpy as np + from scipy import signal + + if orig_sr == target_sr: + return audio + + # Calculate the resampling ratio + duration = len(audio) / orig_sr + target_length = int(duration * target_sr) + + return signal.resample(audio, target_length).astype(np.float32) + + +def _mix_audio( + instrumental: object, + vocals: object, + vocal_volume: float = 0.8, + instrumental_volume: float = 0.6, +) -> object: + """Mix vocals over instrumental track. + + Args: + instrumental: Instrumental audio (numpy array) + vocals: Vocal audio (numpy array) + vocal_volume: Volume multiplier for vocals (0.0-1.0) + instrumental_volume: Volume multiplier for instrumental (0.0-1.0) + + Returns: + Mixed audio data + """ + import numpy as np + + # Ensure same length - pad or trim vocals to match instrumental + if len(vocals) < len(instrumental): + # Pad vocals with silence at the end + vocals = np.pad(vocals, (0, len(instrumental) - len(vocals))) + elif len(vocals) > len(instrumental): + # Trim vocals to match instrumental + vocals = vocals[: len(instrumental)] + + # Mix the tracks + mixed = (instrumental * instrumental_volume) + (vocals * vocal_volume) + + # Normalize to prevent clipping + max_val = np.max(np.abs(mixed)) + if max_val > 1.0: + mixed = mixed / max_val + + return mixed.astype(np.float32) + + +def _generate_vocals_for_song(lyrics: str, voice: str) -> tuple[object, int]: + """Generate vocals using Bark for song mixing. + + Args: + lyrics: Text/lyrics to sing + voice: Bark voice preset + + Returns: + Tuple of (vocal audio array, sample rate) + """ + import functools + + import numpy as np + import torch + + # Patch torch.load for Bark compatibility + original_torch_load = torch.load + + @functools.wraps(original_torch_load) + def patched_load(*args: object, **kwargs: object) -> object: + kwargs.setdefault("weights_only", False) + return original_torch_load(*args, **kwargs) + + torch.load = patched_load + + try: + from bark import SAMPLE_RATE as BARK_SR + from bark import generate_audio, preload_models + + print("Loading Bark model...") + preload_models() + + print(f"Generating vocals with voice: {voice}") + print(f"Lyrics: {lyrics!r}") + + sentences = _split_into_sentences(lyrics) + vocal_segments = [] + + for i, sentence in enumerate(sentences): + if len(sentences) > 1: + print(f" Vocal segment {i + 1}/{len(sentences)}...") + audio = generate_audio(sentence.strip(), history_prompt=voice) + vocal_segments.append(audio) + + if len(vocal_segments) > 1: + vocals = np.concatenate(vocal_segments) + else: + vocals = vocal_segments[0] + + return vocals, BARK_SR + + finally: + torch.load = original_torch_load + + +def _generate_instrumental_for_song( + music_prompt: str, + duration: int, +) -> tuple[object, int]: + """Generate instrumental music using MusicGen for song mixing. + + Args: + music_prompt: Description of the music + duration: Duration in seconds + + Returns: + Tuple of (instrumental audio array, sample rate) + """ + model_size = select_model_size(None) + model, processor = load_model(model_size) + + print(f"Music prompt: {music_prompt!r}") + print(f"Duration: {duration}s") + + device = str(next(model.parameters()).device) + sample_rate = model.config.audio_encoder.sampling_rate + + if duration <= SEGMENT_DURATION: + instrumental = generate_segment( + music_prompt, + model, + processor, + duration, + device, + ) + else: + instrumental = _generate_long_audio( + music_prompt, + model, + processor, + duration, + ) + + return instrumental, sample_rate + + +def generate_song( + lyrics: str, + music_prompt: str, + voice: str = "v2/en_speaker_6", + output_dir: Path | None = None, +) -> Path: + """Generate a complete song with vocals over instrumental music. + + This combines Bark for vocals and MusicGen for instrumental backing. + + Args: + lyrics: The lyrics/text to sing (use ♪ for singing style) + music_prompt: Description of the instrumental music + voice: Bark voice preset (default: v2/en_speaker_6) + output_dir: Directory to save output + + Returns: + Path to the generated song file + """ + import scipy.io.wavfile + + if output_dir is None: + output_dir = Path(__file__).parent / "output" + output_dir.mkdir(exist_ok=True) + + print("=" * 60) + print("GENERATING SONG WITH VOCALS") + print("=" * 60) + + # Step 1: Generate vocals + print("\n[1/3] Generating vocals...") + vocals, bark_sr = _generate_vocals_for_song(lyrics, voice) + vocal_duration = len(vocals) / bark_sr + print(f"Vocals generated: {vocal_duration:.1f}s") + + # Step 2: Generate instrumental (match vocal duration + buffer) + print("\n[2/3] Generating instrumental music...") + music_duration = int(vocal_duration) + 2 + instrumental, musicgen_sr = _generate_instrumental_for_song( + music_prompt, + music_duration, + ) + print(f"Instrumental generated: {len(instrumental) / musicgen_sr:.1f}s") + + # Step 3: Mix vocals and instrumental + print("\n[3/3] Mixing vocals and instrumental...") + vocals_resampled = _resample_audio(vocals, bark_sr, musicgen_sr) + mixed = _mix_audio(instrumental, vocals_resampled) + + # Save the song + timestamp = datetime.now(tz=timezone.utc).strftime("%Y%m%d_%H%M%S") + safe_lyrics = "".join(c if c.isalnum() or c in " -_" else "" for c in lyrics[:20]) + safe_lyrics = safe_lyrics.strip().replace(" ", "_") + filename = f"{timestamp}_song_{safe_lyrics}.wav" + output_path = output_dir / filename + + scipy.io.wavfile.write(output_path, musicgen_sr, mixed) + + print("\n" + "=" * 60) + print(f"Song saved to: {output_path}") + print(f"Duration: {len(mixed) / musicgen_sr:.1f}s") + print("=" * 60) + + return output_path + + +def crossfade_audio( + audio1: object, + audio2: object, + crossfade_samples: int, +) -> object: + """Crossfade two audio segments together. + + Args: + audio1: First audio segment (numpy array) + audio2: Second audio segment (numpy array) + crossfade_samples: Number of samples to use for crossfade + + Returns: + Combined audio with crossfade applied (numpy array) + """ + import numpy as np + + if crossfade_samples <= 0 or len(audio1) < crossfade_samples: + return np.concatenate([audio1, audio2]) + + # Create fade curves + fade_out = np.linspace(1.0, 0.0, crossfade_samples) + fade_in = np.linspace(0.0, 1.0, crossfade_samples) + + # Apply fades + audio1_end = audio1[-crossfade_samples:] * fade_out + audio2_start = audio2[:crossfade_samples] * fade_in + + # Combine + crossfaded = audio1_end + audio2_start + + # Build final audio + return np.concatenate( + [ + audio1[:-crossfade_samples], + crossfaded, + audio2[crossfade_samples:], + ] + ) + + +def generate_segment( + prompt: str, + model: object, + processor: object, + duration_seconds: int, + device: str, +) -> object: + """Generate a single audio segment. + + Args: + prompt: Text description of the music + model: The MusicGen model + processor: The MusicGen processor + duration_seconds: Length of segment to generate + device: Device to generate on + + Returns: + Audio data as numpy array + """ + import torch + + inputs = processor( + text=[prompt], + padding=True, + return_tensors="pt", + ) + inputs = {k: v.to(device) for k, v in inputs.items()} + + max_new_tokens = int(duration_seconds * 50) + + with torch.no_grad(): + audio_values = model.generate( + **inputs, + max_new_tokens=max_new_tokens, + do_sample=True, + ) + + return audio_values[0, 0].cpu().numpy() + + +def _calculate_segment_duration( + segment_index: int, + num_segments: int, + generated_samples: int, + sample_rate: int, + total_duration: int, +) -> int: + """Calculate duration for a specific segment. + + Args: + segment_index: Current segment index + num_segments: Total number of segments + generated_samples: Number of samples generated so far + sample_rate: Audio sample rate + total_duration: Target total duration + + Returns: + Duration in seconds for this segment + """ + if segment_index == num_segments - 1: + # Last segment: calculate remaining time + generated_so_far = generated_samples / sample_rate + remaining = total_duration - generated_so_far + min_duration = max(5, int(remaining) + CROSSFADE_DURATION) + return min(SEGMENT_DURATION, min_duration) + return SEGMENT_DURATION + + +def _generate_long_audio( + prompt: str, + model: object, + processor: object, + duration_seconds: int, +) -> object: + """Generate long audio by segmenting with crossfades. + + Args: + prompt: Text description of the music + model: The MusicGen model + processor: The MusicGen processor + duration_seconds: Total duration to generate + + Returns: + Audio data as numpy array + """ + import numpy as np + + device = str(next(model.parameters()).device) + sample_rate = model.config.audio_encoder.sampling_rate + crossfade_samples = CROSSFADE_DURATION * sample_rate + + effective_segment = SEGMENT_DURATION - CROSSFADE_DURATION + total = duration_seconds + effective_segment - 1 + num_segments = max(1, total // effective_segment) + + print(f"Generating {num_segments} segments of ~{SEGMENT_DURATION}s each...") + + audio_data = np.array([], dtype=np.float32) + + for i in range(num_segments): + segment_duration = _calculate_segment_duration( + i, + num_segments, + len(audio_data), + sample_rate, + duration_seconds, + ) + + seg_num = i + 1 + msg = f" Segment {seg_num}/{num_segments} ({segment_duration}s)..." + print(msg, end=" ", flush=True) + + segment = generate_segment( + prompt, + model, + processor, + segment_duration, + device, + ) + + if len(audio_data) == 0: + audio_data = segment + else: + audio_data = crossfade_audio(audio_data, segment, crossfade_samples) + + print(f"done (total: {len(audio_data) / sample_rate:.1f}s)") + + # Trim to exact duration if needed + target_samples = int(duration_seconds * sample_rate) + if len(audio_data) > target_samples: + audio_data = audio_data[:target_samples] + + return audio_data + + def generate_music( prompt: str, model: object, @@ -191,53 +750,43 @@ def generate_music( ) -> Path: """Generate music from a text prompt. + For durations over 30 seconds, generates in segments with crossfading. + Args: prompt: Text description of the music to generate model: The MusicGen model processor: The MusicGen processor - duration_seconds: Length of audio to generate (max ~30s recommended) + duration_seconds: Length of audio to generate (any duration supported) output_dir: Directory to save output (defaults to ./output) Returns: Path to the generated audio file """ import scipy.io.wavfile - import torch if output_dir is None: output_dir = Path(__file__).parent / "output" output_dir.mkdir(exist_ok=True) - print(f"\nGenerating {duration_seconds}s of music...") - print(f"Prompt: {prompt!r}") - - device = next(model.parameters()).device - - # Prepare inputs - inputs = processor( - text=[prompt], - padding=True, - return_tensors="pt", - ) - inputs = {k: v.to(device) for k, v in inputs.items()} - - # Calculate tokens needed for duration - # MusicGen generates ~50 tokens per second of audio - max_new_tokens = int(duration_seconds * 50) - - # Generate - with torch.no_grad(): - audio_values = model.generate( - **inputs, - max_new_tokens=max_new_tokens, - do_sample=True, - ) - - # Get sample rate from model config sample_rate = model.config.audio_encoder.sampling_rate - # Convert to numpy and save - audio_data = audio_values[0, 0].cpu().numpy() + # For short durations, generate directly + if duration_seconds <= SEGMENT_DURATION: + print(f"\nGenerating {duration_seconds}s of music...") + print(f"Prompt: {prompt!r}") + device = str(next(model.parameters()).device) + audio_data = generate_segment( + prompt, + model, + processor, + duration_seconds, + device, + ) + else: + # Long duration: generate in segments with crossfading + print(f"\nGenerating {duration_seconds}s of music in segments...") + print(f"Prompt: {prompt!r}") + audio_data = _generate_long_audio(prompt, model, processor, duration_seconds) # Create filename with timestamp and sanitized prompt timestamp = datetime.now(tz=timezone.utc).strftime("%Y%m%d_%H%M%S") @@ -328,46 +877,58 @@ def interactive_mode(model: object, processor: object) -> None: def main() -> None: """Main entry point.""" parser = argparse.ArgumentParser( - description="Generate music from text prompts using MusicGen", + description="Generate music or speech from text prompts", formatter_class=argparse.RawDescriptionHelpFormatter, epilog=""" Examples: + # Music generation (MusicGen): %(prog)s "upbeat electronic dance music" - %(prog)s --duration 20 "calm piano melody" + %(prog)s --duration 60 "calm piano melody" %(prog)s --model small "jazz guitar solo" %(prog)s --interactive -Model sizes (auto-selected based on VRAM if not specified): + # Speech/vocals generation (Bark): + %(prog)s --speech "Hello, how are you today?" + %(prog)s --speech --voice v2/en_speaker_3 "Welcome!" + %(prog)s --speech "♪ La la la, I love to sing ♪" + + # Full song with vocals over music: + %(prog)s --song "♪ Hello world, this is my song ♪" --music "upbeat pop" + +Model sizes for MusicGen (auto-selected based on VRAM if not specified): small - ~500MB, fastest, lower quality (3GB+ VRAM) medium - ~3.3GB, good balance (8GB+ VRAM) large - ~6.5GB, best quality (12GB+ VRAM) + +Bark voices: v2/en_speaker_0 to v2/en_speaker_9 +Bark tokens: [laughter] [laughs] [sighs] [music] [gasps] ♪ (singing) """, ) parser.add_argument( "prompt", nargs="?", - help="Text description of music to generate", + help="Text description of music/speech to generate", ) parser.add_argument( "-d", "--duration", type=int, default=10, - help="Duration in seconds (default: 10, max recommended: 30)", + help="Duration in seconds (default: 10, any length supported)", ) parser.add_argument( "-m", "--model", choices=["small", "medium", "large"], default=None, - help="Model size (default: auto-select based on VRAM, largest possible)", + help="MusicGen model size (auto-select based on VRAM by default)", ) parser.add_argument( "-i", "--interactive", action="store_true", - help="Run in interactive mode", + help="Run in interactive mode (MusicGen only)", ) parser.add_argument( "-o", @@ -375,6 +936,29 @@ Model sizes (auto-selected based on VRAM if not specified): type=Path, help="Output directory (default: ./output)", ) + parser.add_argument( + "-s", + "--speech", + action="store_true", + help="Generate speech/vocals using Bark instead of music", + ) + parser.add_argument( + "-v", + "--voice", + default="v2/en_speaker_6", + help="Bark voice preset (default: v2/en_speaker_6)", + ) + parser.add_argument( + "--song", + action="store_true", + help="Generate a full song with vocals over instrumental", + ) + parser.add_argument( + "--music", + type=str, + default="upbeat pop instrumental backing track", + help="Music style for --song mode (default: upbeat pop)", + ) args = parser.parse_args() @@ -384,25 +968,40 @@ Model sizes (auto-selected based on VRAM if not specified): sys.exit(1) # Check dependencies - if not check_dependencies(): + use_bark = args.speech or args.song + if not check_dependencies(include_bark=use_bark): sys.exit(1) - # Select model size based on VRAM if not specified - model_size = select_model_size(args.model) - - # Load model - model, processor = load_model(model_size) - - if args.interactive: - interactive_mode(model, processor) - else: - generate_music( + if args.song: + # Full song generation mode (vocals + instrumental) + generate_song( args.prompt, - model, - processor, - duration_seconds=args.duration, + args.music, + voice=args.voice, output_dir=args.output, ) + elif args.speech: + # Bark speech generation mode + generate_speech( + args.prompt, + voice=args.voice, + output_dir=args.output, + ) + else: + # MusicGen music generation mode + model_size = select_model_size(args.model) + model, processor = load_model(model_size) + + if args.interactive: + interactive_mode(model, processor) + else: + generate_music( + args.prompt, + model, + processor, + duration_seconds=args.duration, + output_dir=args.output, + ) if __name__ == "__main__":