mirror of
https://github.com/kuhyx/testsAndMisc.git
synced 2026-07-04 14:43:01 +02:00
music_gen: add segmented generation, Bark vocals, and song mixing
- Add segmented generation with crossfading for long audio (>30s) - Add Bark integration for speech/vocal generation (--speech flag) - Add full song generation with vocals over instrumental (--song flag) - Auto-select MusicGen model size based on available VRAM - Enforce CUDA for NVIDIA GPUs (no CPU fallback) - Update README with new features and examples
This commit is contained in:
parent
9e6d07b4e7
commit
740726a3ae
@ -1,6 +1,14 @@
|
||||
# MusicGen - Local AI Music Generator
|
||||
# MusicGen - Local AI Music & Speech Generator
|
||||
|
||||
Generate music from text prompts using Meta's open-source MusicGen model.
|
||||
Generate music and speech/vocals from text prompts using Meta's MusicGen and Suno's Bark.
|
||||
|
||||
## Features
|
||||
|
||||
- **Music Generation**: Create instrumental music from text descriptions (MusicGen)
|
||||
- **Long Audio Support**: Generate music of any length via automatic segmentation with crossfading
|
||||
- **Speech/Vocals**: Generate speech and singing with Bark (optional)
|
||||
- **CUDA Optimized**: Auto-detects GPU and selects best model for your VRAM
|
||||
- **No API Keys**: Runs 100% locally on your hardware
|
||||
|
||||
## Quick Start
|
||||
|
||||
@ -18,22 +26,53 @@ python music_generator.py "upbeat electronic dance music with synths"
|
||||
|
||||
## Usage
|
||||
|
||||
### Single Generation
|
||||
### Music Generation (MusicGen)
|
||||
|
||||
```bash
|
||||
# Basic usage
|
||||
python music_generator.py "jazz piano with soft drums"
|
||||
|
||||
# Set duration (in seconds, max ~30 recommended)
|
||||
python music_generator.py --duration 20 "epic orchestral soundtrack"
|
||||
# Set duration (any length supported via segmentation)
|
||||
python music_generator.py --duration 60 "epic orchestral soundtrack"
|
||||
|
||||
# Generate a full 3-minute track
|
||||
python music_generator.py --duration 180 "ambient electronic music"
|
||||
|
||||
# Use smaller/faster model
|
||||
python music_generator.py --model small "rock guitar riff"
|
||||
|
||||
# Use larger/better quality model (needs 16GB+ VRAM)
|
||||
# Use larger/better quality model (needs 12GB+ VRAM)
|
||||
python music_generator.py --model large "ambient electronic"
|
||||
```
|
||||
|
||||
### Speech/Vocals Generation (Bark)
|
||||
|
||||
```bash
|
||||
# First install Bark (not included in base setup)
|
||||
pip install git+https://github.com/suno-ai/bark.git
|
||||
|
||||
# Generate speech
|
||||
python music_generator.py --speech "Hello, how are you today?"
|
||||
|
||||
# Use different voice
|
||||
python music_generator.py --speech --voice v2/en_speaker_3 "Welcome!"
|
||||
|
||||
# Generate singing
|
||||
python music_generator.py --speech "♪ La la la, I love to sing ♪"
|
||||
|
||||
# With laughter and expression
|
||||
python music_generator.py --speech "That's so funny! [laughter] I can't believe it."
|
||||
```
|
||||
|
||||
**Bark special tokens:**
|
||||
|
||||
- `[laughter]`, `[laughs]`, `[sighs]`, `[gasps]` - expressions
|
||||
- `[music]`, `[clears throat]` - sounds
|
||||
- `♪` - singing
|
||||
- `...` or `—` - hesitations
|
||||
|
||||
**Available voices:** `v2/en_speaker_0` through `v2/en_speaker_9`
|
||||
|
||||
### Interactive Mode
|
||||
|
||||
```bash
|
||||
@ -47,20 +86,20 @@ In interactive mode:
|
||||
- `:h` - Show example prompts
|
||||
- `:q` - Quit
|
||||
|
||||
## Model Sizes
|
||||
## Model Sizes (Auto-Selected by VRAM)
|
||||
|
||||
| Model | Size | VRAM | Quality | Speed |
|
||||
| ------ | ------ | ----- | ------- | ------ |
|
||||
| small | ~500MB | ~4GB | Good | Fast |
|
||||
| medium | ~3.3GB | ~8GB | Better | Medium |
|
||||
| large | ~6.5GB | ~16GB | Best | Slow |
|
||||
| small | ~500MB | 3GB+ | Good | Fast |
|
||||
| medium | ~3.3GB | 8GB+ | Better | Medium |
|
||||
| large | ~6.5GB | 12GB+ | Best | Slow |
|
||||
|
||||
## Requirements
|
||||
|
||||
- Python 3.10+
|
||||
- 8GB+ RAM (16GB recommended)
|
||||
- GPU recommended (CUDA or Apple Silicon MPS)
|
||||
- Works on CPU but much slower
|
||||
- NVIDIA GPU with CUDA (required for NVIDIA systems)
|
||||
- Apple Silicon supported via MPS
|
||||
- 8GB+ VRAM recommended for best results
|
||||
|
||||
## Output
|
||||
|
||||
@ -83,7 +122,7 @@ Generated audio files are saved to `./output/` as WAV files with timestamps.
|
||||
### Out of Memory
|
||||
|
||||
- Try `--model small` for lower VRAM usage
|
||||
- Reduce duration with `--duration 5`
|
||||
- Reduce duration with `--duration 10`
|
||||
- Close other GPU applications
|
||||
|
||||
### Slow Generation
|
||||
@ -96,3 +135,11 @@ Generated audio files are saved to `./output/` as WAV files with timestamps.
|
||||
|
||||
- Check if scipy is installed: `pip install scipy`
|
||||
- Try a different audio player (VLC recommended)
|
||||
|
||||
### CUDA Not Available
|
||||
|
||||
If you see "NVIDIA GPU detected but CUDA is not available":
|
||||
|
||||
```bash
|
||||
pip install torch --index-url https://download.pytorch.org/whl/cu121
|
||||
```
|
||||
|
||||
@ -27,9 +27,18 @@ warnings.filterwarnings("ignore", category=UserWarning)
|
||||
VRAM_THRESHOLD_LARGE = 12 # Use large model with 12GB+ VRAM
|
||||
VRAM_THRESHOLD_MEDIUM = 8 # Use medium model with 8GB+ VRAM
|
||||
|
||||
# Generation settings for segmented long audio
|
||||
SEGMENT_DURATION = 25 # Seconds per segment (under 30s MusicGen limit)
|
||||
CROSSFADE_DURATION = 2 # Seconds of crossfade between segments
|
||||
BARK_MAX_CHARS = 200 # Max characters per Bark segment (~13s of speech)
|
||||
|
||||
def check_dependencies() -> bool:
|
||||
"""Check if required packages are installed."""
|
||||
|
||||
def check_dependencies(*, include_bark: bool = False) -> bool:
|
||||
"""Check if required packages are installed.
|
||||
|
||||
Args:
|
||||
include_bark: Whether to check for Bark dependencies as well.
|
||||
"""
|
||||
missing = []
|
||||
|
||||
try:
|
||||
@ -47,12 +56,21 @@ def check_dependencies() -> bool:
|
||||
except ImportError:
|
||||
missing.append("scipy")
|
||||
|
||||
if include_bark:
|
||||
try:
|
||||
from bark import generate_audio as _bark_gen # noqa: F401
|
||||
except ImportError:
|
||||
missing.append("git+https://github.com/suno-ai/bark.git")
|
||||
|
||||
if missing:
|
||||
print("Missing dependencies. Install with:")
|
||||
print(f" pip install {' '.join(missing)}")
|
||||
print("\nFor CUDA support:")
|
||||
print(" pip install torch --index-url https://download.pytorch.org/whl/cu121")
|
||||
print(" pip install transformers scipy")
|
||||
if include_bark:
|
||||
print("\nFor Bark vocals:")
|
||||
print(" pip install git+https://github.com/suno-ai/bark.git")
|
||||
return False
|
||||
return True
|
||||
|
||||
@ -182,6 +200,547 @@ def load_model(
|
||||
return model, processor
|
||||
|
||||
|
||||
# Available Bark voice presets
|
||||
BARK_VOICES = [
|
||||
"v2/en_speaker_0",
|
||||
"v2/en_speaker_1",
|
||||
"v2/en_speaker_2",
|
||||
"v2/en_speaker_3",
|
||||
"v2/en_speaker_4",
|
||||
"v2/en_speaker_5",
|
||||
"v2/en_speaker_6",
|
||||
"v2/en_speaker_7",
|
||||
"v2/en_speaker_8",
|
||||
"v2/en_speaker_9",
|
||||
]
|
||||
|
||||
|
||||
def generate_speech(
|
||||
text: str,
|
||||
voice: str = "v2/en_speaker_6",
|
||||
output_dir: Path | None = None,
|
||||
) -> Path:
|
||||
"""Generate speech audio from text using Bark.
|
||||
|
||||
Bark supports various speech patterns:
|
||||
- [laughter], [laughs], [sighs], [music]
|
||||
- [gasps], [clears throat], — or ... for hesitations
|
||||
- ♪ for singing
|
||||
|
||||
Args:
|
||||
text: Text to convert to speech (max ~13s per segment)
|
||||
voice: Voice preset to use (see BARK_VOICES)
|
||||
output_dir: Directory to save output (defaults to ./output)
|
||||
|
||||
Returns:
|
||||
Path to the generated audio file
|
||||
"""
|
||||
import functools
|
||||
|
||||
import numpy as np
|
||||
import scipy.io.wavfile
|
||||
import torch
|
||||
|
||||
# Bark uses older checkpoint format with pickle
|
||||
# Monkey-patch torch.load to allow unsafe loading for Bark models
|
||||
original_torch_load = torch.load
|
||||
|
||||
@functools.wraps(original_torch_load)
|
||||
def patched_load(*args: object, **kwargs: object) -> object:
|
||||
kwargs.setdefault("weights_only", False)
|
||||
return original_torch_load(*args, **kwargs)
|
||||
|
||||
torch.load = patched_load
|
||||
|
||||
try:
|
||||
from bark import SAMPLE_RATE, generate_audio, preload_models
|
||||
|
||||
if output_dir is None:
|
||||
output_dir = Path(__file__).parent / "output"
|
||||
output_dir.mkdir(exist_ok=True)
|
||||
|
||||
print("\nLoading Bark model...")
|
||||
print("(First run will download models, ~5GB total)")
|
||||
preload_models()
|
||||
|
||||
print(f"\nGenerating speech with voice: {voice}")
|
||||
print(f"Text: {text!r}")
|
||||
|
||||
# Bark can only generate ~13s at a time
|
||||
# For longer text, we need to split into sentences
|
||||
audio_segments = []
|
||||
|
||||
# Split on sentence boundaries for longer texts
|
||||
sentences = _split_into_sentences(text)
|
||||
|
||||
for i, sentence in enumerate(sentences):
|
||||
if len(sentences) > 1:
|
||||
print(f" Generating segment {i + 1}/{len(sentences)}...")
|
||||
|
||||
audio = generate_audio(
|
||||
sentence.strip(),
|
||||
history_prompt=voice,
|
||||
)
|
||||
audio_segments.append(audio)
|
||||
|
||||
# Combine segments
|
||||
if len(audio_segments) > 1:
|
||||
audio_data = np.concatenate(audio_segments)
|
||||
else:
|
||||
audio_data = audio_segments[0]
|
||||
|
||||
# Create filename
|
||||
timestamp = datetime.now(tz=timezone.utc).strftime("%Y%m%d_%H%M%S")
|
||||
safe_text = "".join(c if c.isalnum() or c in " -_" else "" for c in text[:30])
|
||||
safe_text = safe_text.strip().replace(" ", "_")
|
||||
filename = f"{timestamp}_speech_{safe_text}.wav"
|
||||
output_path = output_dir / filename
|
||||
|
||||
scipy.io.wavfile.write(output_path, SAMPLE_RATE, audio_data)
|
||||
|
||||
print(f"\nSaved to: {output_path}")
|
||||
print(f"Duration: {len(audio_data) / SAMPLE_RATE:.1f}s")
|
||||
|
||||
return output_path
|
||||
finally:
|
||||
# Restore original torch.load
|
||||
torch.load = original_torch_load
|
||||
|
||||
|
||||
def _split_into_sentences(text: str) -> list[str]:
|
||||
"""Split text into sentences for Bark processing.
|
||||
|
||||
Args:
|
||||
text: Text to split
|
||||
|
||||
Returns:
|
||||
List of sentences
|
||||
"""
|
||||
import re
|
||||
|
||||
# Split on sentence-ending punctuation followed by space
|
||||
sentences = re.split(r"(?<=[.!?])\s+", text.strip())
|
||||
|
||||
# Group very short sentences together
|
||||
result = []
|
||||
current = ""
|
||||
for sentence in sentences:
|
||||
if len(current) + len(sentence) < BARK_MAX_CHARS:
|
||||
current = f"{current} {sentence}".strip()
|
||||
else:
|
||||
if current:
|
||||
result.append(current)
|
||||
current = sentence
|
||||
if current:
|
||||
result.append(current)
|
||||
|
||||
return result if result else [text]
|
||||
|
||||
|
||||
def _resample_audio(
|
||||
audio: object,
|
||||
orig_sr: int,
|
||||
target_sr: int,
|
||||
) -> object:
|
||||
"""Resample audio to a different sample rate.
|
||||
|
||||
Args:
|
||||
audio: Audio data as numpy array
|
||||
orig_sr: Original sample rate
|
||||
target_sr: Target sample rate
|
||||
|
||||
Returns:
|
||||
Resampled audio data
|
||||
"""
|
||||
import numpy as np
|
||||
from scipy import signal
|
||||
|
||||
if orig_sr == target_sr:
|
||||
return audio
|
||||
|
||||
# Calculate the resampling ratio
|
||||
duration = len(audio) / orig_sr
|
||||
target_length = int(duration * target_sr)
|
||||
|
||||
return signal.resample(audio, target_length).astype(np.float32)
|
||||
|
||||
|
||||
def _mix_audio(
|
||||
instrumental: object,
|
||||
vocals: object,
|
||||
vocal_volume: float = 0.8,
|
||||
instrumental_volume: float = 0.6,
|
||||
) -> object:
|
||||
"""Mix vocals over instrumental track.
|
||||
|
||||
Args:
|
||||
instrumental: Instrumental audio (numpy array)
|
||||
vocals: Vocal audio (numpy array)
|
||||
vocal_volume: Volume multiplier for vocals (0.0-1.0)
|
||||
instrumental_volume: Volume multiplier for instrumental (0.0-1.0)
|
||||
|
||||
Returns:
|
||||
Mixed audio data
|
||||
"""
|
||||
import numpy as np
|
||||
|
||||
# Ensure same length - pad or trim vocals to match instrumental
|
||||
if len(vocals) < len(instrumental):
|
||||
# Pad vocals with silence at the end
|
||||
vocals = np.pad(vocals, (0, len(instrumental) - len(vocals)))
|
||||
elif len(vocals) > len(instrumental):
|
||||
# Trim vocals to match instrumental
|
||||
vocals = vocals[: len(instrumental)]
|
||||
|
||||
# Mix the tracks
|
||||
mixed = (instrumental * instrumental_volume) + (vocals * vocal_volume)
|
||||
|
||||
# Normalize to prevent clipping
|
||||
max_val = np.max(np.abs(mixed))
|
||||
if max_val > 1.0:
|
||||
mixed = mixed / max_val
|
||||
|
||||
return mixed.astype(np.float32)
|
||||
|
||||
|
||||
def _generate_vocals_for_song(lyrics: str, voice: str) -> tuple[object, int]:
|
||||
"""Generate vocals using Bark for song mixing.
|
||||
|
||||
Args:
|
||||
lyrics: Text/lyrics to sing
|
||||
voice: Bark voice preset
|
||||
|
||||
Returns:
|
||||
Tuple of (vocal audio array, sample rate)
|
||||
"""
|
||||
import functools
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
|
||||
# Patch torch.load for Bark compatibility
|
||||
original_torch_load = torch.load
|
||||
|
||||
@functools.wraps(original_torch_load)
|
||||
def patched_load(*args: object, **kwargs: object) -> object:
|
||||
kwargs.setdefault("weights_only", False)
|
||||
return original_torch_load(*args, **kwargs)
|
||||
|
||||
torch.load = patched_load
|
||||
|
||||
try:
|
||||
from bark import SAMPLE_RATE as BARK_SR
|
||||
from bark import generate_audio, preload_models
|
||||
|
||||
print("Loading Bark model...")
|
||||
preload_models()
|
||||
|
||||
print(f"Generating vocals with voice: {voice}")
|
||||
print(f"Lyrics: {lyrics!r}")
|
||||
|
||||
sentences = _split_into_sentences(lyrics)
|
||||
vocal_segments = []
|
||||
|
||||
for i, sentence in enumerate(sentences):
|
||||
if len(sentences) > 1:
|
||||
print(f" Vocal segment {i + 1}/{len(sentences)}...")
|
||||
audio = generate_audio(sentence.strip(), history_prompt=voice)
|
||||
vocal_segments.append(audio)
|
||||
|
||||
if len(vocal_segments) > 1:
|
||||
vocals = np.concatenate(vocal_segments)
|
||||
else:
|
||||
vocals = vocal_segments[0]
|
||||
|
||||
return vocals, BARK_SR
|
||||
|
||||
finally:
|
||||
torch.load = original_torch_load
|
||||
|
||||
|
||||
def _generate_instrumental_for_song(
|
||||
music_prompt: str,
|
||||
duration: int,
|
||||
) -> tuple[object, int]:
|
||||
"""Generate instrumental music using MusicGen for song mixing.
|
||||
|
||||
Args:
|
||||
music_prompt: Description of the music
|
||||
duration: Duration in seconds
|
||||
|
||||
Returns:
|
||||
Tuple of (instrumental audio array, sample rate)
|
||||
"""
|
||||
model_size = select_model_size(None)
|
||||
model, processor = load_model(model_size)
|
||||
|
||||
print(f"Music prompt: {music_prompt!r}")
|
||||
print(f"Duration: {duration}s")
|
||||
|
||||
device = str(next(model.parameters()).device)
|
||||
sample_rate = model.config.audio_encoder.sampling_rate
|
||||
|
||||
if duration <= SEGMENT_DURATION:
|
||||
instrumental = generate_segment(
|
||||
music_prompt,
|
||||
model,
|
||||
processor,
|
||||
duration,
|
||||
device,
|
||||
)
|
||||
else:
|
||||
instrumental = _generate_long_audio(
|
||||
music_prompt,
|
||||
model,
|
||||
processor,
|
||||
duration,
|
||||
)
|
||||
|
||||
return instrumental, sample_rate
|
||||
|
||||
|
||||
def generate_song(
|
||||
lyrics: str,
|
||||
music_prompt: str,
|
||||
voice: str = "v2/en_speaker_6",
|
||||
output_dir: Path | None = None,
|
||||
) -> Path:
|
||||
"""Generate a complete song with vocals over instrumental music.
|
||||
|
||||
This combines Bark for vocals and MusicGen for instrumental backing.
|
||||
|
||||
Args:
|
||||
lyrics: The lyrics/text to sing (use ♪ for singing style)
|
||||
music_prompt: Description of the instrumental music
|
||||
voice: Bark voice preset (default: v2/en_speaker_6)
|
||||
output_dir: Directory to save output
|
||||
|
||||
Returns:
|
||||
Path to the generated song file
|
||||
"""
|
||||
import scipy.io.wavfile
|
||||
|
||||
if output_dir is None:
|
||||
output_dir = Path(__file__).parent / "output"
|
||||
output_dir.mkdir(exist_ok=True)
|
||||
|
||||
print("=" * 60)
|
||||
print("GENERATING SONG WITH VOCALS")
|
||||
print("=" * 60)
|
||||
|
||||
# Step 1: Generate vocals
|
||||
print("\n[1/3] Generating vocals...")
|
||||
vocals, bark_sr = _generate_vocals_for_song(lyrics, voice)
|
||||
vocal_duration = len(vocals) / bark_sr
|
||||
print(f"Vocals generated: {vocal_duration:.1f}s")
|
||||
|
||||
# Step 2: Generate instrumental (match vocal duration + buffer)
|
||||
print("\n[2/3] Generating instrumental music...")
|
||||
music_duration = int(vocal_duration) + 2
|
||||
instrumental, musicgen_sr = _generate_instrumental_for_song(
|
||||
music_prompt,
|
||||
music_duration,
|
||||
)
|
||||
print(f"Instrumental generated: {len(instrumental) / musicgen_sr:.1f}s")
|
||||
|
||||
# Step 3: Mix vocals and instrumental
|
||||
print("\n[3/3] Mixing vocals and instrumental...")
|
||||
vocals_resampled = _resample_audio(vocals, bark_sr, musicgen_sr)
|
||||
mixed = _mix_audio(instrumental, vocals_resampled)
|
||||
|
||||
# Save the song
|
||||
timestamp = datetime.now(tz=timezone.utc).strftime("%Y%m%d_%H%M%S")
|
||||
safe_lyrics = "".join(c if c.isalnum() or c in " -_" else "" for c in lyrics[:20])
|
||||
safe_lyrics = safe_lyrics.strip().replace(" ", "_")
|
||||
filename = f"{timestamp}_song_{safe_lyrics}.wav"
|
||||
output_path = output_dir / filename
|
||||
|
||||
scipy.io.wavfile.write(output_path, musicgen_sr, mixed)
|
||||
|
||||
print("\n" + "=" * 60)
|
||||
print(f"Song saved to: {output_path}")
|
||||
print(f"Duration: {len(mixed) / musicgen_sr:.1f}s")
|
||||
print("=" * 60)
|
||||
|
||||
return output_path
|
||||
|
||||
|
||||
def crossfade_audio(
|
||||
audio1: object,
|
||||
audio2: object,
|
||||
crossfade_samples: int,
|
||||
) -> object:
|
||||
"""Crossfade two audio segments together.
|
||||
|
||||
Args:
|
||||
audio1: First audio segment (numpy array)
|
||||
audio2: Second audio segment (numpy array)
|
||||
crossfade_samples: Number of samples to use for crossfade
|
||||
|
||||
Returns:
|
||||
Combined audio with crossfade applied (numpy array)
|
||||
"""
|
||||
import numpy as np
|
||||
|
||||
if crossfade_samples <= 0 or len(audio1) < crossfade_samples:
|
||||
return np.concatenate([audio1, audio2])
|
||||
|
||||
# Create fade curves
|
||||
fade_out = np.linspace(1.0, 0.0, crossfade_samples)
|
||||
fade_in = np.linspace(0.0, 1.0, crossfade_samples)
|
||||
|
||||
# Apply fades
|
||||
audio1_end = audio1[-crossfade_samples:] * fade_out
|
||||
audio2_start = audio2[:crossfade_samples] * fade_in
|
||||
|
||||
# Combine
|
||||
crossfaded = audio1_end + audio2_start
|
||||
|
||||
# Build final audio
|
||||
return np.concatenate(
|
||||
[
|
||||
audio1[:-crossfade_samples],
|
||||
crossfaded,
|
||||
audio2[crossfade_samples:],
|
||||
]
|
||||
)
|
||||
|
||||
|
||||
def generate_segment(
|
||||
prompt: str,
|
||||
model: object,
|
||||
processor: object,
|
||||
duration_seconds: int,
|
||||
device: str,
|
||||
) -> object:
|
||||
"""Generate a single audio segment.
|
||||
|
||||
Args:
|
||||
prompt: Text description of the music
|
||||
model: The MusicGen model
|
||||
processor: The MusicGen processor
|
||||
duration_seconds: Length of segment to generate
|
||||
device: Device to generate on
|
||||
|
||||
Returns:
|
||||
Audio data as numpy array
|
||||
"""
|
||||
import torch
|
||||
|
||||
inputs = processor(
|
||||
text=[prompt],
|
||||
padding=True,
|
||||
return_tensors="pt",
|
||||
)
|
||||
inputs = {k: v.to(device) for k, v in inputs.items()}
|
||||
|
||||
max_new_tokens = int(duration_seconds * 50)
|
||||
|
||||
with torch.no_grad():
|
||||
audio_values = model.generate(
|
||||
**inputs,
|
||||
max_new_tokens=max_new_tokens,
|
||||
do_sample=True,
|
||||
)
|
||||
|
||||
return audio_values[0, 0].cpu().numpy()
|
||||
|
||||
|
||||
def _calculate_segment_duration(
|
||||
segment_index: int,
|
||||
num_segments: int,
|
||||
generated_samples: int,
|
||||
sample_rate: int,
|
||||
total_duration: int,
|
||||
) -> int:
|
||||
"""Calculate duration for a specific segment.
|
||||
|
||||
Args:
|
||||
segment_index: Current segment index
|
||||
num_segments: Total number of segments
|
||||
generated_samples: Number of samples generated so far
|
||||
sample_rate: Audio sample rate
|
||||
total_duration: Target total duration
|
||||
|
||||
Returns:
|
||||
Duration in seconds for this segment
|
||||
"""
|
||||
if segment_index == num_segments - 1:
|
||||
# Last segment: calculate remaining time
|
||||
generated_so_far = generated_samples / sample_rate
|
||||
remaining = total_duration - generated_so_far
|
||||
min_duration = max(5, int(remaining) + CROSSFADE_DURATION)
|
||||
return min(SEGMENT_DURATION, min_duration)
|
||||
return SEGMENT_DURATION
|
||||
|
||||
|
||||
def _generate_long_audio(
|
||||
prompt: str,
|
||||
model: object,
|
||||
processor: object,
|
||||
duration_seconds: int,
|
||||
) -> object:
|
||||
"""Generate long audio by segmenting with crossfades.
|
||||
|
||||
Args:
|
||||
prompt: Text description of the music
|
||||
model: The MusicGen model
|
||||
processor: The MusicGen processor
|
||||
duration_seconds: Total duration to generate
|
||||
|
||||
Returns:
|
||||
Audio data as numpy array
|
||||
"""
|
||||
import numpy as np
|
||||
|
||||
device = str(next(model.parameters()).device)
|
||||
sample_rate = model.config.audio_encoder.sampling_rate
|
||||
crossfade_samples = CROSSFADE_DURATION * sample_rate
|
||||
|
||||
effective_segment = SEGMENT_DURATION - CROSSFADE_DURATION
|
||||
total = duration_seconds + effective_segment - 1
|
||||
num_segments = max(1, total // effective_segment)
|
||||
|
||||
print(f"Generating {num_segments} segments of ~{SEGMENT_DURATION}s each...")
|
||||
|
||||
audio_data = np.array([], dtype=np.float32)
|
||||
|
||||
for i in range(num_segments):
|
||||
segment_duration = _calculate_segment_duration(
|
||||
i,
|
||||
num_segments,
|
||||
len(audio_data),
|
||||
sample_rate,
|
||||
duration_seconds,
|
||||
)
|
||||
|
||||
seg_num = i + 1
|
||||
msg = f" Segment {seg_num}/{num_segments} ({segment_duration}s)..."
|
||||
print(msg, end=" ", flush=True)
|
||||
|
||||
segment = generate_segment(
|
||||
prompt,
|
||||
model,
|
||||
processor,
|
||||
segment_duration,
|
||||
device,
|
||||
)
|
||||
|
||||
if len(audio_data) == 0:
|
||||
audio_data = segment
|
||||
else:
|
||||
audio_data = crossfade_audio(audio_data, segment, crossfade_samples)
|
||||
|
||||
print(f"done (total: {len(audio_data) / sample_rate:.1f}s)")
|
||||
|
||||
# Trim to exact duration if needed
|
||||
target_samples = int(duration_seconds * sample_rate)
|
||||
if len(audio_data) > target_samples:
|
||||
audio_data = audio_data[:target_samples]
|
||||
|
||||
return audio_data
|
||||
|
||||
|
||||
def generate_music(
|
||||
prompt: str,
|
||||
model: object,
|
||||
@ -191,53 +750,43 @@ def generate_music(
|
||||
) -> Path:
|
||||
"""Generate music from a text prompt.
|
||||
|
||||
For durations over 30 seconds, generates in segments with crossfading.
|
||||
|
||||
Args:
|
||||
prompt: Text description of the music to generate
|
||||
model: The MusicGen model
|
||||
processor: The MusicGen processor
|
||||
duration_seconds: Length of audio to generate (max ~30s recommended)
|
||||
duration_seconds: Length of audio to generate (any duration supported)
|
||||
output_dir: Directory to save output (defaults to ./output)
|
||||
|
||||
Returns:
|
||||
Path to the generated audio file
|
||||
"""
|
||||
import scipy.io.wavfile
|
||||
import torch
|
||||
|
||||
if output_dir is None:
|
||||
output_dir = Path(__file__).parent / "output"
|
||||
output_dir.mkdir(exist_ok=True)
|
||||
|
||||
print(f"\nGenerating {duration_seconds}s of music...")
|
||||
print(f"Prompt: {prompt!r}")
|
||||
|
||||
device = next(model.parameters()).device
|
||||
|
||||
# Prepare inputs
|
||||
inputs = processor(
|
||||
text=[prompt],
|
||||
padding=True,
|
||||
return_tensors="pt",
|
||||
)
|
||||
inputs = {k: v.to(device) for k, v in inputs.items()}
|
||||
|
||||
# Calculate tokens needed for duration
|
||||
# MusicGen generates ~50 tokens per second of audio
|
||||
max_new_tokens = int(duration_seconds * 50)
|
||||
|
||||
# Generate
|
||||
with torch.no_grad():
|
||||
audio_values = model.generate(
|
||||
**inputs,
|
||||
max_new_tokens=max_new_tokens,
|
||||
do_sample=True,
|
||||
)
|
||||
|
||||
# Get sample rate from model config
|
||||
sample_rate = model.config.audio_encoder.sampling_rate
|
||||
|
||||
# Convert to numpy and save
|
||||
audio_data = audio_values[0, 0].cpu().numpy()
|
||||
# For short durations, generate directly
|
||||
if duration_seconds <= SEGMENT_DURATION:
|
||||
print(f"\nGenerating {duration_seconds}s of music...")
|
||||
print(f"Prompt: {prompt!r}")
|
||||
device = str(next(model.parameters()).device)
|
||||
audio_data = generate_segment(
|
||||
prompt,
|
||||
model,
|
||||
processor,
|
||||
duration_seconds,
|
||||
device,
|
||||
)
|
||||
else:
|
||||
# Long duration: generate in segments with crossfading
|
||||
print(f"\nGenerating {duration_seconds}s of music in segments...")
|
||||
print(f"Prompt: {prompt!r}")
|
||||
audio_data = _generate_long_audio(prompt, model, processor, duration_seconds)
|
||||
|
||||
# Create filename with timestamp and sanitized prompt
|
||||
timestamp = datetime.now(tz=timezone.utc).strftime("%Y%m%d_%H%M%S")
|
||||
@ -328,46 +877,58 @@ def interactive_mode(model: object, processor: object) -> None:
|
||||
def main() -> None:
|
||||
"""Main entry point."""
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Generate music from text prompts using MusicGen",
|
||||
description="Generate music or speech from text prompts",
|
||||
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||
epilog="""
|
||||
Examples:
|
||||
# Music generation (MusicGen):
|
||||
%(prog)s "upbeat electronic dance music"
|
||||
%(prog)s --duration 20 "calm piano melody"
|
||||
%(prog)s --duration 60 "calm piano melody"
|
||||
%(prog)s --model small "jazz guitar solo"
|
||||
%(prog)s --interactive
|
||||
|
||||
Model sizes (auto-selected based on VRAM if not specified):
|
||||
# Speech/vocals generation (Bark):
|
||||
%(prog)s --speech "Hello, how are you today?"
|
||||
%(prog)s --speech --voice v2/en_speaker_3 "Welcome!"
|
||||
%(prog)s --speech "♪ La la la, I love to sing ♪"
|
||||
|
||||
# Full song with vocals over music:
|
||||
%(prog)s --song "♪ Hello world, this is my song ♪" --music "upbeat pop"
|
||||
|
||||
Model sizes for MusicGen (auto-selected based on VRAM if not specified):
|
||||
small - ~500MB, fastest, lower quality (3GB+ VRAM)
|
||||
medium - ~3.3GB, good balance (8GB+ VRAM)
|
||||
large - ~6.5GB, best quality (12GB+ VRAM)
|
||||
|
||||
Bark voices: v2/en_speaker_0 to v2/en_speaker_9
|
||||
Bark tokens: [laughter] [laughs] [sighs] [music] [gasps] ♪ (singing)
|
||||
""",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"prompt",
|
||||
nargs="?",
|
||||
help="Text description of music to generate",
|
||||
help="Text description of music/speech to generate",
|
||||
)
|
||||
parser.add_argument(
|
||||
"-d",
|
||||
"--duration",
|
||||
type=int,
|
||||
default=10,
|
||||
help="Duration in seconds (default: 10, max recommended: 30)",
|
||||
help="Duration in seconds (default: 10, any length supported)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"-m",
|
||||
"--model",
|
||||
choices=["small", "medium", "large"],
|
||||
default=None,
|
||||
help="Model size (default: auto-select based on VRAM, largest possible)",
|
||||
help="MusicGen model size (auto-select based on VRAM by default)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"-i",
|
||||
"--interactive",
|
||||
action="store_true",
|
||||
help="Run in interactive mode",
|
||||
help="Run in interactive mode (MusicGen only)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"-o",
|
||||
@ -375,6 +936,29 @@ Model sizes (auto-selected based on VRAM if not specified):
|
||||
type=Path,
|
||||
help="Output directory (default: ./output)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"-s",
|
||||
"--speech",
|
||||
action="store_true",
|
||||
help="Generate speech/vocals using Bark instead of music",
|
||||
)
|
||||
parser.add_argument(
|
||||
"-v",
|
||||
"--voice",
|
||||
default="v2/en_speaker_6",
|
||||
help="Bark voice preset (default: v2/en_speaker_6)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--song",
|
||||
action="store_true",
|
||||
help="Generate a full song with vocals over instrumental",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--music",
|
||||
type=str,
|
||||
default="upbeat pop instrumental backing track",
|
||||
help="Music style for --song mode (default: upbeat pop)",
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
@ -384,25 +968,40 @@ Model sizes (auto-selected based on VRAM if not specified):
|
||||
sys.exit(1)
|
||||
|
||||
# Check dependencies
|
||||
if not check_dependencies():
|
||||
use_bark = args.speech or args.song
|
||||
if not check_dependencies(include_bark=use_bark):
|
||||
sys.exit(1)
|
||||
|
||||
# Select model size based on VRAM if not specified
|
||||
model_size = select_model_size(args.model)
|
||||
|
||||
# Load model
|
||||
model, processor = load_model(model_size)
|
||||
|
||||
if args.interactive:
|
||||
interactive_mode(model, processor)
|
||||
else:
|
||||
generate_music(
|
||||
if args.song:
|
||||
# Full song generation mode (vocals + instrumental)
|
||||
generate_song(
|
||||
args.prompt,
|
||||
model,
|
||||
processor,
|
||||
duration_seconds=args.duration,
|
||||
args.music,
|
||||
voice=args.voice,
|
||||
output_dir=args.output,
|
||||
)
|
||||
elif args.speech:
|
||||
# Bark speech generation mode
|
||||
generate_speech(
|
||||
args.prompt,
|
||||
voice=args.voice,
|
||||
output_dir=args.output,
|
||||
)
|
||||
else:
|
||||
# MusicGen music generation mode
|
||||
model_size = select_model_size(args.model)
|
||||
model, processor = load_model(model_size)
|
||||
|
||||
if args.interactive:
|
||||
interactive_mode(model, processor)
|
||||
else:
|
||||
generate_music(
|
||||
args.prompt,
|
||||
model,
|
||||
processor,
|
||||
duration_seconds=args.duration,
|
||||
output_dir=args.output,
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
Loading…
Reference in New Issue
Block a user