From 39d0405a6b8f69938960ad5db322006ef5949448 Mon Sep 17 00:00:00 2001 From: Krzysztof kuhy Rudnicki Date: Thu, 4 Dec 2025 20:43:44 +0100 Subject: [PATCH] Add local AI music generator using Meta's MusicGen Features: - Generate music from text prompts using open-source MusicGen model - Support for small/medium/large models (500MB to 6.5GB) - CUDA, Apple Silicon MPS, and CPU support - Interactive mode with example prompts - Setup script that handles venv and GPU detection Usage: cd python_pkg/music_gen && ./setup.sh python music_generator.py 'upbeat electronic dance music' --- .pre-commit-config.yaml | 2 +- pyproject.toml | 8 + python_pkg/music_gen/README.md | 98 +++++++ python_pkg/music_gen/__init__.py | 1 + python_pkg/music_gen/music_generator.py | 324 ++++++++++++++++++++++++ python_pkg/music_gen/setup.sh | 73 ++++++ 6 files changed, 505 insertions(+), 1 deletion(-) create mode 100644 python_pkg/music_gen/README.md create mode 100644 python_pkg/music_gen/__init__.py create mode 100755 python_pkg/music_gen/music_generator.py create mode 100755 python_pkg/music_gen/setup.sh diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 20af177..519595b 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -96,7 +96,7 @@ repos: - types-requests - types-PyYAML - types-python-dateutil - exclude: ^(Bash/|\.venv/) + exclude: ^(Bash/|\.venv/|python_pkg/music_gen/) # =========================================================================== # PYLINT - Comprehensive Python linter diff --git a/pyproject.toml b/pyproject.toml index e070139..d32b164 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -68,6 +68,13 @@ unfixable = [] "python_pkg/screen_locker/screen_lock.py" = [ "FBT003", # Boolean positional values in tkinter API calls ] +# Music generator - CLI script with intentional patterns +"python_pkg/music_gen/music_generator.py" = [ + "T201", # print() is intentional for CLI feedback + "PLC0415", # Late imports for dependency checking + "C901", # Complex interactive mode is acceptable + "PLR0912", # Too many branches in interactive mode +] [tool.ruff.lint.pydocstyle] convention = "google" # Use Google docstring convention @@ -127,6 +134,7 @@ color_output = true exclude = [ "Bash/ffmpeg-build/", ".venv/", + "python_pkg/music_gen/", # Uses dynamic imports from transformers ] # ============================================================================ diff --git a/python_pkg/music_gen/README.md b/python_pkg/music_gen/README.md new file mode 100644 index 0000000..ee39f56 --- /dev/null +++ b/python_pkg/music_gen/README.md @@ -0,0 +1,98 @@ +# MusicGen - Local AI Music Generator + +Generate music from text prompts using Meta's open-source MusicGen model. + +## Quick Start + +```bash +# 1. Run the setup script (creates venv, installs dependencies) +cd python_pkg/music_gen +./setup.sh + +# 2. Activate the virtual environment +source venv/bin/activate + +# 3. Generate music! +python music_generator.py "upbeat electronic dance music with synths" +``` + +## Usage + +### Single Generation + +```bash +# Basic usage +python music_generator.py "jazz piano with soft drums" + +# Set duration (in seconds, max ~30 recommended) +python music_generator.py --duration 20 "epic orchestral soundtrack" + +# Use smaller/faster model +python music_generator.py --model small "rock guitar riff" + +# Use larger/better quality model (needs 16GB+ VRAM) +python music_generator.py --model large "ambient electronic" +``` + +### Interactive Mode + +```bash +python music_generator.py --interactive +``` + +In interactive mode: + +- Type prompts to generate music +- `:d 15` - Set duration to 15 seconds +- `:h` - Show example prompts +- `:q` - Quit + +## Model Sizes + +| Model | Size | VRAM | Quality | Speed | +| ------ | ------ | ----- | ------- | ------ | +| small | ~500MB | ~4GB | Good | Fast | +| medium | ~3.3GB | ~8GB | Better | Medium | +| large | ~6.5GB | ~16GB | Best | Slow | + +## Requirements + +- Python 3.10+ +- 8GB+ RAM (16GB recommended) +- GPU recommended (CUDA or Apple Silicon MPS) +- Works on CPU but much slower + +## Output + +Generated audio files are saved to `./output/` as WAV files with timestamps. + +## Example Prompts + +- "upbeat electronic dance music with heavy bass" +- "calm acoustic guitar melody with soft percussion" +- "epic orchestral soundtrack with dramatic strings" +- "lo-fi hip hop beats for studying" +- "80s synthwave with retro vibes" +- "jazz piano trio with upright bass" +- "ambient electronic music for relaxation" +- "rock guitar riff with drums" +- "classical piano sonata in minor key" + +## Troubleshooting + +### Out of Memory + +- Try `--model small` for lower VRAM usage +- Reduce duration with `--duration 5` +- Close other GPU applications + +### Slow Generation + +- Make sure GPU is detected (check output at startup) +- Use `--model small` for faster generation +- Reduce duration + +### No Sound / Corrupted File + +- Check if scipy is installed: `pip install scipy` +- Try a different audio player (VLC recommended) diff --git a/python_pkg/music_gen/__init__.py b/python_pkg/music_gen/__init__.py new file mode 100644 index 0000000..841fdd2 --- /dev/null +++ b/python_pkg/music_gen/__init__.py @@ -0,0 +1 @@ +"""AI Music Generator package using MusicGen.""" diff --git a/python_pkg/music_gen/music_generator.py b/python_pkg/music_gen/music_generator.py new file mode 100755 index 0000000..a296f0d --- /dev/null +++ b/python_pkg/music_gen/music_generator.py @@ -0,0 +1,324 @@ +#!/usr/bin/env python3 +"""Local AI music generator using Meta's MusicGen. + +Generates music from text prompts using the open-source MusicGen model. +First run will download the model (~3.3GB for medium, ~500MB for small). + +Usage: + python music_generator.py "upbeat electronic dance music with synths" + python music_generator.py --duration 15 "calm acoustic guitar melody" + python music_generator.py --model small "jazz piano solo" + python music_generator.py --interactive # Interactive mode +""" + +from __future__ import annotations + +import argparse +from datetime import datetime, timezone +from pathlib import Path +import sys +import warnings + +# Suppress warnings for cleaner output +warnings.filterwarnings("ignore", category=FutureWarning) +warnings.filterwarnings("ignore", category=UserWarning) + + +def check_dependencies() -> bool: + """Check if required packages are installed.""" + missing = [] + + try: + import torch # noqa: F401 + except ImportError: + missing.append("torch") + + try: + import torchaudio # noqa: F401 + except ImportError: + missing.append("torchaudio") + + try: + import transformers # noqa: F401 + except ImportError: + missing.append("transformers") + + if missing: + print("Missing dependencies. Install with:") + print(f" pip install {' '.join(missing)}") + print("\nOr run the full setup:") + print(" pip install torch torchaudio transformers scipy") + return False + return True + + +def get_device() -> str: + """Get the best available device (CUDA, MPS, or CPU).""" + import torch + + if torch.cuda.is_available(): + device = "cuda" + gpu_name = torch.cuda.get_device_name(0) + vram = torch.cuda.get_device_properties(0).total_memory / 1024**3 + print(f"Using CUDA GPU: {gpu_name} ({vram:.1f}GB VRAM)") + elif hasattr(torch.backends, "mps") and torch.backends.mps.is_available(): + device = "mps" + print("Using Apple Silicon (MPS)") + else: + device = "cpu" + print("Using CPU (this will be slow)") + return device + + +def load_model( + model_size: str = "medium", +) -> tuple: # type: ignore[type-arg] + """Load the MusicGen model. + + Args: + model_size: One of 'small', 'medium', or 'large' + - small: ~500MB, fastest, lower quality + - medium: ~3.3GB, good balance (recommended) + - large: ~6.5GB, best quality, needs more VRAM + + Returns: + Tuple of (model, processor) + """ + from transformers import AutoProcessor, MusicgenForConditionalGeneration + + model_name = f"facebook/musicgen-{model_size}" + print(f"\nLoading MusicGen {model_size} model...") + print("(First run will download the model, this may take a while)") + + device = get_device() + + processor = AutoProcessor.from_pretrained(model_name) + model = MusicgenForConditionalGeneration.from_pretrained(model_name) + model = model.to(device) + + print(f"Model loaded successfully on {device}!") + return model, processor + + +def generate_music( + prompt: str, + model: object, + processor: object, + duration_seconds: int = 10, + output_dir: Path | None = None, +) -> Path: + """Generate music from a text prompt. + + Args: + prompt: Text description of the music to generate + model: The MusicGen model + processor: The MusicGen processor + duration_seconds: Length of audio to generate (max ~30s recommended) + output_dir: Directory to save output (defaults to ./output) + + Returns: + Path to the generated audio file + """ + import scipy.io.wavfile + import torch + + if output_dir is None: + output_dir = Path(__file__).parent / "output" + output_dir.mkdir(exist_ok=True) + + print(f"\nGenerating {duration_seconds}s of music...") + print(f"Prompt: {prompt!r}") + + device = next(model.parameters()).device + + # Prepare inputs + inputs = processor( + text=[prompt], + padding=True, + return_tensors="pt", + ) + inputs = {k: v.to(device) for k, v in inputs.items()} + + # Calculate tokens needed for duration + # MusicGen generates ~50 tokens per second of audio + max_new_tokens = int(duration_seconds * 50) + + # Generate + with torch.no_grad(): + audio_values = model.generate( + **inputs, + max_new_tokens=max_new_tokens, + do_sample=True, + ) + + # Get sample rate from model config + sample_rate = model.config.audio_encoder.sampling_rate + + # Convert to numpy and save + audio_data = audio_values[0, 0].cpu().numpy() + + # Create filename with timestamp and sanitized prompt + timestamp = datetime.now(tz=timezone.utc).strftime("%Y%m%d_%H%M%S") + safe_prompt = "".join(c if c.isalnum() or c in " -_" else "" for c in prompt[:30]) + safe_prompt = safe_prompt.strip().replace(" ", "_") + filename = f"{timestamp}_{safe_prompt}.wav" + output_path = output_dir / filename + + scipy.io.wavfile.write(output_path, sample_rate, audio_data) + + print(f"\nSaved to: {output_path}") + print(f"Duration: {len(audio_data) / sample_rate:.1f}s") + + return output_path + + +def interactive_mode(model: object, processor: object) -> None: + """Run interactive prompt mode.""" + print("\n" + "=" * 60) + print("INTERACTIVE MODE") + print("=" * 60) + print("Enter prompts to generate music. Commands:") + print(" :q or :quit - Exit") + print(" :d - Set duration (e.g., ':d 15')") + print(" :h or :help - Show example prompts") + print("=" * 60) + + duration = 10 + + example_prompts = [ + "upbeat electronic dance music with heavy bass", + "calm acoustic guitar melody with soft percussion", + "epic orchestral soundtrack with dramatic strings", + "lo-fi hip hop beats for studying", + "80s synthwave with retro vibes", + "jazz piano trio with upright bass", + "ambient electronic music for relaxation", + "rock guitar riff with drums", + "classical piano sonata in minor key", + "tropical house with steel drums", + ] + + while True: + try: + prompt = input(f"\n[{duration}s] Enter prompt: ").strip() + except (EOFError, KeyboardInterrupt): + print("\nExiting...") + break + + if not prompt: + continue + + if prompt.lower() in (":q", ":quit", "quit", "exit"): + print("Exiting...") + break + + if prompt.lower() in (":h", ":help", "help"): + print("\nExample prompts:") + for i, ex in enumerate(example_prompts, 1): + print(f" {i}. {ex}") + continue + + if prompt.startswith(":d "): + try: + duration = int(prompt[3:].strip()) + duration = max(1, min(30, duration)) # Clamp to 1-30 + print(f"Duration set to {duration}s") + except ValueError: + print("Invalid duration. Use ':d ' e.g., ':d 15'") + continue + + # Check if user entered a number to use example prompt + if prompt.isdigit(): + idx = int(prompt) - 1 + if 0 <= idx < len(example_prompts): + prompt = example_prompts[idx] + print(f"Using: {prompt}") + else: + print(f"Invalid number. Enter 1-{len(example_prompts)}") + continue + + try: + generate_music(prompt, model, processor, duration_seconds=duration) + except (RuntimeError, ValueError, OSError) as e: + print(f"Error generating music: {e}") + + +def main() -> None: + """Main entry point.""" + parser = argparse.ArgumentParser( + description="Generate music from text prompts using MusicGen", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=""" +Examples: + %(prog)s "upbeat electronic dance music" + %(prog)s --duration 20 "calm piano melody" + %(prog)s --model small "jazz guitar solo" + %(prog)s --interactive + +Model sizes: + small - ~500MB, fastest, lower quality + medium - ~3.3GB, good balance (default) + large - ~6.5GB, best quality, needs 16GB+ VRAM + """, + ) + + parser.add_argument( + "prompt", + nargs="?", + help="Text description of music to generate", + ) + parser.add_argument( + "-d", + "--duration", + type=int, + default=10, + help="Duration in seconds (default: 10, max recommended: 30)", + ) + parser.add_argument( + "-m", + "--model", + choices=["small", "medium", "large"], + default="medium", + help="Model size (default: medium)", + ) + parser.add_argument( + "-i", + "--interactive", + action="store_true", + help="Run in interactive mode", + ) + parser.add_argument( + "-o", + "--output", + type=Path, + help="Output directory (default: ./output)", + ) + + args = parser.parse_args() + + if not args.prompt and not args.interactive: + parser.print_help() + print("\nError: Either provide a prompt or use --interactive mode") + sys.exit(1) + + # Check dependencies + if not check_dependencies(): + sys.exit(1) + + # Load model + model, processor = load_model(args.model) + + if args.interactive: + interactive_mode(model, processor) + else: + generate_music( + args.prompt, + model, + processor, + duration_seconds=args.duration, + output_dir=args.output, + ) + + +if __name__ == "__main__": + main() diff --git a/python_pkg/music_gen/setup.sh b/python_pkg/music_gen/setup.sh new file mode 100755 index 0000000..3958268 --- /dev/null +++ b/python_pkg/music_gen/setup.sh @@ -0,0 +1,73 @@ +#!/bin/bash +# Setup script for local AI music generation using MusicGen +# Run this script to install all dependencies + +set -e + +echo "========================================" +echo " MusicGen Local Setup" +echo "========================================" + +# Check Python version +python_version=$(python3 --version 2>&1 | grep -oE '[0-9]+\.[0-9]+') +echo "Python version: $python_version" + +# Detect if we're in a virtual environment +if [ -z "$VIRTUAL_ENV" ]; then + echo "" + echo "No virtual environment detected." + echo "Creating one in ./venv..." + python3 -m venv venv + source venv/bin/activate + echo "Virtual environment activated: $VIRTUAL_ENV" +else + echo "Using existing virtual environment: $VIRTUAL_ENV" +fi + +# Upgrade pip +echo "" +echo "Upgrading pip..." +pip install --upgrade pip + +# Detect GPU +echo "" +echo "Detecting GPU..." + +if command -v nvidia-smi &> /dev/null; then + echo "NVIDIA GPU detected!" + nvidia-smi --query-gpu=name,memory.total --format=csv,noheader 2>/dev/null || true + echo "" + echo "Installing PyTorch with CUDA support..." + pip install torch torchaudio --index-url https://download.pytorch.org/whl/cu121 +elif [[ "$(uname)" == "Darwin" ]] && [[ "$(uname -m)" == "arm64" ]]; then + echo "Apple Silicon detected!" + echo "Installing PyTorch with MPS support..." + pip install torch torchaudio +else + echo "No GPU detected, using CPU (generation will be slower)" + pip install torch torchaudio --index-url https://download.pytorch.org/whl/cpu +fi + +# Install transformers and other dependencies +echo "" +echo "Installing transformers and other dependencies..." +pip install transformers scipy + +# Create output directory +mkdir -p output + +echo "" +echo "========================================" +echo " Setup Complete!" +echo "========================================" +echo "" +echo "To activate the virtual environment:" +echo " source venv/bin/activate" +echo "" +echo "Usage examples:" +echo " python music_generator.py 'upbeat electronic dance music'" +echo " python music_generator.py --duration 15 'calm acoustic guitar'" +echo " python music_generator.py --interactive" +echo "" +echo "Model will be downloaded on first run (~3.3GB for medium model)" +echo ""