import ray
from ray import tune
from ray.rllib.agents.ppo import PPOTrainer
from metadrive import MultiAgentTIntersectionEnv
import random

# Initialize Ray
ray.init(ignore_reinit_error=True)

# Define a custom environment class that switches between three maps
class MultiMapEnv(MultiAgentTIntersectionEnv):
    def __init__(self, config):
        # Define available maps
        self.maps = ["TIntersection", "Roundabout", "Straight"]
        super().__init__(config)

    def reset(self):
        # Randomly choose a map from the available ones at the start of each episode
        self.config["map"] = random.choice(self.maps)
        return super().reset()

# Multi-agent configuration with two independent policies
config = {
    "env": MultiMapEnv,
    "env_config": {
        "num_agents": 2,           # Set to 2 agents for this multi-agent scenario
    },
    "framework": "torch",          # Use PyTorch as the backend
    "num_workers": 1,              # Set to 1 worker for simplicity
    "multiagent": {
        "policies": {
            "policy_1": {},  # Configuration for the first agent's policy
            "policy_2": {},  # Configuration for the second agent's policy
        },
        "policy_mapping_fn": lambda agent_id: "policy_1" if agent_id == "agent_1" else "policy_2",
    },
}

# Initialize the trainer with PPO algorithm
trainer = PPOTrainer(env=MultiMapEnv, config=config)

# Training loop
print("Starting training for two agents across multiple maps...")
for i in range(10):  # Number of training iterations
    result = trainer.train()
    print(f"Iteration {i + 1}: reward = {result['episode_reward_mean']}")

# Clean up resources
trainer.cleanup()
ray.shutdown()