behaviors:
  BensonImitationV3:
    trainer_type: ppo

    hyperparameters:
      # Hyperparameters common to PPO and SAC
      batch_size: 50
      buffer_size: 15000
      learning_rate: 3.0e-4
      learning_rate_schedule: linear

      # PPO-specific hyperparameters
      # Replaces the "PPO-specific hyperparameters" section above
      beta: 5.0e-2
      epsilon: 0.1
      lambd: 0.95
      num_epoch: 3

    # Configuration of the neural network (common to PPO/SAC)
    network_settings:
      vis_encoder_type: simple
      normalize: false
      hidden_units: 128
      num_layers: 2

    # Trainer configurations common to all trainers
    max_steps: 2.4e5
    time_horizon: 64
    summary_freq: 9000
    keep_checkpoints: 5
    checkpoint_interval: 100000
    threaded: true
    init_path: null

    # behavior cloning
    behavioral_cloning:
      demo_path: 'c:\Users\noahk\Documents\Unity projects\Racesm\Assets\Demonstrations\BensonV3M.demo'
      strength: 0.5
      # steps: 150000
      # batch_size: 512
      # num_epoch: 3
      # samples_per_update: 0

    reward_signals:
      # environment reward (default)
      extrinsic:
        strength: 1.0
        gamma: 0.99

      # curiosity module
      curiosity:
        strength: 0.02
        gamma: 0.99
        encoding_size: 256
        learning_rate: 3.0e-4

      # GAIL
      gail:
        strength: 0.5
        # gamma: 0.99
        # encoding_size: 128
        demo_path: 'c:\Users\noahk\Documents\Unity projects\Racesm\Assets\Demonstrations\BensonV3M.demo'
        # learning_rate: 3.0e-4
        # use_actions: false
        # use_vail: false