behaviors:
  MuscleTrack2:
    trainer_type: ppo

    hyperparameters:
      # Hyperparameters common to PPO and SAC
      batch_size: 50
      buffer_size: 10240
      learning_rate: 3.0e-4
      learning_rate_schedule: linear

      # PPO-specific hyperparameters
      # Replaces the "PPO-specific hyperparameters" section above
      beta: 5.0e-2
      epsilon: 0.2
      lambd: 0.95
      num_epoch: 3

    # Configuration of the neural network (common to PPO/SAC)
    network_settings:
      vis_encoder_type: simple
      normalize: false
      hidden_units: 128
      num_layers: 2

    # Trainer configurations common to all trainers
    max_steps: 2.0e7
    time_horizon: 64
    summary_freq: 10000
    keep_checkpoints: 5
    checkpoint_interval : 500000
    threaded: false
    init_path: null

    # # behavior cloning
    # behavioral_cloning:
    #   demo_path: 'c:\Users\noahk\Documents\Unity projects\Racesm\Assets\Demonstrations\BensonV3M.demo'
    #   strength: 0.5
    #   # steps: 150000
    #   # batch_size: 512
    #   # num_epoch: 3
    #   # samples_per_update: 0

    reward_signals:
      # environment reward (default)
      extrinsic:
        strength: 1.0
        gamma: 0.99

    # self_play:
    #   window: 3
    #   play_against_latest_model_ratio: 0.5
    #   save_steps: 100000
    #   swap_steps: 200000
    #   team_change: 1000000

      # # curiosity module
      # curiosity:
      #   strength: 0.02
      #   gamma: 0.99
      #   encoding_size: 256
      #   learning_rate: 3.0e-4

      # # GAIL
      # gail:
      #   strength: 0.5
      #   # gamma: 0.99
      #   # encoding_size: 128
      #   demo_path: 'c:\Users\noahk\Documents\Unity projects\Racesm\Assets\Demonstrations\BensonV3M.demo'
      #   # learning_rate: 3.0e-4
      #   # use_actions: false
      #   # use_vail: false