adap_kl_ctrl: true backward_batch_size: 32 batch_size: 32 cliprange: 0.2 cliprange_value: 0.2 compare_steps: 1 early_stopping: false exp_name: a2b7701778d64d8395823410aec38be2 forward_batch_size: null gamma: 1 global_backward_batch_size: 32 global_batch_size: 32 gradient_accumulation_steps: 1 gradient_checkpointing: false horizon: 10000 init_kl_coef: 0.2 is_encoder_decoder: false is_peft_model: false kl_penalty: kl lam: 0.95 learning_rate: 0.001 log_with: tensorboard max_grad_norm: null mini_batch_size: 32 model_name: gpt2 optimize_cuda_cache: null optimize_device_cache: false ppo_epochs: 4 project_kwargs/logging_dir: ./data/logs query_dataset: imdb ratio_threshold: 10.0 remove_unused_columns: true reward_model: sentiment-analysis:lvwerra/distilbert-imdb score_clip: null seed: 42 steps: 20000 target: 6 target_kl: 1 task_name: a2b7701778d64d8395823410aec38be2 total_ppo_epochs: 625 tracker_project_name: trl use_score_norm: false use_score_scaling: false vf_coef: 0.1 whiten_rewards: false world_size: 1