HH ^uA brain.Event:2R. ,tensorboard.summary.writer.event_file_writer"x= ruA*  objective/kl+'F (ruA*  objective/kl_coefL>$'F ruA*  objective/entropy}B /m]P ruA*" ppo/mean_non_score_reward,-%6 ruA*  ppo/mean_scores9[= $B+M ruA*  ppo/std_scores!>=QC-'Kkt0_ ruA*# ! ppo/policy/advantages_mean432{thc&sO ruA*  ppo/returns/mean_?>%6 ruA*  ppo/returns/var>ߞH#wC %ruA*  ppo/val/vpred-_>J~&#wC ruA*  ppo/val/erroroAv|&sO lruA*  ppo/val/clipfracgf>"x= ruA*  ppo/val/meanS@6 !{ ruA*  ppo/val/varg?e+K KruA*  ppo/val/var_explained-'F ruA*  ppo/learning_rateo:+K ruA*  time/ppo/forward_passPa>nr.W 0ruA*!  time/ppo/compute_rewardsذ;N{1 ruA*$ " time/ppo/compute_advantages;,E jruA*  time/ppo/optimize_step–?SFm)7_ ruA*  time/ppo/calc_stats3O>K9$B+M ruA*  time/ppo/total?%6 ?ruA*  env/reward_mean9[=`$B+M ruA*  env/reward_std!>=Mz"x= a#~uA*  objective/klxB.'F #~uA*  objective/kl_coef?L>\K'F #~uA*  objective/entropyoBXuj/m]P $~uA*" ppo/mean_non_score_reward.85%6 "$~uA*  ppo/mean_scores4 >Y'$B+M 4$~uA*  ppo/std_scoresNID>mVe-@YU )7_ (~uA*  ppo/policy/approxkl3?u)7_ +*~uA*  ppo/policy/policykl"f)7_ q+~uA*  ppo/policy/clipfracff>'%{0_ `,~uA*# ! ppo/policy/advantages_mean2G n&sO %-~uA*  ppo/returns/mean%l+%6 -~uA*  ppo/returns/varB0D#wC .~uA*  ppo/val/vpred饫#wC ;/~uA*  ppo/val/error/>Bل&sO /~uA*  ppo/val/clipfrac>?=! "x= 0~uA*  ppo/val/mean'@z !{ D1~uA*  ppo/val/varA4e>q+K 1~uA*  ppo/val/var_explainedd%V'F 2~uA*  ppo/learning_rateo:%}+K N3~uA*  time/ppo/forward_pass|b>!j.W 4~uA*!  time/ppo/compute_rewards@:WE1 4~uA*$ " time/ppo/compute_advantages{;ݨQ,E `5~uA*  time/ppo/optimize_step?H)7_ 6~uA*  time/ppo/calc_statsОN>TD$B+M 6~uA*  time/ppo/total^?%6 ^7~uA*  env/reward_mean4 >$B+M 8~uA*  env/reward_stdNID>|"x= RuA*  objective/klKEBڭ'F SuA*  objective/kl_coefL>|Z'F 9SuA*  objective/entropy֢B,x/m]P OSuA*" ppo/mean_non_score_reward҉j+%6 bSuA*  ppo/mean_scoreszy[>}e$B+M tSuA*  ppo/std_scoresLd>k-CUn0_ \uA*# ! ppo/policy/advantages_mean3327S&sO \uA*  ppo/returns/mean%*.%6 \uA*  ppo/returns/var@W|#wC ]uA*  ppo/val/vpredf>#wC (^uA*  ppo/val/errorhA̺&sO ^uA*  ppo/val/clipfrac43>~|"x= i_uA*  ppo/val/mean~C!{ `uA*  ppo/val/var22>+K `uA*  ppo/val/var_explained㪿$('F [auA*  ppo/learning_rateo:ܷ+K auA*  time/ppo/forward_pass=.W buA*!  time/ppo/compute_rewardsp:Al1 FcuA*$ " time/ppo/compute_advantages;|B,E cuA*  time/ppo/optimize_stepHx&?z)7_ duA*  time/ppo/calc_statsY=$B+M euA*  time/ppo/totalc?r%6 euA*  env/reward_meanzy[>J-$B+M ]fuA*  env/reward_stdLd>drw"x= cuA*  objective/kl*$BT'F KcuA*  objective/kl_coefUL>Bv'F gcuA*  objective/entropyB;D,/m]P {cuA*" ppo/mean_non_score_rewardVDҾ З%6 cuA*  ppo/mean_scoresw>7$B+M cuA*  ppo/std_scores`>-U(pJ cuA*  ppo/policy/entropy|@)7_ cuA*  ppo/policy/approxklT>K*)7_ !cuA*  ppo/policy/policyklM=)7_ !cuA*  ppo/policy/clipfrac>aKƺ0_ !cuA*# ! ppo/policy/advantages_mean2T&Η&sO "cuA*  ppo/returns/meanƏVr23%6 $cuA*  ppo/returns/var#@n#wC $cuA*  ppo/val/vpredN#wC <&cuA*  ppo/val/error(@pf&sO 'cuA*  ppo/val/clipfrac?z"x= 'cuA*  ppo/val/meanm:!{ c(cuA*  ppo/val/var8=/s +K )cuA*  ppo/val/var_explainedT;p'F )cuA*  ppo/learning_rateo:laE+K O*cuA*  time/ppo/forward_passK>Q.W *cuA*!  time/ppo/compute_rewards:C1 +cuA*$ " time/ppo/compute_advantagese;-R,E 5,cuA*  time/ppo/optimize_step?M)7_ ,cuA*  time/ppo/calc_stats ?>h6'Z$B+M j-cuA*  time/ppo/totalĶ?K %6 .cuA*  env/reward_meanw>Qi$B+M .cuA*  env/reward_std`>"x= [9uA*  objective/kl!B<;'F 9uA*  objective/kl_coefM>D'F F:uA*  objective/entropywB%O/m]P \:uA*" ppo/mean_non_score_rewardhϾH1%6 o:uA*  ppo/mean_scorese>?m$B+M :uA*  ppo/std_scores? >gX-](pJ CuA*  ppo/policy/entropy>@)7_ #CuA*  ppo/policy/approxkl=#c)7_ 6CuA*  ppo/policy/policykl$=a)7_ ICuA*  ppo/policy/clipfrac33>i0_ FHuA*# ! ppo/policy/advantages_mean2jewv&sO rHuA*  ppo/returns/meanJu%6 HuA*  ppo/returns/varQ@^#wC HuA*  ppo/val/vpred51 #wC HuA*  ppo/val/error0@&sO HuA*  ppo/val/clipfrac̲>?Ӌ["x= HuA*  ppo/val/meanxv%-!{ (JuA*  ppo/val/var5=ou+K KuA*  ppo/val/var_explained/C'F KuA*  ppo/learning_rateo:+K LuA*  time/ppo/forward_passeM><.W %MuA*!  time/ppo/compute_rewards:k;1 MuA*$ " time/ppo/compute_advantagesi;,E [NuA*  time/ppo/optimize_stepvć?[zM%)7_ NuA*  time/ppo/calc_stats?>:O_$B+M OuA*  time/ppo/totalF?%6 !PuA*  env/reward_meane>)}$B+M PuA*  env/reward_std? >"x= <uA*  objective/kl:,B`N'F uA*  objective/kl_coef1M>9'F (uA*  objective/entropyB)/m]P ?uA*" ppo/mean_non_score_reward6ݾS;T%6 SuA*  ppo/mean_scores`$m>$B+M huA*  ppo/std_scores@s>c-X&(pJ uA*  ppo/policy/entropy^*@/)7_ uA*  ppo/policy/approxkl=͸!)7_ uA*  ppo/policy/policykl(m=- )7_ uA*  ppo/policy/clipfrac33>K5>%6 uA*  ppo/returns/var@#wC uA*  ppo/val/vpredH |#wC uA*  ppo/val/error@gn'9&sO uA*  ppo/val/clipfrac33>"x= IuA*  ppo/val/meanz.W uA*!  time/ppo/compute_rewards:=]1 uA*$ " time/ppo/compute_advantagesLh;'S,E uA*  time/ppo/optimize_step?)7_ suA*  time/ppo/calc_stats0:>5 $B+M uA*  time/ppo/total?4%6 $uA*  env/reward_mean`$m>.d$B+M |uA*  env/reward_std@s>YH'"x= Q:vA*  objective/kl(.B'F :vA*  objective/kl_coef SM>('F :vA*  objective/entropyBO/m]P :vA*" ppo/mean_non_score_rewardq߾jU%6 :vA*  ppo/mean_scores.O>A$B+M :vA*  ppo/std_scores>XFn-s7(pJ %:vA*  ppo/policy/entropy@є)7_ ;:vA*  ppo/policy/approxkl>Ҧ)7_ f:vA*  ppo/policy/policyklq=N)7_ }:vA*  ppo/policy/clipfrac>30_ :vA*# ! ppo/policy/advantages_mean43305&sO :vA*  ppo/returns/mean2̅-z"%6 :vA*  ppo/returns/varV@R#wC :vA*  ppo/val/vpredsU:#wC :vA*  ppo/val/errorT@\f&sO :vA*  ppo/val/clipfrac>4]"x= :vA*  ppo/val/meanZlH7W!{ T:vA*  ppo/val/varV >+K :vA*  ppo/val/var_explained<>p-'F :vA*  ppo/learning_rateo: I+K Y:vA*  time/ppo/forward_pass0K>K.W :vA*!  time/ppo/compute_rewards:a1 :vA*$ " time/ppo/compute_advantagesb;L,E T:vA*  time/ppo/optimize_stepĆ?)7_ :vA*  time/ppo/calc_stats<>$B+M :vA*  time/ppo/totalx?gd%6 K:vA*  env/reward_mean.O>zl$B+M :vA*  env/reward_std>/"x= f}EvA*  objective/klt@Bl$j'F }EvA*  objective/kl_coeftM>s'F }EvA*  objective/entropyFB3_[m/m]P ~EvA*" ppo/mean_non_score_reward!j9%%6 ~EvA*  ppo/mean_scores[>tFt$B+M *~EvA*  ppo/std_scoresm>6"-_LRF(pJ EvA*  ppo/policy/entropyf@|0;)7_ 7EvA*  ppo/policy/approxkl>)7_ EvA*  ppo/policy/policyklZF=g )7_ EvA*  ppo/policy/clipfrac>K&0_ ؇EvA*# ! ppo/policy/advantages_mean,1G3&sO EvA*  ppo/returns/mean@Q{Z%6 EvA*  ppo/returns/var@Bע#wC EvA*  ppo/val/vpredz!L#wC #EvA*  ppo/val/error@Sy&sO 3EvA*  ppo/val/clipfrac>"x= *EvA*  ppo/val/meanRrq#!{ SEvA*  ppo/val/vară>5~+K hEvA*  ppo/val/var_explainedRS>R'F yEvA*  ppo/learning_rateo:H+K EvA*  time/ppo/forward_pass`K>h"R.W EvA*!  time/ppo/compute_rewards:m1 EvA*$ " time/ppo/compute_advantagesa;~\/,E EvA*  time/ppo/optimize_stepp?{)7_ ˌEvA*  time/ppo/calc_stats=>vK$B+M یEvA*  time/ppo/total?)-2%6 EvA*  env/reward_mean[>ǽ$B+M ُEvA*  env/reward_stdm>U"x= hZnvA *  objective/klXEB-ML'F ZnvA *  objective/kl_coefmM>$'F ZnvA *  objective/entropyIbB>c/m]P [nvA *" ppo/mean_non_score_rewardRa%6 [nvA *  ppo/mean_scores>>G9$B+M +[nvA *  ppo/std_scores>Y-z?$B+M \nvA *  ppo/loss/valueX|8@*F$B+M ^nvA *  ppo/loss/totalۍU>x^i(pJ anvA *  ppo/policy/entropy9Z@VX1)7_ anvA *  ppo/policy/approxklSc>r F)7_ :bnvA *  ppo/policy/policykl$>>u)7_ dnvA *  ppo/policy/clipfrac>=ѫ.0_ dnvA *# ! ppo/policy/advantages_meanMt&sO dnvA *  ppo/returns/meanŠ&%6 envA *  ppo/returns/varZ Ac#wC envA *  ppo/val/vpred'm@#wC %envA *  ppo/val/error>@XyK&sO 7envA *  ppo/val/clipfrac33>aiR"x= GenvA *  ppo/val/mean82!{ ZenvA *  ppo/val/var>i2+K lenvA *  ppo/val/var_explained>>41'F envA *  ppo/learning_rateo:w+K (fnvA *  time/ppo/forward_pass6>.W fnvA *!  time/ppo/compute_rewardsP: U1 fnvA *$ " time/ppo/compute_advantages0E;,E ;gnvA *  time/ppo/optimize_stepԍo?6)7_ gnvA *  time/ppo/calc_stats@;&>zn$B+M gnvA *  time/ppo/total?+Ib%6 ChnvA *  env/reward_mean>>jE $B+M hnvA *  env/reward_std>pf8L"x= ayvA *  objective/kl'B,"}'F yvA *  objective/kl_coefM>8'F yvA *  objective/entropykB:NZ'$$B+M #yvA *  ppo/std_scores˗>+̰-Tj"x= yvA *  ppo/val/mean3%!{ yvA *  ppo/val/varc?ɾ+K yvA *  ppo/val/var_explained ?i'F yvA *  ppo/learning_rateo:+K yvA *  time/ppo/forward_pass0@8>?.W yvA *!  time/ppo/compute_rewards:Od1 yvA *$ " time/ppo/compute_advantagesO;,E #yvA *  time/ppo/optimize_stepbo?tBdI)7_ 5yvA *  time/ppo/calc_stats&>L5 $B+M yvA *  time/ppo/total04?+%6 yvA *  env/reward_mean>g $B+M 8yvA *  env/reward_std˗>o} "x= dvA *  objective/klBABz'F evA *  objective/kl_coefM>B'F evA *  objective/entropy?Ca s/m]P 1evA *" ppo/mean_non_score_reward@GL6%6 DevA *  ppo/mean_scores&>]$y$B+M XevA *  ppo/std_scores9(>f-w(pJ &lvA *  ppo/policy/entropywκ@Mm)7_ >lvA *  ppo/policy/approxklO?o)7_ RlvA *  ppo/policy/policyklj>ZY)7_ clvA *  ppo/policy/clipfrac>ݾz0_ wlvA *# ! ppo/policy/advantages_meangf1SQ1&sO lvA *  ppo/returns/meanH6e%6 lvA *  ppo/returns/varA#wC lvA *  ppo/val/vpred¸#wC lvA *  ppo/val/error@3&sO mvA *  ppo/val/clipfracff>p"x= imvA *  ppo/val/mean}]!{ mvA *  ppo/val/varP"@Q+K nvA *  ppo/val/var_explainedf31?i5'F nvA *  ppo/learning_rateo:& +K nvA *  time/ppo/forward_pass`\8> C.W 3ovA *!  time/ppo/compute_rewardsH: 1 ovA *$ " time/ppo/compute_advantagesA;-,E ovA *  time/ppo/optimize_stepo? R)7_ (pvA *  time/ppo/calc_statsp&>G$B+M xpvA *  time/ppo/totalJC?:T%6 pvA *  env/reward_mean&> $B+M qvA *  env/reward_std9(>h"x= ZvA *  objective/klO6B'F zZvA *  objective/kl_coefM>ɧ˾'F ZvA *  objective/entropyB/V/m]P ZvA *" ppo/mean_non_score_reward4%6 ZvA *  ppo/mean_scores>~$B+M ZvA *  ppo/std_scoresR>N-/m]P [vA *" tokens/responses_len_meanAl.W [vA *!  tokens/responses_len_stdK2G%6 "[vA *  ppo/loss/policyih$B+M \vA *  ppo/loss/value(5@>:$B+M ]vA *  ppo/loss/totalx%Y>JJ(pJ ]vA *  ppo/policy/entropymг@=v)7_ ]vA *  ppo/policy/approxkl8—>)7_ \^vA *  ppo/policy/policykl.>^)7_ ^vA *  ppo/policy/clipfrac>DkC0_ _vA *# ! ppo/policy/advantages_mean43*}&sO w_vA *  ppo/returns/means%6 _vA *  ppo/returns/var>oAy`$#wC 1`vA *  ppo/val/vpred[Ƥѧ|#wC `vA *  ppo/val/error|@&sO `vA *  ppo/val/clipfrac43>o"x= 2avA *  ppo/val/meanFؼ!{ avA *  ppo/val/var@[+K avA *  ppo/val/var_explained||/?2'F 7bvA *  ppo/learning_rateo:lRx+K bvA *  time/ppo/forward_pass\[>XF.W bvA *!  time/ppo/compute_rewards:&A1 AcvA *$ " time/ppo/compute_advantagesg;^,E cvA *  time/ppo/optimize_step?)7_ cvA *  time/ppo/calc_stats`E>R$B+M AdvA *  time/ppo/total־?%6 dvA *  env/reward_mean>=$B+M dvA *  env/reward_stdR>X"x= vA *  objective/kl=Bp'F vA *  objective/kl_coefJN> 5'F vA *  objective/entropy~Bs/m]P 3vA *" ppo/mean_non_score_reward&Vr%6 GvA *  ppo/mean_scores= $B+M XvA *  ppo/std_scores<=*;#-=z(pJ 'vA *  ppo/policy/entropy@LG)7_ vA *  ppo/policy/approxklgy?Jc)7_ vA *  ppo/policy/policyklܦ>1c)7_ cvA *  ppo/policy/clipfrac>{lm0_ vA *# ! ppo/policy/advantages_mean/ed+&sO vA *  ppo/returns/mean'8?V%6 |vA *  ppo/returns/var A^FH#wC vA *  ppo/val/vpred{#wC .vA *  ppo/val/errorK@i<&sO vA *  ppo/val/clipfracgf>cD"x= vA *  ppo/val/meane%!{ /vA *  ppo/val/var At7+K vA *  ppo/val/var_explained>? "'F vA *  ppo/learning_rateo:軄+K 8vA *  time/ppo/forward_pass[>>0S.W vA *!  time/ppo/compute_rewards:h1 vA *$ " time/ppo/compute_advantagesTq;,E YvA *  time/ppo/optimize_stepDK?=)7_ vA *  time/ppo/calc_statsG>WTT$B+M vA *  time/ppo/totalm?%6 RvA *  env/reward_mean=c$B+M vA *  env/reward_std<=L )"x= vA*  objective/klBs'F vA*  objective/kl_coef?N>cE'F 5vA*  objective/entropywB}7/m]P KvA*" ppo/mean_non_score_reward6f%6 _vA*  ppo/mean_scores ]>1$B+M rvA*  ppo/std_scores>@-C(pJ vA*  ppo/policy/entropys @`)7_ vA*  ppo/policy/approxkl?|@Y)7_ -vA*  ppo/policy/policykl>PbY)7_ =vA*  ppo/policy/clipfracff>0_ OvA*# ! ppo/policy/advantages_mean3331Ii&sO dvA*  ppo/returns/mean~%6 vvA*  ppo/returns/varnA #wC vA*  ppo/val/vpredfiȨ#wC vA*  ppo/val/errorȏ@=&sO vA*  ppo/val/clipfrac43>/}D}"x= .vA*  ppo/val/mean 3!{ [vA*  ppo/val/var6A)+p+K vA*  ppo/val/var_explainedJ?M݃'F vA*  ppo/learning_rateo:^+K vA*  time/ppo/forward_passoY>SKc.W vA*!  time/ppo/compute_rewards:-]1 vA*$ " time/ppo/compute_advantageso;n,E *vA*  time/ppo/optimize_step?D)7_ ;vA*  time/ppo/calc_stats0G>,$B+M JvA*  time/ppo/totalF?[X%6 vA*  env/reward_mean ]>>$B+M vA*  env/reward_std>1"x= 7LvA*  objective/klBpJF'F LvA*  objective/kl_coef`N>͚'F LvA*  objective/entropyޝB@XH/m]P LvA*" ppo/mean_non_score_rewardZf!%6 LvA*  ppo/mean_scoresM0=$B+M LvA*  ppo/std_scoresj=,-׺y(pJ ?LvA*  ppo/policy/entropy@m)7_ RLvA*  ppo/policy/approxkl c>)7_ bLvA*  ppo/policy/policykl@}>fCS)7_ rLvA*  ppo/policy/clipfrac>A00_ LvA*# ! ppo/policy/advantages_meanܱ;3A&sO LvA*  ppo/returns/mean߁%6 LvA*  ppo/returns/varA6#wC LvA*  ppo/val/vpred 7q#wC LvA*  ppo/val/errorsa@b&sO LvA*  ppo/val/clipfrac43>"x= LvA*  ppo/val/meandĩ!{ LvA*  ppo/val/var`hA#-+K LvA*  ppo/val/var_explainedxS?+p'F tLvA*  ppo/learning_rateo:;y+K LvA*  time/ppo/forward_passY]>j<.W $LvA*!  time/ppo/compute_rewards(:++W1 zLvA*$ " time/ppo/compute_advantagesl;GCl ,E LvA*  time/ppo/optimize_step֌?Ut)7_ LvA*  time/ppo/calc_stats@D>X$B+M LvA*  time/ppo/totalVy?6M%6 LvA*  env/reward_meanM0=$B+M +LvA*  env/reward_stdj=8 s"x= vA*  objective/kl>B'7'F $vA*  objective/kl_coefN>|N$'F vA*  objective/entropyyB/m]P vA*" ppo/mean_non_score_rewardlFT%6 ̪vA*  ppo/mean_scores_0>Z $B+M ުvA*  ppo/std_scorese>KL-bN (pJ vA*  ppo/policy/entropy @)7_ NvA*  ppo/policy/approxkl>)7_ wvA*  ppo/policy/policykl>b)7_ vA*  ppo/policy/clipfrac>h}0_ vA*# ! ppo/policy/advantages_mean &sO "vA*  ppo/returns/meanh>aj%6 LvA*  ppo/returns/varA7<#wC vA*  ppo/val/vpredw#wC vA*  ppo/val/errorC a@3-|&sO ҽvA*  ppo/val/clipfracff>bn"x= vA*  ppo/val/meanH7!{ vA*  ppo/val/varꑤA+K vA*  ppo/val/var_explained4d_? 'F vA*  ppo/learning_rateo:U+K vA*  time/ppo/forward_passPfZ>ӓ}s.W cvA*!  time/ppo/compute_rewards:_O)1 vA*$ " time/ppo/compute_advantagesm;$B+M vA*  time/ppo/totalRe?&%6 vA*  env/reward_mean_0>!)$B+M *vA*  env/reward_stde>1"x= WvA*  objective/klB'F nWvA*  objective/kl_coefN>׬d'F WvA*  objective/entropyB1r/m]P WvA*" ppo/mean_non_score_rewardKmrtu%6 WvA*  ppo/mean_scorese>$B+M WvA*  ppo/std_scoresC>>}.-Id])7_ ^vA*  ppo/policy/clipfrac>80_ ^vA*# ! ppo/policy/advantages_mean43&sO ^vA*  ppo/returns/mean%6 ^vA*  ppo/returns/varAmU#wC c`vA*  ppo/val/vpredO]e#wC avA*  ppo/val/error=@J&sO avA*  ppo/val/clipfrac̎>'"x= bvA*  ppo/val/means,\&p!{ DdvA*  ppo/val/varAda+K evA*  ppo/val/var_explainede? 'F evA*  ppo/learning_rateo:M+K ^fvA*  time/ppo/forward_passPgY>9M.W gvA*!  time/ppo/compute_rewardsh:=t@1 gvA*$ " time/ppo/compute_advantagess;v,E !;$B+M pivA*  time/ppo/totalva?0%6 jvA*  env/reward_meane><#$B+M jvA*  env/reward_stdC>>."x= NvA*  objective/klYFBx'F vA*  objective/kl_coef[N> 'F )vA*  objective/entropy2B*L/m]P =vA*" ppo/mean_non_score_rewardX^C6%6 OvA*  ppo/mean_scoresK;>r$B+M bvA*  ppo/std_scoresܪ>-,\=)7_ hvA*  ppo/policy/policykl >cC/=)7_ yvA*  ppo/policy/clipfracff>qY0_ vA*# ! ppo/policy/advantages_meanLc &sO vA*  ppo/returns/meanr1+{NY%6 vA*  ppo/returns/var9AC#wC vA*  ppo/val/vpred@."x= vA*  ppo/val/meanJm?!{ vA*  ppo/val/var ޺AO:+K vA*  ppo/val/var_explained}Ih?RcU'F &vA*  ppo/learning_rateo:8+K vA*  time/ppo/forward_passZ>2k\..W lvA*!  time/ppo/compute_rewardsH:>m;q1 vA*$ " time/ppo/compute_advantagesXj;ޙ,E vA*  time/ppo/optimize_step?)7_ @vA*  time/ppo/calc_statsPG>$B+M vA*  time/ppo/totalt?]%6 uvA*  env/reward_meanK;>$B+M vA*  env/reward_stdܪ>,f"x= vA*  objective/klB'F vA*  objective/kl_coef<'F vA*  objective/entropyJ̰BSڂ2/m]P 1vA*" ppo/mean_non_score_reward#eU %6 DvA*  ppo/mean_scores|=M<$B+M VvA*  ppo/std_scoresx=X,-/)7_ vA*  ppo/policy/clipfracgf>"0_ vA*# ! ppo/policy/advantages_meaňV\&sO ͎vA*  ppo/returns/means%6 ގvA*  ppo/returns/vareAr#wC vA*  ppo/val/vpreder #wC vA*  ppo/val/error8@ko&sO dvA*  ppo/val/clipfracZ>w I"x= vA*  ppo/val/mean !{ vA*  ppo/val/varEAtb+K hvA*  ppo/val/var_explainedi?N\'F vA*  ppo/learning_rateo:8dsH+K vA*  time/ppo/forward_pass@\>ۇ.W pvA*!  time/ppo/compute_rewards:>1 vA*$ " time/ppo/compute_advantages`j;Mk,E vA*  time/ppo/optimize_step,&?)7_ fvA*  time/ppo/calc_stats7F>dN$B+M vA*  time/ppo/total)?n%6 vA*  env/reward_mean|= n$B+M ^vA*  env/reward_stdx=?0"x= a wA*  objective/kltB'F ¯ wA*  objective/kl_coef" O>j'F ݯ wA*  objective/entropyBCJ/m]P wA*" ppo/mean_non_score_rewardbdF%6  wA*  ppo/mean_scores= $B+M  wA*  ppo/std_scores=h ^-:(pJ wA*  ppo/policy/entropy]@)7_ 綑 wA*  ppo/policy/approxklG?8v)7_  wA*  ppo/policy/policykl>v )7_ * wA*  ppo/policy/clipfracff>SIk0_ E wA*# ! ppo/policy/advantages_mean1z&sO v wA*  ppo/returns/meanRC%6 wA*  ppo/returns/varXA_i#wC wA*  ppo/val/vpred g#wC wA*  ppo/val/errorA:@[*o&sO ¹ wA*  ppo/val/clipfracf>WZ"x= ӹ wA*  ppo/val/mean``!{ 幑 wA*  ppo/val/varA:5f+K wA*  ppo/val/var_explaineda?:'F Z wA*  ppo/learning_rateo:>+K wA*  time/ppo/forward_passZ>#(.W wA*!  time/ppo/compute_rewards:o1 a wA*$ " time/ppo/compute_advantagesm;,E wA*  time/ppo/optimize_stepTL?:f})7_ wA*  time/ppo/calc_statsF>lg$B+M _ wA*  time/ppo/total$?}Q3%6 wA*  env/reward_mean= $B+M wA*  env/reward_std=4/"x= 4wA*  objective/kl_?BrAd'F |4wA*  objective/kl_coef,O>-{'F 4wA*  objective/entropy|Bdw/m]P 4wA*" ppo/mean_non_score_rewardI$o%6 4wA*  ppo/mean_scores >a$B+M 4wA*  ppo/std_scores$>LX-N(pJ 9wA*  ppo/policy/entropyMZ@W)7_ 9wA*  ppo/policy/approxkl>c)7_ 9wA*  ppo/policy/policykl"=fx)7_ 9wA*  ppo/policy/clipfracff>j]0_ 9wA*# ! ppo/policy/advantages_meanceJ[&sO :wA*  ppo/returns/mean c:^%6 :wA*  ppo/returns/varӚAw@#wC E=wA*  ppo/val/vpredtʅs#wC n=wA*  ppo/val/error>C @ֱ&sO =wA*  ppo/val/clipfracff>՜5"x= (AwA*  ppo/val/mean'!{ QAwA*  ppo/val/var_AY+K hAwA*  ppo/val/var_explainedb?yH'F zAwA*  ppo/learning_rateo:t'+K AwA*  time/ppo/forward_pass`/[>c5.W AwA*!  time/ppo/compute_rewards:R1 AwA*$ " time/ppo/compute_advantagesl;܁,E AwA*  time/ppo/optimize_step??w)7_ AwA*  time/ppo/calc_statsE>ܰQ$B+M AwA*  time/ppo/total?)W,+%6 7BwA*  env/reward_mean >6>u$B+M BwA*  env/reward_std$>t"x= "&wA*  objective/klGB(-m'F "&wA*  objective/kl_coefMO>7 E'F #&wA*  objective/entropySB^}@/m]P $#&wA*" ppo/mean_non_score_rewardG%6 7#&wA*  ppo/mean_scores=|$B+M H#&wA*  ppo/std_scores~=m <-')7_ '&wA*  ppo/policy/policykl3=78)7_ '&wA*  ppo/policy/clipfrac>0_ '&wA*# ! ppo/policy/advantages_meanY F &sO \+&wA*  ppo/returns/mean*P-%6 +&wA*  ppo/returns/varAb+m#wC +&wA*  ppo/val/vpredb}-j#wC +&wA*  ppo/val/error=?S&sO +&wA*  ppo/val/clipfrac43>l"x= +&wA*  ppo/val/meanl@gJ!{ +&wA*  ppo/val/var˓AМ+K +&wA*  ppo/val/var_explained.i?'F ,&wA*  ppo/learning_rateo:2+K /&wA*  time/ppo/forward_passpY>N[3@.W /&wA*!  time/ppo/compute_rewards:{1 /&wA*$ " time/ppo/compute_advantagesn;,E 0&wA*  time/ppo/optimize_step?K#%)7_ !0&wA*  time/ppo/calc_statspF>,T<$B+M 20&wA*  time/ppo/total?H2%6 E0&wA*  env/reward_mean=H`U$B+M U0&wA*  env/reward_std~=2l"x= Iq4wA*  objective/klBRR'F q4wA*  objective/kl_coefoO>p/'F q4wA*  objective/entropyaBV-/m]P q4wA*" ppo/mean_non_score_reward:B-%6 q4wA*  ppo/mean_scoresve>H!_k$B+M q4wA*  ppo/std_scores|>)G-`:%6 dq4wA*  ppo/loss/policyLɼm$B+M q4wA*  ppo/loss/value7܅?ޖ$B+M q4wA*  ppo/loss/total٣=mh(pJ q4wA*  ppo/policy/entropyl<@a2)7_ I q4wA*  ppo/policy/approxklU=:)7_ u q4wA*  ppo/policy/policyklf=)7_ q4wA*  ppo/policy/clipfrac2}>0_ q4wA*# ! ppo/policy/advantages_mean>2V\&sO q4wA*  ppo/returns/meanFG%6 q4wA*  ppo/returns/varmAnp#wC q4wA*  ppo/val/vpredRqo#wC q4wA*  ppo/val/errorjn?ƈ?&sO q4wA*  ppo/val/clipfrac?>["x=  q4wA*  ppo/val/meanco?!{ x q4wA*  ppo/val/varJ~A+K \ q4wA*  ppo/val/var_explained~i?I@'F q4wA*  ppo/learning_rateo:9+K q4wA*  time/ppo/forward_pass Z>!*.W q4wA*!  time/ppo/compute_rewards: s1 q4wA*$ " time/ppo/compute_advantagesDz;Uj!,E q4wA*  time/ppo/optimize_step`?Yu)7_ q4wA*  time/ppo/calc_stats0E>$B+M q4wA*  time/ppo/total?f+G%6 F q4wA*  env/reward_meanve>$B+M q4wA*  env/reward_std|>%z"x= ]wA*  objective/kl$BX<'F ]wA*  objective/kl_coefO>cϟ'F (]wA*  objective/entropyrXBw/m]P <]wA*" ppo/mean_non_score_rewardI<%6 M]wA*  ppo/mean_scoresꍈ>`f$B+M `]wA*  ppo/std_scores`Ca> L-BvĄ(pJ ]wA*  ppo/policy/entropy@'%;)7_ ]wA*  ppo/policy/approxklug=o8)7_ ]wA*  ppo/policy/policykls=ރ)7_ ]wA*  ppo/policy/clipfrac>]e0_ u]wA*# ! ppo/policy/advantages_mean2.&sO ,]wA*  ppo/returns/meanFtj%6 V]wA*  ppo/returns/varAj]#wC k]wA*  ppo/val/vpred 3x#wC }]wA*  ppo/val/error @42&sO ]wA*  ppo/val/clipfrac ?ȷ"x= ]wA*  ppo/val/meani#;!{ ]wA*  ppo/val/varo{@P; +K ]wA*  ppo/val/var_explainedJ?o'F w]wA*  ppo/learning_rateo:eE+K ]wA*  time/ppo/forward_pass~H>P.W +]wA*!  time/ppo/compute_rewards:y1 ]wA*$ " time/ppo/compute_advantagesH[;/ر,E ]wA*  time/ppo/optimize_step?)7_  ]wA*  time/ppo/calc_statsP8>4K$B+M 1 ]wA*  time/ppo/totalW?%6 C ]wA*  env/reward_meanꍈ>Bt`$B+M T ]wA*  env/reward_std`Ca> VA"x= /DtwA*  objective/klB:'F DtwA*  objective/kl_coefO>H'F DtwA*  objective/entropy6AB*/m]P DtwA*" ppo/mean_non_score_rewardfBa,%6 DtwA*  ppo/mean_scores*>$B+M DtwA*  ppo/std_scoresa>or-F.(pJ DtwA*  ppo/policy/entropy{@ 0r)7_ DtwA*  ppo/policy/approxkl>g)7_ DtwA*  ppo/policy/policyklV >Wlɦ)7_ DtwA*  ppo/policy/clipfrac33>7\0_ DDtwA*# ! ppo/policy/advantages_mean$K&sO DtwA*  ppo/returns/meand!%6  DtwA*  ppo/returns/varAo#wC a DtwA*  ppo/val/vpred6hf#wC DtwA*  ppo/val/error?1W@A&sO  DtwA*  ppo/val/clipfrac?"x= m DtwA*  ppo/val/mean(o!{ DtwA*  ppo/val/varAvX+K & DtwA*  ppo/val/var_explainedDW?uٓ'F ~ DtwA*  ppo/learning_rateo:n̓+K DtwA*  time/ppo/forward_passm6>>.W 3 DtwA*!  time/ppo/compute_rewards:Ϙ1 DtwA*$ " time/ppo/compute_advantagesJ;Nc,E DtwA*  time/ppo/optimize_stepPo?8X)7_ : DtwA*  time/ppo/calc_stats1&>oM$B+M DtwA*  time/ppo/total?\čK%6 DtwA*  env/reward_mean*>K$B+M =DtwA*  env/reward_stda>Zi"x= |wA*  objective/klB8Z'F ^}wA*  objective/kl_coefO>ٯ0'F ~}wA*  objective/entropyu+B8Żw/m]P }wA*" ppo/mean_non_score_rewardJIi%6 }wA*  ppo/mean_scores̤>%$B+M }wA*  ppo/std_scores>$ڂ-,E }wA*  tokens/queries_len_stdP+g/m]P }wA*" tokens/responses_len_meanA4}0.W ~wA*!  tokens/responses_len_std<%6 ~wA*  ppo/loss/policy޺$B+M ~wA*  ppo/loss/valueT?U$B+M wA*  ppo/loss/totaltB>& D(pJ ZwA*  ppo/policy/entropy?r" )7_ ۀwA*  ppo/policy/approxklK"=g&@)7_ 9wA*  ppo/policy/policykl3=Yk)7_ wA*  ppo/policy/clipfracV>؝ 0_ wA*# ! ppo/policy/advantages_meanٱ!o&sO XwA*  ppo/returns/meanpX^%6 wA*  ppo/returns/varݩAmx#wC wA*  ppo/val/vpredg#wC owA*  ppo/val/error`Z@i D&sO ÃwA*  ppo/val/clipfrac>Jpj"x= wA*  ppo/val/mean?hg!{ swA*  ppo/val/varqTA!D+K ̈́wA*  ppo/val/var_explainedNV?au'F %wA*  ppo/learning_rateo:¤q+K }wA*  time/ppo/forward_passB8>}.W ۅwA*!  time/ppo/compute_rewards:1 9wA*$ " time/ppo/compute_advantagesE;抮,E wA*  time/ppo/optimize_stepHn?܏)7_ wA*  time/ppo/calc_stats0"&>(w;$B+M DwA*  time/ppo/total?V~%6 wA*  env/reward_mean̤>r$B+M wA*  env/reward_std>H25"x= ɠwA*  objective/klkBT'F ɠwA*  objective/kl_coef O>l'F ɠwA*  objective/entropyeBH4/m]P ɠwA*" ppo/mean_non_score_reward<85AI%%6 ɠwA*  ppo/mean_scoresB>*$B+M ɠwA*  ppo/std_scoresJ >-d(pJ ʠwA*  ppo/policy/entropy.@pR})7_ ʠwA*  ppo/policy/approxkl և>0)7_ ʠwA*  ppo/policy/policykl>[)7_ ʠwA*  ppo/policy/clipfracff>0_ ʠwA*# ! ppo/policy/advantages_mean 2ǜ&sO ʠwA*  ppo/returns/meanȓ%6 ʠwA*  ppo/returns/varAAY#wC ʠwA*  ppo/val/vpredXek!#wC ʠwA*  ppo/val/errorS@&sO ( ʠwA*  ppo/val/clipfracff>Y""x= R ʠwA*  ppo/val/meanu!{ g ʠwA*  ppo/val/varx@P+K { ʠwA*  ppo/val/var_explainedF?6tI'F ʠwA*  ppo/learning_rateo:WN+K ʠwA*  time/ppo/forward_pass\>`/.W Z ʠwA*!  time/ppo/compute_rewards:eB1 ʠwA*$ " time/ppo/compute_advantages;,E ʠwA*  time/ppo/optimize_stepW?S5)7_ ` ʠwA*  time/ppo/calc_statsqo>} $B+M ʠwA*  time/ppo/totalb?[%6 ʠwA*  env/reward_meanB>3=$B+M b ʠwA*  env/reward_stdJ >lHL"x= _QwA*  objective/kl4|B'F _QwA*  objective/kl_coefP>.]u'F `QwA*  objective/entropyBiBI+L/m]P `QwA*" ppo/mean_non_score_reward$J8%6 .`QwA*  ppo/mean_scoresF<>5O$B+M @`QwA*  ppo/std_scores>XI-G(pJ eQwA*  ppo/policy/entropy/@v^)7_ gQwA*  ppo/policy/approxklE=:Ǒ)7_ 1gQwA*  ppo/policy/policykl;x%l[)7_ UiQwA*  ppo/policy/clipfracf>;90_ iQwA*# ! ppo/policy/advantages_mean 3k &sO lQwA*  ppo/returns/meanFQ>%6 LlQwA*  ppo/returns/var4?A#wC alQwA*  ppo/val/vpredԋK,#wC 9nQwA*  ppo/val/errorl5@Yk~&sO nQwA*  ppo/val/clipfracff>"x= 7oQwA*  ppo/val/mean0!{ oQwA*  ppo/val/var@1+K oQwA*  ppo/val/var_explained(NC?'F HpQwA*  ppo/learning_rateo:.+K pQwA*  time/ppo/forward_pass ف>!QJ.W pQwA*!  time/ppo/compute_rewards:}1 EqQwA*$ " time/ppo/compute_advantages,;;m,E qQwA*  time/ppo/optimize_stepz|?·)7_ qQwA*  time/ppo/calc_statso>R$B+M 5rQwA*  time/ppo/total?a%6 rQwA*  env/reward_meanF<>$>#$B+M rQwA*  env/reward_std>q*"x= qwA*  objective/klvwB%Q'F wA*  objective/kl_coef5

_'F wA*  objective/entropy QgBXc/m]P wA*" ppo/mean_non_score_reward"!I%6 wA*  ppo/mean_scores&q>O|M$B+M /wA*  ppo/std_scores.=w-Y_0_ wA*# ! ppo/policy/advantages_mean 2ej&sO wA*  ppo/returns/mean*Y%6 wA*  ppo/returns/varlBA]F#wC wA*  ppo/val/vpred.#wC wA*  ppo/val/errorY?&sO 8wA*  ppo/val/clipfracd>"x= wA*  ppo/val/meanŖ4!{ wA*  ppo/val/var-.AS+K <wA*  ppo/val/var_explainedZ?2'F wA*  ppo/learning_rateo:l+K wA*  time/ppo/forward_pass΂>+i/.W =wA*!  time/ppo/compute_rewards:fH1 wA*$ " time/ppo/compute_advantages6;O,E wA*  time/ppo/optimize_step|?F@)7_ <wA*  time/ppo/calc_stats s>@އ$B+M wA*  time/ppo/total*i? %6 wA*  env/reward_mean&q>8$B+M 7 wA*  env/reward_std.= {1"x= H'F 09$B+M پ>-Fq[0_ 9k^"x= mX.W d$B+M {$B+M ^}"x= TխwA*  objective/klZ@gBOv'F խwA*  objective/kl_coefwP>#4'F խwA*  objective/entropyC^Bj(./m]P խwA*" ppo/mean_non_score_reward/%6 ֭wA*  ppo/mean_scores>b$B+M ֭wA*  ppo/std_scoresz>*-\F0_ qޭwA*# ! ppo/policy/advantages_mean@&sO ޭwA*  ppo/returns/mean"3%6 /߭wA*  ppo/returns/var }CA l#wC ߭wA*  ppo/val/vpredx'#wC ߭wA*  ppo/val/error?/|&sO >wA*  ppo/val/clipfracff> W"x= wA*  ppo/val/mean؞#x!{ wA*  ppo/val/varR+A]b+K RwA*  ppo/val/var_explainedWe?f6'F wA*  ppo/learning_rateo:+K wA*  time/ppo/forward_pass>S?%.W dwA*!  time/ppo/compute_rewards:O߯1 wA*$ " time/ppo/compute_advantagesz;iwr,E wA*  time/ppo/optimize_stepȪ?1)7_ mwA*  time/ppo/calc_statsk>U[$B+M wA*  time/ppo/total؃?/e%6 wA*  env/reward_mean>\ $$B+M uwA*  env/reward_stdz>#q"x= BwA *  objective/kl ZB@'F BwA *  objective/kl_coefP>9Ӓ'F #BwA *  objective/entropyfKBuЃ/m]P 8BwA *" ppo/mean_non_score_reward"x%6 JBwA *  ppo/mean_scoresŬ>/$B+M ZBwA *  ppo/std_scores\t>񇊀- 0_ BwA *# ! ppo/policy/advantages_meanxm:&sO ABwA *  ppo/returns/mean܎:|%6 WBwA *  ppo/returns/varԩ$Aw#wC gBwA *  ppo/val/vpredB`0#wC yBwA *  ppo/val/error@\&sO BwA *  ppo/val/clipfrac33>n$"x= BwA *  ppo/val/mean}r)!{ BwA *  ppo/val/var|A5 +K BwA *  ppo/val/var_explainedQH?'F iBwA *  ppo/learning_rateo:Ct+K BwA *  time/ppo/forward_passc>;.W BwA *!  time/ppo/compute_rewardsh: 1 qBwA *$ " time/ppo/compute_advantages<};Y,E BwA *  time/ppo/optimize_step\?q)7_ BwA *  time/ppo/calc_statsPP> $B+M mBwA *  time/ppo/totalV7?; %6 BwA *  env/reward_meanŬ>F3$B+M IBwA *  env/reward_std\t> z"x= zIxA!*  objective/klPBU'F IxA!*  objective/kl_coefP>'F JxA!*  objective/entropyDBg9/m]P JxA!*" ppo/mean_non_score_rewardz7Uwc%6 .JxA!*  ppo/mean_scoresc>45 $B+M AJxA!*  ppo/std_scores#N}>EE-(pJ {MxA!*  ppo/policy/entropyФ@uк)7_ HOxA!*  ppo/policy/approxkl눠)P0_ OxA!*# ! ppo/policy/advantages_meanX &sO OxA!*  ppo/returns/meann荁)%6 OxA!*  ppo/returns/varqA&/H#wC OxA!*  ppo/val/vpredUx#wC GPxA!*  ppo/val/errort?)p&sO PxA!*  ppo/val/clipfrac>X"x= PxA!*  ppo/val/meanOZ!{ VQxA!*  ppo/val/varAu͟+K QxA!*  ppo/val/var_explained\?(*'F RxA!*  ppo/learning_rateo:XB+K bRxA!*  time/ppo/forward_passc>.W RxA!*!  time/ppo/compute_rewards:4!1 SxA!*$ " time/ppo/compute_advantages;̜,E pSxA!*  time/ppo/optimize_stepٔ?>6)7_ SxA!*  time/ppo/calc_statspQ>E$B+M TxA!*  time/ppo/totalE?%6 tTxA!*  env/reward_meanc>GEq$B+M TxA!*  env/reward_std#N}>Eq"x= xA"*  objective/klw DB/'F ZxA"*  objective/kl_coefP>'F uxA"*  objective/entropy9Bwf]C/m]P xA"*" ppo/mean_non_score_reward]G%6 xA"*  ppo/mean_scoresD> y$B+M xA"*  ppo/std_scoresm>lw-a0_ JxA"*# ! ppo/policy/advantages_meangffC&sO axA"*  ppo/returns/mean u%6 sxA"*  ppo/returns/varyp Am#wC xA"*  ppo/val/vpredLk}+T#wC xA"*  ppo/val/error !?T&sO xA"*  ppo/val/clipfracff>"x= xA"*  ppo/val/meanbDm !{ xA"*  ppo/val/varN@aR+K xA"*  ppo/val/var_explainedw]?~bӃ'F xA"*  ppo/learning_rateo:>NQ+K xA"*  time/ppo/forward_passЗb>v.W 0xA"*!  time/ppo/compute_rewards:1 AxA"*$ " time/ppo/compute_advantages~;i,E xA"*  time/ppo/optimize_step?gz)7_ xA"*  time/ppo/calc_statsPlQ>Q$B+M KxA"*  time/ppo/total(?o%6 xA"*  env/reward_meanD>p$B+M xA"*  env/reward_stdm>+i"x= ,txA#*  objective/kl;IBz5/'F -txA#*  objective/kl_coef= Q>Е2'F ,-txA#*  objective/entropy0B9R/m]P ?-txA#*" ppo/mean_non_score_reward,tƺ%6 S-txA#*  ppo/mean_scoresane>vL$B+M e-txA#*  ppo/std_scores ]>-K%$B+M 2txA#*  ppo/loss/total<i(pJ 2txA#*  ppo/policy/entropy @;-J)7_ >7txA#*  ppo/policy/approxklS'L<8)7_ h7txA#*  ppo/policy/policyklA%"x= 8txA#*  ppo/val/mean8g!{ ;:txA#*  ppo/val/var@/#n+K h:txA#*  ppo/val/var_explained\k?'F :txA#*  ppo/learning_rateo:T,B+K =txA#*  time/ppo/forward_passGc>z.W =txA#*!  time/ppo/compute_rewards:@1 >txA#*$ " time/ppo/compute_advantages}; ,E >txA#*  time/ppo/optimize_step?I)7_ +>txA#*  time/ppo/calc_stats)Q>',$B+M <>txA#*  time/ppo/total'?SDf%6 L>txA#*  env/reward_meanane>x\$B+M ]>txA#*  env/reward_std ]>p"x= L(xA$*  objective/klCBЊ2'F mL(xA$*  objective/kl_coef}+Q>R%'F L(xA$*  objective/entropyrK%Bn^E~/m]P L(xA$*" ppo/mean_non_score_rewardm0%6 L(xA$*  ppo/mean_scores}c>r$B+M L(xA$*  ppo/std_scoresWXx>/{-$B+M L(xA$*  ppo/loss/totalԩ<1^(pJ L(xA$*  ppo/policy/entropy0L? z)7_ 0L(xA$*  ppo/policy/approxklHa0_ jL(xA$*# ! ppo/policy/advantages_mean +)&sO L(xA$*  ppo/returns/mean~nt-~F%6 L(xA$*  ppo/returns/varA #wC L(xA$*  ppo/val/vpredǜΦ#wC n L(xA$*  ppo/val/errorj_?BQ&sO L(xA$*  ppo/val/clipfrac33A>IHs1"x= '!L(xA$*  ppo/val/meanlZ]!{ !L(xA$*  ppo/val/var~@(+K ?"L(xA$*  ppo/val/var_explainede?G'F "L(xA$*  ppo/learning_rateo: պ?+K "L(xA$*  time/ppo/forward_passb>0.W Q#L(xA$*!  time/ppo/compute_rewardsx:Gi51 #L(xA$*$ " time/ppo/compute_advantagesTx;< ;,E $L(xA$*  time/ppo/optimize_stepW?)7_ v$L(xA$*  time/ppo/calc_stats@NQ>H;a$B+M $L(xA$*  time/ppo/total v? (iq%6 !%L(xA$*  env/reward_mean}c>oD$B+M x%L(xA$*  env/reward_stdWXx>E3"x= Ļ4xA%*  objective/klBBz]'F Ż4xA%*  objective/kl_coefMQ>'F Ż4xA%*  objective/entropyB{&k/m]P Ż4xA%*" ppo/mean_non_score_reward`S%6 Ż4xA%*  ppo/mean_scores>P`$B+M Ż4xA%*  ppo/std_scoresh>j-7$B+M }̻4xA%*  ppo/loss/total[<(pJ ̻4xA%*  ppo/policy/entropyN? X)7_ ̻4xA%*  ppo/policy/approxkl]{hN0_ jѻ4xA%*# ! ppo/policy/advantages_mean332|&sO ѻ4xA%*  ppo/returns/meano҉q%6 ӻ4xA%*  ppo/returns/var~aA##wC Ի4xA%*  ppo/val/vpredwj'#wC 3Ի4xA%*  ppo/val/errorg?޶&sO ׻4xA%*  ppo/val/clipfracgf>;s"x= ׻4xA%*  ppo/val/meanf!{ ػ4xA%*  ppo/val/var@+K ػ4xA%*  ppo/val/var_explainedm?7D'F 3ػ4xA%*  ppo/learning_rateo:F/+K Dػ4xA%*  time/ppo/forward_passb>C5V.W Tػ4xA%*!  time/ppo/compute_rewards:ٺ&1 eػ4xA%*$ " time/ppo/compute_advantages};W,E uػ4xA%*  time/ppo/optimize_stepD?Mo)7_ ػ4xA%*  time/ppo/calc_stats@!Q>9Ȱ$B+M ػ4xA%*  time/ppo/totalv?c2%6 :ٻ4xA%*  env/reward_mean>~@|$B+M ٻ4xA%*  env/reward_stdh>rE'F (xdKxA&*  objective/entropyDBt{-l/m]P 1$B+M bxdKxA&*  ppo/std_scoresM>tg\-I >(pJ }dKxA&*  ppo/policy/entropyU?ž)7_ dKxA&*  ppo/policy/approxkld 6=w)7_ dKxA&*  ppo/policy/policyklI=p)7_ #dKxA&*  ppo/policy/clipfrac%>;0_ 8dKxA&*# ! ppo/policy/advantages_mean}3#L&sO OdKxA&*  ppo/returns/meanτ|Ґ%6 adKxA&*  ppo/returns/var{!AL#wC qdKxA&*  ppo/val/vpred?š%#wC dKxA&*  ppo/val/errorFE@˕&sO dKxA&*  ppo/val/clipfrac?!+Г"x= dKxA&*  ppo/val/meanև!{ قdKxA&*  ppo/val/varV[K@(2r+K dKxA&*  ppo/val/var_explained1?~'F dKxA&*  ppo/learning_rateo:Y!+K ,dKxA&*  time/ppo/forward_pass0i>q"7.W x$B+M dKxA&*  env/reward_stdM>/