HH f(tA brain.Event:2R. ,tensorboard.summary.writer.event_file_writer/q."x= @9tA*  objective/kl'F @9tA*  objective/kl_coefL>@,'F @9tA*  objective/entropy1K{Ba/m]P @9tA*" ppo/mean_non_score_reward˥%6 @9tA*  ppo/mean_scores=ېk$B+M @9tA*  ppo/std_scoresw=d-1@(pJ J9tA*  ppo/policy/entropyN@)7_ K9tA*  ppo/policy/approxkl?)7_ K9tA*  ppo/policy/policykl>)7_ -K9tA*  ppo/policy/clipfracgf=r0_ @K9tA*# ! ppo/policy/advantages_mean0&sO SK9tA*  ppo/returns/meanb?T%6 fK9tA*  ppo/returns/var[6>hʅ#wC xK9tA*  ppo/val/vpred1@#wC K9tA*  ppo/val/error @ۣ&sO PM9tA*  ppo/val/clipfracff>@"x= PN9tA*  ppo/val/mean=@]*!{ %O9tA*  ppo/val/var7@_+K Q9tA*  ppo/val/var_explainedֈ=l'F Q9tA*  ppo/learning_rate l7+K R9tA*  time/ppo/forward_passR>Pݭ.W aS9tA*!  time/ppo/compute_rewardsc.;$B+M U9tA*  time/ppo/total?+Υg%6 V9tA*  env/reward_mean=j$B+M CW9tA*  env/reward_stdw=1"x= ^!GtA*  objective/klFc?c 'F L_!GtA*  objective/kl_coef?L>Z'F _!GtA*  objective/entropyxBG_*/m]P _!GtA*" ppo/mean_non_score_reward8]!&%6 _!GtA*  ppo/mean_scores=O:$B+M `!GtA*  ppo/std_scorest=*[,#-,E -`!GtA*  tokens/queries_len_stdNg/m]P @`!GtA*" tokens/responses_len_meanA[\T.W R`!GtA*!  tokens/responses_len_std8Sh%6 g`!GtA*  ppo/loss/policy،&$B+M b!GtA*  ppo/loss/valueLG@w$B+M d!GtA*  ppo/loss/totalFX> (pJ f!GtA*  ppo/policy/entropyC@r)7_ h!GtA*  ppo/policy/approxkl<7 )7_ j!GtA*  ppo/policy/policyklf<&+)7_ Xl!GtA*  ppo/policy/clipfract=%0_ ?n!GtA*# ! ppo/policy/advantages_mean1ҁ&sO o!GtA*  ppo/returns/meanӅU?IUF%6 o!GtA*  ppo/returns/var>۶#wC p!GtA*  ppo/val/vpredl @i#wC (q!GtA*  ppo/val/errorx@&sO q!GtA*  ppo/val/clipfrac33>a"x= tr!GtA*  ppo/val/mean@k!{ s!GtA*  ppo/val/varc!@+K s!GtA*  ppo/val/var_explained`b'F ht!GtA*  ppo/learning_rate l7+K u!GtA*  time/ppo/forward_pass>=.W u!GtA*!  time/ppo/compute_rewards:>qT#1 [v!GtA*$ " time/ppo/compute_advantages/;,E v!GtA*  time/ppo/optimize_step$B+M >x!GtA*  time/ppo/total:b?`z%6 x!GtA*  env/reward_mean="2$B+M y!GtA*  env/reward_stdt=6"x= pUtA*  objective/kla( h'F qUtA*  objective/kl_coefL>'F BqUtA*  objective/entropyBΙ/m]P bqUtA*" ppo/mean_non_score_rewardx<%6 qUtA*  ppo/mean_scores)=.$B+M qUtA*  ppo/std_scores"=W-$B+M |UtA*  ppo/loss/total.;$>(pJ 4|UtA*  ppo/policy/entropyy>@ -)7_ -~UtA*  ppo/policy/approxkl; ")7_ [~UtA*  ppo/policy/policyklùBG)7_ s~UtA*  ppo/policy/clipfrac33C=)30_ UtA*# ! ppo/policy/advantages_mean@2Ju&sO UtA*  ppo/returns/meanp?n)$%6 ԀUtA*  ppo/returns/var_?PF4~#wC UtA*  ppo/val/vpredq?.#wC ՂUtA*  ppo/val/error\d@&sO UtA*  ppo/val/clipfrac̰>R"x= UtA*  ppo/val/meanH?[!{ "UtA*  ppo/val/var`?+K „UtA*  ppo/val/var_explained&ݿ1U'F `UtA*  ppo/learning_rate l7{`+]+K UtA*  time/ppo/forward_passH>uN9.W UtA*!  time/ppo/compute_rewards(:NK(1 0UtA*$ " time/ppo/compute_advantages);҆3,E ƇUtA*  time/ppo/optimize_step{D?E5)7_ \UtA*  time/ppo/calc_stats@>Tg$B+M UtA*  time/ppo/totalЙ?w$v%6 UtA*  env/reward_mean)=;i$B+M UtA*  env/reward_std"=w"x= ydctA*  objective/kl? 'F dctA*  objective/kl_coef3hL>D'F dctA*  objective/entropy̿B cM/m]P dctA*" ppo/mean_non_score_rewardf,N%6 dctA*  ppo/mean_scoresp=-V$B+M dctA*  ppo/std_scoresه=Ƞ-;~_)7_ mdctA*  ppo/policy/policykl<\%)7_ dctA*  ppo/policy/clipfrac=^I0_ dctA*# ! ppo/policy/advantages_mean)T&sO 2dctA*  ppo/returns/mean/@>3k%6 LdctA*  ppo/returns/varі>C#wC _dctA*  ppo/val/vpred?m{#wC dctA*  ppo/val/error"@^r&sO dctA*  ppo/val/clipfrac̒>U"x= adctA*  ppo/val/mean2r?A`!{ dctA*  ppo/val/var?h!+K dctA*  ppo/val/var_explainedRg'F SdctA*  ppo/learning_rate l7g+K dctA*  time/ppo/forward_pass>.W dctA*!  time/ppo/compute_rewardsp:$1 1dctA*$ " time/ppo/compute_advantages(;[Gq,E dctA*  time/ppo/optimize_stepF?fR-)7_ dctA*  time/ppo/calc_stats`\=G--$B+M dctA*  time/ppo/totalZ?*%6 idctA*  env/reward_meanp="9$B+M dctA*  env/reward_stdه=?"x= SaeqtA*  objective/kl{F@2dG'F aeqtA*  objective/kl_coefFL>LCb'F aeqtA*  objective/entropy,Bj/m]P aeqtA*" ppo/mean_non_score_reward)%6 beqtA*  ppo/mean_scoresL=ۉl$B+M beqtA*  ppo/std_scores^=2-0_ ;jeqtA*# ! ppo/policy/advantages_meanff&3b֧&sO PjeqtA*  ppo/returns/mean>%6 ejeqtA*  ppo/returns/var=t*#wC yjeqtA*  ppo/val/vpredȍ?~#wC 3keqtA*  ppo/val/error@q&sO FkeqtA*  ppo/val/clipfracu>AO"x= %meqtA*  ppo/val/mean?v!{ QmeqtA*  ppo/val/varҳ?e+K jmeqtA*  ppo/val/var_explained0ףY'F }meqtA*  ppo/learning_rate l7)F+K meqtA*  time/ppo/forward_pass>_H].W meqtA*!  time/ppo/compute_rewardsH:#>H1 meqtA*$ " time/ppo/compute_advantages,;ټ,E meqtA*  time/ppo/optimize_stepOD?>>2)7_ neqtA*  time/ppo/calc_statsQ>HI$B+M vneqtA*  time/ppo/total:\??%6 neqtA*  env/reward_meanL=+;.$B+M %oeqtA*  env/reward_std^= 9"x= P~tA*  objective/klE-@<'F {P~tA*  objective/kl_coef>%L>N'F P~tA*  objective/entropyBΙM/m]P P~tA*" ppo/mean_non_score_rewardݼ%6 P~tA*  ppo/mean_scoresJ='z$B+M P~tA*  ppo/std_scores#=d-^ %6 X~tA*  ppo/returns/varK>"%DO#wC X~tA*  ppo/val/vpred_?(BX#wC 5[~tA*  ppo/val/errori?v%&sO c[~tA*  ppo/val/clipfracD>t8 "x= |[~tA*  ppo/val/mean{?!{ [~tA*  ppo/val/var$@?TM.W \~tA*!  time/ppo/compute_rewardsp:)Dv1 F]~tA*$ " time/ppo/compute_advantages/;~+,E ]~tA*  time/ppo/optimize_steptC?w.)7_ C^~tA*  time/ppo/calc_stats`>$B+M ^~tA*  time/ppo/total ?Nd%6 )_~tA*  env/reward_meanJ=@$B+M _~tA*  env/reward_std#=_"x= tA*  objective/kl7[?L:'F tA*  objective/kl_coefL>>'F tA*  objective/entropyB/m]P tA*" ppo/mean_non_score_reward+2 _%6 &tA*  ppo/mean_scoresB=F\$B+M 8tA*  ppo/std_scores=_>-)7_ )tA*  ppo/policy/approxkl2;Oks)7_ @)tA*  ppo/policy/policykln<[)7_ X)tA*  ppo/policy/clipfrac=!\0_ p)tA*# ! ppo/policy/advantages_mean2y&sO +tA*  ppo/returns/meanK> y%6 .tA*  ppo/returns/varؕ>+`#wC 0tA*  ppo/val/vpred2 ?oM#wC J3tA*  ppo/val/error?).&sO x3tA*  ppo/val/clipfrac33= "x= @4tA*  ppo/val/mean #?^M!{ 4tA*  ppo/val/var?X|+K 5tA*  ppo/val/var_explainedpbhwR'F >6tA*  ppo/learning_rate l7:K+K 6tA*  time/ppo/forward_pass <>WSD.W 7tA*!  time/ppo/compute_rewards:1 (8tA*$ " time/ppo/compute_advantages(;:8,E 8tA*  time/ppo/optimize_stepXC?}і)7_ h9tA*  time/ppo/calc_stats>*Og$B+M :tA*  time/ppo/total?x%6 :tA*  env/reward_meanB=٠$B+M ?;tA*  env/reward_std=*j"x= u,tA*  objective/klù@'F /v,tA*  objective/kl_coef_K>W'F v,tA*  objective/entropyBH/m]P v,tA*" ppo/mean_non_score_rewardn%{%6 v,tA*  ppo/mean_scores=Rˌ$B+M v,tA*  ppo/std_scoresG2= -^p#wC b,tA*  ppo/val/vpredD>h#wC t,tA*  ppo/val/error]?o47&sO ,tA*  ppo/val/clipfrac)=quR"x= 7,tA*  ppo/val/mean>Ϗ!{ ݈,tA*  ppo/val/var~K?8=+K ,tA*  ppo/val/var_explained|&-`'F ,tA*  ppo/learning_rate l7+K >,tA*  time/ppo/forward_pass>.W ,tA*!  time/ppo/compute_rewards :Fє1 ,tA*$ " time/ppo/compute_advantages8*;yT,E B,tA*  time/ppo/optimize_stepC?)7_ ,tA*  time/ppo/calc_statsPO>8C$B+M ,tA*  time/ppo/total?%6 *,tA*  env/reward_mean=o,O$B+M ʐ,tA*  env/reward_stdG2=q'v"x= wtA *  objective/klD?l 'F tA *  objective/kl_coefK>l V'F tA *  objective/entropy|9B?/m]P  tA *" ppo/mean_non_score_rewardC%6 ( tA *  ppo/mean_scores>$$B+M < tA *  ppo/std_scoresɅ6>-V(pJ 'tA *  ppo/policy/entropyYQ@ӣ)7_ 'tA *  ppo/policy/approxkl=u)7_ 'tA *  ppo/policy/policykl%?<_)7_ 'tA *  ppo/policy/clipfrac43[=<00_ 'tA *# ! ppo/policy/advantages_meanE&sO )tA *  ppo/returns/meanH=À%6 o+tA *  ppo/returns/var)>(K#wC `-tA *  ppo/val/vpred0x>g,#wC /.tA *  ppo/val/error?&sO .tA *  ppo/val/clipfrac̜7BUT!{ L0tA *  ppo/val/varк?q +K 0tA *  ppo/val/var_explainedl'F 1tA *  ppo/learning_rate l7HXE+K 52tA *  time/ppo/forward_pass@>zi.W 2tA *!  time/ppo/compute_rewards(:mo1 {3tA *$ " time/ppo/compute_advantages,;O,E 4tA *  time/ppo/optimize_stepC?,)7_ 4tA *  time/ppo/calc_stats>^K$B+M L5tA *  time/ppo/totalF*??X%6 5tA *  env/reward_mean>v'$B+M 6tA *  env/reward_stdɅ6>̧"x= tA *  objective/klu@'F z tA *  objective/kl_coefK>K^'F tA *  objective/entropy;B"`/m]P !tA *" ppo/mean_non_score_rewardżZ=%6 (!tA *  ppo/mean_scoresX1=n~$B+M :!tA *  ppo/std_scores=ϫ-)#wC ^4tA *  ppo/val/vpredlz=FL#wC 5tA *  ppo/val/error楀?/&sO 5tA *  ppo/val/clipfrac33<,q"x= M6tA *  ppo/val/meanOi=a!{ 6tA *  ppo/val/var`?U/K+K 7tA *  ppo/val/var_explained 2'F A8tA *  ppo/learning_rate l7)+K 8tA *  time/ppo/forward_passs>.W 9tA *!  time/ppo/compute_rewards:O1 <:tA *$ " time/ppo/compute_advantages1;s,E :tA *  time/ppo/optimize_step D?_)7_ ;tA *  time/ppo/calc_statsE>eT$B+M #tA *  objective/kl,h(@"'F I?tA *  objective/kl_coef8~K>4S'F ?tA *  objective/entropyB[P/m]P ?tA *" ppo/mean_non_score_rewardm/ּ-KL%6 ?tA *  ppo/mean_scores(>25$B+M @tA *  ppo/std_scoresi=3-@tA *" tokens/responses_len_meanArM.W Q@tA *!  tokens/responses_len_stdhA2%6 e@tA *  ppo/loss/policy" Y0&=$B+M CBtA *  ppo/loss/value0?$.1$B+M FtA *  ppo/loss/total"m>#wC JOtA *  ppo/val/vpredL>N^B#wC PtA *  ppo/val/errorzh?&sO PtA *  ppo/val/clipfrac;0"x= iQtA *  ppo/val/mean`X>gc(!{ RtA *  ppo/val/varq?UJ+K RtA *  ppo/val/var_explainedo'F StA *  ppo/learning_rate l7kh+K (TtA *  time/ppo/forward_pass >AD.W TtA *!  time/ppo/compute_rewards:f1 oUtA *$ " time/ppo/compute_advantagesH8;e,E VtA *  time/ppo/optimize_step@C?I䫘)7_ VtA *  time/ppo/calc_statsm>%$B+M AWtA *  time/ppo/total6? %6 WtA *  env/reward_mean(>yx$B+M wXtA *  env/reward_stdi=Y"x= ztA *  objective/klò?וH'F NztA *  objective/kl_coef\K>-N'F mztA *  objective/entropy:BJ}/m]P ztA *" ppo/mean_non_score_reward}zdđ%6 ztA *  ppo/mean_scores u1>ĢE $B+M ztA *  ppo/std_scores V>;*-ztA *  tokens/queries_len_stdwd}/m]P RztA *" tokens/responses_len_meanA)OA.W bztA *!  tokens/responses_len_std %6 tztA *  ppo/loss/policy$B+M |ztA *  ppo/loss/valuef?Xk$B+M 0ztA *  ppo/loss/total԰<ɐ`(pJ ztA *  ppo/policy/entropyBL@)7_ ztA *  ppo/policy/approxklĴ&<@)7_ ztA *  ppo/policy/policyklQ;ɝ)7_ ztA *  ppo/policy/clipfrac\=0_ ztA *# ! ppo/policy/advantages_mean332F&sO ztA *  ppo/returns/meanv=}#%6 jztA *  ppo/returns/varhLb>JA#wC $ztA *  ppo/val/vpred0=bX#wC ztA *  ppo/val/errorӃ?&sO tztA *  ppo/val/clipfrac,<-"x= ztA *  ppo/val/mean"=\V^!{ ztA *  ppo/val/varx?aS.+K cztA *  ppo/val/var_explainedBjи.'F ztA *  ppo/learning_rate l7Ǎ +K ztA *  time/ppo/forward_pass>N.a.W `ztA *!  time/ppo/compute_rewards0:>>1 ztA *$ " time/ppo/compute_advantagest+;*L~,E ztA *  time/ppo/optimize_step\C?9ɿ)7_ FztA *  time/ppo/calc_stats*>.$B+M ztA *  time/ppo/total?%6 ztA *  env/reward_mean u1>dV$B+M (ztA *  env/reward_std V>=("x= tA *  objective/kl/? 'F tA *  objective/kl_coef;K>Uk'F mtA *  objective/entropyB>/m]P tA *" ppo/mean_non_score_rewardbkU[%6 tA *  ppo/mean_scoresl>F$B+M tA *  ppo/std_scores>ͽR"-햐$B+M WtA *  ppo/loss/total&<(pJ tA *  ppo/policy/entropyUS@vZ)7_ tA *  ppo/policy/approxkl;| Z)7_ tA *  ppo/policy/policykl:c)7_ tA *  ppo/policy/clipfracgfV=b=0_ tA *# ! ppo/policy/advantages_meandY,&sO CtA *  ppo/returns/mean,=/'o%6 tA *  ppo/returns/var >*q>#wC tA *  ppo/val/vpred=2#wC KtA *  ppo/val/error4T_?)&sO tA *  ppo/val/clipfrac;(1R"x= tA *  ppo/val/meanVI=V*r!{ :tA *  ppo/val/var"=?xM+K tA *  ppo/val/var_explainedM T/'F tA *  ppo/learning_rate l7x}4+K 3tA *  time/ppo/forward_pass1>k.W tA *!  time/ppo/compute_rewardsH:ދq1 tA *$ " time/ppo/compute_advantages.;Q=,E BtA *  time/ppo/optimize_stepjC?=4)7_ tA *  time/ppo/calc_statsm>+$B+M tA *  time/ppo/total?\]%6 3tA *  env/reward_meanl>]$B+M tA *  env/reward_std>^xQW"x= btA*  objective/kl">0l'F btA*  objective/kl_coefCK>'F btA*  objective/entropymB۟t/m]P btA*" ppo/mean_non_score_reward{%6 .btA*  ppo/mean_scores=ۏ$B+M BbtA*  ppo/std_scoresJ=8@-ų#wC btA*  ppo/val/vpred b==j'#wC btA*  ppo/val/errorٙ?}&sO btA*  ppo/val/clipfracff&< O"x= @btA*  ppo/val/meanІ=qPc!{ WbtA*  ppo/val/var\?"Ӏ+K ҿbtA*  ppo/val/var_explained+ h;'F btA*  ppo/learning_rate l7u+K btA*  time/ppo/forward_passP!>dx.W btA*!  time/ppo/compute_rewards: 1 btA*$ " time/ppo/compute_advantages);G,E ;btA*  time/ppo/optimize_step D? `)7_ btA*  time/ppo/calc_stats>]'$B+M btA*  time/ppo/total ?/%6 6btA*  env/reward_mean=?$B+M btA*  env/reward_stdJ='R"x= 2WtA*  objective/kl7?Lm'F W3WtA*  objective/kl_coefJ> 7='F u3WtA*  objective/entropyB/m]P 3WtA*" ppo/mean_non_score_reward&Lo*M%6 3WtA*  ppo/mean_scores=Wu*$B+M 3WtA*  ppo/std_scores~E=y/-,~$B+M 6WtA*  ppo/loss/total7 #wC ;WtA*  ppo/val/vpred&L>^#wC ;WtA*  ppo/val/error:c?X0&sO ;WtA*  ppo/val/clipfracff;J$M"x= g!!{ =WtA*  ppo/val/var\:?+K ?WtA*  ppo/val/var_explained/U'F ~?WtA*  ppo/learning_rate l7+K ?WtA*  time/ppo/forward_pass@e>'Vl.W :@WtA*!  time/ppo/compute_rewards:#^1 @WtA*$ " time/ppo/compute_advantages+;EW ,E @WtA*  time/ppo/optimize_stepRD?i%)7_ Vu$B+M AWtA*  time/ppo/total?0[%6 AWtA*  env/reward_mean=X$B+M IBWtA*  env/reward_std~E= Y"x= G uA*  objective/klR?>8'F G uA*  objective/kl_coefJ>'F 8G uA*  objective/entropye˙B<#u/m]P OG uA*" ppo/mean_non_score_rewardʎpԲ%6 cG uA*  ppo/mean_scoresD=Cl$B+M vG uA*  ppo/std_scoresT/>I-%$B+M H uA*  ppo/loss/totaldN.(pJ ?H uA*  ppo/policy/entropyFT@Hd)7_ WH uA*  ppo/policy/approxkl;KEO#wC H uA*  ppo/val/vpred8Y>u|#wC H uA*  ppo/val/error$?;8&sO ` H uA*  ppo/val/clipfrac;iv'"x= H uA*  ppo/val/mean>b!{ H uA*  ppo/val/var?kl}+K H uA*  ppo/val/var_explained\ 'F H uA*  ppo/learning_rate l7SF+K H uA*  time/ppo/forward_pass>`+.W H uA*!  time/ppo/compute_rewards :u$1 H uA*$ " time/ppo/compute_advantages*;,E  H uA*  time/ppo/optimize_steplC?))7_ H uA*  time/ppo/calc_stats>f$B+M  H uA*  time/ppo/totalp?%6 $H uA*  env/reward_meanD=>k$B+M H uA*  env/reward_stdT/># "x= ÂTuA*  objective/kl}?!'F -TuA*  objective/kl_coefJ>'F ITuA*  objective/entropysBu/m]P `TuA*" ppo/mean_non_score_reward6i#%6 tTuA*  ppo/mean_scores>#Vu$B+M TuA*  ppo/std_scoresrL=_ᶙ-$B+M 3TuA*  ppo/loss/total:.(pJ TuA*  ppo/policy/entropyd]@2)7_ ITuA*  ppo/policy/approxkld"< <)7_ aTuA*  ppo/policy/policykl+; w)7_ uTuA*  ppo/policy/clipfraci=T0_ TuA*# ! ppo/policy/advantages_mean&sO TuA*  ppo/returns/means=4%6 TuA*  ppo/returns/var[(=J#wC "TuA*  ppo/val/vpred;2=x#wC {TuA*  ppo/val/error2?GcD&sO ҉TuA*  ppo/val/clipfracgff;R"x= (TuA*  ppo/val/meanS=YI!{ TuA*  ppo/val/var %?U6+K TuA*  ppo/val/var_explainedK+T?'F MTuA*  ppo/learning_rate l7Z T+K TuA*  time/ppo/forward_pass >ۦ=.W TuA*!  time/ppo/compute_rewardsx:9);1 aTuA*$ " time/ppo/compute_advantages3;6,E TuA*  time/ppo/optimize_step8C?)7_ TuA*  time/ppo/calc_stats2>j]$B+M cTuA*  time/ppo/total0?J%6 TuA*  env/reward_mean>\Ȁ$B+M TuA*  env/reward_stdrL=v4"x= $uA*  objective/kl@J^'F f$uA*  objective/kl_coefIJ>K'F $uA*  objective/entropyB9X/m]P $uA*" ppo/mean_non_score_reward{Ǽm%6 $uA*  ppo/mean_scoresh=i$B+M $uA*  ppo/std_scores^z=h${-fm$B+M $uA*  ppo/loss/totalh2Ғ(pJ $uA*  ppo/policy/entropy]@!)7_ $uA*  ppo/policy/approxkl5<г)7_ $uA*  ppo/policy/policykl8|&<8w)7_ $uA*  ppo/policy/clipfracY=n0_ $uA*# ! ppo/policy/advantages_meanff62&sO 4$uA*  ppo/returns/meanm@c%6 +$uA*  ppo/returns/varsa>H#wC W$uA*  ppo/val/vpred&>];i#wC o$uA*  ppo/val/error%?R'&sO $uA*  ppo/val/clipfrac;TC9"x= $uA*  ppo/val/mean5%>!{ $uA*  ppo/val/var1?ӷ&++K $uA*  ppo/val/var_explainedN'F $uA*  ppo/learning_rate l7Uv/+K /$uA*  time/ppo/forward_pass>Ze.W $uA*!  time/ppo/compute_rewards:Y1 $uA*$ " time/ppo/compute_advantages,(;1E,E b$uA*  time/ppo/optimize_step\C?)7_ $uA*  time/ppo/calc_stats>u$B+M $uA*  time/ppo/totalJ?j`%6 f$uA*  env/reward_meanh=֝e$B+M $uA*  env/reward_std^z=r("x= F1uA*  objective/kl?~$'F G1uA*  objective/kl_coeftJ>L'F 9G1uA*  objective/entropy&B_7/m]P NG1uA*" ppo/mean_non_score_rewardm%6 `G1uA*  ppo/mean_scoresS= ù$B+M rG1uA*  ppo/std_scores >É3-z$B+M gL1uA*  ppo/loss/totalލ;-(pJ L1uA*  ppo/policy/entropyW@?3)7_ N1uA*  ppo/policy/approxkl;b)7_ N1uA*  ppo/policy/policykl;T)7_ R1uA*  ppo/policy/clipfrac43k=IE0_ S1uA*# ! ppo/policy/advantages_mean0&sO 2S1uA*  ppo/returns/mean_,=n%6 GS1uA*  ppo/returns/varC>#wC YS1uA*  ppo/val/vpred6>#wC mS1uA*  ppo/val/errorG?~Ա&sO S1uA*  ppo/val/clipfracL; n"x= S1uA*  ppo/val/mean&1> Y!{ S1uA*  ppo/val/var6?P+K S1uA*  ppo/val/var_explained!ez'F V1uA*  ppo/learning_rate l7>F+K CV1uA*  time/ppo/forward_pass>7.W \V1uA*!  time/ppo/compute_rewards:s1 W1uA*$ " time/ppo/compute_advantages\*;kM,E qX1uA*  time/ppo/optimize_stepHCC?/F)7_ X1uA*  time/ppo/calc_stats>L$B+M )Y1uA*  time/ppo/totalDž?5'%6 Y1uA*  env/reward_meanS=o9$B+M Y1uA*  env/reward_std >M"x= a?uA*  objective/kl*E"@f'F Oa?uA*  objective/kl_coefRJ>L<'F la?uA*  objective/entropyxuBN/m]P a?uA*" ppo/mean_non_score_reward1ͼx%6 a?uA*  ppo/mean_scores'=)k$B+M a?uA*  ppo/std_scoresb=.K- $B+M Ta?uA*  ppo/loss/totalL0(pJ a?uA*  ppo/policy/entropypb@֞0)7_ a?uA*  ppo/policy/approxklJP<)7_ a?uA*  ppo/policy/policykl#;5U)7_ a?uA*  ppo/policy/clipfrac|=F0_ a?uA*# ! ppo/policy/advantages_meangf215&sO b?uA*  ppo/returns/mean0%6 &b?uA*  ppo/returns/var+\>r[O #wC =b?uA*  ppo/val/vpred?b댊#wC Qb?uA*  ppo/val/error_4?X{&sO jb?uA*  ppo/val/clipfracv .W b?uA*!  time/ppo/compute_rewards:I1 b?uA*$ " time/ppo/compute_advantages8);V,E #b?uA*  time/ppo/optimize_stepC?)7_ ?b?uA*  time/ppo/calc_statsP>Hi$B+M b?uA*  time/ppo/total?h%6 (b?uA*  env/reward_mean'=AV$B+M ~b?uA*  env/reward_stdb="x= :LuA*  objective/klɦ@@>H'F LuA*  objective/kl_coef1J>x'F LuA*  objective/entropyB$t/m]P ֍LuA*" ppo/mean_non_score_reward!ؖ%6 LuA*  ppo/mean_scoresh=z d$B+M LuA*  ppo/std_scores g=L3-'{$B+M *LuA*  ppo/loss/total;ߏ#7(pJ DLuA*  ppo/policy/entropyTAU@&vL)7_ WLuA*  ppo/policy/approxklf#wC LuA*  ppo/val/vpredUg#wC ӗLuA*  ppo/val/errorR$1?qdf&sO LuA*  ppo/val/clipfrac;< "x= LuA*  ppo/val/meanFi!!{ LuA*  ppo/val/var%?Hf+K !LuA*  ppo/val/var_explainedHf 'F 3LuA*  ppo/learning_rate l7+K LuA*  time/ppo/forward_pass0!>.W LuA*!  time/ppo/compute_rewards:1 dLuA*$ " time/ppo/compute_advantages';hC,E LuA*  time/ppo/optimize_steplC?)7_ qLuA*  time/ppo/calc_statsm>V%$B+M ӚLuA*  time/ppo/totalp?%6 3LuA*  env/reward_meanh=c$B+M LuA*  env/reward_std g=P