HH H-llA brain.Event:2R. ,tensorboard.summary.writer.event_file_writer塨"x= }lA*  objective/kl0'F }lA*  objective/kl_coefL> er0_ }lA*# ! ppo/policy/advantages_mean3QU&sO }lA*  ppo/returns/mean?m%6 }lA*  ppo/returns/varx?Ea#wC }lA*  ppo/val/vpredO?DW#wC }lA*  ppo/val/errorLAd&sO '}lA*  ppo/val/clipfrac>"x= ?}lA*  ppo/val/mean?@VJ:5!{ }lA*  ppo/val/varM@3ځ+K T}lA*  ppo/val/var_explained^I'F }lA*  ppo/learning_rateϸ:I+K }lA*  time/ppo/forward_pass?>:,h.W }lA*!  time/ppo/compute_rewards;1 ~}lA*$ " time/ppo/compute_advantages ;;.T,E 2}lA*  time/ppo/optimize_step?(TP)7_ }lA*  time/ppo/calc_statsP1.>GY$B+M t}lA*  time/ppo/total̮?%6 }lA*  env/reward_mean`L?w$B+M }lA*  env/reward_stdNo?nҋ"x= (lA*  objective/klOB 2'F (lA*  objective/kl_coef?L>z 'F (lA*  objective/entropy8C*It/m]P (lA*" ppo/mean_non_score_reward^Mt?%6 (lA*  ppo/mean_scores & @vʕ$B+M (lA*  ppo/std_scores3d? `z-k,d0_ (lA*# ! ppo/policy/advantages_meanlx&sO (lA*  ppo/returns/mean3u/%6 (lA*  ppo/returns/var@T #wC (lA*  ppo/val/vpred0=ώs#wC S(lA*  ppo/val/error-AM&sO (lA*  ppo/val/clipfracL?`H"x= (lA*  ppo/val/meannI @]Z!{ (lA*  ppo/val/varC> ),q+K (lA*  ppo/val/var_explainedds'F (lA*  ppo/learning_rateϸ:n+K 8(lA*  time/ppo/forward_passG>qT.W I(lA*!  time/ppo/compute_rewards:51 (lA*$ " time/ppo/compute_advantagesR;>P!,E (lA*  time/ppo/optimize_step(?7qG~)7_ \(lA*  time/ppo/calc_stats`@>,O?$B+M (lA*  time/ppo/total?E)H%6 (lA*  env/reward_mean & @w$B+M J(lA*  env/reward_std3d?"x= lA*  objective/klnO?Bc>'F lA*  objective/kl_coefL>[Sx'F lA*  objective/entropy@C/m]P lA*" ppo/mean_non_score_rewardͽ%6 lA*  ppo/mean_scores,?W$B+M lA*  ppo/std_scores.?|-XJ)7_ lA*  ppo/policy/policykl2>)7_ lA*  ppo/policy/clipfracf?ﯦs0_ lA*# ! ppo/policy/advantages_mean̔38Ģx&sO lA*  ppo/returns/meanR) %6 ,lA*  ppo/returns/varj@yD#wC =lA*  ppo/val/vpred!+˔#wC OlA*  ppo/val/errorxAjC"&sO _lA*  ppo/val/clipfrac?"x= lA*  ppo/val/meanzq#!{ lA*  ppo/val/varT=к+K 2lA*  ppo/val/var_explained얾- 'F GlA*  ppo/learning_rateϸ:n+K klA*  time/ppo/forward_passc>t .W lA*!  time/ppo/compute_rewards:HE61 lA*$ " time/ppo/compute_advantages(W;o$,E lA*  time/ppo/optimize_stepH5?Z|)7_ \lA*  time/ppo/calc_stats;V>6$B+M lA*  time/ppo/totalp??%6 lA*  env/reward_mean,?Q$B+M "lA*  env/reward_std.?"x= lA*  objective/klb4Bw'F llA*  objective/kl_coefUL>'F lA*  objective/entropy. C8 3/m]P lA*" ppo/mean_non_score_rewardVhĔ%6 lA*  ppo/mean_scores+?3V$B+M 8lA*  ppo/std_scores?e-k(pJ lA*  ppo/policy/entropy$@3!)7_ lA*  ppo/policy/approxklP>@z)7_ lA*  ppo/policy/policykl{=qV)7_ `lA*  ppo/policy/clipfrac?=0_ lA*# ! ppo/policy/advantages_mean !&sO lA*  ppo/returns/meanH[;%6 lA*  ppo/returns/var3@ce(#wC 1lA*  ppo/val/vpredm]s#wC GlA*  ppo/val/error}Aͧ=&sO lA*  ppo/val/clipfracff>. "x= lA*  ppo/val/mean]^ `u!{ )lA*  ppo/val/varb=S/+K =lA*  ppo/val/var_explained8{!K'F NlA*  ppo/learning_rateϸ:Rڱ+K ^lA*  time/ppo/forward_pass`Rc>?N.W nlA*!  time/ppo/compute_rewards:O[1 ~lA*$ " time/ppo/compute_advantagesTW;,E lA*  time/ppo/optimize_stepX?h)7_ lA*  time/ppo/calc_stats J>DZ$B+M 'lA*  time/ppo/total?uZ%6 lA*  env/reward_mean+?!./$B+M 6lA*  env/reward_std?:f"x= D;lA*  objective/klLB'F ;lA*  objective/kl_coefM>m2'F ѩ;lA*  objective/entropyCP C*uC/m]P ;lA*" ppo/mean_non_score_reward}E%6 ;lA*  ppo/mean_scores&?6<$B+M ;lA*  ppo/std_scores)?]ѩ:-6d(pJ W;lA*  ppo/policy/entropy@z)7_ o;lA*  ppo/policy/approxkl2> )7_ ;lA*  ppo/policy/policykl>J)7_ ;lA*  ppo/policy/clipfrac?0_ ;lA*# ! ppo/policy/advantages_meany#&sO ;lA*  ppo/returns/meanN%6 ΰ;lA*  ppo/returns/var-@wv9#wC D;lA*  ppo/val/vpredZݩ#wC ;lA*  ppo/val/error(A&sO /;lA*  ppo/val/clipfracv*?f"x= G;lA*  ppo/val/meanHJ]!{ Z;lA*  ppo/val/vargN>a}+K l;lA*  ppo/val/var_explained(K'F ;lA*  ppo/learning_rateϸ:&+K ;lA*  time/ppo/forward_passg[>.W ;lA*!  time/ppo/compute_rewardsH:z1 ;lA*$ " time/ppo/compute_advantagese;}::,E ^;lA*  time/ppo/optimize_stepD?)7_ ;lA*  time/ppo/calc_statsd>=$B+M d;lA*  time/ppo/total?|:%6 ;lA*  env/reward_mean&?$B+M ;lA*  env/reward_std)? <"x= plA*  objective/kl)RB'F splA*  objective/kl_coef1M>OB'F plA*  objective/entropyآCׄ/m]P plA*" ppo/mean_non_score_reward 5%6 plA*  ppo/mean_scores@ak?$B+M plA*  ppo/std_scoresE>?v-'2(pJ vlA*  ppo/policy/entropy @5)7_ TwlA*  ppo/policy/approxklEZ>)7_ wlA*  ppo/policy/policykl&>k{)7_ "xlA*  ppo/policy/clipfracL ?lB0_ xlA*# ! ppo/policy/advantages_meanff\Y&sO xlA*  ppo/returns/meanphZ%6 cylA*  ppo/returns/vart@=#wC ylA*  ppo/val/vpred {#wC 1zlA*  ppo/val/errorL#@Mҧ&sO zlA*  ppo/val/clipfrac43?:F"x= zlA*  ppo/val/mean$Yo!{ Y{lA*  ppo/val/varb>E)i+K {lA*  ppo/val/var_explainedH+JbR'F #|lA*  ppo/learning_rateϸ:x=l+K |lA*  time/ppo/forward_pass`u>.W |lA*!  time/ppo/compute_rewards0:F+1 P}lA*$ " time/ppo/compute_advantages_;c,E }lA*  time/ppo/optimize_stepJ?^k )7_ (~lA*  time/ppo/calc_statsP]>7$B+M ~lA*  time/ppo/total9?*mX%6 ~lA*  env/reward_mean@ak?$B+M TlA*  env/reward_stdE>?!7"x= )H?lA*  objective/kl bB6'F H?lA*  objective/kl_coef SM>p'F H?lA*  objective/entropy Cd/m]P H?lA*" ppo/mean_non_score_reward5%6 H?lA*  ppo/mean_scores̩?tw$B+M H?lA*  ppo/std_scoresp?(pJ N?lA*  ppo/policy/entropy@)7_ N?lA*  ppo/policy/approxkl>J)7_ NP?lA*  ppo/policy/policykl`4>qBN)7_ xP?lA*  ppo/policy/clipfracL?(]0_ P?lA*# ! ppo/policy/advantages_meangfڳ{;&sO P?lA*  ppo/returns/mean|x%6 P?lA*  ppo/returns/varAb#wC P?lA*  ppo/val/vpred0]{#wC Q?lA*  ppo/val/error. A5&sO yQ?lA*  ppo/val/clipfrac ?m"x= Q?lA*  ppo/val/mean: !{ R?lA*  ppo/val/vardK5?Յ+K wR?lA*  ppo/val/var_explained<'F R?lA*  ppo/learning_rateϸ:+K S?lA*  time/ppo/forward_pass {>p>.W uS?lA*!  time/ppo/compute_rewards:1 S?lA*$ " time/ppo/compute_advantagesn;Uu,E T?lA*  time/ppo/optimize_stepU?l)7_ T?lA*  time/ppo/calc_stats_> 2$B+M T?lA*  time/ppo/total^]?%6 YU?lA*  env/reward_mean̩?gV2$B+M kU?lA*  env/reward_stdp?I.>Z"x= lA*  objective/klfB=&_'F lA*  objective/kl_coeftM>*q'F =lA*  objective/entropy8B'/m]P UlA*" ppo/mean_non_score_rewardPXb%6 jlA*  ppo/mean_scores-~L?YU$B+M }lA*  ppo/std_scoresI!?&h-ׁ%6 ׀lA*  ppo/loss/policy5$B+M lA*  ppo/loss/valueߋ@>YF$B+M lA*  ppo/loss/total%>O(pJ FlA*  ppo/policy/entropyj@H)7_ _lA*  ppo/policy/approxkl>Q5)7_ rlA*  ppo/policy/policyklnKL>3H)7_ lA*  ppo/policy/clipfracЅ?ll ~0_ lA*# ! ppo/policy/advantages_mean3 +&sO lA*  ppo/returns/mean,,%6 ʈlA*  ppo/returns/varTAel%'F !lA*  ppo/learning_rateϸ:-+K 6lA*  time/ppo/forward_passq>.W lA*!  time/ppo/compute_rewards :31 -lA*$ " time/ppo/compute_advantages\;C/,E DlA*  time/ppo/optimize_stepX?`ҏ)7_ UlA*  time/ppo/calc_stats]>dfX$B+M ~lA*  time/ppo/total?^C%6 lA*  env/reward_mean-~L?1x$B+M lA*  env/reward_stdI!?f_"x= ^lA *  objective/kll{B0'F ^lA *  objective/kl_coefmM>Fa'F _lA *  objective/entropyBم/m]P '_lA *" ppo/mean_non_score_reward*y&H2%6 ;_lA *  ppo/mean_scores?1$B+M Q_lA *  ppo/std_scores*?;t-7"7(pJ glA *  ppo/policy/entropyˤ@ Z)7_ "glA *  ppo/policy/approxkl%?Qw)7_ 4glA *  ppo/policy/policykl$7?mLs)7_ DglA *  ppo/policy/clipfrac?8P#0_ XglA *# ! ppo/policy/advantages_mean}uW&sO jglA *  ppo/returns/meanؘ%%6 {glA *  ppo/returns/var,LAp#wC glA *  ppo/val/vpred7#wC glA *  ppo/val/errorZ@&sO jlA *  ppo/val/clipfracV>T9{"x= :jlA *  ppo/val/mean!_mA!{ RjlA *  ppo/val/varrU7@cn<+K cjlA *  ppo/val/var_explained?p'F sjlA *  ppo/learning_rateϸ:+K jlA *  time/ppo/forward_pass`r>Q.W jlA *!  time/ppo/compute_rewards:e1 nlA *$ " time/ppo/compute_advantagesZ;s,E nlA *  time/ppo/optimize_stepB? )7_ nlA *  time/ppo/calc_statsP\>g^$B+M nlA *  time/ppo/totalG?B{i%6 nlA *  env/reward_mean?<$B+M olA *  env/reward_std*?F"x= X5mA *  objective/klB'F [!X5mA *  objective/kl_coefM>SU'F v!X5mA *  objective/entropy7B{/m]P !X5mA *" ppo/mean_non_score_reward1OLW%6 !X5mA *  ppo/mean_scoresD*j?`:$B+M !X5mA *  ppo/std_scores?W-(pJ 'X5mA *  ppo/policy/entropy@ )7_ =,X5mA *  ppo/policy/approxkl* @)7_ l,X5mA *  ppo/policy/policykl&\?Jp)7_ ,X5mA *  ppo/policy/clipfrac33>0_ ,X5mA *# ! ppo/policy/advantages_mean$c])&sO ,X5mA *  ppo/returns/mean$'v%6 ,X5mA *  ppo/returns/varTA"b\#wC ,X5mA *  ppo/val/vpred}A#wC ,X5mA *  ppo/val/error`@t&sO ,X5mA *  ppo/val/clipfrac> "x= ,X5mA *  ppo/val/meanKWFΟE!{ .X5mA *  ppo/val/var v@0+K /X5mA *  ppo/val/var_explained ?Lj'F -/X5mA *  ppo/learning_rateϸ:n+K =/X5mA *  time/ppo/forward_passh>h.W /X5mA *!  time/ppo/compute_rewards:<1 /X5mA *$ " time/ppo/compute_advantages [;uO,E Z0X5mA *  time/ppo/optimize_stepPՙ?D)7_ 0X5mA *  time/ppo/calc_statsQW>$B+M 1X5mA *  time/ppo/total:|?%6 i1X5mA *  env/reward_meanD*j?\r$B+M 1X5mA *  env/reward_std?W"x= OmA *  objective/kl oB 8'F OmA *  objective/kl_coefM>v'F OmA *  objective/entropyB {/m]P OmA *" ppo/mean_non_score_rewardR52r%6 OmA *  ppo/mean_scoresi?;a$B+M OmA *  ppo/std_scoresNt?^ `-((pJ 6OmA *  ppo/policy/entropy‰@9)7_ IOmA *  ppo/policy/approxkl?")7_ xOmA *  ppo/policy/policykl^?=.z)7_ OmA *  ppo/policy/clipfrac?360_ OmA *# ! ppo/policy/advantages_mean3*&sO OmA *  ppo/returns/meanq—GL%6 OmA *  ppo/returns/var8>ZA #wC OmA *  ppo/val/vpred=Z#wC OmA *  ppo/val/error@'&sO O OmA *  ppo/val/clipfrac>?z"x= OmA *  ppo/val/meanxb@T!{ !OmA *  ppo/val/var2g@O%+K g!OmA *  ppo/val/var_explained ?'F !OmA *  ppo/learning_rateϸ:G20+K /"OmA *  time/ppo/forward_passd>(ف.W "OmA *!  time/ppo/compute_rewards:]8 1 "OmA *$ " time/ppo/compute_advantagesS;ۄ,E >#OmA *  time/ppo/optimize_stepђ?&hB)7_ #OmA *  time/ppo/calc_statsbL>~$B+M #OmA *  time/ppo/total?:*~%6 J$OmA *  env/reward_meani?$B+M $OmA *  env/reward_stdNt?"x= ~QmA *  objective/kluAǑ'F k~QmA *  objective/kl_coefM>6;'F ~QmA *  objective/entropy)?f /m]P ~QmA *" ppo/mean_non_score_reward%6 ~QmA *  ppo/mean_scoresyʷ<$Z$B+M ~QmA *  ppo/std_scores>o-6J%6 ~QmA *  ppo/loss/policy>,\$B+M Y~QmA *  ppo/loss/valuewA$B+M ~QmA *  ppo/loss/total@ջc(pJ ~QmA *  ppo/policy/entropy b>V)7_ -~QmA *  ppo/policy/approxklPBm[)7_ ~QmA *  ppo/policy/policykl?AabV)7_ 4~QmA *  ppo/policy/clipfracR=Ș0_ a~QmA *# ! ppo/policy/advantages_mean---A&sO {~QmA *  ppo/returns/meanˆ%6 ~QmA *  ppo/returns/varg-=#wC ~QmA *  ppo/val/vpredSD>#wC ~QmA *  ppo/val/error(]A&sO ~QmA *  ppo/val/clipfrac@?Y"x= f~QmA *  ppo/val/mean !{ ~QmA *  ppo/val/varN>kL+K ~QmA *  ppo/val/var_explainedο%''F f~QmA *  ppo/learning_rateϸ:+K ~QmA *  time/ppo/forward_passQ>11.W ~QmA *!  time/ppo/compute_rewards :ָ'1 i~QmA *$ " time/ppo/compute_advantagesDN;Yo6,E ~QmA *  time/ppo/optimize_stepZ?uO )7_ ~QmA *  time/ppo/calc_stats8>ia$B+M b~QmA *  time/ppo/total?%6 ~QmA *  env/reward_meanyʷ<Ĵ$B+M ~QmA *  env/reward_std>z"x= N ^mA *  objective/kl[}B` 'F O ^mA *  objective/kl_coefJN>Oc'F !O ^mA *  objective/entropyhA/m]P 4O ^mA *" ppo/mean_non_score_reward#þ%6 HO ^mA *  ppo/mean_scoresf?נP$B+M ]O ^mA *  ppo/std_scoresNJ>ZR-N%6 O ^mA *  ppo/loss/policy'q0v$B+M Q ^mA *  ppo/loss/valuer? $B+M )U ^mA *  ppo/loss/total=`B(pJ TU ^mA *  ppo/policy/entropy*wX?)7_ kU ^mA *  ppo/policy/approxkli>)7_ |U ^mA *  ppo/policy/policykl K:D1)7_ U ^mA *  ppo/policy/clipfracS=0_ U ^mA *# ! ppo/policy/advantages_meanfC&sO U ^mA *  ppo/returns/mean^Ob%6 U ^mA *  ppo/returns/vardAݣ+#wC U ^mA *  ppo/val/vpredc@#wC U ^mA *  ppo/val/errorZS@&Hf&sO 5V ^mA *  ppo/val/clipfrac=>@h"x= V ^mA *  ppo/val/mean6a!{ V ^mA *  ppo/val/var_%?Zs+K 9W ^mA *  ppo/val/var_explained=Z7'F W ^mA *  ppo/learning_rateϸ:$+K W ^mA *  time/ppo/forward_passd>2.W u$B+M Y ^mA *  time/ppo/total'?Q%6 Z ^mA *  env/reward_meanf?$$B+M cZ ^mA *  env/reward_stdNJ>4"x= IjmA*  objective/klBHq'F jmA*  objective/kl_coef?N>dz'F чjmA*  objective/entropy1Aq/m]P jmA*" ppo/mean_non_score_rewardU1x%6 jmA*  ppo/mean_scoresͨ?g$B+M jmA*  ppo/std_scores|?9Y-)``&)7_ jmA*  ppo/policy/approxklaAr`)7_ jmA*  ppo/policy/policykl=@v#l)7_ jmA*  ppo/policy/clipfracu>+50_ jmA*# ! ppo/policy/advantages_mean2\ǵ#&sO ؍jmA*  ppo/returns/meanMh\%6 jmA*  ppo/returns/varNAs:jf#wC jmA*  ppo/val/vpred$x}80#wC _jmA*  ppo/val/errorHJA);&sO jmA*  ppo/val/clipfrac>X|"x= jmA*  ppo/val/meanп1!{ ujmA*  ppo/val/vark?G+S+K ׏jmA*  ppo/val/var_explained@pB.W jmA*!  time/ppo/compute_rewards:q1 MjmA*$ " time/ppo/compute_advantages[; H,E JjmA*  time/ppo/optimize_stepx?=)7_ jmA*  time/ppo/calc_stats;N>;g$B+M AjmA*  time/ppo/total?In%6 KjmA*  env/reward_meanͨ?^$B+M jmA*  env/reward_std|?t"x= }vmA*  objective/kl$aBj'F }vmA*  objective/kl_coef`N><'/'F !}vmA*  objective/entropyڿX899/m]P 6}vmA*" ppo/mean_non_score_rewardc3s;v%6 I}vmA*  ppo/mean_scores>vtx$B+M [}vmA*  ppo/std_scoresŵ<9c-Y_(pJ }vmA*  ppo/policy/entropy:,@)7_ }vmA*  ppo/policy/approxklЖ3B)7_ }vmA*  ppo/policy/policyklC9B$iF)7_ '}vmA*  ppo/policy/clipfracY00_ <}vmA*# ! ppo/policy/advantages_meanI2`l&sO O}vmA*  ppo/returns/meank.+4%6 }vmA*  ppo/returns/varxNA,##wC V}vmA*  ppo/val/vpredwi#wC }vmA*  ppo/val/error뛰@ ץ&sO S}vmA*  ppo/val/clipfrac>4"x= }vmA*  ppo/val/meanxB?!{ H}vmA*  ppo/val/var>͸6+K }vmA*  ppo/val/var_explainedN>. 'F >}vmA*  ppo/learning_rateϸ:;+K }vmA*  time/ppo/forward_passd>W.W =}vmA*!  time/ppo/compute_rewards0:bH-1 }vmA*$ " time/ppo/compute_advantagesV;3*`,E 0}vmA*  time/ppo/optimize_step|*? z)7_ }vmA*  time/ppo/calc_statsN> 1$B+M E}vmA*  time/ppo/total'?K%6 }vmA*  env/reward_mean> ز$B+M 9}vmA*  env/reward_stdŵ<+g]"x= !mA*  objective/kl)bB_'F &"mA*  objective/kl_coefN>2'F B"mA*  objective/entropy[>E^/m]P W"mA*" ppo/mean_non_score_rewardX%6 k"mA*  ppo/mean_scores[>P$B+M "mA*  ppo/std_scoresv%6 "mA*  ppo/loss/policy/:S|$B+M %mA*  ppo/loss/valueN?;?$B+M 'mA*  ppo/loss/totalH>(-(pJ M'mA*  ppo/policy/entropynx;˻)7_ f'mA*  ppo/policy/approxklr:d)7_ w'mA*  ppo/policy/policykl~r)7_ 'mA*  ppo/policy/clipfrac9F>&0_ 'mA*# ! ppo/policy/advantages_mean42&sO (mA*  ppo/returns/meanڶ&r%6 )mA*  ppo/returns/varGyAG#wC *mA*  ppo/val/vpredo69( #wC +*mA*  ppo/val/errorH@ t&sO =*mA*  ppo/val/clipfracf?"x= O*mA*  ppo/val/meanA]!{ _*mA*  ppo/val/varP?(1+K p*mA*  ppo/val/var_explainedu&?J.'F *mA*  ppo/learning_rateϸ:е<+K B+mA*  time/ppo/forward_pass0h>CSc.W +mA*!  time/ppo/compute_rewards:Wr1 +mA*$ " time/ppo/compute_advantagesZ;$!,E V,mA*  time/ppo/optimize_step?({)7_ .mA*  time/ppo/calc_statsPCZ>OsJ$B+M y.mA*  time/ppo/totalF?|!}%6 .mA*  env/reward_mean[>ʡ$B+M 7/mA*  env/reward_stdv<"x= lmA*  objective/kl#aBGs'F lmA*  objective/kl_coefN>y+K'F lmA*  objective/entropyJG9܋/m]P -lmA*" ppo/mean_non_score_rewardbO%6 ?lmA*  ppo/mean_scores%> $B+M PlmA*  ppo/std_scoresJ;"->Q5"x= mmA*  ppo/val/meanJ2j!{ mmA*  ppo/val/var,$@u}+K mmA*  ppo/val/var_explainedO? 'F mmA*  ppo/learning_rateϸ:f7+K mmA*  time/ppo/forward_passa>CE.W mmA*!  time/ppo/compute_rewards@:;*1 mmA*$ " time/ppo/compute_advantages\;]1N,E PmmA*  time/ppo/optimize_stepv?&=_K)7_ mmA*  time/ppo/calc_stats>M>$B+M :mmA*  time/ppo/total ?J]%6 mmA*  env/reward_mean%>$$B+M mmA*  env/reward_stdJ;.t_N"x= e͗mA*  objective/kl$aB['F _f͗mA*  objective/kl_coef[N>X$'F }f͗mA*  objective/entropy߿8I/m]P f͗mA*" ppo/mean_non_score_rewardz%6 f͗mA*  ppo/mean_scores>#$B+M f͗mA*  ppo/std_scoresŵ<=-@za$B+M m͗mA*  ppo/loss/total 0=h(pJ m͗mA*  ppo/policy/entropy4l9 %)7_ m͗mA*  ppo/policy/approxkl-oX@)7_ m͗mA*  ppo/policy/policykl6)g7)7_ m͗mA*  ppo/policy/clipfracsn0_ n͗mA*# ! ppo/policy/advantages_mean!2;D&sO ,n͗mA*  ppo/returns/meany#[%6 An͗mA*  ppo/returns/varv5Ao#wC Rn͗mA*  ppo/val/vpred-q uc#wC dn͗mA*  ppo/val/error^#?vh&sO n͗mA*  ppo/val/clipfrac43>""x= /o͗mA*  ppo/val/meanv !{ o͗mA*  ppo/val/var@Kj+K o͗mA*  ppo/val/var_explainedk?rTm'F Ip͗mA*  ppo/learning_rateϸ:+K p͗mA*  time/ppo/forward_passb>3 4.W q͗mA*!  time/ppo/compute_rewards:ܺ]1 Zq͗mA*$ " time/ppo/compute_advantagesl;Ԝ>k,E q͗mA*  time/ppo/optimize_step|1?v)7_ r͗mA*  time/ppo/calc_stats-P>g`?X$B+M br͗mA*  time/ppo/total9?O%6 r͗mA*  env/reward_mean>$a$B+M s͗mA*  env/reward_stdŵ'F o[,mA*  objective/entropy?7%F/m]P [,mA*" ppo/mean_non_score_rewardqi%6 [,mA*  ppo/mean_scores> $B+M [,mA*  ppo/std_scoresŵw$B+M ^,mA*  ppo/loss/total&ijP1&sO f,mA*  ppo/val/clipfracgf>R"x= f,mA*  ppo/val/mean~E$!{ 'f,mA*  ppo/val/var0@B+K 8f,mA*  ppo/val/var_explainedwr?:)p'F h,mA*  ppo/learning_rateϸ:Q9S+K h,mA*  time/ppo/forward_pass f>k.W h,mA*!  time/ppo/compute_rewardsh-;6@1 h,mA*$ " time/ppo/compute_advantages Y;$,E h,mA*  time/ppo/optimize_step4 ?%)7_ h,mA*  time/ppo/calc_stats`gQ>,q$B+M oi,mA*  time/ppo/totalj?7u~o%6 i,mA*  env/reward_mean>rP&$B+M #j,mA*  env/reward_stdŵ<"x= tmA*  objective/kl#$aB''F tmA*  objective/kl_coef" O>Җ'F tmA*  objective/entropy7/m]P tmA*" ppo/mean_non_score_reward%6 3tmA*  ppo/mean_scores%>u$B+M FtmA*  ppo/std_scoresJ;-X <$B+M wtmA*  ppo/loss/totalx<(pJ tmA*  ppo/policy/entropy3C8 o)7_ tmA*  ppo/policy/approxklƿ+H4j)7_ tmA*  ppo/policy/policykl|57T2)7_ tmA*  ppo/policy/clipfracWZם0_ tmA*# ! ppo/policy/advantages_mean1D m&sO tmA*  ppo/returns/meanm Dj%6 tmA*  ppo/returns/var&@<|#wC &tmA*  ppo/val/vpred!#wC tmA*  ppo/val/error&>9&sO tmA*  ppo/val/clipfrac>˩"x= tmA*  ppo/val/mean&!{ tmA*  ppo/val/var@np+K J!tmA*  ppo/val/var_explainedS}w?nT'F v!tmA*  ppo/learning_rateϸ:ω+K !tmA*  time/ppo/forward_passg\>=-s.W #tmA*!  time/ppo/compute_rewards:e/1 $tmA*$ " time/ppo/compute_advantages@U;=B,E $tmA*  time/ppo/optimize_step?Z)7_ $tmA*  time/ppo/calc_stats@G>"$B+M $tmA*  time/ppo/total?=%6 9%tmA*  env/reward_mean%>ȸDS$B+M %tmA*  env/reward_stdJ;/"x= mA*  objective/kl%$aBV)'F pmA*  objective/kl_coef,O>N 'F mA*  objective/entropy7/m]P mA*" ppo/mean_non_score_reward`TJ#%6 mA*  ppo/mean_scores%>Y*$B+M ͅmA*  ppo/std_scoresJ;--k#@&sO )mA*  ppo/val/clipfrac>"x= TmA*  ppo/val/meanZ G!{ mA*  ppo/val/var{A~"+K 8mA*  ppo/val/var_explained!y?Ï'F OmA*  ppo/learning_rateϸ:+K bmA*  time/ppo/forward_passZ>K.W umA*!  time/ppo/compute_rewards:J1 mA*$ " time/ppo/compute_advantages8P;p,E mA*  time/ppo/optimize_stepƓ?)7_ mA*  time/ppo/calc_stats`FO>p:$B+M mA*  time/ppo/total?Tu %6 ʔmA*  env/reward_mean%>#&$B+M -mA*  env/reward_stdJ;D,"x= ]mA*  objective/kl$$aBَ^'F mA*  objective/kl_coefMO>3'F ٓmA*  objective/entropy7A/m]P mA*" ppo/mean_non_score_reward@B%6 mA*  ppo/mean_scores>O+$B+M mA*  ppo/std_scoresŵe&sO wmA*  ppo/val/clipfrac>|Y"x= mA*  ppo/val/meany!`!{ mA*  ppo/val/var@l,+K mA*  ppo/val/var_explained)z?['F ;mA*  ppo/learning_rateϸ:`+K PmA*  time/ppo/forward_passP^>X=(.W mA*!  time/ppo/compute_rewards8:}R1 mA*$ " time/ppo/compute_advantagesU;_V,E #mA*  time/ppo/optimize_step6?#)7_ 4mA*  time/ppo/calc_stats:J>[$B+M mA*  time/ppo/total~?A%6 mA*  env/reward_mean>>1Y$B+M ʥmA*  env/reward_stdŵ 'F mA*  objective/entropy?7eJ)/m]P ̟mA*" ppo/mean_non_score_reward&Ȗ%6 ߟmA*  ppo/mean_scores>(|$B+M mA*  ppo/std_scoresŵ"x= mA*  ppo/val/mean˓ɔ!{ mA*  ppo/val/varq@+K %mA*  ppo/val/var_explained1o|?l='F 8mA*  ppo/learning_rateϸ:$&~+K JmA*  time/ppo/forward_passb>j.W [mA*!  time/ppo/compute_rewards:_"51 kmA*$ " time/ppo/compute_advantagesQ;1l,E {mA*  time/ppo/optimize_step*0?Da$)7_ mA*  time/ppo/calc_statspU>&~$B+M FmA*  time/ppo/total^?k%6 (mA*  env/reward_mean>($B+M mA*  env/reward_stdŵC>'F r.mA*  objective/entropy7Xpe/m]P .mA*" ppo/mean_non_score_reward U%6 .mA*  ppo/mean_scores%>:[$B+M .mA*  ppo/std_scoresJ;]-} "x= /mA*  ppo/val/meanU Q*'!{ /mA*  ppo/val/varj@X^I+K M/mA*  ppo/val/var_explainedk|?T'F /mA*  ppo/learning_rateϸ:+K /mA*  time/ppo/forward_passPe\>:U.W \/mA*!  time/ppo/compute_rewards8:L1 /mA*$ " time/ppo/compute_advantagesT;,E /mA*  time/ppo/optimize_step>5?Q v)7_ ^/mA*  time/ppo/calc_stats oJ>`$B+M /mA*  time/ppo/total ?]H%6 /mA*  env/reward_mean%>f$B+M \/mA*  env/reward_stdJ;a"x= ,mA*  objective/kl$aBC'F mA*  objective/kl_coefO>":y'F mA*  objective/entropyQ8O/m]P »mA*" ppo/mean_non_score_reward!a\B%6 ԻmA*  ppo/mean_scores>2V$B+M 廗mA*  ppo/std_scoresŵ<~D-:&sO ǗmA*  ppo/val/clipfracff&>b^"x= 'ǗmA*  ppo/val/mean~~ !{ 9ǗmA*  ppo/val/varު@6I+K ǗmA*  ppo/val/var_explained{?@'F ǗmA*  ppo/learning_rateϸ:.|+K XȗmA*  time/ppo/forward_pass b>.W ȗmA*!  time/ppo/compute_rewards:dmy1 ɗmA*$ " time/ppo/compute_advantages _; ,E rɗmA*  time/ppo/optimize_step^ɪ?%W>)7_ ɗmA*  time/ppo/calc_statsL>TL$B+M ʗmA*  time/ppo/totalJD?I%6 rʗmA*  env/reward_mean>$B+M ʗmA*  env/reward_stdŵ{'F K*mA*  objective/entropyg9rD/m]P ^*mA*" ppo/mean_non_score_reward91+%6 o*mA*  ppo/mean_scores>$B+M *mA*  ppo/std_scoresŵ=&sO N6mA*  ppo/val/clipfracff=i @"x= ]6mA*  ppo/val/mean?ޝ!{ m6mA*  ppo/val/var$A4+K }6mA*  ppo/val/var_explained*T|?-.'F 6mA*  ppo/learning_rateϸ: +K 6mA*  time/ppo/forward_passi[>A.W a7mA*!  time/ppo/compute_rewards:@V1 7mA*$ " time/ppo/compute_advantagesLV;,E 8mA*  time/ppo/optimize_step?aH )7_ _8mA*  time/ppo/calc_stats6J>l$B+M 8mA*  time/ppo/totalL?%6 9mA*  env/reward_mean>!Q$B+M d9mA*  env/reward_stdŵ<8'F wfamA*  objective/entropy_8ا/m]P famA*" ppo/mean_non_score_rewardQ%6 famA*  ppo/mean_scores%>8}$B+M famA*  ppo/std_scoresJ;N-D8.W pamA*!  time/ppo/compute_rewards0:c1 9qamA*$ " time/ppo/compute_advantagesxO;ҠS,E qamA*  time/ppo/optimize_step?sj)7_ qamA*  time/ppo/calc_statsG>p$B+M 9ramA*  time/ppo/total~? %6 ramA*  env/reward_mean%>9$B+M ramA*  env/reward_stdJ;Y -"x= nA*  objective/kl$aBo'F )nA*  objective/kl_coefP>6 I'F DnA*  objective/entropyϜ8ŴR/m]P XnA*" ppo/mean_non_score_rewardi%6 lnA*  ppo/mean_scores>L$B+M nA*  ppo/std_scoresŵ<|<-.W ɖnA*!  time/ppo/compute_rewards:W`1 ɖnA*$ " time/ppo/compute_advantagesP;,E ɖnA*  time/ppo/optimize_step?|)7_ ɖnA*  time/ppo/calc_stats`zH>o_$B+M sʖnA*  time/ppo/total?g%6 G̖nA*  env/reward_mean>\$B+M ̖nA*  env/reward_stdŵ :'F u;nA*  objective/entropy̿8T"/m]P ;nA*" ppo/mean_non_score_rewardҁU%6 ;nA*  ppo/mean_scores>4$B+M ;nA*  ppo/std_scoresŵnA*  ppo/loss/valueTJ<0$B+M @nA*  ppo/loss/total^:(pJ @nA*  ppo/policy/entropy]l8R{;)7_ CnA*  ppo/policy/approxklt$.-w)7_ KCnA*  ppo/policy/policykln;"x= GHnA*  ppo/val/mean!{ YHnA*  ppo/val/var@7Ft+K iHnA*  ppo/val/var_explained??%`'F zHnA*  ppo/learning_rateϸ:)+K HnA*  time/ppo/forward_pass~[>b.W HnA*!  time/ppo/compute_rewardsH;A1 OInA*$ " time/ppo/compute_advantagesZ;e,E InA*  time/ppo/optimize_step?A)7_ InA*  time/ppo/calc_statsZM>U$B+M EJnA*  time/ppo/total ?|W%6 JnA*  env/reward_mean>^^$B+M JnA*  env/reward_stdŵb5'F nA*  objective/entropy~8#/m]P nA*" ppo/mean_non_score_rewardԙ%6 nA*  ppo/mean_scores>ȡf$B+M nA*  ppo/std_scoresŵu"x= nA*  ppo/val/meanm!{ nA*  ppo/val/var/@S+K nA*  ppo/val/var_explainedXh?,N'F nA*  ppo/learning_rateϸ:I+K nA*  time/ppo/forward_pass\>4C~.W nA*!  time/ppo/compute_rewards:ĵ<1 nA*$ " time/ppo/compute_advantages J;5@,E LnA*  time/ppo/optimize_step?J 4)7_ nA*  time/ppo/calc_statsOH>U`{$B+M nA*  time/ppo/total*?'7%6 FnA*  env/reward_mean>c$B+M nA*  env/reward_stdŵ<h"x= . *nA*  objective/kl$aB^eN'F *nA*  objective/kl_coefwP>$'F *nA*  objective/entropy8^/m]P *nA*" ppo/mean_non_score_rewardֱ&$ w%6 *nA*  ppo/mean_scores%>>`$B+M *nA*  ppo/std_scoresJ;7-!$B+M *nA*  ppo/loss/value;S$B+M *nA*  ppo/loss/total<:^(pJ *nA*  ppo/policy/entropym8;uV)7_ * *nA*  ppo/policy/approxkl^:."i)7_ E *nA*  ppo/policy/policyklB~z6%^)7_ [ *nA*  ppo/policy/clipfracm+a0_ t *nA*# ! ppo/policy/advantages_mean"4Pq&sO *nA*  ppo/returns/mean%6 *nA*  ppo/returns/varA{#wC *nA*  ppo/val/vpred3#wC  *nA*  ppo/val/error zj<>h'&sO i *nA*  ppo/val/clipfrac33{="x= } *nA*  ppo/val/meanУ!{ *nA*  ppo/val/var@僔+K *nA*  ppo/val/var_explainedm?P,'F 5 *nA*  ppo/learning_rateϸ:K-+K *nA*  time/ppo/forward_passjZ>g.W N *nA*!  time/ppo/compute_rewardsh:B= 1 *nA*$ " time/ppo/compute_advantagesxL;ix|,E R *nA*  time/ppo/optimize_step?G)7_ *nA*  time/ppo/calc_stats G>d $B+M c *nA*  time/ppo/total?Mf%6 *nA*  env/reward_mean%>Ha$B+M Z *nA*  env/reward_stdJ;\I"x= C5nA *  objective/kl$aBs'F C5nA *  objective/kl_coefP>ߪ'F C5nA *  objective/entropy?88/m]P C5nA *" ppo/mean_non_score_reward(%6 ҨC5nA *  ppo/mean_scores>ZnK$B+M C5nA *  ppo/std_scoresŵorI.W ķC5nA *!  time/ppo/compute_rewardsX: ad1 C5nA *$ " time/ppo/compute_advantages N;-[,E C5nA *  time/ppo/optimize_step?f`)7_ йC5nA *  time/ppo/calc_statsG>V?$B+M AC5nA *  time/ppo/totalB?'4a%6 C5nA *  env/reward_mean>$B+M C5nA *  env/reward_stdŵ`)'F E'e@nA!*  objective/entropy{8>m/m]P X'e@nA!*" ppo/mean_non_score_rewardJp,%6 k'e@nA!*  ppo/mean_scores>1w$B+M 'e@nA!*  ppo/std_scoresŵdzm.W 7e@nA!*!  time/ppo/compute_rewardsh: 91 7e@nA!*$ " time/ppo/compute_advantagesQ;kU,E 7e@nA!*  time/ppo/optimize_stepݎ?-a|)7_ 7e@nA!*  time/ppo/calc_statsH>3K^$B+M 7e@nA!*  time/ppo/total^?D%6 7e@nA!*  env/reward_mean>Bj$B+M J8e@nA!*  env/reward_stdŵ<"x= ~KnA"*  objective/kl $aBfչ'F ޾KnA"*  objective/kl_coefP>k'F KnA"*  objective/entropyfo8[/m]P KnA"*" ppo/mean_non_score_reward`B%6 KnA"*  ppo/mean_scores>-F$B+M 3KnA"*  ppo/std_scoresŵ<`f-b&sO .˜KnA"*  ppo/val/clipfracH1"x= Y˜KnA"*  ppo/val/meanHY!{ o˜KnA"*  ppo/val/var)@[%~+K ˜KnA"*  ppo/val/var_explainedn?/m'F ˜KnA"*  ppo/learning_rateϸ:Y+K ˜KnA"*  time/ppo/forward_pass`^>A9i.W ˜KnA"*!  time/ppo/compute_rewards:GY1 ˜KnA"*$ " time/ppo/compute_advantagesN;,E ˜KnA"*  time/ppo/optimize_stepȎ?@)7_ ˜KnA"*  time/ppo/calc_statslH>;)$B+M ̜KnA"*  time/ppo/total<6?S[|%6 B͜KnA"*  env/reward_mean>Չ$B+M 7ϜKnA"*  env/reward_stdŵ< )8"x= 1IVnA#*  objective/kl$aB]'F JVnA#*  objective/kl_coef= Q>r'F JVnA#*  objective/entropy90/m]P 1JVnA#*" ppo/mean_non_score_reward%6 EJVnA#*  ppo/mean_scores>ׇc$B+M WJVnA#*  ppo/std_scoresŵDg.W TVnA#*!  time/ppo/compute_rewardsx:2D1 TVnA#*$ " time/ppo/compute_advantagesS;U,E $UVnA#*  time/ppo/optimize_stepXȎ?,i)7_ xUVnA#*  time/ppo/calc_stats7H>$B+M UVnA#*  time/ppo/totalZ?5%6 VVnA#*  env/reward_mean>y$B+M kVVnA#*  env/reward_stdŵ<"x= *anA$*  objective/kl$aB(\ڂ'F anA$*  objective/kl_coef}+Q>'F anA$*  objective/entropy|9\>/m]P ĕanA$*" ppo/mean_non_score_reward*GtB%6 ؕanA$*  ppo/mean_scores%>}id$B+M anA$*  ppo/std_scoresJ;&a-#wC anA$*  ppo/val/error;:{&sO .anA$*  ppo/val/clipfracqƓ"x= @anA$*  ppo/val/mean_6 !{ SanA$*  ppo/val/var @=+K danA$*  ppo/val/var_explained?HD'F tanA$*  ppo/learning_rateϸ:Li9+K anA$*  time/ppo/forward_passI_>Ei.W anA$*!  time/ppo/compute_rewards:V$1 anA$*$ " time/ppo/compute_advantagesDX;u,E aanA$*  time/ppo/optimize_step?X))7_ anA$*  time/ppo/calc_stats@S>E$B+M anA$*  time/ppo/totalH?K %6 ianA$*  env/reward_mean%>͏x$B+M anA$*  env/reward_stdJ;~"x= ~R3mnA%*  objective/kl$aBw;'F R3mnA%*  objective/kl_coefMQ>h'F R3mnA%*  objective/entropyb#97F/m]P S3mnA%*" ppo/mean_non_score_reward9B#%6 "S3mnA%*  ppo/mean_scores>Od$B+M 2S3mnA%*  ppo/std_scoresŵ<-4-$B+M _3mnA%*  time/ppo/total?c͕%6 f_3mnA%*  env/reward_mean> $B+M a3mnA%*  env/reward_stdŵ#H'F fxnA&*  objective/entropyR*9n/m]P fxnA&*" ppo/mean_non_score_rewardXZE%6 ϳfxnA&*  ppo/mean_scores>˯Cu$B+M ߳fxnA&*  ppo/std_scoresŵQ%6 üfxnA&*  ppo/returns/var>@R#wC ӼfxnA&*  ppo/val/vpredu<8#wC fxnA&*  ppo/val/errorJJx:zY&sO fxnA&*  ppo/val/clipfrac"x= bfxnA&*  ppo/val/meanһ_\k!{ fxnA&*  ppo/val/var@З+K *fxnA&*  ppo/val/var_explained5?Mg'F ?fxnA&*  ppo/learning_rateϸ:tXZ+K QfxnA&*  time/ppo/forward_passP\>fCG.W afxnA&*!  time/ppo/compute_rewards: H`1 pfxnA&*$ " time/ppo/compute_advantagesM;s,E ĿfxnA&*  time/ppo/optimize_stepю?+x)7_ fxnA&*  time/ppo/calc_statsH>”r$B+M lfxnA&*  time/ppo/total,+?%6 fxnA&*  env/reward_mean>$B+M &fxnA&*  env/reward_stdŵ<X"x= nA'*  objective/kl#aBz`'F .nA'*  objective/kl_coef^Q> o3'F PnA'*  objective/entropyK.9`/m]P hnA'*" ppo/mean_non_score_reward}rƬ%%6 |nA'*  ppo/mean_scores>n_$B+M nA'*  ppo/std_scoresŵ<-[e?)7_ nA'*  ppo/policy/clipfracA0_ nA'*# ! ppo/policy/advantages_mean33̒7&sO bnA'*  ppo/returns/mean%6 ȤnA'*  ppo/returns/varU@[-H#wC +nA'*  ppo/val/vpred/t!#wC nA'*  ppo/val/errorY":5}=&sO 襡nA'*  ppo/val/clipfracp"x= DnA'*  ppo/val/mean+2S!{ nA'*  ppo/val/var8@+K nA'*  ppo/val/var_explained?(C'F ]nA'*  ppo/learning_rateϸ:`U+K nA'*  time/ppo/forward_passPi>q.W nA'*!  time/ppo/compute_rewardsp:R1 rnA'*$ " time/ppo/compute_advantagesM;Qpy,E ͨnA'*  time/ppo/optimize_step\q?()7_ )nA'*  time/ppo/calc_stats0aH>$B+M nA'*  time/ppo/total=?V4%6 ީnA'*  env/reward_mean>{$B+M 6nA'*  env/reward_stdŵ'F ݎnA(*  objective/kl_coefQ>d%'F ݎnA(*  objective/entropyP+9[/m]P ݎnA(*" ppo/mean_non_score_rewardf6%6 #ݎnA(*  ppo/mean_scores%>$B+M 6ݎnA(*  ppo/std_scoresJ;@`{-0_ ݎnA(*# ! ppo/policy/advantages_mean33&Q&sO #ݎnA(*  ppo/returns/mean 9&6%6 4ݎnA(*  ppo/returns/var@#wC DݎnA(*  ppo/val/vpred/4 ]#wC nݎnA(*  ppo/val/error|\9B&sO ݎnA(*  ppo/val/clipfrac"x= ݎnA(*  ppo/val/meanN>!{ ;ݎnA(*  ppo/val/var@P+K RݎnA(*  ppo/val/var_explained_?&+'F ݎnA(*  ppo/learning_rateϸ:$C;+K ݎnA(*  time/ppo/forward_passa>U.W wݎnA(*!  time/ppo/compute_rewards(:}^1 ݎnA(*$ " time/ppo/compute_advantagesP;,E &ݎnA(*  time/ppo/optimize_step֐?)7_ {ݎnA(*  time/ppo/calc_statsM>m$B+M ݎnA(*  time/ppo/totalLi?wі%6 ' ݎnA(*  env/reward_mean%>M w$B+M y ݎnA(*  env/reward_stdJ;,ͼ"x= a\'nA)*  objective/kl#aB:8'F \'nA)*  objective/kl_coefQ>}'F \'nA)*  objective/entropyTw)9/m]P ]'nA)*" ppo/mean_non_score_rewardҢ%6 ]'nA)*  ppo/mean_scores%>3ʢ$B+M 2]'nA)*  ppo/std_scoresJ;_Ɲ-#wC c'nA)*  ppo/val/errorHn9Y&sO hc'nA)*  ppo/val/clipfrac^\I"x= c'nA)*  ppo/val/mean+eB*!{ d'nA)*  ppo/val/var@)a+K d'nA)*  ppo/val/var_explained!?و'F d'nA)*  ppo/learning_rateϸ:'C+K 5e'nA)*  time/ppo/forward_pass]>d.W e'nA)*!  time/ppo/compute_rewards:H1 e'nA)*$ " time/ppo/compute_advantagesM;I,E If'nA)*  time/ppo/optimize_stepLЎ?)7_ f'nA)*  time/ppo/calc_statsG> $B+M f'nA)*  time/ppo/total*?'&;%6 Vg'nA)*  env/reward_mean%>zT$B+M g'nA)*  env/reward_stdJ;"x= fnA**  objective/kl$aB:{zi'F fnA**  objective/kl_coefqQ>q3'F fnA**  objective/entropy^$9~/m]P fnA**" ppo/mean_non_score_reward@%6 fnA**  ppo/mean_scores%>u[$B+M fnA**  ppo/std_scoresJ; c=-CC.W fnA**!  time/ppo/compute_rewards:t1 .fnA**$ " time/ppo/compute_advantagesV;]jg!,E fnA**  time/ppo/optimize_step?JY)7_ fnA**  time/ppo/calc_stats5V>8$B+M )fnA**  time/ppo/total?9M%6 |fnA**  env/reward_mean%>$B+M fnA**  env/reward_stdJ; @F"x= 5nA+*  objective/kl$aBfd'F 6nA+*  objective/kl_coefR>2K'F (6nA+*  objective/entropyj9/m]P <6nA+*" ppo/mean_non_score_reward8Ύ%6 N6nA+*  ppo/mean_scores>$B+M _6nA+*  ppo/std_scoresŵnA+*  time/ppo/forward_pass[>z7.W >nA+*!  time/ppo/compute_rewards:|e1 >nA+*$ " time/ppo/compute_advantagesT;P$,E A?nA+*  time/ppo/optimize_step\/?7)7_ ?nA+*  time/ppo/calc_stats@J>D}$B+M ?nA+*  time/ppo/total?ݠS%6 tBnA+*  env/reward_mean>2X$B+M BnA+*  env/reward_stdŵ<'΍"x= anA,*  objective/kl$aB"'F nA,*  objective/kl_coefD>R>RoG'F nA,*  objective/entropysO9b/m]P nA,*" ppo/mean_non_score_rewardq9U%6 nA,*  ppo/mean_scores>$I$B+M nA,*  ppo/std_scoresŵ<4} --.W nA,*!  time/ppo/compute_rewards:c}1 nA,*$ " time/ppo/compute_advantagesS;`-,E 5nA,*  time/ppo/optimize_step?V^)7_ nA,*  time/ppo/calc_statsH>zc$$B+M nA,*  time/ppo/total?7~%6 (nA,*  env/reward_mean>qFF@$B+M xnA,*  env/reward_stdŵ<"x= JnA-*  objective/kl$aBa'F CJnA-*  objective/kl_coef`R>Eң'F ^JnA-*  objective/entropyy9/J/m]P sJnA-*" ppo/mean_non_score_rewardO%6 JnA-*  ppo/mean_scores%>yZ$B+M JnA-*  ppo/std_scoresJ;\e-&.W =JnA-*!  time/ppo/compute_rewards:T1 JnA-*$ " time/ppo/compute_advantagesXU;6,E KnA-*  time/ppo/optimize_step?fl)7_ _KnA-*  time/ppo/calc_stats~M> $B+M KnA-*  time/ppo/total+?C%6 KnA-*  env/reward_mean%>$B+M YKnA-*  env/reward_stdJ;M"x= ϛnA.*  objective/kl$aB 'F 5nA.*  objective/kl_coef.R> VU'F PnA.*  objective/entropy~9ܕ[$B+M nA.*  ppo/std_scoresŵ.W `nA.*!  time/ppo/compute_rewards:d1 qnA.*$ " time/ppo/compute_advantagesM;I,E nA.*  time/ppo/optimize_step$?s&=)7_ nA.*  time/ppo/calc_statsG>$B+M nA.*  time/ppo/totalAy#$B+M LnA.*  env/reward_stdŵnA/*  objective/kl_coefR><'F YnA/*  objective/entropy9/m]P nnA/*" ppo/mean_non_score_reward24‡%6 nA/*  ppo/mean_scores>E$B+M nA/*  ppo/std_scoresŵ)7_ nA/*  ppo/policy/approxkl>_*o)7_ nA/*  ppo/policy/policykl@Q }v)7_ nA/*  ppo/policy/clipfrac%n0_ nA/*# ! ppo/policy/advantages_meanfffz&sO JnA/*  ppo/returns/meanbw9%6 nA/*  ppo/returns/var|AJ #wC nA/*  ppo/val/vpredwY ?#wC nA/*  ppo/val/error39t|&sO InA/*  ppo/val/clipfracv"x= rnA/*  ppo/val/mean4J!{ nA/*  ppo/val/varAb*+K nA/*  ppo/val/var_explained? 'F nA/*  ppo/learning_rateϸ:J+K nA/*  time/ppo/forward_pass]Z>ڟ.W nA/*!  time/ppo/compute_rewards0:=e1 nA/*$ " time/ppo/compute_advantages,M;1,E KnA/*  time/ppo/optimize_step愎?ʄG)7_ nA/*  time/ppo/calc_statsH>:$B+M nA/*  time/ppo/totalj?%6 CnA/*  env/reward_mean>.B$B+M nA/*  env/reward_stdŵ<"x= nA0*  objective/kl$aBH['F bnA0*  objective/kl_coef/R>,'F |nA0*  objective/entropy 9*8F/m]P nA0*" ppo/mean_non_score_reward{Lu %6 nA0*  ppo/mean_scores>en$$B+M nA0*  ppo/std_scoresŵ<nA0*  ppo/loss/value~G9 4g^$B+M nA0*  ppo/loss/totalcps73(pJ nA0*  ppo/policy/entropyA85Q)7_ nA0*  ppo/policy/approxklf<)N)7_ nA0*  ppo/policy/policykl,hD)7_ HnA0*  ppo/policy/clipfrac>0_ nA0*# ! ppo/policy/advantages_mean431P&sO 8nA0*  ppo/returns/meanJxs%6 UnA0*  ppo/returns/vareA$Op#wC jnA0*  ppo/val/vpred f_~#wC {nA0*  ppo/val/error~G9<&sO nA0*  ppo/val/clipfrac^"x= nA0*  ppo/val/meanff!{ nA0*  ppo/val/varcAŸo+K NnA0*  ppo/val/var_explained?F'F nA0*  ppo/learning_rateϸ:c+K nA0*  time/ppo/forward_pass0TY>]B{.W QnA0*!  time/ppo/compute_rewards :`11 nA0*$ " time/ppo/compute_advantagesU;G,E nA0*  time/ppo/optimize_step?)7_ GnA0*  time/ppo/calc_statspF>" $B+M nA0*  time/ppo/total2:?\_@%6 nA0*  env/reward_mean>҃$B+M 7nA0*  env/reward_stdŵ<"x= z],nA1*  objective/kl$aB4P'F ],nA1*  objective/kl_coefR>h['F ],nA1*  objective/entropy_ 9"i/m]P ^,nA1*" ppo/mean_non_score_rewarddZt%6 ^,nA1*  ppo/mean_scores%>,]$B+M +^,nA1*  ppo/std_scoresJ;1'-B.W o,nA1*!  time/ppo/compute_rewards:s1 lo,nA1*$ " time/ppo/compute_advantagesJ;j,E o,nA1*  time/ppo/optimize_stepӎ?v9)7_ p,nA1*  time/ppo/calc_stats@G>W3-$B+M ]p,nA1*  time/ppo/total?S%6 p,nA1*  env/reward_mean%>D۫$B+M p,nA1*  env/reward_stdJ;A= "x= RnA2*  objective/kl$aB9'F CRnA2*  objective/kl_coefF S>Ah'F _RnA2*  objective/entropy94ω/m]P tRnA2*" ppo/mean_non_score_reward}[3%6 RnA2*  ppo/mean_scores%>R4r$B+M RnA2*  ppo/std_scoresJ; y-=5.W RnA2*!  time/ppo/compute_rewardsH:a}1 ERnA2*$ " time/ppo/compute_advantages`U;g#,E RnA2*  time/ppo/optimize_step6?x)7_ RnA2*  time/ppo/calc_statsE>7o$B+M >RnA2*  time/ppo/total]?G%6 RnA2*  env/reward_mean%>qU$B+M RnA2*  env/reward_stdJ;V"x= b oA3*  objective/kl$aBw'F  oA3*  objective/kl_coef/S> 0'F  oA3*  objective/entropy/9FY/m]P  oA3*" ppo/mean_non_score_rewardmTO%6  oA3*  ppo/mean_scores> X$B+M  oA3*  ppo/std_scoresŵ<>-).W  oA3*!  time/ppo/compute_rewards(:Sù1 i oA3*$ " time/ppo/compute_advantages[;Qn2t,E  oA3*  time/ppo/optimize_stepL?C)7_  oA3*  time/ppo/calc_stats`G>2$B+M a oA3*  time/ppo/totalj?*%6  oA3*  env/reward_mean>`&]$B+M  oA3*  env/reward_stdŵ<Ζ"x= x6oA4*  objective/kl$aB,1'F 6oA4*  objective/kl_coeftRS>'F 6oA4*  objective/entropyW9/m]P 7oA4*" ppo/mean_non_score_rewardŭ%6 !7oA4*  ppo/mean_scores%>S$B+M 47oA4*  ppo/std_scoresJ;I:%-oA4*  ppo/policy/approxklra) .)7_ >oA4*  ppo/policy/policykl1S)7_ >oA4*  ppo/policy/clipfrac]0_ AoA4*# ! ppo/policy/advantages_mean̰;[Tt&sO BoA4*  ppo/returns/meanaK%6 7BoA4*  ppo/returns/varA)#wC KBoA4*  ppo/val/vpred{V2#wC ]BoA4*  ppo/val/errorf~8HN&sO oBoA4*  ppo/val/clipfracp"x= BoA4*  ppo/val/meanr٫ȿ!{ BoA4*  ppo/val/varbAa+K BoA4*  ppo/val/var_explainedM?*'F CoA4*  ppo/learning_rateϸ:%J!+K _CoA4*  time/ppo/forward_pass0^>Kn.W CoA4*!  time/ppo/compute_rewards:w1 DoA4*$ " time/ppo/compute_advantages4Q;FE[,E aDoA4*  time/ppo/optimize_step?E9)7_ DoA4*  time/ppo/calc_statsG>t"$B+M EoA4*  time/ppo/totalR?Iy%6 TEoA4*  env/reward_mean%>u]$B+M EoA4*  env/reward_stdJ;~"x=  oA5*  objective/kl $aB26'F oA5*  objective/kl_coefuS>N!m'F oA5*  objective/entropyG9,X/m]P oA5*" ppo/mean_non_score_reward"8C%6 ɸ oA5*  ppo/mean_scores> o$B+M ۸ oA5*  ppo/std_scoresŵ(.W \ oA5*!  time/ppo/compute_rewards:/CW1 oA5*$ " time/ppo/compute_advantagesI;Nȕ,E $ oA5*  time/ppo/optimize_stepT ?y')7_  oA5*  time/ppo/calc_statsH>kѭ$B+M oA5*  time/ppo/totalt7?2R=%6 * oA5*  env/reward_mean>_$B+M oA5*  env/reward_stdŵ<<"x= ,oA6*  objective/kl $aBjC!'F ,oA6*  objective/kl_coefS>,'F ,oA6*  objective/entropy@/8ח۾/m]P ,oA6*" ppo/mean_non_score_reward!3Z%6 -,oA6*  ppo/mean_scores%>Q9$B+M >,oA6*  ppo/std_scoresJ;3-^.W c,oA6*!  time/ppo/compute_rewards8:N1 r,oA6*$ " time/ppo/compute_advantagesO;\đ,E ,oA6*  time/ppo/optimize_stepn?Nj7)7_ ,oA6*  time/ppo/calc_statsG>#6$B+M A,oA6*  time/ppo/total ?%6 ,oA6*  env/reward_mean%>@Q$B+M ,oA6*  env/reward_stdJ;A"x= V7oA7*  objective/kl $aB3:'F V7oA7*  objective/kl_coefdS>.'F V7oA7*  objective/entropyF?8M /m]P 'V7oA7*" ppo/mean_non_score_rewardE>C%6 :V7oA7*  ppo/mean_scores%>\ˠ$B+M KV7oA7*  ppo/std_scoresJ;-_.W V7oA7*!  time/ppo/compute_rewards@:sd1 !V7oA7*$ " time/ppo/compute_advantagesJ;_,E rV7oA7*  time/ppo/optimize_stepo?T#)7_ V7oA7*  time/ppo/calc_stats0> $B+M V7oA7*  time/ppo/total0 ?1o3%6 eV7oA7*  env/reward_mean%>2$B+M V7oA7*  env/reward_stdJ;}I"x= +IBoA8*  objective/kl $aBT 'F IBoA8*  objective/kl_coefS>&k'F IBoA8*  objective/entropyN8Q/m]P IBoA8*" ppo/mean_non_score_rewardN(AX%6 IBoA8*  ppo/mean_scores%>$Xn$B+M IBoA8*  ppo/std_scoresJ;0-A,P++K VBoA8*  ppo/val/var_explained?mթ'F %VBoA8*  ppo/learning_rateϸ:+K VBoA8*  time/ppo/forward_pass0[>wA.W VBoA8*!  time/ppo/compute_rewards:[j1 >WBoA8*$ " time/ppo/compute_advantages P; {,E WBoA8*  time/ppo/optimize_step?)7_ WBoA8*  time/ppo/calc_statsH>F$B+M 0XBoA8*  time/ppo/total?t%6 XBoA8*  env/reward_mean%>{R$B+M XBoA8*  env/reward_stdJ;È"x= MoA9*  objective/kl $aB3ח'F MoA9*  objective/kl_coefS>>'F 7MoA9*  objective/entropyS/8dD-z/m]P LMoA9*" ppo/mean_non_score_reward't"%6 ]MoA9*  ppo/mean_scores>q$B+M oMoA9*  ppo/std_scoresŵ<9J-3.W PMoA9*!  time/ppo/compute_rewardsX:;n1 MoA9*$ " time/ppo/compute_advantages`O;ܽ-,E MoA9*  time/ppo/optimize_stepd[?XӰ)7_ UMoA9*  time/ppo/calc_stats/L>zyT$B+M MoA9*  time/ppo/total ?3%6 MoA9*  env/reward_mean>%$B+M TMoA9*  env/reward_stdŵ8D'F n56YoA:*  objective/entropyTo8xe/m]P 56YoA:*" ppo/mean_non_score_reward+@`U%6 56YoA:*  ppo/mean_scores%>5$B+M 56YoA:*  ppo/std_scoresJ;+H-".W =6YoA:*!  time/ppo/compute_rewards0:慖1 =6YoA:*$ " time/ppo/compute_advantagesTW;Tl,E O>6YoA:*  time/ppo/optimize_stepd?-D)7_ >6YoA:*  time/ppo/calc_statsN>I$B+M >6YoA:*  time/ppo/totald?^(#%6 T?6YoA:*  env/reward_mean%>?$B+M ?6YoA:*  env/reward_stdJ;|'D"x= %ԮdoA;*  objective/kl $aBs;'F ԮdoA;*  objective/kl_coefHET>8n'F ԮdoA;*  objective/entropyTo8s+/m]P ԮdoA;*" ppo/mean_non_score_rewardX- %6 ԮdoA;*  ppo/mean_scores%>us$B+M ԮdoA;*  ppo/std_scoresJ;CN-.W ߮doA;*!  time/ppo/compute_rewards0:|+@1 ߮doA;*$ " time/ppo/compute_advantagesR;s.\,E IdoA;*  time/ppo/optimize_stepP$B+M doA;*  time/ppo/total+?䢍S%6 ;doA;*  env/reward_mean%>AF$B+M doA;*  env/reward_stdJ;z"x= )poA<*  objective/kl $aB⳵'F '*poA<*  objective/kl_coefhT>Kz'F E*poA<*  objective/entropyT8I)&/m]P Y*poA<*" ppo/mean_non_score_rewardqT%6 l*poA<*  ppo/mean_scores%>T$B+M *poA<*  ppo/std_scoresJ;so-/m]P *poA<*" tokens/responses_len_meanAk.W *poA<*!  tokens/responses_len_std.%6 *poA<*  ppo/loss/policy q$B+M .-poA<*  ppo/loss/valuey(7uX$B+M /poA<*  ppo/loss/totalM5(pJ /poA<*  ppo/policy/entropy8F)7_ 70poA<*  ppo/policy/approxkl0'X )7_ 0poA<*  ppo/policy/policykl$)7_ 0poA<*  ppo/policy/clipfrac`[0_ 41poA<*# ! ppo/policy/advantages_mean2&sO 1poA<*  ppo/returns/meanBq!W%6 1poA<*  ppo/returns/varAG#wC :2poA<*  ppo/val/vpred?`#wC 2poA<*  ppo/val/errory7w&sO 2poA<*  ppo/val/clipfracb"x= 13poA<*  ppo/val/meanB(ǣ!!{ 3poA<*  ppo/val/varA*+K 3poA<*  ppo/val/var_explained?_'F -4poA<*  ppo/learning_rateϸ: +K 4poA<*  time/ppo/forward_pass`>,h.W 4poA<*!  time/ppo/compute_rewards:1 ,5poA<*$ " time/ppo/compute_advantages$R;;r,E 5poA<*  time/ppo/optimize_stepl?E()7_ 5poA<*  time/ppo/calc_statsM>h$B+M "6poA<*  time/ppo/total0?w%6 u6poA<*  env/reward_mean%>54$B+M 6poA<*  env/reward_stdJ;$@"x= ig{oA=*  objective/klB6}'F g{oA=*  objective/kl_coef܊T>c'F g{oA=*  objective/entropy 7 /m]P h{oA=*" ppo/mean_non_score_reward)%6 -h{oA=*  ppo/mean_scores_?($B+M @h{oA=*  ppo/std_scoresjMv$B+M l{oA=*  ppo/loss/totalt=6>(pJ l{oA=*  ppo/policy/entropy91)7_ m{oA=*  ppo/policy/approxkl&L1')7_ m{oA=*  ppo/policy/policykl7kGq)7_ /m{oA=*  ppo/policy/clipfrac0_ Bm{oA=*# ! ppo/policy/advantages_mean333x,P&sO .q{oA=*  ppo/returns/meanvǹ%6 Xq{oA=*  ppo/returns/var.A-#wC qq{oA=*  ppo/val/vpred f7#wC q{oA=*  ppo/val/errora?&sO q{oA=*  ppo/val/clipfrac>0W"x= q{oA=*  ppo/val/mean(!{ q{oA=*  ppo/val/vare@ +K q{oA=*  ppo/val/var_explainedZr?e'F q{oA=*  ppo/learning_rateϸ:k첪+K q{oA=*  time/ppo/forward_passGG>L#%(.W t{oA=*!  time/ppo/compute_rewards:ce1 t{oA=*$ " time/ppo/compute_advantages?;Ff ,E t{oA=*  time/ppo/optimize_stepI?xV)7_ 6w{oA=*  time/ppo/calc_statsC7>Kr$B+M aw{oA=*  time/ppo/totalX?H%6 iy{oA=*  env/reward_mean_?S n$B+M {{oA=*  env/reward_stdj< -"x= LoA>*  objective/klB,'F !LoA>*  objective/kl_coefT>kK'F !LoA>*  objective/entropy?77/m]P 4!LoA>*" ppo/mean_non_score_rewardn)& %6 I!LoA>*  ppo/mean_scores_?f$B+M [!LoA>*  ppo/std_scoresj<|I-*  tokens/queries_len_meanBTɞ,E |!LoA>*  tokens/queries_len_stdK /m]P !LoA>*" tokens/responses_len_meanAiD.W !LoA>*!  tokens/responses_len_std<%6 !LoA>*  ppo/loss/policy68n$B+M &$LoA>*  ppo/loss/value>MA$B+M (LoA>*  ppo/loss/total2z&=4v(pJ 8(LoA>*  ppo/policy/entropyBes9;)7_ M(LoA>*  ppo/policy/approxkl`&h0XA)7_ ^(LoA>*  ppo/policy/policykl0]7 )7_ o(LoA>*  ppo/policy/clipfrac70_ (LoA>*# ! ppo/policy/advantages_mean4335eE&sO (LoA>*  ppo/returns/mean-#xi%6 (LoA>*  ppo/returns/varF9=AMv#wC (LoA>*  ppo/val/vpredm%KZ#wC (LoA>*  ppo/val/error]>a&sO W+LoA>*  ppo/val/clipfrac?6Q"x= +LoA>*  ppo/val/mean`F !{ +LoA>*  ppo/val/varS$@MK^S+K +LoA>*  ppo/val/var_explained v?f'F .LoA>*  ppo/learning_rateϸ:r/+K D.LoA>*  time/ppo/forward_pass ?>j.W X.LoA>*!  time/ppo/compute_rewardsX:Sy31 1LoA>*$ " time/ppo/compute_advantages7;#D,E (1LoA>*  time/ppo/optimize_step|?)7_ >1LoA>*  time/ppo/calc_stats0d0>0)F$B+M _3LoA>*  time/ppo/total?,Q%6 3LoA>*  env/reward_mean_?S[1$B+M g5LoA>*  env/reward_stdj<^lml"x= eoA?*  objective/klBdW'F foA?*  objective/kl_coefT>e'F 'foA?*  objective/entropyFe:Kj/m]P ʂ"x= noA?*  ppo/val/mean?+(n!{ -ooA?*  ppo/val/varmrAYr+K ooA?*  ppo/val/var_explained Rf? H'F ooA?*  ppo/learning_rateϸ:|j+K 1poA?*  time/ppo/forward_passYH>jnf.W poA?*!  time/ppo/compute_rewards0:v|1 poA?*$ " time/ppo/compute_advantages\@;ύ#Z,E -qoA?*  time/ppo/optimize_stepd? u{)7_ qoA?*  time/ppo/calc_stats4>F$B+M qoA?*  time/ppo/total`? %6 roA?*  env/reward_mean_?N}$B+M nroA?*  env/reward_stdj<&'"x= H,oA@*  objective/kl|BJAZ'F I,oA@*  objective/kl_coefeT>Q'F I,oA@*  objective/entropyd8/m]P I,oA@*" ppo/mean_non_score_reward(*5%6 J,oA@*  ppo/mean_scores_?,$B+M J,oA@*  ppo/std_scoresj[$B+M {M,oA@*  ppo/loss/totalS=tp(pJ M,oA@*  ppo/policy/entropyDF:Yq)7_ M,oA@*  ppo/policy/approxkl2Tm)7_ M,oA@*  ppo/policy/policykl8Kl8;j)7_ M,oA@*  ppo/policy/clipfracTS0_ LN,oA@*# ! ppo/policy/advantages_meanff9&sO N,oA@*  ppo/returns/mean'%6 O,oA@*  ppo/returns/var1N'Az<#wC cO,oA@*  ppo/val/vpredƶ #wC O,oA@*  ppo/val/error>}xD&sO P,oA@*  ppo/val/clipfrac>i"x= hP,oA@*  ppo/val/meanθ !{ P,oA@*  ppo/val/varE@/+K Q,oA@*  ppo/val/var_explained(v?p:z'F vQ,oA@*  ppo/learning_rateϸ:+K Q,oA@*  time/ppo/forward_pass&N>w.W 4R,oA@*!  time/ppo/compute_rewards;q=41 R,oA@*$ " time/ppo/compute_advantages B;BR,E S,oA@*  time/ppo/optimize_step?j()7_ S,oA@*  time/ppo/calc_stats;>ݤ$B+M S,oA@*  time/ppo/total6dz?~q%6 U,oA@*  env/reward_mean_?Vw$B+M !V,oA@*  env/reward_stdj<"x= ֧oAA*  objective/klBoB'F MקoAA*  objective/kl_coefIU>y'F kקoAA*  objective/entropyl;Xy/m]P קoAA*" ppo/mean_non_score_rewardA*+J%6 קoAA*  ppo/mean_scores_?7N$B+M קoAA*  ppo/std_scoresj<h4-X=$B+M ܧoAA*  ppo/loss/totaljIBh&sO ᧡oAA*  ppo/val/clipfrac33>H}"x= ⧡oAA*  ppo/val/mean(lp!{ a⧡oAA*  ppo/val/var(1A\Q+K ⧡oAA*  ppo/val/var_explained{?:g;'F 㧡oAA*  ppo/learning_rateϸ:KI+K v㧡oAA*  time/ppo/forward_pass&A>}u.W 㧡oAA*!  time/ppo/compute_rewards@:1 /䧡oAA*$ " time/ppo/compute_advantages9;,E 䧡oAA*  time/ppo/optimize_step? )7_ 䧡oAA*  time/ppo/calc_statsM.>@1$B+M 6姡oAA*  time/ppo/total?4%6 姡oAA*  env/reward_mean_?8i$B+M 姡oAA*  env/reward_stdju'F oAB*  objective/entropy>h&O/m]P oAB*" ppo/mean_non_score_reward"},͑%6 oAB*  ppo/mean_scores ?` $$B+M śoAB*  ppo/std_scores(>wU-Q1$B+M oAB*  ppo/loss/totalB=`Lj(pJ GoAB*  ppo/policy/entropy..;[V<)7_ ġoAB*  ppo/policy/approxklC:<-7)7_ oAB*  ppo/policy/policykl<&)7_ oAB*  ppo/policy/clipfrac;?dG0_ ɣoAB*# ! ppo/policy/advantages_meanY2T &sO ߣoAB*  ppo/returns/meanGY %6 󣞪oAB*  ppo/returns/vartEAŨ#wC oAB*  ppo/val/vpred~g5#wC oAB*  ppo/val/errorN6?.&sO toAB*  ppo/val/clipfracff?"x= ʤoAB*  ppo/val/mean.A !{ #oAB*  ppo/val/var^1AHy+K {oAB*  ppo/val/var_explained7au.W oAB*!  time/ppo/compute_rewards8:Y1 ٦oAB*$ " time/ppo/compute_advantagesX4;,E ,oAB*  time/ppo/optimize_step?]s )7_ oAB*  time/ppo/calc_stats.>ҫI$B+M ԧoAB*  time/ppo/total?눒%6 -oAB*  env/reward_mean ?D}$B+M oAB*  env/reward_std(>:RQ"x= oAC*  objective/klBdJ'F oAC*  objective/kl_coef"\U> 'F oAC*  objective/entropy(7k]/m]P $oAC*" ppo/mean_non_score_reward{* %6 6oAC*  ppo/mean_scores_? $B+M GoAC*  ppo/std_scoresjm/$B+M oAC*  ppo/loss/total{j&sO oAC*  ppo/val/clipfrac>RL"x= oAC*  ppo/val/meanؿ{!{ NoAC*  ppo/val/vary#Aڟ +K yoAC*  ppo/val/var_explainedx?t'F oAC*  ppo/learning_rateϸ:ռ+K oAC*  time/ppo/forward_pass0B>R.W oAC*!  time/ppo/compute_rewardsP:ݫ1 oAC*$ " time/ppo/compute_advantages;;TKB,E oAC*  time/ppo/optimize_stepV|?rE)7_ oAC*  time/ppo/calc_statsp0>w$B+M oAC*  time/ppo/total!?c%6 oAC*  env/reward_mean_?^$B+M )oAC*  env/reward_stdj<.{"x= ɩ6oAD*  objective/klBF'F (6oAD*  objective/kl_coefU>I'F A6oAD*  objective/entropyƈ/m]P T6oAD*" ppo/mean_non_score_rewardŗ*0!%6 f6oAD*  ppo/mean_scores_?UO$B+M x6oAD*  ppo/std_scoresj)7_ 6oAD*  ppo/policy/policykli)7_ ,6oAD*  ppo/policy/clipfrac0_ C6oAD*# ! ppo/policy/advantages_mean̑3`&sO 6oAD*  ppo/returns/mean %6 ˴6oAD*  ppo/returns/varA #wC 6oAD*  ppo/val/vpred=#wC 6oAD*  ppo/val/errorF>V_&sO 6oAD*  ppo/val/clipfrach>"x= 6oAD*  ppo/val/meanxտJ1)?!{ 6oAD*  ppo/val/var{A+K 6oAD*  ppo/val/var_explainedz?bo'F R6oAD*  ppo/learning_rateϸ:3+K 6oAD*  time/ppo/forward_passpB> .W 6oAD*!  time/ppo/compute_rewards:ĵ1 ^6oAD*$ " time/ppo/compute_advantagesT>;ds5,E 6oAD*  time/ppo/optimize_step|?TB)7_ 6oAD*  time/ppo/calc_stats)0>%4*$B+M U6oAD*  time/ppo/totalfA?%6 6oAD*  env/reward_mean_?f $B+M 6oAD*  env/reward_stdj< ="x= HxhoAE*  objective/klB#'F xhoAE*  objective/kl_coefU>Y'F xhoAE*  objective/entropy|/m]P xhoAE*" ppo/mean_non_score_reward*q%6 xhoAE*  ppo/mean_scores_?]I$B+M yhoAE*  ppo/std_scoresj<5-%6 dyhoAE*  ppo/loss/policy43V$B+M zhoAE*  ppo/loss/value,=Πq$B+M {hoAE*  ppo/loss/total;RJ(pJ ~hoAE*  ppo/policy/entropy{L0)7_ ~hoAE*  ppo/policy/approxkl@!)7_ ~hoAE*  ppo/policy/policyklj#)7_ hoAE*  ppo/policy/clipfrac0_ hoAE*# ! ppo/policy/advantages_mean43/S&sO 3hoAE*  ppo/returns/meanz=H%6 EhoAE*  ppo/returns/varq.A bʠ#wC XhoAE*  ppo/val/vpredsa#wC hhoAE*  ppo/val/errorV=?|"U&sO yhoAE*  ppo/val/clipfrac43?>Wt"x= hoAE*  ppo/val/mean{$O!{ 5hoAE*  ppo/val/varAmb\+K hoAE*  ppo/val/var_explainedM[~?>'F hoAE*  ppo/learning_rateϸ:c+K NhoAE*  time/ppo/forward_passP@>9.W hoAE*!  time/ppo/compute_rewardsp:9RK(1 hoAE*$ " time/ppo/compute_advantages=;,E RhoAE*  time/ppo/optimize_stepL;|?)7_ hoAE*  time/ppo/calc_stats=0>ِ$B+M hoAE*  time/ppo/total\ˬ?5k%6 ShoAE*  env/reward_mean_?$B+M hoAE*  env/reward_stdj<6Et"x= koAF*  objective/klBF'F oAF*  objective/kl_coefU>.o'F oAF*  objective/entropyKv/m]P oAF*" ppo/mean_non_score_reward*{R+q%6 oAF*  ppo/mean_scores_?Z$B+M -oAF*  ppo/std_scoresj<.-oAF*  tokens/queries_len_meanBZ0Z,E OoAF*  tokens/queries_len_stdY_\/m]P aoAF*" tokens/responses_len_meanAQT.W soAF*!  tokens/responses_len_stdj?%6 oAF*  ppo/loss/policygf3$B+M oAF*  ppo/loss/valuer=ቤq$B+M voAF*  ppo/loss/total@M;(pJ oAF*  ppo/policy/entropyo)7_ oAF*  ppo/policy/approxkl#)7_ oAF*  ppo/policy/policykl|W 8)7_ oAF*  ppo/policy/clipfrac<0_ oAF*# ! ppo/policy/advantages_meangf@&sO oAF*  ppo/returns/meanԝ Zu"x= oAF*  ppo/val/meanre?D!{ oAF*  ppo/val/varz.W FoAF*!  time/ppo/compute_rewards:]1 UoAF*$ " time/ppo/compute_advantages<=;p,E eoAF*  time/ppo/optimize_step|?)7_ soAF*  time/ppo/calc_stats1>ca'$B+M oAF*  time/ppo/totalp ?B~%6 `oAF*  env/reward_mean_?1H[$B+M oAF*  env/reward_stdj$N'F oAG*  objective/entropySH/m]P /oAG*" ppo/mean_non_score_reward*X@%6 AoAG*  ppo/mean_scores_?{N$B+M SoAG*  ppo/std_scoresj< >-[.W oAG*!  time/ppo/compute_rewards:<1 GoAG*$ " time/ppo/compute_advantages@; ,E oAG*  time/ppo/optimize_steph?R)7_ oAG*  time/ppo/calc_statsP0>TZ$B+M PoAG*  time/ppo/total ? &%6 oAG*  env/reward_mean_?d+$B+M oAG*  env/reward_stdjM'F #PfoAH*  objective/entropy/m]P 6PfoAH*" ppo/mean_non_score_reward+B%6 IPfoAH*  ppo/mean_scores_?>q $B+M ZPfoAH*  ppo/std_scoresj+K RYfoAH*  ppo/val/var_explained*?X'F YfoAH*  ppo/learning_rateϸ:+K ZfoAH*  time/ppo/forward_passBC>IauV.W oZfoAH*!  time/ppo/compute_rewards :j1 ZfoAH*$ " time/ppo/compute_advantages;;e,E 6[foAH*  time/ppo/optimize_step(}?Aã;)7_ [foAH*  time/ppo/calc_stats0>@ cD$B+M [foAH*  time/ppo/total?O%6 ,\foAH*  env/reward_mean_?|N$B+M }\foAH*  env/reward_stdjߨ'F űoAI*  objective/entropy//m]P ٱoAI*" ppo/mean_non_score_reward#+@}%6 챲oAI*  ppo/mean_scores_?c$B+M oAI*  ppo/std_scoresj<'-!S.W RòoAI*!  time/ppo/compute_rewards(:*K1 |òoAI*$ " time/ppo/compute_advantages4;Ke,E ŲoAI*  time/ppo/optimize_stept7}?~)7_ ŲoAI*  time/ppo/calc_stats0>*|$B+M ƲoAI*  time/ppo/totalj?%6 DzoAI*  env/reward_mean_?&$B+M ?ȲoAI*  env/reward_stdjO]'F 8oAJ*  objective/entropyb{D/m]P JoAJ*" ppo/mean_non_score_reward?+8x%6 ]oAJ*  ppo/mean_scores_?B$B+M loAJ*  ppo/std_scoresj<]X-oAJ*  ppo/val/errorw;oT&sO NoAJ*  ppo/val/clipfracc ("x= ]oAJ*  ppo/val/mean(R!{ oAJ*  ppo/val/varl-Aq+K oAJ*  ppo/val/var_explained?D>"'F ӳoAJ*  ppo/learning_rateϸ:un+K ;oAJ*  time/ppo/forward_passPH>pGh.W oAJ*!  time/ppo/compute_rewards:2y"1 oAJ*$ " time/ppo/compute_advantages:;:,E 9oAJ*  time/ppo/optimize_stepY?X%)7_ oAJ*  time/ppo/calc_stats7>Y$B+M յoAJ*  time/ppo/total?l%6 $oAJ*  env/reward_mean_?`u6$B+M qoAJ*  env/reward_stdj<ܑ"x= |oAK*  objective/klBuA'F o|oAK*  objective/kl_coefjtV>1'F |oAK*  objective/entropyod/m]P |oAK*" ppo/mean_non_score_reward[+Q}%6 |oAK*  ppo/mean_scores_?$B+M ʹ|oAK*  ppo/std_scoresj?+.W |oAK*!  time/ppo/compute_rewardsH:_Ӵ1 f|oAK*$ " time/ppo/compute_advantages@;Ag,E |oAK*  time/ppo/optimize_step|}?rq)7_ |oAK*  time/ppo/calc_stats1>1~$B+M o|oAK*  time/ppo/totalԶ?@I~%6 |oAK*  env/reward_mean_?$B+M |oAK*  env/reward_stdj<7J"x= pAL*  objective/klB2W; 'F XpAL*  objective/kl_coefV>Fd'F qpAL*  objective/entropy/m]P pAL*" ppo/mean_non_score_rewardw+%6 pAL*  ppo/mean_scores_?Ñ$B+M pAL*  ppo/std_scoresj<-^.W :pAL*!  time/ppo/compute_rewards;v1 pAL*$ " time/ppo/compute_advantagesZ;,E pAL*  time/ppo/optimize_step~?q )7_ 7pAL*  time/ppo/calc_stats ,>LW$B+M pAL*  time/ppo/totalrh?pf%6 pAL*  env/reward_mean_?$B+M *pAL*  env/reward_stdj<3W"x= V]pAM*  objective/klBv'F V]pAM*  objective/kl_coefV>!]'F W]pAM*  objective/entropy%T7/m]P &W]pAM*" ppo/mean_non_score_reward+$s%6 :W]pAM*  ppo/mean_scores_?$B+M KW]pAM*  ppo/std_scoresj<9#,-' `!{ ]]pAM*  ppo/val/varE+A9s+K ^]pAM*  ppo/val/var_explained2?5'F m^]pAM*  ppo/learning_rateϸ:ӽC+K ^]pAM*  time/ppo/forward_pass@A>.W !_]pAM*!  time/ppo/compute_rewardsH:L~1 x_]pAM*$ " time/ppo/compute_advantages@>;N'[,E _]pAM*  time/ppo/optimize_step?"׆)7_ +`]pAM*  time/ppo/calc_stats4>Dѩ$B+M `]pAM*  time/ppo/totalp]?e!%6 `]pAM*  env/reward_mean_? CL$B+M .a]pAM*  env/reward_stdj<$h"x= pAN*  objective/klBK֑'F 2pAN*  objective/kl_coefV>)'F LpAN*  objective/entropy6/m]P bpAN*" ppo/mean_non_score_reward+db+%6 upAN*  ppo/mean_scores_?U5$B+M pAN*  ppo/std_scoresj< -Da)7_ 'pAN*  ppo/policy/clipfrac5 0_ 'pAN*# ! ppo/policy/advantages_mean 3&&sO 'pAN*  ppo/returns/meanl( `c%6 'pAN*  ppo/returns/var),A#wC (pAN*  ppo/val/vpred} B#wC (pAN*  ppo/val/error"( ;ӻ9C&sO -(pAN*  ppo/val/clipfrac٫7"x= (pAN*  ppo/val/mean 9LxL!{ )pAN*  ppo/val/vart+AL+K \)pAN*  ppo/val/var_explainedA?+-'F )pAN*  ppo/learning_rateϸ:r+K *pAN*  time/ppo/forward_pass K>}.W l*pAN*!  time/ppo/compute_rewards;;?Ư1 *pAN*$ " time/ppo/compute_advantages:;rD,E +pAN*  time/ppo/optimize_step*1?Y|f)7_ w+pAN*  time/ppo/calc_stats`:>zd$B+M +pAN*  time/ppo/totalʷ?LW%6 (,pAN*  env/reward_mean_?r3u$B+M |,pAN*  env/reward_stdj 2G'F 7$pAO*  objective/entropyRK/m]P K$pAO*" ppo/mean_non_score_reward5+fB%6 \$pAO*  ppo/mean_scores_??$B+M m$pAO*  ppo/std_scoresjR.W ;$pAO*!  time/ppo/compute_rewards:]exe1 $pAO*$ " time/ppo/compute_advantagesx9;L,E $pAO*  time/ppo/optimize_step?q )7_ 8$pAO*  time/ppo/calc_stats0>1($B+M $pAO*  time/ppo/total?@Q5%6 $pAO*  env/reward_mean_?G$B+M 6$pAO*  env/reward_stdj