HH A brain.Event:2R. ,tensorboard.summary.writer.event_file_writer08<"x= tA*  objective/kl-W+'F euA*  objective/kl_coefL>L~'F uA*  objective/entropy B>/m]P uA*" ppo/mean_non_score_reward^%6 uA*  ppo/mean_scores^u=Xm$B+M uA*  ppo/std_scores²=_2-(N"0_ A*# ! ppo/policy/advantages_mean"z&sO A*  ppo/returns/meanP?%6 A*  ppo/returns/var>N#wC A*  ppo/val/vpred>3#wC A*  ppo/val/error,AO&sO A*  ppo/val/clipfrac>ĵ="x= ڈA*  ppo/val/meanO@B-t!{ A*  ppo/val/var5@jB.+K 8A*  ppo/val/var_explainedaz'F 劍A*  ppo/learning_rateo:r+K A*  time/ppo/forward_pass0gG>f+.W 5A*!  time/ppo/compute_rewards;vN 1 ،A*$ " time/ppo/compute_advantages|;E,E zA*  time/ppo/optimize_stepJ=? /)7_ A*  time/ppo/calc_statsP;>\*$B+M A*  time/ppo/totalͺ?W%%6 _A*  env/reward_mean^u=Ѣ$B+M A*  env/reward_std²=\"x= GA*  objective/kl'LB'F A*  objective/kl_coef?L>ř`'F 5A*  objective/entropyB;/m]P JA*" ppo/mean_non_score_rewardQ9%6 ]A*  ppo/mean_scores!>XOS$B+M rA*  ppo/std_scores1u>Ȁ-Y0_ 1A*# ! ppo/policy/advantages_meanL1oi&sO KA*  ppo/returns/mean^A%6 _A*  ppo/returns/var-A+"+h#wC pA*  ppo/val/vpredku#wC [A*  ppo/val/errornŢAuN&sO A*  ppo/val/clipfrac:?-:"x= A*  ppo/val/meanV@l!{ A*  ppo/val/var>):,+K A*  ppo/val/var_explained\ݹ'F A*  ppo/learning_rateo:Z+K A*  time/ppo/forward_passPG>iԴ.W *A*!  time/ppo/compute_rewardsx: 1 A*$ " time/ppo/compute_advantages j;Î,E kA*  time/ppo/optimize_stepDž?)7_ A*  time/ppo/calc_stats<>^$B+M A*  time/ppo/total ?%6 EA*  env/reward_mean!>P$B+M A*  env/reward_std1u>*"x= NA*  objective/kl{#B\$'F KNA*  objective/kl_coefL>w'F gNA*  objective/entropyBi /m]P yNA*" ppo/mean_non_score_rewardоBz%6 NA*  ppo/mean_scores>>$B+M NA*  ppo/std_scoresR>0-:(pJ kNA*  ppo/policy/entropy@An])7_ }NA*  ppo/policy/approxkl ?ڛ)7_ NA*  ppo/policy/policykl8z>)7_ $NA*  ppo/policy/clipfrac33><0_ ?NA*# ! ppo/policy/advantages_meancz&sO SNA*  ppo/returns/meanT+%6 fNA*  ppo/returns/vare@]WV(#wC NA*  ppo/val/vpred vd#wC NA*  ppo/val/error@W>Ȓ&sO NA*  ppo/val/clipfrac?ۼ"x= NA*  ppo/val/meanh!{ ϜNA*  ppo/val/varC3>Ø+K NA*  ppo/val/var_explainedľ'F NA*  ppo/learning_rateo:<+K *NA*  time/ppo/forward_passD>~.W :NA*!  time/ppo/compute_rewardsP:bi1 1NA*$ " time/ppo/compute_advantagesk;M,E ]NA*  time/ppo/optimize_step2?s)7_ rNA*  time/ppo/calc_stats`Z;>ɮ~$B+M NA*  time/ppo/totalж?QO%6 NA*  env/reward_mean>>K*$B+M 4NA*  env/reward_stdR>ef"x= #A*  objective/klC.BW(Z'F F#A*  objective/kl_coefUL>;'F #A*  objective/entropyC [/m]P #A*" ppo/mean_non_score_reward3߾j&%6 #A*  ppo/mean_scores@&="Z$B+M /#A*  ppo/std_scoresX=1x- (pJ $#A*  ppo/policy/entropy@Ja)7_ $#A*  ppo/policy/approxkl>WAl)7_ )#A*  ppo/policy/policyklU9=:k)7_ 4)#A*  ppo/policy/clipfrac>Y/U0_ F-#A*# ! ppo/policy/advantages_mean Xp&sO r-#A*  ppo/returns/mean02J%6 /#A*  ppo/returns/varo]@#6#wC 0#A*  ppo/val/vpredR8#wC 4#A*  ppo/val/errorv,@aP&P&sO 4#A*  ppo/val/clipfracf ?ێ"x= 4#A*  ppo/val/mean V!{ 4#A*  ppo/val/var/=+O+K 4#A*  ppo/val/var_explainedB 'F 5#A*  ppo/learning_rateo:R@4+K 6#A*  time/ppo/forward_passp5>8Ԡ.W R7#A*!  time/ppo/compute_rewards:z1 7#A*$ " time/ppo/compute_advantagesTP;B,E 8#A*  time/ppo/optimize_stepu?9e)7_ ;9#A*  time/ppo/calc_stats@+>$B+M 9#A*  time/ppo/total?G.%6 s:#A*  env/reward_mean@&=$/$B+M ;#A*  env/reward_stdX=M@"x= V`1(A*  objective/kl3B$3"'F `1(A*  objective/kl_coefM>'F `1(A*  objective/entropyC6Պ/m]P Pa1(A*" ppo/mean_non_score_reward:#>J u%6 ea1(A*  ppo/mean_scores/<=l$B+M ya1(A*  ppo/std_scores;1=Ul- (pJ hm1(A*  ppo/policy/entropyb!@1)7_ m1(A*  ppo/policy/approxkl=ѯ)7_ m1(A*  ppo/policy/policykld(=K\)7_ m1(A*  ppo/policy/clipfracff>LT0_ m1(A*# ! ppo/policy/advantages_mean2&sO m1(A*  ppo/returns/meanRۅD '%6 m1(A*  ppo/returns/var̜@ɂ#E#wC m1(A*  ppo/val/vpred=&#wC n1(A*  ppo/val/error_@fL&sO n1(A*  ppo/val/clipfrac>"x= Ao1(A*  ppo/val/mean>x0i4!{ o1(A*  ppo/val/var42=iSj+K p1(A*  ppo/val/var_explained`Qc1n'F !q1(A*  ppo/learning_rateo:+K q1(A*  time/ppo/forward_pass=3.W ar1(A*!  time/ppo/compute_rewards:H1 r1(A*$ " time/ppo/compute_advantages;+ $,E s1(A*  time/ppo/optimize_step?0)7_ /t1(A*  time/ppo/calc_stats`Y=8 $B+M t1(A*  time/ppo/totalW?|%6 _u1(A*  env/reward_mean/<$B+M u1(A*  env/reward_std;1=Π"x= L)A*  objective/kll'KB 'F >L)A*  objective/kl_coef1M> 'F ZL)A*  objective/entropy@Cј/m]P oL)A*" ppo/mean_non_score_rewardD]ȹ%6 L)A*  ppo/mean_scoresE<ҋ$B+M L)A*  ppo/std_scores =-0,E L)A*  tokens/queries_len_std3/m]P L)A*" tokens/responses_len_meanA.W L)A*!  tokens/responses_len_std%6 L)A*  ppo/loss/policy^ Ƚȯ_$B+M L)A*  ppo/loss/valuef@; $B+M L)A*  ppo/loss/total i>^I(pJ  L)A*  ppo/policy/entropyɏ@;l)7_ 2 L)A*  ppo/policy/approxkl8=kط)7_ G L)A*  ppo/policy/policykl<€)7_ \ L)A*  ppo/policy/clipfrac3?{ i0_ p L)A*# ! ppo/policy/advantages_meanLL.|&sO L)A*  ppo/returns/mean$5a%6 L)A*  ppo/returns/var~@#wC L)A*  ppo/val/vpred9O{#wC L)A*  ppo/val/errorAIT&sO L)A*  ppo/val/clipfrac?"x= L)A*  ppo/val/mean2#==Ll!{ SL)A*  ppo/val/var6U=lo8+K L)A*  ppo/val/var_explainedQ['F L)A*  ppo/learning_rateo:+K L)A*  time/ppo/forward_pass@= 0.W `L)A*!  time/ppo/compute_rewards0:C1 L)A*$ " time/ppo/compute_advantages;`g,E L)A*  time/ppo/optimize_step8.?/`)7_ dL)A*  time/ppo/calc_stats@j=@$B+M L)A*  time/ppo/totalW?:]d%6 L)A*  env/reward_meanE>F'F [Z*A*  objective/entropy> CA!/m]P qZ*A*" ppo/mean_non_score_reward4%6 Z*A*  ppo/mean_scoresb;}$B+M Z*A*  ppo/std_scores2`s(pJ Z*A*  ppo/policy/entropy@(o)7_ +Z*A*  ppo/policy/approxklA>^)7_ EZ*A*  ppo/policy/policyklT=։)7_ WZ*A*  ppo/policy/clipfracff>6H0_ oZ*A*# ! ppo/policy/advantages_meanٲ*6&sO Z*A*  ppo/returns/mean/c%6 Z*A*  ppo/returns/var(@h>#wC Z*A*  ppo/val/vpreddo^5#wC Z*A*  ppo/val/errorYd@e"&sO Z*A*  ppo/val/clipfrac33>N"x= CZ*A*  ppo/val/meanM?И!{ tZ*A*  ppo/val/var;\=+K Z*A*  ppo/val/var_explainedбT'F Z*A*  ppo/learning_rateo:O+K Z*A*  time/ppo/forward_pass@=M.W Z*A*!  time/ppo/compute_rewards;,hj1 4Z*A*$ " time/ppo/compute_advantagesX;`:,E Z*A*  time/ppo/optimize_step,?/)7_ \~,'F _J.A*  objective/entropyBmHJ/m]P tJ.A*" ppo/mean_non_score_reward 0%6 J.A*  ppo/mean_scores id(pJ P.A*  ppo/policy/entropya@O)7_ ?Q.A*  ppo/policy/approxklj><)7_ T.A*  ppo/policy/policyklb=2)7_ T.A*  ppo/policy/clipfrac> 0_ T.A*# ! ppo/policy/advantages_meanZ܄3\&sO T.A*  ppo/returns/mean٣GpW%6 U.A*  ppo/returns/varlE@(¶}#wC U.A*  ppo/val/vpredjH}#wC V.A*  ppo/val/errorA\&sO W.A*  ppo/val/clipfrac>c"x= bW.A*  ppo/val/meanV!{ W.A*  ppo/val/varȭ>++K X.A*  ppo/val/var_explained৳!'F `X.A*  ppo/learning_rateo:fCl+K X.A*  time/ppo/forward_passP"> .W Y.A*!  time/ppo/compute_rewards:pL1 aY.A*$ " time/ppo/compute_advantages2; i,E Y.A*  time/ppo/optimize_stepU?ע)7_ Z.A*  time/ppo/calc_statsp>$B+M WZ.A*  time/ppo/total9?"o%6 Z.A*  env/reward_mean >_'F [/A *  objective/entropy^B̗/m]P $[/A *" ppo/mean_non_score_rewardn n\%6 6[/A *  ppo/mean_scoresy/2r$B+M H[/A *  ppo/std_scores\?vq-So(pJ M_/A *  ppo/policy/entropy@q4)7_ ^_/A *  ppo/policy/approxklͻ>*)7_ n_/A *  ppo/policy/policyklY>:F)7_ _/A *  ppo/policy/clipfracL?0_ _/A *# ! ppo/policy/advantages_mean4332j*_&sO _/A *  ppo/returns/meanz;%6 _/A *  ppo/returns/varAPr1#wC _/A *  ppo/val/vpred*٦#wC 0`/A *  ppo/val/error@e&sO `/A *  ppo/val/clipfrac43>?"x= `/A *  ppo/val/mean߈t!{ &a/A *  ppo/val/varHZ>+K {a/A *  ppo/val/var_explained=7> 'F a/A *  ppo/learning_rateo:;+K b/A *  time/ppo/forward_pass>V.W ~b/A *!  time/ppo/compute_rewards8:_1 b/A *$ " time/ppo/compute_advantages9;,,E c/A *  time/ppo/optimize_step0`U?)7_ fc/A *  time/ppo/calc_statsh>Q63$B+M c/A *  time/ppo/totallk?GЯ%6 c/A *  env/reward_meany/2H.$B+M Nd/A *  env/reward_std\?~"x= q0A *  objective/kliB%Ą'F ˪0A *  objective/kl_coefM>ԯ{'F 0A *  objective/entropy*B7/m]P 0A *" ppo/mean_non_score_rewardn\fpJ%6 0A *  ppo/mean_scores^U<]y$B+M &0A *  ppo/std_scores|< -Y(pJ a0A *  ppo/policy/entropya@)7_ T0A *  ppo/policy/approxklF>_)7_ 0A *  ppo/policy/policykl Ru><™0_ 0A *# ! ppo/policy/advantages_meanƋ"&sO <0A *  ppo/returns/meanб2%6 [0A *  ppo/returns/varA FF#wC P0A *  ppo/val/vpred`#wC 0A *  ppo/val/error@>&sO 0A *  ppo/val/clipfrac>ʿ"x= 0A *  ppo/val/meanҏ|d!{ E0A *  ppo/val/var?pG+K ]0A *  ppo/val/var_explainedDh>Oi+'F p0A *  ppo/learning_rateo:\+K ̽0A *  time/ppo/forward_pass3>Xپy.W C0A *!  time/ppo/compute_rewards8:\LA1 0A *$ " time/ppo/compute_advantagesH9;/,E 0A *  time/ppo/optimize_step~U?Sa()7_ M0A *  time/ppo/calc_stats >+<$B+M 0A *  time/ppo/totalI?@'-%6 0A *  env/reward_mean^U#'F 2A *  objective/entropyoB/m]P 2A *" ppo/mean_non_score_reward%T%6 2A *  ppo/mean_scoresgQάx(pJ 2A *  ppo/policy/entropyn@k)7_ P2A *  ppo/policy/approxklk?)7_ 2A *  ppo/policy/policykl>?|Z)7_ 2A *  ppo/policy/clipfracff>dq0_ ~2A *# ! ppo/policy/advantages_mean̲]uh&sO 2A *  ppo/returns/meanEq0%6 @2A *  ppo/returns/var( "x= 2A *  ppo/val/mean !{ 2A *  ppo/val/varoNn@D}+K d2A *  ppo/val/var_explained?8c'F 2A *  ppo/learning_rateo:;r+K 12A *  time/ppo/forward_passZ>.W 2A *!  time/ppo/compute_rewards:471 2A *$ " time/ppo/compute_advantages3;tx,E :2A *  time/ppo/optimize_stepV?9?w0)7_ 2A *  time/ppo/calc_stats>/$B+M 2A *  time/ppo/totalf?5Pi%6 :2A *  env/reward_meang<|NO$B+M 2A *  env/reward_stdܥ7<|"x= *b3A *  objective/klaBظ'F b3A *  objective/kl_coefM>ة'F b3A *  objective/entropyjUB3vQ/m]P b3A *" ppo/mean_non_score_rewardQ9T%6 b3A *  ppo/mean_scores3=ݼf$B+M b3A *  ppo/std_scores!>`X-c3A *  ppo/loss/policy4&C$B+M g3A *  ppo/loss/value;@aj$B+M Ag3A *  ppo/loss/totalB>;(pJ Xg3A *  ppo/policy/entropyٛ@@)7_ lg3A *  ppo/policy/approxklH;?`)7_ }g3A *  ppo/policy/policykl> u)7_ g3A *  ppo/policy/clipfracgf>v0_ g3A *# ! ppo/policy/advantages_mean˖&sO g3A *  ppo/returns/mean%6 g3A *  ppo/returns/var'nAhA#wC g3A *  ppo/val/vpredtf#wC i3A *  ppo/val/errorx@*&sO &j3A *  ppo/val/clipfrac>3i"x= >j3A *  ppo/val/meanbЯ!{ Rj3A *  ppo/val/var<@Ow+K dj3A *  ppo/val/var_explainedt.=?w0'F uj3A *  ppo/learning_rateo:ʋ+K j3A *  time/ppo/forward_passp>Jqt.W j3A *!  time/ppo/compute_rewardsx:~1 j3A *$ " time/ppo/compute_advantagesL4;]rpQ,E k3A *  time/ppo/optimize_step U?)7_ ]k3A *  time/ppo/calc_stats>R]`F$B+M k3A *  time/ppo/totalL?w?%6 l3A *  env/reward_mean3=W$B+M el3A *  env/reward_std!>_Z"x= 74A *  objective/kl2B'F m74A *  objective/kl_coefJN>$c'F 74A *  objective/entropyB^]/m]P 74A *" ppo/mean_non_score_rewardI %6 74A *  ppo/mean_scoreso<{-R$B+M ˎ74A *  ppo/std_scoresk=Ҹ-ȿ<(pJ 74A *  ppo/policy/entropyai@ h)7_ 74A *  ppo/policy/approxkl>)7_ f74A *  ppo/policy/policykl]>ԋ)7_ ƒ74A *  ppo/policy/clipfrac>T[0_ *74A *# ! ppo/policy/advantages_meanaбd&sO 74A *  ppo/returns/mean]%6 74A *  ppo/returns/vaȓA?p#wC N74A *  ppo/val/vpred;ǹ#wC 74A *  ppo/val/errorʕ@)&sO 74A *  ppo/val/clipfracw>ӵw"x= [74A *  ppo/val/mean9 !{ 74A *  ppo/val/varz@?+K 74A *  ppo/val/var_explainedb?>?oX,'F Ֆ74A *  ppo/learning_rateo:+K 374A *  time/ppo/forward_pass>c.W 74A *!  time/ppo/compute_rewards:Y1 74A *$ " time/ppo/compute_advantagesRa$B+M 74A *  time/ppo/total?Vm%6 n74A *  env/reward_meano$'F ^5A*  objective/kl_coef?N>S_'F ^5A*  objective/entropy.B&MOT/m]P ^5A*" ppo/mean_non_score_rewardG%6 #^5A*  ppo/mean_scoresq<_$B+M 6^5A*  ppo/std_scores<N-M" (pJ -^5A*  ppo/policy/entropy5 @,HD^)7_ ^5A*  ppo/policy/approxklq@?{A')7_ ^5A*  ppo/policy/policykle>+)7_ '^5A*  ppo/policy/clipfrac>p"0_ <^5A*# ! ppo/policy/advantages_mean)3Ž&sO N^5A*  ppo/returns/mean\r,&}%6 `^5A*  ppo/returns/var S"x= ^5A*  ppo/val/mean*!{ ^5A*  ppo/val/varڈAw)V+K ^5A*  ppo/val/var_explained W?H32'F d ^5A*  ppo/learning_rateo:6Q|+K 4"^5A*  time/ppo/forward_pass>`.W b"^5A*!  time/ppo/compute_rewards: ݆1 y"^5A*$ " time/ppo/compute_advantages6;TQ,E "^5A*  time/ppo/optimize_step|TV?w)7_ "^5A*  time/ppo/calc_stats l>)_X$B+M "^5A*  time/ppo/totalHe?F%6 %$^5A*  env/reward_meanq<+$B+M $^5A*  env/reward_std<8"x= >Cd6A*  objective/klj,B9VC'F Cd6A*  objective/kl_coef`N>dR'F Cd6A*  objective/entropy3^B.)/m]P Cd6A*" ppo/mean_non_score_reward`WVN%6 Cd6A*  ppo/mean_scores`|< $B+M Dd6A*  ppo/std_scores<<-(pJ 2Gd6A*  ppo/policy/entropyp~v@X)7_ Gd6A*  ppo/policy/approxklXQ?()7_ Hd6A*  ppo/policy/policykl>]4w)7_ sHd6A*  ppo/policy/clipfrac>,0_ Hd6A*# ! ppo/policy/advantages_mean2"&sO :Id6A*  ppo/returns/mean?%6 Id6A*  ppo/returns/varQYA [#wC Id6A*  ppo/val/vpred*˺#wC OJd6A*  ppo/val/errorR@@ˉ&sO Jd6A*  ppo/val/clipfrac33>WO"x= Kd6A*  ppo/val/meanP@x!{ aKd6A*  ppo/val/var~AA+K Kd6A*  ppo/val/var_explainedf]? F'F Ld6A*  ppo/learning_rateo:Tn`+K sLd6A*  time/ppo/forward_pass`>.W Ld6A*!  time/ppo/compute_rewards:1 +Md6A*$ " time/ppo/compute_advantages4;a+,E Md6A*  time/ppo/optimize_stepJV? ع)7_ Md6A*  time/ppo/calc_statsP>C$B+M GNd6A*  time/ppo/total?*%6 Nd6A*  env/reward_mean`|<1$B+M Nd6A*  env/reward_std<ɐ"x= 7A*  objective/klBϿ'F b7A*  objective/kl_coefN>$!'F |7A*  objective/entropyB@/m]P 7A*" ppo/mean_non_score_reward%>`=|%6 7A*  ppo/mean_scoresY <$B+M 7A*  ppo/std_scoresKП(pJ 7A*  ppo/policy/entropyN6x@#\)7_ 7A*  ppo/policy/approxklBݸ@…])7_ 7A*  ppo/policy/policykla?6)7_ )7A*  ppo/policy/clipfrac>C20_ <7A*# ! ppo/policy/advantages_meangf2&sO M7A*  ppo/returns/mean-L%6 `7A*  ppo/returns/varA#wC r7A*  ppo/val/vpredH#wC o7A*  ppo/val/error+nv@/bA&sO 7A*  ppo/val/clipfracff>PN"x= 7A*  ppo/val/meanJd?X!{ 7A*  ppo/val/var >JA}O+K 7A*  ppo/val/var_explainedyZ? {'F 7A*  ppo/learning_rateo:yQ +K 7A*  time/ppo/forward_passj>hg .W 7A*!  time/ppo/compute_rewards:}vZ1 7A*$ " time/ppo/compute_advantages;;xa,E ~7A*  time/ppo/optimize_step8V?d\2)7_ 7A*  time/ppo/calc_statsI>=9$B+M *7A*  time/ppo/total?%6 |7A*  env/reward_meanY \N5'F 8A*  objective/entropyDB%./m]P 8A*" ppo/mean_non_score_reward͌S%6 8A*  ppo/mean_scoresS{@ `vt%6 N8A*  ppo/loss/policy $B+M *8A*  ppo/loss/value>?b2"$B+M )8A*  ppo/loss/total=X(pJ 8A*  ppo/policy/entropya~@f)7_ &8A*  ppo/policy/approxklh(AM)7_ 8A*  ppo/policy/policyklC? j)7_ 8A*  ppo/policy/clipfracN>d0_ T8A*# ! ppo/policy/advantages_mean:=1]_Fa&sO 8A*  ppo/returns/meann|%6 8A*  ppo/returns/var.A+[#wC n8A*  ppo/val/vpredDi:#wC 8A*  ppo/val/error7ACW&sO (8A*  ppo/val/clipfrac]I>eK"x= 8A*  ppo/val/mean!{ 8A*  ppo/val/varAYэ+K 98A*  ppo/val/var_explainedg?p'F 8A*  ppo/learning_rateo:;+K 8A*  time/ppo/forward_pass>YFr.W I8A*!  time/ppo/compute_rewards`:?n1 8A*$ " time/ppo/compute_advantages2;,E 8A*  time/ppo/optimize_steplT?X8)7_ I8A*  time/ppo/calc_stats >4WC$B+M 8A*  time/ppo/total*А?װ%6 8A*  env/reward_meanS)'F 9A*  objective/entropyxB%/m]P '9A*" ppo/mean_non_score_reward<*%6 ;9A*  ppo/mean_scorese<3b$B+M M9A*  ppo/std_scoresͬ=i-N.(pJ 9A*  ppo/policy/entropy}@;[&e)7_ d9A*  ppo/policy/approxkl>pOR)7_ 9A*  ppo/policy/policykl A>z)7_ F9A*  ppo/policy/clipfrac>:{z0_ ^9A*# ! ppo/policy/advantages_meanl42*K&sO q9A*  ppo/returns/mean7g%6 9A*  ppo/returns/varRZAC#wC 9A*  ppo/val/vpred[Q^-#wC 9A*  ppo/val/errorp@XGV&sO 9A*  ppo/val/clipfracF >-lI"x= Y9A*  ppo/val/meanWؗ!{ 9A*  ppo/val/varC?1zb+K 9A*  ppo/val/var_explainedɼ?<'F K9A*  ppo/learning_rateo:=tqz+K 9A*  time/ppo/forward_passP> .W 9A*!  time/ppo/compute_rewards:x1 D9A*$ " time/ppo/compute_advantages@:;zWAR,E 9A*  time/ppo/optimize_stepľV?*)7_ 9A*  time/ppo/calc_statsP>@$B+M :9A*  time/ppo/total?r{%6 9A*  env/reward_meane<$B+M 9A*  env/reward_stdͬ=d1"x= .;A*  objective/klCBOae'F ;A*  objective/kl_coefs^'F ;A*  objective/entropy2ӴB_$ /m]P ;A*" ppo/mean_non_score_rewardZy%6 ϸ;A*  ppo/mean_scores\=$B+M ;A*  ppo/std_scores=p-~Tq0_ ;A*# ! ppo/policy/advantages_mean0Wv&sO ;A*  ppo/returns/mean&"%6 ɿ;A*  ppo/returns/varA.#wC ؿ;A*  ppo/val/vpredl۶#wC ;A*  ppo/val/error|}A4&sO ;A*  ppo/val/clipfrac>/="x= ;A*  ppo/val/meanW!{ ;A*  ppo/val/varR@(]+K ;A*  ppo/val/var_explained~>MCr'F ;A*  ppo/learning_rateo:Cll+K ;A*  time/ppo/forward_passP>t%.W !;A*!  time/ppo/compute_rewards:R? 1 1;A*$ " time/ppo/compute_advantages9;1h,E B;A*  time/ppo/optimize_stepV?.8)7_ R;A*  time/ppo/calc_stats >j)^'$B+M ;A*  time/ppo/total?[~%6 ;A*  env/reward_mean\=C$B+M R;A*  env/reward_std=R"x= 6Kcu'F K5O-+-u0_ Pg#wC $R"x= RZ .W vTQ $B+M U3i"x= d?=A*  objective/kl`) CO<}'F Le?=A*  objective/kl_coef,O>Y%'F ke?=A*  objective/entropyBTf/m]P e?=A*" ppo/mean_non_score_reward| %6 e?=A*  ppo/mean_scoreskk>d$B+M e?=A*  ppo/std_scoresÜ>-)7_ h?=A*  ppo/policy/approxkl@t)7_ Di?=A*  ppo/policy/policykl]l?p)7_ i?=A*  ppo/policy/clipfrac>.0_ j?=A*# ! ppo/policy/advantages_meanJT&sO j?=A*  ppo/returns/meanVɭ"%6 j?=A*  ppo/returns/varՃB'#wC Fk?=A*  ppo/val/vpredTz#wC k?=A*  ppo/val/error>Al&sO k?=A*  ppo/val/clipfrac33>"x= Pl?=A*  ppo/val/meanG$vx!{ l?=A*  ppo/val/var~gA@l++K m?=A*  ppo/val/var_explainedn;?'F bm?=A*  ppo/learning_rateo:Z+K m?=A*  time/ppo/forward_pass4>.W n?=A*!  time/ppo/compute_rewards(:ʥG1 {n?=A*$ " time/ppo/compute_advantages4;-=,E n?=A*  time/ppo/optimize_stepV?})7_ .o?=A*  time/ppo/calc_stats> $B+M o?=A*  time/ppo/total?YW%6 o?=A*  env/reward_meankk>= "x= X>A*  objective/klvC'F `X>A*  objective/kl_coefMO>sP'F }X>A*  objective/entropy`B/m]P X>A*" ppo/mean_non_score_rewardd(H%6 X>A*  ppo/mean_scoresXG>1$B+M X>A*  ppo/std_scores>HR-A*  tokens/queries_len_meanBI J,E X>A*  tokens/queries_len_stddHD/m]P X>A*" tokens/responses_len_meanAy{*.W X>A*!  tokens/responses_len_std%6 X>A*  ppo/loss/policy|Ǔ=$B+M X>A*  ppo/loss/valueA}$B+M X>A*  ppo/loss/total!v?Cp(pJ 2X>A*  ppo/policy/entropypS@ >)7_ JX>A*  ppo/policy/approxkl?)7_ \X>A*  ppo/policy/policykl@?!t)7_ lX>A*  ppo/policy/clipfrac>v)0_ }X>A*# ! ppo/policy/advantages_mean̰wL&sO X>A*  ppo/returns/mean.U[#%6 BX>A*  ppo/returns/varIBF7`#wC X>A*  ppo/val/vpredGAU#wC X>A*  ppo/val/error0lA{'&sO >X>A*  ppo/val/clipfrac>j)c"x= X>A*  ppo/val/meany:X!{ X>A*  ppo/val/varA++K 7X>A*  ppo/val/var_explainedE?š 'F X>A*  ppo/learning_rateo:^+K X>A*  time/ppo/forward_pass>eә+.W 8X>A*!  time/ppo/compute_rewards:pJ1 X>A*$ " time/ppo/compute_advantagesL5;4,E ۶X>A*  time/ppo/optimize_steprV?>l)7_ .X>A*  time/ppo/calc_statsH><ޢ$B+M |X>A*  time/ppo/totaly?l?%6 ڷX>A*  env/reward_meanXG>/P?u$B+M #X>A*  env/reward_std>B"x= Hl?A*  objective/klH C'F Il?A*  objective/kl_coefoO>Rv'F -Il?A*  objective/entropyloBn/m]P BIl?A*" ppo/mean_non_score_rewardSRf%6 SIl?A*  ppo/mean_scores={>\q$B+M fIl?A*  ppo/std_scores>,7-`(pJ XOl?A*  ppo/policy/entropyL@5)7_ pOl?A*  ppo/policy/approxkl?")7_ Ol?A*  ppo/policy/policykl>T.)7_ Ol?A*  ppo/policy/clipfrac̏>@0_ Ol?A*# ! ppo/policy/advantages_mean1h&sO Ol?A*  ppo/returns/mean! ]7r)%6 Ol?A*  ppo/returns/varXViB),*#wC Ol?A*  ppo/val/vpred\B#wC Ol?A*  ppo/val/error @i I&sO  "x= Pl?A*  ppo/val/mean9Z(ɬ!{ Ql?A*  ppo/val/varB+K mQl?A*  ppo/val/var_explainedN`?gŶ'F Ql?A*  ppo/learning_rateo:U+K Rl?A*  time/ppo/forward_pass`e>.W iRl?A*!  time/ppo/compute_rewards:j1 Rl?A*$ " time/ppo/compute_advantages3;!,E Sl?A*  time/ppo/optimize_stepV?#S)7_ bSl?A*  time/ppo/calc_stats>Wo$B+M Sl?A*  time/ppo/total`?%6 Tl?A*  env/reward_mean={>Z1"x= Ox%DA*  objective/klĽC !'F x%DA*  objective/kl_coefO>^R'F x%DA*  objective/entropyzB:/m]P x%DA*" ppo/mean_non_score_rewardML%6 x%DA*  ppo/mean_scores0<)$B+M y%DA*  ppo/std_scores_< n-$B+M A}%DA*  ppo/loss/valuebA$B+M l}%DA*  ppo/loss/total?%(pJ }%DA*  ppo/policy/entropyJ?j)7_ }%DA*  ppo/policy/approxkl@ѽ)7_ }%DA*  ppo/policy/policyklid?r)7_ }%DA*  ppo/policy/clipfrac33>EȠ0_ }%DA*# ! ppo/policy/advantages_mean3w&sO }%DA*  ppo/returns/mean@7/Uى%6 }%DA*  ppo/returns/varΊ.B #wC a~%DA*  ppo/val/vpred -7#wC ~%DA*  ppo/val/errorDAnj&sO %DA*  ppo/val/clipfracff>A"x= %DA*  ppo/val/meanS!{ %DA*  ppo/val/vargAZױ+K %DA*  ppo/val/var_explained >+&'F I%DA*  ppo/learning_rateo:"+K %DA*  time/ppo/forward_pass%>y J.W %DA*!  time/ppo/compute_rewards@:f1 D%DA*$ " time/ppo/compute_advantagesHI;ۚ,E %DA*  time/ppo/optimize_stepCd?T)7_ %DA*  time/ppo/calc_stats0>'%$B+M 3%DA*  time/ppo/totalx?c%6 %DA*  env/reward_mean0< $B+M ׂ%DA*  env/reward_std_<@/t"x= qCEA*  objective/klB'F CEA*  objective/kl_coefO>r ~E'F CEA*  objective/entropyrOB t/m]P CEA*" ppo/mean_non_score_rewardNS%6 CEA*  ppo/mean_scores:R=$B+M *CEA*  ppo/std_scores¿>2-ž!S)7_ CEA*  ppo/policy/policyklה>yE)7_ CEA*  ppo/policy/clipfrac33>0_ 'CEA*# ! ppo/policy/advantages_mean33s283&sO 9CEA*  ppo/returns/meansS<4%6 JCEA*  ppo/returns/vargB~_#wC ZCEA*  ppo/val/vpredlbP#wC jCEA*  ppo/val/error~#ApuD&sO CEA*  ppo/val/clipfrac>4gQ"x= CEA*  ppo/val/meanX6!{ CEA*  ppo/val/vardA+K CEA*  ppo/val/var_explainedJ:?d'F CEA*  ppo/learning_rateo: 8h+K ZCEA*  time/ppo/forward_pass)>-U.W CEA*!  time/ppo/compute_rewards:o1  CEA*$ " time/ppo/compute_advantagesK;y_,E t CEA*  time/ppo/optimize_step\zd?RSh)7_ CEA*  time/ppo/calc_statsp>h$B+M "!CEA*  time/ppo/total?+%6 y!CEA*  env/reward_mean:R=L$B+M !CEA*  env/reward_std¿>α"x= XIA*  objective/klAB;%'F lXIA*  objective/kl_coefO>V'F XIA*  objective/entropyzB/m]P XIA*" ppo/mean_non_score_reward֝9'%6 XIA*  ppo/mean_scores =$B+M XIA*  ppo/std_scores)=Dr-Kͫ)7_ wXIA*  ppo/policy/policyklk7=گV)7_ XIA*  ppo/policy/clipfrac>Vbq0_ =XIA*# ! ppo/policy/advantages_mean At&sO XIA*  ppo/returns/meanUg'Ƿ%6 XIA*  ppo/returns/varB#wC XXIA*  ppo/val/vpred>#wC XIA*  ppo/val/error߀A΂1&sO XIA*  ppo/val/clipfracff>tF"x= aXIA*  ppo/val/mean#i`!{ XIA*  ppo/val/varAb9+K XIA*  ppo/val/var_explained?| 'F tXIA*  ppo/learning_rateo:kɼ+K XIA*  time/ppo/forward_passo1>%.W *XIA*!  time/ppo/compute_rewardsX:k1 XIA*$ " time/ppo/compute_advantagesC; ^0,E XIA*  time/ppo/optimize_step l?%`)7_ 3XIA*  time/ppo/calc_statsP $>BF$B+M XIA*  time/ppo/totalP?;Wy%6 XIA*  env/reward_mean =C|z$B+M AXIA*  env/reward_std)=K"x= JA*  objective/kl"uBM'F cJA*  objective/kl_coef O>7'F JA*  objective/entropy6sBԵ*/m]P JA*" ppo/mean_non_score_reward.%6 JA*  ppo/mean_scoress='1y$B+M JA*  ppo/std_scoresM=1PG-i3W1)7_ uJA*  ppo/policy/policykl?L>)7_ JA*  ppo/policy/clipfrac>Ha<0_ >JA*# ! ppo/policy/advantages_mean3vN&sO JA*  ppo/returns/meanzvV%6 JA*  ppo/returns/varWBo_]#wC kJA*  ppo/val/vpredR^7#wC JA*  ppo/val/error*A0=&sO %JA*  ppo/val/clipfrac?b"x= JA*  ppo/val/mean@Ȋ!{ JA*  ppo/val/varh]AaT^+K OJA*  ppo/val/var_explainedV>5?!'F JA*  ppo/learning_rateo:uH+K =JA*  time/ppo/forward_pass=0>{.W JA*!  time/ppo/compute_rewardsP:ǡ(1  JA*$ " time/ppo/compute_advantagesTE;*,E t JA*  time/ppo/optimize_stepl?k)7_ JA*  time/ppo/calc_stats#>+P$B+M + JA*  time/ppo/totalꄡ?C\S%6 JA*  env/reward_means="2$B+M JA*  env/reward_stdM=o2?"x= KA*  objective/klB#'F KA*  objective/kl_coefP>a'F KA*  objective/entropyVBt/m]P $KA*" ppo/mean_non_score_rewardRVa%6 8KA*  ppo/mean_scoresu= $B+M LKA*  ppo/std_scores>`-(pJ læKA*  ppo/policy/entropy?@4)7_ æKA*  ppo/policy/approxklt>˪$)7_ æKA*  ppo/policy/policyklq=.)7_ æKA*  ppo/policy/clipfrac43]>JD 0_ æKA*# ! ppo/policy/advantages_mean/Y&sO æKA*  ppo/returns/mean{dT%6 æKA*  ppo/returns/varqBfԂ#wC æKA*  ppo/val/vpred C#wC ĦKA*  ppo/val/error[=@Ϻ܌&sO cĦKA*  ppo/val/clipfracff>"x= ĦKA*  ppo/val/mean^$ !{ ŦKA*  ppo/val/varTuBB;E+K mŦKA*  ppo/val/var_explained0L??R'F ŦKA*  ppo/learning_rateo:2+K ƦKA*  time/ppo/forward_pass@q2>6.W qƦKA*!  time/ppo/compute_rewards:1 ƦKA*$ " time/ppo/compute_advantagesD;!`!,E ǦKA*  time/ppo/optimize_step$l?cD)7_ sǦKA*  time/ppo/calc_stats$>7qa$B+M ǦKA*  time/ppo/total?MP%6 ȦKA*  env/reward_meanu=8t$B+M qȦKA*  env/reward_std>F\"x= &LA*  objective/klB>'F *'LA*  objective/kl_coef5

'F G'LA*  objective/entropylB"/m]P ]'LA*" ppo/mean_non_score_rewardbL(%6 r'LA*  ppo/mean_scoresּA=c.$B+M 'LA*  ppo/std_scores 9y=MV=-3E(pJ S,LA*  ppo/policy/entropy!@g)7_ k,LA*  ppo/policy/approxklEO>z#)7_ ~,LA*  ppo/policy/policykl>CL()7_ ,LA*  ppo/policy/clipfracff>R0_ ,LA*# ! ppo/policy/advantages_meanss&sO ,LA*  ppo/returns/meanu%6 1-LA*  ppo/returns/varBaJ#wC 1LA*  ppo/val/vpredNnQr#wC 1LA*  ppo/val/error@Cx&sO 1LA*  ppo/val/clipfrac43>P{D$"x= 1LA*  ppo/val/meand+/!o!{ 1LA*  ppo/val/var4QA'8ˢ+K 2LA*  ppo/val/var_explained&Y?L%'F 2LA*  ppo/learning_rateo:Q&+K /2LA*  time/ppo/forward_pass/>8V.W ?2LA*!  time/ppo/compute_rewards:{1 4LA*$ " time/ppo/compute_advantagesK;ye,E 4LA*  time/ppo/optimize_stepl?ւ\))7_ 4LA*  time/ppo/calc_stats#>>$B+M 5LA*  time/ppo/totalJ?G%6 6LA*  env/reward_meanּA=#$B+M ~6LA*  env/reward_std 9y=qQ"x= MA*  objective/klB;N'F FMA*  objective/kl_coefS^P>٩O'F bMA*  objective/entropy`B7d[/m]P uMA*" ppo/mean_non_score_rewardvO|~em%6 MA*  ppo/mean_scores^=O`$B+M MA*  ppo/std_scores)F=--O(pJ MA*  ppo/policy/entropy@4)7_ MA*  ppo/policy/approxklj=2)7_ #MA*  ppo/policy/policyklp=P)7_ RMA*  ppo/policy/clipfrac@>>0_ iMA*# ! ppo/policy/advantages_mean@&sO MA*  ppo/returns/mean /|#%6 9MA*  ppo/returns/var A@C#wC PMA*  ppo/val/vpreduWxl#wC aMA*  ppo/val/errorp-@h?&sO qMA*  ppo/val/clipfrac̃>v"x= MA*  ppo/val/meanf !!{ MA*  ppo/val/var.W MA*!  time/ppo/compute_rewards8:1 oMA*$ " time/ppo/compute_advantages@C;d,E µMA*  time/ppo/optimize_step@l?yK)7_ MA*  time/ppo/calc_stats@#><$B+M vMA*  time/ppo/total¡?â7%6 MA*  env/reward_mean^=钥$B+M MA*  env/reward_std)F=$"x= OA*  objective/klНBWb'F OA*  objective/entropy?2BO/m]P OA*" ppo/mean_non_score_rewardMSW%6 OA*  ppo/mean_scoresp=q$B+M OA*  ppo/std_scores9>} N-q(pJ OA*  ppo/policy/entropy@ur )7_ OA*  ppo/policy/approxkl(Z>!)7_ OA*  ppo/policy/policykl=+4)7_ OA*  ppo/policy/clipfracff$>)0_ &OA*# ! ppo/policy/advantages_meanffn}!&sO 9OA*  ppo/returns/mean%6 JOA*  ppo/returns/varFA##wC ZOA*  ppo/val/vpredS]o#wC kOA*  ppo/val/error `@:`&sO zOA*  ppo/val/clipfrac23>g"x= OA*  ppo/val/meanݧЭL!{ OA*  ppo/val/var5RAg+K OA*  ppo/val/var_explainedK?8'F OA*  ppo/learning_rateo:+K OA*  time/ppo/forward_pass1>.W OA*!  time/ppo/compute_rewards:'G1 OA*$ " time/ppo/compute_advantagesE;.&,E 0OA*  time/ppo/optimize_stepm?sE,)7_ OA*  time/ppo/calc_stats0T%>$B+M OA*  time/ppo/totalN?F%6 HOA*  env/reward_meanp=4$B+M OA*  env/reward_std9>#۹ "x= l^PA *  objective/kl3Bv'F ^PA *  objective/kl_coefP>W'F ^PA *  objective/entropy@6 /m]P ^PA *" ppo/mean_non_score_rewardC9'%6 ^PA *  ppo/mean_scores,q<>j4$B+M ^PA *  ppo/std_scoresΪ>BЩ-')7_ "^PA *  ppo/policy/approxkl=-)7_ "^PA *  ppo/policy/policykl*M)7_ "^PA *  ppo/policy/clipfrac33<] F0_ "^PA *# ! ppo/policy/advantages_meanff*4!&sO #^PA *  ppo/returns/mean a&%6 #^PA *  ppo/returns/varIA,z_#wC ##^PA *  ppo/val/vpredTѾ#wC 3#^PA *  ppo/val/errorw_AB7z&sO #^PA *  ppo/val/clipfrac ?(c"x= '^PA *  ppo/val/meanp7!{ '^PA *  ppo/val/varr@0+K '^PA *  ppo/val/var_explainedpܽW.'F '^PA *  ppo/learning_rateo:\y+K '^PA *  time/ppo/forward_passp/>x >.W '^PA *!  time/ppo/compute_rewardsX:;1 (^PA *$ " time/ppo/compute_advantagesE;r,E (^PA *  time/ppo/optimize_stephl? U)7_ *(^PA *  time/ppo/calc_statsp$>[)<$B+M ;(^PA *  time/ppo/total@?%6 (^PA *  env/reward_mean,q<>.+$B+M (^PA *  env/reward_stdΪ>ܼ&"x= ˃QA!*  objective/kl7Bbb'F 0QA!*  objective/kl_coefP>Gh'F KQA!*  objective/entropy`NAVACy/m]P `QA!*" ppo/mean_non_score_reward "6%6 rQA!*  ppo/mean_scores=RLc$B+M QA!*  ppo/std_scoresN=-%6 ׄQA!*  ppo/loss/policy;<$B+M QA!*  ppo/loss/value? $B+M ɊQA!*  ppo/loss/totalN>%y(pJ QA!*  ppo/policy/entropy>B)7_ QA!*  ppo/policy/approxkl@=`)7_ #QA!*  ppo/policy/policykl,=<)7_ 3QA!*  ppo/policy/clipfrac<ِ0_ FQA!*# ! ppo/policy/advantages_meanzUݯ&sO WQA!*  ppo/returns/mean8HҔ%6 hQA!*  ppo/returns/var`@d#wC xQA!*  ppo/val/vpred2nfB,#wC QA!*  ppo/val/error&c@"`&sO QA!*  ppo/val/clipfrac>=;$"x= QA!*  ppo/val/meant !{ ٍQA!*  ppo/val/varm?XL8+K QA!*  ppo/val/var_explainedjC>'F QA!*  ppo/learning_rateo:sT+K QA!*  time/ppo/forward_pass 0> .W (QA!*!  time/ppo/compute_rewards:LY1 QA!*$ " time/ppo/compute_advantagesTE;*j},E QA!*  time/ppo/optimize_stephl??0)7_ CQA!*  time/ppo/calc_stats$>YHv$B+M QA!*  time/ppo/totalt?Rϟ%6 QA!*  env/reward_mean=$B+M AQA!*  env/reward_stdN=v@"x= BORA"*  objective/klR(B"@'F ORA"*  objective/kl_coefP>4'F ORA"*  objective/entropy>/m]P ORA"*" ppo/mean_non_score_rewardܾ_1%6 ORA"*  ppo/mean_scoresd=g$B+M PRA"*  ppo/std_scores"}=]~>-((pJ VRA"*  ppo/policy/entropyb.:~ )7_ VRA"*  ppo/policy/approxklc:)7_ VRA"*  ppo/policy/policyklU:h)7_ VRA"*  ppo/policy/clipfrac:i0_ VRA"*# ! ppo/policy/advantages_mean43k3;&sO WRA"*  ppo/returns/meanMY!8%6 WRA"*  ppo/returns/vara@T#wC &WRA"*  ppo/val/vpred<:$#wC 6WRA"*  ppo/val/erroraG?&sO YRA"*  ppo/val/clipfrac?U"x= YRA"*  ppo/val/mean\jap}!{ YRA"*  ppo/val/var?x7+K ZRA"*  ppo/val/var_explained(6?&'F ZRA"*  ppo/learning_rateo:pI +K %ZRA"*  time/ppo/forward_passP1>m'A.W 6ZRA"*!  time/ppo/compute_rewardsh:Ͽ1 GZRA"*$ " time/ppo/compute_advantagesJ;$,E WZRA"*  time/ppo/optimize_stepl?$)7_ hZRA"*  time/ppo/calc_statsp$>=7$B+M ZRA"*  time/ppo/totalJʡ?p%6 [RA"*  env/reward_meand=͚$B+M l[RA"*  env/reward_std"}=& "x= _5TA#*  objective/kl\x(B'F 5TA#*  objective/kl_coef= Q>s77'F 5TA#*  objective/entropyX :,/m]P 5TA#*" ppo/mean_non_score_reward&ܾM%6 5TA#*  ppo/mean_scores,=$B+M +5TA#*  ppo/std_scoresE=[-yp1$B+M 5TA#*  ppo/loss/totalbp)7_ 5TA#*  ppo/policy/approxkl-R)7_ ~5TA#*  ppo/policy/policyklh4ҵN)7_ 5TA#*  ppo/policy/clipfract50_ 5TA#*# ! ppo/policy/advantages_mean926c&sO 5TA#*  ppo/returns/mean%6 5TA#*  ppo/returns/var@#wC 5TA#*  ppo/val/vpredjڮGX#wC 5TA#*  ppo/val/error>&sO 5TA#*  ppo/val/clipfrac?޺"x= 25TA#*  ppo/val/mean:g!{ A5TA#*  ppo/val/vare.@E+K :5TA#*  ppo/val/var_explaineduk?K'F k5TA#*  ppo/learning_rateo:ǣ+K 5TA#*  time/ppo/forward_pass@>1>3&.W 5TA#*!  time/ppo/compute_rewards:O1 5TA#*$ " time/ppo/compute_advantagesJ;G,E 5TA#*  time/ppo/optimize_stepul?r$)7_ ,6TA#*  time/ppo/calc_stats@F%>RG$B+M [6TA#*  time/ppo/totalꪡ?%6 t6TA#*  env/reward_mean,=.|$B+M 6TA#*  env/reward_stdE= B|"x= gxUA$*  objective/kltx(BgR'F xUA$*  objective/kl_coef}+Q>A'F xUA$*  objective/entropy؎9h/m]P xUA$*" ppo/mean_non_score_rewardU>ܾ6%6 yUA$*  ppo/mean_scoresc=8$B+M $yUA$*  ppo/std_scoresr=կH-SWA&sO ~UA$*  ppo/val/clipfrac>"x= ʂUA$*  ppo/val/meanC|!{ UA$*  ppo/val/varx@Ow+K UA$*  ppo/val/var_explained0w?At~'F 'UA$*  ppo/learning_rateo:Ⱥ+K UA$*  time/ppo/forward_passe1>P.W UA$*!  time/ppo/compute_rewards:^Ѐ1 ʅUA$*$ " time/ppo/compute_advantagesxE;(1K,E QUA$*  time/ppo/optimize_stepl?ON)7_ UA$*  time/ppo/calc_statsp$>jl3$B+M UA$*  time/ppo/totalD?h?%6 UA$*  env/reward_meanc=D39$B+M UA$*  env/reward_stdr=Z9A"x= wVA%*  objective/klzx(BϜ'F VA%*  objective/kl_coefMQ>yp'F VA%*  objective/entropy9w/m]P VA%*" ppo/mean_non_score_rewardsbܾh3%6 +VA%*  ppo/mean_scores=>$B+M i%6 VA%*  ppo/returns/var}@*S]K#wC VA%*  ppo/val/vpredʿR1#wC NVA%*  ppo/val/error`>#<&sO VA%*  ppo/val/clipfrac=L:"x= VA%*  ppo/val/meanпs(!{ bVA%*  ppo/val/vari@ u+K VA%*  ppo/val/var_explained[w?'T'F VA%*  ppo/learning_rateo:M_{+K ~VA%*  time/ppo/forward_passТ.>Dt .W VA%*!  time/ppo/compute_rewards@:[)81 FVA%*$ " time/ppo/compute_advantageslB; ,E VA%*  time/ppo/optimize_step^l?)7_ VA%*  time/ppo/calc_stats$>uf@$B+M YVA%*  time/ppo/total2?1q+%6 VA%*  env/reward_mean=$B+M VA%*  env/reward_std=:ro"x= ]XA&*  objective/klx(B_a'F ]XA&*  objective/kl_coef pQ>'F ]XA&*  objective/entropy96,|/m]P #]XA&*" ppo/mean_non_score_rewardܾq6%6 7]XA&*  ppo/mean_scoresЬ=oA$B+M G]XA&*  ppo/std_scores }=Lw-+K &]XA&*  time/ppo/forward_passp0>j.W &]XA&*!  time/ppo/compute_rewards:̐2'1 &]XA&*$ " time/ppo/compute_advantagesF;4 ,E ']XA&*  time/ppo/optimize_stepk?{,)7_ ']XA&*  time/ppo/calc_stats$>\$B+M {']XA&*  time/ppo/total4?p%6 ']XA&*  env/reward_meanЬ=G~n$B+M M(]XA&*  env/reward_std }=l9"x= ,YA'*  objective/klx(B'F :-YA'*  objective/kl_coef^Q>mc'F _-YA'*  objective/entropybɜ9!2/m]P t-YA'*" ppo/mean_non_score_rewardڪܾn=%6 -YA'*  ppo/mean_scores=??}$B+M -YA'*  ppo/std_scores$5|=y<-'F P5YA'*  ppo/learning_rateo:r+K 5YA'*  time/ppo/forward_pass/>.W 6YA'*!  time/ppo/compute_rewards`:D1 ^6YA'*$ " time/ppo/compute_advantagesD;$J_,E 6YA'*  time/ppo/optimize_stepm?Yy)7_ 7YA'*  time/ppo/calc_stats ">O{C$B+M d7YA'*  time/ppo/total?DZ%6 7YA'*  env/reward_mean= $B+M 8YA'*  env/reward_std$5|=K]"x= LZA(*  objective/klx(B+p'F *MZA(*  objective/kl_coefQ>A'F LMZA(*  objective/entropy!97/m]P _MZA(*" ppo/mean_non_score_reward ܾo(%6 qMZA(*  ppo/mean_scoresW=(.x$B+M MZA(*  ppo/std_scores(=>^5-.W UZA(*!  time/ppo/compute_rewardsP:at1 HVZA(*$ " time/ppo/compute_advantages?; 9,E VZA(*  time/ppo/optimize_stepm?MvW)7_ WZA(*  time/ppo/calc_stats#>Ai$B+M \WZA(*  time/ppo/total?Xj%6 WZA(*  env/reward_meanW=͠?L$B+M XZA(*  env/reward_std(=