HH !#WA brain.Event:2R. ,tensorboard.summary.writer.event_file_writerB"x= Օ^A*  objective/kl4'F ^A*  objective/kl_coefL>'F ֖^A*  objective/entropybB9q/m]P 떏^A*" ppo/mean_non_score_reward`%6 ^A*  ppo/mean_scores={$B+M ^A*  ppo/std_scores=)h-0_ ^A*# ! ppo/policy/advantages_mean1d,&sO ڧ^A*  ppo/returns/mean ?۶%6 ^A*  ppo/returns/varv>G/#wC !^A*  ppo/val/vpred"?i#wC 3^A*  ppo/val/error6mA5j&sO E^A*  ppo/val/clipfrac33>2m?"x= _^A*  ppo/val/meanM@SCF!{ ^A*  ppo/val/var&@L +K ^A*  ppo/val/var_explainedEb'F ^A*  ppo/learning_rateo:+K O^A*  time/ppo/forward_pass >cP.W #^A*!  time/ppo/compute_rewards;>1 װ^A*$ " time/ppo/compute_advantagesI;r,E ^A*  time/ppo/optimize_steptNb?:)7_ %^A*  time/ppo/calc_stats >eb!$B+M ˲^A*  time/ppo/totalފ?wm%6 o^A*  env/reward_mean=Pje$B+M ^A*  env/reward_std= "x= lA*  objective/klpBC'F lA*  objective/kl_coef?L>'F slA*  objective/entropyLB}/m]P lA*" ppo/mean_non_score_reward#J>%6 lA*  ppo/mean_scores >$B+M lA*  ppo/std_scoresw4r>ĉ-Y)7_ lA*  ppo/policy/policykl-/>' )7_ ɨlA*  ppo/policy/clipfrac ?S00_ 㨈lA*# ! ppo/policy/advantages_mean)j&sO lA*  ppo/returns/meani P%6 ݫlA*  ppo/returns/var;?j#wC lA*  ppo/val/vpredlL? #wC lA*  ppo/val/error@Sb&sO lA*  ppo/val/clipfrac33>?M"x= -lA*  ppo/val/mean@9}!{ VlA*  ppo/val/var}C0>/$L+K mlA*  ppo/val/var_explainedNm;'F lA*  ppo/learning_rateo::8+K lA*  time/ppo/forward_passp,>bl.W hlA*!  time/ppo/compute_rewards:}O1 lA*$ " time/ppo/compute_advantagesK;kR,E lA*  time/ppo/optimize_stepDk?D߰)7_ KlA*  time/ppo/calc_stats@a">@1$B+M 鲈lA*  time/ppo/totalX?2X%6 lA*  env/reward_mean >I$B+M %lA*  env/reward_stdw4r>"x= V!|A*  objective/kl^ BŗxD'F !|A*  objective/kl_coefL>:;'F !|A*  objective/entropyԣB:/m]P !|A*" ppo/mean_non_score_rewardBn%6 !|A*  ppo/mean_scoresO=ʦ$B+M !|A*  ppo/std_scoresEXs>Q"-tM9(pJ !|A*  ppo/policy/entropyŬ@nT$)7_ /!|A*  ppo/policy/approxklh?3E 4)7_ @!|A*  ppo/policy/policykla >$S)7_ P!|A*  ppo/policy/clipfrac?(*0_ c!|A*# ! ppo/policy/advantages_mean*3&sO u!|A*  ppo/returns/mean`;i&9%6 !|A*  ppo/returns/varC@/#wC !|A*  ppo/val/vpred.߿\ #wC !|A*  ppo/val/errorNB@C&sO !|A*  ppo/val/clipfrac?E"x= !|A*  ppo/val/meanCk!{ .!|A*  ppo/val/var,>܂_+K !|A*  ppo/val/var_explained!&,)Ҽ'F !|A*  ppo/learning_rateo:i+K !|A*  time/ppo/forward_pass5:>}.W !|A*!  time/ppo/compute_rewardsH:ǭ\:1 !|A*$ " time/ppo/compute_advantages`V;hu,E %!|A*  time/ppo/optimize_step,w?_)7_ 5!|A*  time/ppo/calc_stats{->g$B+M E!|A*  time/ppo/total4?/%6 W!|A*  env/reward_meanO=$B+M g!|A*  env/reward_stdEXs>G!"x= ?գA*  objective/kl,BJ'F p?գA*  objective/kl_coefUL>r'F ?գA*  objective/entropyBJ &/m]P ?գA*" ppo/mean_non_score_reward ʾce8%6 ?գA*  ppo/mean_scores=$B+M ?գA*  ppo/std_scores|>-ʹ&(pJ CգA*  ppo/policy/entropy@ )7_ DգA*  ppo/policy/approxkl:>$k)7_ EգA*  ppo/policy/policykl=$n)7_ )FգA*  ppo/policy/clipfrac?S0_ DFգA*# ! ppo/policy/advantages_meanx&&sO ZFգA*  ppo/returns/mean^r%6 lFգA*  ppo/returns/var @#wC ~FգA*  ppo/val/vpredރ-([ #wC FգA*  ppo/val/errorn|@mca&sO FգA*  ppo/val/clipfrac?"x= GգA*  ppo/val/mean5!{ HգA*  ppo/val/var0=+K IգA*  ppo/val/var_explained7!'F 3IգA*  ppo/learning_rateo:*7+K EIգA*  time/ppo/forward_passi>[.W VIգA*!  time/ppo/compute_rewards:tF1 fIգA*$ " time/ppo/compute_advantages{;M_,E wIգA*  time/ppo/optimize_step$ ?i)7_ IգA*  time/ppo/calc_statsS>8h$B+M IգA*  time/ppo/totalP?2%6 3JգA*  env/reward_mean=Yb$B+M JգA*  env/reward_std|>'r"x= CA*  objective/kl}4$BP>/'F A*  objective/kl_coefM>\'F A*  objective/entropyBҷ/m]P A*" ppo/mean_non_score_rewardwӾN%6 A*  ppo/mean_scores(?=U$B+M A*  ppo/std_scores>N-k-Y%6 TA*  ppo/loss/policyP$B+M A*  ppo/loss/value0F3@$B+M A*  ppo/loss/total*>>|1?(pJ A*  ppo/policy/entropyk@?)7_ A*  ppo/policy/approxklkS>]>)7_ A*  ppo/policy/policykl"=M)7_ A*  ppo/policy/clipfrac ?RG0_ A*# ! ppo/policy/advantages_mean'qM&sO A*  ppo/returns/meany%6 !A*  ppo/returns/var/ @1re#wC 2A*  ppo/val/vpred%:h#wC BA*  ppo/val/errorҌ@dd&sO A*  ppo/val/clipfrac,>Ak"x= A*  ppo/val/mean΍+o/!{ A*  ppo/val/var$> o+K xA*  ppo/val/var_explained` 'F A*  ppo/learning_rateo:"+K A*  time/ppo/forward_passC_> 5Ŋ.W A*!  time/ppo/compute_rewardsX:E1 A*$ " time/ppo/compute_advantages~;Y!,E A*  time/ppo/optimize_stepd?L׻H)7_ A*  time/ppo/calc_statsUT>*2$B+M A*  time/ppo/total?}N%6 !A*  env/reward_mean(?=G\l$B+M 0A*  env/reward_std>ޟ"x= |lA*  objective/kly.B?v#'F lA*  objective/kl_coef1M>'F lA*  objective/entropyB/m]P mA*" ppo/mean_non_score_reward߾u%6 mA*  ppo/mean_scores ɋ=\9^$B+M "mA*  ppo/std_scorespD>'FHq-SqT(pJ sA*  ppo/policy/entropyj@)7_ sA*  ppo/policy/approxkl*>P)7_ sA*  ppo/policy/policykliH=,)7_ sA*  ppo/policy/clipfrac?f6|0_ sA*# ! ppo/policy/advantages_meanffJ&sO tA*  ppo/returns/meanB|@%6 tA*  ppo/returns/var2˾@A#wC /tA*  ppo/val/vpredXz/M#wC >tA*  ppo/val/errorJ@ &sO tA*  ppo/val/clipfrac>a{"x= tA*  ppo/val/mean INI!{ DuA*  ppo/val/var[Ң>4+K uA*  ppo/val/var_explainedب='F uA*  ppo/learning_rateo:ȏ+K HvA*  time/ppo/forward_pass`=*1.W vA*!  time/ppo/compute_rewards:&*1 vA*$ " time/ppo/compute_advantages\;nw,E CwA*  time/ppo/optimize_step/$?ՙ)7_ wA*  time/ppo/calc_stats=2$B+M wA*  time/ppo/total_?r^%6 9xA*  env/reward_mean ɋ=.0$B+M xA*  env/reward_stdpD>V _"x= OA*  objective/kl,z'F wPA*  objective/entropyFCzFV/m]P PA*" ppo/mean_non_score_rewardUX%6 PA*  ppo/mean_scores74%6 QA*  ppo/loss/policytݥG$B+M ,SA*  ppo/loss/valuefJ@9$B+M TA*  ppo/loss/total'p>ԣH(pJ UA*  ppo/policy/entropy9@E9s)7_ !UA*  ppo/policy/approxkl0?ʸ)7_ 2UA*  ppo/policy/policykl>I)7_ CUA*  ppo/policy/clipfrac?Й0_ UA*# ! ppo/policy/advantages_mean6 24$n&sO VA*  ppo/returns/meane%6 ]VA*  ppo/returns/var`@*#wC VA*  ppo/val/vpreddWe#wC WA*  ppo/val/errorԒ@&sO fWA*  ppo/val/clipfrac>B"x= WA*  ppo/val/meanRc]4!{ XA*  ppo/val/varf+??7 +K YXA*  ppo/val/var_explainedy>lYS'F XA*  ppo/learning_rateo: U#+K YA*  time/ppo/forward_pass=Rד.W TYA*!  time/ppo/compute_rewards:j1 YA*$ " time/ppo/compute_advantages ;g,E YA*  time/ppo/optimize_step$?c)7_ FZA*  time/ppo/calc_stats=וv|$B+M ZA*  time/ppo/total4_?68%6 ZA*  env/reward_mean-_'F xdpA*  objective/entropyBu/m]P dpA*" ppo/mean_non_score_rewardNA徽\%6 dpA*  ppo/mean_scores=(l$B+M dpA*  ppo/std_scores>- (pJ jpA*  ppo/policy/entropy?@yӇ)7_ ;mpA*  ppo/policy/approxklк>-)7_ hmpA*  ppo/policy/policyklh>u20)7_ mpA*  ppo/policy/clipfrac:>? 0_ /qpA*# ! ppo/policy/advantages_mean-]&sO ^qpA*  ppo/returns/meanȁ0?Y%6 vqpA*  ppo/returns/var7:@z#wC qpA*  ppo/val/vpreddT<#wC qpA*  ppo/val/errorY@} &sO qpA*  ppo/val/clipfrac> "x= qpA*  ppo/val/meanUY(!{ qpA*  ppo/val/varݎ?%m+K qpA*  ppo/val/var_explained >$'F qpA*  ppo/learning_rateo:/ +K SspA*  time/ppo/forward_pass4>.W XupA*!  time/ppo/compute_rewards8:k(1 upA*$ " time/ppo/compute_advantagesG;>,E upA*  time/ppo/optimize_stepq?sW)7_ upA*  time/ppo/calc_statsn(> ?;$B+M upA*  time/ppo/total?]J%6 upA*  env/reward_mean=8;$B+M upA*  env/reward_std>7,"x= &A *  objective/kl.)B'F c'A *  objective/entropyTBj3/m]P y'A *" ppo/mean_non_score_reward(;"%6 'A *  ppo/mean_scoresB=S$B+M 'A *  ppo/std_scores= o-Bl(pJ ,A *  ppo/policy/entropy6@kl)7_ x,A *  ppo/policy/approxkl?$6)7_ ,A *  ppo/policy/policyklyP>h Z)7_ "-A *  ppo/policy/clipfracD ?2K0_ x-A *# ! ppo/policy/advantages_mean}P&sO -A *  ppo/returns/meanjh7%6 #.A *  ppo/returns/var @R#wC x.A *  ppo/val/vpred* i7 #wC .A *  ppo/val/error@sf@U&sO #/A *  ppo/val/clipfrac(b>IU"x= /A *  ppo/val/meanfYO!{ /A *  ppo/val/var]?څ[x+K .0A *  ppo/val/var_explained>f0'F 0A *  ppo/learning_rateo:Pf2+K 0A *  time/ppo/forward_pass 4> D.W &1A *!  time/ppo/compute_rewards: Q1 u1A *$ " time/ppo/compute_advantagesZ;],E 1A *  time/ppo/optimize_stepq?))7_ 92A *  time/ppo/calc_stats(>ae$B+M 2A *  time/ppo/total?EIa%6 2A *  env/reward_meanB=6Bl$B+M 53A *  env/reward_std=d'"x= ѷA *  objective/kl6Bz'F ѷA *  objective/kl_coefM>֧'F ѷA *  objective/entropy$fB}/m]P ѷA *" ppo/mean_non_score_reward],V(H%6 $ѷA *  ppo/mean_scores(hD>ג}:$B+M 6ѷA *  ppo/std_scores >U-?-@,E VѷA *  tokens/queries_len_std /m]P gѷA *" tokens/responses_len_mean1A*d.W wѷA *!  tokens/responses_len_std=@GX%6 ѷA *  ppo/loss/policy8 $B+M ѷA *  ppo/loss/value/@D3$B+M ѷA *  ppo/loss/total r>r5(pJ 6ѷA *  ppo/policy/entropy?@e)7_ ѷA *  ppo/policy/approxklzi@\q)7_ ѷA *  ppo/policy/policykl(?5cn)7_ 5ѷA *  ppo/policy/clipfracX?Z0_ ѷA *# ! ppo/policy/advantages_meaněo_&sO ѷA *  ppo/returns/meanx%6 >ѷA *  ppo/returns/var@Q#wC ѷA *  ppo/val/vpred92:q#wC ѷA *  ppo/val/errorL @n&sO PѷA *  ppo/val/clipfrac>o "x= ѷA *  ppo/val/meanQ1!{ ѷA *  ppo/val/varS @A+K YѷA *  ppo/val/var_explainedd>30'F ѷA *  ppo/learning_rateo:lFn+K ѷA *  time/ppo/forward_pass2>WA.W aѷA *!  time/ppo/compute_rewards:֑1 ѷA *$ " time/ppo/compute_advantagesS;,E ѷA *  time/ppo/optimize_step+q?w)7_ dѷA *  time/ppo/calc_stats@@(>)$B+M ѷA *  time/ppo/total?%6 ѷA *  env/reward_mean(hD>X $B+M ^ѷA *  env/reward_std > U"x= ;˸A *  objective/klAI'F <˸A *  objective/kl_coefM>}E'F :<˸A *  objective/entropy)o?b/m]P V<˸A *" ppo/mean_non_score_rewardLt%6 t<˸A *  ppo/mean_scores'?$B+M <˸A *  ppo/std_scoress>uX-_%6 <˸A *  ppo/loss/policyz˸A *  ppo/loss/value"A-g$B+M f?˸A *  ppo/loss/totalh?J(pJ ?˸A *  ppo/policy/entropyX@IYq)7_ F@˸A *  ppo/policy/approxklLRaA%[H)7_ @˸A *  ppo/policy/policykl~@앋)7_ @˸A *  ppo/policy/clipfrac.:>걨R0_ RA˸A *# ! ppo/policy/advantages_mean>:&sO A˸A *  ppo/returns/meanmDO0qy%6 B˸A *  ppo/returns/varr񪙐.W dE˸A *!  time/ppo/compute_rewards(:v1 E˸A *$ " time/ppo/compute_advantages=;Pa,E )F˸A *  time/ppo/optimize_stepĭO?1p_)7_ F˸A *  time/ppo/calc_statsP>X_$B+M F˸A *  time/ppo/totalXҍ?,2%6 E"x= <ṃA *  objective/klՋBHw'F ṃA *  objective/kl_coefM>(s'F ṃA *  objective/entropy0B:7/m]P ṃA *" ppo/mean_non_score_rewardtv7 &%6 ṃA *  ppo/mean_scores>#>{x$B+M ṃA *  ppo/std_scoresQ>۷-e6$B+M DṃA *  ppo/loss/value6A5($B+M ṃA *  ppo/loss/total(?lj(pJ OṃA *  ppo/policy/entropyRA@)7_ ṃA *  ppo/policy/approxkl@v)7_ ṃA *  ppo/policy/policykl+? )7_ [ṃA *  ppo/policy/clipfracա>0_ ṃA *# ! ppo/policy/advantages_mean! 3g&sO ṃA *  ppo/returns/meanLlw%6 cṃA *  ppo/returns/var'A.W MṃA *!  time/ppo/compute_rewardsp:a4"1 ṃA *$ " time/ppo/compute_advantagesP;^ג,E ṃA *  time/ppo/optimize_stepq?[3)7_ GṃA *  time/ppo/calc_stats(>$B+M ṃA *  time/ppo/total?`I%6 ṃA *  env/reward_mean>#>kq 8$B+M 5ṃA *  env/reward_stdQ>:"x= BA *  objective/klsB/'F A *  objective/kl_coefJN>64'F A *  objective/entropyšB~p/m]P A *" ppo/mean_non_score_rewardDf%6 A *  ppo/mean_scores$=&$B+M A *  ppo/std_scorespZ>N{#-A *!  tokens/responses_len_std84%6 NA *  ppo/loss/policyNB]$B+M A *  ppo/loss/value6@UO$B+M  A *  ppo/loss/totaln4?䊆(pJ 4 A *  ppo/policy/entropyp@*)7_ H A *  ppo/policy/approxkl|/?&#N)7_ Y A *  ppo/policy/policyklő>D;)7_ j A *  ppo/policy/clipfrac>Y0_ } A *# ! ppo/policy/advantages_meant&/C&sO A *  ppo/returns/meanx%%6  A *  ppo/returns/varA 8#wC A *  ppo/val/vpredR #wC A *  ppo/val/erroroXA`Vk&sO A *  ppo/val/clipfrac?R"x= A *  ppo/val/mean.qs!{ A *  ppo/val/var>1+K #A *  ppo/val/var_explainedFF?'F 3A *  ppo/learning_rateo:P;+K A *  time/ppo/forward_pass5>ҏQ.W A *!  time/ppo/compute_rewards::1 VA *$ " time/ppo/compute_advantages\O; e,E A *  time/ppo/optimize_step;r?Gv)7_ A *  time/ppo/calc_stats0z)>$B+M LA *  time/ppo/total?r%6 A *  env/reward_mean$=AI$B+M A *  env/reward_stdpZ>9"x= A*  objective/kl,lB 'F SA*  objective/kl_coef?N>'F uA*  objective/entropyLÿB/m]P A*" ppo/mean_non_score_rewardu%%6 A*  ppo/mean_scores K=pAW$B+M A*  ppo/std_scores\>x0-Y60_ A*# ! ppo/policy/advantages_meanff62],&sO JA*  ppo/returns/mean&%6 A*  ppo/returns/var< Ag #wC A*  ppo/val/vpred\2Bj#wC cA*  ppo/val/error45A&sO A*  ppo/val/clipfrac?]WI"x= A*  ppo/val/meanf!{ mA*  ppo/val/varjb>+K ˵A*  ppo/val/var_explainedw'F &A*  ppo/learning_rateo:D+K ~A*  time/ppo/forward_pass@N6>' .W ۶A*!  time/ppo/compute_rewardsH:fN1 8A*$ " time/ppo/compute_advantagesQ;#9 ,E A*  time/ppo/optimize_step r?U)7_ A*  time/ppo/calc_stats)>y*9$B+M BA*  time/ppo/total?%6 A*  env/reward_mean K=u.$B+M A*  env/reward_std\>_Q"x= J4A*  objective/klzB`~'F 4A*  objective/kl_coef`N>B'F Ƙ4A*  objective/entropyB:?/m]P ܘ4A*" ppo/mean_non_score_reward!%6 4A*  ppo/mean_scoresY=R*$B+M 4A*  ppo/std_scoresAYC=i$Q-)7_ ӝ4A*  ppo/policy/approxkl>Y&)7_ 4A*  ppo/policy/policykl=\,u)7_ 4A*  ppo/policy/clipfrac>60_ +4A*# ! ppo/policy/advantages_mean̼2a&sO =4A*  ppo/returns/meanr%6 N4A*  ppo/returns/vareA=a#wC _4A*  ppo/val/vpred$Ydxiۨ#wC n4A*  ppo/val/errorTPAt&sO ~4A*  ppo/val/clipfrac?a"x= 4A*  ppo/val/meanhHZM!{ <4A*  ppo/val/var"?F+K 4A*  ppo/val/var_explained^;ݮh'F 4A*  ppo/learning_rateo:+K A4A*  time/ppo/forward_pass0N1>/.W 4A*!  time/ppo/compute_rewards:}1 4A*$ " time/ppo/compute_advantages\L;n(,E 04A*  time/ppo/optimize_stepur?'T8)7_ 4A*  time/ppo/calc_statsP(>$B+M ޡ4A*  time/ppo/total?w)%6 .4A*  env/reward_meanY=E$B+M }4A*  env/reward_stdAYC=t"x= IdA*  objective/klҨB'F ZJdA*  objective/kl_coefN>àp'F uJdA*  objective/entropyB/m]P JdA*" ppo/mean_non_score_reward{k93~%6 JdA*  ppo/mean_scoresL=5 6$B+M JdA*  ppo/std_scores>YK-$B+M RdA*  ppo/loss/totaly ?0R(pJ 9RdA*  ppo/policy/entropyj\@d;)7_ RRdA*  ppo/policy/approxklh>D)7_ fRdA*  ppo/policy/policykl>|H)7_ wRdA*  ppo/policy/clipfrac33>P40_ RdA*# ! ppo/policy/advantages_mean5,&sO RdA*  ppo/returns/meanvB%6 RdA*  ppo/returns/varfjA2#wC RdA*  ppo/val/vpred6#wC RdA*  ppo/val/errorA2&sO :SdA*  ppo/val/clipfracff>V"x= UdA*  ppo/val/meanL4T!{ UdA*  ppo/val/var3C@#-+K UdA*  ppo/val/var_explainedjT>H'F XdA*  ppo/learning_rateo:غ+K XdA*  time/ppo/forward_pass0>/g.W YdA*!  time/ppo/compute_rewards:$T1 "YdA*$ " time/ppo/compute_advantagesV;ݘ,E ][dA*  time/ppo/optimize_step@4r?jS)7_ [dA*  time/ppo/calc_statsp)>Hl$B+M \dA*  time/ppo/totalh? f%6 \dA*  env/reward_meanL=\$B+M \dA*  env/reward_std>W*"x= ѕA*  objective/klBE9v'F ѕA*  objective/kl_coefN>V>'F ҕA*  objective/entropyB`z/m]P +ҕA*" ppo/mean_non_score_rewardd7zA%6 >ҕA*  ppo/mean_scores T7(pJ ؕA*  ppo/policy/entropy~@;)7_ ؕA*  ppo/policy/approxkl>лΥ)7_ ؕA*  ppo/policy/policyklf">&)7_ ٕA*  ppo/policy/clipfracff>{00_ ٕA*# ! ppo/policy/advantages_mean)2cX&sO ,ٕA*  ppo/returns/mean솽u%6 =ٕA*  ppo/returns/varhyjADkX#wC MٕA*  ppo/val/vpred#wC ]ٕA*  ppo/val/error@"2&sO ٕA*  ppo/val/clipfracgf>k"x= 'ڕA*  ppo/val/meanɛ"!{ ڕA*  ppo/val/var@g>+K ڕA*  ppo/val/var_explained ?hh'F 0ەA*  ppo/learning_rateo:ˌ+K ەA*  time/ppo/forward_pass 32>St.W ەA*!  time/ppo/compute_rewards:Uɞ1 0ܕA*$ " time/ppo/compute_advantagesW;*Qs,E ܕA*  time/ppo/optimize_step t?{7)7_ ܕA*  time/ppo/calc_statsp $>Bw$B+M 'ݕA*  time/ppo/total~x?qA%6 xݕA*  env/reward_mean <A$B+M ݕA*  env/reward_std/u=G"x= >A*  objective/klnB84A'F ,?A*  objective/kl_coef[N>'F I?A*  objective/entropyhBZݎ/m]P ]?A*" ppo/mean_non_score_reward(iB%6 q?A*  ppo/mean_scores4=ːW$B+M ?A*  ppo/std_scores>k)-եh(pJ ^FA*  ppo/policy/entropyw~@3)7_ tFA*  ppo/policy/approxkl|?ě)7_ FA*  ppo/policy/policyklJ?L)7_ FA*  ppo/policy/clipfrac43>0_ FA*# ! ppo/policy/advantages_mean43g}&sO FA*  ppo/returns/mean mX%6 FA*  ppo/returns/vardPAR#wC FA*  ppo/val/vpredv#wC FA*  ppo/val/error^@yl6 &sO HA*  ppo/val/clipfrac>ӡ]$"x= IA*  ppo/val/meantN!{ IA*  ppo/val/var'&@xg+K .IA*  ppo/val/var_explained,?sE'F ?IA*  ppo/learning_rateo:S2+K KA*  time/ppo/forward_passP6>O.W LA*!  time/ppo/compute_rewards\ ;lH1 LA*$ " time/ppo/compute_advantagesk;_,E ,LA*  time/ppo/optimize_stepWr?M5%)7_ =LA*  time/ppo/calc_statsa)>sި$B+M LA*  time/ppo/total6? dB%6 LA*  env/reward_mean4=P$B+M @MA*  env/reward_std>7"x= DA*  objective/klPGBQ:'F A*  objective/kl_coef8F'F A*  objective/entropy4BʯU/m]P A*" ppo/mean_non_score_reward$s%6 A*  ppo/mean_scores=|$B+M A*  ppo/std_scoresk[>p-/m]P JA*" tokens/responses_len_meanAˆ.W [A*!  tokens/responses_len_stdæ%6 nA*  ppo/loss/policy"1=8˜$B+M OA*  ppo/loss/valueAf$B+M JA*  ppo/loss/total+ p?\t(pJ A*  ppo/policy/entropy N@ )7_ ﲞ0_ ~A*# ! ppo/policy/advantages_mean2=&sO A*  ppo/returns/meanI{%6 >A*  ppo/returns/varߩ B,#wC A*  ppo/val/vpredƏ!x#wC A*  ppo/val/error aAb-@&sO WA*  ppo/val/clipfrac33>Ag"x= A*  ppo/val/meanN4Q!{ A*  ppo/val/vard A(+K xA*  ppo/val/var_explainedV^?-+'F A*  ppo/learning_rateo:$+K 2A*  time/ppo/forward_pass`S2>n5.W A*!  time/ppo/compute_rewards:t1 A*$ " time/ppo/compute_advantages`;i,E FA*  time/ppo/optimize_step<:s?&7)7_ A*  time/ppo/calc_stats*>l4$B+M A*  time/ppo/total֥?ugf%6 PA*  env/reward_mean="O$B+M A*  env/reward_stdk[>/a"x= ƒA*  objective/klH(B|H'F ƒA*  objective/kl_coef" O>wE'F ƒA*  objective/entropy*Blb/m]P ^ƒA*" ppo/mean_non_score_rewardQK1ė%6 ƒA*  ppo/mean_scores\=9$B+M ƒA*  ppo/std_scores*39>w7-$@(pJ &!ƒA*  ppo/policy/entropyе@j)7_ !ƒA*  ppo/policy/approxklPcoA5)7_ !ƒA*  ppo/policy/policykl;@ݏ)7_ 7"ƒA*  ppo/policy/clipfrac1>|~-0_ "ƒA*# ! ppo/policy/advantages_mean3y&sO "ƒA*  ppo/returns/meanK%6 H#ƒA*  ppo/returns/varȔAo#wC #ƒA*  ppo/val/vpred-#wC #ƒA*  ppo/val/errorP3A&sO K$ƒA*  ppo/val/clipfrac><͢V"x= $ƒA*  ppo/val/meann!{ $ƒA*  ppo/val/vartA(r+K J%ƒA*  ppo/val/var_explained#>D4'F %ƒA*  ppo/learning_rateo:F%+K %ƒA*  time/ppo/forward_pass=3>U.W Q&ƒA*!  time/ppo/compute_rewards:ۅ1 &ƒA*$ " time/ppo/compute_advantages];K\,E 6'ƒA*  time/ppo/optimize_step\n?.2)7_ G'ƒA*  time/ppo/calc_statsP,%>ف6$B+M 'ƒA*  time/ppo/total??E%6 'ƒA*  env/reward_mean\=$B+M C(ƒA*  env/reward_std*39>o[o"x= **ăA*  objective/kl:=BI'F D+*ăA*  objective/kl_coef,O>c'F `+*ăA*  objective/entropycB H/m]P w+*ăA*" ppo/mean_non_score_rewardP`_%6 +*ăA*  ppo/mean_scoresYX<=L$B+M +*ăA*  ppo/std_scoresy=5/-(pJ /*ăA*  ppo/policy/entropy'@u)7_ 0*ăA*  ppo/policy/approxklXP>Li)7_ j0*ăA*  ppo/policy/policykl =չ)7_ 0*ăA*  ppo/policy/clipfracff>0_ 1*ăA*# ! ppo/policy/advantages_mean33ӲNr&sO g1*ăA*  ppo/returns/mean%6 1*ăA*  ppo/returns/varA0ª)#wC 2*ăA*  ppo/val/vpreduN8#wC ]2*ăA*  ppo/val/errorA W&sO 2*ăA*  ppo/val/clipfrac̘>rȔ"x= 2*ăA*  ppo/val/means?!{ S3*ăA*  ppo/val/varyJ@S+K 3*ăA*  ppo/val/var_explained>~$'F 3*ăA*  ppo/learning_rateo:ň+K G4*ăA*  time/ppo/forward_pass1>i.W 4*ăA*!  time/ppo/compute_rewards:1 f7*ăA*$ " time/ppo/compute_advantagesO;qr$B+M 7*ăA*  time/ppo/total?FZ%6 7*ăA*  env/reward_meanYX<'F 5T'F U/TŃA*  objective/entropyUB+/m]P g/TŃA*" ppo/mean_non_score_rewardнmp:R%6 x/TŃA*  ppo/mean_scores/= $B+M /TŃA*  ppo/std_scoresK[>-4-1$B+M 1TŃA*  ppo/loss/value<@c$B+M 4TŃA*  ppo/loss/total!>j^(pJ 4TŃA*  ppo/policy/entropy:8@@HI)7_ 4TŃA*  ppo/policy/approxkl?% @)7_ 5TŃA*  ppo/policy/policyklh}Y>)7_ 5TŃA*  ppo/policy/clipfrac33>VS0_ 7TŃA*# ! ppo/policy/advantages_mean2&sO 7TŃA*  ppo/returns/meanM%6 7TŃA*  ppo/returns/varդAg|#wC 7TŃA*  ppo/val/vpredfu5#wC 8TŃA*  ppo/val/errorG@H(&sO 8TŃA*  ppo/val/clipfrac33>_"x= &8TŃA*  ppo/val/meanIt\;!{ 88TŃA*  ppo/val/varI@i+K 8TŃA*  ppo/val/var_explained=?'F 8TŃA*  ppo/learning_rateo:/>+K H9TŃA*  time/ppo/forward_pass42>f~.W 9TŃA*!  time/ppo/compute_rewards0:1 9TŃA*$ " time/ppo/compute_advantagesM;(,E P:TŃA*  time/ppo/optimize_stepq?NeJ)7_ :TŃA*  time/ppo/calc_statsp*>9o$B+M :TŃA*  time/ppo/total"?%6 O;TŃA*  env/reward_mean/=$$B+M ;TŃA*  env/reward_stdK[>ԯ"x= 8oƃA*  objective/klӨB2t'F L9oƃA*  objective/kl_coefoO>>o'F k9oƃA*  objective/entropyxYBt/m]P 9oƃA*" ppo/mean_non_score_rewardZ%6 9oƃA*  ppo/mean_scoresv=i$B+M 9oƃA*  ppo/std_scoresiS!>VN-]&$B+M ?ne)7_ k?oƃA*  ppo/policy/clipfrac̵>0_ ?oƃA*# ! ppo/policy/advantages_mean1J@&sO ?oƃA*  ppo/returns/meanN+%6 ?oƃA*  ppo/returns/varzGAnoo#wC ?oƃA*  ppo/val/vpred?@#wC ?oƃA*  ppo/val/errorg@v_|d&sO ?oƃA*  ppo/val/clipfrac>ɮW"x= ?@oƃA*  ppo/val/meanz>u!{ @oƃA*  ppo/val/varfA[Ek4+K @oƃA*  ppo/val/var_explained-/?h'F QAoƃA*  ppo/learning_rateo:+K AoƃA*  time/ppo/forward_pass ?1>% .W BoƃA*!  time/ppo/compute_rewards:OJz1 dBoƃA*$ " time/ppo/compute_advantagesM;-I,E BoƃA*  time/ppo/optimize_stepdo?)7_ CoƃA*  time/ppo/calc_stats$>q$B+M hCoƃA*  time/ppo/total!?uߴ%6 CoƃA*  env/reward_meanv= "$B+M DoƃA*  env/reward_stdiS!>F"x= {ǃA*  objective/klvBl 'F ~{ǃA*  objective/kl_coefO>'F {ǃA*  objective/entropyA3BD1/m]P {ǃA*" ppo/mean_non_score_rewardB[Z%6 {ǃA*  ppo/mean_scoresf=eּ$B+M {ǃA*  ppo/std_scores[>@;P-GE(pJ ЀǃA*  ppo/policy/entropyN$ @r)7_ ǃA*  ppo/policy/approxkl`=0C?)7_ ǃA*  ppo/policy/policykl<4Ї)7_ )ǃA*  ppo/policy/clipfrac[>xLk0_ <ǃA*# ! ppo/policy/advantages_mean0{&sO LǃA*  ppo/returns/mean]1"x= ؂ǃA*  ppo/val/meanVx}!{ 2ǃA*  ppo/val/var@N+K ǃA*  ppo/val/var_explainedM?ÏF/'F 惒ǃA*  ppo/learning_rateo:ZI+K >ǃA*  time/ppo/forward_pass;3>h.W ǃA*!  time/ppo/compute_rewards:_> 1 ǃA*$ " time/ppo/compute_advantagesL;-Μ,E EǃA*  time/ppo/optimize_stepr?e$)7_ ǃA*  time/ppo/calc_stats(>Ό$B+M ǃA*  time/ppo/total"?y(%6 WǃA*  env/reward_meanf=nw$B+M ǃA*  env/reward_std[>ca"x= ȃA*  objective/klyBm.'F zȃA*  objective/kl_coefO>'F ȃA*  objective/entropyj.Bo4/m]P ȃA*" ppo/mean_non_score_reward&" %6 ȃA*  ppo/mean_scoresH>*$B+M ȃA*  ppo/std_scores3>1]˺-=A:H)7_ ȃA*  ppo/policy/clipfracr>%0_ 7ȃA*# ! ppo/policy/advantages_meanffƱo&sO ȃA*  ppo/returns/mean}%6 ȃA*  ppo/returns/varJA]#wC cȃA*  ppo/val/vpredڰϖ#wC ȃA*  ppo/val/error$«?X &sO ȃA*  ppo/val/clipfrac>s"x= ȃA*  ppo/val/mean%4!{ lȃA*  ppo/val/var"AQ@.W ȃA*!  time/ppo/compute_rewards:]1 TȃA*$ " time/ppo/compute_advantagesH;2,E ȃA*  time/ppo/optimize_stepXq?'dC)7_ ȃA*  time/ppo/calc_stats`)>ےg$B+M uȃA*  time/ppo/total?%6 ȃA*  env/reward_meanH>A5_z$B+M 8ȃA*  env/reward_std3>2X"x= @ʃA*  objective/kl=tBFa'F @ʃA*  objective/kl_coefO>S)'F @ʃA*  objective/entropy$b>B1/m]P @ʃA*" ppo/mean_non_score_rewards %6 @ʃA*  ppo/mean_scoresd>=.XiB$B+M @ʃA*  ppo/std_scores~>ͧ-$.W @ʃA*!  tokens/responses_len_std%6 @ʃA*  ppo/loss/policy/ A$B+M @ʃA*  ppo/loss/value;:?3B$B+M @ʃA*  ppo/loss/total=Tz(pJ }@ʃA*  ppo/policy/entropyh@l)7_ @ʃA*  ppo/policy/approxklI=S)7_ s@ʃA*  ppo/policy/policykl_=)7_ @ʃA*  ppo/policy/clipfrac>A,"x= W@ʃA*  ppo/val/mean"W!{ @ʃA*  ppo/val/var,AH!+K @ʃA*  ppo/val/var_explainede?9@'F a@ʃA*  ppo/learning_rateo:h@+K @ʃA*  time/ppo/forward_passP5>+9.W @ʃA*!  time/ppo/compute_rewards:`1 f@ʃA*$ " time/ppo/compute_advantagesQ; #^,E @ʃA*  time/ppo/optimize_step=q?:")7_ @ʃA*  time/ppo/calc_stats`(>$B+M l@ʃA*  time/ppo/total?%6 @ʃA*  env/reward_meand>=u7t$B+M @ʃA*  env/reward_std~>T"x= J˃A*  objective/klaB] ~'F J˃A*  objective/kl_coef O><'F K˃A*  objective/entropyLBBP/m]P K˃A*" ppo/mean_non_score_rewardF/b%6 0K˃A*  ppo/mean_scoresZ>U($B+M BK˃A*  ppo/std_scores>+i]Q-Y0_ S˃A*# ! ppo/policy/advantages_meanొNI&sO #S˃A*  ppo/returns/mean`jE%6 4S˃A*  ppo/returns/var_+A|/)#wC DS˃A*  ppo/val/vpredׅR֩7#wC SS˃A*  ppo/val/error@?V&sO U˃A*  ppo/val/clipfrac33]>W"x= U˃A*  ppo/val/mean=!{ >X˃A*  ppo/val/var"0A[nA+K iX˃A*  ppo/val/var_explainedc?;h'F X˃A*  ppo/learning_rateo:fuD+K [˃A*  time/ppo/forward_pass`3>Jr.W :[˃A*!  time/ppo/compute_rewards(:1 U[˃A*$ " time/ppo/compute_advantagesI;F,E h[˃A*  time/ppo/optimize_stepz+$B+M ^˃A*  time/ppo/totalO?m`%6 7^˃A*  env/reward_meanZ> S$B+M ^˃A*  env/reward_std>t"x= ̃A*  objective/kl7[Bi'F ̃A*  objective/kl_coefP>y1'F ̃A*  objective/entropy1Bw/m]P ̃A*" ppo/mean_non_score_reward~c%6 ̃A*  ppo/mean_scoresd=tߎ{$B+M ̃A*  ppo/std_scoresKC> J-C0_ ̃A*# ! ppo/policy/advantages_mean4332Ŷ~&sO (̃A*  ppo/returns/mean^%p%6 8̃A*  ppo/returns/var Asx\]#wC H̃A*  ppo/val/vpred3hJ#wC X̃A*  ppo/val/error$Gt?,&sO ̃A*  ppo/val/clipfraci>e["x= ̃A*  ppo/val/mean/3!{ ̃A*  ppo/val/var)A y,+K ̃A*  ppo/val/var_explainedg?^$N'F ̃A*  ppo/learning_rateo:@+K  ̃A*  time/ppo/forward_passp4>xi^.W  ̃A*!  time/ppo/compute_rewards:e.%1 ' ̃A*$ " time/ppo/compute_advantageshL;t,E 8 ̃A*  time/ppo/optimize_stepq?0)7_ H ̃A*  time/ppo/calc_statsЊ(>~F$B+M ̃A*  time/ppo/total??ǀ%6 0̃A*  env/reward_meand=P$B+M G̃A*  env/reward_stdKC>b"x= ̓A*  objective/klMVB\jR'F A̓A*  objective/kl_coef5

Z~'F ]̓A*  objective/entropy /B7Ɗ/m]P q̓A*" ppo/mean_non_score_rewardt F%6 ̓A*  ppo/mean_scoresO>١g$B+M ̓A*  ppo/std_scores>ib-0_ $̓A*# ! ppo/policy/advantages_meangfc&sO }̓A*  ppo/returns/meanzq%6 ̓A*  ppo/returns/varBAum#wC .̓A*  ppo/val/vpred%b#wC ̓A*  ppo/val/errorv?^ܨ&sO ̓A*  ppo/val/clipfrac*>c"x= -̓A*  ppo/val/mean2=!{ ̓A*  ppo/val/varA,+K ̓A*  ppo/val/var_explained e?!'F .̓A*  ppo/learning_rateo:,u+K ̓A*  time/ppo/forward_pass3>-ɤ.W ̓A*!  time/ppo/compute_rewards:G?Q1 4̓A*$ " time/ppo/compute_advantagesdK;ޅ,E ̓A*  time/ppo/optimize_step$r?`T)7_ ̓A*  time/ppo/calc_stats (>˙J$B+M 3̓A*  time/ppo/totaly?C%6 ̓A*  env/reward_meanO>AK*$B+M ̓A*  env/reward_std>%y%P"x= Q߃A*  objective/kl;\BcO'F ߃A*  objective/kl_coefS^P>B'F ߃A*  objective/entropyXE*BҜ /m]P ߃A*" ppo/mean_non_score_rewardgX%6 ߃A*  ppo/mean_scoresj!=>$B+M ߃A*  ppo/std_scores]L>tX-1k(pJ ߃A*  ppo/policy/entropy? S)7_ R߃A*  ppo/policy/approxkl%=tqv)7_ ߃A*  ppo/policy/policykl:=gI)7_ 1߃A*  ppo/policy/clipfracq>0_ K߃A*# ! ppo/policy/advantages_meangf8j&sO ^߃A*  ppo/returns/meanun%6 p߃A*  ppo/returns/var$`@$#wC ߃A*  ppo/val/vpredA#wC ߃A*  ppo/val/error @#&sO @߃A*  ppo/val/clipfracff>9,"x= ߃A*  ppo/val/meanE'ܺ!{ ߃A*  ppo/val/var=@]@<+K C߃A*  ppo/val/var_explained_6?$q'F ߃A*  ppo/learning_rateo:t+K ߃A*  time/ppo/forward_passP9>J<.W J߃A*!  time/ppo/compute_rewards:Vբ1 ߃A*$ " time/ppo/compute_advantages([;t,E ߃A*  time/ppo/optimize_stepGw?;)7_ G߃A*  time/ppo/calc_stats`E.>ǰ$B+M ߃A*  time/ppo/totalM?F!%6 ߃A*  env/reward_meanj!=IȪ$B+M ?߃A*  env/reward_std]L>o"x= yრA*  objective/kl\PBkl^'F =yრA*  objective/kl_coefwP>`N+'F [yრA*  objective/entropy6B./m]P ryრA*" ppo/mean_non_score_rewardJx%6 yრA*  ppo/mean_scores=;\Q$B+M yრA*  ppo/std_scores[m>{-pD0_ ̐yრA*# ! ppo/policy/advantages_mean@YrP&sO ߐyრA*  ppo/returns/meankA%6 yრA*  ppo/returns/var@d#wC yრA*  ppo/val/vpredTP#wC yრA*  ppo/val/error:?aZ&sO ZyრA*  ppo/val/clipfrac>2]"x= yრA*  ppo/val/meanp"3O |!{ yრA*  ppo/val/var@U}+K yრA*  ppo/val/var_explaineddEL?MI'F ^yრA*  ppo/learning_rateo:P+K yრA*  time/ppo/forward_pass`7>$.W yრA*!  time/ppo/compute_rewards :IF1 [yრA*$ " time/ppo/compute_advantages^;,E yრA*  time/ppo/optimize_step\jw?@>)7_ yრA*  time/ppo/calc_stats+>nI$B+M RyრA*  time/ppo/total8ը?uf%6 yრA*  env/reward_mean=$B+M yრA*  env/reward_std[m>\z"x= Β⃠A *  objective/klJWB(!'F 7⃠A *  objective/kl_coefP>5 'F S⃠A *  objective/entropy>BPչI/m]P k⃠A *" ppo/mean_non_score_rewardk ^ŒC%6 }⃠A *  ppo/mean_scores=1E$B+M ⃠A *  ppo/std_scoresI>L>`,-ř0_ ⃠A *# ! ppo/policy/advantages_mean4332&sO ⃠A *  ppo/returns/meanU%6 .⃠A *  ppo/returns/varuA'l#wC ?⃠A *  ppo/val/vpredtVؽv#wC N⃠A *  ppo/val/errorn?j*H&sO _⃠A *  ppo/val/clipfracr> H"x= o⃠A *  ppo/val/meant3裎!{ ⛳⃠A *  ppo/val/varz@QG+K v⃠A *  ppo/val/var_explainedX?!);'F ]⃠A *  ppo/learning_rateo:pš+K О⃠A *  time/ppo/forward_pass7>G.W P⃠A *!  time/ppo/compute_rewards:Dk1 j⃠A *$ " time/ppo/compute_advantagesZ;~,E ա⃠A *  time/ppo/optimize_stepx?JY)7_ .⃠A *  time/ppo/calc_stats`,>a$B+M ⃠A *  time/ppo/totalY?kU%6 뢳⃠A *  env/reward_mean=$B+M 9⃠A *  env/reward_stdI>L>w."x= ムA!*  objective/klxQB_ 'F f!ムA!*  objective/kl_coefP>>'F !ムA!*  objective/entropy UB7/m]P !ムA!*" ppo/mean_non_score_rewardF4%6 !ムA!*  ppo/mean_scorest=$B+M !ムA!*  ppo/std_scoresH>DV-&-$B+M $ムA!*  ppo/loss/total)<#I(pJ $ムA!*  ppo/policy/entropy?R)7_ %ムA!*  ppo/policy/approxklY<V)7_ g%ムA!*  ppo/policy/policykll<*d5)7_ %ムA!*  ppo/policy/clipfrac=y0_ 1&ムA!*# ! ppo/policy/advantages_mean'8o&sO &ムA!*  ppo/returns/meanzQ,B%6 &ムA!*  ppo/returns/varUVA#wC X'ムA!*  ppo/val/vpredDɜ #wC 'ムA!*  ppo/val/errorD.e?:o&sO (ムA!*  ppo/val/clipfracgf=fJ>"x= l(ムA!*  ppo/val/mean } s!{ (ムA!*  ppo/val/var@\\+K ()ムA!*  ppo/val/var_explainede?=aQ'F )ムA!*  ppo/learning_rateo:8G +K )ムA!*  time/ppo/forward_passo8>0.W E*ムA!*!  time/ppo/compute_rewardsh:c1 *ムA!*$ " time/ppo/compute_advantagesT;Αh,E +ムA!*  time/ppo/optimize_stepTw?@)7_ \+ムA!*  time/ppo/calc_stats `,>rd$B+M +ムA!*  time/ppo/totalB.??-%6 ,ムA!*  env/reward_meant=hP$B+M k,ムA!*  env/reward_stdH>"+"x= 䃠A"*  objective/kl+qJB&\P8'F 䃠A"*  objective/kl_coefP>'F 䃠A"*  objective/entropyB/m]P 䃠A"*" ppo/mean_non_score_reward(cd%6 䃠A"*  ppo/mean_scoresV>ۙl7$B+M 䃠A"*  ppo/std_scoresH>--s$B+M (䃠A"*  ppo/loss/total< (pJ 䃠A"*  ppo/policy/entropyE?*)7_ 䃠A"*  ppo/policy/approxklۥ =93D)7_ |䃠A"*  ppo/policy/policyklu =fe)7_ 䃠A"*  ppo/policy/clipfrac:>;50_ 8䃠A"*# ! ppo/policy/advantages_mean1&sO 䃠A"*  ppo/returns/meanw@k{5%6 䃠A"*  ppo/returns/varAtL#wC P䃠A"*  ppo/val/vpred~ ę#wC 䃠A"*  ppo/val/error#V?&sO 䃠A"*  ppo/val/clipfracff>@T"x= X䃠A"*  ppo/val/meanI76V!{ 䃠A"*  ppo/val/varvD@'+K 䃠A"*  ppo/val/var_explained.Nf?q8'F 䃠A"*  ppo/learning_rateo:G%+K 䃠A"*  time/ppo/forward_pass@7>T#.W =䃠A"*!  time/ppo/compute_rewardsH:0b1 䃠A"*$ " time/ppo/compute_advantages$X;,E 䃠A"*  time/ppo/optimize_stepx?EQ)7_ C䃠A"*  time/ppo/calc_stats@,>yh$B+M 䃠A"*  time/ppo/total^F?o%6 䃠A"*  env/reward_meanV>O$B+M F䃠A"*  env/reward_stdH>l~"x= e$惠A#*  objective/kl8>Brk'F Bf$惠A#*  objective/kl_coef= Q>%g'F ef$惠A#*  objective/entropy BB /m]P xf$惠A#*" ppo/mean_non_score_reward}3hʘ%6 f$惠A#*  ppo/mean_scores=U>$B+M f$惠A#*  ppo/std_scoresFz>o-0_ j$惠A#*# ! ppo/policy/advantages_meanff*V&sO >k$惠A#*  ppo/returns/meang V%6 k$惠A#*  ppo/returns/var A5ޕ#wC k$惠A#*  ppo/val/vpredLre#wC Ql$惠A#*  ppo/val/errorA*c?) 0&sO l$惠A#*  ppo/val/clipfrac>Gz;"x= l$惠A#*  ppo/val/meanUU!{ \m$惠A#*  ppo/val/varPN@ҢK+K m$惠A#*  ppo/val/var_explainedqc?L-'F n$惠A#*  ppo/learning_rateo:y)c+K rn$惠A#*  time/ppo/forward_pass`i;>&@.W n$惠A#*!  time/ppo/compute_rewards:i\p1 )o$惠A#*$ " time/ppo/compute_advantages8\;"k,E o$惠A#*  time/ppo/optimize_stepZx?`)7_ o$惠A#*  time/ppo/calc_statsPe,>~$B+M +p$惠A#*  time/ppo/totalѩ?\D"x= ꃠA$*  objective/klnQBu'F ~ꃠA$*  objective/kl_coef}+Q>X'F ꃠA$*  objective/entropyd&BiX/m]P ꃠA$*" ppo/mean_non_score_reward.%6 ꃠA$*  ppo/mean_scores͌=QN$B+M ꃠA$*  ppo/std_scoresJ=Χ-W@K$B+M ꃠA$*  ppo/loss/totalc>(pJ ꃠA$*  ppo/policy/entropy?hR)7_ ꃠA$*  ppo/policy/approxkl<<{)7_ b ꃠA$*  ppo/policy/policyklK.=)7_ ꃠA$*  ppo/policy/clipfrac>],0_ &!ꃠA$*# ! ppo/policy/advantages_meaň2&sO !ꃠA$*  ppo/returns/mean@?%6 !ꃠA$*  ppo/returns/var`6A@+#wC ?"ꃠA$*  ppo/val/vpred"ؾ#wC "ꃠA$*  ppo/val/error&@R&sO "ꃠA$*  ppo/val/clipfrac?8g"x= H#ꃠA$*  ppo/val/meanNJg!{ #ꃠA$*  ppo/val/var\@}Ww+K $ꃠA$*  ppo/val/var_explainedE?K8'F \$ꃠA$*  ppo/learning_rateo:#*s+K $ꃠA$*  time/ppo/forward_pass[%>'.W %ꃠA$*!  time/ppo/compute_rewards:21 %ꃠA$*$ " time/ppo/compute_advantages>;,E %ꃠA$*  time/ppo/optimize_step,7^?@y@<)7_ 5&ꃠA$*  time/ppo/calc_stats>$B+M &ꃠA$*  time/ppo/totalT?.h%6 &ꃠA$*  env/reward_mean͌=tk$B+M 3'ꃠA$*  env/reward_stdJ=;"x= 냠A%*  objective/klDPBP'F c냠A%*  objective/kl_coefMQ>w'F |냠A%*  objective/entropyW'!B|/m]P 냠A%*" ppo/mean_non_score_reward*9t%6 냠A%*  ppo/mean_scorest=: )$B+M 냠A%*  ppo/std_scores`={T-]$B+M !냠A%*  ppo/loss/totalX >(pJ !냠A%*  ppo/policy/entropy??Y)7_ U"냠A%*  ppo/policy/approxkl=Ɠ)7_ "냠A%*  ppo/policy/policykl|k=5:)7_ "냠A%*  ppo/policy/clipfracj>iR0_ Q#냠A%*# ! ppo/policy/advantages_mean3/zX&sO #냠A%*  ppo/returns/mean{%6 #냠A%*  ppo/returns/var:;AeC>#wC T$냠A%*  ppo/val/vpred (r#wC $냠A%*  ppo/val/errorʼn?&&sO $냠A%*  ppo/val/clipfrac ?"x= T%냠A%*  ppo/val/mean6^Wذ!{ %냠A%*  ppo/val/var² A[{+K %냠A%*  ppo/val/var_explaineda?aO'F M&냠A%*  ppo/learning_rateo:g+K &냠A%*  time/ppo/forward_pass?'>..W (냠A%*!  time/ppo/compute_rewards:_1 (냠A%*$ " time/ppo/compute_advantages<=;WV ,E x)냠A%*  time/ppo/optimize_step^?I#)7_ )냠A%*  time/ppo/calc_stats>FdGG$B+M b*냠A%*  time/ppo/totalH?`%6 *냠A%*  env/reward_meant="B3$B+M (+냠A%*  env/reward_std`=uq"x= 샠A&*  objective/klNB| 'F 샠A&*  objective/kl_coef pQ>.'F 샠A&*  objective/entropy0BF/m]P *샠A&*" ppo/mean_non_score_rewardw]9a%6 <샠A&*  ppo/mean_scoresw(=[$B+M O샠A&*  ppo/std_scoresh:=*F-*p0_ 샠A&*# ! ppo/policy/advantages_mean332#&sO 샠A&*  ppo/returns/mean3H1%6 샠A&*  ppo/returns/var<*A(۾#wC 샠A&*  ppo/val/vpredY#wC 샠A&*  ppo/val/errorу?@慥&sO 샠A&*  ppo/val/clipfrac>X{"x= 샠A&*  ppo/val/mean .E !{ 샠A&*  ppo/val/varHR@Aie+K [샠A&*  ppo/val/var_explained0g?ͣ?'F 샠A&*  ppo/learning_rateo:+K 샠A&*  time/ppo/forward_pass %>'^I.W d샠A&*!  time/ppo/compute_rewards:y<1 샠A&*$ " time/ppo/compute_advantages>;Sg,E 샠A&*  time/ppo/optimize_step"^?Ϩګ)7_ ^샠A&*  time/ppo/calc_stats>+e$B+M 샠A&*  time/ppo/total\ݗ?rF%6 5샠A&*  env/reward_meanw(=VV>$B+M 샠A&*  env/reward_stdh:=^#"x= 9A'*  objective/kl@Bt'F H9A'*  objective/kl_coef^Q>Ew'F b9A'*  objective/entropyB\s/m]P w9A'*" ppo/mean_non_score_rewardvbu %6 9A'*  ppo/mean_scoresM>$B+M 9A'*  ppo/std_scores>8_-]-(pJ 9A'*  ppo/policy/entropy?)7_ F9A'*  ppo/policy/approxklr[=[&")7_ 9A'*  ppo/policy/policykl*=2)7_ 9A'*  ppo/policy/clipfracff=mb0_ 9A'*# ! ppo/policy/advantages_mean43{x dp&sO 9A'*  ppo/returns/mean/93Y%6 a9A'*  ppo/returns/varw@b<#wC 9A'*  ppo/val/vpredlzp#wC 9A'*  ppo/val/errory2@0i&sO l9A'*  ppo/val/clipfrac33>6 IE"x= 9A'*  ppo/val/mean;uD*!{ "9A'*  ppo/val/var1?UN̍+K ~9A'*  ppo/val/var_explainedl>,'F 9A'*  ppo/learning_rateo:Γ+K 19A'*  time/ppo/forward_passS>uf.W 9A'*!  time/ppo/compute_rewards:J"1 9A'*$ " time/ppo/compute_advantages4w;gV,E 79A'*  time/ppo/optimize_stepLT?P1)7_ 9A'*  time/ppo/calc_statsD>G$B+M 9A'*  time/ppo/total"? %6 :9A'*  env/reward_meanM>Qmo$B+M 9A'*  env/reward_std>'F cA(*  objective/entropyAy(/m]P xA(*" ppo/mean_non_score_reward5.fU%6 A(*  ppo/mean_scoresl=>\Ԕ$B+M A(*  ppo/std_scoresV>P[-vG(40_ XLJA(*# ! ppo/policy/advantages_mean,WR&sO iLJA(*  ppo/returns/mean}L2k%6 zLJA(*  ppo/returns/var@K)#wC LJA(*  ppo/val/vpred8Sk#wC ɇA(*  ppo/val/errorn?N &sO ɇA(*  ppo/val/clipfrac̵>N"x= 7ˇA(*  ppo/val/mean0P!{ ˇA(*  ppo/val/var8@\#+K ͇A(*  ppo/val/var_explainedz .W ·A(*!  time/ppo/compute_rewards@:ֻ1 PχA(*$ " time/ppo/compute_advantages~;DP,E χA(*  time/ppo/optimize_step⃍?m)7_ ЇA(*  time/ppo/calc_stats|E>&$B+M WЇA(*  time/ppo/total?'%6 ЇA(*  env/reward_meanl=>RUy$B+M чA(*  env/reward_stdV>ALB2"x= A)*  objective/klD6Bk 'F A)*  objective/kl_coefQ>pC'F -A)*  objective/entropy&A |!/m]P AA)*" ppo/mean_non_score_rewardmJ%6 TA)*  ppo/mean_scoresA>N|$B+M fA)*  ppo/std_scoresΤ>Q8-ɴc$B+M A)*  ppo/loss/totalr81"x= A)*  ppo/val/meanx4<)!{ cA)*  ppo/val/varJ@(Z}+K ·A)*  ppo/val/var_explained=b?p|C'F !A)*  ppo/learning_rateo:h'y+K }A)*  time/ppo/forward_passpQ>,ո.W ܸA)*!  time/ppo/compute_rewards:?h1 7A)*$ " time/ppo/compute_advantagesk;S,E A)*  time/ppo/optimize_stepڍ?/H)7_ A)*  time/ppo/calc_stats@E>$B+M IA)*  time/ppo/totalFm?Z됸%6 A)*  env/reward_meanA>$B+M A)*  env/reward_stdΤ>{EX"x= Cz+A**  objective/klM0BDr'F z+A**  objective/kl_coefqQ>j'F z+A**  objective/entropy4A/m]P z+A**" ppo/mean_non_score_rewardYu%6 z+A**  ppo/mean_scores8^>=$B+M z+A**  ppo/std_scoresH>j-ƌs$B+M ~+A**  ppo/loss/total]x#T0_ |+A**# ! ppo/policy/advantages_mean2a&sO +A**  ppo/returns/meanG<%6 ن+A**  ppo/returns/var~@#wC J+A**  ppo/val/vpredgR=S _#wC 8+A**  ppo/val/error`&?f&sO +A**  ppo/val/clipfrac >"x= 6+A**  ppo/val/meanA^~!{ +A**  ppo/val/var@ 0u+K +A**  ppo/val/var_explainede?Q~|'F k+A**  ppo/learning_rateo:<'+K ċ+A**  time/ppo/forward_passOW>}C J.W +A**!  time/ppo/compute_rewards:xH1 t+A**$ " time/ppo/compute_advantagesw;|V,E Ό+A**  time/ppo/optimize_stepÍ?!)7_ #+A**  time/ppo/calc_statsF>O$B+M {+A**  time/ppo/total7?2%6 Ս+A**  env/reward_mean8^>e 3$B+M ++A**  env/reward_stdH>qU"x= <|A+*  objective/kl֣Bв'F |A+*  objective/kl_coefR>9H'F |A+*  objective/entropyA /m]P |A+*" ppo/mean_non_score_rewardb̾&i %6 |A+*  ppo/mean_scoresz>R4$B+M |A+*  ppo/std_scoresRK>c)-,A$B+M |A+*  ppo/loss/total"x= _|A+*  ppo/val/mean:>A!{ p|A+*  ppo/val/varĞ@f+K |A+*  ppo/val/var_explainednY?. 7E'F |A+*  ppo/learning_rateo:ٹ+K |A+*  time/ppo/forward_pass0yW>+e.W |A+*!  time/ppo/compute_rewards:}1 |A+*$ " time/ppo/compute_advantageslo;4^,E 4|A+*  time/ppo/optimize_step8ƍ?[5k)7_ |A+*  time/ppo/calc_statsp#E>Z$B+M |A+*  time/ppo/total ?y%6 >|A+*  env/reward_meanz>Dgڑ$B+M |A+*  env/reward_stdRK>=