HH }s$hA brain.Event:2R. ,tensorboard.summary.writer.event_file_writer8#"x= "8hA*  objective/klȴ'F 8hA*  objective/kl_coefL>-L'F 8hA*  objective/entropy43/I$B+M 8hA*  ppo/std_scoreskڐ>c-p$B+M 8hA*  ppo/loss/value(@U$B+M Չ8hA*  ppo/loss/totalt\??+)(pJ 8hA*  ppo/policy/entropy4@W)7_ O8hA*  ppo/policy/approxkl7A:ӈ)7_ 8hA*  ppo/policy/policyklkU@#)7_ 8hA*  ppo/policy/clipfrac>a0_ I8hA*# ! ppo/policy/advantages_mean3N&sO 8hA*  ppo/returns/mean3?%6 8hA*  ppo/returns/var>yjB#wC T8hA*  ppo/val/vpred?#wC 8hA*  ppo/val/errorYA3&sO 8hA*  ppo/val/clipfracff>ng"x= 98hA*  ppo/val/mean%N@:"!{ ߑ8hA*  ppo/val/varc@,+K 8hA*  ppo/val/var_explainedCTP'F Q8hA*  ppo/learning_rateϸ:+K 8hA*  time/ppo/forward_passP>¢.W 8hA*!  time/ppo/compute_rewards;(:Kd1 f8hA*$ " time/ppo/compute_advantagesj;55,E R8hA*  time/ppo/optimize_steps?*)7_ ʘ8hA*  time/ppo/calc_statss>J1$B+M 8hA*  time/ppo/total6?%6 8hA*  env/reward_mean42>V$B+M u8hA*  env/reward_stdkڐ>v"x= ihA*  objective/klRB P'F *ihA*  objective/kl_coef?L>5'F FihA*  objective/entropyTCn`e/m]P ZihA*" ppo/mean_non_score_rewardx1 %6 nihA*  ppo/mean_scores*f?ld$B+M ~ihA*  ppo/std_scores{?+e -{)7_ ihA*  ppo/policy/policykl39=%)7_ ĉihA*  ppo/policy/clipfrac>N@0_ ؉ihA*# ! ppo/policy/advantages_meanaбݱy&sO ihA*  ppo/returns/meant& a%6 ihA*  ppo/returns/var`@dY=#wC ihA*  ppo/val/vpred\ D#wC !ihA*  ppo/val/errorJ4'A?P&sO ihA*  ppo/val/clipfrac=?"x= ۊihA*  ppo/val/mean3I?˰R!{ =ihA*  ppo/val/var,;ι+K ihA*  ppo/val/var_explained u'F ihA*  ppo/learning_rateϸ:^+K DihA*  time/ppo/forward_passcN>vj+.W ihA*!  time/ppo/compute_rewards:g1 ihA*$ " time/ppo/compute_advantages|M;u,E NihA*  time/ppo/optimize_step9?Ed)7_ ihA*  time/ppo/calc_statspA>yw$B+M ihA*  time/ppo/total(?<=%6 MihA*  env/reward_mean*f?(*$B+M ihA*  env/reward_std{?@_"x= hA*  objective/klDUBX!'F hA*  objective/kl_coefL>R'F &hA*  objective/entropyLaCED/m]P =hA*" ppo/mean_non_score_rewardE1Z%6 OhA*  ppo/mean_scores ɩ?(-$B+M chA*  ppo/std_scores\N?P-V)7_ 9hA*  ppo/policy/policyklK">V$Ye)7_ hA*  ppo/policy/clipfrac(>0_ hA*# ! ppo/policy/advantages_meanaI3e=&sO `hA*  ppo/returns/mean_Iz%6 hA*  ppo/returns/var| A`e8#wC hA*  ppo/val/vpredȿ˔6#wC shA*  ppo/val/errorFA:)&sO ʚhA*  ppo/val/clipfrac2, ?(;"x= $hA*  ppo/val/mean|XK!{ hA*  ppo/val/var :Sʠ+K ܛhA*  ppo/val/var_explainedپŤR'F 8hA*  ppo/learning_rateϸ:E)=+K hA*  time/ppo/forward_pass0>P.W hA*!  time/ppo/compute_rewards:'$1 EhA*$ " time/ppo/compute_advantagesX.;:),E hA*  time/ppo/optimize_stepHi?^)7_ 񝕍hA*  time/ppo/calc_stats$>q$B+M FhA*  time/ppo/total?)%6 hA*  env/reward_mean ɩ?Jc~$B+M 󞕍hA*  env/reward_std\N?4 s"x= cghA*  objective/klw`B'F ghA*  objective/kl_coefUL>ׄx'F ghA*  objective/entropyT C^/m]P hhA*" ppo/mean_non_score_rewardL%6 #hhA*  ppo/mean_scores\?^$B+M 7hhA*  ppo/std_scores?kR-H00_ mhA*# ! ppo/policy/advantages_meanr&sO mhA*  ppo/returns/meanBI%6 mhA*  ppo/returns/varAk#wC OnhA*  ppo/val/vpredi#wC nhA*  ppo/val/error5WA(&sO ohA*  ppo/val/clipfracQ ?"x= wohA*  ppo/val/meanϿY!{ ohA*  ppo/val/var:pK+K BphA*  ppo/val/var_explainedDa۾'U'F phA*  ppo/learning_rateϸ:a+K qhA*  time/ppo/forward_passx{>A:.W tqhA*!  time/ppo/compute_rewards:c1 qhA*$ " time/ppo/compute_advantages;=Yl,E ;rhA*  time/ppo/optimize_stepD?_^)7_ rhA*  time/ppo/calc_stats>$B+M rhA*  time/ppo/total? #%6 jshA*  env/reward_mean\?(Qa$B+M shA*  env/reward_std? "x= OhA*  objective/klwuXBw'F hA*  objective/kl_coefM> ;'F hA*  objective/entropy5Bz/m]P hA*" ppo/mean_non_score_reward'kl\%6 hA*  ppo/mean_scoresM?p"e$B+M )hA*  ppo/std_scores ?:-~)7_ hA*  ppo/policy/policyklx=6~)7_ KhA*  ppo/policy/clipfrac>c0_ hA*# ! ppo/policy/advantages_meanv&sO hA*  ppo/returns/meanbٽ%6 fhA*  ppo/returns/var$Az_#wC hA*  ppo/val/vpred.7*##wC hA*  ppo/val/error+dVAˢ[&sO phA*  ppo/val/clipfrac?~|"x= hA*  ppo/val/meane+M0!{ !hA*  ppo/val/var4;Lu[t+K }hA*  ppo/val/var_explainedCP'F hA*  ppo/learning_rateϸ:+K -hA*  time/ppo/forward_pass8؁> S.W hA*!  time/ppo/compute_rewardsh:f\1 hA*$ " time/ppo/compute_advantages$; ,E =hA*  time/ppo/optimize_step?R`)7_ hA*  time/ppo/calc_statsv>l$B+M hA*  time/ppo/totalJ?iBwJ%6 EhA*  env/reward_meanM?ۉ~$B+M hA*  env/reward_std ?:jx"x= 8rhA*  objective/kltA}|'F rhA*  objective/kl_coef1M>Q'F rhA*  objective/entropy5Aw/m]P ϸrhA*" ppo/mean_non_score_reward}n2%6 rhA*  ppo/mean_scores\W? %$B+M rhA*  ppo/std_scores׃\?g-pl$B+M rhA*  ppo/loss/valueW=g@;s$B+M rhA*  ppo/loss/total>((pJ ̽rhA*  ppo/policy/entropy?Ir)7_ rhA*  ppo/policy/approxkl}>A[3)7_ rhA*  ppo/policy/policykl[@G@J)7_ irhA*  ppo/policy/clipfrac>F,50_ *rhA*# ! ppo/policy/advantages_mean\"&sO YrhA*  ppo/returns/mean!NN%6 rrhA*  ppo/returns/vard@]* #wC rhA*  ppo/val/vpredAk:#wC rhA*  ppo/val/errorN@؇'&sO rhA*  ppo/val/clipfrac=<$~"x= rhA*  ppo/val/mean4j;ʐ]!{ rhA*  ppo/val/var \!.W ~rhA*!  time/ppo/compute_rewards:Jc1 rhA*$ " time/ppo/compute_advantagesy;),E *rhA*  time/ppo/optimize_step?s)7_ rhA*  time/ppo/calc_stats`{t>>-!$B+M rhA*  time/ppo/totalv?%6 +rhA*  env/reward_mean\W? X$B+M rhA*  env/reward_std׃\? "x= w$hA*  objective/klx A'F w$hA*  objective/kl_coef SM>r6F'F w$hA*  objective/entropy7Q%F/m]P w$hA*" ppo/mean_non_score_rewardWI"%6 w$hA*  ppo/mean_scoresGb3?dC$B+M w$hA*  ppo/std_scores(]D=&k-T)7_ N{$hA*  ppo/policy/approxkl:)7_ {$hA*  ppo/policy/policykl =TH=)7_ |$hA*  ppo/policy/clipfrac&k0_ |$hA*# ! ppo/policy/advantages_mean4qS&sO |$hA*  ppo/returns/meanq(%6 C}$hA*  ppo/returns/varž;/8#wC }$hA*  ppo/val/vpredK#wC ~$hA*  ppo/val/errorZ9?~&sO h~$hA*  ppo/val/clipfrac@?!"x= ~$hA*  ppo/val/mean?.(2!{ $hA*  ppo/val/varh+K x$hA*  ppo/val/var_explainedúZb'F $hA*  ppo/learning_rateϸ:3+K -$hA*  time/ppo/forward_passm>z.W $hA*!  time/ppo/compute_rewards:1 $hA*$ " time/ppo/compute_advantages_; ,E 7$hA*  time/ppo/optimize_step8?-})7_ $hA*  time/ppo/calc_stats\>3$B+M $hA*  time/ppo/total?aN%6 ;$hA*  env/reward_meanGb3?;2x$B+M $hA*  env/reward_std(]D=9|"x= {hA*  objective/kl5=/Aj'F s{hA*  objective/kl_coeftM>uFk'F {hA*  objective/entropy@F/m]P {hA*" ppo/mean_non_score_rewardC~F%6 Ü{hA*  ppo/mean_scoresQV?|$B+M ՜{hA*  ppo/std_scores??㏜-{hA*  ppo/policy/policykl=?U^ )7_ P{hA*  ppo/policy/clipfrac>CY0_ d{hA*# ! ppo/policy/advantages_meanUUe3%YxT&sO z{hA*  ppo/returns/meanHfӯ%6 {hA*  ppo/returns/var̔?hP#wC {hA*  ppo/val/vpredY)6#wC {hA*  ppo/val/error?NYh&sO {hA*  ppo/val/clipfrac>g)"x= &{hA*  ppo/val/mean!{ <{hA*  ppo/val/var:SAc+K N{hA*  ppo/val/var_explainedР4ʫ'F _{hA*  ppo/learning_rateϸ: +K p{hA*  time/ppo/forward_pass0r><.W {hA*!  time/ppo/compute_rewards:GX1 {hA*$ " time/ppo/compute_advantagesj;N,E {hA*  time/ppo/optimize_step*?ߞr<)7_ {hA*  time/ppo/calc_statsg>t%h$B+M {hA*  time/ppo/totald?6U%6 n{hA*  env/reward_meanQV?*Ʀ$B+M ȧ{hA*  env/reward_std??9@"x= =hA *  objective/klr A=)'F =hA *  objective/kl_coefmM>v}'F =hA *  objective/entropy6:A!/m]P >hA *" ppo/mean_non_score_rewardR2%6 >hA *  ppo/mean_scores1?ND$B+M />hA *  ppo/std_scores<% =-hA *  tokens/queries_len_mean4CU,E R>hA *  tokens/queries_len_stdD/m]P h>hA *" tokens/responses_len_mean?jE.W y>hA *!  tokens/responses_len_std%6 >hA *  ppo/loss/policyAZ)7_ tFhA *  ppo/policy/approxkl>ܼa)7_ FhA *  ppo/policy/policykl>!N)7_ FhA *  ppo/policy/clipfracx>6 0_ #IhA *# ! ppo/policy/advantages_mean &sO QIhA *  ppo/returns/meanʨ8j %6 KhA *  ppo/returns/var :o[)#wC LhA *  ppo/val/vpredak#wC !LhA *  ppo/val/error>`h&sO LhA *  ppo/val/clipfrac>["x= *OhA *  ppo/val/mean@EO!{ UOhA *  ppo/val/vard+K oOhA *  ppo/val/var_explainedwv'F OhA *  ppo/learning_rateϸ:J$J+K OhA *  time/ppo/forward_pass@nl>Փ;.W OhA *!  time/ppo/compute_rewards:Ğd1 OhA *$ " time/ppo/compute_advantagesc;2,E OhA *  time/ppo/optimize_stepo?;)7_ OhA *  time/ppo/calc_statsPp^>yC$B+M OhA *  time/ppo/totalr?Apr%6 [PhA *  env/reward_mean1?Ƹ$B+M PhA *  env/reward_std<% =io"x= ۠hA *  objective/klk A.'F GhA *  objective/kl_coefM>fR['F dhA *  objective/entropyT;֝/m]P zhA *" ppo/mean_non_score_reward %6 hA *  ppo/mean_scores1?$B+M hA *  ppo/std_scores<% =Ur-t.W hA *!  time/ppo/compute_rewardsP:iTW1 hA *$ " time/ppo/compute_advantagesh;Mk,E (hA *  time/ppo/optimize_stepF?*`>)7_ 9hA *  time/ppo/calc_stats0|Z>:$B+M hA *  time/ppo/total?d%6 hA *  env/reward_mean1?$B+M hA *  env/reward_std<% =*,LX"x= -7'F =FhA *  objective/entropyG6EYr/m]P =FhA *" ppo/mean_non_score_rewardg M %6 =FhA *  ppo/mean_scores1?%H^$B+M =FhA *  ppo/std_scores<% =B-FhA *" tokens/responses_len_mean?n3L.W >FhA *!  tokens/responses_len_stdl%6 ,>FhA *  ppo/loss/policyعf$B+M ?FhA *  ppo/loss/value':;J"x= DFhA *  ppo/val/mean!!{ DFhA *  ppo/val/varw.W FFhA *!  time/ppo/compute_rewards:-.1 FFhA *$ " time/ppo/compute_advantagesa;w,E QGFhA *  time/ppo/optimize_step4 ?%v-)7_ GFhA *  time/ppo/calc_statsZ>X|$B+M HFhA *  time/ppo/total?# _%6 _HFhA *  env/reward_mean1?$B+M HFhA *  env/reward_std<% =|G("x= }hA *  objective/klx A!'F hA *  objective/kl_coefM>(bp'F  hA *  objective/entropy3NL!/m]P  hA *" ppo/mean_non_score_reward'%6 4 hA *  ppo/mean_scores1?”$B+M H hA *  ppo/std_scores<% = -)7_ qhA *  ppo/policy/policykl.R}\)7_ hA *  ppo/policy/clipfrac950_ hA *# ! ppo/policy/advantages_meanг&sO hA *  ppo/returns/meanS)h%6 hA *  ppo/returns/var :催#wC hA *  ppo/val/vpred;#wC hA *  ppo/val/error=!H&sO hA *  ppo/val/clipfrac6*"x= -hA *  ppo/val/meanep!{ >hA *  ppo/val/var/d+K hA *  ppo/val/var_explainedb|x'F hA *  ppo/learning_rateϸ:H+K -hA *  time/ppo/forward_passfw>j<.W XhA *!  time/ppo/compute_rewards:ނ1 ohA *$ " time/ppo/compute_advantagesy;Y,E hA *  time/ppo/optimize_stephM?olp)7_ }hA *  time/ppo/calc_stats0`>=ޥ$B+M hA *  time/ppo/totaln?źH%6 4hA *  env/reward_mean1?`$B+M hA *  env/reward_std<% =gk"x= uhA *  objective/klx A~/'F uhA *  objective/kl_coefJN>m'F uhA *  objective/entropy/m]P *uhA *" ppo/mean_non_score_reward36:j%6 =uhA *  ppo/mean_scores1? $B+M PuhA *  ppo/std_scores<% =d-U@W.W uhA *!  time/ppo/compute_rewards0:1 uhA *$ " time/ppo/compute_advantagesDn;6,E uhA *  time/ppo/optimize_stepl?Xb_)7_ uhA *  time/ppo/calc_statsZ>{$B+M "uhA *  time/ppo/total? %6 uhA *  env/reward_mean1?"}$B+M uhA *  env/reward_std<% =" "x= 2hA*  objective/klx A 'F hA*  objective/kl_coef?N>S5'F وhA*  objective/entropy/m]P hA*" ppo/mean_non_score_rewardH*+z%6 hA*  ppo/mean_scores1?$B+M hA*  ppo/std_scores<% =S#-.W hA*!  time/ppo/compute_rewards:"1 ShA*$ " time/ppo/compute_advantagesc;Q ,E hA*  time/ppo/optimize_step:?C)7_ hA*  time/ppo/calc_statse>!$B+M dhA*  time/ppo/totalM?܏%6 hA*  env/reward_mean1? U^$B+M hA*  env/reward_std<% =kū{"x= p?hA*  objective/klx A-Y'F ?hA*  objective/kl_coef`N>a'F ?hA*  objective/entropy;/m]P @hA*" ppo/mean_non_score_reward^s3%6 @hA*  ppo/mean_scores1?tg>$B+M -@hA*  ppo/std_scores<% =(-@hA*  tokens/queries_len_mean4Cm,E P@hA*  tokens/queries_len_std/m]P b@hA*" tokens/responses_len_mean?g.W t@hA*!  tokens/responses_len_stdq%6 @hA*  ppo/loss/policy37$B+M ChA*  ppo/loss/value3;:$B+M EhA*  ppo/loss/totalb:м(pJ EhA*  ppo/policy/entropy05=)7_ EhA*  ppo/policy/approxkl6)7_ HhA*  ppo/policy/policyklsr&)7_ HhA*  ppo/policy/clipfrac0_ HhA*# ! ppo/policy/advantages_meanг!S&sO HhA*  ppo/returns/meanҩ?}%6 KhA*  ppo/returns/var :#wC LhA*  ppo/val/vpred6=E,#wC 6LhA*  ppo/val/error3G.W NhA*!  time/ppo/compute_rewards:1 :OhA*$ " time/ppo/compute_advantages$k;,E OhA*  time/ppo/optimize_step?t/)7_ OhA*  time/ppo/calc_statsд_>Qqt$B+M \PhA*  time/ppo/totalH?S_%6 PhA*  env/reward_mean1?g$B+M QhA*  env/reward_std<% ="x= Ts'F 6=AhA*  objective/entropy/m]P N=AhA*" ppo/mean_non_score_reward@sq%6 b=AhA*  ppo/mean_scores1?~$B+M x=AhA*  ppo/std_scores<% =k S-AhA*  ppo/loss/valueH; $B+M ?AhA*  ppo/loss/totalj~9 q(pJ /@AhA*  ppo/policy/entropy5#m)7_ @AhA*  ppo/policy/approxklGw&)7_ @AhA*  ppo/policy/policykl)7_ aAAhA*  ppo/policy/clipfraclX0_ AAhA*# ! ppo/policy/advantages_mean2G&sO /BAhA*  ppo/returns/mean`%6 BAhA*  ppo/returns/var :>6#wC BAhA*  ppo/val/vpred.v#wC QCAhA*  ppo/val/errorH;몔&sO CAhA*  ppo/val/clipfracH@C"x= DAhA*  ppo/val/meanï#-!{ _DAhA*  ppo/val/varq! +K DAhA*  ppo/val/var_explainedED'F EAhA*  ppo/learning_rateϸ:䊟+K vEAhA*  time/ppo/forward_passZk>:@.W EAhA*!  time/ppo/compute_rewards:11 4FAhA*$ " time/ppo/compute_advantages$l;ZvT,E FAhA*  time/ppo/optimize_stepx?};)7_ GAhA*  time/ppo/calc_stats\> $B+M SHAhA*  time/ppo/totalx?c4%6 KAhA*  env/reward_mean1?Ƹ/?$B+M EKAhA*  env/reward_std<% =F"x= YGhA*  objective/klx Aװ'F GhA*  objective/kl_coefN>%h 'F GhA*  objective/entropyDt/m]P HhA*" ppo/mean_non_score_rewardvZ%6 HhA*  ppo/mean_scores1?5{$B+M ,HhA*  ppo/std_scores<% =cs-HhA*  tokens/queries_len_mean4CÀ,E OHhA*  tokens/queries_len_std/m]P aHhA*" tokens/responses_len_mean?lGT.W tHhA*!  tokens/responses_len_std) %6 HhA*  ppo/loss/policy3%$B+M qIhA*  ppo/loss/value*;"7$B+M wJhA*  ppo/loss/totalܑc9(pJ KhA*  ppo/policy/entropy5)7_ KhA*  ppo/policy/approxklzw@)7_ KhA*  ppo/policy/policykl)7_ YLhA*  ppo/policy/clipfracUS0_ LhA*# ! ppo/policy/advantages_meanгs)&sO MhA*  ppo/returns/meand':~%6 MhA*  ppo/returns/var :>6[#wC NNhA*  ppo/val/vpred եE#wC NhA*  ppo/val/error*;'m&sO OhA*  ppo/val/clipfrac(= "x= mOhA*  ppo/val/meanb!{ OhA*  ppo/val/var10+K -PhA*  ppo/val/var_explained)#'F PhA*  ppo/learning_rateϸ: +K PhA*  time/ppo/forward_passDl>I.W FQhA*!  time/ppo/compute_rewards:#1 QhA*$ " time/ppo/compute_advantagesx;Z,E RhA*  time/ppo/optimize_step ?)7_ [RhA*  time/ppo/calc_statsl,$B+M RhA*  time/ppo/total?,%6 ShA*  env/reward_mean1?@w$B+M tShA*  env/reward_std<% =(g`"x= \uhA*  objective/klx A'F uhA*  objective/kl_coef[N>3'F uhA*  objective/entropy/m]P uhA*" ppo/mean_non_score_rewardIv%6 2uhA*  ppo/mean_scores1?$B+M EuhA*  ppo/std_scores<% =DD-F%`.W [uhA*!  time/ppo/compute_rewards8:s@1 uhA*$ " time/ppo/compute_advantagesc;o,E uhA*  time/ppo/optimize_step͜?4Q)7_ ^uhA*  time/ppo/calc_statsP\>H7X$B+M uhA*  time/ppo/total̄?]7;%6 uhA*  env/reward_mean1?Iʃ$B+M [uhA*  env/reward_std<% =t"x= hA*  objective/klx A#R'F hA*  objective/kl_coefjna'F 4hA*  objective/entropyf/m]P JhA*" ppo/mean_non_score_reward%6 ^hA*  ppo/mean_scores1?s$B+M phA*  ppo/std_scores<% =5<-?&E.W hA*!  time/ppo/compute_rewards:4=1  hA*$ " time/ppo/compute_advantagesh;ձ ~,E p hA*  time/ppo/optimize_step?s)7_ hA*  time/ppo/calc_stats0e>]$$B+M S!hA*  time/ppo/total? Q%6 e!hA*  env/reward_mean1?=@$B+M !hA*  env/reward_std<% =p>J"x= hA*  objective/klx A 'F 뭚hA*  objective/kl_coef" O>%\'F hA*  objective/entropys/m]P hA*" ppo/mean_non_score_reward+3W%6 2hA*  ppo/mean_scores1?0$B+M ChA*  ppo/std_scores<% =- )7_ 䳚hA*  ppo/policy/approxkl%}=)7_ hA*  ppo/policy/policykl&o)7_ ӶhA*  ppo/policy/clipfraciԊ0_ hA*# ! ppo/policy/advantages_mean2X&sO NhA*  ppo/returns/meanΦ!DED.W hA*!  time/ppo/compute_rewards:D91 hA*$ " time/ppo/compute_advantages;,E ChA*  time/ppo/optimize_stepR'?H)7_ hA*  time/ppo/calc_statsp[>2I$B+M hA*  time/ppo/total?zė%6 =hA*  env/reward_mean1?vT$B+M hA*  env/reward_std<% =qE%"x= $hA*  objective/klx AYL'F B$hA*  objective/kl_coef,O>u''F g$hA*  objective/entropyƻ/m]P ~$hA*" ppo/mean_non_score_rewardooR%6 $hA*  ppo/mean_scores1?-$B+M $hA*  ppo/std_scores<% =AC-%6 $hA*  ppo/returns/var :F#wC $hA*  ppo/val/vpredZ]#wC u$hA*  ppo/val/error:dJ&sO $hA*  ppo/val/clipfrac"x= $hA*  ppo/val/mean篿^@!{ z$hA*  ppo/val/var+K $hA*  ppo/val/var_explained7'F /$hA*  ppo/learning_rateϸ: Z+K $hA*  time/ppo/forward_pass7m>du.W $hA*!  time/ppo/compute_rewards:aa1 ?$hA*$ " time/ppo/compute_advantages c;O.L@,E $hA*  time/ppo/optimize_step-?0I)7_ $hA*  time/ppo/calc_stats`Z>g~M$B+M B$hA*  time/ppo/total? ڴ%6 $hA*  env/reward_mean1?) $B+M $hA*  env/reward_std<% =0U"x= 6hA*  objective/klx A?'F &7hA*  objective/kl_coefMO>]'F I7hA*  objective/entropy (/m]P _7hA*" ppo/mean_non_score_reward8t%6 q7hA*  ppo/mean_scoresGb3?2$B+M 7hA*  ppo/std_scores(]D=eg-)7_ ;hA*  ppo/policy/clipfrac0_ >hA*  ppo/val/mean9!{ >hA*  ppo/val/var5;+K >hA*  ppo/val/var_explainedBL'F N?hA*  ppo/learning_rateϸ:!+K ?hA*  time/ppo/forward_passj>t.W @hA*!  time/ppo/compute_rewards:.d1 c@hA*$ " time/ppo/compute_advantagesl;ݎ,E @hA*  time/ppo/optimize_step4I?Q)7_ AhA*  time/ppo/calc_statspY>$B+M kAhA*  time/ppo/totalg?"%6 AhA*  env/reward_meanGb3?8x"$B+M BhA*  env/reward_std(]D=A"x= F}khA*  objective/klx A͌HX'F }khA*  objective/kl_coefoO>'F }khA*  objective/entropy7/m]P }khA*" ppo/mean_non_score_reward:%6 }khA*  ppo/mean_scores1?Q$B+M }khA*  ppo/std_scores<% =i-&sO khA*  ppo/val/clipfracغm"x= khA*  ppo/val/meanp!{ khA*  ppo/val/var/"U+K ʍkhA*  ppo/val/var_explained?{r'F ۍkhA*  ppo/learning_rateϸ:lB+K khA*  time/ppo/forward_passap>P.W khA*!  time/ppo/compute_rewards:1 khA*$ " time/ppo/compute_advantagesd;,E khA*  time/ppo/optimize_stepj8?b)7_ khA*  time/ppo/calc_stats\>ÓA$B+M khA*  time/ppo/total?{Пc%6 %khA*  env/reward_mean1?N? $B+M khA*  env/reward_std<% =0m{"x= !hA*  objective/klx Aaړ'F H"hA*  objective/kl_coefO> 'F d"hA*  objective/entropyêN/m]P w"hA*" ppo/mean_non_score_rewardNkx%6 "hA*  ppo/mean_scores1?-u$B+M "hA*  ppo/std_scores<% =B-/m]P "hA*" tokens/responses_len_mean?.W "hA*!  tokens/responses_len_stdl%6 #hA*  ppo/loss/policyk6$B+M $hA*  ppo/loss/value,:愴$B+M )hA*  ppo/loss/total38P(pJ ,)hA*  ppo/policy/entropy5\:)7_ D)hA*  ppo/policy/approxklgI)7_ X)hA*  ppo/policy/policyklE5o)7_ i)hA*  ppo/policy/clipfracb0_ ~)hA*# ! ppo/policy/advantages_mean2&sO )hA*  ppo/returns/meanQ-%6 )hA*  ppo/returns/var :.#wC )hA*  ppo/val/vpredX퉕#wC )hA*  ppo/val/errorŬ:)&sO ,hA*  ppo/val/clipfrack|i"x= 6,hA*  ppo/val/meanﭿ!{ M,hA*  ppo/val/varW+K .hA*  ppo/val/var_explainedlHYz'F .hA*  ppo/learning_rateϸ:9+K .hA*  time/ppo/forward_pass4m> .W k0hA*!  time/ppo/compute_rewards:s/1 2hA*$ " time/ppo/compute_advantagesPg;?,E 2hA*  time/ppo/optimize_step?)7_ 2hA*  time/ppo/calc_stats`l\>W$B+M 2hA*  time/ppo/totall?:&%6 3hA*  env/reward_mean1?C$B+M 3hA*  env/reward_std<% =m9"x= hA*  objective/klx AH9'F hA*  objective/kl_coefO>V'F hA*  objective/entropy}/m]P ʿhA*" ppo/mean_non_score_reward2j(%6 ݿhA*  ppo/mean_scoresGb3?h$B+M ￧hA*  ppo/std_scores(]D=\-.W ȧhA*!  time/ppo/compute_rewards;D 1 ȧhA*$ " time/ppo/compute_advantagesb;[^&,E PɧhA*  time/ppo/optimize_stepV ?8O)7_ ɧhA*  time/ppo/calc_stats`[>y$B+M ʧhA*  time/ppo/total?uq%6 gʧhA*  env/reward_meanGb3?1'$B+M ˧hA*  env/reward_std(]D=P`"x= GhA*  objective/klx AlD'F xGhA*  objective/kl_coefO>`'F GhA*  objective/entropyT_/m]P GhA*" ppo/mean_non_score_rewardGd\%6 GhA*  ppo/mean_scores1?UR $B+M GhA*  ppo/std_scores<% =g-k-.W )GhA*!  time/ppo/compute_rewards :1 GhA*$ " time/ppo/compute_advantagesb;,E GhA*  time/ppo/optimize_stepH?^ԕ)7_ *GhA*  time/ppo/calc_stats@#\>Q1$B+M |GhA*  time/ppo/total?%6 GhA*  env/reward_mean1?Fx$B+M #GhA*  env/reward_std<% =xP"x= hA*  objective/klx A'F #hA*  objective/kl_coef O>uH'F AhA*  objective/entropyB/m]P XhA*" ppo/mean_non_score_rewardM]璙%6 mhA*  ppo/mean_scores1?3%6 hA*  ppo/returns/var :la#wC WhA*  ppo/val/vpred6㫿X#wC hA*  ppo/val/errorᚚ:R-/!&sO hA*  ppo/val/clipfrac"x= hA*  ppo/val/meanѬ'[S!{ hA*  ppo/val/var+K hA*  ppo/val/var_explained;2#'F hA*  ppo/learning_rateϸ:C+K GhA*  time/ppo/forward_pass`n>jo.W hA*!  time/ppo/compute_rewards:1 hA*$ " time/ppo/compute_advantagesi;J,E PhA*  time/ppo/optimize_stepٟ?1u)7_ hA*  time/ppo/calc_statspZ>T$B+M hA*  time/ppo/total?pL%6 RhA*  env/reward_mean1?|$B+M hA*  env/reward_std<% =c"x= hA*  objective/klx Aܒ'F hA*  objective/kl_coefP>`ai'F hA*  objective/entropy'/m]P ,hA*" ppo/mean_non_score_rewardr%6 @hA*  ppo/mean_scores1?FNlQ$B+M RhA*  ppo/std_scores<% =_-Lo$B+M hA*  time/ppo/totalV?|Z%6 躃hA*  env/reward_mean1?4$B+M 8hA*  env/reward_std<% ="x= 1hA*  objective/klx A>h'F &2hA*  objective/kl_coef5

y'F H2hA*  objective/entropy͕/m]P ^2hA*" ppo/mean_non_score_reward%6 p2hA*  ppo/mean_scores1?Mp$B+M 2hA*  ppo/std_scores<% =J-В.W :hA*!  time/ppo/compute_rewards:X1 ;hA*$ " time/ppo/compute_advantagesa;*!,E n;hA*  time/ppo/optimize_step6֣?Js)7_ ;hA*  time/ppo/calc_statsP^>]$B+M <hA*  time/ppo/total8?2_%6 |<hA*  env/reward_mean1?}$B+M <hA*  env/reward_std<% =sh"x= ahA*  objective/klx A3l'F (bhA*  objective/kl_coefS^P>B[ 'F ObhA*  objective/entropyoI/m]P fbhA*" ppo/mean_non_score_rewardk|%6 zbhA*  ppo/mean_scoresGb3?>)r$B+M bhA*  ppo/std_scores(]D= -.W *khA*!  time/ppo/compute_rewards:EL1 khA*$ " time/ppo/compute_advantagesh;Jn\,E khA*  time/ppo/optimize_step?@I)7_ 3lhA*  time/ppo/calc_statsd>Y@$B+M lhA*  time/ppo/total?SZ %6 lhA*  env/reward_meanGb3?$p$B+M 7mhA*  env/reward_std(]D=9"x= \fhA*  objective/klx A֐'F ҮfhA*  objective/kl_coefwP>Xs'F fhA*  objective/entropyѯ/m]P fhA*" ppo/mean_non_score_rewardѲ0%6 fhA*  ppo/mean_scores1? .$B+M 3fhA*  ppo/std_scores<% =_-zw.W fhA*!  time/ppo/compute_rewardst;,1 fhA*$ " time/ppo/compute_advantagese;qu,E fhA*  time/ppo/optimize_step<ʞ?ΕlE)7_ fhA*  time/ppo/calc_stats>Z>;ؐy$B+M ifhA*  time/ppo/total,I?tq%6 fhA*  env/reward_mean1?Q{$B+M fhA*  env/reward_std<% =2("x= hA *  objective/klx AV9;'F thA *  objective/kl_coefP>NqH'F hA *  objective/entropy8/m]P hA *" ppo/mean_non_score_reward;s9%6 hA *  ppo/mean_scores1?*$B+M hA *  ppo/std_scores<% =l-k(pJ hA *  ppo/policy/entropy5`b)7_ hA *  ppo/policy/approxklG5)7_ ~hA *  ppo/policy/policykl=31)7_ hA *  ppo/policy/clipfrac0_ ChA *# ! ppo/policy/advantages_mean2Zu&sO hA *  ppo/returns/mean; ]2%6 hA *  ppo/returns/var ::p#wC rhA *  ppo/val/vpredHgWN(%#wC hA *  ppo/val/error_:&sO (hA *  ppo/val/clipfrac"x= hA *  ppo/val/mean୿@!{ hA *  ppo/val/varhߎ'+K RhA *  ppo/val/var_explained6'F hA *  ppo/learning_rateϸ:b+K hA *  time/ppo/forward_passPi>.W nhA *!  time/ppo/compute_rewards:B& 1 hA *$ " time/ppo/compute_advantagesh;3x,E &hA *  time/ppo/optimize_stepj?pz")7_ hA *  time/ppo/calc_stats f>vM*$B+M hA *  time/ppo/total? eK%6 ,hA *  env/reward_mean1?Ռ$B+M hA *  env/reward_std<% =2"x= -iA!*  objective/klx A(@ 'F .iA!*  objective/kl_coefP>'F ;.iA!*  objective/entropy(?y/m]P O.iA!*" ppo/mean_non_score_reward۝%6 a.iA!*  ppo/mean_scores1?!e$B+M r.iA!*  ppo/std_scores<% =׵-q.W <iA!*!  time/ppo/compute_rewards:u 1 <iA!*$ " time/ppo/compute_advantagesha;HU,E <iA!*  time/ppo/optimize_stepdʛ?0)7_ d>iA!*  time/ppo/calc_statspY>U1$B+M +@iA!*  time/ppo/totalQ?f2(%6 U@iA!*  env/reward_mean1?J $B+M j@iA!*  env/reward_std<% ="x= &8iA"*  objective/klx A+'F '8iA"*  objective/kl_coefP>'F B'8iA"*  objective/entropy ,/m]P X'8iA"*" ppo/mean_non_score_rewards%6 j'8iA"*  ppo/mean_scores1?*U$B+M {'8iA"*  ppo/std_scores<% =bf-u.W 08iA"*!  time/ppo/compute_rewards@: M1 `08iA"*$ " time/ppo/compute_advantagesth;?=8F,E 08iA"*  time/ppo/optimize_step?t4)7_ 18iA"*  time/ppo/calc_stats \>ۼf$B+M m18iA"*  time/ppo/total?֞%6 18iA"*  env/reward_mean1?#Z\$B+M 28iA"*  env/reward_std<% =K;{"x= iA#*  objective/klx Ak'F ,iA#*  objective/kl_coef= Q> 'F GiA#*  objective/entropyi/m]P ^iA#*" ppo/mean_non_score_reward%6 riA#*  ppo/mean_scores1?A@$B+M iA#*  ppo/std_scores<% =bb-uw.W iA#*!  time/ppo/compute_rewards:p1 iA#*$ " time/ppo/compute_advantagesp; $,E FiA#*  time/ppo/optimize_stepe?a>)7_ iA#*  time/ppo/calc_stats3a>g$B+M DiA#*  time/ppo/total?}CA%6 iA#*  env/reward_mean1?]$B+M iA#*  env/reward_std<% =o "x= ii iA$*  objective/klx A 'F ji iA$*  objective/kl_coef}+Q>P_'F ji iA$*  objective/entropyDY/m]P ji iA$*" ppo/mean_non_score_rewardA)I%6 ji iA$*  ppo/mean_scores1?ΣM$B+M ji iA$*  ppo/std_scores<% =P5&-vi iA$*  ppo/val/clipfracy"x= vi iA$*  ppo/val/meanuϬx !{ vi iA$*  ppo/val/varl|z+K Lwi iA$*  ppo/val/var_explainedh8.W `xi iA$*!  time/ppo/compute_rewards:o1 xi iA$*$ " time/ppo/compute_advantagesl;4|N,E $yi iA$*  time/ppo/optimize_step&?j )7_ ~yi iA$*  time/ppo/calc_statsb>ʾ$B+M yi iA$*  time/ppo/total6?E%6 ,zi iA$*  env/reward_mean1?H rZ$B+M zi iA$*  env/reward_std<% ="x= iD iA%*  objective/klx Ad 'F D iA%*  objective/kl_coefMQ>vBq'F E iA%*  objective/entropy݄q/m]P E iA%*" ppo/mean_non_score_reward3!%6 0E iA%*  ppo/mean_scores1? $B+M BE iA%*  ppo/std_scores<% = i-&sO lK iA%*  ppo/val/clipfrac2J"x= K iA%*  ppo/val/meanaݭ>% !{ !L iA%*  ppo/val/varĥ+K L iA%*  ppo/val/var_explained 9<[G'F L iA%*  ppo/learning_rateϸ:5I+K 5M iA%*  time/ppo/forward_passt>,"pF.W M iA%*!  time/ppo/compute_rewards:1 M iA%*$ " time/ppo/compute_advantagesLk;$,E BN iA%*  time/ppo/optimize_step?fy)7_ N iA%*  time/ppo/calc_stats`Q_>p$H$B+M N iA%*  time/ppo/total#?it%6 KO iA%*  env/reward_mean1?$B+M O iA%*  env/reward_std<% =im"x= iA&*  objective/klx APz'F <iA&*  objective/kl_coef pQ>aR!'F ZiA&*  objective/entropyν/m]P piA&*" ppo/mean_non_score_rewardIR%6 iA&*  ppo/mean_scores1?$B+M iA&*  ppo/std_scores<% =>I-.W ZiA&*!  time/ppo/compute_rewards:c|1 kiA&*$ " time/ppo/compute_advantages`j;{kk,E iA&*  time/ppo/optimize_stepPY?:bm)7_ iA&*  time/ppo/calc_stats Y>'s$B+M piA&*  time/ppo/total?{%6 iA&*  env/reward_mean1?nkU$B+M ziA&*  env/reward_std<% =o7"x= cLiA'*  objective/klx AkrC'F LiA'*  objective/kl_coef^Q>.CZ'F LiA'*  objective/entropy3/m]P LiA'*" ppo/mean_non_score_reward^k-:%6 LiA'*  ppo/mean_scores1? څ$B+M -LiA'*  ppo/std_scores<% =ԟ-O'F LiA'*  ppo/learning_rateϸ:ᾖ+K LiA'*  time/ppo/forward_passen>>?.W LiA'*!  time/ppo/compute_rewards:pDE1 LiA'*$ " time/ppo/compute_advantagesi;,E LiA'*  time/ppo/optimize_step ?5T)7_ LiA'*  time/ppo/calc_statsqf>o$B+M LiA'*  time/ppo/totalP?lD'%6 LiA'*  env/reward_mean1?$B+M eLiA'*  env/reward_std<% =0g"x= WliA(*  objective/klx A1'F liA(*  objective/kl_coefQ>/'F liA(*  objective/entropyX]*/m]P liA(*" ppo/mean_non_score_reward t7sN%6 miA(*  ppo/mean_scores1?=g$B+M miA(*  ppo/std_scores<% =?KG-.W uiA(*!  time/ppo/compute_rewards:ӽ 1 uiA(*$ " time/ppo/compute_advantages{;O,E CviA(*  time/ppo/optimize_stepD?AN)7_ viA(*  time/ppo/calc_statsn_>@2$B+M viA(*  time/ppo/total@?M%6 :wiA(*  env/reward_mean1?y0$B+M wiA(*  env/reward_std<% ="x= aiA)*  objective/klx A8'F iA)*  objective/kl_coefQ>\'F iA)*  objective/entropyz{/m]P iA)*" ppo/mean_non_score_reward%6 iA)*  ppo/mean_scores1?BÉ$B+M iA)*  ppo/std_scores<% =new-.W $iA)*!  time/ppo/compute_rewards:1 N%iA)*$ " time/ppo/compute_advantages$B+M K&iA)*  time/ppo/total?"*E%6 &iA)*  env/reward_mean1? g$B+M &iA)*  env/reward_std<% ={ҏ"x= 9iA**  objective/klx AZC#'F o 9iA**  objective/kl_coefqQ>5uH'F 9iA**  objective/entropyx&/m]P 9iA**" ppo/mean_non_score_reward  %6 9iA**  ppo/mean_scoresGb3?<$B+M 9iA**  ppo/std_scores(]D=WL-Hk)7_ 9iA**  ppo/policy/clipfrac0_ ,9iA**# ! ppo/policy/advantages_meanS{&sO 9iA**  ppo/returns/mean{%6 9iA**  ppo/returns/varž;|5j#wC b9iA**  ppo/val/vpred㞭 #wC 9iA**  ppo/val/error ; &sO 9iA**  ppo/val/clipfrac*m1"x= y9iA**  ppo/val/meanEɩ4!{ 9iA**  ppo/val/var.4Xa+K 89iA**  ppo/val/var_explained.W \9iA**!  time/ppo/compute_rewards:<1 9iA**$ " time/ppo/compute_advantages;F'/U,E 9iA**  time/ppo/optimize_stepӟ?vK)7_ v9iA**  time/ppo/calc_statsp|a>Q_$B+M 9iA**  time/ppo/totalz?*%6 09iA**  env/reward_meanGb3?̹?$B+M 9iA**  env/reward_std(]D="x= )iA+*  objective/klx AH'F iA+*  objective/kl_coefR>7'F iA+*  objective/entropy#j/m]P iA+*" ppo/mean_non_score_rewardJ%6 iA+*  ppo/mean_scores1?eY$B+M iA+*  ppo/std_scores<% =vâ-,E $iA+*  tokens/queries_len_stdȵ/m]P 6iA+*" tokens/responses_len_mean?.W GiA+*!  tokens/responses_len_stdT=%6 XiA+*  ppo/loss/policyo,$B+M BiA+*  ppo/loss/value|y:K$B+M ?iA+*  ppo/loss/total 8:(pJ iA+*  ppo/policy/entropy5 )7_ 'iA+*  ppo/policy/approxkl{aʋ)7_ iA+*  ppo/policy/policyklF})7_ iA+*  ppo/policy/clipfracLK0_ EiA+*# ! ppo/policy/advantages_mean2up&sO iA+*  ppo/returns/mean>X%6 iA+*  ppo/returns/var :||#wC YiA+*  ppo/val/vpredR@e#wC iA+*  ppo/val/error|y:ߒ&sO iA+*  ppo/val/clipfrac*>^"x= YiA+*  ppo/val/mean笿!{ iA+*  ppo/val/var-p+K iA+*  ppo/val/var_explainedɼ@'F `iA+*  ppo/learning_rateϸ:TG#+K iA+*  time/ppo/forward_passil>y.W iA+*!  time/ppo/compute_rewards:1 hiA+*$ " time/ppo/compute_advantagesb;>>,E iA+*  time/ppo/optimize_stepț?$9)7_ iA+*  time/ppo/calc_statsd^>ębO$B+M fiA+*  time/ppo/totalz?i%6 iA+*  env/reward_mean1? s$B+M iA+*  env/reward_std<% =*?"x= 0WiA,*  objective/klx AA'F WiA,*  objective/kl_coefD>R>8'F WiA,*  objective/entropy. y/m]P WiA,*" ppo/mean_non_score_rewardDۋ%6 WiA,*  ppo/mean_scores1?kj$B+M WiA,*  ppo/std_scores<% =<;&-Q.W WiA,*!  time/ppo/compute_rewardsh: lȵ1 vWiA,*$ " time/ppo/compute_advantages|j;ᝬ,E WiA,*  time/ppo/optimize_step՟?jD.l)7_ WiA,*  time/ppo/calc_statsP`>55$B+M YWiA,*  time/ppo/total?m %6 WiA,*  env/reward_mean1?$B+M WiA,*  env/reward_std<% =lD "x= o!iA-*  objective/klx Adnw'F p!iA-*  objective/kl_coef`R>){'F p!iA-*  objective/entropyAa/m]P p!iA-*" ppo/mean_non_score_reward_e%6 p!iA-*  ppo/mean_scores1?;#9$B+M p!iA-*  ppo/std_scores<% =T-u.W !iA-*!  time/ppo/compute_rewards:a1 _!iA-*$ " time/ppo/compute_advantages8h;o,E !iA-*  time/ppo/optimize_step條?)7_ !iA-*  time/ppo/calc_stats[>$B+M (!iA-*  time/ppo/totalP?b %6 ƒ!iA-*  env/reward_mean1?SE$B+M X!iA-*  env/reward_std<% =G"x= 7$iA.*  objective/klx AOic'F ]8$iA.*  objective/kl_coef.R>og'F 8$iA.*  objective/entropyoH'/m]P 8$iA.*" ppo/mean_non_score_rewardwzf%6 8$iA.*  ppo/mean_scores1?$B+M 8$iA.*  ppo/std_scores<% =۸u-c.W J$iA.*!  time/ppo/compute_rewards:9ay1 [L$iA.*$ " time/ppo/compute_advantagesi;f;,E mN$iA.*  time/ppo/optimize_step?O)7_ hP$iA.*  time/ppo/calc_statsp_>L$B+M IR$iA.*  time/ppo/total8}?%6 =S$iA.*  env/reward_mean1?R8$B+M T$iA.*  env/reward_std<% =CN"x= !'iA/*  objective/klx Aҝ'F &!'iA/*  objective/kl_coefR>fg'F D!'iA/*  objective/entropy ~>/m]P Z!'iA/*" ppo/mean_non_score_reward 7%6 p!'iA/*  ppo/mean_scores1?|*$B+M !'iA/*  ppo/std_scores<% =,hz-U.W h!'iA/*!  time/ppo/compute_rewards:11 !'iA/*$ " time/ppo/compute_advantages e;7,E !'iA/*  time/ppo/optimize_step>?)7_ 2!'iA/*  time/ppo/calc_stats\>$B+M ơ!'iA/*  time/ppo/total?.C%6 _!'iA/*  env/reward_mean1?{$B+M !'iA/*  env/reward_std<% =\"x= sS)iA0*  objective/klx A|KWI 'F T)iA0*  objective/entropy'|/m]P T)iA0*" ppo/mean_non_score_reward %6 +T)iA0*  ppo/mean_scores1?$B+M 8Yi.W a)iA0*!  time/ppo/compute_rewards:Gb1 a)iA0*$ " time/ppo/compute_advantagesXo;,E 9b)iA0*  time/ppo/optimize_stepnW?E))7_ b)iA0*  time/ppo/calc_stats@k><$B+M ec)iA0*  time/ppo/total?6%6 c)iA0*  env/reward_mean1?C$B+M d)iA0*  env/reward_std<% =r"x= Zg,iA1*  objective/klx A=Tf'F ([g,iA1*  objective/kl_coefR>k'F E[g,iA1*  objective/entropyR/m]P [[g,iA1*" ppo/mean_non_score_reward^6~@C%6 m[g,iA1*  ppo/mean_scoresGb3?2vJ$B+M [g,iA1*  ppo/std_scores(]D=Uwk-FNj.W mg,iA1*!  time/ppo/compute_rewards:1 -ng,iA1*$ " time/ppo/compute_advantagesn; ,E ng,iA1*  time/ppo/optimize_stepP?Lc)7_ _og,iA1*  time/ppo/calc_statsb>i$B+M og,iA1*  time/ppo/total6?(D%6 pg,iA1*  env/reward_meanGb3?7$B+M 'qg,iA1*  env/reward_std(]D=fs"x= /iA2*  objective/klx AW'F /iA2*  objective/kl_coefF S>b'F /iA2*  objective/entropyu/m]P Ό/iA2*" ppo/mean_non_score_rewardL%6 /iA2*  ppo/mean_scores1? F\$B+M /iA2*  ppo/std_scores<% =EƩ-R.W z/iA2*!  time/ppo/compute_rewards:SM1 /iA2*$ " time/ppo/compute_advantages8j;Ӵ,E /iA2*  time/ppo/optimize_step@+?y)7_ 9/iA2*  time/ppo/calc_statsl>G0l$B+M ̞/iA2*  time/ppo/totaly?,M%6 /iA2*  env/reward_mean1?rz$B+M /iA2*  env/reward_std<% =ē"x= ߫1iA3*  objective/klx Ap}'F 1iA3*  objective/kl_coef/S>?'F 71iA3*  objective/entropy(/m]P M1iA3*" ppo/mean_non_score_rewarda58]%6 `1iA3*  ppo/mean_scores1?P$B+M u1iA3*  ppo/std_scores<% =v-!{ 1iA3*  ppo/val/var+H*s+K 1iA3*  ppo/val/var_explained'F 1iA3*  ppo/learning_rateϸ:{+K Z1iA3*  time/ppo/forward_pass@z>\^.W 1iA3*!  time/ppo/compute_rewards@::{I1 1iA3*$ " time/ppo/compute_advantageshm;,E 71iA3*  time/ppo/optimize_step`/?P)7_ 1iA3*  time/ppo/calc_stats@b>w$B+M c1iA3*  time/ppo/total}?_~%6 &1iA3*  env/reward_mean1?;O&$B+M 1iA3*  env/reward_std<% ='F ٨H4iA4*  objective/entropy%/m]P H4iA4*" ppo/mean_non_score_rewardewQUS%6 H4iA4*  ppo/mean_scores1?w@$B+M H4iA4*  ppo/std_scores<% =O -*A.W `H4iA4*!  time/ppo/compute_rewards:?1 H4iA4*$ " time/ppo/compute_advantagesn;#,E H4iA4*  time/ppo/optimize_step"̟?GMs)7_ &H4iA4*  time/ppo/calc_stats.]>\$B+M H4iA4*  time/ppo/total? %6 RH4iA4*  env/reward_mean1?ZDM$B+M H4iA4*  env/reward_std<% =5"x= F6iA5*  objective/klx A,Y'F 6iA5*  objective/kl_coefuS>'F ӓ6iA5*  objective/entropy/m]P 6iA5*" ppo/mean_non_score_reward7%6 6iA5*  ppo/mean_scores1?OM$B+M 6iA5*  ppo/std_scores<% =!-6iA5*  ppo/val/clipfrackƘ"x= Q6iA5*  ppo/val/meanz.!{ c6iA5*  ppo/val/varAwj+K u6iA5*  ppo/val/var_explained`vJ.W -6iA5*!  time/ppo/compute_rewards$;t]1 \6iA5*$ " time/ppo/compute_advantagesk;̯,E u6iA5*  time/ppo/optimize_steph.?)7_ 6iA5*  time/ppo/calc_stats^>!$B+M 6iA5*  time/ppo/total?.7F%6 J6iA5*  env/reward_mean1?$B+M 6iA5*  env/reward_std<% =/W"x= f;iA6*  objective/klD ABqX'F ?g;iA6*  objective/kl_coefS>'F ]g;iA6*  objective/entropycC!/m]P sg;iA6*" ppo/mean_non_score_rewardy翵gp%6 g;iA6*  ppo/mean_scores>$B+M g;iA6*  ppo/std_scoresl<%-.W w;iA6*!  time/ppo/compute_rewards:6e1 $x;iA6*$ " time/ppo/compute_advantages|;\,E x;iA6*  time/ppo/optimize_step/J?TT)7_ Ly;iA6*  time/ppo/calc_statsS >d]$B+M y;iA6*  time/ppo/totalʼn?Stz~%6 wz;iA6*  env/reward_mean>E$B+M {;iA6*  env/reward_stdliA7*  objective/klD AG;'F >iA7*  objective/kl_coefdS>gi'F >iA7*  objective/entropy7e/m]P .>iA7*" ppo/mean_non_score_rewardb%6 C>iA7*  ppo/mean_scores>G$B+M V>iA7*  ppo/std_scoresliA7*  tokens/queries_len_meanBrS,E {>iA7*  tokens/queries_len_stdI_/m]P >iA7*" tokens/responses_len_mean?`).W >iA7*!  tokens/responses_len_std5vG%6 >iA7*  ppo/loss/policy@F$B+M >iA7*  ppo/loss/valuef:&$B+M 3>iA7*  ppo/loss/total6?9*u(pJ c>iA7*  ppo/policy/entropy@5ph)7_ {>iA7*  ppo/policy/approxkleNtf)7_ >iA7*  ppo/policy/policykl̥\)7_ >iA7*  ppo/policy/clipfracY0_ >iA7*# ! ppo/policy/advantages_mean@2Ʈ&sO >iA7*  ppo/returns/meanvA%6 >iA7*  ppo/returns/varN[9Ԑ#wC >iA7*  ppo/val/vpred`^#wC >iA7*  ppo/val/errorfo; \&sO X>iA7*  ppo/val/clipfrac>"x= >iA7*  ppo/val/mean~;˿*D!{ >iA7*  ppo/val/var"2$~+K a>iA7*  ppo/val/var_explainedа\'F >iA7*  ppo/learning_rateϸ:i+K >iA7*  time/ppo/forward_pass>9.W =>iA7*!  time/ppo/compute_rewards:ܲ1 >iA7*$ " time/ppo/compute_advantages);bҘW,E s>iA7*  time/ppo/optimize_step\S?h)7_ >iA7*  time/ppo/calc_stats[ >zܸ$B+M >iA7*  time/ppo/total_?py|J%6 <>iA7*  env/reward_mean>ϙ $B+M >iA7*  env/reward_stdl<"x= sGAiA8*  objective/klD Aٍ'F GAiA8*  objective/kl_coefS>͙'F GAiA8*  objective/entropy^jT/m]P HAiA8*" ppo/mean_non_score_reward|%6 HAiA8*  ppo/mean_scores>h$B+M 1HAiA8*  ppo/std_scoreslŬ.%6 OAiA8*  ppo/returns/var%[9r#wC PAiA8*  ppo/val/vpredJE#wC QAiA8*  ppo/val/error\%;3p&sO QAiA8*  ppo/val/clipfrac`"x= SRAiA8*  ppo/val/mean !{ RAiA8*  ppo/val/varuy+K SAiA8*  ppo/val/var_explained61P'F VAiA8*  ppo/learning_rateϸ:fXX+K WAiA8*  time/ppo/forward_pass<>D.W .WAiA8*!  time/ppo/compute_rewards:Hx1 BWAiA8*$ " time/ppo/compute_advantagesL;-n,E TWAiA8*  time/ppo/optimize_stepEQ?bv&)7_ eWAiA8*  time/ppo/calc_stats>p$B+M xWAiA8*  time/ppo/totalh^?%6 WAiA8*  env/reward_mean>P$B+M WAiA8*  env/reward_stdli"x= 1NDiA9*  objective/klD AA'F NDiA9*  objective/kl_coefS>S"'F NDiA9*  objective/entropyCz\/m]P NDiA9*" ppo/mean_non_score_rewardp%6 NDiA9*  ppo/mean_scores>Ӽ$B+M ODiA9*  ppo/std_scoresl\.W iDiA9*!  time/ppo/compute_rewards!;(TU1 iDiA9*$ " time/ppo/compute_advantagesT;6,,E OjDiA9*  time/ppo/optimize_stepXX?k)7_ jDiA9*  time/ppo/calc_stats+>$B+M kDiA9*  time/ppo/totalJ?)Y%6 YkDiA9*  env/reward_mean>2$B+M kDiA9*  env/reward_stdl<)"x= ʰGiA:*  objective/klD AQX"'F ʰGiA:*  objective/kl_coef"T>'nn'F ˰GiA:*  objective/entropy>t/m]P ˰GiA:*" ppo/mean_non_score_reward迍%6 (˰GiA:*  ppo/mean_scores>+/j$B+M 9˰GiA:*  ppo/std_scoresl/g(pJ аGiA:*  ppo/policy/entropy`5t)7_ pѰGiA:*  ppo/policy/approxkl)7_ ѰGiA:*  ppo/policy/policyklO.)7_ ӰGiA:*  ppo/policy/clipfrac$E0_ ӰGiA:*# ! ppo/policy/advantages_mean;3&sO ԰GiA:*  ppo/returns/meanbڲ-%6 ԰GiA:*  ppo/returns/var&[9#wC *԰GiA:*  ppo/val/vpredU&#wC =԰GiA:*  ppo/val/errorz:SB&sO N԰GiA:*  ppo/val/clipfracqT"x= ^԰GiA:*  ppo/val/meansgM!{ ԰GiA:*  ppo/val/var0b:+K հGiA:*  ppo/val/var_explaineda'F lհGiA:*  ppo/learning_rateϸ:Rm+K հGiA:*  time/ppo/forward_pass>ay.W ְGiA:*!  time/ppo/compute_rewards:1 oְGiA:*$ " time/ppo/compute_advantagesp;=,E ְGiA:*  time/ppo/optimize_step,MG?f@)7_ װGiA:*  time/ppo/calc_statsP >$B$B+M aװGiA:*  time/ppo/totalN ?)7%6 װGiA:*  env/reward_mean>?A/y$B+M ذGiA:*  env/reward_stdl<"x= JiA;*  objective/klD Ad'F (JiA;*  objective/kl_coefHET>Vj'F CJiA;*  objective/entropymu/m]P WJiA;*" ppo/mean_non_score_reward75N%6 jJiA;*  ppo/mean_scores>a.$B+M {JiA;*  ppo/std_scoresl<]-..W ࡡJiA;*!  time/ppo/compute_rewards:ڀ1 8JiA;*$ " time/ppo/compute_advantages;6M,E JiA;*  time/ppo/optimize_stepPG?֠)7_ JiA;*  time/ppo/calc_stats0 >涱A$B+M AJiA;*  time/ppo/total2n?ߗC%6 JiA;*  env/reward_mean>(91$B+M 룡JiA;*  env/reward_stdlz'F MiA<*  objective/entropy-‰/m]P #MiA<*" ppo/mean_non_score_reward]!n| %6 5MiA<*  ppo/mean_scores>'$B+M FMiA<*  ppo/std_scoresl76<.W "‚MiA<*!  time/ppo/compute_rewards:DR<1 }‚MiA<*$ " time/ppo/compute_advantages|;tX,E ‚MiA<*  time/ppo/optimize_stepPG?E)7_ 'ÂMiA<*  time/ppo/calc_stats^ >2p$B+M }ÂMiA<*  time/ppo/totalp?_S%6 ÂMiA<*  env/reward_mean>[$B+M 6ĂMiA<*  env/reward_stdlz'F }PiA=*  objective/entropyR+b/m]P }PiA=*" ppo/mean_non_score_rewardN)%6 }PiA=*  ppo/mean_scoresΜ>$B+M }PiA=*  ppo/std_scoresն<-PoI6%6 Z}PiA=*  ppo/returns/varn9x#wC }PiA=*  ppo/val/vpredÿk#wC }PiA=*  ppo/val/error(:A&sO q}PiA=*  ppo/val/clipfrac "x= }PiA=*  ppo/val/meanwÿZCAr!{ %}PiA=*  ppo/val/var8N+K }PiA=*  ppo/val/var_explainedJ S]'F }PiA=*  ppo/learning_rateϸ:Vmy+K C}PiA=*  time/ppo/forward_passL>H!@.W }PiA=*!  time/ppo/compute_rewards :61 }PiA=*$ " time/ppo/compute_advantagesx;798,E W}PiA=*  time/ppo/optimize_steptiI?)7_ }PiA=*  time/ppo/calc_stats`J >O ]$B+M }PiA=*  time/ppo/total?y$B+M }PiA=*  env/reward_stdն<k"x= nSiA>*  objective/klD AE´'F wnSiA>*  objective/kl_coefT>!'F nSiA>*  objective/entropy҂~/m]P nSiA>*" ppo/mean_non_score_reward [%6 nSiA>*  ppo/mean_scores>/=$B+M nSiA>*  ppo/std_scoresl<-*  tokens/queries_len_meanBu,E nSiA>*  tokens/queries_len_stdn;6,/m]P nSiA>*" tokens/responses_len_mean?h.W nSiA>*!  tokens/responses_len_std%6 $nSiA>*  ppo/loss/policyL$B+M JnSiA>*  ppo/loss/valueȁ:9ku&$B+M oSiA>*  ppo/loss/total7_(pJ +oSiA>*  ppo/policy/entropy5&-)7_ CoSiA>*  ppo/policy/approxkl,Y)7_ UoSiA>*  ppo/policy/policykl+IO)7_ foSiA>*  ppo/policy/clipfrac0_ zoSiA>*# ! ppo/policy/advantages_mean2ڥ&sO oSiA>*  ppo/returns/mean"I3u%6 oSiA>*  ppo/returns/varN[9r~b#wC oSiA>*  ppo/val/vpred]'=#wC oSiA>*  ppo/val/errorȁ9&sO oSiA>*  ppo/val/clipfracDG1"x= }oSiA>*  ppo/val/mean䐿O !{ oSiA>*  ppo/val/varYh؏+K oSiA>*  ppo/val/var_explainedJ3齐'F  oSiA>*  ppo/learning_rateϸ:+K oSiA>*  time/ppo/forward_pass@8>:K.W  oSiA>*!  time/ppo/compute_rewards@: 1 U oSiA>*$ " time/ppo/compute_advantages,;I"n,E oSiA>*  time/ppo/optimize_steppI?j)7_ oSiA>*  time/ppo/calc_statsp >ʑ$B+M I oSiA>*  time/ppo/totalFR?^k%6 oSiA>*  env/reward_mean>) $B+M oSiA>*  env/reward_stdl<8)"x= ZViA?*  objective/klD A@s'F fZViA?*  objective/kl_coefT>#'F ZViA?*  objective/entropy}/m]P ZViA?*" ppo/mean_non_score_reward(C1'%6 ZViA?*  ppo/mean_scores>RQC$B+M ZViA?*  ppo/std_scoresl.W ZViA?*!  time/ppo/compute_rewardsX:: H1 ZViA?*$ " time/ppo/compute_advantages;qv,E 5ZViA?*  time/ppo/optimize_stepH?#)7_ ZViA?*  time/ppo/calc_stats >.]$B+M ZViA?*  time/ppo/total^?tR%6 3ZViA?*  env/reward_mean>F!!$B+M ZViA?*  env/reward_stdl<w4"x= 5(AYiA@*  objective/klD Aj)'F (AYiA@*  objective/kl_coefeT>}4|'F (AYiA@*  objective/entropyl/m]P (AYiA@*" ppo/mean_non_score_rewardM违%6 (AYiA@*  ppo/mean_scores>'2$B+M (AYiA@*  ppo/std_scoresl<-ߒ#wC 2AYiA@*  ppo/val/error98|&sO 2AYiA@*  ppo/val/clipfracd"x= 2AYiA@*  ppo/val/meanneо!{ 2AYiA@*  ppo/val/var8S+K F4AYiA@*  ppo/val/var_explained'F 4AYiA@*  ppo/learning_rateϸ:;%+K =5AYiA@*  time/ppo/forward_pass> .W 5AYiA@*!  time/ppo/compute_rewards:k`1 5AYiA@*$ " time/ppo/compute_advantagesh;wGF,E G6AYiA@*  time/ppo/optimize_stepG?4)7_ 6AYiA@*  time/ppo/calc_stats >Rg$B+M 6AYiA@*  time/ppo/totalއ?&%6 O7AYiA@*  env/reward_mean>3"$B+M 7AYiA@*  env/reward_stdlB'F 0\iAA*  objective/entropyX:D/m]P 0\iAA*" ppo/mean_non_score_rewardx鿎_U %6 0\iAA*  ppo/mean_scoresΜ>8$B+M 0\iAA*  ppo/std_scoresն< -.W *0\iAA*!  time/ppo/compute_rewards:X 1 0\iAA*$ " time/ppo/compute_advantages;Mn,E 0\iAA*  time/ppo/optimize_stepH?U)7_ 9 0\iAA*  time/ppo/calc_statsp > Wz$B+M 0\iAA*  time/ppo/total?V +%6 0\iAA*  env/reward_meanΜ>"$B+M ?!0\iAA*  env/reward_stdն 'F >c_iAB*  objective/entropyެ /m]P Sc_iAB*" ppo/mean_non_score_rewardB鿑M9|%6 hc_iAB*  ppo/mean_scoresΜ>8$B+M zc_iAB*  ppo/std_scoresն<0-.W c_iAB*!  tokens/responses_len_std4%6 c_iAB*  ppo/loss/policy2*)F$B+M d_iAB*  ppo/loss/valueZ9]G$B+M e_iAB*  ppo/loss/total7[h=(pJ Sf_iAB*  ppo/policy/entropy5x$!)7_ f_iAB*  ppo/policy/approxkl6)7_ /g_iAB*  ppo/policy/policyklqY)7_ g_iAB*  ppo/policy/clipfrac0_ g_iAB*# ! ppo/policy/advantages_mean~&sO Sh_iAB*  ppo/returns/mean¿C5%6 h_iAB*  ppo/returns/var9Q#wC i_iAB*  ppo/val/vpred|#wC fi_iAB*  ppo/val/error9F?.&sO i_iAB*  ppo/val/clipfrac%B "x= j_iAB*  ppo/val/mean,n!{ ij_iAB*  ppo/val/varC+K j_iAB*  ppo/val/var_explained d5'F k_iAB*  ppo/learning_rateϸ:3+K tk_iAB*  time/ppo/forward_passp>c"$B+M 0m_iAB*  time/ppo/total?CW%6 m_iAB*  env/reward_meanΜ>߼$B+M m_iAB*  env/reward_stdն<`T"x= aiAC*  objective/klD A*^'F NaiAC*  objective/kl_coef"\U>]l'F vaiAC*  objective/entropy/m]P aiAC*" ppo/mean_non_score_rewardhd%6 aiAC*  ppo/mean_scores>$B+M aiAC*  ppo/std_scoresl.W g%aiAC*!  time/ppo/compute_rewards:MsG1 %aiAC*$ " time/ppo/compute_advantages;Ǥ,E !&aiAC*  time/ppo/optimize_stepL?i)7_ &aiAC*  time/ppo/calc_stats >l=$B+M &aiAC*  time/ppo/total`8?d+%6 5'aiAC*  env/reward_mean>&"$B+M 'aiAC*  env/reward_stdl< ^{"x= diAD*  objective/klD A=u'F fdiAD*  objective/kl_coefU>M4='F diAD*  objective/entropy4/m]P diAD*" ppo/mean_non_score_reward N Z%6 diAD*  ppo/mean_scores>dmu$B+M ȜdiAD*  ppo/std_scoresl<r-*(pJ diAD*  ppo/policy/entropy5dU)7_ ,diAD*  ppo/policy/approxkl)7_ diAD*  ppo/policy/policyklFs)7_ diAD*  ppo/policy/clipfracho0_ OdiAD*# ! ppo/policy/advantages_mean2G0&sO diAD*  ppo/returns/mean¿)ɸ%6 diAD*  ppo/returns/varN[9Ȉ#wC zdiAD*  ppo/val/vpredפ;T#wC ֢diAD*  ppo/val/error3E_9̙&sO 0diAD*  ppo/val/clipfracn6'"x= diAD*  ppo/val/meana[!{ diAD*  ppo/val/varF-+K DdiAD*  ppo/val/var_explained@̞'F diAD*  ppo/learning_rateϸ:f:+9+K diAD*  time/ppo/forward_pass>L.W `diAD*!  time/ppo/compute_rewards@:Zs1 diAD*$ " time/ppo/compute_advantages;F},E diAD*  time/ppo/optimize_step R?&2)7_ rdiAD*  time/ppo/calc_statsz>Qq$B+M ϦdiAD*  time/ppo/totald-?j%6 )diAD*  env/reward_mean>)=$B+M diAD*  env/reward_stdl'F RgiAE*  objective/entropyںn/m]P fgiAE*" ppo/mean_non_score_rewardd鿅*%6 zgiAE*  ppo/mean_scores>]$B+M giAE*  ppo/std_scoresl<+-=.W *giAE*!  time/ppo/compute_rewards:xAh1 giAE*$ " time/ppo/compute_advantages0;wi,E ֺgiAE*  time/ppo/optimize_stepS?)7_ )giAE*  time/ppo/calc_statsi>5&e$B+M |giAE*  time/ppo/totalI?@֨%6 ͻgiAE*  env/reward_mean>6B$B+M giAE*  env/reward_stdl'F jiAF*  objective/entropyr/m]P jiAF*" ppo/mean_non_score_reward#%6 jiAF*  ppo/mean_scores>c$B+M *jiAF*  ppo/std_scoreslުC.W jiAF*!  time/ppo/compute_rewards`:Ę1 jiAF*$ " time/ppo/compute_advantages,(;P4,E jiAF*  time/ppo/optimize_stepS?+$)7_ jiAF*  time/ppo/calc_statsp>;Yrl$B+M jiAF*  time/ppo/total?%6  jiAF*  env/reward_mean>&x$B+M  jiAF*  env/reward_stdl<.,"x= ymiAG*  objective/klD And'F miAG*  objective/kl_coefU>q'F miAG*  objective/entropy (/m]P miAG*" ppo/mean_non_score_rewardt%6 miAG*  ppo/mean_scores>!$B+M :miAG*  ppo/std_scoreslGR.W }miAG*!  time/ppo/compute_rewards:NIw1 DmiAG*$ " time/ppo/compute_advantages8;#.,E miAG*  time/ppo/optimize_stepR?z)7_ miAG*  time/ppo/calc_stats>1dH$B+M LmiAG*  time/ppo/total? u%6 OmiAG*  env/reward_mean>篼$B+M "miAG*  env/reward_stdl#k'F piAH*  objective/entropyV/m]P piAH*" ppo/mean_non_score_rewardV(T%6 piAH*  ppo/mean_scores>|$B+M piAH*  ppo/std_scoresl< :-+.W piAH*!  time/ppo/compute_rewards:1 @piAH*$ " time/ppo/compute_advantages;8kB,E piAH*  time/ppo/optimize_stepT?6+)7_ piAH*  time/ppo/calc_stats 1>{$B+M EpiAH*  time/ppo/total ː?mɋ]%6 piAH*  env/reward_mean>@$B+M piAH*  env/reward_stdl<"x= siAI*  objective/klD An'F (siAI*  objective/kl_coef6.V>B88'F GsiAI*  objective/entropy٫/m]P ]siAI*" ppo/mean_non_score_rewardN%6 psiAI*  ppo/mean_scores>#Fr$B+M siAI*  ppo/std_scoresl-W.W siAI*!  time/ppo/compute_rewards0:sb1 \siAI*$ " time/ppo/compute_advantages;j>,E siAI*  time/ppo/optimize_stepPT?y|D)7_ siAI*  time/ppo/calc_statsK>N$B+M _siAI*  time/ppo/total?hg%6 siAI*  env/reward_mean>$B+M siAI*  env/reward_stdl<{/a]"x= viAJ*  objective/klD A^'F QviAJ*  objective/kl_coefMQV>F'F qviAJ*  objective/entropy/m]P viAJ*" ppo/mean_non_score_rewardu꿊 ?%6 viAJ*  ppo/mean_scores>/n$B+M viAJ*  ppo/std_scoresl< -ۖ.W viAJ*!  time/ppo/compute_rewards:1 viAJ*$ " time/ppo/compute_advantages ;Z,E HviAJ*  time/ppo/optimize_stepXNT?K)7_ viAJ*  time/ppo/calc_stats`>6n$B+M viAJ*  time/ppo/total2?tg>%6 HviAJ*  env/reward_mean>Щ$B+M viAJ*  env/reward_stdl<)[B"x= yiAK*  objective/klD ArB'F yiAK*  objective/kl_coefjtV>p'F yiAK*  objective/entropyɱ8/m]P yiAK*" ppo/mean_non_score_rewardW+dn%6 yiAK*  ppo/mean_scores>ϐ ]$B+M yiAK*  ppo/std_scoresl?)7_ yiAK*  ppo/policy/policykl@a)7_ yiAK*  ppo/policy/clipfrac 0_ yiAK*# ! ppo/policy/advantages_mean@2&sO yiAK*  ppo/returns/meanÿcM%6 yiAK*  ppo/returns/varN[9 #wC yiAK*  ppo/val/vpred`(ÿZ#wC yiAK*  ppo/val/errorT9R~.W byiAK*!  time/ppo/compute_rewards:1 yiAK*$ " time/ppo/compute_advantages<;ֹ,E yiAK*  time/ppo/optimize_step W?)7_ \yiAK*  time/ppo/calc_stats0~>Gv$B+M yiAK*  time/ppo/total?%6 yiAK*  env/reward_mean>{C$B+M RyiAK*  env/reward_stdl/'F r|iAL*  objective/entropy/m]P |iAL*" ppo/mean_non_score_rewardXû%6 |iAL*  ppo/mean_scores>F/b$B+M |iAL*  ppo/std_scoresl< R(-Be.W %|iAL*!  time/ppo/compute_rewards:ҽP1 K&|iAL*$ " time/ppo/compute_advantages ;l,E &|iAL*  time/ppo/optimize_step[S?E `)7_ '|iAL*  time/ppo/calc_statsl>F}$B+M '|iAL*  time/ppo/total"?Z%6 '|iAL*  env/reward_mean>< $B+M '|iAL*  env/reward_stdl< "x= ,iAM*  objective/klD Aӧf'F ,iAM*  objective/kl_coefV>q'F -iAM*  objective/entropyq/m]P 7-iAM*" ppo/mean_non_score_rewardgILhy%6 L-iAM*  ppo/mean_scores>J$B+M `-iAM*  ppo/std_scoresl<.4-G.W 5iAM*!  time/ppo/compute_rewards:{i1 F6iAM*$ " time/ppo/compute_advantages@&;0,E 6iAM*  time/ppo/optimize_stepQX?YmB)7_ 6iAM*  time/ppo/calc_stats>0$B+M E7iAM*  time/ppo/totalɓ?V%6 7iAM*  env/reward_mean>$B+M 7iAM*  env/reward_stdl<"x= wpiAN*  objective/klD A}'F piAN*  objective/kl_coefV>Z 'F qiAN*  objective/entropyH6/m]P qiAN*" ppo/mean_non_score_reward뿲n%6 -qiAN*  ppo/mean_scores>%Z$B+M @qiAN*  ppo/std_scoresl< -.W ziAN*!  time/ppo/compute_rewards0:a1 fziAN*$ " time/ppo/compute_advantages ;,E ziAN*  time/ppo/optimize_stepmT?y)7_ {iAN*  time/ppo/calc_stats>77$B+M e{iAN*  time/ppo/total?۷!%6 {iAN*  env/reward_mean>2$B+M |iAN*  env/reward_stdle,'F iAO*  objective/entropy0/m]P %iAO*" ppo/mean_non_score_rewardg5"%6 8iAO*  ppo/mean_scores>mp-$B+M JiAO*  ppo/std_scoresl< B-#wC iAO*  ppo/val/errortkT9'{&sO iAO*  ppo/val/clipfracc/"x= iAO*  ppo/val/meanÿ4!{ !iAO*  ppo/val/varW.+K iAO*  ppo/val/var_explained(T,X.W iAO*!  time/ppo/compute_rewards(:jT1 8iAO*$ " time/ppo/compute_advantages<;x3,E iAO*  time/ppo/optimize_stepDS??])7_ iAO*  time/ppo/calc_statsW>ݘ$B+M _iAO*  time/ppo/totalfʐ?l|%6 iAO*  env/reward_mean>Ri$B+M iAO*  env/reward_stdl< "x= iAP*  objective/klD A|'F iAP*  objective/kl_coefR$W>8@j'F =iAP*  objective/entropyI/m]P TiAP*" ppo/mean_non_score_reward[U?%6 jiAP*  ppo/mean_scores>!$B+M |iAP*  ppo/std_scoresl<%-`0_ iAP*# ! ppo/policy/advantages_mean2"4&sO iAP*  ppo/returns/meanÿZ%6 iAP*  ppo/returns/varN[9 #wC iAP*  ppo/val/vpredlÿpQ#wC >#iAP*  ppo/val/errorT9 &sO k#iAP*  ppo/val/clipfracHǛ"x= %iAP*  ppo/val/meanÿR.!{ %iAP*  ppo/val/var~+K %iAP*  ppo/val/var_explained<='F %iAP*  ppo/learning_rateϸ: Gn+K %iAP*  time/ppo/forward_pass!>W,%.W &iAP*!  time/ppo/compute_rewards:1 &iAP*$ " time/ppo/compute_advantages;H,E .&iAP*  time/ppo/optimize_stepW?])7_ &iAP*  time/ppo/calc_stats> }$B+M &iAP*  time/ppo/totalL?L %6 K'iAP*  env/reward_mean>˹$B+M 'iAP*  env/reward_stdl~n'F TiAQ*  objective/entropyC1/m]P jiAQ*" ppo/mean_non_score_rewardƛy%6 iAQ*  ppo/mean_scores>XAX$B+M iAQ*  ppo/std_scoresl<2- iAQ*  ppo/policy/entropy5g)7_ iAQ*  ppo/policy/approxkl)7_ !iAQ*  ppo/policy/policykl=)7_ \!iAQ*  ppo/policy/clipfracw 0_ !iAQ*# ! ppo/policy/advantages_mean@2mo>&sO "iAQ*  ppo/returns/meanÿ@4%6 u"iAQ*  ppo/returns/varN[9bj#wC "iAQ*  ppo/val/vpreddÿB#wC *#iAQ*  ppo/val/errorYT9g&sO #iAQ*  ppo/val/clipfracb"x= #iAQ*  ppo/val/meanÿ`!{ P$iAQ*  ppo/val/vardX+K $iAQ*  ppo/val/var_explained@ VC.W %iAQ*!  time/ppo/compute_rewards(:2g1 &iAQ*$ " time/ppo/compute_advantagesL;,E p&iAQ*  time/ppo/optimize_stepDX?E)7_ &iAQ*  time/ppo/calc_statsC>e:)$B+M W'iAQ*  time/ppo/totalND?E%6 j'iAQ*  env/reward_mean>A$B+M 'iAQ*  env/reward_stdl޻'F iAR*  objective/entropy!P/m]P iAR*" ppo/mean_non_score_reward6 %6 iAR*  ppo/mean_scores>Fl$B+M iAR*  ppo/std_scoresl.W ="iAR*!  time/ppo/compute_rewardsp:"l1 "iAR*$ " time/ppo/compute_advantages$";),E #iAR*  time/ppo/optimize_stepS?'>[)7_ [#iAR*  time/ppo/calc_stats >֤$B+M #iAR*  time/ppo/total?j|%6 #iAR*  env/reward_mean>c)$B+M P$iAR*  env/reward_stdl,'F iAS*  objective/entropy /m]P iAS*" ppo/mean_non_score_reward/%6 iAS*  ppo/mean_scores>hQ$B+M iAS*  ppo/std_scoresl<.W iAS*!  time/ppo/compute_rewardsX:1 iAS*$ " time/ppo/compute_advantages;pPx,E 2iAS*  time/ppo/optimize_stepS?UK)7_ iAS*  time/ppo/calc_statsp>x$B+M iAS*  time/ppo/total?(%6 :iAS*  env/reward_mean>06$B+M iAS*  env/reward_stdlP'F YiAT*  objective/entropyb/m]P YiAT*" ppo/mean_non_score_rewardU0B%6 YiAT*  ppo/mean_scores>U1$B+M ZiAT*  ppo/std_scoreslu].W MiiAT*!  time/ppo/compute_rewards:=&1 diiAT*$ " time/ppo/compute_advantagesp;,E iiAT*  time/ppo/optimize_steplR?R+q$B+M jiAT*  time/ppo/totalb?xpl%6 jiAT*  env/reward_mean>2]d$B+M okiAT*  env/reward_stdl ,Vdd$B+M jiAU*  ppo/std_scoresն<.h-/"x= >iAU*  ppo/val/meanĿPk!{ QiAU*  ppo/val/var+K diAU*  ppo/val/var_explained/<''F viAU*  ppo/learning_rateϸ:ȇ%#+K iAU*  time/ppo/forward_pass >EJ].W iAU*!  time/ppo/compute_rewards:H1 iAU*$ " time/ppo/compute_advantagesh;?m3,E IiAU*  time/ppo/optimize_stepuV? ()7_ \iAU*  time/ppo/calc_stats">tK$B+M iAU*  time/ppo/total7?^%6 iAU*  env/reward_meanΜ>0$B+M liAU*  env/reward_stdն<9("x= _iAV*  objective/klD AK9h'F `iAV*  objective/kl_coef'W>q'F 3`iAV*  objective/entropyl/m]P G`iAV*" ppo/mean_non_score_rewardCO7%6 Y`iAV*  ppo/mean_scores>~$B+M i`iAV*  ppo/std_scoresl<2é-R,.W miAV*!  time/ppo/compute_rewardsx:B#1 niAV*$ " time/ppo/compute_advantages;OI,E uniAV*  time/ppo/optimize_stepS?(Yƾ)7_ niAV*  time/ppo/calc_statsЂ>hw$B+M oiAV*  time/ppo/totalX?#B%6 xoiAV*  env/reward_mean>O8$B+M oiAV*  env/reward_stdl<]p"x= iAW*  objective/klD A}!'F > iAW*  objective/kl_coefX>'F ] iAW*  objective/entropy9*/m]P u iAW*" ppo/mean_non_score_rewardcj쿔x%6 iAW*  ppo/mean_scores>% H$B+M iAW*  ppo/std_scoresl</-?U)7_ hiAW*  ppo/policy/approxklqF)7_ |iAW*  ppo/policy/policyklKR)7_ iAW*  ppo/policy/clipfrac20_ iAW*# ! ppo/policy/advantages_meanoQ&&sO iAW*  ppo/returns/meanĿ'9u %6 iAW*  ppo/returns/var&[9%#wC iAW*  ppo/val/vpredĿ'0#wC iAW*  ppo/val/error`]9&sO iAW*  ppo/val/clipfracT"x= iAW*  ppo/val/meanieĿzN!{ iAW*  ppo/val/varʝ+K iAW*  ppo/val/var_explainedVw@o'F iAW*  ppo/learning_rateϸ:tb+K iAW*  time/ppo/forward_passP>YNT.W iAW*!  time/ppo/compute_rewards:Ax81 EiAW*$ " time/ppo/compute_advantages; j+,E iAW*  time/ppo/optimize_stepQ?,$)7_ iAW*  time/ppo/calc_stats @>Ckx6$B+M OiAW*  time/ppo/totalx?4%6 iAW*  env/reward_mean>$B+M iAW*  env/reward_stdlX>\'F {iAX*  objective/entropyT9/m]P {iAX*" ppo/mean_non_score_reward쿜%6 {iAX*  ppo/mean_scores>_$B+M |iAX*  ppo/std_scoresl<$hp-I(pJ ~iAX*  ppo/policy/entropy5v`#)7_ (iAX*  ppo/policy/approxkld[&)7_ iAX*  ppo/policy/policykl2)7_ iAX*  ppo/policy/clipfracM0_ 5iAX*# ! ppo/policy/advantages_meanv&sO iAX*  ppo/returns/mean ſVG%6 iAX*  ppo/returns/var&[9Y#wC GiAX*  ppo/val/vpredĿj#wC iAX*  ppo/val/errorW9L&sO iAX*  ppo/val/clipfracj "x= FiAX*  ppo/val/mean9ſ!5!{ iAX*  ppo/val/var;^1+K 0iAX*  ppo/val/var_explainedV<['F CiAX*  ppo/learning_rateϸ:x+K iAX*  time/ppo/forward_pass >e .W iAX*!  time/ppo/compute_rewards`:9h1 RiAX*$ " time/ppo/compute_advantagesT;;^W,E iAX*  time/ppo/optimize_stepȥR?_#L)7_ iAX*  time/ppo/calc_stats>J$B+M WiAX*  time/ppo/totalJ?ꚙ%6 iAX*  env/reward_mean>Dh$$B+M iAX*  env/reward_stdl,'F LiAY*  objective/entropy~P/m]P 'LiAY*" ppo/mean_non_score_rewardo%6 8LiAY*  ppo/mean_scores>|0+$B+M JLiAY*  ppo/std_scoresl<*-.W uXiAY*!  time/ppo/compute_rewardsd;.{1 XiAY*$ " time/ppo/compute_advantagesp%;_,E +YiAY*  time/ppo/optimize_stephAR?rZ)7_ ~YiAY*  time/ppo/calc_stats >0R$B+M YiAY*  time/ppo/totala?u%6 $ZiAY*  env/reward_mean>=y$B+M |ZiAY*  env/reward_stdl<^|"x= ZiAZ*  objective/klD AJ5'F &[iAZ*  objective/kl_coefԅX>#['F F[iAZ*  objective/entropy/m]P ][iAZ*" ppo/mean_non_score_reward쿆l%6 s[iAZ*  ppo/mean_scores>%$.$B+M [iAZ*  ppo/std_scoresl<Ğm-Wſ v%6 v`iAZ*  ppo/returns/varN[9zZb#wC `iAZ*  ppo/val/vpred7ſ7#wC biAZ*  ppo/val/errorXU9Le&sO biAZ*  ppo/val/clipfracpU"x= biAZ*  ppo/val/meanUſC!{ ciAZ*  ppo/val/varHʏ+K ciAZ*  ppo/val/var_explained`JciAZ*  time/ppo/forward_pass$>Vm>a.W OciAZ*!  time/ppo/compute_rewards@:o1 aciAZ*$ " time/ppo/compute_advantages;*Y,E ciAZ*  time/ppo/optimize_stepS?h>pt)7_ diAZ*  time/ppo/calc_stats06>+$B+M gdiAZ*  time/ppo/totalj.?}a%6 diAZ*  env/reward_mean>.`$B+M eiAZ*  env/reward_stdlsh'F 3iA[*  objective/entropyRv/m]P GiA[*" ppo/mean_non_score_rewardy'u%6 ZiA[*  ppo/mean_scores>4!t$B+M kiA[*  ppo/std_scoresl<-Gz-($B+M iA[*  ppo/loss/valueS8Fr$B+M iA[*  ppo/loss/totalB@+7(pJ iA[*  ppo/policy/entropy59)7_ eiA[*  ppo/policy/approxkl6")7_ ȥiA[*  ppo/policy/policyklߝ)7_ iA[*  ppo/policy/clipfrac,80_ iA[*# ! ppo/policy/advantages_meanrK&sO ĦiA[*  ppo/returns/mean~ſY%6 iA[*  ppo/returns/var%[9#wC xiA[*  ppo/val/vpredſX#wC ϧiA[*  ppo/val/errorSU9N&sO %iA[*  ppo/val/clipfrac\"x= }iA[*  ppo/val/meancſm=!{ ۨiA[*  ppo/val/var̡&+K 4iA[*  ppo/val/var_explained@^<'F iA[*  ppo/learning_rateϸ:r+K 멇iA[*  time/ppo/forward_pass@>".W GiA[*!  time/ppo/compute_rewards:J!1 iA[*$ " time/ppo/compute_advantages;ҦB,E iA[*  time/ppo/optimize_stepT?cC-)7_ iA[*  time/ppo/calc_stats>v$B+M iA[*  time/ppo/totalː?H[%6 iA[*  env/reward_mean>_$B+M diA[*  env/reward_stdl<_"x= FŋiA\*  objective/klD AU'F ŋiA\*  objective/kl_coefX>'F ŋiA\*  objective/entropyZ/m]P ŋiA\*" ppo/mean_non_score_rewardO, %6 ŋiA\*  ppo/mean_scores>i $B+M ƋiA\*  ppo/std_scoreslƋiA\*" tokens/responses_len_mean?q.W QƋiA\*!  tokens/responses_len_std4%6 dƋiA\*  ppo/loss/policy1`$B+M ʋiA\*  ppo/loss/valuez+8G$B+M ,ʋiA\*  ppo/loss/total.*7g(pJ DʋiA\*  ppo/policy/entropy5\)7_ YʋiA\*  ppo/policy/approxklwf-)7_ mʋiA\*  ppo/policy/policyklcD&)7_ ʋiA\*  ppo/policy/clipfracα0_ ʋiA\*# ! ppo/policy/advantages_meanKT&sO ʋiA\*  ppo/returns/meanſ`m%9%6 ʋiA\*  ppo/returns/var&[9).#wC $ˋiA\*  ppo/val/vpredſ2#wC ˋiA\*  ppo/val/errorz+U9h-Y&sO ˋiA\*  ppo/val/clipfracɁ"x= 4̋iA\*  ppo/val/meanſ C!{ ̋iA\*  ppo/val/var*-+K ̋iA\*  ppo/val/var_explained<(.'F @͋iA\*  ppo/learning_rateϸ:V+K ͋iA\*  time/ppo/forward_pass>1<.W ͋iA\*!  time/ppo/compute_rewards :51 U΋iA\*$ " time/ppo/compute_advantagesX;$y6h,E ΋iA\*  time/ppo/optimize_step0T?3[v)7_ ϋiA\*  time/ppo/calc_stats>C$B+M WϋiA\*  time/ppo/total?%6 ϋiA\*  env/reward_mean>U9*$B+M ЋiA\*  env/reward_stdlCY~'F ~8iA]*  objective/entropy&r/m]P 8iA]*" ppo/mean_non_score_reward*S,kz%6 8iA]*  ppo/mean_scores>n|i$B+M 8iA]*  ppo/std_scoresliA]*  ppo/loss/total)7#'(pJ @iA]*  ppo/policy/entropy5)7_ @iA]*  ppo/policy/approxkle)7_ @iA]*  ppo/policy/policyklO)7_ BiA]*  ppo/policy/clipfrac0_ CiA]*# ! ppo/policy/advantages_mean2&sO %CiA]*  ppo/returns/meanſ3%6 CiA]*  ppo/returns/varN[9o#wC DiA]*  ppo/val/vpredſD#wC &EiA]*  ppo/val/errorT9 &sO EiA]*  ppo/val/clipfrac'@"x= FiA]*  ppo/val/meanſ(!{ CGiA]*  ppo/val/varxI+K GiA]*  ppo/val/var_explained <>' 'F HiA]*  ppo/learning_rateϸ:q+K 3IiA]*  time/ppo/forward_pass`"> |.W IiA]*!  time/ppo/compute_rewards`:F*1 xJiA]*$ " time/ppo/compute_advantagesO; ,E KiA]*  time/ppo/optimize_stepE$B+M LLiA]*  time/ppo/totalб?'%6 LiA]*  env/reward_mean>p$B+M MiA]*  env/reward_stdl<_"x= զiA^*  objective/klD Aڙ'F DiA^*  objective/kl_coefY>r'F aiA^*  objective/entropyfM/m]P viA^*" ppo/mean_non_score_reward zu%6 iA^*  ppo/mean_scores>'1"$B+M iA^*  ppo/std_scoreslwl..W LiA^*!  time/ppo/compute_rewardsX:1 麀iA^*$ " time/ppo/compute_advantagesD;,E iA^*  time/ppo/optimize_stepT?)7_ "iA^*  time/ppo/calc_statsp >e($B+M iA^*  time/ppo/totalɐ??%6 ZiA^*  env/reward_mean>I>$B+M iA^*  env/reward_stdl'F iA_*  objective/entropyݺ/m]P iA_*" ppo/mean_non_score_rewarde%6 iA_*  ppo/mean_scores>$B+M iA_*  ppo/std_scoresl<:!-xc6.W iA_*!  time/ppo/compute_rewards:\M1 SiA_*$ " time/ppo/compute_advantages;3,E iA_*  time/ppo/optimize_stepxS? )7_ iA_*  time/ppo/calc_statsП>ְ#$B+M iA_*  time/ppo/total ?.%6 iA_*  env/reward_mean>u$B+M N iA_*  env/reward_stdl<VH"x= :viA`*  objective/klD Al+'F viA`*  objective/kl_coef[Y><'F viA`*  objective/entropynq/m]P viA`*" ppo/mean_non_score_reward"%6 viA`*  ppo/mean_scores>ڄt$B+M viA`*  ppo/std_scoreslO)7_ viA`*  ppo/policy/approxkl 9z)7_ }viA`*  ppo/policy/policyklD )7_ viA`*  ppo/policy/clipfracz6W0_ viA`*# ! ppo/policy/advantages_mean@2-&sO \viA`*  ppo/returns/meanx@ƿ}%6 viA`*  ppo/returns/varN[9š #wC viA`*  ppo/val/vpred8ƿ#wC 4viA`*  ppo/val/errorT9Ƹd&sO viA`*  ppo/val/clipfracC"x= ` viA`*  ppo/val/meanƿ=@!{ viA`*  ppo/val/var *K+K !viA`*  ppo/val/var_explained@<'F 1"viA`*  ppo/learning_rateϸ:8%+K "viA`*  time/ppo/forward_pass`>W.W m#viA`*!  time/ppo/compute_rewards:1 $viA`*$ " time/ppo/compute_advantages;n4,E $viA`*  time/ppo/optimize_step`T?`G)7_ :%viA`*  time/ppo/calc_stats`w>\$B+M %viA`*  time/ppo/totalِ?,%6 t&viA`*  env/reward_mean>$B+M 'viA`*  env/reward_stdl<) "x= /niAa*  objective/klD A>'F N0niAa*  objective/kl_coef~Y>r'F {0niAa*  objective/entropyeQc/m]P 0niAa*" ppo/mean_non_score_reward!%6 0niAa*  ppo/mean_scores>V $B+M 0niAa*  ppo/std_scoresl<`:-#wC :niAa*  ppo/val/vpredhƿzZ#wC ;niAa*  ppo/val/error@6T9hHl\&sO i;niAa*  ppo/val/clipfrac] "x= ;niAa*  ppo/val/mean`ƿ!!{ .W =niAa*!  time/ppo/compute_rewards:O{1 =niAa*$ " time/ppo/compute_advantages9;k_,E 6>niAa*  time/ppo/optimize_stepOT??x|)7_ >niAa*  time/ppo/calc_stats>r$B+M >niAa*  time/ppo/totala?;v[%6 =?niAa*  env/reward_mean>G$B+M ?niAa*  env/reward_stdl<+2"x= jiAb*  objective/klD A^,'F jiAb*  objective/kl_coefDY>vb'F jiAb*  objective/entropy?c/m]P jiAb*" ppo/mean_non_score_rewardw[%6 HjiAb*  ppo/mean_scores>]}x$B+M ljiAb*  ppo/std_scoreslv`.W jiAb*!  time/ppo/compute_rewards:N1 NjiAb*$ " time/ppo/compute_advantages;(,E jiAb*  time/ppo/optimize_steplU?/)7_ jiAb*  time/ppo/calc_stats >ٌ$B+M OjiAb*  time/ppo/total ?%6 jiAb*  env/reward_mean>%G$B+M jiAb*  env/reward_stdl<&f"x= ٮ\iAc*  objective/klD A'F H\iAc*  objective/kl_coefY>H'F d\iAc*  objective/entropy:/m]P |\iAc*" ppo/mean_non_score_rewardBu$B+M \iAc*  ppo/std_scoresl)7_ ٴ\iAc*  ppo/policy/clipfrac>M0_ \iAc*# ! ppo/policy/advantages_mean?&sO \iAc*  ppo/returns/meanlƿ>y%6 \iAc*  ppo/returns/var&[9!ߡ#wC \iAc*  ppo/val/vpred,ƿ?#wC \iAc*  ppo/val/errorZHT9i&sO d\iAc*  ppo/val/clipfracf3"x= \iAc*  ppo/val/meanƿb!{ "\iAc*  ppo/val/var4W=G+K \iAc*  ppo/val/var_explained$Љ.W \iAc*!  time/ppo/compute_rewards:$a;1 \iAc*$ " time/ppo/compute_advantages';Ƣ,E W\iAc*  time/ppo/optimize_stepHO?TY)7_ \iAc*  time/ppo/calc_stats`>YQ$B+M \iAc*  time/ppo/totalx?L%6 c\iAc*  env/reward_mean>ܝ$B+M \iAc*  env/reward_stdl< "x= JiAd*  objective/klD A{_('F 0JiAd*  objective/kl_coefY>K'F MJiAd*  objective/entropy~/m]P dJiAd*" ppo/mean_non_score_rewardcц:%6 xJiAd*  ppo/mean_scores>\}$B+M JiAd*  ppo/std_scoresl .W JiAd*!  time/ppo/compute_rewards:%,<1 `JiAd*$ " time/ppo/compute_advantagesl;5 F0,E JiAd*  time/ppo/optimize_step\-Q?B^J)7_ JiAd*  time/ppo/calc_stats>a_$B+M qJiAd*  time/ppo/total:B?TX%6 JiAd*  env/reward_mean>y$B+M JiAd*  env/reward_stdl<_a#"x= >iAe*  objective/klD Ao'F ->iAe*  objective/kl_coefN Z>>_O#'F N>iAe*  objective/entropy%0/m]P e>iAe*" ppo/mean_non_score_rewardd|%6 y>iAe*  ppo/mean_scores>N8$B+M >iAe*  ppo/std_scoresl<8~-iAe*  tokens/queries_len_meanB\=,E >iAe*  tokens/queries_len_stdBŸ/m]P >iAe*" tokens/responses_len_mean??;.W >iAe*!  tokens/responses_len_stdD%6 >iAe*  ppo/loss/policy\}$B+M 4>iAe*  ppo/loss/valuePg8d.$B+M `>iAe*  ppo/loss/total@)7(pJ >iAe*  ppo/policy/entropy5 D)7_ H>iAe*  ppo/policy/approxkl[Fl)7_ >iAe*  ppo/policy/policyklgF)7_ >iAe*  ppo/policy/clipfrac1P0_ \>iAe*# ! ppo/policy/advantages_mean25.6&sO >iAe*  ppo/returns/meanǿ^m%6 >iAe*  ppo/returns/varN[9AS1#wC p>iAe*  ppo/val/vpredƿ# #wC >iAe*  ppo/val/errorPgT9T&sO >iAe*  ppo/val/clipfrac"x= t>iAe*  ppo/val/meanƿm!{ >iAe*  ppo/val/var?ħ+K R>iAe*  ppo/val/var_explained`<|9'F >iAe*  ppo/learning_rateϸ:zN+K >iAe*  time/ppo/forward_pass0T>v.W Q>iAe*!  time/ppo/compute_rewardsL;61 >iAe*$ " time/ppo/compute_advantages<%;,,E >iAe*  time/ppo/optimize_stepX?y}3)7_ T>iAe*  time/ppo/calc_stats >^8$B+M >iAe*  time/ppo/total4?%6 >iAe*  env/reward_mean>Ymj $B+M O>iAe*  env/reward_stdlٻ*'F AiAf*  objective/entropyR4 /m]P AiAf*" ppo/mean_non_score_rewardz9/%6 AiAf*  ppo/mean_scores>$B+M AiAf*  ppo/std_scoreslU|.W BiAf*!  time/ppo/compute_rewards:( 1 MBiAf*$ " time/ppo/compute_advantagesH;;M2,E BiAf*  time/ppo/optimize_stepbW?I)7_ BiAf*  time/ppo/calc_statsP>wϷ$B+M M BiAf*  time/ppo/total ?%#%6 BiAf*  env/reward_mean>6'&$B+M BiAf*  env/reward_stdl<7¬"x= "3iAg*  objective/klD A'F 3iAg*  objective/kl_coefTZ>$'F 3iAg*  objective/entropyϽ1/m]P 3iAg*" ppo/mean_non_score_rewardףp%6 3iAg*  ppo/mean_scores>n'e"$B+M 3iAg*  ppo/std_scoreslO.W {3iAg*!  time/ppo/compute_rewards:71 3iAg*$ " time/ppo/compute_advantagesH;xI,E ,3iAg*  time/ppo/optimize_step$_S?G)u)7_ 3iAg*  time/ppo/calc_stats@>E+$B+M 3iAg*  time/ppo/totalЛ?ԟ%6 J3iAg*  env/reward_mean>Ĵ$B+M 3iAg*  env/reward_stdlmw'F kk(iAh*  objective/entropyc/m]P k(iAh*" ppo/mean_non_score_reward_O枌%6 k(iAh*  ppo/mean_scores@@$B+M k(iAh*  ppo/std_scores- .W y(iAh*!  time/ppo/compute_rewards0:_j1 z(iAh*$ " time/ppo/compute_advantages/;M,E z(iAh*  time/ppo/optimize_step,d?k&)7_ z(iAh*  time/ppo/calc_stats>O$B+M p{(iAh*  time/ppo/totald"?Ҭ%6 {(iAh*  env/reward_mean@@-$B+M *|(iAh*  env/reward_std"x= tiAi*  objective/kl5A9q4'F iAi*  objective/kl_coefXZ>42'F iAi*  objective/entropynK+*/m]P iAi*" ppo/mean_non_score_reward6e'j%6 *iAi*  ppo/mean_scores@@`$B+M ;iAi*  ppo/std_scores-9.W iAi*!  tokens/responses_len_stdwdS%6 iAi*  ppo/loss/policy g$B+M 2iAi*  ppo/loss/valueb>`~$B+M `iAi*  ppo/loss/totalp<=qN(pJ yiAi*  ppo/policy/entropyA9t])7_ iAi*  ppo/policy/approxkl~E/#)7_ iAi*  ppo/policy/policykl:=7'z{)7_ iAi*  ppo/policy/clipfrac,80_ iAi*# ! ppo/policy/advantages_mean}oC&sO iAi*  ppo/returns/mean[@%6 iAi*  ppo/returns/var #wC iAi*  ppo/val/vpredam@̮a#wC iAi*  ppo/val/errork> &sO LiAi*  ppo/val/clipfrac?FU"x= iAi*  ppo/val/mean݀@ea!{ iAi*  ppo/val/var݀+K YiAi*  ppo/val/var_explained'F iAi*  ppo/learning_rateϸ:.p+K iAi*  time/ppo/forward_passO1>yq.W hiAi*!  time/ppo/compute_rewardsx:Dlc1 iAi*$ " time/ppo/compute_advantages(-;h}i,E *iAi*  time/ppo/optimize_step[e??շ)7_ iAi*  time/ppo/calc_stats0> p*$B+M iAi*  time/ppo/total5AT@Z'F uiAj*  objective/kl_coef)Z>ƽ'F uiAj*  objective/entropy 9P/m]P uiAj*" ppo/mean_non_score_rewardzz߳%6 uiAj*  ppo/mean_scores@@+$B+M viAj*  ppo/std_scores]->.W ~iAj*!  time/ppo/compute_rewards:1 iAj*$ " time/ppo/compute_advantages);pD,E viAj*  time/ppo/optimize_stepVb?Wv)7_ iAj*  time/ppo/calc_stats>$B+M !iAj*  time/ppo/total͙?QѤz%6 xiAj*  env/reward_mean@@k嵷$B+M ΀iAj*  env/reward_stdxI "x= ziAk*  objective/kl4A['F ziAk*  objective/kl_coefZ>٩'F "ziAk*  objective/entropy \9/m]P 9ziAk*" ppo/mean_non_score_reward4(%6 NziAk*  ppo/mean_scores@@$B+M dziAk*  ppo/std_scoresh-/m]P ziAk*" tokens/responses_len_mean?c.W ziAk*!  tokens/responses_len_std;"x= ziAk*  ppo/val/mean8t@!{ $ziAk*  ppo/val/varb1+K ziAk*  ppo/val/var_explainedRbԼ6u'F ziAk*  ppo/learning_rateϸ:x+K sziAk*  time/ppo/forward_pass@4>'+Q.W ziAk*!  time/ppo/compute_rewards:(1 ziAk*$ " time/ppo/compute_advantages);7,E ?ziAk*  time/ppo/optimize_stepk?(%F`)7_ ziAk*  time/ppo/calc_stats@X>'>u$B+M ziAk*  time/ppo/totalH?0N%6 NziAk*  env/reward_mean@@,x$B+M ziAk*  env/reward_std&e"x= AiAl*  objective/kl/A7'F rAiAl*  objective/kl_coef[>0L 'F AiAl*  objective/entropyҬ:k/m]P AiAl*" ppo/mean_non_score_reward1.%6 AiAl*  ppo/mean_scores@@a$B+M ȵAiAl*  ppo/std_scores! a -H/.W ϽAiAl*!  time/ppo/compute_rewards ;F1 4AiAl*$ " time/ppo/compute_advantages);K",E AiAl*  time/ppo/optimize_stepe?ϫ)7_ ھAiAl*  time/ppo/calc_stats`!>|s$B+M .AiAl*  time/ppo/totalڜ?EP%6 AiAl*  env/reward_mean@@ӫ$B+M ׿AiAl*  env/reward_stdV"x= KOYiAm*  objective/klA K'F OYiAm*  objective/kl_coef+[>}K'F OYiAm*  objective/entropy>k/m]P OYiAm*" ppo/mean_non_score_reward!˩%6 OYiAm*  ppo/mean_scores@AQ$B+M PYiAm*  ppo/std_scoresWh?-/m]P FPYiAm*" tokens/responses_len_mean?%.W XPYiAm*!  tokens/responses_len_std5>q\%6 jPYiAm*  ppo/loss/policy,Dq$B+M RYiAm*  ppo/loss/valuelz?g$B+M SYiAm*  ppo/loss/totalţ=IL(pJ SYiAm*  ppo/policy/entropy/;?)7_ ^UYiAm*  ppo/policy/approxklF?0E5)7_ UYiAm*  ppo/policy/policykl!>(6)7_ UYiAm*  ppo/policy/clipfrac.<%;0_ UYiAm*# ! ppo/policy/advantages_mean>9`&sO UYiAm*  ppo/returns/meanS4G@.WB%6 UYiAm*  ppo/returns/varƿ?0#wC UYiAm*  ppo/val/vpredz~r@#wC JVYiAm*  ppo/val/errorl?V&sO VYiAm*  ppo/val/clipfracU"x= WYiAm*  ppo/val/meanS\@:!{ XWYiAm*  ppo/val/varb[7ОF+K WYiAm*  ppo/val/var_explained@G,'F XYiAm*  ppo/learning_rateϸ:.О+K bXYiAm*  time/ppo/forward_pass0>yqh$B+M ZYiAm*  time/ppo/totalJ?sr%6 iZYiAm*  env/reward_mean@[[$B+M ZYiAm*  env/reward_stdWh?z#"x= j iAn*  objective/kl5A7{'F Uk iAn*  objective/kl_coefO[>wT'F ok iAn*  objective/entropyk/m]P k iAn*" ppo/mean_non_score_reward$G%6 k iAn*  ppo/mean_scores@@_$B+M k iAn*  ppo/std_scoresc'"-i,E k iAn*  tokens/queries_len_stdj-/m]P k iAn*" tokens/responses_len_mean?Kxaj.W k iAn*!  tokens/responses_len_std{%6 k iAn*  ppo/loss/policyy$B+M n iAn*  ppo/loss/valuer >W+"$B+M r iAn*  ppo/loss/totalb<(pJ r iAn*  ppo/policy/entropy4?3)7_ r iAn*  ppo/policy/approxkl&Ny)7_ s iAn*  ppo/policy/policykl5)7_ s iAn*  ppo/policy/clipfracRut0_ -s iAn*# ! ppo/policy/advantages_mean2ZT&sO Ds iAn*  ppo/returns/meankZ@[_%6 Vs iAn*  ppo/returns/varK5#wC hs iAn*  ppo/val/vpred*;@#wC xs iAn*  ppo/val/errorr؍> h&sO u iAn*  ppo/val/clipfracyo"x= u iAn*  ppo/val/meanM@!{ u iAn*  ppo/val/var:R+K v iAn*  ppo/val/var_explained:'F v iAn*  ppo/learning_rateϸ:+K #v iAn*  time/ppo/forward_passC(>[*-.W Py iAn*!  time/ppo/compute_rewardsH:٢c1 yy iAn*$ " time/ppo/compute_advantages#;.,E y iAn*  time/ppo/optimize_step g?0)7_ y iAn*  time/ppo/calc_statsP!>/J$B+M y iAn*  time/ppo/totalP?+K%6 y iAn*  env/reward_mean@@o6&0$B+M y iAn*  env/reward_std)"x= -iAo*  objective/kl5Aڀ'F iAo*  objective/kl_coefs[>o>/'F iAo*  objective/entropy+-/m]P iAo*" ppo/mean_non_score_reward—!%6 iAo*  ppo/mean_scores@@HS~o$B+M iAo*  ppo/std_scores y-R&sO iAo*  ppo/val/clipfrac3["x= iAo*  ppo/val/mean-fW@ e!{ yiAo*  ppo/val/var!)ot{+K iAo*  ppo/val/var_explained'F (iAo*  ppo/learning_rateϸ:M +K 2iAo*  time/ppo/forward_pass0.>3.W `iAo*!  time/ppo/compute_rewards:c1 iAo*$ " time/ppo/compute_advantages(*;F/,E )iAo*  time/ppo/optimize_step Ff?W)7_ iAo*  time/ppo/calc_statsw!>-f$B+M iAo*  time/ppo/total$?`>%6 <iAo*  env/reward_mean@@$B+M iAo*  env/reward_std)zx"x= [iAp*  objective/kl5A'F Q\iAp*  objective/kl_coef[>*og'F o\iAp*  objective/entropyx/m]P \iAp*" ppo/mean_non_score_reward|%6 \iAp*  ppo/mean_scores@@H$B+M \iAp*  ppo/std_scores*TF-*:.W eiAp*!  time/ppo/compute_rewards:[1 meiAp*$ " time/ppo/compute_advantages,;M9,E eiAp*  time/ppo/optimize_steptf?9[)7_ fiAp*  time/ppo/calc_statsP!>vFIZ$B+M rfiAp*  time/ppo/total?u%6 fiAp*  env/reward_mean@@"f$B+M %giAp*  env/reward_std_"x= )->iAq*  objective/kl5AH|)'F ->iAq*  objective/kl_coef[>'F ->iAq*  objective/entropyq/m]P ->iAq*" ppo/mean_non_score_rewardtA3%6 ->iAq*  ppo/mean_scores@@5n $B+M ->iAq*  ppo/std_scores8=-iAq*  tokens/queries_len_meanB,E .>iAq*  tokens/queries_len_stdJ"/m]P .>iAq*" tokens/responses_len_mean?].W /.>iAq*!  tokens/responses_len_std*%6 @.>iAq*  ppo/loss/policy-$B+M X/>iAq*  ppo/loss/value<;$B+M u0>iAq*  ppo/loss/total?;!Y(pJ 1>iAq*  ppo/policy/entropy_A)7_ l1>iAq*  ppo/policy/approxkla)7_ 1>iAq*  ppo/policy/policykl#)7_ 2>iAq*  ppo/policy/clipfracW.gK0_ u2>iAq*# ! ppo/policy/advantages_mean2W&sO 2>iAq*  ppo/returns/meanlZ@`7%6 ,3>iAq*  ppo/returns/varP3#wC 3>iAq*  ppo/val/vpred}^@'h#wC 3>iAq*  ppo/val/errorHj=|Ρ&sO 94>iAq*  ppo/val/clipfrac> pv"x= 4>iAq*  ppo/val/meaneH@t!{ 4>iAq*  ppo/val/var`\+K K5>iAq*  ppo/val/var_explained2'F 5>iAq*  ppo/learning_rateϸ:+K 5>iAq*  time/ppo/forward_pass)>>.W V6>iAq*!  time/ppo/compute_rewards:|}1 6>iAq*$ " time/ppo/compute_advantages);,E 7>iAq*  time/ppo/optimize_stepf?)7_ R7>iAq*  time/ppo/calc_statsp+>at$B+M 7>iAq*  time/ppo/totalۜ?"%6 7>iAq*  env/reward_mean@@,$B+M L8>iAq*  env/reward_stdrz"x= #iAr*  objective/kl5AN.'F $iAr*  objective/kl_coef[>e'F .$iAr*  objective/entropyWz/m]P E$iAr*" ppo/mean_non_score_rewardj*:,%6 Y$iAr*  ppo/mean_scores@@ u t$B+M k$iAr*  ppo/std_scoresbi-[)7_ )iAr*  ppo/policy/approxkl)7_ )iAr*  ppo/policy/policyklj0y)7_ )iAr*  ppo/policy/clipfracqÍ0_ )iAr*# ! ppo/policy/advantages_meanJ+_&sO )iAr*  ppo/returns/meanVZ@oW%6 :iAr*  ppo/returns/var #wC :iAr*  ppo/val/vpredq[@ vJ#wC ;iAr*  ppo/val/error*=U&sO U5"x= iAr*  time/ppo/forward_passc)>.W i>iAr*!  time/ppo/compute_rewards:N1 >iAr*$ " time/ppo/compute_advantagesd);0,E ?iAr*  time/ppo/optimize_stepHd?)7_ n?iAr*  time/ppo/calc_statss>i> $B+M ?iAr*  time/ppo/total?%6 @iAr*  env/reward_mean@@'Y$B+M m@iAr*  env/reward_std: "x= OiAs*  objective/kl5AAo'F iAs*  objective/kl_coef\>='F ֓iAs*  objective/entropy;/m]P iAs*" ppo/mean_non_score_rewarde@p%6 iAs*  ppo/mean_scores@@ ;$B+M iAs*  ppo/std_scoresʄ-iAs*  tokens/queries_len_stdB|T/m]P OiAs*" tokens/responses_len_mean?g.W aiAs*!  tokens/responses_len_std3%6 tiAs*  ppo/loss/policy[$B+M iAs*  ppo/loss/value$/?.W iAs*!  time/ppo/compute_rewards:ֱ11 iAs*$ " time/ppo/compute_advantages*;˧,E ciAs*  time/ppo/optimize_stepf?~:F)7_ iAs*  time/ppo/calc_stats )!>$B+M iAs*  time/ppo/totalŝ?0I%6 giAs*  env/reward_mean@@D$B+M iAs*  env/reward_std'%"x= EiAt*  objective/kl5AV'F 8FiAt*  objective/kl_coef'\>Ll'F VFiAt*  objective/entropyv/m]P mFiAt*" ppo/mean_non_score_rewardeV.o%6 FiAt*  ppo/mean_scores@@"<&C$B+M FiAt*  ppo/std_scores\-(J.W NiAt*!  time/ppo/compute_rewards:Tk%1 'OiAt*$ " time/ppo/compute_advantages(,;!,E {OiAt*  time/ppo/optimize_stepg?I)7_ OiAt*  time/ppo/calc_stats`">^$B+M )PiAt*  time/ppo/totalzr?k %6 ~PiAt*  env/reward_mean@@O&$B+M PiAt*  env/reward_stdh) "x= άOiAu*  objective/kl5A'F :OiAu*  objective/kl_coefK\>iP'F XOiAu*  objective/entropy/m]P nOiAu*" ppo/mean_non_score_rewardgl%6 OiAu*  ppo/mean_scores@@]M$B+M OiAu*  ppo/std_scores&g -..W 4OiAu*!  time/ppo/compute_rewards:,M1 OiAu*$ " time/ppo/compute_advantages\1;Mu,E OiAu*  time/ppo/optimize_step X?NUGh)7_ TOiAu*  time/ppo/calc_stats>$B+M OiAu*  time/ppo/totalF ?/ %6 OiAu*  env/reward_mean@@2$B+M FOiAu*  env/reward_stdV"x= CiAv*  objective/kl5A7'F iAv*  objective/kl_coefo\>A+'F iAv*  objective/entropy@ҙ/m]P iAv*" ppo/mean_non_score_rewardmS%6 iAv*  ppo/mean_scores@@K"$B+M iAv*  ppo/std_scores -94.W 2iAv*!  time/ppo/compute_rewards:^1 iAv*$ " time/ppo/compute_advantages*;W'-,E iAv*  time/ppo/optimize_stepT[?B))7_ DiAv*  time/ppo/calc_stats >F$B+M iAv*  time/ppo/totalV̕?%6 iAv*  env/reward_mean@@)H$B+M ^iAv*  env/reward_stdj"x= iAw*  objective/kl5A'F iAw*  objective/kl_coef\>9'F 6.W ےiAw*!  time/ppo/compute_rewards:[1 쒢iAw*$ " time/ppo/compute_advantages$; ,E iAw*  time/ppo/optimize_step,Y?`)7_ iAw*  time/ppo/calc_statsP>P $B+M !iAw*  time/ppo/total ?B:%6 piAw*  env/reward_mean@@Wڴ$B+M iAw*  env/reward_stdq"x= EiAx*  objective/kl5A1Ч'F EiAx*  objective/kl_coef\>X'F EiAx*  objective/entropyQRA/m]P )EiAx*" ppo/mean_non_score_reward%6 ;EiAx*  ppo/mean_scores@@:$B+M LEiAx*  ppo/std_scores}- (pJ ՞EiAx*  ppo/policy/entropy#W#:)7_ EiAx*  ppo/policy/approxklm?r)7_ >EiAx*  ppo/policy/policyklJ|G)7_ lEiAx*  ppo/policy/clipfracK0_ EiAx*# ! ppo/policy/advantages_mean-h&sO lEiAx*  ppo/returns/meanY@zw%6 EiAx*  ppo/returns/var#wC EiAx*  ppo/val/vpredrY@Ԅif#wC EiAx*  ppo/val/errorr;XZ&sO ˥EiAx*  ppo/val/clipfracG/"x= ܥEiAx*  ppo/val/meanS@K!{ EiAx*  ppo/val/var!) +K EiAx*  ppo/val/var_explained'F EiAx*  ppo/learning_rateϸ:Č+K EiAx*  time/ppo/forward_passq >=#.W ,EiAx*!  time/ppo/compute_rewards(:, 1 XEiAx*$ " time/ppo/compute_advantages$%;tcv\,E nEiAx*  time/ppo/optimize_stepxdX?F%)7_ EiAx*  time/ppo/calc_statsp>f>$B+M EiAx*  time/ppo/totaln?۠E%6 EiAx*  env/reward_mean@@%$B+M ;EiAx*  env/reward_std<"x= JiAy*  objective/kl5Ar[('F JiAy*  objective/kl_coef-\>ܨo 'F KiAy*  objective/entropy\/m]P 1KiAy*" ppo/mean_non_score_reward%6 CKiAy*  ppo/mean_scores@@8\$B+M TKiAy*  ppo/std_scores._}-.W 5iAy*!  time/ppo/compute_rewards:}t1 diAy*$ " time/ppo/compute_advantages4";(,E {iAy*  time/ppo/optimize_stepY?mr)7_ iAy*  time/ppo/calc_statsP>0($B+M iAy*  time/ppo/totalF?I%6 iAy*  env/reward_mean@@Pz$B+M iAy*  env/reward_std"x= TiAz*  objective/kl5A9h''F 5UiAz*  objective/kl_coef]]>as4'F TUiAz*  objective/entropy/'$/m]P jUiAz*" ppo/mean_non_score_rewardL/%6 UiAz*  ppo/mean_scores@@$B+M UiAz*  ppo/std_scoresjz-/+K !]iAz*  ppo/val/var_explainedCU'F ^iAz*  ppo/learning_rateϸ:n+K _iAz*  time/ppo/forward_pass%>[.W _iAz*!  time/ppo/compute_rewards :51 `iAz*$ " time/ppo/compute_advantages-; D,E r`iAz*  time/ppo/optimize_step Wb?>JA)7_ `iAz*  time/ppo/calc_stats]> S$B+M 7aiAz*  time/ppo/totalt?V0%6 aiAz*  env/reward_mean@@ e$B+M biAz*  env/reward_stdvp@V"x= EiA{*  objective/kl5A'F EiA{*  objective/kl_coef$]>{'F -EiA{*  objective/entropy(4o/m]P CEiA{*" ppo/mean_non_score_reward00%6 UEiA{*  ppo/mean_scores@@)K$B+M fEiA{*  ppo/std_scores 6=D-a.W EiA{*!  tokens/responses_len_stdb/%6 EiA{*  ppo/loss/policyM5$B+M EiA{*  ppo/loss/valuet:erd$B+M /EiA{*  ppo/loss/total8"q(pJ ZEiA{*  ppo/policy/entropyb)7_ EiA{*  ppo/policy/approxklO)7_ EiA{*  ppo/policy/policykl)7_ #EiA{*  ppo/policy/clipfracȸ0_ 7EiA{*# ! ppo/policy/advantages_mean9@&sO IEiA{*  ppo/returns/meanJY@%6 \EiA{*  ppo/returns/var#wC mEiA{*  ppo/val/vpred'9\@#wC }EiA{*  ppo/val/error:j&sO EiA{*  ppo/val/clipfrack"x= EiA{*  ppo/val/mean[@?B!{ EiA{*  ppo/val/vare]I+K EiA{*  ppo/val/var_explainedF@'F EiA{*  ppo/learning_rateϸ:sǽ+K EiA{*  time/ppo/forward_pass`>mӪ.W (EiA{*!  time/ppo/compute_rewards:l1 :EiA{*$ " time/ppo/compute_advantages";:",E KEiA{*  time/ppo/optimize_step4X?5j)7_ [EiA{*  time/ppo/calc_statsP>{$B+M nEiA{*  time/ppo/totaln?'9%6 EiA{*  env/reward_mean@@;xR$B+M "EiA{*  env/reward_std,e7"x= -iA|*  objective/kl5A+g'F iA|*  objective/kl_coefH]>p'F iA|*  objective/entropyL [/m]P iA|*" ppo/mean_non_score_rewardu=Ŕ%6 iA|*  ppo/mean_scores@@w$B+M iA|*  ppo/std_scoresQP-iA|*  ppo/loss/policy$B+M tiA|*  ppo/loss/valueG :-+$B+M "iA|*  ppo/loss/total b8fIy(pJ 5"iA|*  ppo/policy/entropyc)7_ K"iA|*  ppo/policy/approxkl!8)7_ _"iA|*  ppo/policy/policyklQM)7_ r"iA|*  ppo/policy/clipfrac+=f0_ "iA|*# ! ppo/policy/advantages_meansh&sO a%iA|*  ppo/returns/mean-zY@C$%6 %iA|*  ppo/returns/var!)(#wC 'iA|*  ppo/val/vpred@W@#wC R(iA|*  ppo/val/errorG:g^&sO (iA|*  ppo/val/clipfrac+{_"x= *iA|*  ppo/val/meanaY@X?!{ *iA|*  ppo/val/varm+K ^+iA|*  ppo/val/var_explained݈ f['F +iA|*  ppo/learning_rateϸ:+K ,iA|*  time/ppo/forward_passp >Cv.W f,iA|*!  time/ppo/compute_rewardsh:1 ,iA|*$ " time/ppo/compute_advantages!;hT,E -iA|*  time/ppo/optimize_step\A]?K)7_ U-iA|*  time/ppo/calc_stats;>;?$B+M -iA|*  time/ppo/totalꓖ?>%6 -iA|*  env/reward_mean@@$B+M D.iA|*  env/reward_std7"x= tiA}*  objective/kl5A:_w'F btiA}*  objective/kl_coefm]>ح`M'F ytiA}*  objective/entropyj/m]P tiA}*" ppo/mean_non_score_reward#%6 tiA}*  ppo/mean_scores@@"$B+M tiA}*  ppo/std_scoresN-e:J'F `iA~*  objective/entropy` /m]P `iA~*" ppo/mean_non_score_reward 3%6 +`iA~*  ppo/mean_scores@@I2$B+M ?`iA~*  ppo/std_scoresf^-9./$B+M `iA~*  ppo/loss/total7z(pJ `iA~*  ppo/policy/entropyL\)7_ `iA~*  ppo/policy/approxklԖY)7_ `iA~*  ppo/policy/policykl{)7_ `iA~*  ppo/policy/clipfrac,i0_ `iA~*# ! ppo/policy/advantages_mean׈&sO `iA~*  ppo/returns/meanMY@LV%6 `iA~*  ppo/returns/var!)@#wC %`iA~*  ppo/val/vpredQ1Y@#wC `iA~*  ppo/val/error9}&sO n`iA~*  ppo/val/clipfracÌ"x= `iA~*  ppo/val/mean-Z@o[V!{ `iA~*  ppo/val/var!)4I+K `iA~*  ppo/val/var_explainedC'F `iA~*  ppo/learning_rateϸ:`M+K `iA~*  time/ppo/forward_passP> P.W K`iA~*!  time/ppo/compute_rewardsP:MW1 `iA~*$ " time/ppo/compute_advantages$*;E,E `iA~*  time/ppo/optimize_stepX?ԡ)7_ T`iA~*  time/ppo/calc_stats >u'$B+M `iA~*  time/ppo/totalV?%6 `iA~*  env/reward_mean@@]*$B+M X`iA~*  env/reward_std"x= iA*  objective/kl5A );'F XiA*  objective/kl_coef]>w}'F qiA*  objective/entropyW/m]P iA*" ppo/mean_non_score_rewardGI1QT%6 iA*  ppo/mean_scores@@N$B+M iA*  ppo/std_scores7f_-r.W iA*!  time/ppo/compute_rewards:O1 iA*$ " time/ppo/compute_advantagesX";K2(,E jiA*  time/ppo/optimize_step(8X?Jc)7_ iA*  time/ppo/calc_stats>m]$B+M uiA*  time/ppo/totaln?m%6 iA*  env/reward_mean@@km$B+M !iA*  env/reward_stdBv #wC iA*  objective/kl5A(pJ /iA*  objective/kl_coef]>*&t(pJ LiA*  objective/entropyG_0_ diA*" ppo/mean_non_score_rewardr_SwA&sO ziA*  ppo/mean_scores@@v[$'%6 iA*  ppo/std_scores?v8q.W iA*  tokens/queries_len_meanB80-+/m]P iA*!  time/ppo/compute_rewards:|2$V iA*$ " time/ppo/compute_advantagesT,;N-OzH%6 iA*  time/ppo/totalN.?s|&sO iA*  env/reward_mean@@+*%6 4iA*  env/reward_stdZ3#wC ѧziA*  objective/kl5A!B(pJ HziA*  objective/kl_coefP]>no(pJ eziA*  objective/entropyЖ0_ yziA*" ppo/mean_non_score_rewarduBl&sO ziA*  ppo/mean_scores@@m7%6 ziA*  ppo/std_scoresū.W ziA*  tokens/queries_len_meanBD\-,E ziA*  time/ppo/forward_pass0> :)/m]P %ziA*!  time/ppo/compute_rewardsP:qsu32$V ziA*$ " time/ppo/compute_advantages ;y-(S%6 {ziA*  time/ppo/total|k?oݰ&sO дziA*  env/reward_mean@@J%6 !ziA*  env/reward_std D#wC d HiA*  objective/kl5A(pJ HiA*  objective/kl_coef"^>AM(pJ HiA*  objective/entropy0_ HiA*" ppo/mean_non_score_rewardыPM&sO !HiA*  ppo/mean_scores@@>%6 !HiA*  ppo/std_scores8TD.W /!HiA*  tokens/queries_len_meanBEkXU-6)7_ %HiA*  ppo/policy/entropy* %HiA*  ppo/policy/approxkl* %HiA*  ppo/policy/policyklvG* %HiA*  ppo/policy/clipfracy1 %HiA*# ! ppo/policy/advantages_mean{Ա'F %HiA*  ppo/returns/mean i/m]P (HiA*!  time/ppo/compute_rewards:i)2$V )HiA*$ " time/ppo/compute_advantagesl;8-~%6 *HiA*  time/ppo/total2[? &sO *HiA*  env/reward_mean@@a %6 *HiA*  env/reward_stdiK##wC jA*  objective/kl5AH(pJ jA*  objective/kl_coefG^>%(pJ .jA*  objective/entropy#0_ DjA*" ppo/mean_non_score_reward<׋&sO WjA*  ppo/mean_scores@@p%6 ijA*  ppo/std_scores>.W |jA*  tokens/queries_len_meanB=T~-* jA*  ppo/policy/clipfrac1 jA*# ! ppo/policy/advantages_meanDg'F !jA*  ppo/returns/meanX@T &sO 2jA*  ppo/returns/var?{$B+M CjA*  ppo/val/vpredX@»$B+M SjA*  ppo/val/errora8:['F cjA*  ppo/val/clipfracT#wC tjA*  ppo/val/mean4Y@蝒"x= jA*  ppo/val/var!)߫,E EjA*  ppo/val/var_explained(pJ $jA*  ppo/learning_rateϸ:T},E jA*  time/ppo/forward_pass>-/m]P jA*!  time/ppo/compute_rewards: 62$V HjA*$ " time/ppo/compute_advantages);}B-i%6 ;jA*  time/ppo/total?j&sO jA*  env/reward_mean@@QG@%6 jA*  env/reward_stdI_#wC ejA*  objective/kl5AI(pJ jA*  objective/kl_coefk^>r(pJ jA*  objective/entropyΉ|0_ jA*" ppo/mean_non_score_reward?u:ʻ&sO jA*  ppo/mean_scores@@]%6 ,jA*  ppo/std_scoresYe.W ?jA*  tokens/queries_len_meanBC $-$B+M #jA*  ppo/val/errorX7Tu 'F &#jA*  ppo/val/clipfrac#wC 7#jA*  ppo/val/meanX@Q"x= H#jA*  ppo/val/varq8/j,E X#jA*  ppo/val/var_explainedR͌)0(pJ #jA*  ppo/learning_rateϸ:,E )$jA*  time/ppo/forward_pass**>#t$;/m]P $jA*!  time/ppo/compute_rewards:y\c2$V $jA*$ " time/ppo/compute_advantages*;k"-,(%6 %jA*  time/ppo/total?&sO Y&jA*  env/reward_mean@@ao%6 &jA*  env/reward_std#wC rjA*  objective/kl5AQS(pJ grjA*  objective/kl_coef^>M(pJ rjA*  objective/entropy-r0_ rjA*" ppo/mean_non_score_reward{Zx&sO rjA*  ppo/mean_scores@@dB%6 rjA*  ppo/std_scoresP$.W rjA*  tokens/queries_len_meanB'U-f/m]P rjA*!  time/ppo/compute_rewards:n$& 2$V ArjA*$ " time/ppo/compute_advantages(;z2-p%6 brjA*  time/ppo/total։?=&sO zrjA*  env/reward_mean@@RB%6 rjA*  env/reward_stdW#wC `,jA*  objective/kl5Ai+(pJ ̈́,jA*  objective/kl_coefg^>(pJ ,jA*  objective/entropy0_ ,jA*" ppo/mean_non_score_reward&sO ,jA*  ppo/mean_scores@@ n%6 ",jA*  ppo/std_scores.W 4,jA*  tokens/queries_len_meanB-j#>%6 Վ,jA*  time/ppo/totalR?B-&sO ),jA*  env/reward_mean@@!b%6 y,jA*  env/reward_std1Q#wC 4jA*  objective/kl5A2(pJ 5jA*  objective/kl_coef^>Ϟ(pJ #5jA*  objective/entropy0_ :5jA*" ppo/mean_non_score_reward&sO L5jA*  ppo/mean_scores@@#u}%6 ^5jA*  ppo/std_scores9Y?.W o5jA*  tokens/queries_len_meanBDw-;jA*  ppo/policy/approxkl^* Q;jA*  ppo/policy/policyklF* b;jA*  ppo/policy/clipfracg1 v;jA*# ! ppo/policy/advantages_mean_V.'F ;jA*  ppo/returns/mean X@ 1&sO ;jA*  ppo/returns/var!)筹$B+M ;jA*  ppo/val/vpredX@ ug$B+M <jA*  ppo/val/error*h73'F g<jA*  ppo/val/clipfracQN#wC <jA*  ppo/val/meanX@z"x= =jA*  ppo/val/var!)/,E o=jA*  ppo/val/var_explained`ͤO(pJ =jA*  ppo/learning_rateϸ:,E >jA*  time/ppo/forward_passPm&>/m]P z>jA*!  time/ppo/compute_rewards:o2$V >jA*$ " time/ppo/compute_advantagesX(;t-U%6 ?jA*  time/ppo/total/?M0&sO 1@jA*  env/reward_mean@@R 3m%6 @jA*  env/reward_stdg+#wC œjA*  objective/kl5A-E(pJ ÜjA*  objective/kl_coefg^>f(pJ /ÜjA*  objective/entropyU=0_ EÜjA*" ppo/mean_non_score_rewardG5&sO YÜjA*  ppo/mean_scores@@6J%6 nÜjA*  ppo/std_scoresw8.W ÜjA*  tokens/queries_len_meanB-/m]P aќjA*!  time/ppo/compute_rewards:2$V ќjA*$ " time/ppo/compute_advantages!;%؜-eR!!%6 ҜjA*  time/ppo/total^;?&sO ӜjA*  env/reward_mean@@O^G%6 uӜjA*  env/reward_std#wC iR jA*  objective/kl5A':(pJ jR jA*  objective/kl_coef!_>,x(pJ jR jA*  objective/entropy0_ jR jA*" ppo/mean_non_score_reward'0D&sO kR jA*  ppo/mean_scores@@>Ba%6 kR jA*  ppo/std_scoresI.W %kR jA*  tokens/queries_len_meanB~'-L/m]P rR jA*!  time/ppo/compute_rewards:-o2$V isR jA*$ " time/ppo/compute_advantages);--:a%6 ztR jA*  time/ppo/total)?CHY&sO tR jA*  env/reward_mean@@kĥ%6 1uR jA*  env/reward_std.z#wC ~ jA*  objective/kl5A{ (pJ & jA*  objective/kl_coefF_>(pJ C jA*  objective/entropyo(e0_ V jA*" ppo/mean_non_score_reward=&sO h jA*  ppo/mean_scores@@K%6 x jA*  ppo/std_scores;c.W  jA*  tokens/queries_len_meanB}- /m]P  jA*!  time/ppo/compute_rewards:!2$V  jA*$ " time/ppo/compute_advantages);&)-U%6 % jA*  time/ppo/totalލ?2&sO z jA*  env/reward_mean@@7y%6 щ jA*  env/reward_stdD#wC \ jA*  objective/kl5A*HWA(pJ jA*  objective/kl_coefk_>(pJ jA*  objective/entropyPj0_ jA*" ppo/mean_non_score_reward4T l&sO  jA*  ppo/mean_scores@@n%6 # jA*  ppo/std_scores3fw.W 5 jA*  tokens/queries_len_meanBi&@-* ] jA*  ppo/policy/approxklHt* jA*  ppo/policy/policykl~* jA*  ppo/policy/clipfrac;1 4 jA*# ! ppo/policy/advantages_mean{'F ^ jA*  ppo/returns/mean,X@nn&sO v jA*  ppo/returns/var`"^$B+M jA*  ppo/val/vpred7X@W=v$B+M jA*  ppo/val/errorrh5_?'F jA*  ppo/val/clipfraci#wC jA*  ppo/val/mean&X@^,;"x= jA*  ppo/val/varͥ,E jA*  ppo/val/var_explained(pJ jA*  ppo/learning_rateϸ:[3,E w jA*  time/ppo/forward_pass@!>ى/m]P jA*!  time/ppo/compute_rewardsP:h2$V ] jA*$ " time/ppo/compute_advantages(;mæY-lD%6 ] jA*  time/ppo/total.ӓ?%M&sO jA*  env/reward_mean@@u%6 jA*  env/reward_std#wC ljjA*  objective/kl5Apx(pJ oljjA*  objective/kl_coef_>](pJ ljjA*  objective/entropy0 0_ ljjA*" ppo/mean_non_score_rewardj.&sO ljjA*  ppo/mean_scores@@ńɉ%6 ljjA*  ppo/std_scores ˆ.W ljjA*  tokens/queries_len_meanBOď[-"x= wjjA*  ppo/val/var!),E +wjjA*  ppo/val/var_explainedܢ(pJ f/m]P ]wjjA*!  time/ppo/compute_rewards:ڧ2$V nwjjA*$ " time/ppo/compute_advantages$;h-n%6 zjjA*  time/ppo/total^Y?I&sO {jjA*  env/reward_mean@@6[%6 i|jjA*  env/reward_std0R#wC T!jA*  objective/kl5A͇(pJ U!jA*  objective/kl_coefO_>س(pJ U!jA*  objective/entropy0_ 3U!jA*" ppo/mean_non_score_rewardk%&sO EU!jA*  ppo/mean_scores@@Dɞ%6 VU!jA*  ppo/std_scores(f.W gU!jA*  tokens/queries_len_meanB-&sO U!jA*  ppo/loss/policy4%6 W!jA*  ppo/loss/value13N;%6 )X!jA*  ppo/loss/total2@)7_ X!jA*  ppo/policy/entropyb0g* Y!jA*  ppo/policy/approxkl;0j* mY!jA*  ppo/policy/policyklWlP* Y!jA*  ppo/policy/clipfrac81 ,Z!jA*# ! ppo/policy/advantages_mean%Լ'F Z!jA*  ppo/returns/mean)X@2x&sO Z!jA*  ppo/returns/var[$B+M I[!jA*  ppo/val/vpredW@f$B+M [!jA*  ppo/val/error1?4s'F [!jA*  ppo/val/clipfracO=q#wC W\!jA*  ppo/val/meanW@vI"x= \!jA*  ppo/val/var p,E ]!jA*  ppo/val/var_explained3o(pJ j]!jA*  ppo/learning_rateϸ:i,E ]!jA*  time/ppo/forward_pass*>]/m]P (^!jA*!  time/ppo/compute_rewards@:z2$V ^!jA*$ " time/ppo/compute_advantagesP);]-!7n%6 _!jA*  time/ppo/total?-O&sO _!jA*  env/reward_mean@@q%6 D`!jA*  env/reward_std\#wC KjA*  objective/kl5AkpT(pJ jA*  objective/kl_coef_>=(pJ jA*  objective/entropyZi0_ jA*" ppo/mean_non_score_reward@H%R&sO jA*  ppo/mean_scores@@z>%6 jA*  ppo/std_scoresK.W $jA*  tokens/queries_len_meanBS 6- FC/m]P yjA*!  time/ppo/compute_rewards:32$V jA*$ " time/ppo/compute_advantages&; n- %6 0jA*  time/ppo/totalƟ?&sO BjA*  env/reward_mean@@y%6 SjA*  env/reward_std$<#wC êjA*  objective/kl5A7!(pJ êjA*  objective/kl_coef_><(pJ ĪjA*  objective/entropyL0_ ĪjA*" ppo/mean_non_score_rewardT43&sO *ĪjA*  ppo/mean_scores@@=)bG%6 ;ĪjA*  ppo/std_scores@.W MĪjA*  tokens/queries_len_meanB @-e]%6 ƪjA*  ppo/loss/value&4z%6 ʪjA*  ppo/loss/totalڄ2H+)7_ ʪjA*  ppo/policy/entropya* ʪjA*  ppo/policy/approxkl4* ˪jA*  ppo/policy/policykl$$w* ˪jA*  ppo/policy/clipfracƺ1 .˪jA*# ! ppo/policy/advantages_meany@ 'F @˪jA*  ppo/returns/meankW@r3v&sO Q˪jA*  ppo/returns/var%$B+M b˪jA*  ppo/val/vpredxW@_ٖ$B+M s˪jA*  ppo/val/error4tM'F ˪jA*  ppo/val/clipfrach#wC ͪjA*  ppo/val/meanW@"x= ͪjA*  ppo/val/var+,E ͪjA*  ppo/val/var_explainedV |(pJ ͪjA*  ppo/learning_rateϸ:b,E ΪjA*  time/ppo/forward_pass @->[h/m]P ΪjA*!  time/ppo/compute_rewards:2$V .ΪjA*$ " time/ppo/compute_advantages<^;Y-<%6 ѪjA*  time/ppo/total՟?F&sO yѪjA*  env/reward_mean@@d%6 ѪjA*  env/reward_std@##wC d_jA*  objective/kl5A^X(pJ ʠ_jA*  objective/kl_coefV"`>A)(pJ _jA*  objective/entropyn0_ _jA*" ppo/mean_non_score_reward}F&sO _jA*  ppo/mean_scores@@@(%6 _jA*  ppo/std_scores1.W ._jA*  tokens/queries_len_meanBw-/Q./m]P _jA*!  time/ppo/compute_rewards: 2$V p_jA*$ " time/ppo/compute_advantages;lˁ-* _jA*  time/ppo/calc_stats@>գ%6 d_jA*  time/ppo/totalѓ?7&sO _jA*  env/reward_mean@@B&(pJ z4jA*  objective/entropyԶrD0_ z4jA*" ppo/mean_non_score_rewardo_g&sO z4jA*  ppo/mean_scores@@"R.%6 z4jA*  ppo/std_scoreso.W z4jA*  tokens/queries_len_meanB@p-q/m]P 4jA*!  time/ppo/compute_rewards :Ѩi2$V o4jA*$ " time/ppo/compute_advantages6;jn-e%6 Y4jA*  time/ppo/total|?yfA&sO 4jA*  env/reward_mean@@Dy%6 4jA*  env/reward_stdG#wC ojA*  objective/kl5A(pJ jA*  objective/kl_coefk`>$X(pJ jA*  objective/entropy5F^{0_ jA*" ppo/mean_non_score_reward;#&sO jA*  ppo/mean_scores@@ ōQ%6 0jA*  ppo/std_scores)|.W BjA*  tokens/queries_len_meanBU]-T͏/m]P >jA*!  time/ppo/compute_rewards:2$V jA*$ " time/ppo/compute_advantages3;:-Nd%6 jA*  time/ppo/totalhC?#6G&sO 7jA*  env/reward_mean@@n%6 jA*  env/reward_std`#wC 7jA*  objective/kl5Ab(pJ jA*  objective/kl_coef`>rv1(pJ jA*  objective/entropy0_ jA*" ppo/mean_non_score_rewardJ .f&sO jA*  ppo/mean_scores@@<‘3%6 jA*  ppo/std_scoresY.W $jA*  tokens/queries_len_meanBS B-a2/m]P jA*!  time/ppo/compute_rewards:2$V jA*$ " time/ppo/compute_advantagesTV;D-%6 ~ jA*  time/ppo/total?.L%&sO ! jA*  env/reward_mean@@.`%6 jA*  env/reward_std #wC icjA*  objective/kl5AQ(pJ cjA*  objective/kl_coef]`>#*(pJ PcjA*  objective/entropy=š0_ gcjA*" ppo/mean_non_score_reward '"&sO {cjA*  ppo/mean_scores@@2%6 cjA*  ppo/std_scores pϭ.W cjA*  tokens/queries_len_meanBeS3 |%6 cjA*  ppo/loss/total1}JS+)7_ cjA*  ppo/policy/entropymI* cjA*  ppo/policy/approxkl n2* cjA*  ppo/policy/policykl 3* cjA*  ppo/policy/clipfracGU1 cjA*# ! ppo/policy/advantages_meanL2'F cjA*  ppo/returns/meanNcW@#4z&sO 5cjA*  ppo/returns/var!)!<$B+M ocjA*  ppo/val/vpreddW@) I$B+M cjA*  ppo/val/error>e3Z.'F cjA*  ppo/val/clipfrac#wC cjA*  ppo/val/mean%kW@#^j"x= jcjA*  ppo/val/varq,E cjA*  ppo/val/var_explained 7(pJ cjA*  ppo/learning_rateϸ:9F;,E PcjA*  time/ppo/forward_passХ&>/m]P cjA*!  time/ppo/compute_rewards(:*ݦ2$V cjA*$ " time/ppo/compute_advantages(;(-r%6 cjA*  time/ppo/total{?y&sO $cjA*  env/reward_mean@@B_%6 cjA*  env/reward_std~=#wC (jA*  objective/kl5AB(pJ H(jA*  objective/kl_coef.`>2i/m]P (jA*!  time/ppo/compute_rewards:2$V b(jA*$ " time/ppo/compute_advantagesd(;>7-e~%6 -(jA*  time/ppo/total g?NYB&sO (jA*  env/reward_mean@@%6 [)jA*  env/reward_stdh#wC ! jA*  objective/kl5AW%(pJ " jA*  objective/kl_coef`>(pJ &" jA*  objective/entropy0_ :" jA*" ppo/mean_non_score_rewardJ &sO K" jA*  ppo/mean_scores@@ &%6 ^" jA*  ppo/std_scores.W p" jA*  tokens/queries_len_meanBRp*-W@Q&+"x= H. jA*  ppo/val/var*@,E . jA*  ppo/val/var_explainedd(pJ / jA*  ppo/learning_rateϸ:5,E 0 jA*  time/ppo/forward_passA(>tc/m]P 0 jA*!  time/ppo/compute_rewards:w(2$V Y1 jA*$ " time/ppo/compute_advantages ";)C-ۣD%6 3 jA*  time/ppo/total̞?&sO 3 jA*  env/reward_mean@@ZDb%6 J4 jA*  env/reward_std׳yC(pJ "jA*  objective/entropyzr0_ S"jA*" ppo/mean_non_score_reward-a M&sO e"jA*  ppo/mean_scores@@?%6 x"jA*  ppo/std_scoresѡz.W "jA*  tokens/queries_len_meanBpՆ-Xp/m]P "jA*!  time/ppo/compute_rewards :A2$V "jA*$ " time/ppo/compute_advantages0%;-Ho%6 S"jA*  time/ppo/total?&sO "jA*  env/reward_mean@@Ǯ%6 ~"jA*  env/reward_std.#wC wQ$jA*  objective/kl5AB(pJ Q$jA*  objective/kl_coefHa>.(pJ {Q$jA*  objective/entropyل0_ Q$jA*" ppo/mean_non_score_rewardw F&sO Q$jA*  ppo/mean_scores@@r"[%6 Q$jA*  ppo/std_scoresV.W ϵQ$jA*  tokens/queries_len_meanB+-a_/m]P Q$jA*!  time/ppo/compute_rewardsH;X2$V bQ$jA*$ " time/ppo/compute_advantages-;l-b%6 $Q$jA*  time/ppo/total"K?$&sO Q$jA*  env/reward_mean@@!%6 QQ$jA*  env/reward_std#wC )3(pJ q &jA*  objective/entropy'/0_ q &jA*" ppo/mean_non_score_reward6 Ī&sO q &jA*  ppo/mean_scores@@ %6 q &jA*  ppo/std_scoresv.W r &jA*  tokens/queries_len_meanBl-ξ#wC y &jA*  ppo/val/meanV@oÅ"x=  &jA*  ppo/val/var2V,E ? &jA*  ppo/val/var_explained@!(pJ ߀ &jA*  ppo/learning_rateϸ:`-,E | &jA*  time/ppo/forward_pass@0>f/m]P  &jA*!  time/ppo/compute_rewards :2$V &jA*$ " time/ppo/compute_advantages;;*O-z$%6 &jA*  time/ppo/total?j&sO  &jA*  env/reward_mean@@0~%6 &jA*  env/reward_stdfF#wC K*jA*  objective/kl,ABUp(pJ *jA*  objective/kl_coefa>](pJ *jA*  objective/entropyf0_ *jA*" ppo/mean_non_score_reward=&sO *jA*  ppo/mean_scores2i%6 *jA*  ppo/std_scores#@.W 2*jA*  tokens/queries_len_meanBN{-'F *jA*  ppo/val/clipfrac@?#wC *jA*  ppo/val/meanV@=}"x= *jA*  ppo/val/var0 ,E K*jA*  ppo/val/var_explained2(pJ *jA*  ppo/learning_rateϸ:I8u,E *jA*  time/ppo/forward_passh>|;/m]P *jA*!  time/ppo/compute_rewards:q2$V *jA*$ " time/ppo/compute_advantages@;-j!.%6 )*jA*  time/ppo/total?pw&sO _*jA*  env/reward_mean %6 *jA*  env/reward_stdqs}#wC {+jA*  objective/kl,A<(pJ {+jA*  objective/kl_coefa>\(pJ {+jA*  objective/entropy:0_ /{+jA*" ppo/mean_non_score_reward޹"&sO B{+jA*  ppo/mean_scoresN~%6 U{+jA*  ppo/std_scores^9.W j{+jA*  tokens/queries_len_meanB4[ -^%6 ${+jA*  ppo/loss/totalv&=[11)7_ T{+jA*  ppo/policy/entropy4i* m{+jA*  ppo/policy/approxkl* i{+jA*  ppo/policy/policykl %* {+jA*  ppo/policy/clipfrac1 {+jA*# ! ppo/policy/advantages_meanIG'F {+jA*  ppo/returns/mean޹ cOf&sO ۣ{+jA*  ppo/returns/var!);g$B+M {+jA*  ppo/val/vpred`G$B+M {+jA*  ppo/val/errorTO?'F ,{+jA*  ppo/val/clipfrac#wC E{+jA*  ppo/val/meanՎbt"x= {+jA*  ppo/val/varN,E {+jA*  ppo/val/var_explained*Iն(pJ Ш{+jA*  ppo/learning_rateϸ:-3r,E {+jA*  time/ppo/forward_passP!>Jw/m]P {+jA*!  time/ppo/compute_rewards0:n2$V 6{+jA*$ " time/ppo/compute_advantages;BZ-PX%6 {+jA*  time/ppo/total?ñ&sO {+jA*  env/reward_meanRl<%6 L{+jA*  env/reward_std?#wC y,jA*  objective/kl,A"'(pJ ez,jA*  objective/kl_coefa>p(pJ z,jA*  objective/entropy_4| 0_ z,jA*" ppo/mean_non_score_rewardJ&sO {,jA*  ppo/mean_scoresb%6 &{,jA*  ppo/std_scoresU.W 9{,jA*  tokens/queries_len_meanB-u7%6 5,jA*  ppo/loss/totalpU<Ś)7_ f,jA*  ppo/policy/entropy7/̏* ~,jA*  ppo/policy/approxklˬ+LG* ,jA*  ppo/policy/policykl5* ,jA*  ppo/policy/clipfracR¸o1 ,jA*# ! ppo/policy/advantages_mean-U'F Ѓ,jA*  ppo/returns/meanPn&sO ,jA*  ppo/returns/varH@r$B+M 3,jA*  ppo/val/vpredRe$B+M _,jA*  ppo/val/errora>O:'F v,jA*  ppo/val/clipfrac?qF#wC ,jA*  ppo/val/mean㥴-%6 ,jA*  time/ppo/totalQ?d&sO ,jA*  env/reward_mean?P%6 C,jA*  env/reward_stdƼ#wC b.jA*  objective/kl,A77(pJ b.jA*  objective/kl_coefb>u+'(pJ b.jA*  objective/entropy60_ b.jA*" ppo/mean_non_score_rewardU(&sO b.jA*  ppo/mean_scores܃%6 b.jA*  ppo/std_scoresNE.W b.jA*  tokens/queries_len_meanBA(-%6 pb.jA*  ppo/loss/value,>Vj%6 7b.jA*  ppo/loss/total<ڮt)7_ eb.jA*  ppo/policy/entropy09!d* b.jA*  ppo/policy/approxkly-Lܞ9* b.jA*  ppo/policy/policykl?6ny * b.jA*  ppo/policy/clipfrac;[1 b.jA*# ! ppo/policy/advantages_mean`B'F b.jA*  ppo/returns/mean93&sO b.jA*  ppo/returns/var $B+M b.jA*  ppo/val/vpred% $B+M b.jA*  ppo/val/error,>*'F xb.jA*  ppo/val/clipfrac?D* w#wC b.jA*  ppo/val/mean ֿ "x= Gb.jA*  ppo/val/var!(r^,E b.jA*  ppo/val/var_explainedD+~(pJ b.jA*  ppo/learning_rateϸ:6?,E .b.jA*  time/ppo/forward_pass`%>$/m]P b.jA*!  time/ppo/compute_rewardsH:ؔx2$V nb.jA*$ " time/ppo/compute_advantagesx8;-9ba%6 @b.jA*  time/ppo/total.?8M&sO b.jA*  env/reward_meanIC%6 rb.jA*  env/reward_std#wC /jA*  objective/kl,A%S?(pJ //jA*  objective/kl_coef&b>.(pJ M/jA*  objective/entropy?7Ok0_ c/jA*" ppo/mean_non_score_rewardoRI&sO v/jA*  ppo/mean_scoresP[)%6 /jA*  ppo/std_scoresd*K.W /jA*  tokens/queries_len_meanBz<-:/m]P ء/jA*!  time/ppo/compute_rewards:&@2$V q/jA*$ " time/ppo/compute_advantages;\#-Zl%6 5/jA*  time/ppo/total̕?h&sO Τ/jA*  env/reward_mean5ec%6 d/jA*  env/reward_std#wC e]1jA*  objective/kl,Aqm(pJ Lf]1jA*  objective/kl_coefKb>0 (pJ jf]1jA*  objective/entropy|8yb0_ f]1jA*" ppo/mean_non_score_rewardJo1&sO f]1jA*  ppo/mean_scoresyk%6 f]1jA*  ppo/std_scores:/.W f]1jA*  tokens/queries_len_meanB ~-7pA* m]1jA*  ppo/policy/clipfracz1 m]1jA*# ! ppo/policy/advantages_mean@,'F n]1jA*  ppo/returns/meanJO&sO n]1jA*  ppo/returns/var1(۳v$B+M o]1jA*  ppo/val/vpred $Wܸ$B+M &p]1jA*  ppo/val/error`:=l'F p]1jA*  ppo/val/clipfracK#wC Wq]1jA*  ppo/val/mean#E "x= q]1jA*  ppo/val/varºL,E r]1jA*  ppo/val/var_explainedI%r(pJ Rs]1jA*  ppo/learning_rateϸ:)›,E s]1jA*  time/ppo/forward_passn>2/m]P t]1jA*!  time/ppo/compute_rewards:_<"2$V u]1jA*$ " time/ppo/compute_advantages;I-2c^%6 v]1jA*  time/ppo/totalp.?&[&sO tw]1jA*  env/reward_mean@J%6 x]1jA*  env/reward_stdY#wC 2jA*  objective/kl,A(pJ 2jA*  objective/kl_coefpb>.(pJ 2jA*  objective/entropy~8*E 0_ 2jA*" ppo/mean_non_score_reward6i}&sO 2jA*  ppo/mean_scoresvY%6 2jA*  ppo/std_scores6a.W 2jA*  tokens/queries_len_meanBa-/m]P A$2jA*!  time/ppo/compute_rewards:7NY2$V $2jA*$ " time/ppo/compute_advantages8;e-7%6 %2jA*  time/ppo/totalD?z&sO S&2jA*  env/reward_mean_|%6 &2jA*  env/reward_stdaU#wC FL4jA*  objective/kl,A](pJ L4jA*  objective/kl_coefѕb>'O(pJ DžL4jA*  objective/entropy87oj0_ ޅL4jA*" ppo/mean_non_score_rewardNom,&sO L4jA*  ppo/mean_scores2J /%6 L4jA*  ppo/std_scores&P?.W L4jA*  tokens/queries_len_meanBzM-:)7_ 6L4jA*  ppo/policy/entropy ;2* aL4jA*  ppo/policy/approxkl/-* yL4jA*  ppo/policy/policykle7,* L4jA*  ppo/policy/clipfrac?1 L4jA*# ! ppo/policy/advantages_meanYz'F L4jA*  ppo/returns/meanN &sO XL4jA*  ppo/returns/var $B+M jL4jA*  ppo/val/vpredʡ$B+M L4jA*  ppo/val/error-mV/m]P L4jA*!  time/ppo/compute_rewardsP:2$V lL4jA*$ " time/ppo/compute_advantagesT4;&-jx_%6 `L4jA*  time/ppo/totalB#?I2&sO L4jA*  env/reward_mean %6 L4jA*  env/reward_stdџ#wC 35jA*  objective/klr,A"(pJ 5jA*  objective/kl_coefb>,s(pJ 5jA*  objective/entropybm9:l0_ 5jA*" ppo/mean_non_score_rewardg&sO 5jA*  ppo/mean_scores$T%6 5jA*  ppo/std_scoresL.W 5jA*  tokens/queries_len_meanBe-v//m]P 5jA*!  time/ppo/compute_rewards:A2$V t5jA*$ " time/ppo/compute_advantages;y-F%6 r5jA*  time/ppo/totald2?&sO 5jA*  env/reward_mean3%6 !5jA*  env/reward_std7#wC @DF7jA*  objective/klz,A:9(pJ DF7jA*  objective/kl_coefb>*;(pJ DF7jA*  objective/entropy 9;7݌0_ EF7jA*" ppo/mean_non_score_rewarda9&sO 5EF7jA*  ppo/mean_scores%6 REF7jA*  ppo/std_scoresE.W nEF7jA*  tokens/queries_len_meanB%+-0/m]P PF7jA*!  time/ppo/compute_rewards:I2$V gQF7jA*$ " time/ppo/compute_advantagesT;@5-=%6 RF7jA*  time/ppo/total6#?F&sO RF7jA*  env/reward_meanZ%6 ^SF7jA*  env/reward_std*`uU#wC >8jA*  objective/klu,Af(pJ ,?8jA*  objective/kl_coefCc>"J(pJ G?8jA*  objective/entropyw-9t(}0_ [?8jA*" ppo/mean_non_score_rewardu&sO m?8jA*  ppo/mean_scoresPr%6 }?8jA*  ppo/std_scores$.W ?8jA*  tokens/queries_len_meanBPYi-lK&sO [F8jA*  ppo/returns/var g$B+M lF8jA*  ppo/val/vpredk)a$B+M |F8jA*  ppo/val/error5 ;>'F F8jA*  ppo/val/clipfracG?#wC F8jA*  ppo/val/mean6_t"x= F8jA*  ppo/val/var",E F8jA*  ppo/val/var_explainedkQ(pJ !G8jA*  ppo/learning_rateϸ::NK,E wG8jA*  time/ppo/forward_pass>/B/m]P G8jA*!  time/ppo/compute_rewards:R2$V aH8jA*$ " time/ppo/compute_advantagesl$;>-|+%6 aI8jA*  time/ppo/total(?K&sO I8jA*  env/reward_mean%6 J8jA*  env/reward_std֕V#wC :jA*  objective/klZ,A)3(pJ :jA*  objective/kl_coeft*c>pD(pJ :jA*  objective/entropyn+9j0_ :jA*" ppo/mean_non_score_reward!&sO :jA*  ppo/mean_scores7r%6 :jA*  ppo/std_scoresi*.W :jA*  tokens/queries_len_meanBr-/m]P 0:jA*!  tokens/responses_len_stdMv&sO C:jA*  ppo/loss/policy7%6 :jA*  ppo/loss/valueC*:$\ %6 :jA*  ppo/loss/total58i)7_ :jA*  ppo/policy/entropy5;2wa* :jA*  ppo/policy/approxkl(M.cn* 2:jA*  ppo/policy/policykl87d* P:jA*  ppo/policy/clipfrac=1 :jA*# ! ppo/policy/advantages_mean@h'F G:jA*  ppo/returns/meantX;&sO :jA*  ppo/returns/varjR$B+M :jA*  ppo/val/vpredd;P$B+M a:jA*  ppo/val/errorC:8S'F :jA*  ppo/val/clipfracݑ#wC :jA*  ppo/val/mean"x= m:jA*  ppo/val/var~j^,E :jA*  ppo/val/var_explainedj6(pJ :jA*  ppo/learning_rateϸ:##,E y:jA*  time/ppo/forward_pass`# >l_/m]P :jA*!  time/ppo/compute_rewards:Gd2$V /:jA*$ " time/ppo/compute_advantages);[o-9"%6 Q:jA*  time/ppo/total ?W6&sO :jA*  env/reward_mean%6 #:jA*  env/reward_stdu#wC ;jA*  objective/klJ,A|(pJ ~;jA*  objective/kl_coefOc>[(pJ ;jA*  objective/entropy;9^N0_ ;jA*" ppo/mean_non_score_reward^ڱ &sO Ȕ;jA*  ppo/mean_scoresx%6 ۔;jA*  ppo/std_scores~.W ;jA*  tokens/queries_len_meanB4-;jA*  ppo/loss/policyp4%6 =;jA*  ppo/loss/value*WI:C%6 -;jA*  ppo/loss/total8%)7_ ;jA*  ppo/policy/entropy <;\h[* ;jA*  ppo/policy/approxkl.?,q٥L* X;jA*  ppo/policy/policykl@صK* ;jA*  ppo/policy/clipfrac7o1 ;jA*# ! ppo/policy/advantages_meanJ1'F n;jA*  ppo/returns/mean"&sO Ι;jA*  ppo/returns/varbY$B+M (;jA*  ppo/val/vpredea$B+M ;jA*  ppo/val/error*W:!m'F ښ;jA*  ppo/val/clipfraccܰ7#wC .;jA*  ppo/val/mean9"x= ;jA*  ppo/val/var;:,E 曓;jA*  ppo/val/var_explainedu1_(pJ >;jA*  ppo/learning_rateϸ:ʤ,E ;jA*  time/ppo/forward_passO>/m]P ;jA*!  time/ppo/compute_rewards:]$2$V c;jA*$ " time/ppo/compute_advantagesH;=:I-6%6 j;jA*  time/ppo/total?&sO Ğ;jA*  env/reward_meanO%6 ;jA*  env/reward_stdfGD#wC =jA*  objective/klQ,A!(pJ n=jA*  objective/kl_coeftc>d(pJ =jA*  objective/entropyD49F#0_ =jA*" ppo/mean_non_score_rewardm&sO =jA*  ppo/mean_scoresd;o%6 ž=jA*  ppo/std_scoresD .W Ԟ=jA*  tokens/queries_len_meanB-F/m]P =jA*!  time/ppo/compute_rewards8:Y.2$V =jA*$ " time/ppo/compute_advantages\&;y-}G%6 A=jA*  time/ppo/total?^ &sO =jA*  env/reward_meanÚȌ%6 =jA*  env/reward_stdY"#wC l>jA*  objective/klX,A3/(pJ pl>jA*  objective/kl_coef/c>U(pJ l>jA*  objective/entropyVD-9060_ l>jA*" ppo/mean_non_score_rewardi&sO l>jA*  ppo/mean_scoresX%6 ˣl>jA*  ppo/std_scores.W ܣl>jA*  tokens/queries_len_meanBY-jA*  tokens/queries_len_std5k0_ l>jA*" tokens/responses_len_mean?Rtnx/m]P l>jA*!  tokens/responses_len_stdY&sO &l>jA*  ppo/loss/policy#%6 tl>jA*  ppo/loss/value9#%6 l>jA*  ppo/loss/totalz8!')7_ jA*  ppo/policy/entropy1;"* Sl>jA*  ppo/policy/approxklv,i* jl>jA*  ppo/policy/policykl5 ~* |l>jA*  ppo/policy/clipfracr'Nb1 l>jA*# ! ppo/policy/advantages_mean'F l>jA*  ppo/returns/mean8fv&sO l>jA*  ppo/returns/var)$B+M ɫl>jA*  ppo/val/vpredog$B+M ٫l>jA*  ppo/val/errorC:cY'F ,l>jA*  ppo/val/clipfracs3@#wC [l>jA*  ppo/val/meanFT"x= ql>jA*  ppo/val/varM\L,E l>jA*  ppo/val/var_explained(pJ 5l>jA*  ppo/learning_rateϸ:W,E Ll>jA*  time/ppo/forward_passl>pHz$/m]P _l>jA*!  time/ppo/compute_rewardsd;22$V űl>jA*$ " time/ppo/compute_advantages;V\D-jA*  time/ppo/optimize_stepN?2* l>jA*  time/ppo/calc_statsd>6d%6 l>jA*  time/ppo/total? &sO l>jA*  env/reward_mean=p%6 ͳl>jA*  env/reward_stdח#wC ?jA*  objective/klR,A (pJ ?jA*  objective/kl_coefyc>]N(pJ 7?jA*  objective/entropy39-C0_ M?jA*" ppo/mean_non_score_reward#&sO a?jA*  ppo/mean_scoresXe%6 s?jA*  ppo/std_scoress.W ?jA*  tokens/queries_len_meanBd-J-,E ?jA*  ppo/val/var_explainedNTU(pJ 9?jA*  ppo/learning_rateϸ: ,E ?jA*  time/ppo/forward_pass '>d?t/m]P ?jA*!  time/ppo/compute_rewards:V2$V J?jA*$ " time/ppo/compute_advantages8;&-Ϫ%6 H?jA*  time/ppo/total?e4&sO ?jA*  env/reward_mean\/Xv%6 ?jA*  env/reward_std/#wC X4AjA*  objective/klH,Ai((pJ 4AjA*  objective/kl_coefc>Y (pJ 4AjA*  objective/entropy<9lh0_ 4AjA*" ppo/mean_non_score_reward/|-K&sO 4AjA*  ppo/mean_scoresci%6 4AjA*  ppo/std_scores.W ,4AjA*  tokens/queries_len_meanBH-4AjA*  tokens/queries_len_std={0_ S4AjA*" tokens/responses_len_mean?w/m]P e4AjA*!  tokens/responses_len_stdn/&sO y4AjA*  ppo/loss/policyU4%6 4AjA*  ppo/loss/value7)59,1W%6 4AjA*  ppo/loss/totalB7)7_ 4AjA*  ppo/policy/entropy>;&Ԗ`* 4AjA*  ppo/policy/approxklK]*U]* 4AjA*  ppo/policy/policykl@4* 4AjA*  ppo/policy/clipfrac:1 *5AjA*# ! ppo/policy/advantages_mean20'F 5AjA*  ppo/returns/mean/&sO 5AjA*  ppo/returns/var!)e$B+M 5AjA*  ppo/val/vpred01k$B+M 45AjA*  ppo/val/error7)9K'F 5AjA*  ppo/val/clipfrac1#wC t5AjA*  ppo/val/meano"x= 5AjA*  ppo/val/var%,E 5AjA*  ppo/val/var_explained/lٍW(pJ P5AjA*  ppo/learning_rateϸ:`,E 5AjA*  time/ppo/forward_passp> e/m]P 5AjA*!  time/ppo/compute_rewards:Y2$V 0 5AjA*$ " time/ppo/compute_advantages;.B - d%6 5AjA*  time/ppo/totalz?C&sO 5AjA*  env/reward_meanwf%C%6 - 5AjA*  env/reward_std $#wC SBjA*  objective/klJ,AېK(pJ BjA*  objective/kl_coef d>)x(pJ BjA*  objective/entropy[;9q0_ BjA*" ppo/mean_non_score_rewardH+<&sO BjA*  ppo/mean_scoresH060%6 BjA*  ppo/std_scores2<.W ,BjA*  tokens/queries_len_meanBAP-#wC 'BjA*  ppo/val/meanbtcf"x= S9BjA*  ppo/val/varp,E 9BjA*  ppo/val/var_explained1yu3(pJ _:BjA*  ppo/learning_rateϸ:,E :BjA*  time/ppo/forward_pass0>y/m]P 0;BjA*!  time/ppo/compute_rewardsx:@{h2$V ;BjA*$ " time/ppo/compute_advantages;>-K%6 p9<(pJ DjA*  objective/entropy59c$0_ ϼDjA*" ppo/mean_non_score_rewardba"&sO DjA*  ppo/mean_scores %6 DjA*  ppo/std_scores1e.W DjA*  tokens/queries_len_meanB71-* DjA*  ppo/policy/clipfrac<1 WDjA*# ! ppo/policy/advantages_meanZ'F DjA*  ppo/returns/meanbAL&sO DjA*  ppo/returns/varUB $B+M `DjA*  ppo/val/vpred$B+M DjA*  ppo/val/error8'F DjA*  ppo/val/clipfracu{{#wC DjA*  ppo/val/mean f|"x= DjA*  ppo/val/var܀e,E /DjA*  ppo/val/var_explained](pJ DjA*  ppo/learning_rateϸ:jl,E DjA*  time/ppo/forward_pass0>׎/m]P 5DjA*!  time/ppo/compute_rewardsx0;CWPf%6 DjA*  time/ppo/total?mg&sO DjA*  env/reward_mean %6 0DjA*  env/reward_stdS<#wC T}EjA*  objective/klP,Ao\6(pJ }EjA*  objective/kl_coefTd>-L{@(pJ ׸}EjA*  objective/entropy590_ }EjA*" ppo/mean_non_score_reward${[٘&sO }EjA*  ppo/mean_scoresPAe%6 }EjA*  ppo/std_scores?z.W (}EjA*  tokens/queries_len_meanB5-œ/m]P }EjA*!  time/ppo/compute_rewards:h2$V }EjA*$ " time/ppo/compute_advantages4;Ŭ-y%6 }EjA*  time/ppo/total ?Lʷ&sO a}EjA*  env/reward_meanE%6 }EjA*  env/reward_std}f#wC GFjA*  objective/klL,A(pJ GFjA*  objective/kl_coefIzd>D.(pJ HFjA*  objective/entropy 99oj0_ (HFjA*" ppo/mean_non_score_rewardF;&sO DO/m]P vQFjA*!  time/ppo/compute_rewards8:x8 2$V QFjA*$ " time/ppo/compute_advantages;%^-%6 RFjA*  time/ppo/total4?DQ&sO ESFjA*  env/reward_meanGa%6 SFjA*  env/reward_std9#wC ډHjA*  objective/kl>*Ad0(pJ ډHjA*  objective/kl_coefd>p(pJ ۉHjA*  objective/entropy ?Q60_ -ۉHjA*" ppo/mean_non_score_rewardxn &sO @ۉHjA*  ppo/mean_scores3>:9%6 TۉHjA*  ppo/std_scores_W~?gTL.W fۉHjA*  tokens/queries_len_meanB&A-Rl&sO ۉHjA*  ppo/loss/policy>0yd%6 ݉HjA*  ppo/loss/valuePy>ϭ%6 0HjA*  ppo/loss/totals<)7_ `HjA*  ppo/policy/entropy@j|* lHjA*  ppo/policy/approxkl'@G%Z* HjA*  ppo/policy/policyklH>&^.* HjA*  ppo/policy/clipfrac>; 1 MHjA*# ! ppo/policy/advantages_mean>̨~'F dHjA*  ppo/returns/mean &sO xHjA*  ppo/returns/varY? $B+M HjA*  ppo/val/vpred9-"$B+M HjA*  ppo/val/errorQ??h'F HjA*  ppo/val/clipfrac.:=8tU#wC HjA*  ppo/val/mean&KS"x= HjA*  ppo/val/vary.9Y,E HjA*  ppo/val/var_explainede9~X(pJ HjA*  ppo/learning_rateϸ:C#,E 5HjA*  time/ppo/forward_pass >) /m]P HjA*!  time/ppo/compute_rewards8:x2$V HjA*$ " time/ppo/compute_advantages(;g -ii%6 HjA*  time/ppo/total!?\&sO `HjA*  env/reward_mean3>G%6 HjA*  env/reward_std_W~?pYy