{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.9320081738807358, "eval_steps": 500, "global_step": 260000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.000743080066877206, "grad_norm": 0.6201262474060059, "learning_rate": 3.715262297518205e-07, "loss": 0.0293, "step": 100 }, { "epoch": 0.001486160133754412, "grad_norm": 0.8341535329818726, "learning_rate": 7.43052459503641e-07, "loss": 0.024, "step": 200 }, { "epoch": 0.002229240200631618, "grad_norm": 0.41398999094963074, "learning_rate": 1.1145786892554615e-06, "loss": 0.0212, "step": 300 }, { "epoch": 0.002972320267508824, "grad_norm": 0.6459685564041138, "learning_rate": 1.486104919007282e-06, "loss": 0.0195, "step": 400 }, { "epoch": 0.00371540033438603, "grad_norm": 0.4232792854309082, "learning_rate": 1.8576311487591026e-06, "loss": 0.018, "step": 500 }, { "epoch": 0.004458480401263236, "grad_norm": 0.3187289535999298, "learning_rate": 2.229157378510923e-06, "loss": 0.0165, "step": 600 }, { "epoch": 0.005201560468140442, "grad_norm": 0.2891034185886383, "learning_rate": 2.6006836082627435e-06, "loss": 0.0168, "step": 700 }, { "epoch": 0.005944640535017648, "grad_norm": 0.3253406584262848, "learning_rate": 2.972209838014564e-06, "loss": 0.0177, "step": 800 }, { "epoch": 0.0066877206018948546, "grad_norm": 0.24529646337032318, "learning_rate": 3.3437360677663843e-06, "loss": 0.0172, "step": 900 }, { "epoch": 0.00743080066877206, "grad_norm": 0.39087575674057007, "learning_rate": 3.715262297518205e-06, "loss": 0.0169, "step": 1000 }, { "epoch": 0.008173880735649267, "grad_norm": 0.24642707407474518, "learning_rate": 4.086788527270026e-06, "loss": 0.0152, "step": 1100 }, { "epoch": 0.008916960802526472, "grad_norm": 0.3169700503349304, "learning_rate": 4.458314757021846e-06, "loss": 0.0181, "step": 1200 }, { "epoch": 0.009660040869403677, "grad_norm": 0.6863657236099243, "learning_rate": 4.8298409867736665e-06, "loss": 0.0156, "step": 1300 }, { "epoch": 0.010403120936280884, "grad_norm": 0.33670246601104736, "learning_rate": 5.201367216525487e-06, "loss": 0.0149, "step": 1400 }, { "epoch": 0.01114620100315809, "grad_norm": 0.2675531208515167, "learning_rate": 5.572893446277307e-06, "loss": 0.0165, "step": 1500 }, { "epoch": 0.011889281070035297, "grad_norm": 0.32111021876335144, "learning_rate": 5.944419676029128e-06, "loss": 0.0158, "step": 1600 }, { "epoch": 0.012632361136912502, "grad_norm": 0.18255972862243652, "learning_rate": 6.315945905780948e-06, "loss": 0.0145, "step": 1700 }, { "epoch": 0.013375441203789709, "grad_norm": 0.33162474632263184, "learning_rate": 6.687472135532769e-06, "loss": 0.0149, "step": 1800 }, { "epoch": 0.014118521270666914, "grad_norm": 0.26008930802345276, "learning_rate": 7.05899836528459e-06, "loss": 0.015, "step": 1900 }, { "epoch": 0.01486160133754412, "grad_norm": 0.23395366966724396, "learning_rate": 7.43052459503641e-06, "loss": 0.0159, "step": 2000 }, { "epoch": 0.015604681404421327, "grad_norm": 0.30649447441101074, "learning_rate": 7.80205082478823e-06, "loss": 0.0144, "step": 2100 }, { "epoch": 0.016347761471298534, "grad_norm": 0.3568742573261261, "learning_rate": 8.173577054540051e-06, "loss": 0.015, "step": 2200 }, { "epoch": 0.01709084153817574, "grad_norm": 0.1808238923549652, "learning_rate": 8.54510328429187e-06, "loss": 0.0138, "step": 2300 }, { "epoch": 0.017833921605052944, "grad_norm": 0.30208560824394226, "learning_rate": 8.916629514043692e-06, "loss": 0.0145, "step": 2400 }, { "epoch": 0.01857700167193015, "grad_norm": 0.2976946234703064, "learning_rate": 9.288155743795512e-06, "loss": 0.0145, "step": 2500 }, { "epoch": 0.019320081738807355, "grad_norm": 0.5738121867179871, "learning_rate": 9.659681973547333e-06, "loss": 0.0146, "step": 2600 }, { "epoch": 0.020063161805684564, "grad_norm": 0.506726861000061, "learning_rate": 1.0031208203299153e-05, "loss": 0.0141, "step": 2700 }, { "epoch": 0.02080624187256177, "grad_norm": 0.17522376775741577, "learning_rate": 1.0402734433050974e-05, "loss": 0.0131, "step": 2800 }, { "epoch": 0.021549321939438974, "grad_norm": 1.1992942094802856, "learning_rate": 1.0774260662802795e-05, "loss": 0.0147, "step": 2900 }, { "epoch": 0.02229240200631618, "grad_norm": 0.29072946310043335, "learning_rate": 1.1145786892554615e-05, "loss": 0.0142, "step": 3000 }, { "epoch": 0.023035482073193385, "grad_norm": 0.19813032448291779, "learning_rate": 1.1517313122306434e-05, "loss": 0.0115, "step": 3100 }, { "epoch": 0.023778562140070594, "grad_norm": 0.15734168887138367, "learning_rate": 1.1888839352058256e-05, "loss": 0.0115, "step": 3200 }, { "epoch": 0.0245216422069478, "grad_norm": 0.9513083100318909, "learning_rate": 1.2260365581810077e-05, "loss": 0.0139, "step": 3300 }, { "epoch": 0.025264722273825004, "grad_norm": 0.12281644344329834, "learning_rate": 1.2631891811561896e-05, "loss": 0.0127, "step": 3400 }, { "epoch": 0.02600780234070221, "grad_norm": 0.288326233625412, "learning_rate": 1.3003418041313718e-05, "loss": 0.0121, "step": 3500 }, { "epoch": 0.026750882407579418, "grad_norm": 0.1910688728094101, "learning_rate": 1.3374944271065537e-05, "loss": 0.0124, "step": 3600 }, { "epoch": 0.027493962474456624, "grad_norm": 0.3038184642791748, "learning_rate": 1.3746470500817357e-05, "loss": 0.0136, "step": 3700 }, { "epoch": 0.02823704254133383, "grad_norm": 0.4833832085132599, "learning_rate": 1.411799673056918e-05, "loss": 0.0131, "step": 3800 }, { "epoch": 0.028980122608211034, "grad_norm": 0.5049723982810974, "learning_rate": 1.4489522960321001e-05, "loss": 0.0118, "step": 3900 }, { "epoch": 0.02972320267508824, "grad_norm": 0.18751120567321777, "learning_rate": 1.486104919007282e-05, "loss": 0.0121, "step": 4000 }, { "epoch": 0.030466282741965448, "grad_norm": 0.20638391375541687, "learning_rate": 1.523257541982464e-05, "loss": 0.0132, "step": 4100 }, { "epoch": 0.031209362808842653, "grad_norm": 0.1307753175497055, "learning_rate": 1.560410164957646e-05, "loss": 0.0111, "step": 4200 }, { "epoch": 0.03195244287571986, "grad_norm": 0.1824759691953659, "learning_rate": 1.597562787932828e-05, "loss": 0.0118, "step": 4300 }, { "epoch": 0.03269552294259707, "grad_norm": 0.15876320004463196, "learning_rate": 1.6347154109080102e-05, "loss": 0.0132, "step": 4400 }, { "epoch": 0.03343860300947427, "grad_norm": 1.4908363819122314, "learning_rate": 1.671868033883192e-05, "loss": 0.0113, "step": 4500 }, { "epoch": 0.03418168307635148, "grad_norm": 0.1806405484676361, "learning_rate": 1.709020656858374e-05, "loss": 0.0114, "step": 4600 }, { "epoch": 0.03492476314322868, "grad_norm": 0.09762364625930786, "learning_rate": 1.7461732798335563e-05, "loss": 0.0101, "step": 4700 }, { "epoch": 0.03566784321010589, "grad_norm": 0.17264674603939056, "learning_rate": 1.7833259028087384e-05, "loss": 0.0135, "step": 4800 }, { "epoch": 0.036410923276983094, "grad_norm": 0.06826501339673996, "learning_rate": 1.8204785257839205e-05, "loss": 0.0124, "step": 4900 }, { "epoch": 0.0371540033438603, "grad_norm": 0.14499710500240326, "learning_rate": 1.8576311487591023e-05, "loss": 0.0115, "step": 5000 }, { "epoch": 0.037897083410737505, "grad_norm": 0.11885485053062439, "learning_rate": 1.8947837717342845e-05, "loss": 0.0121, "step": 5100 }, { "epoch": 0.03864016347761471, "grad_norm": 0.127430722117424, "learning_rate": 1.9319363947094666e-05, "loss": 0.0105, "step": 5200 }, { "epoch": 0.03938324354449192, "grad_norm": 0.24366237223148346, "learning_rate": 1.9690890176846484e-05, "loss": 0.0111, "step": 5300 }, { "epoch": 0.04012632361136913, "grad_norm": 0.21700197458267212, "learning_rate": 2.0062416406598305e-05, "loss": 0.0124, "step": 5400 }, { "epoch": 0.04086940367824633, "grad_norm": 0.21410606801509857, "learning_rate": 2.043394263635013e-05, "loss": 0.0119, "step": 5500 }, { "epoch": 0.04161248374512354, "grad_norm": 0.22886519134044647, "learning_rate": 2.0805468866101948e-05, "loss": 0.0113, "step": 5600 }, { "epoch": 0.04235556381200074, "grad_norm": 0.1471068561077118, "learning_rate": 2.117699509585377e-05, "loss": 0.0108, "step": 5700 }, { "epoch": 0.04309864387887795, "grad_norm": 0.10926610976457596, "learning_rate": 2.154852132560559e-05, "loss": 0.0126, "step": 5800 }, { "epoch": 0.043841723945755154, "grad_norm": 0.2959732413291931, "learning_rate": 2.1920047555357408e-05, "loss": 0.0115, "step": 5900 }, { "epoch": 0.04458480401263236, "grad_norm": 0.25589028000831604, "learning_rate": 2.229157378510923e-05, "loss": 0.01, "step": 6000 }, { "epoch": 0.045327884079509564, "grad_norm": 0.15893419086933136, "learning_rate": 2.266310001486105e-05, "loss": 0.0118, "step": 6100 }, { "epoch": 0.04607096414638677, "grad_norm": 0.1573810577392578, "learning_rate": 2.303462624461287e-05, "loss": 0.0108, "step": 6200 }, { "epoch": 0.04681404421326398, "grad_norm": 0.3414229154586792, "learning_rate": 2.3406152474364693e-05, "loss": 0.0096, "step": 6300 }, { "epoch": 0.04755712428014119, "grad_norm": 0.15525777637958527, "learning_rate": 2.377767870411651e-05, "loss": 0.009, "step": 6400 }, { "epoch": 0.04830020434701839, "grad_norm": 0.11353523284196854, "learning_rate": 2.4149204933868332e-05, "loss": 0.0134, "step": 6500 }, { "epoch": 0.0490432844138956, "grad_norm": 0.15114864706993103, "learning_rate": 2.4520731163620154e-05, "loss": 0.0098, "step": 6600 }, { "epoch": 0.0497863644807728, "grad_norm": 0.09296689927577972, "learning_rate": 2.4892257393371975e-05, "loss": 0.01, "step": 6700 }, { "epoch": 0.05052944454765001, "grad_norm": 0.3114944398403168, "learning_rate": 2.5263783623123793e-05, "loss": 0.0102, "step": 6800 }, { "epoch": 0.051272524614527214, "grad_norm": 0.12343640625476837, "learning_rate": 2.5635309852875618e-05, "loss": 0.0103, "step": 6900 }, { "epoch": 0.05201560468140442, "grad_norm": 0.05941719561815262, "learning_rate": 2.6006836082627435e-05, "loss": 0.0106, "step": 7000 }, { "epoch": 0.052758684748281624, "grad_norm": 0.3176872134208679, "learning_rate": 2.6378362312379257e-05, "loss": 0.0094, "step": 7100 }, { "epoch": 0.053501764815158837, "grad_norm": 0.13310806453227997, "learning_rate": 2.6749888542131075e-05, "loss": 0.0109, "step": 7200 }, { "epoch": 0.05424484488203604, "grad_norm": 0.08222562819719315, "learning_rate": 2.7121414771882896e-05, "loss": 0.0101, "step": 7300 }, { "epoch": 0.05498792494891325, "grad_norm": 0.38013893365859985, "learning_rate": 2.7492941001634714e-05, "loss": 0.0101, "step": 7400 }, { "epoch": 0.05573100501579045, "grad_norm": 0.19492092728614807, "learning_rate": 2.786446723138654e-05, "loss": 0.0104, "step": 7500 }, { "epoch": 0.05647408508266766, "grad_norm": 0.09033786505460739, "learning_rate": 2.823599346113836e-05, "loss": 0.0111, "step": 7600 }, { "epoch": 0.05721716514954486, "grad_norm": 0.13311225175857544, "learning_rate": 2.8607519690890178e-05, "loss": 0.0122, "step": 7700 }, { "epoch": 0.05796024521642207, "grad_norm": 0.5991498231887817, "learning_rate": 2.8979045920642002e-05, "loss": 0.0095, "step": 7800 }, { "epoch": 0.058703325283299274, "grad_norm": 0.21358543634414673, "learning_rate": 2.9350572150393817e-05, "loss": 0.0097, "step": 7900 }, { "epoch": 0.05944640535017648, "grad_norm": 0.17616549134254456, "learning_rate": 2.972209838014564e-05, "loss": 0.0114, "step": 8000 }, { "epoch": 0.060189485417053684, "grad_norm": 0.12915846705436707, "learning_rate": 3.009362460989746e-05, "loss": 0.0091, "step": 8100 }, { "epoch": 0.060932565483930896, "grad_norm": 0.08484867215156555, "learning_rate": 3.046515083964928e-05, "loss": 0.0098, "step": 8200 }, { "epoch": 0.0616756455508081, "grad_norm": 0.08068235963582993, "learning_rate": 3.08366770694011e-05, "loss": 0.0092, "step": 8300 }, { "epoch": 0.06241872561768531, "grad_norm": 0.16312555968761444, "learning_rate": 3.120820329915292e-05, "loss": 0.0109, "step": 8400 }, { "epoch": 0.0631618056845625, "grad_norm": 0.5776621699333191, "learning_rate": 3.1579729528904744e-05, "loss": 0.0104, "step": 8500 }, { "epoch": 0.06390488575143972, "grad_norm": 0.11295618116855621, "learning_rate": 3.195125575865656e-05, "loss": 0.0098, "step": 8600 }, { "epoch": 0.06464796581831693, "grad_norm": 0.15732234716415405, "learning_rate": 3.232278198840839e-05, "loss": 0.0111, "step": 8700 }, { "epoch": 0.06539104588519414, "grad_norm": 0.08376754820346832, "learning_rate": 3.2694308218160205e-05, "loss": 0.0092, "step": 8800 }, { "epoch": 0.06613412595207134, "grad_norm": 0.10953645408153534, "learning_rate": 3.306583444791202e-05, "loss": 0.0085, "step": 8900 }, { "epoch": 0.06687720601894855, "grad_norm": 0.24214927852153778, "learning_rate": 3.343736067766384e-05, "loss": 0.0113, "step": 9000 }, { "epoch": 0.06762028608582575, "grad_norm": 0.05712306499481201, "learning_rate": 3.3808886907415665e-05, "loss": 0.0113, "step": 9100 }, { "epoch": 0.06836336615270296, "grad_norm": 0.20050524175167084, "learning_rate": 3.418041313716748e-05, "loss": 0.0104, "step": 9200 }, { "epoch": 0.06910644621958016, "grad_norm": 0.12058404833078384, "learning_rate": 3.455193936691931e-05, "loss": 0.0104, "step": 9300 }, { "epoch": 0.06984952628645737, "grad_norm": 0.5149028897285461, "learning_rate": 3.4923465596671126e-05, "loss": 0.0105, "step": 9400 }, { "epoch": 0.07059260635333457, "grad_norm": 0.08235427737236023, "learning_rate": 3.5294991826422944e-05, "loss": 0.009, "step": 9500 }, { "epoch": 0.07133568642021178, "grad_norm": 0.08316874504089355, "learning_rate": 3.566651805617477e-05, "loss": 0.0115, "step": 9600 }, { "epoch": 0.07207876648708898, "grad_norm": 0.2917763888835907, "learning_rate": 3.6038044285926586e-05, "loss": 0.0098, "step": 9700 }, { "epoch": 0.07282184655396619, "grad_norm": 0.11247112601995468, "learning_rate": 3.640957051567841e-05, "loss": 0.0088, "step": 9800 }, { "epoch": 0.0735649266208434, "grad_norm": 0.18118980526924133, "learning_rate": 3.678109674543023e-05, "loss": 0.0109, "step": 9900 }, { "epoch": 0.0743080066877206, "grad_norm": 0.5826623439788818, "learning_rate": 3.715262297518205e-05, "loss": 0.0094, "step": 10000 }, { "epoch": 0.0750510867545978, "grad_norm": 0.10135869681835175, "learning_rate": 3.752414920493387e-05, "loss": 0.0075, "step": 10100 }, { "epoch": 0.07579416682147501, "grad_norm": 0.02961345575749874, "learning_rate": 3.789567543468569e-05, "loss": 0.0101, "step": 10200 }, { "epoch": 0.07653724688835221, "grad_norm": 0.0805927962064743, "learning_rate": 3.8267201664437514e-05, "loss": 0.0084, "step": 10300 }, { "epoch": 0.07728032695522942, "grad_norm": 0.3558717370033264, "learning_rate": 3.863872789418933e-05, "loss": 0.0106, "step": 10400 }, { "epoch": 0.07802340702210664, "grad_norm": 0.18754470348358154, "learning_rate": 3.9010254123941157e-05, "loss": 0.0098, "step": 10500 }, { "epoch": 0.07876648708898384, "grad_norm": 0.1944398432970047, "learning_rate": 3.938178035369297e-05, "loss": 0.0088, "step": 10600 }, { "epoch": 0.07950956715586105, "grad_norm": 0.16282789409160614, "learning_rate": 3.975330658344479e-05, "loss": 0.0095, "step": 10700 }, { "epoch": 0.08025264722273825, "grad_norm": 0.08002127707004547, "learning_rate": 4.012483281319661e-05, "loss": 0.0106, "step": 10800 }, { "epoch": 0.08099572728961546, "grad_norm": 0.1011476218700409, "learning_rate": 4.0496359042948435e-05, "loss": 0.0095, "step": 10900 }, { "epoch": 0.08173880735649267, "grad_norm": 0.05073559656739235, "learning_rate": 4.086788527270026e-05, "loss": 0.0093, "step": 11000 }, { "epoch": 0.08248188742336987, "grad_norm": 0.7299156785011292, "learning_rate": 4.123941150245208e-05, "loss": 0.01, "step": 11100 }, { "epoch": 0.08322496749024708, "grad_norm": 0.03845445066690445, "learning_rate": 4.1610937732203895e-05, "loss": 0.0102, "step": 11200 }, { "epoch": 0.08396804755712428, "grad_norm": 0.13008517026901245, "learning_rate": 4.198246396195571e-05, "loss": 0.0099, "step": 11300 }, { "epoch": 0.08471112762400149, "grad_norm": 0.05953648313879967, "learning_rate": 4.235399019170754e-05, "loss": 0.0081, "step": 11400 }, { "epoch": 0.08545420769087869, "grad_norm": 0.08770621567964554, "learning_rate": 4.2725516421459356e-05, "loss": 0.0094, "step": 11500 }, { "epoch": 0.0861972877577559, "grad_norm": 0.04865841940045357, "learning_rate": 4.309704265121118e-05, "loss": 0.0078, "step": 11600 }, { "epoch": 0.0869403678246331, "grad_norm": 0.09762655198574066, "learning_rate": 4.3468568880963e-05, "loss": 0.0101, "step": 11700 }, { "epoch": 0.08768344789151031, "grad_norm": 0.1134525015950203, "learning_rate": 4.3840095110714816e-05, "loss": 0.01, "step": 11800 }, { "epoch": 0.08842652795838751, "grad_norm": 0.0925632119178772, "learning_rate": 4.421162134046664e-05, "loss": 0.0094, "step": 11900 }, { "epoch": 0.08916960802526472, "grad_norm": 0.1400783807039261, "learning_rate": 4.458314757021846e-05, "loss": 0.0088, "step": 12000 }, { "epoch": 0.08991268809214192, "grad_norm": 0.06742055714130402, "learning_rate": 4.4954673799970283e-05, "loss": 0.0078, "step": 12100 }, { "epoch": 0.09065576815901913, "grad_norm": 0.09896427392959595, "learning_rate": 4.53262000297221e-05, "loss": 0.009, "step": 12200 }, { "epoch": 0.09139884822589633, "grad_norm": 0.07769208401441574, "learning_rate": 4.569772625947392e-05, "loss": 0.009, "step": 12300 }, { "epoch": 0.09214192829277354, "grad_norm": 0.12094446271657944, "learning_rate": 4.606925248922574e-05, "loss": 0.0098, "step": 12400 }, { "epoch": 0.09288500835965076, "grad_norm": 0.06585867702960968, "learning_rate": 4.644077871897756e-05, "loss": 0.0114, "step": 12500 }, { "epoch": 0.09362808842652796, "grad_norm": 0.297781765460968, "learning_rate": 4.6812304948729386e-05, "loss": 0.0094, "step": 12600 }, { "epoch": 0.09437116849340517, "grad_norm": 0.12166259437799454, "learning_rate": 4.7183831178481204e-05, "loss": 0.0075, "step": 12700 }, { "epoch": 0.09511424856028237, "grad_norm": 0.0373976044356823, "learning_rate": 4.755535740823302e-05, "loss": 0.0087, "step": 12800 }, { "epoch": 0.09585732862715958, "grad_norm": 0.27907776832580566, "learning_rate": 4.792688363798484e-05, "loss": 0.0093, "step": 12900 }, { "epoch": 0.09660040869403679, "grad_norm": 0.0380702018737793, "learning_rate": 4.8298409867736665e-05, "loss": 0.0082, "step": 13000 }, { "epoch": 0.09734348876091399, "grad_norm": 0.0955420434474945, "learning_rate": 4.866993609748848e-05, "loss": 0.0085, "step": 13100 }, { "epoch": 0.0980865688277912, "grad_norm": 0.14812317490577698, "learning_rate": 4.904146232724031e-05, "loss": 0.0085, "step": 13200 }, { "epoch": 0.0988296488946684, "grad_norm": 0.07400522381067276, "learning_rate": 4.9412988556992125e-05, "loss": 0.0088, "step": 13300 }, { "epoch": 0.0995727289615456, "grad_norm": 0.0549309141933918, "learning_rate": 4.978451478674395e-05, "loss": 0.0082, "step": 13400 }, { "epoch": 0.10031580902842281, "grad_norm": 0.08723131567239761, "learning_rate": 4.999178699372683e-05, "loss": 0.0109, "step": 13500 }, { "epoch": 0.10105888909530002, "grad_norm": 0.07263633608818054, "learning_rate": 4.997223221688595e-05, "loss": 0.0079, "step": 13600 }, { "epoch": 0.10180196916217722, "grad_norm": 0.30761680006980896, "learning_rate": 4.995267744004506e-05, "loss": 0.0093, "step": 13700 }, { "epoch": 0.10254504922905443, "grad_norm": 0.10250383615493774, "learning_rate": 4.993312266320417e-05, "loss": 0.0091, "step": 13800 }, { "epoch": 0.10328812929593163, "grad_norm": 0.37729084491729736, "learning_rate": 4.991356788636328e-05, "loss": 0.0094, "step": 13900 }, { "epoch": 0.10403120936280884, "grad_norm": 0.08499758690595627, "learning_rate": 4.98940131095224e-05, "loss": 0.0086, "step": 14000 }, { "epoch": 0.10477428942968604, "grad_norm": 0.3610725998878479, "learning_rate": 4.987445833268151e-05, "loss": 0.0075, "step": 14100 }, { "epoch": 0.10551736949656325, "grad_norm": 0.05794951692223549, "learning_rate": 4.985490355584062e-05, "loss": 0.0096, "step": 14200 }, { "epoch": 0.10626044956344045, "grad_norm": 0.104873888194561, "learning_rate": 4.9835348778999734e-05, "loss": 0.0086, "step": 14300 }, { "epoch": 0.10700352963031767, "grad_norm": 0.10652638226747513, "learning_rate": 4.981579400215885e-05, "loss": 0.0088, "step": 14400 }, { "epoch": 0.10774660969719488, "grad_norm": 0.03338725492358208, "learning_rate": 4.979623922531796e-05, "loss": 0.0086, "step": 14500 }, { "epoch": 0.10848968976407208, "grad_norm": 0.3040962219238281, "learning_rate": 4.977668444847708e-05, "loss": 0.0095, "step": 14600 }, { "epoch": 0.10923276983094929, "grad_norm": 0.1617971509695053, "learning_rate": 4.9757129671636186e-05, "loss": 0.0079, "step": 14700 }, { "epoch": 0.1099758498978265, "grad_norm": 0.0969538614153862, "learning_rate": 4.9737574894795304e-05, "loss": 0.0082, "step": 14800 }, { "epoch": 0.1107189299647037, "grad_norm": 0.08921152353286743, "learning_rate": 4.9718020117954415e-05, "loss": 0.0093, "step": 14900 }, { "epoch": 0.1114620100315809, "grad_norm": 0.10723528265953064, "learning_rate": 4.9698465341113534e-05, "loss": 0.0086, "step": 15000 }, { "epoch": 0.11220509009845811, "grad_norm": 0.045507241040468216, "learning_rate": 4.967891056427264e-05, "loss": 0.0085, "step": 15100 }, { "epoch": 0.11294817016533532, "grad_norm": 0.07472371309995651, "learning_rate": 4.9659355787431756e-05, "loss": 0.0094, "step": 15200 }, { "epoch": 0.11369125023221252, "grad_norm": 0.1268874555826187, "learning_rate": 4.963980101059087e-05, "loss": 0.009, "step": 15300 }, { "epoch": 0.11443433029908973, "grad_norm": 0.09821897745132446, "learning_rate": 4.9620246233749986e-05, "loss": 0.0089, "step": 15400 }, { "epoch": 0.11517741036596693, "grad_norm": 0.13954854011535645, "learning_rate": 4.96006914569091e-05, "loss": 0.0078, "step": 15500 }, { "epoch": 0.11592049043284414, "grad_norm": 0.41612350940704346, "learning_rate": 4.958113668006821e-05, "loss": 0.0095, "step": 15600 }, { "epoch": 0.11666357049972134, "grad_norm": 0.0604860894382, "learning_rate": 4.956158190322732e-05, "loss": 0.0087, "step": 15700 }, { "epoch": 0.11740665056659855, "grad_norm": 0.052573274821043015, "learning_rate": 4.954202712638644e-05, "loss": 0.0085, "step": 15800 }, { "epoch": 0.11814973063347575, "grad_norm": 0.049111757427453995, "learning_rate": 4.952247234954555e-05, "loss": 0.0076, "step": 15900 }, { "epoch": 0.11889281070035296, "grad_norm": 0.0461510606110096, "learning_rate": 4.950291757270466e-05, "loss": 0.0081, "step": 16000 }, { "epoch": 0.11963589076723016, "grad_norm": 0.05603969469666481, "learning_rate": 4.948336279586377e-05, "loss": 0.0098, "step": 16100 }, { "epoch": 0.12037897083410737, "grad_norm": 0.10832766443490982, "learning_rate": 4.946380801902289e-05, "loss": 0.0094, "step": 16200 }, { "epoch": 0.12112205090098459, "grad_norm": 0.06822163611650467, "learning_rate": 4.9444253242182e-05, "loss": 0.0079, "step": 16300 }, { "epoch": 0.12186513096786179, "grad_norm": 0.09804270416498184, "learning_rate": 4.942469846534112e-05, "loss": 0.0075, "step": 16400 }, { "epoch": 0.122608211034739, "grad_norm": 0.1705053597688675, "learning_rate": 4.9405143688500225e-05, "loss": 0.0103, "step": 16500 }, { "epoch": 0.1233512911016162, "grad_norm": 0.026959968730807304, "learning_rate": 4.938558891165934e-05, "loss": 0.0082, "step": 16600 }, { "epoch": 0.12409437116849341, "grad_norm": 0.10127568244934082, "learning_rate": 4.9366034134818454e-05, "loss": 0.0093, "step": 16700 }, { "epoch": 0.12483745123537061, "grad_norm": 0.05387401208281517, "learning_rate": 4.934647935797757e-05, "loss": 0.0083, "step": 16800 }, { "epoch": 0.1255805313022478, "grad_norm": 0.08786871284246445, "learning_rate": 4.932692458113668e-05, "loss": 0.0094, "step": 16900 }, { "epoch": 0.126323611369125, "grad_norm": 0.06967432796955109, "learning_rate": 4.9307369804295795e-05, "loss": 0.0085, "step": 17000 }, { "epoch": 0.12706669143600222, "grad_norm": 0.07188351452350616, "learning_rate": 4.9287815027454906e-05, "loss": 0.0091, "step": 17100 }, { "epoch": 0.12780977150287945, "grad_norm": 0.16708698868751526, "learning_rate": 4.9268260250614025e-05, "loss": 0.0107, "step": 17200 }, { "epoch": 0.12855285156975665, "grad_norm": 0.03477710485458374, "learning_rate": 4.9248705473773136e-05, "loss": 0.0081, "step": 17300 }, { "epoch": 0.12929593163663386, "grad_norm": 0.09288498759269714, "learning_rate": 4.922915069693225e-05, "loss": 0.0089, "step": 17400 }, { "epoch": 0.13003901170351106, "grad_norm": 0.05606740713119507, "learning_rate": 4.920959592009136e-05, "loss": 0.007, "step": 17500 }, { "epoch": 0.13078209177038827, "grad_norm": 0.263423889875412, "learning_rate": 4.919004114325048e-05, "loss": 0.0102, "step": 17600 }, { "epoch": 0.13152517183726548, "grad_norm": 0.18583562970161438, "learning_rate": 4.917048636640959e-05, "loss": 0.0102, "step": 17700 }, { "epoch": 0.13226825190414268, "grad_norm": 0.04414902627468109, "learning_rate": 4.91509315895687e-05, "loss": 0.0102, "step": 17800 }, { "epoch": 0.13301133197101989, "grad_norm": 0.07003758102655411, "learning_rate": 4.913137681272781e-05, "loss": 0.0093, "step": 17900 }, { "epoch": 0.1337544120378971, "grad_norm": 0.28599071502685547, "learning_rate": 4.911182203588693e-05, "loss": 0.0095, "step": 18000 }, { "epoch": 0.1344974921047743, "grad_norm": 0.0706838071346283, "learning_rate": 4.909226725904604e-05, "loss": 0.0092, "step": 18100 }, { "epoch": 0.1352405721716515, "grad_norm": 0.0730166882276535, "learning_rate": 4.907271248220516e-05, "loss": 0.0083, "step": 18200 }, { "epoch": 0.1359836522385287, "grad_norm": 0.0927768275141716, "learning_rate": 4.905315770536426e-05, "loss": 0.0081, "step": 18300 }, { "epoch": 0.1367267323054059, "grad_norm": 0.1223670095205307, "learning_rate": 4.903360292852338e-05, "loss": 0.009, "step": 18400 }, { "epoch": 0.13746981237228312, "grad_norm": 0.20416951179504395, "learning_rate": 4.901404815168249e-05, "loss": 0.0088, "step": 18500 }, { "epoch": 0.13821289243916032, "grad_norm": 0.04638587683439255, "learning_rate": 4.899449337484161e-05, "loss": 0.0082, "step": 18600 }, { "epoch": 0.13895597250603753, "grad_norm": 0.08867622166872025, "learning_rate": 4.897493859800072e-05, "loss": 0.0076, "step": 18700 }, { "epoch": 0.13969905257291473, "grad_norm": 0.06598735600709915, "learning_rate": 4.8955383821159834e-05, "loss": 0.0083, "step": 18800 }, { "epoch": 0.14044213263979194, "grad_norm": 0.16323970258235931, "learning_rate": 4.8935829044318945e-05, "loss": 0.0088, "step": 18900 }, { "epoch": 0.14118521270666914, "grad_norm": 0.0871758684515953, "learning_rate": 4.891627426747806e-05, "loss": 0.0086, "step": 19000 }, { "epoch": 0.14192829277354635, "grad_norm": 0.055200815200805664, "learning_rate": 4.8896719490637175e-05, "loss": 0.0075, "step": 19100 }, { "epoch": 0.14267137284042355, "grad_norm": 0.09537665545940399, "learning_rate": 4.8877164713796286e-05, "loss": 0.0084, "step": 19200 }, { "epoch": 0.14341445290730076, "grad_norm": 0.03909111022949219, "learning_rate": 4.8857609936955404e-05, "loss": 0.0079, "step": 19300 }, { "epoch": 0.14415753297417797, "grad_norm": 0.058872614055871964, "learning_rate": 4.8838055160114515e-05, "loss": 0.0088, "step": 19400 }, { "epoch": 0.14490061304105517, "grad_norm": 0.10204733908176422, "learning_rate": 4.881850038327363e-05, "loss": 0.0077, "step": 19500 }, { "epoch": 0.14564369310793238, "grad_norm": 0.116595558822155, "learning_rate": 4.8798945606432745e-05, "loss": 0.0097, "step": 19600 }, { "epoch": 0.14638677317480958, "grad_norm": 0.10328437387943268, "learning_rate": 4.8779390829591856e-05, "loss": 0.0087, "step": 19700 }, { "epoch": 0.1471298532416868, "grad_norm": 0.053475573658943176, "learning_rate": 4.875983605275097e-05, "loss": 0.0076, "step": 19800 }, { "epoch": 0.147872933308564, "grad_norm": 0.03302578628063202, "learning_rate": 4.874028127591008e-05, "loss": 0.0102, "step": 19900 }, { "epoch": 0.1486160133754412, "grad_norm": 0.08769609034061432, "learning_rate": 4.87207264990692e-05, "loss": 0.0092, "step": 20000 }, { "epoch": 0.1493590934423184, "grad_norm": 0.11664148420095444, "learning_rate": 4.870117172222831e-05, "loss": 0.0084, "step": 20100 }, { "epoch": 0.1501021735091956, "grad_norm": 0.11319153010845184, "learning_rate": 4.868161694538742e-05, "loss": 0.0095, "step": 20200 }, { "epoch": 0.1508452535760728, "grad_norm": 0.1903834342956543, "learning_rate": 4.866206216854654e-05, "loss": 0.0089, "step": 20300 }, { "epoch": 0.15158833364295002, "grad_norm": 0.09050220996141434, "learning_rate": 4.864250739170565e-05, "loss": 0.0084, "step": 20400 }, { "epoch": 0.15233141370982722, "grad_norm": 0.05532703548669815, "learning_rate": 4.862295261486476e-05, "loss": 0.0072, "step": 20500 }, { "epoch": 0.15307449377670443, "grad_norm": 0.06160612776875496, "learning_rate": 4.860339783802387e-05, "loss": 0.0085, "step": 20600 }, { "epoch": 0.15381757384358163, "grad_norm": 0.14603731036186218, "learning_rate": 4.858384306118299e-05, "loss": 0.0085, "step": 20700 }, { "epoch": 0.15456065391045884, "grad_norm": 0.16069570183753967, "learning_rate": 4.85642882843421e-05, "loss": 0.0084, "step": 20800 }, { "epoch": 0.15530373397733604, "grad_norm": 0.038735128939151764, "learning_rate": 4.854473350750122e-05, "loss": 0.0077, "step": 20900 }, { "epoch": 0.15604681404421328, "grad_norm": 0.03752874210476875, "learning_rate": 4.8525178730660325e-05, "loss": 0.0083, "step": 21000 }, { "epoch": 0.15678989411109048, "grad_norm": 0.0574977770447731, "learning_rate": 4.850562395381944e-05, "loss": 0.0079, "step": 21100 }, { "epoch": 0.1575329741779677, "grad_norm": 0.07614196091890335, "learning_rate": 4.8486069176978554e-05, "loss": 0.0076, "step": 21200 }, { "epoch": 0.1582760542448449, "grad_norm": 0.053198881447315216, "learning_rate": 4.846651440013767e-05, "loss": 0.0081, "step": 21300 }, { "epoch": 0.1590191343117221, "grad_norm": 0.03271758183836937, "learning_rate": 4.8446959623296784e-05, "loss": 0.0082, "step": 21400 }, { "epoch": 0.1597622143785993, "grad_norm": 0.053189072757959366, "learning_rate": 4.8427404846455895e-05, "loss": 0.0092, "step": 21500 }, { "epoch": 0.1605052944454765, "grad_norm": 0.09766688942909241, "learning_rate": 4.8407850069615006e-05, "loss": 0.008, "step": 21600 }, { "epoch": 0.16124837451235371, "grad_norm": 0.047550469636917114, "learning_rate": 4.8388295292774125e-05, "loss": 0.009, "step": 21700 }, { "epoch": 0.16199145457923092, "grad_norm": 0.15216177701950073, "learning_rate": 4.8368740515933236e-05, "loss": 0.0076, "step": 21800 }, { "epoch": 0.16273453464610813, "grad_norm": 0.07907580584287643, "learning_rate": 4.834918573909235e-05, "loss": 0.0067, "step": 21900 }, { "epoch": 0.16347761471298533, "grad_norm": 0.05231738090515137, "learning_rate": 4.832963096225146e-05, "loss": 0.0073, "step": 22000 }, { "epoch": 0.16422069477986254, "grad_norm": 0.05589268356561661, "learning_rate": 4.831007618541058e-05, "loss": 0.0081, "step": 22100 }, { "epoch": 0.16496377484673974, "grad_norm": 0.08138907700777054, "learning_rate": 4.829052140856969e-05, "loss": 0.0082, "step": 22200 }, { "epoch": 0.16570685491361695, "grad_norm": 0.10957285016775131, "learning_rate": 4.8270966631728806e-05, "loss": 0.0076, "step": 22300 }, { "epoch": 0.16644993498049415, "grad_norm": 0.13167788088321686, "learning_rate": 4.825141185488791e-05, "loss": 0.0067, "step": 22400 }, { "epoch": 0.16719301504737136, "grad_norm": 0.2840295135974884, "learning_rate": 4.823185707804703e-05, "loss": 0.0087, "step": 22500 }, { "epoch": 0.16793609511424856, "grad_norm": 0.06660717725753784, "learning_rate": 4.821230230120614e-05, "loss": 0.0084, "step": 22600 }, { "epoch": 0.16867917518112577, "grad_norm": 0.16290557384490967, "learning_rate": 4.819274752436526e-05, "loss": 0.0091, "step": 22700 }, { "epoch": 0.16942225524800297, "grad_norm": 0.06779647618532181, "learning_rate": 4.817319274752436e-05, "loss": 0.0076, "step": 22800 }, { "epoch": 0.17016533531488018, "grad_norm": 0.2867468595504761, "learning_rate": 4.815363797068348e-05, "loss": 0.0074, "step": 22900 }, { "epoch": 0.17090841538175738, "grad_norm": 0.07553325593471527, "learning_rate": 4.813408319384259e-05, "loss": 0.0087, "step": 23000 }, { "epoch": 0.1716514954486346, "grad_norm": 0.19428539276123047, "learning_rate": 4.811452841700171e-05, "loss": 0.0078, "step": 23100 }, { "epoch": 0.1723945755155118, "grad_norm": 0.06864582747220993, "learning_rate": 4.809497364016082e-05, "loss": 0.0075, "step": 23200 }, { "epoch": 0.173137655582389, "grad_norm": 0.10910695791244507, "learning_rate": 4.8075418863319934e-05, "loss": 0.0098, "step": 23300 }, { "epoch": 0.1738807356492662, "grad_norm": 0.12006833404302597, "learning_rate": 4.8055864086479045e-05, "loss": 0.0085, "step": 23400 }, { "epoch": 0.1746238157161434, "grad_norm": 0.1084110289812088, "learning_rate": 4.803630930963816e-05, "loss": 0.0078, "step": 23500 }, { "epoch": 0.17536689578302062, "grad_norm": 0.10104592889547348, "learning_rate": 4.8016754532797275e-05, "loss": 0.0075, "step": 23600 }, { "epoch": 0.17610997584989782, "grad_norm": 0.1579158753156662, "learning_rate": 4.7997199755956386e-05, "loss": 0.0081, "step": 23700 }, { "epoch": 0.17685305591677503, "grad_norm": 0.07632194459438324, "learning_rate": 4.79776449791155e-05, "loss": 0.0083, "step": 23800 }, { "epoch": 0.17759613598365223, "grad_norm": 0.05469130724668503, "learning_rate": 4.7958090202274616e-05, "loss": 0.0081, "step": 23900 }, { "epoch": 0.17833921605052944, "grad_norm": 0.12723760306835175, "learning_rate": 4.793853542543373e-05, "loss": 0.0065, "step": 24000 }, { "epoch": 0.17908229611740664, "grad_norm": 0.18581490218639374, "learning_rate": 4.7918980648592845e-05, "loss": 0.0071, "step": 24100 }, { "epoch": 0.17982537618428385, "grad_norm": 0.054505642503499985, "learning_rate": 4.789942587175195e-05, "loss": 0.0097, "step": 24200 }, { "epoch": 0.18056845625116105, "grad_norm": 0.0994088351726532, "learning_rate": 4.787987109491107e-05, "loss": 0.0081, "step": 24300 }, { "epoch": 0.18131153631803826, "grad_norm": 0.08386176824569702, "learning_rate": 4.786031631807018e-05, "loss": 0.008, "step": 24400 }, { "epoch": 0.18205461638491546, "grad_norm": 0.0655982494354248, "learning_rate": 4.78407615412293e-05, "loss": 0.0086, "step": 24500 }, { "epoch": 0.18279769645179267, "grad_norm": 0.061167161911726, "learning_rate": 4.782120676438841e-05, "loss": 0.0077, "step": 24600 }, { "epoch": 0.18354077651866987, "grad_norm": 0.04353417456150055, "learning_rate": 4.780165198754752e-05, "loss": 0.0075, "step": 24700 }, { "epoch": 0.18428385658554708, "grad_norm": 0.04162757098674774, "learning_rate": 4.778209721070663e-05, "loss": 0.0106, "step": 24800 }, { "epoch": 0.1850269366524243, "grad_norm": 0.22756856679916382, "learning_rate": 4.776254243386575e-05, "loss": 0.0085, "step": 24900 }, { "epoch": 0.18577001671930152, "grad_norm": 0.023471467196941376, "learning_rate": 4.774298765702486e-05, "loss": 0.007, "step": 25000 }, { "epoch": 0.18651309678617872, "grad_norm": 0.08421024680137634, "learning_rate": 4.772343288018397e-05, "loss": 0.0077, "step": 25100 }, { "epoch": 0.18725617685305593, "grad_norm": 0.07162146270275116, "learning_rate": 4.7703878103343084e-05, "loss": 0.0082, "step": 25200 }, { "epoch": 0.18799925691993313, "grad_norm": 0.03949804604053497, "learning_rate": 4.76843233265022e-05, "loss": 0.01, "step": 25300 }, { "epoch": 0.18874233698681034, "grad_norm": 0.04739699512720108, "learning_rate": 4.766476854966131e-05, "loss": 0.0075, "step": 25400 }, { "epoch": 0.18948541705368754, "grad_norm": 0.09161044657230377, "learning_rate": 4.764521377282043e-05, "loss": 0.0075, "step": 25500 }, { "epoch": 0.19022849712056475, "grad_norm": 0.05208253115415573, "learning_rate": 4.7625658995979536e-05, "loss": 0.0087, "step": 25600 }, { "epoch": 0.19097157718744195, "grad_norm": 0.12654227018356323, "learning_rate": 4.7606104219138654e-05, "loss": 0.0076, "step": 25700 }, { "epoch": 0.19171465725431916, "grad_norm": 0.06890890747308731, "learning_rate": 4.7586549442297766e-05, "loss": 0.0081, "step": 25800 }, { "epoch": 0.19245773732119636, "grad_norm": 0.03244900330901146, "learning_rate": 4.7566994665456884e-05, "loss": 0.0065, "step": 25900 }, { "epoch": 0.19320081738807357, "grad_norm": 0.17691504955291748, "learning_rate": 4.754743988861599e-05, "loss": 0.0079, "step": 26000 }, { "epoch": 0.19394389745495078, "grad_norm": 0.09120947867631912, "learning_rate": 4.7527885111775106e-05, "loss": 0.0068, "step": 26100 }, { "epoch": 0.19468697752182798, "grad_norm": 0.049786727875471115, "learning_rate": 4.750833033493422e-05, "loss": 0.0088, "step": 26200 }, { "epoch": 0.19543005758870519, "grad_norm": 0.03292001411318779, "learning_rate": 4.7488775558093336e-05, "loss": 0.0108, "step": 26300 }, { "epoch": 0.1961731376555824, "grad_norm": 0.036403972655534744, "learning_rate": 4.746922078125245e-05, "loss": 0.0073, "step": 26400 }, { "epoch": 0.1969162177224596, "grad_norm": 0.06817327439785004, "learning_rate": 4.744966600441156e-05, "loss": 0.0087, "step": 26500 }, { "epoch": 0.1976592977893368, "grad_norm": 0.08967740833759308, "learning_rate": 4.743011122757067e-05, "loss": 0.0087, "step": 26600 }, { "epoch": 0.198402377856214, "grad_norm": 0.02972276508808136, "learning_rate": 4.741055645072979e-05, "loss": 0.0076, "step": 26700 }, { "epoch": 0.1991454579230912, "grad_norm": 0.06339988112449646, "learning_rate": 4.73910016738889e-05, "loss": 0.0066, "step": 26800 }, { "epoch": 0.19988853798996842, "grad_norm": 0.0548376701772213, "learning_rate": 4.737144689704801e-05, "loss": 0.0084, "step": 26900 }, { "epoch": 0.20063161805684562, "grad_norm": 0.03224008530378342, "learning_rate": 4.735189212020712e-05, "loss": 0.0075, "step": 27000 }, { "epoch": 0.20137469812372283, "grad_norm": 0.21506637334823608, "learning_rate": 4.733233734336624e-05, "loss": 0.009, "step": 27100 }, { "epoch": 0.20211777819060003, "grad_norm": 0.053990256041288376, "learning_rate": 4.731278256652535e-05, "loss": 0.0077, "step": 27200 }, { "epoch": 0.20286085825747724, "grad_norm": 0.09623909741640091, "learning_rate": 4.729322778968447e-05, "loss": 0.0077, "step": 27300 }, { "epoch": 0.20360393832435444, "grad_norm": 0.032970551401376724, "learning_rate": 4.7273673012843575e-05, "loss": 0.0072, "step": 27400 }, { "epoch": 0.20434701839123165, "grad_norm": 0.18009595572948456, "learning_rate": 4.725411823600269e-05, "loss": 0.007, "step": 27500 }, { "epoch": 0.20509009845810885, "grad_norm": 0.05002117156982422, "learning_rate": 4.7234563459161804e-05, "loss": 0.0081, "step": 27600 }, { "epoch": 0.20583317852498606, "grad_norm": 0.0266043022274971, "learning_rate": 4.721500868232092e-05, "loss": 0.0067, "step": 27700 }, { "epoch": 0.20657625859186327, "grad_norm": 0.0331169068813324, "learning_rate": 4.7195453905480034e-05, "loss": 0.0079, "step": 27800 }, { "epoch": 0.20731933865874047, "grad_norm": 0.02557242661714554, "learning_rate": 4.7175899128639145e-05, "loss": 0.0076, "step": 27900 }, { "epoch": 0.20806241872561768, "grad_norm": 0.08954159915447235, "learning_rate": 4.7156344351798256e-05, "loss": 0.0079, "step": 28000 }, { "epoch": 0.20880549879249488, "grad_norm": 0.06708449125289917, "learning_rate": 4.7136789574957375e-05, "loss": 0.0079, "step": 28100 }, { "epoch": 0.2095485788593721, "grad_norm": 0.13038304448127747, "learning_rate": 4.7117234798116486e-05, "loss": 0.0069, "step": 28200 }, { "epoch": 0.2102916589262493, "grad_norm": 0.03928164765238762, "learning_rate": 4.70976800212756e-05, "loss": 0.0083, "step": 28300 }, { "epoch": 0.2110347389931265, "grad_norm": 0.16576159000396729, "learning_rate": 4.707812524443471e-05, "loss": 0.0075, "step": 28400 }, { "epoch": 0.2117778190600037, "grad_norm": 0.10476215928792953, "learning_rate": 4.705857046759383e-05, "loss": 0.0082, "step": 28500 }, { "epoch": 0.2125208991268809, "grad_norm": 0.06988445669412613, "learning_rate": 4.703901569075294e-05, "loss": 0.0076, "step": 28600 }, { "epoch": 0.21326397919375814, "grad_norm": 0.04484279081225395, "learning_rate": 4.701946091391205e-05, "loss": 0.0077, "step": 28700 }, { "epoch": 0.21400705926063535, "grad_norm": 0.13506504893302917, "learning_rate": 4.699990613707117e-05, "loss": 0.0083, "step": 28800 }, { "epoch": 0.21475013932751255, "grad_norm": 0.08723632991313934, "learning_rate": 4.698035136023028e-05, "loss": 0.0085, "step": 28900 }, { "epoch": 0.21549321939438976, "grad_norm": 0.13729853928089142, "learning_rate": 4.696079658338939e-05, "loss": 0.0097, "step": 29000 }, { "epoch": 0.21623629946126696, "grad_norm": 0.04882131516933441, "learning_rate": 4.694124180654851e-05, "loss": 0.0073, "step": 29100 }, { "epoch": 0.21697937952814417, "grad_norm": 0.0922296866774559, "learning_rate": 4.692168702970762e-05, "loss": 0.0071, "step": 29200 }, { "epoch": 0.21772245959502137, "grad_norm": 0.12605391442775726, "learning_rate": 4.690213225286673e-05, "loss": 0.0078, "step": 29300 }, { "epoch": 0.21846553966189858, "grad_norm": 0.03871968761086464, "learning_rate": 4.688257747602585e-05, "loss": 0.0074, "step": 29400 }, { "epoch": 0.21920861972877578, "grad_norm": 0.03335995972156525, "learning_rate": 4.686302269918496e-05, "loss": 0.0078, "step": 29500 }, { "epoch": 0.219951699795653, "grad_norm": 0.08447706699371338, "learning_rate": 4.684346792234407e-05, "loss": 0.0067, "step": 29600 }, { "epoch": 0.2206947798625302, "grad_norm": 0.06754235178232193, "learning_rate": 4.6823913145503184e-05, "loss": 0.0074, "step": 29700 }, { "epoch": 0.2214378599294074, "grad_norm": 0.04381962865591049, "learning_rate": 4.68043583686623e-05, "loss": 0.0071, "step": 29800 }, { "epoch": 0.2221809399962846, "grad_norm": 0.05081590637564659, "learning_rate": 4.678480359182141e-05, "loss": 0.0084, "step": 29900 }, { "epoch": 0.2229240200631618, "grad_norm": 0.06901688873767853, "learning_rate": 4.6765248814980525e-05, "loss": 0.0091, "step": 30000 }, { "epoch": 0.22366710013003901, "grad_norm": 0.11142611503601074, "learning_rate": 4.6745694038139636e-05, "loss": 0.0072, "step": 30100 }, { "epoch": 0.22441018019691622, "grad_norm": 0.03770258277654648, "learning_rate": 4.6726139261298754e-05, "loss": 0.0072, "step": 30200 }, { "epoch": 0.22515326026379343, "grad_norm": 0.032420217990875244, "learning_rate": 4.6706584484457866e-05, "loss": 0.007, "step": 30300 }, { "epoch": 0.22589634033067063, "grad_norm": 0.04543491080403328, "learning_rate": 4.6687029707616984e-05, "loss": 0.0078, "step": 30400 }, { "epoch": 0.22663942039754784, "grad_norm": 0.06744536757469177, "learning_rate": 4.666747493077609e-05, "loss": 0.0091, "step": 30500 }, { "epoch": 0.22738250046442504, "grad_norm": 0.13421909511089325, "learning_rate": 4.6647920153935206e-05, "loss": 0.008, "step": 30600 }, { "epoch": 0.22812558053130225, "grad_norm": 0.22119101881980896, "learning_rate": 4.662836537709432e-05, "loss": 0.0099, "step": 30700 }, { "epoch": 0.22886866059817945, "grad_norm": 0.15917912125587463, "learning_rate": 4.6608810600253436e-05, "loss": 0.0079, "step": 30800 }, { "epoch": 0.22961174066505666, "grad_norm": 0.0467543788254261, "learning_rate": 4.658925582341255e-05, "loss": 0.0077, "step": 30900 }, { "epoch": 0.23035482073193386, "grad_norm": 0.046549465507268906, "learning_rate": 4.656970104657166e-05, "loss": 0.0071, "step": 31000 }, { "epoch": 0.23109790079881107, "grad_norm": 0.10883781313896179, "learning_rate": 4.655014626973077e-05, "loss": 0.0086, "step": 31100 }, { "epoch": 0.23184098086568827, "grad_norm": 0.04284190014004707, "learning_rate": 4.653059149288989e-05, "loss": 0.0096, "step": 31200 }, { "epoch": 0.23258406093256548, "grad_norm": 0.02957095392048359, "learning_rate": 4.6511036716049e-05, "loss": 0.0069, "step": 31300 }, { "epoch": 0.23332714099944268, "grad_norm": 0.08585850149393082, "learning_rate": 4.649148193920812e-05, "loss": 0.0074, "step": 31400 }, { "epoch": 0.2340702210663199, "grad_norm": 0.12542138993740082, "learning_rate": 4.647192716236722e-05, "loss": 0.0074, "step": 31500 }, { "epoch": 0.2348133011331971, "grad_norm": 0.09031056612730026, "learning_rate": 4.645237238552634e-05, "loss": 0.0072, "step": 31600 }, { "epoch": 0.2355563812000743, "grad_norm": 0.04341573268175125, "learning_rate": 4.643281760868545e-05, "loss": 0.0072, "step": 31700 }, { "epoch": 0.2362994612669515, "grad_norm": 0.25583985447883606, "learning_rate": 4.641326283184457e-05, "loss": 0.0076, "step": 31800 }, { "epoch": 0.2370425413338287, "grad_norm": 0.043617378920316696, "learning_rate": 4.6393708055003675e-05, "loss": 0.0072, "step": 31900 }, { "epoch": 0.23778562140070592, "grad_norm": 0.08679509907960892, "learning_rate": 4.637415327816279e-05, "loss": 0.0077, "step": 32000 }, { "epoch": 0.23852870146758312, "grad_norm": 0.04775851219892502, "learning_rate": 4.6354598501321904e-05, "loss": 0.0094, "step": 32100 }, { "epoch": 0.23927178153446033, "grad_norm": 0.051511697471141815, "learning_rate": 4.633504372448102e-05, "loss": 0.007, "step": 32200 }, { "epoch": 0.24001486160133753, "grad_norm": 0.03577471151947975, "learning_rate": 4.6315488947640134e-05, "loss": 0.0081, "step": 32300 }, { "epoch": 0.24075794166821474, "grad_norm": 0.10956818610429764, "learning_rate": 4.6295934170799245e-05, "loss": 0.0066, "step": 32400 }, { "epoch": 0.24150102173509197, "grad_norm": 0.04652298986911774, "learning_rate": 4.6276379393958357e-05, "loss": 0.0092, "step": 32500 }, { "epoch": 0.24224410180196917, "grad_norm": 0.19181059300899506, "learning_rate": 4.6256824617117475e-05, "loss": 0.0081, "step": 32600 }, { "epoch": 0.24298718186884638, "grad_norm": 0.07437797635793686, "learning_rate": 4.6237269840276586e-05, "loss": 0.0075, "step": 32700 }, { "epoch": 0.24373026193572359, "grad_norm": 0.07017706334590912, "learning_rate": 4.62177150634357e-05, "loss": 0.0086, "step": 32800 }, { "epoch": 0.2444733420026008, "grad_norm": 0.037252262234687805, "learning_rate": 4.619816028659481e-05, "loss": 0.007, "step": 32900 }, { "epoch": 0.245216422069478, "grad_norm": 0.05362368002533913, "learning_rate": 4.617860550975393e-05, "loss": 0.0075, "step": 33000 }, { "epoch": 0.2459595021363552, "grad_norm": 0.03547364100813866, "learning_rate": 4.615905073291304e-05, "loss": 0.0065, "step": 33100 }, { "epoch": 0.2467025822032324, "grad_norm": 0.09201129525899887, "learning_rate": 4.6139495956072156e-05, "loss": 0.007, "step": 33200 }, { "epoch": 0.2474456622701096, "grad_norm": 0.17725998163223267, "learning_rate": 4.611994117923126e-05, "loss": 0.0066, "step": 33300 }, { "epoch": 0.24818874233698682, "grad_norm": 0.05252450332045555, "learning_rate": 4.610038640239038e-05, "loss": 0.0082, "step": 33400 }, { "epoch": 0.24893182240386402, "grad_norm": 0.06371159106492996, "learning_rate": 4.608083162554949e-05, "loss": 0.0073, "step": 33500 }, { "epoch": 0.24967490247074123, "grad_norm": 0.07518325746059418, "learning_rate": 4.606127684870861e-05, "loss": 0.0077, "step": 33600 }, { "epoch": 0.25041798253761843, "grad_norm": 0.056805819272994995, "learning_rate": 4.604172207186771e-05, "loss": 0.0068, "step": 33700 }, { "epoch": 0.2511610626044956, "grad_norm": 0.11422194540500641, "learning_rate": 4.602216729502683e-05, "loss": 0.0071, "step": 33800 }, { "epoch": 0.25190414267137284, "grad_norm": 0.04415952041745186, "learning_rate": 4.600261251818594e-05, "loss": 0.0077, "step": 33900 }, { "epoch": 0.25264722273825, "grad_norm": 0.45062729716300964, "learning_rate": 4.598305774134506e-05, "loss": 0.0074, "step": 34000 }, { "epoch": 0.25339030280512725, "grad_norm": 0.15658558905124664, "learning_rate": 4.596350296450417e-05, "loss": 0.0084, "step": 34100 }, { "epoch": 0.25413338287200443, "grad_norm": 0.07771284133195877, "learning_rate": 4.5943948187663284e-05, "loss": 0.0066, "step": 34200 }, { "epoch": 0.25487646293888166, "grad_norm": 0.10516870766878128, "learning_rate": 4.5924393410822395e-05, "loss": 0.0079, "step": 34300 }, { "epoch": 0.2556195430057589, "grad_norm": 0.17560672760009766, "learning_rate": 4.590483863398151e-05, "loss": 0.0066, "step": 34400 }, { "epoch": 0.2563626230726361, "grad_norm": 0.07207468897104263, "learning_rate": 4.5885283857140625e-05, "loss": 0.0068, "step": 34500 }, { "epoch": 0.2571057031395133, "grad_norm": 0.10144203901290894, "learning_rate": 4.5865729080299736e-05, "loss": 0.0079, "step": 34600 }, { "epoch": 0.2578487832063905, "grad_norm": 0.254544734954834, "learning_rate": 4.584617430345885e-05, "loss": 0.0067, "step": 34700 }, { "epoch": 0.2585918632732677, "grad_norm": 0.09579799324274063, "learning_rate": 4.5826619526617966e-05, "loss": 0.0081, "step": 34800 }, { "epoch": 0.2593349433401449, "grad_norm": 0.04340220242738724, "learning_rate": 4.580706474977708e-05, "loss": 0.008, "step": 34900 }, { "epoch": 0.26007802340702213, "grad_norm": 0.06964358687400818, "learning_rate": 4.5787509972936195e-05, "loss": 0.0077, "step": 35000 }, { "epoch": 0.2608211034738993, "grad_norm": 0.03875363618135452, "learning_rate": 4.57679551960953e-05, "loss": 0.0065, "step": 35100 }, { "epoch": 0.26156418354077654, "grad_norm": 0.09474475681781769, "learning_rate": 4.574840041925442e-05, "loss": 0.0085, "step": 35200 }, { "epoch": 0.2623072636076537, "grad_norm": 0.1127619668841362, "learning_rate": 4.572884564241353e-05, "loss": 0.0065, "step": 35300 }, { "epoch": 0.26305034367453095, "grad_norm": 0.08995424211025238, "learning_rate": 4.570929086557265e-05, "loss": 0.0076, "step": 35400 }, { "epoch": 0.26379342374140813, "grad_norm": 0.07925679534673691, "learning_rate": 4.568973608873175e-05, "loss": 0.0068, "step": 35500 }, { "epoch": 0.26453650380828536, "grad_norm": 0.041282784193754196, "learning_rate": 4.567018131189087e-05, "loss": 0.0072, "step": 35600 }, { "epoch": 0.26527958387516254, "grad_norm": 0.038397178053855896, "learning_rate": 4.565062653504998e-05, "loss": 0.0076, "step": 35700 }, { "epoch": 0.26602266394203977, "grad_norm": 0.07533842325210571, "learning_rate": 4.56310717582091e-05, "loss": 0.0076, "step": 35800 }, { "epoch": 0.26676574400891695, "grad_norm": 0.24094197154045105, "learning_rate": 4.561151698136821e-05, "loss": 0.0072, "step": 35900 }, { "epoch": 0.2675088240757942, "grad_norm": 0.05044126510620117, "learning_rate": 4.559196220452732e-05, "loss": 0.007, "step": 36000 }, { "epoch": 0.26825190414267136, "grad_norm": 0.08822425454854965, "learning_rate": 4.5572407427686434e-05, "loss": 0.0074, "step": 36100 }, { "epoch": 0.2689949842095486, "grad_norm": 0.04467346519231796, "learning_rate": 4.555285265084555e-05, "loss": 0.0085, "step": 36200 }, { "epoch": 0.26973806427642577, "grad_norm": 0.0346435122191906, "learning_rate": 4.553329787400466e-05, "loss": 0.0097, "step": 36300 }, { "epoch": 0.270481144343303, "grad_norm": 0.22399814426898956, "learning_rate": 4.5513743097163775e-05, "loss": 0.0079, "step": 36400 }, { "epoch": 0.2712242244101802, "grad_norm": 0.06078993156552315, "learning_rate": 4.5494188320322886e-05, "loss": 0.0064, "step": 36500 }, { "epoch": 0.2719673044770574, "grad_norm": 0.06910208612680435, "learning_rate": 4.5474633543482004e-05, "loss": 0.0073, "step": 36600 }, { "epoch": 0.2727103845439346, "grad_norm": 0.04372115060687065, "learning_rate": 4.5455078766641116e-05, "loss": 0.0075, "step": 36700 }, { "epoch": 0.2734534646108118, "grad_norm": 0.08668945729732513, "learning_rate": 4.5435523989800234e-05, "loss": 0.0077, "step": 36800 }, { "epoch": 0.274196544677689, "grad_norm": 0.10852037370204926, "learning_rate": 4.541596921295934e-05, "loss": 0.0081, "step": 36900 }, { "epoch": 0.27493962474456624, "grad_norm": 0.16517165303230286, "learning_rate": 4.5396414436118457e-05, "loss": 0.007, "step": 37000 }, { "epoch": 0.2756827048114434, "grad_norm": 0.03173983469605446, "learning_rate": 4.537685965927757e-05, "loss": 0.0069, "step": 37100 }, { "epoch": 0.27642578487832065, "grad_norm": 0.1380838006734848, "learning_rate": 4.5357304882436686e-05, "loss": 0.0073, "step": 37200 }, { "epoch": 0.2771688649451978, "grad_norm": 0.03931738808751106, "learning_rate": 4.53377501055958e-05, "loss": 0.008, "step": 37300 }, { "epoch": 0.27791194501207506, "grad_norm": 0.03865446895360947, "learning_rate": 4.531819532875491e-05, "loss": 0.0079, "step": 37400 }, { "epoch": 0.27865502507895223, "grad_norm": 0.054791491478681564, "learning_rate": 4.529864055191402e-05, "loss": 0.0078, "step": 37500 }, { "epoch": 0.27939810514582947, "grad_norm": 0.09200329333543777, "learning_rate": 4.527908577507314e-05, "loss": 0.0084, "step": 37600 }, { "epoch": 0.28014118521270664, "grad_norm": 0.04821917414665222, "learning_rate": 4.525953099823225e-05, "loss": 0.0072, "step": 37700 }, { "epoch": 0.2808842652795839, "grad_norm": 0.05916968360543251, "learning_rate": 4.523997622139136e-05, "loss": 0.0055, "step": 37800 }, { "epoch": 0.28162734534646106, "grad_norm": 0.06297438591718674, "learning_rate": 4.522042144455048e-05, "loss": 0.0083, "step": 37900 }, { "epoch": 0.2823704254133383, "grad_norm": 0.0889165997505188, "learning_rate": 4.520086666770959e-05, "loss": 0.0063, "step": 38000 }, { "epoch": 0.28311350548021547, "grad_norm": 0.059232063591480255, "learning_rate": 4.51813118908687e-05, "loss": 0.0069, "step": 38100 }, { "epoch": 0.2838565855470927, "grad_norm": 0.052629828453063965, "learning_rate": 4.516175711402782e-05, "loss": 0.0084, "step": 38200 }, { "epoch": 0.28459966561396993, "grad_norm": 0.058771658688783646, "learning_rate": 4.514220233718693e-05, "loss": 0.0077, "step": 38300 }, { "epoch": 0.2853427456808471, "grad_norm": 0.21613828837871552, "learning_rate": 4.512264756034604e-05, "loss": 0.0078, "step": 38400 }, { "epoch": 0.28608582574772434, "grad_norm": 0.10147778689861298, "learning_rate": 4.5103092783505154e-05, "loss": 0.0073, "step": 38500 }, { "epoch": 0.2868289058146015, "grad_norm": 0.057812999933958054, "learning_rate": 4.508353800666427e-05, "loss": 0.0075, "step": 38600 }, { "epoch": 0.28757198588147875, "grad_norm": 0.034779440611600876, "learning_rate": 4.5063983229823384e-05, "loss": 0.0075, "step": 38700 }, { "epoch": 0.28831506594835593, "grad_norm": 0.04720171540975571, "learning_rate": 4.5044428452982495e-05, "loss": 0.0082, "step": 38800 }, { "epoch": 0.28905814601523316, "grad_norm": 0.05001204460859299, "learning_rate": 4.502487367614161e-05, "loss": 0.0072, "step": 38900 }, { "epoch": 0.28980122608211034, "grad_norm": 0.04086703434586525, "learning_rate": 4.5005318899300725e-05, "loss": 0.0069, "step": 39000 }, { "epoch": 0.2905443061489876, "grad_norm": 0.05680302157998085, "learning_rate": 4.4985764122459836e-05, "loss": 0.0061, "step": 39100 }, { "epoch": 0.29128738621586475, "grad_norm": 0.09445053339004517, "learning_rate": 4.496620934561895e-05, "loss": 0.007, "step": 39200 }, { "epoch": 0.292030466282742, "grad_norm": 0.050108518451452255, "learning_rate": 4.4946654568778066e-05, "loss": 0.0067, "step": 39300 }, { "epoch": 0.29277354634961916, "grad_norm": 0.13177146017551422, "learning_rate": 4.492709979193718e-05, "loss": 0.0074, "step": 39400 }, { "epoch": 0.2935166264164964, "grad_norm": 0.03201936185359955, "learning_rate": 4.490754501509629e-05, "loss": 0.0068, "step": 39500 }, { "epoch": 0.2942597064833736, "grad_norm": 0.08135665953159332, "learning_rate": 4.48879902382554e-05, "loss": 0.0073, "step": 39600 }, { "epoch": 0.2950027865502508, "grad_norm": 0.16888895630836487, "learning_rate": 4.486843546141452e-05, "loss": 0.0074, "step": 39700 }, { "epoch": 0.295745866617128, "grad_norm": 0.14506329596042633, "learning_rate": 4.484888068457363e-05, "loss": 0.0062, "step": 39800 }, { "epoch": 0.2964889466840052, "grad_norm": 0.09832495450973511, "learning_rate": 4.482932590773275e-05, "loss": 0.0077, "step": 39900 }, { "epoch": 0.2972320267508824, "grad_norm": 0.058222539722919464, "learning_rate": 4.480977113089186e-05, "loss": 0.0071, "step": 40000 }, { "epoch": 0.2979751068177596, "grad_norm": 0.07162114977836609, "learning_rate": 4.479021635405097e-05, "loss": 0.0074, "step": 40100 }, { "epoch": 0.2987181868846368, "grad_norm": 0.047953493893146515, "learning_rate": 4.477066157721008e-05, "loss": 0.0081, "step": 40200 }, { "epoch": 0.29946126695151404, "grad_norm": 0.07063088566064835, "learning_rate": 4.47511068003692e-05, "loss": 0.0068, "step": 40300 }, { "epoch": 0.3002043470183912, "grad_norm": 0.13576781749725342, "learning_rate": 4.473155202352831e-05, "loss": 0.0074, "step": 40400 }, { "epoch": 0.30094742708526845, "grad_norm": 0.06388365477323532, "learning_rate": 4.471199724668742e-05, "loss": 0.0069, "step": 40500 }, { "epoch": 0.3016905071521456, "grad_norm": 0.0779954120516777, "learning_rate": 4.4692442469846534e-05, "loss": 0.0071, "step": 40600 }, { "epoch": 0.30243358721902286, "grad_norm": 0.02623029798269272, "learning_rate": 4.467288769300565e-05, "loss": 0.0092, "step": 40700 }, { "epoch": 0.30317666728590004, "grad_norm": 0.1712590456008911, "learning_rate": 4.4653332916164763e-05, "loss": 0.0072, "step": 40800 }, { "epoch": 0.30391974735277727, "grad_norm": 0.08809200674295425, "learning_rate": 4.463377813932388e-05, "loss": 0.0074, "step": 40900 }, { "epoch": 0.30466282741965445, "grad_norm": 0.11307816207408905, "learning_rate": 4.4614223362482986e-05, "loss": 0.0061, "step": 41000 }, { "epoch": 0.3054059074865317, "grad_norm": 0.039982136338949203, "learning_rate": 4.4594668585642104e-05, "loss": 0.0069, "step": 41100 }, { "epoch": 0.30614898755340886, "grad_norm": 0.2672407627105713, "learning_rate": 4.4575113808801216e-05, "loss": 0.0083, "step": 41200 }, { "epoch": 0.3068920676202861, "grad_norm": 0.04570712894201279, "learning_rate": 4.4555559031960334e-05, "loss": 0.0077, "step": 41300 }, { "epoch": 0.30763514768716327, "grad_norm": 0.0578707754611969, "learning_rate": 4.453600425511944e-05, "loss": 0.0074, "step": 41400 }, { "epoch": 0.3083782277540405, "grad_norm": 0.11470286548137665, "learning_rate": 4.4516449478278557e-05, "loss": 0.0076, "step": 41500 }, { "epoch": 0.3091213078209177, "grad_norm": 0.07640943676233292, "learning_rate": 4.449689470143767e-05, "loss": 0.0089, "step": 41600 }, { "epoch": 0.3098643878877949, "grad_norm": 0.27790024876594543, "learning_rate": 4.4477339924596786e-05, "loss": 0.0072, "step": 41700 }, { "epoch": 0.3106074679546721, "grad_norm": 0.07412257045507431, "learning_rate": 4.44577851477559e-05, "loss": 0.0076, "step": 41800 }, { "epoch": 0.3113505480215493, "grad_norm": 0.06599798053503036, "learning_rate": 4.443823037091501e-05, "loss": 0.0074, "step": 41900 }, { "epoch": 0.31209362808842656, "grad_norm": 0.03567137196660042, "learning_rate": 4.441867559407412e-05, "loss": 0.0077, "step": 42000 }, { "epoch": 0.31283670815530373, "grad_norm": 0.0697321817278862, "learning_rate": 4.439912081723324e-05, "loss": 0.007, "step": 42100 }, { "epoch": 0.31357978822218097, "grad_norm": 0.031224340200424194, "learning_rate": 4.437956604039235e-05, "loss": 0.0064, "step": 42200 }, { "epoch": 0.31432286828905814, "grad_norm": 0.057371530681848526, "learning_rate": 4.436001126355146e-05, "loss": 0.0074, "step": 42300 }, { "epoch": 0.3150659483559354, "grad_norm": 0.044043853878974915, "learning_rate": 4.434045648671057e-05, "loss": 0.007, "step": 42400 }, { "epoch": 0.31580902842281255, "grad_norm": 0.04665425047278404, "learning_rate": 4.432090170986969e-05, "loss": 0.0077, "step": 42500 }, { "epoch": 0.3165521084896898, "grad_norm": 0.11235306411981583, "learning_rate": 4.43013469330288e-05, "loss": 0.0073, "step": 42600 }, { "epoch": 0.31729518855656696, "grad_norm": 0.02958083525300026, "learning_rate": 4.428179215618792e-05, "loss": 0.006, "step": 42700 }, { "epoch": 0.3180382686234442, "grad_norm": 0.01547887921333313, "learning_rate": 4.4262237379347025e-05, "loss": 0.0062, "step": 42800 }, { "epoch": 0.3187813486903214, "grad_norm": 0.05708913877606392, "learning_rate": 4.424268260250614e-05, "loss": 0.007, "step": 42900 }, { "epoch": 0.3195244287571986, "grad_norm": 0.16802677512168884, "learning_rate": 4.4223127825665254e-05, "loss": 0.0074, "step": 43000 }, { "epoch": 0.3202675088240758, "grad_norm": 0.05716625601053238, "learning_rate": 4.420357304882437e-05, "loss": 0.0071, "step": 43100 }, { "epoch": 0.321010588890953, "grad_norm": 0.09568345546722412, "learning_rate": 4.4184018271983484e-05, "loss": 0.0073, "step": 43200 }, { "epoch": 0.3217536689578302, "grad_norm": 0.10568996518850327, "learning_rate": 4.4164463495142595e-05, "loss": 0.0065, "step": 43300 }, { "epoch": 0.32249674902470743, "grad_norm": 0.1316547840833664, "learning_rate": 4.414490871830171e-05, "loss": 0.0068, "step": 43400 }, { "epoch": 0.3232398290915846, "grad_norm": 0.07729732245206833, "learning_rate": 4.4125353941460825e-05, "loss": 0.0072, "step": 43500 }, { "epoch": 0.32398290915846184, "grad_norm": 0.10743196308612823, "learning_rate": 4.4105799164619936e-05, "loss": 0.0075, "step": 43600 }, { "epoch": 0.324725989225339, "grad_norm": 0.025578688830137253, "learning_rate": 4.408624438777905e-05, "loss": 0.0072, "step": 43700 }, { "epoch": 0.32546906929221625, "grad_norm": 0.046454060822725296, "learning_rate": 4.406668961093816e-05, "loss": 0.0062, "step": 43800 }, { "epoch": 0.32621214935909343, "grad_norm": 0.039890214800834656, "learning_rate": 4.404713483409728e-05, "loss": 0.0071, "step": 43900 }, { "epoch": 0.32695522942597066, "grad_norm": 0.04856781288981438, "learning_rate": 4.402758005725639e-05, "loss": 0.0069, "step": 44000 }, { "epoch": 0.32769830949284784, "grad_norm": 0.08272068947553635, "learning_rate": 4.4008025280415507e-05, "loss": 0.0074, "step": 44100 }, { "epoch": 0.32844138955972507, "grad_norm": 0.07336621731519699, "learning_rate": 4.398847050357461e-05, "loss": 0.0073, "step": 44200 }, { "epoch": 0.32918446962660225, "grad_norm": 0.04202836751937866, "learning_rate": 4.396891572673373e-05, "loss": 0.0086, "step": 44300 }, { "epoch": 0.3299275496934795, "grad_norm": 0.04038666561245918, "learning_rate": 4.394936094989284e-05, "loss": 0.0074, "step": 44400 }, { "epoch": 0.33067062976035666, "grad_norm": 0.05853787809610367, "learning_rate": 4.392980617305196e-05, "loss": 0.0072, "step": 44500 }, { "epoch": 0.3314137098272339, "grad_norm": 0.020443668588995934, "learning_rate": 4.3910251396211063e-05, "loss": 0.0071, "step": 44600 }, { "epoch": 0.33215678989411107, "grad_norm": 0.0470191091299057, "learning_rate": 4.389069661937018e-05, "loss": 0.0079, "step": 44700 }, { "epoch": 0.3328998699609883, "grad_norm": 0.02600996196269989, "learning_rate": 4.387114184252929e-05, "loss": 0.0079, "step": 44800 }, { "epoch": 0.3336429500278655, "grad_norm": 0.04765189066529274, "learning_rate": 4.385158706568841e-05, "loss": 0.0067, "step": 44900 }, { "epoch": 0.3343860300947427, "grad_norm": 0.0503423735499382, "learning_rate": 4.383203228884752e-05, "loss": 0.0075, "step": 45000 }, { "epoch": 0.3351291101616199, "grad_norm": 0.07852368801832199, "learning_rate": 4.3812477512006634e-05, "loss": 0.0076, "step": 45100 }, { "epoch": 0.3358721902284971, "grad_norm": 0.05286919325590134, "learning_rate": 4.3792922735165745e-05, "loss": 0.0071, "step": 45200 }, { "epoch": 0.3366152702953743, "grad_norm": 0.24307796359062195, "learning_rate": 4.3773367958324863e-05, "loss": 0.0072, "step": 45300 }, { "epoch": 0.33735835036225154, "grad_norm": 0.11231080442667007, "learning_rate": 4.3753813181483975e-05, "loss": 0.0076, "step": 45400 }, { "epoch": 0.3381014304291287, "grad_norm": 0.10302214324474335, "learning_rate": 4.3734258404643086e-05, "loss": 0.0066, "step": 45500 }, { "epoch": 0.33884451049600595, "grad_norm": 0.045526422560214996, "learning_rate": 4.37147036278022e-05, "loss": 0.008, "step": 45600 }, { "epoch": 0.3395875905628831, "grad_norm": 0.06366898119449615, "learning_rate": 4.3695148850961316e-05, "loss": 0.0084, "step": 45700 }, { "epoch": 0.34033067062976036, "grad_norm": 0.09426689893007278, "learning_rate": 4.367559407412043e-05, "loss": 0.0065, "step": 45800 }, { "epoch": 0.3410737506966376, "grad_norm": 0.04165596142411232, "learning_rate": 4.3656039297279545e-05, "loss": 0.0069, "step": 45900 }, { "epoch": 0.34181683076351477, "grad_norm": 0.05287677049636841, "learning_rate": 4.363648452043865e-05, "loss": 0.0067, "step": 46000 }, { "epoch": 0.342559910830392, "grad_norm": 0.03504342958331108, "learning_rate": 4.361692974359777e-05, "loss": 0.0074, "step": 46100 }, { "epoch": 0.3433029908972692, "grad_norm": 0.22839584946632385, "learning_rate": 4.359737496675688e-05, "loss": 0.0074, "step": 46200 }, { "epoch": 0.3440460709641464, "grad_norm": 0.15492674708366394, "learning_rate": 4.3577820189916e-05, "loss": 0.0066, "step": 46300 }, { "epoch": 0.3447891510310236, "grad_norm": 0.07831539213657379, "learning_rate": 4.355826541307511e-05, "loss": 0.007, "step": 46400 }, { "epoch": 0.3455322310979008, "grad_norm": 0.045418355613946915, "learning_rate": 4.353871063623422e-05, "loss": 0.009, "step": 46500 }, { "epoch": 0.346275311164778, "grad_norm": 0.04851536452770233, "learning_rate": 4.351915585939333e-05, "loss": 0.0072, "step": 46600 }, { "epoch": 0.34701839123165523, "grad_norm": 0.025820812210440636, "learning_rate": 4.349960108255245e-05, "loss": 0.0068, "step": 46700 }, { "epoch": 0.3477614712985324, "grad_norm": 0.3972889482975006, "learning_rate": 4.348004630571156e-05, "loss": 0.0068, "step": 46800 }, { "epoch": 0.34850455136540964, "grad_norm": 0.3437056839466095, "learning_rate": 4.346049152887067e-05, "loss": 0.007, "step": 46900 }, { "epoch": 0.3492476314322868, "grad_norm": 0.19884435832500458, "learning_rate": 4.3440936752029784e-05, "loss": 0.0077, "step": 47000 }, { "epoch": 0.34999071149916405, "grad_norm": 0.12222936004400253, "learning_rate": 4.34213819751889e-05, "loss": 0.0073, "step": 47100 }, { "epoch": 0.35073379156604123, "grad_norm": 0.1213407963514328, "learning_rate": 4.3401827198348013e-05, "loss": 0.0072, "step": 47200 }, { "epoch": 0.35147687163291846, "grad_norm": 0.1353810727596283, "learning_rate": 4.3382272421507125e-05, "loss": 0.0069, "step": 47300 }, { "epoch": 0.35221995169979564, "grad_norm": 0.04720344766974449, "learning_rate": 4.336271764466624e-05, "loss": 0.0065, "step": 47400 }, { "epoch": 0.3529630317666729, "grad_norm": 0.0543699786067009, "learning_rate": 4.3343162867825354e-05, "loss": 0.008, "step": 47500 }, { "epoch": 0.35370611183355005, "grad_norm": 0.03440045565366745, "learning_rate": 4.3323608090984466e-05, "loss": 0.0078, "step": 47600 }, { "epoch": 0.3544491919004273, "grad_norm": 0.06126417592167854, "learning_rate": 4.3304053314143584e-05, "loss": 0.0078, "step": 47700 }, { "epoch": 0.35519227196730446, "grad_norm": 0.03525172546505928, "learning_rate": 4.3284498537302695e-05, "loss": 0.0082, "step": 47800 }, { "epoch": 0.3559353520341817, "grad_norm": 0.04023478552699089, "learning_rate": 4.326494376046181e-05, "loss": 0.0067, "step": 47900 }, { "epoch": 0.3566784321010589, "grad_norm": 0.08829139918088913, "learning_rate": 4.324538898362092e-05, "loss": 0.007, "step": 48000 }, { "epoch": 0.3574215121679361, "grad_norm": 0.08259564638137817, "learning_rate": 4.3225834206780036e-05, "loss": 0.0091, "step": 48100 }, { "epoch": 0.3581645922348133, "grad_norm": 0.07022551447153091, "learning_rate": 4.320627942993915e-05, "loss": 0.0066, "step": 48200 }, { "epoch": 0.3589076723016905, "grad_norm": 0.03341663256287575, "learning_rate": 4.318672465309826e-05, "loss": 0.007, "step": 48300 }, { "epoch": 0.3596507523685677, "grad_norm": 0.043397437781095505, "learning_rate": 4.316716987625738e-05, "loss": 0.007, "step": 48400 }, { "epoch": 0.3603938324354449, "grad_norm": 0.08786457777023315, "learning_rate": 4.314761509941649e-05, "loss": 0.0068, "step": 48500 }, { "epoch": 0.3611369125023221, "grad_norm": 0.03787624463438988, "learning_rate": 4.31280603225756e-05, "loss": 0.0075, "step": 48600 }, { "epoch": 0.36187999256919934, "grad_norm": 0.1954130232334137, "learning_rate": 4.310850554573471e-05, "loss": 0.0074, "step": 48700 }, { "epoch": 0.3626230726360765, "grad_norm": 0.12462051957845688, "learning_rate": 4.308895076889383e-05, "loss": 0.0058, "step": 48800 }, { "epoch": 0.36336615270295375, "grad_norm": 0.018308240920305252, "learning_rate": 4.306939599205294e-05, "loss": 0.0072, "step": 48900 }, { "epoch": 0.3641092327698309, "grad_norm": 0.1488741934299469, "learning_rate": 4.304984121521206e-05, "loss": 0.0067, "step": 49000 }, { "epoch": 0.36485231283670816, "grad_norm": 0.04374194145202637, "learning_rate": 4.303028643837117e-05, "loss": 0.0063, "step": 49100 }, { "epoch": 0.36559539290358534, "grad_norm": 0.08652815222740173, "learning_rate": 4.301073166153028e-05, "loss": 0.0063, "step": 49200 }, { "epoch": 0.36633847297046257, "grad_norm": 0.4489465355873108, "learning_rate": 4.299117688468939e-05, "loss": 0.0059, "step": 49300 }, { "epoch": 0.36708155303733975, "grad_norm": 0.14833860099315643, "learning_rate": 4.297162210784851e-05, "loss": 0.0064, "step": 49400 }, { "epoch": 0.367824633104217, "grad_norm": 0.06946191191673279, "learning_rate": 4.295206733100762e-05, "loss": 0.0075, "step": 49500 }, { "epoch": 0.36856771317109416, "grad_norm": 0.08675149828195572, "learning_rate": 4.2932512554166734e-05, "loss": 0.0066, "step": 49600 }, { "epoch": 0.3693107932379714, "grad_norm": 0.2135254144668579, "learning_rate": 4.2912957777325845e-05, "loss": 0.0059, "step": 49700 }, { "epoch": 0.3700538733048486, "grad_norm": 0.16049957275390625, "learning_rate": 4.2893403000484964e-05, "loss": 0.0065, "step": 49800 }, { "epoch": 0.3707969533717258, "grad_norm": 0.10920880734920502, "learning_rate": 4.2873848223644075e-05, "loss": 0.0062, "step": 49900 }, { "epoch": 0.37154003343860303, "grad_norm": 0.025067072361707687, "learning_rate": 4.285429344680319e-05, "loss": 0.0061, "step": 50000 }, { "epoch": 0.3722831135054802, "grad_norm": 0.036388590931892395, "learning_rate": 4.28347386699623e-05, "loss": 0.0073, "step": 50100 }, { "epoch": 0.37302619357235745, "grad_norm": 0.058134887367486954, "learning_rate": 4.2815183893121416e-05, "loss": 0.0079, "step": 50200 }, { "epoch": 0.3737692736392346, "grad_norm": 0.05767570063471794, "learning_rate": 4.279562911628053e-05, "loss": 0.0068, "step": 50300 }, { "epoch": 0.37451235370611186, "grad_norm": 0.07306674122810364, "learning_rate": 4.2776074339439645e-05, "loss": 0.0065, "step": 50400 }, { "epoch": 0.37525543377298903, "grad_norm": 0.03788163512945175, "learning_rate": 4.275651956259875e-05, "loss": 0.0065, "step": 50500 }, { "epoch": 0.37599851383986627, "grad_norm": 0.07209587097167969, "learning_rate": 4.273696478575787e-05, "loss": 0.0069, "step": 50600 }, { "epoch": 0.37674159390674344, "grad_norm": 0.02345472201704979, "learning_rate": 4.271741000891698e-05, "loss": 0.0075, "step": 50700 }, { "epoch": 0.3774846739736207, "grad_norm": 0.04644273594021797, "learning_rate": 4.26978552320761e-05, "loss": 0.0072, "step": 50800 }, { "epoch": 0.37822775404049785, "grad_norm": 0.03101653791964054, "learning_rate": 4.267830045523521e-05, "loss": 0.0081, "step": 50900 }, { "epoch": 0.3789708341073751, "grad_norm": 0.04607825726270676, "learning_rate": 4.265874567839432e-05, "loss": 0.0071, "step": 51000 }, { "epoch": 0.37971391417425226, "grad_norm": 0.0712171122431755, "learning_rate": 4.263919090155343e-05, "loss": 0.008, "step": 51100 }, { "epoch": 0.3804569942411295, "grad_norm": 0.18553027510643005, "learning_rate": 4.261963612471255e-05, "loss": 0.0077, "step": 51200 }, { "epoch": 0.3812000743080067, "grad_norm": 0.06862063705921173, "learning_rate": 4.260008134787166e-05, "loss": 0.0068, "step": 51300 }, { "epoch": 0.3819431543748839, "grad_norm": 0.04700983315706253, "learning_rate": 4.258052657103077e-05, "loss": 0.0064, "step": 51400 }, { "epoch": 0.3826862344417611, "grad_norm": 0.030787477269768715, "learning_rate": 4.2560971794189884e-05, "loss": 0.0081, "step": 51500 }, { "epoch": 0.3834293145086383, "grad_norm": 0.02073969691991806, "learning_rate": 4.2541417017349e-05, "loss": 0.007, "step": 51600 }, { "epoch": 0.3841723945755155, "grad_norm": 0.16130737960338593, "learning_rate": 4.2521862240508114e-05, "loss": 0.0063, "step": 51700 }, { "epoch": 0.38491547464239273, "grad_norm": 0.11296936869621277, "learning_rate": 4.250230746366723e-05, "loss": 0.0062, "step": 51800 }, { "epoch": 0.3856585547092699, "grad_norm": 0.060501374304294586, "learning_rate": 4.2482752686826336e-05, "loss": 0.0069, "step": 51900 }, { "epoch": 0.38640163477614714, "grad_norm": 0.10431472212076187, "learning_rate": 4.2463197909985454e-05, "loss": 0.0067, "step": 52000 }, { "epoch": 0.3871447148430243, "grad_norm": 0.2196529656648636, "learning_rate": 4.2443643133144566e-05, "loss": 0.0073, "step": 52100 }, { "epoch": 0.38788779490990155, "grad_norm": 0.11308343708515167, "learning_rate": 4.2424088356303684e-05, "loss": 0.0073, "step": 52200 }, { "epoch": 0.38863087497677873, "grad_norm": 0.34784701466560364, "learning_rate": 4.240453357946279e-05, "loss": 0.0065, "step": 52300 }, { "epoch": 0.38937395504365596, "grad_norm": 0.09705357253551483, "learning_rate": 4.238497880262191e-05, "loss": 0.0074, "step": 52400 }, { "epoch": 0.39011703511053314, "grad_norm": 0.06265726685523987, "learning_rate": 4.236542402578102e-05, "loss": 0.0082, "step": 52500 }, { "epoch": 0.39086011517741037, "grad_norm": 0.05951327458024025, "learning_rate": 4.2345869248940136e-05, "loss": 0.0064, "step": 52600 }, { "epoch": 0.39160319524428755, "grad_norm": 0.022820688784122467, "learning_rate": 4.232631447209925e-05, "loss": 0.0071, "step": 52700 }, { "epoch": 0.3923462753111648, "grad_norm": 0.04885343089699745, "learning_rate": 4.230675969525836e-05, "loss": 0.0058, "step": 52800 }, { "epoch": 0.39308935537804196, "grad_norm": 0.030405908823013306, "learning_rate": 4.228720491841747e-05, "loss": 0.0071, "step": 52900 }, { "epoch": 0.3938324354449192, "grad_norm": 0.04036644846200943, "learning_rate": 4.226765014157659e-05, "loss": 0.0069, "step": 53000 }, { "epoch": 0.39457551551179637, "grad_norm": 0.06133032590150833, "learning_rate": 4.22480953647357e-05, "loss": 0.008, "step": 53100 }, { "epoch": 0.3953185955786736, "grad_norm": 0.06812535226345062, "learning_rate": 4.222854058789481e-05, "loss": 0.0077, "step": 53200 }, { "epoch": 0.3960616756455508, "grad_norm": 0.03652730956673622, "learning_rate": 4.220898581105392e-05, "loss": 0.0061, "step": 53300 }, { "epoch": 0.396804755712428, "grad_norm": 0.035384487360715866, "learning_rate": 4.218943103421304e-05, "loss": 0.0064, "step": 53400 }, { "epoch": 0.39754783577930525, "grad_norm": 0.08146490156650543, "learning_rate": 4.216987625737215e-05, "loss": 0.0068, "step": 53500 }, { "epoch": 0.3982909158461824, "grad_norm": 0.06371843814849854, "learning_rate": 4.215032148053127e-05, "loss": 0.006, "step": 53600 }, { "epoch": 0.39903399591305966, "grad_norm": 0.5855499505996704, "learning_rate": 4.2130766703690375e-05, "loss": 0.0069, "step": 53700 }, { "epoch": 0.39977707597993684, "grad_norm": 0.10433755815029144, "learning_rate": 4.211121192684949e-05, "loss": 0.0059, "step": 53800 }, { "epoch": 0.40052015604681407, "grad_norm": 0.032108113169670105, "learning_rate": 4.2091657150008604e-05, "loss": 0.0064, "step": 53900 }, { "epoch": 0.40126323611369125, "grad_norm": 0.15570470690727234, "learning_rate": 4.207210237316772e-05, "loss": 0.0054, "step": 54000 }, { "epoch": 0.4020063161805685, "grad_norm": 0.05345800146460533, "learning_rate": 4.205254759632683e-05, "loss": 0.0069, "step": 54100 }, { "epoch": 0.40274939624744566, "grad_norm": 0.08762531727552414, "learning_rate": 4.2032992819485945e-05, "loss": 0.0069, "step": 54200 }, { "epoch": 0.4034924763143229, "grad_norm": 0.03382888808846474, "learning_rate": 4.201343804264506e-05, "loss": 0.0063, "step": 54300 }, { "epoch": 0.40423555638120007, "grad_norm": 0.10578560084104538, "learning_rate": 4.1993883265804175e-05, "loss": 0.0069, "step": 54400 }, { "epoch": 0.4049786364480773, "grad_norm": 0.06943291425704956, "learning_rate": 4.1974328488963286e-05, "loss": 0.0071, "step": 54500 }, { "epoch": 0.4057217165149545, "grad_norm": 0.039991047233343124, "learning_rate": 4.19547737121224e-05, "loss": 0.007, "step": 54600 }, { "epoch": 0.4064647965818317, "grad_norm": 0.03006046451628208, "learning_rate": 4.193521893528151e-05, "loss": 0.0069, "step": 54700 }, { "epoch": 0.4072078766487089, "grad_norm": 0.0805739238858223, "learning_rate": 4.191566415844063e-05, "loss": 0.0064, "step": 54800 }, { "epoch": 0.4079509567155861, "grad_norm": 0.049078069627285004, "learning_rate": 4.189610938159974e-05, "loss": 0.0081, "step": 54900 }, { "epoch": 0.4086940367824633, "grad_norm": 0.07219412177801132, "learning_rate": 4.187655460475885e-05, "loss": 0.0067, "step": 55000 }, { "epoch": 0.40943711684934053, "grad_norm": 0.04891228675842285, "learning_rate": 4.185699982791796e-05, "loss": 0.0065, "step": 55100 }, { "epoch": 0.4101801969162177, "grad_norm": 0.025021158158779144, "learning_rate": 4.183744505107708e-05, "loss": 0.0068, "step": 55200 }, { "epoch": 0.41092327698309494, "grad_norm": 0.09310584515333176, "learning_rate": 4.181789027423619e-05, "loss": 0.0068, "step": 55300 }, { "epoch": 0.4116663570499721, "grad_norm": 0.09805582463741302, "learning_rate": 4.179833549739531e-05, "loss": 0.0073, "step": 55400 }, { "epoch": 0.41240943711684935, "grad_norm": 0.04602217674255371, "learning_rate": 4.1778780720554414e-05, "loss": 0.0065, "step": 55500 }, { "epoch": 0.41315251718372653, "grad_norm": 0.07918443530797958, "learning_rate": 4.175922594371353e-05, "loss": 0.0068, "step": 55600 }, { "epoch": 0.41389559725060376, "grad_norm": 0.032946448773145676, "learning_rate": 4.173967116687264e-05, "loss": 0.0065, "step": 55700 }, { "epoch": 0.41463867731748094, "grad_norm": 0.050301749259233475, "learning_rate": 4.172011639003176e-05, "loss": 0.0071, "step": 55800 }, { "epoch": 0.4153817573843582, "grad_norm": 0.08164072781801224, "learning_rate": 4.170056161319087e-05, "loss": 0.0067, "step": 55900 }, { "epoch": 0.41612483745123535, "grad_norm": 0.06266114115715027, "learning_rate": 4.1681006836349984e-05, "loss": 0.0078, "step": 56000 }, { "epoch": 0.4168679175181126, "grad_norm": 0.035891663283109665, "learning_rate": 4.1661452059509095e-05, "loss": 0.0062, "step": 56100 }, { "epoch": 0.41761099758498976, "grad_norm": 0.021806631237268448, "learning_rate": 4.1641897282668214e-05, "loss": 0.0068, "step": 56200 }, { "epoch": 0.418354077651867, "grad_norm": 0.1567000299692154, "learning_rate": 4.1622342505827325e-05, "loss": 0.0076, "step": 56300 }, { "epoch": 0.4190971577187442, "grad_norm": 0.04162755608558655, "learning_rate": 4.1602787728986436e-05, "loss": 0.007, "step": 56400 }, { "epoch": 0.4198402377856214, "grad_norm": 0.28211840987205505, "learning_rate": 4.158323295214555e-05, "loss": 0.0086, "step": 56500 }, { "epoch": 0.4205833178524986, "grad_norm": 0.04723413288593292, "learning_rate": 4.1563678175304666e-05, "loss": 0.0067, "step": 56600 }, { "epoch": 0.4213263979193758, "grad_norm": 0.03256762772798538, "learning_rate": 4.154412339846378e-05, "loss": 0.0069, "step": 56700 }, { "epoch": 0.422069477986253, "grad_norm": 0.060872144997119904, "learning_rate": 4.1524568621622895e-05, "loss": 0.0063, "step": 56800 }, { "epoch": 0.4228125580531302, "grad_norm": 0.062318675220012665, "learning_rate": 4.150501384478201e-05, "loss": 0.0075, "step": 56900 }, { "epoch": 0.4235556381200074, "grad_norm": 0.13166898488998413, "learning_rate": 4.148545906794112e-05, "loss": 0.0064, "step": 57000 }, { "epoch": 0.42429871818688464, "grad_norm": 0.10760905593633652, "learning_rate": 4.146590429110023e-05, "loss": 0.0068, "step": 57100 }, { "epoch": 0.4250417982537618, "grad_norm": 0.050046175718307495, "learning_rate": 4.144634951425935e-05, "loss": 0.0075, "step": 57200 }, { "epoch": 0.42578487832063905, "grad_norm": 0.04182514175772667, "learning_rate": 4.142679473741846e-05, "loss": 0.0079, "step": 57300 }, { "epoch": 0.4265279583875163, "grad_norm": 0.07931292057037354, "learning_rate": 4.140723996057757e-05, "loss": 0.0074, "step": 57400 }, { "epoch": 0.42727103845439346, "grad_norm": 0.06941182166337967, "learning_rate": 4.138768518373669e-05, "loss": 0.0071, "step": 57500 }, { "epoch": 0.4280141185212707, "grad_norm": 0.02331538312137127, "learning_rate": 4.13681304068958e-05, "loss": 0.0067, "step": 57600 }, { "epoch": 0.42875719858814787, "grad_norm": 0.06622225046157837, "learning_rate": 4.134857563005491e-05, "loss": 0.0076, "step": 57700 }, { "epoch": 0.4295002786550251, "grad_norm": 0.034822117537260056, "learning_rate": 4.132902085321402e-05, "loss": 0.0063, "step": 57800 }, { "epoch": 0.4302433587219023, "grad_norm": 0.022033903747797012, "learning_rate": 4.130946607637314e-05, "loss": 0.0071, "step": 57900 }, { "epoch": 0.4309864387887795, "grad_norm": 0.034019142389297485, "learning_rate": 4.128991129953225e-05, "loss": 0.0075, "step": 58000 }, { "epoch": 0.4317295188556567, "grad_norm": 0.016568219289183617, "learning_rate": 4.1270356522691364e-05, "loss": 0.0085, "step": 58100 }, { "epoch": 0.4324725989225339, "grad_norm": 0.062086474150419235, "learning_rate": 4.1250801745850475e-05, "loss": 0.0058, "step": 58200 }, { "epoch": 0.4332156789894111, "grad_norm": 0.08158089220523834, "learning_rate": 4.123124696900959e-05, "loss": 0.0067, "step": 58300 }, { "epoch": 0.43395875905628833, "grad_norm": 0.02223905920982361, "learning_rate": 4.1211692192168705e-05, "loss": 0.0061, "step": 58400 }, { "epoch": 0.4347018391231655, "grad_norm": 0.06254424899816513, "learning_rate": 4.119213741532782e-05, "loss": 0.0069, "step": 58500 }, { "epoch": 0.43544491919004275, "grad_norm": 0.08781812340021133, "learning_rate": 4.1172582638486934e-05, "loss": 0.0067, "step": 58600 }, { "epoch": 0.4361879992569199, "grad_norm": 0.03437701240181923, "learning_rate": 4.1153027861646045e-05, "loss": 0.0059, "step": 58700 }, { "epoch": 0.43693107932379716, "grad_norm": 0.1108015701174736, "learning_rate": 4.113347308480516e-05, "loss": 0.0069, "step": 58800 }, { "epoch": 0.43767415939067433, "grad_norm": 0.15791569650173187, "learning_rate": 4.1113918307964275e-05, "loss": 0.0081, "step": 58900 }, { "epoch": 0.43841723945755157, "grad_norm": 0.03537975251674652, "learning_rate": 4.1094363531123386e-05, "loss": 0.0076, "step": 59000 }, { "epoch": 0.43916031952442874, "grad_norm": 0.1926647275686264, "learning_rate": 4.10748087542825e-05, "loss": 0.0066, "step": 59100 }, { "epoch": 0.439903399591306, "grad_norm": 0.0397714264690876, "learning_rate": 4.105525397744161e-05, "loss": 0.007, "step": 59200 }, { "epoch": 0.44064647965818315, "grad_norm": 0.038487717509269714, "learning_rate": 4.103569920060073e-05, "loss": 0.0064, "step": 59300 }, { "epoch": 0.4413895597250604, "grad_norm": 0.06089175492525101, "learning_rate": 4.101614442375984e-05, "loss": 0.0066, "step": 59400 }, { "epoch": 0.44213263979193757, "grad_norm": 0.20144471526145935, "learning_rate": 4.099658964691896e-05, "loss": 0.0077, "step": 59500 }, { "epoch": 0.4428757198588148, "grad_norm": 0.04498005285859108, "learning_rate": 4.097703487007806e-05, "loss": 0.0069, "step": 59600 }, { "epoch": 0.443618799925692, "grad_norm": 0.04998084157705307, "learning_rate": 4.095748009323718e-05, "loss": 0.0065, "step": 59700 }, { "epoch": 0.4443618799925692, "grad_norm": 0.06072353944182396, "learning_rate": 4.093792531639629e-05, "loss": 0.0067, "step": 59800 }, { "epoch": 0.4451049600594464, "grad_norm": 0.03491585701704025, "learning_rate": 4.091837053955541e-05, "loss": 0.0082, "step": 59900 }, { "epoch": 0.4458480401263236, "grad_norm": 0.03201987221837044, "learning_rate": 4.0898815762714514e-05, "loss": 0.006, "step": 60000 }, { "epoch": 0.4465911201932008, "grad_norm": 0.05235864967107773, "learning_rate": 4.087926098587363e-05, "loss": 0.0058, "step": 60100 }, { "epoch": 0.44733420026007803, "grad_norm": 0.2086254358291626, "learning_rate": 4.085970620903274e-05, "loss": 0.008, "step": 60200 }, { "epoch": 0.4480772803269552, "grad_norm": 0.0335107259452343, "learning_rate": 4.084015143219186e-05, "loss": 0.0069, "step": 60300 }, { "epoch": 0.44882036039383244, "grad_norm": 0.07208480685949326, "learning_rate": 4.082059665535097e-05, "loss": 0.0068, "step": 60400 }, { "epoch": 0.4495634404607096, "grad_norm": 0.05190690606832504, "learning_rate": 4.0801041878510084e-05, "loss": 0.0067, "step": 60500 }, { "epoch": 0.45030652052758685, "grad_norm": 0.06418979912996292, "learning_rate": 4.0781487101669195e-05, "loss": 0.0073, "step": 60600 }, { "epoch": 0.45104960059446403, "grad_norm": 0.04282882809638977, "learning_rate": 4.0761932324828314e-05, "loss": 0.0067, "step": 60700 }, { "epoch": 0.45179268066134126, "grad_norm": 0.04149607568979263, "learning_rate": 4.0742377547987425e-05, "loss": 0.007, "step": 60800 }, { "epoch": 0.45253576072821844, "grad_norm": 0.03559328615665436, "learning_rate": 4.0722822771146536e-05, "loss": 0.0073, "step": 60900 }, { "epoch": 0.45327884079509567, "grad_norm": 0.06256083399057388, "learning_rate": 4.070326799430565e-05, "loss": 0.0062, "step": 61000 }, { "epoch": 0.4540219208619729, "grad_norm": 0.07688995450735092, "learning_rate": 4.0683713217464766e-05, "loss": 0.0086, "step": 61100 }, { "epoch": 0.4547650009288501, "grad_norm": 0.2052735835313797, "learning_rate": 4.066415844062388e-05, "loss": 0.0078, "step": 61200 }, { "epoch": 0.4555080809957273, "grad_norm": 0.03859034925699234, "learning_rate": 4.0644603663782995e-05, "loss": 0.0066, "step": 61300 }, { "epoch": 0.4562511610626045, "grad_norm": 0.030222175642848015, "learning_rate": 4.06250488869421e-05, "loss": 0.0077, "step": 61400 }, { "epoch": 0.4569942411294817, "grad_norm": 0.04808099567890167, "learning_rate": 4.060549411010122e-05, "loss": 0.0079, "step": 61500 }, { "epoch": 0.4577373211963589, "grad_norm": 0.02149675413966179, "learning_rate": 4.058593933326033e-05, "loss": 0.0077, "step": 61600 }, { "epoch": 0.45848040126323614, "grad_norm": 0.19837604463100433, "learning_rate": 4.056638455641945e-05, "loss": 0.0062, "step": 61700 }, { "epoch": 0.4592234813301133, "grad_norm": 0.045484255999326706, "learning_rate": 4.054682977957856e-05, "loss": 0.0077, "step": 61800 }, { "epoch": 0.45996656139699055, "grad_norm": 0.15475772321224213, "learning_rate": 4.052727500273767e-05, "loss": 0.0068, "step": 61900 }, { "epoch": 0.4607096414638677, "grad_norm": 0.038013920187950134, "learning_rate": 4.050772022589678e-05, "loss": 0.0073, "step": 62000 }, { "epoch": 0.46145272153074496, "grad_norm": 0.09236283600330353, "learning_rate": 4.04881654490559e-05, "loss": 0.0068, "step": 62100 }, { "epoch": 0.46219580159762214, "grad_norm": 0.04556288570165634, "learning_rate": 4.046861067221501e-05, "loss": 0.0071, "step": 62200 }, { "epoch": 0.46293888166449937, "grad_norm": 0.04730541631579399, "learning_rate": 4.044905589537412e-05, "loss": 0.0071, "step": 62300 }, { "epoch": 0.46368196173137655, "grad_norm": 0.03856947645545006, "learning_rate": 4.0429501118533234e-05, "loss": 0.007, "step": 62400 }, { "epoch": 0.4644250417982538, "grad_norm": 0.06590918451547623, "learning_rate": 4.040994634169235e-05, "loss": 0.0072, "step": 62500 }, { "epoch": 0.46516812186513096, "grad_norm": 0.07670744508504868, "learning_rate": 4.0390391564851464e-05, "loss": 0.0078, "step": 62600 }, { "epoch": 0.4659112019320082, "grad_norm": 0.015557796694338322, "learning_rate": 4.037083678801058e-05, "loss": 0.0069, "step": 62700 }, { "epoch": 0.46665428199888537, "grad_norm": 0.253831684589386, "learning_rate": 4.0351282011169686e-05, "loss": 0.0053, "step": 62800 }, { "epoch": 0.4673973620657626, "grad_norm": 0.043671976774930954, "learning_rate": 4.0331727234328805e-05, "loss": 0.0066, "step": 62900 }, { "epoch": 0.4681404421326398, "grad_norm": 0.07195639610290527, "learning_rate": 4.0312172457487916e-05, "loss": 0.0068, "step": 63000 }, { "epoch": 0.468883522199517, "grad_norm": 0.03793943300843239, "learning_rate": 4.0292617680647034e-05, "loss": 0.0072, "step": 63100 }, { "epoch": 0.4696266022663942, "grad_norm": 0.01745627634227276, "learning_rate": 4.027306290380614e-05, "loss": 0.0074, "step": 63200 }, { "epoch": 0.4703696823332714, "grad_norm": 0.20033705234527588, "learning_rate": 4.025350812696526e-05, "loss": 0.0078, "step": 63300 }, { "epoch": 0.4711127624001486, "grad_norm": 0.08661852777004242, "learning_rate": 4.023395335012437e-05, "loss": 0.0062, "step": 63400 }, { "epoch": 0.47185584246702583, "grad_norm": 0.07728056609630585, "learning_rate": 4.0214398573283486e-05, "loss": 0.0065, "step": 63500 }, { "epoch": 0.472598922533903, "grad_norm": 0.11773434281349182, "learning_rate": 4.01948437964426e-05, "loss": 0.0066, "step": 63600 }, { "epoch": 0.47334200260078024, "grad_norm": 0.028942395001649857, "learning_rate": 4.017528901960171e-05, "loss": 0.0076, "step": 63700 }, { "epoch": 0.4740850826676574, "grad_norm": 0.050476543605327606, "learning_rate": 4.015573424276082e-05, "loss": 0.0071, "step": 63800 }, { "epoch": 0.47482816273453465, "grad_norm": 0.04953029379248619, "learning_rate": 4.013617946591994e-05, "loss": 0.0078, "step": 63900 }, { "epoch": 0.47557124280141183, "grad_norm": 0.03061947226524353, "learning_rate": 4.011662468907905e-05, "loss": 0.0067, "step": 64000 }, { "epoch": 0.47631432286828906, "grad_norm": 0.08053992688655853, "learning_rate": 4.009706991223816e-05, "loss": 0.0069, "step": 64100 }, { "epoch": 0.47705740293516624, "grad_norm": 0.04122145101428032, "learning_rate": 4.007751513539727e-05, "loss": 0.0058, "step": 64200 }, { "epoch": 0.4778004830020435, "grad_norm": 0.04518519714474678, "learning_rate": 4.005796035855639e-05, "loss": 0.0073, "step": 64300 }, { "epoch": 0.47854356306892065, "grad_norm": 0.0653819814324379, "learning_rate": 4.00384055817155e-05, "loss": 0.0069, "step": 64400 }, { "epoch": 0.4792866431357979, "grad_norm": 0.18149197101593018, "learning_rate": 4.001885080487462e-05, "loss": 0.0057, "step": 64500 }, { "epoch": 0.48002972320267506, "grad_norm": 0.06164724752306938, "learning_rate": 3.9999296028033725e-05, "loss": 0.0067, "step": 64600 }, { "epoch": 0.4807728032695523, "grad_norm": 0.038935013115406036, "learning_rate": 3.997974125119284e-05, "loss": 0.0061, "step": 64700 }, { "epoch": 0.4815158833364295, "grad_norm": 0.29689714312553406, "learning_rate": 3.9960186474351955e-05, "loss": 0.0073, "step": 64800 }, { "epoch": 0.4822589634033067, "grad_norm": 0.09419193863868713, "learning_rate": 3.994063169751107e-05, "loss": 0.007, "step": 64900 }, { "epoch": 0.48300204347018394, "grad_norm": 0.026655124500393867, "learning_rate": 3.992107692067018e-05, "loss": 0.0072, "step": 65000 }, { "epoch": 0.4837451235370611, "grad_norm": 0.7815685272216797, "learning_rate": 3.9901522143829296e-05, "loss": 0.0066, "step": 65100 }, { "epoch": 0.48448820360393835, "grad_norm": 0.09732484072446823, "learning_rate": 3.988196736698841e-05, "loss": 0.0068, "step": 65200 }, { "epoch": 0.4852312836708155, "grad_norm": 0.17973355948925018, "learning_rate": 3.9862412590147525e-05, "loss": 0.0066, "step": 65300 }, { "epoch": 0.48597436373769276, "grad_norm": 0.02819853648543358, "learning_rate": 3.9842857813306636e-05, "loss": 0.0065, "step": 65400 }, { "epoch": 0.48671744380456994, "grad_norm": 0.049151696264743805, "learning_rate": 3.982330303646575e-05, "loss": 0.0058, "step": 65500 }, { "epoch": 0.48746052387144717, "grad_norm": 0.13917094469070435, "learning_rate": 3.980374825962486e-05, "loss": 0.0066, "step": 65600 }, { "epoch": 0.48820360393832435, "grad_norm": 0.037208326160907745, "learning_rate": 3.978419348278398e-05, "loss": 0.0074, "step": 65700 }, { "epoch": 0.4889466840052016, "grad_norm": 0.06454883515834808, "learning_rate": 3.976463870594309e-05, "loss": 0.0063, "step": 65800 }, { "epoch": 0.48968976407207876, "grad_norm": 0.03547971323132515, "learning_rate": 3.97450839291022e-05, "loss": 0.0068, "step": 65900 }, { "epoch": 0.490432844138956, "grad_norm": 1.3615156412124634, "learning_rate": 3.972552915226132e-05, "loss": 0.0069, "step": 66000 }, { "epoch": 0.49117592420583317, "grad_norm": 0.08487857133150101, "learning_rate": 3.970597437542043e-05, "loss": 0.0078, "step": 66100 }, { "epoch": 0.4919190042727104, "grad_norm": 0.029045216739177704, "learning_rate": 3.968641959857954e-05, "loss": 0.0061, "step": 66200 }, { "epoch": 0.4926620843395876, "grad_norm": 0.06188974902033806, "learning_rate": 3.966686482173866e-05, "loss": 0.0069, "step": 66300 }, { "epoch": 0.4934051644064648, "grad_norm": 0.040328025817871094, "learning_rate": 3.964731004489777e-05, "loss": 0.0063, "step": 66400 }, { "epoch": 0.494148244473342, "grad_norm": 0.060150083154439926, "learning_rate": 3.962775526805688e-05, "loss": 0.0078, "step": 66500 }, { "epoch": 0.4948913245402192, "grad_norm": 0.062278784811496735, "learning_rate": 3.960820049121599e-05, "loss": 0.0077, "step": 66600 }, { "epoch": 0.4956344046070964, "grad_norm": 0.03264576196670532, "learning_rate": 3.958864571437511e-05, "loss": 0.0071, "step": 66700 }, { "epoch": 0.49637748467397363, "grad_norm": 0.0662609115242958, "learning_rate": 3.956909093753422e-05, "loss": 0.0067, "step": 66800 }, { "epoch": 0.4971205647408508, "grad_norm": 0.06604277342557907, "learning_rate": 3.9549536160693334e-05, "loss": 0.0063, "step": 66900 }, { "epoch": 0.49786364480772805, "grad_norm": 0.23896543681621552, "learning_rate": 3.952998138385245e-05, "loss": 0.0071, "step": 67000 }, { "epoch": 0.4986067248746052, "grad_norm": 0.07176953554153442, "learning_rate": 3.9510426607011564e-05, "loss": 0.0069, "step": 67100 }, { "epoch": 0.49934980494148246, "grad_norm": 0.06186622381210327, "learning_rate": 3.9490871830170675e-05, "loss": 0.0072, "step": 67200 }, { "epoch": 0.5000928850083597, "grad_norm": 0.048317890614271164, "learning_rate": 3.9471317053329786e-05, "loss": 0.0071, "step": 67300 }, { "epoch": 0.5008359650752369, "grad_norm": 0.0677412897348404, "learning_rate": 3.9451762276488905e-05, "loss": 0.0058, "step": 67400 }, { "epoch": 0.501579045142114, "grad_norm": 0.056058160960674286, "learning_rate": 3.9432207499648016e-05, "loss": 0.0074, "step": 67500 }, { "epoch": 0.5023221252089912, "grad_norm": 0.05127792805433273, "learning_rate": 3.941265272280713e-05, "loss": 0.0064, "step": 67600 }, { "epoch": 0.5030652052758685, "grad_norm": 0.2445414960384369, "learning_rate": 3.9393097945966246e-05, "loss": 0.0064, "step": 67700 }, { "epoch": 0.5038082853427457, "grad_norm": 0.05038780719041824, "learning_rate": 3.937354316912536e-05, "loss": 0.0075, "step": 67800 }, { "epoch": 0.5045513654096229, "grad_norm": 0.02819100022315979, "learning_rate": 3.935398839228447e-05, "loss": 0.0054, "step": 67900 }, { "epoch": 0.5052944454765, "grad_norm": 0.03669000044465065, "learning_rate": 3.9334433615443586e-05, "loss": 0.0071, "step": 68000 }, { "epoch": 0.5060375255433773, "grad_norm": 0.23090702295303345, "learning_rate": 3.93148788386027e-05, "loss": 0.0075, "step": 68100 }, { "epoch": 0.5067806056102545, "grad_norm": 0.03818681836128235, "learning_rate": 3.929532406176181e-05, "loss": 0.0065, "step": 68200 }, { "epoch": 0.5075236856771317, "grad_norm": 0.053252410143613815, "learning_rate": 3.927576928492092e-05, "loss": 0.0059, "step": 68300 }, { "epoch": 0.5082667657440089, "grad_norm": 0.06621071696281433, "learning_rate": 3.925621450808004e-05, "loss": 0.0072, "step": 68400 }, { "epoch": 0.5090098458108862, "grad_norm": 0.022596845403313637, "learning_rate": 3.923665973123915e-05, "loss": 0.0079, "step": 68500 }, { "epoch": 0.5097529258777633, "grad_norm": 0.09641772508621216, "learning_rate": 3.921710495439827e-05, "loss": 0.0076, "step": 68600 }, { "epoch": 0.5104960059446405, "grad_norm": 0.03792642802000046, "learning_rate": 3.919755017755737e-05, "loss": 0.008, "step": 68700 }, { "epoch": 0.5112390860115178, "grad_norm": 0.14900176227092743, "learning_rate": 3.917799540071649e-05, "loss": 0.0069, "step": 68800 }, { "epoch": 0.511982166078395, "grad_norm": 0.009287470951676369, "learning_rate": 3.91584406238756e-05, "loss": 0.006, "step": 68900 }, { "epoch": 0.5127252461452722, "grad_norm": 0.05562896654009819, "learning_rate": 3.913888584703472e-05, "loss": 0.0074, "step": 69000 }, { "epoch": 0.5134683262121493, "grad_norm": 0.032463397830724716, "learning_rate": 3.9119331070193825e-05, "loss": 0.0068, "step": 69100 }, { "epoch": 0.5142114062790266, "grad_norm": 0.0895235687494278, "learning_rate": 3.909977629335294e-05, "loss": 0.0073, "step": 69200 }, { "epoch": 0.5149544863459038, "grad_norm": 0.04971130192279816, "learning_rate": 3.9080221516512055e-05, "loss": 0.0057, "step": 69300 }, { "epoch": 0.515697566412781, "grad_norm": 0.04046616330742836, "learning_rate": 3.906066673967117e-05, "loss": 0.0067, "step": 69400 }, { "epoch": 0.5164406464796581, "grad_norm": 0.11941362172365189, "learning_rate": 3.9041111962830284e-05, "loss": 0.006, "step": 69500 }, { "epoch": 0.5171837265465354, "grad_norm": 0.042669184505939484, "learning_rate": 3.9021557185989396e-05, "loss": 0.0076, "step": 69600 }, { "epoch": 0.5179268066134126, "grad_norm": 0.04774431884288788, "learning_rate": 3.900200240914851e-05, "loss": 0.0062, "step": 69700 }, { "epoch": 0.5186698866802898, "grad_norm": 0.21886157989501953, "learning_rate": 3.8982447632307625e-05, "loss": 0.0062, "step": 69800 }, { "epoch": 0.519412966747167, "grad_norm": 0.37722277641296387, "learning_rate": 3.8962892855466736e-05, "loss": 0.0073, "step": 69900 }, { "epoch": 0.5201560468140443, "grad_norm": 0.07759550213813782, "learning_rate": 3.894333807862585e-05, "loss": 0.0066, "step": 70000 }, { "epoch": 0.5208991268809214, "grad_norm": 0.12113677710294724, "learning_rate": 3.892378330178496e-05, "loss": 0.0065, "step": 70100 }, { "epoch": 0.5216422069477986, "grad_norm": 0.11159736663103104, "learning_rate": 3.890422852494408e-05, "loss": 0.0065, "step": 70200 }, { "epoch": 0.5223852870146758, "grad_norm": 0.06360876560211182, "learning_rate": 3.888467374810319e-05, "loss": 0.0068, "step": 70300 }, { "epoch": 0.5231283670815531, "grad_norm": 0.03688746690750122, "learning_rate": 3.886511897126231e-05, "loss": 0.0067, "step": 70400 }, { "epoch": 0.5238714471484303, "grad_norm": 0.07631249725818634, "learning_rate": 3.884556419442141e-05, "loss": 0.0061, "step": 70500 }, { "epoch": 0.5246145272153074, "grad_norm": 0.10997401922941208, "learning_rate": 3.882600941758053e-05, "loss": 0.0067, "step": 70600 }, { "epoch": 0.5253576072821846, "grad_norm": 0.18161103129386902, "learning_rate": 3.880645464073964e-05, "loss": 0.0076, "step": 70700 }, { "epoch": 0.5261006873490619, "grad_norm": 0.14414627850055695, "learning_rate": 3.878689986389876e-05, "loss": 0.0077, "step": 70800 }, { "epoch": 0.5268437674159391, "grad_norm": 0.038566283881664276, "learning_rate": 3.8767345087057864e-05, "loss": 0.006, "step": 70900 }, { "epoch": 0.5275868474828163, "grad_norm": 0.03068181499838829, "learning_rate": 3.874779031021698e-05, "loss": 0.0066, "step": 71000 }, { "epoch": 0.5283299275496934, "grad_norm": 0.050066541880369186, "learning_rate": 3.872823553337609e-05, "loss": 0.0053, "step": 71100 }, { "epoch": 0.5290730076165707, "grad_norm": 0.04796002060174942, "learning_rate": 3.870868075653521e-05, "loss": 0.0083, "step": 71200 }, { "epoch": 0.5298160876834479, "grad_norm": 0.04470355808734894, "learning_rate": 3.868912597969432e-05, "loss": 0.0068, "step": 71300 }, { "epoch": 0.5305591677503251, "grad_norm": 0.10867596417665482, "learning_rate": 3.8669571202853434e-05, "loss": 0.0061, "step": 71400 }, { "epoch": 0.5313022478172023, "grad_norm": 0.131153404712677, "learning_rate": 3.8650016426012546e-05, "loss": 0.0064, "step": 71500 }, { "epoch": 0.5320453278840795, "grad_norm": 0.021755604073405266, "learning_rate": 3.8630461649171664e-05, "loss": 0.0066, "step": 71600 }, { "epoch": 0.5327884079509567, "grad_norm": 0.11654971539974213, "learning_rate": 3.8610906872330775e-05, "loss": 0.0071, "step": 71700 }, { "epoch": 0.5335314880178339, "grad_norm": 0.08312838524580002, "learning_rate": 3.8591352095489886e-05, "loss": 0.0074, "step": 71800 }, { "epoch": 0.5342745680847111, "grad_norm": 0.0574042908847332, "learning_rate": 3.8571797318649e-05, "loss": 0.0061, "step": 71900 }, { "epoch": 0.5350176481515884, "grad_norm": 0.10169748216867447, "learning_rate": 3.8552242541808116e-05, "loss": 0.0069, "step": 72000 }, { "epoch": 0.5357607282184655, "grad_norm": 0.02821221947669983, "learning_rate": 3.853268776496723e-05, "loss": 0.0062, "step": 72100 }, { "epoch": 0.5365038082853427, "grad_norm": 0.1680557280778885, "learning_rate": 3.8513132988126346e-05, "loss": 0.0067, "step": 72200 }, { "epoch": 0.5372468883522199, "grad_norm": 0.06369991600513458, "learning_rate": 3.849357821128545e-05, "loss": 0.007, "step": 72300 }, { "epoch": 0.5379899684190972, "grad_norm": 0.04380596801638603, "learning_rate": 3.847402343444457e-05, "loss": 0.0066, "step": 72400 }, { "epoch": 0.5387330484859744, "grad_norm": 0.0793602466583252, "learning_rate": 3.845446865760368e-05, "loss": 0.0064, "step": 72500 }, { "epoch": 0.5394761285528515, "grad_norm": 0.06712214648723602, "learning_rate": 3.84349138807628e-05, "loss": 0.0073, "step": 72600 }, { "epoch": 0.5402192086197288, "grad_norm": 0.037750229239463806, "learning_rate": 3.84153591039219e-05, "loss": 0.006, "step": 72700 }, { "epoch": 0.540962288686606, "grad_norm": 0.0335860401391983, "learning_rate": 3.839580432708102e-05, "loss": 0.0075, "step": 72800 }, { "epoch": 0.5417053687534832, "grad_norm": 0.0425746776163578, "learning_rate": 3.837624955024013e-05, "loss": 0.0058, "step": 72900 }, { "epoch": 0.5424484488203604, "grad_norm": 0.03574962168931961, "learning_rate": 3.835669477339925e-05, "loss": 0.0056, "step": 73000 }, { "epoch": 0.5431915288872377, "grad_norm": 0.0517202764749527, "learning_rate": 3.833713999655836e-05, "loss": 0.0059, "step": 73100 }, { "epoch": 0.5439346089541148, "grad_norm": 0.0615508109331131, "learning_rate": 3.831758521971747e-05, "loss": 0.0067, "step": 73200 }, { "epoch": 0.544677689020992, "grad_norm": 0.10389608889818192, "learning_rate": 3.8298030442876584e-05, "loss": 0.0073, "step": 73300 }, { "epoch": 0.5454207690878692, "grad_norm": 0.33818569779396057, "learning_rate": 3.82784756660357e-05, "loss": 0.0068, "step": 73400 }, { "epoch": 0.5461638491547465, "grad_norm": 0.11588051170110703, "learning_rate": 3.8258920889194814e-05, "loss": 0.0055, "step": 73500 }, { "epoch": 0.5469069292216236, "grad_norm": 0.04014882445335388, "learning_rate": 3.823936611235393e-05, "loss": 0.0061, "step": 73600 }, { "epoch": 0.5476500092885008, "grad_norm": 0.031074797734618187, "learning_rate": 3.8219811335513037e-05, "loss": 0.0068, "step": 73700 }, { "epoch": 0.548393089355378, "grad_norm": 0.11280274391174316, "learning_rate": 3.8200256558672155e-05, "loss": 0.006, "step": 73800 }, { "epoch": 0.5491361694222553, "grad_norm": 0.028469381853938103, "learning_rate": 3.8180701781831266e-05, "loss": 0.0071, "step": 73900 }, { "epoch": 0.5498792494891325, "grad_norm": 0.11646797508001328, "learning_rate": 3.8161147004990384e-05, "loss": 0.0066, "step": 74000 }, { "epoch": 0.5506223295560096, "grad_norm": 0.050070106983184814, "learning_rate": 3.814159222814949e-05, "loss": 0.0071, "step": 74100 }, { "epoch": 0.5513654096228868, "grad_norm": 0.08478367328643799, "learning_rate": 3.812203745130861e-05, "loss": 0.0066, "step": 74200 }, { "epoch": 0.5521084896897641, "grad_norm": 0.04030213505029678, "learning_rate": 3.810248267446772e-05, "loss": 0.0061, "step": 74300 }, { "epoch": 0.5528515697566413, "grad_norm": 0.06712201237678528, "learning_rate": 3.8082927897626836e-05, "loss": 0.0059, "step": 74400 }, { "epoch": 0.5535946498235185, "grad_norm": 0.03344298154115677, "learning_rate": 3.806337312078595e-05, "loss": 0.0069, "step": 74500 }, { "epoch": 0.5543377298903956, "grad_norm": 0.04861340671777725, "learning_rate": 3.804381834394506e-05, "loss": 0.0072, "step": 74600 }, { "epoch": 0.5550808099572729, "grad_norm": 0.05839976668357849, "learning_rate": 3.802426356710417e-05, "loss": 0.0064, "step": 74700 }, { "epoch": 0.5558238900241501, "grad_norm": 0.05467729642987251, "learning_rate": 3.800470879026329e-05, "loss": 0.0066, "step": 74800 }, { "epoch": 0.5565669700910273, "grad_norm": 0.038264308124780655, "learning_rate": 3.79851540134224e-05, "loss": 0.0059, "step": 74900 }, { "epoch": 0.5573100501579045, "grad_norm": 0.07648076862096786, "learning_rate": 3.796559923658151e-05, "loss": 0.0073, "step": 75000 }, { "epoch": 0.5580531302247818, "grad_norm": 0.06901638209819794, "learning_rate": 3.794604445974062e-05, "loss": 0.0072, "step": 75100 }, { "epoch": 0.5587962102916589, "grad_norm": 0.029322953894734383, "learning_rate": 3.792648968289974e-05, "loss": 0.0066, "step": 75200 }, { "epoch": 0.5595392903585361, "grad_norm": 0.04787376523017883, "learning_rate": 3.790693490605885e-05, "loss": 0.0064, "step": 75300 }, { "epoch": 0.5602823704254133, "grad_norm": 0.029377618804574013, "learning_rate": 3.788738012921797e-05, "loss": 0.007, "step": 75400 }, { "epoch": 0.5610254504922906, "grad_norm": 0.019534962251782417, "learning_rate": 3.786782535237708e-05, "loss": 0.0066, "step": 75500 }, { "epoch": 0.5617685305591678, "grad_norm": 0.04995133727788925, "learning_rate": 3.784827057553619e-05, "loss": 0.0062, "step": 75600 }, { "epoch": 0.5625116106260449, "grad_norm": 0.06099386513233185, "learning_rate": 3.7828715798695305e-05, "loss": 0.0067, "step": 75700 }, { "epoch": 0.5632546906929221, "grad_norm": 0.05053669959306717, "learning_rate": 3.780916102185442e-05, "loss": 0.0065, "step": 75800 }, { "epoch": 0.5639977707597994, "grad_norm": 0.04671207070350647, "learning_rate": 3.7789606245013534e-05, "loss": 0.0067, "step": 75900 }, { "epoch": 0.5647408508266766, "grad_norm": 0.15050657093524933, "learning_rate": 3.7770051468172646e-05, "loss": 0.0069, "step": 76000 }, { "epoch": 0.5654839308935538, "grad_norm": 0.071275994181633, "learning_rate": 3.775049669133176e-05, "loss": 0.0071, "step": 76100 }, { "epoch": 0.5662270109604309, "grad_norm": 0.08324550092220306, "learning_rate": 3.7730941914490875e-05, "loss": 0.0065, "step": 76200 }, { "epoch": 0.5669700910273082, "grad_norm": 0.048320431262254715, "learning_rate": 3.7711387137649987e-05, "loss": 0.007, "step": 76300 }, { "epoch": 0.5677131710941854, "grad_norm": 0.019057622179389, "learning_rate": 3.76918323608091e-05, "loss": 0.0067, "step": 76400 }, { "epoch": 0.5684562511610626, "grad_norm": 0.05270780250430107, "learning_rate": 3.7672277583968216e-05, "loss": 0.0064, "step": 76500 }, { "epoch": 0.5691993312279399, "grad_norm": 0.1253121793270111, "learning_rate": 3.765272280712733e-05, "loss": 0.0069, "step": 76600 }, { "epoch": 0.569942411294817, "grad_norm": 0.04551755636930466, "learning_rate": 3.763316803028644e-05, "loss": 0.0065, "step": 76700 }, { "epoch": 0.5706854913616942, "grad_norm": 0.08351539075374603, "learning_rate": 3.761361325344555e-05, "loss": 0.0064, "step": 76800 }, { "epoch": 0.5714285714285714, "grad_norm": 0.024084163829684258, "learning_rate": 3.759405847660467e-05, "loss": 0.0073, "step": 76900 }, { "epoch": 0.5721716514954487, "grad_norm": 0.2050718367099762, "learning_rate": 3.757450369976378e-05, "loss": 0.0064, "step": 77000 }, { "epoch": 0.5729147315623259, "grad_norm": 0.06663031131029129, "learning_rate": 3.75549489229229e-05, "loss": 0.0063, "step": 77100 }, { "epoch": 0.573657811629203, "grad_norm": 0.06490831077098846, "learning_rate": 3.753539414608201e-05, "loss": 0.007, "step": 77200 }, { "epoch": 0.5744008916960802, "grad_norm": 0.33138546347618103, "learning_rate": 3.751583936924112e-05, "loss": 0.006, "step": 77300 }, { "epoch": 0.5751439717629575, "grad_norm": 0.23077349364757538, "learning_rate": 3.749628459240023e-05, "loss": 0.0062, "step": 77400 }, { "epoch": 0.5758870518298347, "grad_norm": 0.03562178090214729, "learning_rate": 3.747672981555935e-05, "loss": 0.0077, "step": 77500 }, { "epoch": 0.5766301318967119, "grad_norm": 0.027963368222117424, "learning_rate": 3.745717503871846e-05, "loss": 0.0072, "step": 77600 }, { "epoch": 0.577373211963589, "grad_norm": 0.07369949668645859, "learning_rate": 3.743762026187757e-05, "loss": 0.0061, "step": 77700 }, { "epoch": 0.5781162920304663, "grad_norm": 0.0403934046626091, "learning_rate": 3.7418065485036684e-05, "loss": 0.007, "step": 77800 }, { "epoch": 0.5788593720973435, "grad_norm": 0.07512319833040237, "learning_rate": 3.73985107081958e-05, "loss": 0.0068, "step": 77900 }, { "epoch": 0.5796024521642207, "grad_norm": 0.10490557551383972, "learning_rate": 3.7378955931354914e-05, "loss": 0.0064, "step": 78000 }, { "epoch": 0.5803455322310979, "grad_norm": 0.0216389037668705, "learning_rate": 3.735940115451403e-05, "loss": 0.0064, "step": 78100 }, { "epoch": 0.5810886122979751, "grad_norm": 0.10643122345209122, "learning_rate": 3.7339846377673137e-05, "loss": 0.0076, "step": 78200 }, { "epoch": 0.5818316923648523, "grad_norm": 0.040842846035957336, "learning_rate": 3.7320291600832255e-05, "loss": 0.0064, "step": 78300 }, { "epoch": 0.5825747724317295, "grad_norm": 0.03898189589381218, "learning_rate": 3.7300736823991366e-05, "loss": 0.0058, "step": 78400 }, { "epoch": 0.5833178524986067, "grad_norm": 0.06246241554617882, "learning_rate": 3.7281182047150484e-05, "loss": 0.0065, "step": 78500 }, { "epoch": 0.584060932565484, "grad_norm": 0.027250947430729866, "learning_rate": 3.726162727030959e-05, "loss": 0.0071, "step": 78600 }, { "epoch": 0.5848040126323611, "grad_norm": 0.01998763345181942, "learning_rate": 3.724207249346871e-05, "loss": 0.0068, "step": 78700 }, { "epoch": 0.5855470926992383, "grad_norm": 0.039847228676080704, "learning_rate": 3.722251771662782e-05, "loss": 0.0066, "step": 78800 }, { "epoch": 0.5862901727661155, "grad_norm": 0.06855083256959915, "learning_rate": 3.7202962939786937e-05, "loss": 0.0063, "step": 78900 }, { "epoch": 0.5870332528329928, "grad_norm": 0.0495438352227211, "learning_rate": 3.718340816294605e-05, "loss": 0.0057, "step": 79000 }, { "epoch": 0.58777633289987, "grad_norm": 0.051476895809173584, "learning_rate": 3.716385338610516e-05, "loss": 0.0081, "step": 79100 }, { "epoch": 0.5885194129667471, "grad_norm": 0.2565764784812927, "learning_rate": 3.714429860926427e-05, "loss": 0.0058, "step": 79200 }, { "epoch": 0.5892624930336243, "grad_norm": 0.04290062189102173, "learning_rate": 3.712474383242339e-05, "loss": 0.0065, "step": 79300 }, { "epoch": 0.5900055731005016, "grad_norm": 0.05738433450460434, "learning_rate": 3.71051890555825e-05, "loss": 0.0057, "step": 79400 }, { "epoch": 0.5907486531673788, "grad_norm": 0.08283357322216034, "learning_rate": 3.708563427874161e-05, "loss": 0.0063, "step": 79500 }, { "epoch": 0.591491733234256, "grad_norm": 0.029125187546014786, "learning_rate": 3.706607950190072e-05, "loss": 0.008, "step": 79600 }, { "epoch": 0.5922348133011331, "grad_norm": 0.02318490855395794, "learning_rate": 3.704652472505984e-05, "loss": 0.0064, "step": 79700 }, { "epoch": 0.5929778933680104, "grad_norm": 0.044088348746299744, "learning_rate": 3.702696994821895e-05, "loss": 0.0061, "step": 79800 }, { "epoch": 0.5937209734348876, "grad_norm": 0.026523584499955177, "learning_rate": 3.700741517137807e-05, "loss": 0.0057, "step": 79900 }, { "epoch": 0.5944640535017648, "grad_norm": 0.09515058994293213, "learning_rate": 3.6987860394537175e-05, "loss": 0.007, "step": 80000 }, { "epoch": 0.595207133568642, "grad_norm": 0.057157207280397415, "learning_rate": 3.6968305617696293e-05, "loss": 0.0059, "step": 80100 }, { "epoch": 0.5959502136355193, "grad_norm": 0.11583970487117767, "learning_rate": 3.6948750840855405e-05, "loss": 0.0065, "step": 80200 }, { "epoch": 0.5966932937023964, "grad_norm": 0.02544792741537094, "learning_rate": 3.692919606401452e-05, "loss": 0.0062, "step": 80300 }, { "epoch": 0.5974363737692736, "grad_norm": 0.02506929822266102, "learning_rate": 3.6909641287173634e-05, "loss": 0.0076, "step": 80400 }, { "epoch": 0.5981794538361509, "grad_norm": 0.04937093332409859, "learning_rate": 3.6890086510332746e-05, "loss": 0.006, "step": 80500 }, { "epoch": 0.5989225339030281, "grad_norm": 0.06194600090384483, "learning_rate": 3.687053173349186e-05, "loss": 0.0066, "step": 80600 }, { "epoch": 0.5996656139699053, "grad_norm": 0.022812608629465103, "learning_rate": 3.6850976956650975e-05, "loss": 0.0079, "step": 80700 }, { "epoch": 0.6004086940367824, "grad_norm": 0.1491018384695053, "learning_rate": 3.6831422179810087e-05, "loss": 0.0069, "step": 80800 }, { "epoch": 0.6011517741036597, "grad_norm": 0.02633541077375412, "learning_rate": 3.68118674029692e-05, "loss": 0.0062, "step": 80900 }, { "epoch": 0.6018948541705369, "grad_norm": 0.27069804072380066, "learning_rate": 3.679231262612831e-05, "loss": 0.0082, "step": 81000 }, { "epoch": 0.6026379342374141, "grad_norm": 0.08121224492788315, "learning_rate": 3.677275784928743e-05, "loss": 0.0064, "step": 81100 }, { "epoch": 0.6033810143042913, "grad_norm": 0.03438674658536911, "learning_rate": 3.675320307244654e-05, "loss": 0.006, "step": 81200 }, { "epoch": 0.6041240943711685, "grad_norm": 0.033248528838157654, "learning_rate": 3.673364829560566e-05, "loss": 0.0061, "step": 81300 }, { "epoch": 0.6048671744380457, "grad_norm": 0.060492534190416336, "learning_rate": 3.671409351876476e-05, "loss": 0.0066, "step": 81400 }, { "epoch": 0.6056102545049229, "grad_norm": 0.043629564344882965, "learning_rate": 3.669453874192388e-05, "loss": 0.006, "step": 81500 }, { "epoch": 0.6063533345718001, "grad_norm": 0.027690380811691284, "learning_rate": 3.667498396508299e-05, "loss": 0.0071, "step": 81600 }, { "epoch": 0.6070964146386774, "grad_norm": 0.0386318601667881, "learning_rate": 3.665542918824211e-05, "loss": 0.006, "step": 81700 }, { "epoch": 0.6078394947055545, "grad_norm": 0.0509650781750679, "learning_rate": 3.6635874411401214e-05, "loss": 0.007, "step": 81800 }, { "epoch": 0.6085825747724317, "grad_norm": 0.08680703490972519, "learning_rate": 3.661631963456033e-05, "loss": 0.0065, "step": 81900 }, { "epoch": 0.6093256548393089, "grad_norm": 0.05884992331266403, "learning_rate": 3.6596764857719443e-05, "loss": 0.0074, "step": 82000 }, { "epoch": 0.6100687349061862, "grad_norm": 0.07556629925966263, "learning_rate": 3.657721008087856e-05, "loss": 0.0061, "step": 82100 }, { "epoch": 0.6108118149730634, "grad_norm": 0.09777213633060455, "learning_rate": 3.655765530403767e-05, "loss": 0.0059, "step": 82200 }, { "epoch": 0.6115548950399405, "grad_norm": 0.043701108545064926, "learning_rate": 3.6538100527196784e-05, "loss": 0.0064, "step": 82300 }, { "epoch": 0.6122979751068177, "grad_norm": 0.03626338765025139, "learning_rate": 3.6518545750355896e-05, "loss": 0.0066, "step": 82400 }, { "epoch": 0.613041055173695, "grad_norm": 0.06993650645017624, "learning_rate": 3.6498990973515014e-05, "loss": 0.0063, "step": 82500 }, { "epoch": 0.6137841352405722, "grad_norm": 0.10026181489229202, "learning_rate": 3.6479436196674125e-05, "loss": 0.0058, "step": 82600 }, { "epoch": 0.6145272153074494, "grad_norm": 0.185280442237854, "learning_rate": 3.645988141983324e-05, "loss": 0.0064, "step": 82700 }, { "epoch": 0.6152702953743265, "grad_norm": 0.09437177330255508, "learning_rate": 3.644032664299235e-05, "loss": 0.0058, "step": 82800 }, { "epoch": 0.6160133754412038, "grad_norm": 0.06788663566112518, "learning_rate": 3.6420771866151466e-05, "loss": 0.0059, "step": 82900 }, { "epoch": 0.616756455508081, "grad_norm": 0.021169152110815048, "learning_rate": 3.640121708931058e-05, "loss": 0.0066, "step": 83000 }, { "epoch": 0.6174995355749582, "grad_norm": 0.08427069336175919, "learning_rate": 3.6381662312469696e-05, "loss": 0.0075, "step": 83100 }, { "epoch": 0.6182426156418354, "grad_norm": 0.0374024398624897, "learning_rate": 3.63621075356288e-05, "loss": 0.0063, "step": 83200 }, { "epoch": 0.6189856957087126, "grad_norm": 0.03982897102832794, "learning_rate": 3.634255275878792e-05, "loss": 0.0073, "step": 83300 }, { "epoch": 0.6197287757755898, "grad_norm": 0.09790994226932526, "learning_rate": 3.632299798194703e-05, "loss": 0.0064, "step": 83400 }, { "epoch": 0.620471855842467, "grad_norm": 0.05136306583881378, "learning_rate": 3.630344320510615e-05, "loss": 0.0064, "step": 83500 }, { "epoch": 0.6212149359093442, "grad_norm": 0.14014025032520294, "learning_rate": 3.628388842826525e-05, "loss": 0.007, "step": 83600 }, { "epoch": 0.6219580159762215, "grad_norm": 0.028016263619065285, "learning_rate": 3.626433365142437e-05, "loss": 0.0067, "step": 83700 }, { "epoch": 0.6227010960430986, "grad_norm": 0.1596749871969223, "learning_rate": 3.624477887458348e-05, "loss": 0.0061, "step": 83800 }, { "epoch": 0.6234441761099758, "grad_norm": 0.032186128199100494, "learning_rate": 3.62252240977426e-05, "loss": 0.006, "step": 83900 }, { "epoch": 0.6241872561768531, "grad_norm": 0.1675434708595276, "learning_rate": 3.620566932090171e-05, "loss": 0.0067, "step": 84000 }, { "epoch": 0.6249303362437303, "grad_norm": 0.08103535324335098, "learning_rate": 3.618611454406082e-05, "loss": 0.0077, "step": 84100 }, { "epoch": 0.6256734163106075, "grad_norm": 0.010908874683082104, "learning_rate": 3.6166559767219934e-05, "loss": 0.0054, "step": 84200 }, { "epoch": 0.6264164963774846, "grad_norm": 0.042283590883016586, "learning_rate": 3.614700499037905e-05, "loss": 0.0065, "step": 84300 }, { "epoch": 0.6271595764443619, "grad_norm": 0.048543937504291534, "learning_rate": 3.6127450213538164e-05, "loss": 0.0073, "step": 84400 }, { "epoch": 0.6279026565112391, "grad_norm": 0.05129114165902138, "learning_rate": 3.6107895436697275e-05, "loss": 0.0064, "step": 84500 }, { "epoch": 0.6286457365781163, "grad_norm": 0.03513653203845024, "learning_rate": 3.608834065985639e-05, "loss": 0.0071, "step": 84600 }, { "epoch": 0.6293888166449935, "grad_norm": 0.03375072404742241, "learning_rate": 3.6068785883015505e-05, "loss": 0.0058, "step": 84700 }, { "epoch": 0.6301318967118708, "grad_norm": 0.20123067498207092, "learning_rate": 3.6049231106174616e-05, "loss": 0.0058, "step": 84800 }, { "epoch": 0.6308749767787479, "grad_norm": 0.05587746948003769, "learning_rate": 3.6029676329333734e-05, "loss": 0.0065, "step": 84900 }, { "epoch": 0.6316180568456251, "grad_norm": 0.04399985447525978, "learning_rate": 3.6010121552492846e-05, "loss": 0.0071, "step": 85000 }, { "epoch": 0.6323611369125023, "grad_norm": 0.031752388924360275, "learning_rate": 3.599056677565196e-05, "loss": 0.0071, "step": 85100 }, { "epoch": 0.6331042169793796, "grad_norm": 0.06719953566789627, "learning_rate": 3.597101199881107e-05, "loss": 0.0062, "step": 85200 }, { "epoch": 0.6338472970462568, "grad_norm": 0.04719695821404457, "learning_rate": 3.595145722197019e-05, "loss": 0.0063, "step": 85300 }, { "epoch": 0.6345903771131339, "grad_norm": 0.04174108803272247, "learning_rate": 3.59319024451293e-05, "loss": 0.0065, "step": 85400 }, { "epoch": 0.6353334571800111, "grad_norm": 0.06959343701601028, "learning_rate": 3.591234766828841e-05, "loss": 0.0063, "step": 85500 }, { "epoch": 0.6360765372468884, "grad_norm": 0.150008887052536, "learning_rate": 3.589279289144753e-05, "loss": 0.0061, "step": 85600 }, { "epoch": 0.6368196173137656, "grad_norm": 0.11431419849395752, "learning_rate": 3.587323811460664e-05, "loss": 0.0067, "step": 85700 }, { "epoch": 0.6375626973806428, "grad_norm": 0.06842583417892456, "learning_rate": 3.585368333776575e-05, "loss": 0.0069, "step": 85800 }, { "epoch": 0.6383057774475199, "grad_norm": 0.02928929217159748, "learning_rate": 3.583412856092486e-05, "loss": 0.006, "step": 85900 }, { "epoch": 0.6390488575143972, "grad_norm": 0.06702953577041626, "learning_rate": 3.581457378408398e-05, "loss": 0.0078, "step": 86000 }, { "epoch": 0.6397919375812744, "grad_norm": 0.0410122387111187, "learning_rate": 3.579501900724309e-05, "loss": 0.0059, "step": 86100 }, { "epoch": 0.6405350176481516, "grad_norm": 0.03292258456349373, "learning_rate": 3.57754642304022e-05, "loss": 0.0072, "step": 86200 }, { "epoch": 0.6412780977150287, "grad_norm": 0.13459153473377228, "learning_rate": 3.575590945356132e-05, "loss": 0.0064, "step": 86300 }, { "epoch": 0.642021177781906, "grad_norm": 0.07568851858377457, "learning_rate": 3.573635467672043e-05, "loss": 0.0064, "step": 86400 }, { "epoch": 0.6427642578487832, "grad_norm": 0.07055988162755966, "learning_rate": 3.5716799899879543e-05, "loss": 0.0068, "step": 86500 }, { "epoch": 0.6435073379156604, "grad_norm": 0.03178887069225311, "learning_rate": 3.569724512303866e-05, "loss": 0.0063, "step": 86600 }, { "epoch": 0.6442504179825376, "grad_norm": 0.021333517506718636, "learning_rate": 3.567769034619777e-05, "loss": 0.0079, "step": 86700 }, { "epoch": 0.6449934980494149, "grad_norm": 0.030071459710597992, "learning_rate": 3.5658135569356884e-05, "loss": 0.0064, "step": 86800 }, { "epoch": 0.645736578116292, "grad_norm": 0.056355055421590805, "learning_rate": 3.5638580792515996e-05, "loss": 0.0068, "step": 86900 }, { "epoch": 0.6464796581831692, "grad_norm": 0.11301963776350021, "learning_rate": 3.5619026015675114e-05, "loss": 0.0067, "step": 87000 }, { "epoch": 0.6472227382500464, "grad_norm": 0.04488132521510124, "learning_rate": 3.5599471238834225e-05, "loss": 0.0067, "step": 87100 }, { "epoch": 0.6479658183169237, "grad_norm": 0.020405517891049385, "learning_rate": 3.5579916461993343e-05, "loss": 0.0063, "step": 87200 }, { "epoch": 0.6487088983838009, "grad_norm": 0.03566555678844452, "learning_rate": 3.556036168515245e-05, "loss": 0.0061, "step": 87300 }, { "epoch": 0.649451978450678, "grad_norm": 0.07072973996400833, "learning_rate": 3.5540806908311566e-05, "loss": 0.0067, "step": 87400 }, { "epoch": 0.6501950585175552, "grad_norm": 0.22543004155158997, "learning_rate": 3.552125213147068e-05, "loss": 0.0063, "step": 87500 }, { "epoch": 0.6509381385844325, "grad_norm": 0.06148708984255791, "learning_rate": 3.5501697354629796e-05, "loss": 0.0078, "step": 87600 }, { "epoch": 0.6516812186513097, "grad_norm": 0.1072181686758995, "learning_rate": 3.54821425777889e-05, "loss": 0.0059, "step": 87700 }, { "epoch": 0.6524242987181869, "grad_norm": 0.15133073925971985, "learning_rate": 3.546258780094802e-05, "loss": 0.0067, "step": 87800 }, { "epoch": 0.6531673787850641, "grad_norm": 0.1838243007659912, "learning_rate": 3.544303302410713e-05, "loss": 0.0055, "step": 87900 }, { "epoch": 0.6539104588519413, "grad_norm": 0.045693330466747284, "learning_rate": 3.542347824726625e-05, "loss": 0.0053, "step": 88000 }, { "epoch": 0.6546535389188185, "grad_norm": 0.027483217418193817, "learning_rate": 3.540392347042536e-05, "loss": 0.0069, "step": 88100 }, { "epoch": 0.6553966189856957, "grad_norm": 0.07440247386693954, "learning_rate": 3.538436869358447e-05, "loss": 0.0065, "step": 88200 }, { "epoch": 0.656139699052573, "grad_norm": 0.10762964189052582, "learning_rate": 3.536481391674358e-05, "loss": 0.0054, "step": 88300 }, { "epoch": 0.6568827791194501, "grad_norm": 0.04020560160279274, "learning_rate": 3.53452591399027e-05, "loss": 0.0061, "step": 88400 }, { "epoch": 0.6576258591863273, "grad_norm": 0.0629815086722374, "learning_rate": 3.532570436306181e-05, "loss": 0.0071, "step": 88500 }, { "epoch": 0.6583689392532045, "grad_norm": 0.052937667816877365, "learning_rate": 3.530614958622092e-05, "loss": 0.0078, "step": 88600 }, { "epoch": 0.6591120193200818, "grad_norm": 0.028042234480381012, "learning_rate": 3.5286594809380034e-05, "loss": 0.0067, "step": 88700 }, { "epoch": 0.659855099386959, "grad_norm": 0.04339709132909775, "learning_rate": 3.526704003253915e-05, "loss": 0.0067, "step": 88800 }, { "epoch": 0.6605981794538361, "grad_norm": 0.055476486682891846, "learning_rate": 3.5247485255698264e-05, "loss": 0.0057, "step": 88900 }, { "epoch": 0.6613412595207133, "grad_norm": 0.03407488763332367, "learning_rate": 3.522793047885738e-05, "loss": 0.0068, "step": 89000 }, { "epoch": 0.6620843395875906, "grad_norm": 0.09843266010284424, "learning_rate": 3.520837570201649e-05, "loss": 0.0068, "step": 89100 }, { "epoch": 0.6628274196544678, "grad_norm": 0.06433319300413132, "learning_rate": 3.5188820925175605e-05, "loss": 0.0071, "step": 89200 }, { "epoch": 0.663570499721345, "grad_norm": 0.032195206731557846, "learning_rate": 3.5169266148334716e-05, "loss": 0.0065, "step": 89300 }, { "epoch": 0.6643135797882221, "grad_norm": 0.0706305280327797, "learning_rate": 3.5149711371493834e-05, "loss": 0.0056, "step": 89400 }, { "epoch": 0.6650566598550994, "grad_norm": 0.03372344747185707, "learning_rate": 3.513015659465294e-05, "loss": 0.0068, "step": 89500 }, { "epoch": 0.6657997399219766, "grad_norm": 0.20770859718322754, "learning_rate": 3.511060181781206e-05, "loss": 0.0085, "step": 89600 }, { "epoch": 0.6665428199888538, "grad_norm": 0.031605448573827744, "learning_rate": 3.509104704097117e-05, "loss": 0.0069, "step": 89700 }, { "epoch": 0.667285900055731, "grad_norm": 0.019227096810936928, "learning_rate": 3.507149226413029e-05, "loss": 0.0069, "step": 89800 }, { "epoch": 0.6680289801226083, "grad_norm": 0.04627182334661484, "learning_rate": 3.50519374872894e-05, "loss": 0.0064, "step": 89900 }, { "epoch": 0.6687720601894854, "grad_norm": 0.036982838064432144, "learning_rate": 3.503238271044851e-05, "loss": 0.0069, "step": 90000 }, { "epoch": 0.6695151402563626, "grad_norm": 0.02531169354915619, "learning_rate": 3.501282793360762e-05, "loss": 0.0066, "step": 90100 }, { "epoch": 0.6702582203232398, "grad_norm": 0.07996533811092377, "learning_rate": 3.499327315676674e-05, "loss": 0.0073, "step": 90200 }, { "epoch": 0.6710013003901171, "grad_norm": 0.031867947429418564, "learning_rate": 3.497371837992585e-05, "loss": 0.0055, "step": 90300 }, { "epoch": 0.6717443804569943, "grad_norm": 0.0333222970366478, "learning_rate": 3.495416360308496e-05, "loss": 0.0075, "step": 90400 }, { "epoch": 0.6724874605238714, "grad_norm": 0.06809353828430176, "learning_rate": 3.493460882624407e-05, "loss": 0.0072, "step": 90500 }, { "epoch": 0.6732305405907486, "grad_norm": 0.030337151139974594, "learning_rate": 3.491505404940319e-05, "loss": 0.0067, "step": 90600 }, { "epoch": 0.6739736206576259, "grad_norm": 0.03327574208378792, "learning_rate": 3.48954992725623e-05, "loss": 0.0071, "step": 90700 }, { "epoch": 0.6747167007245031, "grad_norm": 0.0618930347263813, "learning_rate": 3.487594449572142e-05, "loss": 0.006, "step": 90800 }, { "epoch": 0.6754597807913802, "grad_norm": 0.16700859367847443, "learning_rate": 3.4856389718880525e-05, "loss": 0.0057, "step": 90900 }, { "epoch": 0.6762028608582574, "grad_norm": 0.2687858045101166, "learning_rate": 3.4836834942039644e-05, "loss": 0.0064, "step": 91000 }, { "epoch": 0.6769459409251347, "grad_norm": 0.051657866686582565, "learning_rate": 3.4817280165198755e-05, "loss": 0.0063, "step": 91100 }, { "epoch": 0.6776890209920119, "grad_norm": 0.05352407321333885, "learning_rate": 3.479772538835787e-05, "loss": 0.0061, "step": 91200 }, { "epoch": 0.6784321010588891, "grad_norm": 0.02180003747344017, "learning_rate": 3.477817061151698e-05, "loss": 0.0055, "step": 91300 }, { "epoch": 0.6791751811257662, "grad_norm": 0.15057185292243958, "learning_rate": 3.4758615834676096e-05, "loss": 0.0073, "step": 91400 }, { "epoch": 0.6799182611926435, "grad_norm": 0.05810995027422905, "learning_rate": 3.473906105783521e-05, "loss": 0.007, "step": 91500 }, { "epoch": 0.6806613412595207, "grad_norm": 0.044675521552562714, "learning_rate": 3.4719506280994325e-05, "loss": 0.0063, "step": 91600 }, { "epoch": 0.6814044213263979, "grad_norm": 0.0578312985599041, "learning_rate": 3.469995150415344e-05, "loss": 0.0074, "step": 91700 }, { "epoch": 0.6821475013932752, "grad_norm": 0.03594698756933212, "learning_rate": 3.468039672731255e-05, "loss": 0.0081, "step": 91800 }, { "epoch": 0.6828905814601524, "grad_norm": 0.022044429555535316, "learning_rate": 3.466084195047166e-05, "loss": 0.0067, "step": 91900 }, { "epoch": 0.6836336615270295, "grad_norm": 0.02994769997894764, "learning_rate": 3.464128717363078e-05, "loss": 0.0062, "step": 92000 }, { "epoch": 0.6843767415939067, "grad_norm": 0.04821077734231949, "learning_rate": 3.462173239678989e-05, "loss": 0.0065, "step": 92100 }, { "epoch": 0.685119821660784, "grad_norm": 0.04815341532230377, "learning_rate": 3.460217761994901e-05, "loss": 0.0058, "step": 92200 }, { "epoch": 0.6858629017276612, "grad_norm": 0.014437396079301834, "learning_rate": 3.458262284310811e-05, "loss": 0.0076, "step": 92300 }, { "epoch": 0.6866059817945384, "grad_norm": 0.05083870515227318, "learning_rate": 3.456306806626723e-05, "loss": 0.0062, "step": 92400 }, { "epoch": 0.6873490618614155, "grad_norm": 0.0693231076002121, "learning_rate": 3.454351328942634e-05, "loss": 0.0064, "step": 92500 }, { "epoch": 0.6880921419282928, "grad_norm": 0.03957217559218407, "learning_rate": 3.452395851258546e-05, "loss": 0.0068, "step": 92600 }, { "epoch": 0.68883522199517, "grad_norm": 0.03965717554092407, "learning_rate": 3.4504403735744564e-05, "loss": 0.0065, "step": 92700 }, { "epoch": 0.6895783020620472, "grad_norm": 0.09057923406362534, "learning_rate": 3.448484895890368e-05, "loss": 0.0061, "step": 92800 }, { "epoch": 0.6903213821289244, "grad_norm": 0.035107094794511795, "learning_rate": 3.4465294182062794e-05, "loss": 0.0059, "step": 92900 }, { "epoch": 0.6910644621958016, "grad_norm": 0.07109538465738297, "learning_rate": 3.444573940522191e-05, "loss": 0.0056, "step": 93000 }, { "epoch": 0.6918075422626788, "grad_norm": 0.03408554568886757, "learning_rate": 3.442618462838102e-05, "loss": 0.006, "step": 93100 }, { "epoch": 0.692550622329556, "grad_norm": 0.039063431322574615, "learning_rate": 3.4406629851540134e-05, "loss": 0.0057, "step": 93200 }, { "epoch": 0.6932937023964332, "grad_norm": 0.04761190712451935, "learning_rate": 3.4387075074699246e-05, "loss": 0.0063, "step": 93300 }, { "epoch": 0.6940367824633105, "grad_norm": 0.10074614733457565, "learning_rate": 3.4367520297858364e-05, "loss": 0.0068, "step": 93400 }, { "epoch": 0.6947798625301876, "grad_norm": 0.05666917935013771, "learning_rate": 3.4347965521017475e-05, "loss": 0.0071, "step": 93500 }, { "epoch": 0.6955229425970648, "grad_norm": 0.05065885931253433, "learning_rate": 3.432841074417659e-05, "loss": 0.0063, "step": 93600 }, { "epoch": 0.696266022663942, "grad_norm": 0.10709994286298752, "learning_rate": 3.43088559673357e-05, "loss": 0.0065, "step": 93700 }, { "epoch": 0.6970091027308193, "grad_norm": 0.027238914743065834, "learning_rate": 3.4289301190494816e-05, "loss": 0.007, "step": 93800 }, { "epoch": 0.6977521827976965, "grad_norm": 0.03197634965181351, "learning_rate": 3.426974641365393e-05, "loss": 0.0064, "step": 93900 }, { "epoch": 0.6984952628645736, "grad_norm": 0.05432217940688133, "learning_rate": 3.4250191636813046e-05, "loss": 0.0067, "step": 94000 }, { "epoch": 0.6992383429314508, "grad_norm": 0.04275660589337349, "learning_rate": 3.423063685997216e-05, "loss": 0.0075, "step": 94100 }, { "epoch": 0.6999814229983281, "grad_norm": 0.0674133226275444, "learning_rate": 3.421108208313127e-05, "loss": 0.0063, "step": 94200 }, { "epoch": 0.7007245030652053, "grad_norm": 0.2420826554298401, "learning_rate": 3.419152730629038e-05, "loss": 0.0063, "step": 94300 }, { "epoch": 0.7014675831320825, "grad_norm": 0.11363525688648224, "learning_rate": 3.41719725294495e-05, "loss": 0.0056, "step": 94400 }, { "epoch": 0.7022106631989596, "grad_norm": 0.1013726070523262, "learning_rate": 3.415241775260861e-05, "loss": 0.0065, "step": 94500 }, { "epoch": 0.7029537432658369, "grad_norm": 0.02509518526494503, "learning_rate": 3.413286297576772e-05, "loss": 0.0066, "step": 94600 }, { "epoch": 0.7036968233327141, "grad_norm": 0.04764735698699951, "learning_rate": 3.411330819892683e-05, "loss": 0.0064, "step": 94700 }, { "epoch": 0.7044399033995913, "grad_norm": 0.08878335356712341, "learning_rate": 3.409375342208595e-05, "loss": 0.007, "step": 94800 }, { "epoch": 0.7051829834664685, "grad_norm": 0.3652065396308899, "learning_rate": 3.407419864524506e-05, "loss": 0.0056, "step": 94900 }, { "epoch": 0.7059260635333457, "grad_norm": 0.09314832091331482, "learning_rate": 3.405464386840417e-05, "loss": 0.0068, "step": 95000 }, { "epoch": 0.7066691436002229, "grad_norm": 0.04099220409989357, "learning_rate": 3.403508909156329e-05, "loss": 0.007, "step": 95100 }, { "epoch": 0.7074122236671001, "grad_norm": 0.06356357783079147, "learning_rate": 3.40155343147224e-05, "loss": 0.0068, "step": 95200 }, { "epoch": 0.7081553037339773, "grad_norm": 0.17001478374004364, "learning_rate": 3.3995979537881514e-05, "loss": 0.0053, "step": 95300 }, { "epoch": 0.7088983838008546, "grad_norm": 0.03735348954796791, "learning_rate": 3.3976424761040625e-05, "loss": 0.0067, "step": 95400 }, { "epoch": 0.7096414638677317, "grad_norm": 0.16057884693145752, "learning_rate": 3.3956869984199744e-05, "loss": 0.0084, "step": 95500 }, { "epoch": 0.7103845439346089, "grad_norm": 0.06293246150016785, "learning_rate": 3.3937315207358855e-05, "loss": 0.0078, "step": 95600 }, { "epoch": 0.7111276240014862, "grad_norm": 0.07786872982978821, "learning_rate": 3.391776043051797e-05, "loss": 0.0065, "step": 95700 }, { "epoch": 0.7118707040683634, "grad_norm": 0.037619397044181824, "learning_rate": 3.3898205653677084e-05, "loss": 0.006, "step": 95800 }, { "epoch": 0.7126137841352406, "grad_norm": 0.03258482366800308, "learning_rate": 3.3878650876836196e-05, "loss": 0.0071, "step": 95900 }, { "epoch": 0.7133568642021177, "grad_norm": 0.03448516130447388, "learning_rate": 3.385909609999531e-05, "loss": 0.0062, "step": 96000 }, { "epoch": 0.714099944268995, "grad_norm": 0.16463284194469452, "learning_rate": 3.3839541323154425e-05, "loss": 0.0073, "step": 96100 }, { "epoch": 0.7148430243358722, "grad_norm": 0.035245850682258606, "learning_rate": 3.381998654631354e-05, "loss": 0.0075, "step": 96200 }, { "epoch": 0.7155861044027494, "grad_norm": 0.031211234629154205, "learning_rate": 3.380043176947265e-05, "loss": 0.0064, "step": 96300 }, { "epoch": 0.7163291844696266, "grad_norm": 0.04479566588997841, "learning_rate": 3.378087699263176e-05, "loss": 0.0059, "step": 96400 }, { "epoch": 0.7170722645365039, "grad_norm": 0.03747931495308876, "learning_rate": 3.376132221579088e-05, "loss": 0.0069, "step": 96500 }, { "epoch": 0.717815344603381, "grad_norm": 0.056844692677259445, "learning_rate": 3.374176743894999e-05, "loss": 0.0065, "step": 96600 }, { "epoch": 0.7185584246702582, "grad_norm": 0.02942519076168537, "learning_rate": 3.372221266210911e-05, "loss": 0.0067, "step": 96700 }, { "epoch": 0.7193015047371354, "grad_norm": 0.0663616955280304, "learning_rate": 3.370265788526821e-05, "loss": 0.0064, "step": 96800 }, { "epoch": 0.7200445848040127, "grad_norm": 0.2324199378490448, "learning_rate": 3.368310310842733e-05, "loss": 0.0066, "step": 96900 }, { "epoch": 0.7207876648708899, "grad_norm": 0.10291825979948044, "learning_rate": 3.366354833158644e-05, "loss": 0.0059, "step": 97000 }, { "epoch": 0.721530744937767, "grad_norm": 0.03113536722958088, "learning_rate": 3.364399355474556e-05, "loss": 0.0074, "step": 97100 }, { "epoch": 0.7222738250046442, "grad_norm": 0.05275307595729828, "learning_rate": 3.3624438777904664e-05, "loss": 0.0072, "step": 97200 }, { "epoch": 0.7230169050715215, "grad_norm": 0.03769388794898987, "learning_rate": 3.360488400106378e-05, "loss": 0.0061, "step": 97300 }, { "epoch": 0.7237599851383987, "grad_norm": 0.02600817009806633, "learning_rate": 3.3585329224222894e-05, "loss": 0.0066, "step": 97400 }, { "epoch": 0.7245030652052759, "grad_norm": 0.04954436793923378, "learning_rate": 3.356577444738201e-05, "loss": 0.0066, "step": 97500 }, { "epoch": 0.725246145272153, "grad_norm": 0.10566754639148712, "learning_rate": 3.354621967054112e-05, "loss": 0.0065, "step": 97600 }, { "epoch": 0.7259892253390303, "grad_norm": 0.14290623366832733, "learning_rate": 3.3526664893700234e-05, "loss": 0.0069, "step": 97700 }, { "epoch": 0.7267323054059075, "grad_norm": 0.058128975331783295, "learning_rate": 3.3507110116859346e-05, "loss": 0.0071, "step": 97800 }, { "epoch": 0.7274753854727847, "grad_norm": 0.038245782256126404, "learning_rate": 3.3487555340018464e-05, "loss": 0.0063, "step": 97900 }, { "epoch": 0.7282184655396619, "grad_norm": 0.031173225492239, "learning_rate": 3.3468000563177575e-05, "loss": 0.0074, "step": 98000 }, { "epoch": 0.7289615456065391, "grad_norm": 0.08169999718666077, "learning_rate": 3.3448445786336694e-05, "loss": 0.007, "step": 98100 }, { "epoch": 0.7297046256734163, "grad_norm": 0.06899525970220566, "learning_rate": 3.34288910094958e-05, "loss": 0.0058, "step": 98200 }, { "epoch": 0.7304477057402935, "grad_norm": 0.09821130335330963, "learning_rate": 3.3409336232654916e-05, "loss": 0.007, "step": 98300 }, { "epoch": 0.7311907858071707, "grad_norm": 0.2154228538274765, "learning_rate": 3.338978145581403e-05, "loss": 0.007, "step": 98400 }, { "epoch": 0.731933865874048, "grad_norm": 0.05544310808181763, "learning_rate": 3.3370226678973146e-05, "loss": 0.0067, "step": 98500 }, { "epoch": 0.7326769459409251, "grad_norm": 0.034354615956544876, "learning_rate": 3.335067190213225e-05, "loss": 0.0069, "step": 98600 }, { "epoch": 0.7334200260078023, "grad_norm": 0.039293549954891205, "learning_rate": 3.333111712529137e-05, "loss": 0.0063, "step": 98700 }, { "epoch": 0.7341631060746795, "grad_norm": 0.03999043256044388, "learning_rate": 3.331156234845048e-05, "loss": 0.0065, "step": 98800 }, { "epoch": 0.7349061861415568, "grad_norm": 0.03497440367937088, "learning_rate": 3.32920075716096e-05, "loss": 0.0059, "step": 98900 }, { "epoch": 0.735649266208434, "grad_norm": 0.02828408218920231, "learning_rate": 3.327245279476871e-05, "loss": 0.006, "step": 99000 }, { "epoch": 0.7363923462753111, "grad_norm": 0.05637276917695999, "learning_rate": 3.325289801792782e-05, "loss": 0.0068, "step": 99100 }, { "epoch": 0.7371354263421883, "grad_norm": 0.07254074513912201, "learning_rate": 3.323334324108693e-05, "loss": 0.0067, "step": 99200 }, { "epoch": 0.7378785064090656, "grad_norm": 0.05682109668850899, "learning_rate": 3.321378846424605e-05, "loss": 0.0053, "step": 99300 }, { "epoch": 0.7386215864759428, "grad_norm": 0.06408507376909256, "learning_rate": 3.319423368740516e-05, "loss": 0.006, "step": 99400 }, { "epoch": 0.73936466654282, "grad_norm": 0.14018365740776062, "learning_rate": 3.317467891056427e-05, "loss": 0.005, "step": 99500 }, { "epoch": 0.7401077466096972, "grad_norm": 0.04723042622208595, "learning_rate": 3.3155124133723385e-05, "loss": 0.0059, "step": 99600 }, { "epoch": 0.7408508266765744, "grad_norm": 0.018479060381650925, "learning_rate": 3.31355693568825e-05, "loss": 0.006, "step": 99700 }, { "epoch": 0.7415939067434516, "grad_norm": 0.4316589832305908, "learning_rate": 3.3116014580041614e-05, "loss": 0.0068, "step": 99800 }, { "epoch": 0.7423369868103288, "grad_norm": 0.044118013232946396, "learning_rate": 3.309645980320073e-05, "loss": 0.0066, "step": 99900 }, { "epoch": 0.7430800668772061, "grad_norm": 0.03065306879580021, "learning_rate": 3.307690502635984e-05, "loss": 0.0081, "step": 100000 }, { "epoch": 0.7438231469440832, "grad_norm": 0.068635955452919, "learning_rate": 3.3057350249518955e-05, "loss": 0.0065, "step": 100100 }, { "epoch": 0.7445662270109604, "grad_norm": 0.03909127786755562, "learning_rate": 3.3037795472678066e-05, "loss": 0.0065, "step": 100200 }, { "epoch": 0.7453093070778376, "grad_norm": 0.019512400031089783, "learning_rate": 3.3018240695837185e-05, "loss": 0.0062, "step": 100300 }, { "epoch": 0.7460523871447149, "grad_norm": 0.03361629322171211, "learning_rate": 3.299868591899629e-05, "loss": 0.0064, "step": 100400 }, { "epoch": 0.7467954672115921, "grad_norm": 0.16772803664207458, "learning_rate": 3.297913114215541e-05, "loss": 0.0071, "step": 100500 }, { "epoch": 0.7475385472784692, "grad_norm": 0.044461384415626526, "learning_rate": 3.295957636531452e-05, "loss": 0.005, "step": 100600 }, { "epoch": 0.7482816273453464, "grad_norm": 0.04803602397441864, "learning_rate": 3.294002158847364e-05, "loss": 0.0069, "step": 100700 }, { "epoch": 0.7490247074122237, "grad_norm": 0.04461536929011345, "learning_rate": 3.292046681163275e-05, "loss": 0.0061, "step": 100800 }, { "epoch": 0.7497677874791009, "grad_norm": 0.06720276176929474, "learning_rate": 3.290091203479186e-05, "loss": 0.0059, "step": 100900 }, { "epoch": 0.7505108675459781, "grad_norm": 0.03081546351313591, "learning_rate": 3.288135725795097e-05, "loss": 0.0063, "step": 101000 }, { "epoch": 0.7512539476128552, "grad_norm": 0.03153671324253082, "learning_rate": 3.286180248111009e-05, "loss": 0.0053, "step": 101100 }, { "epoch": 0.7519970276797325, "grad_norm": 0.014290716499090195, "learning_rate": 3.28422477042692e-05, "loss": 0.006, "step": 101200 }, { "epoch": 0.7527401077466097, "grad_norm": 0.06006433442234993, "learning_rate": 3.282269292742831e-05, "loss": 0.0056, "step": 101300 }, { "epoch": 0.7534831878134869, "grad_norm": 0.11329374462366104, "learning_rate": 3.280313815058742e-05, "loss": 0.0054, "step": 101400 }, { "epoch": 0.7542262678803641, "grad_norm": 0.04265223816037178, "learning_rate": 3.278358337374654e-05, "loss": 0.0057, "step": 101500 }, { "epoch": 0.7549693479472414, "grad_norm": 0.04358424246311188, "learning_rate": 3.276402859690565e-05, "loss": 0.0057, "step": 101600 }, { "epoch": 0.7557124280141185, "grad_norm": 0.09236228466033936, "learning_rate": 3.274447382006477e-05, "loss": 0.0057, "step": 101700 }, { "epoch": 0.7564555080809957, "grad_norm": 0.0772443637251854, "learning_rate": 3.2724919043223875e-05, "loss": 0.0072, "step": 101800 }, { "epoch": 0.7571985881478729, "grad_norm": 0.02888018824160099, "learning_rate": 3.2705364266382994e-05, "loss": 0.0066, "step": 101900 }, { "epoch": 0.7579416682147502, "grad_norm": 0.059490542858839035, "learning_rate": 3.2685809489542105e-05, "loss": 0.0055, "step": 102000 }, { "epoch": 0.7586847482816274, "grad_norm": 0.03580286726355553, "learning_rate": 3.266625471270122e-05, "loss": 0.0062, "step": 102100 }, { "epoch": 0.7594278283485045, "grad_norm": 0.10320024192333221, "learning_rate": 3.264669993586033e-05, "loss": 0.0067, "step": 102200 }, { "epoch": 0.7601709084153817, "grad_norm": 0.027698146179318428, "learning_rate": 3.2627145159019446e-05, "loss": 0.0069, "step": 102300 }, { "epoch": 0.760913988482259, "grad_norm": 0.0631444901227951, "learning_rate": 3.260759038217856e-05, "loss": 0.0064, "step": 102400 }, { "epoch": 0.7616570685491362, "grad_norm": 0.07676903158426285, "learning_rate": 3.2588035605337675e-05, "loss": 0.0053, "step": 102500 }, { "epoch": 0.7624001486160134, "grad_norm": 0.04025654494762421, "learning_rate": 3.256848082849679e-05, "loss": 0.0055, "step": 102600 }, { "epoch": 0.7631432286828905, "grad_norm": 0.061624012887477875, "learning_rate": 3.25489260516559e-05, "loss": 0.0067, "step": 102700 }, { "epoch": 0.7638863087497678, "grad_norm": 0.052770234644412994, "learning_rate": 3.252937127481501e-05, "loss": 0.0065, "step": 102800 }, { "epoch": 0.764629388816645, "grad_norm": 0.029357800260186195, "learning_rate": 3.250981649797413e-05, "loss": 0.0067, "step": 102900 }, { "epoch": 0.7653724688835222, "grad_norm": 0.09723399579524994, "learning_rate": 3.249026172113324e-05, "loss": 0.0076, "step": 103000 }, { "epoch": 0.7661155489503995, "grad_norm": 0.05525946244597435, "learning_rate": 3.247070694429235e-05, "loss": 0.0063, "step": 103100 }, { "epoch": 0.7668586290172766, "grad_norm": 0.017887825146317482, "learning_rate": 3.245115216745146e-05, "loss": 0.0055, "step": 103200 }, { "epoch": 0.7676017090841538, "grad_norm": 0.0247503574937582, "learning_rate": 3.243159739061058e-05, "loss": 0.0062, "step": 103300 }, { "epoch": 0.768344789151031, "grad_norm": 0.09833431243896484, "learning_rate": 3.241204261376969e-05, "loss": 0.0071, "step": 103400 }, { "epoch": 0.7690878692179083, "grad_norm": 0.029891351237893105, "learning_rate": 3.239248783692881e-05, "loss": 0.007, "step": 103500 }, { "epoch": 0.7698309492847855, "grad_norm": 0.038249559700489044, "learning_rate": 3.237293306008792e-05, "loss": 0.0061, "step": 103600 }, { "epoch": 0.7705740293516626, "grad_norm": 0.053829796612262726, "learning_rate": 3.235337828324703e-05, "loss": 0.0059, "step": 103700 }, { "epoch": 0.7713171094185398, "grad_norm": 0.09326806664466858, "learning_rate": 3.2333823506406144e-05, "loss": 0.0071, "step": 103800 }, { "epoch": 0.7720601894854171, "grad_norm": 0.10540059208869934, "learning_rate": 3.231426872956526e-05, "loss": 0.0062, "step": 103900 }, { "epoch": 0.7728032695522943, "grad_norm": 0.08925174176692963, "learning_rate": 3.229471395272437e-05, "loss": 0.0062, "step": 104000 }, { "epoch": 0.7735463496191715, "grad_norm": 0.05753634497523308, "learning_rate": 3.2275159175883485e-05, "loss": 0.0063, "step": 104100 }, { "epoch": 0.7742894296860486, "grad_norm": 0.024254625663161278, "learning_rate": 3.22556043990426e-05, "loss": 0.0064, "step": 104200 }, { "epoch": 0.7750325097529259, "grad_norm": 0.044000301510095596, "learning_rate": 3.2236049622201714e-05, "loss": 0.0058, "step": 104300 }, { "epoch": 0.7757755898198031, "grad_norm": 0.029884913936257362, "learning_rate": 3.2216494845360825e-05, "loss": 0.006, "step": 104400 }, { "epoch": 0.7765186698866803, "grad_norm": 0.034837640821933746, "learning_rate": 3.219694006851994e-05, "loss": 0.0066, "step": 104500 }, { "epoch": 0.7772617499535575, "grad_norm": 0.052576303482055664, "learning_rate": 3.2177385291679055e-05, "loss": 0.0071, "step": 104600 }, { "epoch": 0.7780048300204347, "grad_norm": 0.08560702949762344, "learning_rate": 3.2157830514838166e-05, "loss": 0.0063, "step": 104700 }, { "epoch": 0.7787479100873119, "grad_norm": 0.17175813019275665, "learning_rate": 3.213827573799728e-05, "loss": 0.006, "step": 104800 }, { "epoch": 0.7794909901541891, "grad_norm": 0.027089327573776245, "learning_rate": 3.2118720961156396e-05, "loss": 0.0057, "step": 104900 }, { "epoch": 0.7802340702210663, "grad_norm": 0.045576732605695724, "learning_rate": 3.209916618431551e-05, "loss": 0.0057, "step": 105000 }, { "epoch": 0.7809771502879436, "grad_norm": 0.03752901405096054, "learning_rate": 3.207961140747462e-05, "loss": 0.0066, "step": 105100 }, { "epoch": 0.7817202303548207, "grad_norm": 0.051144734025001526, "learning_rate": 3.206005663063374e-05, "loss": 0.0059, "step": 105200 }, { "epoch": 0.7824633104216979, "grad_norm": 0.10436317324638367, "learning_rate": 3.204050185379285e-05, "loss": 0.0064, "step": 105300 }, { "epoch": 0.7832063904885751, "grad_norm": 0.042949654161930084, "learning_rate": 3.202094707695196e-05, "loss": 0.0053, "step": 105400 }, { "epoch": 0.7839494705554524, "grad_norm": 0.019532397389411926, "learning_rate": 3.200139230011107e-05, "loss": 0.0057, "step": 105500 }, { "epoch": 0.7846925506223296, "grad_norm": 0.2989101707935333, "learning_rate": 3.198183752327019e-05, "loss": 0.0056, "step": 105600 }, { "epoch": 0.7854356306892067, "grad_norm": 0.047879137098789215, "learning_rate": 3.19622827464293e-05, "loss": 0.0056, "step": 105700 }, { "epoch": 0.7861787107560839, "grad_norm": 0.22436703741550446, "learning_rate": 3.194272796958841e-05, "loss": 0.0063, "step": 105800 }, { "epoch": 0.7869217908229612, "grad_norm": 0.2309117168188095, "learning_rate": 3.192317319274752e-05, "loss": 0.0064, "step": 105900 }, { "epoch": 0.7876648708898384, "grad_norm": 0.0556345209479332, "learning_rate": 3.190361841590664e-05, "loss": 0.0062, "step": 106000 }, { "epoch": 0.7884079509567156, "grad_norm": 0.04221750795841217, "learning_rate": 3.188406363906575e-05, "loss": 0.0062, "step": 106100 }, { "epoch": 0.7891510310235927, "grad_norm": 0.15252208709716797, "learning_rate": 3.186450886222487e-05, "loss": 0.0067, "step": 106200 }, { "epoch": 0.78989411109047, "grad_norm": 0.08618135005235672, "learning_rate": 3.1844954085383976e-05, "loss": 0.0063, "step": 106300 }, { "epoch": 0.7906371911573472, "grad_norm": 0.09586931020021439, "learning_rate": 3.1825399308543094e-05, "loss": 0.0068, "step": 106400 }, { "epoch": 0.7913802712242244, "grad_norm": 0.03233255073428154, "learning_rate": 3.1805844531702205e-05, "loss": 0.0063, "step": 106500 }, { "epoch": 0.7921233512911016, "grad_norm": 0.024492453783750534, "learning_rate": 3.178628975486132e-05, "loss": 0.0071, "step": 106600 }, { "epoch": 0.7928664313579789, "grad_norm": 0.04290681332349777, "learning_rate": 3.1766734978020435e-05, "loss": 0.0055, "step": 106700 }, { "epoch": 0.793609511424856, "grad_norm": 0.04963888227939606, "learning_rate": 3.1747180201179546e-05, "loss": 0.0059, "step": 106800 }, { "epoch": 0.7943525914917332, "grad_norm": 0.07590314000844955, "learning_rate": 3.172762542433866e-05, "loss": 0.0066, "step": 106900 }, { "epoch": 0.7950956715586105, "grad_norm": 0.052557770162820816, "learning_rate": 3.1708070647497775e-05, "loss": 0.0064, "step": 107000 }, { "epoch": 0.7958387516254877, "grad_norm": 0.03719668090343475, "learning_rate": 3.168851587065689e-05, "loss": 0.0057, "step": 107100 }, { "epoch": 0.7965818316923649, "grad_norm": 0.0635104849934578, "learning_rate": 3.1668961093816e-05, "loss": 0.0055, "step": 107200 }, { "epoch": 0.797324911759242, "grad_norm": 0.05139888450503349, "learning_rate": 3.164940631697511e-05, "loss": 0.0067, "step": 107300 }, { "epoch": 0.7980679918261193, "grad_norm": 0.08006453514099121, "learning_rate": 3.162985154013423e-05, "loss": 0.0067, "step": 107400 }, { "epoch": 0.7988110718929965, "grad_norm": 0.02387141063809395, "learning_rate": 3.161029676329334e-05, "loss": 0.0054, "step": 107500 }, { "epoch": 0.7995541519598737, "grad_norm": 0.04144104942679405, "learning_rate": 3.159074198645246e-05, "loss": 0.0076, "step": 107600 }, { "epoch": 0.8002972320267508, "grad_norm": 0.029028529301285744, "learning_rate": 3.157118720961156e-05, "loss": 0.0058, "step": 107700 }, { "epoch": 0.8010403120936281, "grad_norm": 0.04836150258779526, "learning_rate": 3.155163243277068e-05, "loss": 0.0065, "step": 107800 }, { "epoch": 0.8017833921605053, "grad_norm": 0.05697936192154884, "learning_rate": 3.153207765592979e-05, "loss": 0.0068, "step": 107900 }, { "epoch": 0.8025264722273825, "grad_norm": 0.01834951341152191, "learning_rate": 3.151252287908891e-05, "loss": 0.0071, "step": 108000 }, { "epoch": 0.8032695522942597, "grad_norm": 0.05598684027791023, "learning_rate": 3.1492968102248014e-05, "loss": 0.0057, "step": 108100 }, { "epoch": 0.804012632361137, "grad_norm": 0.040742773562669754, "learning_rate": 3.147341332540713e-05, "loss": 0.0069, "step": 108200 }, { "epoch": 0.8047557124280141, "grad_norm": 0.044210854917764664, "learning_rate": 3.1453858548566244e-05, "loss": 0.0061, "step": 108300 }, { "epoch": 0.8054987924948913, "grad_norm": 0.03336597606539726, "learning_rate": 3.143430377172536e-05, "loss": 0.0058, "step": 108400 }, { "epoch": 0.8062418725617685, "grad_norm": 0.053590767085552216, "learning_rate": 3.141474899488447e-05, "loss": 0.0068, "step": 108500 }, { "epoch": 0.8069849526286458, "grad_norm": 0.04109320044517517, "learning_rate": 3.1395194218043585e-05, "loss": 0.0069, "step": 108600 }, { "epoch": 0.807728032695523, "grad_norm": 0.024044660851359367, "learning_rate": 3.1375639441202696e-05, "loss": 0.0059, "step": 108700 }, { "epoch": 0.8084711127624001, "grad_norm": 0.06568124145269394, "learning_rate": 3.1356084664361814e-05, "loss": 0.0062, "step": 108800 }, { "epoch": 0.8092141928292773, "grad_norm": 0.03477349132299423, "learning_rate": 3.1336529887520926e-05, "loss": 0.0055, "step": 108900 }, { "epoch": 0.8099572728961546, "grad_norm": 0.03696465119719505, "learning_rate": 3.131697511068004e-05, "loss": 0.007, "step": 109000 }, { "epoch": 0.8107003529630318, "grad_norm": 0.08681201189756393, "learning_rate": 3.129742033383915e-05, "loss": 0.0068, "step": 109100 }, { "epoch": 0.811443433029909, "grad_norm": 0.04944351688027382, "learning_rate": 3.1277865556998266e-05, "loss": 0.0067, "step": 109200 }, { "epoch": 0.8121865130967861, "grad_norm": 0.04247577488422394, "learning_rate": 3.125831078015738e-05, "loss": 0.0058, "step": 109300 }, { "epoch": 0.8129295931636634, "grad_norm": 0.15919862687587738, "learning_rate": 3.1238756003316496e-05, "loss": 0.0065, "step": 109400 }, { "epoch": 0.8136726732305406, "grad_norm": 0.03389531001448631, "learning_rate": 3.12192012264756e-05, "loss": 0.0053, "step": 109500 }, { "epoch": 0.8144157532974178, "grad_norm": 0.03440622612833977, "learning_rate": 3.119964644963472e-05, "loss": 0.0073, "step": 109600 }, { "epoch": 0.815158833364295, "grad_norm": 0.057851821184158325, "learning_rate": 3.118009167279383e-05, "loss": 0.0068, "step": 109700 }, { "epoch": 0.8159019134311722, "grad_norm": 0.05478671193122864, "learning_rate": 3.116053689595295e-05, "loss": 0.0065, "step": 109800 }, { "epoch": 0.8166449934980494, "grad_norm": 0.07140800356864929, "learning_rate": 3.114098211911205e-05, "loss": 0.0062, "step": 109900 }, { "epoch": 0.8173880735649266, "grad_norm": 0.026401204988360405, "learning_rate": 3.112142734227117e-05, "loss": 0.0046, "step": 110000 }, { "epoch": 0.8181311536318038, "grad_norm": 0.06907286494970322, "learning_rate": 3.110187256543028e-05, "loss": 0.0056, "step": 110100 }, { "epoch": 0.8188742336986811, "grad_norm": 0.28904491662979126, "learning_rate": 3.10823177885894e-05, "loss": 0.0062, "step": 110200 }, { "epoch": 0.8196173137655582, "grad_norm": 0.060815900564193726, "learning_rate": 3.106276301174851e-05, "loss": 0.0061, "step": 110300 }, { "epoch": 0.8203603938324354, "grad_norm": 0.0635075569152832, "learning_rate": 3.104320823490762e-05, "loss": 0.0058, "step": 110400 }, { "epoch": 0.8211034738993126, "grad_norm": 0.028261402621865273, "learning_rate": 3.1023653458066735e-05, "loss": 0.0061, "step": 110500 }, { "epoch": 0.8218465539661899, "grad_norm": 0.10022028535604477, "learning_rate": 3.100409868122585e-05, "loss": 0.0065, "step": 110600 }, { "epoch": 0.8225896340330671, "grad_norm": 0.05299481377005577, "learning_rate": 3.0984543904384964e-05, "loss": 0.0069, "step": 110700 }, { "epoch": 0.8233327140999442, "grad_norm": 0.09829452633857727, "learning_rate": 3.096498912754408e-05, "loss": 0.0058, "step": 110800 }, { "epoch": 0.8240757941668215, "grad_norm": 0.02933032251894474, "learning_rate": 3.094543435070319e-05, "loss": 0.0061, "step": 110900 }, { "epoch": 0.8248188742336987, "grad_norm": 0.02067357487976551, "learning_rate": 3.0925879573862305e-05, "loss": 0.0059, "step": 111000 }, { "epoch": 0.8255619543005759, "grad_norm": 0.19346797466278076, "learning_rate": 3.0906324797021416e-05, "loss": 0.0062, "step": 111100 }, { "epoch": 0.8263050343674531, "grad_norm": 0.044643282890319824, "learning_rate": 3.0886770020180535e-05, "loss": 0.0059, "step": 111200 }, { "epoch": 0.8270481144343304, "grad_norm": 0.036223504692316055, "learning_rate": 3.086721524333964e-05, "loss": 0.0065, "step": 111300 }, { "epoch": 0.8277911945012075, "grad_norm": 0.09298253059387207, "learning_rate": 3.084766046649876e-05, "loss": 0.0063, "step": 111400 }, { "epoch": 0.8285342745680847, "grad_norm": 0.042920805513858795, "learning_rate": 3.082810568965787e-05, "loss": 0.0063, "step": 111500 }, { "epoch": 0.8292773546349619, "grad_norm": 0.06851477175951004, "learning_rate": 3.080855091281699e-05, "loss": 0.0066, "step": 111600 }, { "epoch": 0.8300204347018392, "grad_norm": 0.09208864718675613, "learning_rate": 3.07889961359761e-05, "loss": 0.0055, "step": 111700 }, { "epoch": 0.8307635147687163, "grad_norm": 0.040274105966091156, "learning_rate": 3.076944135913521e-05, "loss": 0.006, "step": 111800 }, { "epoch": 0.8315065948355935, "grad_norm": 0.04038352146744728, "learning_rate": 3.074988658229432e-05, "loss": 0.0065, "step": 111900 }, { "epoch": 0.8322496749024707, "grad_norm": 0.038999609649181366, "learning_rate": 3.073033180545344e-05, "loss": 0.0065, "step": 112000 }, { "epoch": 0.832992754969348, "grad_norm": 0.04587159305810928, "learning_rate": 3.071077702861255e-05, "loss": 0.0053, "step": 112100 }, { "epoch": 0.8337358350362252, "grad_norm": 0.17746977508068085, "learning_rate": 3.069122225177166e-05, "loss": 0.007, "step": 112200 }, { "epoch": 0.8344789151031023, "grad_norm": 0.0348338782787323, "learning_rate": 3.067166747493077e-05, "loss": 0.0062, "step": 112300 }, { "epoch": 0.8352219951699795, "grad_norm": 0.02965143881738186, "learning_rate": 3.065211269808989e-05, "loss": 0.0066, "step": 112400 }, { "epoch": 0.8359650752368568, "grad_norm": 0.04646526649594307, "learning_rate": 3.0632557921249e-05, "loss": 0.0056, "step": 112500 }, { "epoch": 0.836708155303734, "grad_norm": 0.032307349145412445, "learning_rate": 3.061300314440812e-05, "loss": 0.0067, "step": 112600 }, { "epoch": 0.8374512353706112, "grad_norm": 0.05828447639942169, "learning_rate": 3.0593448367567226e-05, "loss": 0.006, "step": 112700 }, { "epoch": 0.8381943154374883, "grad_norm": 0.07058124989271164, "learning_rate": 3.0573893590726344e-05, "loss": 0.005, "step": 112800 }, { "epoch": 0.8389373955043656, "grad_norm": 0.028138762339949608, "learning_rate": 3.0554338813885455e-05, "loss": 0.0066, "step": 112900 }, { "epoch": 0.8396804755712428, "grad_norm": 0.04968423768877983, "learning_rate": 3.053478403704457e-05, "loss": 0.0064, "step": 113000 }, { "epoch": 0.84042355563812, "grad_norm": 0.02797107584774494, "learning_rate": 3.051522926020368e-05, "loss": 0.0056, "step": 113100 }, { "epoch": 0.8411666357049972, "grad_norm": 0.04483191296458244, "learning_rate": 3.0495674483362796e-05, "loss": 0.0065, "step": 113200 }, { "epoch": 0.8419097157718745, "grad_norm": 0.06275834143161774, "learning_rate": 3.047611970652191e-05, "loss": 0.0063, "step": 113300 }, { "epoch": 0.8426527958387516, "grad_norm": 0.125113382935524, "learning_rate": 3.0456564929681026e-05, "loss": 0.0062, "step": 113400 }, { "epoch": 0.8433958759056288, "grad_norm": 0.04724840447306633, "learning_rate": 3.043701015284014e-05, "loss": 0.0059, "step": 113500 }, { "epoch": 0.844138955972506, "grad_norm": 0.04281444847583771, "learning_rate": 3.041745537599925e-05, "loss": 0.0059, "step": 113600 }, { "epoch": 0.8448820360393833, "grad_norm": 0.03714798018336296, "learning_rate": 3.0397900599158363e-05, "loss": 0.0063, "step": 113700 }, { "epoch": 0.8456251161062605, "grad_norm": 0.017769157886505127, "learning_rate": 3.0378345822317478e-05, "loss": 0.006, "step": 113800 }, { "epoch": 0.8463681961731376, "grad_norm": 0.07786805182695389, "learning_rate": 3.0358791045476593e-05, "loss": 0.0066, "step": 113900 }, { "epoch": 0.8471112762400148, "grad_norm": 0.02498606964945793, "learning_rate": 3.03392362686357e-05, "loss": 0.0062, "step": 114000 }, { "epoch": 0.8478543563068921, "grad_norm": 0.03650170564651489, "learning_rate": 3.0319681491794815e-05, "loss": 0.0066, "step": 114100 }, { "epoch": 0.8485974363737693, "grad_norm": 0.4203026592731476, "learning_rate": 3.030012671495393e-05, "loss": 0.0073, "step": 114200 }, { "epoch": 0.8493405164406465, "grad_norm": 0.0662989392876625, "learning_rate": 3.0280571938113045e-05, "loss": 0.0057, "step": 114300 }, { "epoch": 0.8500835965075236, "grad_norm": 0.030153153464198112, "learning_rate": 3.026101716127216e-05, "loss": 0.0059, "step": 114400 }, { "epoch": 0.8508266765744009, "grad_norm": 0.04885231703519821, "learning_rate": 3.0241462384431268e-05, "loss": 0.006, "step": 114500 }, { "epoch": 0.8515697566412781, "grad_norm": 0.03838369995355606, "learning_rate": 3.0221907607590382e-05, "loss": 0.0059, "step": 114600 }, { "epoch": 0.8523128367081553, "grad_norm": 0.06227879971265793, "learning_rate": 3.0202352830749497e-05, "loss": 0.0073, "step": 114700 }, { "epoch": 0.8530559167750326, "grad_norm": 0.03245691955089569, "learning_rate": 3.0182798053908612e-05, "loss": 0.0052, "step": 114800 }, { "epoch": 0.8537989968419097, "grad_norm": 0.09926444292068481, "learning_rate": 3.016324327706772e-05, "loss": 0.0061, "step": 114900 }, { "epoch": 0.8545420769087869, "grad_norm": 0.020898571237921715, "learning_rate": 3.0143688500226835e-05, "loss": 0.0065, "step": 115000 }, { "epoch": 0.8552851569756641, "grad_norm": 0.0328318290412426, "learning_rate": 3.012413372338595e-05, "loss": 0.006, "step": 115100 }, { "epoch": 0.8560282370425414, "grad_norm": 0.03578108921647072, "learning_rate": 3.0104578946545064e-05, "loss": 0.0066, "step": 115200 }, { "epoch": 0.8567713171094186, "grad_norm": 0.041066817939281464, "learning_rate": 3.008502416970418e-05, "loss": 0.0062, "step": 115300 }, { "epoch": 0.8575143971762957, "grad_norm": 0.025720179080963135, "learning_rate": 3.0065469392863287e-05, "loss": 0.0065, "step": 115400 }, { "epoch": 0.8582574772431729, "grad_norm": 0.028604835271835327, "learning_rate": 3.0045914616022402e-05, "loss": 0.0057, "step": 115500 }, { "epoch": 0.8590005573100502, "grad_norm": 0.1544080227613449, "learning_rate": 3.0026359839181517e-05, "loss": 0.0064, "step": 115600 }, { "epoch": 0.8597436373769274, "grad_norm": 0.04885988309979439, "learning_rate": 3.000680506234063e-05, "loss": 0.0058, "step": 115700 }, { "epoch": 0.8604867174438046, "grad_norm": 0.043574512004852295, "learning_rate": 2.998725028549974e-05, "loss": 0.0054, "step": 115800 }, { "epoch": 0.8612297975106817, "grad_norm": 0.0658397451043129, "learning_rate": 2.9967695508658854e-05, "loss": 0.006, "step": 115900 }, { "epoch": 0.861972877577559, "grad_norm": 0.06348127871751785, "learning_rate": 2.994814073181797e-05, "loss": 0.0053, "step": 116000 }, { "epoch": 0.8627159576444362, "grad_norm": 0.047632865607738495, "learning_rate": 2.9928585954977084e-05, "loss": 0.0062, "step": 116100 }, { "epoch": 0.8634590377113134, "grad_norm": 0.14865779876708984, "learning_rate": 2.99090311781362e-05, "loss": 0.0064, "step": 116200 }, { "epoch": 0.8642021177781906, "grad_norm": 0.06362463533878326, "learning_rate": 2.9889476401295306e-05, "loss": 0.0069, "step": 116300 }, { "epoch": 0.8649451978450678, "grad_norm": 0.028446676209568977, "learning_rate": 2.986992162445442e-05, "loss": 0.0065, "step": 116400 }, { "epoch": 0.865688277911945, "grad_norm": 0.030784308910369873, "learning_rate": 2.9850366847613536e-05, "loss": 0.0073, "step": 116500 }, { "epoch": 0.8664313579788222, "grad_norm": 0.02654445357620716, "learning_rate": 2.983081207077265e-05, "loss": 0.0059, "step": 116600 }, { "epoch": 0.8671744380456994, "grad_norm": 0.05801524966955185, "learning_rate": 2.9811257293931765e-05, "loss": 0.0058, "step": 116700 }, { "epoch": 0.8679175181125767, "grad_norm": 0.04940108209848404, "learning_rate": 2.9791702517090873e-05, "loss": 0.0062, "step": 116800 }, { "epoch": 0.8686605981794538, "grad_norm": 0.032187290489673615, "learning_rate": 2.9772147740249988e-05, "loss": 0.006, "step": 116900 }, { "epoch": 0.869403678246331, "grad_norm": 0.04729245975613594, "learning_rate": 2.9752592963409103e-05, "loss": 0.0057, "step": 117000 }, { "epoch": 0.8701467583132082, "grad_norm": 0.05030679330229759, "learning_rate": 2.9733038186568218e-05, "loss": 0.0063, "step": 117100 }, { "epoch": 0.8708898383800855, "grad_norm": 0.2055511772632599, "learning_rate": 2.971348340972733e-05, "loss": 0.0061, "step": 117200 }, { "epoch": 0.8716329184469627, "grad_norm": 0.10542912036180496, "learning_rate": 2.969392863288644e-05, "loss": 0.0065, "step": 117300 }, { "epoch": 0.8723759985138398, "grad_norm": 0.07299454510211945, "learning_rate": 2.9674373856045555e-05, "loss": 0.0063, "step": 117400 }, { "epoch": 0.873119078580717, "grad_norm": 0.18185946345329285, "learning_rate": 2.965481907920467e-05, "loss": 0.0059, "step": 117500 }, { "epoch": 0.8738621586475943, "grad_norm": 0.05560184642672539, "learning_rate": 2.9635264302363785e-05, "loss": 0.006, "step": 117600 }, { "epoch": 0.8746052387144715, "grad_norm": 0.02205006405711174, "learning_rate": 2.9615709525522896e-05, "loss": 0.0056, "step": 117700 }, { "epoch": 0.8753483187813487, "grad_norm": 0.17120927572250366, "learning_rate": 2.9596154748682007e-05, "loss": 0.0079, "step": 117800 }, { "epoch": 0.8760913988482258, "grad_norm": 0.05219429358839989, "learning_rate": 2.9576599971841122e-05, "loss": 0.0068, "step": 117900 }, { "epoch": 0.8768344789151031, "grad_norm": 0.050953593105077744, "learning_rate": 2.9557045195000237e-05, "loss": 0.0064, "step": 118000 }, { "epoch": 0.8775775589819803, "grad_norm": 0.04456860199570656, "learning_rate": 2.953749041815935e-05, "loss": 0.0059, "step": 118100 }, { "epoch": 0.8783206390488575, "grad_norm": 0.0739816203713417, "learning_rate": 2.9517935641318463e-05, "loss": 0.0065, "step": 118200 }, { "epoch": 0.8790637191157347, "grad_norm": 0.28489455580711365, "learning_rate": 2.9498380864477575e-05, "loss": 0.0057, "step": 118300 }, { "epoch": 0.879806799182612, "grad_norm": 0.04354569688439369, "learning_rate": 2.947882608763669e-05, "loss": 0.0064, "step": 118400 }, { "epoch": 0.8805498792494891, "grad_norm": 0.04973948746919632, "learning_rate": 2.9459271310795804e-05, "loss": 0.0057, "step": 118500 }, { "epoch": 0.8812929593163663, "grad_norm": 0.039978835731744766, "learning_rate": 2.9439716533954915e-05, "loss": 0.0057, "step": 118600 }, { "epoch": 0.8820360393832436, "grad_norm": 0.03124953806400299, "learning_rate": 2.942016175711403e-05, "loss": 0.0057, "step": 118700 }, { "epoch": 0.8827791194501208, "grad_norm": 1.261185646057129, "learning_rate": 2.940060698027314e-05, "loss": 0.006, "step": 118800 }, { "epoch": 0.883522199516998, "grad_norm": 0.049218133091926575, "learning_rate": 2.9381052203432256e-05, "loss": 0.0056, "step": 118900 }, { "epoch": 0.8842652795838751, "grad_norm": 0.06272688508033752, "learning_rate": 2.9361497426591368e-05, "loss": 0.0058, "step": 119000 }, { "epoch": 0.8850083596507524, "grad_norm": 0.04884091392159462, "learning_rate": 2.9341942649750482e-05, "loss": 0.0066, "step": 119100 }, { "epoch": 0.8857514397176296, "grad_norm": 0.04460691288113594, "learning_rate": 2.9322387872909597e-05, "loss": 0.0067, "step": 119200 }, { "epoch": 0.8864945197845068, "grad_norm": 0.03255524858832359, "learning_rate": 2.9302833096068712e-05, "loss": 0.0065, "step": 119300 }, { "epoch": 0.887237599851384, "grad_norm": 0.030200837180018425, "learning_rate": 2.9283278319227823e-05, "loss": 0.0067, "step": 119400 }, { "epoch": 0.8879806799182612, "grad_norm": 0.1784888505935669, "learning_rate": 2.9263723542386935e-05, "loss": 0.0071, "step": 119500 }, { "epoch": 0.8887237599851384, "grad_norm": 0.04317291080951691, "learning_rate": 2.924416876554605e-05, "loss": 0.0057, "step": 119600 }, { "epoch": 0.8894668400520156, "grad_norm": 0.012780030257999897, "learning_rate": 2.9224613988705164e-05, "loss": 0.0057, "step": 119700 }, { "epoch": 0.8902099201188928, "grad_norm": 0.05232267081737518, "learning_rate": 2.920505921186428e-05, "loss": 0.0056, "step": 119800 }, { "epoch": 0.8909530001857701, "grad_norm": 0.015664201229810715, "learning_rate": 2.9185504435023387e-05, "loss": 0.006, "step": 119900 }, { "epoch": 0.8916960802526472, "grad_norm": 0.02204192616045475, "learning_rate": 2.9165949658182502e-05, "loss": 0.0059, "step": 120000 }, { "epoch": 0.8924391603195244, "grad_norm": 0.0442570298910141, "learning_rate": 2.9146394881341617e-05, "loss": 0.0069, "step": 120100 }, { "epoch": 0.8931822403864016, "grad_norm": 0.04999970272183418, "learning_rate": 2.912684010450073e-05, "loss": 0.006, "step": 120200 }, { "epoch": 0.8939253204532789, "grad_norm": 0.04870384931564331, "learning_rate": 2.9107285327659846e-05, "loss": 0.005, "step": 120300 }, { "epoch": 0.8946684005201561, "grad_norm": 0.04402406886219978, "learning_rate": 2.9087730550818954e-05, "loss": 0.0068, "step": 120400 }, { "epoch": 0.8954114805870332, "grad_norm": 0.06529901176691055, "learning_rate": 2.906817577397807e-05, "loss": 0.0069, "step": 120500 }, { "epoch": 0.8961545606539104, "grad_norm": 0.035527054220438004, "learning_rate": 2.9048620997137184e-05, "loss": 0.0054, "step": 120600 }, { "epoch": 0.8968976407207877, "grad_norm": 0.09738994389772415, "learning_rate": 2.90290662202963e-05, "loss": 0.0056, "step": 120700 }, { "epoch": 0.8976407207876649, "grad_norm": 0.05401282757520676, "learning_rate": 2.9009511443455406e-05, "loss": 0.0053, "step": 120800 }, { "epoch": 0.8983838008545421, "grad_norm": 0.04170982912182808, "learning_rate": 2.898995666661452e-05, "loss": 0.0062, "step": 120900 }, { "epoch": 0.8991268809214192, "grad_norm": 0.03495937958359718, "learning_rate": 2.8970401889773636e-05, "loss": 0.0065, "step": 121000 }, { "epoch": 0.8998699609882965, "grad_norm": 0.03193104267120361, "learning_rate": 2.895084711293275e-05, "loss": 0.0054, "step": 121100 }, { "epoch": 0.9006130410551737, "grad_norm": 0.07597536593675613, "learning_rate": 2.8931292336091865e-05, "loss": 0.0065, "step": 121200 }, { "epoch": 0.9013561211220509, "grad_norm": 0.035020627081394196, "learning_rate": 2.8911737559250973e-05, "loss": 0.0063, "step": 121300 }, { "epoch": 0.9020992011889281, "grad_norm": 0.09660215675830841, "learning_rate": 2.8892182782410088e-05, "loss": 0.0063, "step": 121400 }, { "epoch": 0.9028422812558053, "grad_norm": 0.0824936181306839, "learning_rate": 2.8872628005569203e-05, "loss": 0.0069, "step": 121500 }, { "epoch": 0.9035853613226825, "grad_norm": 0.19923260807991028, "learning_rate": 2.8853073228728318e-05, "loss": 0.0061, "step": 121600 }, { "epoch": 0.9043284413895597, "grad_norm": 0.1586022526025772, "learning_rate": 2.8833518451887426e-05, "loss": 0.0062, "step": 121700 }, { "epoch": 0.9050715214564369, "grad_norm": 0.1038069799542427, "learning_rate": 2.881396367504654e-05, "loss": 0.0063, "step": 121800 }, { "epoch": 0.9058146015233142, "grad_norm": 0.02295359969139099, "learning_rate": 2.8794408898205655e-05, "loss": 0.0067, "step": 121900 }, { "epoch": 0.9065576815901913, "grad_norm": 0.13322241604328156, "learning_rate": 2.877485412136477e-05, "loss": 0.0066, "step": 122000 }, { "epoch": 0.9073007616570685, "grad_norm": 0.22144819796085358, "learning_rate": 2.8755299344523885e-05, "loss": 0.0057, "step": 122100 }, { "epoch": 0.9080438417239458, "grad_norm": 0.06306442618370056, "learning_rate": 2.8735744567682993e-05, "loss": 0.006, "step": 122200 }, { "epoch": 0.908786921790823, "grad_norm": 0.05856137350201607, "learning_rate": 2.8716189790842107e-05, "loss": 0.0055, "step": 122300 }, { "epoch": 0.9095300018577002, "grad_norm": 0.08096302300691605, "learning_rate": 2.8696635014001222e-05, "loss": 0.0054, "step": 122400 }, { "epoch": 0.9102730819245773, "grad_norm": 0.025557823479175568, "learning_rate": 2.8677080237160337e-05, "loss": 0.0078, "step": 122500 }, { "epoch": 0.9110161619914546, "grad_norm": 0.05159449949860573, "learning_rate": 2.8657525460319452e-05, "loss": 0.0059, "step": 122600 }, { "epoch": 0.9117592420583318, "grad_norm": 0.25890082120895386, "learning_rate": 2.863797068347856e-05, "loss": 0.005, "step": 122700 }, { "epoch": 0.912502322125209, "grad_norm": 0.0413227304816246, "learning_rate": 2.8618415906637675e-05, "loss": 0.0063, "step": 122800 }, { "epoch": 0.9132454021920862, "grad_norm": 0.026128780096769333, "learning_rate": 2.859886112979679e-05, "loss": 0.0062, "step": 122900 }, { "epoch": 0.9139884822589635, "grad_norm": 0.05068661645054817, "learning_rate": 2.8579306352955904e-05, "loss": 0.0056, "step": 123000 }, { "epoch": 0.9147315623258406, "grad_norm": 0.02674134261906147, "learning_rate": 2.8559751576115012e-05, "loss": 0.007, "step": 123100 }, { "epoch": 0.9154746423927178, "grad_norm": 0.03687332943081856, "learning_rate": 2.8540196799274127e-05, "loss": 0.007, "step": 123200 }, { "epoch": 0.916217722459595, "grad_norm": 0.04590151831507683, "learning_rate": 2.852064202243324e-05, "loss": 0.0065, "step": 123300 }, { "epoch": 0.9169608025264723, "grad_norm": 0.02323243021965027, "learning_rate": 2.8501087245592356e-05, "loss": 0.0055, "step": 123400 }, { "epoch": 0.9177038825933495, "grad_norm": 0.03177846223115921, "learning_rate": 2.848153246875147e-05, "loss": 0.0067, "step": 123500 }, { "epoch": 0.9184469626602266, "grad_norm": 0.06014629825949669, "learning_rate": 2.846197769191058e-05, "loss": 0.0059, "step": 123600 }, { "epoch": 0.9191900427271038, "grad_norm": 0.06965506076812744, "learning_rate": 2.8442422915069694e-05, "loss": 0.0063, "step": 123700 }, { "epoch": 0.9199331227939811, "grad_norm": 0.026711126789450645, "learning_rate": 2.842286813822881e-05, "loss": 0.0059, "step": 123800 }, { "epoch": 0.9206762028608583, "grad_norm": 0.09764640033245087, "learning_rate": 2.8403313361387923e-05, "loss": 0.0062, "step": 123900 }, { "epoch": 0.9214192829277355, "grad_norm": 0.07304485142230988, "learning_rate": 2.838375858454703e-05, "loss": 0.0072, "step": 124000 }, { "epoch": 0.9221623629946126, "grad_norm": 0.030334781855344772, "learning_rate": 2.8364203807706146e-05, "loss": 0.0073, "step": 124100 }, { "epoch": 0.9229054430614899, "grad_norm": 0.04824782907962799, "learning_rate": 2.834464903086526e-05, "loss": 0.0067, "step": 124200 }, { "epoch": 0.9236485231283671, "grad_norm": 0.14464250206947327, "learning_rate": 2.8325094254024376e-05, "loss": 0.0064, "step": 124300 }, { "epoch": 0.9243916031952443, "grad_norm": 0.03516273573040962, "learning_rate": 2.830553947718349e-05, "loss": 0.0053, "step": 124400 }, { "epoch": 0.9251346832621214, "grad_norm": 0.07263218611478806, "learning_rate": 2.82859847003426e-05, "loss": 0.0065, "step": 124500 }, { "epoch": 0.9258777633289987, "grad_norm": 0.1956225037574768, "learning_rate": 2.8266429923501713e-05, "loss": 0.0058, "step": 124600 }, { "epoch": 0.9266208433958759, "grad_norm": 0.03815372288227081, "learning_rate": 2.8246875146660828e-05, "loss": 0.0063, "step": 124700 }, { "epoch": 0.9273639234627531, "grad_norm": 0.05140213668346405, "learning_rate": 2.8227320369819943e-05, "loss": 0.0071, "step": 124800 }, { "epoch": 0.9281070035296303, "grad_norm": 0.01739925518631935, "learning_rate": 2.820776559297905e-05, "loss": 0.0076, "step": 124900 }, { "epoch": 0.9288500835965076, "grad_norm": 0.02743924781680107, "learning_rate": 2.8188210816138165e-05, "loss": 0.0062, "step": 125000 }, { "epoch": 0.9295931636633847, "grad_norm": 0.013137863017618656, "learning_rate": 2.816865603929728e-05, "loss": 0.0053, "step": 125100 }, { "epoch": 0.9303362437302619, "grad_norm": 0.027032459154725075, "learning_rate": 2.8149101262456395e-05, "loss": 0.0069, "step": 125200 }, { "epoch": 0.9310793237971391, "grad_norm": 0.062423866242170334, "learning_rate": 2.812954648561551e-05, "loss": 0.0054, "step": 125300 }, { "epoch": 0.9318224038640164, "grad_norm": 0.0339629203081131, "learning_rate": 2.8109991708774618e-05, "loss": 0.0074, "step": 125400 }, { "epoch": 0.9325654839308936, "grad_norm": 0.02332276664674282, "learning_rate": 2.8090436931933733e-05, "loss": 0.0067, "step": 125500 }, { "epoch": 0.9333085639977707, "grad_norm": 0.042948391288518906, "learning_rate": 2.8070882155092847e-05, "loss": 0.0059, "step": 125600 }, { "epoch": 0.9340516440646479, "grad_norm": 0.07286813110113144, "learning_rate": 2.8051327378251962e-05, "loss": 0.0056, "step": 125700 }, { "epoch": 0.9347947241315252, "grad_norm": 0.05022014304995537, "learning_rate": 2.803177260141107e-05, "loss": 0.0056, "step": 125800 }, { "epoch": 0.9355378041984024, "grad_norm": 0.06850142031908035, "learning_rate": 2.8012217824570185e-05, "loss": 0.0051, "step": 125900 }, { "epoch": 0.9362808842652796, "grad_norm": 0.06550432741641998, "learning_rate": 2.79926630477293e-05, "loss": 0.0054, "step": 126000 }, { "epoch": 0.9370239643321568, "grad_norm": 0.07907065749168396, "learning_rate": 2.7973108270888414e-05, "loss": 0.0068, "step": 126100 }, { "epoch": 0.937767044399034, "grad_norm": 0.42190802097320557, "learning_rate": 2.795355349404753e-05, "loss": 0.006, "step": 126200 }, { "epoch": 0.9385101244659112, "grad_norm": 0.024219976738095284, "learning_rate": 2.7933998717206637e-05, "loss": 0.0051, "step": 126300 }, { "epoch": 0.9392532045327884, "grad_norm": 0.07315164804458618, "learning_rate": 2.7914443940365752e-05, "loss": 0.006, "step": 126400 }, { "epoch": 0.9399962845996657, "grad_norm": 0.05630774796009064, "learning_rate": 2.7894889163524867e-05, "loss": 0.0066, "step": 126500 }, { "epoch": 0.9407393646665428, "grad_norm": 0.03974086791276932, "learning_rate": 2.787533438668398e-05, "loss": 0.0064, "step": 126600 }, { "epoch": 0.94148244473342, "grad_norm": 0.2864297032356262, "learning_rate": 2.7855779609843093e-05, "loss": 0.0055, "step": 126700 }, { "epoch": 0.9422255248002972, "grad_norm": 0.054144278168678284, "learning_rate": 2.7836224833002204e-05, "loss": 0.0057, "step": 126800 }, { "epoch": 0.9429686048671745, "grad_norm": 0.04591516777873039, "learning_rate": 2.781667005616132e-05, "loss": 0.0059, "step": 126900 }, { "epoch": 0.9437116849340517, "grad_norm": 0.07841447740793228, "learning_rate": 2.7797115279320434e-05, "loss": 0.0057, "step": 127000 }, { "epoch": 0.9444547650009288, "grad_norm": 0.0236077681183815, "learning_rate": 2.777756050247955e-05, "loss": 0.0051, "step": 127100 }, { "epoch": 0.945197845067806, "grad_norm": 0.038887109607458115, "learning_rate": 2.775800572563866e-05, "loss": 0.0059, "step": 127200 }, { "epoch": 0.9459409251346833, "grad_norm": 0.021334482356905937, "learning_rate": 2.773845094879777e-05, "loss": 0.0069, "step": 127300 }, { "epoch": 0.9466840052015605, "grad_norm": 0.020466864109039307, "learning_rate": 2.7718896171956886e-05, "loss": 0.0061, "step": 127400 }, { "epoch": 0.9474270852684377, "grad_norm": 0.040943924337625504, "learning_rate": 2.7699341395116e-05, "loss": 0.0066, "step": 127500 }, { "epoch": 0.9481701653353148, "grad_norm": 0.07737269997596741, "learning_rate": 2.7679786618275112e-05, "loss": 0.0056, "step": 127600 }, { "epoch": 0.9489132454021921, "grad_norm": 0.03779221698641777, "learning_rate": 2.7660231841434227e-05, "loss": 0.0057, "step": 127700 }, { "epoch": 0.9496563254690693, "grad_norm": 0.026844507083296776, "learning_rate": 2.764067706459334e-05, "loss": 0.005, "step": 127800 }, { "epoch": 0.9503994055359465, "grad_norm": 0.023007653653621674, "learning_rate": 2.7621122287752453e-05, "loss": 0.0061, "step": 127900 }, { "epoch": 0.9511424856028237, "grad_norm": 0.03240470588207245, "learning_rate": 2.7601567510911568e-05, "loss": 0.0049, "step": 128000 }, { "epoch": 0.951885565669701, "grad_norm": 0.07020842283964157, "learning_rate": 2.758201273407068e-05, "loss": 0.0067, "step": 128100 }, { "epoch": 0.9526286457365781, "grad_norm": 0.038528673350811005, "learning_rate": 2.7562457957229794e-05, "loss": 0.0068, "step": 128200 }, { "epoch": 0.9533717258034553, "grad_norm": 0.04519226402044296, "learning_rate": 2.754290318038891e-05, "loss": 0.0059, "step": 128300 }, { "epoch": 0.9541148058703325, "grad_norm": 0.0281106885522604, "learning_rate": 2.752334840354802e-05, "loss": 0.007, "step": 128400 }, { "epoch": 0.9548578859372098, "grad_norm": 0.1392526477575302, "learning_rate": 2.7503793626707135e-05, "loss": 0.0049, "step": 128500 }, { "epoch": 0.955600966004087, "grad_norm": 0.18400056660175323, "learning_rate": 2.7484238849866246e-05, "loss": 0.0065, "step": 128600 }, { "epoch": 0.9563440460709641, "grad_norm": 0.11384258419275284, "learning_rate": 2.746468407302536e-05, "loss": 0.0058, "step": 128700 }, { "epoch": 0.9570871261378413, "grad_norm": 0.03577343747019768, "learning_rate": 2.7445129296184476e-05, "loss": 0.007, "step": 128800 }, { "epoch": 0.9578302062047186, "grad_norm": 0.022150063887238503, "learning_rate": 2.7425574519343587e-05, "loss": 0.0064, "step": 128900 }, { "epoch": 0.9585732862715958, "grad_norm": 0.0987548902630806, "learning_rate": 2.74060197425027e-05, "loss": 0.0067, "step": 129000 }, { "epoch": 0.959316366338473, "grad_norm": 0.026291247457265854, "learning_rate": 2.7386464965661813e-05, "loss": 0.0061, "step": 129100 }, { "epoch": 0.9600594464053501, "grad_norm": 0.128851056098938, "learning_rate": 2.7366910188820928e-05, "loss": 0.0065, "step": 129200 }, { "epoch": 0.9608025264722274, "grad_norm": 0.06189007684588432, "learning_rate": 2.7347355411980043e-05, "loss": 0.0059, "step": 129300 }, { "epoch": 0.9615456065391046, "grad_norm": 0.08025570958852768, "learning_rate": 2.7327800635139154e-05, "loss": 0.0056, "step": 129400 }, { "epoch": 0.9622886866059818, "grad_norm": 0.043466612696647644, "learning_rate": 2.7308245858298266e-05, "loss": 0.0052, "step": 129500 }, { "epoch": 0.963031766672859, "grad_norm": 0.19805411994457245, "learning_rate": 2.728869108145738e-05, "loss": 0.0057, "step": 129600 }, { "epoch": 0.9637748467397362, "grad_norm": 0.07436662167310715, "learning_rate": 2.7269136304616495e-05, "loss": 0.0059, "step": 129700 }, { "epoch": 0.9645179268066134, "grad_norm": 0.05976375937461853, "learning_rate": 2.724958152777561e-05, "loss": 0.0057, "step": 129800 }, { "epoch": 0.9652610068734906, "grad_norm": 0.11782792210578918, "learning_rate": 2.7230026750934718e-05, "loss": 0.0074, "step": 129900 }, { "epoch": 0.9660040869403679, "grad_norm": 0.023921027779579163, "learning_rate": 2.7210471974093833e-05, "loss": 0.0058, "step": 130000 }, { "epoch": 0.9667471670072451, "grad_norm": 0.051907505840063095, "learning_rate": 2.7190917197252947e-05, "loss": 0.0059, "step": 130100 }, { "epoch": 0.9674902470741222, "grad_norm": 0.06810475140810013, "learning_rate": 2.7171362420412062e-05, "loss": 0.0053, "step": 130200 }, { "epoch": 0.9682333271409994, "grad_norm": 0.061557333916425705, "learning_rate": 2.7151807643571177e-05, "loss": 0.005, "step": 130300 }, { "epoch": 0.9689764072078767, "grad_norm": 0.01622900366783142, "learning_rate": 2.7132252866730285e-05, "loss": 0.0072, "step": 130400 }, { "epoch": 0.9697194872747539, "grad_norm": 0.03100135177373886, "learning_rate": 2.71126980898894e-05, "loss": 0.0067, "step": 130500 }, { "epoch": 0.970462567341631, "grad_norm": 0.059463635087013245, "learning_rate": 2.7093143313048514e-05, "loss": 0.0059, "step": 130600 }, { "epoch": 0.9712056474085082, "grad_norm": 0.06476590037345886, "learning_rate": 2.707358853620763e-05, "loss": 0.0065, "step": 130700 }, { "epoch": 0.9719487274753855, "grad_norm": 0.05556076020002365, "learning_rate": 2.7054033759366737e-05, "loss": 0.007, "step": 130800 }, { "epoch": 0.9726918075422627, "grad_norm": 0.1929199993610382, "learning_rate": 2.7034478982525852e-05, "loss": 0.0074, "step": 130900 }, { "epoch": 0.9734348876091399, "grad_norm": 0.057607006281614304, "learning_rate": 2.7014924205684967e-05, "loss": 0.0055, "step": 131000 }, { "epoch": 0.974177967676017, "grad_norm": 0.01977013610303402, "learning_rate": 2.699536942884408e-05, "loss": 0.0059, "step": 131100 }, { "epoch": 0.9749210477428943, "grad_norm": 0.06885409355163574, "learning_rate": 2.6975814652003196e-05, "loss": 0.0056, "step": 131200 }, { "epoch": 0.9756641278097715, "grad_norm": 0.03336046263575554, "learning_rate": 2.6956259875162304e-05, "loss": 0.0059, "step": 131300 }, { "epoch": 0.9764072078766487, "grad_norm": 0.05651996284723282, "learning_rate": 2.693670509832142e-05, "loss": 0.0055, "step": 131400 }, { "epoch": 0.9771502879435259, "grad_norm": 0.09750483185052872, "learning_rate": 2.6917150321480534e-05, "loss": 0.0065, "step": 131500 }, { "epoch": 0.9778933680104032, "grad_norm": 0.2095053791999817, "learning_rate": 2.689759554463965e-05, "loss": 0.006, "step": 131600 }, { "epoch": 0.9786364480772803, "grad_norm": 0.026306407526135445, "learning_rate": 2.6878040767798756e-05, "loss": 0.0055, "step": 131700 }, { "epoch": 0.9793795281441575, "grad_norm": 0.12247926741838455, "learning_rate": 2.685848599095787e-05, "loss": 0.0058, "step": 131800 }, { "epoch": 0.9801226082110347, "grad_norm": 0.026288801804184914, "learning_rate": 2.6838931214116986e-05, "loss": 0.0054, "step": 131900 }, { "epoch": 0.980865688277912, "grad_norm": 0.06531048566102982, "learning_rate": 2.68193764372761e-05, "loss": 0.006, "step": 132000 }, { "epoch": 0.9816087683447892, "grad_norm": 0.11530356854200363, "learning_rate": 2.6799821660435216e-05, "loss": 0.0059, "step": 132100 }, { "epoch": 0.9823518484116663, "grad_norm": 0.049980200827121735, "learning_rate": 2.6780266883594324e-05, "loss": 0.0053, "step": 132200 }, { "epoch": 0.9830949284785435, "grad_norm": 0.035161446779966354, "learning_rate": 2.6760712106753438e-05, "loss": 0.0055, "step": 132300 }, { "epoch": 0.9838380085454208, "grad_norm": 0.022060973569750786, "learning_rate": 2.6741157329912553e-05, "loss": 0.006, "step": 132400 }, { "epoch": 0.984581088612298, "grad_norm": 0.1505025029182434, "learning_rate": 2.6721602553071668e-05, "loss": 0.0054, "step": 132500 }, { "epoch": 0.9853241686791752, "grad_norm": 0.029236070811748505, "learning_rate": 2.6702047776230776e-05, "loss": 0.0056, "step": 132600 }, { "epoch": 0.9860672487460523, "grad_norm": 0.06198332458734512, "learning_rate": 2.668249299938989e-05, "loss": 0.0054, "step": 132700 }, { "epoch": 0.9868103288129296, "grad_norm": 0.035058461129665375, "learning_rate": 2.6662938222549005e-05, "loss": 0.0063, "step": 132800 }, { "epoch": 0.9875534088798068, "grad_norm": 0.021489892154932022, "learning_rate": 2.664338344570812e-05, "loss": 0.0067, "step": 132900 }, { "epoch": 0.988296488946684, "grad_norm": 0.021746812388300896, "learning_rate": 2.6623828668867235e-05, "loss": 0.0057, "step": 133000 }, { "epoch": 0.9890395690135612, "grad_norm": 0.034413132816553116, "learning_rate": 2.6604273892026343e-05, "loss": 0.0057, "step": 133100 }, { "epoch": 0.9897826490804384, "grad_norm": 0.13059857487678528, "learning_rate": 2.6584719115185458e-05, "loss": 0.005, "step": 133200 }, { "epoch": 0.9905257291473156, "grad_norm": 0.27728551626205444, "learning_rate": 2.6565164338344572e-05, "loss": 0.0064, "step": 133300 }, { "epoch": 0.9912688092141928, "grad_norm": 0.03905882313847542, "learning_rate": 2.6545609561503687e-05, "loss": 0.0055, "step": 133400 }, { "epoch": 0.99201188928107, "grad_norm": 0.11128711700439453, "learning_rate": 2.6526054784662795e-05, "loss": 0.0065, "step": 133500 }, { "epoch": 0.9927549693479473, "grad_norm": 0.017552727833390236, "learning_rate": 2.650650000782191e-05, "loss": 0.0053, "step": 133600 }, { "epoch": 0.9934980494148244, "grad_norm": 0.033906906843185425, "learning_rate": 2.6486945230981025e-05, "loss": 0.0061, "step": 133700 }, { "epoch": 0.9942411294817016, "grad_norm": 0.1645585149526596, "learning_rate": 2.646739045414014e-05, "loss": 0.0071, "step": 133800 }, { "epoch": 0.9949842095485789, "grad_norm": 0.1793511062860489, "learning_rate": 2.6447835677299254e-05, "loss": 0.0067, "step": 133900 }, { "epoch": 0.9957272896154561, "grad_norm": 0.03598960489034653, "learning_rate": 2.6428280900458362e-05, "loss": 0.0054, "step": 134000 }, { "epoch": 0.9964703696823333, "grad_norm": 0.04526710882782936, "learning_rate": 2.6408726123617477e-05, "loss": 0.0059, "step": 134100 }, { "epoch": 0.9972134497492104, "grad_norm": 0.05793256312608719, "learning_rate": 2.6389171346776592e-05, "loss": 0.0048, "step": 134200 }, { "epoch": 0.9979565298160877, "grad_norm": 0.04955807700753212, "learning_rate": 2.6369616569935706e-05, "loss": 0.0062, "step": 134300 }, { "epoch": 0.9986996098829649, "grad_norm": 0.07895401865243912, "learning_rate": 2.6350061793094814e-05, "loss": 0.0058, "step": 134400 }, { "epoch": 0.9994426899498421, "grad_norm": 0.020841335877776146, "learning_rate": 2.633050701625393e-05, "loss": 0.0064, "step": 134500 }, { "epoch": 1.0, "eval_accuracy": 0.6731159033307861, "eval_f1": 0.5474057853216613, "eval_loss": 0.005794953089207411, "eval_precision": 0.46312202879992004, "eval_recall": 0.6731159033307861, "eval_runtime": 1018.9021, "eval_samples_per_second": 117.403, "eval_steps_per_second": 14.676, "step": 134575 }, { "epoch": 1.0001857700167194, "grad_norm": 0.0858224555850029, "learning_rate": 2.6310952239413044e-05, "loss": 0.0064, "step": 134600 }, { "epoch": 1.0009288500835964, "grad_norm": 0.05276142433285713, "learning_rate": 2.629139746257216e-05, "loss": 0.006, "step": 134700 }, { "epoch": 1.0016719301504737, "grad_norm": 0.012105818837881088, "learning_rate": 2.6271842685731274e-05, "loss": 0.0055, "step": 134800 }, { "epoch": 1.002415010217351, "grad_norm": 0.025852937251329422, "learning_rate": 2.625228790889038e-05, "loss": 0.0051, "step": 134900 }, { "epoch": 1.003158090284228, "grad_norm": 0.035376325249671936, "learning_rate": 2.6232733132049496e-05, "loss": 0.0052, "step": 135000 }, { "epoch": 1.0039011703511054, "grad_norm": 0.03559981659054756, "learning_rate": 2.621317835520861e-05, "loss": 0.0044, "step": 135100 }, { "epoch": 1.0046442504179824, "grad_norm": 0.06627248227596283, "learning_rate": 2.6193623578367726e-05, "loss": 0.0065, "step": 135200 }, { "epoch": 1.0053873304848597, "grad_norm": 0.045760080218315125, "learning_rate": 2.617406880152684e-05, "loss": 0.0054, "step": 135300 }, { "epoch": 1.006130410551737, "grad_norm": 0.023312244564294815, "learning_rate": 2.615451402468595e-05, "loss": 0.0057, "step": 135400 }, { "epoch": 1.006873490618614, "grad_norm": 0.10208668559789658, "learning_rate": 2.6134959247845063e-05, "loss": 0.0065, "step": 135500 }, { "epoch": 1.0076165706854914, "grad_norm": 0.02769012376666069, "learning_rate": 2.6115404471004178e-05, "loss": 0.005, "step": 135600 }, { "epoch": 1.0083596507523687, "grad_norm": 0.026774238795042038, "learning_rate": 2.6095849694163293e-05, "loss": 0.0056, "step": 135700 }, { "epoch": 1.0091027308192457, "grad_norm": 0.04702996462583542, "learning_rate": 2.60762949173224e-05, "loss": 0.0054, "step": 135800 }, { "epoch": 1.009845810886123, "grad_norm": 0.035499636083841324, "learning_rate": 2.6056740140481516e-05, "loss": 0.0061, "step": 135900 }, { "epoch": 1.010588890953, "grad_norm": 0.04537290707230568, "learning_rate": 2.603718536364063e-05, "loss": 0.0054, "step": 136000 }, { "epoch": 1.0113319710198774, "grad_norm": 0.1308145970106125, "learning_rate": 2.6017630586799745e-05, "loss": 0.0053, "step": 136100 }, { "epoch": 1.0120750510867547, "grad_norm": 0.03806699067354202, "learning_rate": 2.599807580995886e-05, "loss": 0.0054, "step": 136200 }, { "epoch": 1.0128181311536317, "grad_norm": 0.034982986748218536, "learning_rate": 2.597852103311797e-05, "loss": 0.0048, "step": 136300 }, { "epoch": 1.013561211220509, "grad_norm": 0.05997470021247864, "learning_rate": 2.5958966256277083e-05, "loss": 0.0051, "step": 136400 }, { "epoch": 1.0143042912873863, "grad_norm": 0.0342450775206089, "learning_rate": 2.5939411479436197e-05, "loss": 0.0056, "step": 136500 }, { "epoch": 1.0150473713542634, "grad_norm": 0.48782026767730713, "learning_rate": 2.5919856702595312e-05, "loss": 0.0064, "step": 136600 }, { "epoch": 1.0157904514211407, "grad_norm": 0.20307183265686035, "learning_rate": 2.5900301925754424e-05, "loss": 0.0054, "step": 136700 }, { "epoch": 1.0165335314880177, "grad_norm": 0.0456731840968132, "learning_rate": 2.588074714891354e-05, "loss": 0.0056, "step": 136800 }, { "epoch": 1.017276611554895, "grad_norm": 0.022497955709695816, "learning_rate": 2.586119237207265e-05, "loss": 0.0052, "step": 136900 }, { "epoch": 1.0180196916217723, "grad_norm": 0.02106841839849949, "learning_rate": 2.5841637595231764e-05, "loss": 0.0057, "step": 137000 }, { "epoch": 1.0187627716886494, "grad_norm": 0.0491965189576149, "learning_rate": 2.582208281839088e-05, "loss": 0.0052, "step": 137100 }, { "epoch": 1.0195058517555267, "grad_norm": 0.12519213557243347, "learning_rate": 2.580252804154999e-05, "loss": 0.0046, "step": 137200 }, { "epoch": 1.020248931822404, "grad_norm": 0.049826931208372116, "learning_rate": 2.5782973264709105e-05, "loss": 0.0058, "step": 137300 }, { "epoch": 1.020992011889281, "grad_norm": 0.31960776448249817, "learning_rate": 2.5763418487868217e-05, "loss": 0.006, "step": 137400 }, { "epoch": 1.0217350919561583, "grad_norm": 0.0274798721075058, "learning_rate": 2.574386371102733e-05, "loss": 0.0054, "step": 137500 }, { "epoch": 1.0224781720230354, "grad_norm": 0.04314976558089256, "learning_rate": 2.5724308934186443e-05, "loss": 0.0062, "step": 137600 }, { "epoch": 1.0232212520899127, "grad_norm": 0.03322748467326164, "learning_rate": 2.5704754157345558e-05, "loss": 0.006, "step": 137700 }, { "epoch": 1.02396433215679, "grad_norm": 0.04182017222046852, "learning_rate": 2.5685199380504672e-05, "loss": 0.0057, "step": 137800 }, { "epoch": 1.024707412223667, "grad_norm": 0.19041411578655243, "learning_rate": 2.5665644603663784e-05, "loss": 0.0066, "step": 137900 }, { "epoch": 1.0254504922905443, "grad_norm": 0.01477507408708334, "learning_rate": 2.56460898268229e-05, "loss": 0.0054, "step": 138000 }, { "epoch": 1.0261935723574216, "grad_norm": 0.07443055510520935, "learning_rate": 2.562653504998201e-05, "loss": 0.0053, "step": 138100 }, { "epoch": 1.0269366524242987, "grad_norm": 0.08909869194030762, "learning_rate": 2.5606980273141125e-05, "loss": 0.0062, "step": 138200 }, { "epoch": 1.027679732491176, "grad_norm": 0.04067017883062363, "learning_rate": 2.558742549630024e-05, "loss": 0.005, "step": 138300 }, { "epoch": 1.0284228125580532, "grad_norm": 0.042003631591796875, "learning_rate": 2.556787071945935e-05, "loss": 0.0051, "step": 138400 }, { "epoch": 1.0291658926249303, "grad_norm": 0.047142427414655685, "learning_rate": 2.5548315942618462e-05, "loss": 0.0059, "step": 138500 }, { "epoch": 1.0299089726918076, "grad_norm": 0.04573028162121773, "learning_rate": 2.5528761165777577e-05, "loss": 0.0064, "step": 138600 }, { "epoch": 1.0306520527586847, "grad_norm": 0.06867179274559021, "learning_rate": 2.5509206388936692e-05, "loss": 0.0062, "step": 138700 }, { "epoch": 1.031395132825562, "grad_norm": 0.10114894062280655, "learning_rate": 2.5489651612095807e-05, "loss": 0.0053, "step": 138800 }, { "epoch": 1.0321382128924392, "grad_norm": 0.02809225767850876, "learning_rate": 2.547009683525492e-05, "loss": 0.005, "step": 138900 }, { "epoch": 1.0328812929593163, "grad_norm": 0.07249823957681656, "learning_rate": 2.545054205841403e-05, "loss": 0.0056, "step": 139000 }, { "epoch": 1.0336243730261936, "grad_norm": 0.09648074209690094, "learning_rate": 2.5430987281573144e-05, "loss": 0.0056, "step": 139100 }, { "epoch": 1.0343674530930709, "grad_norm": 0.044182706624269485, "learning_rate": 2.541143250473226e-05, "loss": 0.0062, "step": 139200 }, { "epoch": 1.035110533159948, "grad_norm": 0.19457797706127167, "learning_rate": 2.5391877727891374e-05, "loss": 0.006, "step": 139300 }, { "epoch": 1.0358536132268252, "grad_norm": 0.02229858562350273, "learning_rate": 2.537232295105048e-05, "loss": 0.006, "step": 139400 }, { "epoch": 1.0365966932937023, "grad_norm": 0.09211944043636322, "learning_rate": 2.5352768174209596e-05, "loss": 0.0062, "step": 139500 }, { "epoch": 1.0373397733605796, "grad_norm": 0.03515355661511421, "learning_rate": 2.533321339736871e-05, "loss": 0.0056, "step": 139600 }, { "epoch": 1.0380828534274569, "grad_norm": 0.09555108845233917, "learning_rate": 2.5313658620527826e-05, "loss": 0.0067, "step": 139700 }, { "epoch": 1.038825933494334, "grad_norm": 0.11455704271793365, "learning_rate": 2.529410384368694e-05, "loss": 0.0055, "step": 139800 }, { "epoch": 1.0395690135612112, "grad_norm": 0.028819721192121506, "learning_rate": 2.527454906684605e-05, "loss": 0.0063, "step": 139900 }, { "epoch": 1.0403120936280885, "grad_norm": 0.03169624134898186, "learning_rate": 2.5254994290005163e-05, "loss": 0.0056, "step": 140000 }, { "epoch": 1.0410551736949656, "grad_norm": 0.04586144909262657, "learning_rate": 2.5235439513164278e-05, "loss": 0.0048, "step": 140100 }, { "epoch": 1.0417982537618429, "grad_norm": 0.0369187630712986, "learning_rate": 2.5215884736323393e-05, "loss": 0.0048, "step": 140200 }, { "epoch": 1.04254133382872, "grad_norm": 0.02590985596179962, "learning_rate": 2.51963299594825e-05, "loss": 0.0077, "step": 140300 }, { "epoch": 1.0432844138955972, "grad_norm": 0.13892929255962372, "learning_rate": 2.5176775182641616e-05, "loss": 0.0065, "step": 140400 }, { "epoch": 1.0440274939624745, "grad_norm": 0.06113473325967789, "learning_rate": 2.515722040580073e-05, "loss": 0.0058, "step": 140500 }, { "epoch": 1.0447705740293516, "grad_norm": 0.05201324075460434, "learning_rate": 2.5137665628959845e-05, "loss": 0.0062, "step": 140600 }, { "epoch": 1.0455136540962289, "grad_norm": 0.041800606995821, "learning_rate": 2.511811085211896e-05, "loss": 0.0049, "step": 140700 }, { "epoch": 1.0462567341631062, "grad_norm": 0.010624642483890057, "learning_rate": 2.5098556075278068e-05, "loss": 0.0056, "step": 140800 }, { "epoch": 1.0469998142299832, "grad_norm": 0.01670183427631855, "learning_rate": 2.5079001298437183e-05, "loss": 0.0067, "step": 140900 }, { "epoch": 1.0477428942968605, "grad_norm": 0.02684764564037323, "learning_rate": 2.5059446521596297e-05, "loss": 0.0064, "step": 141000 }, { "epoch": 1.0484859743637376, "grad_norm": 0.03405214101076126, "learning_rate": 2.5039891744755412e-05, "loss": 0.0054, "step": 141100 }, { "epoch": 1.0492290544306149, "grad_norm": 0.031502846628427505, "learning_rate": 2.5020336967914527e-05, "loss": 0.0052, "step": 141200 }, { "epoch": 1.0499721344974922, "grad_norm": 0.07030481845140457, "learning_rate": 2.5000782191073635e-05, "loss": 0.0048, "step": 141300 }, { "epoch": 1.0507152145643692, "grad_norm": 0.022197427228093147, "learning_rate": 2.498122741423275e-05, "loss": 0.0047, "step": 141400 }, { "epoch": 1.0514582946312465, "grad_norm": 0.049207139760255814, "learning_rate": 2.4961672637391865e-05, "loss": 0.0056, "step": 141500 }, { "epoch": 1.0522013746981238, "grad_norm": 0.02588878758251667, "learning_rate": 2.4942117860550976e-05, "loss": 0.005, "step": 141600 }, { "epoch": 1.0529444547650009, "grad_norm": 0.10278034210205078, "learning_rate": 2.492256308371009e-05, "loss": 0.0055, "step": 141700 }, { "epoch": 1.0536875348318782, "grad_norm": 0.028837259858846664, "learning_rate": 2.4903008306869202e-05, "loss": 0.0067, "step": 141800 }, { "epoch": 1.0544306148987554, "grad_norm": 0.028977002948522568, "learning_rate": 2.4883453530028317e-05, "loss": 0.0052, "step": 141900 }, { "epoch": 1.0551736949656325, "grad_norm": 0.04314657673239708, "learning_rate": 2.4863898753187428e-05, "loss": 0.006, "step": 142000 }, { "epoch": 1.0559167750325098, "grad_norm": 0.11418034881353378, "learning_rate": 2.4844343976346543e-05, "loss": 0.0056, "step": 142100 }, { "epoch": 1.0566598550993869, "grad_norm": 0.03251711651682854, "learning_rate": 2.4824789199505658e-05, "loss": 0.0056, "step": 142200 }, { "epoch": 1.0574029351662642, "grad_norm": 0.03474794700741768, "learning_rate": 2.480523442266477e-05, "loss": 0.0057, "step": 142300 }, { "epoch": 1.0581460152331414, "grad_norm": 0.03282970190048218, "learning_rate": 2.4785679645823884e-05, "loss": 0.0053, "step": 142400 }, { "epoch": 1.0588890953000185, "grad_norm": 0.0425829254090786, "learning_rate": 2.4766124868982995e-05, "loss": 0.0057, "step": 142500 }, { "epoch": 1.0596321753668958, "grad_norm": 0.07478894293308258, "learning_rate": 2.474657009214211e-05, "loss": 0.0054, "step": 142600 }, { "epoch": 1.060375255433773, "grad_norm": 0.1273335963487625, "learning_rate": 2.472701531530122e-05, "loss": 0.0055, "step": 142700 }, { "epoch": 1.0611183355006502, "grad_norm": 0.1713322550058365, "learning_rate": 2.4707460538460336e-05, "loss": 0.0057, "step": 142800 }, { "epoch": 1.0618614155675274, "grad_norm": 0.1671566367149353, "learning_rate": 2.468790576161945e-05, "loss": 0.0062, "step": 142900 }, { "epoch": 1.0626044956344045, "grad_norm": 0.05468142032623291, "learning_rate": 2.4668350984778562e-05, "loss": 0.0059, "step": 143000 }, { "epoch": 1.0633475757012818, "grad_norm": 0.0826503336429596, "learning_rate": 2.4648796207937677e-05, "loss": 0.0051, "step": 143100 }, { "epoch": 1.064090655768159, "grad_norm": 0.02444489859044552, "learning_rate": 2.462924143109679e-05, "loss": 0.0054, "step": 143200 }, { "epoch": 1.0648337358350362, "grad_norm": 0.07199528068304062, "learning_rate": 2.4609686654255903e-05, "loss": 0.0054, "step": 143300 }, { "epoch": 1.0655768159019134, "grad_norm": 0.06646997481584549, "learning_rate": 2.4590131877415015e-05, "loss": 0.0065, "step": 143400 }, { "epoch": 1.0663198959687907, "grad_norm": 0.04792474955320358, "learning_rate": 2.457057710057413e-05, "loss": 0.0056, "step": 143500 }, { "epoch": 1.0670629760356678, "grad_norm": 0.03683505579829216, "learning_rate": 2.455102232373324e-05, "loss": 0.0062, "step": 143600 }, { "epoch": 1.067806056102545, "grad_norm": 0.021543417125940323, "learning_rate": 2.4531467546892355e-05, "loss": 0.0051, "step": 143700 }, { "epoch": 1.0685491361694222, "grad_norm": 0.0271848626434803, "learning_rate": 2.451191277005147e-05, "loss": 0.0063, "step": 143800 }, { "epoch": 1.0692922162362994, "grad_norm": 0.03197645768523216, "learning_rate": 2.449235799321058e-05, "loss": 0.006, "step": 143900 }, { "epoch": 1.0700352963031767, "grad_norm": 0.15035386383533478, "learning_rate": 2.4472803216369696e-05, "loss": 0.0061, "step": 144000 }, { "epoch": 1.0707783763700538, "grad_norm": 0.05711387097835541, "learning_rate": 2.4453248439528808e-05, "loss": 0.0058, "step": 144100 }, { "epoch": 1.071521456436931, "grad_norm": 0.03732112795114517, "learning_rate": 2.4433693662687923e-05, "loss": 0.0061, "step": 144200 }, { "epoch": 1.0722645365038084, "grad_norm": 0.03284495696425438, "learning_rate": 2.4414138885847034e-05, "loss": 0.0056, "step": 144300 }, { "epoch": 1.0730076165706854, "grad_norm": 0.05515632405877113, "learning_rate": 2.439458410900615e-05, "loss": 0.0054, "step": 144400 }, { "epoch": 1.0737506966375627, "grad_norm": 0.06923047453165054, "learning_rate": 2.437502933216526e-05, "loss": 0.0064, "step": 144500 }, { "epoch": 1.0744937767044398, "grad_norm": 0.01954464614391327, "learning_rate": 2.4355474555324375e-05, "loss": 0.0052, "step": 144600 }, { "epoch": 1.075236856771317, "grad_norm": 0.04955330118536949, "learning_rate": 2.433591977848349e-05, "loss": 0.0068, "step": 144700 }, { "epoch": 1.0759799368381944, "grad_norm": 0.02202390506863594, "learning_rate": 2.43163650016426e-05, "loss": 0.0053, "step": 144800 }, { "epoch": 1.0767230169050714, "grad_norm": 0.02945655770599842, "learning_rate": 2.4296810224801716e-05, "loss": 0.0063, "step": 144900 }, { "epoch": 1.0774660969719487, "grad_norm": 0.05445816367864609, "learning_rate": 2.4277255447960827e-05, "loss": 0.0059, "step": 145000 }, { "epoch": 1.078209177038826, "grad_norm": 0.02709222212433815, "learning_rate": 2.4257700671119942e-05, "loss": 0.0053, "step": 145100 }, { "epoch": 1.078952257105703, "grad_norm": 0.034668318927288055, "learning_rate": 2.4238145894279053e-05, "loss": 0.0056, "step": 145200 }, { "epoch": 1.0796953371725804, "grad_norm": 0.02768143266439438, "learning_rate": 2.4218591117438168e-05, "loss": 0.0048, "step": 145300 }, { "epoch": 1.0804384172394577, "grad_norm": 0.086945079267025, "learning_rate": 2.4199036340597283e-05, "loss": 0.0061, "step": 145400 }, { "epoch": 1.0811814973063347, "grad_norm": 0.045763492584228516, "learning_rate": 2.4179481563756394e-05, "loss": 0.0057, "step": 145500 }, { "epoch": 1.081924577373212, "grad_norm": 0.04183658957481384, "learning_rate": 2.415992678691551e-05, "loss": 0.0047, "step": 145600 }, { "epoch": 1.082667657440089, "grad_norm": 0.0742167979478836, "learning_rate": 2.414037201007462e-05, "loss": 0.005, "step": 145700 }, { "epoch": 1.0834107375069664, "grad_norm": 0.08800797164440155, "learning_rate": 2.4120817233233735e-05, "loss": 0.0052, "step": 145800 }, { "epoch": 1.0841538175738437, "grad_norm": 0.06233389675617218, "learning_rate": 2.4101262456392846e-05, "loss": 0.0049, "step": 145900 }, { "epoch": 1.0848968976407207, "grad_norm": 0.04423372447490692, "learning_rate": 2.408170767955196e-05, "loss": 0.0051, "step": 146000 }, { "epoch": 1.085639977707598, "grad_norm": 0.1667689085006714, "learning_rate": 2.4062152902711076e-05, "loss": 0.0065, "step": 146100 }, { "epoch": 1.0863830577744753, "grad_norm": 0.027060603722929955, "learning_rate": 2.4042598125870187e-05, "loss": 0.0052, "step": 146200 }, { "epoch": 1.0871261378413524, "grad_norm": 0.04268006235361099, "learning_rate": 2.4023043349029302e-05, "loss": 0.0048, "step": 146300 }, { "epoch": 1.0878692179082297, "grad_norm": 0.06504328548908234, "learning_rate": 2.4003488572188413e-05, "loss": 0.0063, "step": 146400 }, { "epoch": 1.0886122979751067, "grad_norm": 0.032052189111709595, "learning_rate": 2.3983933795347528e-05, "loss": 0.006, "step": 146500 }, { "epoch": 1.089355378041984, "grad_norm": 0.04761442914605141, "learning_rate": 2.3964379018506643e-05, "loss": 0.006, "step": 146600 }, { "epoch": 1.0900984581088613, "grad_norm": 0.07099795341491699, "learning_rate": 2.3944824241665754e-05, "loss": 0.0056, "step": 146700 }, { "epoch": 1.0908415381757384, "grad_norm": 0.028937682509422302, "learning_rate": 2.392526946482487e-05, "loss": 0.0055, "step": 146800 }, { "epoch": 1.0915846182426157, "grad_norm": 0.14576905965805054, "learning_rate": 2.390571468798398e-05, "loss": 0.0055, "step": 146900 }, { "epoch": 1.092327698309493, "grad_norm": 0.03411734104156494, "learning_rate": 2.3886159911143095e-05, "loss": 0.0048, "step": 147000 }, { "epoch": 1.09307077837637, "grad_norm": 0.052977073937654495, "learning_rate": 2.386660513430221e-05, "loss": 0.0051, "step": 147100 }, { "epoch": 1.0938138584432473, "grad_norm": 0.1216905266046524, "learning_rate": 2.384705035746132e-05, "loss": 0.0058, "step": 147200 }, { "epoch": 1.0945569385101244, "grad_norm": 0.07220131158828735, "learning_rate": 2.3827495580620436e-05, "loss": 0.0054, "step": 147300 }, { "epoch": 1.0953000185770017, "grad_norm": 0.03220698982477188, "learning_rate": 2.380794080377955e-05, "loss": 0.0051, "step": 147400 }, { "epoch": 1.096043098643879, "grad_norm": 0.05680060386657715, "learning_rate": 2.3788386026938662e-05, "loss": 0.0049, "step": 147500 }, { "epoch": 1.096786178710756, "grad_norm": 0.024355633184313774, "learning_rate": 2.3768831250097777e-05, "loss": 0.0055, "step": 147600 }, { "epoch": 1.0975292587776333, "grad_norm": 0.0355987548828125, "learning_rate": 2.374927647325689e-05, "loss": 0.0056, "step": 147700 }, { "epoch": 1.0982723388445106, "grad_norm": 0.048444170504808426, "learning_rate": 2.3729721696416003e-05, "loss": 0.0056, "step": 147800 }, { "epoch": 1.0990154189113877, "grad_norm": 0.022481897845864296, "learning_rate": 2.3710166919575115e-05, "loss": 0.0053, "step": 147900 }, { "epoch": 1.099758498978265, "grad_norm": 0.04564160853624344, "learning_rate": 2.369061214273423e-05, "loss": 0.0056, "step": 148000 }, { "epoch": 1.100501579045142, "grad_norm": 0.032010409981012344, "learning_rate": 2.3671057365893344e-05, "loss": 0.006, "step": 148100 }, { "epoch": 1.1012446591120193, "grad_norm": 0.3300672769546509, "learning_rate": 2.3651502589052455e-05, "loss": 0.0062, "step": 148200 }, { "epoch": 1.1019877391788966, "grad_norm": 0.23415091633796692, "learning_rate": 2.363194781221157e-05, "loss": 0.0056, "step": 148300 }, { "epoch": 1.1027308192457737, "grad_norm": 0.06009884551167488, "learning_rate": 2.361239303537068e-05, "loss": 0.0049, "step": 148400 }, { "epoch": 1.103473899312651, "grad_norm": 0.12067053467035294, "learning_rate": 2.3592838258529796e-05, "loss": 0.0051, "step": 148500 }, { "epoch": 1.1042169793795282, "grad_norm": 0.04308878257870674, "learning_rate": 2.3573283481688908e-05, "loss": 0.0052, "step": 148600 }, { "epoch": 1.1049600594464053, "grad_norm": 0.057723164558410645, "learning_rate": 2.3553728704848023e-05, "loss": 0.0058, "step": 148700 }, { "epoch": 1.1057031395132826, "grad_norm": 0.03455199673771858, "learning_rate": 2.3534173928007137e-05, "loss": 0.0067, "step": 148800 }, { "epoch": 1.1064462195801599, "grad_norm": 0.04712114855647087, "learning_rate": 2.351461915116625e-05, "loss": 0.0061, "step": 148900 }, { "epoch": 1.107189299647037, "grad_norm": 0.10310561209917068, "learning_rate": 2.3495064374325363e-05, "loss": 0.0055, "step": 149000 }, { "epoch": 1.1079323797139142, "grad_norm": 0.0375208854675293, "learning_rate": 2.3475509597484475e-05, "loss": 0.0064, "step": 149100 }, { "epoch": 1.1086754597807913, "grad_norm": 0.04986836388707161, "learning_rate": 2.345595482064359e-05, "loss": 0.0056, "step": 149200 }, { "epoch": 1.1094185398476686, "grad_norm": 0.1312844306230545, "learning_rate": 2.34364000438027e-05, "loss": 0.0047, "step": 149300 }, { "epoch": 1.1101616199145459, "grad_norm": 0.051481690257787704, "learning_rate": 2.3416845266961816e-05, "loss": 0.0057, "step": 149400 }, { "epoch": 1.110904699981423, "grad_norm": 0.14828671514987946, "learning_rate": 2.3397290490120927e-05, "loss": 0.0059, "step": 149500 }, { "epoch": 1.1116477800483002, "grad_norm": 0.03930046781897545, "learning_rate": 2.3377735713280042e-05, "loss": 0.007, "step": 149600 }, { "epoch": 1.1123908601151773, "grad_norm": 0.05732659250497818, "learning_rate": 2.3358180936439157e-05, "loss": 0.0053, "step": 149700 }, { "epoch": 1.1131339401820546, "grad_norm": 0.04219864681363106, "learning_rate": 2.3338626159598268e-05, "loss": 0.0053, "step": 149800 }, { "epoch": 1.1138770202489319, "grad_norm": 0.03151001036167145, "learning_rate": 2.3319071382757383e-05, "loss": 0.0044, "step": 149900 }, { "epoch": 1.114620100315809, "grad_norm": 0.02196461334824562, "learning_rate": 2.3299516605916494e-05, "loss": 0.005, "step": 150000 }, { "epoch": 1.1153631803826862, "grad_norm": 0.0373857356607914, "learning_rate": 2.327996182907561e-05, "loss": 0.0063, "step": 150100 }, { "epoch": 1.1161062604495635, "grad_norm": 0.03340983763337135, "learning_rate": 2.326040705223472e-05, "loss": 0.0047, "step": 150200 }, { "epoch": 1.1168493405164406, "grad_norm": 0.05585547536611557, "learning_rate": 2.3240852275393835e-05, "loss": 0.0059, "step": 150300 }, { "epoch": 1.1175924205833179, "grad_norm": 0.1363915205001831, "learning_rate": 2.3221297498552946e-05, "loss": 0.0063, "step": 150400 }, { "epoch": 1.1183355006501952, "grad_norm": 0.045780081301927567, "learning_rate": 2.320174272171206e-05, "loss": 0.0057, "step": 150500 }, { "epoch": 1.1190785807170722, "grad_norm": 0.07230181246995926, "learning_rate": 2.3182187944871176e-05, "loss": 0.0051, "step": 150600 }, { "epoch": 1.1198216607839495, "grad_norm": 0.10848736763000488, "learning_rate": 2.3162633168030287e-05, "loss": 0.0053, "step": 150700 }, { "epoch": 1.1205647408508266, "grad_norm": 0.17539748549461365, "learning_rate": 2.3143078391189402e-05, "loss": 0.0057, "step": 150800 }, { "epoch": 1.1213078209177039, "grad_norm": 0.02805647812783718, "learning_rate": 2.3123523614348513e-05, "loss": 0.005, "step": 150900 }, { "epoch": 1.1220509009845812, "grad_norm": 0.03508644178509712, "learning_rate": 2.3103968837507628e-05, "loss": 0.0055, "step": 151000 }, { "epoch": 1.1227939810514582, "grad_norm": 0.05408349633216858, "learning_rate": 2.308441406066674e-05, "loss": 0.0057, "step": 151100 }, { "epoch": 1.1235370611183355, "grad_norm": 0.0319347009062767, "learning_rate": 2.3064859283825854e-05, "loss": 0.0056, "step": 151200 }, { "epoch": 1.1242801411852128, "grad_norm": 0.02581121399998665, "learning_rate": 2.3045304506984966e-05, "loss": 0.0056, "step": 151300 }, { "epoch": 1.1250232212520899, "grad_norm": 0.03672366961836815, "learning_rate": 2.302574973014408e-05, "loss": 0.0063, "step": 151400 }, { "epoch": 1.1257663013189672, "grad_norm": 0.09901324659585953, "learning_rate": 2.3006194953303195e-05, "loss": 0.0061, "step": 151500 }, { "epoch": 1.1265093813858442, "grad_norm": 0.07193513214588165, "learning_rate": 2.2986640176462307e-05, "loss": 0.0065, "step": 151600 }, { "epoch": 1.1272524614527215, "grad_norm": 0.24044308066368103, "learning_rate": 2.296708539962142e-05, "loss": 0.0052, "step": 151700 }, { "epoch": 1.1279955415195988, "grad_norm": 0.10689430683851242, "learning_rate": 2.2947530622780533e-05, "loss": 0.0049, "step": 151800 }, { "epoch": 1.1287386215864759, "grad_norm": 0.029800768941640854, "learning_rate": 2.2927975845939648e-05, "loss": 0.006, "step": 151900 }, { "epoch": 1.1294817016533532, "grad_norm": 0.1468217968940735, "learning_rate": 2.290842106909876e-05, "loss": 0.0049, "step": 152000 }, { "epoch": 1.1302247817202304, "grad_norm": 0.29141831398010254, "learning_rate": 2.2888866292257874e-05, "loss": 0.0064, "step": 152100 }, { "epoch": 1.1309678617871075, "grad_norm": 0.33774715662002563, "learning_rate": 2.286931151541699e-05, "loss": 0.0052, "step": 152200 }, { "epoch": 1.1317109418539848, "grad_norm": 0.05860868841409683, "learning_rate": 2.28497567385761e-05, "loss": 0.0059, "step": 152300 }, { "epoch": 1.132454021920862, "grad_norm": 0.0814049169421196, "learning_rate": 2.2830201961735215e-05, "loss": 0.0045, "step": 152400 }, { "epoch": 1.1331971019877392, "grad_norm": 0.03996295481920242, "learning_rate": 2.2810647184894326e-05, "loss": 0.0052, "step": 152500 }, { "epoch": 1.1339401820546164, "grad_norm": 0.049139514565467834, "learning_rate": 2.279109240805344e-05, "loss": 0.0047, "step": 152600 }, { "epoch": 1.1346832621214935, "grad_norm": 0.021718652918934822, "learning_rate": 2.2771537631212552e-05, "loss": 0.0054, "step": 152700 }, { "epoch": 1.1354263421883708, "grad_norm": 0.15595364570617676, "learning_rate": 2.2751982854371667e-05, "loss": 0.005, "step": 152800 }, { "epoch": 1.136169422255248, "grad_norm": 0.04704904183745384, "learning_rate": 2.2732428077530778e-05, "loss": 0.0053, "step": 152900 }, { "epoch": 1.1369125023221252, "grad_norm": 0.027958255261182785, "learning_rate": 2.2712873300689893e-05, "loss": 0.0055, "step": 153000 }, { "epoch": 1.1376555823890024, "grad_norm": 0.1115698590874672, "learning_rate": 2.2693318523849008e-05, "loss": 0.0054, "step": 153100 }, { "epoch": 1.1383986624558795, "grad_norm": 0.05795670300722122, "learning_rate": 2.267376374700812e-05, "loss": 0.0056, "step": 153200 }, { "epoch": 1.1391417425227568, "grad_norm": 0.02866525389254093, "learning_rate": 2.2654208970167234e-05, "loss": 0.0048, "step": 153300 }, { "epoch": 1.139884822589634, "grad_norm": 0.15879274904727936, "learning_rate": 2.2634654193326345e-05, "loss": 0.0058, "step": 153400 }, { "epoch": 1.1406279026565112, "grad_norm": 0.10003737360239029, "learning_rate": 2.261509941648546e-05, "loss": 0.0053, "step": 153500 }, { "epoch": 1.1413709827233884, "grad_norm": 0.05135945603251457, "learning_rate": 2.259554463964457e-05, "loss": 0.007, "step": 153600 }, { "epoch": 1.1421140627902657, "grad_norm": 0.05891573801636696, "learning_rate": 2.2575989862803686e-05, "loss": 0.006, "step": 153700 }, { "epoch": 1.1428571428571428, "grad_norm": 0.03221351280808449, "learning_rate": 2.2556435085962798e-05, "loss": 0.0048, "step": 153800 }, { "epoch": 1.14360022292402, "grad_norm": 0.09542157500982285, "learning_rate": 2.2536880309121912e-05, "loss": 0.0057, "step": 153900 }, { "epoch": 1.1443433029908974, "grad_norm": 0.022061988711357117, "learning_rate": 2.2517325532281027e-05, "loss": 0.0053, "step": 154000 }, { "epoch": 1.1450863830577744, "grad_norm": 0.015003146603703499, "learning_rate": 2.249777075544014e-05, "loss": 0.0065, "step": 154100 }, { "epoch": 1.1458294631246517, "grad_norm": 0.04081400856375694, "learning_rate": 2.2478215978599253e-05, "loss": 0.0051, "step": 154200 }, { "epoch": 1.1465725431915288, "grad_norm": 0.12565812468528748, "learning_rate": 2.2458661201758365e-05, "loss": 0.0049, "step": 154300 }, { "epoch": 1.147315623258406, "grad_norm": 0.055342771112918854, "learning_rate": 2.243910642491748e-05, "loss": 0.0052, "step": 154400 }, { "epoch": 1.1480587033252834, "grad_norm": 0.04786688834428787, "learning_rate": 2.241955164807659e-05, "loss": 0.0052, "step": 154500 }, { "epoch": 1.1488017833921604, "grad_norm": 0.02246098220348358, "learning_rate": 2.2399996871235706e-05, "loss": 0.0056, "step": 154600 }, { "epoch": 1.1495448634590377, "grad_norm": 0.2554017901420593, "learning_rate": 2.238044209439482e-05, "loss": 0.0057, "step": 154700 }, { "epoch": 1.150287943525915, "grad_norm": 0.03989749401807785, "learning_rate": 2.2360887317553932e-05, "loss": 0.0053, "step": 154800 }, { "epoch": 1.151031023592792, "grad_norm": 0.026981180533766747, "learning_rate": 2.2341332540713046e-05, "loss": 0.0052, "step": 154900 }, { "epoch": 1.1517741036596694, "grad_norm": 0.04428456723690033, "learning_rate": 2.2321777763872158e-05, "loss": 0.0061, "step": 155000 }, { "epoch": 1.1525171837265464, "grad_norm": 0.05217871814966202, "learning_rate": 2.2302222987031273e-05, "loss": 0.0059, "step": 155100 }, { "epoch": 1.1532602637934237, "grad_norm": 0.021733073517680168, "learning_rate": 2.2282668210190384e-05, "loss": 0.005, "step": 155200 }, { "epoch": 1.154003343860301, "grad_norm": 0.03587393835186958, "learning_rate": 2.22631134333495e-05, "loss": 0.0055, "step": 155300 }, { "epoch": 1.154746423927178, "grad_norm": 0.012671799398958683, "learning_rate": 2.224355865650861e-05, "loss": 0.0065, "step": 155400 }, { "epoch": 1.1554895039940554, "grad_norm": 0.20622533559799194, "learning_rate": 2.2224003879667725e-05, "loss": 0.0053, "step": 155500 }, { "epoch": 1.1562325840609327, "grad_norm": 0.05680568143725395, "learning_rate": 2.220444910282684e-05, "loss": 0.0052, "step": 155600 }, { "epoch": 1.1569756641278097, "grad_norm": 0.015117454342544079, "learning_rate": 2.218489432598595e-05, "loss": 0.0064, "step": 155700 }, { "epoch": 1.157718744194687, "grad_norm": 0.06436136364936829, "learning_rate": 2.2165339549145066e-05, "loss": 0.0065, "step": 155800 }, { "epoch": 1.1584618242615643, "grad_norm": 0.11756323277950287, "learning_rate": 2.214578477230418e-05, "loss": 0.0057, "step": 155900 }, { "epoch": 1.1592049043284414, "grad_norm": 0.061628226190805435, "learning_rate": 2.2126229995463292e-05, "loss": 0.0067, "step": 156000 }, { "epoch": 1.1599479843953187, "grad_norm": 0.03505253791809082, "learning_rate": 2.2106675218622407e-05, "loss": 0.0053, "step": 156100 }, { "epoch": 1.1606910644621957, "grad_norm": 0.05141303688287735, "learning_rate": 2.2087120441781518e-05, "loss": 0.0052, "step": 156200 }, { "epoch": 1.161434144529073, "grad_norm": 0.018776968121528625, "learning_rate": 2.2067565664940633e-05, "loss": 0.0051, "step": 156300 }, { "epoch": 1.1621772245959503, "grad_norm": 0.023740410804748535, "learning_rate": 2.2048010888099748e-05, "loss": 0.0055, "step": 156400 }, { "epoch": 1.1629203046628274, "grad_norm": 0.04644650220870972, "learning_rate": 2.202845611125886e-05, "loss": 0.0052, "step": 156500 }, { "epoch": 1.1636633847297047, "grad_norm": 0.08129490166902542, "learning_rate": 2.2008901334417974e-05, "loss": 0.0061, "step": 156600 }, { "epoch": 1.1644064647965817, "grad_norm": 0.1336362212896347, "learning_rate": 2.1989346557577085e-05, "loss": 0.0065, "step": 156700 }, { "epoch": 1.165149544863459, "grad_norm": 0.0480029433965683, "learning_rate": 2.19697917807362e-05, "loss": 0.0054, "step": 156800 }, { "epoch": 1.1658926249303363, "grad_norm": 0.03348072990775108, "learning_rate": 2.1950237003895315e-05, "loss": 0.006, "step": 156900 }, { "epoch": 1.1666357049972134, "grad_norm": 0.04710906744003296, "learning_rate": 2.1930682227054426e-05, "loss": 0.0054, "step": 157000 }, { "epoch": 1.1673787850640907, "grad_norm": 0.046862952411174774, "learning_rate": 2.191112745021354e-05, "loss": 0.0047, "step": 157100 }, { "epoch": 1.168121865130968, "grad_norm": 0.08444464206695557, "learning_rate": 2.1891572673372652e-05, "loss": 0.0058, "step": 157200 }, { "epoch": 1.168864945197845, "grad_norm": 0.022795110940933228, "learning_rate": 2.1872017896531767e-05, "loss": 0.0055, "step": 157300 }, { "epoch": 1.1696080252647223, "grad_norm": 0.11179835349321365, "learning_rate": 2.1852463119690882e-05, "loss": 0.0054, "step": 157400 }, { "epoch": 1.1703511053315996, "grad_norm": 0.07551669329404831, "learning_rate": 2.1832908342849993e-05, "loss": 0.0056, "step": 157500 }, { "epoch": 1.1710941853984767, "grad_norm": 0.1867041140794754, "learning_rate": 2.1813353566009108e-05, "loss": 0.0051, "step": 157600 }, { "epoch": 1.171837265465354, "grad_norm": 0.07545205950737, "learning_rate": 2.179379878916822e-05, "loss": 0.0058, "step": 157700 }, { "epoch": 1.172580345532231, "grad_norm": 0.01717420481145382, "learning_rate": 2.1774244012327334e-05, "loss": 0.0054, "step": 157800 }, { "epoch": 1.1733234255991083, "grad_norm": 0.03103042021393776, "learning_rate": 2.1754689235486445e-05, "loss": 0.006, "step": 157900 }, { "epoch": 1.1740665056659856, "grad_norm": 0.027669697999954224, "learning_rate": 2.173513445864556e-05, "loss": 0.005, "step": 158000 }, { "epoch": 1.1748095857328626, "grad_norm": 0.05792463943362236, "learning_rate": 2.1715579681804675e-05, "loss": 0.0063, "step": 158100 }, { "epoch": 1.17555266579974, "grad_norm": 0.06614989787340164, "learning_rate": 2.1696024904963786e-05, "loss": 0.0058, "step": 158200 }, { "epoch": 1.176295745866617, "grad_norm": 0.0775604397058487, "learning_rate": 2.16764701281229e-05, "loss": 0.0057, "step": 158300 }, { "epoch": 1.1770388259334943, "grad_norm": 0.09093412011861801, "learning_rate": 2.1656915351282012e-05, "loss": 0.005, "step": 158400 }, { "epoch": 1.1777819060003716, "grad_norm": 0.5806289911270142, "learning_rate": 2.1637360574441127e-05, "loss": 0.0057, "step": 158500 }, { "epoch": 1.1785249860672486, "grad_norm": 0.0288386307656765, "learning_rate": 2.161780579760024e-05, "loss": 0.006, "step": 158600 }, { "epoch": 1.179268066134126, "grad_norm": 0.14808669686317444, "learning_rate": 2.1598251020759353e-05, "loss": 0.0057, "step": 158700 }, { "epoch": 1.1800111462010032, "grad_norm": 0.03975832834839821, "learning_rate": 2.1578696243918465e-05, "loss": 0.0061, "step": 158800 }, { "epoch": 1.1807542262678803, "grad_norm": 0.030098674818873405, "learning_rate": 2.155914146707758e-05, "loss": 0.0052, "step": 158900 }, { "epoch": 1.1814973063347576, "grad_norm": 0.03533417731523514, "learning_rate": 2.1539586690236694e-05, "loss": 0.0052, "step": 159000 }, { "epoch": 1.1822403864016349, "grad_norm": 0.04134329408407211, "learning_rate": 2.1520031913395806e-05, "loss": 0.0059, "step": 159100 }, { "epoch": 1.182983466468512, "grad_norm": 0.04542015492916107, "learning_rate": 2.150047713655492e-05, "loss": 0.0054, "step": 159200 }, { "epoch": 1.1837265465353892, "grad_norm": 0.049096621572971344, "learning_rate": 2.1480922359714032e-05, "loss": 0.0051, "step": 159300 }, { "epoch": 1.1844696266022665, "grad_norm": 0.09021307528018951, "learning_rate": 2.1461367582873147e-05, "loss": 0.0059, "step": 159400 }, { "epoch": 1.1852127066691436, "grad_norm": 0.04629063606262207, "learning_rate": 2.1441812806032258e-05, "loss": 0.0057, "step": 159500 }, { "epoch": 1.1859557867360209, "grad_norm": 0.11932844668626785, "learning_rate": 2.1422258029191373e-05, "loss": 0.0054, "step": 159600 }, { "epoch": 1.186698866802898, "grad_norm": 0.03797685354948044, "learning_rate": 2.1402703252350484e-05, "loss": 0.0054, "step": 159700 }, { "epoch": 1.1874419468697752, "grad_norm": 0.1738366335630417, "learning_rate": 2.13831484755096e-05, "loss": 0.005, "step": 159800 }, { "epoch": 1.1881850269366525, "grad_norm": 0.04826760292053223, "learning_rate": 2.1363593698668714e-05, "loss": 0.0057, "step": 159900 }, { "epoch": 1.1889281070035296, "grad_norm": 0.029212472960352898, "learning_rate": 2.1344038921827825e-05, "loss": 0.0056, "step": 160000 }, { "epoch": 1.1896711870704069, "grad_norm": 0.03704888001084328, "learning_rate": 2.132448414498694e-05, "loss": 0.005, "step": 160100 }, { "epoch": 1.190414267137284, "grad_norm": 0.08682583272457123, "learning_rate": 2.130492936814605e-05, "loss": 0.0051, "step": 160200 }, { "epoch": 1.1911573472041612, "grad_norm": 0.15551891922950745, "learning_rate": 2.1285374591305166e-05, "loss": 0.0052, "step": 160300 }, { "epoch": 1.1919004272710385, "grad_norm": 0.09310979396104813, "learning_rate": 2.1265819814464277e-05, "loss": 0.0058, "step": 160400 }, { "epoch": 1.1926435073379156, "grad_norm": 0.08121733367443085, "learning_rate": 2.1246265037623392e-05, "loss": 0.0049, "step": 160500 }, { "epoch": 1.1933865874047929, "grad_norm": 0.11129051446914673, "learning_rate": 2.1226710260782507e-05, "loss": 0.0052, "step": 160600 }, { "epoch": 1.1941296674716702, "grad_norm": 0.06511110067367554, "learning_rate": 2.1207155483941618e-05, "loss": 0.0057, "step": 160700 }, { "epoch": 1.1948727475385472, "grad_norm": 0.03029809519648552, "learning_rate": 2.1187600707100733e-05, "loss": 0.0046, "step": 160800 }, { "epoch": 1.1956158276054245, "grad_norm": 0.10168028622865677, "learning_rate": 2.1168045930259844e-05, "loss": 0.0049, "step": 160900 }, { "epoch": 1.1963589076723018, "grad_norm": 0.178556889295578, "learning_rate": 2.114849115341896e-05, "loss": 0.0057, "step": 161000 }, { "epoch": 1.1971019877391789, "grad_norm": 0.31765252351760864, "learning_rate": 2.112893637657807e-05, "loss": 0.0058, "step": 161100 }, { "epoch": 1.1978450678060562, "grad_norm": 0.19874022901058197, "learning_rate": 2.1109381599737185e-05, "loss": 0.006, "step": 161200 }, { "epoch": 1.1985881478729332, "grad_norm": 0.11468703299760818, "learning_rate": 2.1089826822896297e-05, "loss": 0.0054, "step": 161300 }, { "epoch": 1.1993312279398105, "grad_norm": 0.19663496315479279, "learning_rate": 2.107027204605541e-05, "loss": 0.0051, "step": 161400 }, { "epoch": 1.2000743080066878, "grad_norm": 0.157107412815094, "learning_rate": 2.1050717269214526e-05, "loss": 0.0056, "step": 161500 }, { "epoch": 1.2008173880735649, "grad_norm": 0.034948479384183884, "learning_rate": 2.1031162492373637e-05, "loss": 0.0055, "step": 161600 }, { "epoch": 1.2015604681404422, "grad_norm": 0.0480792410671711, "learning_rate": 2.1011607715532752e-05, "loss": 0.0052, "step": 161700 }, { "epoch": 1.2023035482073192, "grad_norm": 0.07523351162672043, "learning_rate": 2.0992052938691864e-05, "loss": 0.0052, "step": 161800 }, { "epoch": 1.2030466282741965, "grad_norm": 0.14565862715244293, "learning_rate": 2.097249816185098e-05, "loss": 0.0057, "step": 161900 }, { "epoch": 1.2037897083410738, "grad_norm": 0.1609962433576584, "learning_rate": 2.095294338501009e-05, "loss": 0.0047, "step": 162000 }, { "epoch": 1.2045327884079509, "grad_norm": 0.02900378592312336, "learning_rate": 2.0933388608169205e-05, "loss": 0.0058, "step": 162100 }, { "epoch": 1.2052758684748281, "grad_norm": 0.05841328576207161, "learning_rate": 2.0913833831328316e-05, "loss": 0.0053, "step": 162200 }, { "epoch": 1.2060189485417054, "grad_norm": 0.04685492068529129, "learning_rate": 2.089427905448743e-05, "loss": 0.0057, "step": 162300 }, { "epoch": 1.2067620286085825, "grad_norm": 0.04519097879528999, "learning_rate": 2.0874724277646545e-05, "loss": 0.0061, "step": 162400 }, { "epoch": 1.2075051086754598, "grad_norm": 0.04171903803944588, "learning_rate": 2.0855169500805657e-05, "loss": 0.005, "step": 162500 }, { "epoch": 1.208248188742337, "grad_norm": 0.12160617858171463, "learning_rate": 2.083561472396477e-05, "loss": 0.0059, "step": 162600 }, { "epoch": 1.2089912688092141, "grad_norm": 0.025517700240015984, "learning_rate": 2.0816059947123883e-05, "loss": 0.007, "step": 162700 }, { "epoch": 1.2097343488760914, "grad_norm": 0.036021728068590164, "learning_rate": 2.0796505170282998e-05, "loss": 0.0058, "step": 162800 }, { "epoch": 1.2104774289429687, "grad_norm": 0.02864682301878929, "learning_rate": 2.077695039344211e-05, "loss": 0.0057, "step": 162900 }, { "epoch": 1.2112205090098458, "grad_norm": 0.1625436693429947, "learning_rate": 2.0757395616601224e-05, "loss": 0.0055, "step": 163000 }, { "epoch": 1.211963589076723, "grad_norm": 0.08981680870056152, "learning_rate": 2.0737840839760335e-05, "loss": 0.0054, "step": 163100 }, { "epoch": 1.2127066691436001, "grad_norm": 0.0683797225356102, "learning_rate": 2.071828606291945e-05, "loss": 0.0057, "step": 163200 }, { "epoch": 1.2134497492104774, "grad_norm": 0.0582747645676136, "learning_rate": 2.0698731286078565e-05, "loss": 0.0049, "step": 163300 }, { "epoch": 1.2141928292773547, "grad_norm": 0.03703107684850693, "learning_rate": 2.0679176509237676e-05, "loss": 0.0054, "step": 163400 }, { "epoch": 1.2149359093442318, "grad_norm": 0.21564917266368866, "learning_rate": 2.065962173239679e-05, "loss": 0.0052, "step": 163500 }, { "epoch": 1.215678989411109, "grad_norm": 0.029874056577682495, "learning_rate": 2.0640066955555902e-05, "loss": 0.0047, "step": 163600 }, { "epoch": 1.2164220694779861, "grad_norm": 0.11503498256206512, "learning_rate": 2.0620512178715017e-05, "loss": 0.0062, "step": 163700 }, { "epoch": 1.2171651495448634, "grad_norm": 0.019750218838453293, "learning_rate": 2.060095740187413e-05, "loss": 0.0056, "step": 163800 }, { "epoch": 1.2179082296117407, "grad_norm": 0.07559823244810104, "learning_rate": 2.0581402625033243e-05, "loss": 0.006, "step": 163900 }, { "epoch": 1.2186513096786178, "grad_norm": 0.022603429853916168, "learning_rate": 2.0561847848192358e-05, "loss": 0.0061, "step": 164000 }, { "epoch": 1.219394389745495, "grad_norm": 0.051996760070323944, "learning_rate": 2.054229307135147e-05, "loss": 0.0058, "step": 164100 }, { "epoch": 1.2201374698123724, "grad_norm": 0.01054561510682106, "learning_rate": 2.0522738294510584e-05, "loss": 0.0058, "step": 164200 }, { "epoch": 1.2208805498792494, "grad_norm": 0.03456280380487442, "learning_rate": 2.0503183517669695e-05, "loss": 0.0054, "step": 164300 }, { "epoch": 1.2216236299461267, "grad_norm": 0.11052970588207245, "learning_rate": 2.048362874082881e-05, "loss": 0.0065, "step": 164400 }, { "epoch": 1.222366710013004, "grad_norm": 0.010518178343772888, "learning_rate": 2.046407396398792e-05, "loss": 0.0062, "step": 164500 }, { "epoch": 1.223109790079881, "grad_norm": 0.053279418498277664, "learning_rate": 2.0444519187147036e-05, "loss": 0.0061, "step": 164600 }, { "epoch": 1.2238528701467584, "grad_norm": 0.045316215604543686, "learning_rate": 2.0424964410306148e-05, "loss": 0.0055, "step": 164700 }, { "epoch": 1.2245959502136354, "grad_norm": 0.11642754077911377, "learning_rate": 2.0405409633465263e-05, "loss": 0.0054, "step": 164800 }, { "epoch": 1.2253390302805127, "grad_norm": 0.03779132664203644, "learning_rate": 2.0385854856624377e-05, "loss": 0.0051, "step": 164900 }, { "epoch": 1.22608211034739, "grad_norm": 0.017442749813199043, "learning_rate": 2.036630007978349e-05, "loss": 0.0053, "step": 165000 }, { "epoch": 1.226825190414267, "grad_norm": 0.04245893657207489, "learning_rate": 2.0346745302942603e-05, "loss": 0.0048, "step": 165100 }, { "epoch": 1.2275682704811444, "grad_norm": 0.10441511124372482, "learning_rate": 2.0327190526101715e-05, "loss": 0.0054, "step": 165200 }, { "epoch": 1.2283113505480214, "grad_norm": 0.054512869566679, "learning_rate": 2.030763574926083e-05, "loss": 0.0059, "step": 165300 }, { "epoch": 1.2290544306148987, "grad_norm": 0.11411943286657333, "learning_rate": 2.0288080972419944e-05, "loss": 0.0064, "step": 165400 }, { "epoch": 1.229797510681776, "grad_norm": 0.04184435307979584, "learning_rate": 2.0268526195579056e-05, "loss": 0.0064, "step": 165500 }, { "epoch": 1.230540590748653, "grad_norm": 0.03550296276807785, "learning_rate": 2.024897141873817e-05, "loss": 0.0041, "step": 165600 }, { "epoch": 1.2312836708155304, "grad_norm": 0.07943154126405716, "learning_rate": 2.0229416641897285e-05, "loss": 0.0059, "step": 165700 }, { "epoch": 1.2320267508824077, "grad_norm": 0.06805075705051422, "learning_rate": 2.0209861865056397e-05, "loss": 0.0054, "step": 165800 }, { "epoch": 1.2327698309492847, "grad_norm": 0.054235901683568954, "learning_rate": 2.019030708821551e-05, "loss": 0.0059, "step": 165900 }, { "epoch": 1.233512911016162, "grad_norm": 0.037821024656295776, "learning_rate": 2.0170752311374623e-05, "loss": 0.0059, "step": 166000 }, { "epoch": 1.2342559910830393, "grad_norm": 0.06173605099320412, "learning_rate": 2.0151197534533738e-05, "loss": 0.0048, "step": 166100 }, { "epoch": 1.2349990711499164, "grad_norm": 0.08779418468475342, "learning_rate": 2.0131642757692852e-05, "loss": 0.0056, "step": 166200 }, { "epoch": 1.2357421512167936, "grad_norm": 0.136694073677063, "learning_rate": 2.0112087980851964e-05, "loss": 0.0067, "step": 166300 }, { "epoch": 1.236485231283671, "grad_norm": 0.017585447058081627, "learning_rate": 2.009253320401108e-05, "loss": 0.0045, "step": 166400 }, { "epoch": 1.237228311350548, "grad_norm": 0.03520519286394119, "learning_rate": 2.007297842717019e-05, "loss": 0.0051, "step": 166500 }, { "epoch": 1.2379713914174253, "grad_norm": 0.10063666850328445, "learning_rate": 2.0053423650329305e-05, "loss": 0.0046, "step": 166600 }, { "epoch": 1.2387144714843024, "grad_norm": 0.06583085656166077, "learning_rate": 2.003386887348842e-05, "loss": 0.0049, "step": 166700 }, { "epoch": 1.2394575515511796, "grad_norm": 0.12166975438594818, "learning_rate": 2.001431409664753e-05, "loss": 0.0055, "step": 166800 }, { "epoch": 1.240200631618057, "grad_norm": 0.10570292919874191, "learning_rate": 1.9994759319806645e-05, "loss": 0.0052, "step": 166900 }, { "epoch": 1.240943711684934, "grad_norm": 0.2888520658016205, "learning_rate": 1.9975204542965757e-05, "loss": 0.0058, "step": 167000 }, { "epoch": 1.2416867917518113, "grad_norm": 0.07429385185241699, "learning_rate": 1.995564976612487e-05, "loss": 0.0052, "step": 167100 }, { "epoch": 1.2424298718186884, "grad_norm": 0.08394579589366913, "learning_rate": 1.9936094989283983e-05, "loss": 0.0053, "step": 167200 }, { "epoch": 1.2431729518855656, "grad_norm": 0.026269758120179176, "learning_rate": 1.9916540212443098e-05, "loss": 0.0045, "step": 167300 }, { "epoch": 1.243916031952443, "grad_norm": 0.02738526649773121, "learning_rate": 1.9896985435602213e-05, "loss": 0.0048, "step": 167400 }, { "epoch": 1.24465911201932, "grad_norm": 0.06147921830415726, "learning_rate": 1.9877430658761324e-05, "loss": 0.0059, "step": 167500 }, { "epoch": 1.2454021920861973, "grad_norm": 0.1722283959388733, "learning_rate": 1.985787588192044e-05, "loss": 0.0054, "step": 167600 }, { "epoch": 1.2461452721530746, "grad_norm": 0.034059323370456696, "learning_rate": 1.983832110507955e-05, "loss": 0.0056, "step": 167700 }, { "epoch": 1.2468883522199516, "grad_norm": 0.05264913663268089, "learning_rate": 1.9818766328238665e-05, "loss": 0.0046, "step": 167800 }, { "epoch": 1.247631432286829, "grad_norm": 0.14849810302257538, "learning_rate": 1.9799211551397776e-05, "loss": 0.0047, "step": 167900 }, { "epoch": 1.2483745123537062, "grad_norm": 0.050658129155635834, "learning_rate": 1.977965677455689e-05, "loss": 0.005, "step": 168000 }, { "epoch": 1.2491175924205833, "grad_norm": 0.1454433649778366, "learning_rate": 1.9760101997716002e-05, "loss": 0.0051, "step": 168100 }, { "epoch": 1.2498606724874606, "grad_norm": 0.14605167508125305, "learning_rate": 1.9740547220875117e-05, "loss": 0.0055, "step": 168200 }, { "epoch": 1.2506037525543379, "grad_norm": 0.03066105581820011, "learning_rate": 1.9720992444034232e-05, "loss": 0.0057, "step": 168300 }, { "epoch": 1.251346832621215, "grad_norm": 0.04106878861784935, "learning_rate": 1.9701437667193343e-05, "loss": 0.0051, "step": 168400 }, { "epoch": 1.2520899126880922, "grad_norm": 0.18177688121795654, "learning_rate": 1.9681882890352458e-05, "loss": 0.0052, "step": 168500 }, { "epoch": 1.2528329927549693, "grad_norm": 0.031028835102915764, "learning_rate": 1.966232811351157e-05, "loss": 0.0064, "step": 168600 }, { "epoch": 1.2535760728218466, "grad_norm": 0.0861407220363617, "learning_rate": 1.9642773336670684e-05, "loss": 0.0053, "step": 168700 }, { "epoch": 1.2543191528887236, "grad_norm": 0.017270410433411598, "learning_rate": 1.9623218559829796e-05, "loss": 0.0056, "step": 168800 }, { "epoch": 1.255062232955601, "grad_norm": 0.12118333578109741, "learning_rate": 1.960366378298891e-05, "loss": 0.0061, "step": 168900 }, { "epoch": 1.2558053130224782, "grad_norm": 0.2574746608734131, "learning_rate": 1.958410900614802e-05, "loss": 0.0056, "step": 169000 }, { "epoch": 1.2565483930893553, "grad_norm": 0.02362808957695961, "learning_rate": 1.9564554229307136e-05, "loss": 0.0054, "step": 169100 }, { "epoch": 1.2572914731562326, "grad_norm": 0.03834668546915054, "learning_rate": 1.954499945246625e-05, "loss": 0.0051, "step": 169200 }, { "epoch": 1.2580345532231099, "grad_norm": 0.034772373735904694, "learning_rate": 1.9525444675625363e-05, "loss": 0.0056, "step": 169300 }, { "epoch": 1.258777633289987, "grad_norm": 0.1967865377664566, "learning_rate": 1.9505889898784477e-05, "loss": 0.0063, "step": 169400 }, { "epoch": 1.2595207133568642, "grad_norm": 0.03655865415930748, "learning_rate": 1.948633512194359e-05, "loss": 0.0065, "step": 169500 }, { "epoch": 1.2602637934237415, "grad_norm": 0.04575251042842865, "learning_rate": 1.9466780345102703e-05, "loss": 0.0056, "step": 169600 }, { "epoch": 1.2610068734906186, "grad_norm": 0.03010171465575695, "learning_rate": 1.9447225568261815e-05, "loss": 0.0053, "step": 169700 }, { "epoch": 1.2617499535574959, "grad_norm": 0.021234197542071342, "learning_rate": 1.942767079142093e-05, "loss": 0.0045, "step": 169800 }, { "epoch": 1.2624930336243732, "grad_norm": 0.05387526750564575, "learning_rate": 1.9408116014580044e-05, "loss": 0.0046, "step": 169900 }, { "epoch": 1.2632361136912502, "grad_norm": 0.09197808802127838, "learning_rate": 1.9388561237739156e-05, "loss": 0.0066, "step": 170000 }, { "epoch": 1.2639791937581275, "grad_norm": 0.0833139717578888, "learning_rate": 1.936900646089827e-05, "loss": 0.0057, "step": 170100 }, { "epoch": 1.2647222738250046, "grad_norm": 0.21150481700897217, "learning_rate": 1.9349451684057382e-05, "loss": 0.0054, "step": 170200 }, { "epoch": 1.2654653538918819, "grad_norm": 0.052112385630607605, "learning_rate": 1.9329896907216497e-05, "loss": 0.0045, "step": 170300 }, { "epoch": 1.266208433958759, "grad_norm": 0.04085984453558922, "learning_rate": 1.9310342130375608e-05, "loss": 0.0052, "step": 170400 }, { "epoch": 1.2669515140256362, "grad_norm": 0.027670495212078094, "learning_rate": 1.9290787353534723e-05, "loss": 0.0053, "step": 170500 }, { "epoch": 1.2676945940925135, "grad_norm": 0.04701368510723114, "learning_rate": 1.9271232576693834e-05, "loss": 0.0051, "step": 170600 }, { "epoch": 1.2684376741593906, "grad_norm": 0.06840857863426208, "learning_rate": 1.925167779985295e-05, "loss": 0.0049, "step": 170700 }, { "epoch": 1.2691807542262679, "grad_norm": 0.15290194749832153, "learning_rate": 1.9232123023012064e-05, "loss": 0.0061, "step": 170800 }, { "epoch": 1.2699238342931451, "grad_norm": 0.022237757220864296, "learning_rate": 1.9212568246171175e-05, "loss": 0.0064, "step": 170900 }, { "epoch": 1.2706669143600222, "grad_norm": 0.0493515282869339, "learning_rate": 1.919301346933029e-05, "loss": 0.0048, "step": 171000 }, { "epoch": 1.2714099944268995, "grad_norm": 0.13774226605892181, "learning_rate": 1.91734586924894e-05, "loss": 0.0055, "step": 171100 }, { "epoch": 1.2721530744937768, "grad_norm": 0.025479158386588097, "learning_rate": 1.9153903915648516e-05, "loss": 0.0058, "step": 171200 }, { "epoch": 1.2728961545606539, "grad_norm": 0.39848169684410095, "learning_rate": 1.9134349138807627e-05, "loss": 0.0053, "step": 171300 }, { "epoch": 1.2736392346275311, "grad_norm": 0.02005809172987938, "learning_rate": 1.9114794361966742e-05, "loss": 0.0052, "step": 171400 }, { "epoch": 1.2743823146944084, "grad_norm": 0.055112939327955246, "learning_rate": 1.9095239585125853e-05, "loss": 0.0066, "step": 171500 }, { "epoch": 1.2751253947612855, "grad_norm": 0.08790340274572372, "learning_rate": 1.9075684808284968e-05, "loss": 0.0048, "step": 171600 }, { "epoch": 1.2758684748281628, "grad_norm": 0.028732988983392715, "learning_rate": 1.9056130031444083e-05, "loss": 0.0054, "step": 171700 }, { "epoch": 1.27661155489504, "grad_norm": 0.03356609866023064, "learning_rate": 1.9036575254603194e-05, "loss": 0.0061, "step": 171800 }, { "epoch": 1.2773546349619171, "grad_norm": 0.014699109829962254, "learning_rate": 1.901702047776231e-05, "loss": 0.005, "step": 171900 }, { "epoch": 1.2780977150287944, "grad_norm": 0.11146711558103561, "learning_rate": 1.899746570092142e-05, "loss": 0.0055, "step": 172000 }, { "epoch": 1.2788407950956715, "grad_norm": 0.06345479190349579, "learning_rate": 1.8977910924080535e-05, "loss": 0.0054, "step": 172100 }, { "epoch": 1.2795838751625488, "grad_norm": 0.03859024867415428, "learning_rate": 1.8958356147239647e-05, "loss": 0.0061, "step": 172200 }, { "epoch": 1.2803269552294259, "grad_norm": 0.28340962529182434, "learning_rate": 1.893880137039876e-05, "loss": 0.0062, "step": 172300 }, { "epoch": 1.2810700352963031, "grad_norm": 0.03966061770915985, "learning_rate": 1.8919246593557873e-05, "loss": 0.0054, "step": 172400 }, { "epoch": 1.2818131153631804, "grad_norm": 0.031234780326485634, "learning_rate": 1.8899691816716988e-05, "loss": 0.0058, "step": 172500 }, { "epoch": 1.2825561954300575, "grad_norm": 0.04265785962343216, "learning_rate": 1.8880137039876102e-05, "loss": 0.0055, "step": 172600 }, { "epoch": 1.2832992754969348, "grad_norm": 0.03143495321273804, "learning_rate": 1.8860582263035214e-05, "loss": 0.0061, "step": 172700 }, { "epoch": 1.284042355563812, "grad_norm": 0.038690049201250076, "learning_rate": 1.884102748619433e-05, "loss": 0.0052, "step": 172800 }, { "epoch": 1.2847854356306891, "grad_norm": 0.15045146644115448, "learning_rate": 1.882147270935344e-05, "loss": 0.0061, "step": 172900 }, { "epoch": 1.2855285156975664, "grad_norm": 0.0434846356511116, "learning_rate": 1.8801917932512555e-05, "loss": 0.006, "step": 173000 }, { "epoch": 1.2862715957644437, "grad_norm": 0.22515283524990082, "learning_rate": 1.8782363155671666e-05, "loss": 0.0057, "step": 173100 }, { "epoch": 1.2870146758313208, "grad_norm": 0.03203754499554634, "learning_rate": 1.876280837883078e-05, "loss": 0.0062, "step": 173200 }, { "epoch": 1.287757755898198, "grad_norm": 0.06853476911783218, "learning_rate": 1.8743253601989896e-05, "loss": 0.0052, "step": 173300 }, { "epoch": 1.2885008359650754, "grad_norm": 0.24159570038318634, "learning_rate": 1.8723698825149007e-05, "loss": 0.0052, "step": 173400 }, { "epoch": 1.2892439160319524, "grad_norm": 0.044867802411317825, "learning_rate": 1.870414404830812e-05, "loss": 0.0057, "step": 173500 }, { "epoch": 1.2899869960988297, "grad_norm": 0.1359708458185196, "learning_rate": 1.8684589271467233e-05, "loss": 0.0061, "step": 173600 }, { "epoch": 1.2907300761657068, "grad_norm": 0.03239482641220093, "learning_rate": 1.8665034494626348e-05, "loss": 0.0055, "step": 173700 }, { "epoch": 1.291473156232584, "grad_norm": 0.019784174859523773, "learning_rate": 1.864547971778546e-05, "loss": 0.0045, "step": 173800 }, { "epoch": 1.2922162362994611, "grad_norm": 0.07641046494245529, "learning_rate": 1.8625924940944574e-05, "loss": 0.0057, "step": 173900 }, { "epoch": 1.2929593163663384, "grad_norm": 0.039163738489151, "learning_rate": 1.8606370164103685e-05, "loss": 0.0049, "step": 174000 }, { "epoch": 1.2937023964332157, "grad_norm": 0.3009902834892273, "learning_rate": 1.85868153872628e-05, "loss": 0.0056, "step": 174100 }, { "epoch": 1.2944454765000928, "grad_norm": 0.05981777235865593, "learning_rate": 1.8567260610421915e-05, "loss": 0.0057, "step": 174200 }, { "epoch": 1.29518855656697, "grad_norm": 0.017076073214411736, "learning_rate": 1.8547705833581026e-05, "loss": 0.0053, "step": 174300 }, { "epoch": 1.2959316366338474, "grad_norm": 0.13244876265525818, "learning_rate": 1.852815105674014e-05, "loss": 0.0059, "step": 174400 }, { "epoch": 1.2966747167007244, "grad_norm": 0.2116813063621521, "learning_rate": 1.8508596279899252e-05, "loss": 0.0066, "step": 174500 }, { "epoch": 1.2974177967676017, "grad_norm": 0.024352658540010452, "learning_rate": 1.8489041503058367e-05, "loss": 0.0061, "step": 174600 }, { "epoch": 1.298160876834479, "grad_norm": 0.03283802047371864, "learning_rate": 1.8469486726217482e-05, "loss": 0.0058, "step": 174700 }, { "epoch": 1.298903956901356, "grad_norm": 0.05378739535808563, "learning_rate": 1.8449931949376593e-05, "loss": 0.005, "step": 174800 }, { "epoch": 1.2996470369682334, "grad_norm": 0.02390611730515957, "learning_rate": 1.8430377172535708e-05, "loss": 0.0046, "step": 174900 }, { "epoch": 1.3003901170351106, "grad_norm": 0.3319222629070282, "learning_rate": 1.8410822395694823e-05, "loss": 0.0048, "step": 175000 }, { "epoch": 1.3011331971019877, "grad_norm": 0.014999190345406532, "learning_rate": 1.8391267618853934e-05, "loss": 0.0055, "step": 175100 }, { "epoch": 1.301876277168865, "grad_norm": 0.043924055993556976, "learning_rate": 1.837171284201305e-05, "loss": 0.0058, "step": 175200 }, { "epoch": 1.302619357235742, "grad_norm": 0.16861611604690552, "learning_rate": 1.835215806517216e-05, "loss": 0.0053, "step": 175300 }, { "epoch": 1.3033624373026194, "grad_norm": 0.04724852740764618, "learning_rate": 1.8332603288331275e-05, "loss": 0.0058, "step": 175400 }, { "epoch": 1.3041055173694964, "grad_norm": 0.054190609604120255, "learning_rate": 1.831304851149039e-05, "loss": 0.0067, "step": 175500 }, { "epoch": 1.3048485974363737, "grad_norm": 0.060979440808296204, "learning_rate": 1.82934937346495e-05, "loss": 0.0051, "step": 175600 }, { "epoch": 1.305591677503251, "grad_norm": 0.14218081533908844, "learning_rate": 1.8273938957808616e-05, "loss": 0.0061, "step": 175700 }, { "epoch": 1.306334757570128, "grad_norm": 0.10301190614700317, "learning_rate": 1.8254384180967727e-05, "loss": 0.0051, "step": 175800 }, { "epoch": 1.3070778376370054, "grad_norm": 0.04832525551319122, "learning_rate": 1.8234829404126842e-05, "loss": 0.0048, "step": 175900 }, { "epoch": 1.3078209177038826, "grad_norm": 0.03788604959845543, "learning_rate": 1.8215274627285957e-05, "loss": 0.006, "step": 176000 }, { "epoch": 1.3085639977707597, "grad_norm": 0.06793135404586792, "learning_rate": 1.8195719850445068e-05, "loss": 0.0054, "step": 176100 }, { "epoch": 1.309307077837637, "grad_norm": 0.054524168372154236, "learning_rate": 1.8176165073604183e-05, "loss": 0.0057, "step": 176200 }, { "epoch": 1.3100501579045143, "grad_norm": 0.17667660117149353, "learning_rate": 1.8156610296763294e-05, "loss": 0.0049, "step": 176300 }, { "epoch": 1.3107932379713914, "grad_norm": 0.0437762551009655, "learning_rate": 1.813705551992241e-05, "loss": 0.0054, "step": 176400 }, { "epoch": 1.3115363180382686, "grad_norm": 0.2396930605173111, "learning_rate": 1.811750074308152e-05, "loss": 0.0061, "step": 176500 }, { "epoch": 1.312279398105146, "grad_norm": 0.10242284089326859, "learning_rate": 1.8097945966240635e-05, "loss": 0.0062, "step": 176600 }, { "epoch": 1.313022478172023, "grad_norm": 0.04702738672494888, "learning_rate": 1.807839118939975e-05, "loss": 0.0062, "step": 176700 }, { "epoch": 1.3137655582389003, "grad_norm": 0.04141435772180557, "learning_rate": 1.805883641255886e-05, "loss": 0.0059, "step": 176800 }, { "epoch": 1.3145086383057776, "grad_norm": 0.02870710752904415, "learning_rate": 1.8039281635717976e-05, "loss": 0.0057, "step": 176900 }, { "epoch": 1.3152517183726546, "grad_norm": 0.0654875710606575, "learning_rate": 1.8019726858877088e-05, "loss": 0.0056, "step": 177000 }, { "epoch": 1.315994798439532, "grad_norm": 0.020132476463913918, "learning_rate": 1.8000172082036202e-05, "loss": 0.0077, "step": 177100 }, { "epoch": 1.316737878506409, "grad_norm": 0.07209264487028122, "learning_rate": 1.7980617305195314e-05, "loss": 0.0053, "step": 177200 }, { "epoch": 1.3174809585732863, "grad_norm": 0.05827513337135315, "learning_rate": 1.796106252835443e-05, "loss": 0.0056, "step": 177300 }, { "epoch": 1.3182240386401634, "grad_norm": 0.03393007069826126, "learning_rate": 1.794150775151354e-05, "loss": 0.0045, "step": 177400 }, { "epoch": 1.3189671187070406, "grad_norm": 0.01895316131412983, "learning_rate": 1.7921952974672655e-05, "loss": 0.0053, "step": 177500 }, { "epoch": 1.319710198773918, "grad_norm": 0.042288392782211304, "learning_rate": 1.790239819783177e-05, "loss": 0.0064, "step": 177600 }, { "epoch": 1.320453278840795, "grad_norm": 0.038910526782274246, "learning_rate": 1.788284342099088e-05, "loss": 0.0065, "step": 177700 }, { "epoch": 1.3211963589076723, "grad_norm": 0.0772082731127739, "learning_rate": 1.7863288644149996e-05, "loss": 0.0051, "step": 177800 }, { "epoch": 1.3219394389745496, "grad_norm": 0.04483630135655403, "learning_rate": 1.7843733867309107e-05, "loss": 0.0059, "step": 177900 }, { "epoch": 1.3226825190414266, "grad_norm": 0.5898640155792236, "learning_rate": 1.7824179090468222e-05, "loss": 0.0053, "step": 178000 }, { "epoch": 1.323425599108304, "grad_norm": 0.0532136932015419, "learning_rate": 1.7804624313627333e-05, "loss": 0.0053, "step": 178100 }, { "epoch": 1.3241686791751812, "grad_norm": 0.03683685138821602, "learning_rate": 1.7785069536786448e-05, "loss": 0.0051, "step": 178200 }, { "epoch": 1.3249117592420583, "grad_norm": 0.13168781995773315, "learning_rate": 1.776551475994556e-05, "loss": 0.0047, "step": 178300 }, { "epoch": 1.3256548393089356, "grad_norm": 0.027689984068274498, "learning_rate": 1.7745959983104674e-05, "loss": 0.0052, "step": 178400 }, { "epoch": 1.3263979193758129, "grad_norm": 0.04325810447335243, "learning_rate": 1.772640520626379e-05, "loss": 0.0048, "step": 178500 }, { "epoch": 1.32714099944269, "grad_norm": 0.19855816662311554, "learning_rate": 1.77068504294229e-05, "loss": 0.0055, "step": 178600 }, { "epoch": 1.3278840795095672, "grad_norm": 0.032868001610040665, "learning_rate": 1.7687295652582015e-05, "loss": 0.0067, "step": 178700 }, { "epoch": 1.3286271595764443, "grad_norm": 0.027391165494918823, "learning_rate": 1.7667740875741126e-05, "loss": 0.0056, "step": 178800 }, { "epoch": 1.3293702396433216, "grad_norm": 0.03709883242845535, "learning_rate": 1.764818609890024e-05, "loss": 0.0052, "step": 178900 }, { "epoch": 1.3301133197101986, "grad_norm": 0.1348564773797989, "learning_rate": 1.7628631322059352e-05, "loss": 0.0069, "step": 179000 }, { "epoch": 1.330856399777076, "grad_norm": 0.07099783420562744, "learning_rate": 1.7609076545218467e-05, "loss": 0.0048, "step": 179100 }, { "epoch": 1.3315994798439532, "grad_norm": 0.11181604117155075, "learning_rate": 1.7589521768377582e-05, "loss": 0.0045, "step": 179200 }, { "epoch": 1.3323425599108303, "grad_norm": 0.025230687111616135, "learning_rate": 1.7569966991536693e-05, "loss": 0.0041, "step": 179300 }, { "epoch": 1.3330856399777076, "grad_norm": 0.10862463712692261, "learning_rate": 1.7550412214695808e-05, "loss": 0.0062, "step": 179400 }, { "epoch": 1.3338287200445849, "grad_norm": 0.09826289862394333, "learning_rate": 1.753085743785492e-05, "loss": 0.0056, "step": 179500 }, { "epoch": 1.334571800111462, "grad_norm": 0.053419604897499084, "learning_rate": 1.7511302661014034e-05, "loss": 0.0065, "step": 179600 }, { "epoch": 1.3353148801783392, "grad_norm": 0.04688083007931709, "learning_rate": 1.7491747884173146e-05, "loss": 0.0054, "step": 179700 }, { "epoch": 1.3360579602452165, "grad_norm": 0.03229918330907822, "learning_rate": 1.747219310733226e-05, "loss": 0.0048, "step": 179800 }, { "epoch": 1.3368010403120936, "grad_norm": 0.12093156576156616, "learning_rate": 1.7452638330491372e-05, "loss": 0.0055, "step": 179900 }, { "epoch": 1.3375441203789709, "grad_norm": 0.04033501073718071, "learning_rate": 1.7433083553650487e-05, "loss": 0.0049, "step": 180000 }, { "epoch": 1.3382872004458481, "grad_norm": 0.025012167170643806, "learning_rate": 1.74135287768096e-05, "loss": 0.0053, "step": 180100 }, { "epoch": 1.3390302805127252, "grad_norm": 0.03204115852713585, "learning_rate": 1.7393973999968713e-05, "loss": 0.0051, "step": 180200 }, { "epoch": 1.3397733605796025, "grad_norm": 0.05455002188682556, "learning_rate": 1.7374419223127827e-05, "loss": 0.0054, "step": 180300 }, { "epoch": 1.3405164406464798, "grad_norm": 0.10321421921253204, "learning_rate": 1.735486444628694e-05, "loss": 0.0047, "step": 180400 }, { "epoch": 1.3412595207133569, "grad_norm": 0.10282864421606064, "learning_rate": 1.7335309669446054e-05, "loss": 0.0048, "step": 180500 }, { "epoch": 1.3420026007802341, "grad_norm": 0.11601240932941437, "learning_rate": 1.7315754892605165e-05, "loss": 0.0059, "step": 180600 }, { "epoch": 1.3427456808471112, "grad_norm": 0.05097212269902229, "learning_rate": 1.729620011576428e-05, "loss": 0.0053, "step": 180700 }, { "epoch": 1.3434887609139885, "grad_norm": 0.04412403330206871, "learning_rate": 1.727664533892339e-05, "loss": 0.0055, "step": 180800 }, { "epoch": 1.3442318409808656, "grad_norm": 0.03051503747701645, "learning_rate": 1.7257090562082506e-05, "loss": 0.0057, "step": 180900 }, { "epoch": 1.3449749210477429, "grad_norm": 0.02553318627178669, "learning_rate": 1.723753578524162e-05, "loss": 0.0056, "step": 181000 }, { "epoch": 1.3457180011146201, "grad_norm": 0.026664968580007553, "learning_rate": 1.7217981008400732e-05, "loss": 0.0052, "step": 181100 }, { "epoch": 1.3464610811814972, "grad_norm": 0.10600394755601883, "learning_rate": 1.7198426231559847e-05, "loss": 0.0057, "step": 181200 }, { "epoch": 1.3472041612483745, "grad_norm": 0.03399946913123131, "learning_rate": 1.7178871454718958e-05, "loss": 0.0052, "step": 181300 }, { "epoch": 1.3479472413152518, "grad_norm": 0.025723356753587723, "learning_rate": 1.7159316677878073e-05, "loss": 0.0052, "step": 181400 }, { "epoch": 1.3486903213821289, "grad_norm": 0.33823010325431824, "learning_rate": 1.7139761901037184e-05, "loss": 0.0045, "step": 181500 }, { "epoch": 1.3494334014490061, "grad_norm": 0.042064741253852844, "learning_rate": 1.71202071241963e-05, "loss": 0.0058, "step": 181600 }, { "epoch": 1.3501764815158834, "grad_norm": 0.22197896242141724, "learning_rate": 1.710065234735541e-05, "loss": 0.0055, "step": 181700 }, { "epoch": 1.3509195615827605, "grad_norm": 0.2986047863960266, "learning_rate": 1.7081097570514525e-05, "loss": 0.0065, "step": 181800 }, { "epoch": 1.3516626416496378, "grad_norm": 0.18359826505184174, "learning_rate": 1.706154279367364e-05, "loss": 0.0049, "step": 181900 }, { "epoch": 1.352405721716515, "grad_norm": 0.05756809189915657, "learning_rate": 1.704198801683275e-05, "loss": 0.0049, "step": 182000 }, { "epoch": 1.3531488017833921, "grad_norm": 0.04589132219552994, "learning_rate": 1.7022433239991866e-05, "loss": 0.0046, "step": 182100 }, { "epoch": 1.3538918818502694, "grad_norm": 0.04157756268978119, "learning_rate": 1.7002878463150977e-05, "loss": 0.0056, "step": 182200 }, { "epoch": 1.3546349619171465, "grad_norm": 0.03877347335219383, "learning_rate": 1.6983323686310092e-05, "loss": 0.0059, "step": 182300 }, { "epoch": 1.3553780419840238, "grad_norm": 0.03444492071866989, "learning_rate": 1.6963768909469204e-05, "loss": 0.0051, "step": 182400 }, { "epoch": 1.3561211220509009, "grad_norm": 0.07215800136327744, "learning_rate": 1.694421413262832e-05, "loss": 0.0054, "step": 182500 }, { "epoch": 1.3568642021177781, "grad_norm": 0.034580983221530914, "learning_rate": 1.6924659355787433e-05, "loss": 0.0058, "step": 182600 }, { "epoch": 1.3576072821846554, "grad_norm": 0.03187589347362518, "learning_rate": 1.6905104578946545e-05, "loss": 0.0048, "step": 182700 }, { "epoch": 1.3583503622515325, "grad_norm": 0.12429722398519516, "learning_rate": 1.688554980210566e-05, "loss": 0.0053, "step": 182800 }, { "epoch": 1.3590934423184098, "grad_norm": 0.015850408002734184, "learning_rate": 1.686599502526477e-05, "loss": 0.0049, "step": 182900 }, { "epoch": 1.359836522385287, "grad_norm": 0.03321019187569618, "learning_rate": 1.6846440248423885e-05, "loss": 0.005, "step": 183000 }, { "epoch": 1.3605796024521641, "grad_norm": 0.047321733087301254, "learning_rate": 1.6826885471582997e-05, "loss": 0.0049, "step": 183100 }, { "epoch": 1.3613226825190414, "grad_norm": 0.0761391669511795, "learning_rate": 1.680733069474211e-05, "loss": 0.0048, "step": 183200 }, { "epoch": 1.3620657625859187, "grad_norm": 0.03366865590214729, "learning_rate": 1.6787775917901223e-05, "loss": 0.0053, "step": 183300 }, { "epoch": 1.3628088426527958, "grad_norm": 0.11512160301208496, "learning_rate": 1.6768221141060338e-05, "loss": 0.0055, "step": 183400 }, { "epoch": 1.363551922719673, "grad_norm": 0.036442674696445465, "learning_rate": 1.6748666364219452e-05, "loss": 0.0049, "step": 183500 }, { "epoch": 1.3642950027865504, "grad_norm": 0.0541292168200016, "learning_rate": 1.6729111587378564e-05, "loss": 0.0047, "step": 183600 }, { "epoch": 1.3650380828534274, "grad_norm": 0.07448815554380417, "learning_rate": 1.670955681053768e-05, "loss": 0.0065, "step": 183700 }, { "epoch": 1.3657811629203047, "grad_norm": 0.06441698223352432, "learning_rate": 1.669000203369679e-05, "loss": 0.006, "step": 183800 }, { "epoch": 1.366524242987182, "grad_norm": 0.02721351385116577, "learning_rate": 1.6670447256855905e-05, "loss": 0.0052, "step": 183900 }, { "epoch": 1.367267323054059, "grad_norm": 0.14046454429626465, "learning_rate": 1.665089248001502e-05, "loss": 0.0063, "step": 184000 }, { "epoch": 1.3680104031209364, "grad_norm": 0.09397460520267487, "learning_rate": 1.663133770317413e-05, "loss": 0.0049, "step": 184100 }, { "epoch": 1.3687534831878134, "grad_norm": 0.020227842032909393, "learning_rate": 1.6611782926333246e-05, "loss": 0.0062, "step": 184200 }, { "epoch": 1.3694965632546907, "grad_norm": 0.023826437070965767, "learning_rate": 1.6592228149492357e-05, "loss": 0.005, "step": 184300 }, { "epoch": 1.3702396433215678, "grad_norm": 0.08031010627746582, "learning_rate": 1.6572673372651472e-05, "loss": 0.0045, "step": 184400 }, { "epoch": 1.370982723388445, "grad_norm": 0.10941245406866074, "learning_rate": 1.6553118595810587e-05, "loss": 0.0054, "step": 184500 }, { "epoch": 1.3717258034553224, "grad_norm": 0.06339183449745178, "learning_rate": 1.6533563818969698e-05, "loss": 0.0053, "step": 184600 }, { "epoch": 1.3724688835221994, "grad_norm": 0.04405754432082176, "learning_rate": 1.6514009042128813e-05, "loss": 0.0055, "step": 184700 }, { "epoch": 1.3732119635890767, "grad_norm": 0.1407489776611328, "learning_rate": 1.6494454265287927e-05, "loss": 0.0056, "step": 184800 }, { "epoch": 1.373955043655954, "grad_norm": 0.034202899783849716, "learning_rate": 1.647489948844704e-05, "loss": 0.0053, "step": 184900 }, { "epoch": 1.374698123722831, "grad_norm": 0.039474084973335266, "learning_rate": 1.6455344711606154e-05, "loss": 0.0057, "step": 185000 }, { "epoch": 1.3754412037897084, "grad_norm": 0.04266037419438362, "learning_rate": 1.6435789934765265e-05, "loss": 0.0047, "step": 185100 }, { "epoch": 1.3761842838565856, "grad_norm": 0.03860767185688019, "learning_rate": 1.641623515792438e-05, "loss": 0.0045, "step": 185200 }, { "epoch": 1.3769273639234627, "grad_norm": 0.04644284024834633, "learning_rate": 1.6396680381083495e-05, "loss": 0.005, "step": 185300 }, { "epoch": 1.37767044399034, "grad_norm": 0.06308018416166306, "learning_rate": 1.6377125604242606e-05, "loss": 0.0046, "step": 185400 }, { "epoch": 1.3784135240572173, "grad_norm": 0.02121039107441902, "learning_rate": 1.635757082740172e-05, "loss": 0.0052, "step": 185500 }, { "epoch": 1.3791566041240944, "grad_norm": 0.06041303649544716, "learning_rate": 1.6338016050560832e-05, "loss": 0.0049, "step": 185600 }, { "epoch": 1.3798996841909716, "grad_norm": 0.05722424387931824, "learning_rate": 1.6318461273719947e-05, "loss": 0.0054, "step": 185700 }, { "epoch": 1.3806427642578487, "grad_norm": 0.018614914268255234, "learning_rate": 1.6298906496879058e-05, "loss": 0.0046, "step": 185800 }, { "epoch": 1.381385844324726, "grad_norm": 0.04849564656615257, "learning_rate": 1.6279351720038173e-05, "loss": 0.0056, "step": 185900 }, { "epoch": 1.382128924391603, "grad_norm": 0.057962242513895035, "learning_rate": 1.6259796943197288e-05, "loss": 0.0059, "step": 186000 }, { "epoch": 1.3828720044584804, "grad_norm": 0.0218083206564188, "learning_rate": 1.62402421663564e-05, "loss": 0.0061, "step": 186100 }, { "epoch": 1.3836150845253576, "grad_norm": 0.08304239809513092, "learning_rate": 1.6220687389515514e-05, "loss": 0.0049, "step": 186200 }, { "epoch": 1.3843581645922347, "grad_norm": 0.023319285362958908, "learning_rate": 1.6201132612674625e-05, "loss": 0.0068, "step": 186300 }, { "epoch": 1.385101244659112, "grad_norm": 0.04920317232608795, "learning_rate": 1.618157783583374e-05, "loss": 0.0058, "step": 186400 }, { "epoch": 1.3858443247259893, "grad_norm": 0.058423787355422974, "learning_rate": 1.616202305899285e-05, "loss": 0.0047, "step": 186500 }, { "epoch": 1.3865874047928664, "grad_norm": 0.039618443697690964, "learning_rate": 1.6142468282151966e-05, "loss": 0.0054, "step": 186600 }, { "epoch": 1.3873304848597436, "grad_norm": 0.03229229897260666, "learning_rate": 1.6122913505311078e-05, "loss": 0.0054, "step": 186700 }, { "epoch": 1.388073564926621, "grad_norm": 0.04728470370173454, "learning_rate": 1.6103358728470192e-05, "loss": 0.0049, "step": 186800 }, { "epoch": 1.388816644993498, "grad_norm": 0.1927575021982193, "learning_rate": 1.6083803951629307e-05, "loss": 0.0056, "step": 186900 }, { "epoch": 1.3895597250603753, "grad_norm": 0.04589666426181793, "learning_rate": 1.606424917478842e-05, "loss": 0.0056, "step": 187000 }, { "epoch": 1.3903028051272526, "grad_norm": 0.048549044877290726, "learning_rate": 1.6044694397947533e-05, "loss": 0.0051, "step": 187100 }, { "epoch": 1.3910458851941296, "grad_norm": 0.12337183207273483, "learning_rate": 1.6025139621106645e-05, "loss": 0.0061, "step": 187200 }, { "epoch": 1.391788965261007, "grad_norm": 0.03598347678780556, "learning_rate": 1.600558484426576e-05, "loss": 0.0057, "step": 187300 }, { "epoch": 1.3925320453278842, "grad_norm": 0.017037920653820038, "learning_rate": 1.598603006742487e-05, "loss": 0.0055, "step": 187400 }, { "epoch": 1.3932751253947613, "grad_norm": 0.03536133095622063, "learning_rate": 1.5966475290583985e-05, "loss": 0.0054, "step": 187500 }, { "epoch": 1.3940182054616386, "grad_norm": 0.03762604668736458, "learning_rate": 1.5946920513743097e-05, "loss": 0.0059, "step": 187600 }, { "epoch": 1.3947612855285156, "grad_norm": 0.04574126750230789, "learning_rate": 1.592736573690221e-05, "loss": 0.0063, "step": 187700 }, { "epoch": 1.395504365595393, "grad_norm": 0.033178798854351044, "learning_rate": 1.5907810960061326e-05, "loss": 0.0055, "step": 187800 }, { "epoch": 1.39624744566227, "grad_norm": 0.1409396529197693, "learning_rate": 1.5888256183220438e-05, "loss": 0.005, "step": 187900 }, { "epoch": 1.3969905257291473, "grad_norm": 0.10325949639081955, "learning_rate": 1.5868701406379553e-05, "loss": 0.0056, "step": 188000 }, { "epoch": 1.3977336057960246, "grad_norm": 0.025328030809760094, "learning_rate": 1.5849146629538664e-05, "loss": 0.0048, "step": 188100 }, { "epoch": 1.3984766858629016, "grad_norm": 0.02672235108911991, "learning_rate": 1.582959185269778e-05, "loss": 0.0049, "step": 188200 }, { "epoch": 1.399219765929779, "grad_norm": 0.01455426774919033, "learning_rate": 1.581003707585689e-05, "loss": 0.0051, "step": 188300 }, { "epoch": 1.3999628459966562, "grad_norm": 0.14910699427127838, "learning_rate": 1.5790482299016005e-05, "loss": 0.0051, "step": 188400 }, { "epoch": 1.4007059260635333, "grad_norm": 0.10585946589708328, "learning_rate": 1.577092752217512e-05, "loss": 0.0056, "step": 188500 }, { "epoch": 1.4014490061304106, "grad_norm": 0.1684427261352539, "learning_rate": 1.575137274533423e-05, "loss": 0.0051, "step": 188600 }, { "epoch": 1.4021920861972879, "grad_norm": 0.04026757925748825, "learning_rate": 1.5731817968493346e-05, "loss": 0.0057, "step": 188700 }, { "epoch": 1.402935166264165, "grad_norm": 0.0680205225944519, "learning_rate": 1.5712263191652457e-05, "loss": 0.0057, "step": 188800 }, { "epoch": 1.4036782463310422, "grad_norm": 0.021099738776683807, "learning_rate": 1.5692708414811572e-05, "loss": 0.0057, "step": 188900 }, { "epoch": 1.4044213263979195, "grad_norm": 0.04393347352743149, "learning_rate": 1.5673153637970683e-05, "loss": 0.0057, "step": 189000 }, { "epoch": 1.4051644064647966, "grad_norm": 0.026928726583719254, "learning_rate": 1.5653598861129798e-05, "loss": 0.0042, "step": 189100 }, { "epoch": 1.4059074865316739, "grad_norm": 0.029390672221779823, "learning_rate": 1.563404408428891e-05, "loss": 0.0051, "step": 189200 }, { "epoch": 1.406650566598551, "grad_norm": 0.01649823784828186, "learning_rate": 1.5614489307448024e-05, "loss": 0.0065, "step": 189300 }, { "epoch": 1.4073936466654282, "grad_norm": 0.02398720756173134, "learning_rate": 1.559493453060714e-05, "loss": 0.0053, "step": 189400 }, { "epoch": 1.4081367267323053, "grad_norm": 0.17312639951705933, "learning_rate": 1.557537975376625e-05, "loss": 0.0055, "step": 189500 }, { "epoch": 1.4088798067991826, "grad_norm": 0.12432188540697098, "learning_rate": 1.5555824976925365e-05, "loss": 0.0051, "step": 189600 }, { "epoch": 1.4096228868660599, "grad_norm": 0.038381610065698624, "learning_rate": 1.5536270200084476e-05, "loss": 0.005, "step": 189700 }, { "epoch": 1.410365966932937, "grad_norm": 0.08032047748565674, "learning_rate": 1.551671542324359e-05, "loss": 0.0052, "step": 189800 }, { "epoch": 1.4111090469998142, "grad_norm": 0.017590852454304695, "learning_rate": 1.5497160646402703e-05, "loss": 0.0051, "step": 189900 }, { "epoch": 1.4118521270666915, "grad_norm": 0.03147004544734955, "learning_rate": 1.5477605869561817e-05, "loss": 0.0056, "step": 190000 }, { "epoch": 1.4125952071335686, "grad_norm": 0.04671696200966835, "learning_rate": 1.545805109272093e-05, "loss": 0.0047, "step": 190100 }, { "epoch": 1.4133382872004459, "grad_norm": 0.037360500544309616, "learning_rate": 1.5438496315880043e-05, "loss": 0.0047, "step": 190200 }, { "epoch": 1.4140813672673231, "grad_norm": 0.06750281155109406, "learning_rate": 1.5418941539039158e-05, "loss": 0.0055, "step": 190300 }, { "epoch": 1.4148244473342002, "grad_norm": 0.03737124800682068, "learning_rate": 1.539938676219827e-05, "loss": 0.006, "step": 190400 }, { "epoch": 1.4155675274010775, "grad_norm": 0.022957438603043556, "learning_rate": 1.5379831985357384e-05, "loss": 0.005, "step": 190500 }, { "epoch": 1.4163106074679548, "grad_norm": 0.13438661396503448, "learning_rate": 1.5360277208516496e-05, "loss": 0.0061, "step": 190600 }, { "epoch": 1.4170536875348319, "grad_norm": 0.07773229479789734, "learning_rate": 1.534072243167561e-05, "loss": 0.0046, "step": 190700 }, { "epoch": 1.4177967676017091, "grad_norm": 0.0562002956867218, "learning_rate": 1.5321167654834722e-05, "loss": 0.0048, "step": 190800 }, { "epoch": 1.4185398476685864, "grad_norm": 0.04332733526825905, "learning_rate": 1.5301612877993837e-05, "loss": 0.0063, "step": 190900 }, { "epoch": 1.4192829277354635, "grad_norm": 0.03429027646780014, "learning_rate": 1.5282058101152948e-05, "loss": 0.005, "step": 191000 }, { "epoch": 1.4200260078023408, "grad_norm": 0.04714904725551605, "learning_rate": 1.5262503324312063e-05, "loss": 0.0053, "step": 191100 }, { "epoch": 1.4207690878692179, "grad_norm": 0.03353253751993179, "learning_rate": 1.5242948547471178e-05, "loss": 0.0049, "step": 191200 }, { "epoch": 1.4215121679360951, "grad_norm": 0.22919848561286926, "learning_rate": 1.522339377063029e-05, "loss": 0.0049, "step": 191300 }, { "epoch": 1.4222552480029722, "grad_norm": 0.06168941780924797, "learning_rate": 1.5203838993789404e-05, "loss": 0.0054, "step": 191400 }, { "epoch": 1.4229983280698495, "grad_norm": 0.14343473315238953, "learning_rate": 1.5184284216948517e-05, "loss": 0.005, "step": 191500 }, { "epoch": 1.4237414081367268, "grad_norm": 0.04179225116968155, "learning_rate": 1.516472944010763e-05, "loss": 0.0049, "step": 191600 }, { "epoch": 1.4244844882036038, "grad_norm": 0.02051694318652153, "learning_rate": 1.5145174663266743e-05, "loss": 0.0053, "step": 191700 }, { "epoch": 1.4252275682704811, "grad_norm": 0.0060800667852163315, "learning_rate": 1.5125619886425858e-05, "loss": 0.0047, "step": 191800 }, { "epoch": 1.4259706483373584, "grad_norm": 0.046514201909303665, "learning_rate": 1.510606510958497e-05, "loss": 0.0047, "step": 191900 }, { "epoch": 1.4267137284042355, "grad_norm": 0.02513851411640644, "learning_rate": 1.5086510332744084e-05, "loss": 0.0048, "step": 192000 }, { "epoch": 1.4274568084711128, "grad_norm": 0.3142237663269043, "learning_rate": 1.5066955555903199e-05, "loss": 0.0057, "step": 192100 }, { "epoch": 1.42819988853799, "grad_norm": 0.08423580229282379, "learning_rate": 1.504740077906231e-05, "loss": 0.0054, "step": 192200 }, { "epoch": 1.4289429686048671, "grad_norm": 0.15629760921001434, "learning_rate": 1.5027846002221425e-05, "loss": 0.0057, "step": 192300 }, { "epoch": 1.4296860486717444, "grad_norm": 0.034243393689394, "learning_rate": 1.5008291225380536e-05, "loss": 0.0056, "step": 192400 }, { "epoch": 1.4304291287386217, "grad_norm": 0.026637054979801178, "learning_rate": 1.4988736448539651e-05, "loss": 0.0054, "step": 192500 }, { "epoch": 1.4311722088054988, "grad_norm": 0.02556227333843708, "learning_rate": 1.4969181671698762e-05, "loss": 0.0056, "step": 192600 }, { "epoch": 1.431915288872376, "grad_norm": 0.27332547307014465, "learning_rate": 1.4949626894857877e-05, "loss": 0.0053, "step": 192700 }, { "epoch": 1.4326583689392531, "grad_norm": 0.08697855472564697, "learning_rate": 1.4930072118016992e-05, "loss": 0.0052, "step": 192800 }, { "epoch": 1.4334014490061304, "grad_norm": 0.41861650347709656, "learning_rate": 1.4910517341176103e-05, "loss": 0.0056, "step": 192900 }, { "epoch": 1.4341445290730075, "grad_norm": 0.053752705454826355, "learning_rate": 1.4890962564335218e-05, "loss": 0.0055, "step": 193000 }, { "epoch": 1.4348876091398848, "grad_norm": 0.03551322594285011, "learning_rate": 1.487140778749433e-05, "loss": 0.0053, "step": 193100 }, { "epoch": 1.435630689206762, "grad_norm": 0.11298243701457977, "learning_rate": 1.4851853010653444e-05, "loss": 0.0063, "step": 193200 }, { "epoch": 1.4363737692736391, "grad_norm": 0.036219272762537, "learning_rate": 1.4832298233812555e-05, "loss": 0.0049, "step": 193300 }, { "epoch": 1.4371168493405164, "grad_norm": 0.0773225873708725, "learning_rate": 1.481274345697167e-05, "loss": 0.0053, "step": 193400 }, { "epoch": 1.4378599294073937, "grad_norm": 0.04517872631549835, "learning_rate": 1.4793188680130782e-05, "loss": 0.0053, "step": 193500 }, { "epoch": 1.4386030094742708, "grad_norm": 0.04297841712832451, "learning_rate": 1.4773633903289896e-05, "loss": 0.0047, "step": 193600 }, { "epoch": 1.439346089541148, "grad_norm": 0.08863457292318344, "learning_rate": 1.4754079126449011e-05, "loss": 0.0046, "step": 193700 }, { "epoch": 1.4400891696080254, "grad_norm": 0.03258669376373291, "learning_rate": 1.4734524349608122e-05, "loss": 0.0054, "step": 193800 }, { "epoch": 1.4408322496749024, "grad_norm": 0.09925554692745209, "learning_rate": 1.4714969572767237e-05, "loss": 0.0048, "step": 193900 }, { "epoch": 1.4415753297417797, "grad_norm": 0.07913857698440552, "learning_rate": 1.4695414795926349e-05, "loss": 0.0051, "step": 194000 }, { "epoch": 1.442318409808657, "grad_norm": 0.05557543411850929, "learning_rate": 1.4675860019085463e-05, "loss": 0.0055, "step": 194100 }, { "epoch": 1.443061489875534, "grad_norm": 0.03559400513768196, "learning_rate": 1.4656305242244575e-05, "loss": 0.0051, "step": 194200 }, { "epoch": 1.4438045699424114, "grad_norm": 0.044786978513002396, "learning_rate": 1.463675046540369e-05, "loss": 0.0063, "step": 194300 }, { "epoch": 1.4445476500092884, "grad_norm": 0.02578897587954998, "learning_rate": 1.4617195688562804e-05, "loss": 0.0052, "step": 194400 }, { "epoch": 1.4452907300761657, "grad_norm": 0.04599909856915474, "learning_rate": 1.4597640911721916e-05, "loss": 0.0052, "step": 194500 }, { "epoch": 1.4460338101430428, "grad_norm": 0.030064262449741364, "learning_rate": 1.457808613488103e-05, "loss": 0.0053, "step": 194600 }, { "epoch": 1.44677689020992, "grad_norm": 0.07608848065137863, "learning_rate": 1.4558531358040142e-05, "loss": 0.0047, "step": 194700 }, { "epoch": 1.4475199702767974, "grad_norm": 0.054041747003793716, "learning_rate": 1.4538976581199257e-05, "loss": 0.0049, "step": 194800 }, { "epoch": 1.4482630503436744, "grad_norm": 0.07664705067873001, "learning_rate": 1.4519421804358368e-05, "loss": 0.0054, "step": 194900 }, { "epoch": 1.4490061304105517, "grad_norm": 0.047844257205724716, "learning_rate": 1.4499867027517483e-05, "loss": 0.0049, "step": 195000 }, { "epoch": 1.449749210477429, "grad_norm": 0.0322202630341053, "learning_rate": 1.4480312250676594e-05, "loss": 0.005, "step": 195100 }, { "epoch": 1.450492290544306, "grad_norm": 0.04123595356941223, "learning_rate": 1.4460757473835709e-05, "loss": 0.0049, "step": 195200 }, { "epoch": 1.4512353706111834, "grad_norm": 0.1135578528046608, "learning_rate": 1.4441202696994824e-05, "loss": 0.0051, "step": 195300 }, { "epoch": 1.4519784506780606, "grad_norm": 0.053413499146699905, "learning_rate": 1.4421647920153935e-05, "loss": 0.0049, "step": 195400 }, { "epoch": 1.4527215307449377, "grad_norm": 0.0325782485306263, "learning_rate": 1.440209314331305e-05, "loss": 0.0058, "step": 195500 }, { "epoch": 1.453464610811815, "grad_norm": 0.04515119269490242, "learning_rate": 1.4382538366472161e-05, "loss": 0.0053, "step": 195600 }, { "epoch": 1.4542076908786923, "grad_norm": 0.05020516738295555, "learning_rate": 1.4362983589631276e-05, "loss": 0.0058, "step": 195700 }, { "epoch": 1.4549507709455693, "grad_norm": 0.06951471418142319, "learning_rate": 1.4343428812790389e-05, "loss": 0.0053, "step": 195800 }, { "epoch": 1.4556938510124466, "grad_norm": 0.052129115909338, "learning_rate": 1.4323874035949502e-05, "loss": 0.0049, "step": 195900 }, { "epoch": 1.456436931079324, "grad_norm": 0.056350305676460266, "learning_rate": 1.4304319259108615e-05, "loss": 0.0053, "step": 196000 }, { "epoch": 1.457180011146201, "grad_norm": 0.05284970998764038, "learning_rate": 1.4284764482267728e-05, "loss": 0.0053, "step": 196100 }, { "epoch": 1.4579230912130783, "grad_norm": 0.04091787710785866, "learning_rate": 1.4265209705426843e-05, "loss": 0.0047, "step": 196200 }, { "epoch": 1.4586661712799553, "grad_norm": 0.21151842176914215, "learning_rate": 1.4245654928585956e-05, "loss": 0.0046, "step": 196300 }, { "epoch": 1.4594092513468326, "grad_norm": 0.04598143696784973, "learning_rate": 1.4226100151745069e-05, "loss": 0.0049, "step": 196400 }, { "epoch": 1.4601523314137097, "grad_norm": 0.12699535489082336, "learning_rate": 1.4206545374904182e-05, "loss": 0.0043, "step": 196500 }, { "epoch": 1.460895411480587, "grad_norm": 0.0752573013305664, "learning_rate": 1.4186990598063297e-05, "loss": 0.0053, "step": 196600 }, { "epoch": 1.4616384915474643, "grad_norm": 0.06762368232011795, "learning_rate": 1.4167435821222408e-05, "loss": 0.0051, "step": 196700 }, { "epoch": 1.4623815716143413, "grad_norm": 0.04167350381612778, "learning_rate": 1.4147881044381523e-05, "loss": 0.0066, "step": 196800 }, { "epoch": 1.4631246516812186, "grad_norm": 0.1161014586687088, "learning_rate": 1.4128326267540634e-05, "loss": 0.0065, "step": 196900 }, { "epoch": 1.463867731748096, "grad_norm": 0.13612593710422516, "learning_rate": 1.410877149069975e-05, "loss": 0.0051, "step": 197000 }, { "epoch": 1.464610811814973, "grad_norm": 0.03966749832034111, "learning_rate": 1.4089216713858864e-05, "loss": 0.0062, "step": 197100 }, { "epoch": 1.4653538918818503, "grad_norm": 0.03584647178649902, "learning_rate": 1.4069661937017975e-05, "loss": 0.0048, "step": 197200 }, { "epoch": 1.4660969719487276, "grad_norm": 0.02297358773648739, "learning_rate": 1.405010716017709e-05, "loss": 0.0055, "step": 197300 }, { "epoch": 1.4668400520156046, "grad_norm": 0.15932773053646088, "learning_rate": 1.4030552383336201e-05, "loss": 0.0048, "step": 197400 }, { "epoch": 1.467583132082482, "grad_norm": 0.134142205119133, "learning_rate": 1.4010997606495316e-05, "loss": 0.0065, "step": 197500 }, { "epoch": 1.4683262121493592, "grad_norm": 0.2510368824005127, "learning_rate": 1.3991442829654428e-05, "loss": 0.0059, "step": 197600 }, { "epoch": 1.4690692922162363, "grad_norm": 0.02118113823235035, "learning_rate": 1.3971888052813542e-05, "loss": 0.0049, "step": 197700 }, { "epoch": 1.4698123722831136, "grad_norm": 0.021748289465904236, "learning_rate": 1.3952333275972657e-05, "loss": 0.0053, "step": 197800 }, { "epoch": 1.4705554523499906, "grad_norm": 0.038835760205984116, "learning_rate": 1.3932778499131769e-05, "loss": 0.005, "step": 197900 }, { "epoch": 1.471298532416868, "grad_norm": 0.04774394631385803, "learning_rate": 1.3913223722290883e-05, "loss": 0.0057, "step": 198000 }, { "epoch": 1.472041612483745, "grad_norm": 0.028986819088459015, "learning_rate": 1.3893668945449995e-05, "loss": 0.0065, "step": 198100 }, { "epoch": 1.4727846925506223, "grad_norm": 0.016819661483168602, "learning_rate": 1.387411416860911e-05, "loss": 0.0047, "step": 198200 }, { "epoch": 1.4735277726174996, "grad_norm": 0.022282276302576065, "learning_rate": 1.385455939176822e-05, "loss": 0.0058, "step": 198300 }, { "epoch": 1.4742708526843766, "grad_norm": 0.04887654259800911, "learning_rate": 1.3835004614927336e-05, "loss": 0.0051, "step": 198400 }, { "epoch": 1.475013932751254, "grad_norm": 0.04414292797446251, "learning_rate": 1.3815449838086447e-05, "loss": 0.0049, "step": 198500 }, { "epoch": 1.4757570128181312, "grad_norm": 0.02765895053744316, "learning_rate": 1.3795895061245562e-05, "loss": 0.0051, "step": 198600 }, { "epoch": 1.4765000928850083, "grad_norm": 0.0196316409856081, "learning_rate": 1.3776340284404676e-05, "loss": 0.0051, "step": 198700 }, { "epoch": 1.4772431729518856, "grad_norm": 0.09023531526327133, "learning_rate": 1.3756785507563788e-05, "loss": 0.0046, "step": 198800 }, { "epoch": 1.4779862530187629, "grad_norm": 0.043948885053396225, "learning_rate": 1.3737230730722903e-05, "loss": 0.0054, "step": 198900 }, { "epoch": 1.47872933308564, "grad_norm": 0.13145171105861664, "learning_rate": 1.3717675953882014e-05, "loss": 0.0065, "step": 199000 }, { "epoch": 1.4794724131525172, "grad_norm": 0.07296920567750931, "learning_rate": 1.3698121177041129e-05, "loss": 0.0048, "step": 199100 }, { "epoch": 1.4802154932193945, "grad_norm": 0.03242073580622673, "learning_rate": 1.367856640020024e-05, "loss": 0.0049, "step": 199200 }, { "epoch": 1.4809585732862716, "grad_norm": 0.01754320226609707, "learning_rate": 1.3659011623359355e-05, "loss": 0.0042, "step": 199300 }, { "epoch": 1.4817016533531489, "grad_norm": 0.029812950640916824, "learning_rate": 1.3639456846518466e-05, "loss": 0.0046, "step": 199400 }, { "epoch": 1.4824447334200261, "grad_norm": 0.04219537973403931, "learning_rate": 1.3619902069677581e-05, "loss": 0.0051, "step": 199500 }, { "epoch": 1.4831878134869032, "grad_norm": 0.08043123036623001, "learning_rate": 1.3600347292836696e-05, "loss": 0.0048, "step": 199600 }, { "epoch": 1.4839308935537805, "grad_norm": 0.06671053916215897, "learning_rate": 1.3580792515995807e-05, "loss": 0.006, "step": 199700 }, { "epoch": 1.4846739736206576, "grad_norm": 0.02985260635614395, "learning_rate": 1.3561237739154922e-05, "loss": 0.0048, "step": 199800 }, { "epoch": 1.4854170536875348, "grad_norm": 0.03610043600201607, "learning_rate": 1.3541682962314033e-05, "loss": 0.0044, "step": 199900 }, { "epoch": 1.486160133754412, "grad_norm": 0.032812174409627914, "learning_rate": 1.3522128185473148e-05, "loss": 0.0052, "step": 200000 }, { "epoch": 1.4869032138212892, "grad_norm": 0.04284307360649109, "learning_rate": 1.350257340863226e-05, "loss": 0.006, "step": 200100 }, { "epoch": 1.4876462938881665, "grad_norm": 0.038830965757369995, "learning_rate": 1.3483018631791374e-05, "loss": 0.0062, "step": 200200 }, { "epoch": 1.4883893739550436, "grad_norm": 0.08964473754167557, "learning_rate": 1.3463463854950487e-05, "loss": 0.0041, "step": 200300 }, { "epoch": 1.4891324540219208, "grad_norm": 0.057817209511995316, "learning_rate": 1.34439090781096e-05, "loss": 0.0057, "step": 200400 }, { "epoch": 1.4898755340887981, "grad_norm": 0.01587858982384205, "learning_rate": 1.3424354301268715e-05, "loss": 0.0043, "step": 200500 }, { "epoch": 1.4906186141556752, "grad_norm": 0.11389410495758057, "learning_rate": 1.3404799524427828e-05, "loss": 0.005, "step": 200600 }, { "epoch": 1.4913616942225525, "grad_norm": 0.04956215247511864, "learning_rate": 1.3385244747586941e-05, "loss": 0.0045, "step": 200700 }, { "epoch": 1.4921047742894298, "grad_norm": 0.05701274797320366, "learning_rate": 1.3365689970746054e-05, "loss": 0.0049, "step": 200800 }, { "epoch": 1.4928478543563068, "grad_norm": 0.019399596378207207, "learning_rate": 1.3346135193905167e-05, "loss": 0.0049, "step": 200900 }, { "epoch": 1.4935909344231841, "grad_norm": 0.03972724452614784, "learning_rate": 1.332658041706428e-05, "loss": 0.0055, "step": 201000 }, { "epoch": 1.4943340144900614, "grad_norm": 0.024248911067843437, "learning_rate": 1.3307025640223395e-05, "loss": 0.0055, "step": 201100 }, { "epoch": 1.4950770945569385, "grad_norm": 0.03850367292761803, "learning_rate": 1.3287470863382508e-05, "loss": 0.0047, "step": 201200 }, { "epoch": 1.4958201746238158, "grad_norm": 0.027499202638864517, "learning_rate": 1.3267916086541621e-05, "loss": 0.0052, "step": 201300 }, { "epoch": 1.4965632546906928, "grad_norm": 0.10976848006248474, "learning_rate": 1.3248361309700734e-05, "loss": 0.0045, "step": 201400 }, { "epoch": 1.4973063347575701, "grad_norm": 0.015280229039490223, "learning_rate": 1.3228806532859848e-05, "loss": 0.0058, "step": 201500 }, { "epoch": 1.4980494148244472, "grad_norm": 0.03293237090110779, "learning_rate": 1.3209251756018962e-05, "loss": 0.0051, "step": 201600 }, { "epoch": 1.4987924948913245, "grad_norm": 0.09754110127687454, "learning_rate": 1.3189696979178074e-05, "loss": 0.0061, "step": 201700 }, { "epoch": 1.4995355749582018, "grad_norm": 0.0665021762251854, "learning_rate": 1.3170142202337188e-05, "loss": 0.0052, "step": 201800 }, { "epoch": 1.5002786550250788, "grad_norm": 0.07443239539861679, "learning_rate": 1.31505874254963e-05, "loss": 0.0055, "step": 201900 }, { "epoch": 1.5010217350919561, "grad_norm": 0.038124386221170425, "learning_rate": 1.3131032648655415e-05, "loss": 0.0061, "step": 202000 }, { "epoch": 1.5017648151588334, "grad_norm": 0.05114785209298134, "learning_rate": 1.311147787181453e-05, "loss": 0.005, "step": 202100 }, { "epoch": 1.5025078952257105, "grad_norm": 0.03301633149385452, "learning_rate": 1.309192309497364e-05, "loss": 0.0055, "step": 202200 }, { "epoch": 1.5032509752925878, "grad_norm": 0.12435542047023773, "learning_rate": 1.3072368318132756e-05, "loss": 0.0053, "step": 202300 }, { "epoch": 1.503994055359465, "grad_norm": 0.0545712485909462, "learning_rate": 1.3052813541291867e-05, "loss": 0.0059, "step": 202400 }, { "epoch": 1.5047371354263421, "grad_norm": 0.046526167541742325, "learning_rate": 1.3033258764450982e-05, "loss": 0.0051, "step": 202500 }, { "epoch": 1.5054802154932194, "grad_norm": 0.12242411822080612, "learning_rate": 1.3013703987610093e-05, "loss": 0.0051, "step": 202600 }, { "epoch": 1.5062232955600967, "grad_norm": 0.050340987741947174, "learning_rate": 1.2994149210769208e-05, "loss": 0.0056, "step": 202700 }, { "epoch": 1.5069663756269738, "grad_norm": 0.07723532617092133, "learning_rate": 1.297459443392832e-05, "loss": 0.0046, "step": 202800 }, { "epoch": 1.507709455693851, "grad_norm": 0.051652420312166214, "learning_rate": 1.2955039657087434e-05, "loss": 0.005, "step": 202900 }, { "epoch": 1.5084525357607284, "grad_norm": 0.04383116215467453, "learning_rate": 1.2935484880246549e-05, "loss": 0.0055, "step": 203000 }, { "epoch": 1.5091956158276054, "grad_norm": 0.04332162067294121, "learning_rate": 1.291593010340566e-05, "loss": 0.0049, "step": 203100 }, { "epoch": 1.5099386958944825, "grad_norm": 0.10582516342401505, "learning_rate": 1.2896375326564775e-05, "loss": 0.0054, "step": 203200 }, { "epoch": 1.51068177596136, "grad_norm": 0.15289223194122314, "learning_rate": 1.2876820549723886e-05, "loss": 0.0057, "step": 203300 }, { "epoch": 1.511424856028237, "grad_norm": 0.1039615347981453, "learning_rate": 1.2857265772883001e-05, "loss": 0.0059, "step": 203400 }, { "epoch": 1.5121679360951141, "grad_norm": 0.022959847003221512, "learning_rate": 1.2837710996042112e-05, "loss": 0.0052, "step": 203500 }, { "epoch": 1.5129110161619914, "grad_norm": 0.028412576764822006, "learning_rate": 1.2818156219201227e-05, "loss": 0.0046, "step": 203600 }, { "epoch": 1.5136540962288687, "grad_norm": 0.15056899189949036, "learning_rate": 1.2798601442360342e-05, "loss": 0.0046, "step": 203700 }, { "epoch": 1.5143971762957458, "grad_norm": 0.03408554196357727, "learning_rate": 1.2779046665519453e-05, "loss": 0.0042, "step": 203800 }, { "epoch": 1.515140256362623, "grad_norm": 0.1577172726392746, "learning_rate": 1.2759491888678568e-05, "loss": 0.0049, "step": 203900 }, { "epoch": 1.5158833364295003, "grad_norm": 0.05934355780482292, "learning_rate": 1.273993711183768e-05, "loss": 0.006, "step": 204000 }, { "epoch": 1.5166264164963774, "grad_norm": 0.5736753344535828, "learning_rate": 1.2720382334996794e-05, "loss": 0.0048, "step": 204100 }, { "epoch": 1.5173694965632547, "grad_norm": 0.02531179040670395, "learning_rate": 1.2700827558155906e-05, "loss": 0.0063, "step": 204200 }, { "epoch": 1.518112576630132, "grad_norm": 0.13243204355239868, "learning_rate": 1.268127278131502e-05, "loss": 0.006, "step": 204300 }, { "epoch": 1.518855656697009, "grad_norm": 0.029423518106341362, "learning_rate": 1.2661718004474132e-05, "loss": 0.0051, "step": 204400 }, { "epoch": 1.5195987367638863, "grad_norm": 0.0465555265545845, "learning_rate": 1.2642163227633246e-05, "loss": 0.0056, "step": 204500 }, { "epoch": 1.5203418168307636, "grad_norm": 0.05445116385817528, "learning_rate": 1.2622608450792361e-05, "loss": 0.005, "step": 204600 }, { "epoch": 1.5210848968976407, "grad_norm": 0.029600728303194046, "learning_rate": 1.2603053673951473e-05, "loss": 0.0051, "step": 204700 }, { "epoch": 1.5218279769645178, "grad_norm": 0.04652674123644829, "learning_rate": 1.2583498897110587e-05, "loss": 0.0053, "step": 204800 }, { "epoch": 1.5225710570313953, "grad_norm": 0.02788420580327511, "learning_rate": 1.2563944120269699e-05, "loss": 0.0064, "step": 204900 }, { "epoch": 1.5233141370982723, "grad_norm": 0.15675701200962067, "learning_rate": 1.2544389343428814e-05, "loss": 0.0059, "step": 205000 }, { "epoch": 1.5240572171651494, "grad_norm": 0.03167521208524704, "learning_rate": 1.2524834566587927e-05, "loss": 0.0063, "step": 205100 }, { "epoch": 1.5248002972320267, "grad_norm": 0.01885204203426838, "learning_rate": 1.250527978974704e-05, "loss": 0.0047, "step": 205200 }, { "epoch": 1.525543377298904, "grad_norm": 0.033755023032426834, "learning_rate": 1.2485725012906153e-05, "loss": 0.0054, "step": 205300 }, { "epoch": 1.526286457365781, "grad_norm": 0.05125948414206505, "learning_rate": 1.2466170236065266e-05, "loss": 0.0055, "step": 205400 }, { "epoch": 1.5270295374326583, "grad_norm": 0.0537232868373394, "learning_rate": 1.244661545922438e-05, "loss": 0.0057, "step": 205500 }, { "epoch": 1.5277726174995356, "grad_norm": 0.13967996835708618, "learning_rate": 1.2427060682383494e-05, "loss": 0.0063, "step": 205600 }, { "epoch": 1.5285156975664127, "grad_norm": 0.02616485394537449, "learning_rate": 1.2407505905542607e-05, "loss": 0.0062, "step": 205700 }, { "epoch": 1.52925877763329, "grad_norm": 0.1289157122373581, "learning_rate": 1.238795112870172e-05, "loss": 0.0062, "step": 205800 }, { "epoch": 1.5300018577001673, "grad_norm": 0.025702519342303276, "learning_rate": 1.2368396351860833e-05, "loss": 0.0051, "step": 205900 }, { "epoch": 1.5307449377670443, "grad_norm": 0.018757013604044914, "learning_rate": 1.2348841575019948e-05, "loss": 0.005, "step": 206000 }, { "epoch": 1.5314880178339216, "grad_norm": 0.015169249847531319, "learning_rate": 1.232928679817906e-05, "loss": 0.0043, "step": 206100 }, { "epoch": 1.532231097900799, "grad_norm": 0.044216688722372055, "learning_rate": 1.2309732021338174e-05, "loss": 0.0069, "step": 206200 }, { "epoch": 1.532974177967676, "grad_norm": 0.024957142770290375, "learning_rate": 1.2290177244497287e-05, "loss": 0.0052, "step": 206300 }, { "epoch": 1.5337172580345533, "grad_norm": 0.09946002811193466, "learning_rate": 1.22706224676564e-05, "loss": 0.0052, "step": 206400 }, { "epoch": 1.5344603381014306, "grad_norm": 0.2029615193605423, "learning_rate": 1.2251067690815513e-05, "loss": 0.0045, "step": 206500 }, { "epoch": 1.5352034181683076, "grad_norm": 0.02831197902560234, "learning_rate": 1.2231512913974626e-05, "loss": 0.0049, "step": 206600 }, { "epoch": 1.5359464982351847, "grad_norm": 0.03911466896533966, "learning_rate": 1.221195813713374e-05, "loss": 0.0061, "step": 206700 }, { "epoch": 1.5366895783020622, "grad_norm": 0.051360636949539185, "learning_rate": 1.2192403360292854e-05, "loss": 0.0057, "step": 206800 }, { "epoch": 1.5374326583689393, "grad_norm": 0.05902998894453049, "learning_rate": 1.2172848583451967e-05, "loss": 0.0042, "step": 206900 }, { "epoch": 1.5381757384358163, "grad_norm": 0.05807112529873848, "learning_rate": 1.215329380661108e-05, "loss": 0.0055, "step": 207000 }, { "epoch": 1.5389188185026936, "grad_norm": 0.017810475081205368, "learning_rate": 1.2133739029770193e-05, "loss": 0.006, "step": 207100 }, { "epoch": 1.539661898569571, "grad_norm": 0.05142570659518242, "learning_rate": 1.2114184252929306e-05, "loss": 0.005, "step": 207200 }, { "epoch": 1.540404978636448, "grad_norm": 0.05944421514868736, "learning_rate": 1.209462947608842e-05, "loss": 0.005, "step": 207300 }, { "epoch": 1.5411480587033253, "grad_norm": 0.05149064213037491, "learning_rate": 1.2075074699247532e-05, "loss": 0.006, "step": 207400 }, { "epoch": 1.5418911387702026, "grad_norm": 0.13631939888000488, "learning_rate": 1.2055519922406647e-05, "loss": 0.0051, "step": 207500 }, { "epoch": 1.5426342188370796, "grad_norm": 0.1117667704820633, "learning_rate": 1.203596514556576e-05, "loss": 0.0052, "step": 207600 }, { "epoch": 1.543377298903957, "grad_norm": 0.07761365920305252, "learning_rate": 1.2016410368724873e-05, "loss": 0.0068, "step": 207700 }, { "epoch": 1.5441203789708342, "grad_norm": 0.02163824811577797, "learning_rate": 1.1996855591883986e-05, "loss": 0.0055, "step": 207800 }, { "epoch": 1.5448634590377113, "grad_norm": 0.16861219704151154, "learning_rate": 1.19773008150431e-05, "loss": 0.0056, "step": 207900 }, { "epoch": 1.5456065391045886, "grad_norm": 0.2045339196920395, "learning_rate": 1.1957746038202212e-05, "loss": 0.0051, "step": 208000 }, { "epoch": 1.5463496191714659, "grad_norm": 0.08610349893569946, "learning_rate": 1.1938191261361325e-05, "loss": 0.0047, "step": 208100 }, { "epoch": 1.547092699238343, "grad_norm": 0.025025537237524986, "learning_rate": 1.1918636484520439e-05, "loss": 0.0052, "step": 208200 }, { "epoch": 1.54783577930522, "grad_norm": 0.031051406636834145, "learning_rate": 1.1899081707679552e-05, "loss": 0.0049, "step": 208300 }, { "epoch": 1.5485788593720975, "grad_norm": 0.053068481385707855, "learning_rate": 1.1879526930838666e-05, "loss": 0.0049, "step": 208400 }, { "epoch": 1.5493219394389746, "grad_norm": 0.018607715144753456, "learning_rate": 1.185997215399778e-05, "loss": 0.0051, "step": 208500 }, { "epoch": 1.5500650195058516, "grad_norm": 0.015145703218877316, "learning_rate": 1.1840417377156893e-05, "loss": 0.0045, "step": 208600 }, { "epoch": 1.550808099572729, "grad_norm": 0.04435611143708229, "learning_rate": 1.1820862600316006e-05, "loss": 0.0056, "step": 208700 }, { "epoch": 1.5515511796396062, "grad_norm": 0.21179670095443726, "learning_rate": 1.1801307823475119e-05, "loss": 0.0054, "step": 208800 }, { "epoch": 1.5522942597064833, "grad_norm": 0.2162257879972458, "learning_rate": 1.1781753046634232e-05, "loss": 0.0056, "step": 208900 }, { "epoch": 1.5530373397733606, "grad_norm": 0.05598733574151993, "learning_rate": 1.1762198269793345e-05, "loss": 0.0052, "step": 209000 }, { "epoch": 1.5537804198402378, "grad_norm": 0.050181787461042404, "learning_rate": 1.1742643492952458e-05, "loss": 0.0042, "step": 209100 }, { "epoch": 1.554523499907115, "grad_norm": 0.185306116938591, "learning_rate": 1.1723088716111573e-05, "loss": 0.0047, "step": 209200 }, { "epoch": 1.5552665799739922, "grad_norm": 0.02198611944913864, "learning_rate": 1.1703533939270686e-05, "loss": 0.0056, "step": 209300 }, { "epoch": 1.5560096600408695, "grad_norm": 0.019903462380170822, "learning_rate": 1.1683979162429799e-05, "loss": 0.0057, "step": 209400 }, { "epoch": 1.5567527401077466, "grad_norm": 0.04688869044184685, "learning_rate": 1.1664424385588912e-05, "loss": 0.0061, "step": 209500 }, { "epoch": 1.5574958201746238, "grad_norm": 0.048686590045690536, "learning_rate": 1.1644869608748025e-05, "loss": 0.0052, "step": 209600 }, { "epoch": 1.5582389002415011, "grad_norm": 0.2239379733800888, "learning_rate": 1.1625314831907138e-05, "loss": 0.0055, "step": 209700 }, { "epoch": 1.5589819803083782, "grad_norm": 0.03906741365790367, "learning_rate": 1.1605760055066251e-05, "loss": 0.0052, "step": 209800 }, { "epoch": 1.5597250603752553, "grad_norm": 0.01773134618997574, "learning_rate": 1.1586205278225364e-05, "loss": 0.0047, "step": 209900 }, { "epoch": 1.5604681404421328, "grad_norm": 0.03815056011080742, "learning_rate": 1.1566650501384479e-05, "loss": 0.0046, "step": 210000 }, { "epoch": 1.5612112205090098, "grad_norm": 0.03085993602871895, "learning_rate": 1.1547095724543592e-05, "loss": 0.0062, "step": 210100 }, { "epoch": 1.561954300575887, "grad_norm": 0.03337539732456207, "learning_rate": 1.1527540947702705e-05, "loss": 0.0053, "step": 210200 }, { "epoch": 1.5626973806427644, "grad_norm": 0.016016045585274696, "learning_rate": 1.1507986170861818e-05, "loss": 0.0055, "step": 210300 }, { "epoch": 1.5634404607096415, "grad_norm": 0.0869809165596962, "learning_rate": 1.1488431394020933e-05, "loss": 0.005, "step": 210400 }, { "epoch": 1.5641835407765186, "grad_norm": 0.04441719129681587, "learning_rate": 1.1468876617180046e-05, "loss": 0.0062, "step": 210500 }, { "epoch": 1.5649266208433958, "grad_norm": 0.14742492139339447, "learning_rate": 1.1449321840339159e-05, "loss": 0.006, "step": 210600 }, { "epoch": 1.5656697009102731, "grad_norm": 0.056334350258111954, "learning_rate": 1.1429767063498272e-05, "loss": 0.0047, "step": 210700 }, { "epoch": 1.5664127809771502, "grad_norm": 0.08409079164266586, "learning_rate": 1.1410212286657385e-05, "loss": 0.0055, "step": 210800 }, { "epoch": 1.5671558610440275, "grad_norm": 0.058762695640325546, "learning_rate": 1.13906575098165e-05, "loss": 0.0056, "step": 210900 }, { "epoch": 1.5678989411109048, "grad_norm": 0.05135015770792961, "learning_rate": 1.1371102732975613e-05, "loss": 0.0058, "step": 211000 }, { "epoch": 1.5686420211777818, "grad_norm": 0.031705133616924286, "learning_rate": 1.1351547956134726e-05, "loss": 0.0058, "step": 211100 }, { "epoch": 1.5693851012446591, "grad_norm": 0.029941117390990257, "learning_rate": 1.1331993179293839e-05, "loss": 0.005, "step": 211200 }, { "epoch": 1.5701281813115364, "grad_norm": 0.0801532119512558, "learning_rate": 1.1312438402452952e-05, "loss": 0.0054, "step": 211300 }, { "epoch": 1.5708712613784135, "grad_norm": 0.05635219067335129, "learning_rate": 1.1292883625612065e-05, "loss": 0.0046, "step": 211400 }, { "epoch": 1.5716143414452908, "grad_norm": 0.052912916988134384, "learning_rate": 1.1273328848771178e-05, "loss": 0.0064, "step": 211500 }, { "epoch": 1.572357421512168, "grad_norm": 0.073097825050354, "learning_rate": 1.1253774071930291e-05, "loss": 0.0046, "step": 211600 }, { "epoch": 1.5731005015790451, "grad_norm": 0.059651412069797516, "learning_rate": 1.1234219295089404e-05, "loss": 0.0057, "step": 211700 }, { "epoch": 1.5738435816459222, "grad_norm": 0.05621477589011192, "learning_rate": 1.121466451824852e-05, "loss": 0.0051, "step": 211800 }, { "epoch": 1.5745866617127997, "grad_norm": 0.047097839415073395, "learning_rate": 1.1195109741407632e-05, "loss": 0.0051, "step": 211900 }, { "epoch": 1.5753297417796768, "grad_norm": 0.033562708646059036, "learning_rate": 1.1175554964566745e-05, "loss": 0.0058, "step": 212000 }, { "epoch": 1.5760728218465538, "grad_norm": 0.042373932898044586, "learning_rate": 1.1156000187725858e-05, "loss": 0.005, "step": 212100 }, { "epoch": 1.5768159019134311, "grad_norm": 0.022201484069228172, "learning_rate": 1.1136445410884972e-05, "loss": 0.0056, "step": 212200 }, { "epoch": 1.5775589819803084, "grad_norm": 0.033126350492239, "learning_rate": 1.1116890634044085e-05, "loss": 0.0061, "step": 212300 }, { "epoch": 1.5783020620471855, "grad_norm": 0.03288864716887474, "learning_rate": 1.1097335857203198e-05, "loss": 0.0043, "step": 212400 }, { "epoch": 1.5790451421140628, "grad_norm": 0.23429463803768158, "learning_rate": 1.107778108036231e-05, "loss": 0.0058, "step": 212500 }, { "epoch": 1.57978822218094, "grad_norm": 0.04487950727343559, "learning_rate": 1.1058226303521426e-05, "loss": 0.0051, "step": 212600 }, { "epoch": 1.5805313022478171, "grad_norm": 0.05184001103043556, "learning_rate": 1.1038671526680539e-05, "loss": 0.0057, "step": 212700 }, { "epoch": 1.5812743823146944, "grad_norm": 0.03178735449910164, "learning_rate": 1.1019116749839652e-05, "loss": 0.0046, "step": 212800 }, { "epoch": 1.5820174623815717, "grad_norm": 0.07508482784032822, "learning_rate": 1.0999561972998765e-05, "loss": 0.0048, "step": 212900 }, { "epoch": 1.5827605424484488, "grad_norm": 0.10821576416492462, "learning_rate": 1.0980007196157878e-05, "loss": 0.0048, "step": 213000 }, { "epoch": 1.583503622515326, "grad_norm": 0.01676221378147602, "learning_rate": 1.0960452419316991e-05, "loss": 0.0057, "step": 213100 }, { "epoch": 1.5842467025822033, "grad_norm": 0.08562224358320236, "learning_rate": 1.0940897642476104e-05, "loss": 0.0051, "step": 213200 }, { "epoch": 1.5849897826490804, "grad_norm": 0.07478880882263184, "learning_rate": 1.0921342865635217e-05, "loss": 0.0051, "step": 213300 }, { "epoch": 1.5857328627159575, "grad_norm": 0.05371400713920593, "learning_rate": 1.0901788088794332e-05, "loss": 0.0049, "step": 213400 }, { "epoch": 1.586475942782835, "grad_norm": 0.0485776849091053, "learning_rate": 1.0882233311953445e-05, "loss": 0.0057, "step": 213500 }, { "epoch": 1.587219022849712, "grad_norm": 0.04849202185869217, "learning_rate": 1.0862678535112558e-05, "loss": 0.0045, "step": 213600 }, { "epoch": 1.5879621029165891, "grad_norm": 0.07598593831062317, "learning_rate": 1.0843123758271671e-05, "loss": 0.0054, "step": 213700 }, { "epoch": 1.5887051829834666, "grad_norm": 0.24355071783065796, "learning_rate": 1.0823568981430784e-05, "loss": 0.0054, "step": 213800 }, { "epoch": 1.5894482630503437, "grad_norm": 0.06331522017717361, "learning_rate": 1.0804014204589897e-05, "loss": 0.005, "step": 213900 }, { "epoch": 1.5901913431172208, "grad_norm": 0.025684267282485962, "learning_rate": 1.078445942774901e-05, "loss": 0.0057, "step": 214000 }, { "epoch": 1.590934423184098, "grad_norm": 0.06554869562387466, "learning_rate": 1.0764904650908123e-05, "loss": 0.0044, "step": 214100 }, { "epoch": 1.5916775032509753, "grad_norm": 0.2995028793811798, "learning_rate": 1.0745349874067236e-05, "loss": 0.0049, "step": 214200 }, { "epoch": 1.5924205833178524, "grad_norm": 0.05166199430823326, "learning_rate": 1.0725795097226351e-05, "loss": 0.0052, "step": 214300 }, { "epoch": 1.5931636633847297, "grad_norm": 0.03865324333310127, "learning_rate": 1.0706240320385464e-05, "loss": 0.0055, "step": 214400 }, { "epoch": 1.593906743451607, "grad_norm": 1.1684839725494385, "learning_rate": 1.0686685543544577e-05, "loss": 0.0043, "step": 214500 }, { "epoch": 1.594649823518484, "grad_norm": 0.04564885050058365, "learning_rate": 1.066713076670369e-05, "loss": 0.0056, "step": 214600 }, { "epoch": 1.5953929035853613, "grad_norm": 0.047709569334983826, "learning_rate": 1.0647575989862803e-05, "loss": 0.0053, "step": 214700 }, { "epoch": 1.5961359836522386, "grad_norm": 0.036967985332012177, "learning_rate": 1.0628021213021916e-05, "loss": 0.005, "step": 214800 }, { "epoch": 1.5968790637191157, "grad_norm": 0.13877320289611816, "learning_rate": 1.0608466436181031e-05, "loss": 0.0047, "step": 214900 }, { "epoch": 1.597622143785993, "grad_norm": 0.08383018523454666, "learning_rate": 1.0588911659340144e-05, "loss": 0.0058, "step": 215000 }, { "epoch": 1.5983652238528703, "grad_norm": 0.0169433131814003, "learning_rate": 1.0569356882499257e-05, "loss": 0.0046, "step": 215100 }, { "epoch": 1.5991083039197473, "grad_norm": 0.0556168258190155, "learning_rate": 1.054980210565837e-05, "loss": 0.0043, "step": 215200 }, { "epoch": 1.5998513839866244, "grad_norm": 0.07416871190071106, "learning_rate": 1.0530247328817485e-05, "loss": 0.0056, "step": 215300 }, { "epoch": 1.600594464053502, "grad_norm": 0.023786362260580063, "learning_rate": 1.0510692551976598e-05, "loss": 0.0052, "step": 215400 }, { "epoch": 1.601337544120379, "grad_norm": 0.04411672055721283, "learning_rate": 1.0491137775135711e-05, "loss": 0.0039, "step": 215500 }, { "epoch": 1.602080624187256, "grad_norm": 0.0694483146071434, "learning_rate": 1.0471582998294824e-05, "loss": 0.0056, "step": 215600 }, { "epoch": 1.6028237042541333, "grad_norm": 0.04468277469277382, "learning_rate": 1.0452028221453937e-05, "loss": 0.0049, "step": 215700 }, { "epoch": 1.6035667843210106, "grad_norm": 0.22823786735534668, "learning_rate": 1.043247344461305e-05, "loss": 0.0055, "step": 215800 }, { "epoch": 1.6043098643878877, "grad_norm": 0.03841749206185341, "learning_rate": 1.0412918667772164e-05, "loss": 0.0049, "step": 215900 }, { "epoch": 1.605052944454765, "grad_norm": 0.12662723660469055, "learning_rate": 1.0393363890931278e-05, "loss": 0.0054, "step": 216000 }, { "epoch": 1.6057960245216423, "grad_norm": 0.03479428216814995, "learning_rate": 1.0373809114090391e-05, "loss": 0.0056, "step": 216100 }, { "epoch": 1.6065391045885193, "grad_norm": 0.031005600467324257, "learning_rate": 1.0354254337249505e-05, "loss": 0.0057, "step": 216200 }, { "epoch": 1.6072821846553966, "grad_norm": 0.07756181061267853, "learning_rate": 1.0334699560408618e-05, "loss": 0.0046, "step": 216300 }, { "epoch": 1.608025264722274, "grad_norm": 0.05916139855980873, "learning_rate": 1.031514478356773e-05, "loss": 0.0047, "step": 216400 }, { "epoch": 1.608768344789151, "grad_norm": 0.03835935890674591, "learning_rate": 1.0295590006726844e-05, "loss": 0.0051, "step": 216500 }, { "epoch": 1.6095114248560283, "grad_norm": 0.031540125608444214, "learning_rate": 1.0276035229885957e-05, "loss": 0.0051, "step": 216600 }, { "epoch": 1.6102545049229056, "grad_norm": 0.03902171179652214, "learning_rate": 1.025648045304507e-05, "loss": 0.0057, "step": 216700 }, { "epoch": 1.6109975849897826, "grad_norm": 0.09328322857618332, "learning_rate": 1.0236925676204185e-05, "loss": 0.0051, "step": 216800 }, { "epoch": 1.6117406650566597, "grad_norm": 0.027132118120789528, "learning_rate": 1.0217370899363298e-05, "loss": 0.005, "step": 216900 }, { "epoch": 1.6124837451235372, "grad_norm": 0.022483808919787407, "learning_rate": 1.019781612252241e-05, "loss": 0.0043, "step": 217000 }, { "epoch": 1.6132268251904143, "grad_norm": 0.034018538892269135, "learning_rate": 1.0178261345681524e-05, "loss": 0.0058, "step": 217100 }, { "epoch": 1.6139699052572913, "grad_norm": 0.05334585905075073, "learning_rate": 1.0158706568840637e-05, "loss": 0.0058, "step": 217200 }, { "epoch": 1.6147129853241686, "grad_norm": 0.031111592426896095, "learning_rate": 1.013915179199975e-05, "loss": 0.005, "step": 217300 }, { "epoch": 1.615456065391046, "grad_norm": 0.14246146380901337, "learning_rate": 1.0119597015158863e-05, "loss": 0.0048, "step": 217400 }, { "epoch": 1.616199145457923, "grad_norm": 0.056439075618982315, "learning_rate": 1.0100042238317976e-05, "loss": 0.0059, "step": 217500 }, { "epoch": 1.6169422255248003, "grad_norm": 0.061631832271814346, "learning_rate": 1.008048746147709e-05, "loss": 0.0047, "step": 217600 }, { "epoch": 1.6176853055916776, "grad_norm": 0.019097784534096718, "learning_rate": 1.0060932684636204e-05, "loss": 0.0048, "step": 217700 }, { "epoch": 1.6184283856585546, "grad_norm": 0.052652183920145035, "learning_rate": 1.0041377907795317e-05, "loss": 0.0051, "step": 217800 }, { "epoch": 1.619171465725432, "grad_norm": 0.04515929892659187, "learning_rate": 1.002182313095443e-05, "loss": 0.0057, "step": 217900 }, { "epoch": 1.6199145457923092, "grad_norm": 0.05994727835059166, "learning_rate": 1.0002268354113543e-05, "loss": 0.0054, "step": 218000 }, { "epoch": 1.6206576258591863, "grad_norm": 0.06598459184169769, "learning_rate": 9.982713577272656e-06, "loss": 0.0057, "step": 218100 }, { "epoch": 1.6214007059260636, "grad_norm": 0.14966866374015808, "learning_rate": 9.96315880043177e-06, "loss": 0.0057, "step": 218200 }, { "epoch": 1.6221437859929408, "grad_norm": 0.08620477467775345, "learning_rate": 9.943604023590882e-06, "loss": 0.0042, "step": 218300 }, { "epoch": 1.622886866059818, "grad_norm": 0.02149772085249424, "learning_rate": 9.924049246749995e-06, "loss": 0.0049, "step": 218400 }, { "epoch": 1.6236299461266952, "grad_norm": 0.1919519603252411, "learning_rate": 9.90449446990911e-06, "loss": 0.0061, "step": 218500 }, { "epoch": 1.6243730261935725, "grad_norm": 0.03158992528915405, "learning_rate": 9.884939693068223e-06, "loss": 0.0056, "step": 218600 }, { "epoch": 1.6251161062604496, "grad_norm": 0.02121286280453205, "learning_rate": 9.865384916227336e-06, "loss": 0.0064, "step": 218700 }, { "epoch": 1.6258591863273266, "grad_norm": 0.03186742588877678, "learning_rate": 9.84583013938645e-06, "loss": 0.0058, "step": 218800 }, { "epoch": 1.6266022663942041, "grad_norm": 0.04128284752368927, "learning_rate": 9.826275362545563e-06, "loss": 0.0053, "step": 218900 }, { "epoch": 1.6273453464610812, "grad_norm": 0.051785312592983246, "learning_rate": 9.806720585704676e-06, "loss": 0.0054, "step": 219000 }, { "epoch": 1.6280884265279583, "grad_norm": 0.009080703370273113, "learning_rate": 9.787165808863789e-06, "loss": 0.0055, "step": 219100 }, { "epoch": 1.6288315065948356, "grad_norm": 0.03506125509738922, "learning_rate": 9.767611032022902e-06, "loss": 0.0051, "step": 219200 }, { "epoch": 1.6295745866617128, "grad_norm": 0.04657045751810074, "learning_rate": 9.748056255182015e-06, "loss": 0.0055, "step": 219300 }, { "epoch": 1.63031766672859, "grad_norm": 0.015722431242465973, "learning_rate": 9.72850147834113e-06, "loss": 0.0046, "step": 219400 }, { "epoch": 1.6310607467954672, "grad_norm": 0.03357243165373802, "learning_rate": 9.708946701500243e-06, "loss": 0.0056, "step": 219500 }, { "epoch": 1.6318038268623445, "grad_norm": 0.13302043080329895, "learning_rate": 9.689391924659356e-06, "loss": 0.0053, "step": 219600 }, { "epoch": 1.6325469069292216, "grad_norm": 0.043520018458366394, "learning_rate": 9.669837147818469e-06, "loss": 0.0052, "step": 219700 }, { "epoch": 1.6332899869960988, "grad_norm": 0.02295575477182865, "learning_rate": 9.650282370977584e-06, "loss": 0.0063, "step": 219800 }, { "epoch": 1.6340330670629761, "grad_norm": 0.054748840630054474, "learning_rate": 9.630727594136697e-06, "loss": 0.0054, "step": 219900 }, { "epoch": 1.6347761471298532, "grad_norm": 0.06290123611688614, "learning_rate": 9.61117281729581e-06, "loss": 0.0067, "step": 220000 }, { "epoch": 1.6355192271967305, "grad_norm": 0.031301774084568024, "learning_rate": 9.591618040454923e-06, "loss": 0.0066, "step": 220100 }, { "epoch": 1.6362623072636078, "grad_norm": 0.24826496839523315, "learning_rate": 9.572063263614038e-06, "loss": 0.0051, "step": 220200 }, { "epoch": 1.6370053873304848, "grad_norm": 0.14271537959575653, "learning_rate": 9.55250848677315e-06, "loss": 0.0046, "step": 220300 }, { "epoch": 1.637748467397362, "grad_norm": 0.06891150772571564, "learning_rate": 9.532953709932264e-06, "loss": 0.0048, "step": 220400 }, { "epoch": 1.6384915474642394, "grad_norm": 0.04668249562382698, "learning_rate": 9.513398933091377e-06, "loss": 0.0049, "step": 220500 }, { "epoch": 1.6392346275311165, "grad_norm": 0.03628038242459297, "learning_rate": 9.49384415625049e-06, "loss": 0.0049, "step": 220600 }, { "epoch": 1.6399777075979936, "grad_norm": 0.03522608056664467, "learning_rate": 9.474289379409603e-06, "loss": 0.005, "step": 220700 }, { "epoch": 1.6407207876648708, "grad_norm": 0.08461201190948486, "learning_rate": 9.454734602568716e-06, "loss": 0.0057, "step": 220800 }, { "epoch": 1.6414638677317481, "grad_norm": 0.021558521315455437, "learning_rate": 9.435179825727829e-06, "loss": 0.0045, "step": 220900 }, { "epoch": 1.6422069477986252, "grad_norm": 0.026459259912371635, "learning_rate": 9.415625048886944e-06, "loss": 0.0048, "step": 221000 }, { "epoch": 1.6429500278655025, "grad_norm": 0.17684800922870636, "learning_rate": 9.396070272046057e-06, "loss": 0.0052, "step": 221100 }, { "epoch": 1.6436931079323798, "grad_norm": 0.1254163533449173, "learning_rate": 9.37651549520517e-06, "loss": 0.0047, "step": 221200 }, { "epoch": 1.6444361879992568, "grad_norm": 0.0752619132399559, "learning_rate": 9.356960718364283e-06, "loss": 0.0054, "step": 221300 }, { "epoch": 1.6451792680661341, "grad_norm": 0.045857831835746765, "learning_rate": 9.337405941523396e-06, "loss": 0.0053, "step": 221400 }, { "epoch": 1.6459223481330114, "grad_norm": 0.04946961998939514, "learning_rate": 9.317851164682509e-06, "loss": 0.0058, "step": 221500 }, { "epoch": 1.6466654281998885, "grad_norm": 0.03132476657629013, "learning_rate": 9.298296387841622e-06, "loss": 0.0055, "step": 221600 }, { "epoch": 1.6474085082667658, "grad_norm": 0.010024027898907661, "learning_rate": 9.278741611000735e-06, "loss": 0.0049, "step": 221700 }, { "epoch": 1.648151588333643, "grad_norm": 0.06337239593267441, "learning_rate": 9.259186834159848e-06, "loss": 0.0049, "step": 221800 }, { "epoch": 1.6488946684005201, "grad_norm": 0.20246298611164093, "learning_rate": 9.239632057318963e-06, "loss": 0.0061, "step": 221900 }, { "epoch": 1.6496377484673974, "grad_norm": 0.16850683093070984, "learning_rate": 9.220077280478076e-06, "loss": 0.0054, "step": 222000 }, { "epoch": 1.6503808285342747, "grad_norm": 0.14984537661075592, "learning_rate": 9.20052250363719e-06, "loss": 0.0042, "step": 222100 }, { "epoch": 1.6511239086011518, "grad_norm": 0.04950297251343727, "learning_rate": 9.180967726796302e-06, "loss": 0.0052, "step": 222200 }, { "epoch": 1.6518669886680288, "grad_norm": 0.03353048861026764, "learning_rate": 9.161412949955415e-06, "loss": 0.0056, "step": 222300 }, { "epoch": 1.6526100687349063, "grad_norm": 0.029077833518385887, "learning_rate": 9.141858173114528e-06, "loss": 0.0051, "step": 222400 }, { "epoch": 1.6533531488017834, "grad_norm": 0.023820197209715843, "learning_rate": 9.122303396273642e-06, "loss": 0.0055, "step": 222500 }, { "epoch": 1.6540962288686605, "grad_norm": 0.03843036666512489, "learning_rate": 9.102748619432755e-06, "loss": 0.0044, "step": 222600 }, { "epoch": 1.6548393089355378, "grad_norm": 0.036830879747867584, "learning_rate": 9.08319384259187e-06, "loss": 0.004, "step": 222700 }, { "epoch": 1.655582389002415, "grad_norm": 0.04081841558218002, "learning_rate": 9.063639065750982e-06, "loss": 0.0066, "step": 222800 }, { "epoch": 1.6563254690692921, "grad_norm": 0.03358948975801468, "learning_rate": 9.044084288910096e-06, "loss": 0.0055, "step": 222900 }, { "epoch": 1.6570685491361694, "grad_norm": 0.020938681438565254, "learning_rate": 9.024529512069209e-06, "loss": 0.0059, "step": 223000 }, { "epoch": 1.6578116292030467, "grad_norm": 0.05733855813741684, "learning_rate": 9.004974735228322e-06, "loss": 0.0046, "step": 223100 }, { "epoch": 1.6585547092699238, "grad_norm": 0.11875060200691223, "learning_rate": 8.985419958387435e-06, "loss": 0.0057, "step": 223200 }, { "epoch": 1.659297789336801, "grad_norm": 0.05742805451154709, "learning_rate": 8.965865181546548e-06, "loss": 0.0053, "step": 223300 }, { "epoch": 1.6600408694036783, "grad_norm": 0.08136138319969177, "learning_rate": 8.946310404705661e-06, "loss": 0.0048, "step": 223400 }, { "epoch": 1.6607839494705554, "grad_norm": 0.010603212751448154, "learning_rate": 8.926755627864774e-06, "loss": 0.0045, "step": 223500 }, { "epoch": 1.6615270295374327, "grad_norm": 0.06397196650505066, "learning_rate": 8.907200851023889e-06, "loss": 0.0048, "step": 223600 }, { "epoch": 1.66227010960431, "grad_norm": 0.04967955872416496, "learning_rate": 8.887646074183002e-06, "loss": 0.0045, "step": 223700 }, { "epoch": 1.663013189671187, "grad_norm": 0.052691444754600525, "learning_rate": 8.868091297342115e-06, "loss": 0.0049, "step": 223800 }, { "epoch": 1.6637562697380641, "grad_norm": 0.05278610810637474, "learning_rate": 8.848536520501228e-06, "loss": 0.0058, "step": 223900 }, { "epoch": 1.6644993498049416, "grad_norm": 0.046934161335229874, "learning_rate": 8.828981743660341e-06, "loss": 0.0045, "step": 224000 }, { "epoch": 1.6652424298718187, "grad_norm": 0.04396077245473862, "learning_rate": 8.809426966819454e-06, "loss": 0.0052, "step": 224100 }, { "epoch": 1.6659855099386958, "grad_norm": 0.021269502118229866, "learning_rate": 8.789872189978569e-06, "loss": 0.0044, "step": 224200 }, { "epoch": 1.666728590005573, "grad_norm": 0.07213716953992844, "learning_rate": 8.770317413137682e-06, "loss": 0.0044, "step": 224300 }, { "epoch": 1.6674716700724503, "grad_norm": 0.034024130553007126, "learning_rate": 8.750762636296795e-06, "loss": 0.0057, "step": 224400 }, { "epoch": 1.6682147501393274, "grad_norm": 0.014567280188202858, "learning_rate": 8.731207859455908e-06, "loss": 0.0051, "step": 224500 }, { "epoch": 1.6689578302062047, "grad_norm": 0.03690924495458603, "learning_rate": 8.711653082615021e-06, "loss": 0.0038, "step": 224600 }, { "epoch": 1.669700910273082, "grad_norm": 0.043557681143283844, "learning_rate": 8.692098305774136e-06, "loss": 0.0049, "step": 224700 }, { "epoch": 1.670443990339959, "grad_norm": 0.03354122117161751, "learning_rate": 8.672543528933249e-06, "loss": 0.0046, "step": 224800 }, { "epoch": 1.6711870704068363, "grad_norm": 0.026770036667585373, "learning_rate": 8.652988752092362e-06, "loss": 0.0048, "step": 224900 }, { "epoch": 1.6719301504737136, "grad_norm": 0.035550884902477264, "learning_rate": 8.633433975251475e-06, "loss": 0.0054, "step": 225000 }, { "epoch": 1.6726732305405907, "grad_norm": 0.02436172217130661, "learning_rate": 8.613879198410588e-06, "loss": 0.0049, "step": 225100 }, { "epoch": 1.673416310607468, "grad_norm": 0.04377181455492973, "learning_rate": 8.594324421569701e-06, "loss": 0.0051, "step": 225200 }, { "epoch": 1.6741593906743453, "grad_norm": 0.02966817282140255, "learning_rate": 8.574769644728816e-06, "loss": 0.0051, "step": 225300 }, { "epoch": 1.6749024707412223, "grad_norm": 0.030720354989171028, "learning_rate": 8.555214867887929e-06, "loss": 0.004, "step": 225400 }, { "epoch": 1.6756455508080996, "grad_norm": 0.023681873455643654, "learning_rate": 8.535660091047042e-06, "loss": 0.004, "step": 225500 }, { "epoch": 1.676388630874977, "grad_norm": 0.044477224349975586, "learning_rate": 8.516105314206155e-06, "loss": 0.0048, "step": 225600 }, { "epoch": 1.677131710941854, "grad_norm": 0.22005289793014526, "learning_rate": 8.496550537365268e-06, "loss": 0.0058, "step": 225700 }, { "epoch": 1.677874791008731, "grad_norm": 0.04272909834980965, "learning_rate": 8.476995760524381e-06, "loss": 0.0042, "step": 225800 }, { "epoch": 1.6786178710756086, "grad_norm": 0.07483986020088196, "learning_rate": 8.457440983683494e-06, "loss": 0.0056, "step": 225900 }, { "epoch": 1.6793609511424856, "grad_norm": 0.20078082382678986, "learning_rate": 8.437886206842607e-06, "loss": 0.0041, "step": 226000 }, { "epoch": 1.6801040312093627, "grad_norm": 0.05233059078454971, "learning_rate": 8.418331430001722e-06, "loss": 0.0051, "step": 226100 }, { "epoch": 1.68084711127624, "grad_norm": 0.7786210775375366, "learning_rate": 8.398776653160835e-06, "loss": 0.0049, "step": 226200 }, { "epoch": 1.6815901913431173, "grad_norm": 0.14322350919246674, "learning_rate": 8.379221876319948e-06, "loss": 0.0051, "step": 226300 }, { "epoch": 1.6823332714099943, "grad_norm": 0.05533061549067497, "learning_rate": 8.359667099479061e-06, "loss": 0.0055, "step": 226400 }, { "epoch": 1.6830763514768716, "grad_norm": 0.02276042103767395, "learning_rate": 8.340112322638175e-06, "loss": 0.0056, "step": 226500 }, { "epoch": 1.683819431543749, "grad_norm": 0.04156886786222458, "learning_rate": 8.320557545797288e-06, "loss": 0.0051, "step": 226600 }, { "epoch": 1.684562511610626, "grad_norm": 0.027079258114099503, "learning_rate": 8.3010027689564e-06, "loss": 0.006, "step": 226700 }, { "epoch": 1.6853055916775033, "grad_norm": 0.16454586386680603, "learning_rate": 8.281447992115514e-06, "loss": 0.0058, "step": 226800 }, { "epoch": 1.6860486717443806, "grad_norm": 0.16317066550254822, "learning_rate": 8.261893215274627e-06, "loss": 0.0057, "step": 226900 }, { "epoch": 1.6867917518112576, "grad_norm": 0.04602869227528572, "learning_rate": 8.242338438433742e-06, "loss": 0.0049, "step": 227000 }, { "epoch": 1.687534831878135, "grad_norm": 0.0491253100335598, "learning_rate": 8.222783661592855e-06, "loss": 0.0057, "step": 227100 }, { "epoch": 1.6882779119450122, "grad_norm": 0.04212072491645813, "learning_rate": 8.203228884751968e-06, "loss": 0.0047, "step": 227200 }, { "epoch": 1.6890209920118893, "grad_norm": 0.0240557212382555, "learning_rate": 8.18367410791108e-06, "loss": 0.0045, "step": 227300 }, { "epoch": 1.6897640720787663, "grad_norm": 0.04941301420331001, "learning_rate": 8.164119331070194e-06, "loss": 0.0053, "step": 227400 }, { "epoch": 1.6905071521456438, "grad_norm": 0.1418868750333786, "learning_rate": 8.144564554229307e-06, "loss": 0.0052, "step": 227500 }, { "epoch": 1.691250232212521, "grad_norm": 0.12963849306106567, "learning_rate": 8.12500977738842e-06, "loss": 0.0055, "step": 227600 }, { "epoch": 1.691993312279398, "grad_norm": 0.04369397833943367, "learning_rate": 8.105455000547533e-06, "loss": 0.0053, "step": 227700 }, { "epoch": 1.6927363923462753, "grad_norm": 0.05911760404706001, "learning_rate": 8.085900223706648e-06, "loss": 0.0055, "step": 227800 }, { "epoch": 1.6934794724131526, "grad_norm": 0.01323460228741169, "learning_rate": 8.066345446865761e-06, "loss": 0.0051, "step": 227900 }, { "epoch": 1.6942225524800296, "grad_norm": 0.0923224687576294, "learning_rate": 8.046790670024874e-06, "loss": 0.0053, "step": 228000 }, { "epoch": 1.694965632546907, "grad_norm": 0.21094828844070435, "learning_rate": 8.027235893183987e-06, "loss": 0.0053, "step": 228100 }, { "epoch": 1.6957087126137842, "grad_norm": 0.0325453020632267, "learning_rate": 8.0076811163431e-06, "loss": 0.0054, "step": 228200 }, { "epoch": 1.6964517926806613, "grad_norm": 0.031888436526060104, "learning_rate": 7.988126339502213e-06, "loss": 0.0053, "step": 228300 }, { "epoch": 1.6971948727475386, "grad_norm": 0.025240691378712654, "learning_rate": 7.968571562661326e-06, "loss": 0.0046, "step": 228400 }, { "epoch": 1.6979379528144158, "grad_norm": 0.17278355360031128, "learning_rate": 7.94901678582044e-06, "loss": 0.0056, "step": 228500 }, { "epoch": 1.698681032881293, "grad_norm": 0.02541627176105976, "learning_rate": 7.929462008979552e-06, "loss": 0.005, "step": 228600 }, { "epoch": 1.6994241129481702, "grad_norm": 0.048963841050863266, "learning_rate": 7.909907232138667e-06, "loss": 0.005, "step": 228700 }, { "epoch": 1.7001671930150475, "grad_norm": 0.06952855736017227, "learning_rate": 7.89035245529778e-06, "loss": 0.0049, "step": 228800 }, { "epoch": 1.7009102730819246, "grad_norm": 0.04739735648036003, "learning_rate": 7.870797678456893e-06, "loss": 0.0052, "step": 228900 }, { "epoch": 1.7016533531488016, "grad_norm": 0.02605341374874115, "learning_rate": 7.851242901616006e-06, "loss": 0.0045, "step": 229000 }, { "epoch": 1.7023964332156791, "grad_norm": 0.0934947207570076, "learning_rate": 7.831688124775121e-06, "loss": 0.0054, "step": 229100 }, { "epoch": 1.7031395132825562, "grad_norm": 0.08638216555118561, "learning_rate": 7.812133347934234e-06, "loss": 0.0048, "step": 229200 }, { "epoch": 1.7038825933494333, "grad_norm": 0.0218040868639946, "learning_rate": 7.792578571093347e-06, "loss": 0.0057, "step": 229300 }, { "epoch": 1.7046256734163108, "grad_norm": 0.05169334635138512, "learning_rate": 7.77302379425246e-06, "loss": 0.0046, "step": 229400 }, { "epoch": 1.7053687534831878, "grad_norm": 0.031662795692682266, "learning_rate": 7.753469017411573e-06, "loss": 0.0051, "step": 229500 }, { "epoch": 1.706111833550065, "grad_norm": 0.13141417503356934, "learning_rate": 7.733914240570688e-06, "loss": 0.0048, "step": 229600 }, { "epoch": 1.7068549136169422, "grad_norm": 0.08911601454019547, "learning_rate": 7.714359463729801e-06, "loss": 0.0052, "step": 229700 }, { "epoch": 1.7075979936838195, "grad_norm": 0.04889464005827904, "learning_rate": 7.694804686888914e-06, "loss": 0.0057, "step": 229800 }, { "epoch": 1.7083410737506965, "grad_norm": 0.029642444103956223, "learning_rate": 7.675249910048027e-06, "loss": 0.0055, "step": 229900 }, { "epoch": 1.7090841538175738, "grad_norm": 0.04428066313266754, "learning_rate": 7.65569513320714e-06, "loss": 0.0055, "step": 230000 }, { "epoch": 1.7098272338844511, "grad_norm": 0.04574123024940491, "learning_rate": 7.636140356366254e-06, "loss": 0.0049, "step": 230100 }, { "epoch": 1.7105703139513282, "grad_norm": 0.048797763884067535, "learning_rate": 7.616585579525367e-06, "loss": 0.0053, "step": 230200 }, { "epoch": 1.7113133940182055, "grad_norm": 0.04924425110220909, "learning_rate": 7.5970308026844805e-06, "loss": 0.0051, "step": 230300 }, { "epoch": 1.7120564740850828, "grad_norm": 0.05206593871116638, "learning_rate": 7.577476025843594e-06, "loss": 0.006, "step": 230400 }, { "epoch": 1.7127995541519598, "grad_norm": 0.04372726008296013, "learning_rate": 7.557921249002707e-06, "loss": 0.0045, "step": 230500 }, { "epoch": 1.7135426342188371, "grad_norm": 0.04597911611199379, "learning_rate": 7.538366472161821e-06, "loss": 0.0061, "step": 230600 }, { "epoch": 1.7142857142857144, "grad_norm": 0.6127720475196838, "learning_rate": 7.518811695320934e-06, "loss": 0.0068, "step": 230700 }, { "epoch": 1.7150287943525915, "grad_norm": 0.08404143899679184, "learning_rate": 7.499256918480047e-06, "loss": 0.0048, "step": 230800 }, { "epoch": 1.7157718744194685, "grad_norm": 0.0474470853805542, "learning_rate": 7.47970214163916e-06, "loss": 0.0041, "step": 230900 }, { "epoch": 1.716514954486346, "grad_norm": 0.22970625758171082, "learning_rate": 7.460147364798273e-06, "loss": 0.0054, "step": 231000 }, { "epoch": 1.7172580345532231, "grad_norm": 0.018522148951888084, "learning_rate": 7.440592587957386e-06, "loss": 0.0048, "step": 231100 }, { "epoch": 1.7180011146201002, "grad_norm": 0.14452296495437622, "learning_rate": 7.421037811116501e-06, "loss": 0.0053, "step": 231200 }, { "epoch": 1.7187441946869775, "grad_norm": 0.081121064722538, "learning_rate": 7.401483034275614e-06, "loss": 0.0045, "step": 231300 }, { "epoch": 1.7194872747538548, "grad_norm": 0.10330303758382797, "learning_rate": 7.381928257434727e-06, "loss": 0.0049, "step": 231400 }, { "epoch": 1.7202303548207318, "grad_norm": 0.1636040061712265, "learning_rate": 7.36237348059384e-06, "loss": 0.007, "step": 231500 }, { "epoch": 1.7209734348876091, "grad_norm": 0.025932984426617622, "learning_rate": 7.342818703752953e-06, "loss": 0.0058, "step": 231600 }, { "epoch": 1.7217165149544864, "grad_norm": 0.03211952745914459, "learning_rate": 7.323263926912066e-06, "loss": 0.0056, "step": 231700 }, { "epoch": 1.7224595950213635, "grad_norm": 0.06997215747833252, "learning_rate": 7.303709150071179e-06, "loss": 0.0058, "step": 231800 }, { "epoch": 1.7232026750882408, "grad_norm": 0.24075594544410706, "learning_rate": 7.284154373230292e-06, "loss": 0.0045, "step": 231900 }, { "epoch": 1.723945755155118, "grad_norm": 0.0370822511613369, "learning_rate": 7.264599596389407e-06, "loss": 0.0063, "step": 232000 }, { "epoch": 1.7246888352219951, "grad_norm": 0.12147554010152817, "learning_rate": 7.24504481954852e-06, "loss": 0.0048, "step": 232100 }, { "epoch": 1.7254319152888724, "grad_norm": 0.18551839888095856, "learning_rate": 7.225490042707633e-06, "loss": 0.0058, "step": 232200 }, { "epoch": 1.7261749953557497, "grad_norm": 0.020847799256443977, "learning_rate": 7.205935265866746e-06, "loss": 0.0049, "step": 232300 }, { "epoch": 1.7269180754226268, "grad_norm": 0.024434233084321022, "learning_rate": 7.186380489025859e-06, "loss": 0.0042, "step": 232400 }, { "epoch": 1.7276611554895038, "grad_norm": 0.09065721929073334, "learning_rate": 7.166825712184972e-06, "loss": 0.0045, "step": 232500 }, { "epoch": 1.7284042355563813, "grad_norm": 0.0461307018995285, "learning_rate": 7.147270935344086e-06, "loss": 0.0056, "step": 232600 }, { "epoch": 1.7291473156232584, "grad_norm": 0.11477819830179214, "learning_rate": 7.127716158503199e-06, "loss": 0.005, "step": 232700 }, { "epoch": 1.7298903956901355, "grad_norm": 0.027914393693208694, "learning_rate": 7.108161381662312e-06, "loss": 0.0055, "step": 232800 }, { "epoch": 1.730633475757013, "grad_norm": 0.04149572178721428, "learning_rate": 7.088606604821426e-06, "loss": 0.004, "step": 232900 }, { "epoch": 1.73137655582389, "grad_norm": 0.04971208795905113, "learning_rate": 7.06905182798054e-06, "loss": 0.005, "step": 233000 }, { "epoch": 1.7321196358907671, "grad_norm": 0.021336160600185394, "learning_rate": 7.049497051139653e-06, "loss": 0.0039, "step": 233100 }, { "epoch": 1.7328627159576444, "grad_norm": 0.02815554104745388, "learning_rate": 7.029942274298766e-06, "loss": 0.0046, "step": 233200 }, { "epoch": 1.7336057960245217, "grad_norm": 0.04150295630097389, "learning_rate": 7.0103874974578794e-06, "loss": 0.0053, "step": 233300 }, { "epoch": 1.7343488760913988, "grad_norm": 0.04658789932727814, "learning_rate": 6.9908327206169925e-06, "loss": 0.0042, "step": 233400 }, { "epoch": 1.735091956158276, "grad_norm": 0.02029784955084324, "learning_rate": 6.971277943776106e-06, "loss": 0.0051, "step": 233500 }, { "epoch": 1.7358350362251533, "grad_norm": 0.03277648612856865, "learning_rate": 6.951723166935219e-06, "loss": 0.0053, "step": 233600 }, { "epoch": 1.7365781162920304, "grad_norm": 0.11073578894138336, "learning_rate": 6.932168390094333e-06, "loss": 0.0053, "step": 233700 }, { "epoch": 1.7373211963589077, "grad_norm": 0.18297845125198364, "learning_rate": 6.9126136132534465e-06, "loss": 0.0058, "step": 233800 }, { "epoch": 1.738064276425785, "grad_norm": 0.03367098420858383, "learning_rate": 6.8930588364125596e-06, "loss": 0.005, "step": 233900 }, { "epoch": 1.738807356492662, "grad_norm": 0.24794617295265198, "learning_rate": 6.873504059571673e-06, "loss": 0.0051, "step": 234000 }, { "epoch": 1.7395504365595393, "grad_norm": 0.05270561948418617, "learning_rate": 6.853949282730786e-06, "loss": 0.0048, "step": 234100 }, { "epoch": 1.7402935166264166, "grad_norm": 0.017965659499168396, "learning_rate": 6.834394505889899e-06, "loss": 0.0053, "step": 234200 }, { "epoch": 1.7410365966932937, "grad_norm": 0.20487599074840546, "learning_rate": 6.814839729049012e-06, "loss": 0.0047, "step": 234300 }, { "epoch": 1.7417796767601708, "grad_norm": 0.06398738920688629, "learning_rate": 6.795284952208125e-06, "loss": 0.005, "step": 234400 }, { "epoch": 1.7425227568270483, "grad_norm": 0.0466417595744133, "learning_rate": 6.775730175367238e-06, "loss": 0.0051, "step": 234500 }, { "epoch": 1.7432658368939253, "grad_norm": 0.13171352446079254, "learning_rate": 6.756175398526353e-06, "loss": 0.0047, "step": 234600 }, { "epoch": 1.7440089169608024, "grad_norm": 0.06774920225143433, "learning_rate": 6.736620621685466e-06, "loss": 0.0051, "step": 234700 }, { "epoch": 1.7447519970276797, "grad_norm": 0.1605709046125412, "learning_rate": 6.717065844844579e-06, "loss": 0.0058, "step": 234800 }, { "epoch": 1.745495077094557, "grad_norm": 0.038702718913555145, "learning_rate": 6.697511068003692e-06, "loss": 0.0039, "step": 234900 }, { "epoch": 1.746238157161434, "grad_norm": 0.06873560696840286, "learning_rate": 6.677956291162806e-06, "loss": 0.0052, "step": 235000 }, { "epoch": 1.7469812372283113, "grad_norm": 0.0755314826965332, "learning_rate": 6.658401514321919e-06, "loss": 0.0055, "step": 235100 }, { "epoch": 1.7477243172951886, "grad_norm": 0.04939044639468193, "learning_rate": 6.638846737481032e-06, "loss": 0.0041, "step": 235200 }, { "epoch": 1.7484673973620657, "grad_norm": 0.1185731440782547, "learning_rate": 6.619291960640145e-06, "loss": 0.0048, "step": 235300 }, { "epoch": 1.749210477428943, "grad_norm": 0.04097692295908928, "learning_rate": 6.599737183799259e-06, "loss": 0.0055, "step": 235400 }, { "epoch": 1.7499535574958203, "grad_norm": 0.025320908054709435, "learning_rate": 6.580182406958373e-06, "loss": 0.0055, "step": 235500 }, { "epoch": 1.7506966375626973, "grad_norm": 0.04844525828957558, "learning_rate": 6.560627630117486e-06, "loss": 0.0045, "step": 235600 }, { "epoch": 1.7514397176295746, "grad_norm": 0.0313161164522171, "learning_rate": 6.541072853276599e-06, "loss": 0.0055, "step": 235700 }, { "epoch": 1.752182797696452, "grad_norm": 0.033600348979234695, "learning_rate": 6.521518076435712e-06, "loss": 0.0048, "step": 235800 }, { "epoch": 1.752925877763329, "grad_norm": 0.1261982023715973, "learning_rate": 6.501963299594825e-06, "loss": 0.0048, "step": 235900 }, { "epoch": 1.753668957830206, "grad_norm": 0.04154984652996063, "learning_rate": 6.482408522753938e-06, "loss": 0.0052, "step": 236000 }, { "epoch": 1.7544120378970836, "grad_norm": 0.030159613117575645, "learning_rate": 6.462853745913051e-06, "loss": 0.0038, "step": 236100 }, { "epoch": 1.7551551179639606, "grad_norm": 0.04945112392306328, "learning_rate": 6.443298969072164e-06, "loss": 0.0052, "step": 236200 }, { "epoch": 1.7558981980308377, "grad_norm": 0.04720941185951233, "learning_rate": 6.423744192231279e-06, "loss": 0.0046, "step": 236300 }, { "epoch": 1.756641278097715, "grad_norm": 0.0515705943107605, "learning_rate": 6.404189415390392e-06, "loss": 0.0049, "step": 236400 }, { "epoch": 1.7573843581645923, "grad_norm": 0.026389721781015396, "learning_rate": 6.384634638549505e-06, "loss": 0.0054, "step": 236500 }, { "epoch": 1.7581274382314693, "grad_norm": 0.05687859654426575, "learning_rate": 6.365079861708618e-06, "loss": 0.0052, "step": 236600 }, { "epoch": 1.7588705182983466, "grad_norm": 0.0775335431098938, "learning_rate": 6.3455250848677315e-06, "loss": 0.0041, "step": 236700 }, { "epoch": 1.759613598365224, "grad_norm": 0.027409590780735016, "learning_rate": 6.3259703080268445e-06, "loss": 0.0048, "step": 236800 }, { "epoch": 1.760356678432101, "grad_norm": 0.022704722359776497, "learning_rate": 6.306415531185958e-06, "loss": 0.0056, "step": 236900 }, { "epoch": 1.7610997584989783, "grad_norm": 0.15620560944080353, "learning_rate": 6.286860754345071e-06, "loss": 0.0055, "step": 237000 }, { "epoch": 1.7618428385658556, "grad_norm": 0.06694124639034271, "learning_rate": 6.2673059775041854e-06, "loss": 0.0055, "step": 237100 }, { "epoch": 1.7625859186327326, "grad_norm": 0.03594265505671501, "learning_rate": 6.247751200663298e-06, "loss": 0.0054, "step": 237200 }, { "epoch": 1.76332899869961, "grad_norm": 0.1489478200674057, "learning_rate": 6.228196423822412e-06, "loss": 0.0054, "step": 237300 }, { "epoch": 1.7640720787664872, "grad_norm": 0.04664010554552078, "learning_rate": 6.208641646981525e-06, "loss": 0.0056, "step": 237400 }, { "epoch": 1.7648151588333643, "grad_norm": 0.03877691552042961, "learning_rate": 6.1890868701406386e-06, "loss": 0.0058, "step": 237500 }, { "epoch": 1.7655582389002415, "grad_norm": 0.06904025375843048, "learning_rate": 6.169532093299752e-06, "loss": 0.0053, "step": 237600 }, { "epoch": 1.7663013189671188, "grad_norm": 0.02214609645307064, "learning_rate": 6.1499773164588656e-06, "loss": 0.0051, "step": 237700 }, { "epoch": 1.767044399033996, "grad_norm": 0.10123707354068756, "learning_rate": 6.130422539617979e-06, "loss": 0.0047, "step": 237800 }, { "epoch": 1.767787479100873, "grad_norm": 0.09619080275297165, "learning_rate": 6.110867762777092e-06, "loss": 0.005, "step": 237900 }, { "epoch": 1.7685305591677505, "grad_norm": 0.04669838771224022, "learning_rate": 6.091312985936205e-06, "loss": 0.0057, "step": 238000 }, { "epoch": 1.7692736392346275, "grad_norm": 0.1163201779127121, "learning_rate": 6.071758209095318e-06, "loss": 0.0046, "step": 238100 }, { "epoch": 1.7700167193015046, "grad_norm": 0.042518991976976395, "learning_rate": 6.052203432254432e-06, "loss": 0.004, "step": 238200 }, { "epoch": 1.770759799368382, "grad_norm": 0.025644797831773758, "learning_rate": 6.032648655413545e-06, "loss": 0.0049, "step": 238300 }, { "epoch": 1.7715028794352592, "grad_norm": 0.009769764728844166, "learning_rate": 6.013093878572658e-06, "loss": 0.005, "step": 238400 }, { "epoch": 1.7722459595021363, "grad_norm": 0.018596921116113663, "learning_rate": 5.993539101731771e-06, "loss": 0.0048, "step": 238500 }, { "epoch": 1.7729890395690135, "grad_norm": 0.028875894844532013, "learning_rate": 5.973984324890885e-06, "loss": 0.0044, "step": 238600 }, { "epoch": 1.7737321196358908, "grad_norm": 0.01555460412055254, "learning_rate": 5.954429548049998e-06, "loss": 0.0055, "step": 238700 }, { "epoch": 1.774475199702768, "grad_norm": 0.17104195058345795, "learning_rate": 5.934874771209111e-06, "loss": 0.0048, "step": 238800 }, { "epoch": 1.7752182797696452, "grad_norm": 0.03551209345459938, "learning_rate": 5.915319994368224e-06, "loss": 0.0049, "step": 238900 }, { "epoch": 1.7759613598365225, "grad_norm": 0.048794254660606384, "learning_rate": 5.895765217527338e-06, "loss": 0.0055, "step": 239000 }, { "epoch": 1.7767044399033995, "grad_norm": 0.0484178401529789, "learning_rate": 5.876210440686451e-06, "loss": 0.0043, "step": 239100 }, { "epoch": 1.7774475199702768, "grad_norm": 0.04048439860343933, "learning_rate": 5.856655663845564e-06, "loss": 0.0055, "step": 239200 }, { "epoch": 1.7781906000371541, "grad_norm": 0.06872911006212234, "learning_rate": 5.837100887004677e-06, "loss": 0.0045, "step": 239300 }, { "epoch": 1.7789336801040312, "grad_norm": 0.023430217057466507, "learning_rate": 5.817546110163791e-06, "loss": 0.0052, "step": 239400 }, { "epoch": 1.7796767601709083, "grad_norm": 0.11198686808347702, "learning_rate": 5.797991333322904e-06, "loss": 0.0053, "step": 239500 }, { "epoch": 1.7804198402377858, "grad_norm": 0.10408621281385422, "learning_rate": 5.778436556482017e-06, "loss": 0.0051, "step": 239600 }, { "epoch": 1.7811629203046628, "grad_norm": 0.06419600546360016, "learning_rate": 5.758881779641131e-06, "loss": 0.0052, "step": 239700 }, { "epoch": 1.78190600037154, "grad_norm": 0.06915321946144104, "learning_rate": 5.739327002800244e-06, "loss": 0.005, "step": 239800 }, { "epoch": 1.7826490804384172, "grad_norm": 0.06673683226108551, "learning_rate": 5.719772225959358e-06, "loss": 0.0051, "step": 239900 }, { "epoch": 1.7833921605052945, "grad_norm": 0.04792710766196251, "learning_rate": 5.700217449118471e-06, "loss": 0.0045, "step": 240000 }, { "epoch": 1.7841352405721715, "grad_norm": 0.09374753385782242, "learning_rate": 5.680662672277584e-06, "loss": 0.005, "step": 240100 }, { "epoch": 1.7848783206390488, "grad_norm": 0.02111654356122017, "learning_rate": 5.661107895436697e-06, "loss": 0.0058, "step": 240200 }, { "epoch": 1.7856214007059261, "grad_norm": 0.0354173369705677, "learning_rate": 5.641553118595811e-06, "loss": 0.0043, "step": 240300 }, { "epoch": 1.7863644807728032, "grad_norm": 0.033833883702754974, "learning_rate": 5.621998341754924e-06, "loss": 0.0048, "step": 240400 }, { "epoch": 1.7871075608396805, "grad_norm": 0.12816233932971954, "learning_rate": 5.6024435649140375e-06, "loss": 0.0045, "step": 240500 }, { "epoch": 1.7878506409065578, "grad_norm": 0.02490169368684292, "learning_rate": 5.5828887880731505e-06, "loss": 0.0043, "step": 240600 }, { "epoch": 1.7885937209734348, "grad_norm": 0.01630701869726181, "learning_rate": 5.5633340112322645e-06, "loss": 0.0044, "step": 240700 }, { "epoch": 1.7893368010403121, "grad_norm": 0.04624038189649582, "learning_rate": 5.5437792343913775e-06, "loss": 0.0046, "step": 240800 }, { "epoch": 1.7900798811071894, "grad_norm": 0.04983685538172722, "learning_rate": 5.524224457550491e-06, "loss": 0.0046, "step": 240900 }, { "epoch": 1.7908229611740665, "grad_norm": 0.08381535112857819, "learning_rate": 5.504669680709604e-06, "loss": 0.0054, "step": 241000 }, { "epoch": 1.7915660412409438, "grad_norm": 0.02518174611032009, "learning_rate": 5.485114903868718e-06, "loss": 0.0051, "step": 241100 }, { "epoch": 1.792309121307821, "grad_norm": 0.13042321801185608, "learning_rate": 5.465560127027831e-06, "loss": 0.0063, "step": 241200 }, { "epoch": 1.7930522013746981, "grad_norm": 0.06890133023262024, "learning_rate": 5.446005350186944e-06, "loss": 0.0038, "step": 241300 }, { "epoch": 1.7937952814415752, "grad_norm": 0.09061165153980255, "learning_rate": 5.426450573346057e-06, "loss": 0.0056, "step": 241400 }, { "epoch": 1.7945383615084527, "grad_norm": 0.02572914958000183, "learning_rate": 5.406895796505171e-06, "loss": 0.0052, "step": 241500 }, { "epoch": 1.7952814415753298, "grad_norm": 0.04056168347597122, "learning_rate": 5.387341019664284e-06, "loss": 0.0043, "step": 241600 }, { "epoch": 1.7960245216422068, "grad_norm": 0.10382635146379471, "learning_rate": 5.367786242823397e-06, "loss": 0.0057, "step": 241700 }, { "epoch": 1.7967676017090841, "grad_norm": 0.07566390931606293, "learning_rate": 5.34823146598251e-06, "loss": 0.0053, "step": 241800 }, { "epoch": 1.7975106817759614, "grad_norm": 0.24152418971061707, "learning_rate": 5.328676689141624e-06, "loss": 0.0054, "step": 241900 }, { "epoch": 1.7982537618428385, "grad_norm": 0.08508574217557907, "learning_rate": 5.309121912300737e-06, "loss": 0.004, "step": 242000 }, { "epoch": 1.7989968419097158, "grad_norm": 0.19049149751663208, "learning_rate": 5.28956713545985e-06, "loss": 0.0057, "step": 242100 }, { "epoch": 1.799739921976593, "grad_norm": 0.04635800048708916, "learning_rate": 5.270012358618964e-06, "loss": 0.0056, "step": 242200 }, { "epoch": 1.8004830020434701, "grad_norm": 0.053433556109666824, "learning_rate": 5.250457581778077e-06, "loss": 0.0052, "step": 242300 }, { "epoch": 1.8012260821103474, "grad_norm": 0.061652008444070816, "learning_rate": 5.230902804937191e-06, "loss": 0.0053, "step": 242400 }, { "epoch": 1.8019691621772247, "grad_norm": 0.04306279122829437, "learning_rate": 5.211348028096304e-06, "loss": 0.0054, "step": 242500 }, { "epoch": 1.8027122422441018, "grad_norm": 0.041412487626075745, "learning_rate": 5.191793251255417e-06, "loss": 0.0051, "step": 242600 }, { "epoch": 1.803455322310979, "grad_norm": 0.13583093881607056, "learning_rate": 5.17223847441453e-06, "loss": 0.005, "step": 242700 }, { "epoch": 1.8041984023778563, "grad_norm": 0.248240664601326, "learning_rate": 5.152683697573644e-06, "loss": 0.0049, "step": 242800 }, { "epoch": 1.8049414824447334, "grad_norm": 0.0316799059510231, "learning_rate": 5.133128920732757e-06, "loss": 0.0048, "step": 242900 }, { "epoch": 1.8056845625116105, "grad_norm": 0.032101090997457504, "learning_rate": 5.11357414389187e-06, "loss": 0.004, "step": 243000 }, { "epoch": 1.806427642578488, "grad_norm": 0.03877284377813339, "learning_rate": 5.094019367050983e-06, "loss": 0.0041, "step": 243100 }, { "epoch": 1.807170722645365, "grad_norm": 0.04548173397779465, "learning_rate": 5.074464590210097e-06, "loss": 0.0047, "step": 243200 }, { "epoch": 1.8079138027122421, "grad_norm": 0.024352988228201866, "learning_rate": 5.05490981336921e-06, "loss": 0.0042, "step": 243300 }, { "epoch": 1.8086568827791194, "grad_norm": 0.035113293677568436, "learning_rate": 5.035355036528323e-06, "loss": 0.0045, "step": 243400 }, { "epoch": 1.8093999628459967, "grad_norm": 0.21953994035720825, "learning_rate": 5.015800259687436e-06, "loss": 0.0047, "step": 243500 }, { "epoch": 1.8101430429128738, "grad_norm": 0.045406751334667206, "learning_rate": 4.99624548284655e-06, "loss": 0.0048, "step": 243600 }, { "epoch": 1.810886122979751, "grad_norm": 0.0787637010216713, "learning_rate": 4.976690706005663e-06, "loss": 0.0058, "step": 243700 }, { "epoch": 1.8116292030466283, "grad_norm": 0.04999193921685219, "learning_rate": 4.9571359291647764e-06, "loss": 0.0046, "step": 243800 }, { "epoch": 1.8123722831135054, "grad_norm": 0.4374842643737793, "learning_rate": 4.9375811523238895e-06, "loss": 0.005, "step": 243900 }, { "epoch": 1.8131153631803827, "grad_norm": 0.06652720272541046, "learning_rate": 4.9180263754830026e-06, "loss": 0.0051, "step": 244000 }, { "epoch": 1.81385844324726, "grad_norm": 0.01788109727203846, "learning_rate": 4.8984715986421165e-06, "loss": 0.0047, "step": 244100 }, { "epoch": 1.814601523314137, "grad_norm": 0.13742923736572266, "learning_rate": 4.8789168218012296e-06, "loss": 0.0056, "step": 244200 }, { "epoch": 1.8153446033810143, "grad_norm": 0.3929288387298584, "learning_rate": 4.859362044960343e-06, "loss": 0.0048, "step": 244300 }, { "epoch": 1.8160876834478916, "grad_norm": 0.06493549048900604, "learning_rate": 4.8398072681194565e-06, "loss": 0.005, "step": 244400 }, { "epoch": 1.8168307635147687, "grad_norm": 0.044479187577962875, "learning_rate": 4.82025249127857e-06, "loss": 0.005, "step": 244500 }, { "epoch": 1.817573843581646, "grad_norm": 0.027194419875741005, "learning_rate": 4.8006977144376835e-06, "loss": 0.0046, "step": 244600 }, { "epoch": 1.8183169236485233, "grad_norm": 0.10379000753164291, "learning_rate": 4.781142937596797e-06, "loss": 0.0047, "step": 244700 }, { "epoch": 1.8190600037154003, "grad_norm": 0.08378485590219498, "learning_rate": 4.76158816075591e-06, "loss": 0.0046, "step": 244800 }, { "epoch": 1.8198030837822774, "grad_norm": 0.024432741105556488, "learning_rate": 4.742033383915024e-06, "loss": 0.0046, "step": 244900 }, { "epoch": 1.820546163849155, "grad_norm": 0.12859566509723663, "learning_rate": 4.722478607074137e-06, "loss": 0.0045, "step": 245000 }, { "epoch": 1.821289243916032, "grad_norm": 0.02252841554582119, "learning_rate": 4.70292383023325e-06, "loss": 0.0059, "step": 245100 }, { "epoch": 1.822032323982909, "grad_norm": 0.0320403091609478, "learning_rate": 4.683369053392363e-06, "loss": 0.0054, "step": 245200 }, { "epoch": 1.8227754040497863, "grad_norm": 0.03936221823096275, "learning_rate": 4.663814276551477e-06, "loss": 0.0052, "step": 245300 }, { "epoch": 1.8235184841166636, "grad_norm": 0.04148055613040924, "learning_rate": 4.64425949971059e-06, "loss": 0.0047, "step": 245400 }, { "epoch": 1.8242615641835407, "grad_norm": 0.06604262441396713, "learning_rate": 4.624704722869703e-06, "loss": 0.005, "step": 245500 }, { "epoch": 1.825004644250418, "grad_norm": 0.058710772544145584, "learning_rate": 4.605149946028816e-06, "loss": 0.0051, "step": 245600 }, { "epoch": 1.8257477243172953, "grad_norm": 0.06030699238181114, "learning_rate": 4.585595169187929e-06, "loss": 0.0044, "step": 245700 }, { "epoch": 1.8264908043841723, "grad_norm": 0.21942508220672607, "learning_rate": 4.566040392347043e-06, "loss": 0.0047, "step": 245800 }, { "epoch": 1.8272338844510496, "grad_norm": 0.02845887281000614, "learning_rate": 4.546485615506156e-06, "loss": 0.0049, "step": 245900 }, { "epoch": 1.827976964517927, "grad_norm": 0.046648405492305756, "learning_rate": 4.526930838665269e-06, "loss": 0.0051, "step": 246000 }, { "epoch": 1.828720044584804, "grad_norm": 0.02161862887442112, "learning_rate": 4.507376061824382e-06, "loss": 0.0046, "step": 246100 }, { "epoch": 1.8294631246516813, "grad_norm": 0.11451492458581924, "learning_rate": 4.487821284983496e-06, "loss": 0.0056, "step": 246200 }, { "epoch": 1.8302062047185585, "grad_norm": 0.041294295340776443, "learning_rate": 4.468266508142609e-06, "loss": 0.0041, "step": 246300 }, { "epoch": 1.8309492847854356, "grad_norm": 0.050578564405441284, "learning_rate": 4.448711731301722e-06, "loss": 0.0047, "step": 246400 }, { "epoch": 1.8316923648523127, "grad_norm": 0.04059571772813797, "learning_rate": 4.429156954460835e-06, "loss": 0.0049, "step": 246500 }, { "epoch": 1.8324354449191902, "grad_norm": 0.025419902056455612, "learning_rate": 4.409602177619949e-06, "loss": 0.0055, "step": 246600 }, { "epoch": 1.8331785249860673, "grad_norm": 0.034540656954050064, "learning_rate": 4.390047400779062e-06, "loss": 0.005, "step": 246700 }, { "epoch": 1.8339216050529443, "grad_norm": 0.05915455520153046, "learning_rate": 4.370492623938176e-06, "loss": 0.0055, "step": 246800 }, { "epoch": 1.8346646851198216, "grad_norm": 0.0917709618806839, "learning_rate": 4.350937847097289e-06, "loss": 0.0049, "step": 246900 }, { "epoch": 1.835407765186699, "grad_norm": 0.05310334637761116, "learning_rate": 4.331383070256403e-06, "loss": 0.004, "step": 247000 }, { "epoch": 1.836150845253576, "grad_norm": 0.046267442405223846, "learning_rate": 4.311828293415516e-06, "loss": 0.006, "step": 247100 }, { "epoch": 1.8368939253204533, "grad_norm": 0.02276589348912239, "learning_rate": 4.292273516574629e-06, "loss": 0.004, "step": 247200 }, { "epoch": 1.8376370053873305, "grad_norm": 0.04449063166975975, "learning_rate": 4.272718739733742e-06, "loss": 0.0051, "step": 247300 }, { "epoch": 1.8383800854542076, "grad_norm": 0.044869549572467804, "learning_rate": 4.253163962892856e-06, "loss": 0.0056, "step": 247400 }, { "epoch": 1.839123165521085, "grad_norm": 0.07758212089538574, "learning_rate": 4.233609186051969e-06, "loss": 0.0042, "step": 247500 }, { "epoch": 1.8398662455879622, "grad_norm": 0.026690185070037842, "learning_rate": 4.2140544092110824e-06, "loss": 0.0047, "step": 247600 }, { "epoch": 1.8406093256548393, "grad_norm": 0.030758585780858994, "learning_rate": 4.1944996323701955e-06, "loss": 0.005, "step": 247700 }, { "epoch": 1.8413524057217165, "grad_norm": 0.05363529548048973, "learning_rate": 4.1749448555293086e-06, "loss": 0.0052, "step": 247800 }, { "epoch": 1.8420954857885938, "grad_norm": 0.030742377042770386, "learning_rate": 4.1553900786884225e-06, "loss": 0.0043, "step": 247900 }, { "epoch": 1.842838565855471, "grad_norm": 0.35409632325172424, "learning_rate": 4.1358353018475356e-06, "loss": 0.0054, "step": 248000 }, { "epoch": 1.843581645922348, "grad_norm": 0.03932475671172142, "learning_rate": 4.116280525006649e-06, "loss": 0.0047, "step": 248100 }, { "epoch": 1.8443247259892255, "grad_norm": 0.2364361584186554, "learning_rate": 4.096725748165762e-06, "loss": 0.0057, "step": 248200 }, { "epoch": 1.8450678060561025, "grad_norm": 0.2554910182952881, "learning_rate": 4.077170971324876e-06, "loss": 0.0049, "step": 248300 }, { "epoch": 1.8458108861229796, "grad_norm": 0.06088513508439064, "learning_rate": 4.057616194483989e-06, "loss": 0.0046, "step": 248400 }, { "epoch": 1.8465539661898571, "grad_norm": 0.03709198161959648, "learning_rate": 4.038061417643102e-06, "loss": 0.0064, "step": 248500 }, { "epoch": 1.8472970462567342, "grad_norm": 0.03819173201918602, "learning_rate": 4.018506640802215e-06, "loss": 0.0053, "step": 248600 }, { "epoch": 1.8480401263236113, "grad_norm": 0.0538676455616951, "learning_rate": 3.998951863961329e-06, "loss": 0.0044, "step": 248700 }, { "epoch": 1.8487832063904885, "grad_norm": 0.1460949033498764, "learning_rate": 3.979397087120442e-06, "loss": 0.0049, "step": 248800 }, { "epoch": 1.8495262864573658, "grad_norm": 0.08562401682138443, "learning_rate": 3.959842310279555e-06, "loss": 0.0054, "step": 248900 }, { "epoch": 1.850269366524243, "grad_norm": 0.06690574437379837, "learning_rate": 3.940287533438668e-06, "loss": 0.0052, "step": 249000 }, { "epoch": 1.8510124465911202, "grad_norm": 0.019072456285357475, "learning_rate": 3.920732756597782e-06, "loss": 0.005, "step": 249100 }, { "epoch": 1.8517555266579975, "grad_norm": 0.032566945999860764, "learning_rate": 3.901177979756895e-06, "loss": 0.0046, "step": 249200 }, { "epoch": 1.8524986067248745, "grad_norm": 0.03856411948800087, "learning_rate": 3.881623202916009e-06, "loss": 0.0047, "step": 249300 }, { "epoch": 1.8532416867917518, "grad_norm": 0.11599314212799072, "learning_rate": 3.862068426075122e-06, "loss": 0.0057, "step": 249400 }, { "epoch": 1.8539847668586291, "grad_norm": 0.025156550109386444, "learning_rate": 3.842513649234235e-06, "loss": 0.0046, "step": 249500 }, { "epoch": 1.8547278469255062, "grad_norm": 0.03131775185465813, "learning_rate": 3.822958872393349e-06, "loss": 0.0048, "step": 249600 }, { "epoch": 1.8554709269923835, "grad_norm": 0.12264790385961533, "learning_rate": 3.8034040955524616e-06, "loss": 0.005, "step": 249700 }, { "epoch": 1.8562140070592608, "grad_norm": 0.035065941512584686, "learning_rate": 3.783849318711575e-06, "loss": 0.0044, "step": 249800 }, { "epoch": 1.8569570871261378, "grad_norm": 0.028623079881072044, "learning_rate": 3.764294541870688e-06, "loss": 0.0049, "step": 249900 }, { "epoch": 1.857700167193015, "grad_norm": 0.03929667919874191, "learning_rate": 3.744739765029802e-06, "loss": 0.0045, "step": 250000 }, { "epoch": 1.8584432472598924, "grad_norm": 0.09318127483129501, "learning_rate": 3.725184988188915e-06, "loss": 0.0052, "step": 250100 }, { "epoch": 1.8591863273267695, "grad_norm": 0.22448211908340454, "learning_rate": 3.705630211348028e-06, "loss": 0.0049, "step": 250200 }, { "epoch": 1.8599294073936465, "grad_norm": 0.11018132418394089, "learning_rate": 3.6860754345071413e-06, "loss": 0.0049, "step": 250300 }, { "epoch": 1.8606724874605238, "grad_norm": 0.030690088868141174, "learning_rate": 3.666520657666255e-06, "loss": 0.0045, "step": 250400 }, { "epoch": 1.8614155675274011, "grad_norm": 0.05016700550913811, "learning_rate": 3.6469658808253683e-06, "loss": 0.0043, "step": 250500 }, { "epoch": 1.8621586475942782, "grad_norm": 0.08158694952726364, "learning_rate": 3.6274111039844813e-06, "loss": 0.0046, "step": 250600 }, { "epoch": 1.8629017276611555, "grad_norm": 0.037295904010534286, "learning_rate": 3.6078563271435944e-06, "loss": 0.005, "step": 250700 }, { "epoch": 1.8636448077280328, "grad_norm": 0.03230149671435356, "learning_rate": 3.5883015503027083e-06, "loss": 0.0047, "step": 250800 }, { "epoch": 1.8643878877949098, "grad_norm": 0.09846612811088562, "learning_rate": 3.5687467734618214e-06, "loss": 0.0063, "step": 250900 }, { "epoch": 1.8651309678617871, "grad_norm": 0.02527785860002041, "learning_rate": 3.549191996620935e-06, "loss": 0.0053, "step": 251000 }, { "epoch": 1.8658740479286644, "grad_norm": 0.02510722726583481, "learning_rate": 3.529637219780048e-06, "loss": 0.0043, "step": 251100 }, { "epoch": 1.8666171279955415, "grad_norm": 0.03260761871933937, "learning_rate": 3.510082442939162e-06, "loss": 0.0041, "step": 251200 }, { "epoch": 1.8673602080624188, "grad_norm": 0.05461159721016884, "learning_rate": 3.490527666098275e-06, "loss": 0.004, "step": 251300 }, { "epoch": 1.868103288129296, "grad_norm": 0.05410800501704216, "learning_rate": 3.470972889257388e-06, "loss": 0.0047, "step": 251400 }, { "epoch": 1.8688463681961731, "grad_norm": 0.018790561705827713, "learning_rate": 3.451418112416501e-06, "loss": 0.0053, "step": 251500 }, { "epoch": 1.8695894482630502, "grad_norm": 0.04171178117394447, "learning_rate": 3.431863335575614e-06, "loss": 0.0048, "step": 251600 }, { "epoch": 1.8703325283299277, "grad_norm": 0.00990862026810646, "learning_rate": 3.412308558734728e-06, "loss": 0.0047, "step": 251700 }, { "epoch": 1.8710756083968048, "grad_norm": 0.018480712547898293, "learning_rate": 3.392753781893841e-06, "loss": 0.005, "step": 251800 }, { "epoch": 1.8718186884636818, "grad_norm": 0.06449975818395615, "learning_rate": 3.3731990050529542e-06, "loss": 0.0053, "step": 251900 }, { "epoch": 1.8725617685305593, "grad_norm": 0.01489555835723877, "learning_rate": 3.3536442282120677e-06, "loss": 0.0046, "step": 252000 }, { "epoch": 1.8733048485974364, "grad_norm": 0.03405756503343582, "learning_rate": 3.334089451371181e-06, "loss": 0.0045, "step": 252100 }, { "epoch": 1.8740479286643135, "grad_norm": 0.12631063163280487, "learning_rate": 3.3145346745302947e-06, "loss": 0.0043, "step": 252200 }, { "epoch": 1.8747910087311908, "grad_norm": 0.05103333294391632, "learning_rate": 3.2949798976894078e-06, "loss": 0.0055, "step": 252300 }, { "epoch": 1.875534088798068, "grad_norm": 0.13906651735305786, "learning_rate": 3.275425120848521e-06, "loss": 0.0052, "step": 252400 }, { "epoch": 1.876277168864945, "grad_norm": 0.03953680396080017, "learning_rate": 3.2558703440076348e-06, "loss": 0.0052, "step": 252500 }, { "epoch": 1.8770202489318224, "grad_norm": 0.11994390934705734, "learning_rate": 3.236315567166748e-06, "loss": 0.0062, "step": 252600 }, { "epoch": 1.8777633289986997, "grad_norm": 0.031179921701550484, "learning_rate": 3.216760790325861e-06, "loss": 0.0067, "step": 252700 }, { "epoch": 1.8785064090655768, "grad_norm": 0.03166612237691879, "learning_rate": 3.197206013484974e-06, "loss": 0.0056, "step": 252800 }, { "epoch": 1.879249489132454, "grad_norm": 0.07886076718568802, "learning_rate": 3.177651236644088e-06, "loss": 0.0057, "step": 252900 }, { "epoch": 1.8799925691993313, "grad_norm": 0.025280674919486046, "learning_rate": 3.158096459803201e-06, "loss": 0.0049, "step": 253000 }, { "epoch": 1.8807356492662084, "grad_norm": 0.16850554943084717, "learning_rate": 3.138541682962314e-06, "loss": 0.005, "step": 253100 }, { "epoch": 1.8814787293330857, "grad_norm": 0.15773867070674896, "learning_rate": 3.1189869061214275e-06, "loss": 0.0053, "step": 253200 }, { "epoch": 1.882221809399963, "grad_norm": 0.041664015501737595, "learning_rate": 3.099432129280541e-06, "loss": 0.0052, "step": 253300 }, { "epoch": 1.88296488946684, "grad_norm": 0.17249035835266113, "learning_rate": 3.079877352439654e-06, "loss": 0.0047, "step": 253400 }, { "epoch": 1.883707969533717, "grad_norm": 0.031894855201244354, "learning_rate": 3.0603225755987676e-06, "loss": 0.005, "step": 253500 }, { "epoch": 1.8844510496005946, "grad_norm": 0.048921748995780945, "learning_rate": 3.0407677987578807e-06, "loss": 0.0058, "step": 253600 }, { "epoch": 1.8851941296674717, "grad_norm": 0.15981683135032654, "learning_rate": 3.021213021916994e-06, "loss": 0.0042, "step": 253700 }, { "epoch": 1.8859372097343488, "grad_norm": 0.03780883923172951, "learning_rate": 3.0016582450761072e-06, "loss": 0.0046, "step": 253800 }, { "epoch": 1.886680289801226, "grad_norm": 0.10279625654220581, "learning_rate": 2.9821034682352207e-06, "loss": 0.0052, "step": 253900 }, { "epoch": 1.8874233698681033, "grad_norm": 0.05411013960838318, "learning_rate": 2.9625486913943338e-06, "loss": 0.0059, "step": 254000 }, { "epoch": 1.8881664499349804, "grad_norm": 0.03860749676823616, "learning_rate": 2.9429939145534473e-06, "loss": 0.005, "step": 254100 }, { "epoch": 1.8889095300018577, "grad_norm": 0.04575478285551071, "learning_rate": 2.9234391377125603e-06, "loss": 0.0042, "step": 254200 }, { "epoch": 1.889652610068735, "grad_norm": 0.02910211868584156, "learning_rate": 2.903884360871674e-06, "loss": 0.0046, "step": 254300 }, { "epoch": 1.890395690135612, "grad_norm": 0.08134551346302032, "learning_rate": 2.884329584030787e-06, "loss": 0.0055, "step": 254400 }, { "epoch": 1.8911387702024893, "grad_norm": 0.04603990539908409, "learning_rate": 2.8647748071899004e-06, "loss": 0.0052, "step": 254500 }, { "epoch": 1.8918818502693666, "grad_norm": 0.08343394100666046, "learning_rate": 2.845220030349014e-06, "loss": 0.0049, "step": 254600 }, { "epoch": 1.8926249303362437, "grad_norm": 0.09697334468364716, "learning_rate": 2.8256652535081274e-06, "loss": 0.0051, "step": 254700 }, { "epoch": 1.893368010403121, "grad_norm": 0.013575372286140919, "learning_rate": 2.8061104766672405e-06, "loss": 0.0054, "step": 254800 }, { "epoch": 1.8941110904699983, "grad_norm": 0.06875712424516678, "learning_rate": 2.786555699826354e-06, "loss": 0.0049, "step": 254900 }, { "epoch": 1.8948541705368753, "grad_norm": 0.03456658497452736, "learning_rate": 2.767000922985467e-06, "loss": 0.005, "step": 255000 }, { "epoch": 1.8955972506037524, "grad_norm": 0.0398942232131958, "learning_rate": 2.74744614614458e-06, "loss": 0.0047, "step": 255100 }, { "epoch": 1.89634033067063, "grad_norm": 0.0382978729903698, "learning_rate": 2.7278913693036936e-06, "loss": 0.0053, "step": 255200 }, { "epoch": 1.897083410737507, "grad_norm": 0.03559809550642967, "learning_rate": 2.7083365924628067e-06, "loss": 0.0046, "step": 255300 }, { "epoch": 1.897826490804384, "grad_norm": 0.03954083472490311, "learning_rate": 2.68878181562192e-06, "loss": 0.0043, "step": 255400 }, { "epoch": 1.8985695708712613, "grad_norm": 0.024809397757053375, "learning_rate": 2.6692270387810332e-06, "loss": 0.0045, "step": 255500 }, { "epoch": 1.8993126509381386, "grad_norm": 0.024904068559408188, "learning_rate": 2.6496722619401467e-06, "loss": 0.005, "step": 255600 }, { "epoch": 1.9000557310050157, "grad_norm": 0.03946581110358238, "learning_rate": 2.6301174850992602e-06, "loss": 0.0055, "step": 255700 }, { "epoch": 1.900798811071893, "grad_norm": 0.021371472626924515, "learning_rate": 2.6105627082583737e-06, "loss": 0.0049, "step": 255800 }, { "epoch": 1.9015418911387703, "grad_norm": 0.061101481318473816, "learning_rate": 2.5910079314174868e-06, "loss": 0.0049, "step": 255900 }, { "epoch": 1.9022849712056473, "grad_norm": 0.04766387864947319, "learning_rate": 2.5714531545766003e-06, "loss": 0.005, "step": 256000 }, { "epoch": 1.9030280512725246, "grad_norm": 0.03629893437027931, "learning_rate": 2.5518983777357134e-06, "loss": 0.0057, "step": 256100 }, { "epoch": 1.903771131339402, "grad_norm": 0.060663290321826935, "learning_rate": 2.532343600894827e-06, "loss": 0.0056, "step": 256200 }, { "epoch": 1.904514211406279, "grad_norm": 0.03465303033590317, "learning_rate": 2.51278882405394e-06, "loss": 0.0062, "step": 256300 }, { "epoch": 1.9052572914731563, "grad_norm": 0.030340708792209625, "learning_rate": 2.4932340472130534e-06, "loss": 0.0051, "step": 256400 }, { "epoch": 1.9060003715400335, "grad_norm": 0.03125110641121864, "learning_rate": 2.4736792703721665e-06, "loss": 0.0047, "step": 256500 }, { "epoch": 1.9067434516069106, "grad_norm": 0.032233402132987976, "learning_rate": 2.45412449353128e-06, "loss": 0.0045, "step": 256600 }, { "epoch": 1.907486531673788, "grad_norm": 0.019639622420072556, "learning_rate": 2.434569716690393e-06, "loss": 0.0051, "step": 256700 }, { "epoch": 1.9082296117406652, "grad_norm": 0.019357016310095787, "learning_rate": 2.4150149398495065e-06, "loss": 0.0048, "step": 256800 }, { "epoch": 1.9089726918075423, "grad_norm": 0.019752876833081245, "learning_rate": 2.39546016300862e-06, "loss": 0.0054, "step": 256900 }, { "epoch": 1.9097157718744193, "grad_norm": 0.013882539235055447, "learning_rate": 2.375905386167733e-06, "loss": 0.0052, "step": 257000 }, { "epoch": 1.9104588519412968, "grad_norm": 0.04412632808089256, "learning_rate": 2.3563506093268466e-06, "loss": 0.005, "step": 257100 }, { "epoch": 1.911201932008174, "grad_norm": 0.02750055119395256, "learning_rate": 2.3367958324859597e-06, "loss": 0.0043, "step": 257200 }, { "epoch": 1.911945012075051, "grad_norm": 0.040861718356609344, "learning_rate": 2.317241055645073e-06, "loss": 0.0054, "step": 257300 }, { "epoch": 1.9126880921419283, "grad_norm": 0.018672935664653778, "learning_rate": 2.2976862788041862e-06, "loss": 0.004, "step": 257400 }, { "epoch": 1.9134311722088055, "grad_norm": 0.020473668351769447, "learning_rate": 2.2781315019632997e-06, "loss": 0.0043, "step": 257500 }, { "epoch": 1.9141742522756826, "grad_norm": 0.1400226652622223, "learning_rate": 2.258576725122413e-06, "loss": 0.0054, "step": 257600 }, { "epoch": 1.91491733234256, "grad_norm": 0.14126868546009064, "learning_rate": 2.2390219482815263e-06, "loss": 0.0042, "step": 257700 }, { "epoch": 1.9156604124094372, "grad_norm": 0.03261305019259453, "learning_rate": 2.2194671714406394e-06, "loss": 0.0047, "step": 257800 }, { "epoch": 1.9164034924763143, "grad_norm": 0.08601921051740646, "learning_rate": 2.199912394599753e-06, "loss": 0.0042, "step": 257900 }, { "epoch": 1.9171465725431915, "grad_norm": 0.03684937208890915, "learning_rate": 2.1803576177588664e-06, "loss": 0.0055, "step": 258000 }, { "epoch": 1.9178896526100688, "grad_norm": 0.026318319141864777, "learning_rate": 2.16080284091798e-06, "loss": 0.0049, "step": 258100 }, { "epoch": 1.918632732676946, "grad_norm": 0.0350843146443367, "learning_rate": 2.141248064077093e-06, "loss": 0.0045, "step": 258200 }, { "epoch": 1.9193758127438232, "grad_norm": 0.018141312524676323, "learning_rate": 2.1216932872362064e-06, "loss": 0.0059, "step": 258300 }, { "epoch": 1.9201188928107005, "grad_norm": 0.03579108789563179, "learning_rate": 2.1021385103953195e-06, "loss": 0.0046, "step": 258400 }, { "epoch": 1.9208619728775775, "grad_norm": 0.019462890923023224, "learning_rate": 2.082583733554433e-06, "loss": 0.005, "step": 258500 }, { "epoch": 1.9216050529444546, "grad_norm": 0.07231540977954865, "learning_rate": 2.063028956713546e-06, "loss": 0.0049, "step": 258600 }, { "epoch": 1.9223481330113321, "grad_norm": 0.04632379487156868, "learning_rate": 2.0434741798726595e-06, "loss": 0.0053, "step": 258700 }, { "epoch": 1.9230912130782092, "grad_norm": 0.07228953391313553, "learning_rate": 2.0239194030317726e-06, "loss": 0.005, "step": 258800 }, { "epoch": 1.9238342931450862, "grad_norm": 0.026706060394644737, "learning_rate": 2.0043646261908857e-06, "loss": 0.0049, "step": 258900 }, { "epoch": 1.9245773732119635, "grad_norm": 0.06982797384262085, "learning_rate": 1.984809849349999e-06, "loss": 0.0054, "step": 259000 }, { "epoch": 1.9253204532788408, "grad_norm": 0.0431886650621891, "learning_rate": 1.9652550725091127e-06, "loss": 0.0055, "step": 259100 }, { "epoch": 1.926063533345718, "grad_norm": 0.020970037207007408, "learning_rate": 1.945700295668226e-06, "loss": 0.005, "step": 259200 }, { "epoch": 1.9268066134125952, "grad_norm": 0.031358376145362854, "learning_rate": 1.9261455188273392e-06, "loss": 0.0052, "step": 259300 }, { "epoch": 1.9275496934794725, "grad_norm": 0.05365696921944618, "learning_rate": 1.9065907419864525e-06, "loss": 0.0039, "step": 259400 }, { "epoch": 1.9282927735463495, "grad_norm": 0.15042555332183838, "learning_rate": 1.8870359651455658e-06, "loss": 0.0046, "step": 259500 }, { "epoch": 1.9290358536132268, "grad_norm": 0.045509397983551025, "learning_rate": 1.8674811883046793e-06, "loss": 0.0056, "step": 259600 }, { "epoch": 1.9297789336801041, "grad_norm": 0.04992702230811119, "learning_rate": 1.8479264114637924e-06, "loss": 0.0044, "step": 259700 }, { "epoch": 1.9305220137469812, "grad_norm": 0.02511606365442276, "learning_rate": 1.8283716346229059e-06, "loss": 0.0054, "step": 259800 }, { "epoch": 1.9312650938138585, "grad_norm": 0.05812444910407066, "learning_rate": 1.808816857782019e-06, "loss": 0.0053, "step": 259900 }, { "epoch": 1.9320081738807358, "grad_norm": 0.03972572833299637, "learning_rate": 1.7892620809411324e-06, "loss": 0.0047, "step": 260000 } ], "logging_steps": 100, "max_steps": 269150, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 10000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 6.169389622186442e+17, "train_batch_size": 8, "trial_name": null, "trial_params": null }