andstor's picture
Upload folder using huggingface_hub
f2325d9 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.9987642455032266,
"eval_steps": 500,
"global_step": 2730,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0010984484415762735,
"grad_norm": 0.13173329830169678,
"learning_rate": 1.0989010989010988e-06,
"loss": 0.8751,
"step": 1
},
{
"epoch": 0.002196896883152547,
"grad_norm": 0.19401921331882477,
"learning_rate": 2.1978021978021976e-06,
"loss": 1.3488,
"step": 2
},
{
"epoch": 0.0032953453247288205,
"grad_norm": 0.142131969332695,
"learning_rate": 3.2967032967032968e-06,
"loss": 0.8371,
"step": 3
},
{
"epoch": 0.004393793766305094,
"grad_norm": 0.1124999076128006,
"learning_rate": 4.395604395604395e-06,
"loss": 1.0039,
"step": 4
},
{
"epoch": 0.005492242207881368,
"grad_norm": 0.20683947205543518,
"learning_rate": 5.494505494505494e-06,
"loss": 1.4423,
"step": 5
},
{
"epoch": 0.006590690649457641,
"grad_norm": 0.2007640153169632,
"learning_rate": 6.5934065934065935e-06,
"loss": 0.9797,
"step": 6
},
{
"epoch": 0.007689139091033915,
"grad_norm": 0.1362670361995697,
"learning_rate": 7.692307692307692e-06,
"loss": 1.0443,
"step": 7
},
{
"epoch": 0.008787587532610188,
"grad_norm": 0.21512511372566223,
"learning_rate": 8.79120879120879e-06,
"loss": 1.2888,
"step": 8
},
{
"epoch": 0.009886035974186462,
"grad_norm": 0.13403186202049255,
"learning_rate": 9.89010989010989e-06,
"loss": 0.9637,
"step": 9
},
{
"epoch": 0.010984484415762736,
"grad_norm": 0.16911157965660095,
"learning_rate": 1.0989010989010989e-05,
"loss": 0.8824,
"step": 10
},
{
"epoch": 0.012082932857339008,
"grad_norm": 0.19280359148979187,
"learning_rate": 1.2087912087912087e-05,
"loss": 0.9843,
"step": 11
},
{
"epoch": 0.013181381298915282,
"grad_norm": 0.15720519423484802,
"learning_rate": 1.3186813186813187e-05,
"loss": 0.9769,
"step": 12
},
{
"epoch": 0.014279829740491556,
"grad_norm": 0.18622402846813202,
"learning_rate": 1.4285714285714284e-05,
"loss": 0.903,
"step": 13
},
{
"epoch": 0.01537827818206783,
"grad_norm": 0.1491895169019699,
"learning_rate": 1.5384615384615384e-05,
"loss": 1.065,
"step": 14
},
{
"epoch": 0.016476726623644102,
"grad_norm": 0.16883142292499542,
"learning_rate": 1.6483516483516482e-05,
"loss": 0.9916,
"step": 15
},
{
"epoch": 0.017575175065220376,
"grad_norm": 0.155453160405159,
"learning_rate": 1.758241758241758e-05,
"loss": 1.1048,
"step": 16
},
{
"epoch": 0.01867362350679665,
"grad_norm": 0.12869666516780853,
"learning_rate": 1.868131868131868e-05,
"loss": 0.9355,
"step": 17
},
{
"epoch": 0.019772071948372924,
"grad_norm": 0.18860433995723724,
"learning_rate": 1.978021978021978e-05,
"loss": 1.1779,
"step": 18
},
{
"epoch": 0.020870520389949198,
"grad_norm": 0.30738529562950134,
"learning_rate": 2.087912087912088e-05,
"loss": 0.905,
"step": 19
},
{
"epoch": 0.021968968831525472,
"grad_norm": 0.30248674750328064,
"learning_rate": 2.1978021978021977e-05,
"loss": 1.0749,
"step": 20
},
{
"epoch": 0.023067417273101742,
"grad_norm": 0.17005079984664917,
"learning_rate": 2.3076923076923076e-05,
"loss": 1.0141,
"step": 21
},
{
"epoch": 0.024165865714678016,
"grad_norm": 0.5497377514839172,
"learning_rate": 2.4175824175824174e-05,
"loss": 0.804,
"step": 22
},
{
"epoch": 0.02526431415625429,
"grad_norm": 0.23464925587177277,
"learning_rate": 2.5274725274725276e-05,
"loss": 1.0592,
"step": 23
},
{
"epoch": 0.026362762597830564,
"grad_norm": 0.2906591594219208,
"learning_rate": 2.6373626373626374e-05,
"loss": 1.4096,
"step": 24
},
{
"epoch": 0.027461211039406838,
"grad_norm": 0.14552968740463257,
"learning_rate": 2.747252747252747e-05,
"loss": 0.8827,
"step": 25
},
{
"epoch": 0.028559659480983112,
"grad_norm": 0.26139914989471436,
"learning_rate": 2.8571428571428567e-05,
"loss": 1.1081,
"step": 26
},
{
"epoch": 0.029658107922559386,
"grad_norm": 0.16122505068778992,
"learning_rate": 2.9670329670329666e-05,
"loss": 0.8967,
"step": 27
},
{
"epoch": 0.03075655636413566,
"grad_norm": 0.19174647331237793,
"learning_rate": 3.076923076923077e-05,
"loss": 0.7527,
"step": 28
},
{
"epoch": 0.031855004805711934,
"grad_norm": 0.24506032466888428,
"learning_rate": 3.1868131868131866e-05,
"loss": 1.0981,
"step": 29
},
{
"epoch": 0.032953453247288204,
"grad_norm": 0.18928349018096924,
"learning_rate": 3.2967032967032964e-05,
"loss": 1.2955,
"step": 30
},
{
"epoch": 0.03405190168886448,
"grad_norm": 0.20482106506824493,
"learning_rate": 3.406593406593406e-05,
"loss": 0.886,
"step": 31
},
{
"epoch": 0.03515035013044075,
"grad_norm": 0.17304010689258575,
"learning_rate": 3.516483516483516e-05,
"loss": 1.0062,
"step": 32
},
{
"epoch": 0.03624879857201702,
"grad_norm": 0.17006444931030273,
"learning_rate": 3.626373626373626e-05,
"loss": 0.76,
"step": 33
},
{
"epoch": 0.0373472470135933,
"grad_norm": 0.16570955514907837,
"learning_rate": 3.736263736263736e-05,
"loss": 0.7512,
"step": 34
},
{
"epoch": 0.03844569545516957,
"grad_norm": 0.4470347464084625,
"learning_rate": 3.8461538461538456e-05,
"loss": 1.051,
"step": 35
},
{
"epoch": 0.03954414389674585,
"grad_norm": 0.3013080060482025,
"learning_rate": 3.956043956043956e-05,
"loss": 1.1269,
"step": 36
},
{
"epoch": 0.04064259233832212,
"grad_norm": 0.33114469051361084,
"learning_rate": 4.065934065934065e-05,
"loss": 1.046,
"step": 37
},
{
"epoch": 0.041741040779898396,
"grad_norm": 0.3496829867362976,
"learning_rate": 4.175824175824176e-05,
"loss": 0.9139,
"step": 38
},
{
"epoch": 0.042839489221474666,
"grad_norm": 0.36173877120018005,
"learning_rate": 4.285714285714285e-05,
"loss": 1.16,
"step": 39
},
{
"epoch": 0.043937937663050944,
"grad_norm": 0.23047995567321777,
"learning_rate": 4.3956043956043955e-05,
"loss": 0.8623,
"step": 40
},
{
"epoch": 0.045036386104627214,
"grad_norm": 0.33733946084976196,
"learning_rate": 4.5054945054945046e-05,
"loss": 0.873,
"step": 41
},
{
"epoch": 0.046134834546203485,
"grad_norm": 0.43975624442100525,
"learning_rate": 4.615384615384615e-05,
"loss": 0.9374,
"step": 42
},
{
"epoch": 0.04723328298777976,
"grad_norm": 0.5429202318191528,
"learning_rate": 4.725274725274725e-05,
"loss": 1.0699,
"step": 43
},
{
"epoch": 0.04833173142935603,
"grad_norm": 0.39317595958709717,
"learning_rate": 4.835164835164835e-05,
"loss": 0.7719,
"step": 44
},
{
"epoch": 0.04943017987093231,
"grad_norm": 0.41328710317611694,
"learning_rate": 4.9450549450549446e-05,
"loss": 1.112,
"step": 45
},
{
"epoch": 0.05052862831250858,
"grad_norm": 0.5977774858474731,
"learning_rate": 5.054945054945055e-05,
"loss": 0.9408,
"step": 46
},
{
"epoch": 0.05162707675408486,
"grad_norm": 0.6984797716140747,
"learning_rate": 5.164835164835164e-05,
"loss": 0.9766,
"step": 47
},
{
"epoch": 0.05272552519566113,
"grad_norm": 0.5161548256874084,
"learning_rate": 5.274725274725275e-05,
"loss": 1.3705,
"step": 48
},
{
"epoch": 0.0538239736372374,
"grad_norm": 0.5750108361244202,
"learning_rate": 5.384615384615384e-05,
"loss": 0.9492,
"step": 49
},
{
"epoch": 0.054922422078813676,
"grad_norm": 0.7861920595169067,
"learning_rate": 5.494505494505494e-05,
"loss": 1.1495,
"step": 50
},
{
"epoch": 0.05602087052038995,
"grad_norm": 0.5992287993431091,
"learning_rate": 5.6043956043956037e-05,
"loss": 1.2818,
"step": 51
},
{
"epoch": 0.057119318961966224,
"grad_norm": 0.5470016598701477,
"learning_rate": 5.7142857142857135e-05,
"loss": 1.0385,
"step": 52
},
{
"epoch": 0.058217767403542495,
"grad_norm": 0.7035269141197205,
"learning_rate": 5.824175824175824e-05,
"loss": 0.785,
"step": 53
},
{
"epoch": 0.05931621584511877,
"grad_norm": 0.5253639817237854,
"learning_rate": 5.934065934065933e-05,
"loss": 0.6092,
"step": 54
},
{
"epoch": 0.06041466428669504,
"grad_norm": 0.5233064293861389,
"learning_rate": 6.043956043956044e-05,
"loss": 0.7853,
"step": 55
},
{
"epoch": 0.06151311272827132,
"grad_norm": 0.4508589804172516,
"learning_rate": 6.153846153846154e-05,
"loss": 0.5737,
"step": 56
},
{
"epoch": 0.06261156116984759,
"grad_norm": 1.0521594285964966,
"learning_rate": 6.263736263736263e-05,
"loss": 1.0132,
"step": 57
},
{
"epoch": 0.06371000961142387,
"grad_norm": 0.3572557866573334,
"learning_rate": 6.373626373626373e-05,
"loss": 0.655,
"step": 58
},
{
"epoch": 0.06480845805300013,
"grad_norm": 0.600371241569519,
"learning_rate": 6.483516483516483e-05,
"loss": 0.8897,
"step": 59
},
{
"epoch": 0.06590690649457641,
"grad_norm": 0.6430579423904419,
"learning_rate": 6.593406593406593e-05,
"loss": 0.8058,
"step": 60
},
{
"epoch": 0.06700535493615269,
"grad_norm": 0.5309410095214844,
"learning_rate": 6.703296703296703e-05,
"loss": 0.7312,
"step": 61
},
{
"epoch": 0.06810380337772896,
"grad_norm": 0.46225860714912415,
"learning_rate": 6.813186813186813e-05,
"loss": 0.8607,
"step": 62
},
{
"epoch": 0.06920225181930523,
"grad_norm": 0.8889493346214294,
"learning_rate": 6.923076923076922e-05,
"loss": 0.7791,
"step": 63
},
{
"epoch": 0.0703007002608815,
"grad_norm": 0.5721575617790222,
"learning_rate": 7.032967032967032e-05,
"loss": 0.9426,
"step": 64
},
{
"epoch": 0.07139914870245778,
"grad_norm": 0.8355056047439575,
"learning_rate": 7.142857142857142e-05,
"loss": 0.621,
"step": 65
},
{
"epoch": 0.07249759714403405,
"grad_norm": 1.3048707246780396,
"learning_rate": 7.252747252747252e-05,
"loss": 0.8869,
"step": 66
},
{
"epoch": 0.07359604558561032,
"grad_norm": 0.5817797183990479,
"learning_rate": 7.362637362637362e-05,
"loss": 0.8385,
"step": 67
},
{
"epoch": 0.0746944940271866,
"grad_norm": 1.2051454782485962,
"learning_rate": 7.472527472527472e-05,
"loss": 0.7566,
"step": 68
},
{
"epoch": 0.07579294246876288,
"grad_norm": 0.8565987944602966,
"learning_rate": 7.582417582417581e-05,
"loss": 0.8374,
"step": 69
},
{
"epoch": 0.07689139091033914,
"grad_norm": 0.7503894567489624,
"learning_rate": 7.692307692307691e-05,
"loss": 0.6749,
"step": 70
},
{
"epoch": 0.07798983935191542,
"grad_norm": 0.6298589706420898,
"learning_rate": 7.802197802197802e-05,
"loss": 0.9096,
"step": 71
},
{
"epoch": 0.0790882877934917,
"grad_norm": 0.8327789306640625,
"learning_rate": 7.912087912087912e-05,
"loss": 0.9836,
"step": 72
},
{
"epoch": 0.08018673623506796,
"grad_norm": 1.0001461505889893,
"learning_rate": 8.021978021978021e-05,
"loss": 0.6917,
"step": 73
},
{
"epoch": 0.08128518467664424,
"grad_norm": 0.8373435735702515,
"learning_rate": 8.13186813186813e-05,
"loss": 0.7703,
"step": 74
},
{
"epoch": 0.08238363311822051,
"grad_norm": 0.9785758256912231,
"learning_rate": 8.241758241758242e-05,
"loss": 0.8004,
"step": 75
},
{
"epoch": 0.08348208155979679,
"grad_norm": 0.8900540471076965,
"learning_rate": 8.351648351648352e-05,
"loss": 0.8238,
"step": 76
},
{
"epoch": 0.08458053000137306,
"grad_norm": 0.7411159873008728,
"learning_rate": 8.46153846153846e-05,
"loss": 1.0364,
"step": 77
},
{
"epoch": 0.08567897844294933,
"grad_norm": 0.4975040555000305,
"learning_rate": 8.57142857142857e-05,
"loss": 0.4814,
"step": 78
},
{
"epoch": 0.08677742688452561,
"grad_norm": 0.6698398590087891,
"learning_rate": 8.681318681318681e-05,
"loss": 0.6828,
"step": 79
},
{
"epoch": 0.08787587532610189,
"grad_norm": 0.5883696675300598,
"learning_rate": 8.791208791208791e-05,
"loss": 0.92,
"step": 80
},
{
"epoch": 0.08897432376767815,
"grad_norm": 0.9050906896591187,
"learning_rate": 8.901098901098901e-05,
"loss": 0.7229,
"step": 81
},
{
"epoch": 0.09007277220925443,
"grad_norm": 0.5996706485748291,
"learning_rate": 9.010989010989009e-05,
"loss": 0.699,
"step": 82
},
{
"epoch": 0.0911712206508307,
"grad_norm": 2.0782630443573,
"learning_rate": 9.120879120879119e-05,
"loss": 1.2118,
"step": 83
},
{
"epoch": 0.09226966909240697,
"grad_norm": 0.759730875492096,
"learning_rate": 9.23076923076923e-05,
"loss": 0.6397,
"step": 84
},
{
"epoch": 0.09336811753398325,
"grad_norm": 1.1138097047805786,
"learning_rate": 9.34065934065934e-05,
"loss": 0.8973,
"step": 85
},
{
"epoch": 0.09446656597555952,
"grad_norm": 0.9852680563926697,
"learning_rate": 9.45054945054945e-05,
"loss": 1.0733,
"step": 86
},
{
"epoch": 0.0955650144171358,
"grad_norm": 0.8435002565383911,
"learning_rate": 9.560439560439558e-05,
"loss": 0.8977,
"step": 87
},
{
"epoch": 0.09666346285871207,
"grad_norm": 1.3031998872756958,
"learning_rate": 9.67032967032967e-05,
"loss": 0.9852,
"step": 88
},
{
"epoch": 0.09776191130028834,
"grad_norm": 0.6343463063240051,
"learning_rate": 9.78021978021978e-05,
"loss": 0.6147,
"step": 89
},
{
"epoch": 0.09886035974186462,
"grad_norm": 0.7061794996261597,
"learning_rate": 9.890109890109889e-05,
"loss": 0.7437,
"step": 90
},
{
"epoch": 0.09995880818344088,
"grad_norm": 1.2231422662734985,
"learning_rate": 9.999999999999999e-05,
"loss": 0.7944,
"step": 91
},
{
"epoch": 0.10105725662501716,
"grad_norm": 0.7199704647064209,
"learning_rate": 0.0001010989010989011,
"loss": 0.7355,
"step": 92
},
{
"epoch": 0.10215570506659344,
"grad_norm": 1.2740516662597656,
"learning_rate": 0.00010219780219780219,
"loss": 0.7622,
"step": 93
},
{
"epoch": 0.10325415350816972,
"grad_norm": 0.7762659788131714,
"learning_rate": 0.00010329670329670329,
"loss": 0.7074,
"step": 94
},
{
"epoch": 0.10435260194974598,
"grad_norm": 0.6618936061859131,
"learning_rate": 0.00010439560439560438,
"loss": 0.7667,
"step": 95
},
{
"epoch": 0.10545105039132226,
"grad_norm": 0.7244533896446228,
"learning_rate": 0.0001054945054945055,
"loss": 0.6451,
"step": 96
},
{
"epoch": 0.10654949883289853,
"grad_norm": 0.6391953229904175,
"learning_rate": 0.0001065934065934066,
"loss": 0.5637,
"step": 97
},
{
"epoch": 0.1076479472744748,
"grad_norm": 0.6992442607879639,
"learning_rate": 0.00010769230769230768,
"loss": 0.7112,
"step": 98
},
{
"epoch": 0.10874639571605108,
"grad_norm": 1.0820791721343994,
"learning_rate": 0.00010879120879120878,
"loss": 0.9199,
"step": 99
},
{
"epoch": 0.10984484415762735,
"grad_norm": 0.6012185215950012,
"learning_rate": 0.00010989010989010988,
"loss": 0.5574,
"step": 100
},
{
"epoch": 0.11094329259920363,
"grad_norm": 0.822455644607544,
"learning_rate": 0.00011098901098901099,
"loss": 0.5185,
"step": 101
},
{
"epoch": 0.1120417410407799,
"grad_norm": 0.9417555332183838,
"learning_rate": 0.00011208791208791207,
"loss": 0.6883,
"step": 102
},
{
"epoch": 0.11314018948235617,
"grad_norm": 1.0258208513259888,
"learning_rate": 0.00011318681318681317,
"loss": 0.7588,
"step": 103
},
{
"epoch": 0.11423863792393245,
"grad_norm": 1.904179573059082,
"learning_rate": 0.00011428571428571427,
"loss": 0.7425,
"step": 104
},
{
"epoch": 0.11533708636550873,
"grad_norm": 1.5453238487243652,
"learning_rate": 0.00011538461538461538,
"loss": 0.658,
"step": 105
},
{
"epoch": 0.11643553480708499,
"grad_norm": 0.8801619410514832,
"learning_rate": 0.00011648351648351648,
"loss": 0.8432,
"step": 106
},
{
"epoch": 0.11753398324866127,
"grad_norm": 0.8567579388618469,
"learning_rate": 0.00011758241758241756,
"loss": 0.5904,
"step": 107
},
{
"epoch": 0.11863243169023754,
"grad_norm": 0.9351131319999695,
"learning_rate": 0.00011868131868131866,
"loss": 0.7228,
"step": 108
},
{
"epoch": 0.11973088013181381,
"grad_norm": 0.8817545175552368,
"learning_rate": 0.00011978021978021978,
"loss": 0.7853,
"step": 109
},
{
"epoch": 0.12082932857339009,
"grad_norm": 1.0484094619750977,
"learning_rate": 0.00012087912087912087,
"loss": 0.7049,
"step": 110
},
{
"epoch": 0.12192777701496636,
"grad_norm": 1.80658757686615,
"learning_rate": 0.00012197802197802197,
"loss": 0.669,
"step": 111
},
{
"epoch": 0.12302622545654264,
"grad_norm": 1.5311473608016968,
"learning_rate": 0.00012307692307692307,
"loss": 0.8342,
"step": 112
},
{
"epoch": 0.1241246738981189,
"grad_norm": 0.8968105912208557,
"learning_rate": 0.00012417582417582416,
"loss": 0.7199,
"step": 113
},
{
"epoch": 0.12522312233969518,
"grad_norm": 0.6149659156799316,
"learning_rate": 0.00012527472527472527,
"loss": 0.4961,
"step": 114
},
{
"epoch": 0.12632157078127146,
"grad_norm": 8.04592227935791,
"learning_rate": 0.00012637362637362635,
"loss": 0.7515,
"step": 115
},
{
"epoch": 0.12742001922284774,
"grad_norm": 0.7797659039497375,
"learning_rate": 0.00012747252747252746,
"loss": 0.7281,
"step": 116
},
{
"epoch": 0.128518467664424,
"grad_norm": 0.6414046883583069,
"learning_rate": 0.00012857142857142855,
"loss": 0.6655,
"step": 117
},
{
"epoch": 0.12961691610600026,
"grad_norm": 4.678529262542725,
"learning_rate": 0.00012967032967032966,
"loss": 0.9165,
"step": 118
},
{
"epoch": 0.13071536454757654,
"grad_norm": 0.8540724515914917,
"learning_rate": 0.00013076923076923077,
"loss": 0.7064,
"step": 119
},
{
"epoch": 0.13181381298915282,
"grad_norm": 1.057844638824463,
"learning_rate": 0.00013186813186813186,
"loss": 0.6617,
"step": 120
},
{
"epoch": 0.1329122614307291,
"grad_norm": 0.8429140448570251,
"learning_rate": 0.00013296703296703294,
"loss": 0.8156,
"step": 121
},
{
"epoch": 0.13401070987230537,
"grad_norm": 0.9944230914115906,
"learning_rate": 0.00013406593406593405,
"loss": 0.5851,
"step": 122
},
{
"epoch": 0.13510915831388165,
"grad_norm": 0.6582810878753662,
"learning_rate": 0.00013516483516483517,
"loss": 0.5819,
"step": 123
},
{
"epoch": 0.13620760675545793,
"grad_norm": 1.3106951713562012,
"learning_rate": 0.00013626373626373625,
"loss": 0.7598,
"step": 124
},
{
"epoch": 0.13730605519703418,
"grad_norm": 1.0464080572128296,
"learning_rate": 0.00013736263736263734,
"loss": 0.7241,
"step": 125
},
{
"epoch": 0.13840450363861045,
"grad_norm": 0.8519262075424194,
"learning_rate": 0.00013846153846153845,
"loss": 0.7001,
"step": 126
},
{
"epoch": 0.13950295208018673,
"grad_norm": 1.2764228582382202,
"learning_rate": 0.00013956043956043956,
"loss": 0.7152,
"step": 127
},
{
"epoch": 0.140601400521763,
"grad_norm": 1.157472014427185,
"learning_rate": 0.00014065934065934064,
"loss": 0.697,
"step": 128
},
{
"epoch": 0.1416998489633393,
"grad_norm": 0.7153847813606262,
"learning_rate": 0.00014175824175824173,
"loss": 0.6897,
"step": 129
},
{
"epoch": 0.14279829740491556,
"grad_norm": 0.7254152297973633,
"learning_rate": 0.00014285714285714284,
"loss": 0.5263,
"step": 130
},
{
"epoch": 0.14389674584649184,
"grad_norm": 1.3370522260665894,
"learning_rate": 0.00014395604395604395,
"loss": 0.7587,
"step": 131
},
{
"epoch": 0.1449951942880681,
"grad_norm": 1.092029333114624,
"learning_rate": 0.00014505494505494504,
"loss": 0.8674,
"step": 132
},
{
"epoch": 0.14609364272964437,
"grad_norm": 0.6123655438423157,
"learning_rate": 0.00014615384615384615,
"loss": 0.7163,
"step": 133
},
{
"epoch": 0.14719209117122065,
"grad_norm": 0.8476639986038208,
"learning_rate": 0.00014725274725274723,
"loss": 0.7241,
"step": 134
},
{
"epoch": 0.14829053961279692,
"grad_norm": 0.9986979961395264,
"learning_rate": 0.00014835164835164835,
"loss": 0.6229,
"step": 135
},
{
"epoch": 0.1493889880543732,
"grad_norm": 0.8208728432655334,
"learning_rate": 0.00014945054945054943,
"loss": 0.5441,
"step": 136
},
{
"epoch": 0.15048743649594948,
"grad_norm": 0.742091953754425,
"learning_rate": 0.00015054945054945054,
"loss": 0.6047,
"step": 137
},
{
"epoch": 0.15158588493752576,
"grad_norm": 1.6566306352615356,
"learning_rate": 0.00015164835164835163,
"loss": 0.6381,
"step": 138
},
{
"epoch": 0.152684333379102,
"grad_norm": 0.7735741138458252,
"learning_rate": 0.0001527472527472527,
"loss": 0.5842,
"step": 139
},
{
"epoch": 0.15378278182067828,
"grad_norm": 0.7116795778274536,
"learning_rate": 0.00015384615384615382,
"loss": 0.7117,
"step": 140
},
{
"epoch": 0.15488123026225456,
"grad_norm": 0.6912885904312134,
"learning_rate": 0.00015494505494505494,
"loss": 0.763,
"step": 141
},
{
"epoch": 0.15597967870383084,
"grad_norm": 1.0789505243301392,
"learning_rate": 0.00015604395604395605,
"loss": 0.5534,
"step": 142
},
{
"epoch": 0.15707812714540711,
"grad_norm": 1.0304033756256104,
"learning_rate": 0.00015714285714285713,
"loss": 0.4961,
"step": 143
},
{
"epoch": 0.1581765755869834,
"grad_norm": 1.0216940641403198,
"learning_rate": 0.00015824175824175824,
"loss": 0.8167,
"step": 144
},
{
"epoch": 0.15927502402855967,
"grad_norm": 0.7767283916473389,
"learning_rate": 0.00015934065934065933,
"loss": 0.649,
"step": 145
},
{
"epoch": 0.16037347247013592,
"grad_norm": 0.6125204563140869,
"learning_rate": 0.00016043956043956041,
"loss": 0.6596,
"step": 146
},
{
"epoch": 0.1614719209117122,
"grad_norm": 2.113314390182495,
"learning_rate": 0.00016153846153846153,
"loss": 0.6825,
"step": 147
},
{
"epoch": 0.16257036935328847,
"grad_norm": 1.3892889022827148,
"learning_rate": 0.0001626373626373626,
"loss": 0.5162,
"step": 148
},
{
"epoch": 0.16366881779486475,
"grad_norm": 1.2544710636138916,
"learning_rate": 0.0001637362637362637,
"loss": 0.5992,
"step": 149
},
{
"epoch": 0.16476726623644103,
"grad_norm": 1.2952786684036255,
"learning_rate": 0.00016483516483516484,
"loss": 0.5968,
"step": 150
},
{
"epoch": 0.1658657146780173,
"grad_norm": 0.9910382628440857,
"learning_rate": 0.00016593406593406592,
"loss": 0.6138,
"step": 151
},
{
"epoch": 0.16696416311959358,
"grad_norm": 0.7291635870933533,
"learning_rate": 0.00016703296703296703,
"loss": 0.8957,
"step": 152
},
{
"epoch": 0.16806261156116986,
"grad_norm": 0.7290105819702148,
"learning_rate": 0.00016813186813186812,
"loss": 0.4864,
"step": 153
},
{
"epoch": 0.1691610600027461,
"grad_norm": 1.1888444423675537,
"learning_rate": 0.0001692307692307692,
"loss": 0.913,
"step": 154
},
{
"epoch": 0.1702595084443224,
"grad_norm": 0.8183659315109253,
"learning_rate": 0.0001703296703296703,
"loss": 0.6405,
"step": 155
},
{
"epoch": 0.17135795688589867,
"grad_norm": 0.8549530506134033,
"learning_rate": 0.0001714285714285714,
"loss": 0.7019,
"step": 156
},
{
"epoch": 0.17245640532747494,
"grad_norm": 0.5960697531700134,
"learning_rate": 0.0001725274725274725,
"loss": 0.6728,
"step": 157
},
{
"epoch": 0.17355485376905122,
"grad_norm": 0.6802973747253418,
"learning_rate": 0.00017362637362637362,
"loss": 0.6462,
"step": 158
},
{
"epoch": 0.1746533022106275,
"grad_norm": 0.5056049823760986,
"learning_rate": 0.00017472527472527473,
"loss": 0.5155,
"step": 159
},
{
"epoch": 0.17575175065220378,
"grad_norm": 0.8181887865066528,
"learning_rate": 0.00017582417582417582,
"loss": 0.6631,
"step": 160
},
{
"epoch": 0.17685019909378003,
"grad_norm": 0.5748574137687683,
"learning_rate": 0.0001769230769230769,
"loss": 0.5807,
"step": 161
},
{
"epoch": 0.1779486475353563,
"grad_norm": 0.8585043549537659,
"learning_rate": 0.00017802197802197802,
"loss": 0.5412,
"step": 162
},
{
"epoch": 0.17904709597693258,
"grad_norm": 0.8763203620910645,
"learning_rate": 0.0001791208791208791,
"loss": 1.0859,
"step": 163
},
{
"epoch": 0.18014554441850886,
"grad_norm": 0.7327267527580261,
"learning_rate": 0.00018021978021978018,
"loss": 0.8034,
"step": 164
},
{
"epoch": 0.18124399286008513,
"grad_norm": 0.6813991665840149,
"learning_rate": 0.0001813186813186813,
"loss": 0.9236,
"step": 165
},
{
"epoch": 0.1823424413016614,
"grad_norm": 2.9234185218811035,
"learning_rate": 0.00018241758241758238,
"loss": 0.9148,
"step": 166
},
{
"epoch": 0.1834408897432377,
"grad_norm": 0.8117207884788513,
"learning_rate": 0.00018351648351648352,
"loss": 1.0514,
"step": 167
},
{
"epoch": 0.18453933818481394,
"grad_norm": 0.6485300064086914,
"learning_rate": 0.0001846153846153846,
"loss": 0.4764,
"step": 168
},
{
"epoch": 0.18563778662639022,
"grad_norm": 0.43059054017066956,
"learning_rate": 0.00018571428571428572,
"loss": 0.6289,
"step": 169
},
{
"epoch": 0.1867362350679665,
"grad_norm": 1.007095456123352,
"learning_rate": 0.0001868131868131868,
"loss": 0.5889,
"step": 170
},
{
"epoch": 0.18783468350954277,
"grad_norm": 1.6733218431472778,
"learning_rate": 0.0001879120879120879,
"loss": 0.8036,
"step": 171
},
{
"epoch": 0.18893313195111905,
"grad_norm": 0.7533760666847229,
"learning_rate": 0.000189010989010989,
"loss": 0.7282,
"step": 172
},
{
"epoch": 0.19003158039269533,
"grad_norm": 0.45892444252967834,
"learning_rate": 0.00019010989010989008,
"loss": 0.6273,
"step": 173
},
{
"epoch": 0.1911300288342716,
"grad_norm": 0.54690021276474,
"learning_rate": 0.00019120879120879117,
"loss": 0.669,
"step": 174
},
{
"epoch": 0.19222847727584785,
"grad_norm": 0.7361836433410645,
"learning_rate": 0.0001923076923076923,
"loss": 0.8945,
"step": 175
},
{
"epoch": 0.19332692571742413,
"grad_norm": 0.5876324772834778,
"learning_rate": 0.0001934065934065934,
"loss": 0.7557,
"step": 176
},
{
"epoch": 0.1944253741590004,
"grad_norm": 0.7753897309303284,
"learning_rate": 0.0001945054945054945,
"loss": 0.7904,
"step": 177
},
{
"epoch": 0.19552382260057669,
"grad_norm": 0.6244968771934509,
"learning_rate": 0.0001956043956043956,
"loss": 0.7617,
"step": 178
},
{
"epoch": 0.19662227104215296,
"grad_norm": 0.6300948262214661,
"learning_rate": 0.00019670329670329667,
"loss": 0.5884,
"step": 179
},
{
"epoch": 0.19772071948372924,
"grad_norm": 0.5845354795455933,
"learning_rate": 0.00019780219780219779,
"loss": 0.8034,
"step": 180
},
{
"epoch": 0.19881916792530552,
"grad_norm": 0.5231277942657471,
"learning_rate": 0.00019890109890109887,
"loss": 0.5302,
"step": 181
},
{
"epoch": 0.19991761636688177,
"grad_norm": 0.8393481969833374,
"learning_rate": 0.00019999999999999998,
"loss": 0.6376,
"step": 182
},
{
"epoch": 0.20101606480845804,
"grad_norm": 0.5777038335800171,
"learning_rate": 0.00020109890109890107,
"loss": 0.5777,
"step": 183
},
{
"epoch": 0.20211451325003432,
"grad_norm": 0.7751956582069397,
"learning_rate": 0.0002021978021978022,
"loss": 0.8368,
"step": 184
},
{
"epoch": 0.2032129616916106,
"grad_norm": 1.5582187175750732,
"learning_rate": 0.0002032967032967033,
"loss": 0.5087,
"step": 185
},
{
"epoch": 0.20431141013318688,
"grad_norm": 0.8304231762886047,
"learning_rate": 0.00020439560439560438,
"loss": 0.5512,
"step": 186
},
{
"epoch": 0.20540985857476315,
"grad_norm": 0.8545000553131104,
"learning_rate": 0.0002054945054945055,
"loss": 1.2533,
"step": 187
},
{
"epoch": 0.20650830701633943,
"grad_norm": 0.4891647696495056,
"learning_rate": 0.00020659340659340657,
"loss": 0.5738,
"step": 188
},
{
"epoch": 0.20760675545791568,
"grad_norm": 0.7159665822982788,
"learning_rate": 0.00020769230769230766,
"loss": 0.9266,
"step": 189
},
{
"epoch": 0.20870520389949196,
"grad_norm": 0.5053237080574036,
"learning_rate": 0.00020879120879120877,
"loss": 0.4574,
"step": 190
},
{
"epoch": 0.20980365234106824,
"grad_norm": 0.728336751461029,
"learning_rate": 0.00020989010989010985,
"loss": 0.6871,
"step": 191
},
{
"epoch": 0.2109021007826445,
"grad_norm": 0.8593311309814453,
"learning_rate": 0.000210989010989011,
"loss": 0.6788,
"step": 192
},
{
"epoch": 0.2120005492242208,
"grad_norm": 1.247111201286316,
"learning_rate": 0.00021208791208791208,
"loss": 0.5428,
"step": 193
},
{
"epoch": 0.21309899766579707,
"grad_norm": 0.6636946201324463,
"learning_rate": 0.0002131868131868132,
"loss": 0.7935,
"step": 194
},
{
"epoch": 0.21419744610737335,
"grad_norm": 0.5811622738838196,
"learning_rate": 0.00021428571428571427,
"loss": 0.4322,
"step": 195
},
{
"epoch": 0.2152958945489496,
"grad_norm": 0.5329126715660095,
"learning_rate": 0.00021538461538461536,
"loss": 0.7037,
"step": 196
},
{
"epoch": 0.21639434299052587,
"grad_norm": 1.730969786643982,
"learning_rate": 0.00021648351648351647,
"loss": 1.0315,
"step": 197
},
{
"epoch": 0.21749279143210215,
"grad_norm": 0.5242175459861755,
"learning_rate": 0.00021758241758241756,
"loss": 0.9285,
"step": 198
},
{
"epoch": 0.21859123987367843,
"grad_norm": 0.4745596945285797,
"learning_rate": 0.00021868131868131864,
"loss": 0.5414,
"step": 199
},
{
"epoch": 0.2196896883152547,
"grad_norm": 0.8693228363990784,
"learning_rate": 0.00021978021978021975,
"loss": 0.4576,
"step": 200
},
{
"epoch": 0.22078813675683098,
"grad_norm": 0.7073357105255127,
"learning_rate": 0.00022087912087912086,
"loss": 0.778,
"step": 201
},
{
"epoch": 0.22188658519840726,
"grad_norm": 0.535009503364563,
"learning_rate": 0.00022197802197802198,
"loss": 0.7734,
"step": 202
},
{
"epoch": 0.2229850336399835,
"grad_norm": 0.5862578749656677,
"learning_rate": 0.00022307692307692306,
"loss": 0.8612,
"step": 203
},
{
"epoch": 0.2240834820815598,
"grad_norm": 0.5167233943939209,
"learning_rate": 0.00022417582417582415,
"loss": 0.6122,
"step": 204
},
{
"epoch": 0.22518193052313606,
"grad_norm": 0.8982027769088745,
"learning_rate": 0.00022527472527472526,
"loss": 0.8905,
"step": 205
},
{
"epoch": 0.22628037896471234,
"grad_norm": 0.7311340570449829,
"learning_rate": 0.00022637362637362634,
"loss": 1.0151,
"step": 206
},
{
"epoch": 0.22737882740628862,
"grad_norm": 0.45674124360084534,
"learning_rate": 0.00022747252747252745,
"loss": 0.7056,
"step": 207
},
{
"epoch": 0.2284772758478649,
"grad_norm": 0.6916844844818115,
"learning_rate": 0.00022857142857142854,
"loss": 0.5977,
"step": 208
},
{
"epoch": 0.22957572428944117,
"grad_norm": 0.6632958650588989,
"learning_rate": 0.00022967032967032962,
"loss": 0.8228,
"step": 209
},
{
"epoch": 0.23067417273101745,
"grad_norm": 0.3243491053581238,
"learning_rate": 0.00023076923076923076,
"loss": 0.4823,
"step": 210
},
{
"epoch": 0.2317726211725937,
"grad_norm": 0.45630499720573425,
"learning_rate": 0.00023186813186813185,
"loss": 0.7206,
"step": 211
},
{
"epoch": 0.23287106961416998,
"grad_norm": 0.6726184487342834,
"learning_rate": 0.00023296703296703296,
"loss": 0.8211,
"step": 212
},
{
"epoch": 0.23396951805574626,
"grad_norm": 0.45092982053756714,
"learning_rate": 0.00023406593406593405,
"loss": 0.6812,
"step": 213
},
{
"epoch": 0.23506796649732253,
"grad_norm": 0.5624651312828064,
"learning_rate": 0.00023516483516483513,
"loss": 0.726,
"step": 214
},
{
"epoch": 0.2361664149388988,
"grad_norm": 1.1685765981674194,
"learning_rate": 0.00023626373626373624,
"loss": 0.7906,
"step": 215
},
{
"epoch": 0.2372648633804751,
"grad_norm": 0.581599771976471,
"learning_rate": 0.00023736263736263733,
"loss": 0.7049,
"step": 216
},
{
"epoch": 0.23836331182205137,
"grad_norm": 0.7660847902297974,
"learning_rate": 0.00023846153846153844,
"loss": 0.6105,
"step": 217
},
{
"epoch": 0.23946176026362762,
"grad_norm": 0.5126472115516663,
"learning_rate": 0.00023956043956043955,
"loss": 0.7134,
"step": 218
},
{
"epoch": 0.2405602087052039,
"grad_norm": 0.48460498452186584,
"learning_rate": 0.00024065934065934066,
"loss": 0.5578,
"step": 219
},
{
"epoch": 0.24165865714678017,
"grad_norm": 0.41463029384613037,
"learning_rate": 0.00024175824175824175,
"loss": 0.5589,
"step": 220
},
{
"epoch": 0.24275710558835645,
"grad_norm": 2.0703623294830322,
"learning_rate": 0.00024285714285714283,
"loss": 0.7128,
"step": 221
},
{
"epoch": 0.24385555402993273,
"grad_norm": 1.5641820430755615,
"learning_rate": 0.00024395604395604394,
"loss": 0.4439,
"step": 222
},
{
"epoch": 0.244954002471509,
"grad_norm": 0.34634652733802795,
"learning_rate": 0.00024505494505494503,
"loss": 0.5389,
"step": 223
},
{
"epoch": 0.24605245091308528,
"grad_norm": 0.5669183135032654,
"learning_rate": 0.00024615384615384614,
"loss": 0.5699,
"step": 224
},
{
"epoch": 0.24715089935466153,
"grad_norm": 0.6459633111953735,
"learning_rate": 0.0002472527472527472,
"loss": 0.7904,
"step": 225
},
{
"epoch": 0.2482493477962378,
"grad_norm": 0.9719502925872803,
"learning_rate": 0.0002483516483516483,
"loss": 0.7354,
"step": 226
},
{
"epoch": 0.24934779623781408,
"grad_norm": 0.7433357834815979,
"learning_rate": 0.0002494505494505494,
"loss": 0.5772,
"step": 227
},
{
"epoch": 0.25044624467939036,
"grad_norm": 0.42272481322288513,
"learning_rate": 0.00025054945054945053,
"loss": 0.5609,
"step": 228
},
{
"epoch": 0.2515446931209666,
"grad_norm": 1.2868828773498535,
"learning_rate": 0.00025164835164835165,
"loss": 0.5775,
"step": 229
},
{
"epoch": 0.2526431415625429,
"grad_norm": 0.40398430824279785,
"learning_rate": 0.0002527472527472527,
"loss": 0.742,
"step": 230
},
{
"epoch": 0.25374159000411917,
"grad_norm": 0.46501678228378296,
"learning_rate": 0.0002538461538461538,
"loss": 0.69,
"step": 231
},
{
"epoch": 0.25484003844569547,
"grad_norm": 0.46631869673728943,
"learning_rate": 0.00025494505494505493,
"loss": 0.7712,
"step": 232
},
{
"epoch": 0.2559384868872717,
"grad_norm": 0.6761367321014404,
"learning_rate": 0.000256043956043956,
"loss": 0.64,
"step": 233
},
{
"epoch": 0.257036935328848,
"grad_norm": 0.6253519654273987,
"learning_rate": 0.0002571428571428571,
"loss": 0.5499,
"step": 234
},
{
"epoch": 0.2581353837704243,
"grad_norm": 1.0556268692016602,
"learning_rate": 0.0002582417582417582,
"loss": 0.869,
"step": 235
},
{
"epoch": 0.2592338322120005,
"grad_norm": 0.4816044867038727,
"learning_rate": 0.0002593406593406593,
"loss": 0.6061,
"step": 236
},
{
"epoch": 0.26033228065357683,
"grad_norm": 1.1049383878707886,
"learning_rate": 0.00026043956043956043,
"loss": 0.7695,
"step": 237
},
{
"epoch": 0.2614307290951531,
"grad_norm": 0.44643181562423706,
"learning_rate": 0.00026153846153846154,
"loss": 0.7849,
"step": 238
},
{
"epoch": 0.2625291775367294,
"grad_norm": 0.5231640338897705,
"learning_rate": 0.0002626373626373626,
"loss": 0.8033,
"step": 239
},
{
"epoch": 0.26362762597830564,
"grad_norm": 0.5537316799163818,
"learning_rate": 0.0002637362637362637,
"loss": 0.7317,
"step": 240
},
{
"epoch": 0.26472607441988194,
"grad_norm": 0.42069998383522034,
"learning_rate": 0.0002648351648351648,
"loss": 0.6325,
"step": 241
},
{
"epoch": 0.2658245228614582,
"grad_norm": 0.8009732365608215,
"learning_rate": 0.0002659340659340659,
"loss": 0.6589,
"step": 242
},
{
"epoch": 0.26692297130303444,
"grad_norm": 1.2626444101333618,
"learning_rate": 0.000267032967032967,
"loss": 0.5845,
"step": 243
},
{
"epoch": 0.26802141974461074,
"grad_norm": 0.4783913195133209,
"learning_rate": 0.0002681318681318681,
"loss": 0.8844,
"step": 244
},
{
"epoch": 0.269119868186187,
"grad_norm": 1.098160982131958,
"learning_rate": 0.0002692307692307692,
"loss": 0.6134,
"step": 245
},
{
"epoch": 0.2702183166277633,
"grad_norm": 1.0397273302078247,
"learning_rate": 0.00027032967032967033,
"loss": 0.7861,
"step": 246
},
{
"epoch": 0.27131676506933955,
"grad_norm": 0.9729229807853699,
"learning_rate": 0.0002714285714285714,
"loss": 0.7691,
"step": 247
},
{
"epoch": 0.27241521351091585,
"grad_norm": 0.44837963581085205,
"learning_rate": 0.0002725274725274725,
"loss": 0.9414,
"step": 248
},
{
"epoch": 0.2735136619524921,
"grad_norm": 1.4863499402999878,
"learning_rate": 0.0002736263736263736,
"loss": 0.5825,
"step": 249
},
{
"epoch": 0.27461211039406835,
"grad_norm": 0.5948237180709839,
"learning_rate": 0.00027472527472527467,
"loss": 0.4934,
"step": 250
},
{
"epoch": 0.27571055883564466,
"grad_norm": 0.5448721051216125,
"learning_rate": 0.0002758241758241758,
"loss": 0.6295,
"step": 251
},
{
"epoch": 0.2768090072772209,
"grad_norm": 0.4309394657611847,
"learning_rate": 0.0002769230769230769,
"loss": 0.6561,
"step": 252
},
{
"epoch": 0.2779074557187972,
"grad_norm": 0.7659335136413574,
"learning_rate": 0.000278021978021978,
"loss": 0.7588,
"step": 253
},
{
"epoch": 0.27900590416037346,
"grad_norm": 0.45655715465545654,
"learning_rate": 0.0002791208791208791,
"loss": 0.5257,
"step": 254
},
{
"epoch": 0.28010435260194977,
"grad_norm": 0.5390630960464478,
"learning_rate": 0.0002802197802197802,
"loss": 0.7051,
"step": 255
},
{
"epoch": 0.281202801043526,
"grad_norm": 0.39703306555747986,
"learning_rate": 0.0002813186813186813,
"loss": 0.6137,
"step": 256
},
{
"epoch": 0.28230124948510227,
"grad_norm": 0.4662924110889435,
"learning_rate": 0.0002824175824175824,
"loss": 0.4897,
"step": 257
},
{
"epoch": 0.2833996979266786,
"grad_norm": 0.39399877190589905,
"learning_rate": 0.00028351648351648346,
"loss": 0.6235,
"step": 258
},
{
"epoch": 0.2844981463682548,
"grad_norm": 0.497549444437027,
"learning_rate": 0.00028461538461538457,
"loss": 0.5134,
"step": 259
},
{
"epoch": 0.28559659480983113,
"grad_norm": 0.6597803235054016,
"learning_rate": 0.0002857142857142857,
"loss": 0.7955,
"step": 260
},
{
"epoch": 0.2866950432514074,
"grad_norm": 0.5545711517333984,
"learning_rate": 0.0002868131868131868,
"loss": 0.833,
"step": 261
},
{
"epoch": 0.2877934916929837,
"grad_norm": 1.0227786302566528,
"learning_rate": 0.0002879120879120879,
"loss": 0.5249,
"step": 262
},
{
"epoch": 0.28889194013455993,
"grad_norm": 0.5727143883705139,
"learning_rate": 0.000289010989010989,
"loss": 0.6319,
"step": 263
},
{
"epoch": 0.2899903885761362,
"grad_norm": 0.39322397112846375,
"learning_rate": 0.0002901098901098901,
"loss": 0.7003,
"step": 264
},
{
"epoch": 0.2910888370177125,
"grad_norm": 0.5657737851142883,
"learning_rate": 0.0002912087912087912,
"loss": 0.7085,
"step": 265
},
{
"epoch": 0.29218728545928874,
"grad_norm": 0.4305976927280426,
"learning_rate": 0.0002923076923076923,
"loss": 0.5931,
"step": 266
},
{
"epoch": 0.29328573390086504,
"grad_norm": 0.5300284624099731,
"learning_rate": 0.00029340659340659336,
"loss": 0.7881,
"step": 267
},
{
"epoch": 0.2943841823424413,
"grad_norm": 0.5922349095344543,
"learning_rate": 0.00029450549450549447,
"loss": 0.8688,
"step": 268
},
{
"epoch": 0.2954826307840176,
"grad_norm": 0.5700828433036804,
"learning_rate": 0.0002956043956043956,
"loss": 1.1328,
"step": 269
},
{
"epoch": 0.29658107922559385,
"grad_norm": 0.6773694753646851,
"learning_rate": 0.0002967032967032967,
"loss": 0.7821,
"step": 270
},
{
"epoch": 0.2976795276671701,
"grad_norm": 0.5200739502906799,
"learning_rate": 0.0002978021978021978,
"loss": 0.8775,
"step": 271
},
{
"epoch": 0.2987779761087464,
"grad_norm": 0.9860020875930786,
"learning_rate": 0.00029890109890109886,
"loss": 0.9141,
"step": 272
},
{
"epoch": 0.29987642455032265,
"grad_norm": 0.7012956142425537,
"learning_rate": 0.0003,
"loss": 0.7672,
"step": 273
},
{
"epoch": 0.30097487299189896,
"grad_norm": 0.4128098785877228,
"learning_rate": 0.0002998778998778999,
"loss": 0.3969,
"step": 274
},
{
"epoch": 0.3020733214334752,
"grad_norm": 0.366597980260849,
"learning_rate": 0.00029975579975579974,
"loss": 0.639,
"step": 275
},
{
"epoch": 0.3031717698750515,
"grad_norm": 0.5208033919334412,
"learning_rate": 0.0002996336996336996,
"loss": 0.664,
"step": 276
},
{
"epoch": 0.30427021831662776,
"grad_norm": 0.45519202947616577,
"learning_rate": 0.0002995115995115995,
"loss": 0.8495,
"step": 277
},
{
"epoch": 0.305368666758204,
"grad_norm": 0.6617010831832886,
"learning_rate": 0.0002993894993894994,
"loss": 1.0204,
"step": 278
},
{
"epoch": 0.3064671151997803,
"grad_norm": 1.4151723384857178,
"learning_rate": 0.00029926739926739923,
"loss": 0.8289,
"step": 279
},
{
"epoch": 0.30756556364135657,
"grad_norm": 0.6531035900115967,
"learning_rate": 0.00029914529914529915,
"loss": 0.7571,
"step": 280
},
{
"epoch": 0.30866401208293287,
"grad_norm": 0.8595600724220276,
"learning_rate": 0.000299023199023199,
"loss": 0.9668,
"step": 281
},
{
"epoch": 0.3097624605245091,
"grad_norm": 0.50210040807724,
"learning_rate": 0.00029890109890109886,
"loss": 0.6662,
"step": 282
},
{
"epoch": 0.3108609089660854,
"grad_norm": 0.6004669666290283,
"learning_rate": 0.0002987789987789988,
"loss": 0.7127,
"step": 283
},
{
"epoch": 0.3119593574076617,
"grad_norm": 0.8085057139396667,
"learning_rate": 0.00029865689865689863,
"loss": 0.9266,
"step": 284
},
{
"epoch": 0.3130578058492379,
"grad_norm": 0.44965627789497375,
"learning_rate": 0.0002985347985347985,
"loss": 0.7118,
"step": 285
},
{
"epoch": 0.31415625429081423,
"grad_norm": 0.5758265852928162,
"learning_rate": 0.00029841269841269835,
"loss": 0.6915,
"step": 286
},
{
"epoch": 0.3152547027323905,
"grad_norm": 0.5623393058776855,
"learning_rate": 0.00029829059829059826,
"loss": 0.6962,
"step": 287
},
{
"epoch": 0.3163531511739668,
"grad_norm": 0.857796311378479,
"learning_rate": 0.0002981684981684982,
"loss": 0.676,
"step": 288
},
{
"epoch": 0.31745159961554303,
"grad_norm": 0.36431241035461426,
"learning_rate": 0.000298046398046398,
"loss": 0.5475,
"step": 289
},
{
"epoch": 0.31855004805711934,
"grad_norm": 0.4778802692890167,
"learning_rate": 0.0002979242979242979,
"loss": 0.7198,
"step": 290
},
{
"epoch": 0.3196484964986956,
"grad_norm": 0.4887610673904419,
"learning_rate": 0.0002978021978021978,
"loss": 0.5559,
"step": 291
},
{
"epoch": 0.32074694494027184,
"grad_norm": 0.745379626750946,
"learning_rate": 0.00029768009768009766,
"loss": 1.0509,
"step": 292
},
{
"epoch": 0.32184539338184814,
"grad_norm": 0.40081167221069336,
"learning_rate": 0.0002975579975579975,
"loss": 0.6564,
"step": 293
},
{
"epoch": 0.3229438418234244,
"grad_norm": 0.5133034586906433,
"learning_rate": 0.00029743589743589743,
"loss": 0.6765,
"step": 294
},
{
"epoch": 0.3240422902650007,
"grad_norm": 0.5123881697654724,
"learning_rate": 0.0002973137973137973,
"loss": 0.8001,
"step": 295
},
{
"epoch": 0.32514073870657695,
"grad_norm": 0.3771597743034363,
"learning_rate": 0.00029719169719169715,
"loss": 0.785,
"step": 296
},
{
"epoch": 0.32623918714815325,
"grad_norm": 0.38929086923599243,
"learning_rate": 0.00029706959706959706,
"loss": 0.7273,
"step": 297
},
{
"epoch": 0.3273376355897295,
"grad_norm": 0.47761446237564087,
"learning_rate": 0.0002969474969474969,
"loss": 0.6997,
"step": 298
},
{
"epoch": 0.3284360840313058,
"grad_norm": 0.4798452854156494,
"learning_rate": 0.0002968253968253968,
"loss": 0.7171,
"step": 299
},
{
"epoch": 0.32953453247288206,
"grad_norm": 0.5864073038101196,
"learning_rate": 0.0002967032967032967,
"loss": 0.7075,
"step": 300
},
{
"epoch": 0.3306329809144583,
"grad_norm": 0.6298258900642395,
"learning_rate": 0.00029658119658119655,
"loss": 0.8659,
"step": 301
},
{
"epoch": 0.3317314293560346,
"grad_norm": 0.9764651656150818,
"learning_rate": 0.0002964590964590964,
"loss": 0.7451,
"step": 302
},
{
"epoch": 0.33282987779761086,
"grad_norm": 0.7084535360336304,
"learning_rate": 0.0002963369963369963,
"loss": 0.7896,
"step": 303
},
{
"epoch": 0.33392832623918717,
"grad_norm": 0.3226016163825989,
"learning_rate": 0.0002962148962148962,
"loss": 0.5614,
"step": 304
},
{
"epoch": 0.3350267746807634,
"grad_norm": 0.5515668988227844,
"learning_rate": 0.0002960927960927961,
"loss": 0.6981,
"step": 305
},
{
"epoch": 0.3361252231223397,
"grad_norm": 0.42776307463645935,
"learning_rate": 0.00029597069597069595,
"loss": 0.5911,
"step": 306
},
{
"epoch": 0.33722367156391597,
"grad_norm": 0.36645814776420593,
"learning_rate": 0.0002958485958485958,
"loss": 0.5584,
"step": 307
},
{
"epoch": 0.3383221200054922,
"grad_norm": 0.4089672565460205,
"learning_rate": 0.0002957264957264957,
"loss": 0.6814,
"step": 308
},
{
"epoch": 0.3394205684470685,
"grad_norm": 0.4406324326992035,
"learning_rate": 0.0002956043956043956,
"loss": 0.5426,
"step": 309
},
{
"epoch": 0.3405190168886448,
"grad_norm": 0.4138193726539612,
"learning_rate": 0.00029548229548229544,
"loss": 0.7554,
"step": 310
},
{
"epoch": 0.3416174653302211,
"grad_norm": 0.45647338032722473,
"learning_rate": 0.00029536019536019535,
"loss": 0.4871,
"step": 311
},
{
"epoch": 0.34271591377179733,
"grad_norm": 0.44362974166870117,
"learning_rate": 0.0002952380952380952,
"loss": 0.7254,
"step": 312
},
{
"epoch": 0.34381436221337364,
"grad_norm": 0.5832559466362,
"learning_rate": 0.00029511599511599507,
"loss": 0.64,
"step": 313
},
{
"epoch": 0.3449128106549499,
"grad_norm": 0.6754651665687561,
"learning_rate": 0.000294993894993895,
"loss": 0.7046,
"step": 314
},
{
"epoch": 0.34601125909652614,
"grad_norm": 0.6487123370170593,
"learning_rate": 0.00029487179487179484,
"loss": 0.5934,
"step": 315
},
{
"epoch": 0.34710970753810244,
"grad_norm": 0.24118930101394653,
"learning_rate": 0.0002947496947496947,
"loss": 0.5241,
"step": 316
},
{
"epoch": 0.3482081559796787,
"grad_norm": 0.4580494165420532,
"learning_rate": 0.0002946275946275946,
"loss": 0.6733,
"step": 317
},
{
"epoch": 0.349306604421255,
"grad_norm": 0.4770609736442566,
"learning_rate": 0.00029450549450549447,
"loss": 0.5758,
"step": 318
},
{
"epoch": 0.35040505286283125,
"grad_norm": 0.40334221720695496,
"learning_rate": 0.0002943833943833944,
"loss": 0.5365,
"step": 319
},
{
"epoch": 0.35150350130440755,
"grad_norm": 0.5605480074882507,
"learning_rate": 0.00029426129426129424,
"loss": 0.5967,
"step": 320
},
{
"epoch": 0.3526019497459838,
"grad_norm": 0.6031836271286011,
"learning_rate": 0.0002941391941391941,
"loss": 0.6397,
"step": 321
},
{
"epoch": 0.35370039818756005,
"grad_norm": 0.5602075457572937,
"learning_rate": 0.000294017094017094,
"loss": 0.7253,
"step": 322
},
{
"epoch": 0.35479884662913636,
"grad_norm": 1.5055879354476929,
"learning_rate": 0.00029389499389499387,
"loss": 0.6066,
"step": 323
},
{
"epoch": 0.3558972950707126,
"grad_norm": 1.969072699546814,
"learning_rate": 0.0002937728937728937,
"loss": 0.9263,
"step": 324
},
{
"epoch": 0.3569957435122889,
"grad_norm": 0.43139147758483887,
"learning_rate": 0.00029365079365079364,
"loss": 0.6462,
"step": 325
},
{
"epoch": 0.35809419195386516,
"grad_norm": 0.40423595905303955,
"learning_rate": 0.0002935286935286935,
"loss": 0.4278,
"step": 326
},
{
"epoch": 0.35919264039544146,
"grad_norm": 0.41983166337013245,
"learning_rate": 0.00029340659340659336,
"loss": 0.7527,
"step": 327
},
{
"epoch": 0.3602910888370177,
"grad_norm": 0.6624807715415955,
"learning_rate": 0.00029328449328449327,
"loss": 0.7381,
"step": 328
},
{
"epoch": 0.36138953727859396,
"grad_norm": 0.6173990964889526,
"learning_rate": 0.00029316239316239313,
"loss": 0.6838,
"step": 329
},
{
"epoch": 0.36248798572017027,
"grad_norm": 1.1278433799743652,
"learning_rate": 0.000293040293040293,
"loss": 0.8439,
"step": 330
},
{
"epoch": 0.3635864341617465,
"grad_norm": 0.3453993797302246,
"learning_rate": 0.0002929181929181929,
"loss": 0.5324,
"step": 331
},
{
"epoch": 0.3646848826033228,
"grad_norm": 0.4151187241077423,
"learning_rate": 0.0002927960927960928,
"loss": 0.7019,
"step": 332
},
{
"epoch": 0.3657833310448991,
"grad_norm": 0.4247313439846039,
"learning_rate": 0.0002926739926739926,
"loss": 0.6362,
"step": 333
},
{
"epoch": 0.3668817794864754,
"grad_norm": 1.5250136852264404,
"learning_rate": 0.00029255189255189253,
"loss": 0.5885,
"step": 334
},
{
"epoch": 0.36798022792805163,
"grad_norm": 0.43669968843460083,
"learning_rate": 0.00029242979242979244,
"loss": 0.9191,
"step": 335
},
{
"epoch": 0.3690786763696279,
"grad_norm": 0.8063925504684448,
"learning_rate": 0.0002923076923076923,
"loss": 0.6813,
"step": 336
},
{
"epoch": 0.3701771248112042,
"grad_norm": 0.6002399325370789,
"learning_rate": 0.00029218559218559216,
"loss": 0.5859,
"step": 337
},
{
"epoch": 0.37127557325278043,
"grad_norm": 0.9405462145805359,
"learning_rate": 0.000292063492063492,
"loss": 0.7476,
"step": 338
},
{
"epoch": 0.37237402169435674,
"grad_norm": 0.5050615072250366,
"learning_rate": 0.00029194139194139193,
"loss": 0.5172,
"step": 339
},
{
"epoch": 0.373472470135933,
"grad_norm": 0.4593801200389862,
"learning_rate": 0.0002918192918192918,
"loss": 0.5405,
"step": 340
},
{
"epoch": 0.3745709185775093,
"grad_norm": 0.5275060534477234,
"learning_rate": 0.00029169719169719164,
"loss": 0.4537,
"step": 341
},
{
"epoch": 0.37566936701908554,
"grad_norm": 0.8907522559165955,
"learning_rate": 0.00029157509157509156,
"loss": 0.6826,
"step": 342
},
{
"epoch": 0.3767678154606618,
"grad_norm": 0.7229670882225037,
"learning_rate": 0.0002914529914529914,
"loss": 0.6072,
"step": 343
},
{
"epoch": 0.3778662639022381,
"grad_norm": 1.7154827117919922,
"learning_rate": 0.0002913308913308913,
"loss": 0.6956,
"step": 344
},
{
"epoch": 0.37896471234381435,
"grad_norm": 1.012902021408081,
"learning_rate": 0.0002912087912087912,
"loss": 0.5337,
"step": 345
},
{
"epoch": 0.38006316078539065,
"grad_norm": 0.6467313170433044,
"learning_rate": 0.00029108669108669105,
"loss": 0.7652,
"step": 346
},
{
"epoch": 0.3811616092269669,
"grad_norm": 0.5594947338104248,
"learning_rate": 0.0002909645909645909,
"loss": 0.578,
"step": 347
},
{
"epoch": 0.3822600576685432,
"grad_norm": 0.5808854699134827,
"learning_rate": 0.0002908424908424908,
"loss": 0.6142,
"step": 348
},
{
"epoch": 0.38335850611011946,
"grad_norm": 0.6067795157432556,
"learning_rate": 0.00029072039072039073,
"loss": 0.7682,
"step": 349
},
{
"epoch": 0.3844569545516957,
"grad_norm": 0.392993301153183,
"learning_rate": 0.0002905982905982906,
"loss": 0.6599,
"step": 350
},
{
"epoch": 0.385555402993272,
"grad_norm": 0.3963404893875122,
"learning_rate": 0.00029047619047619045,
"loss": 0.7079,
"step": 351
},
{
"epoch": 0.38665385143484826,
"grad_norm": 0.3471222221851349,
"learning_rate": 0.00029035409035409036,
"loss": 0.463,
"step": 352
},
{
"epoch": 0.38775229987642457,
"grad_norm": 0.5496531128883362,
"learning_rate": 0.0002902319902319902,
"loss": 0.7639,
"step": 353
},
{
"epoch": 0.3888507483180008,
"grad_norm": 0.5482885241508484,
"learning_rate": 0.0002901098901098901,
"loss": 0.4198,
"step": 354
},
{
"epoch": 0.3899491967595771,
"grad_norm": 0.7329181432723999,
"learning_rate": 0.00028998778998779,
"loss": 0.6057,
"step": 355
},
{
"epoch": 0.39104764520115337,
"grad_norm": 0.41850918531417847,
"learning_rate": 0.00028986568986568985,
"loss": 0.605,
"step": 356
},
{
"epoch": 0.3921460936427296,
"grad_norm": 0.4463609457015991,
"learning_rate": 0.0002897435897435897,
"loss": 0.7381,
"step": 357
},
{
"epoch": 0.3932445420843059,
"grad_norm": 0.7207491397857666,
"learning_rate": 0.0002896214896214896,
"loss": 0.6892,
"step": 358
},
{
"epoch": 0.3943429905258822,
"grad_norm": 0.3715958595275879,
"learning_rate": 0.0002894993894993895,
"loss": 0.5426,
"step": 359
},
{
"epoch": 0.3954414389674585,
"grad_norm": 0.7077822685241699,
"learning_rate": 0.00028937728937728933,
"loss": 0.5923,
"step": 360
},
{
"epoch": 0.39653988740903473,
"grad_norm": 0.5109585523605347,
"learning_rate": 0.00028925518925518925,
"loss": 0.5939,
"step": 361
},
{
"epoch": 0.39763833585061104,
"grad_norm": 0.6105355024337769,
"learning_rate": 0.0002891330891330891,
"loss": 1.0345,
"step": 362
},
{
"epoch": 0.3987367842921873,
"grad_norm": 0.479732871055603,
"learning_rate": 0.000289010989010989,
"loss": 0.71,
"step": 363
},
{
"epoch": 0.39983523273376353,
"grad_norm": 0.8600007891654968,
"learning_rate": 0.0002888888888888888,
"loss": 0.7406,
"step": 364
},
{
"epoch": 0.40093368117533984,
"grad_norm": 0.6584550738334656,
"learning_rate": 0.00028876678876678873,
"loss": 0.6658,
"step": 365
},
{
"epoch": 0.4020321296169161,
"grad_norm": 0.7251041531562805,
"learning_rate": 0.00028864468864468865,
"loss": 0.8425,
"step": 366
},
{
"epoch": 0.4031305780584924,
"grad_norm": 0.5729238390922546,
"learning_rate": 0.0002885225885225885,
"loss": 0.9054,
"step": 367
},
{
"epoch": 0.40422902650006864,
"grad_norm": 1.1829932928085327,
"learning_rate": 0.00028840048840048836,
"loss": 0.9232,
"step": 368
},
{
"epoch": 0.40532747494164495,
"grad_norm": 0.37746721506118774,
"learning_rate": 0.0002882783882783883,
"loss": 0.9619,
"step": 369
},
{
"epoch": 0.4064259233832212,
"grad_norm": 0.5653749108314514,
"learning_rate": 0.00028815628815628813,
"loss": 0.7182,
"step": 370
},
{
"epoch": 0.40752437182479745,
"grad_norm": 0.6024563312530518,
"learning_rate": 0.000288034188034188,
"loss": 0.6881,
"step": 371
},
{
"epoch": 0.40862282026637375,
"grad_norm": 0.485350102186203,
"learning_rate": 0.0002879120879120879,
"loss": 0.6451,
"step": 372
},
{
"epoch": 0.40972126870795,
"grad_norm": 0.5762611627578735,
"learning_rate": 0.00028778998778998776,
"loss": 0.7818,
"step": 373
},
{
"epoch": 0.4108197171495263,
"grad_norm": 0.7961844801902771,
"learning_rate": 0.0002876678876678876,
"loss": 0.6682,
"step": 374
},
{
"epoch": 0.41191816559110256,
"grad_norm": 0.4630587697029114,
"learning_rate": 0.00028754578754578753,
"loss": 0.9015,
"step": 375
},
{
"epoch": 0.41301661403267886,
"grad_norm": 0.6592808961868286,
"learning_rate": 0.0002874236874236874,
"loss": 0.5738,
"step": 376
},
{
"epoch": 0.4141150624742551,
"grad_norm": 0.4788278639316559,
"learning_rate": 0.00028730158730158725,
"loss": 0.7022,
"step": 377
},
{
"epoch": 0.41521351091583136,
"grad_norm": 0.5041861534118652,
"learning_rate": 0.00028717948717948716,
"loss": 0.6137,
"step": 378
},
{
"epoch": 0.41631195935740767,
"grad_norm": 0.5436013340950012,
"learning_rate": 0.000287057387057387,
"loss": 0.6621,
"step": 379
},
{
"epoch": 0.4174104077989839,
"grad_norm": 0.5102400183677673,
"learning_rate": 0.00028693528693528694,
"loss": 0.6627,
"step": 380
},
{
"epoch": 0.4185088562405602,
"grad_norm": 0.43655040860176086,
"learning_rate": 0.0002868131868131868,
"loss": 0.6475,
"step": 381
},
{
"epoch": 0.4196073046821365,
"grad_norm": 0.3989826738834381,
"learning_rate": 0.00028669108669108665,
"loss": 0.5483,
"step": 382
},
{
"epoch": 0.4207057531237128,
"grad_norm": 0.7781158685684204,
"learning_rate": 0.00028656898656898656,
"loss": 0.6475,
"step": 383
},
{
"epoch": 0.421804201565289,
"grad_norm": 0.8119930624961853,
"learning_rate": 0.0002864468864468864,
"loss": 0.8122,
"step": 384
},
{
"epoch": 0.4229026500068653,
"grad_norm": 0.7233585119247437,
"learning_rate": 0.0002863247863247863,
"loss": 0.7837,
"step": 385
},
{
"epoch": 0.4240010984484416,
"grad_norm": 0.41249507665634155,
"learning_rate": 0.0002862026862026862,
"loss": 0.6916,
"step": 386
},
{
"epoch": 0.42509954689001783,
"grad_norm": 0.4865298867225647,
"learning_rate": 0.00028608058608058605,
"loss": 0.595,
"step": 387
},
{
"epoch": 0.42619799533159414,
"grad_norm": 0.6057963371276855,
"learning_rate": 0.0002859584859584859,
"loss": 0.7214,
"step": 388
},
{
"epoch": 0.4272964437731704,
"grad_norm": 0.5390968918800354,
"learning_rate": 0.0002858363858363858,
"loss": 0.805,
"step": 389
},
{
"epoch": 0.4283948922147467,
"grad_norm": 0.5944109559059143,
"learning_rate": 0.0002857142857142857,
"loss": 0.9953,
"step": 390
},
{
"epoch": 0.42949334065632294,
"grad_norm": 0.5480278134346008,
"learning_rate": 0.00028559218559218554,
"loss": 0.8406,
"step": 391
},
{
"epoch": 0.4305917890978992,
"grad_norm": 0.5168552994728088,
"learning_rate": 0.00028547008547008545,
"loss": 0.9715,
"step": 392
},
{
"epoch": 0.4316902375394755,
"grad_norm": 0.4859452247619629,
"learning_rate": 0.0002853479853479853,
"loss": 0.7368,
"step": 393
},
{
"epoch": 0.43278868598105175,
"grad_norm": 0.4697234034538269,
"learning_rate": 0.0002852258852258852,
"loss": 0.4801,
"step": 394
},
{
"epoch": 0.43388713442262805,
"grad_norm": 0.6198891401290894,
"learning_rate": 0.0002851037851037851,
"loss": 0.5184,
"step": 395
},
{
"epoch": 0.4349855828642043,
"grad_norm": 0.531563401222229,
"learning_rate": 0.00028498168498168494,
"loss": 0.8047,
"step": 396
},
{
"epoch": 0.4360840313057806,
"grad_norm": 0.4610724449157715,
"learning_rate": 0.00028485958485958485,
"loss": 0.4583,
"step": 397
},
{
"epoch": 0.43718247974735686,
"grad_norm": 0.5609697699546814,
"learning_rate": 0.0002847374847374847,
"loss": 0.7362,
"step": 398
},
{
"epoch": 0.4382809281889331,
"grad_norm": 0.5257968306541443,
"learning_rate": 0.00028461538461538457,
"loss": 0.8173,
"step": 399
},
{
"epoch": 0.4393793766305094,
"grad_norm": 0.8307009339332581,
"learning_rate": 0.0002844932844932845,
"loss": 0.5507,
"step": 400
},
{
"epoch": 0.44047782507208566,
"grad_norm": 0.36615508794784546,
"learning_rate": 0.00028437118437118434,
"loss": 0.6605,
"step": 401
},
{
"epoch": 0.44157627351366197,
"grad_norm": 0.35138362646102905,
"learning_rate": 0.0002842490842490842,
"loss": 0.6614,
"step": 402
},
{
"epoch": 0.4426747219552382,
"grad_norm": 0.5054494738578796,
"learning_rate": 0.0002841269841269841,
"loss": 0.799,
"step": 403
},
{
"epoch": 0.4437731703968145,
"grad_norm": 0.4711816608905792,
"learning_rate": 0.00028400488400488397,
"loss": 0.8892,
"step": 404
},
{
"epoch": 0.44487161883839077,
"grad_norm": 0.5073884725570679,
"learning_rate": 0.00028388278388278383,
"loss": 0.8156,
"step": 405
},
{
"epoch": 0.445970067279967,
"grad_norm": 0.29938632249832153,
"learning_rate": 0.00028376068376068374,
"loss": 0.7598,
"step": 406
},
{
"epoch": 0.4470685157215433,
"grad_norm": 1.745937466621399,
"learning_rate": 0.00028363858363858365,
"loss": 0.7829,
"step": 407
},
{
"epoch": 0.4481669641631196,
"grad_norm": 0.46887943148612976,
"learning_rate": 0.00028351648351648346,
"loss": 0.7798,
"step": 408
},
{
"epoch": 0.4492654126046959,
"grad_norm": 0.4274987280368805,
"learning_rate": 0.00028339438339438337,
"loss": 0.8407,
"step": 409
},
{
"epoch": 0.45036386104627213,
"grad_norm": 0.4445902109146118,
"learning_rate": 0.0002832722832722833,
"loss": 0.7394,
"step": 410
},
{
"epoch": 0.45146230948784843,
"grad_norm": 0.3842466175556183,
"learning_rate": 0.00028315018315018314,
"loss": 0.7781,
"step": 411
},
{
"epoch": 0.4525607579294247,
"grad_norm": 0.5660600066184998,
"learning_rate": 0.000283028083028083,
"loss": 0.8058,
"step": 412
},
{
"epoch": 0.45365920637100093,
"grad_norm": 0.442911297082901,
"learning_rate": 0.0002829059829059829,
"loss": 0.808,
"step": 413
},
{
"epoch": 0.45475765481257724,
"grad_norm": 0.9051260352134705,
"learning_rate": 0.00028278388278388277,
"loss": 0.9427,
"step": 414
},
{
"epoch": 0.4558561032541535,
"grad_norm": 0.8027593493461609,
"learning_rate": 0.00028266178266178263,
"loss": 0.531,
"step": 415
},
{
"epoch": 0.4569545516957298,
"grad_norm": 0.36242446303367615,
"learning_rate": 0.0002825396825396825,
"loss": 0.5609,
"step": 416
},
{
"epoch": 0.45805300013730604,
"grad_norm": 0.6095871925354004,
"learning_rate": 0.0002824175824175824,
"loss": 0.7424,
"step": 417
},
{
"epoch": 0.45915144857888235,
"grad_norm": 0.5102814435958862,
"learning_rate": 0.00028229548229548226,
"loss": 0.8861,
"step": 418
},
{
"epoch": 0.4602498970204586,
"grad_norm": 0.375265896320343,
"learning_rate": 0.0002821733821733821,
"loss": 0.6235,
"step": 419
},
{
"epoch": 0.4613483454620349,
"grad_norm": 0.4506315588951111,
"learning_rate": 0.00028205128205128203,
"loss": 0.6059,
"step": 420
},
{
"epoch": 0.46244679390361115,
"grad_norm": 0.8119642734527588,
"learning_rate": 0.0002819291819291819,
"loss": 0.7821,
"step": 421
},
{
"epoch": 0.4635452423451874,
"grad_norm": 0.42945513129234314,
"learning_rate": 0.00028180708180708175,
"loss": 0.9503,
"step": 422
},
{
"epoch": 0.4646436907867637,
"grad_norm": 0.35567665100097656,
"learning_rate": 0.00028168498168498166,
"loss": 0.5243,
"step": 423
},
{
"epoch": 0.46574213922833996,
"grad_norm": 0.5160343647003174,
"learning_rate": 0.00028156288156288157,
"loss": 0.5767,
"step": 424
},
{
"epoch": 0.46684058766991626,
"grad_norm": 0.37530624866485596,
"learning_rate": 0.00028144078144078143,
"loss": 1.2016,
"step": 425
},
{
"epoch": 0.4679390361114925,
"grad_norm": 0.5283146500587463,
"learning_rate": 0.0002813186813186813,
"loss": 0.5958,
"step": 426
},
{
"epoch": 0.4690374845530688,
"grad_norm": 0.5217192769050598,
"learning_rate": 0.0002811965811965812,
"loss": 0.715,
"step": 427
},
{
"epoch": 0.47013593299464507,
"grad_norm": 0.5092077851295471,
"learning_rate": 0.00028107448107448106,
"loss": 0.6942,
"step": 428
},
{
"epoch": 0.4712343814362213,
"grad_norm": 0.7683324813842773,
"learning_rate": 0.0002809523809523809,
"loss": 1.0185,
"step": 429
},
{
"epoch": 0.4723328298777976,
"grad_norm": 0.3117397725582123,
"learning_rate": 0.00028083028083028083,
"loss": 0.6949,
"step": 430
},
{
"epoch": 0.47343127831937387,
"grad_norm": 0.3218965232372284,
"learning_rate": 0.0002807081807081807,
"loss": 0.6872,
"step": 431
},
{
"epoch": 0.4745297267609502,
"grad_norm": 1.104121446609497,
"learning_rate": 0.00028058608058608055,
"loss": 0.6628,
"step": 432
},
{
"epoch": 0.4756281752025264,
"grad_norm": 0.3224816620349884,
"learning_rate": 0.00028046398046398046,
"loss": 0.5974,
"step": 433
},
{
"epoch": 0.47672662364410273,
"grad_norm": 0.5742220878601074,
"learning_rate": 0.0002803418803418803,
"loss": 0.7248,
"step": 434
},
{
"epoch": 0.477825072085679,
"grad_norm": 0.5449275374412537,
"learning_rate": 0.0002802197802197802,
"loss": 0.8552,
"step": 435
},
{
"epoch": 0.47892352052725523,
"grad_norm": 0.44660067558288574,
"learning_rate": 0.0002800976800976801,
"loss": 0.6968,
"step": 436
},
{
"epoch": 0.48002196896883154,
"grad_norm": 0.4287508428096771,
"learning_rate": 0.00027997557997557995,
"loss": 0.8101,
"step": 437
},
{
"epoch": 0.4811204174104078,
"grad_norm": 0.4142225384712219,
"learning_rate": 0.00027985347985347986,
"loss": 0.5379,
"step": 438
},
{
"epoch": 0.4822188658519841,
"grad_norm": 1.246833324432373,
"learning_rate": 0.0002797313797313797,
"loss": 0.7116,
"step": 439
},
{
"epoch": 0.48331731429356034,
"grad_norm": 0.3845030963420868,
"learning_rate": 0.0002796092796092796,
"loss": 0.8088,
"step": 440
},
{
"epoch": 0.48441576273513665,
"grad_norm": 1.4492995738983154,
"learning_rate": 0.0002794871794871795,
"loss": 0.7358,
"step": 441
},
{
"epoch": 0.4855142111767129,
"grad_norm": 0.40994521975517273,
"learning_rate": 0.00027936507936507935,
"loss": 0.6228,
"step": 442
},
{
"epoch": 0.48661265961828915,
"grad_norm": 0.4782777428627014,
"learning_rate": 0.0002792429792429792,
"loss": 0.4944,
"step": 443
},
{
"epoch": 0.48771110805986545,
"grad_norm": 0.47269922494888306,
"learning_rate": 0.0002791208791208791,
"loss": 0.7023,
"step": 444
},
{
"epoch": 0.4888095565014417,
"grad_norm": 0.5529118776321411,
"learning_rate": 0.000278998778998779,
"loss": 0.7717,
"step": 445
},
{
"epoch": 0.489908004943018,
"grad_norm": 0.4244072139263153,
"learning_rate": 0.00027887667887667884,
"loss": 0.7902,
"step": 446
},
{
"epoch": 0.49100645338459425,
"grad_norm": 1.4737539291381836,
"learning_rate": 0.00027875457875457875,
"loss": 0.5784,
"step": 447
},
{
"epoch": 0.49210490182617056,
"grad_norm": 0.40120208263397217,
"learning_rate": 0.0002786324786324786,
"loss": 0.7974,
"step": 448
},
{
"epoch": 0.4932033502677468,
"grad_norm": 0.5481031537055969,
"learning_rate": 0.00027851037851037846,
"loss": 0.7867,
"step": 449
},
{
"epoch": 0.49430179870932306,
"grad_norm": 0.36719343066215515,
"learning_rate": 0.0002783882783882784,
"loss": 0.6543,
"step": 450
},
{
"epoch": 0.49540024715089936,
"grad_norm": 0.3980066776275635,
"learning_rate": 0.00027826617826617824,
"loss": 0.5395,
"step": 451
},
{
"epoch": 0.4964986955924756,
"grad_norm": 0.45570313930511475,
"learning_rate": 0.0002781440781440781,
"loss": 0.7908,
"step": 452
},
{
"epoch": 0.4975971440340519,
"grad_norm": 0.41858601570129395,
"learning_rate": 0.000278021978021978,
"loss": 0.5248,
"step": 453
},
{
"epoch": 0.49869559247562817,
"grad_norm": 0.5019702315330505,
"learning_rate": 0.00027789987789987786,
"loss": 0.8006,
"step": 454
},
{
"epoch": 0.4997940409172045,
"grad_norm": 0.4589880108833313,
"learning_rate": 0.0002777777777777778,
"loss": 0.7294,
"step": 455
},
{
"epoch": 0.5008924893587807,
"grad_norm": 0.5679266452789307,
"learning_rate": 0.00027765567765567764,
"loss": 0.651,
"step": 456
},
{
"epoch": 0.501990937800357,
"grad_norm": 0.4854479134082794,
"learning_rate": 0.0002775335775335775,
"loss": 0.9908,
"step": 457
},
{
"epoch": 0.5030893862419332,
"grad_norm": 0.4964112341403961,
"learning_rate": 0.0002774114774114774,
"loss": 0.8084,
"step": 458
},
{
"epoch": 0.5041878346835096,
"grad_norm": 0.5130513906478882,
"learning_rate": 0.00027728937728937727,
"loss": 0.8389,
"step": 459
},
{
"epoch": 0.5052862831250858,
"grad_norm": 0.4784137010574341,
"learning_rate": 0.0002771672771672771,
"loss": 0.5497,
"step": 460
},
{
"epoch": 0.5063847315666621,
"grad_norm": 0.28685998916625977,
"learning_rate": 0.00027704517704517704,
"loss": 0.491,
"step": 461
},
{
"epoch": 0.5074831800082383,
"grad_norm": 0.5337100625038147,
"learning_rate": 0.0002769230769230769,
"loss": 0.8315,
"step": 462
},
{
"epoch": 0.5085816284498146,
"grad_norm": 0.5431344509124756,
"learning_rate": 0.00027680097680097675,
"loss": 0.5996,
"step": 463
},
{
"epoch": 0.5096800768913909,
"grad_norm": 0.4546130299568176,
"learning_rate": 0.00027667887667887667,
"loss": 0.5647,
"step": 464
},
{
"epoch": 0.5107785253329672,
"grad_norm": 0.6298655271530151,
"learning_rate": 0.0002765567765567765,
"loss": 0.7684,
"step": 465
},
{
"epoch": 0.5118769737745434,
"grad_norm": 0.44330841302871704,
"learning_rate": 0.0002764346764346764,
"loss": 0.4906,
"step": 466
},
{
"epoch": 0.5129754222161197,
"grad_norm": 0.3824306130409241,
"learning_rate": 0.0002763125763125763,
"loss": 0.6123,
"step": 467
},
{
"epoch": 0.514073870657696,
"grad_norm": 0.3225514590740204,
"learning_rate": 0.00027619047619047615,
"loss": 0.7535,
"step": 468
},
{
"epoch": 0.5151723190992723,
"grad_norm": 0.701239824295044,
"learning_rate": 0.00027606837606837607,
"loss": 0.9643,
"step": 469
},
{
"epoch": 0.5162707675408486,
"grad_norm": 0.37800920009613037,
"learning_rate": 0.0002759462759462759,
"loss": 0.543,
"step": 470
},
{
"epoch": 0.5173692159824248,
"grad_norm": 0.3521328568458557,
"learning_rate": 0.0002758241758241758,
"loss": 0.7157,
"step": 471
},
{
"epoch": 0.518467664424001,
"grad_norm": 0.2659924626350403,
"learning_rate": 0.0002757020757020757,
"loss": 0.7334,
"step": 472
},
{
"epoch": 0.5195661128655774,
"grad_norm": 0.42815065383911133,
"learning_rate": 0.00027557997557997555,
"loss": 1.2015,
"step": 473
},
{
"epoch": 0.5206645613071537,
"grad_norm": 0.7758998870849609,
"learning_rate": 0.0002754578754578754,
"loss": 0.9493,
"step": 474
},
{
"epoch": 0.5217630097487299,
"grad_norm": 0.46281251311302185,
"learning_rate": 0.0002753357753357753,
"loss": 0.9159,
"step": 475
},
{
"epoch": 0.5228614581903062,
"grad_norm": 0.3668971061706543,
"learning_rate": 0.0002752136752136752,
"loss": 0.4869,
"step": 476
},
{
"epoch": 0.5239599066318824,
"grad_norm": 0.462534099817276,
"learning_rate": 0.00027509157509157504,
"loss": 0.6439,
"step": 477
},
{
"epoch": 0.5250583550734588,
"grad_norm": 0.6341688632965088,
"learning_rate": 0.00027496947496947495,
"loss": 0.6948,
"step": 478
},
{
"epoch": 0.526156803515035,
"grad_norm": 0.5469139814376831,
"learning_rate": 0.0002748473748473748,
"loss": 1.016,
"step": 479
},
{
"epoch": 0.5272552519566113,
"grad_norm": 0.438204288482666,
"learning_rate": 0.00027472527472527467,
"loss": 0.6941,
"step": 480
},
{
"epoch": 0.5283537003981875,
"grad_norm": 0.586700975894928,
"learning_rate": 0.0002746031746031746,
"loss": 0.6649,
"step": 481
},
{
"epoch": 0.5294521488397639,
"grad_norm": 0.4077949523925781,
"learning_rate": 0.0002744810744810745,
"loss": 0.5948,
"step": 482
},
{
"epoch": 0.5305505972813401,
"grad_norm": 0.3756411373615265,
"learning_rate": 0.0002743589743589743,
"loss": 0.4915,
"step": 483
},
{
"epoch": 0.5316490457229164,
"grad_norm": 1.2067008018493652,
"learning_rate": 0.0002742368742368742,
"loss": 0.8795,
"step": 484
},
{
"epoch": 0.5327474941644926,
"grad_norm": 0.3097778260707855,
"learning_rate": 0.0002741147741147741,
"loss": 0.5478,
"step": 485
},
{
"epoch": 0.5338459426060689,
"grad_norm": 0.5536866188049316,
"learning_rate": 0.000273992673992674,
"loss": 0.7042,
"step": 486
},
{
"epoch": 0.5349443910476452,
"grad_norm": 0.5930231809616089,
"learning_rate": 0.00027387057387057384,
"loss": 0.7108,
"step": 487
},
{
"epoch": 0.5360428394892215,
"grad_norm": 0.39304253458976746,
"learning_rate": 0.00027374847374847375,
"loss": 0.788,
"step": 488
},
{
"epoch": 0.5371412879307977,
"grad_norm": 0.5238274335861206,
"learning_rate": 0.0002736263736263736,
"loss": 0.9887,
"step": 489
},
{
"epoch": 0.538239736372374,
"grad_norm": 0.5993770956993103,
"learning_rate": 0.00027350427350427347,
"loss": 0.7819,
"step": 490
},
{
"epoch": 0.5393381848139503,
"grad_norm": 0.4601563811302185,
"learning_rate": 0.00027338217338217333,
"loss": 0.4347,
"step": 491
},
{
"epoch": 0.5404366332555266,
"grad_norm": 0.5292415022850037,
"learning_rate": 0.00027326007326007324,
"loss": 0.5248,
"step": 492
},
{
"epoch": 0.5415350816971028,
"grad_norm": 0.37247565388679504,
"learning_rate": 0.0002731379731379731,
"loss": 0.5412,
"step": 493
},
{
"epoch": 0.5426335301386791,
"grad_norm": 0.6865994930267334,
"learning_rate": 0.00027301587301587296,
"loss": 0.8263,
"step": 494
},
{
"epoch": 0.5437319785802553,
"grad_norm": 0.5019715428352356,
"learning_rate": 0.00027289377289377287,
"loss": 0.7084,
"step": 495
},
{
"epoch": 0.5448304270218317,
"grad_norm": 0.8432828783988953,
"learning_rate": 0.00027277167277167273,
"loss": 0.6188,
"step": 496
},
{
"epoch": 0.545928875463408,
"grad_norm": 0.594881534576416,
"learning_rate": 0.0002726495726495726,
"loss": 0.8923,
"step": 497
},
{
"epoch": 0.5470273239049842,
"grad_norm": 0.5573694705963135,
"learning_rate": 0.0002725274725274725,
"loss": 0.6351,
"step": 498
},
{
"epoch": 0.5481257723465605,
"grad_norm": 0.30426710844039917,
"learning_rate": 0.0002724053724053724,
"loss": 0.6359,
"step": 499
},
{
"epoch": 0.5492242207881367,
"grad_norm": 0.759385883808136,
"learning_rate": 0.00027228327228327227,
"loss": 0.6131,
"step": 500
},
{
"epoch": 0.5503226692297131,
"grad_norm": 0.5436901450157166,
"learning_rate": 0.00027216117216117213,
"loss": 0.5232,
"step": 501
},
{
"epoch": 0.5514211176712893,
"grad_norm": 0.5924163460731506,
"learning_rate": 0.00027203907203907204,
"loss": 0.9594,
"step": 502
},
{
"epoch": 0.5525195661128656,
"grad_norm": 0.49177658557891846,
"learning_rate": 0.0002719169719169719,
"loss": 0.842,
"step": 503
},
{
"epoch": 0.5536180145544418,
"grad_norm": 0.4437295198440552,
"learning_rate": 0.00027179487179487176,
"loss": 1.0338,
"step": 504
},
{
"epoch": 0.5547164629960182,
"grad_norm": 0.426213800907135,
"learning_rate": 0.00027167277167277167,
"loss": 0.6375,
"step": 505
},
{
"epoch": 0.5558149114375944,
"grad_norm": 0.4599516689777374,
"learning_rate": 0.00027155067155067153,
"loss": 0.5005,
"step": 506
},
{
"epoch": 0.5569133598791707,
"grad_norm": 0.647957980632782,
"learning_rate": 0.0002714285714285714,
"loss": 0.6292,
"step": 507
},
{
"epoch": 0.5580118083207469,
"grad_norm": 0.7891755104064941,
"learning_rate": 0.0002713064713064713,
"loss": 0.697,
"step": 508
},
{
"epoch": 0.5591102567623232,
"grad_norm": 0.5290817618370056,
"learning_rate": 0.00027118437118437116,
"loss": 0.4547,
"step": 509
},
{
"epoch": 0.5602087052038995,
"grad_norm": 0.4025941789150238,
"learning_rate": 0.000271062271062271,
"loss": 0.6299,
"step": 510
},
{
"epoch": 0.5613071536454758,
"grad_norm": 0.7768287658691406,
"learning_rate": 0.00027094017094017093,
"loss": 0.6813,
"step": 511
},
{
"epoch": 0.562405602087052,
"grad_norm": 0.6977662444114685,
"learning_rate": 0.0002708180708180708,
"loss": 0.8217,
"step": 512
},
{
"epoch": 0.5635040505286283,
"grad_norm": 0.5238949060440063,
"learning_rate": 0.0002706959706959707,
"loss": 0.7348,
"step": 513
},
{
"epoch": 0.5646024989702045,
"grad_norm": 0.5099830627441406,
"learning_rate": 0.00027057387057387056,
"loss": 0.9894,
"step": 514
},
{
"epoch": 0.5657009474117809,
"grad_norm": 0.6254756450653076,
"learning_rate": 0.0002704517704517704,
"loss": 0.9258,
"step": 515
},
{
"epoch": 0.5667993958533571,
"grad_norm": 0.40313196182250977,
"learning_rate": 0.00027032967032967033,
"loss": 0.8115,
"step": 516
},
{
"epoch": 0.5678978442949334,
"grad_norm": 0.9706575274467468,
"learning_rate": 0.0002702075702075702,
"loss": 0.5204,
"step": 517
},
{
"epoch": 0.5689962927365096,
"grad_norm": 0.36777085065841675,
"learning_rate": 0.00027008547008547005,
"loss": 0.7716,
"step": 518
},
{
"epoch": 0.570094741178086,
"grad_norm": 0.48726886510849,
"learning_rate": 0.00026996336996336996,
"loss": 0.7745,
"step": 519
},
{
"epoch": 0.5711931896196623,
"grad_norm": 0.3590470850467682,
"learning_rate": 0.0002698412698412698,
"loss": 0.7038,
"step": 520
},
{
"epoch": 0.5722916380612385,
"grad_norm": 0.7103118896484375,
"learning_rate": 0.0002697191697191697,
"loss": 0.8368,
"step": 521
},
{
"epoch": 0.5733900865028148,
"grad_norm": 0.5503933429718018,
"learning_rate": 0.0002695970695970696,
"loss": 0.6164,
"step": 522
},
{
"epoch": 0.574488534944391,
"grad_norm": 0.5255150198936462,
"learning_rate": 0.00026947496947496945,
"loss": 0.8886,
"step": 523
},
{
"epoch": 0.5755869833859674,
"grad_norm": 0.4872569739818573,
"learning_rate": 0.0002693528693528693,
"loss": 0.6277,
"step": 524
},
{
"epoch": 0.5766854318275436,
"grad_norm": 0.3748464584350586,
"learning_rate": 0.0002692307692307692,
"loss": 0.6471,
"step": 525
},
{
"epoch": 0.5777838802691199,
"grad_norm": 0.4401276111602783,
"learning_rate": 0.0002691086691086691,
"loss": 0.9846,
"step": 526
},
{
"epoch": 0.5788823287106961,
"grad_norm": 0.9565305709838867,
"learning_rate": 0.00026898656898656894,
"loss": 0.9471,
"step": 527
},
{
"epoch": 0.5799807771522724,
"grad_norm": 0.6307245492935181,
"learning_rate": 0.00026886446886446885,
"loss": 0.9168,
"step": 528
},
{
"epoch": 0.5810792255938487,
"grad_norm": 0.49177634716033936,
"learning_rate": 0.0002687423687423687,
"loss": 0.5464,
"step": 529
},
{
"epoch": 0.582177674035425,
"grad_norm": 0.68553626537323,
"learning_rate": 0.0002686202686202686,
"loss": 0.5874,
"step": 530
},
{
"epoch": 0.5832761224770012,
"grad_norm": 0.3811597228050232,
"learning_rate": 0.0002684981684981685,
"loss": 0.766,
"step": 531
},
{
"epoch": 0.5843745709185775,
"grad_norm": 0.6634503602981567,
"learning_rate": 0.00026837606837606834,
"loss": 0.6438,
"step": 532
},
{
"epoch": 0.5854730193601538,
"grad_norm": 0.6115571856498718,
"learning_rate": 0.00026825396825396825,
"loss": 0.8757,
"step": 533
},
{
"epoch": 0.5865714678017301,
"grad_norm": 0.3011985719203949,
"learning_rate": 0.0002681318681318681,
"loss": 0.6188,
"step": 534
},
{
"epoch": 0.5876699162433063,
"grad_norm": 0.7029386162757874,
"learning_rate": 0.00026800976800976797,
"loss": 0.8681,
"step": 535
},
{
"epoch": 0.5887683646848826,
"grad_norm": 0.4796508550643921,
"learning_rate": 0.0002678876678876679,
"loss": 0.7207,
"step": 536
},
{
"epoch": 0.5898668131264588,
"grad_norm": 0.542948842048645,
"learning_rate": 0.00026776556776556774,
"loss": 0.5587,
"step": 537
},
{
"epoch": 0.5909652615680352,
"grad_norm": 0.7566731572151184,
"learning_rate": 0.0002676434676434676,
"loss": 0.8562,
"step": 538
},
{
"epoch": 0.5920637100096114,
"grad_norm": 0.6411837339401245,
"learning_rate": 0.0002675213675213675,
"loss": 0.4516,
"step": 539
},
{
"epoch": 0.5931621584511877,
"grad_norm": 0.41434159874916077,
"learning_rate": 0.00026739926739926737,
"loss": 0.7069,
"step": 540
},
{
"epoch": 0.5942606068927639,
"grad_norm": 0.29941752552986145,
"learning_rate": 0.0002672771672771672,
"loss": 0.7444,
"step": 541
},
{
"epoch": 0.5953590553343402,
"grad_norm": 1.8168927431106567,
"learning_rate": 0.00026715506715506714,
"loss": 0.4947,
"step": 542
},
{
"epoch": 0.5964575037759166,
"grad_norm": 0.5639868974685669,
"learning_rate": 0.000267032967032967,
"loss": 0.6749,
"step": 543
},
{
"epoch": 0.5975559522174928,
"grad_norm": 0.5054119229316711,
"learning_rate": 0.0002669108669108669,
"loss": 0.8075,
"step": 544
},
{
"epoch": 0.598654400659069,
"grad_norm": 0.3531246483325958,
"learning_rate": 0.00026678876678876677,
"loss": 0.6986,
"step": 545
},
{
"epoch": 0.5997528491006453,
"grad_norm": 0.36428287625312805,
"learning_rate": 0.0002666666666666666,
"loss": 0.6496,
"step": 546
},
{
"epoch": 0.6008512975422217,
"grad_norm": 0.45706960558891296,
"learning_rate": 0.00026654456654456654,
"loss": 0.5646,
"step": 547
},
{
"epoch": 0.6019497459837979,
"grad_norm": 0.39326363801956177,
"learning_rate": 0.0002664224664224664,
"loss": 0.5037,
"step": 548
},
{
"epoch": 0.6030481944253742,
"grad_norm": 0.7158151268959045,
"learning_rate": 0.00026630036630036625,
"loss": 0.5643,
"step": 549
},
{
"epoch": 0.6041466428669504,
"grad_norm": 0.398335337638855,
"learning_rate": 0.00026617826617826617,
"loss": 0.5462,
"step": 550
},
{
"epoch": 0.6052450913085267,
"grad_norm": 0.8625812530517578,
"learning_rate": 0.000266056166056166,
"loss": 0.7898,
"step": 551
},
{
"epoch": 0.606343539750103,
"grad_norm": 0.5558099150657654,
"learning_rate": 0.0002659340659340659,
"loss": 0.7968,
"step": 552
},
{
"epoch": 0.6074419881916793,
"grad_norm": 0.6244741678237915,
"learning_rate": 0.0002658119658119658,
"loss": 0.9085,
"step": 553
},
{
"epoch": 0.6085404366332555,
"grad_norm": 0.4907127916812897,
"learning_rate": 0.00026568986568986565,
"loss": 0.5683,
"step": 554
},
{
"epoch": 0.6096388850748318,
"grad_norm": 0.6140159964561462,
"learning_rate": 0.0002655677655677655,
"loss": 0.5693,
"step": 555
},
{
"epoch": 0.610737333516408,
"grad_norm": 0.41251274943351746,
"learning_rate": 0.0002654456654456654,
"loss": 0.728,
"step": 556
},
{
"epoch": 0.6118357819579844,
"grad_norm": 0.43427684903144836,
"learning_rate": 0.00026532356532356534,
"loss": 0.5692,
"step": 557
},
{
"epoch": 0.6129342303995606,
"grad_norm": 0.41471078991889954,
"learning_rate": 0.00026520146520146514,
"loss": 0.6616,
"step": 558
},
{
"epoch": 0.6140326788411369,
"grad_norm": 0.4406953752040863,
"learning_rate": 0.00026507936507936506,
"loss": 0.4764,
"step": 559
},
{
"epoch": 0.6151311272827131,
"grad_norm": 7.233060359954834,
"learning_rate": 0.00026495726495726497,
"loss": 0.6111,
"step": 560
},
{
"epoch": 0.6162295757242895,
"grad_norm": 0.47008857131004333,
"learning_rate": 0.0002648351648351648,
"loss": 0.8145,
"step": 561
},
{
"epoch": 0.6173280241658657,
"grad_norm": 0.47636717557907104,
"learning_rate": 0.0002647130647130647,
"loss": 0.8036,
"step": 562
},
{
"epoch": 0.618426472607442,
"grad_norm": 0.526971161365509,
"learning_rate": 0.0002645909645909646,
"loss": 0.7559,
"step": 563
},
{
"epoch": 0.6195249210490182,
"grad_norm": 0.5027382373809814,
"learning_rate": 0.00026446886446886446,
"loss": 0.7765,
"step": 564
},
{
"epoch": 0.6206233694905945,
"grad_norm": 0.4222506284713745,
"learning_rate": 0.0002643467643467643,
"loss": 0.6376,
"step": 565
},
{
"epoch": 0.6217218179321709,
"grad_norm": 0.6390372514724731,
"learning_rate": 0.0002642246642246642,
"loss": 0.8224,
"step": 566
},
{
"epoch": 0.6228202663737471,
"grad_norm": 0.44495514035224915,
"learning_rate": 0.0002641025641025641,
"loss": 0.5995,
"step": 567
},
{
"epoch": 0.6239187148153233,
"grad_norm": 0.7005137205123901,
"learning_rate": 0.00026398046398046394,
"loss": 0.4986,
"step": 568
},
{
"epoch": 0.6250171632568996,
"grad_norm": 0.40745365619659424,
"learning_rate": 0.0002638583638583638,
"loss": 0.608,
"step": 569
},
{
"epoch": 0.6261156116984758,
"grad_norm": 0.3449142277240753,
"learning_rate": 0.0002637362637362637,
"loss": 0.6253,
"step": 570
},
{
"epoch": 0.6272140601400522,
"grad_norm": 0.4318457841873169,
"learning_rate": 0.00026361416361416357,
"loss": 0.6376,
"step": 571
},
{
"epoch": 0.6283125085816285,
"grad_norm": 2.2202258110046387,
"learning_rate": 0.00026349206349206343,
"loss": 0.5477,
"step": 572
},
{
"epoch": 0.6294109570232047,
"grad_norm": 0.6759721040725708,
"learning_rate": 0.00026336996336996334,
"loss": 1.1176,
"step": 573
},
{
"epoch": 0.630509405464781,
"grad_norm": 1.7796927690505981,
"learning_rate": 0.00026324786324786326,
"loss": 0.8713,
"step": 574
},
{
"epoch": 0.6316078539063573,
"grad_norm": 0.32952558994293213,
"learning_rate": 0.0002631257631257631,
"loss": 0.4711,
"step": 575
},
{
"epoch": 0.6327063023479336,
"grad_norm": 0.40390628576278687,
"learning_rate": 0.000263003663003663,
"loss": 0.5412,
"step": 576
},
{
"epoch": 0.6338047507895098,
"grad_norm": 0.7439208030700684,
"learning_rate": 0.0002628815628815629,
"loss": 0.7094,
"step": 577
},
{
"epoch": 0.6349031992310861,
"grad_norm": 0.34505775570869446,
"learning_rate": 0.00026275946275946274,
"loss": 0.5939,
"step": 578
},
{
"epoch": 0.6360016476726623,
"grad_norm": 0.9452011585235596,
"learning_rate": 0.0002626373626373626,
"loss": 0.5108,
"step": 579
},
{
"epoch": 0.6371000961142387,
"grad_norm": 0.42789551615715027,
"learning_rate": 0.0002625152625152625,
"loss": 0.5661,
"step": 580
},
{
"epoch": 0.6381985445558149,
"grad_norm": 0.3460575044155121,
"learning_rate": 0.0002623931623931624,
"loss": 0.8333,
"step": 581
},
{
"epoch": 0.6392969929973912,
"grad_norm": 0.8932168483734131,
"learning_rate": 0.00026227106227106223,
"loss": 0.7058,
"step": 582
},
{
"epoch": 0.6403954414389674,
"grad_norm": 0.8588842749595642,
"learning_rate": 0.00026214896214896214,
"loss": 0.6905,
"step": 583
},
{
"epoch": 0.6414938898805437,
"grad_norm": 0.5097251534461975,
"learning_rate": 0.000262026862026862,
"loss": 0.8189,
"step": 584
},
{
"epoch": 0.64259233832212,
"grad_norm": 0.45746755599975586,
"learning_rate": 0.00026190476190476186,
"loss": 0.7212,
"step": 585
},
{
"epoch": 0.6436907867636963,
"grad_norm": 0.9576689600944519,
"learning_rate": 0.0002617826617826618,
"loss": 0.6159,
"step": 586
},
{
"epoch": 0.6447892352052725,
"grad_norm": 0.5721899271011353,
"learning_rate": 0.00026166056166056163,
"loss": 0.6083,
"step": 587
},
{
"epoch": 0.6458876836468488,
"grad_norm": 0.4851115942001343,
"learning_rate": 0.00026153846153846154,
"loss": 0.7678,
"step": 588
},
{
"epoch": 0.6469861320884251,
"grad_norm": 0.6631761193275452,
"learning_rate": 0.0002614163614163614,
"loss": 0.7068,
"step": 589
},
{
"epoch": 0.6480845805300014,
"grad_norm": 0.6862382292747498,
"learning_rate": 0.00026129426129426126,
"loss": 0.5766,
"step": 590
},
{
"epoch": 0.6491830289715776,
"grad_norm": 0.3754968047142029,
"learning_rate": 0.0002611721611721612,
"loss": 0.7254,
"step": 591
},
{
"epoch": 0.6502814774131539,
"grad_norm": 0.5239700078964233,
"learning_rate": 0.00026105006105006103,
"loss": 0.5777,
"step": 592
},
{
"epoch": 0.6513799258547301,
"grad_norm": 0.5103443264961243,
"learning_rate": 0.0002609279609279609,
"loss": 1.0006,
"step": 593
},
{
"epoch": 0.6524783742963065,
"grad_norm": 0.4733884632587433,
"learning_rate": 0.0002608058608058608,
"loss": 0.6851,
"step": 594
},
{
"epoch": 0.6535768227378828,
"grad_norm": 0.5982065796852112,
"learning_rate": 0.00026068376068376066,
"loss": 0.6295,
"step": 595
},
{
"epoch": 0.654675271179459,
"grad_norm": 1.2408190965652466,
"learning_rate": 0.0002605616605616605,
"loss": 0.8806,
"step": 596
},
{
"epoch": 0.6557737196210353,
"grad_norm": 0.6005455851554871,
"learning_rate": 0.00026043956043956043,
"loss": 0.7186,
"step": 597
},
{
"epoch": 0.6568721680626116,
"grad_norm": 0.33777105808258057,
"learning_rate": 0.0002603174603174603,
"loss": 0.4599,
"step": 598
},
{
"epoch": 0.6579706165041879,
"grad_norm": 0.5336529612541199,
"learning_rate": 0.00026019536019536015,
"loss": 0.553,
"step": 599
},
{
"epoch": 0.6590690649457641,
"grad_norm": 0.6930931806564331,
"learning_rate": 0.00026007326007326006,
"loss": 0.5686,
"step": 600
},
{
"epoch": 0.6601675133873404,
"grad_norm": 1.1340439319610596,
"learning_rate": 0.0002599511599511599,
"loss": 0.5886,
"step": 601
},
{
"epoch": 0.6612659618289166,
"grad_norm": 0.9833797812461853,
"learning_rate": 0.0002598290598290598,
"loss": 0.7109,
"step": 602
},
{
"epoch": 0.662364410270493,
"grad_norm": 0.9305315017700195,
"learning_rate": 0.0002597069597069597,
"loss": 0.8341,
"step": 603
},
{
"epoch": 0.6634628587120692,
"grad_norm": 0.9753265380859375,
"learning_rate": 0.00025958485958485955,
"loss": 0.7102,
"step": 604
},
{
"epoch": 0.6645613071536455,
"grad_norm": 2.2342822551727295,
"learning_rate": 0.00025946275946275946,
"loss": 0.6784,
"step": 605
},
{
"epoch": 0.6656597555952217,
"grad_norm": 0.6815157532691956,
"learning_rate": 0.0002593406593406593,
"loss": 0.7689,
"step": 606
},
{
"epoch": 0.666758204036798,
"grad_norm": 0.7792591452598572,
"learning_rate": 0.0002592185592185592,
"loss": 0.9444,
"step": 607
},
{
"epoch": 0.6678566524783743,
"grad_norm": 0.668251097202301,
"learning_rate": 0.0002590964590964591,
"loss": 0.6899,
"step": 608
},
{
"epoch": 0.6689551009199506,
"grad_norm": 0.5041349530220032,
"learning_rate": 0.00025897435897435895,
"loss": 0.652,
"step": 609
},
{
"epoch": 0.6700535493615268,
"grad_norm": 0.35069939494132996,
"learning_rate": 0.0002588522588522588,
"loss": 0.8102,
"step": 610
},
{
"epoch": 0.6711519978031031,
"grad_norm": 3.324793577194214,
"learning_rate": 0.0002587301587301587,
"loss": 0.7936,
"step": 611
},
{
"epoch": 0.6722504462446794,
"grad_norm": 0.6778903007507324,
"learning_rate": 0.0002586080586080586,
"loss": 0.6258,
"step": 612
},
{
"epoch": 0.6733488946862557,
"grad_norm": 3.034745454788208,
"learning_rate": 0.00025848595848595844,
"loss": 0.697,
"step": 613
},
{
"epoch": 0.6744473431278319,
"grad_norm": 2.563870429992676,
"learning_rate": 0.00025836385836385835,
"loss": 0.7596,
"step": 614
},
{
"epoch": 0.6755457915694082,
"grad_norm": 0.45592913031578064,
"learning_rate": 0.0002582417582417582,
"loss": 0.7753,
"step": 615
},
{
"epoch": 0.6766442400109844,
"grad_norm": 0.7209720015525818,
"learning_rate": 0.00025811965811965807,
"loss": 0.6907,
"step": 616
},
{
"epoch": 0.6777426884525608,
"grad_norm": 0.4611949026584625,
"learning_rate": 0.000257997557997558,
"loss": 0.5896,
"step": 617
},
{
"epoch": 0.678841136894137,
"grad_norm": 1.3885395526885986,
"learning_rate": 0.0002578754578754579,
"loss": 0.6344,
"step": 618
},
{
"epoch": 0.6799395853357133,
"grad_norm": 0.544572651386261,
"learning_rate": 0.00025775335775335775,
"loss": 0.586,
"step": 619
},
{
"epoch": 0.6810380337772896,
"grad_norm": 0.5637034177780151,
"learning_rate": 0.0002576312576312576,
"loss": 0.8284,
"step": 620
},
{
"epoch": 0.6821364822188658,
"grad_norm": 1.170779824256897,
"learning_rate": 0.00025750915750915747,
"loss": 0.8818,
"step": 621
},
{
"epoch": 0.6832349306604422,
"grad_norm": 0.4877263605594635,
"learning_rate": 0.0002573870573870574,
"loss": 0.9179,
"step": 622
},
{
"epoch": 0.6843333791020184,
"grad_norm": 0.6684415340423584,
"learning_rate": 0.00025726495726495724,
"loss": 0.7358,
"step": 623
},
{
"epoch": 0.6854318275435947,
"grad_norm": 0.6679075956344604,
"learning_rate": 0.0002571428571428571,
"loss": 0.6342,
"step": 624
},
{
"epoch": 0.6865302759851709,
"grad_norm": 0.65242600440979,
"learning_rate": 0.000257020757020757,
"loss": 0.4762,
"step": 625
},
{
"epoch": 0.6876287244267473,
"grad_norm": 0.806523859500885,
"learning_rate": 0.00025689865689865687,
"loss": 0.7621,
"step": 626
},
{
"epoch": 0.6887271728683235,
"grad_norm": 1.09652578830719,
"learning_rate": 0.0002567765567765567,
"loss": 0.6594,
"step": 627
},
{
"epoch": 0.6898256213098998,
"grad_norm": 0.412505179643631,
"learning_rate": 0.00025665445665445664,
"loss": 0.8026,
"step": 628
},
{
"epoch": 0.690924069751476,
"grad_norm": 0.5801676511764526,
"learning_rate": 0.0002565323565323565,
"loss": 0.7026,
"step": 629
},
{
"epoch": 0.6920225181930523,
"grad_norm": 0.6822883486747742,
"learning_rate": 0.00025641025641025636,
"loss": 0.4372,
"step": 630
},
{
"epoch": 0.6931209666346286,
"grad_norm": 0.3455508351325989,
"learning_rate": 0.00025628815628815627,
"loss": 0.5624,
"step": 631
},
{
"epoch": 0.6942194150762049,
"grad_norm": 0.3533216714859009,
"learning_rate": 0.0002561660561660562,
"loss": 0.7493,
"step": 632
},
{
"epoch": 0.6953178635177811,
"grad_norm": 1.4306656122207642,
"learning_rate": 0.000256043956043956,
"loss": 0.7537,
"step": 633
},
{
"epoch": 0.6964163119593574,
"grad_norm": 0.336393266916275,
"learning_rate": 0.0002559218559218559,
"loss": 0.787,
"step": 634
},
{
"epoch": 0.6975147604009336,
"grad_norm": 0.5303547382354736,
"learning_rate": 0.0002557997557997558,
"loss": 0.5604,
"step": 635
},
{
"epoch": 0.69861320884251,
"grad_norm": 0.5421821475028992,
"learning_rate": 0.00025567765567765567,
"loss": 0.6905,
"step": 636
},
{
"epoch": 0.6997116572840862,
"grad_norm": 0.5445061922073364,
"learning_rate": 0.00025555555555555553,
"loss": 0.6389,
"step": 637
},
{
"epoch": 0.7008101057256625,
"grad_norm": 0.42832881212234497,
"learning_rate": 0.00025543345543345544,
"loss": 0.7825,
"step": 638
},
{
"epoch": 0.7019085541672387,
"grad_norm": 1.4624862670898438,
"learning_rate": 0.0002553113553113553,
"loss": 0.4964,
"step": 639
},
{
"epoch": 0.7030070026088151,
"grad_norm": 0.38657426834106445,
"learning_rate": 0.00025518925518925516,
"loss": 0.5299,
"step": 640
},
{
"epoch": 0.7041054510503914,
"grad_norm": 14.422834396362305,
"learning_rate": 0.00025506715506715507,
"loss": 0.5008,
"step": 641
},
{
"epoch": 0.7052038994919676,
"grad_norm": 0.591106653213501,
"learning_rate": 0.00025494505494505493,
"loss": 0.6732,
"step": 642
},
{
"epoch": 0.7063023479335439,
"grad_norm": 1.6697375774383545,
"learning_rate": 0.0002548229548229548,
"loss": 0.6782,
"step": 643
},
{
"epoch": 0.7074007963751201,
"grad_norm": 1.670777678489685,
"learning_rate": 0.0002547008547008547,
"loss": 0.5275,
"step": 644
},
{
"epoch": 0.7084992448166965,
"grad_norm": 2.3361563682556152,
"learning_rate": 0.00025457875457875456,
"loss": 0.4177,
"step": 645
},
{
"epoch": 0.7095976932582727,
"grad_norm": 1.823844313621521,
"learning_rate": 0.0002544566544566544,
"loss": 0.5438,
"step": 646
},
{
"epoch": 0.710696141699849,
"grad_norm": 0.5374146699905396,
"learning_rate": 0.0002543345543345543,
"loss": 0.6704,
"step": 647
},
{
"epoch": 0.7117945901414252,
"grad_norm": 0.9709361791610718,
"learning_rate": 0.0002542124542124542,
"loss": 0.8896,
"step": 648
},
{
"epoch": 0.7128930385830015,
"grad_norm": 0.7118197083473206,
"learning_rate": 0.0002540903540903541,
"loss": 0.766,
"step": 649
},
{
"epoch": 0.7139914870245778,
"grad_norm": 0.4597225487232208,
"learning_rate": 0.00025396825396825396,
"loss": 0.7498,
"step": 650
},
{
"epoch": 0.7150899354661541,
"grad_norm": 0.9708977937698364,
"learning_rate": 0.0002538461538461538,
"loss": 0.7602,
"step": 651
},
{
"epoch": 0.7161883839077303,
"grad_norm": 0.8156960606575012,
"learning_rate": 0.00025372405372405373,
"loss": 1.1105,
"step": 652
},
{
"epoch": 0.7172868323493066,
"grad_norm": 1.4135644435882568,
"learning_rate": 0.0002536019536019536,
"loss": 0.9203,
"step": 653
},
{
"epoch": 0.7183852807908829,
"grad_norm": 0.5754226446151733,
"learning_rate": 0.00025347985347985344,
"loss": 0.5368,
"step": 654
},
{
"epoch": 0.7194837292324592,
"grad_norm": 1.7644588947296143,
"learning_rate": 0.00025335775335775336,
"loss": 0.6451,
"step": 655
},
{
"epoch": 0.7205821776740354,
"grad_norm": 4.35576868057251,
"learning_rate": 0.0002532356532356532,
"loss": 0.6732,
"step": 656
},
{
"epoch": 0.7216806261156117,
"grad_norm": 1.1072558164596558,
"learning_rate": 0.0002531135531135531,
"loss": 0.7901,
"step": 657
},
{
"epoch": 0.7227790745571879,
"grad_norm": 0.3916113078594208,
"learning_rate": 0.000252991452991453,
"loss": 0.7153,
"step": 658
},
{
"epoch": 0.7238775229987643,
"grad_norm": 1.055137276649475,
"learning_rate": 0.00025286935286935285,
"loss": 0.8664,
"step": 659
},
{
"epoch": 0.7249759714403405,
"grad_norm": 0.5966087579727173,
"learning_rate": 0.0002527472527472527,
"loss": 0.933,
"step": 660
},
{
"epoch": 0.7260744198819168,
"grad_norm": 0.40958529710769653,
"learning_rate": 0.0002526251526251526,
"loss": 0.7196,
"step": 661
},
{
"epoch": 0.727172868323493,
"grad_norm": 0.4636710584163666,
"learning_rate": 0.0002525030525030525,
"loss": 0.7039,
"step": 662
},
{
"epoch": 0.7282713167650693,
"grad_norm": 0.6967337131500244,
"learning_rate": 0.0002523809523809524,
"loss": 0.8981,
"step": 663
},
{
"epoch": 0.7293697652066456,
"grad_norm": 0.49781784415245056,
"learning_rate": 0.00025225885225885225,
"loss": 0.7239,
"step": 664
},
{
"epoch": 0.7304682136482219,
"grad_norm": 0.940851628780365,
"learning_rate": 0.0002521367521367521,
"loss": 0.8199,
"step": 665
},
{
"epoch": 0.7315666620897981,
"grad_norm": 1.0271226167678833,
"learning_rate": 0.000252014652014652,
"loss": 0.6757,
"step": 666
},
{
"epoch": 0.7326651105313744,
"grad_norm": 0.5299912095069885,
"learning_rate": 0.0002518925518925519,
"loss": 0.8464,
"step": 667
},
{
"epoch": 0.7337635589729508,
"grad_norm": 0.7060052156448364,
"learning_rate": 0.00025177045177045173,
"loss": 0.6541,
"step": 668
},
{
"epoch": 0.734862007414527,
"grad_norm": 0.5419691205024719,
"learning_rate": 0.00025164835164835165,
"loss": 0.8741,
"step": 669
},
{
"epoch": 0.7359604558561033,
"grad_norm": 0.6363463401794434,
"learning_rate": 0.0002515262515262515,
"loss": 0.7224,
"step": 670
},
{
"epoch": 0.7370589042976795,
"grad_norm": 0.7622922658920288,
"learning_rate": 0.00025140415140415136,
"loss": 0.9402,
"step": 671
},
{
"epoch": 0.7381573527392558,
"grad_norm": 0.7477490305900574,
"learning_rate": 0.0002512820512820513,
"loss": 0.6036,
"step": 672
},
{
"epoch": 0.7392558011808321,
"grad_norm": 0.4813562333583832,
"learning_rate": 0.00025115995115995113,
"loss": 0.5982,
"step": 673
},
{
"epoch": 0.7403542496224084,
"grad_norm": 3.112766981124878,
"learning_rate": 0.000251037851037851,
"loss": 0.5825,
"step": 674
},
{
"epoch": 0.7414526980639846,
"grad_norm": 0.9523088932037354,
"learning_rate": 0.0002509157509157509,
"loss": 0.5698,
"step": 675
},
{
"epoch": 0.7425511465055609,
"grad_norm": 0.3426001965999603,
"learning_rate": 0.00025079365079365076,
"loss": 0.5516,
"step": 676
},
{
"epoch": 0.7436495949471371,
"grad_norm": 0.4866350591182709,
"learning_rate": 0.0002506715506715506,
"loss": 0.5466,
"step": 677
},
{
"epoch": 0.7447480433887135,
"grad_norm": 0.6590595245361328,
"learning_rate": 0.00025054945054945053,
"loss": 0.7579,
"step": 678
},
{
"epoch": 0.7458464918302897,
"grad_norm": 0.36733704805374146,
"learning_rate": 0.0002504273504273504,
"loss": 0.5114,
"step": 679
},
{
"epoch": 0.746944940271866,
"grad_norm": 0.5890951156616211,
"learning_rate": 0.0002503052503052503,
"loss": 0.7196,
"step": 680
},
{
"epoch": 0.7480433887134422,
"grad_norm": 0.8393438458442688,
"learning_rate": 0.00025018315018315016,
"loss": 0.6291,
"step": 681
},
{
"epoch": 0.7491418371550186,
"grad_norm": 0.9745636582374573,
"learning_rate": 0.00025006105006105,
"loss": 0.8675,
"step": 682
},
{
"epoch": 0.7502402855965948,
"grad_norm": 1.1764310598373413,
"learning_rate": 0.00024993894993894993,
"loss": 0.9384,
"step": 683
},
{
"epoch": 0.7513387340381711,
"grad_norm": 0.6199970245361328,
"learning_rate": 0.0002498168498168498,
"loss": 0.5984,
"step": 684
},
{
"epoch": 0.7524371824797473,
"grad_norm": 2.2708802223205566,
"learning_rate": 0.00024969474969474965,
"loss": 0.7867,
"step": 685
},
{
"epoch": 0.7535356309213236,
"grad_norm": 0.6731462478637695,
"learning_rate": 0.00024957264957264956,
"loss": 0.5377,
"step": 686
},
{
"epoch": 0.7546340793629,
"grad_norm": 0.991669774055481,
"learning_rate": 0.0002494505494505494,
"loss": 0.7015,
"step": 687
},
{
"epoch": 0.7557325278044762,
"grad_norm": 0.5873506665229797,
"learning_rate": 0.0002493284493284493,
"loss": 0.567,
"step": 688
},
{
"epoch": 0.7568309762460524,
"grad_norm": 1.5025473833084106,
"learning_rate": 0.0002492063492063492,
"loss": 0.6264,
"step": 689
},
{
"epoch": 0.7579294246876287,
"grad_norm": 0.4942665696144104,
"learning_rate": 0.00024908424908424905,
"loss": 0.7623,
"step": 690
},
{
"epoch": 0.7590278731292049,
"grad_norm": 0.5522105693817139,
"learning_rate": 0.0002489621489621489,
"loss": 0.6192,
"step": 691
},
{
"epoch": 0.7601263215707813,
"grad_norm": 1.25243079662323,
"learning_rate": 0.0002488400488400488,
"loss": 0.8547,
"step": 692
},
{
"epoch": 0.7612247700123576,
"grad_norm": 0.5228685140609741,
"learning_rate": 0.00024871794871794874,
"loss": 0.7365,
"step": 693
},
{
"epoch": 0.7623232184539338,
"grad_norm": 1.5090827941894531,
"learning_rate": 0.0002485958485958486,
"loss": 0.9226,
"step": 694
},
{
"epoch": 0.76342166689551,
"grad_norm": 3.3617379665374756,
"learning_rate": 0.00024847374847374845,
"loss": 0.7942,
"step": 695
},
{
"epoch": 0.7645201153370864,
"grad_norm": 0.5350137948989868,
"learning_rate": 0.0002483516483516483,
"loss": 0.6254,
"step": 696
},
{
"epoch": 0.7656185637786627,
"grad_norm": 0.8871312141418457,
"learning_rate": 0.0002482295482295482,
"loss": 0.8241,
"step": 697
},
{
"epoch": 0.7667170122202389,
"grad_norm": 0.48593926429748535,
"learning_rate": 0.0002481074481074481,
"loss": 0.5707,
"step": 698
},
{
"epoch": 0.7678154606618152,
"grad_norm": 0.7460000514984131,
"learning_rate": 0.00024798534798534794,
"loss": 0.9521,
"step": 699
},
{
"epoch": 0.7689139091033914,
"grad_norm": 0.7105034589767456,
"learning_rate": 0.00024786324786324785,
"loss": 0.7513,
"step": 700
},
{
"epoch": 0.7700123575449678,
"grad_norm": 0.40251481533050537,
"learning_rate": 0.0002477411477411477,
"loss": 0.6067,
"step": 701
},
{
"epoch": 0.771110805986544,
"grad_norm": 0.452709436416626,
"learning_rate": 0.00024761904761904757,
"loss": 0.671,
"step": 702
},
{
"epoch": 0.7722092544281203,
"grad_norm": 0.581453263759613,
"learning_rate": 0.0002474969474969475,
"loss": 0.5356,
"step": 703
},
{
"epoch": 0.7733077028696965,
"grad_norm": 0.8013669848442078,
"learning_rate": 0.00024737484737484734,
"loss": 0.6889,
"step": 704
},
{
"epoch": 0.7744061513112728,
"grad_norm": 1.1480565071105957,
"learning_rate": 0.0002472527472527472,
"loss": 0.7456,
"step": 705
},
{
"epoch": 0.7755045997528491,
"grad_norm": 0.7568329572677612,
"learning_rate": 0.0002471306471306471,
"loss": 0.7455,
"step": 706
},
{
"epoch": 0.7766030481944254,
"grad_norm": 0.4223226308822632,
"learning_rate": 0.000247008547008547,
"loss": 0.7138,
"step": 707
},
{
"epoch": 0.7777014966360016,
"grad_norm": 0.372872531414032,
"learning_rate": 0.00024688644688644683,
"loss": 0.8037,
"step": 708
},
{
"epoch": 0.7787999450775779,
"grad_norm": 0.968614399433136,
"learning_rate": 0.00024676434676434674,
"loss": 0.5943,
"step": 709
},
{
"epoch": 0.7798983935191542,
"grad_norm": 0.801157534122467,
"learning_rate": 0.00024664224664224665,
"loss": 0.9467,
"step": 710
},
{
"epoch": 0.7809968419607305,
"grad_norm": 0.7115808129310608,
"learning_rate": 0.0002465201465201465,
"loss": 0.7828,
"step": 711
},
{
"epoch": 0.7820952904023067,
"grad_norm": 1.2951349020004272,
"learning_rate": 0.00024639804639804637,
"loss": 0.6221,
"step": 712
},
{
"epoch": 0.783193738843883,
"grad_norm": 0.47706693410873413,
"learning_rate": 0.0002462759462759463,
"loss": 0.3641,
"step": 713
},
{
"epoch": 0.7842921872854592,
"grad_norm": 0.8871097564697266,
"learning_rate": 0.00024615384615384614,
"loss": 0.6177,
"step": 714
},
{
"epoch": 0.7853906357270356,
"grad_norm": 0.7920973896980286,
"learning_rate": 0.000246031746031746,
"loss": 0.5858,
"step": 715
},
{
"epoch": 0.7864890841686119,
"grad_norm": 0.49732694029808044,
"learning_rate": 0.0002459096459096459,
"loss": 0.5176,
"step": 716
},
{
"epoch": 0.7875875326101881,
"grad_norm": 0.34965720772743225,
"learning_rate": 0.00024578754578754577,
"loss": 0.4983,
"step": 717
},
{
"epoch": 0.7886859810517644,
"grad_norm": 0.45963025093078613,
"learning_rate": 0.00024566544566544563,
"loss": 0.7756,
"step": 718
},
{
"epoch": 0.7897844294933407,
"grad_norm": 0.5802373290061951,
"learning_rate": 0.00024554334554334554,
"loss": 0.5773,
"step": 719
},
{
"epoch": 0.790882877934917,
"grad_norm": 1.8482742309570312,
"learning_rate": 0.0002454212454212454,
"loss": 0.7978,
"step": 720
},
{
"epoch": 0.7919813263764932,
"grad_norm": 0.5821959972381592,
"learning_rate": 0.00024529914529914526,
"loss": 0.7483,
"step": 721
},
{
"epoch": 0.7930797748180695,
"grad_norm": 0.9352701306343079,
"learning_rate": 0.0002451770451770451,
"loss": 0.6979,
"step": 722
},
{
"epoch": 0.7941782232596457,
"grad_norm": 0.554032564163208,
"learning_rate": 0.00024505494505494503,
"loss": 0.6773,
"step": 723
},
{
"epoch": 0.7952766717012221,
"grad_norm": 0.6914504766464233,
"learning_rate": 0.00024493284493284494,
"loss": 0.6548,
"step": 724
},
{
"epoch": 0.7963751201427983,
"grad_norm": 0.40804949402809143,
"learning_rate": 0.0002448107448107448,
"loss": 0.4634,
"step": 725
},
{
"epoch": 0.7974735685843746,
"grad_norm": 0.4965716302394867,
"learning_rate": 0.00024468864468864466,
"loss": 0.4879,
"step": 726
},
{
"epoch": 0.7985720170259508,
"grad_norm": 0.48798999190330505,
"learning_rate": 0.00024456654456654457,
"loss": 0.7003,
"step": 727
},
{
"epoch": 0.7996704654675271,
"grad_norm": 0.6946013569831848,
"learning_rate": 0.00024444444444444443,
"loss": 0.7508,
"step": 728
},
{
"epoch": 0.8007689139091034,
"grad_norm": 0.4310678243637085,
"learning_rate": 0.0002443223443223443,
"loss": 0.5765,
"step": 729
},
{
"epoch": 0.8018673623506797,
"grad_norm": 0.5407636761665344,
"learning_rate": 0.0002442002442002442,
"loss": 0.5445,
"step": 730
},
{
"epoch": 0.8029658107922559,
"grad_norm": 0.6281490921974182,
"learning_rate": 0.00024407814407814403,
"loss": 0.9319,
"step": 731
},
{
"epoch": 0.8040642592338322,
"grad_norm": 1.2027008533477783,
"learning_rate": 0.00024395604395604394,
"loss": 0.3957,
"step": 732
},
{
"epoch": 0.8051627076754085,
"grad_norm": 0.543230414390564,
"learning_rate": 0.00024383394383394383,
"loss": 0.7919,
"step": 733
},
{
"epoch": 0.8062611561169848,
"grad_norm": 0.4269828498363495,
"learning_rate": 0.0002437118437118437,
"loss": 0.6081,
"step": 734
},
{
"epoch": 0.807359604558561,
"grad_norm": 1.2857966423034668,
"learning_rate": 0.00024358974358974357,
"loss": 0.8654,
"step": 735
},
{
"epoch": 0.8084580530001373,
"grad_norm": 0.6370485424995422,
"learning_rate": 0.00024346764346764346,
"loss": 0.8053,
"step": 736
},
{
"epoch": 0.8095565014417135,
"grad_norm": 1.1288559436798096,
"learning_rate": 0.00024334554334554332,
"loss": 0.8709,
"step": 737
},
{
"epoch": 0.8106549498832899,
"grad_norm": 0.5601497292518616,
"learning_rate": 0.0002432234432234432,
"loss": 0.7982,
"step": 738
},
{
"epoch": 0.8117533983248661,
"grad_norm": 0.476745069026947,
"learning_rate": 0.0002431013431013431,
"loss": 0.7372,
"step": 739
},
{
"epoch": 0.8128518467664424,
"grad_norm": 0.4287762939929962,
"learning_rate": 0.00024297924297924295,
"loss": 0.5686,
"step": 740
},
{
"epoch": 0.8139502952080186,
"grad_norm": 0.7039306163787842,
"learning_rate": 0.00024285714285714283,
"loss": 0.7976,
"step": 741
},
{
"epoch": 0.8150487436495949,
"grad_norm": 0.47433528304100037,
"learning_rate": 0.00024273504273504272,
"loss": 0.6375,
"step": 742
},
{
"epoch": 0.8161471920911713,
"grad_norm": 0.5443944931030273,
"learning_rate": 0.00024261294261294258,
"loss": 0.6793,
"step": 743
},
{
"epoch": 0.8172456405327475,
"grad_norm": 0.516094982624054,
"learning_rate": 0.00024249084249084246,
"loss": 0.785,
"step": 744
},
{
"epoch": 0.8183440889743238,
"grad_norm": 0.6694304347038269,
"learning_rate": 0.00024236874236874237,
"loss": 0.5431,
"step": 745
},
{
"epoch": 0.8194425374159,
"grad_norm": 0.5309669375419617,
"learning_rate": 0.00024224664224664223,
"loss": 0.5806,
"step": 746
},
{
"epoch": 0.8205409858574764,
"grad_norm": 0.5502971410751343,
"learning_rate": 0.00024212454212454212,
"loss": 0.5053,
"step": 747
},
{
"epoch": 0.8216394342990526,
"grad_norm": 0.5242869853973389,
"learning_rate": 0.00024200244200244198,
"loss": 0.8189,
"step": 748
},
{
"epoch": 0.8227378827406289,
"grad_norm": 0.4131311774253845,
"learning_rate": 0.00024188034188034186,
"loss": 0.7074,
"step": 749
},
{
"epoch": 0.8238363311822051,
"grad_norm": 0.599915087223053,
"learning_rate": 0.00024175824175824175,
"loss": 0.9408,
"step": 750
},
{
"epoch": 0.8249347796237814,
"grad_norm": 0.3683515191078186,
"learning_rate": 0.0002416361416361416,
"loss": 0.6675,
"step": 751
},
{
"epoch": 0.8260332280653577,
"grad_norm": 1.633415699005127,
"learning_rate": 0.0002415140415140415,
"loss": 0.6768,
"step": 752
},
{
"epoch": 0.827131676506934,
"grad_norm": 0.3848377764225006,
"learning_rate": 0.00024139194139194138,
"loss": 0.485,
"step": 753
},
{
"epoch": 0.8282301249485102,
"grad_norm": 0.4116027355194092,
"learning_rate": 0.00024126984126984123,
"loss": 0.8253,
"step": 754
},
{
"epoch": 0.8293285733900865,
"grad_norm": 0.5805407762527466,
"learning_rate": 0.00024114774114774112,
"loss": 0.825,
"step": 755
},
{
"epoch": 0.8304270218316627,
"grad_norm": 1.2401742935180664,
"learning_rate": 0.000241025641025641,
"loss": 0.6394,
"step": 756
},
{
"epoch": 0.8315254702732391,
"grad_norm": 0.42345038056373596,
"learning_rate": 0.00024090354090354086,
"loss": 0.6958,
"step": 757
},
{
"epoch": 0.8326239187148153,
"grad_norm": 1.3758116960525513,
"learning_rate": 0.00024078144078144075,
"loss": 0.6997,
"step": 758
},
{
"epoch": 0.8337223671563916,
"grad_norm": 1.1826672554016113,
"learning_rate": 0.00024065934065934066,
"loss": 0.7908,
"step": 759
},
{
"epoch": 0.8348208155979678,
"grad_norm": 1.0752373933792114,
"learning_rate": 0.0002405372405372405,
"loss": 0.8896,
"step": 760
},
{
"epoch": 0.8359192640395442,
"grad_norm": 0.3347112834453583,
"learning_rate": 0.0002404151404151404,
"loss": 0.8202,
"step": 761
},
{
"epoch": 0.8370177124811204,
"grad_norm": 0.5837082266807556,
"learning_rate": 0.0002402930402930403,
"loss": 0.7502,
"step": 762
},
{
"epoch": 0.8381161609226967,
"grad_norm": 0.5439388751983643,
"learning_rate": 0.00024017094017094015,
"loss": 0.6928,
"step": 763
},
{
"epoch": 0.839214609364273,
"grad_norm": 0.35348060727119446,
"learning_rate": 0.00024004884004884004,
"loss": 0.5495,
"step": 764
},
{
"epoch": 0.8403130578058492,
"grad_norm": 0.4943974018096924,
"learning_rate": 0.00023992673992673992,
"loss": 0.9218,
"step": 765
},
{
"epoch": 0.8414115062474256,
"grad_norm": 0.628667414188385,
"learning_rate": 0.00023980463980463978,
"loss": 0.6266,
"step": 766
},
{
"epoch": 0.8425099546890018,
"grad_norm": 0.822575032711029,
"learning_rate": 0.00023968253968253966,
"loss": 0.791,
"step": 767
},
{
"epoch": 0.843608403130578,
"grad_norm": 0.3044184446334839,
"learning_rate": 0.00023956043956043955,
"loss": 0.6048,
"step": 768
},
{
"epoch": 0.8447068515721543,
"grad_norm": 0.40807369351387024,
"learning_rate": 0.0002394383394383394,
"loss": 0.6286,
"step": 769
},
{
"epoch": 0.8458053000137306,
"grad_norm": 1.2373838424682617,
"learning_rate": 0.0002393162393162393,
"loss": 0.5133,
"step": 770
},
{
"epoch": 0.8469037484553069,
"grad_norm": 0.5104987025260925,
"learning_rate": 0.00023919413919413918,
"loss": 0.591,
"step": 771
},
{
"epoch": 0.8480021968968832,
"grad_norm": 0.6644220352172852,
"learning_rate": 0.00023907203907203904,
"loss": 0.7039,
"step": 772
},
{
"epoch": 0.8491006453384594,
"grad_norm": 0.5887960195541382,
"learning_rate": 0.00023894993894993892,
"loss": 0.7017,
"step": 773
},
{
"epoch": 0.8501990937800357,
"grad_norm": 0.6568577885627747,
"learning_rate": 0.00023882783882783878,
"loss": 0.6131,
"step": 774
},
{
"epoch": 0.851297542221612,
"grad_norm": 0.6594721674919128,
"learning_rate": 0.00023870573870573867,
"loss": 0.6079,
"step": 775
},
{
"epoch": 0.8523959906631883,
"grad_norm": 12.29937744140625,
"learning_rate": 0.00023858363858363858,
"loss": 1.1068,
"step": 776
},
{
"epoch": 0.8534944391047645,
"grad_norm": 1.175355315208435,
"learning_rate": 0.00023846153846153844,
"loss": 0.734,
"step": 777
},
{
"epoch": 0.8545928875463408,
"grad_norm": 1.7128019332885742,
"learning_rate": 0.00023833943833943832,
"loss": 0.6395,
"step": 778
},
{
"epoch": 0.855691335987917,
"grad_norm": 0.6479717493057251,
"learning_rate": 0.0002382173382173382,
"loss": 0.8572,
"step": 779
},
{
"epoch": 0.8567897844294934,
"grad_norm": 0.9646544456481934,
"learning_rate": 0.00023809523809523807,
"loss": 1.1168,
"step": 780
},
{
"epoch": 0.8578882328710696,
"grad_norm": 0.8290930986404419,
"learning_rate": 0.00023797313797313795,
"loss": 0.4413,
"step": 781
},
{
"epoch": 0.8589866813126459,
"grad_norm": 0.6690389513969421,
"learning_rate": 0.00023785103785103784,
"loss": 1.1878,
"step": 782
},
{
"epoch": 0.8600851297542221,
"grad_norm": 0.6602356433868408,
"learning_rate": 0.0002377289377289377,
"loss": 0.5862,
"step": 783
},
{
"epoch": 0.8611835781957984,
"grad_norm": 0.612316370010376,
"learning_rate": 0.00023760683760683758,
"loss": 0.7971,
"step": 784
},
{
"epoch": 0.8622820266373747,
"grad_norm": 0.7429434657096863,
"learning_rate": 0.00023748473748473747,
"loss": 0.6265,
"step": 785
},
{
"epoch": 0.863380475078951,
"grad_norm": 0.40107640624046326,
"learning_rate": 0.00023736263736263733,
"loss": 0.6697,
"step": 786
},
{
"epoch": 0.8644789235205272,
"grad_norm": 0.45808035135269165,
"learning_rate": 0.0002372405372405372,
"loss": 0.7443,
"step": 787
},
{
"epoch": 0.8655773719621035,
"grad_norm": 0.36327049136161804,
"learning_rate": 0.0002371184371184371,
"loss": 0.6518,
"step": 788
},
{
"epoch": 0.8666758204036799,
"grad_norm": 0.45617833733558655,
"learning_rate": 0.00023699633699633696,
"loss": 0.792,
"step": 789
},
{
"epoch": 0.8677742688452561,
"grad_norm": 0.5354835391044617,
"learning_rate": 0.00023687423687423687,
"loss": 0.7788,
"step": 790
},
{
"epoch": 0.8688727172868324,
"grad_norm": 0.9770327210426331,
"learning_rate": 0.00023675213675213675,
"loss": 0.7267,
"step": 791
},
{
"epoch": 0.8699711657284086,
"grad_norm": 0.646757960319519,
"learning_rate": 0.0002366300366300366,
"loss": 0.7234,
"step": 792
},
{
"epoch": 0.8710696141699849,
"grad_norm": 0.4694693982601166,
"learning_rate": 0.0002365079365079365,
"loss": 0.8261,
"step": 793
},
{
"epoch": 0.8721680626115612,
"grad_norm": 0.9923954606056213,
"learning_rate": 0.00023638583638583638,
"loss": 0.703,
"step": 794
},
{
"epoch": 0.8732665110531375,
"grad_norm": 1.6440534591674805,
"learning_rate": 0.00023626373626373624,
"loss": 0.7654,
"step": 795
},
{
"epoch": 0.8743649594947137,
"grad_norm": 0.3947128653526306,
"learning_rate": 0.00023614163614163613,
"loss": 0.637,
"step": 796
},
{
"epoch": 0.87546340793629,
"grad_norm": 3.4264323711395264,
"learning_rate": 0.000236019536019536,
"loss": 0.7325,
"step": 797
},
{
"epoch": 0.8765618563778662,
"grad_norm": 0.5469256043434143,
"learning_rate": 0.00023589743589743587,
"loss": 0.8203,
"step": 798
},
{
"epoch": 0.8776603048194426,
"grad_norm": 0.5184471011161804,
"learning_rate": 0.00023577533577533576,
"loss": 0.7895,
"step": 799
},
{
"epoch": 0.8787587532610188,
"grad_norm": 0.8231347799301147,
"learning_rate": 0.00023565323565323562,
"loss": 0.7888,
"step": 800
},
{
"epoch": 0.8798572017025951,
"grad_norm": 14.826855659484863,
"learning_rate": 0.0002355311355311355,
"loss": 0.7564,
"step": 801
},
{
"epoch": 0.8809556501441713,
"grad_norm": 0.5809927582740784,
"learning_rate": 0.00023540903540903539,
"loss": 0.6702,
"step": 802
},
{
"epoch": 0.8820540985857477,
"grad_norm": 0.7244674563407898,
"learning_rate": 0.00023528693528693524,
"loss": 0.6475,
"step": 803
},
{
"epoch": 0.8831525470273239,
"grad_norm": 0.8071272373199463,
"learning_rate": 0.00023516483516483513,
"loss": 0.7434,
"step": 804
},
{
"epoch": 0.8842509954689002,
"grad_norm": 0.6872429847717285,
"learning_rate": 0.00023504273504273504,
"loss": 0.5968,
"step": 805
},
{
"epoch": 0.8853494439104764,
"grad_norm": 9.353965759277344,
"learning_rate": 0.00023492063492063487,
"loss": 0.4228,
"step": 806
},
{
"epoch": 0.8864478923520527,
"grad_norm": 0.47151222825050354,
"learning_rate": 0.00023479853479853479,
"loss": 0.6832,
"step": 807
},
{
"epoch": 0.887546340793629,
"grad_norm": 1.4599422216415405,
"learning_rate": 0.00023467643467643467,
"loss": 0.6692,
"step": 808
},
{
"epoch": 0.8886447892352053,
"grad_norm": 0.45811519026756287,
"learning_rate": 0.00023455433455433453,
"loss": 0.787,
"step": 809
},
{
"epoch": 0.8897432376767815,
"grad_norm": 1.077709674835205,
"learning_rate": 0.00023443223443223442,
"loss": 0.6695,
"step": 810
},
{
"epoch": 0.8908416861183578,
"grad_norm": 0.5702061057090759,
"learning_rate": 0.0002343101343101343,
"loss": 0.5858,
"step": 811
},
{
"epoch": 0.891940134559934,
"grad_norm": 2.2391059398651123,
"learning_rate": 0.00023418803418803416,
"loss": 0.6688,
"step": 812
},
{
"epoch": 0.8930385830015104,
"grad_norm": 1.6974279880523682,
"learning_rate": 0.00023406593406593405,
"loss": 0.8545,
"step": 813
},
{
"epoch": 0.8941370314430866,
"grad_norm": 0.983435869216919,
"learning_rate": 0.00023394383394383393,
"loss": 0.8128,
"step": 814
},
{
"epoch": 0.8952354798846629,
"grad_norm": 0.44103240966796875,
"learning_rate": 0.0002338217338217338,
"loss": 0.7968,
"step": 815
},
{
"epoch": 0.8963339283262391,
"grad_norm": 1.0707038640975952,
"learning_rate": 0.00023369963369963367,
"loss": 0.6996,
"step": 816
},
{
"epoch": 0.8974323767678155,
"grad_norm": 0.8029122352600098,
"learning_rate": 0.00023357753357753356,
"loss": 0.7911,
"step": 817
},
{
"epoch": 0.8985308252093918,
"grad_norm": 0.46339499950408936,
"learning_rate": 0.00023345543345543342,
"loss": 0.7712,
"step": 818
},
{
"epoch": 0.899629273650968,
"grad_norm": 1.020947813987732,
"learning_rate": 0.0002333333333333333,
"loss": 0.6865,
"step": 819
},
{
"epoch": 0.9007277220925443,
"grad_norm": 0.5332039594650269,
"learning_rate": 0.00023321123321123322,
"loss": 0.8352,
"step": 820
},
{
"epoch": 0.9018261705341205,
"grad_norm": 0.40052923560142517,
"learning_rate": 0.00023308913308913307,
"loss": 0.5435,
"step": 821
},
{
"epoch": 0.9029246189756969,
"grad_norm": 0.6643521189689636,
"learning_rate": 0.00023296703296703296,
"loss": 0.7406,
"step": 822
},
{
"epoch": 0.9040230674172731,
"grad_norm": 0.7514997720718384,
"learning_rate": 0.00023284493284493285,
"loss": 0.7595,
"step": 823
},
{
"epoch": 0.9051215158588494,
"grad_norm": 0.7124571204185486,
"learning_rate": 0.0002327228327228327,
"loss": 0.5736,
"step": 824
},
{
"epoch": 0.9062199643004256,
"grad_norm": 0.6757075786590576,
"learning_rate": 0.0002326007326007326,
"loss": 0.6275,
"step": 825
},
{
"epoch": 0.9073184127420019,
"grad_norm": 0.4200783669948578,
"learning_rate": 0.00023247863247863245,
"loss": 0.6267,
"step": 826
},
{
"epoch": 0.9084168611835782,
"grad_norm": 0.5442836284637451,
"learning_rate": 0.00023235653235653233,
"loss": 0.6814,
"step": 827
},
{
"epoch": 0.9095153096251545,
"grad_norm": 0.4859601557254791,
"learning_rate": 0.00023223443223443222,
"loss": 0.6451,
"step": 828
},
{
"epoch": 0.9106137580667307,
"grad_norm": 0.7353097200393677,
"learning_rate": 0.00023211233211233208,
"loss": 0.6723,
"step": 829
},
{
"epoch": 0.911712206508307,
"grad_norm": 0.6389304995536804,
"learning_rate": 0.00023199023199023196,
"loss": 0.9429,
"step": 830
},
{
"epoch": 0.9128106549498833,
"grad_norm": 0.6813933849334717,
"learning_rate": 0.00023186813186813185,
"loss": 0.5319,
"step": 831
},
{
"epoch": 0.9139091033914596,
"grad_norm": 0.40023690462112427,
"learning_rate": 0.0002317460317460317,
"loss": 0.5808,
"step": 832
},
{
"epoch": 0.9150075518330358,
"grad_norm": 0.5327205657958984,
"learning_rate": 0.0002316239316239316,
"loss": 0.6666,
"step": 833
},
{
"epoch": 0.9161060002746121,
"grad_norm": 1.672450065612793,
"learning_rate": 0.0002315018315018315,
"loss": 0.7758,
"step": 834
},
{
"epoch": 0.9172044487161883,
"grad_norm": 0.5022990703582764,
"learning_rate": 0.00023137973137973134,
"loss": 0.6309,
"step": 835
},
{
"epoch": 0.9183028971577647,
"grad_norm": 0.43023642897605896,
"learning_rate": 0.00023125763125763125,
"loss": 0.5343,
"step": 836
},
{
"epoch": 0.919401345599341,
"grad_norm": 0.6878641843795776,
"learning_rate": 0.00023113553113553113,
"loss": 0.7268,
"step": 837
},
{
"epoch": 0.9204997940409172,
"grad_norm": 0.40551453828811646,
"learning_rate": 0.000231013431013431,
"loss": 0.5784,
"step": 838
},
{
"epoch": 0.9215982424824934,
"grad_norm": 0.412356436252594,
"learning_rate": 0.00023089133089133088,
"loss": 0.7685,
"step": 839
},
{
"epoch": 0.9226966909240698,
"grad_norm": 1.1603305339813232,
"learning_rate": 0.00023076923076923076,
"loss": 0.518,
"step": 840
},
{
"epoch": 0.9237951393656461,
"grad_norm": 0.6733229756355286,
"learning_rate": 0.00023064713064713062,
"loss": 0.5883,
"step": 841
},
{
"epoch": 0.9248935878072223,
"grad_norm": 0.619434654712677,
"learning_rate": 0.0002305250305250305,
"loss": 0.6244,
"step": 842
},
{
"epoch": 0.9259920362487986,
"grad_norm": 0.6989772319793701,
"learning_rate": 0.0002304029304029304,
"loss": 0.5763,
"step": 843
},
{
"epoch": 0.9270904846903748,
"grad_norm": 0.6276418566703796,
"learning_rate": 0.00023028083028083025,
"loss": 0.4762,
"step": 844
},
{
"epoch": 0.9281889331319512,
"grad_norm": 0.5577360987663269,
"learning_rate": 0.00023015873015873014,
"loss": 0.6254,
"step": 845
},
{
"epoch": 0.9292873815735274,
"grad_norm": 0.6185848116874695,
"learning_rate": 0.00023003663003663002,
"loss": 1.0182,
"step": 846
},
{
"epoch": 0.9303858300151037,
"grad_norm": 1.2415262460708618,
"learning_rate": 0.00022991452991452988,
"loss": 0.4677,
"step": 847
},
{
"epoch": 0.9314842784566799,
"grad_norm": 0.4582594335079193,
"learning_rate": 0.00022979242979242977,
"loss": 0.6308,
"step": 848
},
{
"epoch": 0.9325827268982562,
"grad_norm": 0.4749620258808136,
"learning_rate": 0.00022967032967032962,
"loss": 0.6217,
"step": 849
},
{
"epoch": 0.9336811753398325,
"grad_norm": 0.48614588379859924,
"learning_rate": 0.0002295482295482295,
"loss": 0.7469,
"step": 850
},
{
"epoch": 0.9347796237814088,
"grad_norm": 0.7357453107833862,
"learning_rate": 0.00022942612942612942,
"loss": 0.5978,
"step": 851
},
{
"epoch": 0.935878072222985,
"grad_norm": 0.53326815366745,
"learning_rate": 0.00022930402930402928,
"loss": 0.7678,
"step": 852
},
{
"epoch": 0.9369765206645613,
"grad_norm": 0.4853271245956421,
"learning_rate": 0.00022918192918192917,
"loss": 0.4888,
"step": 853
},
{
"epoch": 0.9380749691061376,
"grad_norm": 1.6529743671417236,
"learning_rate": 0.00022905982905982905,
"loss": 0.6103,
"step": 854
},
{
"epoch": 0.9391734175477139,
"grad_norm": 0.8255143165588379,
"learning_rate": 0.0002289377289377289,
"loss": 0.6977,
"step": 855
},
{
"epoch": 0.9402718659892901,
"grad_norm": 0.3999016284942627,
"learning_rate": 0.0002288156288156288,
"loss": 0.5398,
"step": 856
},
{
"epoch": 0.9413703144308664,
"grad_norm": 1.933090329170227,
"learning_rate": 0.00022869352869352868,
"loss": 1.0827,
"step": 857
},
{
"epoch": 0.9424687628724426,
"grad_norm": 0.8884105682373047,
"learning_rate": 0.00022857142857142854,
"loss": 0.702,
"step": 858
},
{
"epoch": 0.943567211314019,
"grad_norm": 0.4555901885032654,
"learning_rate": 0.00022844932844932843,
"loss": 0.8737,
"step": 859
},
{
"epoch": 0.9446656597555952,
"grad_norm": 0.535915732383728,
"learning_rate": 0.0002283272283272283,
"loss": 0.7036,
"step": 860
},
{
"epoch": 0.9457641081971715,
"grad_norm": 0.7607597708702087,
"learning_rate": 0.00022820512820512817,
"loss": 0.8707,
"step": 861
},
{
"epoch": 0.9468625566387477,
"grad_norm": 0.4056457579135895,
"learning_rate": 0.00022808302808302805,
"loss": 0.6658,
"step": 862
},
{
"epoch": 0.947961005080324,
"grad_norm": 0.5472984313964844,
"learning_rate": 0.00022796092796092794,
"loss": 0.5429,
"step": 863
},
{
"epoch": 0.9490594535219004,
"grad_norm": 0.6866592764854431,
"learning_rate": 0.0002278388278388278,
"loss": 0.7343,
"step": 864
},
{
"epoch": 0.9501579019634766,
"grad_norm": 0.5244406461715698,
"learning_rate": 0.0002277167277167277,
"loss": 0.669,
"step": 865
},
{
"epoch": 0.9512563504050529,
"grad_norm": 0.45024383068084717,
"learning_rate": 0.0002275946275946276,
"loss": 0.9062,
"step": 866
},
{
"epoch": 0.9523547988466291,
"grad_norm": 0.4252873659133911,
"learning_rate": 0.00022747252747252745,
"loss": 0.6109,
"step": 867
},
{
"epoch": 0.9534532472882055,
"grad_norm": 0.50081467628479,
"learning_rate": 0.00022735042735042734,
"loss": 0.5266,
"step": 868
},
{
"epoch": 0.9545516957297817,
"grad_norm": 0.9674072861671448,
"learning_rate": 0.00022722832722832723,
"loss": 0.7197,
"step": 869
},
{
"epoch": 0.955650144171358,
"grad_norm": 1.572348952293396,
"learning_rate": 0.00022710622710622708,
"loss": 0.4728,
"step": 870
},
{
"epoch": 0.9567485926129342,
"grad_norm": 0.6033158898353577,
"learning_rate": 0.00022698412698412697,
"loss": 0.6394,
"step": 871
},
{
"epoch": 0.9578470410545105,
"grad_norm": 0.5810523629188538,
"learning_rate": 0.00022686202686202686,
"loss": 0.8813,
"step": 872
},
{
"epoch": 0.9589454894960868,
"grad_norm": 0.46345213055610657,
"learning_rate": 0.00022673992673992671,
"loss": 0.5828,
"step": 873
},
{
"epoch": 0.9600439379376631,
"grad_norm": 0.5414748191833496,
"learning_rate": 0.0002266178266178266,
"loss": 0.6311,
"step": 874
},
{
"epoch": 0.9611423863792393,
"grad_norm": 0.9083818197250366,
"learning_rate": 0.00022649572649572646,
"loss": 0.961,
"step": 875
},
{
"epoch": 0.9622408348208156,
"grad_norm": 0.786993145942688,
"learning_rate": 0.00022637362637362634,
"loss": 0.7825,
"step": 876
},
{
"epoch": 0.9633392832623918,
"grad_norm": 0.7639968991279602,
"learning_rate": 0.00022625152625152623,
"loss": 0.8989,
"step": 877
},
{
"epoch": 0.9644377317039682,
"grad_norm": 0.43360400199890137,
"learning_rate": 0.0002261294261294261,
"loss": 0.6747,
"step": 878
},
{
"epoch": 0.9655361801455444,
"grad_norm": 0.8512898683547974,
"learning_rate": 0.00022600732600732597,
"loss": 0.7152,
"step": 879
},
{
"epoch": 0.9666346285871207,
"grad_norm": 0.46903684735298157,
"learning_rate": 0.00022588522588522589,
"loss": 0.7594,
"step": 880
},
{
"epoch": 0.9677330770286969,
"grad_norm": 1.9560080766677856,
"learning_rate": 0.00022576312576312572,
"loss": 0.598,
"step": 881
},
{
"epoch": 0.9688315254702733,
"grad_norm": 1.1595470905303955,
"learning_rate": 0.00022564102564102563,
"loss": 0.6005,
"step": 882
},
{
"epoch": 0.9699299739118495,
"grad_norm": 0.7318668365478516,
"learning_rate": 0.00022551892551892551,
"loss": 0.7327,
"step": 883
},
{
"epoch": 0.9710284223534258,
"grad_norm": 0.6557647585868835,
"learning_rate": 0.00022539682539682537,
"loss": 0.5858,
"step": 884
},
{
"epoch": 0.972126870795002,
"grad_norm": 0.5645928382873535,
"learning_rate": 0.00022527472527472526,
"loss": 0.5818,
"step": 885
},
{
"epoch": 0.9732253192365783,
"grad_norm": 0.4630253314971924,
"learning_rate": 0.00022515262515262514,
"loss": 0.8363,
"step": 886
},
{
"epoch": 0.9743237676781547,
"grad_norm": 0.6750912666320801,
"learning_rate": 0.000225030525030525,
"loss": 0.8865,
"step": 887
},
{
"epoch": 0.9754222161197309,
"grad_norm": 0.6309487819671631,
"learning_rate": 0.0002249084249084249,
"loss": 0.5596,
"step": 888
},
{
"epoch": 0.9765206645613072,
"grad_norm": 0.9696050882339478,
"learning_rate": 0.00022478632478632477,
"loss": 0.7752,
"step": 889
},
{
"epoch": 0.9776191130028834,
"grad_norm": 0.7614735960960388,
"learning_rate": 0.00022466422466422463,
"loss": 0.7131,
"step": 890
},
{
"epoch": 0.9787175614444596,
"grad_norm": 0.4971006214618683,
"learning_rate": 0.00022454212454212452,
"loss": 0.6218,
"step": 891
},
{
"epoch": 0.979816009886036,
"grad_norm": 0.47809773683547974,
"learning_rate": 0.0002244200244200244,
"loss": 0.5678,
"step": 892
},
{
"epoch": 0.9809144583276123,
"grad_norm": 0.5959337949752808,
"learning_rate": 0.00022429792429792426,
"loss": 1.0002,
"step": 893
},
{
"epoch": 0.9820129067691885,
"grad_norm": 0.45277753472328186,
"learning_rate": 0.00022417582417582415,
"loss": 0.7321,
"step": 894
},
{
"epoch": 0.9831113552107648,
"grad_norm": 1.279405951499939,
"learning_rate": 0.00022405372405372406,
"loss": 0.7912,
"step": 895
},
{
"epoch": 0.9842098036523411,
"grad_norm": 0.49885687232017517,
"learning_rate": 0.00022393162393162392,
"loss": 0.5558,
"step": 896
},
{
"epoch": 0.9853082520939174,
"grad_norm": 0.474979430437088,
"learning_rate": 0.0002238095238095238,
"loss": 0.7095,
"step": 897
},
{
"epoch": 0.9864067005354936,
"grad_norm": 0.3826389014720917,
"learning_rate": 0.0002236874236874237,
"loss": 0.5695,
"step": 898
},
{
"epoch": 0.9875051489770699,
"grad_norm": 0.33514517545700073,
"learning_rate": 0.00022356532356532355,
"loss": 0.6341,
"step": 899
},
{
"epoch": 0.9886035974186461,
"grad_norm": 0.5049251914024353,
"learning_rate": 0.00022344322344322343,
"loss": 0.5577,
"step": 900
},
{
"epoch": 0.9897020458602225,
"grad_norm": 0.5179988145828247,
"learning_rate": 0.0002233211233211233,
"loss": 0.5769,
"step": 901
},
{
"epoch": 0.9908004943017987,
"grad_norm": 0.5194469094276428,
"learning_rate": 0.00022319902319902318,
"loss": 0.5466,
"step": 902
},
{
"epoch": 0.991898942743375,
"grad_norm": 0.46941491961479187,
"learning_rate": 0.00022307692307692306,
"loss": 0.642,
"step": 903
},
{
"epoch": 0.9929973911849512,
"grad_norm": 0.379682719707489,
"learning_rate": 0.00022295482295482292,
"loss": 0.5508,
"step": 904
},
{
"epoch": 0.9940958396265275,
"grad_norm": 1.3844119310379028,
"learning_rate": 0.0002228327228327228,
"loss": 0.8814,
"step": 905
},
{
"epoch": 0.9951942880681038,
"grad_norm": 2.497697114944458,
"learning_rate": 0.0002227106227106227,
"loss": 0.8116,
"step": 906
},
{
"epoch": 0.9962927365096801,
"grad_norm": 0.36689239740371704,
"learning_rate": 0.00022258852258852255,
"loss": 0.5001,
"step": 907
},
{
"epoch": 0.9973911849512563,
"grad_norm": 0.39868447184562683,
"learning_rate": 0.00022246642246642243,
"loss": 0.6913,
"step": 908
},
{
"epoch": 0.9984896333928326,
"grad_norm": 0.5270336270332336,
"learning_rate": 0.00022234432234432235,
"loss": 0.5401,
"step": 909
},
{
"epoch": 0.999588081834409,
"grad_norm": 0.4079851508140564,
"learning_rate": 0.00022222222222222218,
"loss": 0.471,
"step": 910
},
{
"epoch": 1.000686530275985,
"grad_norm": 0.43189048767089844,
"learning_rate": 0.0002221001221001221,
"loss": 0.8237,
"step": 911
},
{
"epoch": 1.0017849787175614,
"grad_norm": 0.52342289686203,
"learning_rate": 0.00022197802197802198,
"loss": 0.6363,
"step": 912
},
{
"epoch": 1.0028834271591378,
"grad_norm": 0.38078904151916504,
"learning_rate": 0.00022185592185592184,
"loss": 0.4411,
"step": 913
},
{
"epoch": 1.003981875600714,
"grad_norm": 0.5302817821502686,
"learning_rate": 0.00022173382173382172,
"loss": 0.858,
"step": 914
},
{
"epoch": 1.0050803240422903,
"grad_norm": 0.3696751892566681,
"learning_rate": 0.0002216117216117216,
"loss": 0.8766,
"step": 915
},
{
"epoch": 1.0061787724838664,
"grad_norm": 0.7566766738891602,
"learning_rate": 0.00022148962148962146,
"loss": 1.067,
"step": 916
},
{
"epoch": 1.0072772209254428,
"grad_norm": 0.7399318218231201,
"learning_rate": 0.00022136752136752135,
"loss": 0.6683,
"step": 917
},
{
"epoch": 1.0083756693670192,
"grad_norm": 0.5435899496078491,
"learning_rate": 0.00022124542124542124,
"loss": 0.6045,
"step": 918
},
{
"epoch": 1.0094741178085953,
"grad_norm": 0.9680571556091309,
"learning_rate": 0.0002211233211233211,
"loss": 0.7546,
"step": 919
},
{
"epoch": 1.0105725662501717,
"grad_norm": 0.6131067872047424,
"learning_rate": 0.00022100122100122098,
"loss": 0.6655,
"step": 920
},
{
"epoch": 1.0116710146917478,
"grad_norm": 0.8093316555023193,
"learning_rate": 0.00022087912087912086,
"loss": 0.4812,
"step": 921
},
{
"epoch": 1.0127694631333242,
"grad_norm": 0.5077763199806213,
"learning_rate": 0.00022075702075702072,
"loss": 0.5357,
"step": 922
},
{
"epoch": 1.0138679115749005,
"grad_norm": 0.4767695963382721,
"learning_rate": 0.0002206349206349206,
"loss": 0.5807,
"step": 923
},
{
"epoch": 1.0149663600164767,
"grad_norm": 0.3215581178665161,
"learning_rate": 0.00022051282051282052,
"loss": 0.5773,
"step": 924
},
{
"epoch": 1.016064808458053,
"grad_norm": 0.425603985786438,
"learning_rate": 0.00022039072039072035,
"loss": 0.5441,
"step": 925
},
{
"epoch": 1.0171632568996292,
"grad_norm": 0.6131730079650879,
"learning_rate": 0.00022026862026862027,
"loss": 0.856,
"step": 926
},
{
"epoch": 1.0182617053412055,
"grad_norm": 0.5472941398620605,
"learning_rate": 0.00022014652014652012,
"loss": 0.8228,
"step": 927
},
{
"epoch": 1.0193601537827819,
"grad_norm": 0.46728211641311646,
"learning_rate": 0.00022002442002442,
"loss": 0.7615,
"step": 928
},
{
"epoch": 1.020458602224358,
"grad_norm": 0.39919501543045044,
"learning_rate": 0.0002199023199023199,
"loss": 0.709,
"step": 929
},
{
"epoch": 1.0215570506659344,
"grad_norm": 0.564400315284729,
"learning_rate": 0.00021978021978021975,
"loss": 0.5941,
"step": 930
},
{
"epoch": 1.0226554991075107,
"grad_norm": 0.39073804020881653,
"learning_rate": 0.00021965811965811964,
"loss": 0.6386,
"step": 931
},
{
"epoch": 1.0237539475490869,
"grad_norm": 0.3725563585758209,
"learning_rate": 0.00021953601953601952,
"loss": 0.4766,
"step": 932
},
{
"epoch": 1.0248523959906632,
"grad_norm": 1.319197654724121,
"learning_rate": 0.00021941391941391938,
"loss": 0.8465,
"step": 933
},
{
"epoch": 1.0259508444322394,
"grad_norm": 0.5126785635948181,
"learning_rate": 0.00021929181929181927,
"loss": 0.5103,
"step": 934
},
{
"epoch": 1.0270492928738157,
"grad_norm": 0.5401897430419922,
"learning_rate": 0.00021916971916971915,
"loss": 0.5879,
"step": 935
},
{
"epoch": 1.028147741315392,
"grad_norm": 0.47014057636260986,
"learning_rate": 0.000219047619047619,
"loss": 0.658,
"step": 936
},
{
"epoch": 1.0292461897569682,
"grad_norm": 0.49227291345596313,
"learning_rate": 0.0002189255189255189,
"loss": 0.5271,
"step": 937
},
{
"epoch": 1.0303446381985446,
"grad_norm": 0.8186778426170349,
"learning_rate": 0.00021880341880341878,
"loss": 0.6491,
"step": 938
},
{
"epoch": 1.0314430866401207,
"grad_norm": 0.46345674991607666,
"learning_rate": 0.00021868131868131864,
"loss": 0.7935,
"step": 939
},
{
"epoch": 1.032541535081697,
"grad_norm": 1.7300915718078613,
"learning_rate": 0.00021855921855921855,
"loss": 0.516,
"step": 940
},
{
"epoch": 1.0336399835232735,
"grad_norm": 0.5100822448730469,
"learning_rate": 0.00021843711843711844,
"loss": 0.8286,
"step": 941
},
{
"epoch": 1.0347384319648496,
"grad_norm": 0.42278483510017395,
"learning_rate": 0.0002183150183150183,
"loss": 0.7312,
"step": 942
},
{
"epoch": 1.035836880406426,
"grad_norm": 0.42105185985565186,
"learning_rate": 0.00021819291819291818,
"loss": 0.5729,
"step": 943
},
{
"epoch": 1.036935328848002,
"grad_norm": 0.5117312669754028,
"learning_rate": 0.00021807081807081807,
"loss": 0.7688,
"step": 944
},
{
"epoch": 1.0380337772895785,
"grad_norm": 0.4982740879058838,
"learning_rate": 0.00021794871794871793,
"loss": 0.5746,
"step": 945
},
{
"epoch": 1.0391322257311548,
"grad_norm": 0.5181052684783936,
"learning_rate": 0.0002178266178266178,
"loss": 0.8446,
"step": 946
},
{
"epoch": 1.040230674172731,
"grad_norm": 5.104315757751465,
"learning_rate": 0.0002177045177045177,
"loss": 0.9641,
"step": 947
},
{
"epoch": 1.0413291226143073,
"grad_norm": 0.7384645938873291,
"learning_rate": 0.00021758241758241756,
"loss": 0.7168,
"step": 948
},
{
"epoch": 1.0424275710558835,
"grad_norm": 0.4367550313472748,
"learning_rate": 0.00021746031746031744,
"loss": 0.7139,
"step": 949
},
{
"epoch": 1.0435260194974598,
"grad_norm": 0.7332566380500793,
"learning_rate": 0.00021733821733821733,
"loss": 0.7082,
"step": 950
},
{
"epoch": 1.0446244679390362,
"grad_norm": 0.4191775918006897,
"learning_rate": 0.00021721611721611719,
"loss": 0.7986,
"step": 951
},
{
"epoch": 1.0457229163806123,
"grad_norm": 0.33929941058158875,
"learning_rate": 0.00021709401709401707,
"loss": 0.3784,
"step": 952
},
{
"epoch": 1.0468213648221887,
"grad_norm": 0.5255181789398193,
"learning_rate": 0.00021697191697191693,
"loss": 0.5842,
"step": 953
},
{
"epoch": 1.047919813263765,
"grad_norm": 0.5401780605316162,
"learning_rate": 0.00021684981684981681,
"loss": 0.7939,
"step": 954
},
{
"epoch": 1.0490182617053412,
"grad_norm": 0.34873855113983154,
"learning_rate": 0.00021672771672771673,
"loss": 0.7957,
"step": 955
},
{
"epoch": 1.0501167101469175,
"grad_norm": 0.33418160676956177,
"learning_rate": 0.00021660561660561656,
"loss": 0.6037,
"step": 956
},
{
"epoch": 1.0512151585884937,
"grad_norm": 0.3197249174118042,
"learning_rate": 0.00021648351648351647,
"loss": 0.5223,
"step": 957
},
{
"epoch": 1.05231360703007,
"grad_norm": 0.5962835550308228,
"learning_rate": 0.00021636141636141636,
"loss": 0.5213,
"step": 958
},
{
"epoch": 1.0534120554716464,
"grad_norm": 1.3891643285751343,
"learning_rate": 0.00021623931623931622,
"loss": 0.6781,
"step": 959
},
{
"epoch": 1.0545105039132225,
"grad_norm": 0.42117932438850403,
"learning_rate": 0.0002161172161172161,
"loss": 0.6363,
"step": 960
},
{
"epoch": 1.055608952354799,
"grad_norm": 0.4514491558074951,
"learning_rate": 0.00021599511599511599,
"loss": 0.6904,
"step": 961
},
{
"epoch": 1.056707400796375,
"grad_norm": 0.4863387644290924,
"learning_rate": 0.00021587301587301584,
"loss": 0.6595,
"step": 962
},
{
"epoch": 1.0578058492379514,
"grad_norm": 0.6178450584411621,
"learning_rate": 0.00021575091575091573,
"loss": 0.8412,
"step": 963
},
{
"epoch": 1.0589042976795278,
"grad_norm": 0.3728642761707306,
"learning_rate": 0.00021562881562881562,
"loss": 0.629,
"step": 964
},
{
"epoch": 1.060002746121104,
"grad_norm": 0.7554892301559448,
"learning_rate": 0.00021550671550671547,
"loss": 0.5804,
"step": 965
},
{
"epoch": 1.0611011945626803,
"grad_norm": 0.550298273563385,
"learning_rate": 0.00021538461538461536,
"loss": 0.476,
"step": 966
},
{
"epoch": 1.0621996430042564,
"grad_norm": 0.4082244336605072,
"learning_rate": 0.00021526251526251524,
"loss": 0.4001,
"step": 967
},
{
"epoch": 1.0632980914458328,
"grad_norm": 1.2327499389648438,
"learning_rate": 0.0002151404151404151,
"loss": 0.4583,
"step": 968
},
{
"epoch": 1.0643965398874091,
"grad_norm": 0.860550045967102,
"learning_rate": 0.000215018315018315,
"loss": 0.6415,
"step": 969
},
{
"epoch": 1.0654949883289853,
"grad_norm": 0.558860182762146,
"learning_rate": 0.0002148962148962149,
"loss": 0.6215,
"step": 970
},
{
"epoch": 1.0665934367705616,
"grad_norm": 0.7794890403747559,
"learning_rate": 0.00021477411477411476,
"loss": 0.5094,
"step": 971
},
{
"epoch": 1.0676918852121378,
"grad_norm": 0.48574942350387573,
"learning_rate": 0.00021465201465201465,
"loss": 0.7385,
"step": 972
},
{
"epoch": 1.0687903336537141,
"grad_norm": 0.4496791660785675,
"learning_rate": 0.00021452991452991453,
"loss": 0.5036,
"step": 973
},
{
"epoch": 1.0698887820952905,
"grad_norm": 0.5360952615737915,
"learning_rate": 0.0002144078144078144,
"loss": 0.6825,
"step": 974
},
{
"epoch": 1.0709872305368666,
"grad_norm": 0.5783904194831848,
"learning_rate": 0.00021428571428571427,
"loss": 0.6736,
"step": 975
},
{
"epoch": 1.072085678978443,
"grad_norm": 2.290815830230713,
"learning_rate": 0.00021416361416361416,
"loss": 0.696,
"step": 976
},
{
"epoch": 1.0731841274200193,
"grad_norm": 1.3432899713516235,
"learning_rate": 0.00021404151404151402,
"loss": 0.5296,
"step": 977
},
{
"epoch": 1.0742825758615955,
"grad_norm": 0.5308722257614136,
"learning_rate": 0.0002139194139194139,
"loss": 0.6642,
"step": 978
},
{
"epoch": 1.0753810243031718,
"grad_norm": 0.7245768904685974,
"learning_rate": 0.00021379731379731376,
"loss": 0.6811,
"step": 979
},
{
"epoch": 1.076479472744748,
"grad_norm": 0.3873349726200104,
"learning_rate": 0.00021367521367521365,
"loss": 0.8503,
"step": 980
},
{
"epoch": 1.0775779211863243,
"grad_norm": 0.5792405605316162,
"learning_rate": 0.00021355311355311353,
"loss": 0.4543,
"step": 981
},
{
"epoch": 1.0786763696279005,
"grad_norm": 0.6543241143226624,
"learning_rate": 0.0002134310134310134,
"loss": 0.7778,
"step": 982
},
{
"epoch": 1.0797748180694768,
"grad_norm": 0.5572071075439453,
"learning_rate": 0.00021330891330891328,
"loss": 0.8446,
"step": 983
},
{
"epoch": 1.0808732665110532,
"grad_norm": 0.5798014402389526,
"learning_rate": 0.0002131868131868132,
"loss": 0.7461,
"step": 984
},
{
"epoch": 1.0819717149526293,
"grad_norm": 0.8282085657119751,
"learning_rate": 0.00021306471306471302,
"loss": 0.612,
"step": 985
},
{
"epoch": 1.0830701633942057,
"grad_norm": 0.5782580971717834,
"learning_rate": 0.00021294261294261293,
"loss": 0.5506,
"step": 986
},
{
"epoch": 1.084168611835782,
"grad_norm": 0.3826775848865509,
"learning_rate": 0.00021282051282051282,
"loss": 0.7859,
"step": 987
},
{
"epoch": 1.0852670602773582,
"grad_norm": 0.534752368927002,
"learning_rate": 0.00021269841269841268,
"loss": 0.8835,
"step": 988
},
{
"epoch": 1.0863655087189346,
"grad_norm": 0.45931264758110046,
"learning_rate": 0.00021257631257631256,
"loss": 0.6694,
"step": 989
},
{
"epoch": 1.0874639571605107,
"grad_norm": 0.6106250286102295,
"learning_rate": 0.00021245421245421245,
"loss": 0.8274,
"step": 990
},
{
"epoch": 1.088562405602087,
"grad_norm": 0.3704061806201935,
"learning_rate": 0.0002123321123321123,
"loss": 0.7449,
"step": 991
},
{
"epoch": 1.0896608540436634,
"grad_norm": 0.3922840356826782,
"learning_rate": 0.0002122100122100122,
"loss": 0.5845,
"step": 992
},
{
"epoch": 1.0907593024852396,
"grad_norm": 0.48152726888656616,
"learning_rate": 0.00021208791208791208,
"loss": 0.6608,
"step": 993
},
{
"epoch": 1.091857750926816,
"grad_norm": 0.42257216572761536,
"learning_rate": 0.00021196581196581194,
"loss": 0.6379,
"step": 994
},
{
"epoch": 1.092956199368392,
"grad_norm": 0.4746345579624176,
"learning_rate": 0.00021184371184371182,
"loss": 0.6467,
"step": 995
},
{
"epoch": 1.0940546478099684,
"grad_norm": 0.3915644884109497,
"learning_rate": 0.0002117216117216117,
"loss": 0.9699,
"step": 996
},
{
"epoch": 1.0951530962515448,
"grad_norm": 0.5957880020141602,
"learning_rate": 0.00021159951159951157,
"loss": 0.6917,
"step": 997
},
{
"epoch": 1.096251544693121,
"grad_norm": 0.4327985942363739,
"learning_rate": 0.00021147741147741145,
"loss": 0.8091,
"step": 998
},
{
"epoch": 1.0973499931346973,
"grad_norm": 0.42600274085998535,
"learning_rate": 0.00021135531135531136,
"loss": 0.7685,
"step": 999
},
{
"epoch": 1.0984484415762734,
"grad_norm": 0.7165039777755737,
"learning_rate": 0.0002112332112332112,
"loss": 0.8646,
"step": 1000
},
{
"epoch": 1.0995468900178498,
"grad_norm": 0.447652131319046,
"learning_rate": 0.0002111111111111111,
"loss": 0.521,
"step": 1001
},
{
"epoch": 1.1006453384594261,
"grad_norm": 0.3022591769695282,
"learning_rate": 0.000210989010989011,
"loss": 0.6099,
"step": 1002
},
{
"epoch": 1.1017437869010023,
"grad_norm": 0.32764387130737305,
"learning_rate": 0.00021086691086691085,
"loss": 0.5624,
"step": 1003
},
{
"epoch": 1.1028422353425786,
"grad_norm": 0.7301959991455078,
"learning_rate": 0.00021074481074481074,
"loss": 0.6091,
"step": 1004
},
{
"epoch": 1.1039406837841548,
"grad_norm": 0.4734131097793579,
"learning_rate": 0.0002106227106227106,
"loss": 0.6849,
"step": 1005
},
{
"epoch": 1.1050391322257311,
"grad_norm": 0.7214820384979248,
"learning_rate": 0.00021050061050061048,
"loss": 0.789,
"step": 1006
},
{
"epoch": 1.1061375806673075,
"grad_norm": 0.31265702843666077,
"learning_rate": 0.00021037851037851037,
"loss": 0.5176,
"step": 1007
},
{
"epoch": 1.1072360291088836,
"grad_norm": 0.5804157257080078,
"learning_rate": 0.00021025641025641022,
"loss": 1.0152,
"step": 1008
},
{
"epoch": 1.10833447755046,
"grad_norm": 0.3624595105648041,
"learning_rate": 0.0002101343101343101,
"loss": 0.6843,
"step": 1009
},
{
"epoch": 1.1094329259920364,
"grad_norm": 0.5099515318870544,
"learning_rate": 0.00021001221001221,
"loss": 0.5568,
"step": 1010
},
{
"epoch": 1.1105313744336125,
"grad_norm": 0.46201249957084656,
"learning_rate": 0.00020989010989010985,
"loss": 0.5883,
"step": 1011
},
{
"epoch": 1.1116298228751889,
"grad_norm": 0.4493483603000641,
"learning_rate": 0.00020976800976800974,
"loss": 0.8338,
"step": 1012
},
{
"epoch": 1.112728271316765,
"grad_norm": 0.4771614968776703,
"learning_rate": 0.00020964590964590963,
"loss": 0.7251,
"step": 1013
},
{
"epoch": 1.1138267197583414,
"grad_norm": 2.073347806930542,
"learning_rate": 0.00020952380952380948,
"loss": 0.8921,
"step": 1014
},
{
"epoch": 1.1149251681999177,
"grad_norm": 0.435680091381073,
"learning_rate": 0.0002094017094017094,
"loss": 0.5444,
"step": 1015
},
{
"epoch": 1.1160236166414939,
"grad_norm": 0.46824783086776733,
"learning_rate": 0.00020927960927960928,
"loss": 0.5591,
"step": 1016
},
{
"epoch": 1.1171220650830702,
"grad_norm": 0.43938374519348145,
"learning_rate": 0.00020915750915750914,
"loss": 0.7476,
"step": 1017
},
{
"epoch": 1.1182205135246464,
"grad_norm": 0.3620377779006958,
"learning_rate": 0.00020903540903540903,
"loss": 0.5763,
"step": 1018
},
{
"epoch": 1.1193189619662227,
"grad_norm": 0.612406313419342,
"learning_rate": 0.0002089133089133089,
"loss": 0.706,
"step": 1019
},
{
"epoch": 1.120417410407799,
"grad_norm": 0.5045173168182373,
"learning_rate": 0.00020879120879120877,
"loss": 0.6799,
"step": 1020
},
{
"epoch": 1.1215158588493752,
"grad_norm": 0.4815331995487213,
"learning_rate": 0.00020866910866910865,
"loss": 0.8845,
"step": 1021
},
{
"epoch": 1.1226143072909516,
"grad_norm": 0.3756159245967865,
"learning_rate": 0.00020854700854700854,
"loss": 0.5545,
"step": 1022
},
{
"epoch": 1.1237127557325277,
"grad_norm": 0.3184347152709961,
"learning_rate": 0.0002084249084249084,
"loss": 0.5109,
"step": 1023
},
{
"epoch": 1.124811204174104,
"grad_norm": 0.4000808298587799,
"learning_rate": 0.00020830280830280828,
"loss": 0.8363,
"step": 1024
},
{
"epoch": 1.1259096526156804,
"grad_norm": 0.3930743336677551,
"learning_rate": 0.00020818070818070817,
"loss": 0.6183,
"step": 1025
},
{
"epoch": 1.1270081010572566,
"grad_norm": 0.7536817789077759,
"learning_rate": 0.00020805860805860803,
"loss": 0.7511,
"step": 1026
},
{
"epoch": 1.128106549498833,
"grad_norm": 0.5012079477310181,
"learning_rate": 0.00020793650793650791,
"loss": 0.6346,
"step": 1027
},
{
"epoch": 1.129204997940409,
"grad_norm": 0.9914690852165222,
"learning_rate": 0.00020781440781440783,
"loss": 0.5827,
"step": 1028
},
{
"epoch": 1.1303034463819854,
"grad_norm": 0.9096476435661316,
"learning_rate": 0.00020769230769230766,
"loss": 1.0235,
"step": 1029
},
{
"epoch": 1.1314018948235618,
"grad_norm": 0.6668229699134827,
"learning_rate": 0.00020757020757020757,
"loss": 0.741,
"step": 1030
},
{
"epoch": 1.132500343265138,
"grad_norm": 0.3232771158218384,
"learning_rate": 0.0002074481074481074,
"loss": 0.6206,
"step": 1031
},
{
"epoch": 1.1335987917067143,
"grad_norm": 0.278003990650177,
"learning_rate": 0.00020732600732600731,
"loss": 0.5661,
"step": 1032
},
{
"epoch": 1.1346972401482907,
"grad_norm": 1.481213927268982,
"learning_rate": 0.0002072039072039072,
"loss": 0.6422,
"step": 1033
},
{
"epoch": 1.1357956885898668,
"grad_norm": 0.4688512682914734,
"learning_rate": 0.00020708180708180706,
"loss": 0.4163,
"step": 1034
},
{
"epoch": 1.1368941370314432,
"grad_norm": 0.6438425779342651,
"learning_rate": 0.00020695970695970694,
"loss": 0.6241,
"step": 1035
},
{
"epoch": 1.1379925854730193,
"grad_norm": 0.5013176798820496,
"learning_rate": 0.00020683760683760683,
"loss": 0.6273,
"step": 1036
},
{
"epoch": 1.1390910339145957,
"grad_norm": 0.5178597569465637,
"learning_rate": 0.0002067155067155067,
"loss": 0.7489,
"step": 1037
},
{
"epoch": 1.1401894823561718,
"grad_norm": 0.5804840922355652,
"learning_rate": 0.00020659340659340657,
"loss": 0.9142,
"step": 1038
},
{
"epoch": 1.1412879307977482,
"grad_norm": 0.47613444924354553,
"learning_rate": 0.00020647130647130646,
"loss": 0.9531,
"step": 1039
},
{
"epoch": 1.1423863792393245,
"grad_norm": 0.4835624694824219,
"learning_rate": 0.00020634920634920632,
"loss": 0.6349,
"step": 1040
},
{
"epoch": 1.1434848276809007,
"grad_norm": 0.38351112604141235,
"learning_rate": 0.0002062271062271062,
"loss": 0.4726,
"step": 1041
},
{
"epoch": 1.144583276122477,
"grad_norm": 0.5533854365348816,
"learning_rate": 0.0002061050061050061,
"loss": 0.5108,
"step": 1042
},
{
"epoch": 1.1456817245640534,
"grad_norm": 0.4842824637889862,
"learning_rate": 0.00020598290598290595,
"loss": 0.6038,
"step": 1043
},
{
"epoch": 1.1467801730056295,
"grad_norm": 0.552798330783844,
"learning_rate": 0.00020586080586080583,
"loss": 0.8056,
"step": 1044
},
{
"epoch": 1.1478786214472059,
"grad_norm": 0.40466025471687317,
"learning_rate": 0.00020573870573870574,
"loss": 0.6234,
"step": 1045
},
{
"epoch": 1.148977069888782,
"grad_norm": 0.6988784074783325,
"learning_rate": 0.0002056166056166056,
"loss": 0.7721,
"step": 1046
},
{
"epoch": 1.1500755183303584,
"grad_norm": 0.4852863550186157,
"learning_rate": 0.0002054945054945055,
"loss": 0.6074,
"step": 1047
},
{
"epoch": 1.1511739667719347,
"grad_norm": 0.4548696279525757,
"learning_rate": 0.00020537240537240537,
"loss": 0.5592,
"step": 1048
},
{
"epoch": 1.1522724152135109,
"grad_norm": 0.9355410933494568,
"learning_rate": 0.00020525030525030523,
"loss": 0.8618,
"step": 1049
},
{
"epoch": 1.1533708636550872,
"grad_norm": 0.5641398429870605,
"learning_rate": 0.00020512820512820512,
"loss": 0.704,
"step": 1050
},
{
"epoch": 1.1544693120966634,
"grad_norm": 0.48187771439552307,
"learning_rate": 0.000205006105006105,
"loss": 0.6008,
"step": 1051
},
{
"epoch": 1.1555677605382397,
"grad_norm": 0.41609904170036316,
"learning_rate": 0.00020488400488400486,
"loss": 0.8812,
"step": 1052
},
{
"epoch": 1.156666208979816,
"grad_norm": 0.919477105140686,
"learning_rate": 0.00020476190476190475,
"loss": 0.6597,
"step": 1053
},
{
"epoch": 1.1577646574213922,
"grad_norm": 0.5008611083030701,
"learning_rate": 0.0002046398046398046,
"loss": 0.6501,
"step": 1054
},
{
"epoch": 1.1588631058629686,
"grad_norm": 0.39832696318626404,
"learning_rate": 0.0002045177045177045,
"loss": 0.6232,
"step": 1055
},
{
"epoch": 1.159961554304545,
"grad_norm": 0.5290446281433105,
"learning_rate": 0.00020439560439560438,
"loss": 0.6123,
"step": 1056
},
{
"epoch": 1.161060002746121,
"grad_norm": 0.40837669372558594,
"learning_rate": 0.00020427350427350423,
"loss": 0.4989,
"step": 1057
},
{
"epoch": 1.1621584511876974,
"grad_norm": 0.43407055735588074,
"learning_rate": 0.00020415140415140412,
"loss": 0.6961,
"step": 1058
},
{
"epoch": 1.1632568996292736,
"grad_norm": 0.7601787447929382,
"learning_rate": 0.00020402930402930403,
"loss": 0.9308,
"step": 1059
},
{
"epoch": 1.16435534807085,
"grad_norm": 0.452628493309021,
"learning_rate": 0.00020390720390720386,
"loss": 0.6478,
"step": 1060
},
{
"epoch": 1.165453796512426,
"grad_norm": 0.4524000287055969,
"learning_rate": 0.00020378510378510378,
"loss": 0.4499,
"step": 1061
},
{
"epoch": 1.1665522449540024,
"grad_norm": 0.5971822142601013,
"learning_rate": 0.00020366300366300366,
"loss": 0.6402,
"step": 1062
},
{
"epoch": 1.1676506933955788,
"grad_norm": 0.36858659982681274,
"learning_rate": 0.00020354090354090352,
"loss": 0.6511,
"step": 1063
},
{
"epoch": 1.168749141837155,
"grad_norm": 0.47295433282852173,
"learning_rate": 0.0002034188034188034,
"loss": 0.5977,
"step": 1064
},
{
"epoch": 1.1698475902787313,
"grad_norm": 0.4402971565723419,
"learning_rate": 0.0002032967032967033,
"loss": 0.4824,
"step": 1065
},
{
"epoch": 1.1709460387203077,
"grad_norm": 0.3752620816230774,
"learning_rate": 0.00020317460317460315,
"loss": 0.6519,
"step": 1066
},
{
"epoch": 1.1720444871618838,
"grad_norm": 0.45207279920578003,
"learning_rate": 0.00020305250305250303,
"loss": 0.6869,
"step": 1067
},
{
"epoch": 1.1731429356034602,
"grad_norm": 0.4255804121494293,
"learning_rate": 0.00020293040293040292,
"loss": 0.7289,
"step": 1068
},
{
"epoch": 1.1742413840450363,
"grad_norm": 0.48725178837776184,
"learning_rate": 0.00020280830280830278,
"loss": 0.5472,
"step": 1069
},
{
"epoch": 1.1753398324866127,
"grad_norm": 0.37094470858573914,
"learning_rate": 0.00020268620268620266,
"loss": 0.558,
"step": 1070
},
{
"epoch": 1.176438280928189,
"grad_norm": 0.4191375970840454,
"learning_rate": 0.00020256410256410255,
"loss": 0.6422,
"step": 1071
},
{
"epoch": 1.1775367293697652,
"grad_norm": 0.4091531038284302,
"learning_rate": 0.0002024420024420024,
"loss": 0.6705,
"step": 1072
},
{
"epoch": 1.1786351778113415,
"grad_norm": 0.4876718521118164,
"learning_rate": 0.0002023199023199023,
"loss": 0.8265,
"step": 1073
},
{
"epoch": 1.1797336262529177,
"grad_norm": 0.43008798360824585,
"learning_rate": 0.0002021978021978022,
"loss": 0.5159,
"step": 1074
},
{
"epoch": 1.180832074694494,
"grad_norm": 0.47896140813827515,
"learning_rate": 0.00020207570207570204,
"loss": 0.5455,
"step": 1075
},
{
"epoch": 1.1819305231360704,
"grad_norm": 0.5313389301300049,
"learning_rate": 0.00020195360195360195,
"loss": 0.7628,
"step": 1076
},
{
"epoch": 1.1830289715776465,
"grad_norm": 0.46337512135505676,
"learning_rate": 0.00020183150183150184,
"loss": 0.6661,
"step": 1077
},
{
"epoch": 1.1841274200192229,
"grad_norm": 0.4304458498954773,
"learning_rate": 0.0002017094017094017,
"loss": 0.7019,
"step": 1078
},
{
"epoch": 1.185225868460799,
"grad_norm": 0.638445258140564,
"learning_rate": 0.00020158730158730158,
"loss": 0.6972,
"step": 1079
},
{
"epoch": 1.1863243169023754,
"grad_norm": 1.8217968940734863,
"learning_rate": 0.00020146520146520144,
"loss": 0.5217,
"step": 1080
},
{
"epoch": 1.1874227653439517,
"grad_norm": 0.4996611773967743,
"learning_rate": 0.00020134310134310132,
"loss": 0.6767,
"step": 1081
},
{
"epoch": 1.1885212137855279,
"grad_norm": 0.43705832958221436,
"learning_rate": 0.0002012210012210012,
"loss": 0.7364,
"step": 1082
},
{
"epoch": 1.1896196622271042,
"grad_norm": 0.4148736596107483,
"learning_rate": 0.00020109890109890107,
"loss": 0.7544,
"step": 1083
},
{
"epoch": 1.1907181106686804,
"grad_norm": 0.5772218108177185,
"learning_rate": 0.00020097680097680095,
"loss": 0.6349,
"step": 1084
},
{
"epoch": 1.1918165591102567,
"grad_norm": 0.9127015471458435,
"learning_rate": 0.00020085470085470084,
"loss": 0.4772,
"step": 1085
},
{
"epoch": 1.192915007551833,
"grad_norm": 0.46906840801239014,
"learning_rate": 0.0002007326007326007,
"loss": 0.6184,
"step": 1086
},
{
"epoch": 1.1940134559934092,
"grad_norm": 0.38405168056488037,
"learning_rate": 0.00020061050061050058,
"loss": 0.5027,
"step": 1087
},
{
"epoch": 1.1951119044349856,
"grad_norm": 0.6352836489677429,
"learning_rate": 0.00020048840048840047,
"loss": 0.6674,
"step": 1088
},
{
"epoch": 1.196210352876562,
"grad_norm": 0.6750807762145996,
"learning_rate": 0.00020036630036630033,
"loss": 0.5707,
"step": 1089
},
{
"epoch": 1.197308801318138,
"grad_norm": 0.5661985874176025,
"learning_rate": 0.00020024420024420024,
"loss": 0.8298,
"step": 1090
},
{
"epoch": 1.1984072497597145,
"grad_norm": 0.6393309831619263,
"learning_rate": 0.00020012210012210012,
"loss": 0.7397,
"step": 1091
},
{
"epoch": 1.1995056982012906,
"grad_norm": 0.5442856550216675,
"learning_rate": 0.00019999999999999998,
"loss": 0.7176,
"step": 1092
},
{
"epoch": 1.200604146642867,
"grad_norm": 1.0100654363632202,
"learning_rate": 0.00019987789987789987,
"loss": 0.8052,
"step": 1093
},
{
"epoch": 1.201702595084443,
"grad_norm": 0.3916209936141968,
"learning_rate": 0.00019975579975579975,
"loss": 0.5951,
"step": 1094
},
{
"epoch": 1.2028010435260195,
"grad_norm": 0.3890608847141266,
"learning_rate": 0.0001996336996336996,
"loss": 0.8129,
"step": 1095
},
{
"epoch": 1.2038994919675958,
"grad_norm": 0.4267507493495941,
"learning_rate": 0.0001995115995115995,
"loss": 0.8741,
"step": 1096
},
{
"epoch": 1.204997940409172,
"grad_norm": 0.49055561423301697,
"learning_rate": 0.00019938949938949938,
"loss": 0.901,
"step": 1097
},
{
"epoch": 1.2060963888507483,
"grad_norm": 0.6662428379058838,
"learning_rate": 0.00019926739926739924,
"loss": 0.4971,
"step": 1098
},
{
"epoch": 1.2071948372923247,
"grad_norm": 0.4469052255153656,
"learning_rate": 0.00019914529914529913,
"loss": 0.6593,
"step": 1099
},
{
"epoch": 1.2082932857339008,
"grad_norm": 0.5514255166053772,
"learning_rate": 0.000199023199023199,
"loss": 0.8033,
"step": 1100
},
{
"epoch": 1.2093917341754772,
"grad_norm": 0.4838184714317322,
"learning_rate": 0.00019890109890109887,
"loss": 0.5533,
"step": 1101
},
{
"epoch": 1.2104901826170533,
"grad_norm": 0.6061891913414001,
"learning_rate": 0.00019877899877899876,
"loss": 0.5837,
"step": 1102
},
{
"epoch": 1.2115886310586297,
"grad_norm": 0.3387523889541626,
"learning_rate": 0.00019865689865689867,
"loss": 0.455,
"step": 1103
},
{
"epoch": 1.212687079500206,
"grad_norm": 0.5204731225967407,
"learning_rate": 0.0001985347985347985,
"loss": 0.6869,
"step": 1104
},
{
"epoch": 1.2137855279417822,
"grad_norm": 0.5747571587562561,
"learning_rate": 0.0001984126984126984,
"loss": 0.7208,
"step": 1105
},
{
"epoch": 1.2148839763833585,
"grad_norm": 0.5382461547851562,
"learning_rate": 0.00019829059829059824,
"loss": 0.6035,
"step": 1106
},
{
"epoch": 1.2159824248249347,
"grad_norm": 0.44335421919822693,
"learning_rate": 0.00019816849816849816,
"loss": 0.8563,
"step": 1107
},
{
"epoch": 1.217080873266511,
"grad_norm": 0.3059934675693512,
"learning_rate": 0.00019804639804639804,
"loss": 0.6422,
"step": 1108
},
{
"epoch": 1.2181793217080874,
"grad_norm": 0.4306177794933319,
"learning_rate": 0.0001979242979242979,
"loss": 0.5347,
"step": 1109
},
{
"epoch": 1.2192777701496635,
"grad_norm": 0.5196095705032349,
"learning_rate": 0.00019780219780219779,
"loss": 0.5996,
"step": 1110
},
{
"epoch": 1.22037621859124,
"grad_norm": 0.4814283549785614,
"learning_rate": 0.00019768009768009767,
"loss": 0.6782,
"step": 1111
},
{
"epoch": 1.2214746670328163,
"grad_norm": 0.2287791222333908,
"learning_rate": 0.00019755799755799753,
"loss": 0.5908,
"step": 1112
},
{
"epoch": 1.2225731154743924,
"grad_norm": 0.43044313788414,
"learning_rate": 0.00019743589743589742,
"loss": 0.6554,
"step": 1113
},
{
"epoch": 1.2236715639159688,
"grad_norm": 0.390874445438385,
"learning_rate": 0.0001973137973137973,
"loss": 0.5777,
"step": 1114
},
{
"epoch": 1.224770012357545,
"grad_norm": 0.5380458235740662,
"learning_rate": 0.00019719169719169716,
"loss": 0.467,
"step": 1115
},
{
"epoch": 1.2258684607991213,
"grad_norm": 0.6176440119743347,
"learning_rate": 0.00019706959706959704,
"loss": 0.5625,
"step": 1116
},
{
"epoch": 1.2269669092406974,
"grad_norm": 0.4321332275867462,
"learning_rate": 0.00019694749694749693,
"loss": 0.7262,
"step": 1117
},
{
"epoch": 1.2280653576822738,
"grad_norm": 0.5679623484611511,
"learning_rate": 0.0001968253968253968,
"loss": 0.8216,
"step": 1118
},
{
"epoch": 1.2291638061238501,
"grad_norm": 0.4741218686103821,
"learning_rate": 0.00019670329670329667,
"loss": 0.7164,
"step": 1119
},
{
"epoch": 1.2302622545654263,
"grad_norm": 0.6570267677307129,
"learning_rate": 0.00019658119658119659,
"loss": 0.7606,
"step": 1120
},
{
"epoch": 1.2313607030070026,
"grad_norm": 0.4256306290626526,
"learning_rate": 0.00019645909645909644,
"loss": 0.5137,
"step": 1121
},
{
"epoch": 1.232459151448579,
"grad_norm": 0.4444984793663025,
"learning_rate": 0.00019633699633699633,
"loss": 0.8863,
"step": 1122
},
{
"epoch": 1.2335575998901551,
"grad_norm": 0.458133339881897,
"learning_rate": 0.00019621489621489622,
"loss": 0.6445,
"step": 1123
},
{
"epoch": 1.2346560483317315,
"grad_norm": 0.6087627410888672,
"learning_rate": 0.00019609279609279607,
"loss": 0.5625,
"step": 1124
},
{
"epoch": 1.2357544967733076,
"grad_norm": 0.42782312631607056,
"learning_rate": 0.00019597069597069596,
"loss": 0.6321,
"step": 1125
},
{
"epoch": 1.236852945214884,
"grad_norm": 0.49623987078666687,
"learning_rate": 0.00019584859584859585,
"loss": 0.6473,
"step": 1126
},
{
"epoch": 1.2379513936564603,
"grad_norm": 0.5348198413848877,
"learning_rate": 0.0001957264957264957,
"loss": 0.6948,
"step": 1127
},
{
"epoch": 1.2390498420980365,
"grad_norm": 0.44476062059402466,
"learning_rate": 0.0001956043956043956,
"loss": 0.5917,
"step": 1128
},
{
"epoch": 1.2401482905396128,
"grad_norm": 0.5777286291122437,
"learning_rate": 0.00019548229548229547,
"loss": 0.7474,
"step": 1129
},
{
"epoch": 1.241246738981189,
"grad_norm": 0.3132689893245697,
"learning_rate": 0.00019536019536019533,
"loss": 0.5827,
"step": 1130
},
{
"epoch": 1.2423451874227653,
"grad_norm": 0.3898192346096039,
"learning_rate": 0.00019523809523809522,
"loss": 0.5469,
"step": 1131
},
{
"epoch": 1.2434436358643417,
"grad_norm": 0.338693767786026,
"learning_rate": 0.00019511599511599508,
"loss": 0.704,
"step": 1132
},
{
"epoch": 1.2445420843059178,
"grad_norm": 0.4276609718799591,
"learning_rate": 0.00019499389499389496,
"loss": 0.7269,
"step": 1133
},
{
"epoch": 1.2456405327474942,
"grad_norm": 0.7320281863212585,
"learning_rate": 0.00019487179487179487,
"loss": 0.62,
"step": 1134
},
{
"epoch": 1.2467389811890706,
"grad_norm": 0.4023820757865906,
"learning_rate": 0.0001947496947496947,
"loss": 0.4234,
"step": 1135
},
{
"epoch": 1.2478374296306467,
"grad_norm": 0.3218212425708771,
"learning_rate": 0.00019462759462759462,
"loss": 0.5325,
"step": 1136
},
{
"epoch": 1.248935878072223,
"grad_norm": 0.45131513476371765,
"learning_rate": 0.0001945054945054945,
"loss": 0.5667,
"step": 1137
},
{
"epoch": 1.2500343265137992,
"grad_norm": 0.604475200176239,
"learning_rate": 0.00019438339438339436,
"loss": 0.9018,
"step": 1138
},
{
"epoch": 1.2511327749553756,
"grad_norm": 0.46968311071395874,
"learning_rate": 0.00019426129426129425,
"loss": 0.7946,
"step": 1139
},
{
"epoch": 1.2522312233969517,
"grad_norm": 0.3960346281528473,
"learning_rate": 0.00019413919413919413,
"loss": 0.7719,
"step": 1140
},
{
"epoch": 1.253329671838528,
"grad_norm": 0.5146461129188538,
"learning_rate": 0.000194017094017094,
"loss": 0.8946,
"step": 1141
},
{
"epoch": 1.2544281202801044,
"grad_norm": 0.6343802809715271,
"learning_rate": 0.00019389499389499388,
"loss": 0.7822,
"step": 1142
},
{
"epoch": 1.2555265687216806,
"grad_norm": 0.4646434485912323,
"learning_rate": 0.00019377289377289376,
"loss": 0.6722,
"step": 1143
},
{
"epoch": 1.256625017163257,
"grad_norm": 0.48127877712249756,
"learning_rate": 0.00019365079365079362,
"loss": 0.9059,
"step": 1144
},
{
"epoch": 1.2577234656048333,
"grad_norm": 0.4040716290473938,
"learning_rate": 0.0001935286935286935,
"loss": 0.7288,
"step": 1145
},
{
"epoch": 1.2588219140464094,
"grad_norm": 0.43992865085601807,
"learning_rate": 0.0001934065934065934,
"loss": 0.5804,
"step": 1146
},
{
"epoch": 1.2599203624879858,
"grad_norm": 0.41578513383865356,
"learning_rate": 0.00019328449328449325,
"loss": 0.5459,
"step": 1147
},
{
"epoch": 1.261018810929562,
"grad_norm": 0.40165719389915466,
"learning_rate": 0.00019316239316239314,
"loss": 0.6001,
"step": 1148
},
{
"epoch": 1.2621172593711383,
"grad_norm": 0.43200212717056274,
"learning_rate": 0.00019304029304029305,
"loss": 0.8712,
"step": 1149
},
{
"epoch": 1.2632157078127144,
"grad_norm": 0.3217264413833618,
"learning_rate": 0.00019291819291819288,
"loss": 0.6074,
"step": 1150
},
{
"epoch": 1.2643141562542908,
"grad_norm": 0.3964528441429138,
"learning_rate": 0.0001927960927960928,
"loss": 0.6131,
"step": 1151
},
{
"epoch": 1.2654126046958671,
"grad_norm": 0.5151070952415466,
"learning_rate": 0.00019267399267399268,
"loss": 0.6992,
"step": 1152
},
{
"epoch": 1.2665110531374433,
"grad_norm": 0.5902129411697388,
"learning_rate": 0.00019255189255189254,
"loss": 0.7311,
"step": 1153
},
{
"epoch": 1.2676095015790196,
"grad_norm": 0.5386108160018921,
"learning_rate": 0.00019242979242979242,
"loss": 0.6469,
"step": 1154
},
{
"epoch": 1.268707950020596,
"grad_norm": 0.384093701839447,
"learning_rate": 0.0001923076923076923,
"loss": 0.7111,
"step": 1155
},
{
"epoch": 1.2698063984621721,
"grad_norm": 0.34160250425338745,
"learning_rate": 0.00019218559218559217,
"loss": 0.5396,
"step": 1156
},
{
"epoch": 1.2709048469037485,
"grad_norm": 0.6590912938117981,
"learning_rate": 0.00019206349206349205,
"loss": 1.1613,
"step": 1157
},
{
"epoch": 1.2720032953453249,
"grad_norm": 0.6230842471122742,
"learning_rate": 0.0001919413919413919,
"loss": 0.7701,
"step": 1158
},
{
"epoch": 1.273101743786901,
"grad_norm": 0.3881864547729492,
"learning_rate": 0.0001918192918192918,
"loss": 0.633,
"step": 1159
},
{
"epoch": 1.2742001922284774,
"grad_norm": 0.4538264274597168,
"learning_rate": 0.00019169719169719168,
"loss": 0.451,
"step": 1160
},
{
"epoch": 1.2752986406700535,
"grad_norm": 0.6188018321990967,
"learning_rate": 0.00019157509157509154,
"loss": 0.9563,
"step": 1161
},
{
"epoch": 1.2763970891116299,
"grad_norm": 0.4172852039337158,
"learning_rate": 0.00019145299145299142,
"loss": 0.8284,
"step": 1162
},
{
"epoch": 1.277495537553206,
"grad_norm": 0.338623583316803,
"learning_rate": 0.0001913308913308913,
"loss": 0.6745,
"step": 1163
},
{
"epoch": 1.2785939859947824,
"grad_norm": 0.3960900902748108,
"learning_rate": 0.00019120879120879117,
"loss": 0.6508,
"step": 1164
},
{
"epoch": 1.2796924344363587,
"grad_norm": 0.37232962250709534,
"learning_rate": 0.00019108669108669108,
"loss": 0.7347,
"step": 1165
},
{
"epoch": 1.2807908828779349,
"grad_norm": 0.47092223167419434,
"learning_rate": 0.00019096459096459097,
"loss": 0.8251,
"step": 1166
},
{
"epoch": 1.2818893313195112,
"grad_norm": 0.4647108316421509,
"learning_rate": 0.00019084249084249082,
"loss": 0.556,
"step": 1167
},
{
"epoch": 1.2829877797610876,
"grad_norm": 0.5812810659408569,
"learning_rate": 0.0001907203907203907,
"loss": 0.6802,
"step": 1168
},
{
"epoch": 1.2840862282026637,
"grad_norm": 0.3731052279472351,
"learning_rate": 0.0001905982905982906,
"loss": 0.6384,
"step": 1169
},
{
"epoch": 1.28518467664424,
"grad_norm": 0.47995856404304504,
"learning_rate": 0.00019047619047619045,
"loss": 0.4914,
"step": 1170
},
{
"epoch": 1.2862831250858162,
"grad_norm": 0.3223705589771271,
"learning_rate": 0.00019035409035409034,
"loss": 0.6676,
"step": 1171
},
{
"epoch": 1.2873815735273926,
"grad_norm": 0.5643377304077148,
"learning_rate": 0.00019023199023199023,
"loss": 0.8224,
"step": 1172
},
{
"epoch": 1.2884800219689687,
"grad_norm": 0.48324450850486755,
"learning_rate": 0.00019010989010989008,
"loss": 0.8005,
"step": 1173
},
{
"epoch": 1.289578470410545,
"grad_norm": 0.40516728162765503,
"learning_rate": 0.00018998778998778997,
"loss": 0.5463,
"step": 1174
},
{
"epoch": 1.2906769188521214,
"grad_norm": 0.45521625876426697,
"learning_rate": 0.00018986568986568985,
"loss": 0.7562,
"step": 1175
},
{
"epoch": 1.2917753672936976,
"grad_norm": 0.38747909665107727,
"learning_rate": 0.0001897435897435897,
"loss": 0.5074,
"step": 1176
},
{
"epoch": 1.292873815735274,
"grad_norm": 0.39688000082969666,
"learning_rate": 0.0001896214896214896,
"loss": 0.3551,
"step": 1177
},
{
"epoch": 1.2939722641768503,
"grad_norm": 0.6891604065895081,
"learning_rate": 0.0001894993894993895,
"loss": 0.601,
"step": 1178
},
{
"epoch": 1.2950707126184264,
"grad_norm": 0.5177300572395325,
"learning_rate": 0.00018937728937728934,
"loss": 0.5188,
"step": 1179
},
{
"epoch": 1.2961691610600028,
"grad_norm": 0.3166979253292084,
"learning_rate": 0.00018925518925518926,
"loss": 0.8411,
"step": 1180
},
{
"epoch": 1.2972676095015792,
"grad_norm": 0.6637437343597412,
"learning_rate": 0.00018913308913308914,
"loss": 0.7256,
"step": 1181
},
{
"epoch": 1.2983660579431553,
"grad_norm": 0.424932599067688,
"learning_rate": 0.000189010989010989,
"loss": 0.783,
"step": 1182
},
{
"epoch": 1.2994645063847314,
"grad_norm": 0.47751033306121826,
"learning_rate": 0.00018888888888888888,
"loss": 0.7039,
"step": 1183
},
{
"epoch": 1.3005629548263078,
"grad_norm": 0.4332704544067383,
"learning_rate": 0.00018876678876678874,
"loss": 0.4797,
"step": 1184
},
{
"epoch": 1.3016614032678842,
"grad_norm": 0.439431756734848,
"learning_rate": 0.00018864468864468863,
"loss": 0.6256,
"step": 1185
},
{
"epoch": 1.3027598517094603,
"grad_norm": 0.4334176480770111,
"learning_rate": 0.00018852258852258851,
"loss": 0.5583,
"step": 1186
},
{
"epoch": 1.3038583001510367,
"grad_norm": 0.42080724239349365,
"learning_rate": 0.00018840048840048837,
"loss": 0.461,
"step": 1187
},
{
"epoch": 1.304956748592613,
"grad_norm": 0.41007399559020996,
"learning_rate": 0.00018827838827838826,
"loss": 0.4746,
"step": 1188
},
{
"epoch": 1.3060551970341892,
"grad_norm": 0.3763822019100189,
"learning_rate": 0.00018815628815628814,
"loss": 0.5352,
"step": 1189
},
{
"epoch": 1.3071536454757655,
"grad_norm": 0.5557730197906494,
"learning_rate": 0.000188034188034188,
"loss": 0.5404,
"step": 1190
},
{
"epoch": 1.3082520939173419,
"grad_norm": 0.43677788972854614,
"learning_rate": 0.0001879120879120879,
"loss": 0.7111,
"step": 1191
},
{
"epoch": 1.309350542358918,
"grad_norm": 0.6084219217300415,
"learning_rate": 0.00018778998778998777,
"loss": 0.7524,
"step": 1192
},
{
"epoch": 1.3104489908004944,
"grad_norm": 0.7219144701957703,
"learning_rate": 0.00018766788766788763,
"loss": 0.6182,
"step": 1193
},
{
"epoch": 1.3115474392420705,
"grad_norm": 0.5280331969261169,
"learning_rate": 0.00018754578754578752,
"loss": 0.8023,
"step": 1194
},
{
"epoch": 1.3126458876836469,
"grad_norm": 0.42130032181739807,
"learning_rate": 0.00018742368742368743,
"loss": 0.5673,
"step": 1195
},
{
"epoch": 1.313744336125223,
"grad_norm": 0.6063292026519775,
"learning_rate": 0.0001873015873015873,
"loss": 0.6438,
"step": 1196
},
{
"epoch": 1.3148427845667994,
"grad_norm": 0.4073690176010132,
"learning_rate": 0.00018717948717948717,
"loss": 0.7099,
"step": 1197
},
{
"epoch": 1.3159412330083757,
"grad_norm": 0.5419113636016846,
"learning_rate": 0.00018705738705738706,
"loss": 0.6451,
"step": 1198
},
{
"epoch": 1.3170396814499519,
"grad_norm": 0.4489867091178894,
"learning_rate": 0.00018693528693528692,
"loss": 0.7522,
"step": 1199
},
{
"epoch": 1.3181381298915282,
"grad_norm": 0.3536837697029114,
"learning_rate": 0.0001868131868131868,
"loss": 0.6201,
"step": 1200
},
{
"epoch": 1.3192365783331046,
"grad_norm": 0.42462313175201416,
"learning_rate": 0.0001866910866910867,
"loss": 0.4804,
"step": 1201
},
{
"epoch": 1.3203350267746807,
"grad_norm": 0.612319827079773,
"learning_rate": 0.00018656898656898655,
"loss": 0.8546,
"step": 1202
},
{
"epoch": 1.321433475216257,
"grad_norm": 0.5242000222206116,
"learning_rate": 0.00018644688644688643,
"loss": 0.7577,
"step": 1203
},
{
"epoch": 1.3225319236578332,
"grad_norm": 0.5688628554344177,
"learning_rate": 0.00018632478632478632,
"loss": 0.6645,
"step": 1204
},
{
"epoch": 1.3236303720994096,
"grad_norm": 0.3695731461048126,
"learning_rate": 0.00018620268620268618,
"loss": 0.4979,
"step": 1205
},
{
"epoch": 1.3247288205409857,
"grad_norm": 0.44525593519210815,
"learning_rate": 0.00018608058608058606,
"loss": 0.807,
"step": 1206
},
{
"epoch": 1.325827268982562,
"grad_norm": 0.37627971172332764,
"learning_rate": 0.00018595848595848595,
"loss": 0.6584,
"step": 1207
},
{
"epoch": 1.3269257174241385,
"grad_norm": 0.39727315306663513,
"learning_rate": 0.0001858363858363858,
"loss": 0.5565,
"step": 1208
},
{
"epoch": 1.3280241658657146,
"grad_norm": 0.4151424169540405,
"learning_rate": 0.00018571428571428572,
"loss": 0.81,
"step": 1209
},
{
"epoch": 1.329122614307291,
"grad_norm": 0.37529075145721436,
"learning_rate": 0.00018559218559218555,
"loss": 0.6188,
"step": 1210
},
{
"epoch": 1.3302210627488673,
"grad_norm": 0.43061408400535583,
"learning_rate": 0.00018547008547008546,
"loss": 0.814,
"step": 1211
},
{
"epoch": 1.3313195111904434,
"grad_norm": 0.437511682510376,
"learning_rate": 0.00018534798534798535,
"loss": 0.55,
"step": 1212
},
{
"epoch": 1.3324179596320198,
"grad_norm": 0.5172685980796814,
"learning_rate": 0.0001852258852258852,
"loss": 0.6551,
"step": 1213
},
{
"epoch": 1.3335164080735962,
"grad_norm": 0.3292716443538666,
"learning_rate": 0.0001851037851037851,
"loss": 0.5108,
"step": 1214
},
{
"epoch": 1.3346148565151723,
"grad_norm": 0.7129474878311157,
"learning_rate": 0.00018498168498168498,
"loss": 0.7197,
"step": 1215
},
{
"epoch": 1.3357133049567487,
"grad_norm": 0.46317145228385925,
"learning_rate": 0.00018485958485958483,
"loss": 0.6553,
"step": 1216
},
{
"epoch": 1.3368117533983248,
"grad_norm": 0.5539398789405823,
"learning_rate": 0.00018473748473748472,
"loss": 0.7057,
"step": 1217
},
{
"epoch": 1.3379102018399012,
"grad_norm": 0.40555253624916077,
"learning_rate": 0.0001846153846153846,
"loss": 0.5976,
"step": 1218
},
{
"epoch": 1.3390086502814773,
"grad_norm": 0.462704062461853,
"learning_rate": 0.00018449328449328446,
"loss": 0.7018,
"step": 1219
},
{
"epoch": 1.3401070987230537,
"grad_norm": 0.407287061214447,
"learning_rate": 0.00018437118437118435,
"loss": 0.4726,
"step": 1220
},
{
"epoch": 1.34120554716463,
"grad_norm": 0.3654995858669281,
"learning_rate": 0.00018424908424908423,
"loss": 0.5811,
"step": 1221
},
{
"epoch": 1.3423039956062062,
"grad_norm": 0.46455878019332886,
"learning_rate": 0.0001841269841269841,
"loss": 0.8998,
"step": 1222
},
{
"epoch": 1.3434024440477825,
"grad_norm": 0.47929346561431885,
"learning_rate": 0.00018400488400488398,
"loss": 0.7348,
"step": 1223
},
{
"epoch": 1.344500892489359,
"grad_norm": 0.7128652930259705,
"learning_rate": 0.0001838827838827839,
"loss": 1.2647,
"step": 1224
},
{
"epoch": 1.345599340930935,
"grad_norm": 0.3956572413444519,
"learning_rate": 0.00018376068376068372,
"loss": 0.6985,
"step": 1225
},
{
"epoch": 1.3466977893725114,
"grad_norm": 0.5585309863090515,
"learning_rate": 0.00018363858363858364,
"loss": 1.0086,
"step": 1226
},
{
"epoch": 1.3477962378140875,
"grad_norm": 1.5960838794708252,
"learning_rate": 0.00018351648351648352,
"loss": 0.644,
"step": 1227
},
{
"epoch": 1.3488946862556639,
"grad_norm": 0.6499342322349548,
"learning_rate": 0.00018339438339438338,
"loss": 0.7698,
"step": 1228
},
{
"epoch": 1.34999313469724,
"grad_norm": 0.42246925830841064,
"learning_rate": 0.00018327228327228326,
"loss": 0.5614,
"step": 1229
},
{
"epoch": 1.3510915831388164,
"grad_norm": 0.42192572355270386,
"learning_rate": 0.00018315018315018315,
"loss": 0.7726,
"step": 1230
},
{
"epoch": 1.3521900315803927,
"grad_norm": 0.6409221887588501,
"learning_rate": 0.000183028083028083,
"loss": 0.5928,
"step": 1231
},
{
"epoch": 1.3532884800219689,
"grad_norm": 1.328852653503418,
"learning_rate": 0.0001829059829059829,
"loss": 0.7861,
"step": 1232
},
{
"epoch": 1.3543869284635452,
"grad_norm": 0.4519331753253937,
"learning_rate": 0.00018278388278388275,
"loss": 0.5938,
"step": 1233
},
{
"epoch": 1.3554853769051216,
"grad_norm": 0.3942720592021942,
"learning_rate": 0.00018266178266178264,
"loss": 0.4781,
"step": 1234
},
{
"epoch": 1.3565838253466977,
"grad_norm": 0.5066869258880615,
"learning_rate": 0.00018253968253968252,
"loss": 0.8069,
"step": 1235
},
{
"epoch": 1.357682273788274,
"grad_norm": 0.37002792954444885,
"learning_rate": 0.00018241758241758238,
"loss": 0.5737,
"step": 1236
},
{
"epoch": 1.3587807222298505,
"grad_norm": 0.3738810122013092,
"learning_rate": 0.00018229548229548227,
"loss": 0.5169,
"step": 1237
},
{
"epoch": 1.3598791706714266,
"grad_norm": 0.44956260919570923,
"learning_rate": 0.00018217338217338215,
"loss": 0.5614,
"step": 1238
},
{
"epoch": 1.3609776191130027,
"grad_norm": 0.34839004278182983,
"learning_rate": 0.000182051282051282,
"loss": 0.5783,
"step": 1239
},
{
"epoch": 1.362076067554579,
"grad_norm": 0.30152127146720886,
"learning_rate": 0.00018192918192918192,
"loss": 0.4321,
"step": 1240
},
{
"epoch": 1.3631745159961555,
"grad_norm": 0.6672345399856567,
"learning_rate": 0.0001818070818070818,
"loss": 0.6073,
"step": 1241
},
{
"epoch": 1.3642729644377316,
"grad_norm": 0.45652687549591064,
"learning_rate": 0.00018168498168498167,
"loss": 0.6193,
"step": 1242
},
{
"epoch": 1.365371412879308,
"grad_norm": 0.6392306089401245,
"learning_rate": 0.00018156288156288155,
"loss": 0.8388,
"step": 1243
},
{
"epoch": 1.3664698613208843,
"grad_norm": 0.5510252714157104,
"learning_rate": 0.00018144078144078144,
"loss": 0.6512,
"step": 1244
},
{
"epoch": 1.3675683097624605,
"grad_norm": 0.38780227303504944,
"learning_rate": 0.0001813186813186813,
"loss": 0.6835,
"step": 1245
},
{
"epoch": 1.3686667582040368,
"grad_norm": 0.47472965717315674,
"learning_rate": 0.00018119658119658118,
"loss": 0.6625,
"step": 1246
},
{
"epoch": 1.3697652066456132,
"grad_norm": 0.3599228262901306,
"learning_rate": 0.00018107448107448107,
"loss": 0.5063,
"step": 1247
},
{
"epoch": 1.3708636550871893,
"grad_norm": 0.3284567892551422,
"learning_rate": 0.00018095238095238093,
"loss": 0.7679,
"step": 1248
},
{
"epoch": 1.3719621035287657,
"grad_norm": 0.5258575081825256,
"learning_rate": 0.0001808302808302808,
"loss": 0.6213,
"step": 1249
},
{
"epoch": 1.3730605519703418,
"grad_norm": 0.3211069405078888,
"learning_rate": 0.0001807081807081807,
"loss": 0.5306,
"step": 1250
},
{
"epoch": 1.3741590004119182,
"grad_norm": 0.6325588822364807,
"learning_rate": 0.00018058608058608056,
"loss": 0.8104,
"step": 1251
},
{
"epoch": 1.3752574488534943,
"grad_norm": 0.4994303584098816,
"learning_rate": 0.00018046398046398044,
"loss": 0.6464,
"step": 1252
},
{
"epoch": 1.3763558972950707,
"grad_norm": 0.3013019263744354,
"learning_rate": 0.00018034188034188035,
"loss": 0.4749,
"step": 1253
},
{
"epoch": 1.377454345736647,
"grad_norm": 1.0342131853103638,
"learning_rate": 0.00018021978021978018,
"loss": 0.7995,
"step": 1254
},
{
"epoch": 1.3785527941782232,
"grad_norm": 0.40213823318481445,
"learning_rate": 0.0001800976800976801,
"loss": 0.8791,
"step": 1255
},
{
"epoch": 1.3796512426197995,
"grad_norm": 0.37126532196998596,
"learning_rate": 0.00017997557997557998,
"loss": 0.551,
"step": 1256
},
{
"epoch": 1.380749691061376,
"grad_norm": 0.3417685031890869,
"learning_rate": 0.00017985347985347984,
"loss": 0.583,
"step": 1257
},
{
"epoch": 1.381848139502952,
"grad_norm": 0.33571329712867737,
"learning_rate": 0.00017973137973137973,
"loss": 0.4927,
"step": 1258
},
{
"epoch": 1.3829465879445284,
"grad_norm": 0.5128073692321777,
"learning_rate": 0.00017960927960927959,
"loss": 0.5903,
"step": 1259
},
{
"epoch": 1.3840450363861048,
"grad_norm": 0.5345245599746704,
"learning_rate": 0.00017948717948717947,
"loss": 0.5828,
"step": 1260
},
{
"epoch": 1.385143484827681,
"grad_norm": 0.312639981508255,
"learning_rate": 0.00017936507936507936,
"loss": 0.6905,
"step": 1261
},
{
"epoch": 1.386241933269257,
"grad_norm": 0.4795394837856293,
"learning_rate": 0.00017924297924297921,
"loss": 0.6193,
"step": 1262
},
{
"epoch": 1.3873403817108334,
"grad_norm": 0.39672231674194336,
"learning_rate": 0.0001791208791208791,
"loss": 0.7833,
"step": 1263
},
{
"epoch": 1.3884388301524098,
"grad_norm": 0.46752655506134033,
"learning_rate": 0.00017899877899877899,
"loss": 0.6385,
"step": 1264
},
{
"epoch": 1.389537278593986,
"grad_norm": 0.5376736521720886,
"learning_rate": 0.00017887667887667884,
"loss": 0.6362,
"step": 1265
},
{
"epoch": 1.3906357270355623,
"grad_norm": 0.5675904750823975,
"learning_rate": 0.00017875457875457873,
"loss": 0.7975,
"step": 1266
},
{
"epoch": 1.3917341754771386,
"grad_norm": 0.5429015755653381,
"learning_rate": 0.00017863247863247861,
"loss": 0.5415,
"step": 1267
},
{
"epoch": 1.3928326239187148,
"grad_norm": 0.3714626729488373,
"learning_rate": 0.00017851037851037847,
"loss": 0.7104,
"step": 1268
},
{
"epoch": 1.3939310723602911,
"grad_norm": 0.7549324035644531,
"learning_rate": 0.00017838827838827836,
"loss": 0.698,
"step": 1269
},
{
"epoch": 1.3950295208018675,
"grad_norm": 0.36867257952690125,
"learning_rate": 0.00017826617826617827,
"loss": 0.6019,
"step": 1270
},
{
"epoch": 1.3961279692434436,
"grad_norm": 0.42439624667167664,
"learning_rate": 0.00017814407814407813,
"loss": 0.4626,
"step": 1271
},
{
"epoch": 1.39722641768502,
"grad_norm": 0.4768877923488617,
"learning_rate": 0.00017802197802197802,
"loss": 0.671,
"step": 1272
},
{
"epoch": 1.3983248661265961,
"grad_norm": 0.3415908217430115,
"learning_rate": 0.0001778998778998779,
"loss": 0.5904,
"step": 1273
},
{
"epoch": 1.3994233145681725,
"grad_norm": 0.5370535850524902,
"learning_rate": 0.00017777777777777776,
"loss": 0.578,
"step": 1274
},
{
"epoch": 1.4005217630097486,
"grad_norm": 0.61114901304245,
"learning_rate": 0.00017765567765567764,
"loss": 0.6498,
"step": 1275
},
{
"epoch": 1.401620211451325,
"grad_norm": 0.3491772711277008,
"learning_rate": 0.00017753357753357753,
"loss": 0.6057,
"step": 1276
},
{
"epoch": 1.4027186598929013,
"grad_norm": 0.4992705285549164,
"learning_rate": 0.0001774114774114774,
"loss": 0.8541,
"step": 1277
},
{
"epoch": 1.4038171083344775,
"grad_norm": 0.5476379990577698,
"learning_rate": 0.00017728937728937727,
"loss": 0.5608,
"step": 1278
},
{
"epoch": 1.4049155567760538,
"grad_norm": 0.6107895374298096,
"learning_rate": 0.00017716727716727716,
"loss": 0.7437,
"step": 1279
},
{
"epoch": 1.4060140052176302,
"grad_norm": 0.510809600353241,
"learning_rate": 0.00017704517704517702,
"loss": 0.6569,
"step": 1280
},
{
"epoch": 1.4071124536592063,
"grad_norm": 0.5050077438354492,
"learning_rate": 0.0001769230769230769,
"loss": 0.6566,
"step": 1281
},
{
"epoch": 1.4082109021007827,
"grad_norm": 0.44812703132629395,
"learning_rate": 0.0001768009768009768,
"loss": 0.6557,
"step": 1282
},
{
"epoch": 1.4093093505423588,
"grad_norm": 0.5216537714004517,
"learning_rate": 0.00017667887667887665,
"loss": 0.7311,
"step": 1283
},
{
"epoch": 1.4104077989839352,
"grad_norm": 0.5608856081962585,
"learning_rate": 0.00017655677655677656,
"loss": 0.9001,
"step": 1284
},
{
"epoch": 1.4115062474255113,
"grad_norm": 0.47205066680908203,
"learning_rate": 0.0001764346764346764,
"loss": 0.5214,
"step": 1285
},
{
"epoch": 1.4126046958670877,
"grad_norm": 0.4073629081249237,
"learning_rate": 0.0001763125763125763,
"loss": 0.483,
"step": 1286
},
{
"epoch": 1.413703144308664,
"grad_norm": 0.42381593585014343,
"learning_rate": 0.0001761904761904762,
"loss": 0.4895,
"step": 1287
},
{
"epoch": 1.4148015927502402,
"grad_norm": 0.629356861114502,
"learning_rate": 0.00017606837606837605,
"loss": 0.4639,
"step": 1288
},
{
"epoch": 1.4159000411918166,
"grad_norm": 0.3123486340045929,
"learning_rate": 0.00017594627594627593,
"loss": 0.4575,
"step": 1289
},
{
"epoch": 1.416998489633393,
"grad_norm": 0.4163682460784912,
"learning_rate": 0.00017582417582417582,
"loss": 0.7511,
"step": 1290
},
{
"epoch": 1.418096938074969,
"grad_norm": 0.5697455406188965,
"learning_rate": 0.00017570207570207568,
"loss": 0.5977,
"step": 1291
},
{
"epoch": 1.4191953865165454,
"grad_norm": 0.39232510328292847,
"learning_rate": 0.00017557997557997556,
"loss": 0.6133,
"step": 1292
},
{
"epoch": 1.4202938349581218,
"grad_norm": 0.5452993512153625,
"learning_rate": 0.00017545787545787545,
"loss": 0.6596,
"step": 1293
},
{
"epoch": 1.421392283399698,
"grad_norm": 0.39080601930618286,
"learning_rate": 0.0001753357753357753,
"loss": 0.7422,
"step": 1294
},
{
"epoch": 1.4224907318412743,
"grad_norm": 0.6513398289680481,
"learning_rate": 0.0001752136752136752,
"loss": 0.5277,
"step": 1295
},
{
"epoch": 1.4235891802828504,
"grad_norm": 0.4627130329608917,
"learning_rate": 0.00017509157509157508,
"loss": 0.6296,
"step": 1296
},
{
"epoch": 1.4246876287244268,
"grad_norm": 0.499700129032135,
"learning_rate": 0.00017496947496947494,
"loss": 0.689,
"step": 1297
},
{
"epoch": 1.425786077166003,
"grad_norm": 0.4668709635734558,
"learning_rate": 0.00017484737484737482,
"loss": 0.784,
"step": 1298
},
{
"epoch": 1.4268845256075793,
"grad_norm": 0.6378145217895508,
"learning_rate": 0.00017472527472527473,
"loss": 0.5077,
"step": 1299
},
{
"epoch": 1.4279829740491556,
"grad_norm": 0.6320174336433411,
"learning_rate": 0.00017460317460317457,
"loss": 1.061,
"step": 1300
},
{
"epoch": 1.4290814224907318,
"grad_norm": 0.48719078302383423,
"learning_rate": 0.00017448107448107448,
"loss": 0.7181,
"step": 1301
},
{
"epoch": 1.4301798709323081,
"grad_norm": 0.5345287919044495,
"learning_rate": 0.00017435897435897436,
"loss": 0.5599,
"step": 1302
},
{
"epoch": 1.4312783193738845,
"grad_norm": 0.567857563495636,
"learning_rate": 0.00017423687423687422,
"loss": 0.6294,
"step": 1303
},
{
"epoch": 1.4323767678154606,
"grad_norm": 0.5715040564537048,
"learning_rate": 0.0001741147741147741,
"loss": 0.5326,
"step": 1304
},
{
"epoch": 1.433475216257037,
"grad_norm": 0.40048834681510925,
"learning_rate": 0.000173992673992674,
"loss": 0.687,
"step": 1305
},
{
"epoch": 1.4345736646986131,
"grad_norm": 0.4964540898799896,
"learning_rate": 0.00017387057387057385,
"loss": 0.6149,
"step": 1306
},
{
"epoch": 1.4356721131401895,
"grad_norm": 0.5018569231033325,
"learning_rate": 0.00017374847374847374,
"loss": 0.4224,
"step": 1307
},
{
"epoch": 1.4367705615817656,
"grad_norm": 0.6026094555854797,
"learning_rate": 0.00017362637362637362,
"loss": 0.8934,
"step": 1308
},
{
"epoch": 1.437869010023342,
"grad_norm": 0.33409950137138367,
"learning_rate": 0.00017350427350427348,
"loss": 0.6725,
"step": 1309
},
{
"epoch": 1.4389674584649184,
"grad_norm": 0.43982234597206116,
"learning_rate": 0.00017338217338217337,
"loss": 0.9203,
"step": 1310
},
{
"epoch": 1.4400659069064945,
"grad_norm": 0.843877911567688,
"learning_rate": 0.00017326007326007322,
"loss": 0.6028,
"step": 1311
},
{
"epoch": 1.4411643553480709,
"grad_norm": 0.35148733854293823,
"learning_rate": 0.0001731379731379731,
"loss": 0.7503,
"step": 1312
},
{
"epoch": 1.4422628037896472,
"grad_norm": 0.4561845362186432,
"learning_rate": 0.000173015873015873,
"loss": 0.6577,
"step": 1313
},
{
"epoch": 1.4433612522312234,
"grad_norm": 0.47295713424682617,
"learning_rate": 0.00017289377289377285,
"loss": 0.8013,
"step": 1314
},
{
"epoch": 1.4444597006727997,
"grad_norm": 0.46340033411979675,
"learning_rate": 0.00017277167277167277,
"loss": 0.73,
"step": 1315
},
{
"epoch": 1.445558149114376,
"grad_norm": 0.49221453070640564,
"learning_rate": 0.00017264957264957265,
"loss": 0.6735,
"step": 1316
},
{
"epoch": 1.4466565975559522,
"grad_norm": 0.36250925064086914,
"learning_rate": 0.0001725274725274725,
"loss": 0.7463,
"step": 1317
},
{
"epoch": 1.4477550459975284,
"grad_norm": 0.3832615911960602,
"learning_rate": 0.0001724053724053724,
"loss": 0.7295,
"step": 1318
},
{
"epoch": 1.4488534944391047,
"grad_norm": 0.7413591742515564,
"learning_rate": 0.00017228327228327228,
"loss": 0.7627,
"step": 1319
},
{
"epoch": 1.449951942880681,
"grad_norm": 0.45626765489578247,
"learning_rate": 0.00017216117216117214,
"loss": 0.727,
"step": 1320
},
{
"epoch": 1.4510503913222572,
"grad_norm": 0.3024120330810547,
"learning_rate": 0.00017203907203907202,
"loss": 0.3986,
"step": 1321
},
{
"epoch": 1.4521488397638336,
"grad_norm": 0.31635284423828125,
"learning_rate": 0.0001719169719169719,
"loss": 0.3469,
"step": 1322
},
{
"epoch": 1.45324728820541,
"grad_norm": 0.36893391609191895,
"learning_rate": 0.00017179487179487177,
"loss": 0.7017,
"step": 1323
},
{
"epoch": 1.454345736646986,
"grad_norm": 0.4804024398326874,
"learning_rate": 0.00017167277167277165,
"loss": 0.8811,
"step": 1324
},
{
"epoch": 1.4554441850885624,
"grad_norm": 0.4446522295475006,
"learning_rate": 0.00017155067155067154,
"loss": 0.8027,
"step": 1325
},
{
"epoch": 1.4565426335301388,
"grad_norm": 0.27936413884162903,
"learning_rate": 0.0001714285714285714,
"loss": 0.3846,
"step": 1326
},
{
"epoch": 1.457641081971715,
"grad_norm": 0.3312259316444397,
"learning_rate": 0.00017130647130647128,
"loss": 0.4852,
"step": 1327
},
{
"epoch": 1.4587395304132913,
"grad_norm": 0.4751642644405365,
"learning_rate": 0.0001711843711843712,
"loss": 0.7337,
"step": 1328
},
{
"epoch": 1.4598379788548674,
"grad_norm": 0.5365067720413208,
"learning_rate": 0.00017106227106227103,
"loss": 0.8052,
"step": 1329
},
{
"epoch": 1.4609364272964438,
"grad_norm": 0.5944942831993103,
"learning_rate": 0.00017094017094017094,
"loss": 0.7673,
"step": 1330
},
{
"epoch": 1.46203487573802,
"grad_norm": 0.48244431614875793,
"learning_rate": 0.00017081807081807083,
"loss": 0.855,
"step": 1331
},
{
"epoch": 1.4631333241795963,
"grad_norm": 0.32348135113716125,
"learning_rate": 0.00017069597069597068,
"loss": 0.5133,
"step": 1332
},
{
"epoch": 1.4642317726211727,
"grad_norm": 0.6455866694450378,
"learning_rate": 0.00017057387057387057,
"loss": 0.6825,
"step": 1333
},
{
"epoch": 1.4653302210627488,
"grad_norm": 0.3937522768974304,
"learning_rate": 0.00017045177045177045,
"loss": 0.6335,
"step": 1334
},
{
"epoch": 1.4664286695043252,
"grad_norm": 0.33579352498054504,
"learning_rate": 0.0001703296703296703,
"loss": 0.4711,
"step": 1335
},
{
"epoch": 1.4675271179459015,
"grad_norm": 0.5055533647537231,
"learning_rate": 0.0001702075702075702,
"loss": 0.6512,
"step": 1336
},
{
"epoch": 1.4686255663874777,
"grad_norm": 0.40702182054519653,
"learning_rate": 0.00017008547008547006,
"loss": 0.8833,
"step": 1337
},
{
"epoch": 1.469724014829054,
"grad_norm": 0.3574135899543762,
"learning_rate": 0.00016996336996336994,
"loss": 0.7127,
"step": 1338
},
{
"epoch": 1.4708224632706302,
"grad_norm": 0.45641472935676575,
"learning_rate": 0.00016984126984126983,
"loss": 0.7258,
"step": 1339
},
{
"epoch": 1.4719209117122065,
"grad_norm": 1.5012352466583252,
"learning_rate": 0.0001697191697191697,
"loss": 0.8065,
"step": 1340
},
{
"epoch": 1.4730193601537827,
"grad_norm": 0.5025885701179504,
"learning_rate": 0.00016959706959706957,
"loss": 0.9377,
"step": 1341
},
{
"epoch": 1.474117808595359,
"grad_norm": 0.2942202687263489,
"learning_rate": 0.00016947496947496946,
"loss": 0.5693,
"step": 1342
},
{
"epoch": 1.4752162570369354,
"grad_norm": 0.48770126700401306,
"learning_rate": 0.00016935286935286932,
"loss": 0.5483,
"step": 1343
},
{
"epoch": 1.4763147054785115,
"grad_norm": 0.3853349983692169,
"learning_rate": 0.0001692307692307692,
"loss": 0.5787,
"step": 1344
},
{
"epoch": 1.4774131539200879,
"grad_norm": 0.3593169152736664,
"learning_rate": 0.00016910866910866911,
"loss": 0.6426,
"step": 1345
},
{
"epoch": 1.4785116023616642,
"grad_norm": 0.5932713150978088,
"learning_rate": 0.00016898656898656897,
"loss": 0.7543,
"step": 1346
},
{
"epoch": 1.4796100508032404,
"grad_norm": 0.43406638503074646,
"learning_rate": 0.00016886446886446886,
"loss": 0.7868,
"step": 1347
},
{
"epoch": 1.4807084992448167,
"grad_norm": 0.38596048951148987,
"learning_rate": 0.00016874236874236874,
"loss": 0.49,
"step": 1348
},
{
"epoch": 1.481806947686393,
"grad_norm": 0.42844533920288086,
"learning_rate": 0.0001686202686202686,
"loss": 0.6485,
"step": 1349
},
{
"epoch": 1.4829053961279692,
"grad_norm": 0.5165280103683472,
"learning_rate": 0.0001684981684981685,
"loss": 0.6924,
"step": 1350
},
{
"epoch": 1.4840038445695456,
"grad_norm": 0.5717988610267639,
"learning_rate": 0.00016837606837606837,
"loss": 0.5624,
"step": 1351
},
{
"epoch": 1.4851022930111217,
"grad_norm": 0.4384293556213379,
"learning_rate": 0.00016825396825396823,
"loss": 0.7895,
"step": 1352
},
{
"epoch": 1.486200741452698,
"grad_norm": 0.5472243428230286,
"learning_rate": 0.00016813186813186812,
"loss": 0.8838,
"step": 1353
},
{
"epoch": 1.4872991898942742,
"grad_norm": 0.3903232216835022,
"learning_rate": 0.000168009768009768,
"loss": 0.5452,
"step": 1354
},
{
"epoch": 1.4883976383358506,
"grad_norm": 0.3799583613872528,
"learning_rate": 0.00016788766788766786,
"loss": 0.8931,
"step": 1355
},
{
"epoch": 1.489496086777427,
"grad_norm": 0.4481349289417267,
"learning_rate": 0.00016776556776556775,
"loss": 0.5956,
"step": 1356
},
{
"epoch": 1.490594535219003,
"grad_norm": 0.45875266194343567,
"learning_rate": 0.00016764346764346763,
"loss": 0.4729,
"step": 1357
},
{
"epoch": 1.4916929836605795,
"grad_norm": 0.494112104177475,
"learning_rate": 0.0001675213675213675,
"loss": 0.6416,
"step": 1358
},
{
"epoch": 1.4927914321021558,
"grad_norm": 0.3976772725582123,
"learning_rate": 0.0001673992673992674,
"loss": 0.6601,
"step": 1359
},
{
"epoch": 1.493889880543732,
"grad_norm": 0.29009610414505005,
"learning_rate": 0.0001672771672771673,
"loss": 0.4261,
"step": 1360
},
{
"epoch": 1.4949883289853083,
"grad_norm": 0.5540419816970825,
"learning_rate": 0.00016715506715506715,
"loss": 0.8206,
"step": 1361
},
{
"epoch": 1.4960867774268845,
"grad_norm": 0.41308313608169556,
"learning_rate": 0.00016703296703296703,
"loss": 0.7862,
"step": 1362
},
{
"epoch": 1.4971852258684608,
"grad_norm": 0.6565150618553162,
"learning_rate": 0.0001669108669108669,
"loss": 0.6963,
"step": 1363
},
{
"epoch": 1.498283674310037,
"grad_norm": 0.4901321530342102,
"learning_rate": 0.00016678876678876678,
"loss": 0.7063,
"step": 1364
},
{
"epoch": 1.4993821227516133,
"grad_norm": 0.4676086902618408,
"learning_rate": 0.00016666666666666666,
"loss": 0.5142,
"step": 1365
},
{
"epoch": 1.5004805711931897,
"grad_norm": 0.4745628833770752,
"learning_rate": 0.00016654456654456652,
"loss": 0.7659,
"step": 1366
},
{
"epoch": 1.5015790196347658,
"grad_norm": 0.42693057656288147,
"learning_rate": 0.0001664224664224664,
"loss": 0.9233,
"step": 1367
},
{
"epoch": 1.5026774680763422,
"grad_norm": 0.4110391139984131,
"learning_rate": 0.0001663003663003663,
"loss": 0.5062,
"step": 1368
},
{
"epoch": 1.5037759165179185,
"grad_norm": 0.3090996742248535,
"learning_rate": 0.00016617826617826615,
"loss": 0.4462,
"step": 1369
},
{
"epoch": 1.5048743649594947,
"grad_norm": 0.42027410864830017,
"learning_rate": 0.00016605616605616603,
"loss": 0.8589,
"step": 1370
},
{
"epoch": 1.505972813401071,
"grad_norm": 0.38396796584129333,
"learning_rate": 0.00016593406593406592,
"loss": 0.6609,
"step": 1371
},
{
"epoch": 1.5070712618426474,
"grad_norm": 0.5236012935638428,
"learning_rate": 0.00016581196581196578,
"loss": 0.6506,
"step": 1372
},
{
"epoch": 1.5081697102842235,
"grad_norm": 0.7232113480567932,
"learning_rate": 0.00016568986568986566,
"loss": 0.6689,
"step": 1373
},
{
"epoch": 1.5092681587257997,
"grad_norm": 0.4777502417564392,
"learning_rate": 0.00016556776556776558,
"loss": 0.5701,
"step": 1374
},
{
"epoch": 1.510366607167376,
"grad_norm": 0.39154767990112305,
"learning_rate": 0.0001654456654456654,
"loss": 0.4906,
"step": 1375
},
{
"epoch": 1.5114650556089524,
"grad_norm": 0.469382107257843,
"learning_rate": 0.00016532356532356532,
"loss": 0.5768,
"step": 1376
},
{
"epoch": 1.5125635040505285,
"grad_norm": 0.3485945761203766,
"learning_rate": 0.0001652014652014652,
"loss": 0.7814,
"step": 1377
},
{
"epoch": 1.513661952492105,
"grad_norm": 0.4375949203968048,
"learning_rate": 0.00016507936507936506,
"loss": 0.6328,
"step": 1378
},
{
"epoch": 1.5147604009336813,
"grad_norm": 0.47778064012527466,
"learning_rate": 0.00016495726495726495,
"loss": 0.635,
"step": 1379
},
{
"epoch": 1.5158588493752574,
"grad_norm": 0.3515126705169678,
"learning_rate": 0.00016483516483516484,
"loss": 0.7014,
"step": 1380
},
{
"epoch": 1.5169572978168337,
"grad_norm": 0.3710018992424011,
"learning_rate": 0.0001647130647130647,
"loss": 0.7903,
"step": 1381
},
{
"epoch": 1.51805574625841,
"grad_norm": 0.37630394101142883,
"learning_rate": 0.00016459096459096458,
"loss": 0.5446,
"step": 1382
},
{
"epoch": 1.5191541946999862,
"grad_norm": 0.4312807321548462,
"learning_rate": 0.00016446886446886446,
"loss": 0.6101,
"step": 1383
},
{
"epoch": 1.5202526431415624,
"grad_norm": 0.399384468793869,
"learning_rate": 0.00016434676434676432,
"loss": 0.5734,
"step": 1384
},
{
"epoch": 1.521351091583139,
"grad_norm": 0.41233471035957336,
"learning_rate": 0.0001642246642246642,
"loss": 0.6525,
"step": 1385
},
{
"epoch": 1.522449540024715,
"grad_norm": 0.5215228199958801,
"learning_rate": 0.0001641025641025641,
"loss": 0.4804,
"step": 1386
},
{
"epoch": 1.5235479884662912,
"grad_norm": 0.42069393396377563,
"learning_rate": 0.00016398046398046395,
"loss": 0.5517,
"step": 1387
},
{
"epoch": 1.5246464369078676,
"grad_norm": 1.7902978658676147,
"learning_rate": 0.00016385836385836384,
"loss": 0.6295,
"step": 1388
},
{
"epoch": 1.525744885349444,
"grad_norm": 0.7353507280349731,
"learning_rate": 0.0001637362637362637,
"loss": 1.0585,
"step": 1389
},
{
"epoch": 1.52684333379102,
"grad_norm": 0.45992404222488403,
"learning_rate": 0.0001636141636141636,
"loss": 0.7671,
"step": 1390
},
{
"epoch": 1.5279417822325965,
"grad_norm": 0.3927334249019623,
"learning_rate": 0.0001634920634920635,
"loss": 0.7479,
"step": 1391
},
{
"epoch": 1.5290402306741728,
"grad_norm": 0.32833003997802734,
"learning_rate": 0.00016336996336996335,
"loss": 0.5774,
"step": 1392
},
{
"epoch": 1.530138679115749,
"grad_norm": 0.4306529462337494,
"learning_rate": 0.00016324786324786324,
"loss": 0.6317,
"step": 1393
},
{
"epoch": 1.5312371275573253,
"grad_norm": 0.5411052703857422,
"learning_rate": 0.00016312576312576312,
"loss": 0.6637,
"step": 1394
},
{
"epoch": 1.5323355759989017,
"grad_norm": 0.633800745010376,
"learning_rate": 0.00016300366300366298,
"loss": 0.7145,
"step": 1395
},
{
"epoch": 1.5334340244404778,
"grad_norm": 0.6986578702926636,
"learning_rate": 0.00016288156288156287,
"loss": 0.7194,
"step": 1396
},
{
"epoch": 1.534532472882054,
"grad_norm": 0.5223686695098877,
"learning_rate": 0.00016275946275946275,
"loss": 0.7849,
"step": 1397
},
{
"epoch": 1.5356309213236303,
"grad_norm": 0.5342483520507812,
"learning_rate": 0.0001626373626373626,
"loss": 0.8885,
"step": 1398
},
{
"epoch": 1.5367293697652067,
"grad_norm": 0.5467656850814819,
"learning_rate": 0.0001625152625152625,
"loss": 0.6265,
"step": 1399
},
{
"epoch": 1.5378278182067828,
"grad_norm": 0.4483658969402313,
"learning_rate": 0.00016239316239316238,
"loss": 0.7133,
"step": 1400
},
{
"epoch": 1.5389262666483592,
"grad_norm": 0.5714216232299805,
"learning_rate": 0.00016227106227106224,
"loss": 0.5212,
"step": 1401
},
{
"epoch": 1.5400247150899355,
"grad_norm": 0.5487145781517029,
"learning_rate": 0.00016214896214896213,
"loss": 0.6276,
"step": 1402
},
{
"epoch": 1.5411231635315117,
"grad_norm": 0.3687078654766083,
"learning_rate": 0.00016202686202686204,
"loss": 0.7512,
"step": 1403
},
{
"epoch": 1.542221611973088,
"grad_norm": 0.3596762418746948,
"learning_rate": 0.00016190476190476187,
"loss": 0.7192,
"step": 1404
},
{
"epoch": 1.5433200604146644,
"grad_norm": 0.4092305898666382,
"learning_rate": 0.00016178266178266178,
"loss": 0.7339,
"step": 1405
},
{
"epoch": 1.5444185088562405,
"grad_norm": 0.4018193483352661,
"learning_rate": 0.00016166056166056167,
"loss": 0.7213,
"step": 1406
},
{
"epoch": 1.5455169572978167,
"grad_norm": 0.4993208646774292,
"learning_rate": 0.00016153846153846153,
"loss": 0.6362,
"step": 1407
},
{
"epoch": 1.5466154057393933,
"grad_norm": 0.3958855867385864,
"learning_rate": 0.0001614163614163614,
"loss": 0.8482,
"step": 1408
},
{
"epoch": 1.5477138541809694,
"grad_norm": 0.32689765095710754,
"learning_rate": 0.0001612942612942613,
"loss": 0.6583,
"step": 1409
},
{
"epoch": 1.5488123026225455,
"grad_norm": 0.48947611451148987,
"learning_rate": 0.00016117216117216116,
"loss": 0.6707,
"step": 1410
},
{
"epoch": 1.549910751064122,
"grad_norm": 0.3446139395236969,
"learning_rate": 0.00016105006105006104,
"loss": 0.8914,
"step": 1411
},
{
"epoch": 1.5510091995056983,
"grad_norm": 0.585746705532074,
"learning_rate": 0.0001609279609279609,
"loss": 0.5413,
"step": 1412
},
{
"epoch": 1.5521076479472744,
"grad_norm": 0.6561328172683716,
"learning_rate": 0.00016080586080586079,
"loss": 0.3728,
"step": 1413
},
{
"epoch": 1.5532060963888508,
"grad_norm": 0.47158828377723694,
"learning_rate": 0.00016068376068376067,
"loss": 0.6525,
"step": 1414
},
{
"epoch": 1.5543045448304271,
"grad_norm": 0.3676914572715759,
"learning_rate": 0.00016056166056166053,
"loss": 0.7395,
"step": 1415
},
{
"epoch": 1.5554029932720033,
"grad_norm": 0.608076810836792,
"learning_rate": 0.00016043956043956041,
"loss": 0.5289,
"step": 1416
},
{
"epoch": 1.5565014417135794,
"grad_norm": 0.44940462708473206,
"learning_rate": 0.0001603174603174603,
"loss": 0.6282,
"step": 1417
},
{
"epoch": 1.557599890155156,
"grad_norm": 0.48062869906425476,
"learning_rate": 0.00016019536019536016,
"loss": 0.7438,
"step": 1418
},
{
"epoch": 1.5586983385967321,
"grad_norm": 0.43834635615348816,
"learning_rate": 0.00016007326007326004,
"loss": 0.4248,
"step": 1419
},
{
"epoch": 1.5597967870383083,
"grad_norm": 0.5203731060028076,
"learning_rate": 0.00015995115995115996,
"loss": 0.91,
"step": 1420
},
{
"epoch": 1.5608952354798846,
"grad_norm": 0.5766960978507996,
"learning_rate": 0.00015982905982905981,
"loss": 0.7211,
"step": 1421
},
{
"epoch": 1.561993683921461,
"grad_norm": 0.3048666715621948,
"learning_rate": 0.0001597069597069597,
"loss": 0.5618,
"step": 1422
},
{
"epoch": 1.5630921323630371,
"grad_norm": 0.3916679322719574,
"learning_rate": 0.00015958485958485959,
"loss": 0.6954,
"step": 1423
},
{
"epoch": 1.5641905808046135,
"grad_norm": 0.6336612105369568,
"learning_rate": 0.00015946275946275944,
"loss": 0.6368,
"step": 1424
},
{
"epoch": 1.5652890292461898,
"grad_norm": 0.8314816355705261,
"learning_rate": 0.00015934065934065933,
"loss": 0.7633,
"step": 1425
},
{
"epoch": 1.566387477687766,
"grad_norm": 0.46973487734794617,
"learning_rate": 0.00015921855921855922,
"loss": 0.6915,
"step": 1426
},
{
"epoch": 1.5674859261293423,
"grad_norm": 0.48737633228302,
"learning_rate": 0.00015909645909645907,
"loss": 0.5346,
"step": 1427
},
{
"epoch": 1.5685843745709187,
"grad_norm": 0.548876941204071,
"learning_rate": 0.00015897435897435896,
"loss": 1.0449,
"step": 1428
},
{
"epoch": 1.5696828230124948,
"grad_norm": 0.5039654970169067,
"learning_rate": 0.00015885225885225884,
"loss": 0.9953,
"step": 1429
},
{
"epoch": 1.570781271454071,
"grad_norm": 0.7233378887176514,
"learning_rate": 0.0001587301587301587,
"loss": 0.7068,
"step": 1430
},
{
"epoch": 1.5718797198956473,
"grad_norm": 0.5767638683319092,
"learning_rate": 0.0001586080586080586,
"loss": 0.8055,
"step": 1431
},
{
"epoch": 1.5729781683372237,
"grad_norm": 0.34450021386146545,
"learning_rate": 0.00015848595848595847,
"loss": 0.726,
"step": 1432
},
{
"epoch": 1.5740766167787998,
"grad_norm": 0.8474962711334229,
"learning_rate": 0.00015836385836385833,
"loss": 0.6974,
"step": 1433
},
{
"epoch": 1.5751750652203762,
"grad_norm": 1.565746545791626,
"learning_rate": 0.00015824175824175824,
"loss": 0.7766,
"step": 1434
},
{
"epoch": 1.5762735136619526,
"grad_norm": 0.4393616020679474,
"learning_rate": 0.00015811965811965813,
"loss": 0.6071,
"step": 1435
},
{
"epoch": 1.5773719621035287,
"grad_norm": 0.5209214091300964,
"learning_rate": 0.000157997557997558,
"loss": 0.7546,
"step": 1436
},
{
"epoch": 1.578470410545105,
"grad_norm": 0.6069398522377014,
"learning_rate": 0.00015787545787545787,
"loss": 0.7322,
"step": 1437
},
{
"epoch": 1.5795688589866814,
"grad_norm": 0.6168296337127686,
"learning_rate": 0.00015775335775335773,
"loss": 0.5169,
"step": 1438
},
{
"epoch": 1.5806673074282576,
"grad_norm": 0.25368016958236694,
"learning_rate": 0.00015763125763125762,
"loss": 0.4838,
"step": 1439
},
{
"epoch": 1.5817657558698337,
"grad_norm": 0.4165039360523224,
"learning_rate": 0.0001575091575091575,
"loss": 1.0135,
"step": 1440
},
{
"epoch": 1.5828642043114103,
"grad_norm": 0.4596197307109833,
"learning_rate": 0.00015738705738705736,
"loss": 0.5545,
"step": 1441
},
{
"epoch": 1.5839626527529864,
"grad_norm": 0.5077592730522156,
"learning_rate": 0.00015726495726495725,
"loss": 0.7754,
"step": 1442
},
{
"epoch": 1.5850611011945626,
"grad_norm": 0.5041285157203674,
"learning_rate": 0.00015714285714285713,
"loss": 0.8384,
"step": 1443
},
{
"epoch": 1.586159549636139,
"grad_norm": 0.40924420952796936,
"learning_rate": 0.000157020757020757,
"loss": 0.5511,
"step": 1444
},
{
"epoch": 1.5872579980777153,
"grad_norm": 0.4800551235675812,
"learning_rate": 0.00015689865689865688,
"loss": 0.6154,
"step": 1445
},
{
"epoch": 1.5883564465192914,
"grad_norm": 0.433174729347229,
"learning_rate": 0.00015677655677655676,
"loss": 0.6158,
"step": 1446
},
{
"epoch": 1.5894548949608678,
"grad_norm": 0.29649895429611206,
"learning_rate": 0.00015665445665445662,
"loss": 0.5729,
"step": 1447
},
{
"epoch": 1.5905533434024441,
"grad_norm": 0.3815969228744507,
"learning_rate": 0.0001565323565323565,
"loss": 0.6748,
"step": 1448
},
{
"epoch": 1.5916517918440203,
"grad_norm": 0.4933919608592987,
"learning_rate": 0.00015641025641025642,
"loss": 0.7683,
"step": 1449
},
{
"epoch": 1.5927502402855966,
"grad_norm": 0.5053071975708008,
"learning_rate": 0.00015628815628815625,
"loss": 0.6779,
"step": 1450
},
{
"epoch": 1.593848688727173,
"grad_norm": 0.3900013566017151,
"learning_rate": 0.00015616605616605616,
"loss": 0.6326,
"step": 1451
},
{
"epoch": 1.5949471371687491,
"grad_norm": 0.5823982357978821,
"learning_rate": 0.00015604395604395605,
"loss": 0.6104,
"step": 1452
},
{
"epoch": 1.5960455856103253,
"grad_norm": 0.5277792811393738,
"learning_rate": 0.0001559218559218559,
"loss": 0.6647,
"step": 1453
},
{
"epoch": 1.5971440340519016,
"grad_norm": 0.32926440238952637,
"learning_rate": 0.0001557997557997558,
"loss": 0.6064,
"step": 1454
},
{
"epoch": 1.598242482493478,
"grad_norm": 0.7350378036499023,
"learning_rate": 0.00015567765567765568,
"loss": 0.7951,
"step": 1455
},
{
"epoch": 1.5993409309350541,
"grad_norm": 0.4125807285308838,
"learning_rate": 0.00015555555555555554,
"loss": 0.7761,
"step": 1456
},
{
"epoch": 1.6004393793766305,
"grad_norm": 0.49707722663879395,
"learning_rate": 0.00015543345543345542,
"loss": 0.7299,
"step": 1457
},
{
"epoch": 1.6015378278182069,
"grad_norm": 0.3240358829498291,
"learning_rate": 0.0001553113553113553,
"loss": 0.4832,
"step": 1458
},
{
"epoch": 1.602636276259783,
"grad_norm": 0.44430434703826904,
"learning_rate": 0.00015518925518925517,
"loss": 0.5968,
"step": 1459
},
{
"epoch": 1.6037347247013594,
"grad_norm": 0.3702992796897888,
"learning_rate": 0.00015506715506715505,
"loss": 0.7177,
"step": 1460
},
{
"epoch": 1.6048331731429357,
"grad_norm": 0.5001052618026733,
"learning_rate": 0.00015494505494505494,
"loss": 0.7448,
"step": 1461
},
{
"epoch": 1.6059316215845119,
"grad_norm": 0.45969969034194946,
"learning_rate": 0.0001548229548229548,
"loss": 0.8292,
"step": 1462
},
{
"epoch": 1.607030070026088,
"grad_norm": 0.46075674891471863,
"learning_rate": 0.00015470085470085468,
"loss": 0.5624,
"step": 1463
},
{
"epoch": 1.6081285184676646,
"grad_norm": 2.077080488204956,
"learning_rate": 0.00015457875457875454,
"loss": 0.6643,
"step": 1464
},
{
"epoch": 1.6092269669092407,
"grad_norm": 0.46008172631263733,
"learning_rate": 0.00015445665445665445,
"loss": 0.6329,
"step": 1465
},
{
"epoch": 1.6103254153508169,
"grad_norm": 0.5016405582427979,
"learning_rate": 0.00015433455433455434,
"loss": 0.7692,
"step": 1466
},
{
"epoch": 1.6114238637923932,
"grad_norm": 0.46292269229888916,
"learning_rate": 0.0001542124542124542,
"loss": 0.6485,
"step": 1467
},
{
"epoch": 1.6125223122339696,
"grad_norm": 0.4498538672924042,
"learning_rate": 0.00015409035409035408,
"loss": 0.598,
"step": 1468
},
{
"epoch": 1.6136207606755457,
"grad_norm": 0.3537295162677765,
"learning_rate": 0.00015396825396825397,
"loss": 0.6356,
"step": 1469
},
{
"epoch": 1.614719209117122,
"grad_norm": 0.9966747164726257,
"learning_rate": 0.00015384615384615382,
"loss": 0.6627,
"step": 1470
},
{
"epoch": 1.6158176575586984,
"grad_norm": 0.9386951327323914,
"learning_rate": 0.0001537240537240537,
"loss": 0.8148,
"step": 1471
},
{
"epoch": 1.6169161060002746,
"grad_norm": 0.3452979028224945,
"learning_rate": 0.0001536019536019536,
"loss": 0.5778,
"step": 1472
},
{
"epoch": 1.618014554441851,
"grad_norm": 0.3443523049354553,
"learning_rate": 0.00015347985347985345,
"loss": 0.9228,
"step": 1473
},
{
"epoch": 1.6191130028834273,
"grad_norm": 0.5345872044563293,
"learning_rate": 0.00015335775335775334,
"loss": 0.4682,
"step": 1474
},
{
"epoch": 1.6202114513250034,
"grad_norm": 0.35112351179122925,
"learning_rate": 0.00015323565323565322,
"loss": 0.5482,
"step": 1475
},
{
"epoch": 1.6213098997665796,
"grad_norm": 0.39090535044670105,
"learning_rate": 0.00015311355311355308,
"loss": 0.825,
"step": 1476
},
{
"epoch": 1.622408348208156,
"grad_norm": 1.1684538125991821,
"learning_rate": 0.00015299145299145297,
"loss": 0.6561,
"step": 1477
},
{
"epoch": 1.6235067966497323,
"grad_norm": 0.4006233513355255,
"learning_rate": 0.00015286935286935288,
"loss": 0.3647,
"step": 1478
},
{
"epoch": 1.6246052450913084,
"grad_norm": 0.30577126145362854,
"learning_rate": 0.0001527472527472527,
"loss": 0.4934,
"step": 1479
},
{
"epoch": 1.6257036935328848,
"grad_norm": 0.39927995204925537,
"learning_rate": 0.00015262515262515263,
"loss": 0.6028,
"step": 1480
},
{
"epoch": 1.6268021419744612,
"grad_norm": 0.49143150448799133,
"learning_rate": 0.0001525030525030525,
"loss": 0.4595,
"step": 1481
},
{
"epoch": 1.6279005904160373,
"grad_norm": 0.8603225946426392,
"learning_rate": 0.00015238095238095237,
"loss": 0.8617,
"step": 1482
},
{
"epoch": 1.6289990388576137,
"grad_norm": 0.534269392490387,
"learning_rate": 0.00015225885225885225,
"loss": 0.6648,
"step": 1483
},
{
"epoch": 1.63009748729919,
"grad_norm": 0.4987354278564453,
"learning_rate": 0.00015213675213675214,
"loss": 0.5908,
"step": 1484
},
{
"epoch": 1.6311959357407662,
"grad_norm": 0.5739774107933044,
"learning_rate": 0.000152014652014652,
"loss": 0.7652,
"step": 1485
},
{
"epoch": 1.6322943841823423,
"grad_norm": 0.5343801975250244,
"learning_rate": 0.00015189255189255188,
"loss": 0.6864,
"step": 1486
},
{
"epoch": 1.6333928326239189,
"grad_norm": 0.45683905482292175,
"learning_rate": 0.00015177045177045177,
"loss": 0.7179,
"step": 1487
},
{
"epoch": 1.634491281065495,
"grad_norm": 0.5020450949668884,
"learning_rate": 0.00015164835164835163,
"loss": 0.4356,
"step": 1488
},
{
"epoch": 1.6355897295070712,
"grad_norm": 0.3870914876461029,
"learning_rate": 0.0001515262515262515,
"loss": 0.692,
"step": 1489
},
{
"epoch": 1.6366881779486475,
"grad_norm": 0.5256255269050598,
"learning_rate": 0.00015140415140415137,
"loss": 0.7184,
"step": 1490
},
{
"epoch": 1.6377866263902239,
"grad_norm": 0.27588197588920593,
"learning_rate": 0.00015128205128205126,
"loss": 0.6928,
"step": 1491
},
{
"epoch": 1.6388850748318,
"grad_norm": 0.43336692452430725,
"learning_rate": 0.00015115995115995114,
"loss": 0.7357,
"step": 1492
},
{
"epoch": 1.6399835232733764,
"grad_norm": 0.7952486872673035,
"learning_rate": 0.000151037851037851,
"loss": 0.5536,
"step": 1493
},
{
"epoch": 1.6410819717149527,
"grad_norm": 3.8659090995788574,
"learning_rate": 0.00015091575091575089,
"loss": 0.6409,
"step": 1494
},
{
"epoch": 1.6421804201565289,
"grad_norm": 0.3824027478694916,
"learning_rate": 0.0001507936507936508,
"loss": 0.5988,
"step": 1495
},
{
"epoch": 1.643278868598105,
"grad_norm": 0.45106491446495056,
"learning_rate": 0.00015067155067155066,
"loss": 0.7568,
"step": 1496
},
{
"epoch": 1.6443773170396816,
"grad_norm": 0.719417154788971,
"learning_rate": 0.00015054945054945054,
"loss": 0.8191,
"step": 1497
},
{
"epoch": 1.6454757654812577,
"grad_norm": 0.4702167212963104,
"learning_rate": 0.00015042735042735043,
"loss": 0.6761,
"step": 1498
},
{
"epoch": 1.6465742139228339,
"grad_norm": 0.49441996216773987,
"learning_rate": 0.0001503052503052503,
"loss": 0.7323,
"step": 1499
},
{
"epoch": 1.6476726623644102,
"grad_norm": 0.623470664024353,
"learning_rate": 0.00015018315018315017,
"loss": 0.8384,
"step": 1500
},
{
"epoch": 1.6487711108059866,
"grad_norm": 0.5583334565162659,
"learning_rate": 0.00015006105006105006,
"loss": 0.8238,
"step": 1501
},
{
"epoch": 1.6498695592475627,
"grad_norm": 0.4803924560546875,
"learning_rate": 0.00014993894993894994,
"loss": 0.5322,
"step": 1502
},
{
"epoch": 1.650968007689139,
"grad_norm": 0.709605872631073,
"learning_rate": 0.0001498168498168498,
"loss": 0.8254,
"step": 1503
},
{
"epoch": 1.6520664561307155,
"grad_norm": 0.48047375679016113,
"learning_rate": 0.0001496947496947497,
"loss": 0.5263,
"step": 1504
},
{
"epoch": 1.6531649045722916,
"grad_norm": 0.41796261072158813,
"learning_rate": 0.00014957264957264957,
"loss": 0.5803,
"step": 1505
},
{
"epoch": 1.654263353013868,
"grad_norm": 0.7576707601547241,
"learning_rate": 0.00014945054945054943,
"loss": 0.545,
"step": 1506
},
{
"epoch": 1.6553618014554443,
"grad_norm": 0.4668630063533783,
"learning_rate": 0.00014932844932844932,
"loss": 0.6213,
"step": 1507
},
{
"epoch": 1.6564602498970205,
"grad_norm": 0.9730806350708008,
"learning_rate": 0.00014920634920634917,
"loss": 0.5415,
"step": 1508
},
{
"epoch": 1.6575586983385966,
"grad_norm": 0.39670151472091675,
"learning_rate": 0.0001490842490842491,
"loss": 0.7931,
"step": 1509
},
{
"epoch": 1.658657146780173,
"grad_norm": 0.6003556847572327,
"learning_rate": 0.00014896214896214895,
"loss": 0.7494,
"step": 1510
},
{
"epoch": 1.6597555952217493,
"grad_norm": 0.4335152506828308,
"learning_rate": 0.00014884004884004883,
"loss": 0.7003,
"step": 1511
},
{
"epoch": 1.6608540436633255,
"grad_norm": 0.34025630354881287,
"learning_rate": 0.00014871794871794872,
"loss": 0.9012,
"step": 1512
},
{
"epoch": 1.6619524921049018,
"grad_norm": 0.403934508562088,
"learning_rate": 0.00014859584859584858,
"loss": 0.717,
"step": 1513
},
{
"epoch": 1.6630509405464782,
"grad_norm": 0.45691147446632385,
"learning_rate": 0.00014847374847374846,
"loss": 0.4833,
"step": 1514
},
{
"epoch": 1.6641493889880543,
"grad_norm": 0.42266151309013367,
"learning_rate": 0.00014835164835164835,
"loss": 0.5892,
"step": 1515
},
{
"epoch": 1.6652478374296307,
"grad_norm": 0.392337441444397,
"learning_rate": 0.0001482295482295482,
"loss": 0.7748,
"step": 1516
},
{
"epoch": 1.666346285871207,
"grad_norm": 0.352081298828125,
"learning_rate": 0.0001481074481074481,
"loss": 0.6018,
"step": 1517
},
{
"epoch": 1.6674447343127832,
"grad_norm": 0.46293389797210693,
"learning_rate": 0.00014798534798534798,
"loss": 0.4696,
"step": 1518
},
{
"epoch": 1.6685431827543593,
"grad_norm": 0.6427372097969055,
"learning_rate": 0.00014786324786324786,
"loss": 0.7279,
"step": 1519
},
{
"epoch": 1.669641631195936,
"grad_norm": 0.500382125377655,
"learning_rate": 0.00014774114774114772,
"loss": 0.7395,
"step": 1520
},
{
"epoch": 1.670740079637512,
"grad_norm": 0.4410606920719147,
"learning_rate": 0.0001476190476190476,
"loss": 0.501,
"step": 1521
},
{
"epoch": 1.6718385280790882,
"grad_norm": 0.5587645769119263,
"learning_rate": 0.0001474969474969475,
"loss": 0.8655,
"step": 1522
},
{
"epoch": 1.6729369765206645,
"grad_norm": 0.4312286376953125,
"learning_rate": 0.00014737484737484735,
"loss": 0.9578,
"step": 1523
},
{
"epoch": 1.674035424962241,
"grad_norm": 0.48694175481796265,
"learning_rate": 0.00014725274725274723,
"loss": 0.6806,
"step": 1524
},
{
"epoch": 1.675133873403817,
"grad_norm": 0.39892563223838806,
"learning_rate": 0.00014713064713064712,
"loss": 0.598,
"step": 1525
},
{
"epoch": 1.6762323218453934,
"grad_norm": 0.4714735150337219,
"learning_rate": 0.000147008547008547,
"loss": 0.9637,
"step": 1526
},
{
"epoch": 1.6773307702869698,
"grad_norm": 0.8308823108673096,
"learning_rate": 0.00014688644688644686,
"loss": 0.7886,
"step": 1527
},
{
"epoch": 1.678429218728546,
"grad_norm": 0.5142358541488647,
"learning_rate": 0.00014676434676434675,
"loss": 0.8028,
"step": 1528
},
{
"epoch": 1.6795276671701223,
"grad_norm": 0.4001234471797943,
"learning_rate": 0.00014664224664224663,
"loss": 0.59,
"step": 1529
},
{
"epoch": 1.6806261156116986,
"grad_norm": 0.4112735688686371,
"learning_rate": 0.0001465201465201465,
"loss": 0.6523,
"step": 1530
},
{
"epoch": 1.6817245640532748,
"grad_norm": 0.4391016960144043,
"learning_rate": 0.0001463980463980464,
"loss": 0.7372,
"step": 1531
},
{
"epoch": 1.682823012494851,
"grad_norm": 0.7199782133102417,
"learning_rate": 0.00014627594627594626,
"loss": 0.8493,
"step": 1532
},
{
"epoch": 1.6839214609364273,
"grad_norm": 0.42379269003868103,
"learning_rate": 0.00014615384615384615,
"loss": 0.6609,
"step": 1533
},
{
"epoch": 1.6850199093780036,
"grad_norm": 0.41174909472465515,
"learning_rate": 0.000146031746031746,
"loss": 0.7021,
"step": 1534
},
{
"epoch": 1.6861183578195797,
"grad_norm": 0.4856640100479126,
"learning_rate": 0.0001459096459096459,
"loss": 0.6055,
"step": 1535
},
{
"epoch": 1.687216806261156,
"grad_norm": 0.5789656043052673,
"learning_rate": 0.00014578754578754578,
"loss": 0.7003,
"step": 1536
},
{
"epoch": 1.6883152547027325,
"grad_norm": 0.5711427330970764,
"learning_rate": 0.00014566544566544564,
"loss": 0.5762,
"step": 1537
},
{
"epoch": 1.6894137031443086,
"grad_norm": 0.3285518288612366,
"learning_rate": 0.00014554334554334552,
"loss": 0.6232,
"step": 1538
},
{
"epoch": 1.690512151585885,
"grad_norm": 0.48425230383872986,
"learning_rate": 0.0001454212454212454,
"loss": 0.5515,
"step": 1539
},
{
"epoch": 1.6916106000274613,
"grad_norm": 0.573079526424408,
"learning_rate": 0.0001452991452991453,
"loss": 0.7776,
"step": 1540
},
{
"epoch": 1.6927090484690375,
"grad_norm": 0.49084943532943726,
"learning_rate": 0.00014517704517704518,
"loss": 0.6504,
"step": 1541
},
{
"epoch": 1.6938074969106136,
"grad_norm": 0.46472617983818054,
"learning_rate": 0.00014505494505494504,
"loss": 0.6971,
"step": 1542
},
{
"epoch": 1.6949059453521902,
"grad_norm": 0.4890255033969879,
"learning_rate": 0.00014493284493284492,
"loss": 0.9292,
"step": 1543
},
{
"epoch": 1.6960043937937663,
"grad_norm": 0.42868301272392273,
"learning_rate": 0.0001448107448107448,
"loss": 0.6024,
"step": 1544
},
{
"epoch": 1.6971028422353425,
"grad_norm": 0.5118973255157471,
"learning_rate": 0.00014468864468864467,
"loss": 0.7598,
"step": 1545
},
{
"epoch": 1.6982012906769188,
"grad_norm": 0.40809181332588196,
"learning_rate": 0.00014456654456654455,
"loss": 0.5157,
"step": 1546
},
{
"epoch": 1.6992997391184952,
"grad_norm": 0.5236404538154602,
"learning_rate": 0.0001444444444444444,
"loss": 0.84,
"step": 1547
},
{
"epoch": 1.7003981875600713,
"grad_norm": 0.5712966322898865,
"learning_rate": 0.00014432234432234432,
"loss": 0.7208,
"step": 1548
},
{
"epoch": 1.7014966360016477,
"grad_norm": 0.2910475730895996,
"learning_rate": 0.00014420024420024418,
"loss": 0.4998,
"step": 1549
},
{
"epoch": 1.702595084443224,
"grad_norm": 0.5326736569404602,
"learning_rate": 0.00014407814407814407,
"loss": 0.5492,
"step": 1550
},
{
"epoch": 1.7036935328848002,
"grad_norm": 0.5454451441764832,
"learning_rate": 0.00014395604395604395,
"loss": 0.9016,
"step": 1551
},
{
"epoch": 1.7047919813263763,
"grad_norm": 0.45031625032424927,
"learning_rate": 0.0001438339438339438,
"loss": 0.671,
"step": 1552
},
{
"epoch": 1.705890429767953,
"grad_norm": 0.5496229529380798,
"learning_rate": 0.0001437118437118437,
"loss": 0.6333,
"step": 1553
},
{
"epoch": 1.706988878209529,
"grad_norm": 0.4200669825077057,
"learning_rate": 0.00014358974358974358,
"loss": 0.6158,
"step": 1554
},
{
"epoch": 1.7080873266511052,
"grad_norm": 0.7623536586761475,
"learning_rate": 0.00014346764346764347,
"loss": 0.686,
"step": 1555
},
{
"epoch": 1.7091857750926815,
"grad_norm": 0.3363445997238159,
"learning_rate": 0.00014334554334554333,
"loss": 0.305,
"step": 1556
},
{
"epoch": 1.710284223534258,
"grad_norm": 0.5042807459831238,
"learning_rate": 0.0001432234432234432,
"loss": 0.72,
"step": 1557
},
{
"epoch": 1.711382671975834,
"grad_norm": 0.5264353156089783,
"learning_rate": 0.0001431013431013431,
"loss": 0.6778,
"step": 1558
},
{
"epoch": 1.7124811204174104,
"grad_norm": 0.48960715532302856,
"learning_rate": 0.00014297924297924296,
"loss": 0.4935,
"step": 1559
},
{
"epoch": 1.7135795688589868,
"grad_norm": 0.4308861792087555,
"learning_rate": 0.00014285714285714284,
"loss": 0.6527,
"step": 1560
},
{
"epoch": 1.714678017300563,
"grad_norm": 0.42890703678131104,
"learning_rate": 0.00014273504273504273,
"loss": 0.4846,
"step": 1561
},
{
"epoch": 1.7157764657421393,
"grad_norm": 0.5222750902175903,
"learning_rate": 0.0001426129426129426,
"loss": 0.764,
"step": 1562
},
{
"epoch": 1.7168749141837156,
"grad_norm": 0.49664998054504395,
"learning_rate": 0.00014249084249084247,
"loss": 0.5728,
"step": 1563
},
{
"epoch": 1.7179733626252918,
"grad_norm": 0.3131520748138428,
"learning_rate": 0.00014236874236874236,
"loss": 0.5089,
"step": 1564
},
{
"epoch": 1.719071811066868,
"grad_norm": 0.5098987221717834,
"learning_rate": 0.00014224664224664224,
"loss": 0.781,
"step": 1565
},
{
"epoch": 1.7201702595084445,
"grad_norm": 0.4040893316268921,
"learning_rate": 0.0001421245421245421,
"loss": 0.7358,
"step": 1566
},
{
"epoch": 1.7212687079500206,
"grad_norm": 0.3601396679878235,
"learning_rate": 0.00014200244200244198,
"loss": 0.5531,
"step": 1567
},
{
"epoch": 1.7223671563915968,
"grad_norm": 0.6634377837181091,
"learning_rate": 0.00014188034188034187,
"loss": 0.6548,
"step": 1568
},
{
"epoch": 1.7234656048331731,
"grad_norm": 0.35935553908348083,
"learning_rate": 0.00014175824175824173,
"loss": 0.5653,
"step": 1569
},
{
"epoch": 1.7245640532747495,
"grad_norm": 0.4607802927494049,
"learning_rate": 0.00014163614163614164,
"loss": 0.9111,
"step": 1570
},
{
"epoch": 1.7256625017163256,
"grad_norm": 1.0116467475891113,
"learning_rate": 0.0001415140415140415,
"loss": 0.9226,
"step": 1571
},
{
"epoch": 1.726760950157902,
"grad_norm": 0.9484761953353882,
"learning_rate": 0.00014139194139194139,
"loss": 0.7536,
"step": 1572
},
{
"epoch": 1.7278593985994783,
"grad_norm": 0.3684981167316437,
"learning_rate": 0.00014126984126984124,
"loss": 0.5013,
"step": 1573
},
{
"epoch": 1.7289578470410545,
"grad_norm": 0.40037083625793457,
"learning_rate": 0.00014114774114774113,
"loss": 0.8069,
"step": 1574
},
{
"epoch": 1.7300562954826306,
"grad_norm": 0.42828282713890076,
"learning_rate": 0.00014102564102564101,
"loss": 0.5586,
"step": 1575
},
{
"epoch": 1.7311547439242072,
"grad_norm": 0.3461548686027527,
"learning_rate": 0.00014090354090354087,
"loss": 0.6045,
"step": 1576
},
{
"epoch": 1.7322531923657833,
"grad_norm": 0.622982919216156,
"learning_rate": 0.00014078144078144079,
"loss": 0.8943,
"step": 1577
},
{
"epoch": 1.7333516408073595,
"grad_norm": 0.3318479359149933,
"learning_rate": 0.00014065934065934064,
"loss": 0.4058,
"step": 1578
},
{
"epoch": 1.7344500892489358,
"grad_norm": 0.5178685188293457,
"learning_rate": 0.00014053724053724053,
"loss": 0.5839,
"step": 1579
},
{
"epoch": 1.7355485376905122,
"grad_norm": 0.44273868203163147,
"learning_rate": 0.00014041514041514042,
"loss": 0.5394,
"step": 1580
},
{
"epoch": 1.7366469861320883,
"grad_norm": 0.60169517993927,
"learning_rate": 0.00014029304029304027,
"loss": 0.6753,
"step": 1581
},
{
"epoch": 1.7377454345736647,
"grad_norm": 0.7691718339920044,
"learning_rate": 0.00014017094017094016,
"loss": 0.9618,
"step": 1582
},
{
"epoch": 1.738843883015241,
"grad_norm": 0.3900390565395355,
"learning_rate": 0.00014004884004884004,
"loss": 0.5809,
"step": 1583
},
{
"epoch": 1.7399423314568172,
"grad_norm": 0.6272429823875427,
"learning_rate": 0.00013992673992673993,
"loss": 0.8579,
"step": 1584
},
{
"epoch": 1.7410407798983936,
"grad_norm": 0.30017220973968506,
"learning_rate": 0.0001398046398046398,
"loss": 0.5335,
"step": 1585
},
{
"epoch": 1.74213922833997,
"grad_norm": 0.4937066435813904,
"learning_rate": 0.00013968253968253967,
"loss": 0.7941,
"step": 1586
},
{
"epoch": 1.743237676781546,
"grad_norm": 0.47317594289779663,
"learning_rate": 0.00013956043956043956,
"loss": 0.6013,
"step": 1587
},
{
"epoch": 1.7443361252231222,
"grad_norm": 1.9155733585357666,
"learning_rate": 0.00013943833943833942,
"loss": 0.6708,
"step": 1588
},
{
"epoch": 1.7454345736646986,
"grad_norm": 0.3844835162162781,
"learning_rate": 0.0001393162393162393,
"loss": 0.7176,
"step": 1589
},
{
"epoch": 1.746533022106275,
"grad_norm": 0.42810145020484924,
"learning_rate": 0.0001391941391941392,
"loss": 0.9255,
"step": 1590
},
{
"epoch": 1.747631470547851,
"grad_norm": 3.846015691757202,
"learning_rate": 0.00013907203907203905,
"loss": 0.6202,
"step": 1591
},
{
"epoch": 1.7487299189894274,
"grad_norm": 0.42783257365226746,
"learning_rate": 0.00013894993894993893,
"loss": 0.7451,
"step": 1592
},
{
"epoch": 1.7498283674310038,
"grad_norm": 0.5237023234367371,
"learning_rate": 0.00013882783882783882,
"loss": 0.7961,
"step": 1593
},
{
"epoch": 1.75092681587258,
"grad_norm": 2.5639729499816895,
"learning_rate": 0.0001387057387057387,
"loss": 0.7026,
"step": 1594
},
{
"epoch": 1.7520252643141563,
"grad_norm": 0.5686498284339905,
"learning_rate": 0.00013858363858363856,
"loss": 0.4916,
"step": 1595
},
{
"epoch": 1.7531237127557326,
"grad_norm": 0.561611533164978,
"learning_rate": 0.00013846153846153845,
"loss": 0.772,
"step": 1596
},
{
"epoch": 1.7542221611973088,
"grad_norm": 0.6220077872276306,
"learning_rate": 0.00013833943833943833,
"loss": 0.5694,
"step": 1597
},
{
"epoch": 1.755320609638885,
"grad_norm": 0.6902570724487305,
"learning_rate": 0.0001382173382173382,
"loss": 0.7963,
"step": 1598
},
{
"epoch": 1.7564190580804615,
"grad_norm": 2.0417702198028564,
"learning_rate": 0.00013809523809523808,
"loss": 0.6721,
"step": 1599
},
{
"epoch": 1.7575175065220376,
"grad_norm": 0.36764901876449585,
"learning_rate": 0.00013797313797313796,
"loss": 0.5714,
"step": 1600
},
{
"epoch": 1.7586159549636138,
"grad_norm": 0.6679022908210754,
"learning_rate": 0.00013785103785103785,
"loss": 0.7025,
"step": 1601
},
{
"epoch": 1.7597144034051901,
"grad_norm": 0.5749796628952026,
"learning_rate": 0.0001377289377289377,
"loss": 0.7381,
"step": 1602
},
{
"epoch": 1.7608128518467665,
"grad_norm": 0.9285687208175659,
"learning_rate": 0.0001376068376068376,
"loss": 0.6,
"step": 1603
},
{
"epoch": 1.7619113002883426,
"grad_norm": 0.8209772706031799,
"learning_rate": 0.00013748473748473748,
"loss": 0.5701,
"step": 1604
},
{
"epoch": 1.763009748729919,
"grad_norm": 0.7823337912559509,
"learning_rate": 0.00013736263736263734,
"loss": 0.6695,
"step": 1605
},
{
"epoch": 1.7641081971714954,
"grad_norm": 0.4885605275630951,
"learning_rate": 0.00013724053724053725,
"loss": 0.6487,
"step": 1606
},
{
"epoch": 1.7652066456130715,
"grad_norm": 0.36517488956451416,
"learning_rate": 0.0001371184371184371,
"loss": 0.5798,
"step": 1607
},
{
"epoch": 1.7663050940546479,
"grad_norm": 0.49961966276168823,
"learning_rate": 0.000136996336996337,
"loss": 0.4373,
"step": 1608
},
{
"epoch": 1.7674035424962242,
"grad_norm": 0.495263010263443,
"learning_rate": 0.00013687423687423688,
"loss": 0.5868,
"step": 1609
},
{
"epoch": 1.7685019909378004,
"grad_norm": 0.7384648323059082,
"learning_rate": 0.00013675213675213674,
"loss": 0.4957,
"step": 1610
},
{
"epoch": 1.7696004393793765,
"grad_norm": 0.465440034866333,
"learning_rate": 0.00013663003663003662,
"loss": 0.7424,
"step": 1611
},
{
"epoch": 1.7706988878209529,
"grad_norm": 0.68381667137146,
"learning_rate": 0.00013650793650793648,
"loss": 1.0421,
"step": 1612
},
{
"epoch": 1.7717973362625292,
"grad_norm": 4.455906867980957,
"learning_rate": 0.00013638583638583637,
"loss": 0.6626,
"step": 1613
},
{
"epoch": 1.7728957847041054,
"grad_norm": 0.6165801286697388,
"learning_rate": 0.00013626373626373625,
"loss": 0.6072,
"step": 1614
},
{
"epoch": 1.7739942331456817,
"grad_norm": 0.8296604156494141,
"learning_rate": 0.00013614163614163614,
"loss": 0.6507,
"step": 1615
},
{
"epoch": 1.775092681587258,
"grad_norm": 0.4678190350532532,
"learning_rate": 0.00013601953601953602,
"loss": 0.8466,
"step": 1616
},
{
"epoch": 1.7761911300288342,
"grad_norm": 1.2141482830047607,
"learning_rate": 0.00013589743589743588,
"loss": 0.513,
"step": 1617
},
{
"epoch": 1.7772895784704106,
"grad_norm": 0.4522024691104889,
"learning_rate": 0.00013577533577533577,
"loss": 0.7571,
"step": 1618
},
{
"epoch": 1.778388026911987,
"grad_norm": 2.0903220176696777,
"learning_rate": 0.00013565323565323565,
"loss": 0.7359,
"step": 1619
},
{
"epoch": 1.779486475353563,
"grad_norm": 0.5292307734489441,
"learning_rate": 0.0001355311355311355,
"loss": 0.6526,
"step": 1620
},
{
"epoch": 1.7805849237951392,
"grad_norm": 0.5047786235809326,
"learning_rate": 0.0001354090354090354,
"loss": 0.7056,
"step": 1621
},
{
"epoch": 1.7816833722367158,
"grad_norm": 0.4102507531642914,
"learning_rate": 0.00013528693528693528,
"loss": 0.8673,
"step": 1622
},
{
"epoch": 1.782781820678292,
"grad_norm": 0.471556693315506,
"learning_rate": 0.00013516483516483517,
"loss": 0.9424,
"step": 1623
},
{
"epoch": 1.783880269119868,
"grad_norm": 0.6595687866210938,
"learning_rate": 0.00013504273504273502,
"loss": 0.661,
"step": 1624
},
{
"epoch": 1.7849787175614444,
"grad_norm": 0.6221860647201538,
"learning_rate": 0.0001349206349206349,
"loss": 0.5457,
"step": 1625
},
{
"epoch": 1.7860771660030208,
"grad_norm": 0.9256211519241333,
"learning_rate": 0.0001347985347985348,
"loss": 0.9216,
"step": 1626
},
{
"epoch": 1.787175614444597,
"grad_norm": 0.31376492977142334,
"learning_rate": 0.00013467643467643465,
"loss": 0.7071,
"step": 1627
},
{
"epoch": 1.7882740628861733,
"grad_norm": 0.5313776135444641,
"learning_rate": 0.00013455433455433454,
"loss": 0.8111,
"step": 1628
},
{
"epoch": 1.7893725113277497,
"grad_norm": 0.8203330636024475,
"learning_rate": 0.00013443223443223442,
"loss": 0.5301,
"step": 1629
},
{
"epoch": 1.7904709597693258,
"grad_norm": 0.42774948477745056,
"learning_rate": 0.0001343101343101343,
"loss": 0.8359,
"step": 1630
},
{
"epoch": 1.791569408210902,
"grad_norm": 0.8165685534477234,
"learning_rate": 0.00013418803418803417,
"loss": 0.4894,
"step": 1631
},
{
"epoch": 1.7926678566524785,
"grad_norm": 0.5739139318466187,
"learning_rate": 0.00013406593406593405,
"loss": 0.7009,
"step": 1632
},
{
"epoch": 1.7937663050940547,
"grad_norm": 0.5102986097335815,
"learning_rate": 0.00013394383394383394,
"loss": 0.7174,
"step": 1633
},
{
"epoch": 1.7948647535356308,
"grad_norm": 1.1377652883529663,
"learning_rate": 0.0001338217338217338,
"loss": 0.79,
"step": 1634
},
{
"epoch": 1.7959632019772072,
"grad_norm": 0.44272491335868835,
"learning_rate": 0.00013369963369963368,
"loss": 0.6761,
"step": 1635
},
{
"epoch": 1.7970616504187835,
"grad_norm": 0.5084714889526367,
"learning_rate": 0.00013357753357753357,
"loss": 0.6848,
"step": 1636
},
{
"epoch": 1.7981600988603597,
"grad_norm": 0.752017080783844,
"learning_rate": 0.00013345543345543345,
"loss": 0.6107,
"step": 1637
},
{
"epoch": 1.799258547301936,
"grad_norm": 0.4430617690086365,
"learning_rate": 0.0001333333333333333,
"loss": 0.7639,
"step": 1638
},
{
"epoch": 1.8003569957435124,
"grad_norm": 0.8098049759864807,
"learning_rate": 0.0001332112332112332,
"loss": 0.8172,
"step": 1639
},
{
"epoch": 1.8014554441850885,
"grad_norm": 0.6817697286605835,
"learning_rate": 0.00013308913308913308,
"loss": 0.8274,
"step": 1640
},
{
"epoch": 1.8025538926266649,
"grad_norm": 0.5132669806480408,
"learning_rate": 0.00013296703296703294,
"loss": 0.6269,
"step": 1641
},
{
"epoch": 1.8036523410682412,
"grad_norm": 0.8487284183502197,
"learning_rate": 0.00013284493284493283,
"loss": 0.6734,
"step": 1642
},
{
"epoch": 1.8047507895098174,
"grad_norm": 0.7084116339683533,
"learning_rate": 0.0001327228327228327,
"loss": 0.703,
"step": 1643
},
{
"epoch": 1.8058492379513935,
"grad_norm": 0.39045432209968567,
"learning_rate": 0.00013260073260073257,
"loss": 0.5466,
"step": 1644
},
{
"epoch": 1.8069476863929699,
"grad_norm": 0.4408475160598755,
"learning_rate": 0.00013247863247863248,
"loss": 0.4998,
"step": 1645
},
{
"epoch": 1.8080461348345462,
"grad_norm": 0.41640380024909973,
"learning_rate": 0.00013235653235653234,
"loss": 0.49,
"step": 1646
},
{
"epoch": 1.8091445832761224,
"grad_norm": 0.6760729551315308,
"learning_rate": 0.00013223443223443223,
"loss": 0.4537,
"step": 1647
},
{
"epoch": 1.8102430317176987,
"grad_norm": 0.42953255772590637,
"learning_rate": 0.0001321123321123321,
"loss": 0.489,
"step": 1648
},
{
"epoch": 1.811341480159275,
"grad_norm": 0.3260825574398041,
"learning_rate": 0.00013199023199023197,
"loss": 0.6633,
"step": 1649
},
{
"epoch": 1.8124399286008512,
"grad_norm": 0.7073171138763428,
"learning_rate": 0.00013186813186813186,
"loss": 0.4953,
"step": 1650
},
{
"epoch": 1.8135383770424276,
"grad_norm": 0.36153069138526917,
"learning_rate": 0.00013174603174603172,
"loss": 0.7641,
"step": 1651
},
{
"epoch": 1.814636825484004,
"grad_norm": 0.4233636260032654,
"learning_rate": 0.00013162393162393163,
"loss": 0.7119,
"step": 1652
},
{
"epoch": 1.81573527392558,
"grad_norm": 0.5262153148651123,
"learning_rate": 0.0001315018315018315,
"loss": 0.4516,
"step": 1653
},
{
"epoch": 1.8168337223671562,
"grad_norm": 0.5263295769691467,
"learning_rate": 0.00013137973137973137,
"loss": 0.7786,
"step": 1654
},
{
"epoch": 1.8179321708087328,
"grad_norm": 0.3681116998195648,
"learning_rate": 0.00013125763125763126,
"loss": 0.5295,
"step": 1655
},
{
"epoch": 1.819030619250309,
"grad_norm": 0.5075433254241943,
"learning_rate": 0.00013113553113553112,
"loss": 0.6017,
"step": 1656
},
{
"epoch": 1.820129067691885,
"grad_norm": 0.2960616946220398,
"learning_rate": 0.000131013431013431,
"loss": 0.4951,
"step": 1657
},
{
"epoch": 1.8212275161334615,
"grad_norm": 0.4010205864906311,
"learning_rate": 0.0001308913308913309,
"loss": 0.8916,
"step": 1658
},
{
"epoch": 1.8223259645750378,
"grad_norm": 0.9112391471862793,
"learning_rate": 0.00013076923076923077,
"loss": 0.4978,
"step": 1659
},
{
"epoch": 1.823424413016614,
"grad_norm": 0.7214633226394653,
"learning_rate": 0.00013064713064713063,
"loss": 0.791,
"step": 1660
},
{
"epoch": 1.8245228614581903,
"grad_norm": 0.4174933433532715,
"learning_rate": 0.00013052503052503052,
"loss": 0.4099,
"step": 1661
},
{
"epoch": 1.8256213098997667,
"grad_norm": 0.4622137248516083,
"learning_rate": 0.0001304029304029304,
"loss": 1.1726,
"step": 1662
},
{
"epoch": 1.8267197583413428,
"grad_norm": 0.5991957783699036,
"learning_rate": 0.00013028083028083026,
"loss": 0.6713,
"step": 1663
},
{
"epoch": 1.8278182067829192,
"grad_norm": 0.43959730863571167,
"learning_rate": 0.00013015873015873015,
"loss": 0.5676,
"step": 1664
},
{
"epoch": 1.8289166552244955,
"grad_norm": 0.6271671056747437,
"learning_rate": 0.00013003663003663003,
"loss": 0.7399,
"step": 1665
},
{
"epoch": 1.8300151036660717,
"grad_norm": 0.6412084102630615,
"learning_rate": 0.0001299145299145299,
"loss": 0.7585,
"step": 1666
},
{
"epoch": 1.8311135521076478,
"grad_norm": 0.4066605269908905,
"learning_rate": 0.00012979242979242977,
"loss": 0.5756,
"step": 1667
},
{
"epoch": 1.8322120005492242,
"grad_norm": 0.3568172752857208,
"learning_rate": 0.00012967032967032966,
"loss": 0.968,
"step": 1668
},
{
"epoch": 1.8333104489908005,
"grad_norm": 0.5061100721359253,
"learning_rate": 0.00012954822954822955,
"loss": 0.5089,
"step": 1669
},
{
"epoch": 1.8344088974323767,
"grad_norm": 3.013622522354126,
"learning_rate": 0.0001294261294261294,
"loss": 0.5101,
"step": 1670
},
{
"epoch": 1.835507345873953,
"grad_norm": 0.40078219771385193,
"learning_rate": 0.0001293040293040293,
"loss": 0.5602,
"step": 1671
},
{
"epoch": 1.8366057943155294,
"grad_norm": 0.4108009338378906,
"learning_rate": 0.00012918192918192918,
"loss": 0.6338,
"step": 1672
},
{
"epoch": 1.8377042427571055,
"grad_norm": 0.5452212691307068,
"learning_rate": 0.00012905982905982903,
"loss": 0.5358,
"step": 1673
},
{
"epoch": 1.838802691198682,
"grad_norm": 0.4694603979587555,
"learning_rate": 0.00012893772893772895,
"loss": 0.7031,
"step": 1674
},
{
"epoch": 1.8399011396402583,
"grad_norm": 0.3787671625614166,
"learning_rate": 0.0001288156288156288,
"loss": 0.5667,
"step": 1675
},
{
"epoch": 1.8409995880818344,
"grad_norm": 0.4842737317085266,
"learning_rate": 0.0001286935286935287,
"loss": 0.5082,
"step": 1676
},
{
"epoch": 1.8420980365234105,
"grad_norm": 0.7690992951393127,
"learning_rate": 0.00012857142857142855,
"loss": 0.706,
"step": 1677
},
{
"epoch": 1.8431964849649871,
"grad_norm": 1.0891668796539307,
"learning_rate": 0.00012844932844932843,
"loss": 0.7162,
"step": 1678
},
{
"epoch": 1.8442949334065633,
"grad_norm": 0.4118032157421112,
"learning_rate": 0.00012832722832722832,
"loss": 0.7019,
"step": 1679
},
{
"epoch": 1.8453933818481394,
"grad_norm": 0.513157308101654,
"learning_rate": 0.00012820512820512818,
"loss": 0.4359,
"step": 1680
},
{
"epoch": 1.8464918302897158,
"grad_norm": 1.3229504823684692,
"learning_rate": 0.0001280830280830281,
"loss": 0.5555,
"step": 1681
},
{
"epoch": 1.8475902787312921,
"grad_norm": 0.6301699876785278,
"learning_rate": 0.00012796092796092795,
"loss": 0.5211,
"step": 1682
},
{
"epoch": 1.8486887271728683,
"grad_norm": 0.6125632524490356,
"learning_rate": 0.00012783882783882783,
"loss": 0.6287,
"step": 1683
},
{
"epoch": 1.8497871756144446,
"grad_norm": 1.806593418121338,
"learning_rate": 0.00012771672771672772,
"loss": 0.5794,
"step": 1684
},
{
"epoch": 1.850885624056021,
"grad_norm": 1.2972358465194702,
"learning_rate": 0.00012759462759462758,
"loss": 0.9205,
"step": 1685
},
{
"epoch": 1.8519840724975971,
"grad_norm": 1.0519033670425415,
"learning_rate": 0.00012747252747252746,
"loss": 0.7103,
"step": 1686
},
{
"epoch": 1.8530825209391735,
"grad_norm": 1.6489734649658203,
"learning_rate": 0.00012735042735042735,
"loss": 0.7585,
"step": 1687
},
{
"epoch": 1.8541809693807498,
"grad_norm": 0.7229527235031128,
"learning_rate": 0.0001272283272283272,
"loss": 0.8109,
"step": 1688
},
{
"epoch": 1.855279417822326,
"grad_norm": 0.35257261991500854,
"learning_rate": 0.0001271062271062271,
"loss": 0.8014,
"step": 1689
},
{
"epoch": 1.856377866263902,
"grad_norm": 0.4653327167034149,
"learning_rate": 0.00012698412698412698,
"loss": 0.6404,
"step": 1690
},
{
"epoch": 1.8574763147054785,
"grad_norm": 0.5230842232704163,
"learning_rate": 0.00012686202686202686,
"loss": 0.7413,
"step": 1691
},
{
"epoch": 1.8585747631470548,
"grad_norm": 0.42130210995674133,
"learning_rate": 0.00012673992673992672,
"loss": 0.7283,
"step": 1692
},
{
"epoch": 1.859673211588631,
"grad_norm": 1.4667960405349731,
"learning_rate": 0.0001266178266178266,
"loss": 0.5656,
"step": 1693
},
{
"epoch": 1.8607716600302073,
"grad_norm": 0.4077359139919281,
"learning_rate": 0.0001264957264957265,
"loss": 0.5891,
"step": 1694
},
{
"epoch": 1.8618701084717837,
"grad_norm": 0.503654956817627,
"learning_rate": 0.00012637362637362635,
"loss": 0.5912,
"step": 1695
},
{
"epoch": 1.8629685569133598,
"grad_norm": 1.6315315961837769,
"learning_rate": 0.00012625152625152624,
"loss": 0.5588,
"step": 1696
},
{
"epoch": 1.8640670053549362,
"grad_norm": 0.783920407295227,
"learning_rate": 0.00012612942612942612,
"loss": 0.6585,
"step": 1697
},
{
"epoch": 1.8651654537965126,
"grad_norm": 0.7186728715896606,
"learning_rate": 0.000126007326007326,
"loss": 0.9174,
"step": 1698
},
{
"epoch": 1.8662639022380887,
"grad_norm": 0.8784156441688538,
"learning_rate": 0.00012588522588522587,
"loss": 0.5835,
"step": 1699
},
{
"epoch": 1.8673623506796648,
"grad_norm": 0.7090787887573242,
"learning_rate": 0.00012576312576312575,
"loss": 0.7555,
"step": 1700
},
{
"epoch": 1.8684607991212414,
"grad_norm": 0.5508129596710205,
"learning_rate": 0.00012564102564102564,
"loss": 0.6168,
"step": 1701
},
{
"epoch": 1.8695592475628175,
"grad_norm": 0.40403681993484497,
"learning_rate": 0.0001255189255189255,
"loss": 0.4528,
"step": 1702
},
{
"epoch": 1.8706576960043937,
"grad_norm": 0.9553635716438293,
"learning_rate": 0.00012539682539682538,
"loss": 0.654,
"step": 1703
},
{
"epoch": 1.87175614444597,
"grad_norm": 1.0610092878341675,
"learning_rate": 0.00012527472527472527,
"loss": 0.6115,
"step": 1704
},
{
"epoch": 1.8728545928875464,
"grad_norm": 0.32898634672164917,
"learning_rate": 0.00012515262515262515,
"loss": 0.5651,
"step": 1705
},
{
"epoch": 1.8739530413291225,
"grad_norm": 0.4018780589103699,
"learning_rate": 0.000125030525030525,
"loss": 0.5919,
"step": 1706
},
{
"epoch": 1.875051489770699,
"grad_norm": 1.6521873474121094,
"learning_rate": 0.0001249084249084249,
"loss": 0.7137,
"step": 1707
},
{
"epoch": 1.8761499382122753,
"grad_norm": 0.5515930652618408,
"learning_rate": 0.00012478632478632478,
"loss": 0.4471,
"step": 1708
},
{
"epoch": 1.8772483866538514,
"grad_norm": 0.4156915545463562,
"learning_rate": 0.00012466422466422464,
"loss": 0.6575,
"step": 1709
},
{
"epoch": 1.8783468350954275,
"grad_norm": 0.41263312101364136,
"learning_rate": 0.00012454212454212453,
"loss": 0.542,
"step": 1710
},
{
"epoch": 1.8794452835370041,
"grad_norm": 1.0169517993927002,
"learning_rate": 0.0001244200244200244,
"loss": 1.1631,
"step": 1711
},
{
"epoch": 1.8805437319785803,
"grad_norm": 0.49169981479644775,
"learning_rate": 0.0001242979242979243,
"loss": 0.6707,
"step": 1712
},
{
"epoch": 1.8816421804201564,
"grad_norm": 0.44801297783851624,
"learning_rate": 0.00012417582417582416,
"loss": 1.0036,
"step": 1713
},
{
"epoch": 1.8827406288617328,
"grad_norm": 0.47181040048599243,
"learning_rate": 0.00012405372405372404,
"loss": 0.6693,
"step": 1714
},
{
"epoch": 1.8838390773033091,
"grad_norm": 0.39900457859039307,
"learning_rate": 0.00012393162393162393,
"loss": 0.6421,
"step": 1715
},
{
"epoch": 1.8849375257448853,
"grad_norm": 1.1160179376602173,
"learning_rate": 0.00012380952380952378,
"loss": 0.6599,
"step": 1716
},
{
"epoch": 1.8860359741864616,
"grad_norm": 0.6951555609703064,
"learning_rate": 0.00012368742368742367,
"loss": 0.743,
"step": 1717
},
{
"epoch": 1.887134422628038,
"grad_norm": 0.5381472706794739,
"learning_rate": 0.00012356532356532356,
"loss": 0.5051,
"step": 1718
},
{
"epoch": 1.8882328710696141,
"grad_norm": 0.48717793822288513,
"learning_rate": 0.00012344322344322341,
"loss": 0.7015,
"step": 1719
},
{
"epoch": 1.8893313195111905,
"grad_norm": 0.3720596432685852,
"learning_rate": 0.00012332112332112333,
"loss": 0.6743,
"step": 1720
},
{
"epoch": 1.8904297679527668,
"grad_norm": 1.1850451231002808,
"learning_rate": 0.00012319902319902318,
"loss": 0.6132,
"step": 1721
},
{
"epoch": 1.891528216394343,
"grad_norm": 0.4546525180339813,
"learning_rate": 0.00012307692307692307,
"loss": 0.5465,
"step": 1722
},
{
"epoch": 1.8926266648359191,
"grad_norm": 0.41415080428123474,
"learning_rate": 0.00012295482295482296,
"loss": 0.7259,
"step": 1723
},
{
"epoch": 1.8937251132774955,
"grad_norm": 0.44278842210769653,
"learning_rate": 0.00012283272283272281,
"loss": 0.7244,
"step": 1724
},
{
"epoch": 1.8948235617190718,
"grad_norm": 0.3887364864349365,
"learning_rate": 0.0001227106227106227,
"loss": 0.7124,
"step": 1725
},
{
"epoch": 1.895922010160648,
"grad_norm": 0.5405781269073486,
"learning_rate": 0.00012258852258852256,
"loss": 0.5153,
"step": 1726
},
{
"epoch": 1.8970204586022243,
"grad_norm": 0.3530559837818146,
"learning_rate": 0.00012246642246642247,
"loss": 0.5429,
"step": 1727
},
{
"epoch": 1.8981189070438007,
"grad_norm": 0.523621678352356,
"learning_rate": 0.00012234432234432233,
"loss": 0.5645,
"step": 1728
},
{
"epoch": 1.8992173554853768,
"grad_norm": 0.3893704116344452,
"learning_rate": 0.00012222222222222221,
"loss": 0.6419,
"step": 1729
},
{
"epoch": 1.9003158039269532,
"grad_norm": 0.7010704278945923,
"learning_rate": 0.0001221001221001221,
"loss": 0.5202,
"step": 1730
},
{
"epoch": 1.9014142523685296,
"grad_norm": 0.45551490783691406,
"learning_rate": 0.00012197802197802197,
"loss": 0.8492,
"step": 1731
},
{
"epoch": 1.9025127008101057,
"grad_norm": 1.0112484693527222,
"learning_rate": 0.00012185592185592184,
"loss": 0.8602,
"step": 1732
},
{
"epoch": 1.9036111492516818,
"grad_norm": 0.4509601294994354,
"learning_rate": 0.00012173382173382173,
"loss": 0.6138,
"step": 1733
},
{
"epoch": 1.9047095976932584,
"grad_norm": 0.4303388297557831,
"learning_rate": 0.0001216117216117216,
"loss": 0.4748,
"step": 1734
},
{
"epoch": 1.9058080461348346,
"grad_norm": 0.4452000558376312,
"learning_rate": 0.00012148962148962147,
"loss": 0.5869,
"step": 1735
},
{
"epoch": 1.9069064945764107,
"grad_norm": 0.5915077924728394,
"learning_rate": 0.00012136752136752136,
"loss": 0.8057,
"step": 1736
},
{
"epoch": 1.908004943017987,
"grad_norm": 0.38761547207832336,
"learning_rate": 0.00012124542124542123,
"loss": 0.5772,
"step": 1737
},
{
"epoch": 1.9091033914595634,
"grad_norm": 0.517752468585968,
"learning_rate": 0.00012112332112332112,
"loss": 0.7865,
"step": 1738
},
{
"epoch": 1.9102018399011396,
"grad_norm": 0.5325546860694885,
"learning_rate": 0.00012100122100122099,
"loss": 0.5934,
"step": 1739
},
{
"epoch": 1.911300288342716,
"grad_norm": 0.3930620551109314,
"learning_rate": 0.00012087912087912087,
"loss": 0.5974,
"step": 1740
},
{
"epoch": 1.9123987367842923,
"grad_norm": 1.1001818180084229,
"learning_rate": 0.00012075702075702075,
"loss": 0.6524,
"step": 1741
},
{
"epoch": 1.9134971852258684,
"grad_norm": 0.3690165877342224,
"learning_rate": 0.00012063492063492062,
"loss": 0.36,
"step": 1742
},
{
"epoch": 1.9145956336674448,
"grad_norm": 0.4403206408023834,
"learning_rate": 0.0001205128205128205,
"loss": 0.5737,
"step": 1743
},
{
"epoch": 1.9156940821090211,
"grad_norm": 0.651498019695282,
"learning_rate": 0.00012039072039072037,
"loss": 0.657,
"step": 1744
},
{
"epoch": 1.9167925305505973,
"grad_norm": 0.6880660057067871,
"learning_rate": 0.00012026862026862025,
"loss": 0.6891,
"step": 1745
},
{
"epoch": 1.9178909789921734,
"grad_norm": 0.4968664348125458,
"learning_rate": 0.00012014652014652015,
"loss": 0.841,
"step": 1746
},
{
"epoch": 1.9189894274337498,
"grad_norm": 0.4392407536506653,
"learning_rate": 0.00012002442002442002,
"loss": 0.7096,
"step": 1747
},
{
"epoch": 1.9200878758753261,
"grad_norm": 0.41028741002082825,
"learning_rate": 0.00011990231990231989,
"loss": 0.5838,
"step": 1748
},
{
"epoch": 1.9211863243169023,
"grad_norm": 0.7928158640861511,
"learning_rate": 0.00011978021978021978,
"loss": 0.6633,
"step": 1749
},
{
"epoch": 1.9222847727584786,
"grad_norm": 0.4970681071281433,
"learning_rate": 0.00011965811965811965,
"loss": 0.7764,
"step": 1750
},
{
"epoch": 1.923383221200055,
"grad_norm": 0.49581378698349,
"learning_rate": 0.00011953601953601952,
"loss": 0.7204,
"step": 1751
},
{
"epoch": 1.9244816696416311,
"grad_norm": 1.309241771697998,
"learning_rate": 0.00011941391941391939,
"loss": 0.5859,
"step": 1752
},
{
"epoch": 1.9255801180832075,
"grad_norm": 0.4651016592979431,
"learning_rate": 0.00011929181929181929,
"loss": 0.6425,
"step": 1753
},
{
"epoch": 1.9266785665247839,
"grad_norm": 0.5377634167671204,
"learning_rate": 0.00011916971916971916,
"loss": 0.8244,
"step": 1754
},
{
"epoch": 1.92777701496636,
"grad_norm": 0.6809287667274475,
"learning_rate": 0.00011904761904761903,
"loss": 0.5711,
"step": 1755
},
{
"epoch": 1.9288754634079361,
"grad_norm": 0.650701105594635,
"learning_rate": 0.00011892551892551892,
"loss": 0.8341,
"step": 1756
},
{
"epoch": 1.9299739118495127,
"grad_norm": 1.1710751056671143,
"learning_rate": 0.00011880341880341879,
"loss": 0.8093,
"step": 1757
},
{
"epoch": 1.9310723602910889,
"grad_norm": 0.4244484603404999,
"learning_rate": 0.00011868131868131866,
"loss": 0.5556,
"step": 1758
},
{
"epoch": 1.932170808732665,
"grad_norm": 0.43999040126800537,
"learning_rate": 0.00011855921855921855,
"loss": 0.4582,
"step": 1759
},
{
"epoch": 1.9332692571742414,
"grad_norm": 0.4197145700454712,
"learning_rate": 0.00011843711843711843,
"loss": 0.6475,
"step": 1760
},
{
"epoch": 1.9343677056158177,
"grad_norm": 0.36619749665260315,
"learning_rate": 0.0001183150183150183,
"loss": 0.5804,
"step": 1761
},
{
"epoch": 1.9354661540573939,
"grad_norm": 1.7230706214904785,
"learning_rate": 0.00011819291819291819,
"loss": 0.7064,
"step": 1762
},
{
"epoch": 1.9365646024989702,
"grad_norm": 0.7621874213218689,
"learning_rate": 0.00011807081807081806,
"loss": 0.6766,
"step": 1763
},
{
"epoch": 1.9376630509405466,
"grad_norm": 0.5920525789260864,
"learning_rate": 0.00011794871794871794,
"loss": 0.7092,
"step": 1764
},
{
"epoch": 1.9387614993821227,
"grad_norm": 1.5368432998657227,
"learning_rate": 0.00011782661782661781,
"loss": 0.3366,
"step": 1765
},
{
"epoch": 1.9398599478236989,
"grad_norm": 0.43197643756866455,
"learning_rate": 0.00011770451770451769,
"loss": 0.6158,
"step": 1766
},
{
"epoch": 1.9409583962652754,
"grad_norm": 0.4623143970966339,
"learning_rate": 0.00011758241758241756,
"loss": 0.6574,
"step": 1767
},
{
"epoch": 1.9420568447068516,
"grad_norm": 0.40638601779937744,
"learning_rate": 0.00011746031746031744,
"loss": 0.4385,
"step": 1768
},
{
"epoch": 1.9431552931484277,
"grad_norm": 0.5941652655601501,
"learning_rate": 0.00011733821733821734,
"loss": 0.8634,
"step": 1769
},
{
"epoch": 1.944253741590004,
"grad_norm": 0.9646288156509399,
"learning_rate": 0.00011721611721611721,
"loss": 0.7107,
"step": 1770
},
{
"epoch": 1.9453521900315804,
"grad_norm": 1.6859776973724365,
"learning_rate": 0.00011709401709401708,
"loss": 0.5544,
"step": 1771
},
{
"epoch": 1.9464506384731566,
"grad_norm": 0.4034999907016754,
"learning_rate": 0.00011697191697191697,
"loss": 0.559,
"step": 1772
},
{
"epoch": 1.947549086914733,
"grad_norm": 0.3644643723964691,
"learning_rate": 0.00011684981684981684,
"loss": 0.535,
"step": 1773
},
{
"epoch": 1.9486475353563093,
"grad_norm": 0.5826202034950256,
"learning_rate": 0.00011672771672771671,
"loss": 0.6405,
"step": 1774
},
{
"epoch": 1.9497459837978854,
"grad_norm": 0.5501505136489868,
"learning_rate": 0.00011660561660561661,
"loss": 0.5702,
"step": 1775
},
{
"epoch": 1.9508444322394618,
"grad_norm": 0.7928853631019592,
"learning_rate": 0.00011648351648351648,
"loss": 0.666,
"step": 1776
},
{
"epoch": 1.9519428806810382,
"grad_norm": 0.8168489933013916,
"learning_rate": 0.00011636141636141635,
"loss": 0.4451,
"step": 1777
},
{
"epoch": 1.9530413291226143,
"grad_norm": 0.3752410113811493,
"learning_rate": 0.00011623931623931622,
"loss": 0.6552,
"step": 1778
},
{
"epoch": 1.9541397775641904,
"grad_norm": 0.9020218849182129,
"learning_rate": 0.00011611721611721611,
"loss": 0.5994,
"step": 1779
},
{
"epoch": 1.9552382260057668,
"grad_norm": 0.7668479084968567,
"learning_rate": 0.00011599511599511598,
"loss": 0.5007,
"step": 1780
},
{
"epoch": 1.9563366744473432,
"grad_norm": 0.5034022331237793,
"learning_rate": 0.00011587301587301585,
"loss": 0.5211,
"step": 1781
},
{
"epoch": 1.9574351228889193,
"grad_norm": 1.0153850317001343,
"learning_rate": 0.00011575091575091575,
"loss": 0.5953,
"step": 1782
},
{
"epoch": 1.9585335713304957,
"grad_norm": 0.40088045597076416,
"learning_rate": 0.00011562881562881562,
"loss": 0.568,
"step": 1783
},
{
"epoch": 1.959632019772072,
"grad_norm": 1.4017099142074585,
"learning_rate": 0.0001155067155067155,
"loss": 0.7058,
"step": 1784
},
{
"epoch": 1.9607304682136482,
"grad_norm": 0.6009597778320312,
"learning_rate": 0.00011538461538461538,
"loss": 0.6239,
"step": 1785
},
{
"epoch": 1.9618289166552245,
"grad_norm": 0.5155071020126343,
"learning_rate": 0.00011526251526251525,
"loss": 0.6089,
"step": 1786
},
{
"epoch": 1.9629273650968009,
"grad_norm": 0.4248057007789612,
"learning_rate": 0.00011514041514041513,
"loss": 0.6481,
"step": 1787
},
{
"epoch": 1.964025813538377,
"grad_norm": 0.6521177887916565,
"learning_rate": 0.00011501831501831501,
"loss": 0.6598,
"step": 1788
},
{
"epoch": 1.9651242619799532,
"grad_norm": 0.44697993993759155,
"learning_rate": 0.00011489621489621488,
"loss": 0.8944,
"step": 1789
},
{
"epoch": 1.9662227104215297,
"grad_norm": 0.41537097096443176,
"learning_rate": 0.00011477411477411476,
"loss": 0.5304,
"step": 1790
},
{
"epoch": 1.9673211588631059,
"grad_norm": 0.48793885111808777,
"learning_rate": 0.00011465201465201464,
"loss": 0.7262,
"step": 1791
},
{
"epoch": 1.968419607304682,
"grad_norm": 0.8768893480300903,
"learning_rate": 0.00011452991452991453,
"loss": 0.6748,
"step": 1792
},
{
"epoch": 1.9695180557462584,
"grad_norm": 0.39224761724472046,
"learning_rate": 0.0001144078144078144,
"loss": 0.5503,
"step": 1793
},
{
"epoch": 1.9706165041878347,
"grad_norm": 0.5617446899414062,
"learning_rate": 0.00011428571428571427,
"loss": 0.7329,
"step": 1794
},
{
"epoch": 1.9717149526294109,
"grad_norm": 0.3787171542644501,
"learning_rate": 0.00011416361416361416,
"loss": 0.545,
"step": 1795
},
{
"epoch": 1.9728134010709872,
"grad_norm": 1.5167701244354248,
"learning_rate": 0.00011404151404151403,
"loss": 0.492,
"step": 1796
},
{
"epoch": 1.9739118495125636,
"grad_norm": 0.6436883807182312,
"learning_rate": 0.0001139194139194139,
"loss": 0.5644,
"step": 1797
},
{
"epoch": 1.9750102979541397,
"grad_norm": 0.7104658484458923,
"learning_rate": 0.0001137973137973138,
"loss": 0.7485,
"step": 1798
},
{
"epoch": 1.976108746395716,
"grad_norm": 0.7996894717216492,
"learning_rate": 0.00011367521367521367,
"loss": 0.6918,
"step": 1799
},
{
"epoch": 1.9772071948372925,
"grad_norm": 0.6419106721878052,
"learning_rate": 0.00011355311355311354,
"loss": 0.5945,
"step": 1800
},
{
"epoch": 1.9783056432788686,
"grad_norm": 0.5158131718635559,
"learning_rate": 0.00011343101343101343,
"loss": 0.6685,
"step": 1801
},
{
"epoch": 1.9794040917204447,
"grad_norm": 1.0825144052505493,
"learning_rate": 0.0001133089133089133,
"loss": 0.6774,
"step": 1802
},
{
"epoch": 1.980502540162021,
"grad_norm": 0.3999088704586029,
"learning_rate": 0.00011318681318681317,
"loss": 0.632,
"step": 1803
},
{
"epoch": 1.9816009886035975,
"grad_norm": 0.8866069316864014,
"learning_rate": 0.00011306471306471304,
"loss": 0.6541,
"step": 1804
},
{
"epoch": 1.9826994370451736,
"grad_norm": 0.3858928978443146,
"learning_rate": 0.00011294261294261294,
"loss": 0.6608,
"step": 1805
},
{
"epoch": 1.98379788548675,
"grad_norm": 0.513117790222168,
"learning_rate": 0.00011282051282051281,
"loss": 0.7598,
"step": 1806
},
{
"epoch": 1.9848963339283263,
"grad_norm": 0.3166581392288208,
"learning_rate": 0.00011269841269841269,
"loss": 0.781,
"step": 1807
},
{
"epoch": 1.9859947823699025,
"grad_norm": 0.3982362151145935,
"learning_rate": 0.00011257631257631257,
"loss": 0.873,
"step": 1808
},
{
"epoch": 1.9870932308114788,
"grad_norm": 0.3784008026123047,
"learning_rate": 0.00011245421245421244,
"loss": 0.7286,
"step": 1809
},
{
"epoch": 1.9881916792530552,
"grad_norm": 0.7578315138816833,
"learning_rate": 0.00011233211233211232,
"loss": 0.5958,
"step": 1810
},
{
"epoch": 1.9892901276946313,
"grad_norm": 0.8509061932563782,
"learning_rate": 0.0001122100122100122,
"loss": 0.557,
"step": 1811
},
{
"epoch": 1.9903885761362075,
"grad_norm": 0.5107323527336121,
"learning_rate": 0.00011208791208791207,
"loss": 0.6994,
"step": 1812
},
{
"epoch": 1.991487024577784,
"grad_norm": 0.5421388149261475,
"learning_rate": 0.00011196581196581196,
"loss": 0.8839,
"step": 1813
},
{
"epoch": 1.9925854730193602,
"grad_norm": 0.7442356944084167,
"learning_rate": 0.00011184371184371184,
"loss": 0.6676,
"step": 1814
},
{
"epoch": 1.9936839214609363,
"grad_norm": 0.34132111072540283,
"learning_rate": 0.00011172161172161172,
"loss": 0.5714,
"step": 1815
},
{
"epoch": 1.9947823699025127,
"grad_norm": 0.3995620906352997,
"learning_rate": 0.00011159951159951159,
"loss": 0.4811,
"step": 1816
},
{
"epoch": 1.995880818344089,
"grad_norm": 0.5613861083984375,
"learning_rate": 0.00011147741147741146,
"loss": 0.7495,
"step": 1817
},
{
"epoch": 1.9969792667856652,
"grad_norm": 0.4366309642791748,
"learning_rate": 0.00011135531135531135,
"loss": 0.6512,
"step": 1818
},
{
"epoch": 1.9980777152272415,
"grad_norm": 0.889916718006134,
"learning_rate": 0.00011123321123321122,
"loss": 0.5544,
"step": 1819
},
{
"epoch": 1.999176163668818,
"grad_norm": 0.512112021446228,
"learning_rate": 0.00011111111111111109,
"loss": 1.136,
"step": 1820
},
{
"epoch": 2.000274612110394,
"grad_norm": 0.5241844654083252,
"learning_rate": 0.00011098901098901099,
"loss": 0.5898,
"step": 1821
},
{
"epoch": 2.00137306055197,
"grad_norm": 0.38159477710723877,
"learning_rate": 0.00011086691086691086,
"loss": 0.5523,
"step": 1822
},
{
"epoch": 2.0024715089935468,
"grad_norm": 1.0415009260177612,
"learning_rate": 0.00011074481074481073,
"loss": 0.6963,
"step": 1823
},
{
"epoch": 2.003569957435123,
"grad_norm": 0.5349957942962646,
"learning_rate": 0.00011062271062271062,
"loss": 0.4422,
"step": 1824
},
{
"epoch": 2.004668405876699,
"grad_norm": 0.4512043297290802,
"learning_rate": 0.00011050061050061049,
"loss": 0.5467,
"step": 1825
},
{
"epoch": 2.0057668543182756,
"grad_norm": 0.8268045783042908,
"learning_rate": 0.00011037851037851036,
"loss": 0.6931,
"step": 1826
},
{
"epoch": 2.0068653027598518,
"grad_norm": 0.47922319173812866,
"learning_rate": 0.00011025641025641026,
"loss": 0.707,
"step": 1827
},
{
"epoch": 2.007963751201428,
"grad_norm": 1.352858304977417,
"learning_rate": 0.00011013431013431013,
"loss": 0.5658,
"step": 1828
},
{
"epoch": 2.0090621996430045,
"grad_norm": 0.6304643154144287,
"learning_rate": 0.00011001221001221,
"loss": 0.6526,
"step": 1829
},
{
"epoch": 2.0101606480845806,
"grad_norm": 0.3759060502052307,
"learning_rate": 0.00010989010989010988,
"loss": 0.627,
"step": 1830
},
{
"epoch": 2.0112590965261568,
"grad_norm": 0.5676531195640564,
"learning_rate": 0.00010976800976800976,
"loss": 0.7568,
"step": 1831
},
{
"epoch": 2.012357544967733,
"grad_norm": 0.7481321692466736,
"learning_rate": 0.00010964590964590963,
"loss": 0.7304,
"step": 1832
},
{
"epoch": 2.0134559934093095,
"grad_norm": 1.0350905656814575,
"learning_rate": 0.0001095238095238095,
"loss": 0.7414,
"step": 1833
},
{
"epoch": 2.0145544418508856,
"grad_norm": 0.7817292809486389,
"learning_rate": 0.00010940170940170939,
"loss": 0.7742,
"step": 1834
},
{
"epoch": 2.0156528902924618,
"grad_norm": 0.44659602642059326,
"learning_rate": 0.00010927960927960928,
"loss": 0.7872,
"step": 1835
},
{
"epoch": 2.0167513387340383,
"grad_norm": 0.46931198239326477,
"learning_rate": 0.00010915750915750915,
"loss": 0.5596,
"step": 1836
},
{
"epoch": 2.0178497871756145,
"grad_norm": 0.34634560346603394,
"learning_rate": 0.00010903540903540903,
"loss": 0.6861,
"step": 1837
},
{
"epoch": 2.0189482356171906,
"grad_norm": 0.36579200625419617,
"learning_rate": 0.0001089133089133089,
"loss": 0.6586,
"step": 1838
},
{
"epoch": 2.020046684058767,
"grad_norm": 0.9167144894599915,
"learning_rate": 0.00010879120879120878,
"loss": 0.7125,
"step": 1839
},
{
"epoch": 2.0211451325003433,
"grad_norm": 0.4107789993286133,
"learning_rate": 0.00010866910866910866,
"loss": 0.6089,
"step": 1840
},
{
"epoch": 2.0222435809419195,
"grad_norm": 1.0845204591751099,
"learning_rate": 0.00010854700854700854,
"loss": 0.499,
"step": 1841
},
{
"epoch": 2.0233420293834956,
"grad_norm": 0.382376492023468,
"learning_rate": 0.00010842490842490841,
"loss": 0.5505,
"step": 1842
},
{
"epoch": 2.024440477825072,
"grad_norm": 0.38339781761169434,
"learning_rate": 0.00010830280830280828,
"loss": 0.4593,
"step": 1843
},
{
"epoch": 2.0255389262666483,
"grad_norm": 0.45328769087791443,
"learning_rate": 0.00010818070818070818,
"loss": 0.8437,
"step": 1844
},
{
"epoch": 2.0266373747082245,
"grad_norm": 0.3051920533180237,
"learning_rate": 0.00010805860805860805,
"loss": 0.6096,
"step": 1845
},
{
"epoch": 2.027735823149801,
"grad_norm": 0.4249560236930847,
"learning_rate": 0.00010793650793650792,
"loss": 0.6441,
"step": 1846
},
{
"epoch": 2.028834271591377,
"grad_norm": 0.6639708280563354,
"learning_rate": 0.00010781440781440781,
"loss": 0.716,
"step": 1847
},
{
"epoch": 2.0299327200329533,
"grad_norm": 0.4324635863304138,
"learning_rate": 0.00010769230769230768,
"loss": 0.5288,
"step": 1848
},
{
"epoch": 2.03103116847453,
"grad_norm": 0.46487629413604736,
"learning_rate": 0.00010757020757020755,
"loss": 0.4908,
"step": 1849
},
{
"epoch": 2.032129616916106,
"grad_norm": 0.5104641318321228,
"learning_rate": 0.00010744810744810745,
"loss": 0.6367,
"step": 1850
},
{
"epoch": 2.033228065357682,
"grad_norm": 0.4010922312736511,
"learning_rate": 0.00010732600732600732,
"loss": 0.4266,
"step": 1851
},
{
"epoch": 2.0343265137992583,
"grad_norm": 0.6835510730743408,
"learning_rate": 0.0001072039072039072,
"loss": 1.0077,
"step": 1852
},
{
"epoch": 2.035424962240835,
"grad_norm": 0.7012602686882019,
"learning_rate": 0.00010708180708180708,
"loss": 0.7656,
"step": 1853
},
{
"epoch": 2.036523410682411,
"grad_norm": 0.8202001452445984,
"learning_rate": 0.00010695970695970695,
"loss": 0.9796,
"step": 1854
},
{
"epoch": 2.037621859123987,
"grad_norm": 0.37708353996276855,
"learning_rate": 0.00010683760683760682,
"loss": 0.3664,
"step": 1855
},
{
"epoch": 2.0387203075655638,
"grad_norm": 0.34818801283836365,
"learning_rate": 0.0001067155067155067,
"loss": 0.5365,
"step": 1856
},
{
"epoch": 2.03981875600714,
"grad_norm": 0.46427440643310547,
"learning_rate": 0.0001065934065934066,
"loss": 0.7503,
"step": 1857
},
{
"epoch": 2.040917204448716,
"grad_norm": 0.4782754182815552,
"learning_rate": 0.00010647130647130647,
"loss": 0.9247,
"step": 1858
},
{
"epoch": 2.0420156528902926,
"grad_norm": 0.6814667582511902,
"learning_rate": 0.00010634920634920634,
"loss": 0.5365,
"step": 1859
},
{
"epoch": 2.0431141013318688,
"grad_norm": 0.4782056510448456,
"learning_rate": 0.00010622710622710622,
"loss": 0.7444,
"step": 1860
},
{
"epoch": 2.044212549773445,
"grad_norm": 0.768439769744873,
"learning_rate": 0.0001061050061050061,
"loss": 0.6386,
"step": 1861
},
{
"epoch": 2.0453109982150215,
"grad_norm": 0.9991740584373474,
"learning_rate": 0.00010598290598290597,
"loss": 0.4762,
"step": 1862
},
{
"epoch": 2.0464094466565976,
"grad_norm": 0.4244922995567322,
"learning_rate": 0.00010586080586080585,
"loss": 0.4469,
"step": 1863
},
{
"epoch": 2.0475078950981738,
"grad_norm": 0.4085465371608734,
"learning_rate": 0.00010573870573870573,
"loss": 0.7215,
"step": 1864
},
{
"epoch": 2.04860634353975,
"grad_norm": 1.3068008422851562,
"learning_rate": 0.0001056166056166056,
"loss": 0.7781,
"step": 1865
},
{
"epoch": 2.0497047919813265,
"grad_norm": 0.3995974659919739,
"learning_rate": 0.0001054945054945055,
"loss": 0.6114,
"step": 1866
},
{
"epoch": 2.0508032404229026,
"grad_norm": 0.47944560647010803,
"learning_rate": 0.00010537240537240537,
"loss": 0.7355,
"step": 1867
},
{
"epoch": 2.0519016888644788,
"grad_norm": 1.6718720197677612,
"learning_rate": 0.00010525030525030524,
"loss": 0.5987,
"step": 1868
},
{
"epoch": 2.0530001373060554,
"grad_norm": 0.46015220880508423,
"learning_rate": 0.00010512820512820511,
"loss": 0.481,
"step": 1869
},
{
"epoch": 2.0540985857476315,
"grad_norm": 0.4863795042037964,
"learning_rate": 0.000105006105006105,
"loss": 0.5877,
"step": 1870
},
{
"epoch": 2.0551970341892076,
"grad_norm": 0.9190402030944824,
"learning_rate": 0.00010488400488400487,
"loss": 0.7941,
"step": 1871
},
{
"epoch": 2.056295482630784,
"grad_norm": 0.6056554317474365,
"learning_rate": 0.00010476190476190474,
"loss": 0.5455,
"step": 1872
},
{
"epoch": 2.0573939310723603,
"grad_norm": 0.7070736289024353,
"learning_rate": 0.00010463980463980464,
"loss": 0.6112,
"step": 1873
},
{
"epoch": 2.0584923795139365,
"grad_norm": 0.5415268540382385,
"learning_rate": 0.00010451770451770451,
"loss": 0.7141,
"step": 1874
},
{
"epoch": 2.0595908279555126,
"grad_norm": 0.45696091651916504,
"learning_rate": 0.00010439560439560438,
"loss": 0.7825,
"step": 1875
},
{
"epoch": 2.060689276397089,
"grad_norm": 0.5728979706764221,
"learning_rate": 0.00010427350427350427,
"loss": 0.5869,
"step": 1876
},
{
"epoch": 2.0617877248386653,
"grad_norm": 0.5910143852233887,
"learning_rate": 0.00010415140415140414,
"loss": 0.728,
"step": 1877
},
{
"epoch": 2.0628861732802415,
"grad_norm": 0.530915379524231,
"learning_rate": 0.00010402930402930401,
"loss": 0.6459,
"step": 1878
},
{
"epoch": 2.063984621721818,
"grad_norm": 0.36358964443206787,
"learning_rate": 0.00010390720390720391,
"loss": 0.7536,
"step": 1879
},
{
"epoch": 2.065083070163394,
"grad_norm": 2.7523410320281982,
"learning_rate": 0.00010378510378510379,
"loss": 0.6347,
"step": 1880
},
{
"epoch": 2.0661815186049703,
"grad_norm": 0.6842527389526367,
"learning_rate": 0.00010366300366300366,
"loss": 0.4943,
"step": 1881
},
{
"epoch": 2.067279967046547,
"grad_norm": 0.5830293297767639,
"learning_rate": 0.00010354090354090353,
"loss": 0.5855,
"step": 1882
},
{
"epoch": 2.068378415488123,
"grad_norm": 0.981920599937439,
"learning_rate": 0.00010341880341880341,
"loss": 0.4425,
"step": 1883
},
{
"epoch": 2.069476863929699,
"grad_norm": 2.0826029777526855,
"learning_rate": 0.00010329670329670329,
"loss": 0.5399,
"step": 1884
},
{
"epoch": 2.0705753123712753,
"grad_norm": 0.4648442268371582,
"learning_rate": 0.00010317460317460316,
"loss": 0.6203,
"step": 1885
},
{
"epoch": 2.071673760812852,
"grad_norm": 0.5086346864700317,
"learning_rate": 0.00010305250305250304,
"loss": 0.6091,
"step": 1886
},
{
"epoch": 2.072772209254428,
"grad_norm": 0.40404266119003296,
"learning_rate": 0.00010293040293040292,
"loss": 0.5013,
"step": 1887
},
{
"epoch": 2.073870657696004,
"grad_norm": 2.0507569313049316,
"learning_rate": 0.0001028083028083028,
"loss": 0.7822,
"step": 1888
},
{
"epoch": 2.074969106137581,
"grad_norm": 0.9318211078643799,
"learning_rate": 0.00010268620268620269,
"loss": 0.6638,
"step": 1889
},
{
"epoch": 2.076067554579157,
"grad_norm": 0.7601054310798645,
"learning_rate": 0.00010256410256410256,
"loss": 0.6085,
"step": 1890
},
{
"epoch": 2.077166003020733,
"grad_norm": 1.1299306154251099,
"learning_rate": 0.00010244200244200243,
"loss": 0.682,
"step": 1891
},
{
"epoch": 2.0782644514623096,
"grad_norm": 0.5009475350379944,
"learning_rate": 0.0001023199023199023,
"loss": 0.7229,
"step": 1892
},
{
"epoch": 2.079362899903886,
"grad_norm": 0.3432561159133911,
"learning_rate": 0.00010219780219780219,
"loss": 0.5991,
"step": 1893
},
{
"epoch": 2.080461348345462,
"grad_norm": 0.5224031805992126,
"learning_rate": 0.00010207570207570206,
"loss": 0.3687,
"step": 1894
},
{
"epoch": 2.0815597967870385,
"grad_norm": 0.4849548935890198,
"learning_rate": 0.00010195360195360193,
"loss": 0.507,
"step": 1895
},
{
"epoch": 2.0826582452286146,
"grad_norm": 0.6093185544013977,
"learning_rate": 0.00010183150183150183,
"loss": 0.7019,
"step": 1896
},
{
"epoch": 2.083756693670191,
"grad_norm": 0.7408457398414612,
"learning_rate": 0.0001017094017094017,
"loss": 0.6331,
"step": 1897
},
{
"epoch": 2.084855142111767,
"grad_norm": 0.67701655626297,
"learning_rate": 0.00010158730158730157,
"loss": 0.6685,
"step": 1898
},
{
"epoch": 2.0859535905533435,
"grad_norm": 0.2880030870437622,
"learning_rate": 0.00010146520146520146,
"loss": 0.4043,
"step": 1899
},
{
"epoch": 2.0870520389949196,
"grad_norm": 0.45890796184539795,
"learning_rate": 0.00010134310134310133,
"loss": 0.3695,
"step": 1900
},
{
"epoch": 2.088150487436496,
"grad_norm": 0.7898344397544861,
"learning_rate": 0.0001012210012210012,
"loss": 0.7875,
"step": 1901
},
{
"epoch": 2.0892489358780724,
"grad_norm": 0.5648753046989441,
"learning_rate": 0.0001010989010989011,
"loss": 0.6058,
"step": 1902
},
{
"epoch": 2.0903473843196485,
"grad_norm": 0.7880465984344482,
"learning_rate": 0.00010097680097680098,
"loss": 0.6403,
"step": 1903
},
{
"epoch": 2.0914458327612246,
"grad_norm": 0.4169737696647644,
"learning_rate": 0.00010085470085470085,
"loss": 0.71,
"step": 1904
},
{
"epoch": 2.0925442812028012,
"grad_norm": 0.33653560280799866,
"learning_rate": 0.00010073260073260072,
"loss": 0.6278,
"step": 1905
},
{
"epoch": 2.0936427296443774,
"grad_norm": 0.6861558556556702,
"learning_rate": 0.0001006105006105006,
"loss": 0.8463,
"step": 1906
},
{
"epoch": 2.0947411780859535,
"grad_norm": 0.29407018423080444,
"learning_rate": 0.00010048840048840048,
"loss": 0.5644,
"step": 1907
},
{
"epoch": 2.09583962652753,
"grad_norm": 0.673083484172821,
"learning_rate": 0.00010036630036630035,
"loss": 0.8353,
"step": 1908
},
{
"epoch": 2.0969380749691062,
"grad_norm": 0.429061621427536,
"learning_rate": 0.00010024420024420023,
"loss": 0.6381,
"step": 1909
},
{
"epoch": 2.0980365234106824,
"grad_norm": 0.5113368630409241,
"learning_rate": 0.00010012210012210012,
"loss": 0.7603,
"step": 1910
},
{
"epoch": 2.0991349718522585,
"grad_norm": 0.9005820751190186,
"learning_rate": 9.999999999999999e-05,
"loss": 0.6331,
"step": 1911
},
{
"epoch": 2.100233420293835,
"grad_norm": 0.489851176738739,
"learning_rate": 9.987789987789988e-05,
"loss": 0.8564,
"step": 1912
},
{
"epoch": 2.1013318687354112,
"grad_norm": 0.42647236585617065,
"learning_rate": 9.975579975579975e-05,
"loss": 0.5496,
"step": 1913
},
{
"epoch": 2.1024303171769874,
"grad_norm": 0.9061693549156189,
"learning_rate": 9.963369963369962e-05,
"loss": 0.4478,
"step": 1914
},
{
"epoch": 2.103528765618564,
"grad_norm": 0.4721933901309967,
"learning_rate": 9.95115995115995e-05,
"loss": 0.6066,
"step": 1915
},
{
"epoch": 2.10462721406014,
"grad_norm": 0.7265921831130981,
"learning_rate": 9.938949938949938e-05,
"loss": 0.7195,
"step": 1916
},
{
"epoch": 2.1057256625017162,
"grad_norm": 0.4521386921405792,
"learning_rate": 9.926739926739925e-05,
"loss": 0.6476,
"step": 1917
},
{
"epoch": 2.106824110943293,
"grad_norm": 0.42982912063598633,
"learning_rate": 9.914529914529912e-05,
"loss": 0.535,
"step": 1918
},
{
"epoch": 2.107922559384869,
"grad_norm": 0.4758259952068329,
"learning_rate": 9.902319902319902e-05,
"loss": 0.8106,
"step": 1919
},
{
"epoch": 2.109021007826445,
"grad_norm": 0.69195157289505,
"learning_rate": 9.890109890109889e-05,
"loss": 0.6643,
"step": 1920
},
{
"epoch": 2.110119456268021,
"grad_norm": 0.8207395672798157,
"learning_rate": 9.877899877899876e-05,
"loss": 0.7535,
"step": 1921
},
{
"epoch": 2.111217904709598,
"grad_norm": 1.4245035648345947,
"learning_rate": 9.865689865689865e-05,
"loss": 0.6721,
"step": 1922
},
{
"epoch": 2.112316353151174,
"grad_norm": 0.5496362447738647,
"learning_rate": 9.853479853479852e-05,
"loss": 0.5367,
"step": 1923
},
{
"epoch": 2.11341480159275,
"grad_norm": 0.5466665625572205,
"learning_rate": 9.84126984126984e-05,
"loss": 0.6083,
"step": 1924
},
{
"epoch": 2.1145132500343267,
"grad_norm": 0.7750464677810669,
"learning_rate": 9.829059829059829e-05,
"loss": 0.663,
"step": 1925
},
{
"epoch": 2.115611698475903,
"grad_norm": 0.4978208541870117,
"learning_rate": 9.816849816849817e-05,
"loss": 0.6334,
"step": 1926
},
{
"epoch": 2.116710146917479,
"grad_norm": 0.6415550708770752,
"learning_rate": 9.804639804639804e-05,
"loss": 0.6477,
"step": 1927
},
{
"epoch": 2.1178085953590555,
"grad_norm": 0.644123911857605,
"learning_rate": 9.792429792429792e-05,
"loss": 0.668,
"step": 1928
},
{
"epoch": 2.1189070438006317,
"grad_norm": 0.39706236124038696,
"learning_rate": 9.78021978021978e-05,
"loss": 0.5875,
"step": 1929
},
{
"epoch": 2.120005492242208,
"grad_norm": 1.3733233213424683,
"learning_rate": 9.768009768009767e-05,
"loss": 0.6023,
"step": 1930
},
{
"epoch": 2.121103940683784,
"grad_norm": 0.48839983344078064,
"learning_rate": 9.755799755799754e-05,
"loss": 0.5693,
"step": 1931
},
{
"epoch": 2.1222023891253605,
"grad_norm": 0.3107692301273346,
"learning_rate": 9.743589743589744e-05,
"loss": 0.5822,
"step": 1932
},
{
"epoch": 2.1233008375669367,
"grad_norm": 0.3988654911518097,
"learning_rate": 9.731379731379731e-05,
"loss": 0.5989,
"step": 1933
},
{
"epoch": 2.124399286008513,
"grad_norm": 1.1887754201889038,
"learning_rate": 9.719169719169718e-05,
"loss": 0.6382,
"step": 1934
},
{
"epoch": 2.1254977344500894,
"grad_norm": 0.43282651901245117,
"learning_rate": 9.706959706959707e-05,
"loss": 0.5649,
"step": 1935
},
{
"epoch": 2.1265961828916655,
"grad_norm": 0.39243975281715393,
"learning_rate": 9.694749694749694e-05,
"loss": 0.7005,
"step": 1936
},
{
"epoch": 2.1276946313332417,
"grad_norm": 0.7401454448699951,
"learning_rate": 9.682539682539681e-05,
"loss": 1.0632,
"step": 1937
},
{
"epoch": 2.1287930797748182,
"grad_norm": 0.6976983547210693,
"learning_rate": 9.67032967032967e-05,
"loss": 0.562,
"step": 1938
},
{
"epoch": 2.1298915282163944,
"grad_norm": 0.9784336686134338,
"learning_rate": 9.658119658119657e-05,
"loss": 0.8115,
"step": 1939
},
{
"epoch": 2.1309899766579705,
"grad_norm": 0.5289125442504883,
"learning_rate": 9.645909645909644e-05,
"loss": 0.6161,
"step": 1940
},
{
"epoch": 2.132088425099547,
"grad_norm": 1.414559006690979,
"learning_rate": 9.633699633699634e-05,
"loss": 0.7115,
"step": 1941
},
{
"epoch": 2.1331868735411232,
"grad_norm": 0.5444177389144897,
"learning_rate": 9.621489621489621e-05,
"loss": 0.6211,
"step": 1942
},
{
"epoch": 2.1342853219826994,
"grad_norm": 0.637030839920044,
"learning_rate": 9.609279609279608e-05,
"loss": 0.8747,
"step": 1943
},
{
"epoch": 2.1353837704242755,
"grad_norm": 0.5926198363304138,
"learning_rate": 9.597069597069595e-05,
"loss": 0.8673,
"step": 1944
},
{
"epoch": 2.136482218865852,
"grad_norm": 0.3638801872730255,
"learning_rate": 9.584859584859584e-05,
"loss": 0.4698,
"step": 1945
},
{
"epoch": 2.1375806673074282,
"grad_norm": 0.5823031067848206,
"learning_rate": 9.572649572649571e-05,
"loss": 0.6988,
"step": 1946
},
{
"epoch": 2.1386791157490044,
"grad_norm": 0.44348934292793274,
"learning_rate": 9.560439560439558e-05,
"loss": 0.6667,
"step": 1947
},
{
"epoch": 2.139777564190581,
"grad_norm": 3.177112579345703,
"learning_rate": 9.548229548229548e-05,
"loss": 0.8738,
"step": 1948
},
{
"epoch": 2.140876012632157,
"grad_norm": 1.3834997415542603,
"learning_rate": 9.536019536019536e-05,
"loss": 0.528,
"step": 1949
},
{
"epoch": 2.1419744610737332,
"grad_norm": 0.5514722466468811,
"learning_rate": 9.523809523809523e-05,
"loss": 0.5058,
"step": 1950
},
{
"epoch": 2.14307290951531,
"grad_norm": 0.8795000314712524,
"learning_rate": 9.511599511599511e-05,
"loss": 0.6368,
"step": 1951
},
{
"epoch": 2.144171357956886,
"grad_norm": 1.0043178796768188,
"learning_rate": 9.499389499389498e-05,
"loss": 0.5701,
"step": 1952
},
{
"epoch": 2.145269806398462,
"grad_norm": 1.8537780046463013,
"learning_rate": 9.487179487179486e-05,
"loss": 0.6978,
"step": 1953
},
{
"epoch": 2.1463682548400387,
"grad_norm": 0.5239475965499878,
"learning_rate": 9.474969474969476e-05,
"loss": 0.7093,
"step": 1954
},
{
"epoch": 2.147466703281615,
"grad_norm": 0.7944377064704895,
"learning_rate": 9.462759462759463e-05,
"loss": 0.7625,
"step": 1955
},
{
"epoch": 2.148565151723191,
"grad_norm": 0.7356003522872925,
"learning_rate": 9.45054945054945e-05,
"loss": 0.6845,
"step": 1956
},
{
"epoch": 2.149663600164767,
"grad_norm": 1.3590694665908813,
"learning_rate": 9.438339438339437e-05,
"loss": 0.6964,
"step": 1957
},
{
"epoch": 2.1507620486063437,
"grad_norm": 0.40889453887939453,
"learning_rate": 9.426129426129426e-05,
"loss": 0.6643,
"step": 1958
},
{
"epoch": 2.15186049704792,
"grad_norm": 0.6347643136978149,
"learning_rate": 9.413919413919413e-05,
"loss": 1.0002,
"step": 1959
},
{
"epoch": 2.152958945489496,
"grad_norm": 0.3661377429962158,
"learning_rate": 9.4017094017094e-05,
"loss": 0.5084,
"step": 1960
},
{
"epoch": 2.1540573939310725,
"grad_norm": 0.8262574672698975,
"learning_rate": 9.389499389499389e-05,
"loss": 0.5658,
"step": 1961
},
{
"epoch": 2.1551558423726487,
"grad_norm": 0.6054818034172058,
"learning_rate": 9.377289377289376e-05,
"loss": 0.6349,
"step": 1962
},
{
"epoch": 2.156254290814225,
"grad_norm": 0.3696078658103943,
"learning_rate": 9.365079365079364e-05,
"loss": 0.5746,
"step": 1963
},
{
"epoch": 2.157352739255801,
"grad_norm": 0.7613049745559692,
"learning_rate": 9.352869352869353e-05,
"loss": 0.5204,
"step": 1964
},
{
"epoch": 2.1584511876973775,
"grad_norm": 0.6841816306114197,
"learning_rate": 9.34065934065934e-05,
"loss": 0.813,
"step": 1965
},
{
"epoch": 2.1595496361389537,
"grad_norm": 0.902998685836792,
"learning_rate": 9.328449328449327e-05,
"loss": 0.6288,
"step": 1966
},
{
"epoch": 2.16064808458053,
"grad_norm": 0.5367470979690552,
"learning_rate": 9.316239316239316e-05,
"loss": 0.6689,
"step": 1967
},
{
"epoch": 2.1617465330221064,
"grad_norm": 0.9443572163581848,
"learning_rate": 9.304029304029303e-05,
"loss": 0.6864,
"step": 1968
},
{
"epoch": 2.1628449814636825,
"grad_norm": 0.42191457748413086,
"learning_rate": 9.29181929181929e-05,
"loss": 0.6509,
"step": 1969
},
{
"epoch": 2.1639434299052587,
"grad_norm": 0.6019404530525208,
"learning_rate": 9.279609279609277e-05,
"loss": 0.5252,
"step": 1970
},
{
"epoch": 2.1650418783468353,
"grad_norm": 1.9933907985687256,
"learning_rate": 9.267399267399267e-05,
"loss": 0.6042,
"step": 1971
},
{
"epoch": 2.1661403267884114,
"grad_norm": 0.33075836300849915,
"learning_rate": 9.255189255189255e-05,
"loss": 0.579,
"step": 1972
},
{
"epoch": 2.1672387752299875,
"grad_norm": 0.37899547815322876,
"learning_rate": 9.242979242979242e-05,
"loss": 0.5006,
"step": 1973
},
{
"epoch": 2.168337223671564,
"grad_norm": 0.6482734680175781,
"learning_rate": 9.23076923076923e-05,
"loss": 0.4844,
"step": 1974
},
{
"epoch": 2.1694356721131403,
"grad_norm": 0.47632062435150146,
"learning_rate": 9.218559218559217e-05,
"loss": 0.5844,
"step": 1975
},
{
"epoch": 2.1705341205547164,
"grad_norm": 0.3402813971042633,
"learning_rate": 9.206349206349205e-05,
"loss": 0.6397,
"step": 1976
},
{
"epoch": 2.1716325689962925,
"grad_norm": 0.47405871748924255,
"learning_rate": 9.194139194139195e-05,
"loss": 0.6436,
"step": 1977
},
{
"epoch": 2.172731017437869,
"grad_norm": 0.5474234223365784,
"learning_rate": 9.181929181929182e-05,
"loss": 0.5758,
"step": 1978
},
{
"epoch": 2.1738294658794453,
"grad_norm": 0.5423378348350525,
"learning_rate": 9.169719169719169e-05,
"loss": 0.5882,
"step": 1979
},
{
"epoch": 2.1749279143210214,
"grad_norm": 0.32848963141441345,
"learning_rate": 9.157509157509158e-05,
"loss": 0.5828,
"step": 1980
},
{
"epoch": 2.176026362762598,
"grad_norm": 0.6646802425384521,
"learning_rate": 9.145299145299145e-05,
"loss": 0.551,
"step": 1981
},
{
"epoch": 2.177124811204174,
"grad_norm": 0.4560980200767517,
"learning_rate": 9.133089133089132e-05,
"loss": 0.705,
"step": 1982
},
{
"epoch": 2.1782232596457503,
"grad_norm": 0.4531053304672241,
"learning_rate": 9.120879120879119e-05,
"loss": 0.7471,
"step": 1983
},
{
"epoch": 2.179321708087327,
"grad_norm": 0.5881507992744446,
"learning_rate": 9.108669108669108e-05,
"loss": 0.7559,
"step": 1984
},
{
"epoch": 2.180420156528903,
"grad_norm": 0.41462886333465576,
"learning_rate": 9.096459096459096e-05,
"loss": 0.5674,
"step": 1985
},
{
"epoch": 2.181518604970479,
"grad_norm": 0.46718108654022217,
"learning_rate": 9.084249084249083e-05,
"loss": 0.7149,
"step": 1986
},
{
"epoch": 2.1826170534120557,
"grad_norm": 0.49290111660957336,
"learning_rate": 9.072039072039072e-05,
"loss": 0.5641,
"step": 1987
},
{
"epoch": 2.183715501853632,
"grad_norm": 0.398296594619751,
"learning_rate": 9.059829059829059e-05,
"loss": 0.5177,
"step": 1988
},
{
"epoch": 2.184813950295208,
"grad_norm": 0.8241115212440491,
"learning_rate": 9.047619047619046e-05,
"loss": 0.7864,
"step": 1989
},
{
"epoch": 2.185912398736784,
"grad_norm": 1.1335865259170532,
"learning_rate": 9.035409035409035e-05,
"loss": 0.6167,
"step": 1990
},
{
"epoch": 2.1870108471783607,
"grad_norm": 0.4479789435863495,
"learning_rate": 9.023199023199022e-05,
"loss": 0.6365,
"step": 1991
},
{
"epoch": 2.188109295619937,
"grad_norm": 0.4892582297325134,
"learning_rate": 9.010989010989009e-05,
"loss": 0.6283,
"step": 1992
},
{
"epoch": 2.189207744061513,
"grad_norm": 0.8397974371910095,
"learning_rate": 8.998778998778999e-05,
"loss": 0.7123,
"step": 1993
},
{
"epoch": 2.1903061925030896,
"grad_norm": 0.5295377969741821,
"learning_rate": 8.986568986568986e-05,
"loss": 0.4033,
"step": 1994
},
{
"epoch": 2.1914046409446657,
"grad_norm": 0.464832067489624,
"learning_rate": 8.974358974358974e-05,
"loss": 0.8228,
"step": 1995
},
{
"epoch": 2.192503089386242,
"grad_norm": 0.381369024515152,
"learning_rate": 8.962148962148961e-05,
"loss": 0.6267,
"step": 1996
},
{
"epoch": 2.193601537827818,
"grad_norm": 0.7176710963249207,
"learning_rate": 8.949938949938949e-05,
"loss": 0.7008,
"step": 1997
},
{
"epoch": 2.1946999862693946,
"grad_norm": 2.569753885269165,
"learning_rate": 8.937728937728936e-05,
"loss": 0.6899,
"step": 1998
},
{
"epoch": 2.1957984347109707,
"grad_norm": 0.5020056962966919,
"learning_rate": 8.925518925518924e-05,
"loss": 0.527,
"step": 1999
},
{
"epoch": 2.196896883152547,
"grad_norm": 1.7054524421691895,
"learning_rate": 8.913308913308914e-05,
"loss": 0.5455,
"step": 2000
},
{
"epoch": 2.1979953315941234,
"grad_norm": 0.5037225484848022,
"learning_rate": 8.901098901098901e-05,
"loss": 0.7445,
"step": 2001
},
{
"epoch": 2.1990937800356996,
"grad_norm": 0.8109555840492249,
"learning_rate": 8.888888888888888e-05,
"loss": 0.624,
"step": 2002
},
{
"epoch": 2.2001922284772757,
"grad_norm": 0.47120043635368347,
"learning_rate": 8.876678876678877e-05,
"loss": 0.6858,
"step": 2003
},
{
"epoch": 2.2012906769188523,
"grad_norm": 0.6166191101074219,
"learning_rate": 8.864468864468864e-05,
"loss": 0.4528,
"step": 2004
},
{
"epoch": 2.2023891253604284,
"grad_norm": 0.4999128580093384,
"learning_rate": 8.852258852258851e-05,
"loss": 0.712,
"step": 2005
},
{
"epoch": 2.2034875738020046,
"grad_norm": 1.1858354806900024,
"learning_rate": 8.84004884004884e-05,
"loss": 0.7647,
"step": 2006
},
{
"epoch": 2.204586022243581,
"grad_norm": 0.4223528206348419,
"learning_rate": 8.827838827838828e-05,
"loss": 0.6553,
"step": 2007
},
{
"epoch": 2.2056844706851573,
"grad_norm": 0.41678956151008606,
"learning_rate": 8.815628815628815e-05,
"loss": 0.6033,
"step": 2008
},
{
"epoch": 2.2067829191267334,
"grad_norm": 0.5812666416168213,
"learning_rate": 8.803418803418802e-05,
"loss": 0.6016,
"step": 2009
},
{
"epoch": 2.2078813675683095,
"grad_norm": 0.5553560256958008,
"learning_rate": 8.791208791208791e-05,
"loss": 0.7621,
"step": 2010
},
{
"epoch": 2.208979816009886,
"grad_norm": 0.6392796635627747,
"learning_rate": 8.778998778998778e-05,
"loss": 0.567,
"step": 2011
},
{
"epoch": 2.2100782644514623,
"grad_norm": 1.0086902379989624,
"learning_rate": 8.766788766788765e-05,
"loss": 0.9432,
"step": 2012
},
{
"epoch": 2.2111767128930384,
"grad_norm": 1.3578602075576782,
"learning_rate": 8.754578754578754e-05,
"loss": 0.5107,
"step": 2013
},
{
"epoch": 2.212275161334615,
"grad_norm": 0.5530524849891663,
"learning_rate": 8.742368742368741e-05,
"loss": 0.6078,
"step": 2014
},
{
"epoch": 2.213373609776191,
"grad_norm": 0.3795104920864105,
"learning_rate": 8.730158730158728e-05,
"loss": 0.4889,
"step": 2015
},
{
"epoch": 2.2144720582177673,
"grad_norm": 0.40977227687835693,
"learning_rate": 8.717948717948718e-05,
"loss": 0.6295,
"step": 2016
},
{
"epoch": 2.215570506659344,
"grad_norm": 0.4882934093475342,
"learning_rate": 8.705738705738705e-05,
"loss": 0.7219,
"step": 2017
},
{
"epoch": 2.21666895510092,
"grad_norm": 0.7966530919075012,
"learning_rate": 8.693528693528693e-05,
"loss": 0.5342,
"step": 2018
},
{
"epoch": 2.217767403542496,
"grad_norm": 0.6992311477661133,
"learning_rate": 8.681318681318681e-05,
"loss": 0.5932,
"step": 2019
},
{
"epoch": 2.2188658519840727,
"grad_norm": 0.396427720785141,
"learning_rate": 8.669108669108668e-05,
"loss": 0.5838,
"step": 2020
},
{
"epoch": 2.219964300425649,
"grad_norm": 0.5625690817832947,
"learning_rate": 8.656898656898655e-05,
"loss": 0.7605,
"step": 2021
},
{
"epoch": 2.221062748867225,
"grad_norm": 0.6052583456039429,
"learning_rate": 8.644688644688643e-05,
"loss": 0.6572,
"step": 2022
},
{
"epoch": 2.222161197308801,
"grad_norm": 0.7201973795890808,
"learning_rate": 8.632478632478633e-05,
"loss": 0.4924,
"step": 2023
},
{
"epoch": 2.2232596457503777,
"grad_norm": 0.4222647249698639,
"learning_rate": 8.62026862026862e-05,
"loss": 0.7764,
"step": 2024
},
{
"epoch": 2.224358094191954,
"grad_norm": 0.5168121457099915,
"learning_rate": 8.608058608058607e-05,
"loss": 0.5766,
"step": 2025
},
{
"epoch": 2.22545654263353,
"grad_norm": 0.886203408241272,
"learning_rate": 8.595848595848596e-05,
"loss": 0.3804,
"step": 2026
},
{
"epoch": 2.2265549910751066,
"grad_norm": 1.7365875244140625,
"learning_rate": 8.583638583638583e-05,
"loss": 0.6583,
"step": 2027
},
{
"epoch": 2.2276534395166827,
"grad_norm": 0.44519639015197754,
"learning_rate": 8.57142857142857e-05,
"loss": 0.7322,
"step": 2028
},
{
"epoch": 2.228751887958259,
"grad_norm": 0.4888206422328949,
"learning_rate": 8.55921855921856e-05,
"loss": 0.6645,
"step": 2029
},
{
"epoch": 2.2298503363998354,
"grad_norm": 0.598225474357605,
"learning_rate": 8.547008547008547e-05,
"loss": 0.7903,
"step": 2030
},
{
"epoch": 2.2309487848414116,
"grad_norm": 0.8521910905838013,
"learning_rate": 8.534798534798534e-05,
"loss": 0.8573,
"step": 2031
},
{
"epoch": 2.2320472332829877,
"grad_norm": 1.6346311569213867,
"learning_rate": 8.522588522588523e-05,
"loss": 0.5653,
"step": 2032
},
{
"epoch": 2.233145681724564,
"grad_norm": 0.6574315428733826,
"learning_rate": 8.51037851037851e-05,
"loss": 0.5289,
"step": 2033
},
{
"epoch": 2.2342441301661404,
"grad_norm": 0.3821216821670532,
"learning_rate": 8.498168498168497e-05,
"loss": 0.4627,
"step": 2034
},
{
"epoch": 2.2353425786077166,
"grad_norm": 0.28965023159980774,
"learning_rate": 8.485958485958484e-05,
"loss": 0.3696,
"step": 2035
},
{
"epoch": 2.2364410270492927,
"grad_norm": 0.8256242275238037,
"learning_rate": 8.473748473748473e-05,
"loss": 0.6305,
"step": 2036
},
{
"epoch": 2.2375394754908693,
"grad_norm": 0.8374451398849487,
"learning_rate": 8.46153846153846e-05,
"loss": 0.5038,
"step": 2037
},
{
"epoch": 2.2386379239324454,
"grad_norm": 0.5931464433670044,
"learning_rate": 8.449328449328449e-05,
"loss": 0.6928,
"step": 2038
},
{
"epoch": 2.2397363723740216,
"grad_norm": 0.5120035409927368,
"learning_rate": 8.437118437118437e-05,
"loss": 0.6004,
"step": 2039
},
{
"epoch": 2.240834820815598,
"grad_norm": 0.6345282196998596,
"learning_rate": 8.424908424908424e-05,
"loss": 0.866,
"step": 2040
},
{
"epoch": 2.2419332692571743,
"grad_norm": 0.5632284283638,
"learning_rate": 8.412698412698412e-05,
"loss": 0.406,
"step": 2041
},
{
"epoch": 2.2430317176987504,
"grad_norm": 0.4784685969352722,
"learning_rate": 8.4004884004884e-05,
"loss": 0.4732,
"step": 2042
},
{
"epoch": 2.2441301661403266,
"grad_norm": 0.47678086161613464,
"learning_rate": 8.388278388278387e-05,
"loss": 0.502,
"step": 2043
},
{
"epoch": 2.245228614581903,
"grad_norm": 0.6543307304382324,
"learning_rate": 8.376068376068374e-05,
"loss": 0.7183,
"step": 2044
},
{
"epoch": 2.2463270630234793,
"grad_norm": 0.6147063374519348,
"learning_rate": 8.363858363858364e-05,
"loss": 0.618,
"step": 2045
},
{
"epoch": 2.2474255114650554,
"grad_norm": 0.5867168307304382,
"learning_rate": 8.351648351648352e-05,
"loss": 0.7749,
"step": 2046
},
{
"epoch": 2.248523959906632,
"grad_norm": 1.164838433265686,
"learning_rate": 8.339438339438339e-05,
"loss": 0.6261,
"step": 2047
},
{
"epoch": 2.249622408348208,
"grad_norm": 0.6695102453231812,
"learning_rate": 8.327228327228326e-05,
"loss": 0.6172,
"step": 2048
},
{
"epoch": 2.2507208567897843,
"grad_norm": 0.43873751163482666,
"learning_rate": 8.315018315018315e-05,
"loss": 0.7032,
"step": 2049
},
{
"epoch": 2.251819305231361,
"grad_norm": 0.439897745847702,
"learning_rate": 8.302808302808302e-05,
"loss": 0.7744,
"step": 2050
},
{
"epoch": 2.252917753672937,
"grad_norm": 0.6671053767204285,
"learning_rate": 8.290598290598289e-05,
"loss": 0.6877,
"step": 2051
},
{
"epoch": 2.254016202114513,
"grad_norm": 0.37354105710983276,
"learning_rate": 8.278388278388279e-05,
"loss": 0.5653,
"step": 2052
},
{
"epoch": 2.2551146505560897,
"grad_norm": 0.5615684390068054,
"learning_rate": 8.266178266178266e-05,
"loss": 0.5961,
"step": 2053
},
{
"epoch": 2.256213098997666,
"grad_norm": 2.0932323932647705,
"learning_rate": 8.253968253968253e-05,
"loss": 0.6139,
"step": 2054
},
{
"epoch": 2.257311547439242,
"grad_norm": 0.5486952066421509,
"learning_rate": 8.241758241758242e-05,
"loss": 0.7816,
"step": 2055
},
{
"epoch": 2.258409995880818,
"grad_norm": 0.7377699017524719,
"learning_rate": 8.229548229548229e-05,
"loss": 0.5036,
"step": 2056
},
{
"epoch": 2.2595084443223947,
"grad_norm": 0.7057545781135559,
"learning_rate": 8.217338217338216e-05,
"loss": 0.5788,
"step": 2057
},
{
"epoch": 2.260606892763971,
"grad_norm": 0.5388674736022949,
"learning_rate": 8.205128205128205e-05,
"loss": 0.7079,
"step": 2058
},
{
"epoch": 2.261705341205547,
"grad_norm": 0.620943546295166,
"learning_rate": 8.192918192918192e-05,
"loss": 0.6223,
"step": 2059
},
{
"epoch": 2.2628037896471236,
"grad_norm": 0.6159489154815674,
"learning_rate": 8.18070818070818e-05,
"loss": 0.7277,
"step": 2060
},
{
"epoch": 2.2639022380886997,
"grad_norm": 0.5745131373405457,
"learning_rate": 8.168498168498168e-05,
"loss": 0.6356,
"step": 2061
},
{
"epoch": 2.265000686530276,
"grad_norm": 0.4925720989704132,
"learning_rate": 8.156288156288156e-05,
"loss": 0.6342,
"step": 2062
},
{
"epoch": 2.2660991349718524,
"grad_norm": 0.410692036151886,
"learning_rate": 8.144078144078143e-05,
"loss": 0.5903,
"step": 2063
},
{
"epoch": 2.2671975834134286,
"grad_norm": 0.8246005177497864,
"learning_rate": 8.13186813186813e-05,
"loss": 0.4048,
"step": 2064
},
{
"epoch": 2.2682960318550047,
"grad_norm": 0.5054492950439453,
"learning_rate": 8.119658119658119e-05,
"loss": 0.5797,
"step": 2065
},
{
"epoch": 2.2693944802965813,
"grad_norm": 0.6249692440032959,
"learning_rate": 8.107448107448106e-05,
"loss": 0.5434,
"step": 2066
},
{
"epoch": 2.2704929287381574,
"grad_norm": 0.5582659244537354,
"learning_rate": 8.095238095238093e-05,
"loss": 0.5925,
"step": 2067
},
{
"epoch": 2.2715913771797336,
"grad_norm": 0.38472238183021545,
"learning_rate": 8.083028083028083e-05,
"loss": 0.7325,
"step": 2068
},
{
"epoch": 2.2726898256213097,
"grad_norm": 0.4649077355861664,
"learning_rate": 8.07081807081807e-05,
"loss": 0.6244,
"step": 2069
},
{
"epoch": 2.2737882740628863,
"grad_norm": 0.38582849502563477,
"learning_rate": 8.058608058608058e-05,
"loss": 0.7696,
"step": 2070
},
{
"epoch": 2.2748867225044624,
"grad_norm": 0.4612105190753937,
"learning_rate": 8.046398046398045e-05,
"loss": 0.6453,
"step": 2071
},
{
"epoch": 2.2759851709460386,
"grad_norm": 0.6572852730751038,
"learning_rate": 8.034188034188034e-05,
"loss": 0.7417,
"step": 2072
},
{
"epoch": 2.277083619387615,
"grad_norm": 0.6322109699249268,
"learning_rate": 8.021978021978021e-05,
"loss": 0.2827,
"step": 2073
},
{
"epoch": 2.2781820678291913,
"grad_norm": 1.2452771663665771,
"learning_rate": 8.009768009768008e-05,
"loss": 0.7441,
"step": 2074
},
{
"epoch": 2.2792805162707674,
"grad_norm": 0.32154834270477295,
"learning_rate": 7.997557997557998e-05,
"loss": 0.4606,
"step": 2075
},
{
"epoch": 2.2803789647123436,
"grad_norm": 1.0170034170150757,
"learning_rate": 7.985347985347985e-05,
"loss": 0.7003,
"step": 2076
},
{
"epoch": 2.28147741315392,
"grad_norm": 0.7780435085296631,
"learning_rate": 7.973137973137972e-05,
"loss": 0.5847,
"step": 2077
},
{
"epoch": 2.2825758615954963,
"grad_norm": 0.6422854661941528,
"learning_rate": 7.960927960927961e-05,
"loss": 0.6278,
"step": 2078
},
{
"epoch": 2.2836743100370724,
"grad_norm": 0.5440393090248108,
"learning_rate": 7.948717948717948e-05,
"loss": 0.6313,
"step": 2079
},
{
"epoch": 2.284772758478649,
"grad_norm": 0.5774940848350525,
"learning_rate": 7.936507936507935e-05,
"loss": 0.7504,
"step": 2080
},
{
"epoch": 2.285871206920225,
"grad_norm": 0.44180789589881897,
"learning_rate": 7.924297924297924e-05,
"loss": 0.5806,
"step": 2081
},
{
"epoch": 2.2869696553618013,
"grad_norm": 0.8452728390693665,
"learning_rate": 7.912087912087912e-05,
"loss": 0.5753,
"step": 2082
},
{
"epoch": 2.288068103803378,
"grad_norm": 0.40172943472862244,
"learning_rate": 7.8998778998779e-05,
"loss": 0.5565,
"step": 2083
},
{
"epoch": 2.289166552244954,
"grad_norm": 0.3919180929660797,
"learning_rate": 7.887667887667887e-05,
"loss": 0.4951,
"step": 2084
},
{
"epoch": 2.29026500068653,
"grad_norm": 1.0796260833740234,
"learning_rate": 7.875457875457875e-05,
"loss": 0.733,
"step": 2085
},
{
"epoch": 2.2913634491281067,
"grad_norm": 0.5640047788619995,
"learning_rate": 7.863247863247862e-05,
"loss": 0.4625,
"step": 2086
},
{
"epoch": 2.292461897569683,
"grad_norm": 0.8736083507537842,
"learning_rate": 7.85103785103785e-05,
"loss": 0.5532,
"step": 2087
},
{
"epoch": 2.293560346011259,
"grad_norm": 0.5358221530914307,
"learning_rate": 7.838827838827838e-05,
"loss": 0.6397,
"step": 2088
},
{
"epoch": 2.294658794452835,
"grad_norm": 5.207391262054443,
"learning_rate": 7.826617826617825e-05,
"loss": 0.6402,
"step": 2089
},
{
"epoch": 2.2957572428944117,
"grad_norm": 0.4122523069381714,
"learning_rate": 7.814407814407813e-05,
"loss": 0.474,
"step": 2090
},
{
"epoch": 2.296855691335988,
"grad_norm": 2.8296186923980713,
"learning_rate": 7.802197802197802e-05,
"loss": 0.5197,
"step": 2091
},
{
"epoch": 2.297954139777564,
"grad_norm": 0.6898410320281982,
"learning_rate": 7.78998778998779e-05,
"loss": 0.782,
"step": 2092
},
{
"epoch": 2.2990525882191406,
"grad_norm": 0.37363025546073914,
"learning_rate": 7.777777777777777e-05,
"loss": 0.5824,
"step": 2093
},
{
"epoch": 2.3001510366607167,
"grad_norm": 0.5120764374732971,
"learning_rate": 7.765567765567765e-05,
"loss": 0.7326,
"step": 2094
},
{
"epoch": 2.301249485102293,
"grad_norm": 0.6517985463142395,
"learning_rate": 7.753357753357753e-05,
"loss": 0.6274,
"step": 2095
},
{
"epoch": 2.3023479335438695,
"grad_norm": 0.8033846020698547,
"learning_rate": 7.74114774114774e-05,
"loss": 0.7093,
"step": 2096
},
{
"epoch": 2.3034463819854456,
"grad_norm": 0.896397590637207,
"learning_rate": 7.728937728937727e-05,
"loss": 0.6685,
"step": 2097
},
{
"epoch": 2.3045448304270217,
"grad_norm": 0.4606597423553467,
"learning_rate": 7.716727716727717e-05,
"loss": 0.5821,
"step": 2098
},
{
"epoch": 2.3056432788685983,
"grad_norm": 0.9286845922470093,
"learning_rate": 7.704517704517704e-05,
"loss": 0.7537,
"step": 2099
},
{
"epoch": 2.3067417273101745,
"grad_norm": 0.6514043211936951,
"learning_rate": 7.692307692307691e-05,
"loss": 0.5644,
"step": 2100
},
{
"epoch": 2.3078401757517506,
"grad_norm": 0.4881083369255066,
"learning_rate": 7.68009768009768e-05,
"loss": 0.5348,
"step": 2101
},
{
"epoch": 2.3089386241933267,
"grad_norm": 2.688716173171997,
"learning_rate": 7.667887667887667e-05,
"loss": 0.6732,
"step": 2102
},
{
"epoch": 2.3100370726349033,
"grad_norm": 0.4597708582878113,
"learning_rate": 7.655677655677654e-05,
"loss": 0.6166,
"step": 2103
},
{
"epoch": 2.3111355210764795,
"grad_norm": 0.7629315853118896,
"learning_rate": 7.643467643467644e-05,
"loss": 0.4677,
"step": 2104
},
{
"epoch": 2.3122339695180556,
"grad_norm": 0.7282788753509521,
"learning_rate": 7.631257631257631e-05,
"loss": 0.6841,
"step": 2105
},
{
"epoch": 2.313332417959632,
"grad_norm": 0.5421862006187439,
"learning_rate": 7.619047619047618e-05,
"loss": 0.7274,
"step": 2106
},
{
"epoch": 2.3144308664012083,
"grad_norm": 0.7396867871284485,
"learning_rate": 7.606837606837607e-05,
"loss": 0.6546,
"step": 2107
},
{
"epoch": 2.3155293148427845,
"grad_norm": 0.34731313586235046,
"learning_rate": 7.594627594627594e-05,
"loss": 0.72,
"step": 2108
},
{
"epoch": 2.3166277632843606,
"grad_norm": 1.1024978160858154,
"learning_rate": 7.582417582417581e-05,
"loss": 0.7304,
"step": 2109
},
{
"epoch": 2.317726211725937,
"grad_norm": 0.5866183638572693,
"learning_rate": 7.570207570207569e-05,
"loss": 0.4912,
"step": 2110
},
{
"epoch": 2.3188246601675133,
"grad_norm": 0.8068836331367493,
"learning_rate": 7.557997557997557e-05,
"loss": 0.5342,
"step": 2111
},
{
"epoch": 2.31992310860909,
"grad_norm": 0.6417646408081055,
"learning_rate": 7.545787545787544e-05,
"loss": 0.7642,
"step": 2112
},
{
"epoch": 2.321021557050666,
"grad_norm": 0.4545808434486389,
"learning_rate": 7.533577533577533e-05,
"loss": 0.5681,
"step": 2113
},
{
"epoch": 2.322120005492242,
"grad_norm": 0.3567211329936981,
"learning_rate": 7.521367521367521e-05,
"loss": 0.6368,
"step": 2114
},
{
"epoch": 2.3232184539338183,
"grad_norm": 0.5747010707855225,
"learning_rate": 7.509157509157509e-05,
"loss": 0.5848,
"step": 2115
},
{
"epoch": 2.324316902375395,
"grad_norm": 0.46303555369377136,
"learning_rate": 7.496947496947497e-05,
"loss": 0.6577,
"step": 2116
},
{
"epoch": 2.325415350816971,
"grad_norm": 0.5343080759048462,
"learning_rate": 7.484737484737484e-05,
"loss": 0.8531,
"step": 2117
},
{
"epoch": 2.326513799258547,
"grad_norm": 0.9027140736579895,
"learning_rate": 7.472527472527472e-05,
"loss": 0.6271,
"step": 2118
},
{
"epoch": 2.3276122477001238,
"grad_norm": 0.6390063166618347,
"learning_rate": 7.460317460317459e-05,
"loss": 0.5669,
"step": 2119
},
{
"epoch": 2.3287106961417,
"grad_norm": 0.4965013563632965,
"learning_rate": 7.448107448107447e-05,
"loss": 0.6362,
"step": 2120
},
{
"epoch": 2.329809144583276,
"grad_norm": 0.49252766370773315,
"learning_rate": 7.435897435897436e-05,
"loss": 0.6703,
"step": 2121
},
{
"epoch": 2.330907593024852,
"grad_norm": 0.7043023705482483,
"learning_rate": 7.423687423687423e-05,
"loss": 0.7114,
"step": 2122
},
{
"epoch": 2.3320060414664288,
"grad_norm": 0.4373185634613037,
"learning_rate": 7.41147741147741e-05,
"loss": 0.5656,
"step": 2123
},
{
"epoch": 2.333104489908005,
"grad_norm": 1.0036537647247314,
"learning_rate": 7.399267399267399e-05,
"loss": 0.6652,
"step": 2124
},
{
"epoch": 2.334202938349581,
"grad_norm": 2.06589937210083,
"learning_rate": 7.387057387057386e-05,
"loss": 0.6502,
"step": 2125
},
{
"epoch": 2.3353013867911576,
"grad_norm": 1.1616554260253906,
"learning_rate": 7.374847374847375e-05,
"loss": 0.7288,
"step": 2126
},
{
"epoch": 2.3363998352327338,
"grad_norm": 0.4532950520515442,
"learning_rate": 7.362637362637362e-05,
"loss": 0.7696,
"step": 2127
},
{
"epoch": 2.33749828367431,
"grad_norm": 1.0143449306488037,
"learning_rate": 7.35042735042735e-05,
"loss": 1.0185,
"step": 2128
},
{
"epoch": 2.3385967321158865,
"grad_norm": 2.2059850692749023,
"learning_rate": 7.338217338217337e-05,
"loss": 0.6267,
"step": 2129
},
{
"epoch": 2.3396951805574626,
"grad_norm": 0.4883456826210022,
"learning_rate": 7.326007326007325e-05,
"loss": 0.6081,
"step": 2130
},
{
"epoch": 2.3407936289990388,
"grad_norm": 0.42373138666152954,
"learning_rate": 7.313797313797313e-05,
"loss": 0.6204,
"step": 2131
},
{
"epoch": 2.3418920774406153,
"grad_norm": 0.43958979845046997,
"learning_rate": 7.3015873015873e-05,
"loss": 0.7608,
"step": 2132
},
{
"epoch": 2.3429905258821915,
"grad_norm": 0.4493010342121124,
"learning_rate": 7.289377289377289e-05,
"loss": 0.5985,
"step": 2133
},
{
"epoch": 2.3440889743237676,
"grad_norm": 0.38533085584640503,
"learning_rate": 7.277167277167276e-05,
"loss": 0.445,
"step": 2134
},
{
"epoch": 2.3451874227653438,
"grad_norm": 0.37900710105895996,
"learning_rate": 7.264957264957265e-05,
"loss": 0.8466,
"step": 2135
},
{
"epoch": 2.3462858712069203,
"grad_norm": 1.7598285675048828,
"learning_rate": 7.252747252747252e-05,
"loss": 0.6881,
"step": 2136
},
{
"epoch": 2.3473843196484965,
"grad_norm": 0.5551338791847229,
"learning_rate": 7.24053724053724e-05,
"loss": 0.5908,
"step": 2137
},
{
"epoch": 2.3484827680900726,
"grad_norm": 0.42995861172676086,
"learning_rate": 7.228327228327228e-05,
"loss": 0.689,
"step": 2138
},
{
"epoch": 2.349581216531649,
"grad_norm": 0.6428760290145874,
"learning_rate": 7.216117216117216e-05,
"loss": 0.5879,
"step": 2139
},
{
"epoch": 2.3506796649732253,
"grad_norm": 0.6199445724487305,
"learning_rate": 7.203907203907203e-05,
"loss": 0.5275,
"step": 2140
},
{
"epoch": 2.3517781134148015,
"grad_norm": 0.4687311053276062,
"learning_rate": 7.19169719169719e-05,
"loss": 0.7046,
"step": 2141
},
{
"epoch": 2.352876561856378,
"grad_norm": 0.47645121812820435,
"learning_rate": 7.179487179487179e-05,
"loss": 0.4787,
"step": 2142
},
{
"epoch": 2.353975010297954,
"grad_norm": 1.3774843215942383,
"learning_rate": 7.167277167277166e-05,
"loss": 0.565,
"step": 2143
},
{
"epoch": 2.3550734587395303,
"grad_norm": 0.9585548043251038,
"learning_rate": 7.155067155067155e-05,
"loss": 0.7496,
"step": 2144
},
{
"epoch": 2.356171907181107,
"grad_norm": 0.9073938131332397,
"learning_rate": 7.142857142857142e-05,
"loss": 0.6785,
"step": 2145
},
{
"epoch": 2.357270355622683,
"grad_norm": 1.4543087482452393,
"learning_rate": 7.13064713064713e-05,
"loss": 0.4827,
"step": 2146
},
{
"epoch": 2.358368804064259,
"grad_norm": 0.49685895442962646,
"learning_rate": 7.118437118437118e-05,
"loss": 0.5624,
"step": 2147
},
{
"epoch": 2.3594672525058353,
"grad_norm": 0.3820716142654419,
"learning_rate": 7.106227106227105e-05,
"loss": 0.5326,
"step": 2148
},
{
"epoch": 2.360565700947412,
"grad_norm": 0.6018278002738953,
"learning_rate": 7.094017094017094e-05,
"loss": 0.7372,
"step": 2149
},
{
"epoch": 2.361664149388988,
"grad_norm": 0.49245381355285645,
"learning_rate": 7.081807081807082e-05,
"loss": 0.714,
"step": 2150
},
{
"epoch": 2.362762597830564,
"grad_norm": 0.5913417339324951,
"learning_rate": 7.069597069597069e-05,
"loss": 0.6395,
"step": 2151
},
{
"epoch": 2.3638610462721408,
"grad_norm": 0.3142958879470825,
"learning_rate": 7.057387057387056e-05,
"loss": 0.4363,
"step": 2152
},
{
"epoch": 2.364959494713717,
"grad_norm": 0.44251006841659546,
"learning_rate": 7.045177045177044e-05,
"loss": 0.5751,
"step": 2153
},
{
"epoch": 2.366057943155293,
"grad_norm": 0.7642143964767456,
"learning_rate": 7.032967032967032e-05,
"loss": 0.9707,
"step": 2154
},
{
"epoch": 2.367156391596869,
"grad_norm": 0.3676380217075348,
"learning_rate": 7.020757020757021e-05,
"loss": 0.6142,
"step": 2155
},
{
"epoch": 2.3682548400384458,
"grad_norm": 0.43112027645111084,
"learning_rate": 7.008547008547008e-05,
"loss": 0.6194,
"step": 2156
},
{
"epoch": 2.369353288480022,
"grad_norm": 0.5463792681694031,
"learning_rate": 6.996336996336996e-05,
"loss": 0.5478,
"step": 2157
},
{
"epoch": 2.370451736921598,
"grad_norm": 0.5498053431510925,
"learning_rate": 6.984126984126984e-05,
"loss": 0.8373,
"step": 2158
},
{
"epoch": 2.3715501853631746,
"grad_norm": 0.5144299268722534,
"learning_rate": 6.971916971916971e-05,
"loss": 0.7033,
"step": 2159
},
{
"epoch": 2.3726486338047508,
"grad_norm": 0.4049033522605896,
"learning_rate": 6.95970695970696e-05,
"loss": 0.6257,
"step": 2160
},
{
"epoch": 2.373747082246327,
"grad_norm": 0.8007866740226746,
"learning_rate": 6.947496947496947e-05,
"loss": 1.1859,
"step": 2161
},
{
"epoch": 2.3748455306879035,
"grad_norm": 0.6302816867828369,
"learning_rate": 6.935286935286935e-05,
"loss": 0.4972,
"step": 2162
},
{
"epoch": 2.3759439791294796,
"grad_norm": 0.4181542694568634,
"learning_rate": 6.923076923076922e-05,
"loss": 0.5543,
"step": 2163
},
{
"epoch": 2.3770424275710558,
"grad_norm": 0.45409703254699707,
"learning_rate": 6.91086691086691e-05,
"loss": 0.6237,
"step": 2164
},
{
"epoch": 2.3781408760126324,
"grad_norm": 0.5172666907310486,
"learning_rate": 6.898656898656898e-05,
"loss": 0.5798,
"step": 2165
},
{
"epoch": 2.3792393244542085,
"grad_norm": 0.7849127054214478,
"learning_rate": 6.886446886446885e-05,
"loss": 0.8282,
"step": 2166
},
{
"epoch": 2.3803377728957846,
"grad_norm": 0.4041041135787964,
"learning_rate": 6.874236874236874e-05,
"loss": 0.5046,
"step": 2167
},
{
"epoch": 2.3814362213373608,
"grad_norm": 0.35880064964294434,
"learning_rate": 6.862026862026862e-05,
"loss": 0.4096,
"step": 2168
},
{
"epoch": 2.3825346697789374,
"grad_norm": 0.5949457883834839,
"learning_rate": 6.84981684981685e-05,
"loss": 0.6666,
"step": 2169
},
{
"epoch": 2.3836331182205135,
"grad_norm": 0.6332186460494995,
"learning_rate": 6.837606837606837e-05,
"loss": 0.9715,
"step": 2170
},
{
"epoch": 2.3847315666620896,
"grad_norm": 0.3173432946205139,
"learning_rate": 6.825396825396824e-05,
"loss": 0.6792,
"step": 2171
},
{
"epoch": 2.385830015103666,
"grad_norm": 0.7556782364845276,
"learning_rate": 6.813186813186813e-05,
"loss": 0.7267,
"step": 2172
},
{
"epoch": 2.3869284635452424,
"grad_norm": 0.43191683292388916,
"learning_rate": 6.800976800976801e-05,
"loss": 0.5841,
"step": 2173
},
{
"epoch": 2.3880269119868185,
"grad_norm": 0.4010660946369171,
"learning_rate": 6.788766788766788e-05,
"loss": 0.7491,
"step": 2174
},
{
"epoch": 2.389125360428395,
"grad_norm": 0.6889204382896423,
"learning_rate": 6.776556776556775e-05,
"loss": 0.4539,
"step": 2175
},
{
"epoch": 2.390223808869971,
"grad_norm": 0.4509136974811554,
"learning_rate": 6.764346764346764e-05,
"loss": 0.7066,
"step": 2176
},
{
"epoch": 2.3913222573115474,
"grad_norm": 0.4313298463821411,
"learning_rate": 6.752136752136751e-05,
"loss": 0.6292,
"step": 2177
},
{
"epoch": 2.392420705753124,
"grad_norm": 0.7713265419006348,
"learning_rate": 6.73992673992674e-05,
"loss": 0.8392,
"step": 2178
},
{
"epoch": 2.3935191541947,
"grad_norm": 0.5283428430557251,
"learning_rate": 6.727716727716727e-05,
"loss": 0.6912,
"step": 2179
},
{
"epoch": 2.394617602636276,
"grad_norm": 0.40429314970970154,
"learning_rate": 6.715506715506716e-05,
"loss": 0.4335,
"step": 2180
},
{
"epoch": 2.3957160510778523,
"grad_norm": 0.6888754367828369,
"learning_rate": 6.703296703296703e-05,
"loss": 0.6276,
"step": 2181
},
{
"epoch": 2.396814499519429,
"grad_norm": 0.5595026612281799,
"learning_rate": 6.69108669108669e-05,
"loss": 0.7806,
"step": 2182
},
{
"epoch": 2.397912947961005,
"grad_norm": 0.32394587993621826,
"learning_rate": 6.678876678876678e-05,
"loss": 0.5531,
"step": 2183
},
{
"epoch": 2.399011396402581,
"grad_norm": 0.5909039974212646,
"learning_rate": 6.666666666666666e-05,
"loss": 0.4932,
"step": 2184
},
{
"epoch": 2.400109844844158,
"grad_norm": 0.4148501455783844,
"learning_rate": 6.654456654456654e-05,
"loss": 0.5637,
"step": 2185
},
{
"epoch": 2.401208293285734,
"grad_norm": 0.558403491973877,
"learning_rate": 6.642246642246641e-05,
"loss": 0.5733,
"step": 2186
},
{
"epoch": 2.40230674172731,
"grad_norm": 0.5171149373054504,
"learning_rate": 6.630036630036629e-05,
"loss": 0.6931,
"step": 2187
},
{
"epoch": 2.403405190168886,
"grad_norm": 0.44966164231300354,
"learning_rate": 6.617826617826617e-05,
"loss": 0.5061,
"step": 2188
},
{
"epoch": 2.404503638610463,
"grad_norm": 0.45499417185783386,
"learning_rate": 6.605616605616606e-05,
"loss": 0.3726,
"step": 2189
},
{
"epoch": 2.405602087052039,
"grad_norm": 0.5790139436721802,
"learning_rate": 6.593406593406593e-05,
"loss": 0.6647,
"step": 2190
},
{
"epoch": 2.4067005354936155,
"grad_norm": 0.5948793292045593,
"learning_rate": 6.581196581196581e-05,
"loss": 0.765,
"step": 2191
},
{
"epoch": 2.4077989839351917,
"grad_norm": 0.5925643444061279,
"learning_rate": 6.568986568986569e-05,
"loss": 0.889,
"step": 2192
},
{
"epoch": 2.408897432376768,
"grad_norm": 0.5776219964027405,
"learning_rate": 6.556776556776556e-05,
"loss": 0.5506,
"step": 2193
},
{
"epoch": 2.409995880818344,
"grad_norm": 0.44397997856140137,
"learning_rate": 6.544566544566544e-05,
"loss": 0.5372,
"step": 2194
},
{
"epoch": 2.4110943292599205,
"grad_norm": 0.45733606815338135,
"learning_rate": 6.532356532356532e-05,
"loss": 0.7207,
"step": 2195
},
{
"epoch": 2.4121927777014966,
"grad_norm": 0.38223645091056824,
"learning_rate": 6.52014652014652e-05,
"loss": 0.5888,
"step": 2196
},
{
"epoch": 2.413291226143073,
"grad_norm": 0.3642580211162567,
"learning_rate": 6.507936507936507e-05,
"loss": 0.5687,
"step": 2197
},
{
"epoch": 2.4143896745846494,
"grad_norm": 0.42435723543167114,
"learning_rate": 6.495726495726494e-05,
"loss": 0.6056,
"step": 2198
},
{
"epoch": 2.4154881230262255,
"grad_norm": 0.4998740255832672,
"learning_rate": 6.483516483516483e-05,
"loss": 0.6813,
"step": 2199
},
{
"epoch": 2.4165865714678016,
"grad_norm": 0.47158849239349365,
"learning_rate": 6.47130647130647e-05,
"loss": 0.5585,
"step": 2200
},
{
"epoch": 2.417685019909378,
"grad_norm": 0.4780612289905548,
"learning_rate": 6.459096459096459e-05,
"loss": 0.4941,
"step": 2201
},
{
"epoch": 2.4187834683509544,
"grad_norm": 0.5073630809783936,
"learning_rate": 6.446886446886447e-05,
"loss": 0.4549,
"step": 2202
},
{
"epoch": 2.4198819167925305,
"grad_norm": 0.4311310052871704,
"learning_rate": 6.434676434676435e-05,
"loss": 0.4419,
"step": 2203
},
{
"epoch": 2.4209803652341066,
"grad_norm": 0.3557896316051483,
"learning_rate": 6.422466422466422e-05,
"loss": 0.6973,
"step": 2204
},
{
"epoch": 2.4220788136756832,
"grad_norm": 0.6171516180038452,
"learning_rate": 6.410256410256409e-05,
"loss": 0.7554,
"step": 2205
},
{
"epoch": 2.4231772621172594,
"grad_norm": 0.4687957465648651,
"learning_rate": 6.398046398046397e-05,
"loss": 0.7429,
"step": 2206
},
{
"epoch": 2.4242757105588355,
"grad_norm": 0.8685696125030518,
"learning_rate": 6.385836385836386e-05,
"loss": 0.5896,
"step": 2207
},
{
"epoch": 2.425374159000412,
"grad_norm": 0.39599040150642395,
"learning_rate": 6.373626373626373e-05,
"loss": 0.4744,
"step": 2208
},
{
"epoch": 2.4264726074419882,
"grad_norm": 0.9079630970954895,
"learning_rate": 6.36141636141636e-05,
"loss": 0.6067,
"step": 2209
},
{
"epoch": 2.4275710558835644,
"grad_norm": 0.5051462054252625,
"learning_rate": 6.349206349206349e-05,
"loss": 0.7314,
"step": 2210
},
{
"epoch": 2.428669504325141,
"grad_norm": 0.4899844825267792,
"learning_rate": 6.336996336996336e-05,
"loss": 0.7086,
"step": 2211
},
{
"epoch": 2.429767952766717,
"grad_norm": 0.5135432481765747,
"learning_rate": 6.324786324786325e-05,
"loss": 0.5261,
"step": 2212
},
{
"epoch": 2.4308664012082932,
"grad_norm": 0.6025048494338989,
"learning_rate": 6.312576312576312e-05,
"loss": 0.5276,
"step": 2213
},
{
"epoch": 2.4319648496498694,
"grad_norm": 0.6931442022323608,
"learning_rate": 6.3003663003663e-05,
"loss": 0.6535,
"step": 2214
},
{
"epoch": 2.433063298091446,
"grad_norm": 0.695106565952301,
"learning_rate": 6.288156288156288e-05,
"loss": 0.9183,
"step": 2215
},
{
"epoch": 2.434161746533022,
"grad_norm": 0.450100302696228,
"learning_rate": 6.275946275946275e-05,
"loss": 0.5049,
"step": 2216
},
{
"epoch": 2.4352601949745982,
"grad_norm": 0.5539785623550415,
"learning_rate": 6.263736263736263e-05,
"loss": 0.5735,
"step": 2217
},
{
"epoch": 2.436358643416175,
"grad_norm": 0.5560977458953857,
"learning_rate": 6.25152625152625e-05,
"loss": 0.7364,
"step": 2218
},
{
"epoch": 2.437457091857751,
"grad_norm": 0.740195095539093,
"learning_rate": 6.239316239316239e-05,
"loss": 0.7839,
"step": 2219
},
{
"epoch": 2.438555540299327,
"grad_norm": 0.9324271082878113,
"learning_rate": 6.227106227106226e-05,
"loss": 0.6365,
"step": 2220
},
{
"epoch": 2.4396539887409037,
"grad_norm": 0.5540104508399963,
"learning_rate": 6.214896214896215e-05,
"loss": 0.6586,
"step": 2221
},
{
"epoch": 2.44075243718248,
"grad_norm": 0.5028054714202881,
"learning_rate": 6.202686202686202e-05,
"loss": 0.4422,
"step": 2222
},
{
"epoch": 2.441850885624056,
"grad_norm": 0.7052125930786133,
"learning_rate": 6.190476190476189e-05,
"loss": 0.7248,
"step": 2223
},
{
"epoch": 2.4429493340656325,
"grad_norm": 0.6705207824707031,
"learning_rate": 6.178266178266178e-05,
"loss": 0.81,
"step": 2224
},
{
"epoch": 2.4440477825072087,
"grad_norm": 0.7996514439582825,
"learning_rate": 6.166056166056166e-05,
"loss": 0.382,
"step": 2225
},
{
"epoch": 2.445146230948785,
"grad_norm": 1.5169689655303955,
"learning_rate": 6.153846153846154e-05,
"loss": 0.7373,
"step": 2226
},
{
"epoch": 2.446244679390361,
"grad_norm": 0.8039339780807495,
"learning_rate": 6.141636141636141e-05,
"loss": 0.8609,
"step": 2227
},
{
"epoch": 2.4473431278319375,
"grad_norm": 0.6489125490188599,
"learning_rate": 6.129426129426128e-05,
"loss": 0.6309,
"step": 2228
},
{
"epoch": 2.4484415762735137,
"grad_norm": 0.533184826374054,
"learning_rate": 6.117216117216116e-05,
"loss": 0.5166,
"step": 2229
},
{
"epoch": 2.44954002471509,
"grad_norm": 0.5699225068092346,
"learning_rate": 6.105006105006105e-05,
"loss": 0.7276,
"step": 2230
},
{
"epoch": 2.4506384731566664,
"grad_norm": 0.5552012324333191,
"learning_rate": 6.092796092796092e-05,
"loss": 0.636,
"step": 2231
},
{
"epoch": 2.4517369215982425,
"grad_norm": 0.4785599112510681,
"learning_rate": 6.08058608058608e-05,
"loss": 0.6362,
"step": 2232
},
{
"epoch": 2.4528353700398187,
"grad_norm": 0.740872859954834,
"learning_rate": 6.068376068376068e-05,
"loss": 0.5603,
"step": 2233
},
{
"epoch": 2.453933818481395,
"grad_norm": 0.5217441916465759,
"learning_rate": 6.056166056166056e-05,
"loss": 0.6306,
"step": 2234
},
{
"epoch": 2.4550322669229714,
"grad_norm": 0.446481853723526,
"learning_rate": 6.043956043956044e-05,
"loss": 0.8156,
"step": 2235
},
{
"epoch": 2.4561307153645475,
"grad_norm": 0.6527410745620728,
"learning_rate": 6.031746031746031e-05,
"loss": 0.7057,
"step": 2236
},
{
"epoch": 2.4572291638061237,
"grad_norm": 0.6801958680152893,
"learning_rate": 6.019536019536019e-05,
"loss": 0.7718,
"step": 2237
},
{
"epoch": 2.4583276122477002,
"grad_norm": 1.0723007917404175,
"learning_rate": 6.007326007326007e-05,
"loss": 0.5552,
"step": 2238
},
{
"epoch": 2.4594260606892764,
"grad_norm": 0.4058208763599396,
"learning_rate": 5.9951159951159945e-05,
"loss": 0.5035,
"step": 2239
},
{
"epoch": 2.4605245091308525,
"grad_norm": 0.5384330153465271,
"learning_rate": 5.9829059829059824e-05,
"loss": 0.5059,
"step": 2240
},
{
"epoch": 2.461622957572429,
"grad_norm": 0.7797716856002808,
"learning_rate": 5.9706959706959696e-05,
"loss": 0.5613,
"step": 2241
},
{
"epoch": 2.4627214060140052,
"grad_norm": 2.9689226150512695,
"learning_rate": 5.958485958485958e-05,
"loss": 0.6219,
"step": 2242
},
{
"epoch": 2.4638198544555814,
"grad_norm": 0.47863152623176575,
"learning_rate": 5.946275946275946e-05,
"loss": 0.5498,
"step": 2243
},
{
"epoch": 2.464918302897158,
"grad_norm": 0.49707144498825073,
"learning_rate": 5.934065934065933e-05,
"loss": 0.775,
"step": 2244
},
{
"epoch": 2.466016751338734,
"grad_norm": 0.3437495529651642,
"learning_rate": 5.921855921855922e-05,
"loss": 0.4592,
"step": 2245
},
{
"epoch": 2.4671151997803102,
"grad_norm": 0.7298309206962585,
"learning_rate": 5.9096459096459096e-05,
"loss": 0.5374,
"step": 2246
},
{
"epoch": 2.4682136482218864,
"grad_norm": 0.6666691303253174,
"learning_rate": 5.897435897435897e-05,
"loss": 0.424,
"step": 2247
},
{
"epoch": 2.469312096663463,
"grad_norm": 0.5841661691665649,
"learning_rate": 5.8852258852258847e-05,
"loss": 0.5316,
"step": 2248
},
{
"epoch": 2.470410545105039,
"grad_norm": 0.4921081066131592,
"learning_rate": 5.873015873015872e-05,
"loss": 0.6901,
"step": 2249
},
{
"epoch": 2.4715089935466152,
"grad_norm": 0.4779987633228302,
"learning_rate": 5.8608058608058604e-05,
"loss": 0.8976,
"step": 2250
},
{
"epoch": 2.472607441988192,
"grad_norm": 0.43142780661582947,
"learning_rate": 5.848595848595848e-05,
"loss": 0.4915,
"step": 2251
},
{
"epoch": 2.473705890429768,
"grad_norm": 1.132870078086853,
"learning_rate": 5.8363858363858355e-05,
"loss": 0.6633,
"step": 2252
},
{
"epoch": 2.474804338871344,
"grad_norm": 0.5674893856048584,
"learning_rate": 5.824175824175824e-05,
"loss": 0.5023,
"step": 2253
},
{
"epoch": 2.4759027873129207,
"grad_norm": 0.42495957016944885,
"learning_rate": 5.811965811965811e-05,
"loss": 0.6544,
"step": 2254
},
{
"epoch": 2.477001235754497,
"grad_norm": 0.8031434416770935,
"learning_rate": 5.799755799755799e-05,
"loss": 0.892,
"step": 2255
},
{
"epoch": 2.478099684196073,
"grad_norm": 0.7715115547180176,
"learning_rate": 5.7875457875457876e-05,
"loss": 0.5659,
"step": 2256
},
{
"epoch": 2.4791981326376495,
"grad_norm": 0.6882114410400391,
"learning_rate": 5.775335775335775e-05,
"loss": 0.5154,
"step": 2257
},
{
"epoch": 2.4802965810792257,
"grad_norm": 0.4994114935398102,
"learning_rate": 5.763125763125763e-05,
"loss": 0.6001,
"step": 2258
},
{
"epoch": 2.481395029520802,
"grad_norm": 0.45008450746536255,
"learning_rate": 5.7509157509157506e-05,
"loss": 0.7076,
"step": 2259
},
{
"epoch": 2.482493477962378,
"grad_norm": 0.654270350933075,
"learning_rate": 5.738705738705738e-05,
"loss": 0.5809,
"step": 2260
},
{
"epoch": 2.4835919264039545,
"grad_norm": 0.6344896554946899,
"learning_rate": 5.726495726495726e-05,
"loss": 0.6059,
"step": 2261
},
{
"epoch": 2.4846903748455307,
"grad_norm": 0.44090238213539124,
"learning_rate": 5.7142857142857135e-05,
"loss": 0.7953,
"step": 2262
},
{
"epoch": 2.485788823287107,
"grad_norm": 0.47564128041267395,
"learning_rate": 5.7020757020757014e-05,
"loss": 0.5062,
"step": 2263
},
{
"epoch": 2.4868872717286834,
"grad_norm": 0.3644583225250244,
"learning_rate": 5.68986568986569e-05,
"loss": 0.6417,
"step": 2264
},
{
"epoch": 2.4879857201702595,
"grad_norm": 0.5264548659324646,
"learning_rate": 5.677655677655677e-05,
"loss": 0.5971,
"step": 2265
},
{
"epoch": 2.4890841686118357,
"grad_norm": 0.7300589680671692,
"learning_rate": 5.665445665445665e-05,
"loss": 0.6249,
"step": 2266
},
{
"epoch": 2.490182617053412,
"grad_norm": 0.9016311764717102,
"learning_rate": 5.653235653235652e-05,
"loss": 0.5761,
"step": 2267
},
{
"epoch": 2.4912810654949884,
"grad_norm": 0.7480237483978271,
"learning_rate": 5.641025641025641e-05,
"loss": 0.4026,
"step": 2268
},
{
"epoch": 2.4923795139365645,
"grad_norm": 0.5738864541053772,
"learning_rate": 5.6288156288156286e-05,
"loss": 0.8657,
"step": 2269
},
{
"epoch": 2.493477962378141,
"grad_norm": 0.7320820093154907,
"learning_rate": 5.616605616605616e-05,
"loss": 0.7341,
"step": 2270
},
{
"epoch": 2.4945764108197173,
"grad_norm": 0.7029497623443604,
"learning_rate": 5.6043956043956037e-05,
"loss": 0.7597,
"step": 2271
},
{
"epoch": 2.4956748592612934,
"grad_norm": 0.5160001516342163,
"learning_rate": 5.592185592185592e-05,
"loss": 0.6488,
"step": 2272
},
{
"epoch": 2.4967733077028695,
"grad_norm": 0.5425933003425598,
"learning_rate": 5.5799755799755794e-05,
"loss": 0.7102,
"step": 2273
},
{
"epoch": 2.497871756144446,
"grad_norm": 0.5881295204162598,
"learning_rate": 5.567765567765567e-05,
"loss": 0.8123,
"step": 2274
},
{
"epoch": 2.4989702045860223,
"grad_norm": 0.6021397113800049,
"learning_rate": 5.5555555555555545e-05,
"loss": 0.8887,
"step": 2275
},
{
"epoch": 2.5000686530275984,
"grad_norm": 0.4754411578178406,
"learning_rate": 5.543345543345543e-05,
"loss": 0.8162,
"step": 2276
},
{
"epoch": 2.501167101469175,
"grad_norm": 0.46976983547210693,
"learning_rate": 5.531135531135531e-05,
"loss": 0.4177,
"step": 2277
},
{
"epoch": 2.502265549910751,
"grad_norm": 0.4946482181549072,
"learning_rate": 5.518925518925518e-05,
"loss": 0.6997,
"step": 2278
},
{
"epoch": 2.5033639983523273,
"grad_norm": 0.49166280031204224,
"learning_rate": 5.5067155067155066e-05,
"loss": 0.6436,
"step": 2279
},
{
"epoch": 2.5044624467939034,
"grad_norm": 0.40157628059387207,
"learning_rate": 5.494505494505494e-05,
"loss": 0.6998,
"step": 2280
},
{
"epoch": 2.50556089523548,
"grad_norm": 0.4139937162399292,
"learning_rate": 5.482295482295482e-05,
"loss": 0.4021,
"step": 2281
},
{
"epoch": 2.506659343677056,
"grad_norm": 3.6814892292022705,
"learning_rate": 5.4700854700854696e-05,
"loss": 0.6402,
"step": 2282
},
{
"epoch": 2.5077577921186327,
"grad_norm": 0.3136257529258728,
"learning_rate": 5.4578754578754574e-05,
"loss": 0.5364,
"step": 2283
},
{
"epoch": 2.508856240560209,
"grad_norm": 0.42901432514190674,
"learning_rate": 5.445665445665445e-05,
"loss": 0.6838,
"step": 2284
},
{
"epoch": 2.509954689001785,
"grad_norm": 0.8462406992912292,
"learning_rate": 5.433455433455433e-05,
"loss": 0.4232,
"step": 2285
},
{
"epoch": 2.511053137443361,
"grad_norm": 1.244150996208191,
"learning_rate": 5.4212454212454204e-05,
"loss": 0.6192,
"step": 2286
},
{
"epoch": 2.5121515858849373,
"grad_norm": 0.834296703338623,
"learning_rate": 5.409035409035409e-05,
"loss": 0.548,
"step": 2287
},
{
"epoch": 2.513250034326514,
"grad_norm": 0.4279276430606842,
"learning_rate": 5.396825396825396e-05,
"loss": 0.7549,
"step": 2288
},
{
"epoch": 2.51434848276809,
"grad_norm": 0.5770757794380188,
"learning_rate": 5.384615384615384e-05,
"loss": 0.6156,
"step": 2289
},
{
"epoch": 2.5154469312096666,
"grad_norm": 0.41763821244239807,
"learning_rate": 5.3724053724053725e-05,
"loss": 0.5019,
"step": 2290
},
{
"epoch": 2.5165453796512427,
"grad_norm": 0.5212944746017456,
"learning_rate": 5.36019536019536e-05,
"loss": 0.6132,
"step": 2291
},
{
"epoch": 2.517643828092819,
"grad_norm": 0.44493457674980164,
"learning_rate": 5.3479853479853476e-05,
"loss": 0.4162,
"step": 2292
},
{
"epoch": 2.518742276534395,
"grad_norm": 0.46922022104263306,
"learning_rate": 5.335775335775335e-05,
"loss": 0.4624,
"step": 2293
},
{
"epoch": 2.5198407249759716,
"grad_norm": 0.41906213760375977,
"learning_rate": 5.3235653235653233e-05,
"loss": 0.612,
"step": 2294
},
{
"epoch": 2.5209391734175477,
"grad_norm": 0.620276153087616,
"learning_rate": 5.311355311355311e-05,
"loss": 0.6322,
"step": 2295
},
{
"epoch": 2.522037621859124,
"grad_norm": 0.6597051620483398,
"learning_rate": 5.2991452991452984e-05,
"loss": 0.7659,
"step": 2296
},
{
"epoch": 2.5231360703007004,
"grad_norm": 4.377660274505615,
"learning_rate": 5.286935286935286e-05,
"loss": 0.8294,
"step": 2297
},
{
"epoch": 2.5242345187422766,
"grad_norm": 0.6086331009864807,
"learning_rate": 5.274725274725275e-05,
"loss": 0.5164,
"step": 2298
},
{
"epoch": 2.5253329671838527,
"grad_norm": 0.5100352168083191,
"learning_rate": 5.262515262515262e-05,
"loss": 0.6319,
"step": 2299
},
{
"epoch": 2.526431415625429,
"grad_norm": 0.6642487049102783,
"learning_rate": 5.25030525030525e-05,
"loss": 0.533,
"step": 2300
},
{
"epoch": 2.5275298640670054,
"grad_norm": 0.5834927558898926,
"learning_rate": 5.238095238095237e-05,
"loss": 0.5669,
"step": 2301
},
{
"epoch": 2.5286283125085816,
"grad_norm": 0.530815064907074,
"learning_rate": 5.2258852258852256e-05,
"loss": 0.6189,
"step": 2302
},
{
"epoch": 2.529726760950158,
"grad_norm": 0.6275864243507385,
"learning_rate": 5.2136752136752135e-05,
"loss": 0.8403,
"step": 2303
},
{
"epoch": 2.5308252093917343,
"grad_norm": 0.5878366827964783,
"learning_rate": 5.201465201465201e-05,
"loss": 0.6176,
"step": 2304
},
{
"epoch": 2.5319236578333104,
"grad_norm": 0.37410980463027954,
"learning_rate": 5.189255189255189e-05,
"loss": 0.6337,
"step": 2305
},
{
"epoch": 2.5330221062748866,
"grad_norm": 0.43912917375564575,
"learning_rate": 5.1770451770451764e-05,
"loss": 0.5348,
"step": 2306
},
{
"epoch": 2.534120554716463,
"grad_norm": 1.4737471342086792,
"learning_rate": 5.164835164835164e-05,
"loss": 0.4862,
"step": 2307
},
{
"epoch": 2.5352190031580393,
"grad_norm": 0.3978705108165741,
"learning_rate": 5.152625152625152e-05,
"loss": 0.7929,
"step": 2308
},
{
"epoch": 2.5363174515996154,
"grad_norm": 0.3852058947086334,
"learning_rate": 5.14041514041514e-05,
"loss": 0.5895,
"step": 2309
},
{
"epoch": 2.537415900041192,
"grad_norm": 17.968448638916016,
"learning_rate": 5.128205128205128e-05,
"loss": 0.4661,
"step": 2310
},
{
"epoch": 2.538514348482768,
"grad_norm": 0.9369175434112549,
"learning_rate": 5.115995115995115e-05,
"loss": 0.5957,
"step": 2311
},
{
"epoch": 2.5396127969243443,
"grad_norm": 0.612750768661499,
"learning_rate": 5.103785103785103e-05,
"loss": 0.6786,
"step": 2312
},
{
"epoch": 2.5407112453659204,
"grad_norm": 0.588512659072876,
"learning_rate": 5.0915750915750915e-05,
"loss": 1.0482,
"step": 2313
},
{
"epoch": 2.541809693807497,
"grad_norm": 0.4964143633842468,
"learning_rate": 5.079365079365079e-05,
"loss": 0.5673,
"step": 2314
},
{
"epoch": 2.542908142249073,
"grad_norm": 0.5807982683181763,
"learning_rate": 5.0671550671550666e-05,
"loss": 0.5493,
"step": 2315
},
{
"epoch": 2.5440065906906497,
"grad_norm": 0.5131386518478394,
"learning_rate": 5.054945054945055e-05,
"loss": 0.5947,
"step": 2316
},
{
"epoch": 2.545105039132226,
"grad_norm": 0.4521124064922333,
"learning_rate": 5.0427350427350424e-05,
"loss": 0.5554,
"step": 2317
},
{
"epoch": 2.546203487573802,
"grad_norm": 0.9441378712654114,
"learning_rate": 5.03052503052503e-05,
"loss": 0.6991,
"step": 2318
},
{
"epoch": 2.547301936015378,
"grad_norm": 0.6353013515472412,
"learning_rate": 5.0183150183150174e-05,
"loss": 0.5308,
"step": 2319
},
{
"epoch": 2.5484003844569547,
"grad_norm": 0.5940631628036499,
"learning_rate": 5.006105006105006e-05,
"loss": 0.6536,
"step": 2320
},
{
"epoch": 2.549498832898531,
"grad_norm": 0.5457591414451599,
"learning_rate": 4.993894993894994e-05,
"loss": 0.6927,
"step": 2321
},
{
"epoch": 2.550597281340107,
"grad_norm": 0.6265937685966492,
"learning_rate": 4.981684981684981e-05,
"loss": 0.6341,
"step": 2322
},
{
"epoch": 2.5516957297816836,
"grad_norm": 0.5842925310134888,
"learning_rate": 4.969474969474969e-05,
"loss": 0.4583,
"step": 2323
},
{
"epoch": 2.5527941782232597,
"grad_norm": 0.5363351106643677,
"learning_rate": 4.957264957264956e-05,
"loss": 0.6882,
"step": 2324
},
{
"epoch": 2.553892626664836,
"grad_norm": 0.3677682876586914,
"learning_rate": 4.9450549450549446e-05,
"loss": 0.5671,
"step": 2325
},
{
"epoch": 2.554991075106412,
"grad_norm": 1.222985863685608,
"learning_rate": 4.9328449328449325e-05,
"loss": 0.4936,
"step": 2326
},
{
"epoch": 2.5560895235479886,
"grad_norm": 1.187898874282837,
"learning_rate": 4.92063492063492e-05,
"loss": 0.4893,
"step": 2327
},
{
"epoch": 2.5571879719895647,
"grad_norm": 0.38843801617622375,
"learning_rate": 4.908424908424908e-05,
"loss": 0.6512,
"step": 2328
},
{
"epoch": 2.558286420431141,
"grad_norm": 0.9550036191940308,
"learning_rate": 4.896214896214896e-05,
"loss": 0.6055,
"step": 2329
},
{
"epoch": 2.5593848688727174,
"grad_norm": 0.80762779712677,
"learning_rate": 4.884004884004883e-05,
"loss": 0.8852,
"step": 2330
},
{
"epoch": 2.5604833173142936,
"grad_norm": 0.7496643662452698,
"learning_rate": 4.871794871794872e-05,
"loss": 0.6535,
"step": 2331
},
{
"epoch": 2.5615817657558697,
"grad_norm": 0.5532578825950623,
"learning_rate": 4.859584859584859e-05,
"loss": 0.6336,
"step": 2332
},
{
"epoch": 2.562680214197446,
"grad_norm": 0.4058012366294861,
"learning_rate": 4.847374847374847e-05,
"loss": 0.6529,
"step": 2333
},
{
"epoch": 2.5637786626390224,
"grad_norm": 3.1913115978240967,
"learning_rate": 4.835164835164835e-05,
"loss": 0.548,
"step": 2334
},
{
"epoch": 2.5648771110805986,
"grad_norm": 0.47375988960266113,
"learning_rate": 4.822954822954822e-05,
"loss": 0.7567,
"step": 2335
},
{
"epoch": 2.565975559522175,
"grad_norm": 0.5287726521492004,
"learning_rate": 4.8107448107448106e-05,
"loss": 0.6009,
"step": 2336
},
{
"epoch": 2.5670740079637513,
"grad_norm": 0.43966931104660034,
"learning_rate": 4.798534798534798e-05,
"loss": 0.5538,
"step": 2337
},
{
"epoch": 2.5681724564053274,
"grad_norm": 0.6683239340782166,
"learning_rate": 4.7863247863247856e-05,
"loss": 0.3999,
"step": 2338
},
{
"epoch": 2.5692709048469036,
"grad_norm": 0.5260687470436096,
"learning_rate": 4.774114774114774e-05,
"loss": 0.7212,
"step": 2339
},
{
"epoch": 2.57036935328848,
"grad_norm": 1.086850881576538,
"learning_rate": 4.7619047619047614e-05,
"loss": 0.7439,
"step": 2340
},
{
"epoch": 2.5714678017300563,
"grad_norm": 0.9744517207145691,
"learning_rate": 4.749694749694749e-05,
"loss": 0.5625,
"step": 2341
},
{
"epoch": 2.5725662501716324,
"grad_norm": 0.6829352974891663,
"learning_rate": 4.737484737484738e-05,
"loss": 0.5241,
"step": 2342
},
{
"epoch": 2.573664698613209,
"grad_norm": 0.9441612958908081,
"learning_rate": 4.725274725274725e-05,
"loss": 0.8815,
"step": 2343
},
{
"epoch": 2.574763147054785,
"grad_norm": 0.9406607151031494,
"learning_rate": 4.713064713064713e-05,
"loss": 0.7176,
"step": 2344
},
{
"epoch": 2.5758615954963613,
"grad_norm": 0.6601364016532898,
"learning_rate": 4.7008547008547e-05,
"loss": 0.7713,
"step": 2345
},
{
"epoch": 2.5769600439379374,
"grad_norm": 2.5189599990844727,
"learning_rate": 4.688644688644688e-05,
"loss": 0.5572,
"step": 2346
},
{
"epoch": 2.578058492379514,
"grad_norm": 0.7295210957527161,
"learning_rate": 4.6764346764346765e-05,
"loss": 0.4431,
"step": 2347
},
{
"epoch": 2.57915694082109,
"grad_norm": 0.5053385496139526,
"learning_rate": 4.6642246642246637e-05,
"loss": 0.4881,
"step": 2348
},
{
"epoch": 2.5802553892626667,
"grad_norm": 0.6556063890457153,
"learning_rate": 4.6520146520146515e-05,
"loss": 0.5168,
"step": 2349
},
{
"epoch": 2.581353837704243,
"grad_norm": 0.37052014470100403,
"learning_rate": 4.639804639804639e-05,
"loss": 0.3954,
"step": 2350
},
{
"epoch": 2.582452286145819,
"grad_norm": 0.5975561738014221,
"learning_rate": 4.627594627594627e-05,
"loss": 0.5714,
"step": 2351
},
{
"epoch": 2.583550734587395,
"grad_norm": 0.7273014187812805,
"learning_rate": 4.615384615384615e-05,
"loss": 0.7287,
"step": 2352
},
{
"epoch": 2.5846491830289717,
"grad_norm": 0.566586971282959,
"learning_rate": 4.603174603174602e-05,
"loss": 0.5589,
"step": 2353
},
{
"epoch": 2.585747631470548,
"grad_norm": 0.5846517086029053,
"learning_rate": 4.590964590964591e-05,
"loss": 0.5061,
"step": 2354
},
{
"epoch": 2.586846079912124,
"grad_norm": 0.7470859885215759,
"learning_rate": 4.578754578754579e-05,
"loss": 0.5433,
"step": 2355
},
{
"epoch": 2.5879445283537006,
"grad_norm": 0.5419175624847412,
"learning_rate": 4.566544566544566e-05,
"loss": 0.5502,
"step": 2356
},
{
"epoch": 2.5890429767952767,
"grad_norm": 1.507851004600525,
"learning_rate": 4.554334554334554e-05,
"loss": 0.7399,
"step": 2357
},
{
"epoch": 2.590141425236853,
"grad_norm": 1.4420006275177002,
"learning_rate": 4.542124542124542e-05,
"loss": 0.4233,
"step": 2358
},
{
"epoch": 2.591239873678429,
"grad_norm": 0.6471789479255676,
"learning_rate": 4.5299145299145296e-05,
"loss": 0.4052,
"step": 2359
},
{
"epoch": 2.5923383221200056,
"grad_norm": 0.5886567831039429,
"learning_rate": 4.5177045177045174e-05,
"loss": 0.7197,
"step": 2360
},
{
"epoch": 2.5934367705615817,
"grad_norm": 0.843024492263794,
"learning_rate": 4.5054945054945046e-05,
"loss": 0.7636,
"step": 2361
},
{
"epoch": 2.5945352190031583,
"grad_norm": 0.8689064979553223,
"learning_rate": 4.493284493284493e-05,
"loss": 0.6694,
"step": 2362
},
{
"epoch": 2.5956336674447344,
"grad_norm": 0.5112485289573669,
"learning_rate": 4.4810744810744804e-05,
"loss": 0.5338,
"step": 2363
},
{
"epoch": 2.5967321158863106,
"grad_norm": 0.4828614294528961,
"learning_rate": 4.468864468864468e-05,
"loss": 0.8519,
"step": 2364
},
{
"epoch": 2.5978305643278867,
"grad_norm": 0.5644575357437134,
"learning_rate": 4.456654456654457e-05,
"loss": 0.5605,
"step": 2365
},
{
"epoch": 2.598929012769463,
"grad_norm": 0.7749584913253784,
"learning_rate": 4.444444444444444e-05,
"loss": 0.6697,
"step": 2366
},
{
"epoch": 2.6000274612110394,
"grad_norm": 0.9038271307945251,
"learning_rate": 4.432234432234432e-05,
"loss": 0.7242,
"step": 2367
},
{
"epoch": 2.6011259096526156,
"grad_norm": 0.5102944374084473,
"learning_rate": 4.42002442002442e-05,
"loss": 0.5841,
"step": 2368
},
{
"epoch": 2.602224358094192,
"grad_norm": 0.5072823762893677,
"learning_rate": 4.4078144078144076e-05,
"loss": 0.4927,
"step": 2369
},
{
"epoch": 2.6033228065357683,
"grad_norm": 0.3654184341430664,
"learning_rate": 4.3956043956043955e-05,
"loss": 0.6449,
"step": 2370
},
{
"epoch": 2.6044212549773444,
"grad_norm": 1.7309939861297607,
"learning_rate": 4.3833943833943827e-05,
"loss": 0.6979,
"step": 2371
},
{
"epoch": 2.6055197034189206,
"grad_norm": 0.7982075214385986,
"learning_rate": 4.3711843711843705e-05,
"loss": 0.6589,
"step": 2372
},
{
"epoch": 2.606618151860497,
"grad_norm": 0.6989462375640869,
"learning_rate": 4.358974358974359e-05,
"loss": 0.7104,
"step": 2373
},
{
"epoch": 2.6077166003020733,
"grad_norm": 0.7331676483154297,
"learning_rate": 4.346764346764346e-05,
"loss": 0.7565,
"step": 2374
},
{
"epoch": 2.6088150487436494,
"grad_norm": 1.0566400289535522,
"learning_rate": 4.334554334554334e-05,
"loss": 0.6967,
"step": 2375
},
{
"epoch": 2.609913497185226,
"grad_norm": 0.5988017320632935,
"learning_rate": 4.322344322344321e-05,
"loss": 0.7871,
"step": 2376
},
{
"epoch": 2.611011945626802,
"grad_norm": 0.4248102307319641,
"learning_rate": 4.31013431013431e-05,
"loss": 0.6891,
"step": 2377
},
{
"epoch": 2.6121103940683783,
"grad_norm": 1.9839611053466797,
"learning_rate": 4.297924297924298e-05,
"loss": 0.6647,
"step": 2378
},
{
"epoch": 2.6132088425099544,
"grad_norm": 0.4382665455341339,
"learning_rate": 4.285714285714285e-05,
"loss": 0.5969,
"step": 2379
},
{
"epoch": 2.614307290951531,
"grad_norm": 1.1918715238571167,
"learning_rate": 4.2735042735042735e-05,
"loss": 0.7788,
"step": 2380
},
{
"epoch": 2.615405739393107,
"grad_norm": 0.38117820024490356,
"learning_rate": 4.2612942612942614e-05,
"loss": 0.4967,
"step": 2381
},
{
"epoch": 2.6165041878346837,
"grad_norm": 0.6454489827156067,
"learning_rate": 4.2490842490842486e-05,
"loss": 0.7724,
"step": 2382
},
{
"epoch": 2.61760263627626,
"grad_norm": 1.0696319341659546,
"learning_rate": 4.2368742368742364e-05,
"loss": 0.5292,
"step": 2383
},
{
"epoch": 2.618701084717836,
"grad_norm": 0.5887579321861267,
"learning_rate": 4.224664224664224e-05,
"loss": 0.5317,
"step": 2384
},
{
"epoch": 2.619799533159412,
"grad_norm": 0.557188093662262,
"learning_rate": 4.212454212454212e-05,
"loss": 0.7172,
"step": 2385
},
{
"epoch": 2.6208979816009887,
"grad_norm": 0.5122195482254028,
"learning_rate": 4.2002442002442e-05,
"loss": 0.6398,
"step": 2386
},
{
"epoch": 2.621996430042565,
"grad_norm": 0.520722508430481,
"learning_rate": 4.188034188034187e-05,
"loss": 0.3984,
"step": 2387
},
{
"epoch": 2.623094878484141,
"grad_norm": 1.2077422142028809,
"learning_rate": 4.175824175824176e-05,
"loss": 0.6686,
"step": 2388
},
{
"epoch": 2.6241933269257176,
"grad_norm": 1.1437829732894897,
"learning_rate": 4.163614163614163e-05,
"loss": 0.6653,
"step": 2389
},
{
"epoch": 2.6252917753672937,
"grad_norm": 0.6157158017158508,
"learning_rate": 4.151404151404151e-05,
"loss": 0.7074,
"step": 2390
},
{
"epoch": 2.62639022380887,
"grad_norm": 1.8944931030273438,
"learning_rate": 4.1391941391941394e-05,
"loss": 0.5991,
"step": 2391
},
{
"epoch": 2.627488672250446,
"grad_norm": 0.6598528623580933,
"learning_rate": 4.1269841269841266e-05,
"loss": 0.6051,
"step": 2392
},
{
"epoch": 2.6285871206920226,
"grad_norm": 0.9341129660606384,
"learning_rate": 4.1147741147741145e-05,
"loss": 0.3795,
"step": 2393
},
{
"epoch": 2.6296855691335987,
"grad_norm": 0.4246079921722412,
"learning_rate": 4.1025641025641023e-05,
"loss": 0.4603,
"step": 2394
},
{
"epoch": 2.6307840175751753,
"grad_norm": 0.6639881134033203,
"learning_rate": 4.09035409035409e-05,
"loss": 0.5862,
"step": 2395
},
{
"epoch": 2.6318824660167515,
"grad_norm": 1.297917366027832,
"learning_rate": 4.078144078144078e-05,
"loss": 0.6175,
"step": 2396
},
{
"epoch": 2.6329809144583276,
"grad_norm": 0.7880698442459106,
"learning_rate": 4.065934065934065e-05,
"loss": 0.7034,
"step": 2397
},
{
"epoch": 2.6340793628999037,
"grad_norm": 0.6197066903114319,
"learning_rate": 4.053724053724053e-05,
"loss": 0.659,
"step": 2398
},
{
"epoch": 2.6351778113414803,
"grad_norm": 0.7560408711433411,
"learning_rate": 4.041514041514042e-05,
"loss": 0.5543,
"step": 2399
},
{
"epoch": 2.6362762597830565,
"grad_norm": 2.2571635246276855,
"learning_rate": 4.029304029304029e-05,
"loss": 0.712,
"step": 2400
},
{
"epoch": 2.6373747082246326,
"grad_norm": 0.8119613528251648,
"learning_rate": 4.017094017094017e-05,
"loss": 0.6407,
"step": 2401
},
{
"epoch": 2.638473156666209,
"grad_norm": 3.9773592948913574,
"learning_rate": 4.004884004884004e-05,
"loss": 0.6434,
"step": 2402
},
{
"epoch": 2.6395716051077853,
"grad_norm": 1.2648125886917114,
"learning_rate": 3.9926739926739925e-05,
"loss": 0.689,
"step": 2403
},
{
"epoch": 2.6406700535493615,
"grad_norm": 0.7015364170074463,
"learning_rate": 3.9804639804639804e-05,
"loss": 0.4175,
"step": 2404
},
{
"epoch": 2.6417685019909376,
"grad_norm": 0.941303551197052,
"learning_rate": 3.9682539682539676e-05,
"loss": 0.4126,
"step": 2405
},
{
"epoch": 2.642866950432514,
"grad_norm": 0.7533726096153259,
"learning_rate": 3.956043956043956e-05,
"loss": 0.7401,
"step": 2406
},
{
"epoch": 2.6439653988740903,
"grad_norm": 0.5480525493621826,
"learning_rate": 3.943833943833943e-05,
"loss": 0.5567,
"step": 2407
},
{
"epoch": 2.6450638473156665,
"grad_norm": 0.6171422004699707,
"learning_rate": 3.931623931623931e-05,
"loss": 0.721,
"step": 2408
},
{
"epoch": 2.646162295757243,
"grad_norm": 0.6719728708267212,
"learning_rate": 3.919413919413919e-05,
"loss": 0.5015,
"step": 2409
},
{
"epoch": 2.647260744198819,
"grad_norm": 1.8106555938720703,
"learning_rate": 3.907203907203906e-05,
"loss": 0.6954,
"step": 2410
},
{
"epoch": 2.6483591926403953,
"grad_norm": 0.42534878849983215,
"learning_rate": 3.894993894993895e-05,
"loss": 0.5241,
"step": 2411
},
{
"epoch": 2.6494576410819715,
"grad_norm": 0.8733202219009399,
"learning_rate": 3.882783882783883e-05,
"loss": 0.4485,
"step": 2412
},
{
"epoch": 2.650556089523548,
"grad_norm": 0.9050257802009583,
"learning_rate": 3.87057387057387e-05,
"loss": 0.6202,
"step": 2413
},
{
"epoch": 2.651654537965124,
"grad_norm": 0.650347888469696,
"learning_rate": 3.8583638583638584e-05,
"loss": 0.621,
"step": 2414
},
{
"epoch": 2.6527529864067008,
"grad_norm": 6.092042446136475,
"learning_rate": 3.8461538461538456e-05,
"loss": 0.5143,
"step": 2415
},
{
"epoch": 2.653851434848277,
"grad_norm": 0.7801241874694824,
"learning_rate": 3.8339438339438335e-05,
"loss": 0.5424,
"step": 2416
},
{
"epoch": 2.654949883289853,
"grad_norm": 0.5492686629295349,
"learning_rate": 3.821733821733822e-05,
"loss": 0.642,
"step": 2417
},
{
"epoch": 2.656048331731429,
"grad_norm": 0.4257514774799347,
"learning_rate": 3.809523809523809e-05,
"loss": 0.8273,
"step": 2418
},
{
"epoch": 2.6571467801730058,
"grad_norm": 1.0180964469909668,
"learning_rate": 3.797313797313797e-05,
"loss": 0.6962,
"step": 2419
},
{
"epoch": 2.658245228614582,
"grad_norm": 0.3844882547855377,
"learning_rate": 3.785103785103784e-05,
"loss": 0.7315,
"step": 2420
},
{
"epoch": 2.659343677056158,
"grad_norm": 0.46182385087013245,
"learning_rate": 3.772893772893772e-05,
"loss": 0.3889,
"step": 2421
},
{
"epoch": 2.6604421254977346,
"grad_norm": 0.562627375125885,
"learning_rate": 3.760683760683761e-05,
"loss": 0.6415,
"step": 2422
},
{
"epoch": 2.6615405739393108,
"grad_norm": 0.3234645128250122,
"learning_rate": 3.7484737484737486e-05,
"loss": 0.4819,
"step": 2423
},
{
"epoch": 2.662639022380887,
"grad_norm": 0.6804205775260925,
"learning_rate": 3.736263736263736e-05,
"loss": 0.4248,
"step": 2424
},
{
"epoch": 2.663737470822463,
"grad_norm": 0.5543864369392395,
"learning_rate": 3.7240537240537236e-05,
"loss": 0.5259,
"step": 2425
},
{
"epoch": 2.6648359192640396,
"grad_norm": 0.8411497473716736,
"learning_rate": 3.7118437118437115e-05,
"loss": 0.5448,
"step": 2426
},
{
"epoch": 2.6659343677056158,
"grad_norm": 0.4386245608329773,
"learning_rate": 3.6996336996336994e-05,
"loss": 0.9601,
"step": 2427
},
{
"epoch": 2.6670328161471923,
"grad_norm": 0.773210346698761,
"learning_rate": 3.687423687423687e-05,
"loss": 0.8601,
"step": 2428
},
{
"epoch": 2.6681312645887685,
"grad_norm": 0.4636232852935791,
"learning_rate": 3.675213675213675e-05,
"loss": 0.6322,
"step": 2429
},
{
"epoch": 2.6692297130303446,
"grad_norm": 1.6318496465682983,
"learning_rate": 3.663003663003662e-05,
"loss": 0.4402,
"step": 2430
},
{
"epoch": 2.6703281614719208,
"grad_norm": 0.5299782156944275,
"learning_rate": 3.65079365079365e-05,
"loss": 0.5622,
"step": 2431
},
{
"epoch": 2.6714266099134973,
"grad_norm": 1.1223825216293335,
"learning_rate": 3.638583638583638e-05,
"loss": 0.5994,
"step": 2432
},
{
"epoch": 2.6725250583550735,
"grad_norm": 1.8495402336120605,
"learning_rate": 3.626373626373626e-05,
"loss": 0.669,
"step": 2433
},
{
"epoch": 2.6736235067966496,
"grad_norm": 0.4963383972644806,
"learning_rate": 3.614163614163614e-05,
"loss": 0.5412,
"step": 2434
},
{
"epoch": 2.674721955238226,
"grad_norm": 0.5644822716712952,
"learning_rate": 3.601953601953602e-05,
"loss": 0.5768,
"step": 2435
},
{
"epoch": 2.6758204036798023,
"grad_norm": 0.5272318720817566,
"learning_rate": 3.5897435897435896e-05,
"loss": 0.5909,
"step": 2436
},
{
"epoch": 2.6769188521213785,
"grad_norm": 0.29838863015174866,
"learning_rate": 3.5775335775335774e-05,
"loss": 0.5625,
"step": 2437
},
{
"epoch": 2.6780173005629546,
"grad_norm": 0.5375344157218933,
"learning_rate": 3.565323565323565e-05,
"loss": 0.5932,
"step": 2438
},
{
"epoch": 2.679115749004531,
"grad_norm": 0.7850833535194397,
"learning_rate": 3.5531135531135525e-05,
"loss": 0.6706,
"step": 2439
},
{
"epoch": 2.6802141974461073,
"grad_norm": 0.5286651253700256,
"learning_rate": 3.540903540903541e-05,
"loss": 0.6865,
"step": 2440
},
{
"epoch": 2.681312645887684,
"grad_norm": 0.9832364320755005,
"learning_rate": 3.528693528693528e-05,
"loss": 0.7941,
"step": 2441
},
{
"epoch": 2.68241109432926,
"grad_norm": 0.4431805908679962,
"learning_rate": 3.516483516483516e-05,
"loss": 0.4706,
"step": 2442
},
{
"epoch": 2.683509542770836,
"grad_norm": 1.7264482975006104,
"learning_rate": 3.504273504273504e-05,
"loss": 0.6308,
"step": 2443
},
{
"epoch": 2.6846079912124123,
"grad_norm": 0.6196084022521973,
"learning_rate": 3.492063492063492e-05,
"loss": 1.0233,
"step": 2444
},
{
"epoch": 2.6857064396539885,
"grad_norm": 0.855876088142395,
"learning_rate": 3.47985347985348e-05,
"loss": 0.5522,
"step": 2445
},
{
"epoch": 2.686804888095565,
"grad_norm": 0.45323798060417175,
"learning_rate": 3.4676434676434676e-05,
"loss": 0.6232,
"step": 2446
},
{
"epoch": 2.687903336537141,
"grad_norm": 0.577273964881897,
"learning_rate": 3.455433455433455e-05,
"loss": 0.5051,
"step": 2447
},
{
"epoch": 2.689001784978718,
"grad_norm": 0.4999620020389557,
"learning_rate": 3.4432234432234427e-05,
"loss": 0.4881,
"step": 2448
},
{
"epoch": 2.690100233420294,
"grad_norm": 0.5028046369552612,
"learning_rate": 3.431013431013431e-05,
"loss": 0.6575,
"step": 2449
},
{
"epoch": 2.69119868186187,
"grad_norm": 2.122028350830078,
"learning_rate": 3.4188034188034184e-05,
"loss": 0.7226,
"step": 2450
},
{
"epoch": 2.692297130303446,
"grad_norm": 0.4979703426361084,
"learning_rate": 3.406593406593406e-05,
"loss": 0.5768,
"step": 2451
},
{
"epoch": 2.693395578745023,
"grad_norm": 0.9270527958869934,
"learning_rate": 3.394383394383394e-05,
"loss": 0.6464,
"step": 2452
},
{
"epoch": 2.694494027186599,
"grad_norm": 1.0739809274673462,
"learning_rate": 3.382173382173382e-05,
"loss": 0.753,
"step": 2453
},
{
"epoch": 2.695592475628175,
"grad_norm": 0.6039335131645203,
"learning_rate": 3.36996336996337e-05,
"loss": 0.7909,
"step": 2454
},
{
"epoch": 2.6966909240697516,
"grad_norm": 0.49040424823760986,
"learning_rate": 3.357753357753358e-05,
"loss": 0.6112,
"step": 2455
},
{
"epoch": 2.6977893725113278,
"grad_norm": 0.6890440583229065,
"learning_rate": 3.345543345543345e-05,
"loss": 0.6849,
"step": 2456
},
{
"epoch": 2.698887820952904,
"grad_norm": 0.7819212675094604,
"learning_rate": 3.333333333333333e-05,
"loss": 0.6797,
"step": 2457
},
{
"epoch": 2.69998626939448,
"grad_norm": 1.0147050619125366,
"learning_rate": 3.321123321123321e-05,
"loss": 0.6867,
"step": 2458
},
{
"epoch": 2.7010847178360566,
"grad_norm": 1.3562036752700806,
"learning_rate": 3.3089133089133086e-05,
"loss": 0.7811,
"step": 2459
},
{
"epoch": 2.7021831662776328,
"grad_norm": 0.5813838839530945,
"learning_rate": 3.2967032967032964e-05,
"loss": 0.5405,
"step": 2460
},
{
"epoch": 2.7032816147192094,
"grad_norm": 0.6152640581130981,
"learning_rate": 3.284493284493284e-05,
"loss": 0.425,
"step": 2461
},
{
"epoch": 2.7043800631607855,
"grad_norm": 1.1984590291976929,
"learning_rate": 3.272283272283272e-05,
"loss": 0.592,
"step": 2462
},
{
"epoch": 2.7054785116023616,
"grad_norm": 0.48487693071365356,
"learning_rate": 3.26007326007326e-05,
"loss": 0.5223,
"step": 2463
},
{
"epoch": 2.7065769600439378,
"grad_norm": 0.47191065549850464,
"learning_rate": 3.247863247863247e-05,
"loss": 0.6479,
"step": 2464
},
{
"epoch": 2.7076754084855144,
"grad_norm": 1.3167297840118408,
"learning_rate": 3.235653235653235e-05,
"loss": 0.4552,
"step": 2465
},
{
"epoch": 2.7087738569270905,
"grad_norm": 1.3219714164733887,
"learning_rate": 3.2234432234432237e-05,
"loss": 0.5839,
"step": 2466
},
{
"epoch": 2.7098723053686666,
"grad_norm": 0.8047394752502441,
"learning_rate": 3.211233211233211e-05,
"loss": 0.795,
"step": 2467
},
{
"epoch": 2.710970753810243,
"grad_norm": 0.6053475737571716,
"learning_rate": 3.199023199023199e-05,
"loss": 0.743,
"step": 2468
},
{
"epoch": 2.7120692022518194,
"grad_norm": 0.4619985818862915,
"learning_rate": 3.1868131868131866e-05,
"loss": 0.642,
"step": 2469
},
{
"epoch": 2.7131676506933955,
"grad_norm": 0.8241426944732666,
"learning_rate": 3.1746031746031745e-05,
"loss": 0.521,
"step": 2470
},
{
"epoch": 2.7142660991349716,
"grad_norm": 0.4344565272331238,
"learning_rate": 3.162393162393162e-05,
"loss": 0.4615,
"step": 2471
},
{
"epoch": 2.715364547576548,
"grad_norm": 0.9640605449676514,
"learning_rate": 3.15018315018315e-05,
"loss": 0.4735,
"step": 2472
},
{
"epoch": 2.7164629960181244,
"grad_norm": 0.49423810839653015,
"learning_rate": 3.1379731379731374e-05,
"loss": 0.7547,
"step": 2473
},
{
"epoch": 2.717561444459701,
"grad_norm": 0.7234408855438232,
"learning_rate": 3.125763125763125e-05,
"loss": 0.464,
"step": 2474
},
{
"epoch": 2.718659892901277,
"grad_norm": 0.542647123336792,
"learning_rate": 3.113553113553113e-05,
"loss": 0.5563,
"step": 2475
},
{
"epoch": 2.719758341342853,
"grad_norm": 0.555722177028656,
"learning_rate": 3.101343101343101e-05,
"loss": 0.6899,
"step": 2476
},
{
"epoch": 2.7208567897844294,
"grad_norm": 0.6171600222587585,
"learning_rate": 3.089133089133089e-05,
"loss": 0.6088,
"step": 2477
},
{
"epoch": 2.7219552382260055,
"grad_norm": 0.9118738770484924,
"learning_rate": 3.076923076923077e-05,
"loss": 0.7778,
"step": 2478
},
{
"epoch": 2.723053686667582,
"grad_norm": 0.6610655784606934,
"learning_rate": 3.064713064713064e-05,
"loss": 0.6935,
"step": 2479
},
{
"epoch": 2.724152135109158,
"grad_norm": 0.6729289889335632,
"learning_rate": 3.0525030525030525e-05,
"loss": 0.792,
"step": 2480
},
{
"epoch": 2.725250583550735,
"grad_norm": 0.4955647587776184,
"learning_rate": 3.04029304029304e-05,
"loss": 0.6746,
"step": 2481
},
{
"epoch": 2.726349031992311,
"grad_norm": 0.42975953221321106,
"learning_rate": 3.028083028083028e-05,
"loss": 0.5318,
"step": 2482
},
{
"epoch": 2.727447480433887,
"grad_norm": 0.3555055856704712,
"learning_rate": 3.0158730158730154e-05,
"loss": 0.6377,
"step": 2483
},
{
"epoch": 2.728545928875463,
"grad_norm": 3.138209342956543,
"learning_rate": 3.0036630036630036e-05,
"loss": 0.6296,
"step": 2484
},
{
"epoch": 2.72964437731704,
"grad_norm": 0.5710242390632629,
"learning_rate": 2.9914529914529912e-05,
"loss": 0.8987,
"step": 2485
},
{
"epoch": 2.730742825758616,
"grad_norm": 0.5200769305229187,
"learning_rate": 2.979242979242979e-05,
"loss": 0.5154,
"step": 2486
},
{
"epoch": 2.731841274200192,
"grad_norm": 0.797572910785675,
"learning_rate": 2.9670329670329666e-05,
"loss": 0.8039,
"step": 2487
},
{
"epoch": 2.7329397226417687,
"grad_norm": 0.4667447805404663,
"learning_rate": 2.9548229548229548e-05,
"loss": 0.586,
"step": 2488
},
{
"epoch": 2.734038171083345,
"grad_norm": 0.5500869154930115,
"learning_rate": 2.9426129426129423e-05,
"loss": 0.7007,
"step": 2489
},
{
"epoch": 2.735136619524921,
"grad_norm": 0.5311625003814697,
"learning_rate": 2.9304029304029302e-05,
"loss": 0.4257,
"step": 2490
},
{
"epoch": 2.736235067966497,
"grad_norm": 0.6474941968917847,
"learning_rate": 2.9181929181929177e-05,
"loss": 0.4747,
"step": 2491
},
{
"epoch": 2.7373335164080737,
"grad_norm": 1.1186646223068237,
"learning_rate": 2.9059829059829056e-05,
"loss": 0.8177,
"step": 2492
},
{
"epoch": 2.73843196484965,
"grad_norm": 2.455371379852295,
"learning_rate": 2.8937728937728938e-05,
"loss": 0.6535,
"step": 2493
},
{
"epoch": 2.7395304132912264,
"grad_norm": 0.5033484101295471,
"learning_rate": 2.8815628815628813e-05,
"loss": 0.525,
"step": 2494
},
{
"epoch": 2.7406288617328025,
"grad_norm": 0.5826357007026672,
"learning_rate": 2.869352869352869e-05,
"loss": 0.476,
"step": 2495
},
{
"epoch": 2.7417273101743787,
"grad_norm": 0.5875104665756226,
"learning_rate": 2.8571428571428567e-05,
"loss": 0.6903,
"step": 2496
},
{
"epoch": 2.742825758615955,
"grad_norm": 0.6006028056144714,
"learning_rate": 2.844932844932845e-05,
"loss": 0.8522,
"step": 2497
},
{
"epoch": 2.7439242070575314,
"grad_norm": 0.5605003833770752,
"learning_rate": 2.8327228327228325e-05,
"loss": 0.5312,
"step": 2498
},
{
"epoch": 2.7450226554991075,
"grad_norm": 0.7641153931617737,
"learning_rate": 2.8205128205128204e-05,
"loss": 0.6841,
"step": 2499
},
{
"epoch": 2.7461211039406836,
"grad_norm": 0.5523414015769958,
"learning_rate": 2.808302808302808e-05,
"loss": 0.6582,
"step": 2500
},
{
"epoch": 2.7472195523822602,
"grad_norm": 0.40714672207832336,
"learning_rate": 2.796092796092796e-05,
"loss": 0.7493,
"step": 2501
},
{
"epoch": 2.7483180008238364,
"grad_norm": 0.6960926651954651,
"learning_rate": 2.7838827838827836e-05,
"loss": 0.7104,
"step": 2502
},
{
"epoch": 2.7494164492654125,
"grad_norm": 0.42409783601760864,
"learning_rate": 2.7716727716727715e-05,
"loss": 0.5643,
"step": 2503
},
{
"epoch": 2.7505148977069886,
"grad_norm": 0.5174455046653748,
"learning_rate": 2.759462759462759e-05,
"loss": 0.4545,
"step": 2504
},
{
"epoch": 2.7516133461485652,
"grad_norm": 0.6353528499603271,
"learning_rate": 2.747252747252747e-05,
"loss": 0.5068,
"step": 2505
},
{
"epoch": 2.7527117945901414,
"grad_norm": 0.46814125776290894,
"learning_rate": 2.7350427350427348e-05,
"loss": 0.7979,
"step": 2506
},
{
"epoch": 2.753810243031718,
"grad_norm": 0.7229417562484741,
"learning_rate": 2.7228327228327227e-05,
"loss": 0.6212,
"step": 2507
},
{
"epoch": 2.754908691473294,
"grad_norm": 1.2155603170394897,
"learning_rate": 2.7106227106227102e-05,
"loss": 0.8444,
"step": 2508
},
{
"epoch": 2.7560071399148702,
"grad_norm": 0.462703138589859,
"learning_rate": 2.698412698412698e-05,
"loss": 0.8263,
"step": 2509
},
{
"epoch": 2.7571055883564464,
"grad_norm": 0.9474642872810364,
"learning_rate": 2.6862026862026863e-05,
"loss": 0.7586,
"step": 2510
},
{
"epoch": 2.758204036798023,
"grad_norm": 4.502622127532959,
"learning_rate": 2.6739926739926738e-05,
"loss": 0.5806,
"step": 2511
},
{
"epoch": 2.759302485239599,
"grad_norm": 1.1251213550567627,
"learning_rate": 2.6617826617826617e-05,
"loss": 0.6333,
"step": 2512
},
{
"epoch": 2.7604009336811752,
"grad_norm": 0.7035579681396484,
"learning_rate": 2.6495726495726492e-05,
"loss": 0.4739,
"step": 2513
},
{
"epoch": 2.761499382122752,
"grad_norm": 0.5279493927955627,
"learning_rate": 2.6373626373626374e-05,
"loss": 0.597,
"step": 2514
},
{
"epoch": 2.762597830564328,
"grad_norm": 0.5512554049491882,
"learning_rate": 2.625152625152625e-05,
"loss": 0.6471,
"step": 2515
},
{
"epoch": 2.763696279005904,
"grad_norm": 0.857778012752533,
"learning_rate": 2.6129426129426128e-05,
"loss": 0.6172,
"step": 2516
},
{
"epoch": 2.7647947274474802,
"grad_norm": 0.5348466634750366,
"learning_rate": 2.6007326007326004e-05,
"loss": 0.8074,
"step": 2517
},
{
"epoch": 2.765893175889057,
"grad_norm": 0.5413629412651062,
"learning_rate": 2.5885225885225882e-05,
"loss": 0.3879,
"step": 2518
},
{
"epoch": 2.766991624330633,
"grad_norm": 0.569411039352417,
"learning_rate": 2.576312576312576e-05,
"loss": 0.4392,
"step": 2519
},
{
"epoch": 2.7680900727722095,
"grad_norm": 0.5127429962158203,
"learning_rate": 2.564102564102564e-05,
"loss": 0.6566,
"step": 2520
},
{
"epoch": 2.7691885212137857,
"grad_norm": 0.7328614592552185,
"learning_rate": 2.5518925518925515e-05,
"loss": 0.6801,
"step": 2521
},
{
"epoch": 2.770286969655362,
"grad_norm": 0.615686297416687,
"learning_rate": 2.5396825396825394e-05,
"loss": 0.6366,
"step": 2522
},
{
"epoch": 2.771385418096938,
"grad_norm": 0.5250161290168762,
"learning_rate": 2.5274725274725276e-05,
"loss": 0.5737,
"step": 2523
},
{
"epoch": 2.772483866538514,
"grad_norm": 0.6708832383155823,
"learning_rate": 2.515262515262515e-05,
"loss": 0.6681,
"step": 2524
},
{
"epoch": 2.7735823149800907,
"grad_norm": 0.6120278835296631,
"learning_rate": 2.503052503052503e-05,
"loss": 0.4964,
"step": 2525
},
{
"epoch": 2.774680763421667,
"grad_norm": 0.7024976015090942,
"learning_rate": 2.4908424908424905e-05,
"loss": 0.7984,
"step": 2526
},
{
"epoch": 2.7757792118632434,
"grad_norm": 7.281716823577881,
"learning_rate": 2.478632478632478e-05,
"loss": 0.7191,
"step": 2527
},
{
"epoch": 2.7768776603048195,
"grad_norm": 0.7347024083137512,
"learning_rate": 2.4664224664224663e-05,
"loss": 0.8684,
"step": 2528
},
{
"epoch": 2.7779761087463957,
"grad_norm": 1.1338274478912354,
"learning_rate": 2.454212454212454e-05,
"loss": 0.5936,
"step": 2529
},
{
"epoch": 2.779074557187972,
"grad_norm": 0.4176536202430725,
"learning_rate": 2.4420024420024417e-05,
"loss": 0.445,
"step": 2530
},
{
"epoch": 2.7801730056295484,
"grad_norm": 0.9390072822570801,
"learning_rate": 2.4297924297924295e-05,
"loss": 0.5821,
"step": 2531
},
{
"epoch": 2.7812714540711245,
"grad_norm": 1.1045840978622437,
"learning_rate": 2.4175824175824174e-05,
"loss": 0.7372,
"step": 2532
},
{
"epoch": 2.7823699025127007,
"grad_norm": 0.5568689703941345,
"learning_rate": 2.4053724053724053e-05,
"loss": 0.5005,
"step": 2533
},
{
"epoch": 2.7834683509542772,
"grad_norm": 0.2747582793235779,
"learning_rate": 2.3931623931623928e-05,
"loss": 0.5778,
"step": 2534
},
{
"epoch": 2.7845667993958534,
"grad_norm": 1.4027804136276245,
"learning_rate": 2.3809523809523807e-05,
"loss": 0.5368,
"step": 2535
},
{
"epoch": 2.7856652478374295,
"grad_norm": 0.7523220777511597,
"learning_rate": 2.368742368742369e-05,
"loss": 0.58,
"step": 2536
},
{
"epoch": 2.7867636962790057,
"grad_norm": 0.33777353167533875,
"learning_rate": 2.3565323565323564e-05,
"loss": 0.5269,
"step": 2537
},
{
"epoch": 2.7878621447205822,
"grad_norm": 0.5818787217140198,
"learning_rate": 2.344322344322344e-05,
"loss": 0.4459,
"step": 2538
},
{
"epoch": 2.7889605931621584,
"grad_norm": 0.36858034133911133,
"learning_rate": 2.3321123321123318e-05,
"loss": 0.712,
"step": 2539
},
{
"epoch": 2.790059041603735,
"grad_norm": 0.5299241542816162,
"learning_rate": 2.3199023199023194e-05,
"loss": 0.6086,
"step": 2540
},
{
"epoch": 2.791157490045311,
"grad_norm": 2.432325601577759,
"learning_rate": 2.3076923076923076e-05,
"loss": 1.0386,
"step": 2541
},
{
"epoch": 2.7922559384868872,
"grad_norm": 0.746638834476471,
"learning_rate": 2.2954822954822954e-05,
"loss": 0.7372,
"step": 2542
},
{
"epoch": 2.7933543869284634,
"grad_norm": 0.6017647981643677,
"learning_rate": 2.283272283272283e-05,
"loss": 0.9134,
"step": 2543
},
{
"epoch": 2.79445283537004,
"grad_norm": 0.7385385036468506,
"learning_rate": 2.271062271062271e-05,
"loss": 0.6827,
"step": 2544
},
{
"epoch": 2.795551283811616,
"grad_norm": 0.6607246994972229,
"learning_rate": 2.2588522588522587e-05,
"loss": 0.6333,
"step": 2545
},
{
"epoch": 2.7966497322531922,
"grad_norm": 0.40185117721557617,
"learning_rate": 2.2466422466422466e-05,
"loss": 0.6589,
"step": 2546
},
{
"epoch": 2.797748180694769,
"grad_norm": 0.48225662112236023,
"learning_rate": 2.234432234432234e-05,
"loss": 0.6571,
"step": 2547
},
{
"epoch": 2.798846629136345,
"grad_norm": 0.8996065855026245,
"learning_rate": 2.222222222222222e-05,
"loss": 0.7518,
"step": 2548
},
{
"epoch": 2.799945077577921,
"grad_norm": 0.7139112949371338,
"learning_rate": 2.21001221001221e-05,
"loss": 0.6517,
"step": 2549
},
{
"epoch": 2.8010435260194972,
"grad_norm": 0.5433416366577148,
"learning_rate": 2.1978021978021977e-05,
"loss": 0.3799,
"step": 2550
},
{
"epoch": 2.802141974461074,
"grad_norm": 0.3883088231086731,
"learning_rate": 2.1855921855921853e-05,
"loss": 0.9269,
"step": 2551
},
{
"epoch": 2.80324042290265,
"grad_norm": 0.5275357961654663,
"learning_rate": 2.173382173382173e-05,
"loss": 0.6606,
"step": 2552
},
{
"epoch": 2.8043388713442265,
"grad_norm": 0.4666341543197632,
"learning_rate": 2.1611721611721607e-05,
"loss": 0.6982,
"step": 2553
},
{
"epoch": 2.8054373197858027,
"grad_norm": 0.9221529364585876,
"learning_rate": 2.148962148962149e-05,
"loss": 0.4769,
"step": 2554
},
{
"epoch": 2.806535768227379,
"grad_norm": 0.7469640374183655,
"learning_rate": 2.1367521367521368e-05,
"loss": 0.6985,
"step": 2555
},
{
"epoch": 2.807634216668955,
"grad_norm": 0.6858775615692139,
"learning_rate": 2.1245421245421243e-05,
"loss": 0.4511,
"step": 2556
},
{
"epoch": 2.808732665110531,
"grad_norm": 1.266801357269287,
"learning_rate": 2.112332112332112e-05,
"loss": 0.421,
"step": 2557
},
{
"epoch": 2.8098311135521077,
"grad_norm": 0.5506262183189392,
"learning_rate": 2.1001221001221e-05,
"loss": 0.6082,
"step": 2558
},
{
"epoch": 2.810929561993684,
"grad_norm": 0.5359029173851013,
"learning_rate": 2.087912087912088e-05,
"loss": 0.8111,
"step": 2559
},
{
"epoch": 2.8120280104352604,
"grad_norm": 0.6969206929206848,
"learning_rate": 2.0757020757020754e-05,
"loss": 0.8331,
"step": 2560
},
{
"epoch": 2.8131264588768365,
"grad_norm": 0.6040379405021667,
"learning_rate": 2.0634920634920633e-05,
"loss": 0.575,
"step": 2561
},
{
"epoch": 2.8142249073184127,
"grad_norm": 1.3847273588180542,
"learning_rate": 2.0512820512820512e-05,
"loss": 0.5442,
"step": 2562
},
{
"epoch": 2.815323355759989,
"grad_norm": 0.8050490617752075,
"learning_rate": 2.039072039072039e-05,
"loss": 0.6267,
"step": 2563
},
{
"epoch": 2.8164218042015654,
"grad_norm": 0.5663136839866638,
"learning_rate": 2.0268620268620266e-05,
"loss": 0.5246,
"step": 2564
},
{
"epoch": 2.8175202526431415,
"grad_norm": 0.3316130042076111,
"learning_rate": 2.0146520146520144e-05,
"loss": 0.5175,
"step": 2565
},
{
"epoch": 2.8186187010847177,
"grad_norm": 0.4782855808734894,
"learning_rate": 2.002442002442002e-05,
"loss": 0.5111,
"step": 2566
},
{
"epoch": 2.8197171495262943,
"grad_norm": 0.44766396284103394,
"learning_rate": 1.9902319902319902e-05,
"loss": 0.5825,
"step": 2567
},
{
"epoch": 2.8208155979678704,
"grad_norm": 0.6830618977546692,
"learning_rate": 1.978021978021978e-05,
"loss": 0.5685,
"step": 2568
},
{
"epoch": 2.8219140464094465,
"grad_norm": 0.5860748887062073,
"learning_rate": 1.9658119658119656e-05,
"loss": 0.7557,
"step": 2569
},
{
"epoch": 2.8230124948510227,
"grad_norm": 0.49533459544181824,
"learning_rate": 1.953601953601953e-05,
"loss": 0.7326,
"step": 2570
},
{
"epoch": 2.8241109432925993,
"grad_norm": 0.4989941418170929,
"learning_rate": 1.9413919413919413e-05,
"loss": 0.5757,
"step": 2571
},
{
"epoch": 2.8252093917341754,
"grad_norm": 0.4973461627960205,
"learning_rate": 1.9291819291819292e-05,
"loss": 0.5357,
"step": 2572
},
{
"epoch": 2.826307840175752,
"grad_norm": 0.7442370057106018,
"learning_rate": 1.9169719169719167e-05,
"loss": 0.7283,
"step": 2573
},
{
"epoch": 2.827406288617328,
"grad_norm": 1.3321865797042847,
"learning_rate": 1.9047619047619046e-05,
"loss": 0.5107,
"step": 2574
},
{
"epoch": 2.8285047370589043,
"grad_norm": 0.47394871711730957,
"learning_rate": 1.892551892551892e-05,
"loss": 0.5495,
"step": 2575
},
{
"epoch": 2.8296031855004804,
"grad_norm": 0.6102151274681091,
"learning_rate": 1.8803418803418804e-05,
"loss": 0.5983,
"step": 2576
},
{
"epoch": 2.830701633942057,
"grad_norm": 0.4657471179962158,
"learning_rate": 1.868131868131868e-05,
"loss": 0.5937,
"step": 2577
},
{
"epoch": 2.831800082383633,
"grad_norm": 0.41180238127708435,
"learning_rate": 1.8559218559218558e-05,
"loss": 0.7775,
"step": 2578
},
{
"epoch": 2.8328985308252093,
"grad_norm": 3.5043845176696777,
"learning_rate": 1.8437118437118436e-05,
"loss": 0.5304,
"step": 2579
},
{
"epoch": 2.833996979266786,
"grad_norm": 0.4502231776714325,
"learning_rate": 1.831501831501831e-05,
"loss": 0.6556,
"step": 2580
},
{
"epoch": 2.835095427708362,
"grad_norm": 0.6165898442268372,
"learning_rate": 1.819291819291819e-05,
"loss": 0.8434,
"step": 2581
},
{
"epoch": 2.836193876149938,
"grad_norm": 0.5112649202346802,
"learning_rate": 1.807081807081807e-05,
"loss": 0.7429,
"step": 2582
},
{
"epoch": 2.8372923245915143,
"grad_norm": 0.4834790527820587,
"learning_rate": 1.7948717948717948e-05,
"loss": 0.5772,
"step": 2583
},
{
"epoch": 2.838390773033091,
"grad_norm": 0.4251219630241394,
"learning_rate": 1.7826617826617826e-05,
"loss": 0.5192,
"step": 2584
},
{
"epoch": 2.839489221474667,
"grad_norm": 0.7645363807678223,
"learning_rate": 1.7704517704517705e-05,
"loss": 0.6624,
"step": 2585
},
{
"epoch": 2.8405876699162436,
"grad_norm": 0.5651314854621887,
"learning_rate": 1.758241758241758e-05,
"loss": 0.5829,
"step": 2586
},
{
"epoch": 2.8416861183578197,
"grad_norm": 1.059164047241211,
"learning_rate": 1.746031746031746e-05,
"loss": 0.6688,
"step": 2587
},
{
"epoch": 2.842784566799396,
"grad_norm": 2.2424001693725586,
"learning_rate": 1.7338217338217338e-05,
"loss": 0.4515,
"step": 2588
},
{
"epoch": 2.843883015240972,
"grad_norm": 0.6211466789245605,
"learning_rate": 1.7216117216117213e-05,
"loss": 0.836,
"step": 2589
},
{
"epoch": 2.8449814636825486,
"grad_norm": 0.4224345088005066,
"learning_rate": 1.7094017094017092e-05,
"loss": 0.536,
"step": 2590
},
{
"epoch": 2.8460799121241247,
"grad_norm": 0.7985780239105225,
"learning_rate": 1.697191697191697e-05,
"loss": 0.7433,
"step": 2591
},
{
"epoch": 2.847178360565701,
"grad_norm": 1.4033039808273315,
"learning_rate": 1.684981684981685e-05,
"loss": 0.7479,
"step": 2592
},
{
"epoch": 2.8482768090072774,
"grad_norm": 1.1432255506515503,
"learning_rate": 1.6727716727716725e-05,
"loss": 0.652,
"step": 2593
},
{
"epoch": 2.8493752574488536,
"grad_norm": 0.9324535727500916,
"learning_rate": 1.6605616605616603e-05,
"loss": 0.5225,
"step": 2594
},
{
"epoch": 2.8504737058904297,
"grad_norm": 0.5573447942733765,
"learning_rate": 1.6483516483516482e-05,
"loss": 0.6649,
"step": 2595
},
{
"epoch": 2.851572154332006,
"grad_norm": 0.6875207424163818,
"learning_rate": 1.636141636141636e-05,
"loss": 0.7334,
"step": 2596
},
{
"epoch": 2.8526706027735824,
"grad_norm": 0.32099124789237976,
"learning_rate": 1.6239316239316236e-05,
"loss": 0.5732,
"step": 2597
},
{
"epoch": 2.8537690512151586,
"grad_norm": 0.4142940938472748,
"learning_rate": 1.6117216117216118e-05,
"loss": 0.6605,
"step": 2598
},
{
"epoch": 2.8548674996567347,
"grad_norm": 0.5377205610275269,
"learning_rate": 1.5995115995115994e-05,
"loss": 0.5556,
"step": 2599
},
{
"epoch": 2.8559659480983113,
"grad_norm": 0.43509960174560547,
"learning_rate": 1.5873015873015872e-05,
"loss": 0.8321,
"step": 2600
},
{
"epoch": 2.8570643965398874,
"grad_norm": 0.4376494586467743,
"learning_rate": 1.575091575091575e-05,
"loss": 0.6392,
"step": 2601
},
{
"epoch": 2.8581628449814636,
"grad_norm": 0.507837176322937,
"learning_rate": 1.5628815628815626e-05,
"loss": 0.5326,
"step": 2602
},
{
"epoch": 2.8592612934230397,
"grad_norm": 29.0502986907959,
"learning_rate": 1.5506715506715505e-05,
"loss": 0.5478,
"step": 2603
},
{
"epoch": 2.8603597418646163,
"grad_norm": 0.6940420866012573,
"learning_rate": 1.5384615384615384e-05,
"loss": 1.3063,
"step": 2604
},
{
"epoch": 2.8614581903061924,
"grad_norm": 0.7178813219070435,
"learning_rate": 1.5262515262515263e-05,
"loss": 0.7447,
"step": 2605
},
{
"epoch": 2.862556638747769,
"grad_norm": 0.6209506392478943,
"learning_rate": 1.514041514041514e-05,
"loss": 0.5496,
"step": 2606
},
{
"epoch": 2.863655087189345,
"grad_norm": 0.5526819825172424,
"learning_rate": 1.5018315018315018e-05,
"loss": 0.4224,
"step": 2607
},
{
"epoch": 2.8647535356309213,
"grad_norm": 0.5056405663490295,
"learning_rate": 1.4896214896214895e-05,
"loss": 0.6248,
"step": 2608
},
{
"epoch": 2.8658519840724974,
"grad_norm": 2.416952610015869,
"learning_rate": 1.4774114774114774e-05,
"loss": 0.7551,
"step": 2609
},
{
"epoch": 2.866950432514074,
"grad_norm": 0.52223140001297,
"learning_rate": 1.4652014652014651e-05,
"loss": 1.1146,
"step": 2610
},
{
"epoch": 2.86804888095565,
"grad_norm": 0.685767650604248,
"learning_rate": 1.4529914529914528e-05,
"loss": 0.715,
"step": 2611
},
{
"epoch": 2.8691473293972263,
"grad_norm": 0.650374174118042,
"learning_rate": 1.4407814407814407e-05,
"loss": 0.8844,
"step": 2612
},
{
"epoch": 2.870245777838803,
"grad_norm": 0.46946465969085693,
"learning_rate": 1.4285714285714284e-05,
"loss": 0.9545,
"step": 2613
},
{
"epoch": 2.871344226280379,
"grad_norm": 0.5312052369117737,
"learning_rate": 1.4163614163614162e-05,
"loss": 0.5204,
"step": 2614
},
{
"epoch": 2.872442674721955,
"grad_norm": 0.41921889781951904,
"learning_rate": 1.404151404151404e-05,
"loss": 0.4614,
"step": 2615
},
{
"epoch": 2.8735411231635313,
"grad_norm": 0.513203501701355,
"learning_rate": 1.3919413919413918e-05,
"loss": 0.613,
"step": 2616
},
{
"epoch": 2.874639571605108,
"grad_norm": 1.1020901203155518,
"learning_rate": 1.3797313797313795e-05,
"loss": 0.525,
"step": 2617
},
{
"epoch": 2.875738020046684,
"grad_norm": 0.39301392436027527,
"learning_rate": 1.3675213675213674e-05,
"loss": 0.5799,
"step": 2618
},
{
"epoch": 2.8768364684882606,
"grad_norm": 1.576910376548767,
"learning_rate": 1.3553113553113551e-05,
"loss": 0.6286,
"step": 2619
},
{
"epoch": 2.8779349169298367,
"grad_norm": 0.36711424589157104,
"learning_rate": 1.3431013431013431e-05,
"loss": 0.7542,
"step": 2620
},
{
"epoch": 2.879033365371413,
"grad_norm": 1.2777636051177979,
"learning_rate": 1.3308913308913308e-05,
"loss": 0.6269,
"step": 2621
},
{
"epoch": 2.880131813812989,
"grad_norm": 0.5584180355072021,
"learning_rate": 1.3186813186813187e-05,
"loss": 0.5633,
"step": 2622
},
{
"epoch": 2.8812302622545656,
"grad_norm": 1.2418673038482666,
"learning_rate": 1.3064713064713064e-05,
"loss": 0.537,
"step": 2623
},
{
"epoch": 2.8823287106961417,
"grad_norm": 0.5850531458854675,
"learning_rate": 1.2942612942612941e-05,
"loss": 0.595,
"step": 2624
},
{
"epoch": 2.883427159137718,
"grad_norm": 1.054592251777649,
"learning_rate": 1.282051282051282e-05,
"loss": 0.8308,
"step": 2625
},
{
"epoch": 2.8845256075792944,
"grad_norm": 0.3231412470340729,
"learning_rate": 1.2698412698412697e-05,
"loss": 0.4044,
"step": 2626
},
{
"epoch": 2.8856240560208706,
"grad_norm": 0.47942933440208435,
"learning_rate": 1.2576312576312576e-05,
"loss": 0.6299,
"step": 2627
},
{
"epoch": 2.8867225044624467,
"grad_norm": 0.4884187579154968,
"learning_rate": 1.2454212454212453e-05,
"loss": 0.6606,
"step": 2628
},
{
"epoch": 2.887820952904023,
"grad_norm": 0.6658734083175659,
"learning_rate": 1.2332112332112331e-05,
"loss": 0.642,
"step": 2629
},
{
"epoch": 2.8889194013455994,
"grad_norm": 0.24990247189998627,
"learning_rate": 1.2210012210012208e-05,
"loss": 0.4041,
"step": 2630
},
{
"epoch": 2.8900178497871756,
"grad_norm": 0.6446508169174194,
"learning_rate": 1.2087912087912087e-05,
"loss": 0.7126,
"step": 2631
},
{
"epoch": 2.891116298228752,
"grad_norm": 0.7800988554954529,
"learning_rate": 1.1965811965811964e-05,
"loss": 0.6733,
"step": 2632
},
{
"epoch": 2.8922147466703283,
"grad_norm": 0.5319482684135437,
"learning_rate": 1.1843711843711844e-05,
"loss": 0.6445,
"step": 2633
},
{
"epoch": 2.8933131951119044,
"grad_norm": 0.6029678583145142,
"learning_rate": 1.172161172161172e-05,
"loss": 0.7642,
"step": 2634
},
{
"epoch": 2.8944116435534806,
"grad_norm": 0.9029693007469177,
"learning_rate": 1.1599511599511597e-05,
"loss": 0.635,
"step": 2635
},
{
"epoch": 2.8955100919950567,
"grad_norm": 0.6022691130638123,
"learning_rate": 1.1477411477411477e-05,
"loss": 0.5361,
"step": 2636
},
{
"epoch": 2.8966085404366333,
"grad_norm": 0.6777801513671875,
"learning_rate": 1.1355311355311354e-05,
"loss": 0.5099,
"step": 2637
},
{
"epoch": 2.8977069888782094,
"grad_norm": 0.4157528877258301,
"learning_rate": 1.1233211233211233e-05,
"loss": 0.5038,
"step": 2638
},
{
"epoch": 2.898805437319786,
"grad_norm": 2.6101133823394775,
"learning_rate": 1.111111111111111e-05,
"loss": 0.6324,
"step": 2639
},
{
"epoch": 2.899903885761362,
"grad_norm": 0.6885612607002258,
"learning_rate": 1.0989010989010989e-05,
"loss": 0.4931,
"step": 2640
},
{
"epoch": 2.9010023342029383,
"grad_norm": 0.5510079264640808,
"learning_rate": 1.0866910866910866e-05,
"loss": 0.5088,
"step": 2641
},
{
"epoch": 2.9021007826445144,
"grad_norm": 0.6099854111671448,
"learning_rate": 1.0744810744810744e-05,
"loss": 0.4647,
"step": 2642
},
{
"epoch": 2.903199231086091,
"grad_norm": 0.4390881657600403,
"learning_rate": 1.0622710622710621e-05,
"loss": 0.6787,
"step": 2643
},
{
"epoch": 2.904297679527667,
"grad_norm": 0.46238628029823303,
"learning_rate": 1.05006105006105e-05,
"loss": 0.5655,
"step": 2644
},
{
"epoch": 2.9053961279692433,
"grad_norm": 0.479106605052948,
"learning_rate": 1.0378510378510377e-05,
"loss": 0.7833,
"step": 2645
},
{
"epoch": 2.90649457641082,
"grad_norm": 0.4643683135509491,
"learning_rate": 1.0256410256410256e-05,
"loss": 0.4563,
"step": 2646
},
{
"epoch": 2.907593024852396,
"grad_norm": 0.4173976480960846,
"learning_rate": 1.0134310134310133e-05,
"loss": 0.6614,
"step": 2647
},
{
"epoch": 2.908691473293972,
"grad_norm": 0.7158990502357483,
"learning_rate": 1.001221001221001e-05,
"loss": 0.7342,
"step": 2648
},
{
"epoch": 2.9097899217355483,
"grad_norm": 0.7276301980018616,
"learning_rate": 9.89010989010989e-06,
"loss": 0.6883,
"step": 2649
},
{
"epoch": 2.910888370177125,
"grad_norm": 0.63588947057724,
"learning_rate": 9.768009768009766e-06,
"loss": 0.7533,
"step": 2650
},
{
"epoch": 2.911986818618701,
"grad_norm": 1.8038127422332764,
"learning_rate": 9.645909645909646e-06,
"loss": 0.6238,
"step": 2651
},
{
"epoch": 2.9130852670602776,
"grad_norm": 0.7289617657661438,
"learning_rate": 9.523809523809523e-06,
"loss": 0.4767,
"step": 2652
},
{
"epoch": 2.9141837155018537,
"grad_norm": 0.3828502893447876,
"learning_rate": 9.401709401709402e-06,
"loss": 0.4812,
"step": 2653
},
{
"epoch": 2.91528216394343,
"grad_norm": 0.5157826542854309,
"learning_rate": 9.279609279609279e-06,
"loss": 0.703,
"step": 2654
},
{
"epoch": 2.916380612385006,
"grad_norm": 0.6833345890045166,
"learning_rate": 9.157509157509156e-06,
"loss": 0.7471,
"step": 2655
},
{
"epoch": 2.9174790608265826,
"grad_norm": 1.0189886093139648,
"learning_rate": 9.035409035409035e-06,
"loss": 0.6065,
"step": 2656
},
{
"epoch": 2.9185775092681587,
"grad_norm": 0.5197221040725708,
"learning_rate": 8.913308913308913e-06,
"loss": 0.5904,
"step": 2657
},
{
"epoch": 2.919675957709735,
"grad_norm": 0.6265780925750732,
"learning_rate": 8.79120879120879e-06,
"loss": 0.5622,
"step": 2658
},
{
"epoch": 2.9207744061513115,
"grad_norm": 0.5703533887863159,
"learning_rate": 8.669108669108669e-06,
"loss": 0.8005,
"step": 2659
},
{
"epoch": 2.9218728545928876,
"grad_norm": 0.8656613230705261,
"learning_rate": 8.547008547008546e-06,
"loss": 0.4942,
"step": 2660
},
{
"epoch": 2.9229713030344637,
"grad_norm": 0.6180423498153687,
"learning_rate": 8.424908424908425e-06,
"loss": 0.8163,
"step": 2661
},
{
"epoch": 2.92406975147604,
"grad_norm": 0.7308143377304077,
"learning_rate": 8.302808302808302e-06,
"loss": 0.7639,
"step": 2662
},
{
"epoch": 2.9251681999176165,
"grad_norm": 0.585617184638977,
"learning_rate": 8.18070818070818e-06,
"loss": 0.7614,
"step": 2663
},
{
"epoch": 2.9262666483591926,
"grad_norm": 0.5277345776557922,
"learning_rate": 8.058608058608059e-06,
"loss": 0.6489,
"step": 2664
},
{
"epoch": 2.927365096800769,
"grad_norm": 0.3540293574333191,
"learning_rate": 7.936507936507936e-06,
"loss": 0.4503,
"step": 2665
},
{
"epoch": 2.9284635452423453,
"grad_norm": 0.554492175579071,
"learning_rate": 7.814407814407813e-06,
"loss": 0.5785,
"step": 2666
},
{
"epoch": 2.9295619936839215,
"grad_norm": 0.5547875761985779,
"learning_rate": 7.692307692307692e-06,
"loss": 0.5763,
"step": 2667
},
{
"epoch": 2.9306604421254976,
"grad_norm": 0.745947003364563,
"learning_rate": 7.57020757020757e-06,
"loss": 0.512,
"step": 2668
},
{
"epoch": 2.931758890567074,
"grad_norm": 0.47691571712493896,
"learning_rate": 7.448107448107448e-06,
"loss": 0.7018,
"step": 2669
},
{
"epoch": 2.9328573390086503,
"grad_norm": 0.9611607789993286,
"learning_rate": 7.3260073260073255e-06,
"loss": 0.7419,
"step": 2670
},
{
"epoch": 2.9339557874502264,
"grad_norm": 0.5495268106460571,
"learning_rate": 7.203907203907203e-06,
"loss": 0.6096,
"step": 2671
},
{
"epoch": 2.935054235891803,
"grad_norm": 0.8863226771354675,
"learning_rate": 7.081807081807081e-06,
"loss": 0.7149,
"step": 2672
},
{
"epoch": 2.936152684333379,
"grad_norm": 0.4234665334224701,
"learning_rate": 6.959706959706959e-06,
"loss": 0.6913,
"step": 2673
},
{
"epoch": 2.9372511327749553,
"grad_norm": 0.9667326211929321,
"learning_rate": 6.837606837606837e-06,
"loss": 0.4181,
"step": 2674
},
{
"epoch": 2.9383495812165314,
"grad_norm": 0.543683648109436,
"learning_rate": 6.715506715506716e-06,
"loss": 0.6329,
"step": 2675
},
{
"epoch": 2.939448029658108,
"grad_norm": 0.5083779692649841,
"learning_rate": 6.5934065934065935e-06,
"loss": 0.8742,
"step": 2676
},
{
"epoch": 2.940546478099684,
"grad_norm": 0.7212001085281372,
"learning_rate": 6.4713064713064706e-06,
"loss": 0.6912,
"step": 2677
},
{
"epoch": 2.9416449265412603,
"grad_norm": 0.9474835991859436,
"learning_rate": 6.349206349206348e-06,
"loss": 0.649,
"step": 2678
},
{
"epoch": 2.942743374982837,
"grad_norm": 0.8142021298408508,
"learning_rate": 6.227106227106226e-06,
"loss": 0.6136,
"step": 2679
},
{
"epoch": 2.943841823424413,
"grad_norm": 2.9018187522888184,
"learning_rate": 6.105006105006104e-06,
"loss": 0.7157,
"step": 2680
},
{
"epoch": 2.944940271865989,
"grad_norm": 0.4023605287075043,
"learning_rate": 5.982905982905982e-06,
"loss": 0.5675,
"step": 2681
},
{
"epoch": 2.9460387203075653,
"grad_norm": 0.3693840801715851,
"learning_rate": 5.86080586080586e-06,
"loss": 0.5982,
"step": 2682
},
{
"epoch": 2.947137168749142,
"grad_norm": 0.4298234283924103,
"learning_rate": 5.738705738705739e-06,
"loss": 0.5379,
"step": 2683
},
{
"epoch": 2.948235617190718,
"grad_norm": 0.6495395302772522,
"learning_rate": 5.6166056166056165e-06,
"loss": 0.5411,
"step": 2684
},
{
"epoch": 2.9493340656322946,
"grad_norm": 0.44857510924339294,
"learning_rate": 5.494505494505494e-06,
"loss": 0.5154,
"step": 2685
},
{
"epoch": 2.9504325140738707,
"grad_norm": 0.7485830187797546,
"learning_rate": 5.372405372405372e-06,
"loss": 0.6595,
"step": 2686
},
{
"epoch": 2.951530962515447,
"grad_norm": 0.5141469836235046,
"learning_rate": 5.25030525030525e-06,
"loss": 0.6289,
"step": 2687
},
{
"epoch": 2.952629410957023,
"grad_norm": 0.8847435712814331,
"learning_rate": 5.128205128205128e-06,
"loss": 0.6734,
"step": 2688
},
{
"epoch": 2.9537278593985996,
"grad_norm": 0.570573091506958,
"learning_rate": 5.006105006105005e-06,
"loss": 0.7013,
"step": 2689
},
{
"epoch": 2.9548263078401757,
"grad_norm": 0.4376991391181946,
"learning_rate": 4.884004884004883e-06,
"loss": 0.5918,
"step": 2690
},
{
"epoch": 2.955924756281752,
"grad_norm": 0.5480318069458008,
"learning_rate": 4.7619047619047615e-06,
"loss": 0.6227,
"step": 2691
},
{
"epoch": 2.9570232047233285,
"grad_norm": 0.5831297636032104,
"learning_rate": 4.639804639804639e-06,
"loss": 0.6264,
"step": 2692
},
{
"epoch": 2.9581216531649046,
"grad_norm": 1.5778921842575073,
"learning_rate": 4.517704517704517e-06,
"loss": 0.6352,
"step": 2693
},
{
"epoch": 2.9592201016064807,
"grad_norm": 0.9567496180534363,
"learning_rate": 4.395604395604395e-06,
"loss": 0.6067,
"step": 2694
},
{
"epoch": 2.960318550048057,
"grad_norm": 0.5237869620323181,
"learning_rate": 4.273504273504273e-06,
"loss": 0.8241,
"step": 2695
},
{
"epoch": 2.9614169984896335,
"grad_norm": 0.3452164828777313,
"learning_rate": 4.151404151404151e-06,
"loss": 0.5718,
"step": 2696
},
{
"epoch": 2.9625154469312096,
"grad_norm": 0.42237767577171326,
"learning_rate": 4.0293040293040296e-06,
"loss": 0.5199,
"step": 2697
},
{
"epoch": 2.963613895372786,
"grad_norm": 0.7035055756568909,
"learning_rate": 3.907203907203907e-06,
"loss": 0.7078,
"step": 2698
},
{
"epoch": 2.9647123438143623,
"grad_norm": 0.39236482977867126,
"learning_rate": 3.785103785103785e-06,
"loss": 0.59,
"step": 2699
},
{
"epoch": 2.9658107922559385,
"grad_norm": 1.1658680438995361,
"learning_rate": 3.6630036630036627e-06,
"loss": 0.53,
"step": 2700
},
{
"epoch": 2.9669092406975146,
"grad_norm": 0.6797634363174438,
"learning_rate": 3.5409035409035406e-06,
"loss": 0.6763,
"step": 2701
},
{
"epoch": 2.968007689139091,
"grad_norm": 1.0421425104141235,
"learning_rate": 3.4188034188034185e-06,
"loss": 0.4,
"step": 2702
},
{
"epoch": 2.9691061375806673,
"grad_norm": 0.36937475204467773,
"learning_rate": 3.2967032967032968e-06,
"loss": 0.5401,
"step": 2703
},
{
"epoch": 2.9702045860222435,
"grad_norm": 0.4324638843536377,
"learning_rate": 3.174603174603174e-06,
"loss": 0.5882,
"step": 2704
},
{
"epoch": 2.97130303446382,
"grad_norm": 1.2700526714324951,
"learning_rate": 3.052503052503052e-06,
"loss": 0.613,
"step": 2705
},
{
"epoch": 2.972401482905396,
"grad_norm": 0.5261131525039673,
"learning_rate": 2.93040293040293e-06,
"loss": 0.6279,
"step": 2706
},
{
"epoch": 2.9734999313469723,
"grad_norm": 0.42924660444259644,
"learning_rate": 2.8083028083028082e-06,
"loss": 1.0058,
"step": 2707
},
{
"epoch": 2.9745983797885485,
"grad_norm": 3.100399971008301,
"learning_rate": 2.686202686202686e-06,
"loss": 0.5209,
"step": 2708
},
{
"epoch": 2.975696828230125,
"grad_norm": 0.3666403293609619,
"learning_rate": 2.564102564102564e-06,
"loss": 0.5231,
"step": 2709
},
{
"epoch": 2.976795276671701,
"grad_norm": 1.1315009593963623,
"learning_rate": 2.4420024420024414e-06,
"loss": 0.4449,
"step": 2710
},
{
"epoch": 2.9778937251132778,
"grad_norm": 0.3323412537574768,
"learning_rate": 2.3199023199023197e-06,
"loss": 0.4806,
"step": 2711
},
{
"epoch": 2.978992173554854,
"grad_norm": 0.7348967790603638,
"learning_rate": 2.1978021978021976e-06,
"loss": 0.7521,
"step": 2712
},
{
"epoch": 2.98009062199643,
"grad_norm": 1.018898606300354,
"learning_rate": 2.0757020757020754e-06,
"loss": 0.8468,
"step": 2713
},
{
"epoch": 2.981189070438006,
"grad_norm": 0.46808505058288574,
"learning_rate": 1.9536019536019533e-06,
"loss": 0.6992,
"step": 2714
},
{
"epoch": 2.9822875188795823,
"grad_norm": 0.5411276817321777,
"learning_rate": 1.8315018315018314e-06,
"loss": 0.5949,
"step": 2715
},
{
"epoch": 2.983385967321159,
"grad_norm": 0.45061302185058594,
"learning_rate": 1.7094017094017092e-06,
"loss": 0.4617,
"step": 2716
},
{
"epoch": 2.984484415762735,
"grad_norm": 0.44529294967651367,
"learning_rate": 1.587301587301587e-06,
"loss": 0.5811,
"step": 2717
},
{
"epoch": 2.9855828642043116,
"grad_norm": 1.255299687385559,
"learning_rate": 1.465201465201465e-06,
"loss": 1.1899,
"step": 2718
},
{
"epoch": 2.9866813126458878,
"grad_norm": 0.8325234651565552,
"learning_rate": 1.343101343101343e-06,
"loss": 0.6344,
"step": 2719
},
{
"epoch": 2.987779761087464,
"grad_norm": 1.0692095756530762,
"learning_rate": 1.2210012210012207e-06,
"loss": 0.5136,
"step": 2720
},
{
"epoch": 2.98887820952904,
"grad_norm": 0.4980855882167816,
"learning_rate": 1.0989010989010988e-06,
"loss": 0.6352,
"step": 2721
},
{
"epoch": 2.9899766579706166,
"grad_norm": 0.8502411246299744,
"learning_rate": 9.768009768009766e-07,
"loss": 0.599,
"step": 2722
},
{
"epoch": 2.9910751064121928,
"grad_norm": 0.4849570691585541,
"learning_rate": 8.547008547008546e-07,
"loss": 0.5862,
"step": 2723
},
{
"epoch": 2.992173554853769,
"grad_norm": 0.5491626858711243,
"learning_rate": 7.326007326007325e-07,
"loss": 0.5634,
"step": 2724
},
{
"epoch": 2.9932720032953455,
"grad_norm": 0.7289263606071472,
"learning_rate": 6.105006105006104e-07,
"loss": 0.6643,
"step": 2725
},
{
"epoch": 2.9943704517369216,
"grad_norm": 1.5343972444534302,
"learning_rate": 4.884004884004883e-07,
"loss": 0.71,
"step": 2726
},
{
"epoch": 2.9954689001784978,
"grad_norm": 0.5619814395904541,
"learning_rate": 3.6630036630036624e-07,
"loss": 0.721,
"step": 2727
},
{
"epoch": 2.996567348620074,
"grad_norm": 0.500442624092102,
"learning_rate": 2.4420024420024416e-07,
"loss": 0.6571,
"step": 2728
},
{
"epoch": 2.9976657970616505,
"grad_norm": 0.42292630672454834,
"learning_rate": 1.2210012210012208e-07,
"loss": 0.4772,
"step": 2729
},
{
"epoch": 2.9987642455032266,
"grad_norm": 0.4350331425666809,
"learning_rate": 0.0,
"loss": 0.7493,
"step": 2730
},
{
"epoch": 2.9987642455032266,
"step": 2730,
"total_flos": 1.0372510312766669e+18,
"train_loss": 0.674373844124022,
"train_runtime": 11584.4184,
"train_samples_per_second": 1.886,
"train_steps_per_second": 0.236
}
],
"logging_steps": 1.0,
"max_steps": 2730,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1.0372510312766669e+18,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}