From 666fb6269d88b4790bc0f3eb4936f1f72f934fac Mon Sep 17 00:00:00 2001
From: LinqiY <100989140+LinqiY@users.noreply.github.com>
Date: Fri, 26 Apr 2024 15:24:03 +0800
Subject: [PATCH 01/16] Add Mistral tp&pp model

---
 tests/models/mistral2/__init__.py             |    2 +
 .../__pycache__/__init__.cpython-310.pyc      |  Bin 0 -> 295 bytes
 .../configuration_mistraltp.cpython-310.pyc   |  Bin 0 -> 6283 bytes
 .../__pycache__/model.cpython-310.pyc         |  Bin 0 -> 49178 bytes
 .../__pycache__/modeltp.cpython-310.pyc       |  Bin 0 -> 52277 bytes
 .../mistral2/configuration_mistraltp.py       |  155 ++
 tests/models/mistral2/model.py                | 2026 +++++++++++++++
 tests/models/mistral2/modelpp.py              | 1922 ++++++++++++++
 tests/models/mistral2/modeltp.py              | 2254 +++++++++++++++++
 9 files changed, 6359 insertions(+)
 create mode 100644 tests/models/mistral2/__init__.py
 create mode 100644 tests/models/mistral2/__pycache__/__init__.cpython-310.pyc
 create mode 100644 tests/models/mistral2/__pycache__/configuration_mistraltp.cpython-310.pyc
 create mode 100644 tests/models/mistral2/__pycache__/model.cpython-310.pyc
 create mode 100644 tests/models/mistral2/__pycache__/modeltp.cpython-310.pyc
 create mode 100644 tests/models/mistral2/configuration_mistraltp.py
 create mode 100644 tests/models/mistral2/model.py
 create mode 100644 tests/models/mistral2/modelpp.py
 create mode 100644 tests/models/mistral2/modeltp.py
diff --git a/tests/models/mistral2/__init__.py b/tests/models/mistral2/__init__.py
new file mode 100644
index 0000000..9dc3f79
--- /dev/null
+++ b/tests/models/mistral2/__init__.py
@@ -0,0 +1,2 @@
+from .modeltp import MistralForCausalLM
+from .configuration_mistraltp import MistralConfig
\ No newline at end of file
diff --git a/tests/models/mistral2/__pycache__/__init__.cpython-310.pyc b/tests/models/mistral2/__pycache__/__init__.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..76a01ca4171928aebb54f37b4541ecbf0bd2731f
GIT binary patch
literal 295
zcmd1j<>g`kf)fuV(xQO$V-N=!FabFZKwK;XBvKes7;_kM8KW2(L2RZRrd;MIW+0n6
zm_d`}B_mLYCgUw3-^}8YqQo4x{37SX(&EG%A77v-FI3byKQApa-A|Jxiaj?!B{ip{
zpa^6~lz1{&qO>TnBr`uRJ{MvJP?i}eyON=Z1xSI3U(xzSsk!+jsk#~YxvBa&g_W6k
z`p)@2KAEoiC8@<F#rj}#ffU3DBYizRC`!)H$;nK`kdKeg%*!l^kJl@xyv1Py)LojB
PY6tR6G00LL1_4F@r8rN7

literal 0
HcmV?d00001

diff --git a/tests/models/mistral2/__pycache__/configuration_mistraltp.cpython-310.pyc b/tests/models/mistral2/__pycache__/configuration_mistraltp.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b9656ff93c053ebc2cddc39aad2ef3badc5f9ac6
GIT binary patch
literal 6283
zcmbtY&2JmW6<<;mMaq(GS^h}sw3#YRBN`H^52taB+Kpn%vH(l5Wg!J*7|Ye}kQ{qI
z=-FK<QW>B?0;HGr)ME}k^pZ=_OaFov{Ui2P^k5ih&js3|=<m%gcS)N1FiJ^JOLFGD
znfHG0_uh;*IXSN3wYK_h<InGC+TZ9+@2h|}ALGxz31MkFnw7KicFxJ~<a6{qVvjh5
zodTW<)~HqdCciVfrdeZF=_}1D@%)z~J4LJfSgVX5zQZ<zA4tPyj%RV3nV#Da%}^SF
z@Z49>u2pi;`G=BYjBvSiM}3dainiBma(TGk3WBzOvsP<`&8Bdh4a4MB)2lhLT0_(l
z1M?q!GQYP_U93s&d-fi$IZ#-O?Ny)oo_o0UCaTt5EITxV<8@Z@hi@OblEayyU-fA=
z$pyVHay`BLEeNjd<lz3imD?Gy@;e1<q?y|pEf>n8<zjgZKc#ZHJYGIio+wY2&z8@X
z&zGml7x3+L`J#2@Yps#9E?JkqD(@7()(YCr7(((^G?_RocC|4&rD@{~ukDub8GeHn
z?++6*ZNtas9`gfF@*xAf1~b@`dZKuPB3*x4W$OXy3w>_UWC4#FZXk?+Gs9ITlL-kn
z4Ab<ag&5OlT7}={rf3LEH)Jz(xEuI$%;F8<szDeLgUXO*O9b2uLdjt@wL_zf{=F!(
z`yyy51JGrJcHpypVcYCLa2uwWFC1YTvVUT{CL>_jE3GYAdKY!flf57H#ke}ItUhs$
zg6z}TH$f(WymE}jKHG1mx^HSFUUm$VBx9Mk`wV^tPTUrkB47>aIf(pV0@t4+#1=CR
zm+kTt0Z7I`dbaXk962ux+F{^V*%mhdk``C{vOH5oTam;uCK_<MLx>bE9vw9t&Q>~B
zWYfQ?o(S7}o@wmrzBuG;wl3VDKF6+kZQ>xqzNRo0;{d>0URqdKsAN-V`e!sf1SSpu
z4(RB|K#amFyGzLAaRRxav&$t7v_zx9C9n$wJ?Acc4HPAwk-QDM!k57K?m_ARvPmy2
zmE5BX%dfLGk`Tl8TinHT+a`1m=3Khrmar`Do^Hq27k>Psfeqgk$TlDD>25XF$I|sG
zU;EY69(>BB!!640(^*}-zPx;**^LJekG2UP>&ZTC<A6*56tNxH=VltFWP^z!;VO`B
zOT+aES_ma~P4Jq`woK-YAVBcoC%pv#|6dD8Rk2tVftwp<i)CM^#R2T6TibJfpzj%W
z$WOVp8w0-Stp^#LQ?1D|6`K8)7lJx_Qe8N0n-gQRHR*+I;Pz3-kv=P3hk>UXDS8yv
z$)BnAc+Izl_!y~6{2TL!;%yWdwgDO!>^`wbYJ$yf++V58F>wk*i$^ORSU9vkTB$G(
zi;(*Qrq>>=bjw_o{S4XJw=Hnq9+yE}iX2+yJjIupEp9$<d&nEAEQZpOeKr?|zDd4g
zaEfA66-?!JmN^{y<3@#BeRjbn4maGk=P9B9U#J~=QlN`MI~IeCDxty?C`lQsvU^=p
zwRm6%+mBZYqGnGp*6%aYHriZfCB~4Si@mBRo3*yps4Xood{|vvUb_Ba6>lpPaE6M3
zj==STthUd^#WyL%<c#`_dX{BLf*WRl?8Z*(RB7vt&^0OTj^Xk8j4$kPrv4hLIqbXh
z@CrwbhhDLl)R{ylo9${RQ!t6xRk~Sc@c|ZajL-G9=L<yw9Hehi2XitL)QNSkEiVmQ
zA9M;QbePXygrFVHZ0<Hy%Yrab5EAq|qS*=<2fotOqkvYsoyZ41^KGFzs(s;F-hRd<
zePIMRynT%AnnTqB>=21$jY;Lof`SGNeVmO_D0NU|ld5XN_Kf4|>Oys4X-F)nC$tP{
zp|9IR@#rZzXun+(<;2omD(<%4W0es*rtej90pR;wIRuS>6ejPfPSWv}5}mgxGAAu{
zk<Tv-mZoiR4rU7*HBoi6ak*wt4Ls5M#9_SrxP)ZR!f4$Dk&}~lk&ee<c3Xyg5D}mh
z;u-=YB~C_u2?^lr6w0_Jpid8oE^3M@{4CtqHO=iVoY+7SsE@S#)U^-#;2^op_t2?>
zI9VP1f&*Js_<92UTWXEj_S%M}tIh^p3U)ZyTn!01#w86D<|h3<I-^YGuIC@8vcF$j
zJl4X@E(hPyd12mb%y$t!6C*ma$lt``Fc@chn&GZ>tgS$BeIExMy^k$ULdHGA_K$={
ziV0-AY8h3%h?{Fhwpfp?tae}}R^|igwYd&WfpMA?L6eM(9^(6512qw4L3nAHx4eg|
zJuApeChVjq?DRT%`a?aWQGI*_zoDoo!lQOwRi2Y-4(aYGJ89FUTks22zmsWWxIen{
z)s=_hEs9pEpS7j8YwQXej7pTHGO=E-w-17r=c;$NZ{Lm&LAWF7wxfu%)AnT0rTPsz
zbI*0(-BC#!>!R83;ZU3~w&Kd}IQG#eC~yb%z=1!t8Yh4E?02g>Eml9991w>Zh}$!D
zI;8D(5BewAi)W>F)TU<p7SU&O#RSXyw4*cu`VZ|#ZbTWvH4<@K*W=MxazKyH;iwg4
zy{P|6`$E%RY3E7YfMB?qc~ydl3e;oZ8Y{28{Q1S-ul@D#H$Q#(tAG6QxBvXhA8v1N
zzRJ<SJjA`15SsSOKYbiO;5nLVfUa9TM~Ef!q8b&8OZYd2zccvre*w|Sbu?Y;lpu4R
zJY>Ez0y)wtKo&ZqkfWU`$YQ5xX`v>sc1FLzGd>yXj6s$<Wyo@89CEyK2J%ejJmf^@
z0_0?H_Dk*AIV<0p=uCFbcFtM3Zwj4@zsvnPhkHH^S4Sh8l|nSp%lFYlFI7hqy)+t4
z4<Wj5waWQuJUfj?SB~hPqBBY3M+qEFy>`5ers!zUclwK_hnyCov;7<s4;naTMB`bP
zNwx->n`ol9e~8LyL662W&Wxtgl<rbSH0tBzBcqb8S{GffTmoRY-H_iwI3+G#HThi<
z-y`uZiSLt`A#s(&dnA59;(ZbX33-{s6%uce_#p{K;sc0i8mn;i^z@}VlBoE69~Wr;
zx4EdS>*_L3*X7$7{;y}aEp@zr&$m2>*X$PuxH7%tJ$N8iYXSEIzorre|Kh8wrCPO`
z{F)waAaKb_ywmf2U$W}Sw4hyWAIJ&}3jFyC5XCFST=r8MDddXz?0qp;8p-G4pTd9g
zqvQWRyqr)-%stiArPGp<L{c`&G6}lbjs<0ooFFkt;w*`C5Zjv)sjNv{d}{aP1$tD|
zr|F4Ouu`I!xrW1aJsQ__Y%;Xzc|zA;god4bGp6gfsiGmu3wm(S=FwGPi@FzGG|?7v
zq&I@EE`Us~To6qrF)+DpCvi~0kn?1}MuIMZ2^7jzC9RMvtQB+lLf0k7{lENoHG8u%
zCYPaMhxHbXgz*wW*TVqkJ4yT$O~vu7;?|x1aPuxl==1kC)*o+gtvt{lZmr_~`sV6g
z{m!SG_tw|+m90DX*FRg;H&z}hlkp}9R`C<<Br5$RcHV8GLZ3oVI6IM_$mfgsT=Bmb
Gi~j>0wxh=Y

literal 0
HcmV?d00001

diff --git a/tests/models/mistral2/__pycache__/model.cpython-310.pyc b/tests/models/mistral2/__pycache__/model.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ab53c9573dc702d9ab95ac9870bc99c46c10a54a
GIT binary patch
literal 49178
zcmd75d7K>AeIM9WU48fToR|RyR|5nI3<xw1Qlvx^pdgTtXaPft041|YsHZViGw7K<
z2Cr&xOnPKmpe@QWEIHZF#<r~Nz=|VNv6HnU+liBSv$3tjUL|(ca-6MooWt4JRAf6|
zMUn}U;(ousS5<w?3`xoUV|%7wSG{`m?&EiS-|w{s2C^}He(TBPoK-Vof5@BePZBrp
z#@D?+9*fzra?Cbtvu>0P`8CU?{Km_1{Ko56BT-H?lI3J0RZcb1<+P#0XUZ7~mu=+C
zm=SB_%LBoaf;_S6nMSUh6Td`#urX8~Y7Cc$8zbcr@k`c68)M}$xlYx$G{(#0jfwJv
z+^6eX8{5j;8k6NoxzE(MH+GbFG<KGEHg=VFiC?z9yHP9`8+*!o<UUv5+qkQIS7Tp!
zU*qob-Hm(7_cZpG_csod58!#;9;i<>4werZG3V}^G4-R!d!c@B<52lf<8b+~#4uPd
zHI9^zG>(>!Hjb5#!EeYp?ig$LIfrf=3-{kNuExsmaQ3e~;5>NSD8JJlF274^;GxEM
zl^>QE%=(G)3Gq8wJ}JMClpn$GNd4W7Q{__<X0-mE#_96u#-rs&<$kRGSmR9jj9hQ2
zzqj$e^7|T(mmin=@%nc+-d}#dTu;=`HqMpL$@SLybmN`ncgpp)`V);O%TG2wQ2s#U
zsq$0eH(5X5xKO^(c)I*_<Adc7Ha=ATkoa$}e^29?@-uS1qh4-2TYk3jT=}`i#q!0*
zhsz&^|4w_?3$b$Ly)k>YU3?*C7oFU~%*}YY3ZFgpUij<{d@k8{+4~T7);?|DZQt{P
zQMM6gzkL8<4g_Hwd&<cy%-IK>dHY^o%_Z$a_Td+-@@0fA*+&reND#ILpQH9M_#AU;
zNO1x8$L;%Yf1lhhIyGm(S+wu}iFmn=XYa5dz_SN}w;T3@<)(9&{Z6NO)5Ozv*$?6A
zLr(E#{H9TE+22)O>U#RHeF9HU=y&n_3f?$rKLWo;oM(`>LfCf)F`csC1ON9pEs4ps
z&GWITM^`3_Pu{rDQnQzf=W4E}s`X-{WjpoK%N%IaQ<iU?g}<MEV#%wun$^0WxUjrb
zcl^ZDP25izzS(R}#r@&;oPO*>HShA-)_l!#&$e9GA3Ck@^H|eUt)&~{lR9_)F<#)U
z<f+paj-Q#HGX2Eq>g;95&p&#jS#8v2<qqC^Doe^)sw$^)zg?NFF1yuwrRsT3li8>=
zs_vqH@GXN>=31)a+DldBaN<2x*Ez?sJ+bU9Eql!ElT{RL{ItY=_FSKT@w{_o*=f!?
zr|VVMt<BXO)f)o29DGta7gV*@bZm**AGEz2OHSp|4bQ2#wH0S7>5o-yyW)CP<yGv~
zEb6${oOk`ZH}b7`t%_7XGk&eAY=2wLt;`|)%Pf4ea=cQ#TCLTqm(T)t^*)+cPStZ%
z1tmtlQCr6=$Bz1`dTV|j3HsYmoAv6fQ&En$teRa}dD%cqqe%)u2RMGZ)tsx%vvgf4
z&n-7+{qZi3t5sEZnicw^c86+B+qqtut0~v3RIk@uKeJS|E43zS+RrUFyKb%_XzQBP
zj@qu1&OM=QM>+PRwOJ487SQh1dflnB+C0Y}M6<gnSLlt*W>8wCs?B)^;WKBeHym|-
z$(i*BF4P=5x<$bJ=~lg7bAmD%e){nz9#yrgj$$Wd<=`sr??$NQM)S!a-Lq)Ws$vgm
zEqm{&Rb79qFTlC?IaS-=@l@;Drte_v49vEgvsJI6^Y0gIl&?~Cr63L0k2}q)etJo@
z&~`E^#&u?;E1hYnuwx>o9g9~n3NTREMWj~IFHj|QmfYVF_~2O-uIp*!*ZZ#Yv`ycY
zr?c!E;U|HM6uzhNbw3EF9b3Q>>KO~BZQe3hc@=NR)~wqxJ8oOIOkS;;ZDTEQJGPd@
zb=*s>r4cIAHta<3UffRJHdn2-*|ys8xi}`J)bz`5;PaB{$L(6fPsorugZ4_J>O9Uq
zO1iA6CXoq!o%Q4HGQxVC%KSvF>D_nSPc>W36-TxF6#6aa4nN+gUiYn<=QModj2}m@
z^pms84ZJ#)@C$wO%aH3_K^J#X-<3;M5AmkZb!)SZ>yILyihkUAnw?z^{M?sf>ix*u
zo$?XoG+Lf>_;Rb^9I0QqQEMJK-8y@=_Sg~6aXt5l)G~hb-0LkJDV2i%S?Ray1IQbt
zr5pZWW!~{BdT7~IugVs2hu{nu<3`??G)&VpOv8FzorC8rzC9l+xO4Bpg%bnO6a&GO
z{8(%bM3fzWAp^3>vJ+Sst&hd5SUJI#@dv|6;;D1zr(3G=N4Ngu7uLS{#sBr>EJ(2E
z!)x{y{!08_>}G7uxDE2kGuPstwU%fbxAE7GRg!4=SS4i}h~Hc}ku3(FAfX_gV$CgL
zTB}{HqD8!-*D79khYWLOB6??fD(xrS<s}S>bOjAl^C}fZD$&nym^K_tkm>~QtyUAw
zn7roH<}Z8x5GJ!)6;pl1S#oQ2tZQ-Ka-I5|pSxVMZA^2V%Q0S;+?O!-R4TJv5)d~V
zS)HZx0XR0kE|bezBSzL(+1giGr7%ykCIW>YRusRd@pY%+w2cKlYz)s_h~G9=4Gba6
zMs+PD*fbnktMN9Tt|i+Ta0{u4*llAhhH$ZkG{PEfQ|_&{g-T6Kzs%&+gK#cj6r~m&
zX9)vqDx)5Pmv3UsNS$Dmw5l98@uk)^-)cD3rk_ym73Ha?8NmfOQwha><`0DR!U@{-
zb20%{WTg7(ur~d8wdv|QiYf|YQ7>(EMC0b+B+*&N@Hb>wV=EK=RThMvGSm|Y`%;n(
z3qrxERQzm(i{)~i*ZE3?i=KWWjiKfwpFvyAI`w+kUXqt*@Km=Pt9%w;X}NT)U<{>I
z=m+o4CmJzF;TMgVtc)0r6pomro#KeGb9UO!fRs$g*h!)p@_wgmEl>kD9&223Y#VFy
z%dvDU_VUnv&|9<Y2;ql42~YGWx{iy^C~9-$m@;zW>;e^Z4>Z2zB^Hu4#uH}bIIdIZ
zD%M&Wvu~ztakMR%+he1#?XlGauCsQconVe{#SPBoImWk;Ul{NbYX!{VY5LnS<UKPz
zuQT=EACc)+J&W6y4BdaywdU2zoN}(5!Pv-R?75iCFi2mD`zcJ36%1HEX**Xjamou*
zgX&4Vt{drHxbi1Bn=ZB7n$UJmSTC+`T>{OfJ7*H(7puKY?GN1VC#0{ci+p`hIrB_X
zRW2>h&0*}1U>f!+vn?0HwyBp*wZtTs8GM-2rtQpCm+SZ?xi*!QPOX?3b(PMwFtmP8
zI!n}rCFh?&2zLZd)<~L3BWWCE2aXqv>>Dc~DSiB@-iGds|FA3JeOXeY@xKBOY1|l>
zM>I#QZE{()ja%_G3-<}k2bO1G<m)?gl?!RAji#K#GHk~&HzW}*ZCmZ+S_Vy@ZDS6x
z(a=FNr|lG;<ocgTL#IKJOD#!D$&`U+mPX35K7|Ca&!{WNrk{;!*GBIi36?~*cMwim
zPO;~z8+;)pv~ksyO7#<<t<YF;5J>8;@o^j}=oXD5=Bbo!6pyd7;dE~^JhVMJ8>y$p
zrM}5UwGr;e@C_!G9Y(<%F(-{(#>&BHV(D*_b$w;SrK~eMORFqeY7n)InHIA(ztTc7
zj4T^%_WkBIw3Yaqw>a>U>wV|QfyJkl%fhwbpvd3HYnGx@Wc9tgWlvYn!<mZf8Q8e4
zq-vyFcKRk~z0cz7Qgo6v$Bgm#xM6*3W%p*~4=32TYTymkgfsn;F^#-j=6#LM0v)b!
zI%`ZrEz;>EE<;3CLQd>Ih_B1Ji}Pu^Fp^dD?V%>)@QAwo`*711aSW;o)Z@08ZB53$
z-<(4=a#@qvE%LWfz4C-Z1G8WbLk68cZYOTBN|P%GIq<M96)#uob4Bm6Q`E~<5zC%i
zY-ul9vPz|C>E$w`sgEEVSo#z0<tpYeE{f^}@lq_1Z(cu!@A0luv%r2%Noo})_V?i-
z3&|w9-I$?1iThbj<l!fr(4{}|b@U)>TWl@X=EycNnRo6mvA4}J%mnS&O`{zji%rBZ
zw$bOr&xrgmp4%zuO-8keIz6?t1eV7w>glKWWH7T9qZNnr6Ke(ht-4XHHtph~bEBxa
z6t|QWdg_##cOQx<t1HC=GeTRJE(k9+GxbpMLKK63vlNA1+$3}-l}=a;`b|RP3HY!g
zxIQmzGuAY694!%@Xb^OhXugd|qBW>yZ;+{|c%)ES_%L`eU<<-rdaXrNQgODWz}GFc
zz%w-G-@M9;XIiT0T(363+#EvFfZCm{Hj9^>o-~(T$7VcBAU=`S9A+*LKAmhJ*VisP
z&EmC|T4b8nz`+))#rJ>kuuR;Y+FEY9XnSYHsbEB)-oP)-94a1c1wx*Khl;@h=`y`z
zQ=N_wmVJLu6#I)+HNV^-Y3_mzr?3QXqKXhbssbXLVijaHa&)-{Q4`{fUPm>#&xS>)
zpyDsGL-v##E!Y>rW&f>$54enet81zVJ{p6zexrCvwW{_kL_4~+da@y-|BCMNrQTX^
zf=}y7R#Mn7cg6KNx?H{LNQq_cC_XC;qp%H!iUAdLsCXGd1J?M(68lf{P_Z^wl+5&`
zyNGFP^T-&7yHT9%H)wly3buVz!A$VPv0|gT=(s)UyxpQj^+~Z)*pH={y|sD1)_`aV
zEzl`fQJ1ABS+>VpRT2sq)m5ilePueX`}0#G(CV72W<*ACpnJNSnn5>fEU6kAZay5Q
z8_s@3J(=6ZWeLj6ZaR(X^eKRfDqALeaG}N(-&j=NkNI8w031Kx-NXHXC_Ed-PhF|V
zj0V<BU-YjqV^z%PH!6DeBc;q>{}A!YDqq-Az)~}2qz&~q;WvwK&xb25Kkhy_A>Sfg
z3po83(%@RaQugrq2|pK-3+K*0*~6`pm81}ek-A(Q*D)m`1gd~PT>ynz0ZW1<om?td
z6lxM6TEOCd3%-?P-?K#5vy_+SJA4;!fk}~PX?ZW>Wf6OB8%TyqK44JJ^bi-65Ak~*
z->;F_ZsSsY0vA%XplP|F_g8cJ;d-@k$*!LKL%zn6TCtUTx@saK;7bqJTeH==dlDD=
z6>vM&B}xhh=Bwy5ki&|UnVd1C5Rg1vZrPh=QITftWKaG0`pl_M!|yBf{|Pu3{?bwX
zc^K78arL)w?GI}f9zta-xFUU3Yw({M=x}S0m5FFreJFSV^n1gxYaq3>IQL%qr<Ru>
zm~Sok8PtR$m+CMdWFYo!>iaD3vw;kuI_vpKIF-5POQt6JKa1x|NJ^nFNhb;Y`MbDq
z?}r1f3;jK9m<h|UAmuW7|EihV{B9_{GPY4uZrHm|Go2zFh%@;G1-1q%UyH2P(6=dH
zxebw(G}@G<K8LXCB|3kH&QHMsCq+@oMm<er6W_o?>10J?WwbkI0Y$FJb9Z&o4u$=r
zc<^O>1Bpf67}}%gr!K+q6HUnYm;9s_^!F5xgeV8E6z{{h!0?JaXRSh<v}!>nfC0K_
zssRob(A;1rzn?5+;D*1Ty%Qi<$@-uoz=kGnCstWWlt;j4EGB*wJ;hhPEK~ml5;XYd
zY}zHoBOy+CZ3+2>@&W=obf~(6#qYp%P#|2k4i)t?_k@&cN*2zLniuVWTbyqd=Tr-1
zjI6nbB*o@(qXKHP0(s%GQ|YXkU`{(r%M=&0;H@Ix=m>x}dqW}@x$`s_u^(c6C!Om{
zpjiAAh&}QRkS4Gl)X(r?LJN*ZH^>hpj1-Ne!-cM^soCGD;*eTdy!t9`^-P~gqqiR=
zWAO#}&Enhh84K>*pT<Qv)7u7S_;EY#ShnRPoaDCH93;As@<Jr=!azA?r=0Xc#!fp)
zJL4pw*v#5FC^oYhQ~vUqyggwLNSFfNM_6!YgK~u_7^>xjlgIC{GhmP4JL(h|CJc#R
zx!)E;y5Q^P3ig=2<%L9fP$XkR=p|eI;jsIk3N?Hy|5$4J9Aq@L!>l3Da#8E(=H4F#
zE<WxSPu1(iXIjg}X~(hmgF*%=SSp@t*(61UXa@0Ibn7*t#kC5m6X8R}vmj0}?i!Ag
zbjcF!eXrAWR8@#6K92N(AhJnUpT|6jIHCS)y82ajucip573VKMBgz?&wAa^36rvUM
z0#>PYD<~ukSU*iWwj5KF6qXD7n_NhJ8iVpPkmf+nU_jCvt3cWdWlF4)#X#nInVrNy
ztmNRE$1lDMe3BR60r(c+Tfi@57nMPqg2N%)4dWMI2mlct!xwQa#?>?42xLgOMyet%
z#BcI5iJQhp4R4hG42^GYEQZxNB?8Zwos|`OEDFIqgduVeiH0GzY#1Wn6=K|*=%j!v
z$gM!A3`mHAq_Wl9#@O*ixfTZP!CNfRq)-Gy(<_C?o0uV>FxG9j@{gq5ur3P&MMAJz
zoQc|Kh7%Nc08BU_6~VPgmqS0eRzugqtR%e%`h%8oi}o_LxH6vwvo*L71Oq)!PK(bF
z8~L?T@kzXkenR@Y=q@Xvps_w)t=Udr284H;r5;yDjwnRYP`a@4NGNZI9-`P-c0KTh
zm?5u%pe0d$>4wZ-GHVqN%%c=DT`$Z$R9qPmo@a*hkc6V2IaS)6Y<jyi3E(A~CRsVO
zZWO->y`(;aA@Y}|Vt^dciT|ZzdgQMxCLmyRp44p=5mZ1VQJ&h~p`~PsP``)}>L4eJ
zJk<S7G<yXaV@H+gf0>SuTC#jZ)1PiYKNGd3`Ued8bBt!JccGbAt>q<Ga^PplA6Kvh
zdsROJGEajl{NYZ#HbP){_}LF~)qIRyT?t8-yrL<(<YMr9wTrAvw)9P%=_3(JXvm*{
zuoU`-bD=Ki1_a#zD$AwJkpvw{I?E^+)*I<$8o&CR7XJ|_GqZS_e8YOf%viYYCi*tN
z<2zBvG|5BD$iKcaxp`mplUkf5;#EI>$z7Qx2N8aFO*-l{zM4cdKq6wg#uP+BuX)>A
zBg<mm7X7yt@lGPJh!~+}wyxHkYw91-hwbKPfVycyOyoK~WbBx(=9gQ`E@(=ry<cZI
zkIonA2ss?l#D20OB*zo%a(@@kJ1WO$aV+T|XEafAhLQXYI0RV$su9sh{;+0PIy4X<
z9GJkgC|~_7UYX^p6@Iqjg0f8J#r^R(l%GaBbv33k9!NtJUe>5J^LFgH_-cAJvx=2M
z)6J`xuc65R1-**3K`Vk+2fSoE$Mr~5R&m_rd+r80ck#FxU(K!NR|ncTtO+1RG3R3q
zl(i@mtbe(+Jb1<gfCagBKDy4g2cl~bxeEge1v?2z{z5yqHVAkE!}6Em_&d<fbJc+e
zy3mGHf7wu<z*|GIE`QV;wz&e1ER14Z%zOsutdAOPgj*Yfi%_@C2l-xXVM}|UUBJ3%
zK4-2Dt`2$QZKTs4Y!9{LHxuwfS%TPq9)5#%?kCLEVZ<|mP|W2}d-%46a45026@IaW
zZ8vd;Yd+=mfH%3eow1^H2|IrqUC-OGwi8cNw+$%!M%p86i8k|_X=mDnIScFiS5Sh1
zZa<WxphJE-_8CjcLmI^Rl%Wc3Q^#N@!}sBvMGS-Obm%AVOStI$k+un$^@z7?VfR9@
zJz@`elM8$7;al;wy=c9=&<FIheeDtC?QVSUu}5yj;mZ<@v<J|FqwUe2+@n;?Khi&o
z^hcu-h`;1P!bf=+4clDWl(Fr{I~nf)Qk%jzw;j^;+i|25TRYeuMV?06iBB2sqRpBc
zUAq@OVay&|-LkUVJJjA{ZvinqaV!4Gfz@&E@LH)ojyEPg1VyQZzngI**32A>;r$=H
zhB3guXYl<{teLlB-jPokH-19<&c{|Ko{2S$Yq9pk+R^qHM~uY3HHd$l@h==}PqfGF
zZT6(S{gwr~Z+z{zy@O#lNby&@Q*4i-{wx91?3{-D33xLuFGSAWQ2@S)cx$HHTPPsg
z0qzbeInvu9@dHwWFqW(EgOvU1{iXXJ+<>ehfj!e%OqdX21PKPxsWfwE>{1aD`pZRo
zs(AQhkvF)8#@Mj6?uAq2%ijX-=gyf6U=u}PT?ATgc@}szC~&GEhAvYDY*YF=>lA_E
z62UU_^FXQi_+0S@=zkyv1JT``t3}Xe#p<%xIt&dlAy&YBL9jgwt^fp@5b=vgu7(ub
zp(2Fu>nJsfZ)(*#$u4rQQTa4wrX#5qg^EOBfq^SQxWxGyqKqHFkH`RD#{3vt*%!^p
zMU6XgAqQ<O*KN{r;Dw1XDc;E(DqaGK3fk|^p3V!b%?(0p-PJo0JEV}tlAp#}f*d#J
zSEMPLffQ<HgIDjI{<g76D0~=W<ydLKZ8d#E-AOEabBEuFmP`XIDOKwm*Ur4mbrKM+
zJjDNKKcL=bYc)T~Dr$On;@NC=>OMam5IxZJ^9md>szCj3thRqf=YOYjHyxpHgx$_Z
zWKYFgl4TTn<s~hD`j-q!dc#jG1zx{FFCnvjg;x<_rC|Vom4bYx%6i_rtbUatC_?p<
z%S(V%Il^VDU#2&efqt@4^%~1{U}GTTLv9I;AuZr%34;s(vMIMU=Rp~+C}7cg+lLq?
z6_8izpVI&L=}=qf$07C*GEe<;-X^_Porqft)Ea6;YS{uX$M%QM)auT3%RAFrZrVbf
zL2R~!P_}Q`%Z(*JaZLf*Hnmk#4kCnz)_e7TFu^a;`B#$6B5Hiz6{^BdXq1Kl9E}LY
zHSW^&vm_6+l*P|$QA%)^1*R}iYjjJwWz<%5)P7$2OVHK*TvzV_k`J&tDQ0F4g@VYD
z^~p;=kAVuVUq9}ncSLRmWO&S_5Hy`7X=lArh-b|#2*N_D0HQJbdOle&@@5{eqP$5N
zVcN*zmn0%d%aOzgW#xDSD7kpvD4OFSEk{6PS|Ba6iE)IRBnfKmfaV-HIy@Z<o(?4@
z5m(Z*jI?E$W_$?GcfDafUqJlk8`=1fIYOUrWm7BL))SZkrzj%BmlD&{(`U|14fko;
z{Zxg}TtJl$u&Fq2sBh2_?Yr)TS_7+o6W)H7l8L}0M-S^#t5sJVd{SAPtLc9MK~k(r
zqM6b$%h-|F`W}=`sAW{^D=6T%(sA?kv}L}QOqj3Y*&Q=wzOpMb@=e9JXYuX%3<P)X
zbzF3dpCV9Ux@=|iUm}yRNA09Tz*5Rd@fSj*v|OcemBAGSO<4!xC?`9Y=gtR+jdGa+
zVn{%UG-wY2V3oIr?GY$121J}x0J*)T1GG9r<a9KN9<P2k%B$r{8gE$?JagBiuJL3P
zGGhMKH<;BS&IhPnOPywdc8y}4DPm>f(h$z+0lXl18{_4MR=X`Xz1A`iybuX+fonCP
zssxZ_xdz~Ku(UyO8!iXU9^ry$d?6md;#TvDO~|-JF)JG8Sq~A3^ntlpTE0}T%@$8R
z`8bvoDQkZjxvbj`fc~-o1_S^kp`x?K`VP&hUh7050KHk$!<yoP9Jmo^&Kg?G)x&eM
zr?S@#;wx+srCMN!j2bWuf7|$BQa9$!Sf12+oD!N)a^As$P2g33S2x@pBn1$^ttCv*
zb5NpS4GhsQmP}r6;Ej#c1Yc&qW^P7|LZSo5>gx`XbYi-nIqI7%dl;WRh&F?-Fg`zn
zL;}XA(8c(q)=a`R1^XQDVSzN`Ll#I7u)+cX(t)r8UMgUL0NuI;q;$Xng?E|-lDi%j
zC=;+iB8me!t3`l$urvc*EKqn&)+W3r`%r+`j4V(K;fDeih`$WSU$Q`%9v0|RcxzZ#
zppQX7NS0!BVXTh@+9E8_ux5e4@D$o;avOLaJ4@Wo+Bm>x6SreGfj0(wKp+*F8L&RW
zBn_<&L$F6C39L_tN%AIJzvgXco-q}G&?alr`ZY<Zu&`s%BGcf0$=kWM%N`K+Q&={z
z33V`2yP0di;0g#g+8+zTJlBaw?ndc?v0)AE*a`+f;C929kP2W2w?ucCv{uK#6zI~4
z+a8mW%Jm$<JgmnpAmZOD4C2vsty8s=TVulJybJX_$Y)?_%pSJvISPrHnocvoL=_kI
z05n(xixoVDL_fe%-8SaY2KEp@f+p&A$Q}l>If44QD_}N<+Y=G9xlfqQyM@`@C(I_V
z*=z0rd-D-{q)C~_{?)Bu<qoWFYj5+43sZ|GYVMyR^@F#;1gZZCS7e|jo5Nsk*mk2g
z8DdS!co_Z^En5pE#4sssH`bm+Z`u-wxpQkr+LIU^WN!5pd#1jE03hnid>;1e{yu(G
z`qBU-G@15RFrnkOQmfnB+t-c>8+aV)--mBrvw_lD=w)m7x3?qR?ZOKF53qte@Wwk(
zlAZR%>MpQ?543mLTgjLSE4bTxaP6J#-FTz8jxjUc(Qd{pU<KdxDf7nN5i3{>Si$1j
zL+zcy3NrrfLHy&Zh=1X`+Qs&6dk43+2`jjJ?O_`!Zomqf?%D1X+uPea0#>j%t^PG9
z&EKN)+jRa79pZWQZi?UG)tBi|*Q)u8e~T->5GYS7T6wbalU&4RmjQ7C=%;^;33)TS
z1so8C>Hy_0wSeQ=OapUB^J-XgvBWjNY!c-)H&>gjG5Q*DQA8I(G7czFk0^b4X<k7+
zq_c3CL}syGyQCn5ED2V51-r-~E+_O%<mQ5{sRjrXtknl*x}iD|b3ueneXgVx-lg8M
zABI|zLUe*RW~L570ST?aM_66I2j`FQm5)CG!Kg!AI)r*Fl68$YUilT)onBr8V4^0g
zk){KHBD9j=2(b|XC6={apiEer1Au;j5NmMn0z)dNE(|-~BMC4EUApkXQq<+8016d>
z=8E8oHN)D3#!Fv9JsU+|Dt-vm47DD7vv{dGyD0JM3?rOxlpxmrItuBJQbgEWJFfco
zc%uG*&VQiuAL;yOIA_!!^76mXVJ%IaP=Czpm*H^J!IBby!<+Jd>c8;~k)D2)R}ssn
z{)D%iv3u%I`ScW>_t5!OI>PF04+xH+hq^)k7KYMK0V|F!rPLNs6e6fszl1ma0x{$P
z|3l7^gt<UP{R}sjfVEk4)SuDw3v`5o6pl}LKeB_u9r~k?!&5-Pimo_KGNZOX&d4L*
z`K9GxUquQs5XlVo7VF&@8ZEuGLo%VmaC>CPX^_9v?8Aq6MgCHKlg?kzd4<j$I#e*L
z*XX=X=UZ^5_Ph-)QV9X}MZQD0jlx_MQzjjV!D0&Xs+dZ^DHj5@z&e&uZR6t+%{QWa
zK<xtr(Dd9g=bKqdtTbhceR6@k8hZ?QCG4n5^7VYcSy!N~brjhWKMyU0;QR?WSJ>(u
zQ)G5jhEA4Fj!vEqDLL(&<<%UV--o};E*tWg*#tPtx8gG6-CXA0IQUG{0*?sJ&@^AS
zlA2r1CWg#0vml&nK3>4pK69V36<j8mu7C#}GY%j%&7XoZP2|D7g1=?@CfG9U>`HzM
zylRrZ<YDuM_y(+-s5@TSmF3Qax8zil9b|Yvc^vklaa6`EN#5`?h70m~r}1^!ZLoaC
z+!S16HPe*e)&RDa7>n^$Y&@||NX{=p&kXDr$fZCNAYc&zk35%L1*r@Oik9l<A?q%H
zC>+2qNWpg6eZ&LVEa3|xHy<Y6xt#%-oLwDhZrS)Las!@Hy8wqLZskBArXUl|wX^NS
zO^|`@QNUc%?jn$8I(8jv@>Wtm>-F!8F9jk(Vwg%-SHT4ct5wA3H1xolAq?sd%NfC6
z<ej93ixKNsJWPc#r|gZ^f`IV?cS`LuCSuGi&|HfcSF>wTnzHQ{=tw;6L>J&DiF?FG
z3!NoQA9{)?y5Jq*NoFv7Yp#a<-CfME4idn@S+r9<kZ}5RON-XSucQ1iC_j&LJ8*zH
z4^}+A){(yj7mS<!Dgq<h6HP}V^|sUF(*GT%)-CgT+v&+m-)V|b3kkdXHqs<15%vm^
z5+CjYo(nBe4UqkjjCn?5o_f*AQ!~VlgZetktT!QGK>uZL>ZyLVVhCkrd|}Fg_<-H%
zSomU3Ky1Z5+#6U65f~vq!4!@H3z38U1?$r7jGG{EPl!P3T5%x?aiVCtLG2qv$;#sk
z)CZt6A9(+VFMyyz!6E4<<d*(H#HHe+de0bp>9OLGBBj3vLNaiwCodZiMaU5)b3G{g
z4Knm0sOQlNOaoLLw!3<pqn0`qr~zq0?qZK_zOxUxa;4{P(a(2nT*kAmyG7tLwf?`d
z<PY@JCUBo(B4H}6?%{&DpAL+X#IT7T&Oti&(m6zD4;BQFhmi+FHqo_ym{Fq)_)ic}
zL+EmqW%Lc;|1dqMo%H@J9U;1Q^J)j2sWDBF^wAfTaED^+K#ot!RHsSmLUQG2J5)Z<
z4|*FnYA2`biW&enst&p96xH9-RZZbuaJ;*r_o4tQA~lK3$(sJ?BQy3QYo9Q&fO^Pn
zvIwk0JRK)mA!`gl_zWb3kOZMTw)30*hB?r=>s%F#<kt%YYux&3$B$RZJJzWAjcz}L
ze8tMH91TU({R{tQ^kmOQreS28fB+!^*e%+oU`dm!DJuN2KwGp9unjT{wL84V;t#2e
zNG!7L)K)Dc;T|t+X>3E~46uL@0UofVfD2$j1{@8_+S~DV4zdRm{_zd`<JkU~0&?|f
zJU2FYZnX1I#Seg}$gK`FkApNZ5gH~?RtG%`+w`z!b|Qu=Na6rMh(iHgfXW`Z=Qh4o
zK;+iagfy%gO~`?Q{3mkVxrapq<lh9A)&S8$@<o#rLy#-w=S+|-19Rpa_W@hMhO<f3
z9dv}k3oPS$Dl20_HtVZh@DS}#*A~i?^kYbDtJRx#R1tSjArL6y=k!`EjGdq1GN~_f
zdJU5sCF-CO2gMOOq|G}>l=8OT#iv4MIH10xA+xAQ|NVSZCWHHUb(BtD2R{tIKVt`f
z7}<kn#>yI7LBnn}w?g|ujSKWGaR7RtpCKp*o$?LRvg4qe&A9Hagalc4R_@xgqjpUj
zL=s`DA+b7r278PzBh{|miv{K|Qlh9q-o_bZh*vpY4fBfBzv9?b%>U-<lMy`q629&O
zc!BDZe9Fv)gKRV9N%oc4_nHqfRQ8)gwqhK!c)jOC;+mgzN|XSP8_y5~X*uY(HqgBM
zg;X*rSAeS6DO^#3l6EpsqGYH*VVE!^e&xP{#}(yF&dvi1oD(HV9=){SZwV{?JnZL0
zOYh2Svd{(3x`@R<-4X1X5ma}xS?Z`oOR_FQWYY2o7qg+(??8>-MByUIeMd?c(8KE$
zE*e4rx)&=vBuQ9nNv%K?N-`L`>bUOO)g@Y(kd)6q`s67pQvzV`TPO<F!=y&gl4*R|
z)!Yjh4ef*)6-@zT{#qr1%`tke05imy=oV%P8hjAFfu2i4Z2?&)tu3GiA*Df7U{kUJ
zk0L`9$DIaJg?AcAC3m7X$O_;n%iaG1okC0m3oqmXkZ^VlpeTV(Wg%Y;uPI**uNis}
zFevH`ymoeN7*bJ&<uAkW7y1XJK<Lx}XG}1FKL81`wFXEJ-X0ar0I$X<cZQ__f}=2N
zdRxGp<uGRs26BJoHhBMM;a9LTkXH{O9u1;GOzk0&S7X%xT$p9qighA~Yvz*Ic>z)z
z>i4jspibK<NcGd0jR}GJJ#U+hbtv?K+!;0y7%Bn}WfvwFw%d8E7Jm}N^Cz2?*xRwN
za{)jnDFISN`bG6B+&xVLK&#lo-cK2=w8R3*IIaMkTD0r}*v;Yga9F!id%V93D`?u=
zhxZ7J8VEp2fN8JY13;L;cd(^g1AGRjtDnGJ5{$}oG;&1%i5`rBZ_hB+7?T;j`{M17
z)0yHrf$q#Y*x-o^YT22==1MTaq@T6?P>UuT)h4WTbWIz*l3krGy@OO}Z;sF4o!*dI
zJ}I)#K!Yk<K)S*B9e~g~N=HjaG2A)K-C(XE-h{aeHjjq*rp~TE$dB}f))}md%}Q_b
zC-yp3SdQ8&JQD=YfM+l=$*!1BdnwoiA`33@mVGM4&1LTtwu>BD`MmnS;P}?;vR##>
z4ZG7bk{_~qn(`(YJ+;3hoYWX&S;zir#Hk`W{Y7TxSLsk{+ArJu75WKnVu`>_<e^D=
zehrSl<IN_24TTZGsBi@Tg0FppP9(IP8hJay$S?ApH<d$npeiCb)O7hXd`)PVNc^`C
z)3vO>z$$=LvEE2p;nL3E<eft1?Fa)$K;(>>0n`M5lr+pSXmH6G@XhiU(f9?30F$o_
zWeG6JnkzduTzyNyTBU^88Q74<k*eR6yrEM3Er1~pV<y?b3OIzYq->BMWSf7R(KP^t
zq{sh4@lBdfkg?$p&xLmD0;%+IAf`Y|%p%&JPek$jb$EARBTn2l9V^(Y#2u>nGC(8p
zm&_#XAyPV^*3%AlE-jGq$;g!kjif>4h%fwe@aImYeDFIU&v?f$d48!m#oi>i*q>Ce
z2LUwc07{1aaC_dP#11;@ERC*p0Q~EA_pp*tf!at~_wF8Abg7sg7W@S$hPmwDq)!5P
zX9Lz8U<QjaN3kaYo20k+EW6GpC;MAW80k>ogG8qCU7HFxV*G9A9zTEKsZ(bwr!HK0
zZ2H3EPfS-%pFMT{{A1_+G16T9lbN5Ps!N{8HbPiQt1pQOB!ICRZiQMR8piSmg+bSR
z1wH+(5CwISah)gX*N~BokQqPBj+Mv5Ec*Wlde#VFSSve%-rilmZyTMlp6FuyB)T}L
z-5-Lp9-E#)?XLp7pm%kO<*Z&ZEPg8qW)$*lkTz%&z!ZRGkvBAigF8Ysgv01OpQEBW
zp-zAh$;=tRf9Kjs^(ZCI!Cg)>fE^X=2@aWLM-8UV!b8DmYD}G@L&OqV_=6}!AArAq
zL7^(fPeZ0#v$4eb!%>@7LImD0w2>~xVi)K?rC0BMBV95>{3x61e#9E;K+L#Rfc_!i
z&Gd|rzmrZ|5L>W0p>2SMLcZfG+c#>`p6P2Hi*|s8l!<>+K24@C@@anV42_iq$TwN+
z7Wul&<KoD)ewKcZA^a^C<`4@L1HdnmpY%>>d7#{qK)EG>a@#?a8{e#+#jC;#grG1I
zS3Q`pXJ`7%fX=|c!|BUTpf?qRbh42P<P)q7q6xGf8`U8s!#p<{0<@(S1tO`J1Oq*G
zNBym3va-<(cr>tF$jSj1pfY06xVt(eL=drCy$X_q`)2{F4T%%2XNWYxFtf0vmW2+I
zO<|I%<IR?)v<;Z<5`$-|!|X;ivKRYoL_LDwgcY`n2QGWwlKaq+BTF~*qn4UKLX)RQ
zAW(*8^9Tsd%U+{?SKo_MKnGmX8{H%WPXm7jlpBgvMegU9VN)VXR3{~B3+2Gp7a2BE
zU7-=%^%cz80$dl0!EQadR5>V70D$MUgqx!4OAb`#wU!EW9+Az!F5?m!5KuhuY{-+I
zmM!r!7yE>{nh4jSm${1++@|gW+9(Th#e8$Gy_hA8%o0hf0?r=d&e?SO6+-dmYV$?H
zWFtacYOEN1*!&W0h(!cT+nVM6d~RdJp4EszJ*4r6yzj|l*l#f}FmO-o&(3_Lub-Dl
zbQ9ztuye<rxfm+D&@54Aq(mV+vXotU`T|nDL=HTdBe+i&Jz391viDI{uLYaD4+okX
zW}lfN3NbJ-Gb<nrVdGm%rCAK_o)4Jq*t_u+2~9{=IB^)~gIPY?EGO;wyn*e{RvE%Q
zSo${0S$uQYL6ZcYG;gOcSEoBz(!Qem2`SbEnIrtcP&uvD(r(YLd(0mKjXcfF{2(0g
zb5xa5nrV3P1;`E6UM%}iSAw1aQ9x^|dW)}c&P6;+FNiTWZI6!xK=5uB@1EN5U62Xj
zx=-t8a6tjl99$;`AtPXZj1n`*>t~QL7*X|L%S}V*VGttR`k<g$r}4ck?Wa&Xge{7s
z?){{;lq;qo0YZlrQW{-V4VWL+RwWX&zgqD#kA+74{9K>~;=bq*&6z}9Vg^wkL{_?{
zaL(TTfVSfq^ym~;0Cy|=(@=WT$R>>Ly^#kMK{&Vdx@E519a7GH4OZ%jXBu`b`Xq$g
znOa_t25sQeAEy1mz#=nx`CRR~zdOQ-5Ozd#QU#da*~O(6VEh=X{xImJiVFi=0IW)>
zTz`ATU92r}*>z!fkcJQ|VvCt8fj?e}VvGo87<%>#9e+~I{+7<YMoY+SqGo!lOqM=b
z_0HqLE9iPI$!n1DVkiUR)Zi~~Du$?@9ST;rUgfT)^t#Ppb%5qGqDZ8vgW@_(q8Q8$
z_ESvtEAbxJY8Sq)7b3tx<OR&)A^Mmz(pBJLE^*qwfo(>C!dvi2;P-3u=wF!5u;V!}
zQ5EiO;qIScUv<y4Rr9)3*qkWCYy_-->k{$Jh*I<Z&(53zAm;)z#LXyN%FjDav3re=
z9+qUsi-Tzot4B|w8wZCmQY=T63SQvaZv$LZtA_^H!b$wu57Dv}r$=2$k3Z@@a`6D{
z0?*fBI~|pBm-rt><8-u+T;MeJM|8k_#@gO?gT0Bm^c;5}Q9OFJRfDk9Q7A!!i&K;|
zXW||eFA5;$l3|E}FN-1qd!C%SGs_hTNV1t{j~*%>yEyZZrZ>VWI2<aGb+-^hT&$3T
z2Z<$Q2_5_3L2c=y^UhJNcwYBT7{0F?bn3ZN0tW4`?da)YJ`E_NFxCH`_a$tKo9SDc
zWS1Wl)HX<fz}y=!hO!%hV-Gw#j0eajIT$gDrh%EjCYc};b-z3SuwqZ2UzgHUUyTP_
z^sWI9$)2QT@llrviLXO~T?}@{!5D&#u_tRSI$H%<E|K5>-2~fpnjP=x5d<?abFn8^
zr0ei{Z4L$wTZ~%i{S$gv!*~(q@do-EdgHUK05KiXp=GP;dhKe7lx+2qd*s-=jvXx>
zyYK!7j!j8tBV4s7wOW(ILX&%eej%EsKqPt2{Efa!VPBVVf`=NULsFdL+qdfNo4wWP
z?_n<u5gRnG7ePwClL`Gz=7(@35EK#^oD`bYfjtZqkief!ru#5>$E+iSwCB5K2{wxm
zATnR{)VE%)OE4=HrHKl?IMY|jov|K<E3Thb!$-X{Yv1re^_@nE94vyW7+RX6P5Qvz
zTo+q9HT7m_50EE^18kPvN9!ISVu}4Wm@&~cSq0?i32@dEeen`ZkS%HiR}f&92X1si
z^i-4%4U0!yT!NL4vj7JL=xHv0OF;v1zbn=zy_Yi(mqV&s<JyItMlINLgz0(?80Ko-
zsP9Qu8=*q%Sg#pc1~Qh6VijvONFi7{AlztPHo%5y9@17UX9%MaT1TrCq{f2=?|anO
z+!wCFB$SLpt?5I?Th|=B7gBT>c>qhJiK%6e3$bq5izu2Va=)vmEQgD7^a&T!u;E3|
z+1=zq8)+CyaVF}VqI`GJ8-xjy5g|4TIjM39Js#R#Mch3}YCDO&R8|i}UDHni!F5QF
zAi;H2utJNW5Kv)38)8Zk7Jjh_NkO=4t?tZMXK(Z*Dm4<+Uw_3$O(^u3=Gb)S1RINf
zg^>}wse2mIoI~q_mzOC(wrMw8>pP9mJJi5(T|!6c$&k)tZ|osG7<RC}jBdE_k%Z7I
zAG;#1l#GO~Wl+ykeT$)Za_XEZ9sm<6yW`M<?N)QY*OO1d+av0l!YUH%Ip_fc&SL{Q
zMh!(4s2CXwQ=8Fd#{kF+cF!{i#TYZ4ob;rN#~XE&aJr4i@eW{2A`&Z-+({Txi(MW0
z?b7(z2+=tp;Z5heCn914=0UYp0G5kw&LPlMHd+5UMa^DrNo}`a=?F#%xEF%KF*OED
z1$q`p0WH;(HCOlap2|T*qowDT#W)jr1R+0Up9#h%4kv=(0ozrIms{6_C}}lu2NI}L
z*T}ArvG_&Pb%|82B2Tmig*jE`!k%=cb@#L1L-a&oZ6tMrWTdr-*%oAis$K6iN;Idv
z4XhygHw?Wp!!x38J+#4);wsY9{n-D*@gnQXX6ryW5kt86HfX$%%FwexhZ<wYdMnIh
zh<C(dO41(X(HZ0%(BUjl#lYvWs)LcxSq1gfs)@Oo$Qwfyd*nsr$J36SUgj)}j69fh
z1kH$bS|TV(*tNAS@xNqYdN=w6<><+MPjdYQh-v_t?(1i0)&2tLO2qyo>;(Et<XaX<
zy)>Sqi0%ooX$tPOYXD;*uY-Rs%7RwP&O+MI^P#YYANQBxgkm_cNl5LUDd?j>yBCPw
zXzVo&5ghCiI+>eBX59w{Qh;h&$U(E0$M1mX_IO1t-vImyxE`cN&n`#^T*G2v-WkI0
zQ1Ck({Eh^_qt381;*5$85awQJqYwwGjR_)Z3q*kv{y->MlDb&=U!)nj5C@uO;-aES
zBu;B+=Yt^`K+|h_veeCfFxYxkDn3RUSv*Piv(`7zo`A2B6nh0VI`W4?QR-<C5lc9c
z;`In3DG^QJ8)tK@!0G5FU*x`Q^F^`LTe0wqZJE?duBCX5FE{#HfMLSc5`6WcC$RJM
zLfXsWY5u11D6Qxics_u8u`}om@;!Wqcq#>+4AY)qWkdvziRqs0<6+iDnYD2^^T^P}
z6wZ{2A#27>^GD1pu<b9x!6`$8uCRDwmX3;T52!*vi+4I&d_VgztxJ%;KG}rGQ0%OZ
z$2zA#NE1dX#%^=I;cMy)#A9csM!U4DOhYW`XjCLf0)?XnVT3^hR@wxKfF3TwS6gn&
z%d8)wD6>cwa!xr>FQG-M!$Qt=cMBX#M*Xs@-p5#`rCdzK5|!)t#$reHd4?06*CJmN
z5g^rQ4h5#HP}r>y$iX=F1dRi0nFh`hK!fp~P94=qPvam$!YTyn`!yDj0_<Q1KdOES
z&+sEU%8vd`gtuedgQZu}X!&!mg$<H4R`&KLzftgYU6_64G`?TUGArzIK;j13QP-In
zZF$#E$m-WGP#)ufAN_e+3G;+woBHy!QSfPE$idQ*D%7A_!T3Cl#Qgzz5)xNpZAI*^
z_+uL_D>ywOIFJu>wN_=bSmB4WC+}Ha*NUUODUs^~8QrA0H0lw66BECob~^Agp{T}z
zBar`3wv&Nh97jiF#BX0a6Zl!cY2(~4gu5S@X!-T9ZA<o;YyXqb93+9w0}ih_7=(Ge
z4NV1<<ANk?!oat!ccsGs7xvL($iq#l=e*1saEU}Y+SV!%g+Kx(mGXe+^YS;%JdL9j
zRKpusgV8kL*>G6Z8t~b;8lbfRm_GfVgS@RG4@1~%o>&+HLU4G27{Xk80Eq2j;QwI1
z;Fg8su;OcDKrI?;TX4Kh4rasR%{2UiyR}hZ5`hwzcs~c#{kUK!69K00i^%naod(M9
zi^$^_=}U~djYBn{HQ5Sl1>1le%wjv+P<u#y5hcdD1aoiUTmLSI=XYT!)r1ke-))<~
z{NWI<<}L}7Ma(a^Q|`;&ByfR1I06x#Xb)f$n%NwHHG(cbgpV!kfF*<kcIYR#{Y|}Q
z=Own0b_zS^<H6l1;vYlkosY&YJ_`E+X*?Ul_O@Yx8qk~?L({++pWt_G7i<XZ#;$)k
zWCZBK<n7ezcr)AH((Pvt0FO8hYXv}uBK0KRJ>4Fcl;3R^kPob-NSAS+gNd<m-b;$G
zm~e&o)M9{6WU9|~@`Kz?NNfx30-juMkFD*w9cvoxG5BX-PhiE{3rx@O3o+zo0`HB^
zSzJ<X!pOFFSLg@ha2D3$(dm^7&w!K%{VJQw<;V#f{Lq4mGZYF&<5MWz9^Y_mQVu~n
z(<Sff;iQU{#~7<-<3ucilPs_g9jZsH>OE0#;8Pya;~B=gQeG#zz_JdbWmt~DrqE!k
z2m->X9E?tv28pz+KEk0Z7Jw;TfGSI+GqTG^J%E`;Z_YoBtCg=Z-z4uz0XM*Df%E_^
z9Z2b+Y{lt?V3xGjTXfNqflw6(%!qShzv6!K{Q+-Uy%E{GU2ej{wHDb2GFr$`NjuBF
zk*@8G;jVuWJ;Ah2$$yvGTQnHiSkd-Sxl-!RK(TnDxSxhz_RoZSu!4Y68nlx%X{o3l
zMUg=S$o3#EI5?E<41UC}Wj`z?vTV?^qHzhqWEyk%<>sRABe{_w#z&?>C}uy1CGQF*
zf^cDxgiN)-Dc-o+zo=v$X5C(2QYUwU!V=pv=JS)Wl>@!&8)}Up)Gm4>d<|!O4%fM_
zxtj}6Tu`)C_iTr^ncAnRoi8EZ>Lx>oMXy2fkKK!jfHHrGfvz&p@AK*)onNB!J@m`*
zDh~%}J*x7xq$RNVDY#F9Iy;+Cd{`oD&Z|$+Q*fGG`!%8RF#X;`r$C4MgLMme2;%Rk
zgq^)tT`K3t@=QOi7*op(Jcx5jyK{nLJ>}F-+7=3}n|&V>?zFm8)JN&Zy3lscq-tKE
zA2++;xH5eh9rx>FQHuHF6w-9SP}sFEX5h+0tjO;41=(62lA{r{6-5~;x?zUVrF$C9
z#X+L*ittyiKZ2gYBj*G>s4LeW2ycbh^9ST7FxLmeYx0JO)B1XvK(n~v*!`C%=-2@4
z;=VB!A2Wxr`J6g&v6*Y;LFDP}=GbBbSQa*jLqiUb77(a<tIZgUl?gJ{+#w-}a?K_u
zu^pY8Z;}G{OumL7;kA)`)k^2L!h{;g#w0cGK^h@A6d_;fKt<`Bypzr(p_@-4R_l4Z
z#r<t~mp(Y2H2O=vwi4|&(S&(0PH^yG_?a5-*VoIA9>I(WTx^96{k?Rypkn;3AqevE
z=_jAQaK7^B6Q==+e0=)7x-$<kAZGx*6NvMHdWwFs35ZORwxPx@_ZE2B8eV&b4e~yO
zrJ{W#l^#jQldx0UDWblzz5zIKWK@lCvvdGAOQy}gNM_7$CbQ-@vbnLZE5<sD2zx$+
zDDdO{CZ2Y9X~0-CR`yM~X&ZZiz*VPlHf0{a8SEk|;5RFV)M5a>XhUjwm}MJ-{{Xn?
z!JCFNM84SGa?8w&um7_juqPbYf3mkaTkty${(75Sjp1q%SA4ovIPvY`k868J6k;c?
zw!vpt@Vh(sEe5}PoJnW9Gb}v$xV_iDi#$2F@*SCR`$2o3eK!oi?R0k8_t^Vi0Ig8o
z4M~Vd73>4{)C<^8Sl;98${e)ci9_4>;>o0AIeQtKeaP8{lMW8sCE@b#VyMg!9Jane
zIQuBxN!!P|!y?Ax&b}bDC9$RL`|SHAw!5R)`oc&|SseHtgcI4qJ)mbEiV()f&vcG`
zzEh}cW9=mket8Rq{yB6E&Crv!<=&zIR4i~~a7p?o9$<_>6DH+29#ahFr}ge393lXe
z&_Z5lMwmt><ya{Q%RU&e>^R|5U_BQUr{AfeF5Z~^Rb8OdqH`}Y!{e`c_BL&L0Jo+Y
z;#5#3DGUKc;k=05w-hl2f&EbrFX9|CTc3!;?G-pTQW9a*gAK@>0xph+zIO4yT&O$z
z?-Z9~LSSe^16CR_6{W?zIPW9iad)+E&I4nRYewu4E<oyvkYWLN?Vlto5dRUWcKGwu
z1o*-P9AI2hbtbkXCV;-4g(Zh(?`okdjf<rE3>@Nqz@J94V0D8}q)bu4uxnZ3k@ewS
zA-%B`W-7akd35?wsVnq4W;!Yp>xgC2c}Eut@~-Xl#noq}?&&bE0E`L0FRN`|?f@qT
zD2l3}6U-auR^%+&(a_byOHs%k>J0IB<PB-b7a849;$Z43lg_X~?I^AU??%cZ^hp~X
z8mv3zZ%@)d5{Z(Ag;)gyVn`Bxj<vB$r?&=V89E*2AiqftSa!{%bVMVYR)f06WIlmx
zU<k6GY}hZhuvCGRp%dvD<V5{7yt6VDQltGfvF@F9{e-P}8ei28Got=3u7ZBx7n;ts
z?vdJCA^sK?c3mQmGh@$hnz0Svc^er+4RIn0vz7`dt-eO`bBsTDb~egMbWUn_6?KJt
z8!e>%KC5Vhp6Tb<K0%UCM6I%6l3gJo=*$ImC!4FPh<;jT;Ef1iiVrV2b$3HT_-E0A
zvpG27hoheNY%f=K@uzJA@*wPyjO0t$e=mMRk=)3H+$bx4lkIFIU!n!d&`=NevO+Ee
zi+cdo-pXJXCibDqZcuKUfPF?vf7X(a$`ot^yXBJ-+ZF8a<S}sx$X*QYzUPTSYbk^p
zLI?}{Qt3yT4E<og5c~3JKM-c^A@PB=NZ#wckvR5vXo1qCVMv?A1U{`vVn66$JL`T7
zCh%bsK*zU@$Es3}#BzUsdk{}XCoslo9>{}LO77stYM95NIeCb(Q@0WX23J3inN#HK
zMSOZ?^7PS-5UE!GiorHa;jA>XFyb0V60*q}|AQPf=a-4Bkad+0i611gTx&A}gjFEj
zAha^T<3Y;M3$0^nffA!5>Aolkn8U}89u+NNkHVCof9N)w^MoE~Jf6rKLPaXZlc+BF
zHtgp@ZV|vbqY7n`{2b7Qfss85YFGejNJK#&foJ9*ir`a`76gYg1=ZhOq>YNug&KG&
zJVr9g!(;k9)-G(~%*+h-He7BsqcA5=p44`M0q%;9f#QY@8awO(k{$|WB3%<p2ldE!
zq@5rH2`1^TaQ!&@1A80VfN`~UBnW-@!S@`#3T)cm4knawMxqwP+moGdFUp?s0%Qvj
z6C&LEJ_l!gviYc2M-PFGcK~`GfeoytczM;jU`89xBYS(I&?0g~5#{fIj+OgWsIjdC
zI@%L^mpl}y=jFSK@6{)Bbx<NKY9QQlL^O9o*Jdg7JfKr7=>-?UyF*aFUk^C`6Zak6
zq}EE>YK)$;4}_611=xoqrdzfm0PU{(p5T-YVV?)7JRE}<lun=t2wI(W+!G?nn|k(W
z2c`J!RO55a$M-ixRUTEh-ZcHVKv}{HM|+eyHI8mOJ(byX#n^DAz#0=PX$j*bE?>gA
zH5`3dU0U-nhTV}&sTaA{4)=?sKF&2z_yM6-sBP578AWU)Ji<l#iF}DPo&pd()*o$P
z^5^>)*+#;iFYv)e5+5;4_yIn>#AttsSHB3y&%2jvuqCZa5#7mD>Ze)4${dfQ)az{@
z$smG&r?76j-;GGe4r<v;?}j+d=qJCK&Xe7+U>JT9GM6E2jLVDE#F8D4rtJIGSp4Lx
z(Pp_GHU!feWredyFZyt)rqLZ)u1V(xiif6$D-UW?wMC!Q0%ec65Ql0=%o(hvn=pS%
zV?6xAa0r%A3ulS1LYV|rk(g4(Hj+4=V8I0&D8Df8fUPB36%svz(Vi~DH%4<9%X}Cf
zB9tgRV%AGFWU6VAsWyXjnlS4VyfFdeK1fBLFnl}R6*ts1Yh3hWSlTneyPp9mLe<b`
zpxOZmpqG0)4g<|N*Jh2OaDXM0PT*bICPE>!#2cj76+7(`QsMP^slk}~Jd8kCHDfi~
zw4gvTcm+cbSr9N#!yq4N=(gi9KLJ%0;?9Bu$WCifd?aDk^5+N9JnEzDZ_;boIdPU#
zq#o40m3?_C*Cm79$-W7S)WOUr3#K+P+L@!Yp{zK_LiGto{upAQ!74)GE7%c+4L7tm
zib#bole9%BD#J$d!!c3L3u6{ceB6u!5ojp)WNO`yCD$Qet|!pG5#jF)vDo@CEss`$
zX%ai$euiV^-AFbK<3Bvd3%hh6HQbTh3u{+6gaOA{V*f9sl|$GIjGa8j4r2&1!ZeLs
zVuXNQH>}rSCMdmfY$ML)P1EjS6v*W!@fDeW2^Szb&2|i`vmvMjO)u6tp`abtY9~=@
znOCefU<WkX#39#1{F5W%SiBEkcOM+lAq-H5u!dguB9BOg6yP>E=|GnN4i-!aS}%sf
zL;XCcA?G^f#L%tib3!;7B82D|w3SoY{l>FZE;-c()}8CR0nQ?z`T@lB71l6IXFk6-
zw(^dRY6&_4h*5wBm8RWD^<bz^A}aJ!=`df!wO<HcLg!)^@v~Y!&1;>5Uq>jBvOr+b
zlfeJO9Fbq<+{dFc(lGPX(W!S+HN7wPG#DF-s{&b0cL@$AI7245tVhdW5Vf$P7c8}l
z8fy=Bb8Mi9ZkaIg$)hoGEGE%J&}Ev<K`*=Mb7)!Zl!xQO7$eMt!amEKInO4~tUMA?
z9q$FbszDh7S*$HNux#RV@L~^XqU9l2o_0HsP%W;C)*rBOSxtU$BUr@J@#c*BG^;aO
zF0na{>KSgwQ*sc9KiId^QmyfY{iqmqgbpQA+Fph%t@rSjB020#405YC=ey{5@mX+(
zpJj-D%t+X=yL#2XhTosE4H*Yb%qC^6;_)e3r;xs5W;%DtSI5&`2D-YWPJIcV@hv&!
z1-sB&po5?*y*^vVmMffh$i@yo?90f3tZ-lBO-J0qjG~kNHZv{R|2MpnmA<zO{#Nci
zjn1*wC)fxWE{91#1X5P2b6jP2m)z_{F&7LQ#IR%Bf>d*sISW5T*U&s}4qw+tVK(Zk
z9*%Ks0}X*wma&|)ad@=5!{csG*c#%YuEY~~X>h7Z91NDSacEu}7>U+V4~LyE@Hjh*
zHkoeXD0CXtC2Hb!Or69cchVcg-4ISwkj2Xzv{SrlXCH-?p^HBSTS$E|v~!E5`*a&t
z_}xDsnnFEG9K~wd8^hwA1_@zf^lVwpgReIh#=*H83lqGWq%9}p9r@Y1hJ#ma@Z7fz
z_1kt<ayWTA2IRwUd)wD`0I9LFI}|kzw-D9_Qg5|@)OLLwhXt<f{+Ka`6B%G_BMn(V
z@iwMoLdYPw-sA0UqZB9yY%{%RAl{!t_`4t-0D@0K>}#h{7a&ID=|2me;(Ug#rz64B
z-wK|N>Zi>t503%PAhEWG?Z8x5Q{LS`)@<?aVY(n2#&21l+zna5AeM+RNEW(E1dCH9
zPDDnDh>yTwa<Il^#6B{7<&R+bKi+o;T&l+oKP2sPoZgbI18)!BN4*g|ck$_ubICYh
zZ@tX}_a5{zxSO)Kq1IsA1UbYZes*et<(21n^Wck8T7<jT#u*8C>%CH2a@GCZ-Y%yy
zlwnPZwWjNZp~~H%5DN~LgKskkr)yVyI^lrwnC!{-{Kp{%w|D*|j?ME9$uhc2T69z|
zqri%cA<n;tHa)yhYA0U=Digiw2xKq2Z=u(tCXPa$a}31XamaMGFvJNEZ+PdYz5CYg
z=Z<9{RN(tC`ty~mQraT(>He}@?~y$J{<`>6ZyEo;+bDiLvXJ^$BoAM~{X3Y4E04Bm
zy>V|fiAg6~7|wFTy{yq1Yb!>mASA$XRx{XxR17{BAe{hZ1i-hfA6PATOd9|hn&t1D
z0!J~1pi$&ZAqgHuGIM!(ejYLvnD++~B=U~do=!GO#pxF8#>!S7iID(ioxx60!8F_l
zjfO8kBLLb1J%a24mdXgiV;&&mSnQ;{lA3GP39??v2}e4|bID56zV+0p>G%7_QB59`
zdz7<j@9YTdLg)?ErAxIY*m0hiCuf`uo_^x&*~cHNJo#kh$x}~VI4=hXOc^+94Qe4j
zt5smw;J4I6gmLL0VK>UT_bE7n8@(HdHSVuRPs3rP7$xZS1{jU~;XT<P9xAE!(-@Bq
zLqCLrCD0c+lfeS1#++i4`c^J95uFdHMjRxwpO9VNeyZl$HSok@x&(oRdW&OOS0BM^
zz^QnATUIC`+v>PI&vl3lZ!$&&Ic>z_oUzxE*2=p#BH=b$mAZ(z2=2PFT{=|>Y($qp
zK2Ra93~0tS)F`q{E5rH)!b=1Ho+>7t3wi(&QRJDeDOD_38Pd5}0-$INs>--yv0e3H
zfVNAaUqH`_iGY4ugH<udsvy0XjG3euUk7gBjg=9OEXpjUS7ZOjtOv@)vK@C8Xb_w+
zT*rBOH95V{H`OIw$84rMquq$xu-Lh-!l@reu$M9`1Nu<oFvx&D+c>(_M;r&YcLww!
z$HA>cu`)EzVTBR15WZA^%%|)A&Jb;~bo;3PinKSZU3TEFk&+03&zRiMeQyUft84B*
zGUow*kOb3*oh49q`~iIX`q$5+QdNWgA3>Drk9qYnoT&#i&-o>K9N?pW#j7b?`2zuo
zsVN*k>0Adf>*}*1_A=-<7|Lw9e#&WHt*KU1{R&=2qad7LfF`$NFv3<G{F;Y_61Q$^
zaP(2ekRy&JAWCrxaY&wGM=w&_!|ePVQ~QT>!~z@%J{*Jy0}H-IYFQ4h@Dm&quAE=t
zk4CX*DxHiH3jiUa?I&<1Vgn~ka)f}Q#@zcFqdrCF7@hx{X_9#cL<q-5c+zdv@6&gZ
z&VPdA$6<Ixpd)@__VRLbQGJy!@Z?EOihhziFlH~~Apla2Z_dy9DQdsL+W9HiI#D%8
z0m9NpyMCF`i{YkBtyQrVwudRnL@-_Zsle=opOEVD2Y?QfE+@yC;<Rm~T@miiw|K3q
z?jNvJK67m?G3^V?Dom6poLLer1*ky6%z5T?odN{(B|pP!nbp*<GY+A+P2Is{7Ly6=
zcM$U!N3D-A*&!ZEkp=e-^H1d9fi=Q$h&YmQ2M!EOLhhA@3tJs=^Nj-ddVRjP`G#p(
zuO(qtPY$WTVHWvp8mjprm;vV3d_Eb+d*3n>l&2Am1S}Gy1sc1Q9S64Qk9Xb4?oJ|I
z+5y4g9FVUO;grQ4ujyLP1LI_jk0xPs7;lKVVOV%d0SJ#AZH2|XeqW^U{1Bd+&s&Dd
zVCI|cCiFQAqU^$(sQ(jJqL9_Kl#t;44ZI<90?SN1@lpXL2E0S3MSh2M%Gnn3jzb$Q
z1Dn0rBO)YXoZJXkM)#|Tj@D)cVVxl7j5QBus;OZdnqe*gO|zN=f*U$FUU9nzXfPU~
z13w+{4dAPV^GGbl!BOMj<xd_3caj$VVjp>tg@T<B3iAxE2iqC&7bn{Y=brI~yx|v(
zwGq;|LgiLJ4hzMrS%e$qJL<WQ#lQuC@3u48O2Ryi>8&IPV=ioY(ICCZ^Wq?8<XaOs
zNZSI35WfF8(4INao;h!;(4Kk5)6NC7C$x_0lQ>Hbwr+uIHW}N)IICU#FprlOs_qP?
z*&TKP+}MxF)gYF$&&eC$R=VE!T<482ZP17;?@s(ah=Z+BavNsi?BP#aHjcf92$ff(
zp8@V2*!MyW=iH$tc9F}uxCQhHk2V=z+kHFc0WrO>=cX}6EhY7hL&zfz;)eOS!j<0u
zCj@J__87R0Q5-#;c>#9HkrrA`_?9u$(p~LQNuBliN_z|#?G*0!frp7pO$`RE^FWx7
z_LvC1cuL<EdtzIRDYXaJ?#3PPS>$FwY3J;TcA;h<?QO8RI}UE?9&i7`0U*9{6t%tM
zR@~kR1b%j53WsJJ_AVYej-#(ldpAxp41DGI?g_s6qJeYe@Lcym)Ywz(AL=RG-z)8K
z7`zi~`&hA?$a8Rj_J_N9CzR+=P>v<k?qQ%YEW1cMNnspfuxZcA4@Q*nd5p7e;+&f$
z--0yf2nz^$p*cd^GSH5G2c)?i%dnD1K1j(3m8E`{)5gE0!<huc(SPRke}S`pLlou%
zP1q8(D|EpyH$<%&dRW{x)}%{Aq7T6W%@%iWl7bmRyya(UfGIeTN)#>+!B?F`dI2FJ
zSXf*O6~QU5kCb?wvr<5xS^3wABu&1)F`2sA{In67N;#B`1vv*I3g~)$7;s`<wU(FE
zA0c;}5=rVStS}*`Ne-$1M(0l=KUtwjSMGvrb|VsKrO?BM9QNZ!-(jy<hg)-p)qlWy
z6w?WTsYw<tqwDg*xmCzvlJL?nu={U^)&B|Buw-yNs!q9$^7a(+CNupI`GZM@+Chhj
zA^kKg!odnTXc2k{<8c8!<-a0c*y@C4Sxia!MppeEp05n|6o~XUA|+`tRT9ch{V8Ml
zf9ZUU4tt(HE$uGGxR+29t!@oXT}5k?nicX!(@CWN)PLv8e@=(QpAw2LWd~FcPu<q!
z5Oe9+>UD<s3pyst_f@8s)GaQ35C@ho0U6Flk1+s6^D4osBpp&->g#k~gX1T#SI(&`
zlFD*!0!dHMR|7#8YF%0&xs^sHbd*BVS?rYo$;eZ(UA;zsld-0lNCA#NsQYfjnFD4n
zV__qLEzP$E=xn7kNJk>c1+_>@Q;pDj6b`7W#U-yID8;Civ73*mMWM`c0dLF0&nY6N
zRzM1t>s&)#4D!131n<s6HZQ?+{FFK9UN^y%rLT?Q>fdwxejZhp9l)X3pv)+h=N}~V
zuba@z?7$V#;lPet$=57MeEAfk{Ix8s6q}>8ux43s0R}gp2hu#a%8#ebT?l7^Vk2e7
zErnVxoCG3WZ!*l|7e|l@jYc^#jxDoz^qC2pZPx3cvT$T~UfwsKH*f8FHGhNmFJ(j2
z_cRnGxSlf3Oqpjy1`A8qm)ZU`Ity_8kqV%qc4*~PhK6Qs1Q!0c+0H*jXN8Uc#D9iY
ze~-@3(fN5g1dXd7<<&M`abWv{mD5j5pLzVfl`~H~MF9D?2OXbdJpUXqLd#5ppR?M$
zr}_<g0l^7F0)Y=j?n+oPlGbWJ+mB7R`RNlshH&?Q9Cg2lMA9)p>do{J4voPPGT|S=
z_UjSoXxOiBgh5Zk<Y_ksJ()AVo*gKB&3q>NhWReUl!lCA2Z(mC=W?Fix6F9{b@T`r
z#<pJT_<?=5@~`2Pi$~0a_J<-5VYR=RAwh4w%J3b36QupvtBPf+`Y5(6;1?iv3TWWK
zJY*NbMsJ_02KH)ZA)sPy)^J>?KrOoMLKY8DRZ|U4ybLW;MH{Rk`#&ETp*!6Z$IL~V
zjnD>}xGzT|Rfv%ic8l^cTbmy3O&fR^;FGwd0gwUfqA)&6ShGF?{D4!MFBRYW<b}h>
zr|^1!K01$`VL(99q6iY~jZUmvqAEIBfb|TVHmc7JNA_ApnsX8AB(z^d86&Bc&O2gu
zr7HCb0?5`XW-!TO3k>$DL>9?C{FL<xSUWITDv9ASHoNu4h+9Y}0mb6}JD33IK9-J^
zBsOkT>l#y*bhXi!jgyvCTw2(N+!6c5x)Rg&E4m^K3n##wQZcH#S-ll-ei^DI=2a5y
z$rg}dmtlQnCeT!in8?-WGpIFb3aQJgfewW#qSc<TX;=j!ojt%@c4{Qdq@)>^yeFgR
zcSU?=F<N^*6sPgy{u4Oih7Mt5Ac%Y+jcpw?jSwqC`C}*X%VS(pFo}>_{|Dv(rqH4G
z%)qR9T1+AE=ok0^^^wCB44%_e0`6ccL0`jILKp%j6C50E4KoS)cPRKB4t__RL2d;J
zwqkIL2w0Hvh-_g1P;zSA4~OffL#ju?i*equMeuIibZ=dNf&kiiZ7E&sqGRiVKAZ_)
zPZQA4VB-St>KU>Rg0YX|7<Z~sagfrJh))jt!A2@rT?hcN)Q4&vE0#oTmglV)Oi##3
zhkgbFh>&cEa=BjW-Si#8jmybLNw{Tk!HrflIhv10pz2M_z4?X$<UoU}{|@sabfaL(
zH^2_d^7RSQv_Flok*0t1L`l_Yb#n&^zODt>3a9af7_lb<tGtz>cgoMD^M{#%&%eD4
z5H#JNfz4sd>I+PABPiFVm}O?%y$3}Iu*k^>i%k6|ItmT!x8pdgy%&p|;|V#?MB;=P
zAPMr&$qs;nCHnVXbUaI*6>u(jBD@xaF;E5o9-SmY87Ie_1fl=~K=yzWP{n7^zO4>6
z?+W6&?4{Otuqe^V(A=`4Vn2Km`aBD8C1@5UEu0@eE_Z9#(Z0aE41$M7JnaE5wJ-n<
z9lpRE;{-7dz11O`C%<U4^q)a17rw=VPGbweQX?NfgX=-$(o}yPcn0oSC^RPnKTG{|
zJGn4~xQ8L7`D}0viH-VcI}?Ppa3+qtMHg{$U;J-+BZ8t1WBxVl&>rHhdgK8vaMi=j
zd+FCs$~Did;k%6aKY3#z0vr4BnbX1BCf>s}G&6jYdyN>H`{e)38xKS4xY}7rl2Yv<
z;Iebjg3Fm~g4iBrn?Z_@;GXv`%=QF!ix1)aA)ZUW4c|%brpFYOuiio*lZlv<@wbi8
zo_aHLew(7PLn~Z;3;PjRHizLV;vY7~WkckEWls#Gt^;HTVC>G^hLdDz^bm_}$Ld@(
z@fA;;C?3N?2dbebNiN?Ji2A@eU{(*G$6@7w^AZOM$aS!Ev$FV^S)c4l9dRGc62i1Q
zk)IaJgpbkO*3yk&F=Uj}I8yQq=byhpN8m>v-E1C!zdoaW<-}>d*u(6WjlJ|j3gEc(
z6%Y={5YN;Fwb8Tov#0@`qd~3UkO{Ve1U;$~e>)A4)v@)LXi#kdIwC3k$;NWst8Ms9
z{Q|N!wOwOUKF;i4pmPE<lRna~$y?ctEg~7yxuSI`Nt`hR%v}fjrLHqid8m+JBLHXb
z@$Bk{8I1^SKEY@x+EowoN<g#!g14BIA=RZLHF$|vg3A3);j`F%2R7cFgGLad8LcvO
zyO6X;kiI_@_Ur&Z==uXoIN=nmSO5_w-d6oGosDotWRHFXId>H<#>RNgPX8ZJv^nYz
zuvHI7W|NDg<bD_Q1lUhZnZEdOBVX3oi!VY7#&!(ye}>&5Z$QHUoxoOX{bT&(N5QMU
z8tn?(d<8}E3BXkkpapRFXfNBT&l2qLlpX$&ZLCifjPq)c6@VBv-d`-O#;Vwy*ZVHK
ze1#Q7mOdTJ52rI5?=Jqfc)R@A|4@vX@+ZVQ<*5Q2H3b@&wjR`<Gf#gE$4}DsCpP^{
z8Kg+mi6|;Vgdr}EM&~(nj{6hy@N)Q*;xg@z>oD4u7<L8*2Gb$`362WXoml!byLfqP
zR}XYc!CAPNEL}zNtM9QO8!Hld$D~2|?w%-w4MRC-pROr}f`b<ZSP<GQ@ON(fLfHQY
z8IXbj8EfV38;91p;JJSysPad8g6XH+b)_W^)cC3iI@H^U?4dl-!-7>BlqOZ{-jd?>
zZ$I0y5--Q%%6Yl|M8~&u3HA%2UlL0N`XtfajJrsO0y)SKvE8PMZ59wWQ4ggLpitZS
zEK9l(0~n{9=^lTOue}qE?+@#?qPkbrF5ygZ+s}2~XbGbT?_S|MloUeFp_QQ0a(m(J
z4>TOe>!3X@@dR>p7lZEOgQQbywjiFn8#m>G?cl^A+NCW~x2x`9u>E`zY%0dvvZ?q0
z{f3xI%_(UOY6&NJOwoHd@J4Y;x=7)c<Pv%V4;;9c(H^367!IxfOFX)jL>MXPN=i9J
z2I`LlAv9Bu?IAECdYrG0_CC-t+{gFsr}GXv570RW$4{Qq=}*(APUn+!{wAFtr6aS3
z2wcCw+h3qFf(n&0w$yuh^%$J;L}=#V{Do6bUFfoMaGpU#o+WG#g)1`NyQunzwD;2w
z>-wNtAm;upIM%2>pp0j=j)(w{lj;yeb=b^c6peg38RzwwnT3IgycnP01Wk1;zL<XV
zc)B$~HETLGGBC0~eRno9vSV`pNIv$xLuX>&3->9U!iy<6`T9u8de$74>Hk|JmNjmk
X(br=ZLVY*G^3Qw%zIpR&<3s-+!t&0O

literal 0
HcmV?d00001

diff --git a/tests/models/mistral2/__pycache__/modeltp.cpython-310.pyc b/tests/models/mistral2/__pycache__/modeltp.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f7c6a28cecdfc2502d5bbb914f4ff15a9b802990
GIT binary patch
literal 52277
zcmd753wRvYeIL3rJNv`}SiC_HBv%wA5)=uNdfKucmMPJeO+m6q+wn^BVu=}m3+w|t
zGayA|5zD4rD-R})+dOV#e+6SFvD`FinlDY_=H@!jo5a0MZf??>Ow+V!+M76a>f}1H
zBT6#w@Ap45vpWkwlH<1DU0~0iIdk66|NFH1`|}BWe(fE%E&hwEM&eKT(EUl{;jQ?(
zr;>?;ohT=4!#3+i*^pneY|3x4oWyUkZZ%TnR3lwZH!|f+BU{cII)1L4lX&??pP4Wc
zjY7FUcrzexta`4|SMHOrRDG~fEEgL?<)OxKd04{I^^wMCc~tzF`dDMUJl>cnPsnq&
zKH1n+-qn~YPswwxzPmA9o^I?Z?`iBU@0GB8eP5$gE;Vi{-z3j{_5F>T%QrU;ln*p+
zDc{n#wR~&iVEJI<Q27wv7wrD}Oyh9*u#s?Xxsp&nh`JBdZ)+SWA88ydAC(*i>$8nx
z<ztQG<>QSL<r4@iI=4H<<{i$FYsT`OSB#5^@?Fls&AXj@t{LSw*+b<wOAp-JcuV;{
z$-%7OU%p?$9w<K`zYmrl#P4wZt&NAu4@sPn`r8^O%O@KTmmikr(fT8eQ{_|QAFIE;
z@s9F48jqGAmFMyLI~(sRzf1fR_0x^V%8!YEvOd>%Q~6Ee-&KFS@kIHF#=Fb!Zai6j
zQo^R{XBubAXB+P+zo+ru@_QTaE5A>|ch|qA@&5Ar#XnsyH=Zg#)p)x6bmN)wGmQ_F
zKY;K(_TJ|c<;vR=_CCAxe8Mg{earJ#lI1EwZnF0yWPcEH&c4|`fVd0x!}cxqt<M`}
z8*vWWhY;sb5XZ4+oZRxFeb`yDZ=-K9Z6C3ZK5v!JBkrty3~`SIacc-UZl6HN38#h<
zm+^eNeFvWJkmnVr<}5oa_MM+dmg{(Tmwh+h-5sQEAmpBSi2bH=)4ADxv(vm{;_bcm
zTM+*(PU%YWicxOa_mx+>-rjFNfVU6m-0=PaQaotC6=836-jCWS#C<5p>23B&gr9U;
zl9OwjXA(1ytWT7lxO}#y7S5L*tGS-4)=Q0+?bK(#$fY+oWBJx;g!|dYSG`)RS*`o2
zvumq$$4|YdiRT%^H=E6wq(AhwlaIWw=AA#?TB>>O>6YvI#ghs@k2F2iTD>eGna9pN
zLJv}<A3AyV_EU2+rk^@lT{!Rfg@-RUtBu-%JR$g|%Bpf!tIDa|X;&7iYi_k(sd}E%
zWHl;{s=MMJe%&aQ#g?kL_G%S1oOoN+bsl5e9$)iT*F4tti7J{leo}Hj{a8<U>5Ow>
z&1o(;C+k($tu59Z6_0^h4nCorv#MHaI<{o(58B@4Ri|?9vgcIX+PX87_D8F>U2(mt
z@+x*~0exI+F1h|K+l5xVRz<p>6~9zfw!f?9Ru)nIc{aXTxxG@oSgqBo=P&{{$6qZe
zr|LPXf)=CR=&jo;Cyx7>dTVJ31^T<toAv5~Q&En$rkY(<`J#c5#*p*}6X0a4xma6b
z<GKo7Tx%})<6Qw4tE%oaD-1{H7Hdu0dA71xQ?6I3K3jAB+-lXX)SBpMzi+MC^>7JM
zTbHDB)NWmJ-{Z=5lw&_!Tkufs0EWF<uRC@2n&<d~7<CuT3WHJC92%=swYlUVe(rSj
zvZKzdItzaP*_vZVkBC?}*{auTPS7U9&p!J2!>V@CQJjP98~Bp`KEzsUG@l5{J&p0K
zDo&8rn)kL^)%8bvB0TmEr)vAtPqr@Ym<CIyf1%Y}sCpG$e}BM6`zlpe8q#q6q|?0U
zXIE7VLnn)3Tvt|R(y5jT=OuEQUb%=xfOWzdBE5=ffi9`D<^FUKf_G88uD4NGJgrQ$
z9n;F&1x}6dlY%FM?@4^!_rhr>mT{(f#<FRfSIrIjlI_H%buD2hZR@H@--g*XHdEIU
zn`!uyUS>0kSh==gr-F1zJAKXEu-azZY9|+y*po7IU%Y|Oi>9BnYYjgoE9w-+D~qo4
zxcMlVv8I|rCA2^9C*3u~^|+Jysan&!<90vOY&F*%)$%i#x7<4XWTX15Z`C}f;Txy?
zBxa?bURY}&=}gKW=-FP1u5$rX+(my^&Q(3+o59qrEjX?}f_y6ab?0q<VJ!%AUreZX
zp>Eg9$CT4(dCt-Et%h@~e&KShdF*8C^y%6o$2`aN++)(q_|aSM?Z;+kgZ~AYwp-py
zd~bDj^|C)$S#rFJURrk5t8zr#BAlWzZWN3u!!%99G^|(EV+dTpH}<iDC-+u(oCKh#
z1R$vNM-z*Hs_f+RIlxhtox(|JeKcVu$|;VFKM?E@Pd;{LuB94(cJ(iRX7g*m{?8v+
z07w>n=;vhNuVn8ft|T^%YXG-Ab2I5#o2j;O4S(%KC5?ekR5G@K?9KK2^QGVuFQ-&<
zOW4$E7poW$ujI8#XYZ0#&PqgS=4P^f%3WK<ipW+lE;X-GQG^=(99L<>!S<-`=d;yn
zViePtoZ8ZP&o5#xt5vblSDaP1R>!H9^exw^FZzAwYqpJDjypM)>#F-Aww_95fd>Ne
z<`}Efblwfe#@A(Wd286n8|xGCHq3@qS}+kO{AA(b_awgVkHcvj%X-lmp1GX7W^5Q(
zK$eZ(T266bxUe>oZM@w~x3S!oGZTqx#%KcZ63bb{HQJ^;TWxDIhlQ8Nw-1XiWvAMy
znz51g3U=DgTrt0=xRLStmj~MEcE--yxvNPd@jkS|!rvA6o7ocy_t8s+fpIeu@5eV@
z0!A!9_eC~F-2>+=R%m9$aaOTt*N0?uN*tb&J~Y>dsuk66E7(zSCd@9ot)_2W`XZ+x
zIyKGnqv_2Is(TUOn^?y(P*}OGD%WyywROq28cwz8r<8j^dFnkZ_$*_mZO6mj=qDFe
z*Zj2CsuN(bTB}&_mXu@JYmL>Jtl|>!`@=Ei+2Z<rvUgTwlkv0ROz@M{rduA3MGdO%
z7r5HE54+Wij&CgLX%Ni?>^A!BuA?+}98Ma`aTI?=qnOMZ1#{S#zEMoBkN3=vAlZze
z9!H`V)11TriJVHs&sPW*tkvl+R4P2)^&6RZN=dtddAZ=!>*3s#O1&R%^(>u(ZvkJK
zrP;)QQOv3^48fgGv^g9{ShP9hWpm)Bz|8@3h?|4m$GHPAIwjjg8ndF{M@?%1?793%
z<D6sL07AaVss6>{K>(Hu+#tda7Y!GIY#7hN)7dcE+$2odB$Ax&6)XizL(5Aor)_K<
zID3-tXRs`+%`B#Ru5EE+SvGekMiRR*?cvXJ>a)U};&|?3e#?dBelN8-fYU0=a65sj
z=jN7lrT*$cIat(Fczn^&voc$2UaTxC=fWw>*gUow7ZVM$_Qj;1!C_Fr-o&|h5l4h1
zz-)d3$@Lh$89sl4=f-NwtqBn8g#F_B);R#DddjD<RROM%L+tK5{gkW#Rb}!)<t(vC
zRXMk|xQK0T7zd75S!lV~p_&>%P;D0ZAx0nK0bx6f)wMc)375{KWeq4+Mtzvhl`ytW
zCvzp5%2M;^5W^jYlQ+_4+DIG6F_Eq0fRVqkzAv1@@p0~Kb>T!pa)BK<>ax|X!y{vx
zARK_vNVHAD54LeNxoP1!g{{x>3~X}x$=o2skZEHi7XfeBNo<2@#LL=NJFPdmHnvL}
z;~b1~*3RG!w!Y1L_Zu1KEHDr0BpE2#zc9)&MtOFpz*_bxwSj8-`KWVk%<bXe?B{R?
z@nqoC^N6cH!UP$?0jjR_sh`4@hp|cm5>y}M>m*9h1DZt6GZ{T5&oMd2O;0w%!_Z@{
z5ln2H>)B(}3yAmK___r+X=B<LFo(@4W3RD3v(u1lnJL?z*)llX6J&`bRxCA$PR5yn
zbBSLWAX!9~jW{1McVVa`+`P(Vm);t>NJKL^r#^^W)wjYyi?7npMs!-NzLSrf=jyxQ
z%p~=8Z9J=<W0*|J&dlVN_k;Mlq#5PSQDZzgZdhMm-@8Ne!_76RE;BbyzquEUIaI~t
z^BSFtba;~Mnz0CViB6|%Ss$kv#%+o%a=CGFFU<}N=OuK(KuGxEmKII=ci^EdWGM6$
zfaF~X+nP#z#9Tx#^8A*qEef~MyYhw$16yDp77V6-(oS7vkEYiTKZ#|BGpuyJT3;-A
z=be&1uSz(9-BL>j$?-NjJ128owlnn|s0KislzYC4O^oNP`j`Z%@5hgCK6?V++q+uL
z2K#+VN~^H2Pa{AMkts~NQA7O@o)@@_haYwdn=aFd$uWVfU5U*^o9o)dPTqOKuHH6B
zu@AHpSB!RYG%=CD%EpwFFe3`XYHnv_G8xq-`t+gIRbX~*iM^mMm!1f=)lzia5Kd&T
z0E@@_YSS*QIG0Nr8FOdz0yjRSmfU+I2=YSd(7d1!vu6e6oS(V3bT-OCr<{#quj~-J
zQ%WZ;mi!K}@doH-3COJ%jv0FzHIB}QPBw_TLpI+)CeZ;@v$v^K)I37sY<w8K6ktDL
zExpzXI;phKQb1o<TR@GPORwGKrBf|ca-OX=fWsZZ&;Y7ms5VRIoLHG_u46NwRRGy2
zYY`ilhmcM+Q0q(Qoo4A$ORccXOF-01)zZ7(dsMdVPH(L>U5vf6?o_ZM&~HEs=Z};Q
zw*peu;UlHsbaYwXiJ8ts2-|)zmc_wRRV}SGh=RDFHjpHQl;|R4kFEe=s#FE~fEt~z
zfuM#wqvYr&_o=W66?FVFoRG10V+4C*xSYRL;5+9rZ*@<VKrX<dtzRykQ?07K00Nfo
ztync=^<U6aem36gO`xN(Vx@#_YggJ@qw@fHrNy#$l%5j2SkR**r2qsvQaTUX2z&gQ
zS<av4ky353B$bJkyMk?N=ggRgyIr1~HyC?P3XXl$!7T9p6QxFV#c^ZhywRpb{Ym0l
zIFF^7@!mXBYk&ZU5$Lq5q}viJmgDhyorDHPebs4KPn&Mn^Z7|3+jZ?#^Fj(a)V*EJ
z%ww80R#goHw-heZZFj$t-puXNnnYz~cicvG`J}T&oh=(aux;alZ>*^A#s0274#zKa
z&v3s#iqFCEGZ!kdqXA3Qp57B?w2B@5az*cc1d|!<pCMm4<OjwEaMX+%SwnpaVGH=i
zKFQ$8JpfOLvk1llg2wY%U@X9$W8{pKpAVpg$4);H!^(-&lGMdKUBY!+5E3DERe*;t
z13aw&9l?oCY#itb87Ls2;B3E&&`P@JU8?I{#>+Ad(;^iR6nU4GbU81N-1~L`VyF}X
z<m6Ngwji9r?-_i*V!>&{t9}3;=~)1@JkNW3IeTBd+Bj!dANT_%V@s{X`YoM~h=BLm
z`|7QQYTbPR9-RV+j&+XY(Eg<=rVMD!g3J?#2CW73mGdoo#~P|1VP&G)sUO{1H}zu(
z`wtBN44kvybXNaQnAM9(^;!7+A&u;VaE{|vNX+WT5kAx3LDWDH6#BP%UyuN>d&9A7
z0JJpC`Zk7V)>grMXf69W^n@c`b(Al1plvtxGa*wyACM5L3!a~bQ(0`jXlj7}C-Giw
zgK+&8!reRJ0O7)H&l+aRGAxktOg_JCW_ErmX4gly8Ohk(4w3DXEQU}lNVfa}{MrPB
zujz7|nAxNv39u%F+zt^JP-+r%B+JWio`a4w1k@}GqOJub8`ObbI#L_ZDkw>J#VAPv
z!OpGcvGTeIM2-4tH-Sh!gX*ad(5b+g$*Z5H|7Ym@3pzhVXORxkc6FXkjm|Qi6*wWr
zE0AqEn6LRxIck+LFVN{g`2ye*{Qs}`3hrBqNH_T#R<ii2m0Tb0>gWKwP;NY*4z@sm
z`8S!F?)`#Myh$-k)#3Q5Cb%-Iep<6;wu}~cA{j0A1on!=)7A!PfEyNw8`y1DOx4eo
z5Bvd$@kfXp2X6QaJR$|)n^+vimE&|RwZSnbVFe+h2??`^r>*~<9Ajq*oZ+7*Ll@<h
zNCipkv#2lFO(3#jdDI;o<cFRGG{qzMNJ+nQ@0V812oMlrCc@BhOG~ZNqG|yo6ZqkX
zl-OKrQ~)_wK;vC=DxLEmXm;nAoZ-P7q$)8*hi1Q13dvkp1#`eDi8K0X=h;<2Uw#IF
zD{&x95{`rVdA>|(TK~v4^#N6wG@fjDX!lGRa;2nKRxZATob;)c%3|#vCzA9_2wT86
z_8ASHT<&<`sbw2Dt;X%7W7(FIa?-mJiy*>-Fc0F!^Zn(FopG?<?5vZvb50t}>Ac+s
z=5#)1%3mQ@uqW((i8Fxohzm?}P<%K~ARdr%3iusz`t4zSN1Or131i|{o_8fsE^ySv
z0ejRQdp=bj6bfGvvt-gQ2DAU6kR`kRucV_N1FflclpO@HG#VT|+6SY^rAOV;L-l&;
z{jIgqoa5LB0qg^+oh?1qvI$iS{0}^!<ko9~R%oVSC&PP7rvYbU*)<#`<r2Hw|8}S8
zsH(t)d>t`xfiDn6ID<_ed4gZtbahg9zXopR6gOi(C+svp=g)4TjYQDES)90ApaJqQ
zl5NvYtR>VG`5S`Cr<XH7gf;m|kflH|F+iP9R6wD}PMN6Wv5tB6=clj^D}4wp;1}Oz
zz9~p(KSBo(I)GnLVJd?*37|zh4dEAG5Hb-Ts~35#B-Q)9VGzUMN2x;g#BchOsVl~Z
z4R3_ujE!&KXabOi(EI_V<OSLojbgA4VT?XdcEcEB+r}t##TfS{Iwimd0vuqjeu*(4
zrA&Iem^;2`*Ycn}c$F=h66mHlw?6P_6B`6L{CW)6e_qB7o3`MBglVg#`Dl#hxjzA&
z!F~g%7YLY4In0AgHB2pRN-~SU@@XlzWUrC!F56kKSp%U$G@vF#8u<>n5%rucJ%O~C
zCqyMm?wS&00Ehj>n(g#dK+w<yGTwFO$U=BSvuD>I40#7&9hDkut_L(18{|d6=EN_~
zU6$=jHm%a3CA4C`E5ZD|rS)M!-{!duNi2q$GqXE`fNvDQI))i60M`0#y^HuYU_bRD
zR>(IEk=-4Z{Y~d||9`if9tm?&#5F({jtJv3yE|~9>=5c#5JMg2UQvK}MiZl60gK;J
zWrhnSQNTtyJfhuCkDs55hEjc=(SMQIjK+_dCDmG6b)^P=o|t|G2e4Q5a{yVj5R5<6
z$=5~<tPVf_UY?ncaHcB(PtzAPbediXelK=`sOgq|AneCNNNXad1t9sote3#0($H;y
z(>`>T+e9=1rvy(eW5BR(WYbyv>Tg!UhrxZ$<8Ateb;HbA@OJ~SJ3eKZCW@IQsB0O8
zSJ!v#I90*%6Iz{mj_paFbJrJ$3xyy033Q#r*Db*Ti!=eChdc)otj%lICXp}mnt-{Q
zR6T`wLMw$>fOWCvTvETzkk7;Ma}bhhg5KpiK3E0Vtd`bVYwnDtJMVWH?>RdE7o9jv
z>8C3KZ_Ke1iFuVeT;k|#oYiqoX^`X;bNMH5C=~<Qng~MjhcqzTfq)=$gZ)dhB-H1S
zWPyiP_?du*R9V)@{d5xSYNMUGm{2(nfFW8hN7SZyE%9`6BfF8?z(JvrfDLTd;K%?S
z*uc@CS@#?LUb@}K<48DfNjw!|PyL;zWYSD-^lcP2`rCb*S%NT-_vmXEB7dRXANd)l
zi)oEm-B119Pj==L$&CRpv%!88X7->kvj^IP1c!Q&5MgG2YYYX!0kmASQ($p?9HkC{
zMb29Bmo?&Vf4i_eYy(u<D7L}wSTod5qs1d~%zxG!wE=c7k1dbmm>u{er1L&(v=MJ}
z0xn`*Gw)%##PVdjzg@&pXFhFi3~dbK2t+yUq4sb)c_oD~^i7bvkFX)T_z81k1o=!M
z7E2guk6g154?XDZMp$Bb`U;-l=Ue*wy*-<InJfA+Z4X_;T=w>Dmhd)n%>WN{v^~m^
zZ4&^v3E#ox{ks$GT)Vhvtt3BYsNHDG@OE#}vJsv3hZ3K(q=f`>45WFmZR$J$YIJgX
z3vwE5W0ZB6q?dS6`q8!t`u?bQ^YVe^TiO6Dy*<mf+T&M~n+Mx{%ZD(_^t+k%C~A5b
z-`nhot4V~ijic>;%$~9K*tYFKdxI8z1ofEQ(js{;b&~W^oyNlXC!@~Xk05Q%JBre0
z!~VKrTuY+l#OAT~7-~7zPJPU9f5b*x>|IFFHBW&Yk8hcm<2%gD+=&G8`Hk4j978)T
zk%gL?15FK*f;?y_^LSlBWKl?V^WDJ$3fT@MCc$JR;2lCifH^3EbP-|T99+C}_KthD
zfo}+D&v%X$7K9uDivfn3oxeVMt^}^b`I0?TI{H9~5AcHpZrEA^OL*kT^}@S_&!SPh
z_58a)vkv?PIiJF($!cfM3b1`w<UxAS{V#qUQef9lodpIebbX{(YYPxw1fQ!4K<hl&
z<Ti=a3r-0VeL{a{6YriaJ-S%B4BP_ZqXC8U`o$99zEX9~YaIm_odS1|0tEek0T>5>
zKS2;yj$I6)!6PN`D7HX>q;u7(b%Jz6jH52oP@m4ES`r8q{RfH2SrFl#!;TA@&&Tm2
zq|z^9^G&QDh&KF^7DIADE^e*WZNiB_vng^^x}G~yItTC<aOL%zIte&H+r-u^zBeIv
z5NnN9KZ}zJHEu4g%jh<P&Ys^U>Gi?Kg_IZ8<=&a&el~z`Ae}%TkY7|u{cASk|Df}4
z=-fi*^K^cn&LW+iXfYbe%B-$xLfXG&T;k&zs`>(b5get37a;WuV40luCGWiY9me<+
z96!Cb3LzXv{Srg|cRC_1lx|eL##$Y6LLg6~z7!MU!3~CdP-dD>c?zut(##pRwdjGP
zug)=-DxG^JMF3kVQsdNL(D?<1CqeNL(xCcnJkCt$Q(veD(NV4bfYJYuj?Xmd74+Yd
zD-eO7(sCjO1bKuquVq|aKhKk0uNS|dX&}K<7r>{Fd{^jINO6(!hU}eg61=RHiZ0SG
zV2U;?!36XB<Vk1Z7x4g27Nx1N8i2O<04pL@^lm~boVnRN!7}T{fR#7%0OkfV0|5B)
zuNKk;NH-0n_>{&IWt$3+a^hd!#22XuAP%R7NxMf{2qz{BM#(GzK%4@oXaPXXr^XR)
zih!gw4Mu*!*pIi<!P{bL+Q=u<rezc@>qY@N?Y)stu1{_O2?NYdh|Djh=H}*3othcy
zVetEz3T2-mA$5oY#I>kMt<j8sJ&iQ;SN-1z_Vc791Oa^vu+Ftwbw#jJdT3_}fnP?H
z6#MTuJlTY0Os6JKnrQ;J{Ig!NO!Mp6r1@&rGG9rj%$M=*x|uOw+M65xnquk&d}E*f
z;7RDV-PEE;Y%^Q7a{4coOW7lK+M%FS#>wy(G@q>avhd~LBV8!(09<nNiv=p1fXdUC
z8$b>z(0m5%BILRX_K-aczDd7Oe+B>`k45rbrzn{o?Mt^;--+gGGLRN%Es4N**FLGG
z*hmt@mZl%DQG%5Kr2$POn(rOe`4TpJ?zZ7J83;B6sj)~dYlh!i(`&6kL>d@8&x}?R
zOi~EYtkob{9-J0n8HYP;Ge!##UNf+HoEJ5()C5UNSi-{FUht5S5DQp~)wOf=+Cu4}
zCmzKvEp6>>BM)EOfg**RDZw!fx}|W-aSDTHtB((1Mu212^sr64IJ7SZJiCUbO2u$w
zPE=01L3{;|ArT4~j8TL1#HWq#2K?@sR}ux1laeH;$SMF@@&pp-$xRf%-`lNDfP)_*
z)vZ<RA&X$(;8+R8ZY<fct~2d6Sm5t)TC+C8So)C^SkG*ThzWDOxJ=Im8N*Xb$Tf$r
z;3@wIWdwN2Ko_2p*)%B-E3)Ou7{;RU6k;qQVk{U7WJwUW-^&CT3na#_LJ~c|Si&cb
zvB*;lW65=4ETNzHLFgyO5{TGt764-xVo&|uPXWf#--WRZfUx&k7|U}pxG=F?TpkjP
z<p)seaDcJ!mo?%qF_w`S#`1H3Uq=OF`AKg~FqZM<i5`q)QZSZLjj;efDYh|#YzQXV
zgA_p8+yxoBscVTVrkw)pPQoK`4`3>S(+qEnc+-N@47W!*IE}Zb^(AjF;uf)iK%kg-
zL+eXYQgM0TibaUt{Tr{ed6PXX*r#Avz%2d|aG(9GHKh4Q5O1tE7sPqKlaD-&(G{X6
z@}5j=jEmIZD7Lo|U~d!A6WAadlkJJ1b<)Q6xU^LKi<C-YKTfnKUpEf&(S6M}F7LBb
zlL^75s_5rYz5{+?#&D+RNg`%xy39bh3z!gOA8+2gC47sGHxLTDW-MV0>@mnXn&{hM
zdmOmoF7(f>0d6?j-WB172L(4gB)H*0!42srjyMAh<45g@CTSjrH>QAR-L|p2y&E{#
zkrfj?_g_%@(QCj%)Gxp{i}p`9hrDBKmpyrfF@8hZcpTv*EL+Pbki)c$-LCdDX46za
z<nG(Ny*-W9MqE_;I5V|RaQoL?O^M|@WG?lCLX&Gx0Ylt<HM6m&y=U`I!O8AI`FBfg
z?%54_v1>^guRU1tZ))#B$$Nm8neHCoWqa*ur0QCmz{~b|y{m8EYv5(3`_|a{+tc0~
z;AMN})W5?frT&P{zo+xZbf)MK9a8@h&idYJXR{4JtpI}C4&PCK!i4{c&Y#k`ht7Y7
z;|~OEkcwu5tp702w}myxT|pA5_xK4hA-e_PB$CIW5wY5WaM@10lp`85!f}k#uK{e4
zqGyYXwS^k9uTfZwB2S=kL%u0S6kl6iQegDxDjX$nS*q90DbN#VMS6N23Va}Z1G3c=
za1r>a24XQd#t+SR%MV55QHuT47iTpeem36rqu?)w?6>)uBj6;0{{J29ss94!&+wIx
zKLHx2LqT~!xa(4NEdjm$i|jjn@&?k98q7wS4x}c*UIhLHEfBO=j(w4@!l@l7A_QV<
z4JwW>rgG|naU&g}gGI3T1@n}quB`_0S&<}P2{@=m51Ukt5HI+wQTDT?_W{r$|AHw?
z=c)@UlCQ2X;`zobDAa$4M*1V9_QiY0RsR)lX5Os6gdg>nj0MGsRrTNa_|0N&>dVaK
zHL*7JSB&xyoww2HU~IdCl@LruH<-$T3Hlibs$=#jH2}Cm$m{A?k=!4k0DFMq5CJ4~
zE~2V_jtV@LdTqr~f6c&Oq9cf)plX7&5swqJ&L08woR~lRwBj_0Q`-JGGmk=-R@VX@
zkPL_mQkfzvbC#J@;rNqV)^dn$5HZv^#QW&`TRMUrmfiq%s079mV)4sN_zIm@>3p5e
z4LX$a(J~5v@)G5NfM&l1`BTGuJ*+W6Gy|Fe;LkTbx6B=6fkY$?hC=HwASOePfj&Vv
zmC59V0Q0SYlM2xnl<yb7A`o2y%C#$twMo(;cC#JU_y-7gi3EpOV;=b7H-I++f9&9m
z`;)*NO$c2W=8e2*zG|g4CYnzb%~5lJm=rM10r(D>2aHMJjR*;_%2DGG%GDU>u#rv`
zkS}moVz4H#FQ~|+zYgp(%}`>c1w%pwKhs!R`lV8yiVr=wXDDXg3-X3jVh}yBKrlA^
zjKV|g?IgY~#~4Rg!p*=nHgZk*ZS_Nb##l*iK$6BbL2P~ooL)%S0AvY>@gl7u(i=~w
zHvs-Z21FCq3m^{<Yz(&h@mmCdmUSQW018WdfR_N4hA2GU&H?nzZwxobwttJ-087*^
zB+XM-`#`SFfGFM9&bL!nERdx~AZL<wS0J~hbJw}1ucr08cz92K864!2!%Vii0mxpk
zrxHFV!HLy4UeJFyhA2x$bVz!*6ycbqqh#4~tKDu74{#|UoaFyv1I0E0-m%b6H3k-y
zDXPj4wZz*_c7YHkg(#u3p{s<w085Bxp9^#lfyD2^REssJ#CPGhI!Yj0&Za#SlP_mT
zH`QmWEO%5tM&;*Gs!#_a?7>N;&nTj-Ky2~Q+eN@lW7%{lOK-S59@&4-(z<C^Z@4^3
z^!F_>8X>`8-$0oJ0m4}!wA7<rQtSc)R0B1Eh!Q)mg^uE)&XBj&G=t-;dJ{AQ%wNu?
zSod=jL%CSy7nU4Q1UQ{egb8C2p}u@H9$C``m?1yGMvVpwZ~|q8E#-FBO%OR2BcOd&
zTu?WhD4T8x`E9af=P?1P=4j2k-}QmB0EEzR(8DRe%kUuM+0w&W`;4>nMCn+G6xl-|
z_%;)(%QnCaYD8LFOk{tX3cU{;a!_tq2B0tK2zxur4RtzDON43d@r+;S=x$do#GY3C
zLYJ~P-gP~#K-^Qa;47<sf2=nlMk-b%pmn;Br(}uFO?39txtY!ZI=9fd2?qkex+nlt
zn8>eR=2U?ZDMP4b*!n7Km>Uql!}g%|F!(3w2n4l{zG*l!qZ$I~fhS0*4$02}ZJiLM
zE|V~X)XL9ypmPWyXw`T0PM_{8(w*I?JLIX;RDZ0in?n70q}?`nMPwR6_Jmex*H8wD
zKpMa}P2X-*?TU#5)WcJ{1aX68Hc5ejyio+@7{Um0`s<Us+s%H}?C*XK{h(R@?ZALF
zZha{X(ZA{I)`<Dl?l8oB$;z)E3#gO5=l)KxVN5A%2(`}OD=@&VXiM7^S<duEhAelS
z&lU|T>;f4<?F;=l`#~5HqCvi$nbc$o>We`+vJI9j*!2Pfc#!1;!ye}`<iWtayq0YD
z0sS`-p4=up2}Q~bBuC$a_r^Bwjdo#UpxqCcqHm+vyd98<iP$h2voYvd(D;M$?L-1T
z5Z53AkOYHvpgn+k8q^2`+uHz36+#6YMiZ>ep#G`8?%Kl$1nO@>mel~vLg+<96h#ok
z3X3M7mi|R^k^00|pu9%x8jLi7>puiP<bi(>KVsiN?WIp3alsOv=i>u(7!&%rICh~3
zq)K=KPvE@_?9)fFAYp!vr=<4w=_5=YlyG$pVz3nKF~QmdonJ?4VS1H!#m}A!ao7OF
zj#kWyUi>iDl+X+02z@hjdgeFh#Gi3~--iOgQL*yIB%swva}t~r(-<aHn}kRWcq<f(
z!JGu-TY&20uwf>HnM<C@_4WNb&C{+;BaPCq(GXOfI|W^3E~+j?$P6p^ZiY#RX8A}W
zw93)fM_-;kMITqDV)b`shxFm?Z{X|RjRa(e6f$OCxWsm1hU8yLe9*juvGQLl+KPEB
zAbIRV(3+nuOp!nwHkqRkqUB)P+7PDYFX)14@nzu4z(<Bi*2#e>k|RTeal)ARmFJE)
ztT0>p>;gn_`-CY{z$6{;M}s~;1A9Z!xx4;~oN~ddE=Vz8Uj+I$B2(RL&UQGHvvMdy
zkE-QST+0UMyTdkr4F-x7_xEC=02toNK+#-w0KGWg0RrQoC8z>^CShRc=kd_BtE;rh
zAtj%A_=$(e90}xVUx)I)RZM9N14%I@&NTNGtc7+;jR@%<M10NSfS!%sDS!!aAG(S?
z0{e|6i~zlphU9<ZO`7~qmOw@eF#$)(i+B>Tp(LKPAXE6H1)1bY7y)?^(8+JYh=7PL
z<w@}GB4dXBLOU<Y=3oMB_Ja}77kesnKM5nC-y4V+0RtXHkwCH?*im%pBQZi`VtELH
zNfcfB1WFwN!Irh+FA1;wEr7^7DvW?Z?!Uk$e*!zIwK)biu{<uMe)=XrG#<43g^&--
z(VGOO*C+U*7EVIzgY>@-9I|K+fB-#=e6$b~a%v9?0UBqJ2f{LP+>KMG4}J<N(O(d`
zXhYq=`Ghfot&tSYr$Gwps2kq2J=h5WA-SkyMWQ+R<vq)L?I9skPw1RLj-J@5)!x44
z((+Bq`=u2q8FjcB@o&ZNfhG*v;rEu08Lc}cClI{hgRs(yWshhKHXK>$L*S|)v}f8`
z?;z40!gpwSMiZTZ$0jxpLomzO7y+?=q<eHhW**%#wxc_Ytrq|BVxtQIvw+MzHdp;l
zY<a=%@feMlQ5HmR8ry9*T3lpa>u9|3{bathg<3qXDb<}l$G!ioT65;1P6}+DK)xnU
zYPx5m+JsG#uI(*OHrCbBI<tZxa32ZM#A9mWrw~>H&aEhlbj$2_WPR?MJwAIJ+Y+}U
zH`o&?s>0d@>awBqQ%BDcq%5t%JCC!wS&0{a|9+<mlVJM=u>&<5Dz?l_sw-ykLRFES
zggtg<2JESdCAv<BzLxBh<^_=WFLZ77=k$@f?OO|Lc2y2-sPN~c2}EHvMj_~dkbrON
zkWn=USdQucA#`A&`Thzk7?FE>iLzhd+Yd?gg@h;&MVf(MW#IH{opoCHMku<{yx(Io
zk=W>=wvN3KYU?*bXzfVV2xU~GBtrA-VTrSEr*UGr-vtk(F954qH_}!J-0(MjeIWOZ
zQ9y?Q??n<bW&sf3DDt6<%p`=Z<uAf^ilBR@Un=GyCu7EBWTv;<BxZvXTnPl&zb)YR
z+Atr8cdsY?au@ckQTE9Z#3i_A=EMHj3D|?=OLqM46jRbXg{(w>Xfd>YsHt$3+acHm
z2x|e^#y+eQKXHR*hLf~S#|rePr~zE)vQ(7vLv_iADG2DO)3m3jF=@`|Eowk41dU33
z<vBRrYCyDMQ!yc{U{qk>3`h~kwggkIFdVW7aMK{A8n9*oB_I#TkC;=*({dg{`nH6~
z;Z-5`HoZ(K0jmE{wxdYUr2>xnnZ%1gjP65cLB{Y~P@PjR4tEKAaB=vf+--XIIDgk;
zkDfXE<U^+`51l>x$lTdSAD^q7JpIs_Gmo6{bL6~9BHam}W-dvnONyN+zd#FK<Xq8G
znm;JS2aPl_(4PdEs|!l)dXt1s6=((ALdleXeMu3G(=zUVh(gr=NvCIB5&QUlboW;#
z;mKppr?Do|!1lQ|U{wp|i1NnuaE^BmVI+I9)xe90eaZeI$jv2|+rh)6210D0g30%x
zaFKJ#CiWVNAq$8e`xJvGS^FK4tnH><4kB3)tSS=s0MZO4KK=^kK_7*&atO=oP&^S6
zNiX*WVSV&dW&kRAa2FxA1so&=hE2Z%RM{S|2cZNwAQ;IYFp?pu@Ee%S>H#!NJxJ#v
zIy();kD@o!yBOm%oVgwa1ufL`Idn(NwilToBzmxm-4DQN!vr0GC;XB;fcXt;%^^T<
z8=nABPiZEQH4KtPK+rHCbAojxsYF4UsX(&O>7)1Utk{OJs&7R#1s5sdvz7iHs#$5%
zIibCmQ?xUu#r$aWMKc$G-0<WVLV!?$sVrIwNWopkDTKXC8wWG<QNb)6i!a!;gvz{K
zS#e@(Ck3D*$oPW3N}=0Wrp&mz3Sy#`<Tr3gBZ72DHg+`cUgK{<!V_*aw9k%rNTJMe
z3$*tQ5??|0rmO*aOe--<$3^W2o+vsxVo8H)chQ@G1wIR1&2AZVcOKX%_Tw!r*SVAL
z5oAb;yGSn32w<%qZxxBk!VP_ct%2A$42!T8^8)N1Db0tBQq3Yoy~rAgu`7rgN7W)V
zKL9;ZxUgpc_QJEa8VQ?kY1ARe9&9aXCf=5KMbIFz!yMTE=$gJWJ@j%^heDA=*R?1H
ztEd}53kHrzLI<pusAiBGtYfWe>KE0IWOdKUMu~TC=O*wlop;cA1kOxuXYh+GZa)j&
zS<MEP;15NIOeC2+1ZfQyYo(*#)U_vbJBBgD0BZRe?#U$tS<`tl9gw{MaJ^z?3NK}I
z76`&TDzf?Y?GJ_^8=9O1>D-NJ+ozB%H>{KRy5A4SBcB$V56se01Pkh7%SlCS&{79!
zf^azlssc4@p+dz2A7X(Jbh}2HDkuozGp|v}2ip3egeEp$J0{h95R*Fjws{}(%X`GB
zK`EuxFRi47tvyZib&s;^DWKgmRKY8Mpy^!FM=-2ixZX@b-H8_IWaJgVJn?yFVP&-i
zk#-<C8qOx%=cm@ZS{)+ko%iYql;bBM1fW6tI}mLqrAM4q@bj`sN<q3hPqJd2b5)L1
z!pVQl;X42i=$nuaCpD7;1_Iy9RyOo!U&*Gy^EcLa2k>j`Y!628#Uv!WXNtYxYtQ+<
z9Ux06&hP75GrD;e?K8iQP&Z+KA7K;rBadtXV(Y^MYps_7^hd(n1qgFb=ki}0zpr{4
zab;D763@ioV*<}l&G*<4pNHL~lN=e@L&cu$0u9arM~AIqED|k=#f^`r(-M{5A(5<E
z=EAd<0s~2`jsTbDPKweqgslY@D;;tnF--w$nk@;jU1SW?rkD~m<$&`i#DMjo7|Alh
zdeN#{6J%Qyl%-V1TbP~Erq#Mzj030+YgX0B*!)uwi2_s&7&d+8(0R{Wb?-fPZ1u8!
z)ly5xXfOR3SgR1&IR<LRd9P8wxhLTa<lD|k|LE=VJrJ&j2sfHkMeUc?V5%u9R2L--
zeC5D?CO3L^VQ2w=YX`HppiBZm7-SYndP$dq+yV8JC4D$^Pd(>A=&ROJ0rx*LIoxHl
zNrS$nLr;YS+LHnc&p*?{)<i`yPju`xS~LLZRnGx!>kPGGy=lZotQyO?$`Bb<s6T~z
zN*p?!B*?9_2+_8P+7s>f1OYhwl5Hqxk2%*`py3;;)Wd#4RKO7}T+ioN9m9DGV1kij
zxj!}kZ9VgRmXsZe>4}<>W6wVmihW>MqRL2%LfQY>{Q7&&qSSLdbAzpcHY3nk8h{Z!
zd{w;^*c&+-$S|<_tQ1A;1Iuj-B1R}Q?bhn-0#<YE1Ilyat@sMJA_VK5B<$V7(yeWl
z({^&nfF%yA3_cew4l&C><;s1e3qx0;U}vyPXCpm{o|gM5Y1LT)(EY*q$sRwEuze)B
zU7ciIDCdaN0Ox`5^$ah$3=W#w4`fo#BA}nJ{c;szTnNbu)we|7vKUC!CT@v1lIMrC
zd8B=4+q8X>_N0CX7gj^4;MmDQEH=QVCz(e$Z){Wy7T7)5M-_a7hL%E2IR!y7+Oy5m
zDjCHjpe|3+kDu0N_O%UXi$;}cv0XKAb%D0ll%nzWil2KVw3F-i1#m4*bcC7;Y4jyF
z5QS?%dJi1GUt0kWCiKstBVD4BFv1K35=>(dQ)rHvN=eqMmbrc)m{UFDH5<!j4(6hI
zcxAiUo`PNr+B&#DM2o(`We%9*i?wI{eUb1Z#bzm8Tj`vzSgHOHaO;W-tCrBkkVd)w
z?uxroTP0ZJ!jvyf3s>YC2c84|cqPg)qLSc_2!EgxPEv_K)_E3SP*+i%kN21y)^bii
zg%_V?XOXoA+Kt$s0$(xsE0~Hgs;7bb${LR&S(+-)ZCkRVLN&hyjp|6$#1WaE9q~~u
ze!%4V7=rMlB)}dXD%Nm6x(Ff9bKLYVL(er3{1SOukd$jnm|ob-fc*q^A45$@>WT&y
zK4P0z&Fd27;vN}RBLMMR=cu|v=w5VkpPGLNQl@8FA*xF9ASYVWeZogCXK}L#HZE*>
zI6Gp6ZXX@iy>S>-DoDVSUt>F=6;oD?pQqkObCKK}btgUgu>0UMhj6LDQXSW>pi^#^
z@S_-<j`U50i)|}D^@ay5j>*Pa^cZnjZ0gmEtr`#!N1+7`9!*iv+=oM)RR$CdkgFD8
z5{QWfUx9`!@Yw~PN05Y`f9m*=(urs0@72(T?tl{H0|Q|yJuSq#B|GHsVdAI6svZ0A
zVQq4%ljgV<z}k`~jNda2I{n;f0i*WzcJ%fr-v)3|SnB`J=Mr$?t;nx1pBx(iKwv`^
zGV-Em$F&C_6=rNir5d&YqitY5F!U*s^?F_s;f>9ZEhPc@jYU9hxJWNSo}V*Gv!$ac
z6M|kxBvPP94YNEp)}9=-m~0hPd6wV?pe7iE)%4yjZVX3g{+U>zgzAvIwg{`iEoKcO
z2UP?0vW87S?Bfm0H_XPT*a2e1ssqVZ)w8vWvxH--=iFl_-hATt?1?+>y!*tAOg0iN
zVx`rZToxMG3s~*ZHU$vLbCxdmbPDIXtP{Kxz+b~O7^K&m?K`FF%=d7XhLVf4Sqq|r
zhQ9wk8~SVP58+CnaAaV`Ss+>mlnXSF0<b%5_hIyotz;RT&Mv`W5%@#)i&%f_<9Zg>
z8ly7Nq0h|sbaH2{hw(~Vx7F}fd}r+e?9hEDNsGZDs7j&vR^Df`4H2i8iW93jz>}ka
zjI!MW=^i3^Dx&a+Y6-i56c;M!0BRsEaSoO}SAZd5eq-)3!UFH=>cof@r(?tP9uJqD
zh+BGqXdNM<a|Ik(aQtdKyP!ndf{|ogdlMGAW!1-ujc95(uUSP|$T%{9*kKg|(^|uU
z-~j{U5hd#xtNe&I4Lu)G{xsKGdOR5Lo>x7?efAP;6UsW&L{n6}b;+^gqP0g+2Ou;W
zm=a>itRC5C&@>I>z9rU{qa~r%v8Lg`i^xwG4Fa@Vyw!Bv(tI>Ig|6a)rY9(vtOzmu
z%T1L>=<N|DG*Cwl7p^uw9G6>;saR?FDWDu5(JM%xu?kLTu~{`UFV-8epWt5*_?0H;
z1ffE9-C3$GT#mI<dL-z--j0oiP~b6*ugSo=Kw>qOGDxXs8p>S6Q4dUBwg55e(FA3-
zXH`Kz9LGBXL2ecK{aA%`9pkx&@L)K>dMdi@!N)R4(3KMCQ?e4ejzPUo^&Ey0$gOj}
zbO<P@D1KuG+pXq7FIG>{7sD*CDK<qDYGYL9C73J0QYFCx9V2UDW+&L}1m;p;zJ*ap
z#F+2YBvvk7Z#PlG?Ka{obZF6%k(g$tl6-g~bWP+p3gbhOuXDT7YwmZCN5BSbZfmnN
zY!}CzOQ5T5a{hCRS~%a5-fqF>9<1U}mygl0<poCtW)?sJO?8$tSMOG_&Ot|Gq!-u3
zOkf0l2)6<26ZaZ|>;QGk()rdU0ZLj;JOKpi^fjt0U@U$K|8@u}7f~nLCB&X88*r>#
z8Qp`N_aHn`!6c%&fiBWo0V@Guf~q~+8I)*G!+^Y=%>4fxCI?81Zve&%H!Hmxbf7U#
ztk-KtDEE)U^nV2F3DbiYB|35B>~+~rK}*7^9da4Jsm8<=Rf83+4+ag2&c})icxO=q
zq7wq=TF*SguGY&p)6E7q6Pm2w#30B4sgK6TNT7RM><a;V?OMP%$m_tLOLCyi<`+P1
zh<!+_;m7^ga6%%S7{{eZhbeNnfk;O{c%upREC_Jqt>>;7xh)@il3exWKA1cy;I|)K
z_Ja6m7`Gq30fY{c$8HZuIQ%f3TX2f_Ee5|s!S8VJJK_u>-LP=wVgHQAa&Z^yh%+jp
zCnik!PCO4${qmUb=Ep%A+~pSnQj>Jf`ah9z>5_8O01ZzW4Ks0{!@wU7;R0F()w`#j
z2Ztj@>LUc2B~VUnz$x4CP=M%2iE+w}&ivkxvU*a8(-KdpdNEcR38Z9~B{@DHhSQOT
zd4X~&<_lsFzhdE+X7E!Um}T_i3;7q^$%UJq5Z>n%Hep(35ezRFzt4IDct3c>c$m8V
z2HqEK<WLzxDC|yDhP+{>$9IGbF@%iTkS3^%2|+S5hdbzCFAZ23sAZHii<wXft31Lg
zkHc9)MW4yw;>rZ7Y+NzF*Sr9`<eJH;#XVq1%bYzf2C5)sBN{dlnat1M$D8H|h(B;M
zx<YK}k0+uVm}F!lL6mN5`e7=O-cfG_E$!6Ic$e&saOY_EP3sZ>eo%{u<mZLDB`}&g
zfcpF6?<Kj5oCaQ(8qrC$fhysAony3uoF9Q=f+oJ^Q9Dpg<!;}UCeRKEbl5$qa6>BP
zp<MMglr9oL>J;l?iL}r7_Eb#uFHxkH<nfqHD2f!d8Al(v)Jg)|R^-<#3sP;+Xc&hc
zLmE22a%<vP_qvFBFE)<Il^W4P6aNZH$)t$~25&G*>v)Hs_u@ql1JWO*7PCH`qa)Ql
z8jfAsSidP&`gXCmECSA|llXom&zf+`LdzwnjruU_qV4_rDS_v<REhU;^;YS4SS4H|
z8L!fIvFE6r1tghpW*5i)8I<k!%bO5f(?VC2xAI4~JE(BUQ*b*lIiVuU!&Zf#J@8OC
zpUrjGvGO6hP0?~F9W`kSl#B_8OG{X>oejcFO7Te8R68AnB_Yj|lduErTo7hK1q!#2
zBELJKZX~}R6r#m~iVlAOtcWyN<Pb`44hC@^ZG-gzX(18QZ{i{jTiaZLtw;4V>Xe0>
zR!@65h<1x1Zrj>O+bP&ZNh<~By5$wFn1uw@msG>+-yGomfndQ~n^1j%ub;MXVAlKl
zgSu^?4n<f}Ni7$l3I#EIi0${a`=Jaq1WTQRxP8vT^_<DgQ5!d1ZI0m%x;|K}Og6I!
z3!X%{912*H?~g$!U|eLGaI@X!1e6CR@!bUx{-T|27a>eO<W1SoN)a2U!`|-YY1$5@
z?AKK&;cvOH^9wOia#Q;8%jkzac0UxjzKm9XnZoZ>D6?^!78pBwQQkfX=I5b$4v}~D
zW%S{I^m>pA0`yQWlbmiqc;AH8SP1h&oE{gIHcOKCpa|%XwKK3ZGv<{bcn>uKC~>9Q
z{ZJb>oBiHR-C;6U_QQTq3PKer*e5k$@ltLDgSb5`n0_Jy`<hvldxt$N<%~0Db#L$l
z`#l?z-p$D608ff%PGL^$f?XS?cz1ieJw~fPiw5kkjw23JR`JWX_=TOFspVT?!_(Z@
z-Lzm~X6%XqYdB-DKe}54EF?ARlt%nE@`4p1DgClNhPr4ALd^4p_Jn@RFez=~W5$J(
zDB(~Wwu_JhOZ)-UXH?e(wVsw-f22KtH=k-xZq8gwG>!Ho!gJ`kpYaY0tdKx&Oe5XI
zqQ!IY3NFX>ZbKMCAP6#f)glBjw|@5hpojw55hE{h9~EweYJpK35;vn=K`1@3&~Qf}
zJw*sa#KApWW3~PWbJgf*D`1ww9t;?$07hb2aJwO0B^DUwyI$Txf&n%KQ=K?1fo{<R
zx)3zNGXw}*mn;gQT7AD$S1!;yaG`%NJ9|pBcGWTLZ(<~8eJDEl-0(6H7#Z#C@UL;F
zh<XchS^pB-Ocb9avIekCKwH4q0!{_k&A9y;2&3lv3k6LoA95oh*W}!PP|+sL!2mz6
zUXCo#uQhSqgl1a=bZXF%0g7+Y+Yc@G0L}}JA#y*ujL>7SiOLIugbb$H?h2Gj_m>XR
zj@!ZcP#-&pD6IkYMW(}SNuZ7yLsW$!9{Zi_#yWQNo};5<z)~Msj8g6=ON||XBMVKK
zLRjo*=3yuhA%5ko{}4f}9OXA<{{Z9py*RrsU^fYmNWxQKA-g=SjElX;W9~k7`m?L*
zfj=Pl0H$QZd~PbSelT`Wqbk$^>4LGsRJc2Rh<u3<S{8u-X|A7-=%KFSSvhk=19&_h
z)m8S251&6s+`9W%`~u(qAf4wJ=`ZLzOy^hW%rmTyz5*PGuT#iDQ=tOW+@de>A;u60
z?RiGC>aa4YKEXiIZy*fwI>uB5CZnRLHnlTAA1#c?xcWH<&>SzraMZUUFMqlcuCzGM
zxk8OY-ZZSai(0ztgG@1q8;-l{jmy>L?)5>GF|$X5S2A}xsxa(D^<8w>b9w?vM}9ZM
z4l&m}C`$oRNle}N<0N-=#QtE^M<iLxMRt34`Jz8k9@?_1J2&{0i*m7(ww5fbQO|1;
zP#eP1iA@3*7rrAXl0|g=VN5h$m@Aq9!C+Cy&hHPy1RVDJ<tJ3%@$-^QD4#vES7gcV
z;Jl?~%;*4Y;@=oej+(p%nlcDtFW)Rc&`q1PfpRa*(!dCfh{RFhLo4}?CW|Ibr*^K7
z9#2k78Y-F;(o>LOpaO6jl4x*WLFCYHq#;%pDhF!?jPy$beK5&t3@6hN!7J+0GwHvB
zHg3jxDWA*2)ap2Fkq(>IbCj@zf;|9s2;AeYPsu^lPSP*0hYGJ6(FxXI3_eEF#rVE#
z37dEzrsV<CQhDA4{{rrp`gVjsnsN_8JrCNPke|d2=UzsTyd2);uK-AB4tRY#yajk6
z=p!X>vR(lx`ZfBvk(4xY?;CO?VwuNx(9QrvgB(vAx2pogO2UQ3okepgiDNZ?{Xs!^
zNtk*&M1eH|B-F;*ssp_@r$cnRSMLh<!swaXq2>zy(b-jBG$HvS&^cu|{JsF}=EIhq
zleml^rzCs`a+)P_>KTHbCcDv7Q{9R<B0v;vo;4*1SA?LdHB8>?oPNwWohSg$T0b7!
zI9G#4X*p51YXgl?TVyYaR^P(%xR)qy50Ja$PDi01C6;vk!5(ogf5fXf72MHjcb&xQ
zAaglJhX|~?%>16EBV|aN+I-xgGsh^zIB^X_a-r_({aN<Ls3LmFMVKa}sp3ug6Sn_e
zWJD`ZX_>mT7{R~v%cfj6Ay2R5Gte)&VPd*tQSt37X~?5O>gCl)kXTRjo&s=Zc=&-i
z=pMl!QKA=#R8cm0o~`<JI@ELaCqqS?N9Ue+&)GAThaW!)#hgdy-mXuE93yh+s4RUv
z`_%jC6TN$`Eyc}Pb{SaZz%l1)AyD<>jJ2a?%LTm6vs>7|6z>_%WGArdaZy8Oee1o#
z`rDXXD-0B8VW2pjHUBQ1GryM3n_o@$nSYxv6#fQ!`m>XUV%`hLHuj--6hH27;cbVG
z2LU>*ob{TNJmNl}=w(IQB>_>petQ7Vc`;9%fXJOTPh5bp<57h7gAxb&ky8XsZkU!c
zbK_h8?7Qs=X9Nn;lg=1^$3ffMB|cgVoq~^VCxr^QTf*VDr=u8q;M;|ey}|Fk;I|a~
z-sDU<yPYAS2aenO?VCvt1pRM1H*Vi!AFywMS>-*>Ui()2;PZg*%lklu6^f>P$es~O
z;Z4ro++q7o_HE97yqR(=XFqeZk2t$<-PcijRw#@&Ggj`H{pRiB*~gJ4YoF+jiyUuv
z4g|3+$t`Q&Vc#ja-4f;26Gw8&+joWWgo=49Hp_ec@DjXJEp<}t-JE{x`aoUZSTw7}
z?p}v<_%!B)CLIz9riOo@BnKJzuDdlsJOm1tmiskLAslA44nJ<vf_&m~LCxTOzuYU8
ztqkxUEGo>QlBu~<pay*w#hjI089Ub#c+jRTI9=7#bYv5{4KE-T--3cO6i2%5fy@ML
zlEzS!F$5FTJDK>ibYx#T%Nzz9plSz1ohA^gE|OShbLNK7kBwP`>}Z${7d^s=T~_YW
z;d3PX-qpD~6U4YXoWxi3ErEl|`LPTw!yg7oYB~A6(zyud1z|KRXIz7rsC1AC>mMno
z3NFB8S~CtJ>8_Ez_NUQA3KoJ|6)~IC6}DTN6t#;R5%UpvXb;=RnoUBLHM*O<yFFcz
zV~_(8iKs~wLu8Y5)iF<ks%sP1N%ga=hKwSsiUUh%e*%N{R1W%$5W1ug#!ajmOF_j#
zMUn1`MG<4CG_nW!qI>!)B!!_$=+BV&XzE(XOxUJx6!-rFC|F3vStG(?x(oish>k=u
zNOW9IRDfZHIMu&GIqIWy;_a8S=U`ZY!uAzl%QeK(fwgz+1oa`7@&l*^78&QrwsT^P
zjS3*2PNs)Z5%m$ISw9p2CB6NyCCQdK!V%=OSN|Zahx*^(3+8~<f$5ak(;fa8+q$KY
zqpa6=>{PF9le~d?p?^3<!<uCRoT9f%eat_2cRZ>{WN)&w&(3y5d;`OyeuW*h&7}1E
zI5<IxcSJ+7U5Q;0!59faS<&dPA@?~A^`7Ezh|XAq50a5^&H(_o9bWi3#9!!x6Mned
z`OLZN-inMtQZ|Il9Fmj`kd$*0R*cBUCdkKNDk0ufJ0FpmVF3oG2w;8aRMB1x$kQ+@
zO||PBER(?23#?T^$r|S3`$4h;5qL8V((8b2z#dgva=QS_9l#ds6bPo!1H)aaNVAzC
zffg|=T)4zA5>gok8!5a@NVbI>NQ|(&$!Fv}1Yf>gTqaR@2t?-u?5GUW(h6*(47T&`
zN81C7u)3x5+m(RQE6K@h&qrV_#b}RAV0qL2rq>S}F0he>v#tQe@B)l<XRfAj(q*d@
z+K$kdIB$;#jUI$ZX5+uJ6dy}CEyFC-EiIrH(w?+nA!^WETBFETgFIorq+l`?R<xKz
zAi5cl8_$tur3HOK_KVBxvb2C1+Tn^lBcgRjPaHok;s-HF4C)d=Xrj49nI<jhD@43d
zLIcZ*G{1SK)-pwbym-{1ERvr?x-qamP0A@7Ku)Vj{zOuFi=glEt&re?`$vQB?{3m|
zP3RIwdNRE9J*vYa&sJ%WyG!VKetsS*Q|DXFD9!^9JfQE=fYfMo(<$}9Xazhbp?)MJ
zG<I#g9l2ZPBjW_h;x-#}559M3e*<VRIFZ#_XeICHJ#RaD5kjx~JHqD7GooK2-&l2e
zHG*C{%UF9My$`xAGTi?T=kgJu<z6Or9NhB-N(w#*BePAB4)UsX5k_w~5AKg;A!nY8
zETZ}y(z#L%j-nuyK=|YS{i`0D6ia+Fs0DKQY)7V-O${i<j?l{Q*S$F#1|HHS&g#P!
z)bt~e>v=XHfZcz`@f~_?R-0wiJNBV4GnN3AbSjI8h7+`C^xP8x$Rki6BUFZK%Y)X5
z%oCzk7aaF~A+66mb-W|~{Ec+u)6GW@Hbe#|>TYfD=TVW73p*U+G27{JOxro=41Xt2
zrEN}(ZBGisP0VV$_*3w{h|3wd`f$3mmT-ZLP;oWM@8gIH0|;cKcab6gAqx{YQ6QCn
zz_<UF&hOC?44_Kihv*QxQ_yfp^ky5_)ZDS=zrj+rJLs+>Ge4uj4OiFm3ye#w*e|%}
zYq*0&w<LO!?bOe*m6b)XWPg_R>4E8}ReLwiZTFq<j2_lRMk`W`)`&da^w+Wlq8S$S
zZl^$8EP|w{ZD?2^AVQT@s3SrR0c~V`YP&VB3GFcp4?L<P3l9W+172Yh%G0J0`^NNL
zf#CT<eHt8MF$A`eZQ_mxn!Dr|w)t>`T8Rx9!vvoUWI5p%Jk(C&jo4Drzc6_Ql{L-3
zjNZX4W|wJ6qdA0Q9x8E6w~|m#M)8reKB~cw&|K<fP)?I}CUlAkfEy@9-Y|YU+m$!C
zwwpZkq2dj2L)Y<>07b|l{v<fO00H!IZzpjJNv@sSWGr0K4&F16mbQslh%NaB<#pvw
z!{6k4e^z=BXeDG@EO3DHO$&TvgFYBO%L9M`*A(^0LP92K=RN_BHS*4rd^x9iV{!)t
z*N6th?x9Zz^D=8WIibbhfzf2Pauv?>bulo;IWGZ^I@tN-z|;o$JA0Jog1nb=`1|NQ
zf*fcOmO3^SD4If%@2_>Hf{;ns0u#!Dm-=u`lna891sflg{J;?i*#Io9`?bCdH83Tk
zH_)C1;qM8RJoIZC4yXj%Bsf<e<5GDmip|2jCQZsf`wukm>GXb>lZC-0T-c7q0IGb^
z7{(WGKzBz>Q0BQnfTbJOD=@^FjYz3ohv)2T79GP9h~1(pnq4*v4`5ESod7$#2mu1q
zOLT;y+DXj@7B;eZ!D>TFiB?g$5yU`$VnbYtRL>VBdttxzll``dN%sP-dc`z{9f50R
zz=i|r3XBAUmzaair2>E<=UMVZz^_zQgvwrs7%rj&cLfLw4c9+r&pFiwj-6+9|DR^;
zk0IK>VF$By=5zZK>vwLmi-4U5vM_|qWnjCA&KxE3`oNw1$H)nDRHn-xvhaZ*5vCR=
zh<YCab;-PLK*WqdpNQrBo2&^jWbSw-t!Y8pC=#gY9=3S8*f@~@8wtIv;sM*5(cxhP
zW@tdGQhR%O80c%c@14SlurFvf2fchZ7bEr_Tn;<q^=%fIqi`mGRV;C&bLvxUp->g-
zsDP4F<I@N`rGAQUq9YBOsltE!3`2$N{33lqp6XEEk1&t-(UBwXR{DexMzS%~YX2$I
z$kDmoal=Tz%Z_EbZlby-n`4DsH*+Bo9f!`#<Jpcm+Aj39hm3y8w;&jB-iIZS7C;in
z5j?w4hgv1B*X9U>AI>XOKybsa@S#KW;A+5(`J<@z|DHZ>FufbZB(=*ii?rCBzI9Oq
zUcbzC3F#>lU5?&8O?FD>TU4vZts1zQqXqKe0&5n2#^Iq+5uTH-kH%~_fjkQ+DCCni
zC<O$G#%qQ<?Pb8EO28<lsc#qdvOpTsO9`BixTCoZsioF&FXus;b+dpoDRm8!Pj;FY
zNKlUH8tfnAym`RGy~vxzYoIgp9Pz;ZqOYBQ80IgZ`4O1v?8%|sw_>{QY2(%>_m3&x
zq@JQ=(?-^Vl#ZPR>(0is9or}X#Wt45fkYcHA`0IW5oy#N^_hfuP}s7<J;a9k9Xl^I
zoVu2P+|ciMyEmsHJGG}fmdG+8t_^8i+~_>Cy!YF2H}&Sej~b8)rB$me(DKqXY>ku&
z1&R13Z+{!DKsz#a`UL~|{uJWh3`8ALx)S3+JBz*mTp({hAG{rky&VqT{&w&dGDA$;
z%=7BQ=V5qu^Cpf1OWnwLw?Niy%!4^lSa8eXzGCZp_5p1l#CAOjB)_Xg?cz1lo`7@~
zZhLO0AyqWQT)%DT!k<A_D`^47xspUF+VE#4CandMw%o0}s`~&kQ)#((4Rv_tJ!F(_
z9<nE|@%H6=Jg`PLXY5_*HCPZs4M9w>GZSpDyhqBzWUm@bb(=jUsopNV#rJRTesAv<
z>8-M-drkKXW0kvOAs4g+q0Jzk?p+D##DfIXRIJ`h-wsN&J^jNll<FOkBW;h2=m`52
z^~T-Q$nz&Krbltta{2{GUSl>L0~uuRRotnKo;VKD$_c<#w}ZSg#u)bljzXHxcz0~x
zNuva93B3>V`NBnM?LOAi{hatqQs+O~l7HrP^Z$eG^4BX1rT>N0;V<xf7wd50;WkZ`
z-&9Rw)5({D&uN;}kn>W@(-UPb0bj>PHxE6-FxpuV3IL=)sY4jez&3etF9eP>_T9M`
zm=p!kmXa$mC3=*}{Q0$|C6G681r?;EqTq;0-KoZGX|9Dk5=5O(a-@=iu3)FAU>ojH
ze?TaNqM%FzN(rh9NpofpUPT6S!%C;*^~_?cj#Iw0-Y3Z6Vhbj{X6JnC$%p3N<r~K}
zq&kINQT-yE>(hbd7i~0Q_FSzA%$Rp5FT1U#KX~%-)2APOr1HcQl_ws0^6VMkfC+Wn
z^b0zYpVyow=&!HFz#$$S*wHml{YiL)r+y2h1!+J7^9(oNV~t>^FaYr%44;XQ@P2Wa
zc)}j1<1h^IVD$b4?qiUL)ZBO+mqR>eB7hqLM@hi0eoE{(_?en(*MOXg#Zp8TI4TM5
zzj^`50OtsEg-4eblqSpm9)NHzG0$`YQW&sOVcwWBaYd}P{@^yi)J|to7g!bX^4E9C
zjG7gScmULgY`T9QHe-EAN1~8-@G{n`JRiY51~n4Z<nT$aii{V5Ozin!;RU^8T(>xo
zgccp#k3xc508HwewWQ(k*1wajC(xEom;~Ei1q^XxeV9v$^g@}&Q0iLn0IyoM<1Roj
zsRbH?pRsFlU9@kibGlnOGS^48Y3zDmSL0sHtoL7_jrHZAwd?&W@yCnZkBS|&UR>gu
z=}t5N+Dg~6`Wr;sw%;h>@f89Eq51*#Da^%dL7!=0<-Z~Iivif325<p6E-(<jgwQT%
zo0H}H(Ov2y!@mPrUhi*pSK@k+cAdIIX71h=LYzTbzdxYpX&A;&JI{ik=lVmZ0NY^4
zIK}n*h*B-3_9ME?Xy0tPe#U8Dtf^L05fj9iAeBD=7HP)@jIG*yCHJ{OBBX9>9fsp9
zwU0tufy{<K5c&w6fub|ThH-aLzeY!ZRswjwwG6XmzC|$_wajc%TnD(SoIn0Zl#Q0(
zA(q1SfPiQFsTyQF8I$!)g6!+3ad8dasE1g_2|8b7xx`%|u!hUeJej%bFBm#P=gV;X
zB+Lnkq=uhbIKS3hk(=M-u5oUJBEE+=knD4Ob7{fPkgp89%Fn=jk*YZgIv+j2>Yp%g
zLadPJZ&hpsolhk-2=-?`6PU8`Q_??vKa_xE8p@6Kex3za1h?`nsf6_1=h>t`XO*p0
zmaUo8%%KbRMGsQh16JQ!bt3PWaej_|*~rxIG7qj)tY4XivT>be=zl=Ye@DpK$5Xxt
zb8aTg5JT98q@-!cv%%0?GH*^prYsGTQx-1l?<CC|1Hh>%;bz@1Z<v<#N*Xqk<+3$g
z%2vo{!3{6sEx+b-P!PdoZDvXcStMq?Vgggw@^Sfb$j|-x-s|~&X_QObKDcfUq%9hj
zqp?`}=~~Z0j?Ne#N#jZlq>#%r%$L&{sH31plQ5#(8;TO1E8?yBoMot8*!dQ^A$#tE
zXuBXI>W|?QMyT$kltk|xV2-U`z|p4XPr}$5#&9b(^0V9t1sln3i;e7*E=HZBC>gO2
zo`vq0kQ*;k5FLjUMeuOS4YhHG!Dg|!3@Me3G;jp4yXf=M!qUp%iUbJeMR)=P^aRsL
ztt5fgCO>9;&v9TQS-~d`5cgOffJuEqXs6&GZ0CSaJkUlw_mo%khF&muV=G}lK(*Cx
z$Nd`{dBlT07xWXJ{%8VFKg9Cv9EAB;r_n%|4+4YBV=q7w4rlZDHAv*QnQFq$U$uY{
zgz0|_ux1}%%|35ZV9f&aY4-)NCK!I|dvL!ouWZ1T2h0sd7L)3`y{XOJ0>z!eUOa6N
z0Q>r1#5ai3?8hVpu#>J7Kh{YRmL0Tr4}R~liwFhtZX#h1{g8!g2hg|T8~G$OPoRA=
zP)lqKq9>qxfnFE`M8a$PhNy=D?TMk~o30q6WDt_oS415L+QZOh7`X78z<hXL^eC{4
z5qk`+!zDZ@3nM3Z$|!p2=Jtq`&i;I<Jqn$L44w}FX93<3zCkVT4(rh#MIDNOtp}jb
zFtIDalG=mNlNhrh_e2~5kaeFu(H^K7D0|l@Oz2glHgEL~;!=chd&=HzPhU;id!Pf8
zU!K9W2!_2ExF6`9P1D|oo412duvksptc;W|80`tZ*E0}xd6M%(J&EVrWE_qHSIYF@
zN+0OPmC%kO!7N)v?;Zt<*Ro4AeHP{+CfIIT|BeU}K7(b}4UCi7_d1|BS6BeR3&;`L
zivnZv?*WR7xcqtn^&lW4K$iMX+%^7`4mT3)t$#)T{|jes3-AzuvWF2>fQKQ9hi)2r
zSzI$VWlDpX4nhL&+UOP-P3e;=Pn&AN&8bynqi<8X8|4MihDc?@FMB+9ydHStPrAS}
ztMDp?mD8_o3z+V-J#7b=0+vhKIF9=ORzcAOH@o5?wWL~WtLkr2xh;Lct(Sn5`YPV)
zDiUN;U!(I<6ei~r!6XwTybhS8t~18VaMlN6SjSO6dHgPW-8$M@JgWW@?;+=q7w}R;
zEIdTF)P-BC0K<gYWn6Hll^yo}53rW9+us`XX0-W>i0<8}o9y#NA_RvS>j)hdhVruz
z>cky807Docn8sO5lzHR}d+p%qigh^O$gBU1_v=Hk1`+y3rUWczW);Ca^$K$_Sb#}~
zkeR;P_h#m}pF$p*(HUB=i%urBKwEw*8blHk>Nl7;!E6ZoDFNs*_NqklGpm6o&00FP
z>Sx3Zoe?^`DA!Nx5m%goe%4!so&pDbh!IE?S7Y>z(;?8MEcyy?{1i1D>Plw8y^L9A
z7X85tYMrBLW02&+90i)QI4J{w(Ozq;y83`lGt&t=dztf~p1KhpJ~O%PVJ^_xBvb97
zbBNA9I+96W(21PNYCnT-h69jlW!0;QRA4m7P^hJppnzq40WM3s-;^9DJ)YPO;N-{;
zF>(Ocoi|9k1hTk9)A^IapeNlymMqKQ0DQm1)%#g=R=&Rg7DpC}2%0|#+W4y`7$Vc4
zl2iQv$_ZBb6$>)ce2+E$N*=dPm?JcqXjyQfV_-f9RffP<7|)t}5fAs|lEVm;4IrOR
z36%#6+(q71#`oIzA|<hL_pZhJdrcT&v|a^>l^H;}Om9ACUfuh0;WD3J%!lIDb93rj
zFvK&)sTuQ>`j7M;>=C&3hbxdew?n(qvNALw{N0=(KgSvL^K>YQrzNpJPv0-o`87Jf
zPKVOj>YvfKi$1Pvf3R}$@wrovzP)nl@h2&%y{(Kkx_FC>HX%lBRrvRjBUrmM;JXk=
zYX3VH@)LAOS682*LwH&yv*M8LEXVzmh?gha{p%<pn}CjnnJwZPL|lp({!x1)uKl5h
zyGO1Y_KLGR$#(Y-e8qf!{)YKx<dg+1Vj7S-aB{hf`Rit~@G52kt~#+^>4X7iw+gR7
zBK$!!rNhBFLtGth<_N`GFEf58+yp#7`m$o1sy>=6$M#temjaSUVCEH+u~4KXG;AB%
zs?ip5Yq+>xWE;BeyOs`7bE6vEy&l?@i!396#D69*CwY=m^idx3&(HvjHmyc2GA$ZH
zp(MevNCmUC31N}GkJ3Q^1p+}l$shp%E5R@#OnE0)8g<B-U79Vu{fV<jZ=XT(K-}jH
zbR!||MB6|pFrJ-QsYO?GssKAUxc44)hH)EIuT`R9CV@#pOJt-Z5=QBy5d%t9sRF>B
zj%@+h3WEJ-SSTgMk=8g!(G%I{!1S;bhSyN{jOU0)&>)eT8mKVl(S6Y}a}sM+s&$Da
zOS#&7)AmJ6DK4*L0(B8uk6ndnLr7g2hK&<RnAuX)cMDn>c4-X^B-T|5ja3VxSm$9!
zX+B^L3vI~N;$E~5Bt77Iv>b^3)3HE2p#80IXxIfpgFVDrc6ubNq?8%9JXX=bp#K6^
zYwR->Jh{IBPpBRt=nDvr&u5_|K_e##*dEr_g?Tk$2F6V?TmOI(WS#6XFEY&Imqtx^
zi3mdA>w~WVKiDjIg838ehuM=b28^FL19&$W{1$`Xq2PDe8Kh!Gpx8i#3m_ilVNnQx
zWa7-YA1>D&msE_m%I%Vn93d$8t#G<E0l)?TKfGd+2rJj-K)GMvHwx)W6SA0r9sr~_
zb3__Mm_0eCd7`)*^a<oAQvT4;gSnJI0#z8DPjiPQIIXJmk*oxQEwA_Xb67x>e+J3+
zUoyh`>2%;V9wUo{*YfaCgO9d@3(3gn$BymZS#lpW&?3yAV_gJX6#4Bde3X(q#q-!|
z5tRJ}c-HUgkb}41zaVP1jVw2NJ^awm?@4^s{~E8rP6Ff<jZ(k(#wtJ|Wi8*<k;2|t
zPFVd0Yd|tmm(<pGGwgE++g3=J6?X|)2g1MUNcfjrPE3>?gn#>Z?IZY&xTX+y$rQkO
z?FZgUaogK4@ho{a&;&O&^ovL@loTQ9^8iJHaed(fVCG{1d|2e~f#LHVcx)Sk&6|U~
z&U+bN`JmHYGdWRVhawHGodx+I@DHV|^sdCXJZ-{~#4_tL2+SGzwELk5iEAMl3L#%y
zip!-3%McVN*5b*q=n$m>-x%5m5MD+-ejNTm)Y4QRhms@p0|uH?L71gJ(@rlJkvA?@
zbUzjNL8nolfhweqYuW5Qq@s&Fvv~Vk-mr)*hdKW$1(<2g0(F23LFS?6Z6eg17XK|l
zTIT#G-e@TD3_BRC>EJaJ>EH(khAG36@`(LqZ#<0Mmb}_|(2p|hA_SrP!0VQ4hDAhr
zh+_uYL8?6z6Ox|bb(OdTa}r-%R=GT7_lb~np-PQp*`!POe8VEqGd<U<L^(0H9mSt_
zfKiU&h5{VmlycZ!5E_IA9D8Ewc#8ydApP9g+i3e3=1XzdcI-z+8(-=E`%5Qq&;e?Q
z70Kf}l3G6W7=%QRp1H6_TL=_7gp6}=bhCN<%x^9B0iAIV%o55)cQQXIHbx(z;k?z$
zUANoJ;ga-IT=N})wC~z!A3%6Q(^2j}sSkS?(6eDpLO=nzEy*pRLZ#XwppssYB!Y9a
z6e$qn4<SF0&7(f?chjU`9jfOPz0`)%Be2e&YOK}0+P3f1Z=z~5yS3cRKV<dgka-t&
zCM_m+iD6%0^{Mvno6ZHzK}lED5`^4Tm9IWT=Xt&_fR_X-8MsawW~|gZnT;U1-^Xl7
z&{fCj!&V%KuL83JEtihSOx5Z8IHH7iz<(#BZ`aK`?YU6k?Eu@MwtJV@>LFz47sHty
z2<*9j|7xpM2MQL*<Wjg*{WhJSWsQ2yGs>>6@i?Vtbd)zI_WnU?wU0~#sKlA0Am2la
z8Bcq`9022hgnS93!d%|ik1t}%y>O!-`ZMmdqySq1jDX2r8P_hwqa99A@a1V*LOp^J
zKv#Nf#2TH8bOfvH;F2A@QA|7J*lfZ!T7efiY>ynusIdoj#`PXW;`gwlh{|UZg`q6<
z^h1r^Sf_mhxLslNZxnOJLLd-2<0&dEgK6EQ47sPi!a98kj-RF#Uzn(nHb|2MttId#
z^SL|{-OJLs!&-w_Uu7H~)Bd=Qqpbo$2QRpM2xOn&{%a+ZSmrdRczLpG2D-CVTvc^Z
za64BK{OWnFr9>t3jscGHEwL;F4MRI<X|gHy?khThww%FA?%6&;D7Gyc*CYtYS}X6~
zKDN$un9)d|@`q#5^xILsl^H{3u$rJlZjI0!{D~L}R%wumRIPifO75D-ckX*o(@aCt
zT>eBSboLx9k%Flt#xT@gM%hQFM2B=ZkPe~WQiWav$V_BD>5F94C?`Oka3dBluENzb
zo)l;Gb|xFrV?`#fs-0T{S<mn5deDSLq1i1n4GDx$3DulW9CNPlj{W|I13Dde#<K*v
z)y<5001lY6wPp)sxm)m19<UwUGs)}IX35aiCJzqsO`xNT)N+^PA%+!MO3j(oY|~lX
z#6xmtc_;`*b7pmuLJi+p%m&^ZavP$lBXo|!;XNLbj~*qVK_25v5;ukP=?@1nG*S<J
z444DHok>UHFLVxfFx{PW?xJ%yoyXw#>Bn^WRKr)#()l4epQ7_=I$7=?LTCLYKK?SD
zK8D>z=Omr`;glyrI|66ULNun!jKEWjB6KW4dq`Z7_1?7&N4SNby-)YY1BjJye;AH6
zqA!}A#HKndggb7kMUd#AJYbZJLN=YGf7Hywa!Em~ns9?A4;Ei+zXiOVOp&*m%?$Ss
zPiJq*=Z2@Jjt>_SA1pqc_#oUl^IN&)r(Ye;3>On0wARcaIRd^uY+2*xDeWJ%5Oag^
P`DZ?k(1Q7u@#6mn+3oyM

literal 0
HcmV?d00001

diff --git a/tests/models/mistral2/configuration_mistraltp.py b/tests/models/mistral2/configuration_mistraltp.py
new file mode 100644
index 0000000..ad6691b
--- /dev/null
+++ b/tests/models/mistral2/configuration_mistraltp.py
@@ -0,0 +1,155 @@
+# coding=utf-8
+# Copyright 2023 Mistral AI and the HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Mistral model configuration"""
+
+from transformers.configuration_utils import PretrainedConfig
+# from transformers.utils import logging
+from collie.log.logger import logger
+
+
+# logger = logging.get_logger(__name__)
+
+MISTRAL_PRETRAINED_CONFIG_ARCHIVE_MAP = {
+    "mistralai/Mistral-7B-v0.1": "https://huggingface.co/mistralai/Mistral-7B-v0.1/resolve/main/config.json",
+    "mistralai/Mistral-7B-Instruct-v0.1": "https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.1/resolve/main/config.json",
+}
+
+
+class MistralConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`MistralModel`]. It is used to instantiate an
+    Mistral model according to the specified arguments, defining the model architecture. Instantiating a configuration
+    with the defaults will yield a similar configuration to that of the Mistral-7B-v0.1 or Mistral-7B-Instruct-v0.1.
+
+    [mistralai/Mistral-7B-v0.1](https://huggingface.co/mistralai/Mistral-7B-v0.1)
+    [mistralai/Mistral-7B-Instruct-v0.1](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.1)
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 32000):
+            Vocabulary size of the Mistral model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`MistralModel`]
+        hidden_size (`int`, *optional*, defaults to 4096):
+            Dimension of the hidden representations.
+        intermediate_size (`int`, *optional*, defaults to 14336):
+            Dimension of the MLP representations.
+        num_hidden_layers (`int`, *optional*, defaults to 32):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 32):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        num_key_value_heads (`int`, *optional*, defaults to 8):
+            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
+            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
+            `num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used. When
+            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
+            by meanpooling all the original heads within that group. For more details checkout [this
+            paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to `8`.
+        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
+            The non-linear activation function (function or string) in the decoder.
+        max_position_embeddings (`int`, *optional*, defaults to `4096*32`):
+            The maximum sequence length that this model might ever be used with. Mistral's sliding window attention
+            allows sequence of up to 4096*32 tokens.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        rms_norm_eps (`float`, *optional*, defaults to 1e-06):
+            The epsilon used by the rms normalization layers.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models). Only
+            relevant if `config.is_decoder=True`.
+        pad_token_id (`int`, *optional*):
+            The id of the padding token.
+        bos_token_id (`int`, *optional*, defaults to 1):
+            The id of the "beginning-of-sequence" token.
+        eos_token_id (`int`, *optional*, defaults to 2):
+            The id of the "end-of-sequence" token.
+        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
+            Whether the model's input and output word embeddings should be tied.
+        rope_theta (`float`, *optional*, defaults to 10000.0):
+            The base period of the RoPE embeddings.
+        sliding_window (`int`, *optional*, defaults to 4096):
+            Sliding window attention window size. If not specified, will default to `4096`.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+
+    ```python
+    >>> from transformers import MistralModel, MistralConfig
+
+    >>> # Initializing a Mistral 7B style configuration
+    >>> configuration = MistralConfig()
+
+    >>> # Initializing a model from the Mistral 7B style configuration
+    >>> model = MistralModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "mistral"
+    keys_to_ignore_at_inference = ["past_key_values"]
+
+    def __init__(
+        self,
+        vocab_size=32000,
+        hidden_size=4096,
+        intermediate_size=14336,
+        num_hidden_layers=32,
+        num_attention_heads=32,
+        num_key_value_heads=8,
+        hidden_act="silu",
+        max_position_embeddings=4096 * 32,
+        initializer_range=0.02,
+        rms_norm_eps=1e-6,
+        use_cache=True,
+        pad_token_id=None,
+        bos_token_id=1,
+        eos_token_id=2,
+        tie_word_embeddings=False,
+        rope_theta=10000.0,
+        sliding_window=4096,
+        attention_dropout=0.0,
+        attn_implementation="flash_attention_2",
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.sliding_window = sliding_window
+
+        # for backward compatibility
+        if num_key_value_heads is None:
+            num_key_value_heads = num_attention_heads
+
+        self.num_key_value_heads = num_key_value_heads
+        self.hidden_act = hidden_act
+        self.initializer_range = initializer_range
+        self.rms_norm_eps = rms_norm_eps
+        self.use_cache = use_cache
+        self.rope_theta = rope_theta
+        self.attention_dropout = attention_dropout
+
+        # 调用父类的初始化函数,将一些公共参数传递给父类处理
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )
diff --git a/tests/models/mistral2/model.py b/tests/models/mistral2/model.py
new file mode 100644
index 0000000..60d9553
--- /dev/null
+++ b/tests/models/mistral2/model.py
@@ -0,0 +1,2026 @@
+# coding=utf-8
+# Copyright 2023 Mistral AI and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" PyTorch Mistral model."""
+import inspect
+import math
+import warnings
+from typing import List, Optional, Tuple, Union
+
+import torch
+import torch.nn.functional as F
+import torch.utils.checkpoint
+from torch import nn
+from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
+
+from transformers.activations import ACT2FN
+from transformers.cache_utils import Cache, DynamicCache
+from transformers.modeling_attn_mask_utils import _prepare_4d_causal_attention_mask, _prepare_4d_causal_attention_mask_for_sdpa
+from transformers.modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast, SequenceClassifierOutputWithPast
+from transformers.modeling_utils import PreTrainedModel, dtype_byte_size
+from transformers.utils import (
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    is_flash_attn_2_available,
+    is_flash_attn_greater_or_equal_2_10,
+    logging,
+    replace_return_docstrings,
+)
+from .configuration_mistraltp import Mistral2Config
+
+
+if is_flash_attn_2_available():
+    from flash_attn import flash_attn_func, flash_attn_varlen_func
+    from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input  # noqa
+
+    _flash_supports_window_size = "window_size" in list(inspect.signature(flash_attn_func).parameters)
+
+
+logger = logging.get_logger(__name__)
+
+_CONFIG_FOR_DOC = "Mistral2Config"
+
+#modified for collie
+import torch.distributed as dist
+import gc
+import json
+import os
+from collections import OrderedDict
+from megatron.core import parallel_state, tensor_parallel
+from einops import rearrange
+from deepspeed.pipe import LayerSpec, TiedLayerSpec
+
+from collie.config import CollieConfig
+from collie.driver.io import IODriver
+from collie.log.logger import logger
+from collie.module import (
+    ColumnParallelLinearWithoutBias,
+    ColumnParallelLMHead,
+    RowParallelLinearWithoutBias,
+)
+from collie.utils import concat_tensor, dict_as_params, env, progress
+from collie.models.base import CollieModelForCausalLM
+from collie.models.utils import (
+    kv_cache_to_inputs_for_layer, inputs_to_kv_cache_for_layer,
+    kv_cache_to_inputs_for_model, inputs_to_kv_cache_for_model,
+)
+
+
+# Copied from transformers.models.llama.modeling_llama._get_unpad_data
+def _get_unpad_data(attention_mask):
+    seqlens_in_batch = attention_mask.sum(dim=-1, dtype=torch.int32)
+    indices = torch.nonzero(attention_mask.flatten(), as_tuple=False).flatten()
+    max_seqlen_in_batch = seqlens_in_batch.max().item()
+    cu_seqlens = F.pad(torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.int32), (1, 0))
+    return (
+        indices,
+        cu_seqlens,
+        max_seqlen_in_batch,
+    )
+
+
+# Copied from transformers.models.llama.modeling_llama.LlamaRMSNorm with Llama->Mistral
+class Mistral2RMSNorm(nn.Module):
+    def __init__(self, hidden_size, eps=1e-6):
+        """
+        MistralRMSNorm is equivalent to T5LayerNorm
+        """
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+        self.variance_epsilon = eps
+
+    def forward(self, hidden_states):
+        input_dtype = hidden_states.dtype
+        hidden_states = hidden_states.to(torch.float32)
+        variance = hidden_states.pow(2).mean(-1, keepdim=True)
+        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
+        ans = self.weight * hidden_states.to(input_dtype)
+        # --------------------------------------------------------
+        # # 将Tensor转换为列表
+        # ans_list = ans.tolist()
+        # # 指定.json文件的路径
+        # file_path = '/remote-home/lqyin/CoLLiE/tests/models/mistraltp/rms_ans.json'
+        
+        # # 尝试打开现有的.json文件并读取内容，如果文件不存在则创建一个新的列表
+        # try:
+        #     with open(file_path, 'r', encoding='utf-8') as file:
+        #         results_list = json.load(file)
+        # except FileNotFoundError:
+        #     results_list = []
+        # # 将当前结果添加到列表中
+        # results_list.append(ans_list)
+        # # 将更新后的列表写回.json文件
+        # with open(file_path, 'w', encoding='utf-8') as file:
+        #     json.dump(results_list, file, ensure_ascii=False, indent=4)
+        #     file.write('\n')  # 在文件末尾添加一个换行符
+        # --------------------------------------------------------
+        return ans
+
+
+# copied from transformers.models.llama.modeling_llama.LlamaRotaryEmbedding with Llama->Mistral
+# TODO @Arthur no longer copied from LLama after static cache
+class Mistral2RotaryEmbedding(nn.Module):
+    def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None):
+        super().__init__()
+
+        self.dim = dim
+        self.max_position_embeddings = max_position_embeddings
+        self.base = base
+        inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2, dtype=torch.int64).float().to(device) / self.dim))
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
+
+        # Build here to make `torch.jit.trace` work.
+        self._set_cos_sin_cache(
+            seq_len=max_position_embeddings, device=self.inv_freq.device, dtype=torch.get_default_dtype()
+        )
+
+    def _set_cos_sin_cache(self, seq_len, device, dtype):
+        self.max_seq_len_cached = seq_len
+        t = torch.arange(self.max_seq_len_cached, device=device, dtype=torch.int64).type_as(self.inv_freq)
+
+        freqs = torch.outer(t, self.inv_freq)
+        # Different from paper, but it uses a different permutation in order to obtain the same calculation
+        emb = torch.cat((freqs, freqs), dim=-1)
+        self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False)
+        self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False)
+
+    def forward(self, x, seq_len=None):
+        # x: [bs, num_attention_heads, seq_len, head_size]
+        if seq_len > self.max_seq_len_cached:
+            self._set_cos_sin_cache(seq_len=seq_len, device=x.device, dtype=x.dtype)
+
+        return (
+            self.cos_cached[:seq_len].to(dtype=x.dtype),
+            self.sin_cached[:seq_len].to(dtype=x.dtype),
+        )
+
+
+# Copied from transformers.models.llama.modeling_llama.rotate_half
+def rotate_half(x):
+    """Rotates half the hidden dims of the input."""
+    x1 = x[..., : x.shape[-1] // 2]
+    x2 = x[..., x.shape[-1] // 2 :]
+    return torch.cat((-x2, x1), dim=-1)
+
+
+# copied from transformers.models.llama.modeling_llama.apply_rotary_pos_emb
+# TODO @Arthur no longer copied from LLama after static cache
+def apply_rotary_pos_emb(q, k, cos, sin, position_ids, unsqueeze_dim=1):
+    """Applies Rotary Position Embedding to the query and key tensors.
+
+    Args:
+        q (`torch.Tensor`): The query tensor.
+        k (`torch.Tensor`): The key tensor.
+        cos (`torch.Tensor`): The cosine part of the rotary embedding.
+        sin (`torch.Tensor`): The sine part of the rotary embedding.
+        position_ids (`torch.Tensor`):
+            The position indices of the tokens corresponding to the query and key tensors. For example, this can be
+            used to pass offsetted position ids when working with a KV-cache.
+        unsqueeze_dim (`int`, *optional*, defaults to 1):
+            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
+            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
+            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
+            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
+            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
+            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
+    Returns:
+        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
+    """
+    cos = cos[position_ids].unsqueeze(unsqueeze_dim)
+    sin = sin[position_ids].unsqueeze(unsqueeze_dim)
+    q_embed = (q * cos) + (rotate_half(q) * sin)
+    k_embed = (k * cos) + (rotate_half(k) * sin)
+    return q_embed, k_embed
+
+
+class Mistral2MLP(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.hidden_size = config.hidden_size
+        self.intermediate_size = config.intermediate_size
+        # self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
+        # self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
+        # self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False)
+        
+        self.up_proj = ColumnParallelLinearWithoutBias(
+            self.hidden_size,
+            self.intermediate_size,
+            bias=False,
+            gather_output=False,
+            init_method=lambda x: x,
+        )
+        self.gate_proj = ColumnParallelLinearWithoutBias(
+            self.hidden_size,
+            self.intermediate_size,
+            bias=False,
+            gather_output=False,
+            init_method=lambda x: x,
+        )
+        self.down_proj = RowParallelLinearWithoutBias(
+            self.intermediate_size,
+            self.hidden_size,
+            bias=False,
+            input_is_parallel=True,
+            init_method=lambda x: x,
+        )
+        
+        self.act_fn = ACT2FN[config.hidden_act]
+
+    def forward(self, x):
+        return self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
+
+
+# Copied from transformers.models.llama.modeling_llama.repeat_kv
+def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
+    """
+    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
+    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
+    """
+    batch, num_key_value_heads, slen, head_dim = hidden_states.shape
+    if n_rep == 1:
+        return hidden_states
+    hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
+    return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
+
+
+class Mistral2Attention(nn.Module):
+    """
+    Multi-headed attention from 'Attention Is All You Need' paper. Modified to use sliding window attention: Longformer
+    and "Generating Long Sequences with Sparse Transformers".
+    """
+
+    def __init__(self, config: CollieConfig, layer_idx: Optional[int] = None):
+        super().__init__()
+        self.config = config
+        self.layer_idx = layer_idx
+        if layer_idx is None:
+            logger.warning_once(
+                f"Instantiating {self.__class__.__name__} without passing a `layer_idx` is not recommended and will "
+                "lead to errors during the forward call if caching is used. Please make sure to provide a `layer_idx` "
+                "when creating this class."
+            )
+
+        self.hidden_size = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_dim = self.hidden_size // self.num_heads
+        self.num_key_value_heads = config.num_key_value_heads
+        self.num_key_value_groups = self.num_heads // self.num_key_value_heads
+        self.max_position_embeddings = config.max_position_embeddings
+        self.rope_theta = config.rope_theta
+        self.is_causal = True
+        self.attention_dropout = config.attention_dropout
+
+        if (self.head_dim * self.num_heads) != self.hidden_size:
+            raise ValueError(
+                f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}"
+                f" and `num_heads`: {self.num_heads})."
+            )
+        # self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=False)
+        # self.k_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=False)
+        # self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=False)
+        # self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=False)
+
+        self.q_proj = ColumnParallelLinearWithoutBias(
+            self.hidden_size,
+            self.num_heads * self.head_dim,
+            bias=False,
+            gather_output=False,
+            init_method=lambda x: x,
+        )
+        self.k_proj = ColumnParallelLinearWithoutBias(
+            self.hidden_size,
+            self.num_key_value_heads * self.head_dim,
+            bias=False,
+            gather_output=False,
+            init_method=lambda x: x,
+        )
+        self.v_proj = ColumnParallelLinearWithoutBias(
+            self.hidden_size,
+            self.num_key_value_heads * self.head_dim,
+            bias=False,
+            gather_output=False,
+            init_method=lambda x: x,
+        )
+        self.o_proj = RowParallelLinearWithoutBias(
+            self.num_heads * self.head_dim,
+            self.hidden_size,
+            bias=False,
+            input_is_parallel=True,
+            init_method=lambda x: x,
+        )
+
+        self.rotary_emb = Mistral2RotaryEmbedding(
+            self.head_dim,
+            max_position_embeddings=self.max_position_embeddings,
+            base=self.rope_theta,
+        )
+
+    def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
+        return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Cache] = None,
+        output_attentions: bool = False,
+        use_cache: bool = False,
+        **kwargs,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        if "padding_mask" in kwargs:
+            warnings.warn(
+                "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`"
+            )
+        bsz, q_len, _ = hidden_states.size()
+
+        query_states = self.q_proj(hidden_states)
+        key_states = self.k_proj(hidden_states)
+        value_states = self.v_proj(hidden_states)
+
+        # query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        # key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        # value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+
+        query_states, key_states, value_states = (
+            rearrange(query_states, "b n (h d) -> b n h d", d=self.head_dim),
+            rearrange(key_states, "b n (h d) -> b n h d", d=self.head_dim),
+            rearrange(value_states, "b n (h d) -> b n h d", d=self.head_dim),
+        )
+
+        self.num_heads_tp = query_states.shape[2]
+        self.tp_size = self.num_heads // self.num_heads_tp
+
+        query_states = query_states.transpose(1, 2)
+        key_states = key_states.transpose(1, 2)
+        value_states = value_states.transpose(1, 2)
+
+        if self.config.pp_size > 1:
+            query_states = query_states.contiguous()
+            key_states = key_states.contiguous()
+            value_states = value_states.contiguous()
+
+        kv_seq_len = key_states.shape[-2]
+        if past_key_value is not None:
+            if self.layer_idx is None:
+                raise ValueError(
+                    f"The cache structure has changed since version v4.36. If you are using {self.__class__.__name__} "
+                    "for auto-regressive decoding with k/v caching, please make sure to initialize the attention class "
+                    "with a layer index."
+                )
+            kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
+        cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
+
+        if past_key_value is not None:
+            cache_kwargs = {"sin": sin, "cos": cos}  # Specific to RoPE models
+            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
+
+        # repeat k/v heads if n_kv_heads < n_heads
+        key_states = repeat_kv(key_states, self.num_key_value_groups)
+        value_states = repeat_kv(value_states, self.num_key_value_groups)
+
+        attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim)
+
+        if attn_weights.size() != (bsz, self.num_heads_tp, q_len, kv_seq_len):
+            raise ValueError(
+                f"Attention weights should be of size {(bsz, self.num_heads_tp, q_len, kv_seq_len)}, but is"
+                f" {attn_weights.size()}"
+            )
+
+        if attention_mask is not None:
+            if attention_mask.size() != (bsz, 1, q_len, kv_seq_len):
+                raise ValueError(
+                    f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}"
+                )
+
+            attn_weights = attn_weights + attention_mask
+
+        # upcast attention to fp32
+        attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
+        attn_weights = nn.functional.dropout(attn_weights, p=self.attention_dropout, training=self.training)
+        attn_output = torch.matmul(attn_weights, value_states)
+
+        if attn_output.size() != (bsz, self.num_heads_tp, q_len, self.head_dim):
+            raise ValueError(
+                f"`attn_output` should be of size {(bsz, self.num_heads_tp, q_len, self.head_dim)}, but is"
+                f" {attn_output.size()}"
+            )
+
+        attn_output = attn_output.transpose(1, 2).contiguous()
+        attn_output = attn_output.reshape(bsz, q_len, int(self.hidden_size/self.tp_size))
+
+        attn_output = self.o_proj(attn_output)
+
+        if not output_attentions:
+            attn_weights = None
+
+        # --------------------------------------------------------
+        # 将Tensor转换为列表
+        ans_list = attn_output.tolist()
+        # 指定.json文件的路径
+        file_path = '/remote-home/lqyin/CoLLiE/tests/models/mistraltp/attn_output_1.json'
+        
+        # 尝试打开现有的.json文件并读取内容，如果文件不存在则创建一个新的列表
+        try:
+            with open(file_path, 'r', encoding='utf-8') as file:
+                results_list = json.load(file)
+        except FileNotFoundError:
+            results_list = []
+        # 将当前结果添加到列表中
+        results_list.append(ans_list)
+        # 将更新后的列表写回.json文件
+        with open(file_path, 'w', encoding='utf-8') as file:
+            json.dump(results_list, file, ensure_ascii=False, indent=4)
+            file.write('\n\n\n')  # 在文件末尾添加一个换行符
+        # --------------------------------------------------------
+
+
+
+        return attn_output, attn_weights, past_key_value
+
+
+class Mistral2FlashAttention2(Mistral2Attention):
+    """
+    Mistral flash attention module. This module inherits from `MistralAttention` as the weights of the module stays
+    untouched. The only required change would be on the forward pass where it needs to correctly call the public API of
+    flash attention and deal with padding tokens in case the input contains any of them.
+    """
+
+    # Copied from transformers.models.llama.modeling_llama.LlamaFlashAttention2.__init__
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+        # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1.
+        # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignement, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0.
+        # Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left).
+        self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10()
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Cache] = None,
+        output_attentions: bool = False,
+        use_cache: bool = False,
+        **kwargs,
+    ):
+        if "padding_mask" in kwargs:
+            warnings.warn(
+                "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`"
+            )
+
+            # overwrite attention_mask with padding_mask
+            attention_mask = kwargs.pop("padding_mask")
+        bsz, q_len, _ = hidden_states.size()
+
+        query_states = self.q_proj(hidden_states)
+        key_states = self.k_proj(hidden_states)
+        value_states = self.v_proj(hidden_states)
+
+        # query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        # key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        # value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+
+        query_states, key_states, value_states = (
+            rearrange(query_states, "b n (h d) -> b n h d", d=self.head_dim),
+            rearrange(key_states, "b n (h d) -> b n h d", d=self.head_dim),
+            rearrange(value_states, "b n (h d) -> b n h d", d=self.head_dim),
+        )
+
+        self.num_heads_tp = query_states.shape[2]
+        self.tp_size = self.num_heads // self.num_heads_tp
+
+        query_states = query_states.transpose(1, 2)
+        key_states = key_states.transpose(1, 2)
+        value_states = value_states.transpose(1, 2)
+
+        if self.config.pp_size > 1:
+            query_states = query_states.contiguous()
+            key_states = key_states.contiguous()
+            value_states = value_states.contiguous()
+
+
+        kv_seq_len = key_states.shape[-2]
+        if past_key_value is not None:
+            if self.layer_idx is None:
+                raise ValueError(
+                    f"The cache structure has changed since version v4.36. If you are using {self.__class__.__name__} "
+                    "for auto-regressive decoding with k/v caching, please make sure to initialize the attention class "
+                    "with a layer index."
+                )
+            kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
+
+        # Because the input can be padded, the absolute sequence length depends on the max position id.
+        rotary_seq_len = max(kv_seq_len, position_ids[:, -1].max().item()) + 1
+        cos, sin = self.rotary_emb(value_states, seq_len=rotary_seq_len)
+
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
+
+        use_sliding_windows = (
+            _flash_supports_window_size
+            and getattr(self.config, "sliding_window", None) is not None
+            and kv_seq_len > self.config.sliding_window
+        )
+
+        if not _flash_supports_window_size:
+            logger.warning_once(
+                "The current flash attention version does not support sliding window attention, for a more memory efficient implementation"
+                " make sure to upgrade flash-attn library."
+            )
+
+        if past_key_value is not None:
+            # Activate slicing cache only if the config has a value `sliding_windows` attribute
+            cache_has_contents = past_key_value.get_seq_length(self.layer_idx) > 0
+            if (
+                getattr(self.config, "sliding_window", None) is not None
+                and kv_seq_len > self.config.sliding_window
+                and cache_has_contents
+            ):
+                slicing_tokens = 1 - self.config.sliding_window
+
+                past_key = past_key_value[self.layer_idx][0]
+                past_value = past_key_value[self.layer_idx][1]
+
+                past_key = past_key[:, :, slicing_tokens:, :].contiguous()
+                past_value = past_value[:, :, slicing_tokens:, :].contiguous()
+
+                if past_key.shape[-2] != self.config.sliding_window - 1:
+                    raise ValueError(
+                        f"past key must have a shape of (`batch_size, num_heads, self.config.sliding_window-1, head_dim`), got"
+                        f" {past_key.shape}"
+                    )
+
+                if attention_mask is not None:
+                    attention_mask = attention_mask[:, slicing_tokens:]
+                    attention_mask = torch.cat([attention_mask, torch.ones_like(attention_mask[:, -1:])], dim=-1)
+
+            cache_kwargs = {"sin": sin, "cos": cos}  # Specific to RoPE models
+            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
+
+        # repeat k/v heads if n_kv_heads < n_heads
+        key_states = repeat_kv(key_states, self.num_key_value_groups)
+        value_states = repeat_kv(value_states, self.num_key_value_groups)
+        dropout_rate = 0.0 if not self.training else self.attention_dropout
+
+        # In PEFT, usually we cast the layer norms in float32 for training stability reasons
+        # therefore the input hidden states gets silently casted in float32. Hence, we need
+        # cast them back in float16 just to be sure everything works as expected.
+        input_dtype = query_states.dtype
+        if input_dtype == torch.float32:
+            if torch.is_autocast_enabled():
+                target_dtype = torch.get_autocast_gpu_dtype()
+            # Handle the case where the model is quantized
+            elif hasattr(self.config, "_pre_quantization_dtype"):
+                target_dtype = self.config._pre_quantization_dtype
+            else:
+                target_dtype = self.q_proj.weight.dtype
+
+            logger.warning_once(
+                f"The input hidden states seems to be silently casted in float32, this might be related to"
+                f" the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in"
+                f" {target_dtype}."
+            )
+
+            query_states = query_states.to(target_dtype)
+            key_states = key_states.to(target_dtype)
+            value_states = value_states.to(target_dtype)
+
+        # Reashape to the expected shape for Flash Attention
+        query_states = query_states.transpose(1, 2)
+        key_states = key_states.transpose(1, 2)
+        value_states = value_states.transpose(1, 2)
+
+        attn_output = self._flash_attention_forward(
+            query_states,
+            key_states,
+            value_states,
+            attention_mask,
+            q_len,
+            dropout=dropout_rate,
+            use_sliding_windows=use_sliding_windows,
+        )
+
+        attn_output = attn_output.reshape(bsz, q_len, int(self.hidden_size/self.tp_size)).contiguous()
+        attn_output = self.o_proj(attn_output)
+
+        if not output_attentions:
+            attn_weights = None
+
+  # --------------------------------------------------------
+        # 将Tensor转换为列表
+        ans_list = attn_output.tolist()
+        # 指定.json文件的路径
+        file_path = '/remote-home/lqyin/CoLLiE/tests/models/mistraltp/attn_output_1.json'
+        
+        # 尝试打开现有的.json文件并读取内容，如果文件不存在则创建一个新的列表
+        try:
+            with open(file_path, 'r', encoding='utf-8') as file:
+                results_list = json.load(file)
+        except FileNotFoundError:
+            results_list = []
+        # 将当前结果添加到列表中
+        results_list.append(ans_list)
+        # 将更新后的列表写回.json文件
+        with open(file_path, 'w', encoding='utf-8') as file:
+            json.dump(results_list, file, ensure_ascii=False, indent=4)
+            file.write('\n\n\n')  # 在文件末尾添加一个换行符
+        # --------------------------------------------------------
+
+
+        return attn_output, attn_weights, past_key_value
+
+    def _flash_attention_forward(
+        self,
+        query_states,
+        key_states,
+        value_states,
+        attention_mask,
+        query_length,
+        dropout=0.0,
+        softmax_scale=None,
+        use_sliding_windows=False,
+    ):
+        """
+        Calls the forward method of Flash Attention - if the input hidden states contain at least one padding token
+        first unpad the input, then computes the attention scores and pad the final attention scores.
+
+        Args:
+            query_states (`torch.Tensor`):
+                Input query states to be passed to Flash Attention API
+            key_states (`torch.Tensor`):
+                Input key states to be passed to Flash Attention API
+            value_states (`torch.Tensor`):
+                Input value states to be passed to Flash Attention API
+            attention_mask (`torch.Tensor`):
+                The padding mask - corresponds to a tensor of size `(batch_size, seq_len)` where 0 stands for the
+                position of padding tokens and 1 for the position of non-padding tokens.
+            dropout (`float`):
+                Attention dropout
+            softmax_scale (`float`, *optional*):
+                The scaling of QK^T before applying softmax. Default to 1 / sqrt(head_dim)
+            use_sliding_windows (`bool`, *optional*):
+                Whether to activate sliding window attention.
+        """
+        if not self._flash_attn_uses_top_left_mask:
+            causal = self.is_causal
+        else:
+            # TODO: Remove the `query_length != 1` check once Flash Attention for RoCm is bumped to 2.1. For details, please see the comment in LlamaFlashAttention2 __init__.
+            causal = self.is_causal and query_length != 1
+
+        # Contains at least one padding token in the sequence
+        if attention_mask is not None:
+            batch_size = query_states.shape[0]
+            query_states, key_states, value_states, indices_q, cu_seq_lens, max_seq_lens = self._upad_input(
+                query_states, key_states, value_states, attention_mask, query_length
+            )
+
+            cu_seqlens_q, cu_seqlens_k = cu_seq_lens
+            max_seqlen_in_batch_q, max_seqlen_in_batch_k = max_seq_lens
+
+            if not use_sliding_windows:
+                attn_output_unpad = flash_attn_varlen_func(
+                    query_states,
+                    key_states,
+                    value_states,
+                    cu_seqlens_q=cu_seqlens_q,
+                    cu_seqlens_k=cu_seqlens_k,
+                    max_seqlen_q=max_seqlen_in_batch_q,
+                    max_seqlen_k=max_seqlen_in_batch_k,
+                    dropout_p=dropout,
+                    softmax_scale=softmax_scale,
+                    causal=causal,
+                )
+            else:
+                attn_output_unpad = flash_attn_varlen_func(
+                    query_states,
+                    key_states,
+                    value_states,
+                    cu_seqlens_q=cu_seqlens_q,
+                    cu_seqlens_k=cu_seqlens_k,
+                    max_seqlen_q=max_seqlen_in_batch_q,
+                    max_seqlen_k=max_seqlen_in_batch_k,
+                    dropout_p=dropout,
+                    softmax_scale=softmax_scale,
+                    causal=causal,
+                    window_size=(self.config.sliding_window, self.config.sliding_window),
+                )
+
+            attn_output = pad_input(attn_output_unpad, indices_q, batch_size, query_length)
+        else:
+            if not use_sliding_windows:
+                attn_output = flash_attn_func(
+                    query_states,
+                    key_states,
+                    value_states,
+                    dropout,
+                    softmax_scale=softmax_scale,
+                    causal=causal,
+                )
+            else:
+                attn_output = flash_attn_func(
+                    query_states,
+                    key_states,
+                    value_states,
+                    dropout,
+                    softmax_scale=softmax_scale,
+                    causal=causal,
+                    window_size=(self.config.sliding_window, self.config.sliding_window),
+                )
+
+        return attn_output
+
+    def _upad_input(self, query_layer, key_layer, value_layer, attention_mask, query_length):
+        batch_size, kv_seq_len, num_heads, head_dim = key_layer.shape
+
+        # On the first iteration we need to properly re-create the padding mask
+        # by slicing it on the proper place
+        if kv_seq_len != attention_mask.shape[-1]:
+            attention_mask_num_tokens = attention_mask.shape[-1]
+            attention_mask = attention_mask[:, attention_mask_num_tokens - kv_seq_len :]
+
+        indices_k, cu_seqlens_k, max_seqlen_in_batch_k = _get_unpad_data(attention_mask)
+
+        key_layer = index_first_axis(key_layer.reshape(batch_size * kv_seq_len, num_heads, head_dim), indices_k)
+        value_layer = index_first_axis(value_layer.reshape(batch_size * kv_seq_len, num_heads, head_dim), indices_k)
+
+        if query_length == kv_seq_len:
+            query_layer = index_first_axis(
+                query_layer.reshape(batch_size * kv_seq_len, num_heads, head_dim), indices_k
+            )
+            cu_seqlens_q = cu_seqlens_k
+            max_seqlen_in_batch_q = max_seqlen_in_batch_k
+            indices_q = indices_k
+        elif query_length == 1:
+            max_seqlen_in_batch_q = 1
+            cu_seqlens_q = torch.arange(
+                batch_size + 1, dtype=torch.int32, device=query_layer.device
+            )  # There is a memcpy here, that is very bad.
+            indices_q = cu_seqlens_q[:-1]
+            query_layer = query_layer.squeeze(1)
+        else:
+            # The -q_len: slice assumes left padding.
+            attention_mask = attention_mask[:, -query_length:]
+            query_layer, indices_q, cu_seqlens_q, max_seqlen_in_batch_q = unpad_input(query_layer, attention_mask)
+
+        return (
+            query_layer,
+            key_layer,
+            value_layer,
+            indices_q,
+            (cu_seqlens_q, cu_seqlens_k),
+            (max_seqlen_in_batch_q, max_seqlen_in_batch_k),
+        )
+
+
+# copied from transformers.models.llama.modeling_llama.LlamaSdpaAttention with Llama->Mistral
+# TODO @Arthur no longer copied from LLama after static cache
+class Mistral2SdpaAttention(Mistral2Attention):
+    """
+    Mistral attention module using torch.nn.functional.scaled_dot_product_attention. This module inherits from
+    `MistralAttention` as the weights of the module stays untouched. The only changes are on the forward pass to adapt to
+    SDPA API.
+    """
+
+    # Adapted from MistralAttention.forward
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Cache] = None,
+        output_attentions: bool = False,
+        use_cache: bool = False,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        if output_attentions:
+            # TODO: Improve this warning with e.g. `model.config.attn_implementation = "manual"` once this is implemented.
+            logger.warning_once(
+                "MistralModel is using MistralSdpaAttention, but `torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to the manual attention implementation, "
+                'but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
+            )
+            return super().forward(
+                hidden_states=hidden_states,
+                attention_mask=attention_mask,
+                position_ids=position_ids,
+                past_key_value=past_key_value,
+                output_attentions=output_attentions,
+                use_cache=use_cache,
+            )
+
+        bsz, q_len, _ = hidden_states.size()
+
+        query_states = self.q_proj(hidden_states)
+        key_states = self.k_proj(hidden_states)
+        value_states = self.v_proj(hidden_states)
+
+        # query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        # key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        # value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+
+        query_states, key_states, value_states = (
+            rearrange(query_states, "b n (h d) -> b n h d", d=self.head_dim),
+            rearrange(key_states, "b n (h d) -> b n h d", d=self.head_dim),
+            rearrange(value_states, "b n (h d) -> b n h d", d=self.head_dim),
+        )
+
+        self.num_heads_tp = query_states.shape[2]
+        self.tp_size = self.num_heads // self.num_heads_tp
+
+        query_states = query_states.transpose(1, 2)
+        key_states = key_states.transpose(1, 2)
+        value_states = value_states.transpose(1, 2)
+
+        if self.config.pp_size > 1:
+            query_states = query_states.contiguous()
+            key_states = key_states.contiguous()
+            value_states = value_states.contiguous()
+
+
+        kv_seq_len = key_states.shape[-2]
+        if past_key_value is not None:
+            kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
+        cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
+
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
+
+        if past_key_value is not None:
+            cache_kwargs = {"sin": sin, "cos": cos}  # Specific to RoPE models
+            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
+
+        key_states = repeat_kv(key_states, self.num_key_value_groups)
+        value_states = repeat_kv(value_states, self.num_key_value_groups)
+
+        if attention_mask is not None:
+            if attention_mask.size() != (bsz, 1, q_len, kv_seq_len):
+                raise ValueError(
+                    f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}"
+                )
+
+        # SDPA with memory-efficient backend is currently (torch==2.1.2) bugged with non-contiguous inputs with custom attn_mask,
+        # Reference: https://github.com/pytorch/pytorch/issues/112577.
+        if query_states.device.type == "cuda" and attention_mask is not None:
+            query_states = query_states.contiguous()
+            key_states = key_states.contiguous()
+            value_states = value_states.contiguous()
+
+        attn_output = torch.nn.functional.scaled_dot_product_attention(
+            query_states,
+            key_states,
+            value_states,
+            attn_mask=attention_mask,
+            dropout_p=self.attention_dropout if self.training else 0.0,
+            # The q_len > 1 is necessary to match with AttentionMaskConverter.to_causal_4d that does not create a causal mask in case q_len == 1.
+            is_causal=self.is_causal and attention_mask is None and q_len > 1,
+        )
+
+        attn_output = attn_output.transpose(1, 2).contiguous()
+        attn_output = attn_output.view(bsz, q_len, int(self.hidden_size/self.tp_size))
+
+        attn_output = self.o_proj(attn_output)
+
+        return attn_output, None, past_key_value
+
+
+MISTRAL_ATTENTION_CLASSES = {
+    "eager": Mistral2Attention,
+    "flash_attention_2": Mistral2FlashAttention2,
+    "sdpa": Mistral2SdpaAttention,
+}
+
+
+class MistralDecoderLayer(nn.Module):
+    def __init__(self, config: CollieConfig, layer_idx: int):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+
+        self.self_attn = MISTRAL_ATTENTION_CLASSES[config._attn_implementation](config, layer_idx)
+
+        self.mlp = Mistral2MLP(config)
+        self.input_layernorm = Mistral2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = Mistral2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        output_attentions: Optional[bool] = False,
+        use_cache: Optional[bool] = False,
+        **kwargs,
+    ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
+        if "padding_mask" in kwargs:
+            warnings.warn(
+                "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`"
+            )
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
+            attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
+                `(batch, sequence_length)` where padding elements are indicated by 0.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            use_cache (`bool`, *optional*):
+                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
+                (see `past_key_values`).
+            past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
+        """
+
+        residual = hidden_states
+
+        hidden_states = self.input_layernorm(hidden_states)
+
+        # Self Attention
+        hidden_states, self_attn_weights, present_key_value = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_value=past_key_value,
+            output_attentions=output_attentions,
+            use_cache=use_cache,
+        )
+        hidden_states = residual + hidden_states
+
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + hidden_states
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (self_attn_weights,)
+
+        if use_cache:
+            outputs += (present_key_value,)
+
+        # --------------------------------------------------------
+        # # 将Tensor转换为列表
+        # ans_list = [tensor.tolist() for tensor in outputs]
+        # # 指定.json文件的路径
+        # file_path = '/remote-home/lqyin/CoLLiE/tests/models/mistraltp/decoder_outputs.json'
+        
+        # # 尝试打开现有的.json文件并读取内容，如果文件不存在则创建一个新的列表
+        # try:
+        #     with open(file_path, 'r', encoding='utf-8') as file:
+        #         results_list = json.load(file)
+        # except FileNotFoundError:
+        #     results_list = []
+        # # 将当前结果添加到列表中
+        # results_list.append(ans_list)
+        # # 将更新后的列表写回.json文件
+        # with open(file_path, 'w', encoding='utf-8') as file:
+        #     json.dump(results_list, file, ensure_ascii=False, indent=4)
+        #     file.write('\n')  # 在文件末尾添加一个换行符
+        # --------------------------------------------------------
+
+        return outputs
+
+
+MISTRAL_START_DOCSTRING = r"""
+    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+
+    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
+    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
+    and behavior.
+
+    Parameters:
+        config ([`MistralConfig`]):
+            Model configuration class with all the parameters of the model. Initializing with a config file does not
+            load the weights associated with the model, only the configuration. Check out the
+            [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+
+@add_start_docstrings(
+    "The bare Mistral Model outputting raw hidden-states without any specific head on top.",
+    MISTRAL_START_DOCSTRING,
+)
+class Mistral2PreTrainedModel(PreTrainedModel):
+    config_class = Mistral2Config
+    base_model_prefix = "model"
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["MistralDecoderLayer"]
+    _skip_keys_device_placement = "past_key_values"
+    _supports_flash_attn_2 = True
+    _supports_sdpa = True
+    _supports_cache_class = True
+
+    def _init_weights(self, module):
+        std = self.config.initializer_range
+        if isinstance(module, nn.Linear):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+
+
+MISTRAL_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
+            it.
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            [What are input IDs?](../glossary#input-ids)
+        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            If `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
+            `past_key_values`).
+
+            If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
+            and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
+            information on the default strategy.
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+        position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+            config.n_positions - 1]`.
+
+            [What are position IDs?](../glossary#position-ids)
+        past_key_values (`Cache` or `tuple(tuple(torch.FloatTensor))`, *optional*):
+            Pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
+            blocks) that can be used to speed up sequential decoding. This typically consists in the `past_key_values`
+            returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`.
+
+            Two formats are allowed:
+            - a [`~cache_utils.Cache`] instance;
+            - Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of
+            shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`). This is also known as the legacy
+            cache format.
+
+            The model will output the same cache format that is fed as input. If no `past_key_values` are passed, the
+            legacy cache format will be returned.
+
+            If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that don't
+            have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `input_ids`
+            of shape `(batch_size, sequence_length)`.
+        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
+            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
+            model's internal embedding lookup matrix.
+        use_cache (`bool`, *optional*):
+            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
+            `past_key_values`).
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+@add_start_docstrings(
+    "The bare Mistral Model outputting raw hidden-states without any specific head on top.",
+    MISTRAL_START_DOCSTRING,
+)
+class Mistral2Model(nn.Module):
+    """
+    Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`MistralDecoderLayer`]
+
+    Args:
+        config: MistralConfig
+    """
+
+    def __init__(self, config:  CollieConfig):
+        # super().__init__(config)
+        super().__init__()
+        self.config = config
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+
+        self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
+        self.layers = nn.ModuleList(
+            [MistralDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
+        )
+        self._attn_implementation = config._attn_implementation
+        self.norm = Mistral2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+        self.gradient_checkpointing = False
+        # Initialize weights and apply final processing
+        # self.post_init()
+
+    def get_input_embeddings(self):
+        return self.embed_tokens
+
+    def set_input_embeddings(self, value):
+        self.embed_tokens = value
+
+    @add_start_docstrings_to_model_forward(MISTRAL_INPUTS_DOCSTRING)
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutputWithPast]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        # retrieve input_ids and inputs_embeds
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time")
+        elif input_ids is not None:
+            batch_size, seq_length = input_ids.shape
+        elif inputs_embeds is not None:
+            batch_size, seq_length, _ = inputs_embeds.shape
+        else:
+            raise ValueError("You have to specify either decoder_input_ids or decoder_inputs_embeds")
+
+        if self.gradient_checkpointing and self.training:
+            if use_cache:
+                logger.warning_once(
+                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+                )
+                use_cache = False
+
+        past_key_values_length = 0
+
+        if use_cache:
+            use_legacy_cache = not isinstance(past_key_values, Cache)
+            if use_legacy_cache:
+                past_key_values = DynamicCache.from_legacy_cache(past_key_values)
+            past_key_values_length = past_key_values.get_usable_length(seq_length)
+
+        if position_ids is None:
+            device = input_ids.device if input_ids is not None else inputs_embeds.device
+            position_ids = torch.arange(
+                past_key_values_length, seq_length + past_key_values_length, dtype=torch.long, device=device
+            )
+            position_ids = position_ids.unsqueeze(0).view(-1, seq_length)
+        else:
+            position_ids = position_ids.view(-1, seq_length).long()
+
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_tokens(input_ids)
+
+        if attention_mask is not None and self._attn_implementation == "flash_attention_2" and use_cache:
+            is_padding_right = attention_mask[:, -1].sum().item() != batch_size
+            if is_padding_right:
+                raise ValueError(
+                    "You are attempting to perform batched generation with padding_side='right'"
+                    " this may lead to unexpected behaviour for Flash Attention version of Mistral. Make sure to "
+                    " call `tokenizer.padding_side  = 'left'` before tokenizing the input. "
+                )
+
+        if self._attn_implementation == "flash_attention_2":
+            # 2d mask is passed through the layers
+            attention_mask = attention_mask if (attention_mask is not None and 0 in attention_mask) else None
+        elif self._attn_implementation == "sdpa" and not output_attentions:
+            # output_attentions=True can not be supported when using SDPA, and we fall back on
+            # the manual implementation that requires a 4D causal mask in all cases.
+            attention_mask = _prepare_4d_causal_attention_mask_for_sdpa(
+                attention_mask,
+                (batch_size, seq_length),
+                inputs_embeds,
+                past_key_values_length,
+            )
+        else:
+            # 4d mask is passed through the layers
+            attention_mask = _prepare_4d_causal_attention_mask(
+                attention_mask,
+                (batch_size, seq_length),
+                inputs_embeds,
+                past_key_values_length,
+                sliding_window=self.config.sliding_window,
+            )
+
+        hidden_states = inputs_embeds
+    
+
+        # --------------------------------------------------------
+        # # 将Tensor转换为列表
+        # ans_list = inputs_embeds.tolist()
+        # # 指定.json文件的路径
+        # file_path = '/remote-home/lqyin/CoLLiE/tests/models/mistraltp/inputs_embeds.json'
+        
+        # # 尝试打开现有的.json文件并读取内容，如果文件不存在则创建一个新的列表
+        # try:
+        #     with open(file_path, 'r', encoding='utf-8') as file:
+        #         results_list = json.load(file)
+        # except FileNotFoundError:
+        #     results_list = []
+        # # 将当前结果添加到列表中
+        # results_list.append(ans_list)
+        # # 将更新后的列表写回.json文件
+        # with open(file_path, 'w', encoding='utf-8') as file:
+        #     json.dump(results_list, file, ensure_ascii=False, indent=4)
+        #     file.write('\n')  # 在文件末尾添加一个换行符
+        # # --------------------------------------------------------
+
+        # decoder layers
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attns = () if output_attentions else None
+        next_decoder_cache = None
+
+        for decoder_layer in self.layers:
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+
+            if self.gradient_checkpointing and self.training:
+                layer_outputs = self._gradient_checkpointing_func(
+                    decoder_layer.__call__,
+                    hidden_states,
+                    attention_mask,
+                    position_ids,
+                    past_key_values,
+                    output_attentions,
+                    use_cache,
+                )
+            else:
+                layer_outputs = decoder_layer(
+                    hidden_states,
+                    attention_mask=attention_mask,
+                    position_ids=position_ids,
+                    past_key_value=past_key_values,
+                    output_attentions=output_attentions,
+                    use_cache=use_cache,
+                )
+
+            hidden_states = layer_outputs[0]
+
+            if use_cache:
+                next_decoder_cache = layer_outputs[2 if output_attentions else 1]
+
+            if output_attentions:
+                all_self_attns += (layer_outputs[1],)
+
+        hidden_states = self.norm(hidden_states)
+
+        # add hidden states from the last decoder layer
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+
+        next_cache = None
+        if use_cache:
+            next_cache = next_decoder_cache.to_legacy_cache() if use_legacy_cache else next_decoder_cache
+
+        if not return_dict:
+            return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None)
+        return BaseModelOutputWithPast(
+            last_hidden_state=hidden_states,
+            past_key_values=next_cache,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attns,
+        )
+
+
+class Mistral2ForCausalLM(CollieModelForCausalLM):
+    _tied_weights_keys = ["lm_head.weight"]
+
+    def __init__(self, config:CollieConfig):
+        super().__init__(config)
+        self.model = Mistral2Model(config)
+        self.vocab_size = config.vocab_size
+        # self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+        self.lm_head = ColumnParallelLinearWithoutBias(
+            self.collie_config.hidden_size, self.collie_config.vocab_size, bias=False
+        )
+        # Initialize weights and apply final processing
+        # self.post_init()
+        # GenerationMixin 需要的额外参数
+        self.config.is_decoder = True
+        if config.model_config.tie_word_embeddings:
+            self.lm_head.weight = self.embed_tokens.weight
+        self.main_input_name = "input_ids"
+
+    def clean_cache(self):
+        self._clean_hidden_states([*self.model.layers, self.lm_head])
+        self._set_use_cache(self.model.layers, False)
+
+    def set_cache(self, use_cache):
+        self._set_use_cache(self.model.layers, use_cache)
+
+    def get_input_embeddings(self):
+        return self.model.embed_tokens
+
+    def set_input_embeddings(self, value):
+        self.model.embed_tokens = value
+
+    def get_output_embeddings(self):
+        return self.lm_head
+
+    def set_output_embeddings(self, new_embeddings):
+        self.lm_head = new_embeddings
+
+    def set_decoder(self, decoder):
+        self.model = decoder
+
+    def get_decoder(self):
+        return self.model
+
+    @add_start_docstrings_to_model_forward(MISTRAL_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        **kwargs,
+    ) -> Union[Tuple, CausalLMOutputWithPast]:
+        r"""
+        Args:
+            labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
+                config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+                (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+
+        Returns:
+
+        Example:
+
+        ```python
+        >>> from transformers import AutoTokenizer, MistralForCausalLM
+
+        >>> model = MistralForCausalLM.from_pretrained("mistralai/Mistral-7B-v0.1")
+        >>> tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-v0.1")
+
+        >>> prompt = "Hey, are you conscious? Can you talk to me?"
+        >>> inputs = tokenizer(prompt, return_tensors="pt")
+
+        >>> # Generate
+        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
+        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+        "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
+        ```"""
+
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
+        outputs = self.model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        hidden_states = outputs[0]
+        logits = self.lm_head(hidden_states)
+        logits = logits.float()
+
+        loss = None
+        if labels is not None:
+            # Shift so that tokens < n predict n
+            shift_logits = logits[..., :-1, :].contiguous()
+            shift_labels = labels[..., 1:].contiguous()
+            # Flatten the tokens
+            shift_logits = shift_logits.view(-1, self.config.vocab_size)
+            shift_labels = shift_labels.view(-1)
+            # Ensure tensors are on the same device
+            shift_labels = shift_labels.to(shift_logits.device)
+            loss_fct = CrossEntropyLoss()
+            loss = loss_fct(shift_logits, shift_labels)
+
+        if not return_dict:
+            output = (logits,) + outputs[1:]
+            return (loss,) + output if loss is not None else output
+
+        return CausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+    def prepare_inputs_for_generation(
+        self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, **kwargs
+    ):
+        # Omit tokens covered by past_key_values
+        if past_key_values is not None:
+            if isinstance(past_key_values, Cache):
+                cache_length = past_key_values.get_seq_length()
+                past_length = past_key_values.seen_tokens
+                max_cache_length = past_key_values.get_max_length()
+            else:
+                cache_length = past_length = past_key_values[0][0].shape[2]
+                max_cache_length = None
+
+            # Keep only the unprocessed tokens:
+            # 1 - If the length of the attention_mask exceeds the length of input_ids, then we are in a setting where
+            # some of the inputs are exclusively passed as part of the cache (e.g. when passing input_embeds as
+            # input)
+            if attention_mask is not None and attention_mask.shape[1] > input_ids.shape[1]:
+                input_ids = input_ids[:, -(attention_mask.shape[1] - past_length) :]
+            # 2 - If the past_length is smaller than input_ids', then input_ids holds all input tokens. We can discard
+            # input_ids based on the past_length.
+            elif past_length < input_ids.shape[1]:
+                input_ids = input_ids[:, past_length:]
+            # 3 - Otherwise (past_length >= input_ids.shape[1]), let's assume input_ids only has unprocessed tokens.
+
+            # If we are about to go beyond the maximum cache length, we need to crop the input attention mask.
+            if (
+                max_cache_length is not None
+                and attention_mask is not None
+                and cache_length + input_ids.shape[1] > max_cache_length
+            ):
+                attention_mask = attention_mask[:, -max_cache_length:]
+
+        position_ids = kwargs.get("position_ids", None)
+        if attention_mask is not None and position_ids is None:
+            # create position_ids on the fly for batch generation
+            position_ids = attention_mask.long().cumsum(-1) - 1
+            position_ids.masked_fill_(attention_mask == 0, 1)
+            if past_key_values:
+                position_ids = position_ids[:, -input_ids.shape[1] :]
+
+        # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
+        if inputs_embeds is not None and past_key_values is None:
+            model_inputs = {"inputs_embeds": inputs_embeds}
+        else:
+            model_inputs = {"input_ids": input_ids}
+
+        model_inputs.update(
+            {
+                "position_ids": position_ids,
+                "past_key_values": past_key_values,
+                "use_cache": kwargs.get("use_cache"),
+                "attention_mask": attention_mask,
+            }
+        )
+        return model_inputs
+
+    @staticmethod
+    def _reorder_cache(past_key_values, beam_idx):
+        reordered_past = ()
+        for layer_past in past_key_values:
+            reordered_past += (
+                tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past),
+            )
+        return reordered_past
+
+    @classmethod
+    def pipeline_layers(cls, config: CollieConfig):
+        """
+        Get layers of pipeline.
+        :return: list
+        """
+        if isinstance(config, str):
+            config = CollieConfig.from_pretrained(config)
+
+        if config.tie_word_embeddings:
+            output = TiedLayerSpec(
+                "embed_tokens",
+                dict_as_params(input_keys="hidden_states", output_keys="logits"),
+                ColumnParallelLMHead,
+                config.hidden_size,
+                config.vocab_size,
+                bias=False,
+            )
+        else:
+            output = LayerSpec(
+                dict_as_params(input_keys="hidden_states", output_keys="logits"),
+                ColumnParallelLMHead,
+                config.hidden_size,
+                config.vocab_size,
+                bias=False,
+            )
+
+        return [("model", Mistral2Model.pipeline_layers(config)), ("lm_head", output)]
+
+    @staticmethod
+    def load_parallel_state_dict(
+            path: str,
+            config: Union[CollieConfig, str],
+            process_exclusion: bool = False,
+            **kwargs,
+    ):
+        ...
+
+    @staticmethod
+    def load_parallel_state_dict(
+            path: str,
+            config: Union[CollieConfig, str],
+            process_exclusion: bool = False,
+            protocol: str = "file", # 指定加载state_dict时使用的协议
+            **kwargs,
+    ):
+        """
+        Load state_dict from ``path``.
+        The format of pretrained model should be the same as that of
+        `huggingface`.
+        :return: state_dict. Note that the state_dict should be processed
+            properly to match the current rank.
+        """
+        # 配置加载
+        if isinstance(config, str):
+            config = CollieConfig.from_pretrained(config)
+        # IO驱动初始化
+        io_driver = IODriver.from_protocol(protocol)
+        # 检查文件路径是否存在
+        if not io_driver.exists(path):
+            raise FileNotFoundError(f"folder {path} not found.")
+        # 初始化存储和处理变量
+        state_dict = OrderedDict()
+        weights = []
+        parts = None # 变量用于存储模型分割的部分信息
+        # 如果开启了进程互斥，那么每个进程都会显示进度条，否则只显示 RANK0 的
+        hide_progress = not process_exclusion and int(os.environ.get("RANK", "0")) != 0
+        if dist.is_initialized() and process_exclusion:
+            # 如果启动了进程互斥，则要进行 dist.get_world_size() 次循环
+            rank_order = range(dist.get_world_size())
+        else:
+            # 不开启只进行一次循环
+            rank_order = range(1)
+        # 权重文件加载和处理
+        for rank in rank_order:
+            # 如果开启了进程互斥，那么只有对应 RANK 的能进入循环；不开启进程互斥的话就都可以进
+            if int(os.environ.get("RANK", "0")) == rank or not process_exclusion:
+                # PP 分层的方法保存在了 os.environ["COLLIE_PP_PARTS"], 格式类似于 [0, 17, 35], 左闭右开
+                if env.is_pipeline:
+                    # 保存的是 json 格式
+                    parts = env.pipeline_parts
+                if hasattr(config, "num_key_value_heads"):
+                    # llama2 (transformers >= 4.31.0)
+                    num_key_value_heads = config.num_key_value_heads
+                else:
+                    num_key_value_heads = config.num_attention_heads
+                head_dim = config.hidden_size // config.num_attention_heads
+                # 如果存在 pytorch_model.bin.index.json 文件的话，此时不同的 pp 进程可以按需加载自己需要的权重
+                if (
+                        io_driver.exists(os.path.join(path, "pytorch_model.bin.index.json"))
+                        and "COLLIE_PP_PARTS" in os.environ.keys()
+                ):
+                    weight_map = json.loads(
+                        io_driver.load(
+                            os.path.join(path, "pytorch_model.bin.index.json"), mode="r"
+                        )
+                    )["weight_map"]
+                    # layers 表示自己需要的层
+                    layers = env.pipeline_layers_idx
+                    # 筛选出形似 model.layers.0 这样的层。包含两个条件：1. 有数字的层；2. 数字加一要在 layers 里面（因为最开始还有个 embedding 占一层）
+                    weights.extend(
+                        [
+                            value
+                            for key, value in weight_map.items()
+                            if len(key.split(".")) > 2
+                               and key.split(".")[2].isdigit()
+                               and (int(key.split(".")[2]) + 1) in layers
+                        ]
+                    )
+                    # 去重
+                    weights = list(set(weights))
+                    # 继续筛选，如果有 0 层，那么就要加载 embedding；如果有最后一层，那么就要加载 lm_head；如果有倒数第二层，那么就要加载 norm
+                    if 0 in layers:
+                        weights.append(weight_map["model.tok_embeddings.weight"])
+                    if max(parts) - 1 in layers:
+                        weights.append(weight_map["output.weight"])
+                    if max(parts) - 2 in layers:
+                        weights.append(weight_map["model.norm.weight"])
+                else:
+                    # 如果没有 pytorch_model.bin.index.json 文件的话，那么就加载所有的权重
+                    weights = [
+                        weight
+                        for weight in io_driver.list(path)
+                        if weight.endswith(".bin")
+                    ]
+                with progress(
+                    weights,
+                    desc="Loading state dict",
+                    total=len(weights),
+                    disable=hide_progress,
+                ) as pbar:
+                    for weight in pbar:
+                        part_state_dict = io_driver.load(
+                            os.path.join(path, weight), mode="rb"
+                        )
+                        # for key in list(part_state_dict.keys()):
+                            # if "attention.wqkv.weight" in key:
+                            #     # qkv_weights = part_state_dict.pop(key)
+                            #     qkv_weights = part_state_dict[key]
+                            #     print(qkv_weights.shape)
+                            #     (wq, wk, wv) = qkv_weights.split(
+                            #         [
+                            #             config.hidden_size,
+                            #             config.num_key_value_heads * head_dim,
+                            #             config.num_key_value_heads * head_dim,
+                            #         ],
+                            #         dim=0,
+                            #     )
+                            #     wq_name = key.replace("wqkv", "wq")
+                            #     wk_name = key.replace("wqkv", "wk")
+                            #     wv_name = key.replace("wqkv", "wv")
+                            #     part_state_dict[wq_name] = wq
+                            #     part_state_dict[wk_name] = wk
+                            #     part_state_dict[wv_name] = wv
+                        state_dict.update(part_state_dict)
+                        del part_state_dict
+                if parts is not None:
+                    # 这一步是 pp 的复筛
+                    layers = env.pipeline_layers_idx
+                    for key in list(state_dict.keys()):
+                        if key.startswith("layers"):
+                            layer = int(key.split(".")[1])
+                            if layer + 1 not in layers:
+                                state_dict.pop(key)
+                        # if key.endswith("tok_embeddings.weight"):
+                        if key.endswith("embed_tokens.weight"):
+                            if 0 not in layers:
+                                state_dict.pop(key)
+                        if key == "norm.weight":
+                            if max(parts) - 2 not in layers:
+                                state_dict.pop(key)
+                        # if key.endswith("output.weight"):
+                        if key.endswith("lm_head.weight"):
+                            if max(parts) - 1 not in layers:
+                                state_dict.pop(key)
+                # 根据用户配置的新的 tp size 进行分割
+                for key in list(state_dict.keys()):
+                    col_filter = [
+                        # "wq.weight",
+                        # "wk.weight",
+                        # "wv.weight",
+                        # "wqkv.weight",
+                        # "w1.weight",
+                        # "w3.weight",
+                        # "tok_embeddings.weight",
+                        # "output.weight",
+                        "q_proj.weight",
+                        "k_proj.weight",
+                        "v_proj.weight",
+                        "o_proj.weight",
+                        "lm_head.weight",
+                        "gate_proj.weight",
+                        "up_proj.weight",
+                        "down_proj.weight",
+                        "embed_tokens.weight",
+                    ]
+                    col_split = any([key.endswith(filter) for filter in col_filter])
+
+                    if col_split:
+                        tensor = (
+                            list(torch.chunk(state_dict[key], config.tp_size, dim=0))[
+                                env.tp_rank
+                            ]
+                            .detach()
+                            .clone()
+                        )
+                        del state_dict[key]
+                        if process_exclusion:
+                            # CPU 内存回收（速度很慢）
+                            gc.collect()
+                        state_dict[key] = tensor
+                    elif key.endswith("o_proj.weight") or key.endswith("down_proj.weight"):
+                        tensor = (
+                            list(torch.chunk(state_dict[key], config.tp_size, dim=1))[
+                                env.tp_rank
+                            ]
+                            .detach()
+                            .clone()
+                        )
+                        del state_dict[key]
+                        if process_exclusion:
+                            # CPU 内存回收（速度很慢）
+                            gc.collect()
+                        state_dict[key] = tensor
+            if dist.is_initialized() and process_exclusion:
+                # 如果选择了进程互斥，那么本次循环中不需要加载权重的进程需等待
+                dist.barrier()
+        return state_dict
+
+    @staticmethod
+    def save_parallel_state_dict(
+        state_dict: dict,
+        path: str,
+        config: CollieConfig,
+        process_exclusion: bool = False,
+        **kwargs,
+    ):
+        ...
+
+    @staticmethod
+    def save_parallel_state_dict(
+            state_dict: dict,
+            path: str,
+            config: CollieConfig,
+            process_exclusion: bool = False,
+            protocol: str = "file",
+    ):
+        """
+        Save state_dict to ``path``.
+        The format of saved state dict should be the same as that of
+        `huggingface`.
+        """
+        io_driver = IODriver.from_protocol(protocol)
+        # gather to tp rank 0
+        if dist.is_initialized() and process_exclusion:
+            # 如果启动了进程互斥，则要进行 pp_size 次循环
+            rank_order = range(config.pp_size)
+        else:
+            # 不开启只进行一次循环
+            rank_order = range(1)
+        dst = parallel_state.get_tensor_model_parallel_src_rank()
+        with progress(
+                rank_order,
+                desc="Saving model",
+                disable=int(os.environ.get("RANK", "0")) != 0,
+        ) as pbar:
+            for rank in pbar:
+                if env.dp_rank == 0 and (env.pp_rank == rank or not process_exclusion):
+                    for key in sorted(list(state_dict.keys())):
+                        tensor_list = None
+                        if env.tp_rank == 0:
+                            tensor_list = [
+                                torch.zeros_like(state_dict[key])
+                                .to(state_dict[key].dtype)
+                                .cuda()
+                                for _ in range(config.tp_size)
+                            ]
+                        dist.gather(
+                            state_dict[key].cuda(),
+                            dst=dst,
+                            gather_list=tensor_list,
+                            group=env.tp_group,
+                        )
+                        if env.tp_rank == 0:
+                            col_filter = [
+                                # "wq.weight",
+                                # "wk.weight",
+                                # "wv.weight",
+                                # "wqkv.weight",
+                                # "w1.weight",
+                                # "w3.weight",
+                                # "tok_embeddings.weight",
+                                # "output.weight",
+                                "q_proj.weight",
+                                "k_proj.weight",
+                                "v_proj.weight",
+                                "o_proj.weight",
+                                "lm_head.weight",
+                                "gate_proj.weight",
+                                "up_proj.weight",
+                                "down_proj.weight",
+                                "embed_tokens.weight",
+                            ]
+                            col_split = any(
+                                [key.endswith(filter) for filter in col_filter]
+                            )
+
+                            if col_split:
+                                state_dict[key] = concat_tensor(tensor_list, dim=0)
+
+                                if process_exclusion:
+                                    # CPU 内存回收（速度很慢）
+                                    gc.collect()
+
+                            elif key.endswith("o_proj.weight") or key.endswith("down_proj.weight"):
+                                state_dict[key] = concat_tensor(tensor_list, dim=1)
+
+                                if process_exclusion:
+                                    # CPU 内存回收（速度很慢）
+                                    gc.collect()
+                    # 似乎不需要？
+                    # state_dict_keys = state_dict.keys()
+                    # for layer_id in range(config.num_layers):
+                    #     qkv_names = [None, None, None]
+                    #     for key in state_dict_keys:
+                    #         if f"layers.{layer_id}.attention.wq.weight" in key:
+                    #             qkv_names[0] = key
+                    #         elif f"layers.{layer_id}.attention.wk.weight" in key:
+                    #             qkv_names[1] = key
+                    #         elif f"layers.{layer_id}.attention.wv.weight" in key:
+                    #             qkv_names[2] = key
+                    #     qkv_name = qkv_names[0].replace("wq", "wqkv")
+                    #     state_dict[qkv_name] = torch.cat(
+                    #         [
+                    #             state_dict.pop(qkv_names[0]),
+                    #             state_dict.pop(qkv_names[1]),
+                    #             state_dict.pop(qkv_names[2]),
+                    #         ],
+                    #         dim=0
+                    #     )
+
+                    if env.tp_rank == 0:
+                        # Save gathered weights
+                        if env.is_pipeline:
+                            ckpt_name = f"pytorch_model-{env.pp_rank + 1:05d}-of-{config.pp_size:05d}.bin"
+                            total_size = 0
+                            weight_map = {}
+                            for name, weight in state_dict.items():
+                                weight_size = weight.numel() * dtype_byte_size(
+                                    weight.dtype
+                                )
+                                weight_map[name] = ckpt_name
+                                total_size += weight_size
+                            index_dict = dict(
+                                total_size=total_size, weight_map=weight_map
+                            )
+                            index_dicts = [None for _ in range(env.pp_size)]
+                            dist.gather_object(
+                                index_dict, index_dicts if env.pp_rank == 0 else None, group=env.pp_group
+                            )
+                            if env.pp_rank == 0:
+                                total_size = 0
+                                weight_map = {}
+                                for _index_dict in index_dicts:
+                                    total_size += _index_dict["total_size"]
+                                    weight_map.update(_index_dict["weight_map"])
+                                merged_dict = {
+                                    "metadata": {"total_size": total_size},
+                                    "weight_map": weight_map,
+                                }
+                                io_driver.save(
+                                    json.dumps(merged_dict, indent=2, sort_keys=True)
+                                    + "\n",
+                                    os.path.join(path, "pytorch_model.bin.index.json"),
+                                )
+
+                        else:
+                            ckpt_name = f"pytorch_model.bin"
+                        ckpt_path = os.path.join(path, ckpt_name)
+                        io_driver.save(state_dict, ckpt_path)
+                if dist.is_initialized() and process_exclusion:
+                    dist.barrier()
+        if env.rank == 0:
+            config.save_pretrained(path, protocol=protocol)
+        dist.barrier()
+
+
+@add_start_docstrings(
+    """
+    The Mistral Model transformer with a sequence classification head on top (linear layer).
+
+    [`MistralForSequenceClassification`] uses the last token in order to do the classification, as other causal models
+    (e.g. GPT-2) do.
+
+    Since it does classification on the last token, it requires to know the position of the last token. If a
+    `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If
+    no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the
+    padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
+    each row of the batch).
+    """,
+    MISTRAL_START_DOCSTRING,
+)
+# Copied from transformers.models.llama.modeling_llama.LlamaForSequenceClassification with Llama->Mistral, LLAMA->MISTRAL
+class MistralForSequenceClassification(Mistral2PreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+        self.model = Mistral2Model(config)
+        self.score = nn.Linear(config.hidden_size, self.num_labels, bias=False)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.model.embed_tokens
+
+    def set_input_embeddings(self, value):
+        self.model.embed_tokens = value
+
+    @add_start_docstrings_to_model_forward(MISTRAL_INPUTS_DOCSTRING)
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, SequenceClassifierOutputWithPast]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        transformer_outputs = self.model(
+            input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        hidden_states = transformer_outputs[0]
+        logits = self.score(hidden_states)
+
+        if input_ids is not None:
+            batch_size = input_ids.shape[0]
+        else:
+            batch_size = inputs_embeds.shape[0]
+
+        if self.config.pad_token_id is None and batch_size != 1:
+            raise ValueError("Cannot handle batch sizes > 1 if no padding token is defined.")
+        if self.config.pad_token_id is None:
+            sequence_lengths = -1
+        else:
+            if input_ids is not None:
+                # if no pad token found, use modulo instead of reverse indexing for ONNX compatibility
+                sequence_lengths = torch.eq(input_ids, self.config.pad_token_id).int().argmax(-1) - 1
+                sequence_lengths = sequence_lengths % input_ids.shape[-1]
+                sequence_lengths = sequence_lengths.to(logits.device)
+            else:
+                sequence_lengths = -1
+
+        pooled_logits = logits[torch.arange(batch_size, device=logits.device), sequence_lengths]
+
+        loss = None
+        if labels is not None:
+            labels = labels.to(logits.device)
+            if self.config.problem_type is None:
+                if self.num_labels == 1:
+                    self.config.problem_type = "regression"
+                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
+                    self.config.problem_type = "single_label_classification"
+                else:
+                    self.config.problem_type = "multi_label_classification"
+
+            if self.config.problem_type == "regression":
+                loss_fct = MSELoss()
+                if self.num_labels == 1:
+                    loss = loss_fct(pooled_logits.squeeze(), labels.squeeze())
+                else:
+                    loss = loss_fct(pooled_logits, labels)
+            elif self.config.problem_type == "single_label_classification":
+                loss_fct = CrossEntropyLoss()
+                loss = loss_fct(pooled_logits.view(-1, self.num_labels), labels.view(-1))
+            elif self.config.problem_type == "multi_label_classification":
+                loss_fct = BCEWithLogitsLoss()
+                loss = loss_fct(pooled_logits, labels)
+        if not return_dict:
+            output = (pooled_logits,) + transformer_outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+
+        return SequenceClassifierOutputWithPast(
+            loss=loss,
+            logits=pooled_logits,
+            past_key_values=transformer_outputs.past_key_values,
+            hidden_states=transformer_outputs.hidden_states,
+            attentions=transformer_outputs.attentions,
+        )
diff --git a/tests/models/mistral2/modelpp.py b/tests/models/mistral2/modelpp.py
new file mode 100644
index 0000000..1180a10
--- /dev/null
+++ b/tests/models/mistral2/modelpp.py
@@ -0,0 +1,1922 @@
+# coding=utf-8
+# Copyright 2023 Mistral AI and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" PyTorch Mistral model."""
+import inspect
+import math
+import warnings
+from typing import List, Optional, Tuple, Union
+
+import torch
+import torch.nn.functional as F
+import torch.utils.checkpoint
+from torch import nn
+from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
+
+from transformers.activations import ACT2FN
+from transformers.cache_utils import Cache, DynamicCache
+from transformers.modeling_attn_mask_utils import _prepare_4d_causal_attention_mask, _prepare_4d_causal_attention_mask_for_sdpa
+from transformers.modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast, SequenceClassifierOutputWithPast
+from transformers.modeling_utils import PreTrainedModel, dtype_byte_size
+from transformers.utils import (
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    is_flash_attn_2_available,
+    is_flash_attn_greater_or_equal_2_10,
+    logging,
+    replace_return_docstrings,
+)
+from .configuration_mistraltp import Mistral2Config
+
+
+if is_flash_attn_2_available():
+    from flash_attn import flash_attn_func, flash_attn_varlen_func
+    from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input  # noqa
+
+    _flash_supports_window_size = "window_size" in list(inspect.signature(flash_attn_func).parameters)
+
+
+logger = logging.get_logger(__name__)
+
+_CONFIG_FOR_DOC = "Mistral2Config"
+
+#modified for collie
+import torch.distributed as dist
+import gc
+import json
+import os
+from collections import OrderedDict
+from megatron.core import parallel_state, tensor_parallel
+from einops import rearrange
+from deepspeed.pipe import LayerSpec, TiedLayerSpec
+
+from collie.config import CollieConfig
+from collie.driver.io import IODriver
+from collie.log.logger import logger
+from collie.module import (
+    ColumnParallelLinearWithoutBias,
+    ColumnParallelLMHead,
+    RowParallelLinearWithoutBias,
+)
+from collie.utils import concat_tensor, dict_as_params, env, progress
+from collie.models.base import CollieModelForCausalLM
+from collie.models.utils import (
+    kv_cache_to_inputs_for_layer, inputs_to_kv_cache_for_layer,
+    kv_cache_to_inputs_for_model, inputs_to_kv_cache_for_model,
+)
+
+
+# Copied from transformers.models.llama.modeling_llama._get_unpad_data
+def _get_unpad_data(attention_mask):
+    seqlens_in_batch = attention_mask.sum(dim=-1, dtype=torch.int32)
+    indices = torch.nonzero(attention_mask.flatten(), as_tuple=False).flatten()
+    max_seqlen_in_batch = seqlens_in_batch.max().item()
+    cu_seqlens = F.pad(torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.int32), (1, 0))
+    return (
+        indices,
+        cu_seqlens,
+        max_seqlen_in_batch,
+    )
+
+
+# Copied from transformers.models.llama.modeling_llama.LlamaRMSNorm with Llama->Mistral
+class Mistral2RMSNorm(nn.Module):
+    def __init__(self, hidden_size, eps=1e-6):
+        """
+        MistralRMSNorm is equivalent to T5LayerNorm
+        """
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+        self.variance_epsilon = eps
+
+    def forward(self, hidden_states):
+        input_dtype = hidden_states.dtype
+        hidden_states = hidden_states.to(torch.float32)
+        variance = hidden_states.pow(2).mean(-1, keepdim=True)
+        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
+        return self.weight * hidden_states.to(input_dtype)
+
+
+# copied from transformers.models.llama.modeling_llama.LlamaRotaryEmbedding with Llama->Mistral
+# TODO @Arthur no longer copied from LLama after static cache
+class Mistral2RotaryEmbedding(nn.Module):
+    def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None):
+        super().__init__()
+
+        self.dim = dim
+        self.max_position_embeddings = max_position_embeddings
+        self.base = base
+        inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2, dtype=torch.int64).float().to(device) / self.dim))
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
+
+        # Build here to make `torch.jit.trace` work.
+        self._set_cos_sin_cache(
+            seq_len=max_position_embeddings, device=self.inv_freq.device, dtype=torch.get_default_dtype()
+        )
+
+    def _set_cos_sin_cache(self, seq_len, device, dtype):
+        self.max_seq_len_cached = seq_len
+        t = torch.arange(self.max_seq_len_cached, device=device, dtype=torch.int64).type_as(self.inv_freq)
+
+        freqs = torch.outer(t, self.inv_freq)
+        # Different from paper, but it uses a different permutation in order to obtain the same calculation
+        emb = torch.cat((freqs, freqs), dim=-1)
+        self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False)
+        self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False)
+
+    def forward(self, x, seq_len=None):
+        # x: [bs, num_attention_heads, seq_len, head_size]
+        if seq_len > self.max_seq_len_cached:
+            self._set_cos_sin_cache(seq_len=seq_len, device=x.device, dtype=x.dtype)
+
+        return (
+            self.cos_cached[:seq_len].to(dtype=x.dtype),
+            self.sin_cached[:seq_len].to(dtype=x.dtype),
+        )
+
+
+# Copied from transformers.models.llama.modeling_llama.rotate_half
+def rotate_half(x):
+    """Rotates half the hidden dims of the input."""
+    x1 = x[..., : x.shape[-1] // 2]
+    x2 = x[..., x.shape[-1] // 2 :]
+    return torch.cat((-x2, x1), dim=-1)
+
+
+# copied from transformers.models.llama.modeling_llama.apply_rotary_pos_emb
+# TODO @Arthur no longer copied from LLama after static cache
+def apply_rotary_pos_emb(q, k, cos, sin, position_ids, unsqueeze_dim=1):
+    """Applies Rotary Position Embedding to the query and key tensors.
+
+    Args:
+        q (`torch.Tensor`): The query tensor.
+        k (`torch.Tensor`): The key tensor.
+        cos (`torch.Tensor`): The cosine part of the rotary embedding.
+        sin (`torch.Tensor`): The sine part of the rotary embedding.
+        position_ids (`torch.Tensor`):
+            The position indices of the tokens corresponding to the query and key tensors. For example, this can be
+            used to pass offsetted position ids when working with a KV-cache.
+        unsqueeze_dim (`int`, *optional*, defaults to 1):
+            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
+            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
+            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
+            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
+            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
+            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
+    Returns:
+        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
+    """
+    cos = cos[position_ids].unsqueeze(unsqueeze_dim)
+    sin = sin[position_ids].unsqueeze(unsqueeze_dim)
+    q_embed = (q * cos) + (rotate_half(q) * sin)
+    k_embed = (k * cos) + (rotate_half(k) * sin)
+    return q_embed, k_embed
+
+
+class Mistral2MLP(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.hidden_size = config.hidden_size
+        self.intermediate_size = config.intermediate_size
+        # self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
+        # self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
+        # self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False)
+        
+        self.up_proj = ColumnParallelLinearWithoutBias(
+            self.hidden_size,
+            self.intermediate_size,
+            bias=False,
+            gather_output=False,
+            init_method=lambda x: x,
+        )
+        self.gate_proj = ColumnParallelLinearWithoutBias(
+            self.hidden_size,
+            self.intermediate_size,
+            bias=False,
+            gather_output=False,
+            init_method=lambda x: x,
+        )
+        self.down_proj = RowParallelLinearWithoutBias(
+            self.intermediate_size,
+            self.hidden_size,
+            bias=False,
+            input_is_parallel=True,
+            init_method=lambda x: x,
+        )
+        
+        self.act_fn = ACT2FN[config.hidden_act]
+
+    def forward(self, x):
+        return self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
+
+
+# Copied from transformers.models.llama.modeling_llama.repeat_kv
+def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
+    """
+    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
+    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
+    """
+    batch, num_key_value_heads, slen, head_dim = hidden_states.shape
+    if n_rep == 1:
+        return hidden_states
+    hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
+    return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
+
+
+class Mistral2Attention(nn.Module):
+    """
+    Multi-headed attention from 'Attention Is All You Need' paper. Modified to use sliding window attention: Longformer
+    and "Generating Long Sequences with Sparse Transformers".
+    """
+
+    def __init__(self, config: CollieConfig, layer_idx: Optional[int] = None):
+        super().__init__()
+        self.config = config
+        self.layer_idx = layer_idx
+        if layer_idx is None:
+            logger.warning_once(
+                f"Instantiating {self.__class__.__name__} without passing a `layer_idx` is not recommended and will "
+                "lead to errors during the forward call if caching is used. Please make sure to provide a `layer_idx` "
+                "when creating this class."
+            )
+
+        self.hidden_size = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_dim = self.hidden_size // self.num_heads
+        self.num_key_value_heads = config.num_key_value_heads
+        self.num_key_value_groups = self.num_heads // self.num_key_value_heads
+        self.max_position_embeddings = config.max_position_embeddings
+        self.rope_theta = config.rope_theta
+        self.is_causal = True
+        self.attention_dropout = config.attention_dropout
+
+        if (self.head_dim * self.num_heads) != self.hidden_size:
+            raise ValueError(
+                f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}"
+                f" and `num_heads`: {self.num_heads})."
+            )
+        # self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=False)
+        # self.k_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=False)
+        # self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=False)
+        # self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=False)
+
+        self.q_proj = ColumnParallelLinearWithoutBias(
+            self.hidden_size,
+            self.num_heads * self.head_dim,
+            bias=False,
+            gather_output=False,
+            init_method=lambda x: x,
+        )
+        self.k_proj = ColumnParallelLinearWithoutBias(
+            self.hidden_size,
+            self.num_key_value_heads * self.head_dim,
+            bias=False,
+            gather_output=False,
+            init_method=lambda x: x,
+        )
+        self.v_proj = ColumnParallelLinearWithoutBias(
+            self.hidden_size,
+            self.num_key_value_heads * self.head_dim,
+            bias=False,
+            gather_output=False,
+            init_method=lambda x: x,
+        )
+        self.o_proj = RowParallelLinearWithoutBias(
+            self.num_heads * self.head_dim,
+            self.hidden_size,
+            bias=False,
+            input_is_parallel=True,
+            init_method=lambda x: x,
+        )
+
+        self.rotary_emb = Mistral2RotaryEmbedding(
+            self.head_dim,
+            max_position_embeddings=self.max_position_embeddings,
+            base=self.rope_theta,
+        )
+
+    def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
+        return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Cache] = None,
+        output_attentions: bool = False,
+        use_cache: bool = False,
+        **kwargs,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        if "padding_mask" in kwargs:
+            warnings.warn(
+                "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`"
+            )
+        bsz, q_len, _ = hidden_states.size()
+
+        query_states = self.q_proj(hidden_states)
+        key_states = self.k_proj(hidden_states)
+        value_states = self.v_proj(hidden_states)
+
+        # query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        # key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        # value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+
+        query_states, key_states, value_states = (
+            rearrange(query_states, "b n (h d) -> b n h d", d=self.head_dim),
+            rearrange(key_states, "b n (h d) -> b n h d", d=self.head_dim),
+            rearrange(value_states, "b n (h d) -> b n h d", d=self.head_dim),
+        )
+
+        self.num_heads_tp = query_states.shape[2]
+        self.tp_size = self.num_heads // self.num_heads_tp
+
+        query_states = query_states.transpose(1, 2)
+        key_states = key_states.transpose(1, 2)
+        value_states = value_states.transpose(1, 2)
+
+        if self.config.pp_size > 1:
+            query_states = query_states.contiguous()
+            key_states = key_states.contiguous()
+            value_states = value_states.contiguous()
+
+        kv_seq_len = key_states.shape[-2]
+        if past_key_value is not None:
+            if self.layer_idx is None:
+                raise ValueError(
+                    f"The cache structure has changed since version v4.36. If you are using {self.__class__.__name__} "
+                    "for auto-regressive decoding with k/v caching, please make sure to initialize the attention class "
+                    "with a layer index."
+                )
+            kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
+        cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
+
+        if past_key_value is not None:
+            cache_kwargs = {"sin": sin, "cos": cos}  # Specific to RoPE models
+            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
+
+        # repeat k/v heads if n_kv_heads < n_heads
+        key_states = repeat_kv(key_states, self.num_key_value_groups)
+        value_states = repeat_kv(value_states, self.num_key_value_groups)
+
+        attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim)
+
+        if attn_weights.size() != (bsz, self.num_heads_tp, q_len, kv_seq_len):
+            raise ValueError(
+                f"Attention weights should be of size {(bsz, self.num_heads_tp, q_len, kv_seq_len)}, but is"
+                f" {attn_weights.size()}"
+            )
+
+        if attention_mask is not None:
+            if attention_mask.size() != (bsz, 1, q_len, kv_seq_len):
+                raise ValueError(
+                    f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}"
+                )
+
+            attn_weights = attn_weights + attention_mask
+
+        # upcast attention to fp32
+        attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
+        attn_weights = nn.functional.dropout(attn_weights, p=self.attention_dropout, training=self.training)
+        attn_output = torch.matmul(attn_weights, value_states)
+
+        if attn_output.size() != (bsz, self.num_heads_tp, q_len, self.head_dim):
+            raise ValueError(
+                f"`attn_output` should be of size {(bsz, self.num_heads_tp, q_len, self.head_dim)}, but is"
+                f" {attn_output.size()}"
+            )
+
+        attn_output = attn_output.transpose(1, 2).contiguous()
+        attn_output = attn_output.reshape(bsz, q_len, int(self.hidden_size/self.tp_size))
+
+        attn_output = self.o_proj(attn_output)
+
+        if not output_attentions:
+            attn_weights = None
+
+        return attn_output, attn_weights, past_key_value
+
+
+class Mistral2FlashAttention2(Mistral2Attention):
+    """
+    Mistral flash attention module. This module inherits from `MistralAttention` as the weights of the module stays
+    untouched. The only required change would be on the forward pass where it needs to correctly call the public API of
+    flash attention and deal with padding tokens in case the input contains any of them.
+    """
+
+    # Copied from transformers.models.llama.modeling_llama.LlamaFlashAttention2.__init__
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+        # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1.
+        # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignement, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0.
+        # Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left).
+        self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10()
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Cache] = None,
+        output_attentions: bool = False,
+        use_cache: bool = False,
+        **kwargs,
+    ):
+        if "padding_mask" in kwargs:
+            warnings.warn(
+                "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`"
+            )
+
+            # overwrite attention_mask with padding_mask
+            attention_mask = kwargs.pop("padding_mask")
+        bsz, q_len, _ = hidden_states.size()
+
+        query_states = self.q_proj(hidden_states)
+        key_states = self.k_proj(hidden_states)
+        value_states = self.v_proj(hidden_states)
+
+        # query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        # key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        # value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+
+        query_states, key_states, value_states = (
+            rearrange(query_states, "b n (h d) -> b n h d", d=self.head_dim),
+            rearrange(key_states, "b n (h d) -> b n h d", d=self.head_dim),
+            rearrange(value_states, "b n (h d) -> b n h d", d=self.head_dim),
+        )
+
+        self.num_heads_tp = query_states.shape[2]
+        self.tp_size = self.num_heads // self.num_heads_tp
+
+        query_states = query_states.transpose(1, 2)
+        key_states = key_states.transpose(1, 2)
+        value_states = value_states.transpose(1, 2)
+
+        if self.config.pp_size > 1:
+            query_states = query_states.contiguous()
+            key_states = key_states.contiguous()
+            value_states = value_states.contiguous()
+
+
+        kv_seq_len = key_states.shape[-2]
+        if past_key_value is not None:
+            if self.layer_idx is None:
+                raise ValueError(
+                    f"The cache structure has changed since version v4.36. If you are using {self.__class__.__name__} "
+                    "for auto-regressive decoding with k/v caching, please make sure to initialize the attention class "
+                    "with a layer index."
+                )
+            kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
+
+        # Because the input can be padded, the absolute sequence length depends on the max position id.
+        rotary_seq_len = max(kv_seq_len, position_ids[:, -1].max().item()) + 1
+        cos, sin = self.rotary_emb(value_states, seq_len=rotary_seq_len)
+
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
+
+        use_sliding_windows = (
+            _flash_supports_window_size
+            and getattr(self.config, "sliding_window", None) is not None
+            and kv_seq_len > self.config.sliding_window
+        )
+
+        if not _flash_supports_window_size:
+            logger.warning_once(
+                "The current flash attention version does not support sliding window attention, for a more memory efficient implementation"
+                " make sure to upgrade flash-attn library."
+            )
+
+        if past_key_value is not None:
+            # Activate slicing cache only if the config has a value `sliding_windows` attribute
+            cache_has_contents = past_key_value.get_seq_length(self.layer_idx) > 0
+            if (
+                getattr(self.config, "sliding_window", None) is not None
+                and kv_seq_len > self.config.sliding_window
+                and cache_has_contents
+            ):
+                slicing_tokens = 1 - self.config.sliding_window
+
+                past_key = past_key_value[self.layer_idx][0]
+                past_value = past_key_value[self.layer_idx][1]
+
+                past_key = past_key[:, :, slicing_tokens:, :].contiguous()
+                past_value = past_value[:, :, slicing_tokens:, :].contiguous()
+
+                if past_key.shape[-2] != self.config.sliding_window - 1:
+                    raise ValueError(
+                        f"past key must have a shape of (`batch_size, num_heads, self.config.sliding_window-1, head_dim`), got"
+                        f" {past_key.shape}"
+                    )
+
+                if attention_mask is not None:
+                    attention_mask = attention_mask[:, slicing_tokens:]
+                    attention_mask = torch.cat([attention_mask, torch.ones_like(attention_mask[:, -1:])], dim=-1)
+
+            cache_kwargs = {"sin": sin, "cos": cos}  # Specific to RoPE models
+            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
+
+        # repeat k/v heads if n_kv_heads < n_heads
+        key_states = repeat_kv(key_states, self.num_key_value_groups)
+        value_states = repeat_kv(value_states, self.num_key_value_groups)
+        dropout_rate = 0.0 if not self.training else self.attention_dropout
+
+        # In PEFT, usually we cast the layer norms in float32 for training stability reasons
+        # therefore the input hidden states gets silently casted in float32. Hence, we need
+        # cast them back in float16 just to be sure everything works as expected.
+        input_dtype = query_states.dtype
+        if input_dtype == torch.float32:
+            if torch.is_autocast_enabled():
+                target_dtype = torch.get_autocast_gpu_dtype()
+            # Handle the case where the model is quantized
+            elif hasattr(self.config, "_pre_quantization_dtype"):
+                target_dtype = self.config._pre_quantization_dtype
+            else:
+                target_dtype = self.q_proj.weight.dtype
+
+            logger.warning_once(
+                f"The input hidden states seems to be silently casted in float32, this might be related to"
+                f" the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in"
+                f" {target_dtype}."
+            )
+
+            query_states = query_states.to(target_dtype)
+            key_states = key_states.to(target_dtype)
+            value_states = value_states.to(target_dtype)
+
+        # Reashape to the expected shape for Flash Attention
+        query_states = query_states.transpose(1, 2)
+        key_states = key_states.transpose(1, 2)
+        value_states = value_states.transpose(1, 2)
+
+        attn_output = self._flash_attention_forward(
+            query_states,
+            key_states,
+            value_states,
+            attention_mask,
+            q_len,
+            dropout=dropout_rate,
+            use_sliding_windows=use_sliding_windows,
+        )
+
+        attn_output = attn_output.reshape(bsz, q_len, int(self.hidden_size/self.tp_size)).contiguous()
+        attn_output = self.o_proj(attn_output)
+
+        if not output_attentions:
+            attn_weights = None
+
+        return attn_output, attn_weights, past_key_value
+
+    def _flash_attention_forward(
+        self,
+        query_states,
+        key_states,
+        value_states,
+        attention_mask,
+        query_length,
+        dropout=0.0,
+        softmax_scale=None,
+        use_sliding_windows=False,
+    ):
+        """
+        Calls the forward method of Flash Attention - if the input hidden states contain at least one padding token
+        first unpad the input, then computes the attention scores and pad the final attention scores.
+
+        Args:
+            query_states (`torch.Tensor`):
+                Input query states to be passed to Flash Attention API
+            key_states (`torch.Tensor`):
+                Input key states to be passed to Flash Attention API
+            value_states (`torch.Tensor`):
+                Input value states to be passed to Flash Attention API
+            attention_mask (`torch.Tensor`):
+                The padding mask - corresponds to a tensor of size `(batch_size, seq_len)` where 0 stands for the
+                position of padding tokens and 1 for the position of non-padding tokens.
+            dropout (`float`):
+                Attention dropout
+            softmax_scale (`float`, *optional*):
+                The scaling of QK^T before applying softmax. Default to 1 / sqrt(head_dim)
+            use_sliding_windows (`bool`, *optional*):
+                Whether to activate sliding window attention.
+        """
+        if not self._flash_attn_uses_top_left_mask:
+            causal = self.is_causal
+        else:
+            # TODO: Remove the `query_length != 1` check once Flash Attention for RoCm is bumped to 2.1. For details, please see the comment in LlamaFlashAttention2 __init__.
+            causal = self.is_causal and query_length != 1
+
+        # Contains at least one padding token in the sequence
+        if attention_mask is not None:
+            batch_size = query_states.shape[0]
+            query_states, key_states, value_states, indices_q, cu_seq_lens, max_seq_lens = self._upad_input(
+                query_states, key_states, value_states, attention_mask, query_length
+            )
+
+            cu_seqlens_q, cu_seqlens_k = cu_seq_lens
+            max_seqlen_in_batch_q, max_seqlen_in_batch_k = max_seq_lens
+
+            if not use_sliding_windows:
+                attn_output_unpad = flash_attn_varlen_func(
+                    query_states,
+                    key_states,
+                    value_states,
+                    cu_seqlens_q=cu_seqlens_q,
+                    cu_seqlens_k=cu_seqlens_k,
+                    max_seqlen_q=max_seqlen_in_batch_q,
+                    max_seqlen_k=max_seqlen_in_batch_k,
+                    dropout_p=dropout,
+                    softmax_scale=softmax_scale,
+                    causal=causal,
+                )
+            else:
+                attn_output_unpad = flash_attn_varlen_func(
+                    query_states,
+                    key_states,
+                    value_states,
+                    cu_seqlens_q=cu_seqlens_q,
+                    cu_seqlens_k=cu_seqlens_k,
+                    max_seqlen_q=max_seqlen_in_batch_q,
+                    max_seqlen_k=max_seqlen_in_batch_k,
+                    dropout_p=dropout,
+                    softmax_scale=softmax_scale,
+                    causal=causal,
+                    window_size=(self.config.sliding_window, self.config.sliding_window),
+                )
+
+            attn_output = pad_input(attn_output_unpad, indices_q, batch_size, query_length)
+        else:
+            if not use_sliding_windows:
+                attn_output = flash_attn_func(
+                    query_states,
+                    key_states,
+                    value_states,
+                    dropout,
+                    softmax_scale=softmax_scale,
+                    causal=causal,
+                )
+            else:
+                attn_output = flash_attn_func(
+                    query_states,
+                    key_states,
+                    value_states,
+                    dropout,
+                    softmax_scale=softmax_scale,
+                    causal=causal,
+                    window_size=(self.config.sliding_window, self.config.sliding_window),
+                )
+
+        return attn_output
+
+    def _upad_input(self, query_layer, key_layer, value_layer, attention_mask, query_length):
+        batch_size, kv_seq_len, num_heads, head_dim = key_layer.shape
+
+        # On the first iteration we need to properly re-create the padding mask
+        # by slicing it on the proper place
+        if kv_seq_len != attention_mask.shape[-1]:
+            attention_mask_num_tokens = attention_mask.shape[-1]
+            attention_mask = attention_mask[:, attention_mask_num_tokens - kv_seq_len :]
+
+        indices_k, cu_seqlens_k, max_seqlen_in_batch_k = _get_unpad_data(attention_mask)
+
+        key_layer = index_first_axis(key_layer.reshape(batch_size * kv_seq_len, num_heads, head_dim), indices_k)
+        value_layer = index_first_axis(value_layer.reshape(batch_size * kv_seq_len, num_heads, head_dim), indices_k)
+
+        if query_length == kv_seq_len:
+            query_layer = index_first_axis(
+                query_layer.reshape(batch_size * kv_seq_len, num_heads, head_dim), indices_k
+            )
+            cu_seqlens_q = cu_seqlens_k
+            max_seqlen_in_batch_q = max_seqlen_in_batch_k
+            indices_q = indices_k
+        elif query_length == 1:
+            max_seqlen_in_batch_q = 1
+            cu_seqlens_q = torch.arange(
+                batch_size + 1, dtype=torch.int32, device=query_layer.device
+            )  # There is a memcpy here, that is very bad.
+            indices_q = cu_seqlens_q[:-1]
+            query_layer = query_layer.squeeze(1)
+        else:
+            # The -q_len: slice assumes left padding.
+            attention_mask = attention_mask[:, -query_length:]
+            query_layer, indices_q, cu_seqlens_q, max_seqlen_in_batch_q = unpad_input(query_layer, attention_mask)
+
+        return (
+            query_layer,
+            key_layer,
+            value_layer,
+            indices_q,
+            (cu_seqlens_q, cu_seqlens_k),
+            (max_seqlen_in_batch_q, max_seqlen_in_batch_k),
+        )
+
+
+# copied from transformers.models.llama.modeling_llama.LlamaSdpaAttention with Llama->Mistral
+# TODO @Arthur no longer copied from LLama after static cache
+class Mistral2SdpaAttention(Mistral2Attention):
+    """
+    Mistral attention module using torch.nn.functional.scaled_dot_product_attention. This module inherits from
+    `MistralAttention` as the weights of the module stays untouched. The only changes are on the forward pass to adapt to
+    SDPA API.
+    """
+
+    # Adapted from MistralAttention.forward
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Cache] = None,
+        output_attentions: bool = False,
+        use_cache: bool = False,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        if output_attentions:
+            # TODO: Improve this warning with e.g. `model.config.attn_implementation = "manual"` once this is implemented.
+            logger.warning_once(
+                "MistralModel is using MistralSdpaAttention, but `torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to the manual attention implementation, "
+                'but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
+            )
+            return super().forward(
+                hidden_states=hidden_states,
+                attention_mask=attention_mask,
+                position_ids=position_ids,
+                past_key_value=past_key_value,
+                output_attentions=output_attentions,
+                use_cache=use_cache,
+            )
+
+        bsz, q_len, _ = hidden_states.size()
+
+        query_states = self.q_proj(hidden_states)
+        key_states = self.k_proj(hidden_states)
+        value_states = self.v_proj(hidden_states)
+
+        # query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        # key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        # value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+
+        query_states, key_states, value_states = (
+            rearrange(query_states, "b n (h d) -> b n h d", d=self.head_dim),
+            rearrange(key_states, "b n (h d) -> b n h d", d=self.head_dim),
+            rearrange(value_states, "b n (h d) -> b n h d", d=self.head_dim),
+        )
+
+        self.num_heads_tp = query_states.shape[2]
+        self.tp_size = self.num_heads // self.num_heads_tp
+
+        query_states = query_states.transpose(1, 2)
+        key_states = key_states.transpose(1, 2)
+        value_states = value_states.transpose(1, 2)
+
+        if self.config.pp_size > 1:
+            query_states = query_states.contiguous()
+            key_states = key_states.contiguous()
+            value_states = value_states.contiguous()
+
+
+        kv_seq_len = key_states.shape[-2]
+        if past_key_value is not None:
+            kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
+        cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
+
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
+
+        if past_key_value is not None:
+            cache_kwargs = {"sin": sin, "cos": cos}  # Specific to RoPE models
+            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
+
+        key_states = repeat_kv(key_states, self.num_key_value_groups)
+        value_states = repeat_kv(value_states, self.num_key_value_groups)
+
+        if attention_mask is not None:
+            if attention_mask.size() != (bsz, 1, q_len, kv_seq_len):
+                raise ValueError(
+                    f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}"
+                )
+
+        # SDPA with memory-efficient backend is currently (torch==2.1.2) bugged with non-contiguous inputs with custom attn_mask,
+        # Reference: https://github.com/pytorch/pytorch/issues/112577.
+        if query_states.device.type == "cuda" and attention_mask is not None:
+            query_states = query_states.contiguous()
+            key_states = key_states.contiguous()
+            value_states = value_states.contiguous()
+
+        attn_output = torch.nn.functional.scaled_dot_product_attention(
+            query_states,
+            key_states,
+            value_states,
+            attn_mask=attention_mask,
+            dropout_p=self.attention_dropout if self.training else 0.0,
+            # The q_len > 1 is necessary to match with AttentionMaskConverter.to_causal_4d that does not create a causal mask in case q_len == 1.
+            is_causal=self.is_causal and attention_mask is None and q_len > 1,
+        )
+
+        attn_output = attn_output.transpose(1, 2).contiguous()
+        attn_output = attn_output.view(bsz, q_len, int(self.hidden_size/self.tp_size))
+
+        attn_output = self.o_proj(attn_output)
+
+        return attn_output, None, past_key_value
+
+
+MISTRAL_ATTENTION_CLASSES = {
+    "eager": Mistral2Attention,
+    "flash_attention_2": Mistral2FlashAttention2,
+    "sdpa": Mistral2SdpaAttention,
+}
+
+
+class MistralDecoderLayer(nn.Module):
+    def __init__(self, config: CollieConfig, layer_idx: int):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+
+        self.self_attn = MISTRAL_ATTENTION_CLASSES[config._attn_implementation](config, layer_idx)
+
+        self.mlp = Mistral2MLP(config)
+        self.input_layernorm = Mistral2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = Mistral2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        output_attentions: Optional[bool] = False,
+        use_cache: Optional[bool] = False,
+        **kwargs,
+    ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
+        if "padding_mask" in kwargs:
+            warnings.warn(
+                "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`"
+            )
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
+            attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
+                `(batch, sequence_length)` where padding elements are indicated by 0.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            use_cache (`bool`, *optional*):
+                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
+                (see `past_key_values`).
+            past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
+        """
+
+        residual = hidden_states
+
+        hidden_states = self.input_layernorm(hidden_states)
+
+        # Self Attention
+        hidden_states, self_attn_weights, present_key_value = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_value=past_key_value,
+            output_attentions=output_attentions,
+            use_cache=use_cache,
+        )
+        hidden_states = residual + hidden_states
+
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + hidden_states
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (self_attn_weights,)
+
+        if use_cache:
+            outputs += (present_key_value,)
+
+        return outputs
+
+
+MISTRAL_START_DOCSTRING = r"""
+    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+
+    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
+    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
+    and behavior.
+
+    Parameters:
+        config ([`MistralConfig`]):
+            Model configuration class with all the parameters of the model. Initializing with a config file does not
+            load the weights associated with the model, only the configuration. Check out the
+            [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+
+@add_start_docstrings(
+    "The bare Mistral Model outputting raw hidden-states without any specific head on top.",
+    MISTRAL_START_DOCSTRING,
+)
+class Mistral2PreTrainedModel(PreTrainedModel):
+    config_class = Mistral2Config
+    base_model_prefix = "model"
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["MistralDecoderLayer"]
+    _skip_keys_device_placement = "past_key_values"
+    _supports_flash_attn_2 = True
+    _supports_sdpa = True
+    _supports_cache_class = True
+
+    def _init_weights(self, module):
+        std = self.config.initializer_range
+        if isinstance(module, nn.Linear):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+
+
+MISTRAL_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
+            it.
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            [What are input IDs?](../glossary#input-ids)
+        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            If `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
+            `past_key_values`).
+
+            If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
+            and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
+            information on the default strategy.
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+        position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+            config.n_positions - 1]`.
+
+            [What are position IDs?](../glossary#position-ids)
+        past_key_values (`Cache` or `tuple(tuple(torch.FloatTensor))`, *optional*):
+            Pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
+            blocks) that can be used to speed up sequential decoding. This typically consists in the `past_key_values`
+            returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`.
+
+            Two formats are allowed:
+            - a [`~cache_utils.Cache`] instance;
+            - Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of
+            shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`). This is also known as the legacy
+            cache format.
+
+            The model will output the same cache format that is fed as input. If no `past_key_values` are passed, the
+            legacy cache format will be returned.
+
+            If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that don't
+            have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `input_ids`
+            of shape `(batch_size, sequence_length)`.
+        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
+            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
+            model's internal embedding lookup matrix.
+        use_cache (`bool`, *optional*):
+            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
+            `past_key_values`).
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+@add_start_docstrings(
+    "The bare Mistral Model outputting raw hidden-states without any specific head on top.",
+    MISTRAL_START_DOCSTRING,
+)
+class Mistral2Model(nn.Module):
+    """
+    Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`MistralDecoderLayer`]
+
+    Args:
+        config: MistralConfig
+    """
+
+    def __init__(self, config:  CollieConfig):
+        # super().__init__(config)
+        super().__init__()
+        self.config = config
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+
+        self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
+        self.layers = nn.ModuleList(
+            [MistralDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
+        )
+        self._attn_implementation = config._attn_implementation
+        self.norm = Mistral2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+        self.gradient_checkpointing = False
+        # Initialize weights and apply final processing
+        # self.post_init()
+
+    def get_input_embeddings(self):
+        return self.embed_tokens
+
+    def set_input_embeddings(self, value):
+        self.embed_tokens = value
+
+    @add_start_docstrings_to_model_forward(MISTRAL_INPUTS_DOCSTRING)
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutputWithPast]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        # retrieve input_ids and inputs_embeds
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time")
+        elif input_ids is not None:
+            batch_size, seq_length = input_ids.shape
+        elif inputs_embeds is not None:
+            batch_size, seq_length, _ = inputs_embeds.shape
+        else:
+            raise ValueError("You have to specify either decoder_input_ids or decoder_inputs_embeds")
+
+        if self.gradient_checkpointing and self.training:
+            if use_cache:
+                logger.warning_once(
+                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+                )
+                use_cache = False
+
+        past_key_values_length = 0
+
+        if use_cache:
+            use_legacy_cache = not isinstance(past_key_values, Cache)
+            if use_legacy_cache:
+                past_key_values = DynamicCache.from_legacy_cache(past_key_values)
+            past_key_values_length = past_key_values.get_usable_length(seq_length)
+
+        if position_ids is None:
+            device = input_ids.device if input_ids is not None else inputs_embeds.device
+            position_ids = torch.arange(
+                past_key_values_length, seq_length + past_key_values_length, dtype=torch.long, device=device
+            )
+            position_ids = position_ids.unsqueeze(0).view(-1, seq_length)
+        else:
+            position_ids = position_ids.view(-1, seq_length).long()
+
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_tokens(input_ids)
+
+        if attention_mask is not None and self._attn_implementation == "flash_attention_2" and use_cache:
+            is_padding_right = attention_mask[:, -1].sum().item() != batch_size
+            if is_padding_right:
+                raise ValueError(
+                    "You are attempting to perform batched generation with padding_side='right'"
+                    " this may lead to unexpected behaviour for Flash Attention version of Mistral. Make sure to "
+                    " call `tokenizer.padding_side  = 'left'` before tokenizing the input. "
+                )
+
+        if self._attn_implementation == "flash_attention_2":
+            # 2d mask is passed through the layers
+            attention_mask = attention_mask if (attention_mask is not None and 0 in attention_mask) else None
+        elif self._attn_implementation == "sdpa" and not output_attentions:
+            # output_attentions=True can not be supported when using SDPA, and we fall back on
+            # the manual implementation that requires a 4D causal mask in all cases.
+            attention_mask = _prepare_4d_causal_attention_mask_for_sdpa(
+                attention_mask,
+                (batch_size, seq_length),
+                inputs_embeds,
+                past_key_values_length,
+            )
+        else:
+            # 4d mask is passed through the layers
+            attention_mask = _prepare_4d_causal_attention_mask(
+                attention_mask,
+                (batch_size, seq_length),
+                inputs_embeds,
+                past_key_values_length,
+                sliding_window=self.config.sliding_window,
+            )
+
+        hidden_states = inputs_embeds
+
+        # decoder layers
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attns = () if output_attentions else None
+        next_decoder_cache = None
+
+        for decoder_layer in self.layers:
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+
+            if self.gradient_checkpointing and self.training:
+                layer_outputs = self._gradient_checkpointing_func(
+                    decoder_layer.__call__,
+                    hidden_states,
+                    attention_mask,
+                    position_ids,
+                    past_key_values,
+                    output_attentions,
+                    use_cache,
+                )
+            else:
+                layer_outputs = decoder_layer(
+                    hidden_states,
+                    attention_mask=attention_mask,
+                    position_ids=position_ids,
+                    past_key_value=past_key_values,
+                    output_attentions=output_attentions,
+                    use_cache=use_cache,
+                )
+
+            hidden_states = layer_outputs[0]
+
+            if use_cache:
+                next_decoder_cache = layer_outputs[2 if output_attentions else 1]
+
+            if output_attentions:
+                all_self_attns += (layer_outputs[1],)
+
+        hidden_states = self.norm(hidden_states)
+
+        # add hidden states from the last decoder layer
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+
+        next_cache = None
+        if use_cache:
+            next_cache = next_decoder_cache.to_legacy_cache() if use_legacy_cache else next_decoder_cache
+
+        if not return_dict:
+            return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None)
+        return BaseModelOutputWithPast(
+            last_hidden_state=hidden_states,
+            past_key_values=next_cache,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attns,
+        )
+
+
+class Mistral2ForCausalLM(CollieModelForCausalLM):
+    _tied_weights_keys = ["lm_head.weight"]
+
+    def __init__(self, config:CollieConfig):
+        super().__init__(config)
+        self.model = Mistral2Model(config)
+        self.vocab_size = config.vocab_size
+        # self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+        self.lm_head = ColumnParallelLinearWithoutBias(
+            self.collie_config.hidden_size, self.collie_config.vocab_size, bias=False
+        )
+        # Initialize weights and apply final processing
+        # self.post_init()
+        # GenerationMixin 需要的额外参数
+        self.config.is_decoder = True
+        if config.model_config.tie_word_embeddings:
+            self.lm_head.weight = self.embed_tokens.weight
+        self.main_input_name = "input_ids"
+
+    def clean_cache(self):
+        self._clean_hidden_states([*self.model.layers, self.lm_head])
+        self._set_use_cache(self.model.layers, False)
+
+    def set_cache(self, use_cache):
+        self._set_use_cache(self.model.layers, use_cache)
+
+    def get_input_embeddings(self):
+        return self.model.embed_tokens
+
+    def set_input_embeddings(self, value):
+        self.model.embed_tokens = value
+
+    def get_output_embeddings(self):
+        return self.lm_head
+
+    def set_output_embeddings(self, new_embeddings):
+        self.lm_head = new_embeddings
+
+    def set_decoder(self, decoder):
+        self.model = decoder
+
+    def get_decoder(self):
+        return self.model
+
+    @add_start_docstrings_to_model_forward(MISTRAL_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        **kwargs,
+    ) -> Union[Tuple, CausalLMOutputWithPast]:
+        r"""
+        Args:
+            labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
+                config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+                (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+
+        Returns:
+
+        Example:
+
+        ```python
+        >>> from transformers import AutoTokenizer, MistralForCausalLM
+
+        >>> model = MistralForCausalLM.from_pretrained("mistralai/Mistral-7B-v0.1")
+        >>> tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-v0.1")
+
+        >>> prompt = "Hey, are you conscious? Can you talk to me?"
+        >>> inputs = tokenizer(prompt, return_tensors="pt")
+
+        >>> # Generate
+        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
+        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+        "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
+        ```"""
+
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
+        outputs = self.model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        hidden_states = outputs[0]
+        logits = self.lm_head(hidden_states)
+        logits = logits.float()
+
+        loss = None
+        if labels is not None:
+            # Shift so that tokens < n predict n
+            shift_logits = logits[..., :-1, :].contiguous()
+            shift_labels = labels[..., 1:].contiguous()
+            # Flatten the tokens
+            shift_logits = shift_logits.view(-1, self.config.vocab_size)
+            shift_labels = shift_labels.view(-1)
+            # Ensure tensors are on the same device
+            shift_labels = shift_labels.to(shift_logits.device)
+            loss_fct = CrossEntropyLoss()
+            loss = loss_fct(shift_logits, shift_labels)
+
+        if not return_dict:
+            output = (logits,) + outputs[1:]
+            return (loss,) + output if loss is not None else output
+
+        return CausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+    def prepare_inputs_for_generation(
+        self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, **kwargs
+    ):
+        # Omit tokens covered by past_key_values
+        if past_key_values is not None:
+            if isinstance(past_key_values, Cache):
+                cache_length = past_key_values.get_seq_length()
+                past_length = past_key_values.seen_tokens
+                max_cache_length = past_key_values.get_max_length()
+            else:
+                cache_length = past_length = past_key_values[0][0].shape[2]
+                max_cache_length = None
+
+            # Keep only the unprocessed tokens:
+            # 1 - If the length of the attention_mask exceeds the length of input_ids, then we are in a setting where
+            # some of the inputs are exclusively passed as part of the cache (e.g. when passing input_embeds as
+            # input)
+            if attention_mask is not None and attention_mask.shape[1] > input_ids.shape[1]:
+                input_ids = input_ids[:, -(attention_mask.shape[1] - past_length) :]
+            # 2 - If the past_length is smaller than input_ids', then input_ids holds all input tokens. We can discard
+            # input_ids based on the past_length.
+            elif past_length < input_ids.shape[1]:
+                input_ids = input_ids[:, past_length:]
+            # 3 - Otherwise (past_length >= input_ids.shape[1]), let's assume input_ids only has unprocessed tokens.
+
+            # If we are about to go beyond the maximum cache length, we need to crop the input attention mask.
+            if (
+                max_cache_length is not None
+                and attention_mask is not None
+                and cache_length + input_ids.shape[1] > max_cache_length
+            ):
+                attention_mask = attention_mask[:, -max_cache_length:]
+
+        position_ids = kwargs.get("position_ids", None)
+        if attention_mask is not None and position_ids is None:
+            # create position_ids on the fly for batch generation
+            position_ids = attention_mask.long().cumsum(-1) - 1
+            position_ids.masked_fill_(attention_mask == 0, 1)
+            if past_key_values:
+                position_ids = position_ids[:, -input_ids.shape[1] :]
+
+        # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
+        if inputs_embeds is not None and past_key_values is None:
+            model_inputs = {"inputs_embeds": inputs_embeds}
+        else:
+            model_inputs = {"input_ids": input_ids}
+
+        model_inputs.update(
+            {
+                "position_ids": position_ids,
+                "past_key_values": past_key_values,
+                "use_cache": kwargs.get("use_cache"),
+                "attention_mask": attention_mask,
+            }
+        )
+        return model_inputs
+
+    @staticmethod
+    def _reorder_cache(past_key_values, beam_idx):
+        reordered_past = ()
+        for layer_past in past_key_values:
+            reordered_past += (
+                tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past),
+            )
+        return reordered_past
+
+    @classmethod
+    def pipeline_layers(cls, config: CollieConfig):
+        """
+        Get layers of pipeline.
+        :return: list
+        """
+        if isinstance(config, str):
+            config = CollieConfig.from_pretrained(config)
+
+        if config.tie_word_embeddings:
+            output = TiedLayerSpec(
+                "embed_tokens",
+                dict_as_params(input_keys="hidden_states", output_keys="logits"),
+                ColumnParallelLMHead,
+                config.hidden_size,
+                config.vocab_size,
+                bias=False,
+            )
+        else:
+            output = LayerSpec(
+                dict_as_params(input_keys="hidden_states", output_keys="logits"),
+                ColumnParallelLMHead,
+                config.hidden_size,
+                config.vocab_size,
+                bias=False,
+            )
+
+        return [("model", Mistral2Model.pipeline_layers(config)), ("lm_head", output)]
+
+    @staticmethod
+    def load_parallel_state_dict(
+            path: str,
+            config: Union[CollieConfig, str],
+            process_exclusion: bool = False,
+            **kwargs,
+    ):
+        ...
+
+    @staticmethod
+    def load_parallel_state_dict(
+            path: str,
+            config: Union[CollieConfig, str],
+            process_exclusion: bool = False,
+            protocol: str = "file", # 指定加载state_dict时使用的协议
+            **kwargs,
+    ):
+        """
+        Load state_dict from ``path``.
+        The format of pretrained model should be the same as that of
+        `huggingface`.
+        :return: state_dict. Note that the state_dict should be processed
+            properly to match the current rank.
+        """
+        # 配置加载
+        if isinstance(config, str):
+            config = CollieConfig.from_pretrained(config)
+        # IO驱动初始化
+        io_driver = IODriver.from_protocol(protocol)
+        # 检查文件路径是否存在
+        if not io_driver.exists(path):
+            raise FileNotFoundError(f"folder {path} not found.")
+        # 初始化存储和处理变量
+        state_dict = OrderedDict()
+        weights = []
+        parts = None # 变量用于存储模型分割的部分信息
+        # 如果开启了进程互斥，那么每个进程都会显示进度条，否则只显示 RANK0 的
+        hide_progress = not process_exclusion and int(os.environ.get("RANK", "0")) != 0
+        if dist.is_initialized() and process_exclusion:
+            # 如果启动了进程互斥，则要进行 dist.get_world_size() 次循环
+            rank_order = range(dist.get_world_size())
+        else:
+            # 不开启只进行一次循环
+            rank_order = range(1)
+        # 权重文件加载和处理
+        for rank in rank_order:
+            # 如果开启了进程互斥，那么只有对应 RANK 的能进入循环；不开启进程互斥的话就都可以进
+            if int(os.environ.get("RANK", "0")) == rank or not process_exclusion:
+                # PP 分层的方法保存在了 os.environ["COLLIE_PP_PARTS"], 格式类似于 [0, 17, 35], 左闭右开
+                if env.is_pipeline:
+                    # 保存的是 json 格式
+                    parts = env.pipeline_parts
+                if hasattr(config, "num_key_value_heads"):
+                    # llama2 (transformers >= 4.31.0)
+                    num_key_value_heads = config.num_key_value_heads
+                else:
+                    num_key_value_heads = config.num_attention_heads
+                head_dim = config.hidden_size // config.num_attention_heads
+                # 如果存在 pytorch_model.bin.index.json 文件的话，此时不同的 pp 进程可以按需加载自己需要的权重
+                if (
+                        io_driver.exists(os.path.join(path, "pytorch_model.bin.index.json"))
+                        and "COLLIE_PP_PARTS" in os.environ.keys()
+                ):
+                    weight_map = json.loads(
+                        io_driver.load(
+                            os.path.join(path, "pytorch_model.bin.index.json"), mode="r"
+                        )
+                    )["weight_map"]
+                    # layers 表示自己需要的层
+                    layers = env.pipeline_layers_idx
+                    # 筛选出形似 model.layers.0 这样的层。包含两个条件：1. 有数字的层；2. 数字加一要在 layers 里面（因为最开始还有个 embedding 占一层）
+                    weights.extend(
+                        [
+                            value
+                            for key, value in weight_map.items()
+                            if len(key.split(".")) > 2
+                               and key.split(".")[2].isdigit()
+                               and (int(key.split(".")[2]) + 1) in layers
+                        ]
+                    )
+                    # 去重
+                    weights = list(set(weights))
+                    # 继续筛选，如果有 0 层，那么就要加载 embedding；如果有最后一层，那么就要加载 lm_head；如果有倒数第二层，那么就要加载 norm
+                    if 0 in layers:
+                        weights.append(weight_map["model.tok_embeddings.weight"])
+                    if max(parts) - 1 in layers:
+                        weights.append(weight_map["output.weight"])
+                    if max(parts) - 2 in layers:
+                        weights.append(weight_map["model.norm.weight"])
+                else:
+                    # 如果没有 pytorch_model.bin.index.json 文件的话，那么就加载所有的权重
+                    weights = [
+                        weight
+                        for weight in io_driver.list(path)
+                        if weight.endswith(".bin")
+                    ]
+                with progress(
+                    weights,
+                    desc="Loading state dict",
+                    total=len(weights),
+                    disable=hide_progress,
+                ) as pbar:
+                    for weight in pbar:
+                        part_state_dict = io_driver.load(
+                            os.path.join(path, weight), mode="rb"
+                        )
+                        # for key in list(part_state_dict.keys()):
+                            # if "attention.wqkv.weight" in key:
+                            #     # qkv_weights = part_state_dict.pop(key)
+                            #     qkv_weights = part_state_dict[key]
+                            #     print(qkv_weights.shape)
+                            #     (wq, wk, wv) = qkv_weights.split(
+                            #         [
+                            #             config.hidden_size,
+                            #             config.num_key_value_heads * head_dim,
+                            #             config.num_key_value_heads * head_dim,
+                            #         ],
+                            #         dim=0,
+                            #     )
+                            #     wq_name = key.replace("wqkv", "wq")
+                            #     wk_name = key.replace("wqkv", "wk")
+                            #     wv_name = key.replace("wqkv", "wv")
+                            #     part_state_dict[wq_name] = wq
+                            #     part_state_dict[wk_name] = wk
+                            #     part_state_dict[wv_name] = wv
+                        state_dict.update(part_state_dict)
+                        del part_state_dict
+                if parts is not None:
+                    # 这一步是 pp 的复筛
+                    layers = env.pipeline_layers_idx
+                    for key in list(state_dict.keys()):
+                        if key.startswith("layers"):
+                            layer = int(key.split(".")[1])
+                            if layer + 1 not in layers:
+                                state_dict.pop(key)
+                        # if key.endswith("tok_embeddings.weight"):
+                        if key.endswith("embed_tokens.weight"):
+                            if 0 not in layers:
+                                state_dict.pop(key)
+                        if key == "norm.weight":
+                            if max(parts) - 2 not in layers:
+                                state_dict.pop(key)
+                        # if key.endswith("output.weight"):
+                        if key.endswith("lm_head.weight"):
+                            if max(parts) - 1 not in layers:
+                                state_dict.pop(key)
+                # 根据用户配置的新的 tp size 进行分割
+                for key in list(state_dict.keys()):
+                    col_filter = [
+                        # "wq.weight",
+                        # "wk.weight",
+                        # "wv.weight",
+                        # "wqkv.weight",
+                        # "w1.weight",
+                        # "w3.weight",
+                        # "tok_embeddings.weight",
+                        # "output.weight",
+                        "q_proj.weight",
+                        "k_proj.weight",
+                        "v_proj.weight",
+                        "o_proj.weight",
+                        "lm_head.weight",
+                        "gate_proj.weight",
+                        "up_proj.weight",
+                        "down_proj.weight",
+                        "embed_tokens.weight",
+                    ]
+                    col_split = any([key.endswith(filter) for filter in col_filter])
+
+                    if col_split:
+                        tensor = (
+                            list(torch.chunk(state_dict[key], config.tp_size, dim=0))[
+                                env.tp_rank
+                            ]
+                            .detach()
+                            .clone()
+                        )
+                        del state_dict[key]
+                        if process_exclusion:
+                            # CPU 内存回收（速度很慢）
+                            gc.collect()
+                        state_dict[key] = tensor
+                    elif key.endswith("o_proj.weight") or key.endswith("down_proj.weight"):
+                        tensor = (
+                            list(torch.chunk(state_dict[key], config.tp_size, dim=1))[
+                                env.tp_rank
+                            ]
+                            .detach()
+                            .clone()
+                        )
+                        del state_dict[key]
+                        if process_exclusion:
+                            # CPU 内存回收（速度很慢）
+                            gc.collect()
+                        state_dict[key] = tensor
+            if dist.is_initialized() and process_exclusion:
+                # 如果选择了进程互斥，那么本次循环中不需要加载权重的进程需等待
+                dist.barrier()
+        return state_dict
+
+    @staticmethod
+    def save_parallel_state_dict(
+        state_dict: dict,
+        path: str,
+        config: CollieConfig,
+        process_exclusion: bool = False,
+        **kwargs,
+    ):
+        ...
+
+    @staticmethod
+    def save_parallel_state_dict(
+            state_dict: dict,
+            path: str,
+            config: CollieConfig,
+            process_exclusion: bool = False,
+            protocol: str = "file",
+    ):
+        """
+        Save state_dict to ``path``.
+        The format of saved state dict should be the same as that of
+        `huggingface`.
+        """
+        io_driver = IODriver.from_protocol(protocol)
+        # gather to tp rank 0
+        if dist.is_initialized() and process_exclusion:
+            # 如果启动了进程互斥，则要进行 pp_size 次循环
+            rank_order = range(config.pp_size)
+        else:
+            # 不开启只进行一次循环
+            rank_order = range(1)
+        dst = parallel_state.get_tensor_model_parallel_src_rank()
+        with progress(
+                rank_order,
+                desc="Saving model",
+                disable=int(os.environ.get("RANK", "0")) != 0,
+        ) as pbar:
+            for rank in pbar:
+                if env.dp_rank == 0 and (env.pp_rank == rank or not process_exclusion):
+                    for key in sorted(list(state_dict.keys())):
+                        tensor_list = None
+                        if env.tp_rank == 0:
+                            tensor_list = [
+                                torch.zeros_like(state_dict[key])
+                                .to(state_dict[key].dtype)
+                                .cuda()
+                                for _ in range(config.tp_size)
+                            ]
+                        dist.gather(
+                            state_dict[key].cuda(),
+                            dst=dst,
+                            gather_list=tensor_list,
+                            group=env.tp_group,
+                        )
+                        if env.tp_rank == 0:
+                            col_filter = [
+                                # "wq.weight",
+                                # "wk.weight",
+                                # "wv.weight",
+                                # "wqkv.weight",
+                                # "w1.weight",
+                                # "w3.weight",
+                                # "tok_embeddings.weight",
+                                # "output.weight",
+                                "q_proj.weight",
+                                "k_proj.weight",
+                                "v_proj.weight",
+                                "o_proj.weight",
+                                "lm_head.weight",
+                                "gate_proj.weight",
+                                "up_proj.weight",
+                                "down_proj.weight",
+                                "embed_tokens.weight",
+                            ]
+                            col_split = any(
+                                [key.endswith(filter) for filter in col_filter]
+                            )
+
+                            if col_split:
+                                state_dict[key] = concat_tensor(tensor_list, dim=0)
+
+                                if process_exclusion:
+                                    # CPU 内存回收（速度很慢）
+                                    gc.collect()
+
+                            elif key.endswith("o_proj.weight") or key.endswith("down_proj.weight"):
+                                state_dict[key] = concat_tensor(tensor_list, dim=1)
+
+                                if process_exclusion:
+                                    # CPU 内存回收（速度很慢）
+                                    gc.collect()
+                    # 似乎不需要？
+                    # state_dict_keys = state_dict.keys()
+                    # for layer_id in range(config.num_layers):
+                    #     qkv_names = [None, None, None]
+                    #     for key in state_dict_keys:
+                    #         if f"layers.{layer_id}.attention.wq.weight" in key:
+                    #             qkv_names[0] = key
+                    #         elif f"layers.{layer_id}.attention.wk.weight" in key:
+                    #             qkv_names[1] = key
+                    #         elif f"layers.{layer_id}.attention.wv.weight" in key:
+                    #             qkv_names[2] = key
+                    #     qkv_name = qkv_names[0].replace("wq", "wqkv")
+                    #     state_dict[qkv_name] = torch.cat(
+                    #         [
+                    #             state_dict.pop(qkv_names[0]),
+                    #             state_dict.pop(qkv_names[1]),
+                    #             state_dict.pop(qkv_names[2]),
+                    #         ],
+                    #         dim=0
+                    #     )
+
+                    if env.tp_rank == 0:
+                        # Save gathered weights
+                        if env.is_pipeline:
+                            ckpt_name = f"pytorch_model-{env.pp_rank + 1:05d}-of-{config.pp_size:05d}.bin"
+                            total_size = 0
+                            weight_map = {}
+                            for name, weight in state_dict.items():
+                                weight_size = weight.numel() * dtype_byte_size(
+                                    weight.dtype
+                                )
+                                weight_map[name] = ckpt_name
+                                total_size += weight_size
+                            index_dict = dict(
+                                total_size=total_size, weight_map=weight_map
+                            )
+                            index_dicts = [None for _ in range(env.pp_size)]
+                            dist.gather_object(
+                                index_dict, index_dicts if env.pp_rank == 0 else None, group=env.pp_group
+                            )
+                            if env.pp_rank == 0:
+                                total_size = 0
+                                weight_map = {}
+                                for _index_dict in index_dicts:
+                                    total_size += _index_dict["total_size"]
+                                    weight_map.update(_index_dict["weight_map"])
+                                merged_dict = {
+                                    "metadata": {"total_size": total_size},
+                                    "weight_map": weight_map,
+                                }
+                                io_driver.save(
+                                    json.dumps(merged_dict, indent=2, sort_keys=True)
+                                    + "\n",
+                                    os.path.join(path, "pytorch_model.bin.index.json"),
+                                )
+
+                        else:
+                            ckpt_name = f"pytorch_model.bin"
+                        ckpt_path = os.path.join(path, ckpt_name)
+                        io_driver.save(state_dict, ckpt_path)
+                if dist.is_initialized() and process_exclusion:
+                    dist.barrier()
+        if env.rank == 0:
+            config.save_pretrained(path, protocol=protocol)
+        dist.barrier()
+
+
+@add_start_docstrings(
+    """
+    The Mistral Model transformer with a sequence classification head on top (linear layer).
+
+    [`MistralForSequenceClassification`] uses the last token in order to do the classification, as other causal models
+    (e.g. GPT-2) do.
+
+    Since it does classification on the last token, it requires to know the position of the last token. If a
+    `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If
+    no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the
+    padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
+    each row of the batch).
+    """,
+    MISTRAL_START_DOCSTRING,
+)
+# Copied from transformers.models.llama.modeling_llama.LlamaForSequenceClassification with Llama->Mistral, LLAMA->MISTRAL
+class MistralForSequenceClassification(Mistral2PreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+        self.model = Mistral2Model(config)
+        self.score = nn.Linear(config.hidden_size, self.num_labels, bias=False)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.model.embed_tokens
+
+    def set_input_embeddings(self, value):
+        self.model.embed_tokens = value
+
+    @add_start_docstrings_to_model_forward(MISTRAL_INPUTS_DOCSTRING)
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, SequenceClassifierOutputWithPast]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        transformer_outputs = self.model(
+            input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        hidden_states = transformer_outputs[0]
+        logits = self.score(hidden_states)
+
+        if input_ids is not None:
+            batch_size = input_ids.shape[0]
+        else:
+            batch_size = inputs_embeds.shape[0]
+
+        if self.config.pad_token_id is None and batch_size != 1:
+            raise ValueError("Cannot handle batch sizes > 1 if no padding token is defined.")
+        if self.config.pad_token_id is None:
+            sequence_lengths = -1
+        else:
+            if input_ids is not None:
+                # if no pad token found, use modulo instead of reverse indexing for ONNX compatibility
+                sequence_lengths = torch.eq(input_ids, self.config.pad_token_id).int().argmax(-1) - 1
+                sequence_lengths = sequence_lengths % input_ids.shape[-1]
+                sequence_lengths = sequence_lengths.to(logits.device)
+            else:
+                sequence_lengths = -1
+
+        pooled_logits = logits[torch.arange(batch_size, device=logits.device), sequence_lengths]
+
+        loss = None
+        if labels is not None:
+            labels = labels.to(logits.device)
+            if self.config.problem_type is None:
+                if self.num_labels == 1:
+                    self.config.problem_type = "regression"
+                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
+                    self.config.problem_type = "single_label_classification"
+                else:
+                    self.config.problem_type = "multi_label_classification"
+
+            if self.config.problem_type == "regression":
+                loss_fct = MSELoss()
+                if self.num_labels == 1:
+                    loss = loss_fct(pooled_logits.squeeze(), labels.squeeze())
+                else:
+                    loss = loss_fct(pooled_logits, labels)
+            elif self.config.problem_type == "single_label_classification":
+                loss_fct = CrossEntropyLoss()
+                loss = loss_fct(pooled_logits.view(-1, self.num_labels), labels.view(-1))
+            elif self.config.problem_type == "multi_label_classification":
+                loss_fct = BCEWithLogitsLoss()
+                loss = loss_fct(pooled_logits, labels)
+        if not return_dict:
+            output = (pooled_logits,) + transformer_outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+
+        return SequenceClassifierOutputWithPast(
+            loss=loss,
+            logits=pooled_logits,
+            past_key_values=transformer_outputs.past_key_values,
+            hidden_states=transformer_outputs.hidden_states,
+            attentions=transformer_outputs.attentions,
+        )
diff --git a/tests/models/mistral2/modeltp.py b/tests/models/mistral2/modeltp.py
new file mode 100644
index 0000000..e91037f
--- /dev/null
+++ b/tests/models/mistral2/modeltp.py
@@ -0,0 +1,2254 @@
+# coding=utf-8
+# Copyright 2023 Mistral AI and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" PyTorch Mistral model."""
+import inspect
+import math
+import warnings
+from typing import List, Optional, Tuple, Union
+
+import torch
+import torch.nn.functional as F
+import torch.utils.checkpoint
+from torch import nn
+from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
+
+from transformers.activations import ACT2FN
+from transformers.cache_utils import Cache, DynamicCache
+from transformers.modeling_attn_mask_utils import _prepare_4d_causal_attention_mask, _prepare_4d_causal_attention_mask_for_sdpa
+from transformers.modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast, SequenceClassifierOutputWithPast
+from transformers.modeling_utils import PreTrainedModel, dtype_byte_size
+from transformers.utils import (
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    is_flash_attn_2_available,
+    is_flash_attn_greater_or_equal_2_10,
+    logging,
+    replace_return_docstrings,
+)
+from .configuration_mistraltp import MistralConfig
+
+
+if is_flash_attn_2_available():
+    from flash_attn import flash_attn_func, flash_attn_varlen_func
+    from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input  # noqa
+
+    _flash_supports_window_size = "window_size" in list(inspect.signature(flash_attn_func).parameters)
+
+
+logger = logging.get_logger(__name__)
+
+_CONFIG_FOR_DOC = "MistralConfig"
+
+#modified for collie
+import torch.distributed as dist
+import gc
+import json
+import os
+from collections import OrderedDict
+from megatron.core import parallel_state, tensor_parallel
+from einops import rearrange
+from deepspeed.pipe import LayerSpec, TiedLayerSpec
+
+from collie.config import CollieConfig
+from collie.driver.io import IODriver
+from collie.log.logger import logger
+from collie.module import (
+    ColumnParallelLinearWithoutBias,
+    ColumnParallelLMHead,
+    RowParallelLinearWithoutBias,
+)
+from collie.utils import concat_tensor, dict_as_params, env, progress
+from collie.models.base import CollieModelForCausalLM
+from collie.models.utils import (
+    kv_cache_to_inputs_for_layer, inputs_to_kv_cache_for_layer,
+    kv_cache_to_inputs_for_model, inputs_to_kv_cache_for_model,
+)
+
+
+# Copied from transformers.models.llama.modeling_llama._get_unpad_data
+def _get_unpad_data(attention_mask):
+    seqlens_in_batch = attention_mask.sum(dim=-1, dtype=torch.int32)
+    indices = torch.nonzero(attention_mask.flatten(), as_tuple=False).flatten()
+    max_seqlen_in_batch = seqlens_in_batch.max().item()
+    cu_seqlens = F.pad(torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.int32), (1, 0))
+    return (
+        indices,
+        cu_seqlens,
+        max_seqlen_in_batch,
+    )
+
+
+# Copied from transformers.models.llama.modeling_llama.LlamaRMSNorm with Llama->Mistral
+class MistralRMSNorm(nn.Module):
+    def __init__(self, hidden_size, eps=1e-6):
+        """
+        MistralRMSNorm is equivalent to T5LayerNorm
+        """
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+        self.variance_epsilon = eps
+
+    def forward(self, hidden_states):
+        input_dtype = hidden_states.dtype
+        hidden_states = hidden_states.to(torch.float32)
+        variance = hidden_states.pow(2).mean(-1, keepdim=True)
+        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
+        ans = self.weight * hidden_states.to(input_dtype)
+
+        # # 打印层标准化的输出
+        hidden_states_output = ans.detach().cpu().tolist()
+        data_to_save = {"Layer Norm Output": hidden_states_output}
+        # 将输出写入 JSON 文件
+        with open('a_rms_output.json', 'w') as f:
+            json.dump(data_to_save, f, indent=4)
+
+        return self.weight * hidden_states.to(input_dtype)
+
+
+# copied from transformers.models.llama.modeling_llama.LlamaRotaryEmbedding with Llama->Mistral
+# TODO @Arthur no longer copied from LLama after static cache
+class MistralRotaryEmbedding(nn.Module):
+    def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None):
+        super().__init__()
+
+        self.dim = dim
+        self.max_position_embeddings = max_position_embeddings
+        self.base = base
+        inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2, dtype=torch.int64).float().to(device) / self.dim))
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
+
+        # Build here to make `torch.jit.trace` work.
+        self._set_cos_sin_cache(
+            seq_len=max_position_embeddings, device=self.inv_freq.device, dtype=torch.get_default_dtype()
+        )
+
+    def _set_cos_sin_cache(self, seq_len, device, dtype):
+        self.max_seq_len_cached = seq_len
+        t = torch.arange(self.max_seq_len_cached, device=device, dtype=torch.int64).type_as(self.inv_freq)
+
+        freqs = torch.outer(t, self.inv_freq)
+        # Different from paper, but it uses a different permutation in order to obtain the same calculation
+        emb = torch.cat((freqs, freqs), dim=-1)
+        self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False)
+        self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False)
+
+    def forward(self, x, seq_len=None):
+        # x: [bs, num_attention_heads, seq_len, head_size]
+        if seq_len > self.max_seq_len_cached:
+            self._set_cos_sin_cache(seq_len=seq_len, device=x.device, dtype=x.dtype)
+
+        return (
+            self.cos_cached[:seq_len].to(dtype=x.dtype),
+            self.sin_cached[:seq_len].to(dtype=x.dtype),
+        )
+
+
+# Copied from transformers.models.llama.modeling_llama.rotate_half
+def rotate_half(x):
+    """Rotates half the hidden dims of the input."""
+    x1 = x[..., : x.shape[-1] // 2]
+    x2 = x[..., x.shape[-1] // 2 :]
+    return torch.cat((-x2, x1), dim=-1)
+
+
+# copied from transformers.models.llama.modeling_llama.apply_rotary_pos_emb
+# TODO @Arthur no longer copied from LLama after static cache
+def apply_rotary_pos_emb(q, k, cos, sin, position_ids, unsqueeze_dim=1):
+    """Applies Rotary Position Embedding to the query and key tensors.
+
+    Args:
+        q (`torch.Tensor`): The query tensor.
+        k (`torch.Tensor`): The key tensor.
+        cos (`torch.Tensor`): The cosine part of the rotary embedding.
+        sin (`torch.Tensor`): The sine part of the rotary embedding.
+        position_ids (`torch.Tensor`):
+            The position indices of the tokens corresponding to the query and key tensors. For example, this can be
+            used to pass offsetted position ids when working with a KV-cache.
+        unsqueeze_dim (`int`, *optional*, defaults to 1):
+            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
+            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
+            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
+            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
+            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
+            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
+    Returns:
+        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
+    """
+    cos = cos[position_ids].unsqueeze(unsqueeze_dim)
+    sin = sin[position_ids].unsqueeze(unsqueeze_dim)
+    q_embed = (q * cos) + (rotate_half(q) * sin)
+    k_embed = (k * cos) + (rotate_half(k) * sin)
+    return q_embed, k_embed
+
+
+class MistralMLP(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.hidden_size = config.hidden_size
+        self.intermediate_size = config.intermediate_size
+        # self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
+        # self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
+        # self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False)
+        
+        self.up_proj = ColumnParallelLinearWithoutBias(
+            self.hidden_size,
+            self.intermediate_size,
+            bias=False,
+            gather_output=False,
+            init_method=lambda x: x,
+        )
+        self.gate_proj = ColumnParallelLinearWithoutBias(
+            self.hidden_size,
+            self.intermediate_size,
+            bias=False,
+            gather_output=False,
+            init_method=lambda x: x,
+        )
+        self.down_proj = RowParallelLinearWithoutBias(
+            self.intermediate_size,
+            self.hidden_size,
+            bias=False,
+            input_is_parallel=True,
+            init_method=lambda x: x,
+        )
+        
+        self.act_fn = ACT2FN[config.hidden_act]
+
+    def forward(self, x):
+        output = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
+
+        # 打印MLP层输出
+        mlp_output = output.detach().cpu().tolist()
+        data_to_save = {"MLP Output": mlp_output}
+        # 将输出写入 JSON 文件
+        with open('a_mlp_output.json', 'w') as f:
+            json.dump(data_to_save, f, indent=4)
+        
+        return output
+
+
+# Copied from transformers.models.llama.modeling_llama.repeat_kv
+def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
+    """
+    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
+    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
+    """
+    batch, num_key_value_heads, slen, head_dim = hidden_states.shape
+    if n_rep == 1:
+        return hidden_states
+    hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
+    return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
+
+
+class MistralAttention(nn.Module):
+    """
+    Multi-headed attention from 'Attention Is All You Need' paper. Modified to use sliding window attention: Longformer
+    and "Generating Long Sequences with Sparse Transformers".
+    """
+
+    def __init__(self, config: CollieConfig, layer_idx: Optional[int] = None):
+        super().__init__()
+        self.config = config
+        self.layer_idx = layer_idx
+        if layer_idx is None:
+            logger.warning_once(
+                f"Instantiating {self.__class__.__name__} without passing a `layer_idx` is not recommended and will "
+                "lead to errors during the forward call if caching is used. Please make sure to provide a `layer_idx` "
+                "when creating this class."
+            )
+
+        self.hidden_size = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_dim = self.hidden_size // self.num_heads
+        self.num_key_value_heads = config.num_key_value_heads
+        self.num_key_value_groups = self.num_heads // self.num_key_value_heads
+        self.max_position_embeddings = config.max_position_embeddings
+        self.rope_theta = config.rope_theta
+        self.is_causal = True
+        self.attention_dropout = config.attention_dropout
+
+        if (self.head_dim * self.num_heads) != self.hidden_size:
+            raise ValueError(
+                f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}"
+                f" and `num_heads`: {self.num_heads})."
+            )
+        # self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=False)
+        # self.k_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=False)
+        # self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=False)
+        # self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=False)
+
+        self.q_proj = ColumnParallelLinearWithoutBias(
+            self.hidden_size,
+            self.num_heads * self.head_dim,
+            bias=False,
+            gather_output=False,
+            init_method=lambda x: x,
+        )
+        self.k_proj = ColumnParallelLinearWithoutBias(
+            self.hidden_size,
+            self.num_key_value_heads * self.head_dim,
+            bias=False,
+            gather_output=False,
+            init_method=lambda x: x,
+        )
+        self.v_proj = ColumnParallelLinearWithoutBias(
+            self.hidden_size,
+            self.num_key_value_heads * self.head_dim,
+            bias=False,
+            gather_output=False,
+            init_method=lambda x: x,
+        )
+        # aaaa
+        self.o_proj = RowParallelLinearWithoutBias(
+            self.num_heads * self.head_dim,
+            self.hidden_size,
+            bias=False,
+            input_is_parallel=True,
+            init_method=lambda x: x,
+        )
+
+        self.rotary_emb = MistralRotaryEmbedding(
+            self.head_dim,
+            max_position_embeddings=self.max_position_embeddings,
+            base=self.rope_theta,
+        )
+
+    def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
+        return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,    # 输入维度 [bsz, q_len, hidden_size]
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Cache] = None,
+        output_attentions: bool = False,
+        use_cache: bool = False,
+        **kwargs,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        if "padding_mask" in kwargs:
+            warnings.warn(
+                "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`"
+            )
+
+        bsz, q_len, _ = hidden_states.size()
+
+        query_states = self.q_proj(hidden_states)   # [bsz, q_len, num_heads * head_dim]
+        key_states = self.k_proj(hidden_states)     # [bsz, q_len, num_key_value_heads * head_dim]
+        value_states = self.v_proj(hidden_states)   # [bsz, q_len, num_key_value_heads * head_dim]
+
+        # query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        # key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        # value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+
+        query_states, key_states, value_states = (
+            rearrange(query_states, "b n (h d) -> b n h d", d=self.head_dim),   # [bsz, q_len, num_heads, head_dim]
+            rearrange(key_states, "b n (h d) -> b n h d", d=self.head_dim),     # [bsz, q_len, num_key_value_heads, head_dim]
+            rearrange(value_states, "b n (h d) -> b n h d", d=self.head_dim),   # [bsz, q_len, num_key_value_heads, head_dim]
+        )
+
+        query_states = query_states.transpose(1, 2)     # [bsz, num_heads, q_len, head_dim]
+        key_states = key_states.transpose(1, 2)         # [bsz, num_key_value_heads, q_len, head_dim]
+        value_states = value_states.transpose(1, 2)     # [bsz, num_key_value_heads, q_len, head_dim]
+        
+        # 打印注意力模块的输出
+        # 准备数据以写入 JSON 文件
+        attention_outputs = {
+            "Query states": query_states.detach().cpu().tolist(),
+            "Key states": key_states.detach().cpu().tolist(),
+            "Value states": value_states.detach().cpu().tolist()
+        }
+        # 将数据写入 JSON 文件
+        with open("a_attention_outputs.json", "w") as f:
+            json.dump(attention_outputs, f, indent=4)
+
+        if self.config.pp_size > 1:
+            query_states = query_states.contiguous()
+            key_states = key_states.contiguous()
+            value_states = value_states.contiguous()
+
+        kv_seq_len = key_states.shape[-2]
+        if past_key_value is not None:
+            if self.layer_idx is None:
+                raise ValueError(
+                    f"The cache structure has changed since version v4.36. If you are using {self.__class__.__name__} "
+                    "for auto-regressive decoding with k/v caching, please make sure to initialize the attention class "
+                    "with a layer index."
+                )
+            kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
+        cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
+
+        if past_key_value is not None:
+            cache_kwargs = {"sin": sin, "cos": cos}  # Specific to RoPE models
+            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
+
+        # repeat k/v heads if n_kv_heads < n_heads
+        key_states = repeat_kv(key_states, self.num_key_value_groups)
+        value_states = repeat_kv(value_states, self.num_key_value_groups)
+
+        attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim)
+
+        if attn_weights.size() != (bsz, self.num_heads/self.config.tp_size, q_len, kv_seq_len):
+            raise ValueError(
+                f"Attention weights should be of size {(bsz, self.num_heads/self.config.tp_size, q_len, kv_seq_len)}, but is"
+                f" {attn_weights.size()}"
+            )
+
+        if attention_mask is not None:
+            if attention_mask.size() != (bsz, 1, q_len, kv_seq_len):
+                raise ValueError(
+                    f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}"
+                )
+
+            attn_weights = attn_weights + attention_mask
+
+        # upcast attention to fp32
+        attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
+        attn_weights = nn.functional.dropout(attn_weights, p=self.attention_dropout, training=self.training)
+        attn_output = torch.matmul(attn_weights, value_states)
+
+        if attn_output.size() != (bsz, self.num_heads/self.config.tp_size, q_len, self.head_dim):
+            raise ValueError(
+                f"`attn_output` should be of size {(bsz, self.num_heads/self.config.tp_size, q_len, self.head_dim)}, but is"
+                f" {attn_output.size()}"
+            )
+
+        attn_output = attn_output.transpose(1, 2).contiguous()
+        attn_output = attn_output.reshape(bsz, q_len, int(self.hidden_size/self.config.tp_size))
+
+
+        attn_output = self.o_proj(attn_output)
+
+        if not output_attentions:
+            attn_weights = None
+
+        # 打印注意力模块的输出
+        attention_result = {
+            "Output weights:": attn_output.detach().cpu().tolist(),
+            # "Attention weights:": attn_weights.detach().cpu().tolist(),
+        }
+        # 将数据写入 JSON 文件
+        with open("a_attention_outputs.json", "w") as f:
+            json.dump(attention_result, f, indent=4)
+
+        return attn_output, attn_weights, past_key_value
+
+
+class MistralFlashAttention2(MistralAttention):
+    """
+    Mistral flash attention module. This module inherits from `MistralAttention` as the weights of the module stays
+    untouched. The only required change would be on the forward pass where it needs to correctly call the public API of
+    flash attention and deal with padding tokens in case the input contains any of them.
+    """
+
+    # Copied from transformers.models.llama.modeling_llama.LlamaFlashAttention2.__init__
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+        # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1.
+        # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignement, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0.
+        # Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left).
+        self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10()
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Cache] = None,
+        output_attentions: bool = False,
+        use_cache: bool = False,
+        **kwargs,
+    ):
+        if "padding_mask" in kwargs:
+            warnings.warn(
+                "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`"
+            )
+
+            # overwrite attention_mask with padding_mask
+            attention_mask = kwargs.pop("padding_mask")
+        bsz, q_len, _ = hidden_states.size()
+
+        query_states = self.q_proj(hidden_states)
+        key_states = self.k_proj(hidden_states)
+        value_states = self.v_proj(hidden_states)
+
+        # query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        # key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        # value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+
+        query_states, key_states, value_states = (
+            rearrange(query_states, "b n (h d) -> b n h d", d=self.head_dim),
+            rearrange(key_states, "b n (h d) -> b n h d", d=self.head_dim),
+            rearrange(value_states, "b n (h d) -> b n h d", d=self.head_dim),
+        )
+
+        query_states = query_states.transpose(1, 2)
+        key_states = key_states.transpose(1, 2)
+        value_states = value_states.transpose(1, 2)
+
+        # 打印注意力模块的输出
+        # 准备数据以写入 JSON 文件
+        attention_outputs = {
+            "Query states": query_states.detach().cpu().tolist(),
+            "Key states": key_states.detach().cpu().tolist(),
+            "Value states": value_states.detach().cpu().tolist()
+        }
+        # 将数据写入 JSON 文件
+        with open("a_flash_attention_outputs.json", "w") as f:
+            json.dump(attention_outputs, f, indent=4)
+
+        if self.config.pp_size > 1:
+            query_states = query_states.contiguous()
+            key_states = key_states.contiguous()
+            value_states = value_states.contiguous()
+
+
+        kv_seq_len = key_states.shape[-2]
+        if past_key_value is not None:
+            if self.layer_idx is None:
+                raise ValueError(
+                    f"The cache structure has changed since version v4.36. If you are using {self.__class__.__name__} "
+                    "for auto-regressive decoding with k/v caching, please make sure to initialize the attention class "
+                    "with a layer index."
+                )
+            kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
+
+        # Because the input can be padded, the absolute sequence length depends on the max position id.
+        rotary_seq_len = max(kv_seq_len, position_ids[:, -1].max().item()) + 1
+        cos, sin = self.rotary_emb(value_states, seq_len=rotary_seq_len)
+
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
+
+        use_sliding_windows = (
+            _flash_supports_window_size
+            and getattr(self.config, "sliding_window", None) is not None
+            and kv_seq_len > self.config.sliding_window
+        )
+
+        if not _flash_supports_window_size:
+            logger.warning_once(
+                "The current flash attention version does not support sliding window attention, for a more memory efficient implementation"
+                " make sure to upgrade flash-attn library."
+            )
+
+        if past_key_value is not None:
+            # Activate slicing cache only if the config has a value `sliding_windows` attribute
+            cache_has_contents = past_key_value.get_seq_length(self.layer_idx) > 0
+            if (
+                getattr(self.config, "sliding_window", None) is not None
+                and kv_seq_len > self.config.sliding_window
+                and cache_has_contents
+            ):
+                slicing_tokens = 1 - self.config.sliding_window
+
+                past_key = past_key_value[self.layer_idx][0]
+                past_value = past_key_value[self.layer_idx][1]
+
+                past_key = past_key[:, :, slicing_tokens:, :].contiguous()
+                past_value = past_value[:, :, slicing_tokens:, :].contiguous()
+
+                if past_key.shape[-2] != self.config.sliding_window - 1:
+                    raise ValueError(
+                        f"past key must have a shape of (`batch_size, num_heads, self.config.sliding_window-1, head_dim`), got"
+                        f" {past_key.shape}"
+                    )
+
+                if attention_mask is not None:
+                    attention_mask = attention_mask[:, slicing_tokens:]
+                    attention_mask = torch.cat([attention_mask, torch.ones_like(attention_mask[:, -1:])], dim=-1)
+
+            cache_kwargs = {"sin": sin, "cos": cos}  # Specific to RoPE models
+            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
+
+        # repeat k/v heads if n_kv_heads < n_heads
+        key_states = repeat_kv(key_states, self.num_key_value_groups)
+        value_states = repeat_kv(value_states, self.num_key_value_groups)
+        dropout_rate = 0.0 if not self.training else self.attention_dropout
+
+        # In PEFT, usually we cast the layer norms in float32 for training stability reasons
+        # therefore the input hidden states gets silently casted in float32. Hence, we need
+        # cast them back in float16 just to be sure everything works as expected.
+        input_dtype = query_states.dtype
+        if input_dtype == torch.float32:
+            if torch.is_autocast_enabled():
+                target_dtype = torch.get_autocast_gpu_dtype()
+            # Handle the case where the model is quantized
+            elif hasattr(self.config, "_pre_quantization_dtype"):
+                target_dtype = self.config._pre_quantization_dtype
+            else:
+                target_dtype = self.q_proj.weight.dtype
+
+            logger.warning_once(
+                f"The input hidden states seems to be silently casted in float32, this might be related to"
+                f" the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in"
+                f" {target_dtype}."
+            )
+
+            query_states = query_states.to(target_dtype)
+            key_states = key_states.to(target_dtype)
+            value_states = value_states.to(target_dtype)
+
+        # Reashape to the expected shape for Flash Attention
+        query_states = query_states.transpose(1, 2)
+        key_states = key_states.transpose(1, 2)
+        value_states = value_states.transpose(1, 2)
+
+        attn_output = self._flash_attention_forward(
+            query_states,
+            key_states,
+            value_states,
+            attention_mask,
+            q_len,
+            dropout=dropout_rate,
+            use_sliding_windows=use_sliding_windows,
+        )
+
+        attn_output = attn_output.reshape(bsz, q_len, int(self.hidden_size/self.config.tp_size)).contiguous()
+        attn_output = self.o_proj(attn_output)
+
+        if not output_attentions:
+            attn_weights = None
+
+        # 打印注意力模块的输出
+        attention_result = {
+            "Output weights:": attn_output.detach().cpu().tolist(),
+            # "Attention weights:": attn_weights.detach().cpu().tolist(),
+        }
+        # 将数据写入 JSON 文件
+        with open("a_flash_attention_outputs.json", "w") as f:
+            json.dump(attention_result, f, indent=4)
+
+        return attn_output, attn_weights, past_key_value
+
+    def _flash_attention_forward(
+        self,
+        query_states,
+        key_states,
+        value_states,
+        attention_mask,
+        query_length,
+        dropout=0.0,
+        softmax_scale=None,
+        use_sliding_windows=False,
+    ):
+        """
+        Calls the forward method of Flash Attention - if the input hidden states contain at least one padding token
+        first unpad the input, then computes the attention scores and pad the final attention scores.
+
+        Args:
+            query_states (`torch.Tensor`):
+                Input query states to be passed to Flash Attention API
+            key_states (`torch.Tensor`):
+                Input key states to be passed to Flash Attention API
+            value_states (`torch.Tensor`):
+                Input value states to be passed to Flash Attention API
+            attention_mask (`torch.Tensor`):
+                The padding mask - corresponds to a tensor of size `(batch_size, seq_len)` where 0 stands for the
+                position of padding tokens and 1 for the position of non-padding tokens.
+            dropout (`float`):
+                Attention dropout
+            softmax_scale (`float`, *optional*):
+                The scaling of QK^T before applying softmax. Default to 1 / sqrt(head_dim)
+            use_sliding_windows (`bool`, *optional*):
+                Whether to activate sliding window attention.
+        """
+        if not self._flash_attn_uses_top_left_mask:
+            causal = self.is_causal
+        else:
+            # TODO: Remove the `query_length != 1` check once Flash Attention for RoCm is bumped to 2.1. For details, please see the comment in LlamaFlashAttention2 __init__.
+            causal = self.is_causal and query_length != 1
+
+        # Contains at least one padding token in the sequence
+        if attention_mask is not None:
+            batch_size = query_states.shape[0]
+            query_states, key_states, value_states, indices_q, cu_seq_lens, max_seq_lens = self._upad_input(
+                query_states, key_states, value_states, attention_mask, query_length
+            )
+
+            cu_seqlens_q, cu_seqlens_k = cu_seq_lens
+            max_seqlen_in_batch_q, max_seqlen_in_batch_k = max_seq_lens
+
+            if not use_sliding_windows:
+                attn_output_unpad = flash_attn_varlen_func(
+                    query_states,
+                    key_states,
+                    value_states,
+                    cu_seqlens_q=cu_seqlens_q,
+                    cu_seqlens_k=cu_seqlens_k,
+                    max_seqlen_q=max_seqlen_in_batch_q,
+                    max_seqlen_k=max_seqlen_in_batch_k,
+                    dropout_p=dropout,
+                    softmax_scale=softmax_scale,
+                    causal=causal,
+                )
+            else:
+                attn_output_unpad = flash_attn_varlen_func(
+                    query_states,
+                    key_states,
+                    value_states,
+                    cu_seqlens_q=cu_seqlens_q,
+                    cu_seqlens_k=cu_seqlens_k,
+                    max_seqlen_q=max_seqlen_in_batch_q,
+                    max_seqlen_k=max_seqlen_in_batch_k,
+                    dropout_p=dropout,
+                    softmax_scale=softmax_scale,
+                    causal=causal,
+                    window_size=(self.config.sliding_window, self.config.sliding_window),
+                )
+
+            attn_output = pad_input(attn_output_unpad, indices_q, batch_size, query_length)
+        else:
+            if not use_sliding_windows:
+                attn_output = flash_attn_func(
+                    query_states,
+                    key_states,
+                    value_states,
+                    dropout,
+                    softmax_scale=softmax_scale,
+                    causal=causal,
+                )
+            else:
+                attn_output = flash_attn_func(
+                    query_states,
+                    key_states,
+                    value_states,
+                    dropout,
+                    softmax_scale=softmax_scale,
+                    causal=causal,
+                    window_size=(self.config.sliding_window, self.config.sliding_window),
+                )
+
+        return attn_output
+
+    def _upad_input(self, query_layer, key_layer, value_layer, attention_mask, query_length):
+        batch_size, kv_seq_len, num_heads, head_dim = key_layer.shape
+
+        # On the first iteration we need to properly re-create the padding mask
+        # by slicing it on the proper place
+        if kv_seq_len != attention_mask.shape[-1]:
+            attention_mask_num_tokens = attention_mask.shape[-1]
+            attention_mask = attention_mask[:, attention_mask_num_tokens - kv_seq_len :]
+
+        indices_k, cu_seqlens_k, max_seqlen_in_batch_k = _get_unpad_data(attention_mask)
+
+        key_layer = index_first_axis(key_layer.reshape(batch_size * kv_seq_len, num_heads, head_dim), indices_k)
+        value_layer = index_first_axis(value_layer.reshape(batch_size * kv_seq_len, num_heads, head_dim), indices_k)
+
+        if query_length == kv_seq_len:
+            query_layer = index_first_axis(
+                query_layer.reshape(batch_size * kv_seq_len, num_heads, head_dim), indices_k
+            )
+            cu_seqlens_q = cu_seqlens_k
+            max_seqlen_in_batch_q = max_seqlen_in_batch_k
+            indices_q = indices_k
+        elif query_length == 1:
+            max_seqlen_in_batch_q = 1
+            cu_seqlens_q = torch.arange(
+                batch_size + 1, dtype=torch.int32, device=query_layer.device
+            )  # There is a memcpy here, that is very bad.
+            indices_q = cu_seqlens_q[:-1]
+            query_layer = query_layer.squeeze(1)
+        else:
+            # The -q_len: slice assumes left padding.
+            attention_mask = attention_mask[:, -query_length:]
+            query_layer, indices_q, cu_seqlens_q, max_seqlen_in_batch_q = unpad_input(query_layer, attention_mask)
+
+        return (
+            query_layer,
+            key_layer,
+            value_layer,
+            indices_q,
+            (cu_seqlens_q, cu_seqlens_k),
+            (max_seqlen_in_batch_q, max_seqlen_in_batch_k),
+        )
+
+
+# copied from transformers.models.llama.modeling_llama.LlamaSdpaAttention with Llama->Mistral
+# TODO @Arthur no longer copied from LLama after static cache
+class MistralSdpaAttention(MistralAttention):
+    """
+    Mistral attention module using torch.nn.functional.scaled_dot_product_attention. This module inherits from
+    `MistralAttention` as the weights of the module stays untouched. The only changes are on the forward pass to adapt to
+    SDPA API.
+    """
+
+    # Adapted from MistralAttention.forward
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Cache] = None,
+        output_attentions: bool = False,
+        use_cache: bool = False,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        if output_attentions:
+            # TODO: Improve this warning with e.g. `model.config.attn_implementation = "manual"` once this is implemented.
+            logger.warning_once(
+                "MistralModel is using MistralSdpaAttention, but `torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to the manual attention implementation, "
+                'but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
+            )
+            return super().forward(
+                hidden_states=hidden_states,
+                attention_mask=attention_mask,
+                position_ids=position_ids,
+                past_key_value=past_key_value,
+                output_attentions=output_attentions,
+                use_cache=use_cache,
+            )
+
+        bsz, q_len, _ = hidden_states.size()
+
+        query_states = self.q_proj(hidden_states)
+        key_states = self.k_proj(hidden_states)
+        value_states = self.v_proj(hidden_states)
+
+        # query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        # key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        # value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+
+        query_states, key_states, value_states = (
+            rearrange(query_states, "b n (h d) -> b n h d", d=self.head_dim),
+            rearrange(key_states, "b n (h d) -> b n h d", d=self.head_dim),
+            rearrange(value_states, "b n (h d) -> b n h d", d=self.head_dim),
+        )
+
+        query_states = query_states.transpose(1, 2)
+        key_states = key_states.transpose(1, 2)
+        value_states = value_states.transpose(1, 2)
+
+        # 打印注意力模块的输出
+        # 准备数据以写入 JSON 文件
+        attention_outputs = {
+            "Query states": query_states.detach().cpu().tolist(),
+            "Key states": key_states.detach().cpu().tolist(),
+            "Value states": value_states.detach().cpu().tolist()
+        }
+        # 将数据写入 JSON 文件
+        with open("a_sdpa_attention_outputs.json", "w") as f:
+            json.dump(attention_outputs, f, indent=4)
+        
+        if self.config.pp_size > 1:
+            query_states = query_states.contiguous()
+            key_states = key_states.contiguous()
+            value_states = value_states.contiguous()
+
+
+        kv_seq_len = key_states.shape[-2]
+        if past_key_value is not None:
+            kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
+        cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
+
+        if past_key_value is not None:
+            cache_kwargs = {"sin": sin, "cos": cos}  # Specific to RoPE models
+            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
+
+        key_states = repeat_kv(key_states, self.num_key_value_groups)
+        value_states = repeat_kv(value_states, self.num_key_value_groups)
+
+        if attention_mask is not None:
+            if attention_mask.size() != (bsz, 1, q_len, kv_seq_len):
+                raise ValueError(
+                    f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}"
+                )
+
+        # SDPA with memory-efficient backend is currently (torch==2.1.2) bugged with non-contiguous inputs with custom attn_mask,
+        # Reference: https://github.com/pytorch/pytorch/issues/112577.
+        if query_states.device.type == "cuda" and attention_mask is not None:
+            query_states = query_states.contiguous()
+            key_states = key_states.contiguous()
+            value_states = value_states.contiguous()
+
+        attn_output = torch.nn.functional.scaled_dot_product_attention(
+            query_states,
+            key_states,
+            value_states,
+            attn_mask=attention_mask,
+            dropout_p=self.attention_dropout if self.training else 0.0,
+            # The q_len > 1 is necessary to match with AttentionMaskConverter.to_causal_4d that does not create a causal mask in case q_len == 1.
+            is_causal=self.is_causal and attention_mask is None and q_len > 1,
+        )
+
+        attn_output = attn_output.transpose(1, 2).contiguous()
+        attn_output = attn_output.view(bsz, q_len, int(self.hidden_size/self.config.tp_size))
+
+        attn_output = self.o_proj(attn_output)
+
+        # 打印注意力模块的输出
+        attention_result = {
+            "Output weights:": attn_output.detach().cpu().tolist(),
+            # "Attention weights:": attn_weights.detach().cpu().tolist(),
+        }
+        # 将数据写入 JSON 文件
+        with open("a_sdpa_attention_outputs.json", "w") as f:
+            json.dump(attention_result, f, indent=4)
+
+        return attn_output, None, past_key_value
+
+
+MISTRAL_ATTENTION_CLASSES = {
+    "eager": MistralAttention,
+    "flash_attention_2": MistralFlashAttention2,
+    "sdpa": MistralSdpaAttention,
+}
+
+
+class MistralDecoderLayer(nn.Module):
+    def __init__(self, config: CollieConfig, layer_idx: int):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        config._attn_implementation = "sdpa"
+        self.self_attn = MISTRAL_ATTENTION_CLASSES[config._attn_implementation](config, layer_idx)
+        self.config = config
+        self.mlp = MistralMLP(config)
+        self.input_layernorm = MistralRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = MistralRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.idx = layer_idx
+        # 务必保持变量名一致
+        self.use_cache = self.config.model_config.use_cache
+        self.hidden_states = None
+        self.output_attentions = False
+
+class MistralDecoderLayer(nn.Module):
+    def __init__(self, config: CollieConfig, layer_idx: int):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        config._attn_implementation = "sdpa"
+        self.self_attn = MISTRAL_ATTENTION_CLASSES[config._attn_implementation](config, layer_idx)
+        self.config = config
+        self.mlp = MistralMLP(config)
+        self.input_layernorm = MistralRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = MistralRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.idx = layer_idx
+        # 务必保持变量名一致
+        self.use_cache = self.config.model_config.use_cache
+        self.hidden_states = None
+        self.output_attentions = False
+
+    def _forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        # output_attentions: Optional[bool] = False,
+        # use_cache: Optional[bool] = False,
+        **kwargs,
+    ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
+        # if "padding_mask" in kwargs:
+        #     warnings.warn(
+        #         "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`"
+        #     )
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
+            attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
+                `(batch, sequence_length)` where padding elements are indicated by 0.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            use_cache (`bool`, *optional*):
+                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
+                (see `past_key_values`).
+            past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
+        """
+
+        residual = hidden_states
+
+        hidden_states = self.input_layernorm(hidden_states)
+
+        # Self Attention
+        hidden_states, self_attn_weights, present_key_value = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_value=past_key_value,
+            # output_attentions=output_attentions,
+            # use_cache=use_cache,
+            **kwargs,
+        )
+        hidden_states = residual + hidden_states
+
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + hidden_states
+
+        # outputs = (hidden_states,)
+
+        # if output_attentions:
+        #     outputs += (self_attn_weights,)
+
+        # if use_cache:
+        #     outputs += (present_key_value,)
+
+        return hidden_states, present_key_value
+
+    def forward(self, inputs: dict):
+        layer_past = inputs_to_kv_cache_for_layer(idx=self.idx, inputs=inputs)
+
+        if self.config.checkpointing and self.training:
+            hidden_states, new_layer_past = torch.utils.checkpoint.checkpoint(
+                self._forward,
+                inputs["hidden_states"],
+                inputs.get("attention_mask", None),
+                inputs.get("position_ids", None),
+                layer_past,  # inputs.get("past_key_values", None),
+            )
+        else:
+            hidden_states, new_layer_past = self._forward(
+                inputs["hidden_states"],
+                inputs.get("attention_mask", None),
+                inputs.get("position_ids", None),
+                layer_past
+            )  # **inputs
+        inputs["hidden_states"] = hidden_states
+
+        inputs.update(kv_cache_to_inputs_for_layer(idx=self.idx, new_layer_past=new_layer_past))
+        return inputs
+
+
+    # def _forward(
+    #     self,
+    #     hidden_states: torch.Tensor,
+    #     attention_mask: Optional[torch.Tensor] = None,
+    #     position_ids: Optional[torch.LongTensor] = None,
+    #     past_key_value: Optional[Tuple[torch.Tensor]] = None,
+    #     # output_attentions: Optional[bool] = False,
+    #     # use_cache: Optional[bool] = False,
+    #     **kwargs,
+    # ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
+    #     # if "padding_mask" in kwargs:
+    #     #     warnings.warn(
+    #     #         "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`"
+    #     #     )
+    #     """
+    #     Args:
+    #         hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
+    #         attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
+    #             `(batch, sequence_length)` where padding elements are indicated by 0.
+    #         output_attentions (`bool`, *optional*):
+    #             Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+    #             returned tensors for more detail.
+    #         use_cache (`bool`, *optional*):
+    #             If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
+    #             (see `past_key_values`).
+    #         past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
+    #     """
+
+    #     residual = hidden_states
+
+    #     hidden_states = self.input_layernorm(hidden_states)
+
+    #     # Self Attention
+    #     hidden_states, self_attn_weights, present_key_value = self.self_attn(
+    #         hidden_states=hidden_states,
+    #         attention_mask=attention_mask,
+    #         position_ids=position_ids,
+    #         past_key_value=past_key_value,
+    #         # output_attentions=output_attentions,
+    #         # use_cache=use_cache,
+    #         **kwargs,
+    #     )
+    #     hidden_states = residual + hidden_states
+
+    #     # Fully Connected
+    #     residual = hidden_states
+    #     hidden_states = self.post_attention_layernorm(hidden_states)
+    #     hidden_states = self.mlp(hidden_states)
+    #     hidden_states = residual + hidden_states
+
+    #     # outputs = (hidden_states,)
+
+    #     # if output_attentions:
+    #     #     outputs += (self_attn_weights,)
+
+    #     # if use_cache:
+    #     #     outputs += (present_key_value,)
+
+    #     return hidden_states, present_key_value
+
+    # def forward(self, inputs: dict):
+    #     layer_past = inputs_to_kv_cache_for_layer(idx=self.idx, inputs=inputs)
+
+    #     if self.config.checkpointing and self.training:
+    #         hidden_states, new_layer_past = torch.utils.checkpoint.checkpoint(
+    #             self._forward,
+    #             inputs["hidden_states"],
+    #             inputs.get("attention_mask", None),
+    #             inputs.get("position_ids", None),
+    #             layer_past,  # inputs.get("past_key_values", None),
+    #         )
+    #     else:
+    #         hidden_states, new_layer_past = self._forward(
+    #             inputs["hidden_states"],
+    #             inputs.get("attention_mask", None),
+    #             inputs.get("position_ids", None),
+    #             layer_past
+    #         )  # **inputs
+    #     inputs["hidden_states"] = hidden_states
+
+    #     inputs.update(kv_cache_to_inputs_for_layer(idx=self.idx, new_layer_past=new_layer_past))
+    #     return inputs
+
+    # def forward(
+    #     self,
+    #     hidden_states: torch.Tensor,
+    #     attention_mask: Optional[torch.Tensor] = None,
+    #     position_ids: Optional[torch.LongTensor] = None,
+    #     past_key_value: Optional[Tuple[torch.Tensor]] = None,
+    #     output_attentions: Optional[bool] = False,
+    #     use_cache: Optional[bool] = False,
+    #     **kwargs,
+    # ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
+    #     if "padding_mask" in kwargs:
+    #         warnings.warn(
+    #             "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`"
+    #         )
+    #     """
+    #     Args:
+    #         hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
+    #         attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
+    #             `(batch, sequence_length)` where padding elements are indicated by 0.
+    #         output_attentions (`bool`, *optional*):
+    #             Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+    #             returned tensors for more detail.
+    #         use_cache (`bool`, *optional*):
+    #             If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
+    #             (see `past_key_values`).
+    #         past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
+    #     """
+
+    #     residual = hidden_states
+
+    #     hidden_states = self.input_layernorm(hidden_states)
+
+    #     # Self Attention
+    #     hidden_states, self_attn_weights, present_key_value = self.self_attn(
+    #         hidden_states=hidden_states,
+    #         attention_mask=attention_mask,
+    #         position_ids=position_ids,
+    #         past_key_value=past_key_value,
+    #         output_attentions=output_attentions,
+    #         use_cache=use_cache,
+    #         **kwargs,
+    #     )
+    #     hidden_states = residual + hidden_states
+
+    #     # Fully Connected
+    #     residual = hidden_states
+    #     hidden_states = self.post_attention_layernorm(hidden_states)
+    #     hidden_states = self.mlp(hidden_states)
+    #     hidden_states = residual + hidden_states
+
+    #     outputs = (hidden_states,)
+
+    #     if output_attentions:
+    #         outputs += (self_attn_weights,)
+
+    #     if use_cache:
+    #         outputs += (present_key_value,)
+
+    #     return outputs
+
+
+MISTRAL_START_DOCSTRING = r"""
+    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+
+    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
+    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
+    and behavior.
+
+    Parameters:
+        config ([`MistralConfig`]):
+            Model configuration class with all the parameters of the model. Initializing with a config file does not
+            load the weights associated with the model, only the configuration. Check out the
+            [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+
+@add_start_docstrings(
+    "The bare Mistral Model outputting raw hidden-states without any specific head on top.",
+    MISTRAL_START_DOCSTRING,
+)
+class MistralPreTrainedModel(PreTrainedModel):
+    config_class = MistralConfig
+    base_model_prefix = "model"
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["MistralDecoderLayer"]
+    _skip_keys_device_placement = "past_key_values"
+    _supports_flash_attn_2 = True
+    _supports_sdpa = True
+    _supports_cache_class = True
+
+    def _init_weights(self, module):
+        std = self.config.initializer_range
+        if isinstance(module, nn.Linear):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+
+
+MISTRAL_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
+            it.
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            [What are input IDs?](../glossary#input-ids)
+        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            If `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
+            `past_key_values`).
+
+            If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
+            and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
+            information on the default strategy.
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+        position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+            config.n_positions - 1]`.
+
+            [What are position IDs?](../glossary#position-ids)
+        past_key_values (`Cache` or `tuple(tuple(torch.FloatTensor))`, *optional*):
+            Pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
+            blocks) that can be used to speed up sequential decoding. This typically consists in the `past_key_values`
+            returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`.
+
+            Two formats are allowed:
+            - a [`~cache_utils.Cache`] instance;
+            - Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of
+            shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`). This is also known as the legacy
+            cache format.
+
+            The model will output the same cache format that is fed as input. If no `past_key_values` are passed, the
+            legacy cache format will be returned.
+
+            If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that don't
+            have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `input_ids`
+            of shape `(batch_size, sequence_length)`.
+        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
+            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
+            model's internal embedding lookup matrix.
+        use_cache (`bool`, *optional*):
+            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
+            `past_key_values`).
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+@add_start_docstrings(
+    "The bare Mistral Model outputting raw hidden-states without any specific head on top.",
+    MISTRAL_START_DOCSTRING,
+)
+class MistralModel(nn.Module):
+    """
+    Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`MistralDecoderLayer`]
+
+    Args:
+        config: MistralConfig
+    """
+
+    def __init__(self, config: CollieConfig):
+        # super().__init__(config)
+        super().__init__()
+        self.config = config
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+        
+        # aaaa
+        # self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
+        self.embed_tokens = tensor_parallel.VocabParallelEmbedding(
+            config.vocab_size, config.hidden_size, params_dtype=torch.float32
+        )
+        self.layers = nn.ModuleList(
+            [MistralDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
+        )
+        config._attn_implementation = "sdpa"
+        self._attn_implementation = config._attn_implementation
+        self.norm = MistralRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.gradient_checkpointing = False
+        # Initialize weights and apply final processing
+        # self.post_init()
+        
+
+    def get_input_embeddings(self):
+        return self.embed_tokens
+
+    def set_input_embeddings(self, value):
+        self.embed_tokens = value
+
+    @add_start_docstrings_to_model_forward(MISTRAL_INPUTS_DOCSTRING)
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        # aaaa
+        past_key_values: Optional[Tuple[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutputWithPast]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        # retrieve input_ids and inputs_embeds
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time")
+        elif input_ids is not None:
+            batch_size, seq_length = input_ids.shape
+        elif inputs_embeds is not None:
+            batch_size, seq_length, _ = inputs_embeds.shape
+        else:
+            raise ValueError("You have to specify either decoder_input_ids or decoder_inputs_embeds")
+
+        if self.gradient_checkpointing and self.training:
+            if use_cache:
+                logger.warning_once(
+                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+                )
+                use_cache = False
+
+        past_key_values_length = 0
+
+        if use_cache:
+            use_legacy_cache = not isinstance(past_key_values, Cache)
+            if use_legacy_cache:
+                past_key_values = DynamicCache.from_legacy_cache(past_key_values)
+            past_key_values_length = past_key_values.get_usable_length(seq_length)
+
+        if position_ids is None:
+            device = input_ids.device if input_ids is not None else inputs_embeds.device
+            position_ids = torch.arange(
+                past_key_values_length, seq_length + past_key_values_length, dtype=torch.long, device=device
+            )
+            position_ids = position_ids.unsqueeze(0).view(-1, seq_length)
+        else:
+            position_ids = position_ids.view(-1, seq_length).long()
+
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_tokens(input_ids)
+
+        # 打印嵌入层输出
+        embeddings_output = inputs_embeds.detach().cpu().tolist()
+        data_to_save = {"Embeddings Output": embeddings_output}
+        # 将输出写入 JSON 文件
+        with open('a_embeddings_output.json', 'w') as f:
+            json.dump(data_to_save, f, indent=4)
+
+        if attention_mask is not None and self._attn_implementation == "flash_attention_2" and use_cache:
+            is_padding_right = attention_mask[:, -1].sum().item() != batch_size
+            if is_padding_right:
+                raise ValueError(
+                    "You are attempting to perform batched generation with padding_side='right'"
+                    " this may lead to unexpected behaviour for Flash Attention version of Mistral. Make sure to "
+                    " call `tokenizer.padding_side  = 'left'` before tokenizing the input. "
+                )
+
+        if self._attn_implementation == "flash_attention_2":
+            # 2d mask is passed through the layers
+            attention_mask = attention_mask if (attention_mask is not None and 0 in attention_mask) else None
+        elif self._attn_implementation == "sdpa" and not output_attentions:
+            # output_attentions=True can not be supported when using SDPA, and we fall back on
+            # the manual implementation that requires a 4D causal mask in all cases.
+            attention_mask = _prepare_4d_causal_attention_mask_for_sdpa(
+                attention_mask,
+                (batch_size, seq_length),
+                inputs_embeds,
+                past_key_values_length,
+            )
+        else:
+            # 4d mask is passed through the layers
+            attention_mask = _prepare_4d_causal_attention_mask(
+                attention_mask,
+                (batch_size, seq_length),
+                inputs_embeds,
+                past_key_values_length,
+                sliding_window=self.config.sliding_window,
+            )
+
+        hidden_states = inputs_embeds
+
+        inputs = {
+            "input_ids": input_ids,
+            "hidden_states": hidden_states,
+            "attention_mask": attention_mask,
+            "position_ids": position_ids,
+            "past_key_values": past_key_values,
+            "output_attentions": output_attentions,
+            "use_cache": use_cache,
+        } 
+
+        # decoder layers
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attns = () if output_attentions else None
+        next_decoder_cache = None
+
+        # for decoder_layer in self.layers:
+        for idx, decoder_layer in enumerate(self.layers):
+            if output_hidden_states:
+                # all_hidden_states += (hidden_states,)
+                all_hidden_states += (inputs["hidden_states"],)
+
+            if self.gradient_checkpointing and self.training:
+                layer_outputs = self._gradient_checkpointing_func(
+                    decoder_layer.__call__,
+                    # hidden_states,
+                    # attention_mask,
+                    # position_ids,
+                    # past_key_values,
+                    # output_attentions,
+                    # use_cache,
+                    inputs,
+                )
+            else:
+                layer_outputs = decoder_layer(
+                    # hidden_states,
+                    # attention_mask=attention_mask,
+                    # position_ids=position_ids,
+                    # past_key_value=past_key_values,
+                    # output_attentions=output_attentions,
+                    # use_cache=use_cache,
+                    inputs,
+                )
+            inputs.update(layer_outputs)
+
+            # hidden_states = layer_outputs[0]
+            hidden_states = inputs["hidden_states"]
+
+            if use_cache:
+                # next_decoder_cache = layer_outputs[2 if output_attentions else 1]
+                next_decoder_cache = inputs["addition_info"][1 if output_attentions else 0]
+
+            if output_attentions:
+                # all_self_attns += (layer_outputs[1],)
+                all_self_attns += (inputs["addition_info"][0],)
+
+        hidden_states = self.norm(hidden_states)
+
+        # add hidden states from the last decoder layer
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+
+        next_cache = None
+        if use_cache:
+            next_cache = next_decoder_cache.to_legacy_cache() if use_legacy_cache else next_decoder_cache
+
+        if not return_dict:
+            return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None)
+        return BaseModelOutputWithPast(
+            last_hidden_state=hidden_states,
+            # past_key_values=next_cache,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attns,
+            past_key_values=past_key_values,
+        )
+
+    @classmethod
+    def pipeline_layers(cls, config: CollieConfig):
+        """
+        Get layers of pipeline.
+        :return: list
+        """
+        if isinstance(config, str):
+            config = CollieConfig.from_pretrained(config)
+
+        if config.tie_word_embeddings:
+            embed_tokens = TiedLayerSpec(
+                "embed_tokens",
+                dict_as_params(input_keys="input_ids", output_keys="hidden_states"),
+                tensor_parallel.VocabParallelEmbedding,
+                config.vocab_size,
+                config.hidden_size,
+            )
+        else:
+            embed_tokens = LayerSpec(
+                dict_as_params(input_keys="input_ids", output_keys="hidden_states"),
+                tensor_parallel.VocabParallelEmbedding,
+                config.vocab_size,
+                config.hidden_size,
+            )
+
+        layers = [
+            LayerSpec(MistralDecoderLayer, config, i) for i in range(config.num_hidden_layers)
+        ]
+        norm = LayerSpec(
+            dict_as_params(input_keys="hidden_states", output_keys="hidden_states"),
+            MistralRMSNorm,
+            hidden_size=config.hidden_size,
+            eps=config.rms_norm_eps,
+        )
+
+        return [
+            ("embed_tokens", embed_tokens),
+            ("layers", layers),
+            ("norm", norm),
+        ]
+
+class MistralForCausalLM(CollieModelForCausalLM):
+    _tied_weights_keys = ["lm_head.weight"]
+
+    def __init__(self, config:CollieConfig):
+        super().__init__(config)
+        self.model = MistralModel(config)
+        self.vocab_size = config.vocab_size
+        # self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+        self.lm_head = ColumnParallelLinearWithoutBias(
+            self.collie_config.hidden_size, self.collie_config.vocab_size, bias=False
+        )
+        # Initialize weights and apply final processing
+        # self.post_init()
+        # GenerationMixin 需要的额外参数
+        self.config.is_decoder = True
+        if config.model_config.tie_word_embeddings:
+            self.lm_head.weight = self.embed_tokens.weight
+        self.main_input_name = "input_ids"
+
+    def clean_cache(self):
+        self._clean_hidden_states([*self.model.layers, self.lm_head])
+        self._set_use_cache(self.model.layers, False)
+
+    def set_cache(self, use_cache):
+        self._set_use_cache(self.model.layers, use_cache)
+
+    def get_input_embeddings(self):
+        return self.model.embed_tokens
+
+    def set_input_embeddings(self, value):
+        self.model.embed_tokens = value
+
+    def get_output_embeddings(self):
+        return self.lm_head
+
+    def set_output_embeddings(self, new_embeddings):
+        self.lm_head = new_embeddings
+
+    def set_decoder(self, decoder):
+        self.model = decoder
+
+    def get_decoder(self):
+        return self.model
+
+    @add_start_docstrings_to_model_forward(MISTRAL_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        **kwargs,
+    ) -> Union[Tuple, CausalLMOutputWithPast]:
+        r"""
+        Args:
+            labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
+                config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+                (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+
+        Returns:
+
+        Example:
+
+        ```python
+        >>> from transformers import AutoTokenizer, MistralForCausalLM
+
+        >>> model = MistralForCausalLM.from_pretrained("mistralai/Mistral-7B-v0.1")
+        >>> tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-v0.1")
+
+        >>> prompt = "Hey, are you conscious? Can you talk to me?"
+        >>> inputs = tokenizer(prompt, return_tensors="pt")
+
+        >>> # Generate
+        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
+        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+        "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
+        ```"""
+
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
+        outputs = self.model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        hidden_states = outputs[0]
+        logits = self.lm_head(hidden_states)
+        logits = logits.float()
+
+        loss = None
+        if labels is not None:
+            # Shift so that tokens < n predict n
+            shift_logits = logits[..., :-1, :].contiguous()
+            shift_labels = labels[..., 1:].contiguous()
+            # Flatten the tokens
+            shift_logits = shift_logits.view(-1, self.config.vocab_size)
+            shift_labels = shift_labels.view(-1)
+            # Ensure tensors are on the same device
+            shift_labels = shift_labels.to(shift_logits.device)
+            loss_fct = CrossEntropyLoss()
+            loss = loss_fct(shift_logits, shift_labels)
+
+        if not return_dict:
+            output = (logits,) + outputs[1:]
+            return (loss,) + output if loss is not None else output
+
+        return CausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+    def prepare_inputs_for_generation(
+        self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, **kwargs
+    ):
+        # Omit tokens covered by past_key_values
+        if past_key_values is not None:
+            if isinstance(past_key_values, Cache):
+                cache_length = past_key_values.get_seq_length()
+                past_length = past_key_values.seen_tokens
+                max_cache_length = past_key_values.get_max_length()
+            else:
+                cache_length = past_length = past_key_values[0][0].shape[2]
+                max_cache_length = None
+
+            # Keep only the unprocessed tokens:
+            # 1 - If the length of the attention_mask exceeds the length of input_ids, then we are in a setting where
+            # some of the inputs are exclusively passed as part of the cache (e.g. when passing input_embeds as
+            # input)
+            if attention_mask is not None and attention_mask.shape[1] > input_ids.shape[1]:
+                input_ids = input_ids[:, -(attention_mask.shape[1] - past_length) :]
+            # 2 - If the past_length is smaller than input_ids', then input_ids holds all input tokens. We can discard
+            # input_ids based on the past_length.
+            elif past_length < input_ids.shape[1]:
+                input_ids = input_ids[:, past_length:]
+            # 3 - Otherwise (past_length >= input_ids.shape[1]), let's assume input_ids only has unprocessed tokens.
+
+            # If we are about to go beyond the maximum cache length, we need to crop the input attention mask.
+            if (
+                max_cache_length is not None
+                and attention_mask is not None
+                and cache_length + input_ids.shape[1] > max_cache_length
+            ):
+                attention_mask = attention_mask[:, -max_cache_length:]
+
+        position_ids = kwargs.get("position_ids", None)
+        if attention_mask is not None and position_ids is None:
+            # create position_ids on the fly for batch generation
+            position_ids = attention_mask.long().cumsum(-1) - 1
+            position_ids.masked_fill_(attention_mask == 0, 1)
+            if past_key_values:
+                position_ids = position_ids[:, -input_ids.shape[1] :]
+
+        # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
+        if inputs_embeds is not None and past_key_values is None:
+            model_inputs = {"inputs_embeds": inputs_embeds}
+        else:
+            model_inputs = {"input_ids": input_ids}
+
+        model_inputs.update(
+            {
+                "position_ids": position_ids,
+                "past_key_values": past_key_values,
+                "use_cache": kwargs.get("use_cache"),
+                "attention_mask": attention_mask,
+            }
+        )
+        return model_inputs
+
+    @staticmethod
+    def _reorder_cache(past_key_values, beam_idx):
+        reordered_past = ()
+        for layer_past in past_key_values:
+            reordered_past += (
+                tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past),
+            )
+        return reordered_past
+
+    @classmethod
+    def pipeline_layers(cls, config: CollieConfig):
+        """
+        Get layers of pipeline.
+        :return: list
+        """
+        if isinstance(config, str):
+            config = CollieConfig.from_pretrained(config)
+
+        if config.tie_word_embeddings:
+            output = TiedLayerSpec(
+                "embed_tokens",
+                dict_as_params(input_keys="hidden_states", output_keys="logits"),
+                ColumnParallelLMHead,
+                config.hidden_size,
+                config.vocab_size,
+                bias=False,
+            )
+        else:
+            output = LayerSpec(
+                dict_as_params(input_keys="hidden_states", output_keys="logits"),
+                ColumnParallelLMHead,
+                config.hidden_size,
+                config.vocab_size,
+                bias=False,
+            )
+
+        return [("model", MistralModel.pipeline_layers(config)), ("lm_head", output)]
+
+    @staticmethod
+    def load_parallel_state_dict(
+            path: str,
+            config: Union[CollieConfig, str],
+            process_exclusion: bool = False,
+            **kwargs,
+    ):
+        ...
+
+    @staticmethod
+    def load_parallel_state_dict(
+            path: str,
+            config: Union[CollieConfig, str],
+            process_exclusion: bool = False,
+            protocol: str = "file", # 指定加载state_dict时使用的协议
+            **kwargs,
+    ):
+        """
+        Load state_dict from ``path``.
+        The format of pretrained model should be the same as that of
+        `huggingface`.
+        :return: state_dict. Note that the state_dict should be processed
+            properly to match the current rank.
+        """
+        # 配置加载
+        if isinstance(config, str):
+            config = CollieConfig.from_pretrained(config)
+        # IO驱动初始化
+        io_driver = IODriver.from_protocol(protocol)
+        # 检查文件路径是否存在
+        if not io_driver.exists(path):
+            raise FileNotFoundError(f"folder {path} not found.")
+        # 初始化存储和处理变量
+        state_dict = OrderedDict()
+        weights = []
+        parts = None # 变量用于存储模型分割的部分信息
+        # 如果开启了进程互斥，那么每个进程都会显示进度条，否则只显示 RANK0 的
+        hide_progress = not process_exclusion and int(os.environ.get("RANK", "0")) != 0
+        if dist.is_initialized() and process_exclusion:
+            # 如果启动了进程互斥，则要进行 dist.get_world_size() 次循环
+            rank_order = range(dist.get_world_size())
+        else:
+            # 不开启只进行一次循环
+            rank_order = range(1)
+        # 权重文件加载和处理
+        for rank in rank_order:
+            # 如果开启了进程互斥，那么只有对应 RANK 的能进入循环；不开启进程互斥的话就都可以进
+            if int(os.environ.get("RANK", "0")) == rank or not process_exclusion:
+                # PP 分层的方法保存在了 os.environ["COLLIE_PP_PARTS"], 格式类似于 [0, 17, 35], 左闭右开
+                if env.is_pipeline:
+                    # 保存的是 json 格式
+                    parts = env.pipeline_parts
+                if hasattr(config, "num_key_value_heads"):
+                    # llama2 (transformers >= 4.31.0)
+                    num_key_value_heads = config.num_key_value_heads
+                else:
+                    num_key_value_heads = config.num_attention_heads
+                head_dim = config.hidden_size // config.num_attention_heads
+                # 如果存在 pytorch_model.bin.index.json 文件的话，此时不同的 pp 进程可以按需加载自己需要的权重
+                if (
+                        io_driver.exists(os.path.join(path, "pytorch_model.bin.index.json"))
+                        and "COLLIE_PP_PARTS" in os.environ.keys()
+                ):
+                    weight_map = json.loads(
+                        io_driver.load(
+                            os.path.join(path, "pytorch_model.bin.index.json"), mode="r"
+                        )
+                    )["weight_map"]
+                    # layers 表示自己需要的层
+                    layers = env.pipeline_layers_idx
+                    # 筛选出形似 model.layers.0 这样的层。包含两个条件：1. 有数字的层；2. 数字加一要在 layers 里面（因为最开始还有个 embedding 占一层）
+                    weights.extend(
+                        [
+                            value
+                            for key, value in weight_map.items()
+                            if len(key.split(".")) > 2
+                               and key.split(".")[2].isdigit()
+                               and (int(key.split(".")[2]) + 1) in layers
+                        ]
+                    )
+                    # 去重
+                    weights = list(set(weights))
+                    # 继续筛选，如果有 0 层，那么就要加载 embedding；如果有最后一层，那么就要加载 lm_head；如果有倒数第二层，那么就要加载 norm
+                    if 0 in layers:
+                        weights.append(weight_map["model.embed_tokens.weight"])
+                    if max(parts) - 1 in layers:
+                        weights.append(weight_map["lm_head.weight"])
+                    if max(parts) - 2 in layers:
+                        weights.append(weight_map["model.norm.weight"])
+                else:
+                    # 如果没有 pytorch_model.bin.index.json 文件的话，那么就加载所有的权重
+                    weights = [
+                        weight
+                        for weight in io_driver.list(path)
+                        if weight.endswith(".bin")
+                    ]
+                with progress(
+                    weights,
+                    desc="Loading state dict",
+                    total=len(weights),
+                    disable=hide_progress,
+                ) as pbar:
+                    for weight in pbar:
+                        part_state_dict = io_driver.load(
+                            os.path.join(path, weight), mode="rb"
+                        )
+                        # for key in list(part_state_dict.keys()):
+                            # if "attention.wqkv.weight" in key:
+                            #     # qkv_weights = part_state_dict.pop(key)
+                            #     qkv_weights = part_state_dict[key]
+                            #     print(qkv_weights.shape)
+                            #     (wq, wk, wv) = qkv_weights.split(
+                            #         [
+                            #             config.hidden_size,
+                            #             config.num_key_value_heads * head_dim,
+                            #             config.num_key_value_heads * head_dim,
+                            #         ],
+                            #         dim=0,
+                            #     )
+                            #     wq_name = key.replace("wqkv", "wq")
+                            #     wk_name = key.replace("wqkv", "wk")
+                            #     wv_name = key.replace("wqkv", "wv")
+                            #     part_state_dict[wq_name] = wq
+                            #     part_state_dict[wk_name] = wk
+                            #     part_state_dict[wv_name] = wv
+                        state_dict.update(part_state_dict)
+                        del part_state_dict
+                if parts is not None:
+                    # 这一步是 pp 的复筛
+                    layers = env.pipeline_layers_idx
+                    for key in list(state_dict.keys()):
+                        if key.startswith("layers"):
+                            layer = int(key.split(".")[1])
+                            if layer + 1 not in layers:
+                                state_dict.pop(key)
+                        # if key.endswith("tok_embeddings.weight"):
+                        if key.endswith("embed_tokens.weight"):
+                            if 0 not in layers:
+                                state_dict.pop(key)
+                        if key == "norm.weight":
+                            if max(parts) - 2 not in layers:
+                                state_dict.pop(key)
+                        # if key.endswith("output.weight"):
+                        if key.endswith("lm_head.weight"):
+                            if max(parts) - 1 not in layers:
+                                state_dict.pop(key)
+                # 根据用户配置的新的 tp size 进行分割
+                for key in list(state_dict.keys()):
+                    col_filter = [
+                        # "wq.weight",
+                        # "wk.weight",
+                        # "wv.weight",
+                        # "wqkv.weight",
+                        # "w1.weight",
+                        # "w3.weight",
+                        # "tok_embeddings.weight",
+                        # "output.weight",
+                        "q_proj.weight",
+                        "k_proj.weight",
+                        "v_proj.weight",
+                        #"o_proj.weight",
+                        "lm_head.weight",
+                        "gate_proj.weight",
+                        "up_proj.weight",
+                        #"down_proj.weight",
+                        "embed_tokens.weight",
+                    ]
+                    col_split = any([key.endswith(filter) for filter in col_filter])
+
+                    if col_split:
+                        tensor = (
+                            list(torch.chunk(state_dict[key], config.tp_size, dim=0))[
+                                env.tp_rank
+                            ]
+                            .detach()
+                            .clone()
+                        )
+                        del state_dict[key]
+                        if process_exclusion:
+                            # CPU 内存回收（速度很慢）
+                            gc.collect()
+                        state_dict[key] = tensor
+                    elif key.endswith("o_proj.weight") or key.endswith("down_proj.weight"):
+                        tensor = (
+                            list(torch.chunk(state_dict[key], config.tp_size, dim=1))[
+                                env.tp_rank
+                            ]
+                            .detach()
+                            .clone()
+                        )
+                        del state_dict[key]
+                        if process_exclusion:
+                            # CPU 内存回收（速度很慢）
+                            gc.collect()
+                        state_dict[key] = tensor
+            if dist.is_initialized() and process_exclusion:
+                # 如果选择了进程互斥，那么本次循环中不需要加载权重的进程需等待
+                dist.barrier()
+        return state_dict
+
+    @staticmethod
+    def save_parallel_state_dict(
+        state_dict: dict,
+        path: str,
+        config: CollieConfig,
+        process_exclusion: bool = False,
+        **kwargs,
+    ):
+        ...
+
+    @staticmethod
+    def save_parallel_state_dict(
+            state_dict: dict,
+            path: str,
+            config: CollieConfig,
+            process_exclusion: bool = False,
+            protocol: str = "file",
+    ):
+        """
+        Save state_dict to ``path``.
+        The format of saved state dict should be the same as that of
+        `huggingface`.
+        """
+        io_driver = IODriver.from_protocol(protocol)
+        # gather to tp rank 0
+        if dist.is_initialized() and process_exclusion:
+            # 如果启动了进程互斥，则要进行 pp_size 次循环
+            rank_order = range(config.pp_size)
+        else:
+            # 不开启只进行一次循环
+            rank_order = range(1)
+        dst = parallel_state.get_tensor_model_parallel_src_rank()
+        with progress(
+                rank_order,
+                desc="Saving model",
+                disable=int(os.environ.get("RANK", "0")) != 0,
+        ) as pbar:
+            for rank in pbar:
+                if env.dp_rank == 0 and (env.pp_rank == rank or not process_exclusion):
+                    for key in sorted(list(state_dict.keys())):
+                        tensor_list = None
+                        if env.tp_rank == 0:
+                            tensor_list = [
+                                torch.zeros_like(state_dict[key])
+                                .to(state_dict[key].dtype)
+                                .cuda()
+                                for _ in range(config.tp_size)
+                            ]
+                        dist.gather(
+                            state_dict[key].cuda(),
+                            dst=dst,
+                            gather_list=tensor_list,
+                            group=env.tp_group,
+                        )
+                        if env.tp_rank == 0:
+                            col_filter = [
+                                # "wq.weight",
+                                # "wk.weight",
+                                # "wv.weight",
+                                # "wqkv.weight",
+                                # "w1.weight",
+                                # "w3.weight",
+                                # "tok_embeddings.weight",
+                                # "output.weight",
+                                "q_proj.weight",
+                                "k_proj.weight",
+                                "v_proj.weight",
+                                #"o_proj.weight",
+                                "lm_head.weight",
+                                "gate_proj.weight",
+                                "up_proj.weight",
+                                #"down_proj.weight",
+                                "embed_tokens.weight",
+                            ]
+                            col_split = any(
+                                [key.endswith(filter) for filter in col_filter]
+                            )
+
+                            if col_split:
+                                state_dict[key] = concat_tensor(tensor_list, dim=0)
+
+                                if process_exclusion:
+                                    # CPU 内存回收（速度很慢）
+                                    gc.collect()
+
+                            elif key.endswith("o_proj.weight") or key.endswith("down_proj.weight"):
+                                state_dict[key] = concat_tensor(tensor_list, dim=1)
+
+                                if process_exclusion:
+                                    # CPU 内存回收（速度很慢）
+                                    gc.collect()
+                    # 似乎不需要？
+                    # state_dict_keys = state_dict.keys()
+                    # for layer_id in range(config.num_layers):
+                    #     qkv_names = [None, None, None]
+                    #     for key in state_dict_keys:
+                    #         if f"layers.{layer_id}.attention.wq.weight" in key:
+                    #             qkv_names[0] = key
+                    #         elif f"layers.{layer_id}.attention.wk.weight" in key:
+                    #             qkv_names[1] = key
+                    #         elif f"layers.{layer_id}.attention.wv.weight" in key:
+                    #             qkv_names[2] = key
+                    #     qkv_name = qkv_names[0].replace("wq", "wqkv")
+                    #     state_dict[qkv_name] = torch.cat(
+                    #         [
+                    #             state_dict.pop(qkv_names[0]),
+                    #             state_dict.pop(qkv_names[1]),
+                    #             state_dict.pop(qkv_names[2]),
+                    #         ],
+                    #         dim=0
+                    #     )
+
+                    if env.tp_rank == 0:
+                        # Save gathered weights
+                        if env.is_pipeline:
+                            ckpt_name = f"pytorch_model-{env.pp_rank + 1:05d}-of-{config.pp_size:05d}.bin"
+                            total_size = 0
+                            weight_map = {}
+                            for name, weight in state_dict.items():
+                                weight_size = weight.numel() * dtype_byte_size(
+                                    weight.dtype
+                                )
+                                weight_map[name] = ckpt_name
+                                total_size += weight_size
+                            index_dict = dict(
+                                total_size=total_size, weight_map=weight_map
+                            )
+                            index_dicts = [None for _ in range(env.pp_size)]
+                            dist.gather_object(
+                                index_dict, index_dicts if env.pp_rank == 0 else None, group=env.pp_group
+                            )
+                            if env.pp_rank == 0:
+                                total_size = 0
+                                weight_map = {}
+                                for _index_dict in index_dicts:
+                                    total_size += _index_dict["total_size"]
+                                    weight_map.update(_index_dict["weight_map"])
+                                merged_dict = {
+                                    "metadata": {"total_size": total_size},
+                                    "weight_map": weight_map,
+                                }
+                                io_driver.save(
+                                    json.dumps(merged_dict, indent=2, sort_keys=True)
+                                    + "\n",
+                                    os.path.join(path, "pytorch_model.bin.index.json"),
+                                )
+
+                        else:
+                            ckpt_name = f"pytorch_model.bin"
+                        ckpt_path = os.path.join(path, ckpt_name)
+                        io_driver.save(state_dict, ckpt_path)
+                if dist.is_initialized() and process_exclusion:
+                    dist.barrier()
+        if env.rank == 0:
+            config.save_pretrained(path, protocol=protocol)
+        dist.barrier()
+
+
+@add_start_docstrings(
+    """
+    The Mistral Model transformer with a sequence classification head on top (linear layer).
+
+    [`MistralForSequenceClassification`] uses the last token in order to do the classification, as other causal models
+    (e.g. GPT-2) do.
+
+    Since it does classification on the last token, it requires to know the position of the last token. If a
+    `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If
+    no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the
+    padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
+    each row of the batch).
+    """,
+    MISTRAL_START_DOCSTRING,
+)
+# Copied from transformers.models.llama.modeling_llama.LlamaForSequenceClassification with Llama->Mistral, LLAMA->MISTRAL
+class MistralForSequenceClassification(MistralPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+        self.model = MistralModel(config)
+        self.score = nn.Linear(config.hidden_size, self.num_labels, bias=False)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.model.embed_tokens
+
+    def set_input_embeddings(self, value):
+        self.model.embed_tokens = value
+
+    @add_start_docstrings_to_model_forward(MISTRAL_INPUTS_DOCSTRING)
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, SequenceClassifierOutputWithPast]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        transformer_outputs = self.model(
+            input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        hidden_states = transformer_outputs[0]
+        logits = self.score(hidden_states)
+
+        if input_ids is not None:
+            batch_size = input_ids.shape[0]
+        else:
+            batch_size = inputs_embeds.shape[0]
+
+        if self.config.pad_token_id is None and batch_size != 1:
+            raise ValueError("Cannot handle batch sizes > 1 if no padding token is defined.")
+        if self.config.pad_token_id is None:
+            sequence_lengths = -1
+        else:
+            if input_ids is not None:
+                # if no pad token found, use modulo instead of reverse indexing for ONNX compatibility
+                sequence_lengths = torch.eq(input_ids, self.config.pad_token_id).int().argmax(-1) - 1
+                sequence_lengths = sequence_lengths % input_ids.shape[-1]
+                sequence_lengths = sequence_lengths.to(logits.device)
+            else:
+                sequence_lengths = -1
+
+        pooled_logits = logits[torch.arange(batch_size, device=logits.device), sequence_lengths]
+
+        loss = None
+        if labels is not None:
+            labels = labels.to(logits.device)
+            if self.config.problem_type is None:
+                if self.num_labels == 1:
+                    self.config.problem_type = "regression"
+                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
+                    self.config.problem_type = "single_label_classification"
+                else:
+                    self.config.problem_type = "multi_label_classification"
+
+            if self.config.problem_type == "regression":
+                loss_fct = MSELoss()
+                if self.num_labels == 1:
+                    loss = loss_fct(pooled_logits.squeeze(), labels.squeeze())
+                else:
+                    loss = loss_fct(pooled_logits, labels)
+            elif self.config.problem_type == "single_label_classification":
+                loss_fct = CrossEntropyLoss()
+                loss = loss_fct(pooled_logits.view(-1, self.num_labels), labels.view(-1))
+            elif self.config.problem_type == "multi_label_classification":
+                loss_fct = BCEWithLogitsLoss()
+                loss = loss_fct(pooled_logits, labels)
+        if not return_dict:
+            output = (pooled_logits,) + transformer_outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+
+        return SequenceClassifierOutputWithPast(
+            loss=loss,
+            logits=pooled_logits,
+            past_key_values=transformer_outputs.past_key_values,
+            hidden_states=transformer_outputs.hidden_states,
+            attentions=transformer_outputs.attentions,
+        )

From f1a19a9ef405c9a30c5cacf725920b50fa2ff697 Mon Sep 17 00:00:00 2001
From: LinqiY <100989140+LinqiY@users.noreply.github.com>
Date: Fri, 26 Apr 2024 15:41:46 +0800
Subject: [PATCH 02/16] Add files via upload

---
 collie/models/mistral2/__init__.py            |    2 +
 .../__pycache__/__init__.cpython-310.pyc      |  Bin 0 -> 295 bytes
 .../configuration_mistraltp.cpython-310.pyc   |  Bin 0 -> 6283 bytes
 .../__pycache__/model.cpython-310.pyc         |  Bin 0 -> 49178 bytes
 .../__pycache__/modeltp.cpython-310.pyc       |  Bin 0 -> 52277 bytes
 .../mistral2/configuration_mistraltp.py       |  155 ++
 collie/models/mistral2/model.py               | 2026 +++++++++++++++
 collie/models/mistral2/modelpp.py             | 1922 ++++++++++++++
 collie/models/mistral2/modeltp.py             | 2254 +++++++++++++++++
 9 files changed, 6359 insertions(+)
 create mode 100644 collie/models/mistral2/__init__.py
 create mode 100644 collie/models/mistral2/__pycache__/__init__.cpython-310.pyc
 create mode 100644 collie/models/mistral2/__pycache__/configuration_mistraltp.cpython-310.pyc
 create mode 100644 collie/models/mistral2/__pycache__/model.cpython-310.pyc
 create mode 100644 collie/models/mistral2/__pycache__/modeltp.cpython-310.pyc
 create mode 100644 collie/models/mistral2/configuration_mistraltp.py
 create mode 100644 collie/models/mistral2/model.py
 create mode 100644 collie/models/mistral2/modelpp.py
 create mode 100644 collie/models/mistral2/modeltp.py

diff --git a/collie/models/mistral2/__init__.py b/collie/models/mistral2/__init__.py
new file mode 100644
index 0000000..9dc3f79
--- /dev/null
+++ b/collie/models/mistral2/__init__.py
@@ -0,0 +1,2 @@
+from .modeltp import MistralForCausalLM
+from .configuration_mistraltp import MistralConfig
\ No newline at end of file
diff --git a/collie/models/mistral2/__pycache__/__init__.cpython-310.pyc b/collie/models/mistral2/__pycache__/__init__.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..76a01ca4171928aebb54f37b4541ecbf0bd2731f
GIT binary patch
literal 295
zcmd1j<>g`kf)fuV(xQO$V-N=!FabFZKwK;XBvKes7;_kM8KW2(L2RZRrd;MIW+0n6
zm_d`}B_mLYCgUw3-^}8YqQo4x{37SX(&EG%A77v-FI3byKQApa-A|Jxiaj?!B{ip{
zpa^6~lz1{&qO>TnBr`uRJ{MvJP?i}eyON=Z1xSI3U(xzSsk!+jsk#~YxvBa&g_W6k
z`p)@2KAEoiC8@<F#rj}#ffU3DBYizRC`!)H$;nK`kdKeg%*!l^kJl@xyv1Py)LojB
PY6tR6G00LL1_4F@r8rN7

literal 0
HcmV?d00001

diff --git a/collie/models/mistral2/__pycache__/configuration_mistraltp.cpython-310.pyc b/collie/models/mistral2/__pycache__/configuration_mistraltp.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b9656ff93c053ebc2cddc39aad2ef3badc5f9ac6
GIT binary patch
literal 6283
zcmbtY&2JmW6<<;mMaq(GS^h}sw3#YRBN`H^52taB+Kpn%vH(l5Wg!J*7|Ye}kQ{qI
z=-FK<QW>B?0;HGr)ME}k^pZ=_OaFov{Ui2P^k5ih&js3|=<m%gcS)N1FiJ^JOLFGD
znfHG0_uh;*IXSN3wYK_h<InGC+TZ9+@2h|}ALGxz31MkFnw7KicFxJ~<a6{qVvjh5
zodTW<)~HqdCciVfrdeZF=_}1D@%)z~J4LJfSgVX5zQZ<zA4tPyj%RV3nV#Da%}^SF
z@Z49>u2pi;`G=BYjBvSiM}3dainiBma(TGk3WBzOvsP<`&8Bdh4a4MB)2lhLT0_(l
z1M?q!GQYP_U93s&d-fi$IZ#-O?Ny)oo_o0UCaTt5EITxV<8@Z@hi@OblEayyU-fA=
z$pyVHay`BLEeNjd<lz3imD?Gy@;e1<q?y|pEf>n8<zjgZKc#ZHJYGIio+wY2&z8@X
z&zGml7x3+L`J#2@Yps#9E?JkqD(@7()(YCr7(((^G?_RocC|4&rD@{~ukDub8GeHn
z?++6*ZNtas9`gfF@*xAf1~b@`dZKuPB3*x4W$OXy3w>_UWC4#FZXk?+Gs9ITlL-kn
z4Ab<ag&5OlT7}={rf3LEH)Jz(xEuI$%;F8<szDeLgUXO*O9b2uLdjt@wL_zf{=F!(
z`yyy51JGrJcHpypVcYCLa2uwWFC1YTvVUT{CL>_jE3GYAdKY!flf57H#ke}ItUhs$
zg6z}TH$f(WymE}jKHG1mx^HSFUUm$VBx9Mk`wV^tPTUrkB47>aIf(pV0@t4+#1=CR
zm+kTt0Z7I`dbaXk962ux+F{^V*%mhdk``C{vOH5oTam;uCK_<MLx>bE9vw9t&Q>~B
zWYfQ?o(S7}o@wmrzBuG;wl3VDKF6+kZQ>xqzNRo0;{d>0URqdKsAN-V`e!sf1SSpu
z4(RB|K#amFyGzLAaRRxav&$t7v_zx9C9n$wJ?Acc4HPAwk-QDM!k57K?m_ARvPmy2
zmE5BX%dfLGk`Tl8TinHT+a`1m=3Khrmar`Do^Hq27k>Psfeqgk$TlDD>25XF$I|sG
zU;EY69(>BB!!640(^*}-zPx;**^LJekG2UP>&ZTC<A6*56tNxH=VltFWP^z!;VO`B
zOT+aES_ma~P4Jq`woK-YAVBcoC%pv#|6dD8Rk2tVftwp<i)CM^#R2T6TibJfpzj%W
z$WOVp8w0-Stp^#LQ?1D|6`K8)7lJx_Qe8N0n-gQRHR*+I;Pz3-kv=P3hk>UXDS8yv
z$)BnAc+Izl_!y~6{2TL!;%yWdwgDO!>^`wbYJ$yf++V58F>wk*i$^ORSU9vkTB$G(
zi;(*Qrq>>=bjw_o{S4XJw=Hnq9+yE}iX2+yJjIupEp9$<d&nEAEQZpOeKr?|zDd4g
zaEfA66-?!JmN^{y<3@#BeRjbn4maGk=P9B9U#J~=QlN`MI~IeCDxty?C`lQsvU^=p
zwRm6%+mBZYqGnGp*6%aYHriZfCB~4Si@mBRo3*yps4Xood{|vvUb_Ba6>lpPaE6M3
zj==STthUd^#WyL%<c#`_dX{BLf*WRl?8Z*(RB7vt&^0OTj^Xk8j4$kPrv4hLIqbXh
z@CrwbhhDLl)R{ylo9${RQ!t6xRk~Sc@c|ZajL-G9=L<yw9Hehi2XitL)QNSkEiVmQ
zA9M;QbePXygrFVHZ0<Hy%Yrab5EAq|qS*=<2fotOqkvYsoyZ41^KGFzs(s;F-hRd<
zePIMRynT%AnnTqB>=21$jY;Lof`SGNeVmO_D0NU|ld5XN_Kf4|>Oys4X-F)nC$tP{
zp|9IR@#rZzXun+(<;2omD(<%4W0es*rtej90pR;wIRuS>6ejPfPSWv}5}mgxGAAu{
zk<Tv-mZoiR4rU7*HBoi6ak*wt4Ls5M#9_SrxP)ZR!f4$Dk&}~lk&ee<c3Xyg5D}mh
z;u-=YB~C_u2?^lr6w0_Jpid8oE^3M@{4CtqHO=iVoY+7SsE@S#)U^-#;2^op_t2?>
zI9VP1f&*Js_<92UTWXEj_S%M}tIh^p3U)ZyTn!01#w86D<|h3<I-^YGuIC@8vcF$j
zJl4X@E(hPyd12mb%y$t!6C*ma$lt``Fc@chn&GZ>tgS$BeIExMy^k$ULdHGA_K$={
ziV0-AY8h3%h?{Fhwpfp?tae}}R^|igwYd&WfpMA?L6eM(9^(6512qw4L3nAHx4eg|
zJuApeChVjq?DRT%`a?aWQGI*_zoDoo!lQOwRi2Y-4(aYGJ89FUTks22zmsWWxIen{
z)s=_hEs9pEpS7j8YwQXej7pTHGO=E-w-17r=c;$NZ{Lm&LAWF7wxfu%)AnT0rTPsz
zbI*0(-BC#!>!R83;ZU3~w&Kd}IQG#eC~yb%z=1!t8Yh4E?02g>Eml9991w>Zh}$!D
zI;8D(5BewAi)W>F)TU<p7SU&O#RSXyw4*cu`VZ|#ZbTWvH4<@K*W=MxazKyH;iwg4
zy{P|6`$E%RY3E7YfMB?qc~ydl3e;oZ8Y{28{Q1S-ul@D#H$Q#(tAG6QxBvXhA8v1N
zzRJ<SJjA`15SsSOKYbiO;5nLVfUa9TM~Ef!q8b&8OZYd2zccvre*w|Sbu?Y;lpu4R
zJY>Ez0y)wtKo&ZqkfWU`$YQ5xX`v>sc1FLzGd>yXj6s$<Wyo@89CEyK2J%ejJmf^@
z0_0?H_Dk*AIV<0p=uCFbcFtM3Zwj4@zsvnPhkHH^S4Sh8l|nSp%lFYlFI7hqy)+t4
z4<Wj5waWQuJUfj?SB~hPqBBY3M+qEFy>`5ers!zUclwK_hnyCov;7<s4;naTMB`bP
zNwx->n`ol9e~8LyL662W&Wxtgl<rbSH0tBzBcqb8S{GffTmoRY-H_iwI3+G#HThi<
z-y`uZiSLt`A#s(&dnA59;(ZbX33-{s6%uce_#p{K;sc0i8mn;i^z@}VlBoE69~Wr;
zx4EdS>*_L3*X7$7{;y}aEp@zr&$m2>*X$PuxH7%tJ$N8iYXSEIzorre|Kh8wrCPO`
z{F)waAaKb_ywmf2U$W}Sw4hyWAIJ&}3jFyC5XCFST=r8MDddXz?0qp;8p-G4pTd9g
zqvQWRyqr)-%stiArPGp<L{c`&G6}lbjs<0ooFFkt;w*`C5Zjv)sjNv{d}{aP1$tD|
zr|F4Ouu`I!xrW1aJsQ__Y%;Xzc|zA;god4bGp6gfsiGmu3wm(S=FwGPi@FzGG|?7v
zq&I@EE`Us~To6qrF)+DpCvi~0kn?1}MuIMZ2^7jzC9RMvtQB+lLf0k7{lENoHG8u%
zCYPaMhxHbXgz*wW*TVqkJ4yT$O~vu7;?|x1aPuxl==1kC)*o+gtvt{lZmr_~`sV6g
z{m!SG_tw|+m90DX*FRg;H&z}hlkp}9R`C<<Br5$RcHV8GLZ3oVI6IM_$mfgsT=Bmb
Gi~j>0wxh=Y

literal 0
HcmV?d00001

diff --git a/collie/models/mistral2/__pycache__/model.cpython-310.pyc b/collie/models/mistral2/__pycache__/model.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ab53c9573dc702d9ab95ac9870bc99c46c10a54a
GIT binary patch
literal 49178
zcmd75d7K>AeIM9WU48fToR|RyR|5nI3<xw1Qlvx^pdgTtXaPft041|YsHZViGw7K<
z2Cr&xOnPKmpe@QWEIHZF#<r~Nz=|VNv6HnU+liBSv$3tjUL|(ca-6MooWt4JRAf6|
zMUn}U;(ousS5<w?3`xoUV|%7wSG{`m?&EiS-|w{s2C^}He(TBPoK-Vof5@BePZBrp
z#@D?+9*fzra?Cbtvu>0P`8CU?{Km_1{Ko56BT-H?lI3J0RZcb1<+P#0XUZ7~mu=+C
zm=SB_%LBoaf;_S6nMSUh6Td`#urX8~Y7Cc$8zbcr@k`c68)M}$xlYx$G{(#0jfwJv
z+^6eX8{5j;8k6NoxzE(MH+GbFG<KGEHg=VFiC?z9yHP9`8+*!o<UUv5+qkQIS7Tp!
zU*qob-Hm(7_cZpG_csod58!#;9;i<>4werZG3V}^G4-R!d!c@B<52lf<8b+~#4uPd
zHI9^zG>(>!Hjb5#!EeYp?ig$LIfrf=3-{kNuExsmaQ3e~;5>NSD8JJlF274^;GxEM
zl^>QE%=(G)3Gq8wJ}JMClpn$GNd4W7Q{__<X0-mE#_96u#-rs&<$kRGSmR9jj9hQ2
zzqj$e^7|T(mmin=@%nc+-d}#dTu;=`HqMpL$@SLybmN`ncgpp)`V);O%TG2wQ2s#U
zsq$0eH(5X5xKO^(c)I*_<Adc7Ha=ATkoa$}e^29?@-uS1qh4-2TYk3jT=}`i#q!0*
zhsz&^|4w_?3$b$Ly)k>YU3?*C7oFU~%*}YY3ZFgpUij<{d@k8{+4~T7);?|DZQt{P
zQMM6gzkL8<4g_Hwd&<cy%-IK>dHY^o%_Z$a_Td+-@@0fA*+&reND#ILpQH9M_#AU;
zNO1x8$L;%Yf1lhhIyGm(S+wu}iFmn=XYa5dz_SN}w;T3@<)(9&{Z6NO)5Ozv*$?6A
zLr(E#{H9TE+22)O>U#RHeF9HU=y&n_3f?$rKLWo;oM(`>LfCf)F`csC1ON9pEs4ps
z&GWITM^`3_Pu{rDQnQzf=W4E}s`X-{WjpoK%N%IaQ<iU?g}<MEV#%wun$^0WxUjrb
zcl^ZDP25izzS(R}#r@&;oPO*>HShA-)_l!#&$e9GA3Ck@^H|eUt)&~{lR9_)F<#)U
z<f+paj-Q#HGX2Eq>g;95&p&#jS#8v2<qqC^Doe^)sw$^)zg?NFF1yuwrRsT3li8>=
zs_vqH@GXN>=31)a+DldBaN<2x*Ez?sJ+bU9Eql!ElT{RL{ItY=_FSKT@w{_o*=f!?
zr|VVMt<BXO)f)o29DGta7gV*@bZm**AGEz2OHSp|4bQ2#wH0S7>5o-yyW)CP<yGv~
zEb6${oOk`ZH}b7`t%_7XGk&eAY=2wLt;`|)%Pf4ea=cQ#TCLTqm(T)t^*)+cPStZ%
z1tmtlQCr6=$Bz1`dTV|j3HsYmoAv6fQ&En$teRa}dD%cqqe%)u2RMGZ)tsx%vvgf4
z&n-7+{qZi3t5sEZnicw^c86+B+qqtut0~v3RIk@uKeJS|E43zS+RrUFyKb%_XzQBP
zj@qu1&OM=QM>+PRwOJ487SQh1dflnB+C0Y}M6<gnSLlt*W>8wCs?B)^;WKBeHym|-
z$(i*BF4P=5x<$bJ=~lg7bAmD%e){nz9#yrgj$$Wd<=`sr??$NQM)S!a-Lq)Ws$vgm
zEqm{&Rb79qFTlC?IaS-=@l@;Drte_v49vEgvsJI6^Y0gIl&?~Cr63L0k2}q)etJo@
z&~`E^#&u?;E1hYnuwx>o9g9~n3NTREMWj~IFHj|QmfYVF_~2O-uIp*!*ZZ#Yv`ycY
zr?c!E;U|HM6uzhNbw3EF9b3Q>>KO~BZQe3hc@=NR)~wqxJ8oOIOkS;;ZDTEQJGPd@
zb=*s>r4cIAHta<3UffRJHdn2-*|ys8xi}`J)bz`5;PaB{$L(6fPsorugZ4_J>O9Uq
zO1iA6CXoq!o%Q4HGQxVC%KSvF>D_nSPc>W36-TxF6#6aa4nN+gUiYn<=QModj2}m@
z^pms84ZJ#)@C$wO%aH3_K^J#X-<3;M5AmkZb!)SZ>yILyihkUAnw?z^{M?sf>ix*u
zo$?XoG+Lf>_;Rb^9I0QqQEMJK-8y@=_Sg~6aXt5l)G~hb-0LkJDV2i%S?Ray1IQbt
zr5pZWW!~{BdT7~IugVs2hu{nu<3`??G)&VpOv8FzorC8rzC9l+xO4Bpg%bnO6a&GO
z{8(%bM3fzWAp^3>vJ+Sst&hd5SUJI#@dv|6;;D1zr(3G=N4Ngu7uLS{#sBr>EJ(2E
z!)x{y{!08_>}G7uxDE2kGuPstwU%fbxAE7GRg!4=SS4i}h~Hc}ku3(FAfX_gV$CgL
zTB}{HqD8!-*D79khYWLOB6??fD(xrS<s}S>bOjAl^C}fZD$&nym^K_tkm>~QtyUAw
zn7roH<}Z8x5GJ!)6;pl1S#oQ2tZQ-Ka-I5|pSxVMZA^2V%Q0S;+?O!-R4TJv5)d~V
zS)HZx0XR0kE|bezBSzL(+1giGr7%ykCIW>YRusRd@pY%+w2cKlYz)s_h~G9=4Gba6
zMs+PD*fbnktMN9Tt|i+Ta0{u4*llAhhH$ZkG{PEfQ|_&{g-T6Kzs%&+gK#cj6r~m&
zX9)vqDx)5Pmv3UsNS$Dmw5l98@uk)^-)cD3rk_ym73Ha?8NmfOQwha><`0DR!U@{-
zb20%{WTg7(ur~d8wdv|QiYf|YQ7>(EMC0b+B+*&N@Hb>wV=EK=RThMvGSm|Y`%;n(
z3qrxERQzm(i{)~i*ZE3?i=KWWjiKfwpFvyAI`w+kUXqt*@Km=Pt9%w;X}NT)U<{>I
z=m+o4CmJzF;TMgVtc)0r6pomro#KeGb9UO!fRs$g*h!)p@_wgmEl>kD9&223Y#VFy
z%dvDU_VUnv&|9<Y2;ql42~YGWx{iy^C~9-$m@;zW>;e^Z4>Z2zB^Hu4#uH}bIIdIZ
zD%M&Wvu~ztakMR%+he1#?XlGauCsQconVe{#SPBoImWk;Ul{NbYX!{VY5LnS<UKPz
zuQT=EACc)+J&W6y4BdaywdU2zoN}(5!Pv-R?75iCFi2mD`zcJ36%1HEX**Xjamou*
zgX&4Vt{drHxbi1Bn=ZB7n$UJmSTC+`T>{OfJ7*H(7puKY?GN1VC#0{ci+p`hIrB_X
zRW2>h&0*}1U>f!+vn?0HwyBp*wZtTs8GM-2rtQpCm+SZ?xi*!QPOX?3b(PMwFtmP8
zI!n}rCFh?&2zLZd)<~L3BWWCE2aXqv>>Dc~DSiB@-iGds|FA3JeOXeY@xKBOY1|l>
zM>I#QZE{()ja%_G3-<}k2bO1G<m)?gl?!RAji#K#GHk~&HzW}*ZCmZ+S_Vy@ZDS6x
z(a=FNr|lG;<ocgTL#IKJOD#!D$&`U+mPX35K7|Ca&!{WNrk{;!*GBIi36?~*cMwim
zPO;~z8+;)pv~ksyO7#<<t<YF;5J>8;@o^j}=oXD5=Bbo!6pyd7;dE~^JhVMJ8>y$p
zrM}5UwGr;e@C_!G9Y(<%F(-{(#>&BHV(D*_b$w;SrK~eMORFqeY7n)InHIA(ztTc7
zj4T^%_WkBIw3Yaqw>a>U>wV|QfyJkl%fhwbpvd3HYnGx@Wc9tgWlvYn!<mZf8Q8e4
zq-vyFcKRk~z0cz7Qgo6v$Bgm#xM6*3W%p*~4=32TYTymkgfsn;F^#-j=6#LM0v)b!
zI%`ZrEz;>EE<;3CLQd>Ih_B1Ji}Pu^Fp^dD?V%>)@QAwo`*711aSW;o)Z@08ZB53$
z-<(4=a#@qvE%LWfz4C-Z1G8WbLk68cZYOTBN|P%GIq<M96)#uob4Bm6Q`E~<5zC%i
zY-ul9vPz|C>E$w`sgEEVSo#z0<tpYeE{f^}@lq_1Z(cu!@A0luv%r2%Noo})_V?i-
z3&|w9-I$?1iThbj<l!fr(4{}|b@U)>TWl@X=EycNnRo6mvA4}J%mnS&O`{zji%rBZ
zw$bOr&xrgmp4%zuO-8keIz6?t1eV7w>glKWWH7T9qZNnr6Ke(ht-4XHHtph~bEBxa
z6t|QWdg_##cOQx<t1HC=GeTRJE(k9+GxbpMLKK63vlNA1+$3}-l}=a;`b|RP3HY!g
zxIQmzGuAY694!%@Xb^OhXugd|qBW>yZ;+{|c%)ES_%L`eU<<-rdaXrNQgODWz}GFc
zz%w-G-@M9;XIiT0T(363+#EvFfZCm{Hj9^>o-~(T$7VcBAU=`S9A+*LKAmhJ*VisP
z&EmC|T4b8nz`+))#rJ>kuuR;Y+FEY9XnSYHsbEB)-oP)-94a1c1wx*Khl;@h=`y`z
zQ=N_wmVJLu6#I)+HNV^-Y3_mzr?3QXqKXhbssbXLVijaHa&)-{Q4`{fUPm>#&xS>)
zpyDsGL-v##E!Y>rW&f>$54enet81zVJ{p6zexrCvwW{_kL_4~+da@y-|BCMNrQTX^
zf=}y7R#Mn7cg6KNx?H{LNQq_cC_XC;qp%H!iUAdLsCXGd1J?M(68lf{P_Z^wl+5&`
zyNGFP^T-&7yHT9%H)wly3buVz!A$VPv0|gT=(s)UyxpQj^+~Z)*pH={y|sD1)_`aV
zEzl`fQJ1ABS+>VpRT2sq)m5ilePueX`}0#G(CV72W<*ACpnJNSnn5>fEU6kAZay5Q
z8_s@3J(=6ZWeLj6ZaR(X^eKRfDqALeaG}N(-&j=NkNI8w031Kx-NXHXC_Ed-PhF|V
zj0V<BU-YjqV^z%PH!6DeBc;q>{}A!YDqq-Az)~}2qz&~q;WvwK&xb25Kkhy_A>Sfg
z3po83(%@RaQugrq2|pK-3+K*0*~6`pm81}ek-A(Q*D)m`1gd~PT>ynz0ZW1<om?td
z6lxM6TEOCd3%-?P-?K#5vy_+SJA4;!fk}~PX?ZW>Wf6OB8%TyqK44JJ^bi-65Ak~*
z->;F_ZsSsY0vA%XplP|F_g8cJ;d-@k$*!LKL%zn6TCtUTx@saK;7bqJTeH==dlDD=
z6>vM&B}xhh=Bwy5ki&|UnVd1C5Rg1vZrPh=QITftWKaG0`pl_M!|yBf{|Pu3{?bwX
zc^K78arL)w?GI}f9zta-xFUU3Yw({M=x}S0m5FFreJFSV^n1gxYaq3>IQL%qr<Ru>
zm~Sok8PtR$m+CMdWFYo!>iaD3vw;kuI_vpKIF-5POQt6JKa1x|NJ^nFNhb;Y`MbDq
z?}r1f3;jK9m<h|UAmuW7|EihV{B9_{GPY4uZrHm|Go2zFh%@;G1-1q%UyH2P(6=dH
zxebw(G}@G<K8LXCB|3kH&QHMsCq+@oMm<er6W_o?>10J?WwbkI0Y$FJb9Z&o4u$=r
zc<^O>1Bpf67}}%gr!K+q6HUnYm;9s_^!F5xgeV8E6z{{h!0?JaXRSh<v}!>nfC0K_
zssRob(A;1rzn?5+;D*1Ty%Qi<$@-uoz=kGnCstWWlt;j4EGB*wJ;hhPEK~ml5;XYd
zY}zHoBOy+CZ3+2>@&W=obf~(6#qYp%P#|2k4i)t?_k@&cN*2zLniuVWTbyqd=Tr-1
zjI6nbB*o@(qXKHP0(s%GQ|YXkU`{(r%M=&0;H@Ix=m>x}dqW}@x$`s_u^(c6C!Om{
zpjiAAh&}QRkS4Gl)X(r?LJN*ZH^>hpj1-Ne!-cM^soCGD;*eTdy!t9`^-P~gqqiR=
zWAO#}&Enhh84K>*pT<Qv)7u7S_;EY#ShnRPoaDCH93;As@<Jr=!azA?r=0Xc#!fp)
zJL4pw*v#5FC^oYhQ~vUqyggwLNSFfNM_6!YgK~u_7^>xjlgIC{GhmP4JL(h|CJc#R
zx!)E;y5Q^P3ig=2<%L9fP$XkR=p|eI;jsIk3N?Hy|5$4J9Aq@L!>l3Da#8E(=H4F#
zE<WxSPu1(iXIjg}X~(hmgF*%=SSp@t*(61UXa@0Ibn7*t#kC5m6X8R}vmj0}?i!Ag
zbjcF!eXrAWR8@#6K92N(AhJnUpT|6jIHCS)y82ajucip573VKMBgz?&wAa^36rvUM
z0#>PYD<~ukSU*iWwj5KF6qXD7n_NhJ8iVpPkmf+nU_jCvt3cWdWlF4)#X#nInVrNy
ztmNRE$1lDMe3BR60r(c+Tfi@57nMPqg2N%)4dWMI2mlct!xwQa#?>?42xLgOMyet%
z#BcI5iJQhp4R4hG42^GYEQZxNB?8Zwos|`OEDFIqgduVeiH0GzY#1Wn6=K|*=%j!v
z$gM!A3`mHAq_Wl9#@O*ixfTZP!CNfRq)-Gy(<_C?o0uV>FxG9j@{gq5ur3P&MMAJz
zoQc|Kh7%Nc08BU_6~VPgmqS0eRzugqtR%e%`h%8oi}o_LxH6vwvo*L71Oq)!PK(bF
z8~L?T@kzXkenR@Y=q@Xvps_w)t=Udr284H;r5;yDjwnRYP`a@4NGNZI9-`P-c0KTh
zm?5u%pe0d$>4wZ-GHVqN%%c=DT`$Z$R9qPmo@a*hkc6V2IaS)6Y<jyi3E(A~CRsVO
zZWO->y`(;aA@Y}|Vt^dciT|ZzdgQMxCLmyRp44p=5mZ1VQJ&h~p`~PsP``)}>L4eJ
zJk<S7G<yXaV@H+gf0>SuTC#jZ)1PiYKNGd3`Ued8bBt!JccGbAt>q<Ga^PplA6Kvh
zdsROJGEajl{NYZ#HbP){_}LF~)qIRyT?t8-yrL<(<YMr9wTrAvw)9P%=_3(JXvm*{
zuoU`-bD=Ki1_a#zD$AwJkpvw{I?E^+)*I<$8o&CR7XJ|_GqZS_e8YOf%viYYCi*tN
z<2zBvG|5BD$iKcaxp`mplUkf5;#EI>$z7Qx2N8aFO*-l{zM4cdKq6wg#uP+BuX)>A
zBg<mm7X7yt@lGPJh!~+}wyxHkYw91-hwbKPfVycyOyoK~WbBx(=9gQ`E@(=ry<cZI
zkIonA2ss?l#D20OB*zo%a(@@kJ1WO$aV+T|XEafAhLQXYI0RV$su9sh{;+0PIy4X<
z9GJkgC|~_7UYX^p6@Iqjg0f8J#r^R(l%GaBbv33k9!NtJUe>5J^LFgH_-cAJvx=2M
z)6J`xuc65R1-**3K`Vk+2fSoE$Mr~5R&m_rd+r80ck#FxU(K!NR|ncTtO+1RG3R3q
zl(i@mtbe(+Jb1<gfCagBKDy4g2cl~bxeEge1v?2z{z5yqHVAkE!}6Em_&d<fbJc+e
zy3mGHf7wu<z*|GIE`QV;wz&e1ER14Z%zOsutdAOPgj*Yfi%_@C2l-xXVM}|UUBJ3%
zK4-2Dt`2$QZKTs4Y!9{LHxuwfS%TPq9)5#%?kCLEVZ<|mP|W2}d-%46a45026@IaW
zZ8vd;Yd+=mfH%3eow1^H2|IrqUC-OGwi8cNw+$%!M%p86i8k|_X=mDnIScFiS5Sh1
zZa<WxphJE-_8CjcLmI^Rl%Wc3Q^#N@!}sBvMGS-Obm%AVOStI$k+un$^@z7?VfR9@
zJz@`elM8$7;al;wy=c9=&<FIheeDtC?QVSUu}5yj;mZ<@v<J|FqwUe2+@n;?Khi&o
z^hcu-h`;1P!bf=+4clDWl(Fr{I~nf)Qk%jzw;j^;+i|25TRYeuMV?06iBB2sqRpBc
zUAq@OVay&|-LkUVJJjA{ZvinqaV!4Gfz@&E@LH)ojyEPg1VyQZzngI**32A>;r$=H
zhB3guXYl<{teLlB-jPokH-19<&c{|Ko{2S$Yq9pk+R^qHM~uY3HHd$l@h==}PqfGF
zZT6(S{gwr~Z+z{zy@O#lNby&@Q*4i-{wx91?3{-D33xLuFGSAWQ2@S)cx$HHTPPsg
z0qzbeInvu9@dHwWFqW(EgOvU1{iXXJ+<>ehfj!e%OqdX21PKPxsWfwE>{1aD`pZRo
zs(AQhkvF)8#@Mj6?uAq2%ijX-=gyf6U=u}PT?ATgc@}szC~&GEhAvYDY*YF=>lA_E
z62UU_^FXQi_+0S@=zkyv1JT``t3}Xe#p<%xIt&dlAy&YBL9jgwt^fp@5b=vgu7(ub
zp(2Fu>nJsfZ)(*#$u4rQQTa4wrX#5qg^EOBfq^SQxWxGyqKqHFkH`RD#{3vt*%!^p
zMU6XgAqQ<O*KN{r;Dw1XDc;E(DqaGK3fk|^p3V!b%?(0p-PJo0JEV}tlAp#}f*d#J
zSEMPLffQ<HgIDjI{<g76D0~=W<ydLKZ8d#E-AOEabBEuFmP`XIDOKwm*Ur4mbrKM+
zJjDNKKcL=bYc)T~Dr$On;@NC=>OMam5IxZJ^9md>szCj3thRqf=YOYjHyxpHgx$_Z
zWKYFgl4TTn<s~hD`j-q!dc#jG1zx{FFCnvjg;x<_rC|Vom4bYx%6i_rtbUatC_?p<
z%S(V%Il^VDU#2&efqt@4^%~1{U}GTTLv9I;AuZr%34;s(vMIMU=Rp~+C}7cg+lLq?
z6_8izpVI&L=}=qf$07C*GEe<;-X^_Porqft)Ea6;YS{uX$M%QM)auT3%RAFrZrVbf
zL2R~!P_}Q`%Z(*JaZLf*Hnmk#4kCnz)_e7TFu^a;`B#$6B5Hiz6{^BdXq1Kl9E}LY
zHSW^&vm_6+l*P|$QA%)^1*R}iYjjJwWz<%5)P7$2OVHK*TvzV_k`J&tDQ0F4g@VYD
z^~p;=kAVuVUq9}ncSLRmWO&S_5Hy`7X=lArh-b|#2*N_D0HQJbdOle&@@5{eqP$5N
zVcN*zmn0%d%aOzgW#xDSD7kpvD4OFSEk{6PS|Ba6iE)IRBnfKmfaV-HIy@Z<o(?4@
z5m(Z*jI?E$W_$?GcfDafUqJlk8`=1fIYOUrWm7BL))SZkrzj%BmlD&{(`U|14fko;
z{Zxg}TtJl$u&Fq2sBh2_?Yr)TS_7+o6W)H7l8L}0M-S^#t5sJVd{SAPtLc9MK~k(r
zqM6b$%h-|F`W}=`sAW{^D=6T%(sA?kv}L}QOqj3Y*&Q=wzOpMb@=e9JXYuX%3<P)X
zbzF3dpCV9Ux@=|iUm}yRNA09Tz*5Rd@fSj*v|OcemBAGSO<4!xC?`9Y=gtR+jdGa+
zVn{%UG-wY2V3oIr?GY$121J}x0J*)T1GG9r<a9KN9<P2k%B$r{8gE$?JagBiuJL3P
zGGhMKH<;BS&IhPnOPywdc8y}4DPm>f(h$z+0lXl18{_4MR=X`Xz1A`iybuX+fonCP
zssxZ_xdz~Ku(UyO8!iXU9^ry$d?6md;#TvDO~|-JF)JG8Sq~A3^ntlpTE0}T%@$8R
z`8bvoDQkZjxvbj`fc~-o1_S^kp`x?K`VP&hUh7050KHk$!<yoP9Jmo^&Kg?G)x&eM
zr?S@#;wx+srCMN!j2bWuf7|$BQa9$!Sf12+oD!N)a^As$P2g33S2x@pBn1$^ttCv*
zb5NpS4GhsQmP}r6;Ej#c1Yc&qW^P7|LZSo5>gx`XbYi-nIqI7%dl;WRh&F?-Fg`zn
zL;}XA(8c(q)=a`R1^XQDVSzN`Ll#I7u)+cX(t)r8UMgUL0NuI;q;$Xng?E|-lDi%j
zC=;+iB8me!t3`l$urvc*EKqn&)+W3r`%r+`j4V(K;fDeih`$WSU$Q`%9v0|RcxzZ#
zppQX7NS0!BVXTh@+9E8_ux5e4@D$o;avOLaJ4@Wo+Bm>x6SreGfj0(wKp+*F8L&RW
zBn_<&L$F6C39L_tN%AIJzvgXco-q}G&?alr`ZY<Zu&`s%BGcf0$=kWM%N`K+Q&={z
z33V`2yP0di;0g#g+8+zTJlBaw?ndc?v0)AE*a`+f;C929kP2W2w?ucCv{uK#6zI~4
z+a8mW%Jm$<JgmnpAmZOD4C2vsty8s=TVulJybJX_$Y)?_%pSJvISPrHnocvoL=_kI
z05n(xixoVDL_fe%-8SaY2KEp@f+p&A$Q}l>If44QD_}N<+Y=G9xlfqQyM@`@C(I_V
z*=z0rd-D-{q)C~_{?)Bu<qoWFYj5+43sZ|GYVMyR^@F#;1gZZCS7e|jo5Nsk*mk2g
z8DdS!co_Z^En5pE#4sssH`bm+Z`u-wxpQkr+LIU^WN!5pd#1jE03hnid>;1e{yu(G
z`qBU-G@15RFrnkOQmfnB+t-c>8+aV)--mBrvw_lD=w)m7x3?qR?ZOKF53qte@Wwk(
zlAZR%>MpQ?543mLTgjLSE4bTxaP6J#-FTz8jxjUc(Qd{pU<KdxDf7nN5i3{>Si$1j
zL+zcy3NrrfLHy&Zh=1X`+Qs&6dk43+2`jjJ?O_`!Zomqf?%D1X+uPea0#>j%t^PG9
z&EKN)+jRa79pZWQZi?UG)tBi|*Q)u8e~T->5GYS7T6wbalU&4RmjQ7C=%;^;33)TS
z1so8C>Hy_0wSeQ=OapUB^J-XgvBWjNY!c-)H&>gjG5Q*DQA8I(G7czFk0^b4X<k7+
zq_c3CL}syGyQCn5ED2V51-r-~E+_O%<mQ5{sRjrXtknl*x}iD|b3ueneXgVx-lg8M
zABI|zLUe*RW~L570ST?aM_66I2j`FQm5)CG!Kg!AI)r*Fl68$YUilT)onBr8V4^0g
zk){KHBD9j=2(b|XC6={apiEer1Au;j5NmMn0z)dNE(|-~BMC4EUApkXQq<+8016d>
z=8E8oHN)D3#!Fv9JsU+|Dt-vm47DD7vv{dGyD0JM3?rOxlpxmrItuBJQbgEWJFfco
zc%uG*&VQiuAL;yOIA_!!^76mXVJ%IaP=Czpm*H^J!IBby!<+Jd>c8;~k)D2)R}ssn
z{)D%iv3u%I`ScW>_t5!OI>PF04+xH+hq^)k7KYMK0V|F!rPLNs6e6fszl1ma0x{$P
z|3l7^gt<UP{R}sjfVEk4)SuDw3v`5o6pl}LKeB_u9r~k?!&5-Pimo_KGNZOX&d4L*
z`K9GxUquQs5XlVo7VF&@8ZEuGLo%VmaC>CPX^_9v?8Aq6MgCHKlg?kzd4<j$I#e*L
z*XX=X=UZ^5_Ph-)QV9X}MZQD0jlx_MQzjjV!D0&Xs+dZ^DHj5@z&e&uZR6t+%{QWa
zK<xtr(Dd9g=bKqdtTbhceR6@k8hZ?QCG4n5^7VYcSy!N~brjhWKMyU0;QR?WSJ>(u
zQ)G5jhEA4Fj!vEqDLL(&<<%UV--o};E*tWg*#tPtx8gG6-CXA0IQUG{0*?sJ&@^AS
zlA2r1CWg#0vml&nK3>4pK69V36<j8mu7C#}GY%j%&7XoZP2|D7g1=?@CfG9U>`HzM
zylRrZ<YDuM_y(+-s5@TSmF3Qax8zil9b|Yvc^vklaa6`EN#5`?h70m~r}1^!ZLoaC
z+!S16HPe*e)&RDa7>n^$Y&@||NX{=p&kXDr$fZCNAYc&zk35%L1*r@Oik9l<A?q%H
zC>+2qNWpg6eZ&LVEa3|xHy<Y6xt#%-oLwDhZrS)Las!@Hy8wqLZskBArXUl|wX^NS
zO^|`@QNUc%?jn$8I(8jv@>Wtm>-F!8F9jk(Vwg%-SHT4ct5wA3H1xolAq?sd%NfC6
z<ej93ixKNsJWPc#r|gZ^f`IV?cS`LuCSuGi&|HfcSF>wTnzHQ{=tw;6L>J&DiF?FG
z3!NoQA9{)?y5Jq*NoFv7Yp#a<-CfME4idn@S+r9<kZ}5RON-XSucQ1iC_j&LJ8*zH
z4^}+A){(yj7mS<!Dgq<h6HP}V^|sUF(*GT%)-CgT+v&+m-)V|b3kkdXHqs<15%vm^
z5+CjYo(nBe4UqkjjCn?5o_f*AQ!~VlgZetktT!QGK>uZL>ZyLVVhCkrd|}Fg_<-H%
zSomU3Ky1Z5+#6U65f~vq!4!@H3z38U1?$r7jGG{EPl!P3T5%x?aiVCtLG2qv$;#sk
z)CZt6A9(+VFMyyz!6E4<<d*(H#HHe+de0bp>9OLGBBj3vLNaiwCodZiMaU5)b3G{g
z4Knm0sOQlNOaoLLw!3<pqn0`qr~zq0?qZK_zOxUxa;4{P(a(2nT*kAmyG7tLwf?`d
z<PY@JCUBo(B4H}6?%{&DpAL+X#IT7T&Oti&(m6zD4;BQFhmi+FHqo_ym{Fq)_)ic}
zL+EmqW%Lc;|1dqMo%H@J9U;1Q^J)j2sWDBF^wAfTaED^+K#ot!RHsSmLUQG2J5)Z<
z4|*FnYA2`biW&enst&p96xH9-RZZbuaJ;*r_o4tQA~lK3$(sJ?BQy3QYo9Q&fO^Pn
zvIwk0JRK)mA!`gl_zWb3kOZMTw)30*hB?r=>s%F#<kt%YYux&3$B$RZJJzWAjcz}L
ze8tMH91TU({R{tQ^kmOQreS28fB+!^*e%+oU`dm!DJuN2KwGp9unjT{wL84V;t#2e
zNG!7L)K)Dc;T|t+X>3E~46uL@0UofVfD2$j1{@8_+S~DV4zdRm{_zd`<JkU~0&?|f
zJU2FYZnX1I#Seg}$gK`FkApNZ5gH~?RtG%`+w`z!b|Qu=Na6rMh(iHgfXW`Z=Qh4o
zK;+iagfy%gO~`?Q{3mkVxrapq<lh9A)&S8$@<o#rLy#-w=S+|-19Rpa_W@hMhO<f3
z9dv}k3oPS$Dl20_HtVZh@DS}#*A~i?^kYbDtJRx#R1tSjArL6y=k!`EjGdq1GN~_f
zdJU5sCF-CO2gMOOq|G}>l=8OT#iv4MIH10xA+xAQ|NVSZCWHHUb(BtD2R{tIKVt`f
z7}<kn#>yI7LBnn}w?g|ujSKWGaR7RtpCKp*o$?LRvg4qe&A9Hagalc4R_@xgqjpUj
zL=s`DA+b7r278PzBh{|miv{K|Qlh9q-o_bZh*vpY4fBfBzv9?b%>U-<lMy`q629&O
zc!BDZe9Fv)gKRV9N%oc4_nHqfRQ8)gwqhK!c)jOC;+mgzN|XSP8_y5~X*uY(HqgBM
zg;X*rSAeS6DO^#3l6EpsqGYH*VVE!^e&xP{#}(yF&dvi1oD(HV9=){SZwV{?JnZL0
zOYh2Svd{(3x`@R<-4X1X5ma}xS?Z`oOR_FQWYY2o7qg+(??8>-MByUIeMd?c(8KE$
zE*e4rx)&=vBuQ9nNv%K?N-`L`>bUOO)g@Y(kd)6q`s67pQvzV`TPO<F!=y&gl4*R|
z)!Yjh4ef*)6-@zT{#qr1%`tke05imy=oV%P8hjAFfu2i4Z2?&)tu3GiA*Df7U{kUJ
zk0L`9$DIaJg?AcAC3m7X$O_;n%iaG1okC0m3oqmXkZ^VlpeTV(Wg%Y;uPI**uNis}
zFevH`ymoeN7*bJ&<uAkW7y1XJK<Lx}XG}1FKL81`wFXEJ-X0ar0I$X<cZQ__f}=2N
zdRxGp<uGRs26BJoHhBMM;a9LTkXH{O9u1;GOzk0&S7X%xT$p9qighA~Yvz*Ic>z)z
z>i4jspibK<NcGd0jR}GJJ#U+hbtv?K+!;0y7%Bn}WfvwFw%d8E7Jm}N^Cz2?*xRwN
za{)jnDFISN`bG6B+&xVLK&#lo-cK2=w8R3*IIaMkTD0r}*v;Yga9F!id%V93D`?u=
zhxZ7J8VEp2fN8JY13;L;cd(^g1AGRjtDnGJ5{$}oG;&1%i5`rBZ_hB+7?T;j`{M17
z)0yHrf$q#Y*x-o^YT22==1MTaq@T6?P>UuT)h4WTbWIz*l3krGy@OO}Z;sF4o!*dI
zJ}I)#K!Yk<K)S*B9e~g~N=HjaG2A)K-C(XE-h{aeHjjq*rp~TE$dB}f))}md%}Q_b
zC-yp3SdQ8&JQD=YfM+l=$*!1BdnwoiA`33@mVGM4&1LTtwu>BD`MmnS;P}?;vR##>
z4ZG7bk{_~qn(`(YJ+;3hoYWX&S;zir#Hk`W{Y7TxSLsk{+ArJu75WKnVu`>_<e^D=
zehrSl<IN_24TTZGsBi@Tg0FppP9(IP8hJay$S?ApH<d$npeiCb)O7hXd`)PVNc^`C
z)3vO>z$$=LvEE2p;nL3E<eft1?Fa)$K;(>>0n`M5lr+pSXmH6G@XhiU(f9?30F$o_
zWeG6JnkzduTzyNyTBU^88Q74<k*eR6yrEM3Er1~pV<y?b3OIzYq->BMWSf7R(KP^t
zq{sh4@lBdfkg?$p&xLmD0;%+IAf`Y|%p%&JPek$jb$EARBTn2l9V^(Y#2u>nGC(8p
zm&_#XAyPV^*3%AlE-jGq$;g!kjif>4h%fwe@aImYeDFIU&v?f$d48!m#oi>i*q>Ce
z2LUwc07{1aaC_dP#11;@ERC*p0Q~EA_pp*tf!at~_wF8Abg7sg7W@S$hPmwDq)!5P
zX9Lz8U<QjaN3kaYo20k+EW6GpC;MAW80k>ogG8qCU7HFxV*G9A9zTEKsZ(bwr!HK0
zZ2H3EPfS-%pFMT{{A1_+G16T9lbN5Ps!N{8HbPiQt1pQOB!ICRZiQMR8piSmg+bSR
z1wH+(5CwISah)gX*N~BokQqPBj+Mv5Ec*Wlde#VFSSve%-rilmZyTMlp6FuyB)T}L
z-5-Lp9-E#)?XLp7pm%kO<*Z&ZEPg8qW)$*lkTz%&z!ZRGkvBAigF8Ysgv01OpQEBW
zp-zAh$;=tRf9Kjs^(ZCI!Cg)>fE^X=2@aWLM-8UV!b8DmYD}G@L&OqV_=6}!AArAq
zL7^(fPeZ0#v$4eb!%>@7LImD0w2>~xVi)K?rC0BMBV95>{3x61e#9E;K+L#Rfc_!i
z&Gd|rzmrZ|5L>W0p>2SMLcZfG+c#>`p6P2Hi*|s8l!<>+K24@C@@anV42_iq$TwN+
z7Wul&<KoD)ewKcZA^a^C<`4@L1HdnmpY%>>d7#{qK)EG>a@#?a8{e#+#jC;#grG1I
zS3Q`pXJ`7%fX=|c!|BUTpf?qRbh42P<P)q7q6xGf8`U8s!#p<{0<@(S1tO`J1Oq*G
zNBym3va-<(cr>tF$jSj1pfY06xVt(eL=drCy$X_q`)2{F4T%%2XNWYxFtf0vmW2+I
zO<|I%<IR?)v<;Z<5`$-|!|X;ivKRYoL_LDwgcY`n2QGWwlKaq+BTF~*qn4UKLX)RQ
zAW(*8^9Tsd%U+{?SKo_MKnGmX8{H%WPXm7jlpBgvMegU9VN)VXR3{~B3+2Gp7a2BE
zU7-=%^%cz80$dl0!EQadR5>V70D$MUgqx!4OAb`#wU!EW9+Az!F5?m!5KuhuY{-+I
zmM!r!7yE>{nh4jSm${1++@|gW+9(Th#e8$Gy_hA8%o0hf0?r=d&e?SO6+-dmYV$?H
zWFtacYOEN1*!&W0h(!cT+nVM6d~RdJp4EszJ*4r6yzj|l*l#f}FmO-o&(3_Lub-Dl
zbQ9ztuye<rxfm+D&@54Aq(mV+vXotU`T|nDL=HTdBe+i&Jz391viDI{uLYaD4+okX
zW}lfN3NbJ-Gb<nrVdGm%rCAK_o)4Jq*t_u+2~9{=IB^)~gIPY?EGO;wyn*e{RvE%Q
zSo${0S$uQYL6ZcYG;gOcSEoBz(!Qem2`SbEnIrtcP&uvD(r(YLd(0mKjXcfF{2(0g
zb5xa5nrV3P1;`E6UM%}iSAw1aQ9x^|dW)}c&P6;+FNiTWZI6!xK=5uB@1EN5U62Xj
zx=-t8a6tjl99$;`AtPXZj1n`*>t~QL7*X|L%S}V*VGttR`k<g$r}4ck?Wa&Xge{7s
z?){{;lq;qo0YZlrQW{-V4VWL+RwWX&zgqD#kA+74{9K>~;=bq*&6z}9Vg^wkL{_?{
zaL(TTfVSfq^ym~;0Cy|=(@=WT$R>>Ly^#kMK{&Vdx@E519a7GH4OZ%jXBu`b`Xq$g
znOa_t25sQeAEy1mz#=nx`CRR~zdOQ-5Ozd#QU#da*~O(6VEh=X{xImJiVFi=0IW)>
zTz`ATU92r}*>z!fkcJQ|VvCt8fj?e}VvGo87<%>#9e+~I{+7<YMoY+SqGo!lOqM=b
z_0HqLE9iPI$!n1DVkiUR)Zi~~Du$?@9ST;rUgfT)^t#Ppb%5qGqDZ8vgW@_(q8Q8$
z_ESvtEAbxJY8Sq)7b3tx<OR&)A^Mmz(pBJLE^*qwfo(>C!dvi2;P-3u=wF!5u;V!}
zQ5EiO;qIScUv<y4Rr9)3*qkWCYy_-->k{$Jh*I<Z&(53zAm;)z#LXyN%FjDav3re=
z9+qUsi-Tzot4B|w8wZCmQY=T63SQvaZv$LZtA_^H!b$wu57Dv}r$=2$k3Z@@a`6D{
z0?*fBI~|pBm-rt><8-u+T;MeJM|8k_#@gO?gT0Bm^c;5}Q9OFJRfDk9Q7A!!i&K;|
zXW||eFA5;$l3|E}FN-1qd!C%SGs_hTNV1t{j~*%>yEyZZrZ>VWI2<aGb+-^hT&$3T
z2Z<$Q2_5_3L2c=y^UhJNcwYBT7{0F?bn3ZN0tW4`?da)YJ`E_NFxCH`_a$tKo9SDc
zWS1Wl)HX<fz}y=!hO!%hV-Gw#j0eajIT$gDrh%EjCYc};b-z3SuwqZ2UzgHUUyTP_
z^sWI9$)2QT@llrviLXO~T?}@{!5D&#u_tRSI$H%<E|K5>-2~fpnjP=x5d<?abFn8^
zr0ei{Z4L$wTZ~%i{S$gv!*~(q@do-EdgHUK05KiXp=GP;dhKe7lx+2qd*s-=jvXx>
zyYK!7j!j8tBV4s7wOW(ILX&%eej%EsKqPt2{Efa!VPBVVf`=NULsFdL+qdfNo4wWP
z?_n<u5gRnG7ePwClL`Gz=7(@35EK#^oD`bYfjtZqkief!ru#5>$E+iSwCB5K2{wxm
zATnR{)VE%)OE4=HrHKl?IMY|jov|K<E3Thb!$-X{Yv1re^_@nE94vyW7+RX6P5Qvz
zTo+q9HT7m_50EE^18kPvN9!ISVu}4Wm@&~cSq0?i32@dEeen`ZkS%HiR}f&92X1si
z^i-4%4U0!yT!NL4vj7JL=xHv0OF;v1zbn=zy_Yi(mqV&s<JyItMlINLgz0(?80Ko-
zsP9Qu8=*q%Sg#pc1~Qh6VijvONFi7{AlztPHo%5y9@17UX9%MaT1TrCq{f2=?|anO
z+!wCFB$SLpt?5I?Th|=B7gBT>c>qhJiK%6e3$bq5izu2Va=)vmEQgD7^a&T!u;E3|
z+1=zq8)+CyaVF}VqI`GJ8-xjy5g|4TIjM39Js#R#Mch3}YCDO&R8|i}UDHni!F5QF
zAi;H2utJNW5Kv)38)8Zk7Jjh_NkO=4t?tZMXK(Z*Dm4<+Uw_3$O(^u3=Gb)S1RINf
zg^>}wse2mIoI~q_mzOC(wrMw8>pP9mJJi5(T|!6c$&k)tZ|osG7<RC}jBdE_k%Z7I
zAG;#1l#GO~Wl+ykeT$)Za_XEZ9sm<6yW`M<?N)QY*OO1d+av0l!YUH%Ip_fc&SL{Q
zMh!(4s2CXwQ=8Fd#{kF+cF!{i#TYZ4ob;rN#~XE&aJr4i@eW{2A`&Z-+({Txi(MW0
z?b7(z2+=tp;Z5heCn914=0UYp0G5kw&LPlMHd+5UMa^DrNo}`a=?F#%xEF%KF*OED
z1$q`p0WH;(HCOlap2|T*qowDT#W)jr1R+0Up9#h%4kv=(0ozrIms{6_C}}lu2NI}L
z*T}ArvG_&Pb%|82B2Tmig*jE`!k%=cb@#L1L-a&oZ6tMrWTdr-*%oAis$K6iN;Idv
z4XhygHw?Wp!!x38J+#4);wsY9{n-D*@gnQXX6ryW5kt86HfX$%%FwexhZ<wYdMnIh
zh<C(dO41(X(HZ0%(BUjl#lYvWs)LcxSq1gfs)@Oo$Qwfyd*nsr$J36SUgj)}j69fh
z1kH$bS|TV(*tNAS@xNqYdN=w6<><+MPjdYQh-v_t?(1i0)&2tLO2qyo>;(Et<XaX<
zy)>Sqi0%ooX$tPOYXD;*uY-Rs%7RwP&O+MI^P#YYANQBxgkm_cNl5LUDd?j>yBCPw
zXzVo&5ghCiI+>eBX59w{Qh;h&$U(E0$M1mX_IO1t-vImyxE`cN&n`#^T*G2v-WkI0
zQ1Ck({Eh^_qt381;*5$85awQJqYwwGjR_)Z3q*kv{y->MlDb&=U!)nj5C@uO;-aES
zBu;B+=Yt^`K+|h_veeCfFxYxkDn3RUSv*Piv(`7zo`A2B6nh0VI`W4?QR-<C5lc9c
z;`In3DG^QJ8)tK@!0G5FU*x`Q^F^`LTe0wqZJE?duBCX5FE{#HfMLSc5`6WcC$RJM
zLfXsWY5u11D6Qxics_u8u`}om@;!Wqcq#>+4AY)qWkdvziRqs0<6+iDnYD2^^T^P}
z6wZ{2A#27>^GD1pu<b9x!6`$8uCRDwmX3;T52!*vi+4I&d_VgztxJ%;KG}rGQ0%OZ
z$2zA#NE1dX#%^=I;cMy)#A9csM!U4DOhYW`XjCLf0)?XnVT3^hR@wxKfF3TwS6gn&
z%d8)wD6>cwa!xr>FQG-M!$Qt=cMBX#M*Xs@-p5#`rCdzK5|!)t#$reHd4?06*CJmN
z5g^rQ4h5#HP}r>y$iX=F1dRi0nFh`hK!fp~P94=qPvam$!YTyn`!yDj0_<Q1KdOES
z&+sEU%8vd`gtuedgQZu}X!&!mg$<H4R`&KLzftgYU6_64G`?TUGArzIK;j13QP-In
zZF$#E$m-WGP#)ufAN_e+3G;+woBHy!QSfPE$idQ*D%7A_!T3Cl#Qgzz5)xNpZAI*^
z_+uL_D>ywOIFJu>wN_=bSmB4WC+}Ha*NUUODUs^~8QrA0H0lw66BECob~^Agp{T}z
zBar`3wv&Nh97jiF#BX0a6Zl!cY2(~4gu5S@X!-T9ZA<o;YyXqb93+9w0}ih_7=(Ge
z4NV1<<ANk?!oat!ccsGs7xvL($iq#l=e*1saEU}Y+SV!%g+Kx(mGXe+^YS;%JdL9j
zRKpusgV8kL*>G6Z8t~b;8lbfRm_GfVgS@RG4@1~%o>&+HLU4G27{Xk80Eq2j;QwI1
z;Fg8su;OcDKrI?;TX4Kh4rasR%{2UiyR}hZ5`hwzcs~c#{kUK!69K00i^%naod(M9
zi^$^_=}U~djYBn{HQ5Sl1>1le%wjv+P<u#y5hcdD1aoiUTmLSI=XYT!)r1ke-))<~
z{NWI<<}L}7Ma(a^Q|`;&ByfR1I06x#Xb)f$n%NwHHG(cbgpV!kfF*<kcIYR#{Y|}Q
z=Own0b_zS^<H6l1;vYlkosY&YJ_`E+X*?Ul_O@Yx8qk~?L({++pWt_G7i<XZ#;$)k
zWCZBK<n7ezcr)AH((Pvt0FO8hYXv}uBK0KRJ>4Fcl;3R^kPob-NSAS+gNd<m-b;$G
zm~e&o)M9{6WU9|~@`Kz?NNfx30-juMkFD*w9cvoxG5BX-PhiE{3rx@O3o+zo0`HB^
zSzJ<X!pOFFSLg@ha2D3$(dm^7&w!K%{VJQw<;V#f{Lq4mGZYF&<5MWz9^Y_mQVu~n
z(<Sff;iQU{#~7<-<3ucilPs_g9jZsH>OE0#;8Pya;~B=gQeG#zz_JdbWmt~DrqE!k
z2m->X9E?tv28pz+KEk0Z7Jw;TfGSI+GqTG^J%E`;Z_YoBtCg=Z-z4uz0XM*Df%E_^
z9Z2b+Y{lt?V3xGjTXfNqflw6(%!qShzv6!K{Q+-Uy%E{GU2ej{wHDb2GFr$`NjuBF
zk*@8G;jVuWJ;Ah2$$yvGTQnHiSkd-Sxl-!RK(TnDxSxhz_RoZSu!4Y68nlx%X{o3l
zMUg=S$o3#EI5?E<41UC}Wj`z?vTV?^qHzhqWEyk%<>sRABe{_w#z&?>C}uy1CGQF*
zf^cDxgiN)-Dc-o+zo=v$X5C(2QYUwU!V=pv=JS)Wl>@!&8)}Up)Gm4>d<|!O4%fM_
zxtj}6Tu`)C_iTr^ncAnRoi8EZ>Lx>oMXy2fkKK!jfHHrGfvz&p@AK*)onNB!J@m`*
zDh~%}J*x7xq$RNVDY#F9Iy;+Cd{`oD&Z|$+Q*fGG`!%8RF#X;`r$C4MgLMme2;%Rk
zgq^)tT`K3t@=QOi7*op(Jcx5jyK{nLJ>}F-+7=3}n|&V>?zFm8)JN&Zy3lscq-tKE
zA2++;xH5eh9rx>FQHuHF6w-9SP}sFEX5h+0tjO;41=(62lA{r{6-5~;x?zUVrF$C9
z#X+L*ittyiKZ2gYBj*G>s4LeW2ycbh^9ST7FxLmeYx0JO)B1XvK(n~v*!`C%=-2@4
z;=VB!A2Wxr`J6g&v6*Y;LFDP}=GbBbSQa*jLqiUb77(a<tIZgUl?gJ{+#w-}a?K_u
zu^pY8Z;}G{OumL7;kA)`)k^2L!h{;g#w0cGK^h@A6d_;fKt<`Bypzr(p_@-4R_l4Z
z#r<t~mp(Y2H2O=vwi4|&(S&(0PH^yG_?a5-*VoIA9>I(WTx^96{k?Rypkn;3AqevE
z=_jAQaK7^B6Q==+e0=)7x-$<kAZGx*6NvMHdWwFs35ZORwxPx@_ZE2B8eV&b4e~yO
zrJ{W#l^#jQldx0UDWblzz5zIKWK@lCvvdGAOQy}gNM_7$CbQ-@vbnLZE5<sD2zx$+
zDDdO{CZ2Y9X~0-CR`yM~X&ZZiz*VPlHf0{a8SEk|;5RFV)M5a>XhUjwm}MJ-{{Xn?
z!JCFNM84SGa?8w&um7_juqPbYf3mkaTkty${(75Sjp1q%SA4ovIPvY`k868J6k;c?
zw!vpt@Vh(sEe5}PoJnW9Gb}v$xV_iDi#$2F@*SCR`$2o3eK!oi?R0k8_t^Vi0Ig8o
z4M~Vd73>4{)C<^8Sl;98${e)ci9_4>;>o0AIeQtKeaP8{lMW8sCE@b#VyMg!9Jane
zIQuBxN!!P|!y?Ax&b}bDC9$RL`|SHAw!5R)`oc&|SseHtgcI4qJ)mbEiV()f&vcG`
zzEh}cW9=mket8Rq{yB6E&Crv!<=&zIR4i~~a7p?o9$<_>6DH+29#ahFr}ge393lXe
z&_Z5lMwmt><ya{Q%RU&e>^R|5U_BQUr{AfeF5Z~^Rb8OdqH`}Y!{e`c_BL&L0Jo+Y
z;#5#3DGUKc;k=05w-hl2f&EbrFX9|CTc3!;?G-pTQW9a*gAK@>0xph+zIO4yT&O$z
z?-Z9~LSSe^16CR_6{W?zIPW9iad)+E&I4nRYewu4E<oyvkYWLN?Vlto5dRUWcKGwu
z1o*-P9AI2hbtbkXCV;-4g(Zh(?`okdjf<rE3>@Nqz@J94V0D8}q)bu4uxnZ3k@ewS
zA-%B`W-7akd35?wsVnq4W;!Yp>xgC2c}Eut@~-Xl#noq}?&&bE0E`L0FRN`|?f@qT
zD2l3}6U-auR^%+&(a_byOHs%k>J0IB<PB-b7a849;$Z43lg_X~?I^AU??%cZ^hp~X
z8mv3zZ%@)d5{Z(Ag;)gyVn`Bxj<vB$r?&=V89E*2AiqftSa!{%bVMVYR)f06WIlmx
zU<k6GY}hZhuvCGRp%dvD<V5{7yt6VDQltGfvF@F9{e-P}8ei28Got=3u7ZBx7n;ts
z?vdJCA^sK?c3mQmGh@$hnz0Svc^er+4RIn0vz7`dt-eO`bBsTDb~egMbWUn_6?KJt
z8!e>%KC5Vhp6Tb<K0%UCM6I%6l3gJo=*$ImC!4FPh<;jT;Ef1iiVrV2b$3HT_-E0A
zvpG27hoheNY%f=K@uzJA@*wPyjO0t$e=mMRk=)3H+$bx4lkIFIU!n!d&`=NevO+Ee
zi+cdo-pXJXCibDqZcuKUfPF?vf7X(a$`ot^yXBJ-+ZF8a<S}sx$X*QYzUPTSYbk^p
zLI?}{Qt3yT4E<og5c~3JKM-c^A@PB=NZ#wckvR5vXo1qCVMv?A1U{`vVn66$JL`T7
zCh%bsK*zU@$Es3}#BzUsdk{}XCoslo9>{}LO77stYM95NIeCb(Q@0WX23J3inN#HK
zMSOZ?^7PS-5UE!GiorHa;jA>XFyb0V60*q}|AQPf=a-4Bkad+0i611gTx&A}gjFEj
zAha^T<3Y;M3$0^nffA!5>Aolkn8U}89u+NNkHVCof9N)w^MoE~Jf6rKLPaXZlc+BF
zHtgp@ZV|vbqY7n`{2b7Qfss85YFGejNJK#&foJ9*ir`a`76gYg1=ZhOq>YNug&KG&
zJVr9g!(;k9)-G(~%*+h-He7BsqcA5=p44`M0q%;9f#QY@8awO(k{$|WB3%<p2ldE!
zq@5rH2`1^TaQ!&@1A80VfN`~UBnW-@!S@`#3T)cm4knawMxqwP+moGdFUp?s0%Qvj
z6C&LEJ_l!gviYc2M-PFGcK~`GfeoytczM;jU`89xBYS(I&?0g~5#{fIj+OgWsIjdC
zI@%L^mpl}y=jFSK@6{)Bbx<NKY9QQlL^O9o*Jdg7JfKr7=>-?UyF*aFUk^C`6Zak6
zq}EE>YK)$;4}_611=xoqrdzfm0PU{(p5T-YVV?)7JRE}<lun=t2wI(W+!G?nn|k(W
z2c`J!RO55a$M-ixRUTEh-ZcHVKv}{HM|+eyHI8mOJ(byX#n^DAz#0=PX$j*bE?>gA
zH5`3dU0U-nhTV}&sTaA{4)=?sKF&2z_yM6-sBP578AWU)Ji<l#iF}DPo&pd()*o$P
z^5^>)*+#;iFYv)e5+5;4_yIn>#AttsSHB3y&%2jvuqCZa5#7mD>Ze)4${dfQ)az{@
z$smG&r?76j-;GGe4r<v;?}j+d=qJCK&Xe7+U>JT9GM6E2jLVDE#F8D4rtJIGSp4Lx
z(Pp_GHU!feWredyFZyt)rqLZ)u1V(xiif6$D-UW?wMC!Q0%ec65Ql0=%o(hvn=pS%
zV?6xAa0r%A3ulS1LYV|rk(g4(Hj+4=V8I0&D8Df8fUPB36%svz(Vi~DH%4<9%X}Cf
zB9tgRV%AGFWU6VAsWyXjnlS4VyfFdeK1fBLFnl}R6*ts1Yh3hWSlTneyPp9mLe<b`
zpxOZmpqG0)4g<|N*Jh2OaDXM0PT*bICPE>!#2cj76+7(`QsMP^slk}~Jd8kCHDfi~
zw4gvTcm+cbSr9N#!yq4N=(gi9KLJ%0;?9Bu$WCifd?aDk^5+N9JnEzDZ_;boIdPU#
zq#o40m3?_C*Cm79$-W7S)WOUr3#K+P+L@!Yp{zK_LiGto{upAQ!74)GE7%c+4L7tm
zib#bole9%BD#J$d!!c3L3u6{ceB6u!5ojp)WNO`yCD$Qet|!pG5#jF)vDo@CEss`$
zX%ai$euiV^-AFbK<3Bvd3%hh6HQbTh3u{+6gaOA{V*f9sl|$GIjGa8j4r2&1!ZeLs
zVuXNQH>}rSCMdmfY$ML)P1EjS6v*W!@fDeW2^Szb&2|i`vmvMjO)u6tp`abtY9~=@
znOCefU<WkX#39#1{F5W%SiBEkcOM+lAq-H5u!dguB9BOg6yP>E=|GnN4i-!aS}%sf
zL;XCcA?G^f#L%tib3!;7B82D|w3SoY{l>FZE;-c()}8CR0nQ?z`T@lB71l6IXFk6-
zw(^dRY6&_4h*5wBm8RWD^<bz^A}aJ!=`df!wO<HcLg!)^@v~Y!&1;>5Uq>jBvOr+b
zlfeJO9Fbq<+{dFc(lGPX(W!S+HN7wPG#DF-s{&b0cL@$AI7245tVhdW5Vf$P7c8}l
z8fy=Bb8Mi9ZkaIg$)hoGEGE%J&}Ev<K`*=Mb7)!Zl!xQO7$eMt!amEKInO4~tUMA?
z9q$FbszDh7S*$HNux#RV@L~^XqU9l2o_0HsP%W;C)*rBOSxtU$BUr@J@#c*BG^;aO
zF0na{>KSgwQ*sc9KiId^QmyfY{iqmqgbpQA+Fph%t@rSjB020#405YC=ey{5@mX+(
zpJj-D%t+X=yL#2XhTosE4H*Yb%qC^6;_)e3r;xs5W;%DtSI5&`2D-YWPJIcV@hv&!
z1-sB&po5?*y*^vVmMffh$i@yo?90f3tZ-lBO-J0qjG~kNHZv{R|2MpnmA<zO{#Nci
zjn1*wC)fxWE{91#1X5P2b6jP2m)z_{F&7LQ#IR%Bf>d*sISW5T*U&s}4qw+tVK(Zk
z9*%Ks0}X*wma&|)ad@=5!{csG*c#%YuEY~~X>h7Z91NDSacEu}7>U+V4~LyE@Hjh*
zHkoeXD0CXtC2Hb!Or69cchVcg-4ISwkj2Xzv{SrlXCH-?p^HBSTS$E|v~!E5`*a&t
z_}xDsnnFEG9K~wd8^hwA1_@zf^lVwpgReIh#=*H83lqGWq%9}p9r@Y1hJ#ma@Z7fz
z_1kt<ayWTA2IRwUd)wD`0I9LFI}|kzw-D9_Qg5|@)OLLwhXt<f{+Ka`6B%G_BMn(V
z@iwMoLdYPw-sA0UqZB9yY%{%RAl{!t_`4t-0D@0K>}#h{7a&ID=|2me;(Ug#rz64B
z-wK|N>Zi>t503%PAhEWG?Z8x5Q{LS`)@<?aVY(n2#&21l+zna5AeM+RNEW(E1dCH9
zPDDnDh>yTwa<Il^#6B{7<&R+bKi+o;T&l+oKP2sPoZgbI18)!BN4*g|ck$_ubICYh
zZ@tX}_a5{zxSO)Kq1IsA1UbYZes*et<(21n^Wck8T7<jT#u*8C>%CH2a@GCZ-Y%yy
zlwnPZwWjNZp~~H%5DN~LgKskkr)yVyI^lrwnC!{-{Kp{%w|D*|j?ME9$uhc2T69z|
zqri%cA<n;tHa)yhYA0U=Digiw2xKq2Z=u(tCXPa$a}31XamaMGFvJNEZ+PdYz5CYg
z=Z<9{RN(tC`ty~mQraT(>He}@?~y$J{<`>6ZyEo;+bDiLvXJ^$BoAM~{X3Y4E04Bm
zy>V|fiAg6~7|wFTy{yq1Yb!>mASA$XRx{XxR17{BAe{hZ1i-hfA6PATOd9|hn&t1D
z0!J~1pi$&ZAqgHuGIM!(ejYLvnD++~B=U~do=!GO#pxF8#>!S7iID(ioxx60!8F_l
zjfO8kBLLb1J%a24mdXgiV;&&mSnQ;{lA3GP39??v2}e4|bID56zV+0p>G%7_QB59`
zdz7<j@9YTdLg)?ErAxIY*m0hiCuf`uo_^x&*~cHNJo#kh$x}~VI4=hXOc^+94Qe4j
zt5smw;J4I6gmLL0VK>UT_bE7n8@(HdHSVuRPs3rP7$xZS1{jU~;XT<P9xAE!(-@Bq
zLqCLrCD0c+lfeS1#++i4`c^J95uFdHMjRxwpO9VNeyZl$HSok@x&(oRdW&OOS0BM^
zz^QnATUIC`+v>PI&vl3lZ!$&&Ic>z_oUzxE*2=p#BH=b$mAZ(z2=2PFT{=|>Y($qp
zK2Ra93~0tS)F`q{E5rH)!b=1Ho+>7t3wi(&QRJDeDOD_38Pd5}0-$INs>--yv0e3H
zfVNAaUqH`_iGY4ugH<udsvy0XjG3euUk7gBjg=9OEXpjUS7ZOjtOv@)vK@C8Xb_w+
zT*rBOH95V{H`OIw$84rMquq$xu-Lh-!l@reu$M9`1Nu<oFvx&D+c>(_M;r&YcLww!
z$HA>cu`)EzVTBR15WZA^%%|)A&Jb;~bo;3PinKSZU3TEFk&+03&zRiMeQyUft84B*
zGUow*kOb3*oh49q`~iIX`q$5+QdNWgA3>Drk9qYnoT&#i&-o>K9N?pW#j7b?`2zuo
zsVN*k>0Adf>*}*1_A=-<7|Lw9e#&WHt*KU1{R&=2qad7LfF`$NFv3<G{F;Y_61Q$^
zaP(2ekRy&JAWCrxaY&wGM=w&_!|ePVQ~QT>!~z@%J{*Jy0}H-IYFQ4h@Dm&quAE=t
zk4CX*DxHiH3jiUa?I&<1Vgn~ka)f}Q#@zcFqdrCF7@hx{X_9#cL<q-5c+zdv@6&gZ
z&VPdA$6<Ixpd)@__VRLbQGJy!@Z?EOihhziFlH~~Apla2Z_dy9DQdsL+W9HiI#D%8
z0m9NpyMCF`i{YkBtyQrVwudRnL@-_Zsle=opOEVD2Y?QfE+@yC;<Rm~T@miiw|K3q
z?jNvJK67m?G3^V?Dom6poLLer1*ky6%z5T?odN{(B|pP!nbp*<GY+A+P2Is{7Ly6=
zcM$U!N3D-A*&!ZEkp=e-^H1d9fi=Q$h&YmQ2M!EOLhhA@3tJs=^Nj-ddVRjP`G#p(
zuO(qtPY$WTVHWvp8mjprm;vV3d_Eb+d*3n>l&2Am1S}Gy1sc1Q9S64Qk9Xb4?oJ|I
z+5y4g9FVUO;grQ4ujyLP1LI_jk0xPs7;lKVVOV%d0SJ#AZH2|XeqW^U{1Bd+&s&Dd
zVCI|cCiFQAqU^$(sQ(jJqL9_Kl#t;44ZI<90?SN1@lpXL2E0S3MSh2M%Gnn3jzb$Q
z1Dn0rBO)YXoZJXkM)#|Tj@D)cVVxl7j5QBus;OZdnqe*gO|zN=f*U$FUU9nzXfPU~
z13w+{4dAPV^GGbl!BOMj<xd_3caj$VVjp>tg@T<B3iAxE2iqC&7bn{Y=brI~yx|v(
zwGq;|LgiLJ4hzMrS%e$qJL<WQ#lQuC@3u48O2Ryi>8&IPV=ioY(ICCZ^Wq?8<XaOs
zNZSI35WfF8(4INao;h!;(4Kk5)6NC7C$x_0lQ>Hbwr+uIHW}N)IICU#FprlOs_qP?
z*&TKP+}MxF)gYF$&&eC$R=VE!T<482ZP17;?@s(ah=Z+BavNsi?BP#aHjcf92$ff(
zp8@V2*!MyW=iH$tc9F}uxCQhHk2V=z+kHFc0WrO>=cX}6EhY7hL&zfz;)eOS!j<0u
zCj@J__87R0Q5-#;c>#9HkrrA`_?9u$(p~LQNuBliN_z|#?G*0!frp7pO$`RE^FWx7
z_LvC1cuL<EdtzIRDYXaJ?#3PPS>$FwY3J;TcA;h<?QO8RI}UE?9&i7`0U*9{6t%tM
zR@~kR1b%j53WsJJ_AVYej-#(ldpAxp41DGI?g_s6qJeYe@Lcym)Ywz(AL=RG-z)8K
z7`zi~`&hA?$a8Rj_J_N9CzR+=P>v<k?qQ%YEW1cMNnspfuxZcA4@Q*nd5p7e;+&f$
z--0yf2nz^$p*cd^GSH5G2c)?i%dnD1K1j(3m8E`{)5gE0!<huc(SPRke}S`pLlou%
zP1q8(D|EpyH$<%&dRW{x)}%{Aq7T6W%@%iWl7bmRyya(UfGIeTN)#>+!B?F`dI2FJ
zSXf*O6~QU5kCb?wvr<5xS^3wABu&1)F`2sA{In67N;#B`1vv*I3g~)$7;s`<wU(FE
zA0c;}5=rVStS}*`Ne-$1M(0l=KUtwjSMGvrb|VsKrO?BM9QNZ!-(jy<hg)-p)qlWy
z6w?WTsYw<tqwDg*xmCzvlJL?nu={U^)&B|Buw-yNs!q9$^7a(+CNupI`GZM@+Chhj
zA^kKg!odnTXc2k{<8c8!<-a0c*y@C4Sxia!MppeEp05n|6o~XUA|+`tRT9ch{V8Ml
zf9ZUU4tt(HE$uGGxR+29t!@oXT}5k?nicX!(@CWN)PLv8e@=(QpAw2LWd~FcPu<q!
z5Oe9+>UD<s3pyst_f@8s)GaQ35C@ho0U6Flk1+s6^D4osBpp&->g#k~gX1T#SI(&`
zlFD*!0!dHMR|7#8YF%0&xs^sHbd*BVS?rYo$;eZ(UA;zsld-0lNCA#NsQYfjnFD4n
zV__qLEzP$E=xn7kNJk>c1+_>@Q;pDj6b`7W#U-yID8;Civ73*mMWM`c0dLF0&nY6N
zRzM1t>s&)#4D!131n<s6HZQ?+{FFK9UN^y%rLT?Q>fdwxejZhp9l)X3pv)+h=N}~V
zuba@z?7$V#;lPet$=57MeEAfk{Ix8s6q}>8ux43s0R}gp2hu#a%8#ebT?l7^Vk2e7
zErnVxoCG3WZ!*l|7e|l@jYc^#jxDoz^qC2pZPx3cvT$T~UfwsKH*f8FHGhNmFJ(j2
z_cRnGxSlf3Oqpjy1`A8qm)ZU`Ity_8kqV%qc4*~PhK6Qs1Q!0c+0H*jXN8Uc#D9iY
ze~-@3(fN5g1dXd7<<&M`abWv{mD5j5pLzVfl`~H~MF9D?2OXbdJpUXqLd#5ppR?M$
zr}_<g0l^7F0)Y=j?n+oPlGbWJ+mB7R`RNlshH&?Q9Cg2lMA9)p>do{J4voPPGT|S=
z_UjSoXxOiBgh5Zk<Y_ksJ()AVo*gKB&3q>NhWReUl!lCA2Z(mC=W?Fix6F9{b@T`r
z#<pJT_<?=5@~`2Pi$~0a_J<-5VYR=RAwh4w%J3b36QupvtBPf+`Y5(6;1?iv3TWWK
zJY*NbMsJ_02KH)ZA)sPy)^J>?KrOoMLKY8DRZ|U4ybLW;MH{Rk`#&ETp*!6Z$IL~V
zjnD>}xGzT|Rfv%ic8l^cTbmy3O&fR^;FGwd0gwUfqA)&6ShGF?{D4!MFBRYW<b}h>
zr|^1!K01$`VL(99q6iY~jZUmvqAEIBfb|TVHmc7JNA_ApnsX8AB(z^d86&Bc&O2gu
zr7HCb0?5`XW-!TO3k>$DL>9?C{FL<xSUWITDv9ASHoNu4h+9Y}0mb6}JD33IK9-J^
zBsOkT>l#y*bhXi!jgyvCTw2(N+!6c5x)Rg&E4m^K3n##wQZcH#S-ll-ei^DI=2a5y
z$rg}dmtlQnCeT!in8?-WGpIFb3aQJgfewW#qSc<TX;=j!ojt%@c4{Qdq@)>^yeFgR
zcSU?=F<N^*6sPgy{u4Oih7Mt5Ac%Y+jcpw?jSwqC`C}*X%VS(pFo}>_{|Dv(rqH4G
z%)qR9T1+AE=ok0^^^wCB44%_e0`6ccL0`jILKp%j6C50E4KoS)cPRKB4t__RL2d;J
zwqkIL2w0Hvh-_g1P;zSA4~OffL#ju?i*equMeuIibZ=dNf&kiiZ7E&sqGRiVKAZ_)
zPZQA4VB-St>KU>Rg0YX|7<Z~sagfrJh))jt!A2@rT?hcN)Q4&vE0#oTmglV)Oi##3
zhkgbFh>&cEa=BjW-Si#8jmybLNw{Tk!HrflIhv10pz2M_z4?X$<UoU}{|@sabfaL(
zH^2_d^7RSQv_Flok*0t1L`l_Yb#n&^zODt>3a9af7_lb<tGtz>cgoMD^M{#%&%eD4
z5H#JNfz4sd>I+PABPiFVm}O?%y$3}Iu*k^>i%k6|ItmT!x8pdgy%&p|;|V#?MB;=P
zAPMr&$qs;nCHnVXbUaI*6>u(jBD@xaF;E5o9-SmY87Ie_1fl=~K=yzWP{n7^zO4>6
z?+W6&?4{Otuqe^V(A=`4Vn2Km`aBD8C1@5UEu0@eE_Z9#(Z0aE41$M7JnaE5wJ-n<
z9lpRE;{-7dz11O`C%<U4^q)a17rw=VPGbweQX?NfgX=-$(o}yPcn0oSC^RPnKTG{|
zJGn4~xQ8L7`D}0viH-VcI}?Ppa3+qtMHg{$U;J-+BZ8t1WBxVl&>rHhdgK8vaMi=j
zd+FCs$~Did;k%6aKY3#z0vr4BnbX1BCf>s}G&6jYdyN>H`{e)38xKS4xY}7rl2Yv<
z;Iebjg3Fm~g4iBrn?Z_@;GXv`%=QF!ix1)aA)ZUW4c|%brpFYOuiio*lZlv<@wbi8
zo_aHLew(7PLn~Z;3;PjRHizLV;vY7~WkckEWls#Gt^;HTVC>G^hLdDz^bm_}$Ld@(
z@fA;;C?3N?2dbebNiN?Ji2A@eU{(*G$6@7w^AZOM$aS!Ev$FV^S)c4l9dRGc62i1Q
zk)IaJgpbkO*3yk&F=Uj}I8yQq=byhpN8m>v-E1C!zdoaW<-}>d*u(6WjlJ|j3gEc(
z6%Y={5YN;Fwb8Tov#0@`qd~3UkO{Ve1U;$~e>)A4)v@)LXi#kdIwC3k$;NWst8Ms9
z{Q|N!wOwOUKF;i4pmPE<lRna~$y?ctEg~7yxuSI`Nt`hR%v}fjrLHqid8m+JBLHXb
z@$Bk{8I1^SKEY@x+EowoN<g#!g14BIA=RZLHF$|vg3A3);j`F%2R7cFgGLad8LcvO
zyO6X;kiI_@_Ur&Z==uXoIN=nmSO5_w-d6oGosDotWRHFXId>H<#>RNgPX8ZJv^nYz
zuvHI7W|NDg<bD_Q1lUhZnZEdOBVX3oi!VY7#&!(ye}>&5Z$QHUoxoOX{bT&(N5QMU
z8tn?(d<8}E3BXkkpapRFXfNBT&l2qLlpX$&ZLCifjPq)c6@VBv-d`-O#;Vwy*ZVHK
ze1#Q7mOdTJ52rI5?=Jqfc)R@A|4@vX@+ZVQ<*5Q2H3b@&wjR`<Gf#gE$4}DsCpP^{
z8Kg+mi6|;Vgdr}EM&~(nj{6hy@N)Q*;xg@z>oD4u7<L8*2Gb$`362WXoml!byLfqP
zR}XYc!CAPNEL}zNtM9QO8!Hld$D~2|?w%-w4MRC-pROr}f`b<ZSP<GQ@ON(fLfHQY
z8IXbj8EfV38;91p;JJSysPad8g6XH+b)_W^)cC3iI@H^U?4dl-!-7>BlqOZ{-jd?>
zZ$I0y5--Q%%6Yl|M8~&u3HA%2UlL0N`XtfajJrsO0y)SKvE8PMZ59wWQ4ggLpitZS
zEK9l(0~n{9=^lTOue}qE?+@#?qPkbrF5ygZ+s}2~XbGbT?_S|MloUeFp_QQ0a(m(J
z4>TOe>!3X@@dR>p7lZEOgQQbywjiFn8#m>G?cl^A+NCW~x2x`9u>E`zY%0dvvZ?q0
z{f3xI%_(UOY6&NJOwoHd@J4Y;x=7)c<Pv%V4;;9c(H^367!IxfOFX)jL>MXPN=i9J
z2I`LlAv9Bu?IAECdYrG0_CC-t+{gFsr}GXv570RW$4{Qq=}*(APUn+!{wAFtr6aS3
z2wcCw+h3qFf(n&0w$yuh^%$J;L}=#V{Do6bUFfoMaGpU#o+WG#g)1`NyQunzwD;2w
z>-wNtAm;upIM%2>pp0j=j)(w{lj;yeb=b^c6peg38RzwwnT3IgycnP01Wk1;zL<XV
zc)B$~HETLGGBC0~eRno9vSV`pNIv$xLuX>&3->9U!iy<6`T9u8de$74>Hk|JmNjmk
X(br=ZLVY*G^3Qw%zIpR&<3s-+!t&0O

literal 0
HcmV?d00001

diff --git a/collie/models/mistral2/__pycache__/modeltp.cpython-310.pyc b/collie/models/mistral2/__pycache__/modeltp.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f7c6a28cecdfc2502d5bbb914f4ff15a9b802990
GIT binary patch
literal 52277
zcmd753wRvYeIL3rJNv`}SiC_HBv%wA5)=uNdfKucmMPJeO+m6q+wn^BVu=}m3+w|t
zGayA|5zD4rD-R})+dOV#e+6SFvD`FinlDY_=H@!jo5a0MZf??>Ow+V!+M76a>f}1H
zBT6#w@Ap45vpWkwlH<1DU0~0iIdk66|NFH1`|}BWe(fE%E&hwEM&eKT(EUl{;jQ?(
zr;>?;ohT=4!#3+i*^pneY|3x4oWyUkZZ%TnR3lwZH!|f+BU{cII)1L4lX&??pP4Wc
zjY7FUcrzexta`4|SMHOrRDG~fEEgL?<)OxKd04{I^^wMCc~tzF`dDMUJl>cnPsnq&
zKH1n+-qn~YPswwxzPmA9o^I?Z?`iBU@0GB8eP5$gE;Vi{-z3j{_5F>T%QrU;ln*p+
zDc{n#wR~&iVEJI<Q27wv7wrD}Oyh9*u#s?Xxsp&nh`JBdZ)+SWA88ydAC(*i>$8nx
z<ztQG<>QSL<r4@iI=4H<<{i$FYsT`OSB#5^@?Fls&AXj@t{LSw*+b<wOAp-JcuV;{
z$-%7OU%p?$9w<K`zYmrl#P4wZt&NAu4@sPn`r8^O%O@KTmmikr(fT8eQ{_|QAFIE;
z@s9F48jqGAmFMyLI~(sRzf1fR_0x^V%8!YEvOd>%Q~6Ee-&KFS@kIHF#=Fb!Zai6j
zQo^R{XBubAXB+P+zo+ru@_QTaE5A>|ch|qA@&5Ar#XnsyH=Zg#)p)x6bmN)wGmQ_F
zKY;K(_TJ|c<;vR=_CCAxe8Mg{earJ#lI1EwZnF0yWPcEH&c4|`fVd0x!}cxqt<M`}
z8*vWWhY;sb5XZ4+oZRxFeb`yDZ=-K9Z6C3ZK5v!JBkrty3~`SIacc-UZl6HN38#h<
zm+^eNeFvWJkmnVr<}5oa_MM+dmg{(Tmwh+h-5sQEAmpBSi2bH=)4ADxv(vm{;_bcm
zTM+*(PU%YWicxOa_mx+>-rjFNfVU6m-0=PaQaotC6=836-jCWS#C<5p>23B&gr9U;
zl9OwjXA(1ytWT7lxO}#y7S5L*tGS-4)=Q0+?bK(#$fY+oWBJx;g!|dYSG`)RS*`o2
zvumq$$4|YdiRT%^H=E6wq(AhwlaIWw=AA#?TB>>O>6YvI#ghs@k2F2iTD>eGna9pN
zLJv}<A3AyV_EU2+rk^@lT{!Rfg@-RUtBu-%JR$g|%Bpf!tIDa|X;&7iYi_k(sd}E%
zWHl;{s=MMJe%&aQ#g?kL_G%S1oOoN+bsl5e9$)iT*F4tti7J{leo}Hj{a8<U>5Ow>
z&1o(;C+k($tu59Z6_0^h4nCorv#MHaI<{o(58B@4Ri|?9vgcIX+PX87_D8F>U2(mt
z@+x*~0exI+F1h|K+l5xVRz<p>6~9zfw!f?9Ru)nIc{aXTxxG@oSgqBo=P&{{$6qZe
zr|LPXf)=CR=&jo;Cyx7>dTVJ31^T<toAv5~Q&En$rkY(<`J#c5#*p*}6X0a4xma6b
z<GKo7Tx%})<6Qw4tE%oaD-1{H7Hdu0dA71xQ?6I3K3jAB+-lXX)SBpMzi+MC^>7JM
zTbHDB)NWmJ-{Z=5lw&_!Tkufs0EWF<uRC@2n&<d~7<CuT3WHJC92%=swYlUVe(rSj
zvZKzdItzaP*_vZVkBC?}*{auTPS7U9&p!J2!>V@CQJjP98~Bp`KEzsUG@l5{J&p0K
zDo&8rn)kL^)%8bvB0TmEr)vAtPqr@Ym<CIyf1%Y}sCpG$e}BM6`zlpe8q#q6q|?0U
zXIE7VLnn)3Tvt|R(y5jT=OuEQUb%=xfOWzdBE5=ffi9`D<^FUKf_G88uD4NGJgrQ$
z9n;F&1x}6dlY%FM?@4^!_rhr>mT{(f#<FRfSIrIjlI_H%buD2hZR@H@--g*XHdEIU
zn`!uyUS>0kSh==gr-F1zJAKXEu-azZY9|+y*po7IU%Y|Oi>9BnYYjgoE9w-+D~qo4
zxcMlVv8I|rCA2^9C*3u~^|+Jysan&!<90vOY&F*%)$%i#x7<4XWTX15Z`C}f;Txy?
zBxa?bURY}&=}gKW=-FP1u5$rX+(my^&Q(3+o59qrEjX?}f_y6ab?0q<VJ!%AUreZX
zp>Eg9$CT4(dCt-Et%h@~e&KShdF*8C^y%6o$2`aN++)(q_|aSM?Z;+kgZ~AYwp-py
zd~bDj^|C)$S#rFJURrk5t8zr#BAlWzZWN3u!!%99G^|(EV+dTpH}<iDC-+u(oCKh#
z1R$vNM-z*Hs_f+RIlxhtox(|JeKcVu$|;VFKM?E@Pd;{LuB94(cJ(iRX7g*m{?8v+
z07w>n=;vhNuVn8ft|T^%YXG-Ab2I5#o2j;O4S(%KC5?ekR5G@K?9KK2^QGVuFQ-&<
zOW4$E7poW$ujI8#XYZ0#&PqgS=4P^f%3WK<ipW+lE;X-GQG^=(99L<>!S<-`=d;yn
zViePtoZ8ZP&o5#xt5vblSDaP1R>!H9^exw^FZzAwYqpJDjypM)>#F-Aww_95fd>Ne
z<`}Efblwfe#@A(Wd286n8|xGCHq3@qS}+kO{AA(b_awgVkHcvj%X-lmp1GX7W^5Q(
zK$eZ(T266bxUe>oZM@w~x3S!oGZTqx#%KcZ63bb{HQJ^;TWxDIhlQ8Nw-1XiWvAMy
znz51g3U=DgTrt0=xRLStmj~MEcE--yxvNPd@jkS|!rvA6o7ocy_t8s+fpIeu@5eV@
z0!A!9_eC~F-2>+=R%m9$aaOTt*N0?uN*tb&J~Y>dsuk66E7(zSCd@9ot)_2W`XZ+x
zIyKGnqv_2Is(TUOn^?y(P*}OGD%WyywROq28cwz8r<8j^dFnkZ_$*_mZO6mj=qDFe
z*Zj2CsuN(bTB}&_mXu@JYmL>Jtl|>!`@=Ei+2Z<rvUgTwlkv0ROz@M{rduA3MGdO%
z7r5HE54+Wij&CgLX%Ni?>^A!BuA?+}98Ma`aTI?=qnOMZ1#{S#zEMoBkN3=vAlZze
z9!H`V)11TriJVHs&sPW*tkvl+R4P2)^&6RZN=dtddAZ=!>*3s#O1&R%^(>u(ZvkJK
zrP;)QQOv3^48fgGv^g9{ShP9hWpm)Bz|8@3h?|4m$GHPAIwjjg8ndF{M@?%1?793%
z<D6sL07AaVss6>{K>(Hu+#tda7Y!GIY#7hN)7dcE+$2odB$Ax&6)XizL(5Aor)_K<
zID3-tXRs`+%`B#Ru5EE+SvGekMiRR*?cvXJ>a)U};&|?3e#?dBelN8-fYU0=a65sj
z=jN7lrT*$cIat(Fczn^&voc$2UaTxC=fWw>*gUow7ZVM$_Qj;1!C_Fr-o&|h5l4h1
zz-)d3$@Lh$89sl4=f-NwtqBn8g#F_B);R#DddjD<RROM%L+tK5{gkW#Rb}!)<t(vC
zRXMk|xQK0T7zd75S!lV~p_&>%P;D0ZAx0nK0bx6f)wMc)375{KWeq4+Mtzvhl`ytW
zCvzp5%2M;^5W^jYlQ+_4+DIG6F_Eq0fRVqkzAv1@@p0~Kb>T!pa)BK<>ax|X!y{vx
zARK_vNVHAD54LeNxoP1!g{{x>3~X}x$=o2skZEHi7XfeBNo<2@#LL=NJFPdmHnvL}
z;~b1~*3RG!w!Y1L_Zu1KEHDr0BpE2#zc9)&MtOFpz*_bxwSj8-`KWVk%<bXe?B{R?
z@nqoC^N6cH!UP$?0jjR_sh`4@hp|cm5>y}M>m*9h1DZt6GZ{T5&oMd2O;0w%!_Z@{
z5ln2H>)B(}3yAmK___r+X=B<LFo(@4W3RD3v(u1lnJL?z*)llX6J&`bRxCA$PR5yn
zbBSLWAX!9~jW{1McVVa`+`P(Vm);t>NJKL^r#^^W)wjYyi?7npMs!-NzLSrf=jyxQ
z%p~=8Z9J=<W0*|J&dlVN_k;Mlq#5PSQDZzgZdhMm-@8Ne!_76RE;BbyzquEUIaI~t
z^BSFtba;~Mnz0CViB6|%Ss$kv#%+o%a=CGFFU<}N=OuK(KuGxEmKII=ci^EdWGM6$
zfaF~X+nP#z#9Tx#^8A*qEef~MyYhw$16yDp77V6-(oS7vkEYiTKZ#|BGpuyJT3;-A
z=be&1uSz(9-BL>j$?-NjJ128owlnn|s0KislzYC4O^oNP`j`Z%@5hgCK6?V++q+uL
z2K#+VN~^H2Pa{AMkts~NQA7O@o)@@_haYwdn=aFd$uWVfU5U*^o9o)dPTqOKuHH6B
zu@AHpSB!RYG%=CD%EpwFFe3`XYHnv_G8xq-`t+gIRbX~*iM^mMm!1f=)lzia5Kd&T
z0E@@_YSS*QIG0Nr8FOdz0yjRSmfU+I2=YSd(7d1!vu6e6oS(V3bT-OCr<{#quj~-J
zQ%WZ;mi!K}@doH-3COJ%jv0FzHIB}QPBw_TLpI+)CeZ;@v$v^K)I37sY<w8K6ktDL
zExpzXI;phKQb1o<TR@GPORwGKrBf|ca-OX=fWsZZ&;Y7ms5VRIoLHG_u46NwRRGy2
zYY`ilhmcM+Q0q(Qoo4A$ORccXOF-01)zZ7(dsMdVPH(L>U5vf6?o_ZM&~HEs=Z};Q
zw*peu;UlHsbaYwXiJ8ts2-|)zmc_wRRV}SGh=RDFHjpHQl;|R4kFEe=s#FE~fEt~z
zfuM#wqvYr&_o=W66?FVFoRG10V+4C*xSYRL;5+9rZ*@<VKrX<dtzRykQ?07K00Nfo
ztync=^<U6aem36gO`xN(Vx@#_YggJ@qw@fHrNy#$l%5j2SkR**r2qsvQaTUX2z&gQ
zS<av4ky353B$bJkyMk?N=ggRgyIr1~HyC?P3XXl$!7T9p6QxFV#c^ZhywRpb{Ym0l
zIFF^7@!mXBYk&ZU5$Lq5q}viJmgDhyorDHPebs4KPn&Mn^Z7|3+jZ?#^Fj(a)V*EJ
z%ww80R#goHw-heZZFj$t-puXNnnYz~cicvG`J}T&oh=(aux;alZ>*^A#s0274#zKa
z&v3s#iqFCEGZ!kdqXA3Qp57B?w2B@5az*cc1d|!<pCMm4<OjwEaMX+%SwnpaVGH=i
zKFQ$8JpfOLvk1llg2wY%U@X9$W8{pKpAVpg$4);H!^(-&lGMdKUBY!+5E3DERe*;t
z13aw&9l?oCY#itb87Ls2;B3E&&`P@JU8?I{#>+Ad(;^iR6nU4GbU81N-1~L`VyF}X
z<m6Ngwji9r?-_i*V!>&{t9}3;=~)1@JkNW3IeTBd+Bj!dANT_%V@s{X`YoM~h=BLm
z`|7QQYTbPR9-RV+j&+XY(Eg<=rVMD!g3J?#2CW73mGdoo#~P|1VP&G)sUO{1H}zu(
z`wtBN44kvybXNaQnAM9(^;!7+A&u;VaE{|vNX+WT5kAx3LDWDH6#BP%UyuN>d&9A7
z0JJpC`Zk7V)>grMXf69W^n@c`b(Al1plvtxGa*wyACM5L3!a~bQ(0`jXlj7}C-Giw
zgK+&8!reRJ0O7)H&l+aRGAxktOg_JCW_ErmX4gly8Ohk(4w3DXEQU}lNVfa}{MrPB
zujz7|nAxNv39u%F+zt^JP-+r%B+JWio`a4w1k@}GqOJub8`ObbI#L_ZDkw>J#VAPv
z!OpGcvGTeIM2-4tH-Sh!gX*ad(5b+g$*Z5H|7Ym@3pzhVXORxkc6FXkjm|Qi6*wWr
zE0AqEn6LRxIck+LFVN{g`2ye*{Qs}`3hrBqNH_T#R<ii2m0Tb0>gWKwP;NY*4z@sm
z`8S!F?)`#Myh$-k)#3Q5Cb%-Iep<6;wu}~cA{j0A1on!=)7A!PfEyNw8`y1DOx4eo
z5Bvd$@kfXp2X6QaJR$|)n^+vimE&|RwZSnbVFe+h2??`^r>*~<9Ajq*oZ+7*Ll@<h
zNCipkv#2lFO(3#jdDI;o<cFRGG{qzMNJ+nQ@0V812oMlrCc@BhOG~ZNqG|yo6ZqkX
zl-OKrQ~)_wK;vC=DxLEmXm;nAoZ-P7q$)8*hi1Q13dvkp1#`eDi8K0X=h;<2Uw#IF
zD{&x95{`rVdA>|(TK~v4^#N6wG@fjDX!lGRa;2nKRxZATob;)c%3|#vCzA9_2wT86
z_8ASHT<&<`sbw2Dt;X%7W7(FIa?-mJiy*>-Fc0F!^Zn(FopG?<?5vZvb50t}>Ac+s
z=5#)1%3mQ@uqW((i8Fxohzm?}P<%K~ARdr%3iusz`t4zSN1Or131i|{o_8fsE^ySv
z0ejRQdp=bj6bfGvvt-gQ2DAU6kR`kRucV_N1FflclpO@HG#VT|+6SY^rAOV;L-l&;
z{jIgqoa5LB0qg^+oh?1qvI$iS{0}^!<ko9~R%oVSC&PP7rvYbU*)<#`<r2Hw|8}S8
zsH(t)d>t`xfiDn6ID<_ed4gZtbahg9zXopR6gOi(C+svp=g)4TjYQDES)90ApaJqQ
zl5NvYtR>VG`5S`Cr<XH7gf;m|kflH|F+iP9R6wD}PMN6Wv5tB6=clj^D}4wp;1}Oz
zz9~p(KSBo(I)GnLVJd?*37|zh4dEAG5Hb-Ts~35#B-Q)9VGzUMN2x;g#BchOsVl~Z
z4R3_ujE!&KXabOi(EI_V<OSLojbgA4VT?XdcEcEB+r}t##TfS{Iwimd0vuqjeu*(4
zrA&Iem^;2`*Ycn}c$F=h66mHlw?6P_6B`6L{CW)6e_qB7o3`MBglVg#`Dl#hxjzA&
z!F~g%7YLY4In0AgHB2pRN-~SU@@XlzWUrC!F56kKSp%U$G@vF#8u<>n5%rucJ%O~C
zCqyMm?wS&00Ehj>n(g#dK+w<yGTwFO$U=BSvuD>I40#7&9hDkut_L(18{|d6=EN_~
zU6$=jHm%a3CA4C`E5ZD|rS)M!-{!duNi2q$GqXE`fNvDQI))i60M`0#y^HuYU_bRD
zR>(IEk=-4Z{Y~d||9`if9tm?&#5F({jtJv3yE|~9>=5c#5JMg2UQvK}MiZl60gK;J
zWrhnSQNTtyJfhuCkDs55hEjc=(SMQIjK+_dCDmG6b)^P=o|t|G2e4Q5a{yVj5R5<6
z$=5~<tPVf_UY?ncaHcB(PtzAPbediXelK=`sOgq|AneCNNNXad1t9sote3#0($H;y
z(>`>T+e9=1rvy(eW5BR(WYbyv>Tg!UhrxZ$<8Ateb;HbA@OJ~SJ3eKZCW@IQsB0O8
zSJ!v#I90*%6Iz{mj_paFbJrJ$3xyy033Q#r*Db*Ti!=eChdc)otj%lICXp}mnt-{Q
zR6T`wLMw$>fOWCvTvETzkk7;Ma}bhhg5KpiK3E0Vtd`bVYwnDtJMVWH?>RdE7o9jv
z>8C3KZ_Ke1iFuVeT;k|#oYiqoX^`X;bNMH5C=~<Qng~MjhcqzTfq)=$gZ)dhB-H1S
zWPyiP_?du*R9V)@{d5xSYNMUGm{2(nfFW8hN7SZyE%9`6BfF8?z(JvrfDLTd;K%?S
z*uc@CS@#?LUb@}K<48DfNjw!|PyL;zWYSD-^lcP2`rCb*S%NT-_vmXEB7dRXANd)l
zi)oEm-B119Pj==L$&CRpv%!88X7->kvj^IP1c!Q&5MgG2YYYX!0kmASQ($p?9HkC{
zMb29Bmo?&Vf4i_eYy(u<D7L}wSTod5qs1d~%zxG!wE=c7k1dbmm>u{er1L&(v=MJ}
z0xn`*Gw)%##PVdjzg@&pXFhFi3~dbK2t+yUq4sb)c_oD~^i7bvkFX)T_z81k1o=!M
z7E2guk6g154?XDZMp$Bb`U;-l=Ue*wy*-<InJfA+Z4X_;T=w>Dmhd)n%>WN{v^~m^
zZ4&^v3E#ox{ks$GT)Vhvtt3BYsNHDG@OE#}vJsv3hZ3K(q=f`>45WFmZR$J$YIJgX
z3vwE5W0ZB6q?dS6`q8!t`u?bQ^YVe^TiO6Dy*<mf+T&M~n+Mx{%ZD(_^t+k%C~A5b
z-`nhot4V~ijic>;%$~9K*tYFKdxI8z1ofEQ(js{;b&~W^oyNlXC!@~Xk05Q%JBre0
z!~VKrTuY+l#OAT~7-~7zPJPU9f5b*x>|IFFHBW&Yk8hcm<2%gD+=&G8`Hk4j978)T
zk%gL?15FK*f;?y_^LSlBWKl?V^WDJ$3fT@MCc$JR;2lCifH^3EbP-|T99+C}_KthD
zfo}+D&v%X$7K9uDivfn3oxeVMt^}^b`I0?TI{H9~5AcHpZrEA^OL*kT^}@S_&!SPh
z_58a)vkv?PIiJF($!cfM3b1`w<UxAS{V#qUQef9lodpIebbX{(YYPxw1fQ!4K<hl&
z<Ti=a3r-0VeL{a{6YriaJ-S%B4BP_ZqXC8U`o$99zEX9~YaIm_odS1|0tEek0T>5>
zKS2;yj$I6)!6PN`D7HX>q;u7(b%Jz6jH52oP@m4ES`r8q{RfH2SrFl#!;TA@&&Tm2
zq|z^9^G&QDh&KF^7DIADE^e*WZNiB_vng^^x}G~yItTC<aOL%zIte&H+r-u^zBeIv
z5NnN9KZ}zJHEu4g%jh<P&Ys^U>Gi?Kg_IZ8<=&a&el~z`Ae}%TkY7|u{cASk|Df}4
z=-fi*^K^cn&LW+iXfYbe%B-$xLfXG&T;k&zs`>(b5get37a;WuV40luCGWiY9me<+
z96!Cb3LzXv{Srg|cRC_1lx|eL##$Y6LLg6~z7!MU!3~CdP-dD>c?zut(##pRwdjGP
zug)=-DxG^JMF3kVQsdNL(D?<1CqeNL(xCcnJkCt$Q(veD(NV4bfYJYuj?Xmd74+Yd
zD-eO7(sCjO1bKuquVq|aKhKk0uNS|dX&}K<7r>{Fd{^jINO6(!hU}eg61=RHiZ0SG
zV2U;?!36XB<Vk1Z7x4g27Nx1N8i2O<04pL@^lm~boVnRN!7}T{fR#7%0OkfV0|5B)
zuNKk;NH-0n_>{&IWt$3+a^hd!#22XuAP%R7NxMf{2qz{BM#(GzK%4@oXaPXXr^XR)
zih!gw4Mu*!*pIi<!P{bL+Q=u<rezc@>qY@N?Y)stu1{_O2?NYdh|Djh=H}*3othcy
zVetEz3T2-mA$5oY#I>kMt<j8sJ&iQ;SN-1z_Vc791Oa^vu+Ftwbw#jJdT3_}fnP?H
z6#MTuJlTY0Os6JKnrQ;J{Ig!NO!Mp6r1@&rGG9rj%$M=*x|uOw+M65xnquk&d}E*f
z;7RDV-PEE;Y%^Q7a{4coOW7lK+M%FS#>wy(G@q>avhd~LBV8!(09<nNiv=p1fXdUC
z8$b>z(0m5%BILRX_K-aczDd7Oe+B>`k45rbrzn{o?Mt^;--+gGGLRN%Es4N**FLGG
z*hmt@mZl%DQG%5Kr2$POn(rOe`4TpJ?zZ7J83;B6sj)~dYlh!i(`&6kL>d@8&x}?R
zOi~EYtkob{9-J0n8HYP;Ge!##UNf+HoEJ5()C5UNSi-{FUht5S5DQp~)wOf=+Cu4}
zCmzKvEp6>>BM)EOfg**RDZw!fx}|W-aSDTHtB((1Mu212^sr64IJ7SZJiCUbO2u$w
zPE=01L3{;|ArT4~j8TL1#HWq#2K?@sR}ux1laeH;$SMF@@&pp-$xRf%-`lNDfP)_*
z)vZ<RA&X$(;8+R8ZY<fct~2d6Sm5t)TC+C8So)C^SkG*ThzWDOxJ=Im8N*Xb$Tf$r
z;3@wIWdwN2Ko_2p*)%B-E3)Ou7{;RU6k;qQVk{U7WJwUW-^&CT3na#_LJ~c|Si&cb
zvB*;lW65=4ETNzHLFgyO5{TGt764-xVo&|uPXWf#--WRZfUx&k7|U}pxG=F?TpkjP
z<p)seaDcJ!mo?%qF_w`S#`1H3Uq=OF`AKg~FqZM<i5`q)QZSZLjj;efDYh|#YzQXV
zgA_p8+yxoBscVTVrkw)pPQoK`4`3>S(+qEnc+-N@47W!*IE}Zb^(AjF;uf)iK%kg-
zL+eXYQgM0TibaUt{Tr{ed6PXX*r#Avz%2d|aG(9GHKh4Q5O1tE7sPqKlaD-&(G{X6
z@}5j=jEmIZD7Lo|U~d!A6WAadlkJJ1b<)Q6xU^LKi<C-YKTfnKUpEf&(S6M}F7LBb
zlL^75s_5rYz5{+?#&D+RNg`%xy39bh3z!gOA8+2gC47sGHxLTDW-MV0>@mnXn&{hM
zdmOmoF7(f>0d6?j-WB172L(4gB)H*0!42srjyMAh<45g@CTSjrH>QAR-L|p2y&E{#
zkrfj?_g_%@(QCj%)Gxp{i}p`9hrDBKmpyrfF@8hZcpTv*EL+Pbki)c$-LCdDX46za
z<nG(Ny*-W9MqE_;I5V|RaQoL?O^M|@WG?lCLX&Gx0Ylt<HM6m&y=U`I!O8AI`FBfg
z?%54_v1>^guRU1tZ))#B$$Nm8neHCoWqa*ur0QCmz{~b|y{m8EYv5(3`_|a{+tc0~
z;AMN})W5?frT&P{zo+xZbf)MK9a8@h&idYJXR{4JtpI}C4&PCK!i4{c&Y#k`ht7Y7
z;|~OEkcwu5tp702w}myxT|pA5_xK4hA-e_PB$CIW5wY5WaM@10lp`85!f}k#uK{e4
zqGyYXwS^k9uTfZwB2S=kL%u0S6kl6iQegDxDjX$nS*q90DbN#VMS6N23Va}Z1G3c=
za1r>a24XQd#t+SR%MV55QHuT47iTpeem36rqu?)w?6>)uBj6;0{{J29ss94!&+wIx
zKLHx2LqT~!xa(4NEdjm$i|jjn@&?k98q7wS4x}c*UIhLHEfBO=j(w4@!l@l7A_QV<
z4JwW>rgG|naU&g}gGI3T1@n}quB`_0S&<}P2{@=m51Ukt5HI+wQTDT?_W{r$|AHw?
z=c)@UlCQ2X;`zobDAa$4M*1V9_QiY0RsR)lX5Os6gdg>nj0MGsRrTNa_|0N&>dVaK
zHL*7JSB&xyoww2HU~IdCl@LruH<-$T3Hlibs$=#jH2}Cm$m{A?k=!4k0DFMq5CJ4~
zE~2V_jtV@LdTqr~f6c&Oq9cf)plX7&5swqJ&L08woR~lRwBj_0Q`-JGGmk=-R@VX@
zkPL_mQkfzvbC#J@;rNqV)^dn$5HZv^#QW&`TRMUrmfiq%s079mV)4sN_zIm@>3p5e
z4LX$a(J~5v@)G5NfM&l1`BTGuJ*+W6Gy|Fe;LkTbx6B=6fkY$?hC=HwASOePfj&Vv
zmC59V0Q0SYlM2xnl<yb7A`o2y%C#$twMo(;cC#JU_y-7gi3EpOV;=b7H-I++f9&9m
z`;)*NO$c2W=8e2*zG|g4CYnzb%~5lJm=rM10r(D>2aHMJjR*;_%2DGG%GDU>u#rv`
zkS}moVz4H#FQ~|+zYgp(%}`>c1w%pwKhs!R`lV8yiVr=wXDDXg3-X3jVh}yBKrlA^
zjKV|g?IgY~#~4Rg!p*=nHgZk*ZS_Nb##l*iK$6BbL2P~ooL)%S0AvY>@gl7u(i=~w
zHvs-Z21FCq3m^{<Yz(&h@mmCdmUSQW018WdfR_N4hA2GU&H?nzZwxobwttJ-087*^
zB+XM-`#`SFfGFM9&bL!nERdx~AZL<wS0J~hbJw}1ucr08cz92K864!2!%Vii0mxpk
zrxHFV!HLy4UeJFyhA2x$bVz!*6ycbqqh#4~tKDu74{#|UoaFyv1I0E0-m%b6H3k-y
zDXPj4wZz*_c7YHkg(#u3p{s<w085Bxp9^#lfyD2^REssJ#CPGhI!Yj0&Za#SlP_mT
zH`QmWEO%5tM&;*Gs!#_a?7>N;&nTj-Ky2~Q+eN@lW7%{lOK-S59@&4-(z<C^Z@4^3
z^!F_>8X>`8-$0oJ0m4}!wA7<rQtSc)R0B1Eh!Q)mg^uE)&XBj&G=t-;dJ{AQ%wNu?
zSod=jL%CSy7nU4Q1UQ{egb8C2p}u@H9$C``m?1yGMvVpwZ~|q8E#-FBO%OR2BcOd&
zTu?WhD4T8x`E9af=P?1P=4j2k-}QmB0EEzR(8DRe%kUuM+0w&W`;4>nMCn+G6xl-|
z_%;)(%QnCaYD8LFOk{tX3cU{;a!_tq2B0tK2zxur4RtzDON43d@r+;S=x$do#GY3C
zLYJ~P-gP~#K-^Qa;47<sf2=nlMk-b%pmn;Br(}uFO?39txtY!ZI=9fd2?qkex+nlt
zn8>eR=2U?ZDMP4b*!n7Km>Uql!}g%|F!(3w2n4l{zG*l!qZ$I~fhS0*4$02}ZJiLM
zE|V~X)XL9ypmPWyXw`T0PM_{8(w*I?JLIX;RDZ0in?n70q}?`nMPwR6_Jmex*H8wD
zKpMa}P2X-*?TU#5)WcJ{1aX68Hc5ejyio+@7{Um0`s<Us+s%H}?C*XK{h(R@?ZALF
zZha{X(ZA{I)`<Dl?l8oB$;z)E3#gO5=l)KxVN5A%2(`}OD=@&VXiM7^S<duEhAelS
z&lU|T>;f4<?F;=l`#~5HqCvi$nbc$o>We`+vJI9j*!2Pfc#!1;!ye}`<iWtayq0YD
z0sS`-p4=up2}Q~bBuC$a_r^Bwjdo#UpxqCcqHm+vyd98<iP$h2voYvd(D;M$?L-1T
z5Z53AkOYHvpgn+k8q^2`+uHz36+#6YMiZ>ep#G`8?%Kl$1nO@>mel~vLg+<96h#ok
z3X3M7mi|R^k^00|pu9%x8jLi7>puiP<bi(>KVsiN?WIp3alsOv=i>u(7!&%rICh~3
zq)K=KPvE@_?9)fFAYp!vr=<4w=_5=YlyG$pVz3nKF~QmdonJ?4VS1H!#m}A!ao7OF
zj#kWyUi>iDl+X+02z@hjdgeFh#Gi3~--iOgQL*yIB%swva}t~r(-<aHn}kRWcq<f(
z!JGu-TY&20uwf>HnM<C@_4WNb&C{+;BaPCq(GXOfI|W^3E~+j?$P6p^ZiY#RX8A}W
zw93)fM_-;kMITqDV)b`shxFm?Z{X|RjRa(e6f$OCxWsm1hU8yLe9*juvGQLl+KPEB
zAbIRV(3+nuOp!nwHkqRkqUB)P+7PDYFX)14@nzu4z(<Bi*2#e>k|RTeal)ARmFJE)
ztT0>p>;gn_`-CY{z$6{;M}s~;1A9Z!xx4;~oN~ddE=Vz8Uj+I$B2(RL&UQGHvvMdy
zkE-QST+0UMyTdkr4F-x7_xEC=02toNK+#-w0KGWg0RrQoC8z>^CShRc=kd_BtE;rh
zAtj%A_=$(e90}xVUx)I)RZM9N14%I@&NTNGtc7+;jR@%<M10NSfS!%sDS!!aAG(S?
z0{e|6i~zlphU9<ZO`7~qmOw@eF#$)(i+B>Tp(LKPAXE6H1)1bY7y)?^(8+JYh=7PL
z<w@}GB4dXBLOU<Y=3oMB_Ja}77kesnKM5nC-y4V+0RtXHkwCH?*im%pBQZi`VtELH
zNfcfB1WFwN!Irh+FA1;wEr7^7DvW?Z?!Uk$e*!zIwK)biu{<uMe)=XrG#<43g^&--
z(VGOO*C+U*7EVIzgY>@-9I|K+fB-#=e6$b~a%v9?0UBqJ2f{LP+>KMG4}J<N(O(d`
zXhYq=`Ghfot&tSYr$Gwps2kq2J=h5WA-SkyMWQ+R<vq)L?I9skPw1RLj-J@5)!x44
z((+Bq`=u2q8FjcB@o&ZNfhG*v;rEu08Lc}cClI{hgRs(yWshhKHXK>$L*S|)v}f8`
z?;z40!gpwSMiZTZ$0jxpLomzO7y+?=q<eHhW**%#wxc_Ytrq|BVxtQIvw+MzHdp;l
zY<a=%@feMlQ5HmR8ry9*T3lpa>u9|3{bathg<3qXDb<}l$G!ioT65;1P6}+DK)xnU
zYPx5m+JsG#uI(*OHrCbBI<tZxa32ZM#A9mWrw~>H&aEhlbj$2_WPR?MJwAIJ+Y+}U
zH`o&?s>0d@>awBqQ%BDcq%5t%JCC!wS&0{a|9+<mlVJM=u>&<5Dz?l_sw-ykLRFES
zggtg<2JESdCAv<BzLxBh<^_=WFLZ77=k$@f?OO|Lc2y2-sPN~c2}EHvMj_~dkbrON
zkWn=USdQucA#`A&`Thzk7?FE>iLzhd+Yd?gg@h;&MVf(MW#IH{opoCHMku<{yx(Io
zk=W>=wvN3KYU?*bXzfVV2xU~GBtrA-VTrSEr*UGr-vtk(F954qH_}!J-0(MjeIWOZ
zQ9y?Q??n<bW&sf3DDt6<%p`=Z<uAf^ilBR@Un=GyCu7EBWTv;<BxZvXTnPl&zb)YR
z+Atr8cdsY?au@ckQTE9Z#3i_A=EMHj3D|?=OLqM46jRbXg{(w>Xfd>YsHt$3+acHm
z2x|e^#y+eQKXHR*hLf~S#|rePr~zE)vQ(7vLv_iADG2DO)3m3jF=@`|Eowk41dU33
z<vBRrYCyDMQ!yc{U{qk>3`h~kwggkIFdVW7aMK{A8n9*oB_I#TkC;=*({dg{`nH6~
z;Z-5`HoZ(K0jmE{wxdYUr2>xnnZ%1gjP65cLB{Y~P@PjR4tEKAaB=vf+--XIIDgk;
zkDfXE<U^+`51l>x$lTdSAD^q7JpIs_Gmo6{bL6~9BHam}W-dvnONyN+zd#FK<Xq8G
znm;JS2aPl_(4PdEs|!l)dXt1s6=((ALdleXeMu3G(=zUVh(gr=NvCIB5&QUlboW;#
z;mKppr?Do|!1lQ|U{wp|i1NnuaE^BmVI+I9)xe90eaZeI$jv2|+rh)6210D0g30%x
zaFKJ#CiWVNAq$8e`xJvGS^FK4tnH><4kB3)tSS=s0MZO4KK=^kK_7*&atO=oP&^S6
zNiX*WVSV&dW&kRAa2FxA1so&=hE2Z%RM{S|2cZNwAQ;IYFp?pu@Ee%S>H#!NJxJ#v
zIy();kD@o!yBOm%oVgwa1ufL`Idn(NwilToBzmxm-4DQN!vr0GC;XB;fcXt;%^^T<
z8=nABPiZEQH4KtPK+rHCbAojxsYF4UsX(&O>7)1Utk{OJs&7R#1s5sdvz7iHs#$5%
zIibCmQ?xUu#r$aWMKc$G-0<WVLV!?$sVrIwNWopkDTKXC8wWG<QNb)6i!a!;gvz{K
zS#e@(Ck3D*$oPW3N}=0Wrp&mz3Sy#`<Tr3gBZ72DHg+`cUgK{<!V_*aw9k%rNTJMe
z3$*tQ5??|0rmO*aOe--<$3^W2o+vsxVo8H)chQ@G1wIR1&2AZVcOKX%_Tw!r*SVAL
z5oAb;yGSn32w<%qZxxBk!VP_ct%2A$42!T8^8)N1Db0tBQq3Yoy~rAgu`7rgN7W)V
zKL9;ZxUgpc_QJEa8VQ?kY1ARe9&9aXCf=5KMbIFz!yMTE=$gJWJ@j%^heDA=*R?1H
ztEd}53kHrzLI<pusAiBGtYfWe>KE0IWOdKUMu~TC=O*wlop;cA1kOxuXYh+GZa)j&
zS<MEP;15NIOeC2+1ZfQyYo(*#)U_vbJBBgD0BZRe?#U$tS<`tl9gw{MaJ^z?3NK}I
z76`&TDzf?Y?GJ_^8=9O1>D-NJ+ozB%H>{KRy5A4SBcB$V56se01Pkh7%SlCS&{79!
zf^azlssc4@p+dz2A7X(Jbh}2HDkuozGp|v}2ip3egeEp$J0{h95R*Fjws{}(%X`GB
zK`EuxFRi47tvyZib&s;^DWKgmRKY8Mpy^!FM=-2ixZX@b-H8_IWaJgVJn?yFVP&-i
zk#-<C8qOx%=cm@ZS{)+ko%iYql;bBM1fW6tI}mLqrAM4q@bj`sN<q3hPqJd2b5)L1
z!pVQl;X42i=$nuaCpD7;1_Iy9RyOo!U&*Gy^EcLa2k>j`Y!628#Uv!WXNtYxYtQ+<
z9Ux06&hP75GrD;e?K8iQP&Z+KA7K;rBadtXV(Y^MYps_7^hd(n1qgFb=ki}0zpr{4
zab;D763@ioV*<}l&G*<4pNHL~lN=e@L&cu$0u9arM~AIqED|k=#f^`r(-M{5A(5<E
z=EAd<0s~2`jsTbDPKweqgslY@D;;tnF--w$nk@;jU1SW?rkD~m<$&`i#DMjo7|Alh
zdeN#{6J%Qyl%-V1TbP~Erq#Mzj030+YgX0B*!)uwi2_s&7&d+8(0R{Wb?-fPZ1u8!
z)ly5xXfOR3SgR1&IR<LRd9P8wxhLTa<lD|k|LE=VJrJ&j2sfHkMeUc?V5%u9R2L--
zeC5D?CO3L^VQ2w=YX`HppiBZm7-SYndP$dq+yV8JC4D$^Pd(>A=&ROJ0rx*LIoxHl
zNrS$nLr;YS+LHnc&p*?{)<i`yPju`xS~LLZRnGx!>kPGGy=lZotQyO?$`Bb<s6T~z
zN*p?!B*?9_2+_8P+7s>f1OYhwl5Hqxk2%*`py3;;)Wd#4RKO7}T+ioN9m9DGV1kij
zxj!}kZ9VgRmXsZe>4}<>W6wVmihW>MqRL2%LfQY>{Q7&&qSSLdbAzpcHY3nk8h{Z!
zd{w;^*c&+-$S|<_tQ1A;1Iuj-B1R}Q?bhn-0#<YE1Ilyat@sMJA_VK5B<$V7(yeWl
z({^&nfF%yA3_cew4l&C><;s1e3qx0;U}vyPXCpm{o|gM5Y1LT)(EY*q$sRwEuze)B
zU7ciIDCdaN0Ox`5^$ah$3=W#w4`fo#BA}nJ{c;szTnNbu)we|7vKUC!CT@v1lIMrC
zd8B=4+q8X>_N0CX7gj^4;MmDQEH=QVCz(e$Z){Wy7T7)5M-_a7hL%E2IR!y7+Oy5m
zDjCHjpe|3+kDu0N_O%UXi$;}cv0XKAb%D0ll%nzWil2KVw3F-i1#m4*bcC7;Y4jyF
z5QS?%dJi1GUt0kWCiKstBVD4BFv1K35=>(dQ)rHvN=eqMmbrc)m{UFDH5<!j4(6hI
zcxAiUo`PNr+B&#DM2o(`We%9*i?wI{eUb1Z#bzm8Tj`vzSgHOHaO;W-tCrBkkVd)w
z?uxroTP0ZJ!jvyf3s>YC2c84|cqPg)qLSc_2!EgxPEv_K)_E3SP*+i%kN21y)^bii
zg%_V?XOXoA+Kt$s0$(xsE0~Hgs;7bb${LR&S(+-)ZCkRVLN&hyjp|6$#1WaE9q~~u
ze!%4V7=rMlB)}dXD%Nm6x(Ff9bKLYVL(er3{1SOukd$jnm|ob-fc*q^A45$@>WT&y
zK4P0z&Fd27;vN}RBLMMR=cu|v=w5VkpPGLNQl@8FA*xF9ASYVWeZogCXK}L#HZE*>
zI6Gp6ZXX@iy>S>-DoDVSUt>F=6;oD?pQqkObCKK}btgUgu>0UMhj6LDQXSW>pi^#^
z@S_-<j`U50i)|}D^@ay5j>*Pa^cZnjZ0gmEtr`#!N1+7`9!*iv+=oM)RR$CdkgFD8
z5{QWfUx9`!@Yw~PN05Y`f9m*=(urs0@72(T?tl{H0|Q|yJuSq#B|GHsVdAI6svZ0A
zVQq4%ljgV<z}k`~jNda2I{n;f0i*WzcJ%fr-v)3|SnB`J=Mr$?t;nx1pBx(iKwv`^
zGV-Em$F&C_6=rNir5d&YqitY5F!U*s^?F_s;f>9ZEhPc@jYU9hxJWNSo}V*Gv!$ac
z6M|kxBvPP94YNEp)}9=-m~0hPd6wV?pe7iE)%4yjZVX3g{+U>zgzAvIwg{`iEoKcO
z2UP?0vW87S?Bfm0H_XPT*a2e1ssqVZ)w8vWvxH--=iFl_-hATt?1?+>y!*tAOg0iN
zVx`rZToxMG3s~*ZHU$vLbCxdmbPDIXtP{Kxz+b~O7^K&m?K`FF%=d7XhLVf4Sqq|r
zhQ9wk8~SVP58+CnaAaV`Ss+>mlnXSF0<b%5_hIyotz;RT&Mv`W5%@#)i&%f_<9Zg>
z8ly7Nq0h|sbaH2{hw(~Vx7F}fd}r+e?9hEDNsGZDs7j&vR^Df`4H2i8iW93jz>}ka
zjI!MW=^i3^Dx&a+Y6-i56c;M!0BRsEaSoO}SAZd5eq-)3!UFH=>cof@r(?tP9uJqD
zh+BGqXdNM<a|Ik(aQtdKyP!ndf{|ogdlMGAW!1-ujc95(uUSP|$T%{9*kKg|(^|uU
z-~j{U5hd#xtNe&I4Lu)G{xsKGdOR5Lo>x7?efAP;6UsW&L{n6}b;+^gqP0g+2Ou;W
zm=a>itRC5C&@>I>z9rU{qa~r%v8Lg`i^xwG4Fa@Vyw!Bv(tI>Ig|6a)rY9(vtOzmu
z%T1L>=<N|DG*Cwl7p^uw9G6>;saR?FDWDu5(JM%xu?kLTu~{`UFV-8epWt5*_?0H;
z1ffE9-C3$GT#mI<dL-z--j0oiP~b6*ugSo=Kw>qOGDxXs8p>S6Q4dUBwg55e(FA3-
zXH`Kz9LGBXL2ecK{aA%`9pkx&@L)K>dMdi@!N)R4(3KMCQ?e4ejzPUo^&Ey0$gOj}
zbO<P@D1KuG+pXq7FIG>{7sD*CDK<qDYGYL9C73J0QYFCx9V2UDW+&L}1m;p;zJ*ap
z#F+2YBvvk7Z#PlG?Ka{obZF6%k(g$tl6-g~bWP+p3gbhOuXDT7YwmZCN5BSbZfmnN
zY!}CzOQ5T5a{hCRS~%a5-fqF>9<1U}mygl0<poCtW)?sJO?8$tSMOG_&Ot|Gq!-u3
zOkf0l2)6<26ZaZ|>;QGk()rdU0ZLj;JOKpi^fjt0U@U$K|8@u}7f~nLCB&X88*r>#
z8Qp`N_aHn`!6c%&fiBWo0V@Guf~q~+8I)*G!+^Y=%>4fxCI?81Zve&%H!Hmxbf7U#
ztk-KtDEE)U^nV2F3DbiYB|35B>~+~rK}*7^9da4Jsm8<=Rf83+4+ag2&c})icxO=q
zq7wq=TF*SguGY&p)6E7q6Pm2w#30B4sgK6TNT7RM><a;V?OMP%$m_tLOLCyi<`+P1
zh<!+_;m7^ga6%%S7{{eZhbeNnfk;O{c%upREC_Jqt>>;7xh)@il3exWKA1cy;I|)K
z_Ja6m7`Gq30fY{c$8HZuIQ%f3TX2f_Ee5|s!S8VJJK_u>-LP=wVgHQAa&Z^yh%+jp
zCnik!PCO4${qmUb=Ep%A+~pSnQj>Jf`ah9z>5_8O01ZzW4Ks0{!@wU7;R0F()w`#j
z2Ztj@>LUc2B~VUnz$x4CP=M%2iE+w}&ivkxvU*a8(-KdpdNEcR38Z9~B{@DHhSQOT
zd4X~&<_lsFzhdE+X7E!Um}T_i3;7q^$%UJq5Z>n%Hep(35ezRFzt4IDct3c>c$m8V
z2HqEK<WLzxDC|yDhP+{>$9IGbF@%iTkS3^%2|+S5hdbzCFAZ23sAZHii<wXft31Lg
zkHc9)MW4yw;>rZ7Y+NzF*Sr9`<eJH;#XVq1%bYzf2C5)sBN{dlnat1M$D8H|h(B;M
zx<YK}k0+uVm}F!lL6mN5`e7=O-cfG_E$!6Ic$e&saOY_EP3sZ>eo%{u<mZLDB`}&g
zfcpF6?<Kj5oCaQ(8qrC$fhysAony3uoF9Q=f+oJ^Q9Dpg<!;}UCeRKEbl5$qa6>BP
zp<MMglr9oL>J;l?iL}r7_Eb#uFHxkH<nfqHD2f!d8Al(v)Jg)|R^-<#3sP;+Xc&hc
zLmE22a%<vP_qvFBFE)<Il^W4P6aNZH$)t$~25&G*>v)Hs_u@ql1JWO*7PCH`qa)Ql
z8jfAsSidP&`gXCmECSA|llXom&zf+`LdzwnjruU_qV4_rDS_v<REhU;^;YS4SS4H|
z8L!fIvFE6r1tghpW*5i)8I<k!%bO5f(?VC2xAI4~JE(BUQ*b*lIiVuU!&Zf#J@8OC
zpUrjGvGO6hP0?~F9W`kSl#B_8OG{X>oejcFO7Te8R68AnB_Yj|lduErTo7hK1q!#2
zBELJKZX~}R6r#m~iVlAOtcWyN<Pb`44hC@^ZG-gzX(18QZ{i{jTiaZLtw;4V>Xe0>
zR!@65h<1x1Zrj>O+bP&ZNh<~By5$wFn1uw@msG>+-yGomfndQ~n^1j%ub;MXVAlKl
zgSu^?4n<f}Ni7$l3I#EIi0${a`=Jaq1WTQRxP8vT^_<DgQ5!d1ZI0m%x;|K}Og6I!
z3!X%{912*H?~g$!U|eLGaI@X!1e6CR@!bUx{-T|27a>eO<W1SoN)a2U!`|-YY1$5@
z?AKK&;cvOH^9wOia#Q;8%jkzac0UxjzKm9XnZoZ>D6?^!78pBwQQkfX=I5b$4v}~D
zW%S{I^m>pA0`yQWlbmiqc;AH8SP1h&oE{gIHcOKCpa|%XwKK3ZGv<{bcn>uKC~>9Q
z{ZJb>oBiHR-C;6U_QQTq3PKer*e5k$@ltLDgSb5`n0_Jy`<hvldxt$N<%~0Db#L$l
z`#l?z-p$D608ff%PGL^$f?XS?cz1ieJw~fPiw5kkjw23JR`JWX_=TOFspVT?!_(Z@
z-Lzm~X6%XqYdB-DKe}54EF?ARlt%nE@`4p1DgClNhPr4ALd^4p_Jn@RFez=~W5$J(
zDB(~Wwu_JhOZ)-UXH?e(wVsw-f22KtH=k-xZq8gwG>!Ho!gJ`kpYaY0tdKx&Oe5XI
zqQ!IY3NFX>ZbKMCAP6#f)glBjw|@5hpojw55hE{h9~EweYJpK35;vn=K`1@3&~Qf}
zJw*sa#KApWW3~PWbJgf*D`1ww9t;?$07hb2aJwO0B^DUwyI$Txf&n%KQ=K?1fo{<R
zx)3zNGXw}*mn;gQT7AD$S1!;yaG`%NJ9|pBcGWTLZ(<~8eJDEl-0(6H7#Z#C@UL;F
zh<XchS^pB-Ocb9avIekCKwH4q0!{_k&A9y;2&3lv3k6LoA95oh*W}!PP|+sL!2mz6
zUXCo#uQhSqgl1a=bZXF%0g7+Y+Yc@G0L}}JA#y*ujL>7SiOLIugbb$H?h2Gj_m>XR
zj@!ZcP#-&pD6IkYMW(}SNuZ7yLsW$!9{Zi_#yWQNo};5<z)~Msj8g6=ON||XBMVKK
zLRjo*=3yuhA%5ko{}4f}9OXA<{{Z9py*RrsU^fYmNWxQKA-g=SjElX;W9~k7`m?L*
zfj=Pl0H$QZd~PbSelT`Wqbk$^>4LGsRJc2Rh<u3<S{8u-X|A7-=%KFSSvhk=19&_h
z)m8S251&6s+`9W%`~u(qAf4wJ=`ZLzOy^hW%rmTyz5*PGuT#iDQ=tOW+@de>A;u60
z?RiGC>aa4YKEXiIZy*fwI>uB5CZnRLHnlTAA1#c?xcWH<&>SzraMZUUFMqlcuCzGM
zxk8OY-ZZSai(0ztgG@1q8;-l{jmy>L?)5>GF|$X5S2A}xsxa(D^<8w>b9w?vM}9ZM
z4l&m}C`$oRNle}N<0N-=#QtE^M<iLxMRt34`Jz8k9@?_1J2&{0i*m7(ww5fbQO|1;
zP#eP1iA@3*7rrAXl0|g=VN5h$m@Aq9!C+Cy&hHPy1RVDJ<tJ3%@$-^QD4#vES7gcV
z;Jl?~%;*4Y;@=oej+(p%nlcDtFW)Rc&`q1PfpRa*(!dCfh{RFhLo4}?CW|Ibr*^K7
z9#2k78Y-F;(o>LOpaO6jl4x*WLFCYHq#;%pDhF!?jPy$beK5&t3@6hN!7J+0GwHvB
zHg3jxDWA*2)ap2Fkq(>IbCj@zf;|9s2;AeYPsu^lPSP*0hYGJ6(FxXI3_eEF#rVE#
z37dEzrsV<CQhDA4{{rrp`gVjsnsN_8JrCNPke|d2=UzsTyd2);uK-AB4tRY#yajk6
z=p!X>vR(lx`ZfBvk(4xY?;CO?VwuNx(9QrvgB(vAx2pogO2UQ3okepgiDNZ?{Xs!^
zNtk*&M1eH|B-F;*ssp_@r$cnRSMLh<!swaXq2>zy(b-jBG$HvS&^cu|{JsF}=EIhq
zleml^rzCs`a+)P_>KTHbCcDv7Q{9R<B0v;vo;4*1SA?LdHB8>?oPNwWohSg$T0b7!
zI9G#4X*p51YXgl?TVyYaR^P(%xR)qy50Ja$PDi01C6;vk!5(ogf5fXf72MHjcb&xQ
zAaglJhX|~?%>16EBV|aN+I-xgGsh^zIB^X_a-r_({aN<Ls3LmFMVKa}sp3ug6Sn_e
zWJD`ZX_>mT7{R~v%cfj6Ay2R5Gte)&VPd*tQSt37X~?5O>gCl)kXTRjo&s=Zc=&-i
z=pMl!QKA=#R8cm0o~`<JI@ELaCqqS?N9Ue+&)GAThaW!)#hgdy-mXuE93yh+s4RUv
z`_%jC6TN$`Eyc}Pb{SaZz%l1)AyD<>jJ2a?%LTm6vs>7|6z>_%WGArdaZy8Oee1o#
z`rDXXD-0B8VW2pjHUBQ1GryM3n_o@$nSYxv6#fQ!`m>XUV%`hLHuj--6hH27;cbVG
z2LU>*ob{TNJmNl}=w(IQB>_>petQ7Vc`;9%fXJOTPh5bp<57h7gAxb&ky8XsZkU!c
zbK_h8?7Qs=X9Nn;lg=1^$3ffMB|cgVoq~^VCxr^QTf*VDr=u8q;M;|ey}|Fk;I|a~
z-sDU<yPYAS2aenO?VCvt1pRM1H*Vi!AFywMS>-*>Ui()2;PZg*%lklu6^f>P$es~O
z;Z4ro++q7o_HE97yqR(=XFqeZk2t$<-PcijRw#@&Ggj`H{pRiB*~gJ4YoF+jiyUuv
z4g|3+$t`Q&Vc#ja-4f;26Gw8&+joWWgo=49Hp_ec@DjXJEp<}t-JE{x`aoUZSTw7}
z?p}v<_%!B)CLIz9riOo@BnKJzuDdlsJOm1tmiskLAslA44nJ<vf_&m~LCxTOzuYU8
ztqkxUEGo>QlBu~<pay*w#hjI089Ub#c+jRTI9=7#bYv5{4KE-T--3cO6i2%5fy@ML
zlEzS!F$5FTJDK>ibYx#T%Nzz9plSz1ohA^gE|OShbLNK7kBwP`>}Z${7d^s=T~_YW
z;d3PX-qpD~6U4YXoWxi3ErEl|`LPTw!yg7oYB~A6(zyud1z|KRXIz7rsC1AC>mMno
z3NFB8S~CtJ>8_Ez_NUQA3KoJ|6)~IC6}DTN6t#;R5%UpvXb;=RnoUBLHM*O<yFFcz
zV~_(8iKs~wLu8Y5)iF<ks%sP1N%ga=hKwSsiUUh%e*%N{R1W%$5W1ug#!ajmOF_j#
zMUn1`MG<4CG_nW!qI>!)B!!_$=+BV&XzE(XOxUJx6!-rFC|F3vStG(?x(oish>k=u
zNOW9IRDfZHIMu&GIqIWy;_a8S=U`ZY!uAzl%QeK(fwgz+1oa`7@&l*^78&QrwsT^P
zjS3*2PNs)Z5%m$ISw9p2CB6NyCCQdK!V%=OSN|Zahx*^(3+8~<f$5ak(;fa8+q$KY
zqpa6=>{PF9le~d?p?^3<!<uCRoT9f%eat_2cRZ>{WN)&w&(3y5d;`OyeuW*h&7}1E
zI5<IxcSJ+7U5Q;0!59faS<&dPA@?~A^`7Ezh|XAq50a5^&H(_o9bWi3#9!!x6Mned
z`OLZN-inMtQZ|Il9Fmj`kd$*0R*cBUCdkKNDk0ufJ0FpmVF3oG2w;8aRMB1x$kQ+@
zO||PBER(?23#?T^$r|S3`$4h;5qL8V((8b2z#dgva=QS_9l#ds6bPo!1H)aaNVAzC
zffg|=T)4zA5>gok8!5a@NVbI>NQ|(&$!Fv}1Yf>gTqaR@2t?-u?5GUW(h6*(47T&`
zN81C7u)3x5+m(RQE6K@h&qrV_#b}RAV0qL2rq>S}F0he>v#tQe@B)l<XRfAj(q*d@
z+K$kdIB$;#jUI$ZX5+uJ6dy}CEyFC-EiIrH(w?+nA!^WETBFETgFIorq+l`?R<xKz
zAi5cl8_$tur3HOK_KVBxvb2C1+Tn^lBcgRjPaHok;s-HF4C)d=Xrj49nI<jhD@43d
zLIcZ*G{1SK)-pwbym-{1ERvr?x-qamP0A@7Ku)Vj{zOuFi=glEt&re?`$vQB?{3m|
zP3RIwdNRE9J*vYa&sJ%WyG!VKetsS*Q|DXFD9!^9JfQE=fYfMo(<$}9Xazhbp?)MJ
zG<I#g9l2ZPBjW_h;x-#}559M3e*<VRIFZ#_XeICHJ#RaD5kjx~JHqD7GooK2-&l2e
zHG*C{%UF9My$`xAGTi?T=kgJu<z6Or9NhB-N(w#*BePAB4)UsX5k_w~5AKg;A!nY8
zETZ}y(z#L%j-nuyK=|YS{i`0D6ia+Fs0DKQY)7V-O${i<j?l{Q*S$F#1|HHS&g#P!
z)bt~e>v=XHfZcz`@f~_?R-0wiJNBV4GnN3AbSjI8h7+`C^xP8x$Rki6BUFZK%Y)X5
z%oCzk7aaF~A+66mb-W|~{Ec+u)6GW@Hbe#|>TYfD=TVW73p*U+G27{JOxro=41Xt2
zrEN}(ZBGisP0VV$_*3w{h|3wd`f$3mmT-ZLP;oWM@8gIH0|;cKcab6gAqx{YQ6QCn
zz_<UF&hOC?44_Kihv*QxQ_yfp^ky5_)ZDS=zrj+rJLs+>Ge4uj4OiFm3ye#w*e|%}
zYq*0&w<LO!?bOe*m6b)XWPg_R>4E8}ReLwiZTFq<j2_lRMk`W`)`&da^w+Wlq8S$S
zZl^$8EP|w{ZD?2^AVQT@s3SrR0c~V`YP&VB3GFcp4?L<P3l9W+172Yh%G0J0`^NNL
zf#CT<eHt8MF$A`eZQ_mxn!Dr|w)t>`T8Rx9!vvoUWI5p%Jk(C&jo4Drzc6_Ql{L-3
zjNZX4W|wJ6qdA0Q9x8E6w~|m#M)8reKB~cw&|K<fP)?I}CUlAkfEy@9-Y|YU+m$!C
zwwpZkq2dj2L)Y<>07b|l{v<fO00H!IZzpjJNv@sSWGr0K4&F16mbQslh%NaB<#pvw
z!{6k4e^z=BXeDG@EO3DHO$&TvgFYBO%L9M`*A(^0LP92K=RN_BHS*4rd^x9iV{!)t
z*N6th?x9Zz^D=8WIibbhfzf2Pauv?>bulo;IWGZ^I@tN-z|;o$JA0Jog1nb=`1|NQ
zf*fcOmO3^SD4If%@2_>Hf{;ns0u#!Dm-=u`lna891sflg{J;?i*#Io9`?bCdH83Tk
zH_)C1;qM8RJoIZC4yXj%Bsf<e<5GDmip|2jCQZsf`wukm>GXb>lZC-0T-c7q0IGb^
z7{(WGKzBz>Q0BQnfTbJOD=@^FjYz3ohv)2T79GP9h~1(pnq4*v4`5ESod7$#2mu1q
zOLT;y+DXj@7B;eZ!D>TFiB?g$5yU`$VnbYtRL>VBdttxzll``dN%sP-dc`z{9f50R
zz=i|r3XBAUmzaair2>E<=UMVZz^_zQgvwrs7%rj&cLfLw4c9+r&pFiwj-6+9|DR^;
zk0IK>VF$By=5zZK>vwLmi-4U5vM_|qWnjCA&KxE3`oNw1$H)nDRHn-xvhaZ*5vCR=
zh<YCab;-PLK*WqdpNQrBo2&^jWbSw-t!Y8pC=#gY9=3S8*f@~@8wtIv;sM*5(cxhP
zW@tdGQhR%O80c%c@14SlurFvf2fchZ7bEr_Tn;<q^=%fIqi`mGRV;C&bLvxUp->g-
zsDP4F<I@N`rGAQUq9YBOsltE!3`2$N{33lqp6XEEk1&t-(UBwXR{DexMzS%~YX2$I
z$kDmoal=Tz%Z_EbZlby-n`4DsH*+Bo9f!`#<Jpcm+Aj39hm3y8w;&jB-iIZS7C;in
z5j?w4hgv1B*X9U>AI>XOKybsa@S#KW;A+5(`J<@z|DHZ>FufbZB(=*ii?rCBzI9Oq
zUcbzC3F#>lU5?&8O?FD>TU4vZts1zQqXqKe0&5n2#^Iq+5uTH-kH%~_fjkQ+DCCni
zC<O$G#%qQ<?Pb8EO28<lsc#qdvOpTsO9`BixTCoZsioF&FXus;b+dpoDRm8!Pj;FY
zNKlUH8tfnAym`RGy~vxzYoIgp9Pz;ZqOYBQ80IgZ`4O1v?8%|sw_>{QY2(%>_m3&x
zq@JQ=(?-^Vl#ZPR>(0is9or}X#Wt45fkYcHA`0IW5oy#N^_hfuP}s7<J;a9k9Xl^I
zoVu2P+|ciMyEmsHJGG}fmdG+8t_^8i+~_>Cy!YF2H}&Sej~b8)rB$me(DKqXY>ku&
z1&R13Z+{!DKsz#a`UL~|{uJWh3`8ALx)S3+JBz*mTp({hAG{rky&VqT{&w&dGDA$;
z%=7BQ=V5qu^Cpf1OWnwLw?Niy%!4^lSa8eXzGCZp_5p1l#CAOjB)_Xg?cz1lo`7@~
zZhLO0AyqWQT)%DT!k<A_D`^47xspUF+VE#4CandMw%o0}s`~&kQ)#((4Rv_tJ!F(_
z9<nE|@%H6=Jg`PLXY5_*HCPZs4M9w>GZSpDyhqBzWUm@bb(=jUsopNV#rJRTesAv<
z>8-M-drkKXW0kvOAs4g+q0Jzk?p+D##DfIXRIJ`h-wsN&J^jNll<FOkBW;h2=m`52
z^~T-Q$nz&Krbltta{2{GUSl>L0~uuRRotnKo;VKD$_c<#w}ZSg#u)bljzXHxcz0~x
zNuva93B3>V`NBnM?LOAi{hatqQs+O~l7HrP^Z$eG^4BX1rT>N0;V<xf7wd50;WkZ`
z-&9Rw)5({D&uN;}kn>W@(-UPb0bj>PHxE6-FxpuV3IL=)sY4jez&3etF9eP>_T9M`
zm=p!kmXa$mC3=*}{Q0$|C6G681r?;EqTq;0-KoZGX|9Dk5=5O(a-@=iu3)FAU>ojH
ze?TaNqM%FzN(rh9NpofpUPT6S!%C;*^~_?cj#Iw0-Y3Z6Vhbj{X6JnC$%p3N<r~K}
zq&kINQT-yE>(hbd7i~0Q_FSzA%$Rp5FT1U#KX~%-)2APOr1HcQl_ws0^6VMkfC+Wn
z^b0zYpVyow=&!HFz#$$S*wHml{YiL)r+y2h1!+J7^9(oNV~t>^FaYr%44;XQ@P2Wa
zc)}j1<1h^IVD$b4?qiUL)ZBO+mqR>eB7hqLM@hi0eoE{(_?en(*MOXg#Zp8TI4TM5
zzj^`50OtsEg-4eblqSpm9)NHzG0$`YQW&sOVcwWBaYd}P{@^yi)J|to7g!bX^4E9C
zjG7gScmULgY`T9QHe-EAN1~8-@G{n`JRiY51~n4Z<nT$aii{V5Ozin!;RU^8T(>xo
zgccp#k3xc508HwewWQ(k*1wajC(xEom;~Ei1q^XxeV9v$^g@}&Q0iLn0IyoM<1Roj
zsRbH?pRsFlU9@kibGlnOGS^48Y3zDmSL0sHtoL7_jrHZAwd?&W@yCnZkBS|&UR>gu
z=}t5N+Dg~6`Wr;sw%;h>@f89Eq51*#Da^%dL7!=0<-Z~Iivif325<p6E-(<jgwQT%
zo0H}H(Ov2y!@mPrUhi*pSK@k+cAdIIX71h=LYzTbzdxYpX&A;&JI{ik=lVmZ0NY^4
zIK}n*h*B-3_9ME?Xy0tPe#U8Dtf^L05fj9iAeBD=7HP)@jIG*yCHJ{OBBX9>9fsp9
zwU0tufy{<K5c&w6fub|ThH-aLzeY!ZRswjwwG6XmzC|$_wajc%TnD(SoIn0Zl#Q0(
zA(q1SfPiQFsTyQF8I$!)g6!+3ad8dasE1g_2|8b7xx`%|u!hUeJej%bFBm#P=gV;X
zB+Lnkq=uhbIKS3hk(=M-u5oUJBEE+=knD4Ob7{fPkgp89%Fn=jk*YZgIv+j2>Yp%g
zLadPJZ&hpsolhk-2=-?`6PU8`Q_??vKa_xE8p@6Kex3za1h?`nsf6_1=h>t`XO*p0
zmaUo8%%KbRMGsQh16JQ!bt3PWaej_|*~rxIG7qj)tY4XivT>be=zl=Ye@DpK$5Xxt
zb8aTg5JT98q@-!cv%%0?GH*^prYsGTQx-1l?<CC|1Hh>%;bz@1Z<v<#N*Xqk<+3$g
z%2vo{!3{6sEx+b-P!PdoZDvXcStMq?Vgggw@^Sfb$j|-x-s|~&X_QObKDcfUq%9hj
zqp?`}=~~Z0j?Ne#N#jZlq>#%r%$L&{sH31plQ5#(8;TO1E8?yBoMot8*!dQ^A$#tE
zXuBXI>W|?QMyT$kltk|xV2-U`z|p4XPr}$5#&9b(^0V9t1sln3i;e7*E=HZBC>gO2
zo`vq0kQ*;k5FLjUMeuOS4YhHG!Dg|!3@Me3G;jp4yXf=M!qUp%iUbJeMR)=P^aRsL
ztt5fgCO>9;&v9TQS-~d`5cgOffJuEqXs6&GZ0CSaJkUlw_mo%khF&muV=G}lK(*Cx
z$Nd`{dBlT07xWXJ{%8VFKg9Cv9EAB;r_n%|4+4YBV=q7w4rlZDHAv*QnQFq$U$uY{
zgz0|_ux1}%%|35ZV9f&aY4-)NCK!I|dvL!ouWZ1T2h0sd7L)3`y{XOJ0>z!eUOa6N
z0Q>r1#5ai3?8hVpu#>J7Kh{YRmL0Tr4}R~liwFhtZX#h1{g8!g2hg|T8~G$OPoRA=
zP)lqKq9>qxfnFE`M8a$PhNy=D?TMk~o30q6WDt_oS415L+QZOh7`X78z<hXL^eC{4
z5qk`+!zDZ@3nM3Z$|!p2=Jtq`&i;I<Jqn$L44w}FX93<3zCkVT4(rh#MIDNOtp}jb
zFtIDalG=mNlNhrh_e2~5kaeFu(H^K7D0|l@Oz2glHgEL~;!=chd&=HzPhU;id!Pf8
zU!K9W2!_2ExF6`9P1D|oo412duvksptc;W|80`tZ*E0}xd6M%(J&EVrWE_qHSIYF@
zN+0OPmC%kO!7N)v?;Zt<*Ro4AeHP{+CfIIT|BeU}K7(b}4UCi7_d1|BS6BeR3&;`L
zivnZv?*WR7xcqtn^&lW4K$iMX+%^7`4mT3)t$#)T{|jes3-AzuvWF2>fQKQ9hi)2r
zSzI$VWlDpX4nhL&+UOP-P3e;=Pn&AN&8bynqi<8X8|4MihDc?@FMB+9ydHStPrAS}
ztMDp?mD8_o3z+V-J#7b=0+vhKIF9=ORzcAOH@o5?wWL~WtLkr2xh;Lct(Sn5`YPV)
zDiUN;U!(I<6ei~r!6XwTybhS8t~18VaMlN6SjSO6dHgPW-8$M@JgWW@?;+=q7w}R;
zEIdTF)P-BC0K<gYWn6Hll^yo}53rW9+us`XX0-W>i0<8}o9y#NA_RvS>j)hdhVruz
z>cky807Docn8sO5lzHR}d+p%qigh^O$gBU1_v=Hk1`+y3rUWczW);Ca^$K$_Sb#}~
zkeR;P_h#m}pF$p*(HUB=i%urBKwEw*8blHk>Nl7;!E6ZoDFNs*_NqklGpm6o&00FP
z>Sx3Zoe?^`DA!Nx5m%goe%4!so&pDbh!IE?S7Y>z(;?8MEcyy?{1i1D>Plw8y^L9A
z7X85tYMrBLW02&+90i)QI4J{w(Ozq;y83`lGt&t=dztf~p1KhpJ~O%PVJ^_xBvb97
zbBNA9I+96W(21PNYCnT-h69jlW!0;QRA4m7P^hJppnzq40WM3s-;^9DJ)YPO;N-{;
zF>(Ocoi|9k1hTk9)A^IapeNlymMqKQ0DQm1)%#g=R=&Rg7DpC}2%0|#+W4y`7$Vc4
zl2iQv$_ZBb6$>)ce2+E$N*=dPm?JcqXjyQfV_-f9RffP<7|)t}5fAs|lEVm;4IrOR
z36%#6+(q71#`oIzA|<hL_pZhJdrcT&v|a^>l^H;}Om9ACUfuh0;WD3J%!lIDb93rj
zFvK&)sTuQ>`j7M;>=C&3hbxdew?n(qvNALw{N0=(KgSvL^K>YQrzNpJPv0-o`87Jf
zPKVOj>YvfKi$1Pvf3R}$@wrovzP)nl@h2&%y{(Kkx_FC>HX%lBRrvRjBUrmM;JXk=
zYX3VH@)LAOS682*LwH&yv*M8LEXVzmh?gha{p%<pn}CjnnJwZPL|lp({!x1)uKl5h
zyGO1Y_KLGR$#(Y-e8qf!{)YKx<dg+1Vj7S-aB{hf`Rit~@G52kt~#+^>4X7iw+gR7
zBK$!!rNhBFLtGth<_N`GFEf58+yp#7`m$o1sy>=6$M#temjaSUVCEH+u~4KXG;AB%
zs?ip5Yq+>xWE;BeyOs`7bE6vEy&l?@i!396#D69*CwY=m^idx3&(HvjHmyc2GA$ZH
zp(MevNCmUC31N}GkJ3Q^1p+}l$shp%E5R@#OnE0)8g<B-U79Vu{fV<jZ=XT(K-}jH
zbR!||MB6|pFrJ-QsYO?GssKAUxc44)hH)EIuT`R9CV@#pOJt-Z5=QBy5d%t9sRF>B
zj%@+h3WEJ-SSTgMk=8g!(G%I{!1S;bhSyN{jOU0)&>)eT8mKVl(S6Y}a}sM+s&$Da
zOS#&7)AmJ6DK4*L0(B8uk6ndnLr7g2hK&<RnAuX)cMDn>c4-X^B-T|5ja3VxSm$9!
zX+B^L3vI~N;$E~5Bt77Iv>b^3)3HE2p#80IXxIfpgFVDrc6ubNq?8%9JXX=bp#K6^
zYwR->Jh{IBPpBRt=nDvr&u5_|K_e##*dEr_g?Tk$2F6V?TmOI(WS#6XFEY&Imqtx^
zi3mdA>w~WVKiDjIg838ehuM=b28^FL19&$W{1$`Xq2PDe8Kh!Gpx8i#3m_ilVNnQx
zWa7-YA1>D&msE_m%I%Vn93d$8t#G<E0l)?TKfGd+2rJj-K)GMvHwx)W6SA0r9sr~_
zb3__Mm_0eCd7`)*^a<oAQvT4;gSnJI0#z8DPjiPQIIXJmk*oxQEwA_Xb67x>e+J3+
zUoyh`>2%;V9wUo{*YfaCgO9d@3(3gn$BymZS#lpW&?3yAV_gJX6#4Bde3X(q#q-!|
z5tRJ}c-HUgkb}41zaVP1jVw2NJ^awm?@4^s{~E8rP6Ff<jZ(k(#wtJ|Wi8*<k;2|t
zPFVd0Yd|tmm(<pGGwgE++g3=J6?X|)2g1MUNcfjrPE3>?gn#>Z?IZY&xTX+y$rQkO
z?FZgUaogK4@ho{a&;&O&^ovL@loTQ9^8iJHaed(fVCG{1d|2e~f#LHVcx)Sk&6|U~
z&U+bN`JmHYGdWRVhawHGodx+I@DHV|^sdCXJZ-{~#4_tL2+SGzwELk5iEAMl3L#%y
zip!-3%McVN*5b*q=n$m>-x%5m5MD+-ejNTm)Y4QRhms@p0|uH?L71gJ(@rlJkvA?@
zbUzjNL8nolfhweqYuW5Qq@s&Fvv~Vk-mr)*hdKW$1(<2g0(F23LFS?6Z6eg17XK|l
zTIT#G-e@TD3_BRC>EJaJ>EH(khAG36@`(LqZ#<0Mmb}_|(2p|hA_SrP!0VQ4hDAhr
zh+_uYL8?6z6Ox|bb(OdTa}r-%R=GT7_lb~np-PQp*`!POe8VEqGd<U<L^(0H9mSt_
zfKiU&h5{VmlycZ!5E_IA9D8Ewc#8ydApP9g+i3e3=1XzdcI-z+8(-=E`%5Qq&;e?Q
z70Kf}l3G6W7=%QRp1H6_TL=_7gp6}=bhCN<%x^9B0iAIV%o55)cQQXIHbx(z;k?z$
zUANoJ;ga-IT=N})wC~z!A3%6Q(^2j}sSkS?(6eDpLO=nzEy*pRLZ#XwppssYB!Y9a
z6e$qn4<SF0&7(f?chjU`9jfOPz0`)%Be2e&YOK}0+P3f1Z=z~5yS3cRKV<dgka-t&
zCM_m+iD6%0^{Mvno6ZHzK}lED5`^4Tm9IWT=Xt&_fR_X-8MsawW~|gZnT;U1-^Xl7
z&{fCj!&V%KuL83JEtihSOx5Z8IHH7iz<(#BZ`aK`?YU6k?Eu@MwtJV@>LFz47sHty
z2<*9j|7xpM2MQL*<Wjg*{WhJSWsQ2yGs>>6@i?Vtbd)zI_WnU?wU0~#sKlA0Am2la
z8Bcq`9022hgnS93!d%|ik1t}%y>O!-`ZMmdqySq1jDX2r8P_hwqa99A@a1V*LOp^J
zKv#Nf#2TH8bOfvH;F2A@QA|7J*lfZ!T7efiY>ynusIdoj#`PXW;`gwlh{|UZg`q6<
z^h1r^Sf_mhxLslNZxnOJLLd-2<0&dEgK6EQ47sPi!a98kj-RF#Uzn(nHb|2MttId#
z^SL|{-OJLs!&-w_Uu7H~)Bd=Qqpbo$2QRpM2xOn&{%a+ZSmrdRczLpG2D-CVTvc^Z
za64BK{OWnFr9>t3jscGHEwL;F4MRI<X|gHy?khThww%FA?%6&;D7Gyc*CYtYS}X6~
zKDN$un9)d|@`q#5^xILsl^H{3u$rJlZjI0!{D~L}R%wumRIPifO75D-ckX*o(@aCt
zT>eBSboLx9k%Flt#xT@gM%hQFM2B=ZkPe~WQiWav$V_BD>5F94C?`Oka3dBluENzb
zo)l;Gb|xFrV?`#fs-0T{S<mn5deDSLq1i1n4GDx$3DulW9CNPlj{W|I13Dde#<K*v
z)y<5001lY6wPp)sxm)m19<UwUGs)}IX35aiCJzqsO`xNT)N+^PA%+!MO3j(oY|~lX
z#6xmtc_;`*b7pmuLJi+p%m&^ZavP$lBXo|!;XNLbj~*qVK_25v5;ukP=?@1nG*S<J
z444DHok>UHFLVxfFx{PW?xJ%yoyXw#>Bn^WRKr)#()l4epQ7_=I$7=?LTCLYKK?SD
zK8D>z=Omr`;glyrI|66ULNun!jKEWjB6KW4dq`Z7_1?7&N4SNby-)YY1BjJye;AH6
zqA!}A#HKndggb7kMUd#AJYbZJLN=YGf7Hywa!Em~ns9?A4;Ei+zXiOVOp&*m%?$Ss
zPiJq*=Z2@Jjt>_SA1pqc_#oUl^IN&)r(Ye;3>On0wARcaIRd^uY+2*xDeWJ%5Oag^
P`DZ?k(1Q7u@#6mn+3oyM

literal 0
HcmV?d00001

diff --git a/collie/models/mistral2/configuration_mistraltp.py b/collie/models/mistral2/configuration_mistraltp.py
new file mode 100644
index 0000000..ad6691b
--- /dev/null
+++ b/collie/models/mistral2/configuration_mistraltp.py
@@ -0,0 +1,155 @@
+# coding=utf-8
+# Copyright 2023 Mistral AI and the HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Mistral model configuration"""
+
+from transformers.configuration_utils import PretrainedConfig
+# from transformers.utils import logging
+from collie.log.logger import logger
+
+
+# logger = logging.get_logger(__name__)
+
+MISTRAL_PRETRAINED_CONFIG_ARCHIVE_MAP = {
+    "mistralai/Mistral-7B-v0.1": "https://huggingface.co/mistralai/Mistral-7B-v0.1/resolve/main/config.json",
+    "mistralai/Mistral-7B-Instruct-v0.1": "https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.1/resolve/main/config.json",
+}
+
+
+class MistralConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`MistralModel`]. It is used to instantiate an
+    Mistral model according to the specified arguments, defining the model architecture. Instantiating a configuration
+    with the defaults will yield a similar configuration to that of the Mistral-7B-v0.1 or Mistral-7B-Instruct-v0.1.
+
+    [mistralai/Mistral-7B-v0.1](https://huggingface.co/mistralai/Mistral-7B-v0.1)
+    [mistralai/Mistral-7B-Instruct-v0.1](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.1)
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 32000):
+            Vocabulary size of the Mistral model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`MistralModel`]
+        hidden_size (`int`, *optional*, defaults to 4096):
+            Dimension of the hidden representations.
+        intermediate_size (`int`, *optional*, defaults to 14336):
+            Dimension of the MLP representations.
+        num_hidden_layers (`int`, *optional*, defaults to 32):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 32):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        num_key_value_heads (`int`, *optional*, defaults to 8):
+            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
+            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
+            `num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used. When
+            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
+            by meanpooling all the original heads within that group. For more details checkout [this
+            paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to `8`.
+        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
+            The non-linear activation function (function or string) in the decoder.
+        max_position_embeddings (`int`, *optional*, defaults to `4096*32`):
+            The maximum sequence length that this model might ever be used with. Mistral's sliding window attention
+            allows sequence of up to 4096*32 tokens.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        rms_norm_eps (`float`, *optional*, defaults to 1e-06):
+            The epsilon used by the rms normalization layers.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models). Only
+            relevant if `config.is_decoder=True`.
+        pad_token_id (`int`, *optional*):
+            The id of the padding token.
+        bos_token_id (`int`, *optional*, defaults to 1):
+            The id of the "beginning-of-sequence" token.
+        eos_token_id (`int`, *optional*, defaults to 2):
+            The id of the "end-of-sequence" token.
+        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
+            Whether the model's input and output word embeddings should be tied.
+        rope_theta (`float`, *optional*, defaults to 10000.0):
+            The base period of the RoPE embeddings.
+        sliding_window (`int`, *optional*, defaults to 4096):
+            Sliding window attention window size. If not specified, will default to `4096`.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+
+    ```python
+    >>> from transformers import MistralModel, MistralConfig
+
+    >>> # Initializing a Mistral 7B style configuration
+    >>> configuration = MistralConfig()
+
+    >>> # Initializing a model from the Mistral 7B style configuration
+    >>> model = MistralModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "mistral"
+    keys_to_ignore_at_inference = ["past_key_values"]
+
+    def __init__(
+        self,
+        vocab_size=32000,
+        hidden_size=4096,
+        intermediate_size=14336,
+        num_hidden_layers=32,
+        num_attention_heads=32,
+        num_key_value_heads=8,
+        hidden_act="silu",
+        max_position_embeddings=4096 * 32,
+        initializer_range=0.02,
+        rms_norm_eps=1e-6,
+        use_cache=True,
+        pad_token_id=None,
+        bos_token_id=1,
+        eos_token_id=2,
+        tie_word_embeddings=False,
+        rope_theta=10000.0,
+        sliding_window=4096,
+        attention_dropout=0.0,
+        attn_implementation="flash_attention_2",
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.sliding_window = sliding_window
+
+        # for backward compatibility
+        if num_key_value_heads is None:
+            num_key_value_heads = num_attention_heads
+
+        self.num_key_value_heads = num_key_value_heads
+        self.hidden_act = hidden_act
+        self.initializer_range = initializer_range
+        self.rms_norm_eps = rms_norm_eps
+        self.use_cache = use_cache
+        self.rope_theta = rope_theta
+        self.attention_dropout = attention_dropout
+
+        # 调用父类的初始化函数,将一些公共参数传递给父类处理
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )
diff --git a/collie/models/mistral2/model.py b/collie/models/mistral2/model.py
new file mode 100644
index 0000000..60d9553
--- /dev/null
+++ b/collie/models/mistral2/model.py
@@ -0,0 +1,2026 @@
+# coding=utf-8
+# Copyright 2023 Mistral AI and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" PyTorch Mistral model."""
+import inspect
+import math
+import warnings
+from typing import List, Optional, Tuple, Union
+
+import torch
+import torch.nn.functional as F
+import torch.utils.checkpoint
+from torch import nn
+from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
+
+from transformers.activations import ACT2FN
+from transformers.cache_utils import Cache, DynamicCache
+from transformers.modeling_attn_mask_utils import _prepare_4d_causal_attention_mask, _prepare_4d_causal_attention_mask_for_sdpa
+from transformers.modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast, SequenceClassifierOutputWithPast
+from transformers.modeling_utils import PreTrainedModel, dtype_byte_size
+from transformers.utils import (
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    is_flash_attn_2_available,
+    is_flash_attn_greater_or_equal_2_10,
+    logging,
+    replace_return_docstrings,
+)
+from .configuration_mistraltp import Mistral2Config
+
+
+if is_flash_attn_2_available():
+    from flash_attn import flash_attn_func, flash_attn_varlen_func
+    from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input  # noqa
+
+    _flash_supports_window_size = "window_size" in list(inspect.signature(flash_attn_func).parameters)
+
+
+logger = logging.get_logger(__name__)
+
+_CONFIG_FOR_DOC = "Mistral2Config"
+
+#modified for collie
+import torch.distributed as dist
+import gc
+import json
+import os
+from collections import OrderedDict
+from megatron.core import parallel_state, tensor_parallel
+from einops import rearrange
+from deepspeed.pipe import LayerSpec, TiedLayerSpec
+
+from collie.config import CollieConfig
+from collie.driver.io import IODriver
+from collie.log.logger import logger
+from collie.module import (
+    ColumnParallelLinearWithoutBias,
+    ColumnParallelLMHead,
+    RowParallelLinearWithoutBias,
+)
+from collie.utils import concat_tensor, dict_as_params, env, progress
+from collie.models.base import CollieModelForCausalLM
+from collie.models.utils import (
+    kv_cache_to_inputs_for_layer, inputs_to_kv_cache_for_layer,
+    kv_cache_to_inputs_for_model, inputs_to_kv_cache_for_model,
+)
+
+
+# Copied from transformers.models.llama.modeling_llama._get_unpad_data
+def _get_unpad_data(attention_mask):
+    seqlens_in_batch = attention_mask.sum(dim=-1, dtype=torch.int32)
+    indices = torch.nonzero(attention_mask.flatten(), as_tuple=False).flatten()
+    max_seqlen_in_batch = seqlens_in_batch.max().item()
+    cu_seqlens = F.pad(torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.int32), (1, 0))
+    return (
+        indices,
+        cu_seqlens,
+        max_seqlen_in_batch,
+    )
+
+
+# Copied from transformers.models.llama.modeling_llama.LlamaRMSNorm with Llama->Mistral
+class Mistral2RMSNorm(nn.Module):
+    def __init__(self, hidden_size, eps=1e-6):
+        """
+        MistralRMSNorm is equivalent to T5LayerNorm
+        """
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+        self.variance_epsilon = eps
+
+    def forward(self, hidden_states):
+        input_dtype = hidden_states.dtype
+        hidden_states = hidden_states.to(torch.float32)
+        variance = hidden_states.pow(2).mean(-1, keepdim=True)
+        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
+        ans = self.weight * hidden_states.to(input_dtype)
+        # --------------------------------------------------------
+        # # 将Tensor转换为列表
+        # ans_list = ans.tolist()
+        # # 指定.json文件的路径
+        # file_path = '/remote-home/lqyin/CoLLiE/tests/models/mistraltp/rms_ans.json'
+        
+        # # 尝试打开现有的.json文件并读取内容，如果文件不存在则创建一个新的列表
+        # try:
+        #     with open(file_path, 'r', encoding='utf-8') as file:
+        #         results_list = json.load(file)
+        # except FileNotFoundError:
+        #     results_list = []
+        # # 将当前结果添加到列表中
+        # results_list.append(ans_list)
+        # # 将更新后的列表写回.json文件
+        # with open(file_path, 'w', encoding='utf-8') as file:
+        #     json.dump(results_list, file, ensure_ascii=False, indent=4)
+        #     file.write('\n')  # 在文件末尾添加一个换行符
+        # --------------------------------------------------------
+        return ans
+
+
+# copied from transformers.models.llama.modeling_llama.LlamaRotaryEmbedding with Llama->Mistral
+# TODO @Arthur no longer copied from LLama after static cache
+class Mistral2RotaryEmbedding(nn.Module):
+    def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None):
+        super().__init__()
+
+        self.dim = dim
+        self.max_position_embeddings = max_position_embeddings
+        self.base = base
+        inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2, dtype=torch.int64).float().to(device) / self.dim))
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
+
+        # Build here to make `torch.jit.trace` work.
+        self._set_cos_sin_cache(
+            seq_len=max_position_embeddings, device=self.inv_freq.device, dtype=torch.get_default_dtype()
+        )
+
+    def _set_cos_sin_cache(self, seq_len, device, dtype):
+        self.max_seq_len_cached = seq_len
+        t = torch.arange(self.max_seq_len_cached, device=device, dtype=torch.int64).type_as(self.inv_freq)
+
+        freqs = torch.outer(t, self.inv_freq)
+        # Different from paper, but it uses a different permutation in order to obtain the same calculation
+        emb = torch.cat((freqs, freqs), dim=-1)
+        self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False)
+        self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False)
+
+    def forward(self, x, seq_len=None):
+        # x: [bs, num_attention_heads, seq_len, head_size]
+        if seq_len > self.max_seq_len_cached:
+            self._set_cos_sin_cache(seq_len=seq_len, device=x.device, dtype=x.dtype)
+
+        return (
+            self.cos_cached[:seq_len].to(dtype=x.dtype),
+            self.sin_cached[:seq_len].to(dtype=x.dtype),
+        )
+
+
+# Copied from transformers.models.llama.modeling_llama.rotate_half
+def rotate_half(x):
+    """Rotates half the hidden dims of the input."""
+    x1 = x[..., : x.shape[-1] // 2]
+    x2 = x[..., x.shape[-1] // 2 :]
+    return torch.cat((-x2, x1), dim=-1)
+
+
+# copied from transformers.models.llama.modeling_llama.apply_rotary_pos_emb
+# TODO @Arthur no longer copied from LLama after static cache
+def apply_rotary_pos_emb(q, k, cos, sin, position_ids, unsqueeze_dim=1):
+    """Applies Rotary Position Embedding to the query and key tensors.
+
+    Args:
+        q (`torch.Tensor`): The query tensor.
+        k (`torch.Tensor`): The key tensor.
+        cos (`torch.Tensor`): The cosine part of the rotary embedding.
+        sin (`torch.Tensor`): The sine part of the rotary embedding.
+        position_ids (`torch.Tensor`):
+            The position indices of the tokens corresponding to the query and key tensors. For example, this can be
+            used to pass offsetted position ids when working with a KV-cache.
+        unsqueeze_dim (`int`, *optional*, defaults to 1):
+            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
+            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
+            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
+            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
+            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
+            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
+    Returns:
+        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
+    """
+    cos = cos[position_ids].unsqueeze(unsqueeze_dim)
+    sin = sin[position_ids].unsqueeze(unsqueeze_dim)
+    q_embed = (q * cos) + (rotate_half(q) * sin)
+    k_embed = (k * cos) + (rotate_half(k) * sin)
+    return q_embed, k_embed
+
+
+class Mistral2MLP(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.hidden_size = config.hidden_size
+        self.intermediate_size = config.intermediate_size
+        # self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
+        # self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
+        # self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False)
+        
+        self.up_proj = ColumnParallelLinearWithoutBias(
+            self.hidden_size,
+            self.intermediate_size,
+            bias=False,
+            gather_output=False,
+            init_method=lambda x: x,
+        )
+        self.gate_proj = ColumnParallelLinearWithoutBias(
+            self.hidden_size,
+            self.intermediate_size,
+            bias=False,
+            gather_output=False,
+            init_method=lambda x: x,
+        )
+        self.down_proj = RowParallelLinearWithoutBias(
+            self.intermediate_size,
+            self.hidden_size,
+            bias=False,
+            input_is_parallel=True,
+            init_method=lambda x: x,
+        )
+        
+        self.act_fn = ACT2FN[config.hidden_act]
+
+    def forward(self, x):
+        return self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
+
+
+# Copied from transformers.models.llama.modeling_llama.repeat_kv
+def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
+    """
+    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
+    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
+    """
+    batch, num_key_value_heads, slen, head_dim = hidden_states.shape
+    if n_rep == 1:
+        return hidden_states
+    hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
+    return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
+
+
+class Mistral2Attention(nn.Module):
+    """
+    Multi-headed attention from 'Attention Is All You Need' paper. Modified to use sliding window attention: Longformer
+    and "Generating Long Sequences with Sparse Transformers".
+    """
+
+    def __init__(self, config: CollieConfig, layer_idx: Optional[int] = None):
+        super().__init__()
+        self.config = config
+        self.layer_idx = layer_idx
+        if layer_idx is None:
+            logger.warning_once(
+                f"Instantiating {self.__class__.__name__} without passing a `layer_idx` is not recommended and will "
+                "lead to errors during the forward call if caching is used. Please make sure to provide a `layer_idx` "
+                "when creating this class."
+            )
+
+        self.hidden_size = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_dim = self.hidden_size // self.num_heads
+        self.num_key_value_heads = config.num_key_value_heads
+        self.num_key_value_groups = self.num_heads // self.num_key_value_heads
+        self.max_position_embeddings = config.max_position_embeddings
+        self.rope_theta = config.rope_theta
+        self.is_causal = True
+        self.attention_dropout = config.attention_dropout
+
+        if (self.head_dim * self.num_heads) != self.hidden_size:
+            raise ValueError(
+                f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}"
+                f" and `num_heads`: {self.num_heads})."
+            )
+        # self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=False)
+        # self.k_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=False)
+        # self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=False)
+        # self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=False)
+
+        self.q_proj = ColumnParallelLinearWithoutBias(
+            self.hidden_size,
+            self.num_heads * self.head_dim,
+            bias=False,
+            gather_output=False,
+            init_method=lambda x: x,
+        )
+        self.k_proj = ColumnParallelLinearWithoutBias(
+            self.hidden_size,
+            self.num_key_value_heads * self.head_dim,
+            bias=False,
+            gather_output=False,
+            init_method=lambda x: x,
+        )
+        self.v_proj = ColumnParallelLinearWithoutBias(
+            self.hidden_size,
+            self.num_key_value_heads * self.head_dim,
+            bias=False,
+            gather_output=False,
+            init_method=lambda x: x,
+        )
+        self.o_proj = RowParallelLinearWithoutBias(
+            self.num_heads * self.head_dim,
+            self.hidden_size,
+            bias=False,
+            input_is_parallel=True,
+            init_method=lambda x: x,
+        )
+
+        self.rotary_emb = Mistral2RotaryEmbedding(
+            self.head_dim,
+            max_position_embeddings=self.max_position_embeddings,
+            base=self.rope_theta,
+        )
+
+    def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
+        return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Cache] = None,
+        output_attentions: bool = False,
+        use_cache: bool = False,
+        **kwargs,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        if "padding_mask" in kwargs:
+            warnings.warn(
+                "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`"
+            )
+        bsz, q_len, _ = hidden_states.size()
+
+        query_states = self.q_proj(hidden_states)
+        key_states = self.k_proj(hidden_states)
+        value_states = self.v_proj(hidden_states)
+
+        # query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        # key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        # value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+
+        query_states, key_states, value_states = (
+            rearrange(query_states, "b n (h d) -> b n h d", d=self.head_dim),
+            rearrange(key_states, "b n (h d) -> b n h d", d=self.head_dim),
+            rearrange(value_states, "b n (h d) -> b n h d", d=self.head_dim),
+        )
+
+        self.num_heads_tp = query_states.shape[2]
+        self.tp_size = self.num_heads // self.num_heads_tp
+
+        query_states = query_states.transpose(1, 2)
+        key_states = key_states.transpose(1, 2)
+        value_states = value_states.transpose(1, 2)
+
+        if self.config.pp_size > 1:
+            query_states = query_states.contiguous()
+            key_states = key_states.contiguous()
+            value_states = value_states.contiguous()
+
+        kv_seq_len = key_states.shape[-2]
+        if past_key_value is not None:
+            if self.layer_idx is None:
+                raise ValueError(
+                    f"The cache structure has changed since version v4.36. If you are using {self.__class__.__name__} "
+                    "for auto-regressive decoding with k/v caching, please make sure to initialize the attention class "
+                    "with a layer index."
+                )
+            kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
+        cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
+
+        if past_key_value is not None:
+            cache_kwargs = {"sin": sin, "cos": cos}  # Specific to RoPE models
+            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
+
+        # repeat k/v heads if n_kv_heads < n_heads
+        key_states = repeat_kv(key_states, self.num_key_value_groups)
+        value_states = repeat_kv(value_states, self.num_key_value_groups)
+
+        attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim)
+
+        if attn_weights.size() != (bsz, self.num_heads_tp, q_len, kv_seq_len):
+            raise ValueError(
+                f"Attention weights should be of size {(bsz, self.num_heads_tp, q_len, kv_seq_len)}, but is"
+                f" {attn_weights.size()}"
+            )
+
+        if attention_mask is not None:
+            if attention_mask.size() != (bsz, 1, q_len, kv_seq_len):
+                raise ValueError(
+                    f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}"
+                )
+
+            attn_weights = attn_weights + attention_mask
+
+        # upcast attention to fp32
+        attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
+        attn_weights = nn.functional.dropout(attn_weights, p=self.attention_dropout, training=self.training)
+        attn_output = torch.matmul(attn_weights, value_states)
+
+        if attn_output.size() != (bsz, self.num_heads_tp, q_len, self.head_dim):
+            raise ValueError(
+                f"`attn_output` should be of size {(bsz, self.num_heads_tp, q_len, self.head_dim)}, but is"
+                f" {attn_output.size()}"
+            )
+
+        attn_output = attn_output.transpose(1, 2).contiguous()
+        attn_output = attn_output.reshape(bsz, q_len, int(self.hidden_size/self.tp_size))
+
+        attn_output = self.o_proj(attn_output)
+
+        if not output_attentions:
+            attn_weights = None
+
+        # --------------------------------------------------------
+        # 将Tensor转换为列表
+        ans_list = attn_output.tolist()
+        # 指定.json文件的路径
+        file_path = '/remote-home/lqyin/CoLLiE/tests/models/mistraltp/attn_output_1.json'
+        
+        # 尝试打开现有的.json文件并读取内容，如果文件不存在则创建一个新的列表
+        try:
+            with open(file_path, 'r', encoding='utf-8') as file:
+                results_list = json.load(file)
+        except FileNotFoundError:
+            results_list = []
+        # 将当前结果添加到列表中
+        results_list.append(ans_list)
+        # 将更新后的列表写回.json文件
+        with open(file_path, 'w', encoding='utf-8') as file:
+            json.dump(results_list, file, ensure_ascii=False, indent=4)
+            file.write('\n\n\n')  # 在文件末尾添加一个换行符
+        # --------------------------------------------------------
+
+
+
+        return attn_output, attn_weights, past_key_value
+
+
+class Mistral2FlashAttention2(Mistral2Attention):
+    """
+    Mistral flash attention module. This module inherits from `MistralAttention` as the weights of the module stays
+    untouched. The only required change would be on the forward pass where it needs to correctly call the public API of
+    flash attention and deal with padding tokens in case the input contains any of them.
+    """
+
+    # Copied from transformers.models.llama.modeling_llama.LlamaFlashAttention2.__init__
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+        # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1.
+        # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignement, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0.
+        # Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left).
+        self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10()
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Cache] = None,
+        output_attentions: bool = False,
+        use_cache: bool = False,
+        **kwargs,
+    ):
+        if "padding_mask" in kwargs:
+            warnings.warn(
+                "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`"
+            )
+
+            # overwrite attention_mask with padding_mask
+            attention_mask = kwargs.pop("padding_mask")
+        bsz, q_len, _ = hidden_states.size()
+
+        query_states = self.q_proj(hidden_states)
+        key_states = self.k_proj(hidden_states)
+        value_states = self.v_proj(hidden_states)
+
+        # query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        # key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        # value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+
+        query_states, key_states, value_states = (
+            rearrange(query_states, "b n (h d) -> b n h d", d=self.head_dim),
+            rearrange(key_states, "b n (h d) -> b n h d", d=self.head_dim),
+            rearrange(value_states, "b n (h d) -> b n h d", d=self.head_dim),
+        )
+
+        self.num_heads_tp = query_states.shape[2]
+        self.tp_size = self.num_heads // self.num_heads_tp
+
+        query_states = query_states.transpose(1, 2)
+        key_states = key_states.transpose(1, 2)
+        value_states = value_states.transpose(1, 2)
+
+        if self.config.pp_size > 1:
+            query_states = query_states.contiguous()
+            key_states = key_states.contiguous()
+            value_states = value_states.contiguous()
+
+
+        kv_seq_len = key_states.shape[-2]
+        if past_key_value is not None:
+            if self.layer_idx is None:
+                raise ValueError(
+                    f"The cache structure has changed since version v4.36. If you are using {self.__class__.__name__} "
+                    "for auto-regressive decoding with k/v caching, please make sure to initialize the attention class "
+                    "with a layer index."
+                )
+            kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
+
+        # Because the input can be padded, the absolute sequence length depends on the max position id.
+        rotary_seq_len = max(kv_seq_len, position_ids[:, -1].max().item()) + 1
+        cos, sin = self.rotary_emb(value_states, seq_len=rotary_seq_len)
+
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
+
+        use_sliding_windows = (
+            _flash_supports_window_size
+            and getattr(self.config, "sliding_window", None) is not None
+            and kv_seq_len > self.config.sliding_window
+        )
+
+        if not _flash_supports_window_size:
+            logger.warning_once(
+                "The current flash attention version does not support sliding window attention, for a more memory efficient implementation"
+                " make sure to upgrade flash-attn library."
+            )
+
+        if past_key_value is not None:
+            # Activate slicing cache only if the config has a value `sliding_windows` attribute
+            cache_has_contents = past_key_value.get_seq_length(self.layer_idx) > 0
+            if (
+                getattr(self.config, "sliding_window", None) is not None
+                and kv_seq_len > self.config.sliding_window
+                and cache_has_contents
+            ):
+                slicing_tokens = 1 - self.config.sliding_window
+
+                past_key = past_key_value[self.layer_idx][0]
+                past_value = past_key_value[self.layer_idx][1]
+
+                past_key = past_key[:, :, slicing_tokens:, :].contiguous()
+                past_value = past_value[:, :, slicing_tokens:, :].contiguous()
+
+                if past_key.shape[-2] != self.config.sliding_window - 1:
+                    raise ValueError(
+                        f"past key must have a shape of (`batch_size, num_heads, self.config.sliding_window-1, head_dim`), got"
+                        f" {past_key.shape}"
+                    )
+
+                if attention_mask is not None:
+                    attention_mask = attention_mask[:, slicing_tokens:]
+                    attention_mask = torch.cat([attention_mask, torch.ones_like(attention_mask[:, -1:])], dim=-1)
+
+            cache_kwargs = {"sin": sin, "cos": cos}  # Specific to RoPE models
+            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
+
+        # repeat k/v heads if n_kv_heads < n_heads
+        key_states = repeat_kv(key_states, self.num_key_value_groups)
+        value_states = repeat_kv(value_states, self.num_key_value_groups)
+        dropout_rate = 0.0 if not self.training else self.attention_dropout
+
+        # In PEFT, usually we cast the layer norms in float32 for training stability reasons
+        # therefore the input hidden states gets silently casted in float32. Hence, we need
+        # cast them back in float16 just to be sure everything works as expected.
+        input_dtype = query_states.dtype
+        if input_dtype == torch.float32:
+            if torch.is_autocast_enabled():
+                target_dtype = torch.get_autocast_gpu_dtype()
+            # Handle the case where the model is quantized
+            elif hasattr(self.config, "_pre_quantization_dtype"):
+                target_dtype = self.config._pre_quantization_dtype
+            else:
+                target_dtype = self.q_proj.weight.dtype
+
+            logger.warning_once(
+                f"The input hidden states seems to be silently casted in float32, this might be related to"
+                f" the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in"
+                f" {target_dtype}."
+            )
+
+            query_states = query_states.to(target_dtype)
+            key_states = key_states.to(target_dtype)
+            value_states = value_states.to(target_dtype)
+
+        # Reashape to the expected shape for Flash Attention
+        query_states = query_states.transpose(1, 2)
+        key_states = key_states.transpose(1, 2)
+        value_states = value_states.transpose(1, 2)
+
+        attn_output = self._flash_attention_forward(
+            query_states,
+            key_states,
+            value_states,
+            attention_mask,
+            q_len,
+            dropout=dropout_rate,
+            use_sliding_windows=use_sliding_windows,
+        )
+
+        attn_output = attn_output.reshape(bsz, q_len, int(self.hidden_size/self.tp_size)).contiguous()
+        attn_output = self.o_proj(attn_output)
+
+        if not output_attentions:
+            attn_weights = None
+
+  # --------------------------------------------------------
+        # 将Tensor转换为列表
+        ans_list = attn_output.tolist()
+        # 指定.json文件的路径
+        file_path = '/remote-home/lqyin/CoLLiE/tests/models/mistraltp/attn_output_1.json'
+        
+        # 尝试打开现有的.json文件并读取内容，如果文件不存在则创建一个新的列表
+        try:
+            with open(file_path, 'r', encoding='utf-8') as file:
+                results_list = json.load(file)
+        except FileNotFoundError:
+            results_list = []
+        # 将当前结果添加到列表中
+        results_list.append(ans_list)
+        # 将更新后的列表写回.json文件
+        with open(file_path, 'w', encoding='utf-8') as file:
+            json.dump(results_list, file, ensure_ascii=False, indent=4)
+            file.write('\n\n\n')  # 在文件末尾添加一个换行符
+        # --------------------------------------------------------
+
+
+        return attn_output, attn_weights, past_key_value
+
+    def _flash_attention_forward(
+        self,
+        query_states,
+        key_states,
+        value_states,
+        attention_mask,
+        query_length,
+        dropout=0.0,
+        softmax_scale=None,
+        use_sliding_windows=False,
+    ):
+        """
+        Calls the forward method of Flash Attention - if the input hidden states contain at least one padding token
+        first unpad the input, then computes the attention scores and pad the final attention scores.
+
+        Args:
+            query_states (`torch.Tensor`):
+                Input query states to be passed to Flash Attention API
+            key_states (`torch.Tensor`):
+                Input key states to be passed to Flash Attention API
+            value_states (`torch.Tensor`):
+                Input value states to be passed to Flash Attention API
+            attention_mask (`torch.Tensor`):
+                The padding mask - corresponds to a tensor of size `(batch_size, seq_len)` where 0 stands for the
+                position of padding tokens and 1 for the position of non-padding tokens.
+            dropout (`float`):
+                Attention dropout
+            softmax_scale (`float`, *optional*):
+                The scaling of QK^T before applying softmax. Default to 1 / sqrt(head_dim)
+            use_sliding_windows (`bool`, *optional*):
+                Whether to activate sliding window attention.
+        """
+        if not self._flash_attn_uses_top_left_mask:
+            causal = self.is_causal
+        else:
+            # TODO: Remove the `query_length != 1` check once Flash Attention for RoCm is bumped to 2.1. For details, please see the comment in LlamaFlashAttention2 __init__.
+            causal = self.is_causal and query_length != 1
+
+        # Contains at least one padding token in the sequence
+        if attention_mask is not None:
+            batch_size = query_states.shape[0]
+            query_states, key_states, value_states, indices_q, cu_seq_lens, max_seq_lens = self._upad_input(
+                query_states, key_states, value_states, attention_mask, query_length
+            )
+
+            cu_seqlens_q, cu_seqlens_k = cu_seq_lens
+            max_seqlen_in_batch_q, max_seqlen_in_batch_k = max_seq_lens
+
+            if not use_sliding_windows:
+                attn_output_unpad = flash_attn_varlen_func(
+                    query_states,
+                    key_states,
+                    value_states,
+                    cu_seqlens_q=cu_seqlens_q,
+                    cu_seqlens_k=cu_seqlens_k,
+                    max_seqlen_q=max_seqlen_in_batch_q,
+                    max_seqlen_k=max_seqlen_in_batch_k,
+                    dropout_p=dropout,
+                    softmax_scale=softmax_scale,
+                    causal=causal,
+                )
+            else:
+                attn_output_unpad = flash_attn_varlen_func(
+                    query_states,
+                    key_states,
+                    value_states,
+                    cu_seqlens_q=cu_seqlens_q,
+                    cu_seqlens_k=cu_seqlens_k,
+                    max_seqlen_q=max_seqlen_in_batch_q,
+                    max_seqlen_k=max_seqlen_in_batch_k,
+                    dropout_p=dropout,
+                    softmax_scale=softmax_scale,
+                    causal=causal,
+                    window_size=(self.config.sliding_window, self.config.sliding_window),
+                )
+
+            attn_output = pad_input(attn_output_unpad, indices_q, batch_size, query_length)
+        else:
+            if not use_sliding_windows:
+                attn_output = flash_attn_func(
+                    query_states,
+                    key_states,
+                    value_states,
+                    dropout,
+                    softmax_scale=softmax_scale,
+                    causal=causal,
+                )
+            else:
+                attn_output = flash_attn_func(
+                    query_states,
+                    key_states,
+                    value_states,
+                    dropout,
+                    softmax_scale=softmax_scale,
+                    causal=causal,
+                    window_size=(self.config.sliding_window, self.config.sliding_window),
+                )
+
+        return attn_output
+
+    def _upad_input(self, query_layer, key_layer, value_layer, attention_mask, query_length):
+        batch_size, kv_seq_len, num_heads, head_dim = key_layer.shape
+
+        # On the first iteration we need to properly re-create the padding mask
+        # by slicing it on the proper place
+        if kv_seq_len != attention_mask.shape[-1]:
+            attention_mask_num_tokens = attention_mask.shape[-1]
+            attention_mask = attention_mask[:, attention_mask_num_tokens - kv_seq_len :]
+
+        indices_k, cu_seqlens_k, max_seqlen_in_batch_k = _get_unpad_data(attention_mask)
+
+        key_layer = index_first_axis(key_layer.reshape(batch_size * kv_seq_len, num_heads, head_dim), indices_k)
+        value_layer = index_first_axis(value_layer.reshape(batch_size * kv_seq_len, num_heads, head_dim), indices_k)
+
+        if query_length == kv_seq_len:
+            query_layer = index_first_axis(
+                query_layer.reshape(batch_size * kv_seq_len, num_heads, head_dim), indices_k
+            )
+            cu_seqlens_q = cu_seqlens_k
+            max_seqlen_in_batch_q = max_seqlen_in_batch_k
+            indices_q = indices_k
+        elif query_length == 1:
+            max_seqlen_in_batch_q = 1
+            cu_seqlens_q = torch.arange(
+                batch_size + 1, dtype=torch.int32, device=query_layer.device
+            )  # There is a memcpy here, that is very bad.
+            indices_q = cu_seqlens_q[:-1]
+            query_layer = query_layer.squeeze(1)
+        else:
+            # The -q_len: slice assumes left padding.
+            attention_mask = attention_mask[:, -query_length:]
+            query_layer, indices_q, cu_seqlens_q, max_seqlen_in_batch_q = unpad_input(query_layer, attention_mask)
+
+        return (
+            query_layer,
+            key_layer,
+            value_layer,
+            indices_q,
+            (cu_seqlens_q, cu_seqlens_k),
+            (max_seqlen_in_batch_q, max_seqlen_in_batch_k),
+        )
+
+
+# copied from transformers.models.llama.modeling_llama.LlamaSdpaAttention with Llama->Mistral
+# TODO @Arthur no longer copied from LLama after static cache
+class Mistral2SdpaAttention(Mistral2Attention):
+    """
+    Mistral attention module using torch.nn.functional.scaled_dot_product_attention. This module inherits from
+    `MistralAttention` as the weights of the module stays untouched. The only changes are on the forward pass to adapt to
+    SDPA API.
+    """
+
+    # Adapted from MistralAttention.forward
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Cache] = None,
+        output_attentions: bool = False,
+        use_cache: bool = False,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        if output_attentions:
+            # TODO: Improve this warning with e.g. `model.config.attn_implementation = "manual"` once this is implemented.
+            logger.warning_once(
+                "MistralModel is using MistralSdpaAttention, but `torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to the manual attention implementation, "
+                'but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
+            )
+            return super().forward(
+                hidden_states=hidden_states,
+                attention_mask=attention_mask,
+                position_ids=position_ids,
+                past_key_value=past_key_value,
+                output_attentions=output_attentions,
+                use_cache=use_cache,
+            )
+
+        bsz, q_len, _ = hidden_states.size()
+
+        query_states = self.q_proj(hidden_states)
+        key_states = self.k_proj(hidden_states)
+        value_states = self.v_proj(hidden_states)
+
+        # query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        # key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        # value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+
+        query_states, key_states, value_states = (
+            rearrange(query_states, "b n (h d) -> b n h d", d=self.head_dim),
+            rearrange(key_states, "b n (h d) -> b n h d", d=self.head_dim),
+            rearrange(value_states, "b n (h d) -> b n h d", d=self.head_dim),
+        )
+
+        self.num_heads_tp = query_states.shape[2]
+        self.tp_size = self.num_heads // self.num_heads_tp
+
+        query_states = query_states.transpose(1, 2)
+        key_states = key_states.transpose(1, 2)
+        value_states = value_states.transpose(1, 2)
+
+        if self.config.pp_size > 1:
+            query_states = query_states.contiguous()
+            key_states = key_states.contiguous()
+            value_states = value_states.contiguous()
+
+
+        kv_seq_len = key_states.shape[-2]
+        if past_key_value is not None:
+            kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
+        cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
+
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
+
+        if past_key_value is not None:
+            cache_kwargs = {"sin": sin, "cos": cos}  # Specific to RoPE models
+            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
+
+        key_states = repeat_kv(key_states, self.num_key_value_groups)
+        value_states = repeat_kv(value_states, self.num_key_value_groups)
+
+        if attention_mask is not None:
+            if attention_mask.size() != (bsz, 1, q_len, kv_seq_len):
+                raise ValueError(
+                    f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}"
+                )
+
+        # SDPA with memory-efficient backend is currently (torch==2.1.2) bugged with non-contiguous inputs with custom attn_mask,
+        # Reference: https://github.com/pytorch/pytorch/issues/112577.
+        if query_states.device.type == "cuda" and attention_mask is not None:
+            query_states = query_states.contiguous()
+            key_states = key_states.contiguous()
+            value_states = value_states.contiguous()
+
+        attn_output = torch.nn.functional.scaled_dot_product_attention(
+            query_states,
+            key_states,
+            value_states,
+            attn_mask=attention_mask,
+            dropout_p=self.attention_dropout if self.training else 0.0,
+            # The q_len > 1 is necessary to match with AttentionMaskConverter.to_causal_4d that does not create a causal mask in case q_len == 1.
+            is_causal=self.is_causal and attention_mask is None and q_len > 1,
+        )
+
+        attn_output = attn_output.transpose(1, 2).contiguous()
+        attn_output = attn_output.view(bsz, q_len, int(self.hidden_size/self.tp_size))
+
+        attn_output = self.o_proj(attn_output)
+
+        return attn_output, None, past_key_value
+
+
+MISTRAL_ATTENTION_CLASSES = {
+    "eager": Mistral2Attention,
+    "flash_attention_2": Mistral2FlashAttention2,
+    "sdpa": Mistral2SdpaAttention,
+}
+
+
+class MistralDecoderLayer(nn.Module):
+    def __init__(self, config: CollieConfig, layer_idx: int):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+
+        self.self_attn = MISTRAL_ATTENTION_CLASSES[config._attn_implementation](config, layer_idx)
+
+        self.mlp = Mistral2MLP(config)
+        self.input_layernorm = Mistral2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = Mistral2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        output_attentions: Optional[bool] = False,
+        use_cache: Optional[bool] = False,
+        **kwargs,
+    ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
+        if "padding_mask" in kwargs:
+            warnings.warn(
+                "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`"
+            )
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
+            attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
+                `(batch, sequence_length)` where padding elements are indicated by 0.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            use_cache (`bool`, *optional*):
+                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
+                (see `past_key_values`).
+            past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
+        """
+
+        residual = hidden_states
+
+        hidden_states = self.input_layernorm(hidden_states)
+
+        # Self Attention
+        hidden_states, self_attn_weights, present_key_value = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_value=past_key_value,
+            output_attentions=output_attentions,
+            use_cache=use_cache,
+        )
+        hidden_states = residual + hidden_states
+
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + hidden_states
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (self_attn_weights,)
+
+        if use_cache:
+            outputs += (present_key_value,)
+
+        # --------------------------------------------------------
+        # # 将Tensor转换为列表
+        # ans_list = [tensor.tolist() for tensor in outputs]
+        # # 指定.json文件的路径
+        # file_path = '/remote-home/lqyin/CoLLiE/tests/models/mistraltp/decoder_outputs.json'
+        
+        # # 尝试打开现有的.json文件并读取内容，如果文件不存在则创建一个新的列表
+        # try:
+        #     with open(file_path, 'r', encoding='utf-8') as file:
+        #         results_list = json.load(file)
+        # except FileNotFoundError:
+        #     results_list = []
+        # # 将当前结果添加到列表中
+        # results_list.append(ans_list)
+        # # 将更新后的列表写回.json文件
+        # with open(file_path, 'w', encoding='utf-8') as file:
+        #     json.dump(results_list, file, ensure_ascii=False, indent=4)
+        #     file.write('\n')  # 在文件末尾添加一个换行符
+        # --------------------------------------------------------
+
+        return outputs
+
+
+MISTRAL_START_DOCSTRING = r"""
+    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+
+    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
+    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
+    and behavior.
+
+    Parameters:
+        config ([`MistralConfig`]):
+            Model configuration class with all the parameters of the model. Initializing with a config file does not
+            load the weights associated with the model, only the configuration. Check out the
+            [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+
+@add_start_docstrings(
+    "The bare Mistral Model outputting raw hidden-states without any specific head on top.",
+    MISTRAL_START_DOCSTRING,
+)
+class Mistral2PreTrainedModel(PreTrainedModel):
+    config_class = Mistral2Config
+    base_model_prefix = "model"
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["MistralDecoderLayer"]
+    _skip_keys_device_placement = "past_key_values"
+    _supports_flash_attn_2 = True
+    _supports_sdpa = True
+    _supports_cache_class = True
+
+    def _init_weights(self, module):
+        std = self.config.initializer_range
+        if isinstance(module, nn.Linear):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+
+
+MISTRAL_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
+            it.
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            [What are input IDs?](../glossary#input-ids)
+        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            If `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
+            `past_key_values`).
+
+            If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
+            and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
+            information on the default strategy.
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+        position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+            config.n_positions - 1]`.
+
+            [What are position IDs?](../glossary#position-ids)
+        past_key_values (`Cache` or `tuple(tuple(torch.FloatTensor))`, *optional*):
+            Pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
+            blocks) that can be used to speed up sequential decoding. This typically consists in the `past_key_values`
+            returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`.
+
+            Two formats are allowed:
+            - a [`~cache_utils.Cache`] instance;
+            - Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of
+            shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`). This is also known as the legacy
+            cache format.
+
+            The model will output the same cache format that is fed as input. If no `past_key_values` are passed, the
+            legacy cache format will be returned.
+
+            If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that don't
+            have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `input_ids`
+            of shape `(batch_size, sequence_length)`.
+        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
+            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
+            model's internal embedding lookup matrix.
+        use_cache (`bool`, *optional*):
+            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
+            `past_key_values`).
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+@add_start_docstrings(
+    "The bare Mistral Model outputting raw hidden-states without any specific head on top.",
+    MISTRAL_START_DOCSTRING,
+)
+class Mistral2Model(nn.Module):
+    """
+    Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`MistralDecoderLayer`]
+
+    Args:
+        config: MistralConfig
+    """
+
+    def __init__(self, config:  CollieConfig):
+        # super().__init__(config)
+        super().__init__()
+        self.config = config
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+
+        self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
+        self.layers = nn.ModuleList(
+            [MistralDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
+        )
+        self._attn_implementation = config._attn_implementation
+        self.norm = Mistral2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+        self.gradient_checkpointing = False
+        # Initialize weights and apply final processing
+        # self.post_init()
+
+    def get_input_embeddings(self):
+        return self.embed_tokens
+
+    def set_input_embeddings(self, value):
+        self.embed_tokens = value
+
+    @add_start_docstrings_to_model_forward(MISTRAL_INPUTS_DOCSTRING)
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutputWithPast]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        # retrieve input_ids and inputs_embeds
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time")
+        elif input_ids is not None:
+            batch_size, seq_length = input_ids.shape
+        elif inputs_embeds is not None:
+            batch_size, seq_length, _ = inputs_embeds.shape
+        else:
+            raise ValueError("You have to specify either decoder_input_ids or decoder_inputs_embeds")
+
+        if self.gradient_checkpointing and self.training:
+            if use_cache:
+                logger.warning_once(
+                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+                )
+                use_cache = False
+
+        past_key_values_length = 0
+
+        if use_cache:
+            use_legacy_cache = not isinstance(past_key_values, Cache)
+            if use_legacy_cache:
+                past_key_values = DynamicCache.from_legacy_cache(past_key_values)
+            past_key_values_length = past_key_values.get_usable_length(seq_length)
+
+        if position_ids is None:
+            device = input_ids.device if input_ids is not None else inputs_embeds.device
+            position_ids = torch.arange(
+                past_key_values_length, seq_length + past_key_values_length, dtype=torch.long, device=device
+            )
+            position_ids = position_ids.unsqueeze(0).view(-1, seq_length)
+        else:
+            position_ids = position_ids.view(-1, seq_length).long()
+
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_tokens(input_ids)
+
+        if attention_mask is not None and self._attn_implementation == "flash_attention_2" and use_cache:
+            is_padding_right = attention_mask[:, -1].sum().item() != batch_size
+            if is_padding_right:
+                raise ValueError(
+                    "You are attempting to perform batched generation with padding_side='right'"
+                    " this may lead to unexpected behaviour for Flash Attention version of Mistral. Make sure to "
+                    " call `tokenizer.padding_side  = 'left'` before tokenizing the input. "
+                )
+
+        if self._attn_implementation == "flash_attention_2":
+            # 2d mask is passed through the layers
+            attention_mask = attention_mask if (attention_mask is not None and 0 in attention_mask) else None
+        elif self._attn_implementation == "sdpa" and not output_attentions:
+            # output_attentions=True can not be supported when using SDPA, and we fall back on
+            # the manual implementation that requires a 4D causal mask in all cases.
+            attention_mask = _prepare_4d_causal_attention_mask_for_sdpa(
+                attention_mask,
+                (batch_size, seq_length),
+                inputs_embeds,
+                past_key_values_length,
+            )
+        else:
+            # 4d mask is passed through the layers
+            attention_mask = _prepare_4d_causal_attention_mask(
+                attention_mask,
+                (batch_size, seq_length),
+                inputs_embeds,
+                past_key_values_length,
+                sliding_window=self.config.sliding_window,
+            )
+
+        hidden_states = inputs_embeds
+    
+
+        # --------------------------------------------------------
+        # # 将Tensor转换为列表
+        # ans_list = inputs_embeds.tolist()
+        # # 指定.json文件的路径
+        # file_path = '/remote-home/lqyin/CoLLiE/tests/models/mistraltp/inputs_embeds.json'
+        
+        # # 尝试打开现有的.json文件并读取内容，如果文件不存在则创建一个新的列表
+        # try:
+        #     with open(file_path, 'r', encoding='utf-8') as file:
+        #         results_list = json.load(file)
+        # except FileNotFoundError:
+        #     results_list = []
+        # # 将当前结果添加到列表中
+        # results_list.append(ans_list)
+        # # 将更新后的列表写回.json文件
+        # with open(file_path, 'w', encoding='utf-8') as file:
+        #     json.dump(results_list, file, ensure_ascii=False, indent=4)
+        #     file.write('\n')  # 在文件末尾添加一个换行符
+        # # --------------------------------------------------------
+
+        # decoder layers
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attns = () if output_attentions else None
+        next_decoder_cache = None
+
+        for decoder_layer in self.layers:
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+
+            if self.gradient_checkpointing and self.training:
+                layer_outputs = self._gradient_checkpointing_func(
+                    decoder_layer.__call__,
+                    hidden_states,
+                    attention_mask,
+                    position_ids,
+                    past_key_values,
+                    output_attentions,
+                    use_cache,
+                )
+            else:
+                layer_outputs = decoder_layer(
+                    hidden_states,
+                    attention_mask=attention_mask,
+                    position_ids=position_ids,
+                    past_key_value=past_key_values,
+                    output_attentions=output_attentions,
+                    use_cache=use_cache,
+                )
+
+            hidden_states = layer_outputs[0]
+
+            if use_cache:
+                next_decoder_cache = layer_outputs[2 if output_attentions else 1]
+
+            if output_attentions:
+                all_self_attns += (layer_outputs[1],)
+
+        hidden_states = self.norm(hidden_states)
+
+        # add hidden states from the last decoder layer
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+
+        next_cache = None
+        if use_cache:
+            next_cache = next_decoder_cache.to_legacy_cache() if use_legacy_cache else next_decoder_cache
+
+        if not return_dict:
+            return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None)
+        return BaseModelOutputWithPast(
+            last_hidden_state=hidden_states,
+            past_key_values=next_cache,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attns,
+        )
+
+
+class Mistral2ForCausalLM(CollieModelForCausalLM):
+    _tied_weights_keys = ["lm_head.weight"]
+
+    def __init__(self, config:CollieConfig):
+        super().__init__(config)
+        self.model = Mistral2Model(config)
+        self.vocab_size = config.vocab_size
+        # self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+        self.lm_head = ColumnParallelLinearWithoutBias(
+            self.collie_config.hidden_size, self.collie_config.vocab_size, bias=False
+        )
+        # Initialize weights and apply final processing
+        # self.post_init()
+        # GenerationMixin 需要的额外参数
+        self.config.is_decoder = True
+        if config.model_config.tie_word_embeddings:
+            self.lm_head.weight = self.embed_tokens.weight
+        self.main_input_name = "input_ids"
+
+    def clean_cache(self):
+        self._clean_hidden_states([*self.model.layers, self.lm_head])
+        self._set_use_cache(self.model.layers, False)
+
+    def set_cache(self, use_cache):
+        self._set_use_cache(self.model.layers, use_cache)
+
+    def get_input_embeddings(self):
+        return self.model.embed_tokens
+
+    def set_input_embeddings(self, value):
+        self.model.embed_tokens = value
+
+    def get_output_embeddings(self):
+        return self.lm_head
+
+    def set_output_embeddings(self, new_embeddings):
+        self.lm_head = new_embeddings
+
+    def set_decoder(self, decoder):
+        self.model = decoder
+
+    def get_decoder(self):
+        return self.model
+
+    @add_start_docstrings_to_model_forward(MISTRAL_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        **kwargs,
+    ) -> Union[Tuple, CausalLMOutputWithPast]:
+        r"""
+        Args:
+            labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
+                config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+                (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+
+        Returns:
+
+        Example:
+
+        ```python
+        >>> from transformers import AutoTokenizer, MistralForCausalLM
+
+        >>> model = MistralForCausalLM.from_pretrained("mistralai/Mistral-7B-v0.1")
+        >>> tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-v0.1")
+
+        >>> prompt = "Hey, are you conscious? Can you talk to me?"
+        >>> inputs = tokenizer(prompt, return_tensors="pt")
+
+        >>> # Generate
+        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
+        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+        "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
+        ```"""
+
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
+        outputs = self.model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        hidden_states = outputs[0]
+        logits = self.lm_head(hidden_states)
+        logits = logits.float()
+
+        loss = None
+        if labels is not None:
+            # Shift so that tokens < n predict n
+            shift_logits = logits[..., :-1, :].contiguous()
+            shift_labels = labels[..., 1:].contiguous()
+            # Flatten the tokens
+            shift_logits = shift_logits.view(-1, self.config.vocab_size)
+            shift_labels = shift_labels.view(-1)
+            # Ensure tensors are on the same device
+            shift_labels = shift_labels.to(shift_logits.device)
+            loss_fct = CrossEntropyLoss()
+            loss = loss_fct(shift_logits, shift_labels)
+
+        if not return_dict:
+            output = (logits,) + outputs[1:]
+            return (loss,) + output if loss is not None else output
+
+        return CausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+    def prepare_inputs_for_generation(
+        self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, **kwargs
+    ):
+        # Omit tokens covered by past_key_values
+        if past_key_values is not None:
+            if isinstance(past_key_values, Cache):
+                cache_length = past_key_values.get_seq_length()
+                past_length = past_key_values.seen_tokens
+                max_cache_length = past_key_values.get_max_length()
+            else:
+                cache_length = past_length = past_key_values[0][0].shape[2]
+                max_cache_length = None
+
+            # Keep only the unprocessed tokens:
+            # 1 - If the length of the attention_mask exceeds the length of input_ids, then we are in a setting where
+            # some of the inputs are exclusively passed as part of the cache (e.g. when passing input_embeds as
+            # input)
+            if attention_mask is not None and attention_mask.shape[1] > input_ids.shape[1]:
+                input_ids = input_ids[:, -(attention_mask.shape[1] - past_length) :]
+            # 2 - If the past_length is smaller than input_ids', then input_ids holds all input tokens. We can discard
+            # input_ids based on the past_length.
+            elif past_length < input_ids.shape[1]:
+                input_ids = input_ids[:, past_length:]
+            # 3 - Otherwise (past_length >= input_ids.shape[1]), let's assume input_ids only has unprocessed tokens.
+
+            # If we are about to go beyond the maximum cache length, we need to crop the input attention mask.
+            if (
+                max_cache_length is not None
+                and attention_mask is not None
+                and cache_length + input_ids.shape[1] > max_cache_length
+            ):
+                attention_mask = attention_mask[:, -max_cache_length:]
+
+        position_ids = kwargs.get("position_ids", None)
+        if attention_mask is not None and position_ids is None:
+            # create position_ids on the fly for batch generation
+            position_ids = attention_mask.long().cumsum(-1) - 1
+            position_ids.masked_fill_(attention_mask == 0, 1)
+            if past_key_values:
+                position_ids = position_ids[:, -input_ids.shape[1] :]
+
+        # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
+        if inputs_embeds is not None and past_key_values is None:
+            model_inputs = {"inputs_embeds": inputs_embeds}
+        else:
+            model_inputs = {"input_ids": input_ids}
+
+        model_inputs.update(
+            {
+                "position_ids": position_ids,
+                "past_key_values": past_key_values,
+                "use_cache": kwargs.get("use_cache"),
+                "attention_mask": attention_mask,
+            }
+        )
+        return model_inputs
+
+    @staticmethod
+    def _reorder_cache(past_key_values, beam_idx):
+        reordered_past = ()
+        for layer_past in past_key_values:
+            reordered_past += (
+                tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past),
+            )
+        return reordered_past
+
+    @classmethod
+    def pipeline_layers(cls, config: CollieConfig):
+        """
+        Get layers of pipeline.
+        :return: list
+        """
+        if isinstance(config, str):
+            config = CollieConfig.from_pretrained(config)
+
+        if config.tie_word_embeddings:
+            output = TiedLayerSpec(
+                "embed_tokens",
+                dict_as_params(input_keys="hidden_states", output_keys="logits"),
+                ColumnParallelLMHead,
+                config.hidden_size,
+                config.vocab_size,
+                bias=False,
+            )
+        else:
+            output = LayerSpec(
+                dict_as_params(input_keys="hidden_states", output_keys="logits"),
+                ColumnParallelLMHead,
+                config.hidden_size,
+                config.vocab_size,
+                bias=False,
+            )
+
+        return [("model", Mistral2Model.pipeline_layers(config)), ("lm_head", output)]
+
+    @staticmethod
+    def load_parallel_state_dict(
+            path: str,
+            config: Union[CollieConfig, str],
+            process_exclusion: bool = False,
+            **kwargs,
+    ):
+        ...
+
+    @staticmethod
+    def load_parallel_state_dict(
+            path: str,
+            config: Union[CollieConfig, str],
+            process_exclusion: bool = False,
+            protocol: str = "file", # 指定加载state_dict时使用的协议
+            **kwargs,
+    ):
+        """
+        Load state_dict from ``path``.
+        The format of pretrained model should be the same as that of
+        `huggingface`.
+        :return: state_dict. Note that the state_dict should be processed
+            properly to match the current rank.
+        """
+        # 配置加载
+        if isinstance(config, str):
+            config = CollieConfig.from_pretrained(config)
+        # IO驱动初始化
+        io_driver = IODriver.from_protocol(protocol)
+        # 检查文件路径是否存在
+        if not io_driver.exists(path):
+            raise FileNotFoundError(f"folder {path} not found.")
+        # 初始化存储和处理变量
+        state_dict = OrderedDict()
+        weights = []
+        parts = None # 变量用于存储模型分割的部分信息
+        # 如果开启了进程互斥，那么每个进程都会显示进度条，否则只显示 RANK0 的
+        hide_progress = not process_exclusion and int(os.environ.get("RANK", "0")) != 0
+        if dist.is_initialized() and process_exclusion:
+            # 如果启动了进程互斥，则要进行 dist.get_world_size() 次循环
+            rank_order = range(dist.get_world_size())
+        else:
+            # 不开启只进行一次循环
+            rank_order = range(1)
+        # 权重文件加载和处理
+        for rank in rank_order:
+            # 如果开启了进程互斥，那么只有对应 RANK 的能进入循环；不开启进程互斥的话就都可以进
+            if int(os.environ.get("RANK", "0")) == rank or not process_exclusion:
+                # PP 分层的方法保存在了 os.environ["COLLIE_PP_PARTS"], 格式类似于 [0, 17, 35], 左闭右开
+                if env.is_pipeline:
+                    # 保存的是 json 格式
+                    parts = env.pipeline_parts
+                if hasattr(config, "num_key_value_heads"):
+                    # llama2 (transformers >= 4.31.0)
+                    num_key_value_heads = config.num_key_value_heads
+                else:
+                    num_key_value_heads = config.num_attention_heads
+                head_dim = config.hidden_size // config.num_attention_heads
+                # 如果存在 pytorch_model.bin.index.json 文件的话，此时不同的 pp 进程可以按需加载自己需要的权重
+                if (
+                        io_driver.exists(os.path.join(path, "pytorch_model.bin.index.json"))
+                        and "COLLIE_PP_PARTS" in os.environ.keys()
+                ):
+                    weight_map = json.loads(
+                        io_driver.load(
+                            os.path.join(path, "pytorch_model.bin.index.json"), mode="r"
+                        )
+                    )["weight_map"]
+                    # layers 表示自己需要的层
+                    layers = env.pipeline_layers_idx
+                    # 筛选出形似 model.layers.0 这样的层。包含两个条件：1. 有数字的层；2. 数字加一要在 layers 里面（因为最开始还有个 embedding 占一层）
+                    weights.extend(
+                        [
+                            value
+                            for key, value in weight_map.items()
+                            if len(key.split(".")) > 2
+                               and key.split(".")[2].isdigit()
+                               and (int(key.split(".")[2]) + 1) in layers
+                        ]
+                    )
+                    # 去重
+                    weights = list(set(weights))
+                    # 继续筛选，如果有 0 层，那么就要加载 embedding；如果有最后一层，那么就要加载 lm_head；如果有倒数第二层，那么就要加载 norm
+                    if 0 in layers:
+                        weights.append(weight_map["model.tok_embeddings.weight"])
+                    if max(parts) - 1 in layers:
+                        weights.append(weight_map["output.weight"])
+                    if max(parts) - 2 in layers:
+                        weights.append(weight_map["model.norm.weight"])
+                else:
+                    # 如果没有 pytorch_model.bin.index.json 文件的话，那么就加载所有的权重
+                    weights = [
+                        weight
+                        for weight in io_driver.list(path)
+                        if weight.endswith(".bin")
+                    ]
+                with progress(
+                    weights,
+                    desc="Loading state dict",
+                    total=len(weights),
+                    disable=hide_progress,
+                ) as pbar:
+                    for weight in pbar:
+                        part_state_dict = io_driver.load(
+                            os.path.join(path, weight), mode="rb"
+                        )
+                        # for key in list(part_state_dict.keys()):
+                            # if "attention.wqkv.weight" in key:
+                            #     # qkv_weights = part_state_dict.pop(key)
+                            #     qkv_weights = part_state_dict[key]
+                            #     print(qkv_weights.shape)
+                            #     (wq, wk, wv) = qkv_weights.split(
+                            #         [
+                            #             config.hidden_size,
+                            #             config.num_key_value_heads * head_dim,
+                            #             config.num_key_value_heads * head_dim,
+                            #         ],
+                            #         dim=0,
+                            #     )
+                            #     wq_name = key.replace("wqkv", "wq")
+                            #     wk_name = key.replace("wqkv", "wk")
+                            #     wv_name = key.replace("wqkv", "wv")
+                            #     part_state_dict[wq_name] = wq
+                            #     part_state_dict[wk_name] = wk
+                            #     part_state_dict[wv_name] = wv
+                        state_dict.update(part_state_dict)
+                        del part_state_dict
+                if parts is not None:
+                    # 这一步是 pp 的复筛
+                    layers = env.pipeline_layers_idx
+                    for key in list(state_dict.keys()):
+                        if key.startswith("layers"):
+                            layer = int(key.split(".")[1])
+                            if layer + 1 not in layers:
+                                state_dict.pop(key)
+                        # if key.endswith("tok_embeddings.weight"):
+                        if key.endswith("embed_tokens.weight"):
+                            if 0 not in layers:
+                                state_dict.pop(key)
+                        if key == "norm.weight":
+                            if max(parts) - 2 not in layers:
+                                state_dict.pop(key)
+                        # if key.endswith("output.weight"):
+                        if key.endswith("lm_head.weight"):
+                            if max(parts) - 1 not in layers:
+                                state_dict.pop(key)
+                # 根据用户配置的新的 tp size 进行分割
+                for key in list(state_dict.keys()):
+                    col_filter = [
+                        # "wq.weight",
+                        # "wk.weight",
+                        # "wv.weight",
+                        # "wqkv.weight",
+                        # "w1.weight",
+                        # "w3.weight",
+                        # "tok_embeddings.weight",
+                        # "output.weight",
+                        "q_proj.weight",
+                        "k_proj.weight",
+                        "v_proj.weight",
+                        "o_proj.weight",
+                        "lm_head.weight",
+                        "gate_proj.weight",
+                        "up_proj.weight",
+                        "down_proj.weight",
+                        "embed_tokens.weight",
+                    ]
+                    col_split = any([key.endswith(filter) for filter in col_filter])
+
+                    if col_split:
+                        tensor = (
+                            list(torch.chunk(state_dict[key], config.tp_size, dim=0))[
+                                env.tp_rank
+                            ]
+                            .detach()
+                            .clone()
+                        )
+                        del state_dict[key]
+                        if process_exclusion:
+                            # CPU 内存回收（速度很慢）
+                            gc.collect()
+                        state_dict[key] = tensor
+                    elif key.endswith("o_proj.weight") or key.endswith("down_proj.weight"):
+                        tensor = (
+                            list(torch.chunk(state_dict[key], config.tp_size, dim=1))[
+                                env.tp_rank
+                            ]
+                            .detach()
+                            .clone()
+                        )
+                        del state_dict[key]
+                        if process_exclusion:
+                            # CPU 内存回收（速度很慢）
+                            gc.collect()
+                        state_dict[key] = tensor
+            if dist.is_initialized() and process_exclusion:
+                # 如果选择了进程互斥，那么本次循环中不需要加载权重的进程需等待
+                dist.barrier()
+        return state_dict
+
+    @staticmethod
+    def save_parallel_state_dict(
+        state_dict: dict,
+        path: str,
+        config: CollieConfig,
+        process_exclusion: bool = False,
+        **kwargs,
+    ):
+        ...
+
+    @staticmethod
+    def save_parallel_state_dict(
+            state_dict: dict,
+            path: str,
+            config: CollieConfig,
+            process_exclusion: bool = False,
+            protocol: str = "file",
+    ):
+        """
+        Save state_dict to ``path``.
+        The format of saved state dict should be the same as that of
+        `huggingface`.
+        """
+        io_driver = IODriver.from_protocol(protocol)
+        # gather to tp rank 0
+        if dist.is_initialized() and process_exclusion:
+            # 如果启动了进程互斥，则要进行 pp_size 次循环
+            rank_order = range(config.pp_size)
+        else:
+            # 不开启只进行一次循环
+            rank_order = range(1)
+        dst = parallel_state.get_tensor_model_parallel_src_rank()
+        with progress(
+                rank_order,
+                desc="Saving model",
+                disable=int(os.environ.get("RANK", "0")) != 0,
+        ) as pbar:
+            for rank in pbar:
+                if env.dp_rank == 0 and (env.pp_rank == rank or not process_exclusion):
+                    for key in sorted(list(state_dict.keys())):
+                        tensor_list = None
+                        if env.tp_rank == 0:
+                            tensor_list = [
+                                torch.zeros_like(state_dict[key])
+                                .to(state_dict[key].dtype)
+                                .cuda()
+                                for _ in range(config.tp_size)
+                            ]
+                        dist.gather(
+                            state_dict[key].cuda(),
+                            dst=dst,
+                            gather_list=tensor_list,
+                            group=env.tp_group,
+                        )
+                        if env.tp_rank == 0:
+                            col_filter = [
+                                # "wq.weight",
+                                # "wk.weight",
+                                # "wv.weight",
+                                # "wqkv.weight",
+                                # "w1.weight",
+                                # "w3.weight",
+                                # "tok_embeddings.weight",
+                                # "output.weight",
+                                "q_proj.weight",
+                                "k_proj.weight",
+                                "v_proj.weight",
+                                "o_proj.weight",
+                                "lm_head.weight",
+                                "gate_proj.weight",
+                                "up_proj.weight",
+                                "down_proj.weight",
+                                "embed_tokens.weight",
+                            ]
+                            col_split = any(
+                                [key.endswith(filter) for filter in col_filter]
+                            )
+
+                            if col_split:
+                                state_dict[key] = concat_tensor(tensor_list, dim=0)
+
+                                if process_exclusion:
+                                    # CPU 内存回收（速度很慢）
+                                    gc.collect()
+
+                            elif key.endswith("o_proj.weight") or key.endswith("down_proj.weight"):
+                                state_dict[key] = concat_tensor(tensor_list, dim=1)
+
+                                if process_exclusion:
+                                    # CPU 内存回收（速度很慢）
+                                    gc.collect()
+                    # 似乎不需要？
+                    # state_dict_keys = state_dict.keys()
+                    # for layer_id in range(config.num_layers):
+                    #     qkv_names = [None, None, None]
+                    #     for key in state_dict_keys:
+                    #         if f"layers.{layer_id}.attention.wq.weight" in key:
+                    #             qkv_names[0] = key
+                    #         elif f"layers.{layer_id}.attention.wk.weight" in key:
+                    #             qkv_names[1] = key
+                    #         elif f"layers.{layer_id}.attention.wv.weight" in key:
+                    #             qkv_names[2] = key
+                    #     qkv_name = qkv_names[0].replace("wq", "wqkv")
+                    #     state_dict[qkv_name] = torch.cat(
+                    #         [
+                    #             state_dict.pop(qkv_names[0]),
+                    #             state_dict.pop(qkv_names[1]),
+                    #             state_dict.pop(qkv_names[2]),
+                    #         ],
+                    #         dim=0
+                    #     )
+
+                    if env.tp_rank == 0:
+                        # Save gathered weights
+                        if env.is_pipeline:
+                            ckpt_name = f"pytorch_model-{env.pp_rank + 1:05d}-of-{config.pp_size:05d}.bin"
+                            total_size = 0
+                            weight_map = {}
+                            for name, weight in state_dict.items():
+                                weight_size = weight.numel() * dtype_byte_size(
+                                    weight.dtype
+                                )
+                                weight_map[name] = ckpt_name
+                                total_size += weight_size
+                            index_dict = dict(
+                                total_size=total_size, weight_map=weight_map
+                            )
+                            index_dicts = [None for _ in range(env.pp_size)]
+                            dist.gather_object(
+                                index_dict, index_dicts if env.pp_rank == 0 else None, group=env.pp_group
+                            )
+                            if env.pp_rank == 0:
+                                total_size = 0
+                                weight_map = {}
+                                for _index_dict in index_dicts:
+                                    total_size += _index_dict["total_size"]
+                                    weight_map.update(_index_dict["weight_map"])
+                                merged_dict = {
+                                    "metadata": {"total_size": total_size},
+                                    "weight_map": weight_map,
+                                }
+                                io_driver.save(
+                                    json.dumps(merged_dict, indent=2, sort_keys=True)
+                                    + "\n",
+                                    os.path.join(path, "pytorch_model.bin.index.json"),
+                                )
+
+                        else:
+                            ckpt_name = f"pytorch_model.bin"
+                        ckpt_path = os.path.join(path, ckpt_name)
+                        io_driver.save(state_dict, ckpt_path)
+                if dist.is_initialized() and process_exclusion:
+                    dist.barrier()
+        if env.rank == 0:
+            config.save_pretrained(path, protocol=protocol)
+        dist.barrier()
+
+
+@add_start_docstrings(
+    """
+    The Mistral Model transformer with a sequence classification head on top (linear layer).
+
+    [`MistralForSequenceClassification`] uses the last token in order to do the classification, as other causal models
+    (e.g. GPT-2) do.
+
+    Since it does classification on the last token, it requires to know the position of the last token. If a
+    `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If
+    no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the
+    padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
+    each row of the batch).
+    """,
+    MISTRAL_START_DOCSTRING,
+)
+# Copied from transformers.models.llama.modeling_llama.LlamaForSequenceClassification with Llama->Mistral, LLAMA->MISTRAL
+class MistralForSequenceClassification(Mistral2PreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+        self.model = Mistral2Model(config)
+        self.score = nn.Linear(config.hidden_size, self.num_labels, bias=False)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.model.embed_tokens
+
+    def set_input_embeddings(self, value):
+        self.model.embed_tokens = value
+
+    @add_start_docstrings_to_model_forward(MISTRAL_INPUTS_DOCSTRING)
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, SequenceClassifierOutputWithPast]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        transformer_outputs = self.model(
+            input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        hidden_states = transformer_outputs[0]
+        logits = self.score(hidden_states)
+
+        if input_ids is not None:
+            batch_size = input_ids.shape[0]
+        else:
+            batch_size = inputs_embeds.shape[0]
+
+        if self.config.pad_token_id is None and batch_size != 1:
+            raise ValueError("Cannot handle batch sizes > 1 if no padding token is defined.")
+        if self.config.pad_token_id is None:
+            sequence_lengths = -1
+        else:
+            if input_ids is not None:
+                # if no pad token found, use modulo instead of reverse indexing for ONNX compatibility
+                sequence_lengths = torch.eq(input_ids, self.config.pad_token_id).int().argmax(-1) - 1
+                sequence_lengths = sequence_lengths % input_ids.shape[-1]
+                sequence_lengths = sequence_lengths.to(logits.device)
+            else:
+                sequence_lengths = -1
+
+        pooled_logits = logits[torch.arange(batch_size, device=logits.device), sequence_lengths]
+
+        loss = None
+        if labels is not None:
+            labels = labels.to(logits.device)
+            if self.config.problem_type is None:
+                if self.num_labels == 1:
+                    self.config.problem_type = "regression"
+                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
+                    self.config.problem_type = "single_label_classification"
+                else:
+                    self.config.problem_type = "multi_label_classification"
+
+            if self.config.problem_type == "regression":
+                loss_fct = MSELoss()
+                if self.num_labels == 1:
+                    loss = loss_fct(pooled_logits.squeeze(), labels.squeeze())
+                else:
+                    loss = loss_fct(pooled_logits, labels)
+            elif self.config.problem_type == "single_label_classification":
+                loss_fct = CrossEntropyLoss()
+                loss = loss_fct(pooled_logits.view(-1, self.num_labels), labels.view(-1))
+            elif self.config.problem_type == "multi_label_classification":
+                loss_fct = BCEWithLogitsLoss()
+                loss = loss_fct(pooled_logits, labels)
+        if not return_dict:
+            output = (pooled_logits,) + transformer_outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+
+        return SequenceClassifierOutputWithPast(
+            loss=loss,
+            logits=pooled_logits,
+            past_key_values=transformer_outputs.past_key_values,
+            hidden_states=transformer_outputs.hidden_states,
+            attentions=transformer_outputs.attentions,
+        )
diff --git a/collie/models/mistral2/modelpp.py b/collie/models/mistral2/modelpp.py
new file mode 100644
index 0000000..1180a10
--- /dev/null
+++ b/collie/models/mistral2/modelpp.py
@@ -0,0 +1,1922 @@
+# coding=utf-8
+# Copyright 2023 Mistral AI and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" PyTorch Mistral model."""
+import inspect
+import math
+import warnings
+from typing import List, Optional, Tuple, Union
+
+import torch
+import torch.nn.functional as F
+import torch.utils.checkpoint
+from torch import nn
+from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
+
+from transformers.activations import ACT2FN
+from transformers.cache_utils import Cache, DynamicCache
+from transformers.modeling_attn_mask_utils import _prepare_4d_causal_attention_mask, _prepare_4d_causal_attention_mask_for_sdpa
+from transformers.modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast, SequenceClassifierOutputWithPast
+from transformers.modeling_utils import PreTrainedModel, dtype_byte_size
+from transformers.utils import (
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    is_flash_attn_2_available,
+    is_flash_attn_greater_or_equal_2_10,
+    logging,
+    replace_return_docstrings,
+)
+from .configuration_mistraltp import Mistral2Config
+
+
+if is_flash_attn_2_available():
+    from flash_attn import flash_attn_func, flash_attn_varlen_func
+    from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input  # noqa
+
+    _flash_supports_window_size = "window_size" in list(inspect.signature(flash_attn_func).parameters)
+
+
+logger = logging.get_logger(__name__)
+
+_CONFIG_FOR_DOC = "Mistral2Config"
+
+#modified for collie
+import torch.distributed as dist
+import gc
+import json
+import os
+from collections import OrderedDict
+from megatron.core import parallel_state, tensor_parallel
+from einops import rearrange
+from deepspeed.pipe import LayerSpec, TiedLayerSpec
+
+from collie.config import CollieConfig
+from collie.driver.io import IODriver
+from collie.log.logger import logger
+from collie.module import (
+    ColumnParallelLinearWithoutBias,
+    ColumnParallelLMHead,
+    RowParallelLinearWithoutBias,
+)
+from collie.utils import concat_tensor, dict_as_params, env, progress
+from collie.models.base import CollieModelForCausalLM
+from collie.models.utils import (
+    kv_cache_to_inputs_for_layer, inputs_to_kv_cache_for_layer,
+    kv_cache_to_inputs_for_model, inputs_to_kv_cache_for_model,
+)
+
+
+# Copied from transformers.models.llama.modeling_llama._get_unpad_data
+def _get_unpad_data(attention_mask):
+    seqlens_in_batch = attention_mask.sum(dim=-1, dtype=torch.int32)
+    indices = torch.nonzero(attention_mask.flatten(), as_tuple=False).flatten()
+    max_seqlen_in_batch = seqlens_in_batch.max().item()
+    cu_seqlens = F.pad(torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.int32), (1, 0))
+    return (
+        indices,
+        cu_seqlens,
+        max_seqlen_in_batch,
+    )
+
+
+# Copied from transformers.models.llama.modeling_llama.LlamaRMSNorm with Llama->Mistral
+class Mistral2RMSNorm(nn.Module):
+    def __init__(self, hidden_size, eps=1e-6):
+        """
+        MistralRMSNorm is equivalent to T5LayerNorm
+        """
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+        self.variance_epsilon = eps
+
+    def forward(self, hidden_states):
+        input_dtype = hidden_states.dtype
+        hidden_states = hidden_states.to(torch.float32)
+        variance = hidden_states.pow(2).mean(-1, keepdim=True)
+        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
+        return self.weight * hidden_states.to(input_dtype)
+
+
+# copied from transformers.models.llama.modeling_llama.LlamaRotaryEmbedding with Llama->Mistral
+# TODO @Arthur no longer copied from LLama after static cache
+class Mistral2RotaryEmbedding(nn.Module):
+    def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None):
+        super().__init__()
+
+        self.dim = dim
+        self.max_position_embeddings = max_position_embeddings
+        self.base = base
+        inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2, dtype=torch.int64).float().to(device) / self.dim))
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
+
+        # Build here to make `torch.jit.trace` work.
+        self._set_cos_sin_cache(
+            seq_len=max_position_embeddings, device=self.inv_freq.device, dtype=torch.get_default_dtype()
+        )
+
+    def _set_cos_sin_cache(self, seq_len, device, dtype):
+        self.max_seq_len_cached = seq_len
+        t = torch.arange(self.max_seq_len_cached, device=device, dtype=torch.int64).type_as(self.inv_freq)
+
+        freqs = torch.outer(t, self.inv_freq)
+        # Different from paper, but it uses a different permutation in order to obtain the same calculation
+        emb = torch.cat((freqs, freqs), dim=-1)
+        self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False)
+        self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False)
+
+    def forward(self, x, seq_len=None):
+        # x: [bs, num_attention_heads, seq_len, head_size]
+        if seq_len > self.max_seq_len_cached:
+            self._set_cos_sin_cache(seq_len=seq_len, device=x.device, dtype=x.dtype)
+
+        return (
+            self.cos_cached[:seq_len].to(dtype=x.dtype),
+            self.sin_cached[:seq_len].to(dtype=x.dtype),
+        )
+
+
+# Copied from transformers.models.llama.modeling_llama.rotate_half
+def rotate_half(x):
+    """Rotates half the hidden dims of the input."""
+    x1 = x[..., : x.shape[-1] // 2]
+    x2 = x[..., x.shape[-1] // 2 :]
+    return torch.cat((-x2, x1), dim=-1)
+
+
+# copied from transformers.models.llama.modeling_llama.apply_rotary_pos_emb
+# TODO @Arthur no longer copied from LLama after static cache
+def apply_rotary_pos_emb(q, k, cos, sin, position_ids, unsqueeze_dim=1):
+    """Applies Rotary Position Embedding to the query and key tensors.
+
+    Args:
+        q (`torch.Tensor`): The query tensor.
+        k (`torch.Tensor`): The key tensor.
+        cos (`torch.Tensor`): The cosine part of the rotary embedding.
+        sin (`torch.Tensor`): The sine part of the rotary embedding.
+        position_ids (`torch.Tensor`):
+            The position indices of the tokens corresponding to the query and key tensors. For example, this can be
+            used to pass offsetted position ids when working with a KV-cache.
+        unsqueeze_dim (`int`, *optional*, defaults to 1):
+            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
+            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
+            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
+            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
+            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
+            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
+    Returns:
+        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
+    """
+    cos = cos[position_ids].unsqueeze(unsqueeze_dim)
+    sin = sin[position_ids].unsqueeze(unsqueeze_dim)
+    q_embed = (q * cos) + (rotate_half(q) * sin)
+    k_embed = (k * cos) + (rotate_half(k) * sin)
+    return q_embed, k_embed
+
+
+class Mistral2MLP(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.hidden_size = config.hidden_size
+        self.intermediate_size = config.intermediate_size
+        # self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
+        # self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
+        # self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False)
+        
+        self.up_proj = ColumnParallelLinearWithoutBias(
+            self.hidden_size,
+            self.intermediate_size,
+            bias=False,
+            gather_output=False,
+            init_method=lambda x: x,
+        )
+        self.gate_proj = ColumnParallelLinearWithoutBias(
+            self.hidden_size,
+            self.intermediate_size,
+            bias=False,
+            gather_output=False,
+            init_method=lambda x: x,
+        )
+        self.down_proj = RowParallelLinearWithoutBias(
+            self.intermediate_size,
+            self.hidden_size,
+            bias=False,
+            input_is_parallel=True,
+            init_method=lambda x: x,
+        )
+        
+        self.act_fn = ACT2FN[config.hidden_act]
+
+    def forward(self, x):
+        return self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
+
+
+# Copied from transformers.models.llama.modeling_llama.repeat_kv
+def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
+    """
+    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
+    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
+    """
+    batch, num_key_value_heads, slen, head_dim = hidden_states.shape
+    if n_rep == 1:
+        return hidden_states
+    hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
+    return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
+
+
+class Mistral2Attention(nn.Module):
+    """
+    Multi-headed attention from 'Attention Is All You Need' paper. Modified to use sliding window attention: Longformer
+    and "Generating Long Sequences with Sparse Transformers".
+    """
+
+    def __init__(self, config: CollieConfig, layer_idx: Optional[int] = None):
+        super().__init__()
+        self.config = config
+        self.layer_idx = layer_idx
+        if layer_idx is None:
+            logger.warning_once(
+                f"Instantiating {self.__class__.__name__} without passing a `layer_idx` is not recommended and will "
+                "lead to errors during the forward call if caching is used. Please make sure to provide a `layer_idx` "
+                "when creating this class."
+            )
+
+        self.hidden_size = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_dim = self.hidden_size // self.num_heads
+        self.num_key_value_heads = config.num_key_value_heads
+        self.num_key_value_groups = self.num_heads // self.num_key_value_heads
+        self.max_position_embeddings = config.max_position_embeddings
+        self.rope_theta = config.rope_theta
+        self.is_causal = True
+        self.attention_dropout = config.attention_dropout
+
+        if (self.head_dim * self.num_heads) != self.hidden_size:
+            raise ValueError(
+                f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}"
+                f" and `num_heads`: {self.num_heads})."
+            )
+        # self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=False)
+        # self.k_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=False)
+        # self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=False)
+        # self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=False)
+
+        self.q_proj = ColumnParallelLinearWithoutBias(
+            self.hidden_size,
+            self.num_heads * self.head_dim,
+            bias=False,
+            gather_output=False,
+            init_method=lambda x: x,
+        )
+        self.k_proj = ColumnParallelLinearWithoutBias(
+            self.hidden_size,
+            self.num_key_value_heads * self.head_dim,
+            bias=False,
+            gather_output=False,
+            init_method=lambda x: x,
+        )
+        self.v_proj = ColumnParallelLinearWithoutBias(
+            self.hidden_size,
+            self.num_key_value_heads * self.head_dim,
+            bias=False,
+            gather_output=False,
+            init_method=lambda x: x,
+        )
+        self.o_proj = RowParallelLinearWithoutBias(
+            self.num_heads * self.head_dim,
+            self.hidden_size,
+            bias=False,
+            input_is_parallel=True,
+            init_method=lambda x: x,
+        )
+
+        self.rotary_emb = Mistral2RotaryEmbedding(
+            self.head_dim,
+            max_position_embeddings=self.max_position_embeddings,
+            base=self.rope_theta,
+        )
+
+    def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
+        return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Cache] = None,
+        output_attentions: bool = False,
+        use_cache: bool = False,
+        **kwargs,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        if "padding_mask" in kwargs:
+            warnings.warn(
+                "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`"
+            )
+        bsz, q_len, _ = hidden_states.size()
+
+        query_states = self.q_proj(hidden_states)
+        key_states = self.k_proj(hidden_states)
+        value_states = self.v_proj(hidden_states)
+
+        # query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        # key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        # value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+
+        query_states, key_states, value_states = (
+            rearrange(query_states, "b n (h d) -> b n h d", d=self.head_dim),
+            rearrange(key_states, "b n (h d) -> b n h d", d=self.head_dim),
+            rearrange(value_states, "b n (h d) -> b n h d", d=self.head_dim),
+        )
+
+        self.num_heads_tp = query_states.shape[2]
+        self.tp_size = self.num_heads // self.num_heads_tp
+
+        query_states = query_states.transpose(1, 2)
+        key_states = key_states.transpose(1, 2)
+        value_states = value_states.transpose(1, 2)
+
+        if self.config.pp_size > 1:
+            query_states = query_states.contiguous()
+            key_states = key_states.contiguous()
+            value_states = value_states.contiguous()
+
+        kv_seq_len = key_states.shape[-2]
+        if past_key_value is not None:
+            if self.layer_idx is None:
+                raise ValueError(
+                    f"The cache structure has changed since version v4.36. If you are using {self.__class__.__name__} "
+                    "for auto-regressive decoding with k/v caching, please make sure to initialize the attention class "
+                    "with a layer index."
+                )
+            kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
+        cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
+
+        if past_key_value is not None:
+            cache_kwargs = {"sin": sin, "cos": cos}  # Specific to RoPE models
+            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
+
+        # repeat k/v heads if n_kv_heads < n_heads
+        key_states = repeat_kv(key_states, self.num_key_value_groups)
+        value_states = repeat_kv(value_states, self.num_key_value_groups)
+
+        attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim)
+
+        if attn_weights.size() != (bsz, self.num_heads_tp, q_len, kv_seq_len):
+            raise ValueError(
+                f"Attention weights should be of size {(bsz, self.num_heads_tp, q_len, kv_seq_len)}, but is"
+                f" {attn_weights.size()}"
+            )
+
+        if attention_mask is not None:
+            if attention_mask.size() != (bsz, 1, q_len, kv_seq_len):
+                raise ValueError(
+                    f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}"
+                )
+
+            attn_weights = attn_weights + attention_mask
+
+        # upcast attention to fp32
+        attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
+        attn_weights = nn.functional.dropout(attn_weights, p=self.attention_dropout, training=self.training)
+        attn_output = torch.matmul(attn_weights, value_states)
+
+        if attn_output.size() != (bsz, self.num_heads_tp, q_len, self.head_dim):
+            raise ValueError(
+                f"`attn_output` should be of size {(bsz, self.num_heads_tp, q_len, self.head_dim)}, but is"
+                f" {attn_output.size()}"
+            )
+
+        attn_output = attn_output.transpose(1, 2).contiguous()
+        attn_output = attn_output.reshape(bsz, q_len, int(self.hidden_size/self.tp_size))
+
+        attn_output = self.o_proj(attn_output)
+
+        if not output_attentions:
+            attn_weights = None
+
+        return attn_output, attn_weights, past_key_value
+
+
+class Mistral2FlashAttention2(Mistral2Attention):
+    """
+    Mistral flash attention module. This module inherits from `MistralAttention` as the weights of the module stays
+    untouched. The only required change would be on the forward pass where it needs to correctly call the public API of
+    flash attention and deal with padding tokens in case the input contains any of them.
+    """
+
+    # Copied from transformers.models.llama.modeling_llama.LlamaFlashAttention2.__init__
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+        # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1.
+        # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignement, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0.
+        # Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left).
+        self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10()
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Cache] = None,
+        output_attentions: bool = False,
+        use_cache: bool = False,
+        **kwargs,
+    ):
+        if "padding_mask" in kwargs:
+            warnings.warn(
+                "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`"
+            )
+
+            # overwrite attention_mask with padding_mask
+            attention_mask = kwargs.pop("padding_mask")
+        bsz, q_len, _ = hidden_states.size()
+
+        query_states = self.q_proj(hidden_states)
+        key_states = self.k_proj(hidden_states)
+        value_states = self.v_proj(hidden_states)
+
+        # query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        # key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        # value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+
+        query_states, key_states, value_states = (
+            rearrange(query_states, "b n (h d) -> b n h d", d=self.head_dim),
+            rearrange(key_states, "b n (h d) -> b n h d", d=self.head_dim),
+            rearrange(value_states, "b n (h d) -> b n h d", d=self.head_dim),
+        )
+
+        self.num_heads_tp = query_states.shape[2]
+        self.tp_size = self.num_heads // self.num_heads_tp
+
+        query_states = query_states.transpose(1, 2)
+        key_states = key_states.transpose(1, 2)
+        value_states = value_states.transpose(1, 2)
+
+        if self.config.pp_size > 1:
+            query_states = query_states.contiguous()
+            key_states = key_states.contiguous()
+            value_states = value_states.contiguous()
+
+
+        kv_seq_len = key_states.shape[-2]
+        if past_key_value is not None:
+            if self.layer_idx is None:
+                raise ValueError(
+                    f"The cache structure has changed since version v4.36. If you are using {self.__class__.__name__} "
+                    "for auto-regressive decoding with k/v caching, please make sure to initialize the attention class "
+                    "with a layer index."
+                )
+            kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
+
+        # Because the input can be padded, the absolute sequence length depends on the max position id.
+        rotary_seq_len = max(kv_seq_len, position_ids[:, -1].max().item()) + 1
+        cos, sin = self.rotary_emb(value_states, seq_len=rotary_seq_len)
+
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
+
+        use_sliding_windows = (
+            _flash_supports_window_size
+            and getattr(self.config, "sliding_window", None) is not None
+            and kv_seq_len > self.config.sliding_window
+        )
+
+        if not _flash_supports_window_size:
+            logger.warning_once(
+                "The current flash attention version does not support sliding window attention, for a more memory efficient implementation"
+                " make sure to upgrade flash-attn library."
+            )
+
+        if past_key_value is not None:
+            # Activate slicing cache only if the config has a value `sliding_windows` attribute
+            cache_has_contents = past_key_value.get_seq_length(self.layer_idx) > 0
+            if (
+                getattr(self.config, "sliding_window", None) is not None
+                and kv_seq_len > self.config.sliding_window
+                and cache_has_contents
+            ):
+                slicing_tokens = 1 - self.config.sliding_window
+
+                past_key = past_key_value[self.layer_idx][0]
+                past_value = past_key_value[self.layer_idx][1]
+
+                past_key = past_key[:, :, slicing_tokens:, :].contiguous()
+                past_value = past_value[:, :, slicing_tokens:, :].contiguous()
+
+                if past_key.shape[-2] != self.config.sliding_window - 1:
+                    raise ValueError(
+                        f"past key must have a shape of (`batch_size, num_heads, self.config.sliding_window-1, head_dim`), got"
+                        f" {past_key.shape}"
+                    )
+
+                if attention_mask is not None:
+                    attention_mask = attention_mask[:, slicing_tokens:]
+                    attention_mask = torch.cat([attention_mask, torch.ones_like(attention_mask[:, -1:])], dim=-1)
+
+            cache_kwargs = {"sin": sin, "cos": cos}  # Specific to RoPE models
+            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
+
+        # repeat k/v heads if n_kv_heads < n_heads
+        key_states = repeat_kv(key_states, self.num_key_value_groups)
+        value_states = repeat_kv(value_states, self.num_key_value_groups)
+        dropout_rate = 0.0 if not self.training else self.attention_dropout
+
+        # In PEFT, usually we cast the layer norms in float32 for training stability reasons
+        # therefore the input hidden states gets silently casted in float32. Hence, we need
+        # cast them back in float16 just to be sure everything works as expected.
+        input_dtype = query_states.dtype
+        if input_dtype == torch.float32:
+            if torch.is_autocast_enabled():
+                target_dtype = torch.get_autocast_gpu_dtype()
+            # Handle the case where the model is quantized
+            elif hasattr(self.config, "_pre_quantization_dtype"):
+                target_dtype = self.config._pre_quantization_dtype
+            else:
+                target_dtype = self.q_proj.weight.dtype
+
+            logger.warning_once(
+                f"The input hidden states seems to be silently casted in float32, this might be related to"
+                f" the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in"
+                f" {target_dtype}."
+            )
+
+            query_states = query_states.to(target_dtype)
+            key_states = key_states.to(target_dtype)
+            value_states = value_states.to(target_dtype)
+
+        # Reashape to the expected shape for Flash Attention
+        query_states = query_states.transpose(1, 2)
+        key_states = key_states.transpose(1, 2)
+        value_states = value_states.transpose(1, 2)
+
+        attn_output = self._flash_attention_forward(
+            query_states,
+            key_states,
+            value_states,
+            attention_mask,
+            q_len,
+            dropout=dropout_rate,
+            use_sliding_windows=use_sliding_windows,
+        )
+
+        attn_output = attn_output.reshape(bsz, q_len, int(self.hidden_size/self.tp_size)).contiguous()
+        attn_output = self.o_proj(attn_output)
+
+        if not output_attentions:
+            attn_weights = None
+
+        return attn_output, attn_weights, past_key_value
+
+    def _flash_attention_forward(
+        self,
+        query_states,
+        key_states,
+        value_states,
+        attention_mask,
+        query_length,
+        dropout=0.0,
+        softmax_scale=None,
+        use_sliding_windows=False,
+    ):
+        """
+        Calls the forward method of Flash Attention - if the input hidden states contain at least one padding token
+        first unpad the input, then computes the attention scores and pad the final attention scores.
+
+        Args:
+            query_states (`torch.Tensor`):
+                Input query states to be passed to Flash Attention API
+            key_states (`torch.Tensor`):
+                Input key states to be passed to Flash Attention API
+            value_states (`torch.Tensor`):
+                Input value states to be passed to Flash Attention API
+            attention_mask (`torch.Tensor`):
+                The padding mask - corresponds to a tensor of size `(batch_size, seq_len)` where 0 stands for the
+                position of padding tokens and 1 for the position of non-padding tokens.
+            dropout (`float`):
+                Attention dropout
+            softmax_scale (`float`, *optional*):
+                The scaling of QK^T before applying softmax. Default to 1 / sqrt(head_dim)
+            use_sliding_windows (`bool`, *optional*):
+                Whether to activate sliding window attention.
+        """
+        if not self._flash_attn_uses_top_left_mask:
+            causal = self.is_causal
+        else:
+            # TODO: Remove the `query_length != 1` check once Flash Attention for RoCm is bumped to 2.1. For details, please see the comment in LlamaFlashAttention2 __init__.
+            causal = self.is_causal and query_length != 1
+
+        # Contains at least one padding token in the sequence
+        if attention_mask is not None:
+            batch_size = query_states.shape[0]
+            query_states, key_states, value_states, indices_q, cu_seq_lens, max_seq_lens = self._upad_input(
+                query_states, key_states, value_states, attention_mask, query_length
+            )
+
+            cu_seqlens_q, cu_seqlens_k = cu_seq_lens
+            max_seqlen_in_batch_q, max_seqlen_in_batch_k = max_seq_lens
+
+            if not use_sliding_windows:
+                attn_output_unpad = flash_attn_varlen_func(
+                    query_states,
+                    key_states,
+                    value_states,
+                    cu_seqlens_q=cu_seqlens_q,
+                    cu_seqlens_k=cu_seqlens_k,
+                    max_seqlen_q=max_seqlen_in_batch_q,
+                    max_seqlen_k=max_seqlen_in_batch_k,
+                    dropout_p=dropout,
+                    softmax_scale=softmax_scale,
+                    causal=causal,
+                )
+            else:
+                attn_output_unpad = flash_attn_varlen_func(
+                    query_states,
+                    key_states,
+                    value_states,
+                    cu_seqlens_q=cu_seqlens_q,
+                    cu_seqlens_k=cu_seqlens_k,
+                    max_seqlen_q=max_seqlen_in_batch_q,
+                    max_seqlen_k=max_seqlen_in_batch_k,
+                    dropout_p=dropout,
+                    softmax_scale=softmax_scale,
+                    causal=causal,
+                    window_size=(self.config.sliding_window, self.config.sliding_window),
+                )
+
+            attn_output = pad_input(attn_output_unpad, indices_q, batch_size, query_length)
+        else:
+            if not use_sliding_windows:
+                attn_output = flash_attn_func(
+                    query_states,
+                    key_states,
+                    value_states,
+                    dropout,
+                    softmax_scale=softmax_scale,
+                    causal=causal,
+                )
+            else:
+                attn_output = flash_attn_func(
+                    query_states,
+                    key_states,
+                    value_states,
+                    dropout,
+                    softmax_scale=softmax_scale,
+                    causal=causal,
+                    window_size=(self.config.sliding_window, self.config.sliding_window),
+                )
+
+        return attn_output
+
+    def _upad_input(self, query_layer, key_layer, value_layer, attention_mask, query_length):
+        batch_size, kv_seq_len, num_heads, head_dim = key_layer.shape
+
+        # On the first iteration we need to properly re-create the padding mask
+        # by slicing it on the proper place
+        if kv_seq_len != attention_mask.shape[-1]:
+            attention_mask_num_tokens = attention_mask.shape[-1]
+            attention_mask = attention_mask[:, attention_mask_num_tokens - kv_seq_len :]
+
+        indices_k, cu_seqlens_k, max_seqlen_in_batch_k = _get_unpad_data(attention_mask)
+
+        key_layer = index_first_axis(key_layer.reshape(batch_size * kv_seq_len, num_heads, head_dim), indices_k)
+        value_layer = index_first_axis(value_layer.reshape(batch_size * kv_seq_len, num_heads, head_dim), indices_k)
+
+        if query_length == kv_seq_len:
+            query_layer = index_first_axis(
+                query_layer.reshape(batch_size * kv_seq_len, num_heads, head_dim), indices_k
+            )
+            cu_seqlens_q = cu_seqlens_k
+            max_seqlen_in_batch_q = max_seqlen_in_batch_k
+            indices_q = indices_k
+        elif query_length == 1:
+            max_seqlen_in_batch_q = 1
+            cu_seqlens_q = torch.arange(
+                batch_size + 1, dtype=torch.int32, device=query_layer.device
+            )  # There is a memcpy here, that is very bad.
+            indices_q = cu_seqlens_q[:-1]
+            query_layer = query_layer.squeeze(1)
+        else:
+            # The -q_len: slice assumes left padding.
+            attention_mask = attention_mask[:, -query_length:]
+            query_layer, indices_q, cu_seqlens_q, max_seqlen_in_batch_q = unpad_input(query_layer, attention_mask)
+
+        return (
+            query_layer,
+            key_layer,
+            value_layer,
+            indices_q,
+            (cu_seqlens_q, cu_seqlens_k),
+            (max_seqlen_in_batch_q, max_seqlen_in_batch_k),
+        )
+
+
+# copied from transformers.models.llama.modeling_llama.LlamaSdpaAttention with Llama->Mistral
+# TODO @Arthur no longer copied from LLama after static cache
+class Mistral2SdpaAttention(Mistral2Attention):
+    """
+    Mistral attention module using torch.nn.functional.scaled_dot_product_attention. This module inherits from
+    `MistralAttention` as the weights of the module stays untouched. The only changes are on the forward pass to adapt to
+    SDPA API.
+    """
+
+    # Adapted from MistralAttention.forward
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Cache] = None,
+        output_attentions: bool = False,
+        use_cache: bool = False,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        if output_attentions:
+            # TODO: Improve this warning with e.g. `model.config.attn_implementation = "manual"` once this is implemented.
+            logger.warning_once(
+                "MistralModel is using MistralSdpaAttention, but `torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to the manual attention implementation, "
+                'but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
+            )
+            return super().forward(
+                hidden_states=hidden_states,
+                attention_mask=attention_mask,
+                position_ids=position_ids,
+                past_key_value=past_key_value,
+                output_attentions=output_attentions,
+                use_cache=use_cache,
+            )
+
+        bsz, q_len, _ = hidden_states.size()
+
+        query_states = self.q_proj(hidden_states)
+        key_states = self.k_proj(hidden_states)
+        value_states = self.v_proj(hidden_states)
+
+        # query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        # key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        # value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+
+        query_states, key_states, value_states = (
+            rearrange(query_states, "b n (h d) -> b n h d", d=self.head_dim),
+            rearrange(key_states, "b n (h d) -> b n h d", d=self.head_dim),
+            rearrange(value_states, "b n (h d) -> b n h d", d=self.head_dim),
+        )
+
+        self.num_heads_tp = query_states.shape[2]
+        self.tp_size = self.num_heads // self.num_heads_tp
+
+        query_states = query_states.transpose(1, 2)
+        key_states = key_states.transpose(1, 2)
+        value_states = value_states.transpose(1, 2)
+
+        if self.config.pp_size > 1:
+            query_states = query_states.contiguous()
+            key_states = key_states.contiguous()
+            value_states = value_states.contiguous()
+
+
+        kv_seq_len = key_states.shape[-2]
+        if past_key_value is not None:
+            kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
+        cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
+
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
+
+        if past_key_value is not None:
+            cache_kwargs = {"sin": sin, "cos": cos}  # Specific to RoPE models
+            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
+
+        key_states = repeat_kv(key_states, self.num_key_value_groups)
+        value_states = repeat_kv(value_states, self.num_key_value_groups)
+
+        if attention_mask is not None:
+            if attention_mask.size() != (bsz, 1, q_len, kv_seq_len):
+                raise ValueError(
+                    f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}"
+                )
+
+        # SDPA with memory-efficient backend is currently (torch==2.1.2) bugged with non-contiguous inputs with custom attn_mask,
+        # Reference: https://github.com/pytorch/pytorch/issues/112577.
+        if query_states.device.type == "cuda" and attention_mask is not None:
+            query_states = query_states.contiguous()
+            key_states = key_states.contiguous()
+            value_states = value_states.contiguous()
+
+        attn_output = torch.nn.functional.scaled_dot_product_attention(
+            query_states,
+            key_states,
+            value_states,
+            attn_mask=attention_mask,
+            dropout_p=self.attention_dropout if self.training else 0.0,
+            # The q_len > 1 is necessary to match with AttentionMaskConverter.to_causal_4d that does not create a causal mask in case q_len == 1.
+            is_causal=self.is_causal and attention_mask is None and q_len > 1,
+        )
+
+        attn_output = attn_output.transpose(1, 2).contiguous()
+        attn_output = attn_output.view(bsz, q_len, int(self.hidden_size/self.tp_size))
+
+        attn_output = self.o_proj(attn_output)
+
+        return attn_output, None, past_key_value
+
+
+MISTRAL_ATTENTION_CLASSES = {
+    "eager": Mistral2Attention,
+    "flash_attention_2": Mistral2FlashAttention2,
+    "sdpa": Mistral2SdpaAttention,
+}
+
+
+class MistralDecoderLayer(nn.Module):
+    def __init__(self, config: CollieConfig, layer_idx: int):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+
+        self.self_attn = MISTRAL_ATTENTION_CLASSES[config._attn_implementation](config, layer_idx)
+
+        self.mlp = Mistral2MLP(config)
+        self.input_layernorm = Mistral2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = Mistral2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        output_attentions: Optional[bool] = False,
+        use_cache: Optional[bool] = False,
+        **kwargs,
+    ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
+        if "padding_mask" in kwargs:
+            warnings.warn(
+                "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`"
+            )
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
+            attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
+                `(batch, sequence_length)` where padding elements are indicated by 0.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            use_cache (`bool`, *optional*):
+                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
+                (see `past_key_values`).
+            past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
+        """
+
+        residual = hidden_states
+
+        hidden_states = self.input_layernorm(hidden_states)
+
+        # Self Attention
+        hidden_states, self_attn_weights, present_key_value = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_value=past_key_value,
+            output_attentions=output_attentions,
+            use_cache=use_cache,
+        )
+        hidden_states = residual + hidden_states
+
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + hidden_states
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (self_attn_weights,)
+
+        if use_cache:
+            outputs += (present_key_value,)
+
+        return outputs
+
+
+MISTRAL_START_DOCSTRING = r"""
+    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+
+    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
+    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
+    and behavior.
+
+    Parameters:
+        config ([`MistralConfig`]):
+            Model configuration class with all the parameters of the model. Initializing with a config file does not
+            load the weights associated with the model, only the configuration. Check out the
+            [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+
+@add_start_docstrings(
+    "The bare Mistral Model outputting raw hidden-states without any specific head on top.",
+    MISTRAL_START_DOCSTRING,
+)
+class Mistral2PreTrainedModel(PreTrainedModel):
+    config_class = Mistral2Config
+    base_model_prefix = "model"
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["MistralDecoderLayer"]
+    _skip_keys_device_placement = "past_key_values"
+    _supports_flash_attn_2 = True
+    _supports_sdpa = True
+    _supports_cache_class = True
+
+    def _init_weights(self, module):
+        std = self.config.initializer_range
+        if isinstance(module, nn.Linear):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+
+
+MISTRAL_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
+            it.
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            [What are input IDs?](../glossary#input-ids)
+        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            If `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
+            `past_key_values`).
+
+            If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
+            and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
+            information on the default strategy.
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+        position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+            config.n_positions - 1]`.
+
+            [What are position IDs?](../glossary#position-ids)
+        past_key_values (`Cache` or `tuple(tuple(torch.FloatTensor))`, *optional*):
+            Pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
+            blocks) that can be used to speed up sequential decoding. This typically consists in the `past_key_values`
+            returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`.
+
+            Two formats are allowed:
+            - a [`~cache_utils.Cache`] instance;
+            - Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of
+            shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`). This is also known as the legacy
+            cache format.
+
+            The model will output the same cache format that is fed as input. If no `past_key_values` are passed, the
+            legacy cache format will be returned.
+
+            If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that don't
+            have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `input_ids`
+            of shape `(batch_size, sequence_length)`.
+        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
+            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
+            model's internal embedding lookup matrix.
+        use_cache (`bool`, *optional*):
+            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
+            `past_key_values`).
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+@add_start_docstrings(
+    "The bare Mistral Model outputting raw hidden-states without any specific head on top.",
+    MISTRAL_START_DOCSTRING,
+)
+class Mistral2Model(nn.Module):
+    """
+    Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`MistralDecoderLayer`]
+
+    Args:
+        config: MistralConfig
+    """
+
+    def __init__(self, config:  CollieConfig):
+        # super().__init__(config)
+        super().__init__()
+        self.config = config
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+
+        self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
+        self.layers = nn.ModuleList(
+            [MistralDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
+        )
+        self._attn_implementation = config._attn_implementation
+        self.norm = Mistral2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+        self.gradient_checkpointing = False
+        # Initialize weights and apply final processing
+        # self.post_init()
+
+    def get_input_embeddings(self):
+        return self.embed_tokens
+
+    def set_input_embeddings(self, value):
+        self.embed_tokens = value
+
+    @add_start_docstrings_to_model_forward(MISTRAL_INPUTS_DOCSTRING)
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutputWithPast]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        # retrieve input_ids and inputs_embeds
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time")
+        elif input_ids is not None:
+            batch_size, seq_length = input_ids.shape
+        elif inputs_embeds is not None:
+            batch_size, seq_length, _ = inputs_embeds.shape
+        else:
+            raise ValueError("You have to specify either decoder_input_ids or decoder_inputs_embeds")
+
+        if self.gradient_checkpointing and self.training:
+            if use_cache:
+                logger.warning_once(
+                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+                )
+                use_cache = False
+
+        past_key_values_length = 0
+
+        if use_cache:
+            use_legacy_cache = not isinstance(past_key_values, Cache)
+            if use_legacy_cache:
+                past_key_values = DynamicCache.from_legacy_cache(past_key_values)
+            past_key_values_length = past_key_values.get_usable_length(seq_length)
+
+        if position_ids is None:
+            device = input_ids.device if input_ids is not None else inputs_embeds.device
+            position_ids = torch.arange(
+                past_key_values_length, seq_length + past_key_values_length, dtype=torch.long, device=device
+            )
+            position_ids = position_ids.unsqueeze(0).view(-1, seq_length)
+        else:
+            position_ids = position_ids.view(-1, seq_length).long()
+
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_tokens(input_ids)
+
+        if attention_mask is not None and self._attn_implementation == "flash_attention_2" and use_cache:
+            is_padding_right = attention_mask[:, -1].sum().item() != batch_size
+            if is_padding_right:
+                raise ValueError(
+                    "You are attempting to perform batched generation with padding_side='right'"
+                    " this may lead to unexpected behaviour for Flash Attention version of Mistral. Make sure to "
+                    " call `tokenizer.padding_side  = 'left'` before tokenizing the input. "
+                )
+
+        if self._attn_implementation == "flash_attention_2":
+            # 2d mask is passed through the layers
+            attention_mask = attention_mask if (attention_mask is not None and 0 in attention_mask) else None
+        elif self._attn_implementation == "sdpa" and not output_attentions:
+            # output_attentions=True can not be supported when using SDPA, and we fall back on
+            # the manual implementation that requires a 4D causal mask in all cases.
+            attention_mask = _prepare_4d_causal_attention_mask_for_sdpa(
+                attention_mask,
+                (batch_size, seq_length),
+                inputs_embeds,
+                past_key_values_length,
+            )
+        else:
+            # 4d mask is passed through the layers
+            attention_mask = _prepare_4d_causal_attention_mask(
+                attention_mask,
+                (batch_size, seq_length),
+                inputs_embeds,
+                past_key_values_length,
+                sliding_window=self.config.sliding_window,
+            )
+
+        hidden_states = inputs_embeds
+
+        # decoder layers
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attns = () if output_attentions else None
+        next_decoder_cache = None
+
+        for decoder_layer in self.layers:
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+
+            if self.gradient_checkpointing and self.training:
+                layer_outputs = self._gradient_checkpointing_func(
+                    decoder_layer.__call__,
+                    hidden_states,
+                    attention_mask,
+                    position_ids,
+                    past_key_values,
+                    output_attentions,
+                    use_cache,
+                )
+            else:
+                layer_outputs = decoder_layer(
+                    hidden_states,
+                    attention_mask=attention_mask,
+                    position_ids=position_ids,
+                    past_key_value=past_key_values,
+                    output_attentions=output_attentions,
+                    use_cache=use_cache,
+                )
+
+            hidden_states = layer_outputs[0]
+
+            if use_cache:
+                next_decoder_cache = layer_outputs[2 if output_attentions else 1]
+
+            if output_attentions:
+                all_self_attns += (layer_outputs[1],)
+
+        hidden_states = self.norm(hidden_states)
+
+        # add hidden states from the last decoder layer
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+
+        next_cache = None
+        if use_cache:
+            next_cache = next_decoder_cache.to_legacy_cache() if use_legacy_cache else next_decoder_cache
+
+        if not return_dict:
+            return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None)
+        return BaseModelOutputWithPast(
+            last_hidden_state=hidden_states,
+            past_key_values=next_cache,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attns,
+        )
+
+
+class Mistral2ForCausalLM(CollieModelForCausalLM):
+    _tied_weights_keys = ["lm_head.weight"]
+
+    def __init__(self, config:CollieConfig):
+        super().__init__(config)
+        self.model = Mistral2Model(config)
+        self.vocab_size = config.vocab_size
+        # self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+        self.lm_head = ColumnParallelLinearWithoutBias(
+            self.collie_config.hidden_size, self.collie_config.vocab_size, bias=False
+        )
+        # Initialize weights and apply final processing
+        # self.post_init()
+        # GenerationMixin 需要的额外参数
+        self.config.is_decoder = True
+        if config.model_config.tie_word_embeddings:
+            self.lm_head.weight = self.embed_tokens.weight
+        self.main_input_name = "input_ids"
+
+    def clean_cache(self):
+        self._clean_hidden_states([*self.model.layers, self.lm_head])
+        self._set_use_cache(self.model.layers, False)
+
+    def set_cache(self, use_cache):
+        self._set_use_cache(self.model.layers, use_cache)
+
+    def get_input_embeddings(self):
+        return self.model.embed_tokens
+
+    def set_input_embeddings(self, value):
+        self.model.embed_tokens = value
+
+    def get_output_embeddings(self):
+        return self.lm_head
+
+    def set_output_embeddings(self, new_embeddings):
+        self.lm_head = new_embeddings
+
+    def set_decoder(self, decoder):
+        self.model = decoder
+
+    def get_decoder(self):
+        return self.model
+
+    @add_start_docstrings_to_model_forward(MISTRAL_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        **kwargs,
+    ) -> Union[Tuple, CausalLMOutputWithPast]:
+        r"""
+        Args:
+            labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
+                config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+                (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+
+        Returns:
+
+        Example:
+
+        ```python
+        >>> from transformers import AutoTokenizer, MistralForCausalLM
+
+        >>> model = MistralForCausalLM.from_pretrained("mistralai/Mistral-7B-v0.1")
+        >>> tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-v0.1")
+
+        >>> prompt = "Hey, are you conscious? Can you talk to me?"
+        >>> inputs = tokenizer(prompt, return_tensors="pt")
+
+        >>> # Generate
+        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
+        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+        "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
+        ```"""
+
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
+        outputs = self.model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        hidden_states = outputs[0]
+        logits = self.lm_head(hidden_states)
+        logits = logits.float()
+
+        loss = None
+        if labels is not None:
+            # Shift so that tokens < n predict n
+            shift_logits = logits[..., :-1, :].contiguous()
+            shift_labels = labels[..., 1:].contiguous()
+            # Flatten the tokens
+            shift_logits = shift_logits.view(-1, self.config.vocab_size)
+            shift_labels = shift_labels.view(-1)
+            # Ensure tensors are on the same device
+            shift_labels = shift_labels.to(shift_logits.device)
+            loss_fct = CrossEntropyLoss()
+            loss = loss_fct(shift_logits, shift_labels)
+
+        if not return_dict:
+            output = (logits,) + outputs[1:]
+            return (loss,) + output if loss is not None else output
+
+        return CausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+    def prepare_inputs_for_generation(
+        self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, **kwargs
+    ):
+        # Omit tokens covered by past_key_values
+        if past_key_values is not None:
+            if isinstance(past_key_values, Cache):
+                cache_length = past_key_values.get_seq_length()
+                past_length = past_key_values.seen_tokens
+                max_cache_length = past_key_values.get_max_length()
+            else:
+                cache_length = past_length = past_key_values[0][0].shape[2]
+                max_cache_length = None
+
+            # Keep only the unprocessed tokens:
+            # 1 - If the length of the attention_mask exceeds the length of input_ids, then we are in a setting where
+            # some of the inputs are exclusively passed as part of the cache (e.g. when passing input_embeds as
+            # input)
+            if attention_mask is not None and attention_mask.shape[1] > input_ids.shape[1]:
+                input_ids = input_ids[:, -(attention_mask.shape[1] - past_length) :]
+            # 2 - If the past_length is smaller than input_ids', then input_ids holds all input tokens. We can discard
+            # input_ids based on the past_length.
+            elif past_length < input_ids.shape[1]:
+                input_ids = input_ids[:, past_length:]
+            # 3 - Otherwise (past_length >= input_ids.shape[1]), let's assume input_ids only has unprocessed tokens.
+
+            # If we are about to go beyond the maximum cache length, we need to crop the input attention mask.
+            if (
+                max_cache_length is not None
+                and attention_mask is not None
+                and cache_length + input_ids.shape[1] > max_cache_length
+            ):
+                attention_mask = attention_mask[:, -max_cache_length:]
+
+        position_ids = kwargs.get("position_ids", None)
+        if attention_mask is not None and position_ids is None:
+            # create position_ids on the fly for batch generation
+            position_ids = attention_mask.long().cumsum(-1) - 1
+            position_ids.masked_fill_(attention_mask == 0, 1)
+            if past_key_values:
+                position_ids = position_ids[:, -input_ids.shape[1] :]
+
+        # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
+        if inputs_embeds is not None and past_key_values is None:
+            model_inputs = {"inputs_embeds": inputs_embeds}
+        else:
+            model_inputs = {"input_ids": input_ids}
+
+        model_inputs.update(
+            {
+                "position_ids": position_ids,
+                "past_key_values": past_key_values,
+                "use_cache": kwargs.get("use_cache"),
+                "attention_mask": attention_mask,
+            }
+        )
+        return model_inputs
+
+    @staticmethod
+    def _reorder_cache(past_key_values, beam_idx):
+        reordered_past = ()
+        for layer_past in past_key_values:
+            reordered_past += (
+                tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past),
+            )
+        return reordered_past
+
+    @classmethod
+    def pipeline_layers(cls, config: CollieConfig):
+        """
+        Get layers of pipeline.
+        :return: list
+        """
+        if isinstance(config, str):
+            config = CollieConfig.from_pretrained(config)
+
+        if config.tie_word_embeddings:
+            output = TiedLayerSpec(
+                "embed_tokens",
+                dict_as_params(input_keys="hidden_states", output_keys="logits"),
+                ColumnParallelLMHead,
+                config.hidden_size,
+                config.vocab_size,
+                bias=False,
+            )
+        else:
+            output = LayerSpec(
+                dict_as_params(input_keys="hidden_states", output_keys="logits"),
+                ColumnParallelLMHead,
+                config.hidden_size,
+                config.vocab_size,
+                bias=False,
+            )
+
+        return [("model", Mistral2Model.pipeline_layers(config)), ("lm_head", output)]
+
+    @staticmethod
+    def load_parallel_state_dict(
+            path: str,
+            config: Union[CollieConfig, str],
+            process_exclusion: bool = False,
+            **kwargs,
+    ):
+        ...
+
+    @staticmethod
+    def load_parallel_state_dict(
+            path: str,
+            config: Union[CollieConfig, str],
+            process_exclusion: bool = False,
+            protocol: str = "file", # 指定加载state_dict时使用的协议
+            **kwargs,
+    ):
+        """
+        Load state_dict from ``path``.
+        The format of pretrained model should be the same as that of
+        `huggingface`.
+        :return: state_dict. Note that the state_dict should be processed
+            properly to match the current rank.
+        """
+        # 配置加载
+        if isinstance(config, str):
+            config = CollieConfig.from_pretrained(config)
+        # IO驱动初始化
+        io_driver = IODriver.from_protocol(protocol)
+        # 检查文件路径是否存在
+        if not io_driver.exists(path):
+            raise FileNotFoundError(f"folder {path} not found.")
+        # 初始化存储和处理变量
+        state_dict = OrderedDict()
+        weights = []
+        parts = None # 变量用于存储模型分割的部分信息
+        # 如果开启了进程互斥，那么每个进程都会显示进度条，否则只显示 RANK0 的
+        hide_progress = not process_exclusion and int(os.environ.get("RANK", "0")) != 0
+        if dist.is_initialized() and process_exclusion:
+            # 如果启动了进程互斥，则要进行 dist.get_world_size() 次循环
+            rank_order = range(dist.get_world_size())
+        else:
+            # 不开启只进行一次循环
+            rank_order = range(1)
+        # 权重文件加载和处理
+        for rank in rank_order:
+            # 如果开启了进程互斥，那么只有对应 RANK 的能进入循环；不开启进程互斥的话就都可以进
+            if int(os.environ.get("RANK", "0")) == rank or not process_exclusion:
+                # PP 分层的方法保存在了 os.environ["COLLIE_PP_PARTS"], 格式类似于 [0, 17, 35], 左闭右开
+                if env.is_pipeline:
+                    # 保存的是 json 格式
+                    parts = env.pipeline_parts
+                if hasattr(config, "num_key_value_heads"):
+                    # llama2 (transformers >= 4.31.0)
+                    num_key_value_heads = config.num_key_value_heads
+                else:
+                    num_key_value_heads = config.num_attention_heads
+                head_dim = config.hidden_size // config.num_attention_heads
+                # 如果存在 pytorch_model.bin.index.json 文件的话，此时不同的 pp 进程可以按需加载自己需要的权重
+                if (
+                        io_driver.exists(os.path.join(path, "pytorch_model.bin.index.json"))
+                        and "COLLIE_PP_PARTS" in os.environ.keys()
+                ):
+                    weight_map = json.loads(
+                        io_driver.load(
+                            os.path.join(path, "pytorch_model.bin.index.json"), mode="r"
+                        )
+                    )["weight_map"]
+                    # layers 表示自己需要的层
+                    layers = env.pipeline_layers_idx
+                    # 筛选出形似 model.layers.0 这样的层。包含两个条件：1. 有数字的层；2. 数字加一要在 layers 里面（因为最开始还有个 embedding 占一层）
+                    weights.extend(
+                        [
+                            value
+                            for key, value in weight_map.items()
+                            if len(key.split(".")) > 2
+                               and key.split(".")[2].isdigit()
+                               and (int(key.split(".")[2]) + 1) in layers
+                        ]
+                    )
+                    # 去重
+                    weights = list(set(weights))
+                    # 继续筛选，如果有 0 层，那么就要加载 embedding；如果有最后一层，那么就要加载 lm_head；如果有倒数第二层，那么就要加载 norm
+                    if 0 in layers:
+                        weights.append(weight_map["model.tok_embeddings.weight"])
+                    if max(parts) - 1 in layers:
+                        weights.append(weight_map["output.weight"])
+                    if max(parts) - 2 in layers:
+                        weights.append(weight_map["model.norm.weight"])
+                else:
+                    # 如果没有 pytorch_model.bin.index.json 文件的话，那么就加载所有的权重
+                    weights = [
+                        weight
+                        for weight in io_driver.list(path)
+                        if weight.endswith(".bin")
+                    ]
+                with progress(
+                    weights,
+                    desc="Loading state dict",
+                    total=len(weights),
+                    disable=hide_progress,
+                ) as pbar:
+                    for weight in pbar:
+                        part_state_dict = io_driver.load(
+                            os.path.join(path, weight), mode="rb"
+                        )
+                        # for key in list(part_state_dict.keys()):
+                            # if "attention.wqkv.weight" in key:
+                            #     # qkv_weights = part_state_dict.pop(key)
+                            #     qkv_weights = part_state_dict[key]
+                            #     print(qkv_weights.shape)
+                            #     (wq, wk, wv) = qkv_weights.split(
+                            #         [
+                            #             config.hidden_size,
+                            #             config.num_key_value_heads * head_dim,
+                            #             config.num_key_value_heads * head_dim,
+                            #         ],
+                            #         dim=0,
+                            #     )
+                            #     wq_name = key.replace("wqkv", "wq")
+                            #     wk_name = key.replace("wqkv", "wk")
+                            #     wv_name = key.replace("wqkv", "wv")
+                            #     part_state_dict[wq_name] = wq
+                            #     part_state_dict[wk_name] = wk
+                            #     part_state_dict[wv_name] = wv
+                        state_dict.update(part_state_dict)
+                        del part_state_dict
+                if parts is not None:
+                    # 这一步是 pp 的复筛
+                    layers = env.pipeline_layers_idx
+                    for key in list(state_dict.keys()):
+                        if key.startswith("layers"):
+                            layer = int(key.split(".")[1])
+                            if layer + 1 not in layers:
+                                state_dict.pop(key)
+                        # if key.endswith("tok_embeddings.weight"):
+                        if key.endswith("embed_tokens.weight"):
+                            if 0 not in layers:
+                                state_dict.pop(key)
+                        if key == "norm.weight":
+                            if max(parts) - 2 not in layers:
+                                state_dict.pop(key)
+                        # if key.endswith("output.weight"):
+                        if key.endswith("lm_head.weight"):
+                            if max(parts) - 1 not in layers:
+                                state_dict.pop(key)
+                # 根据用户配置的新的 tp size 进行分割
+                for key in list(state_dict.keys()):
+                    col_filter = [
+                        # "wq.weight",
+                        # "wk.weight",
+                        # "wv.weight",
+                        # "wqkv.weight",
+                        # "w1.weight",
+                        # "w3.weight",
+                        # "tok_embeddings.weight",
+                        # "output.weight",
+                        "q_proj.weight",
+                        "k_proj.weight",
+                        "v_proj.weight",
+                        "o_proj.weight",
+                        "lm_head.weight",
+                        "gate_proj.weight",
+                        "up_proj.weight",
+                        "down_proj.weight",
+                        "embed_tokens.weight",
+                    ]
+                    col_split = any([key.endswith(filter) for filter in col_filter])
+
+                    if col_split:
+                        tensor = (
+                            list(torch.chunk(state_dict[key], config.tp_size, dim=0))[
+                                env.tp_rank
+                            ]
+                            .detach()
+                            .clone()
+                        )
+                        del state_dict[key]
+                        if process_exclusion:
+                            # CPU 内存回收（速度很慢）
+                            gc.collect()
+                        state_dict[key] = tensor
+                    elif key.endswith("o_proj.weight") or key.endswith("down_proj.weight"):
+                        tensor = (
+                            list(torch.chunk(state_dict[key], config.tp_size, dim=1))[
+                                env.tp_rank
+                            ]
+                            .detach()
+                            .clone()
+                        )
+                        del state_dict[key]
+                        if process_exclusion:
+                            # CPU 内存回收（速度很慢）
+                            gc.collect()
+                        state_dict[key] = tensor
+            if dist.is_initialized() and process_exclusion:
+                # 如果选择了进程互斥，那么本次循环中不需要加载权重的进程需等待
+                dist.barrier()
+        return state_dict
+
+    @staticmethod
+    def save_parallel_state_dict(
+        state_dict: dict,
+        path: str,
+        config: CollieConfig,
+        process_exclusion: bool = False,
+        **kwargs,
+    ):
+        ...
+
+    @staticmethod
+    def save_parallel_state_dict(
+            state_dict: dict,
+            path: str,
+            config: CollieConfig,
+            process_exclusion: bool = False,
+            protocol: str = "file",
+    ):
+        """
+        Save state_dict to ``path``.
+        The format of saved state dict should be the same as that of
+        `huggingface`.
+        """
+        io_driver = IODriver.from_protocol(protocol)
+        # gather to tp rank 0
+        if dist.is_initialized() and process_exclusion:
+            # 如果启动了进程互斥，则要进行 pp_size 次循环
+            rank_order = range(config.pp_size)
+        else:
+            # 不开启只进行一次循环
+            rank_order = range(1)
+        dst = parallel_state.get_tensor_model_parallel_src_rank()
+        with progress(
+                rank_order,
+                desc="Saving model",
+                disable=int(os.environ.get("RANK", "0")) != 0,
+        ) as pbar:
+            for rank in pbar:
+                if env.dp_rank == 0 and (env.pp_rank == rank or not process_exclusion):
+                    for key in sorted(list(state_dict.keys())):
+                        tensor_list = None
+                        if env.tp_rank == 0:
+                            tensor_list = [
+                                torch.zeros_like(state_dict[key])
+                                .to(state_dict[key].dtype)
+                                .cuda()
+                                for _ in range(config.tp_size)
+                            ]
+                        dist.gather(
+                            state_dict[key].cuda(),
+                            dst=dst,
+                            gather_list=tensor_list,
+                            group=env.tp_group,
+                        )
+                        if env.tp_rank == 0:
+                            col_filter = [
+                                # "wq.weight",
+                                # "wk.weight",
+                                # "wv.weight",
+                                # "wqkv.weight",
+                                # "w1.weight",
+                                # "w3.weight",
+                                # "tok_embeddings.weight",
+                                # "output.weight",
+                                "q_proj.weight",
+                                "k_proj.weight",
+                                "v_proj.weight",
+                                "o_proj.weight",
+                                "lm_head.weight",
+                                "gate_proj.weight",
+                                "up_proj.weight",
+                                "down_proj.weight",
+                                "embed_tokens.weight",
+                            ]
+                            col_split = any(
+                                [key.endswith(filter) for filter in col_filter]
+                            )
+
+                            if col_split:
+                                state_dict[key] = concat_tensor(tensor_list, dim=0)
+
+                                if process_exclusion:
+                                    # CPU 内存回收（速度很慢）
+                                    gc.collect()
+
+                            elif key.endswith("o_proj.weight") or key.endswith("down_proj.weight"):
+                                state_dict[key] = concat_tensor(tensor_list, dim=1)
+
+                                if process_exclusion:
+                                    # CPU 内存回收（速度很慢）
+                                    gc.collect()
+                    # 似乎不需要？
+                    # state_dict_keys = state_dict.keys()
+                    # for layer_id in range(config.num_layers):
+                    #     qkv_names = [None, None, None]
+                    #     for key in state_dict_keys:
+                    #         if f"layers.{layer_id}.attention.wq.weight" in key:
+                    #             qkv_names[0] = key
+                    #         elif f"layers.{layer_id}.attention.wk.weight" in key:
+                    #             qkv_names[1] = key
+                    #         elif f"layers.{layer_id}.attention.wv.weight" in key:
+                    #             qkv_names[2] = key
+                    #     qkv_name = qkv_names[0].replace("wq", "wqkv")
+                    #     state_dict[qkv_name] = torch.cat(
+                    #         [
+                    #             state_dict.pop(qkv_names[0]),
+                    #             state_dict.pop(qkv_names[1]),
+                    #             state_dict.pop(qkv_names[2]),
+                    #         ],
+                    #         dim=0
+                    #     )
+
+                    if env.tp_rank == 0:
+                        # Save gathered weights
+                        if env.is_pipeline:
+                            ckpt_name = f"pytorch_model-{env.pp_rank + 1:05d}-of-{config.pp_size:05d}.bin"
+                            total_size = 0
+                            weight_map = {}
+                            for name, weight in state_dict.items():
+                                weight_size = weight.numel() * dtype_byte_size(
+                                    weight.dtype
+                                )
+                                weight_map[name] = ckpt_name
+                                total_size += weight_size
+                            index_dict = dict(
+                                total_size=total_size, weight_map=weight_map
+                            )
+                            index_dicts = [None for _ in range(env.pp_size)]
+                            dist.gather_object(
+                                index_dict, index_dicts if env.pp_rank == 0 else None, group=env.pp_group
+                            )
+                            if env.pp_rank == 0:
+                                total_size = 0
+                                weight_map = {}
+                                for _index_dict in index_dicts:
+                                    total_size += _index_dict["total_size"]
+                                    weight_map.update(_index_dict["weight_map"])
+                                merged_dict = {
+                                    "metadata": {"total_size": total_size},
+                                    "weight_map": weight_map,
+                                }
+                                io_driver.save(
+                                    json.dumps(merged_dict, indent=2, sort_keys=True)
+                                    + "\n",
+                                    os.path.join(path, "pytorch_model.bin.index.json"),
+                                )
+
+                        else:
+                            ckpt_name = f"pytorch_model.bin"
+                        ckpt_path = os.path.join(path, ckpt_name)
+                        io_driver.save(state_dict, ckpt_path)
+                if dist.is_initialized() and process_exclusion:
+                    dist.barrier()
+        if env.rank == 0:
+            config.save_pretrained(path, protocol=protocol)
+        dist.barrier()
+
+
+@add_start_docstrings(
+    """
+    The Mistral Model transformer with a sequence classification head on top (linear layer).
+
+    [`MistralForSequenceClassification`] uses the last token in order to do the classification, as other causal models
+    (e.g. GPT-2) do.
+
+    Since it does classification on the last token, it requires to know the position of the last token. If a
+    `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If
+    no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the
+    padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
+    each row of the batch).
+    """,
+    MISTRAL_START_DOCSTRING,
+)
+# Copied from transformers.models.llama.modeling_llama.LlamaForSequenceClassification with Llama->Mistral, LLAMA->MISTRAL
+class MistralForSequenceClassification(Mistral2PreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+        self.model = Mistral2Model(config)
+        self.score = nn.Linear(config.hidden_size, self.num_labels, bias=False)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.model.embed_tokens
+
+    def set_input_embeddings(self, value):
+        self.model.embed_tokens = value
+
+    @add_start_docstrings_to_model_forward(MISTRAL_INPUTS_DOCSTRING)
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, SequenceClassifierOutputWithPast]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        transformer_outputs = self.model(
+            input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        hidden_states = transformer_outputs[0]
+        logits = self.score(hidden_states)
+
+        if input_ids is not None:
+            batch_size = input_ids.shape[0]
+        else:
+            batch_size = inputs_embeds.shape[0]
+
+        if self.config.pad_token_id is None and batch_size != 1:
+            raise ValueError("Cannot handle batch sizes > 1 if no padding token is defined.")
+        if self.config.pad_token_id is None:
+            sequence_lengths = -1
+        else:
+            if input_ids is not None:
+                # if no pad token found, use modulo instead of reverse indexing for ONNX compatibility
+                sequence_lengths = torch.eq(input_ids, self.config.pad_token_id).int().argmax(-1) - 1
+                sequence_lengths = sequence_lengths % input_ids.shape[-1]
+                sequence_lengths = sequence_lengths.to(logits.device)
+            else:
+                sequence_lengths = -1
+
+        pooled_logits = logits[torch.arange(batch_size, device=logits.device), sequence_lengths]
+
+        loss = None
+        if labels is not None:
+            labels = labels.to(logits.device)
+            if self.config.problem_type is None:
+                if self.num_labels == 1:
+                    self.config.problem_type = "regression"
+                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
+                    self.config.problem_type = "single_label_classification"
+                else:
+                    self.config.problem_type = "multi_label_classification"
+
+            if self.config.problem_type == "regression":
+                loss_fct = MSELoss()
+                if self.num_labels == 1:
+                    loss = loss_fct(pooled_logits.squeeze(), labels.squeeze())
+                else:
+                    loss = loss_fct(pooled_logits, labels)
+            elif self.config.problem_type == "single_label_classification":
+                loss_fct = CrossEntropyLoss()
+                loss = loss_fct(pooled_logits.view(-1, self.num_labels), labels.view(-1))
+            elif self.config.problem_type == "multi_label_classification":
+                loss_fct = BCEWithLogitsLoss()
+                loss = loss_fct(pooled_logits, labels)
+        if not return_dict:
+            output = (pooled_logits,) + transformer_outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+
+        return SequenceClassifierOutputWithPast(
+            loss=loss,
+            logits=pooled_logits,
+            past_key_values=transformer_outputs.past_key_values,
+            hidden_states=transformer_outputs.hidden_states,
+            attentions=transformer_outputs.attentions,
+        )
diff --git a/collie/models/mistral2/modeltp.py b/collie/models/mistral2/modeltp.py
new file mode 100644
index 0000000..e91037f
--- /dev/null
+++ b/collie/models/mistral2/modeltp.py
@@ -0,0 +1,2254 @@
+# coding=utf-8
+# Copyright 2023 Mistral AI and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" PyTorch Mistral model."""
+import inspect
+import math
+import warnings
+from typing import List, Optional, Tuple, Union
+
+import torch
+import torch.nn.functional as F
+import torch.utils.checkpoint
+from torch import nn
+from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
+
+from transformers.activations import ACT2FN
+from transformers.cache_utils import Cache, DynamicCache
+from transformers.modeling_attn_mask_utils import _prepare_4d_causal_attention_mask, _prepare_4d_causal_attention_mask_for_sdpa
+from transformers.modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast, SequenceClassifierOutputWithPast
+from transformers.modeling_utils import PreTrainedModel, dtype_byte_size
+from transformers.utils import (
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    is_flash_attn_2_available,
+    is_flash_attn_greater_or_equal_2_10,
+    logging,
+    replace_return_docstrings,
+)
+from .configuration_mistraltp import MistralConfig
+
+
+if is_flash_attn_2_available():
+    from flash_attn import flash_attn_func, flash_attn_varlen_func
+    from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input  # noqa
+
+    _flash_supports_window_size = "window_size" in list(inspect.signature(flash_attn_func).parameters)
+
+
+logger = logging.get_logger(__name__)
+
+_CONFIG_FOR_DOC = "MistralConfig"
+
+#modified for collie
+import torch.distributed as dist
+import gc
+import json
+import os
+from collections import OrderedDict
+from megatron.core import parallel_state, tensor_parallel
+from einops import rearrange
+from deepspeed.pipe import LayerSpec, TiedLayerSpec
+
+from collie.config import CollieConfig
+from collie.driver.io import IODriver
+from collie.log.logger import logger
+from collie.module import (
+    ColumnParallelLinearWithoutBias,
+    ColumnParallelLMHead,
+    RowParallelLinearWithoutBias,
+)
+from collie.utils import concat_tensor, dict_as_params, env, progress
+from collie.models.base import CollieModelForCausalLM
+from collie.models.utils import (
+    kv_cache_to_inputs_for_layer, inputs_to_kv_cache_for_layer,
+    kv_cache_to_inputs_for_model, inputs_to_kv_cache_for_model,
+)
+
+
+# Copied from transformers.models.llama.modeling_llama._get_unpad_data
+def _get_unpad_data(attention_mask):
+    seqlens_in_batch = attention_mask.sum(dim=-1, dtype=torch.int32)
+    indices = torch.nonzero(attention_mask.flatten(), as_tuple=False).flatten()
+    max_seqlen_in_batch = seqlens_in_batch.max().item()
+    cu_seqlens = F.pad(torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.int32), (1, 0))
+    return (
+        indices,
+        cu_seqlens,
+        max_seqlen_in_batch,
+    )
+
+
+# Copied from transformers.models.llama.modeling_llama.LlamaRMSNorm with Llama->Mistral
+class MistralRMSNorm(nn.Module):
+    def __init__(self, hidden_size, eps=1e-6):
+        """
+        MistralRMSNorm is equivalent to T5LayerNorm
+        """
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+        self.variance_epsilon = eps
+
+    def forward(self, hidden_states):
+        input_dtype = hidden_states.dtype
+        hidden_states = hidden_states.to(torch.float32)
+        variance = hidden_states.pow(2).mean(-1, keepdim=True)
+        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
+        ans = self.weight * hidden_states.to(input_dtype)
+
+        # # 打印层标准化的输出
+        hidden_states_output = ans.detach().cpu().tolist()
+        data_to_save = {"Layer Norm Output": hidden_states_output}
+        # 将输出写入 JSON 文件
+        with open('a_rms_output.json', 'w') as f:
+            json.dump(data_to_save, f, indent=4)
+
+        return self.weight * hidden_states.to(input_dtype)
+
+
+# copied from transformers.models.llama.modeling_llama.LlamaRotaryEmbedding with Llama->Mistral
+# TODO @Arthur no longer copied from LLama after static cache
+class MistralRotaryEmbedding(nn.Module):
+    def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None):
+        super().__init__()
+
+        self.dim = dim
+        self.max_position_embeddings = max_position_embeddings
+        self.base = base
+        inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2, dtype=torch.int64).float().to(device) / self.dim))
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
+
+        # Build here to make `torch.jit.trace` work.
+        self._set_cos_sin_cache(
+            seq_len=max_position_embeddings, device=self.inv_freq.device, dtype=torch.get_default_dtype()
+        )
+
+    def _set_cos_sin_cache(self, seq_len, device, dtype):
+        self.max_seq_len_cached = seq_len
+        t = torch.arange(self.max_seq_len_cached, device=device, dtype=torch.int64).type_as(self.inv_freq)
+
+        freqs = torch.outer(t, self.inv_freq)
+        # Different from paper, but it uses a different permutation in order to obtain the same calculation
+        emb = torch.cat((freqs, freqs), dim=-1)
+        self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False)
+        self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False)
+
+    def forward(self, x, seq_len=None):
+        # x: [bs, num_attention_heads, seq_len, head_size]
+        if seq_len > self.max_seq_len_cached:
+            self._set_cos_sin_cache(seq_len=seq_len, device=x.device, dtype=x.dtype)
+
+        return (
+            self.cos_cached[:seq_len].to(dtype=x.dtype),
+            self.sin_cached[:seq_len].to(dtype=x.dtype),
+        )
+
+
+# Copied from transformers.models.llama.modeling_llama.rotate_half
+def rotate_half(x):
+    """Rotates half the hidden dims of the input."""
+    x1 = x[..., : x.shape[-1] // 2]
+    x2 = x[..., x.shape[-1] // 2 :]
+    return torch.cat((-x2, x1), dim=-1)
+
+
+# copied from transformers.models.llama.modeling_llama.apply_rotary_pos_emb
+# TODO @Arthur no longer copied from LLama after static cache
+def apply_rotary_pos_emb(q, k, cos, sin, position_ids, unsqueeze_dim=1):
+    """Applies Rotary Position Embedding to the query and key tensors.
+
+    Args:
+        q (`torch.Tensor`): The query tensor.
+        k (`torch.Tensor`): The key tensor.
+        cos (`torch.Tensor`): The cosine part of the rotary embedding.
+        sin (`torch.Tensor`): The sine part of the rotary embedding.
+        position_ids (`torch.Tensor`):
+            The position indices of the tokens corresponding to the query and key tensors. For example, this can be
+            used to pass offsetted position ids when working with a KV-cache.
+        unsqueeze_dim (`int`, *optional*, defaults to 1):
+            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
+            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
+            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
+            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
+            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
+            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
+    Returns:
+        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
+    """
+    cos = cos[position_ids].unsqueeze(unsqueeze_dim)
+    sin = sin[position_ids].unsqueeze(unsqueeze_dim)
+    q_embed = (q * cos) + (rotate_half(q) * sin)
+    k_embed = (k * cos) + (rotate_half(k) * sin)
+    return q_embed, k_embed
+
+
+class MistralMLP(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.hidden_size = config.hidden_size
+        self.intermediate_size = config.intermediate_size
+        # self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
+        # self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
+        # self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False)
+        
+        self.up_proj = ColumnParallelLinearWithoutBias(
+            self.hidden_size,
+            self.intermediate_size,
+            bias=False,
+            gather_output=False,
+            init_method=lambda x: x,
+        )
+        self.gate_proj = ColumnParallelLinearWithoutBias(
+            self.hidden_size,
+            self.intermediate_size,
+            bias=False,
+            gather_output=False,
+            init_method=lambda x: x,
+        )
+        self.down_proj = RowParallelLinearWithoutBias(
+            self.intermediate_size,
+            self.hidden_size,
+            bias=False,
+            input_is_parallel=True,
+            init_method=lambda x: x,
+        )
+        
+        self.act_fn = ACT2FN[config.hidden_act]
+
+    def forward(self, x):
+        output = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
+
+        # 打印MLP层输出
+        mlp_output = output.detach().cpu().tolist()
+        data_to_save = {"MLP Output": mlp_output}
+        # 将输出写入 JSON 文件
+        with open('a_mlp_output.json', 'w') as f:
+            json.dump(data_to_save, f, indent=4)
+        
+        return output
+
+
+# Copied from transformers.models.llama.modeling_llama.repeat_kv
+def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
+    """
+    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
+    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
+    """
+    batch, num_key_value_heads, slen, head_dim = hidden_states.shape
+    if n_rep == 1:
+        return hidden_states
+    hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
+    return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
+
+
+class MistralAttention(nn.Module):
+    """
+    Multi-headed attention from 'Attention Is All You Need' paper. Modified to use sliding window attention: Longformer
+    and "Generating Long Sequences with Sparse Transformers".
+    """
+
+    def __init__(self, config: CollieConfig, layer_idx: Optional[int] = None):
+        super().__init__()
+        self.config = config
+        self.layer_idx = layer_idx
+        if layer_idx is None:
+            logger.warning_once(
+                f"Instantiating {self.__class__.__name__} without passing a `layer_idx` is not recommended and will "
+                "lead to errors during the forward call if caching is used. Please make sure to provide a `layer_idx` "
+                "when creating this class."
+            )
+
+        self.hidden_size = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_dim = self.hidden_size // self.num_heads
+        self.num_key_value_heads = config.num_key_value_heads
+        self.num_key_value_groups = self.num_heads // self.num_key_value_heads
+        self.max_position_embeddings = config.max_position_embeddings
+        self.rope_theta = config.rope_theta
+        self.is_causal = True
+        self.attention_dropout = config.attention_dropout
+
+        if (self.head_dim * self.num_heads) != self.hidden_size:
+            raise ValueError(
+                f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}"
+                f" and `num_heads`: {self.num_heads})."
+            )
+        # self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=False)
+        # self.k_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=False)
+        # self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=False)
+        # self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=False)
+
+        self.q_proj = ColumnParallelLinearWithoutBias(
+            self.hidden_size,
+            self.num_heads * self.head_dim,
+            bias=False,
+            gather_output=False,
+            init_method=lambda x: x,
+        )
+        self.k_proj = ColumnParallelLinearWithoutBias(
+            self.hidden_size,
+            self.num_key_value_heads * self.head_dim,
+            bias=False,
+            gather_output=False,
+            init_method=lambda x: x,
+        )
+        self.v_proj = ColumnParallelLinearWithoutBias(
+            self.hidden_size,
+            self.num_key_value_heads * self.head_dim,
+            bias=False,
+            gather_output=False,
+            init_method=lambda x: x,
+        )
+        # aaaa
+        self.o_proj = RowParallelLinearWithoutBias(
+            self.num_heads * self.head_dim,
+            self.hidden_size,
+            bias=False,
+            input_is_parallel=True,
+            init_method=lambda x: x,
+        )
+
+        self.rotary_emb = MistralRotaryEmbedding(
+            self.head_dim,
+            max_position_embeddings=self.max_position_embeddings,
+            base=self.rope_theta,
+        )
+
+    def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
+        return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,    # 输入维度 [bsz, q_len, hidden_size]
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Cache] = None,
+        output_attentions: bool = False,
+        use_cache: bool = False,
+        **kwargs,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        if "padding_mask" in kwargs:
+            warnings.warn(
+                "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`"
+            )
+
+        bsz, q_len, _ = hidden_states.size()
+
+        query_states = self.q_proj(hidden_states)   # [bsz, q_len, num_heads * head_dim]
+        key_states = self.k_proj(hidden_states)     # [bsz, q_len, num_key_value_heads * head_dim]
+        value_states = self.v_proj(hidden_states)   # [bsz, q_len, num_key_value_heads * head_dim]
+
+        # query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        # key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        # value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+
+        query_states, key_states, value_states = (
+            rearrange(query_states, "b n (h d) -> b n h d", d=self.head_dim),   # [bsz, q_len, num_heads, head_dim]
+            rearrange(key_states, "b n (h d) -> b n h d", d=self.head_dim),     # [bsz, q_len, num_key_value_heads, head_dim]
+            rearrange(value_states, "b n (h d) -> b n h d", d=self.head_dim),   # [bsz, q_len, num_key_value_heads, head_dim]
+        )
+
+        query_states = query_states.transpose(1, 2)     # [bsz, num_heads, q_len, head_dim]
+        key_states = key_states.transpose(1, 2)         # [bsz, num_key_value_heads, q_len, head_dim]
+        value_states = value_states.transpose(1, 2)     # [bsz, num_key_value_heads, q_len, head_dim]
+        
+        # 打印注意力模块的输出
+        # 准备数据以写入 JSON 文件
+        attention_outputs = {
+            "Query states": query_states.detach().cpu().tolist(),
+            "Key states": key_states.detach().cpu().tolist(),
+            "Value states": value_states.detach().cpu().tolist()
+        }
+        # 将数据写入 JSON 文件
+        with open("a_attention_outputs.json", "w") as f:
+            json.dump(attention_outputs, f, indent=4)
+
+        if self.config.pp_size > 1:
+            query_states = query_states.contiguous()
+            key_states = key_states.contiguous()
+            value_states = value_states.contiguous()
+
+        kv_seq_len = key_states.shape[-2]
+        if past_key_value is not None:
+            if self.layer_idx is None:
+                raise ValueError(
+                    f"The cache structure has changed since version v4.36. If you are using {self.__class__.__name__} "
+                    "for auto-regressive decoding with k/v caching, please make sure to initialize the attention class "
+                    "with a layer index."
+                )
+            kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
+        cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
+
+        if past_key_value is not None:
+            cache_kwargs = {"sin": sin, "cos": cos}  # Specific to RoPE models
+            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
+
+        # repeat k/v heads if n_kv_heads < n_heads
+        key_states = repeat_kv(key_states, self.num_key_value_groups)
+        value_states = repeat_kv(value_states, self.num_key_value_groups)
+
+        attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim)
+
+        if attn_weights.size() != (bsz, self.num_heads/self.config.tp_size, q_len, kv_seq_len):
+            raise ValueError(
+                f"Attention weights should be of size {(bsz, self.num_heads/self.config.tp_size, q_len, kv_seq_len)}, but is"
+                f" {attn_weights.size()}"
+            )
+
+        if attention_mask is not None:
+            if attention_mask.size() != (bsz, 1, q_len, kv_seq_len):
+                raise ValueError(
+                    f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}"
+                )
+
+            attn_weights = attn_weights + attention_mask
+
+        # upcast attention to fp32
+        attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
+        attn_weights = nn.functional.dropout(attn_weights, p=self.attention_dropout, training=self.training)
+        attn_output = torch.matmul(attn_weights, value_states)
+
+        if attn_output.size() != (bsz, self.num_heads/self.config.tp_size, q_len, self.head_dim):
+            raise ValueError(
+                f"`attn_output` should be of size {(bsz, self.num_heads/self.config.tp_size, q_len, self.head_dim)}, but is"
+                f" {attn_output.size()}"
+            )
+
+        attn_output = attn_output.transpose(1, 2).contiguous()
+        attn_output = attn_output.reshape(bsz, q_len, int(self.hidden_size/self.config.tp_size))
+
+
+        attn_output = self.o_proj(attn_output)
+
+        if not output_attentions:
+            attn_weights = None
+
+        # 打印注意力模块的输出
+        attention_result = {
+            "Output weights:": attn_output.detach().cpu().tolist(),
+            # "Attention weights:": attn_weights.detach().cpu().tolist(),
+        }
+        # 将数据写入 JSON 文件
+        with open("a_attention_outputs.json", "w") as f:
+            json.dump(attention_result, f, indent=4)
+
+        return attn_output, attn_weights, past_key_value
+
+
+class MistralFlashAttention2(MistralAttention):
+    """
+    Mistral flash attention module. This module inherits from `MistralAttention` as the weights of the module stays
+    untouched. The only required change would be on the forward pass where it needs to correctly call the public API of
+    flash attention and deal with padding tokens in case the input contains any of them.
+    """
+
+    # Copied from transformers.models.llama.modeling_llama.LlamaFlashAttention2.__init__
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+        # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1.
+        # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignement, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0.
+        # Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left).
+        self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10()
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Cache] = None,
+        output_attentions: bool = False,
+        use_cache: bool = False,
+        **kwargs,
+    ):
+        if "padding_mask" in kwargs:
+            warnings.warn(
+                "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`"
+            )
+
+            # overwrite attention_mask with padding_mask
+            attention_mask = kwargs.pop("padding_mask")
+        bsz, q_len, _ = hidden_states.size()
+
+        query_states = self.q_proj(hidden_states)
+        key_states = self.k_proj(hidden_states)
+        value_states = self.v_proj(hidden_states)
+
+        # query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        # key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        # value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+
+        query_states, key_states, value_states = (
+            rearrange(query_states, "b n (h d) -> b n h d", d=self.head_dim),
+            rearrange(key_states, "b n (h d) -> b n h d", d=self.head_dim),
+            rearrange(value_states, "b n (h d) -> b n h d", d=self.head_dim),
+        )
+
+        query_states = query_states.transpose(1, 2)
+        key_states = key_states.transpose(1, 2)
+        value_states = value_states.transpose(1, 2)
+
+        # 打印注意力模块的输出
+        # 准备数据以写入 JSON 文件
+        attention_outputs = {
+            "Query states": query_states.detach().cpu().tolist(),
+            "Key states": key_states.detach().cpu().tolist(),
+            "Value states": value_states.detach().cpu().tolist()
+        }
+        # 将数据写入 JSON 文件
+        with open("a_flash_attention_outputs.json", "w") as f:
+            json.dump(attention_outputs, f, indent=4)
+
+        if self.config.pp_size > 1:
+            query_states = query_states.contiguous()
+            key_states = key_states.contiguous()
+            value_states = value_states.contiguous()
+
+
+        kv_seq_len = key_states.shape[-2]
+        if past_key_value is not None:
+            if self.layer_idx is None:
+                raise ValueError(
+                    f"The cache structure has changed since version v4.36. If you are using {self.__class__.__name__} "
+                    "for auto-regressive decoding with k/v caching, please make sure to initialize the attention class "
+                    "with a layer index."
+                )
+            kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
+
+        # Because the input can be padded, the absolute sequence length depends on the max position id.
+        rotary_seq_len = max(kv_seq_len, position_ids[:, -1].max().item()) + 1
+        cos, sin = self.rotary_emb(value_states, seq_len=rotary_seq_len)
+
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
+
+        use_sliding_windows = (
+            _flash_supports_window_size
+            and getattr(self.config, "sliding_window", None) is not None
+            and kv_seq_len > self.config.sliding_window
+        )
+
+        if not _flash_supports_window_size:
+            logger.warning_once(
+                "The current flash attention version does not support sliding window attention, for a more memory efficient implementation"
+                " make sure to upgrade flash-attn library."
+            )
+
+        if past_key_value is not None:
+            # Activate slicing cache only if the config has a value `sliding_windows` attribute
+            cache_has_contents = past_key_value.get_seq_length(self.layer_idx) > 0
+            if (
+                getattr(self.config, "sliding_window", None) is not None
+                and kv_seq_len > self.config.sliding_window
+                and cache_has_contents
+            ):
+                slicing_tokens = 1 - self.config.sliding_window
+
+                past_key = past_key_value[self.layer_idx][0]
+                past_value = past_key_value[self.layer_idx][1]
+
+                past_key = past_key[:, :, slicing_tokens:, :].contiguous()
+                past_value = past_value[:, :, slicing_tokens:, :].contiguous()
+
+                if past_key.shape[-2] != self.config.sliding_window - 1:
+                    raise ValueError(
+                        f"past key must have a shape of (`batch_size, num_heads, self.config.sliding_window-1, head_dim`), got"
+                        f" {past_key.shape}"
+                    )
+
+                if attention_mask is not None:
+                    attention_mask = attention_mask[:, slicing_tokens:]
+                    attention_mask = torch.cat([attention_mask, torch.ones_like(attention_mask[:, -1:])], dim=-1)
+
+            cache_kwargs = {"sin": sin, "cos": cos}  # Specific to RoPE models
+            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
+
+        # repeat k/v heads if n_kv_heads < n_heads
+        key_states = repeat_kv(key_states, self.num_key_value_groups)
+        value_states = repeat_kv(value_states, self.num_key_value_groups)
+        dropout_rate = 0.0 if not self.training else self.attention_dropout
+
+        # In PEFT, usually we cast the layer norms in float32 for training stability reasons
+        # therefore the input hidden states gets silently casted in float32. Hence, we need
+        # cast them back in float16 just to be sure everything works as expected.
+        input_dtype = query_states.dtype
+        if input_dtype == torch.float32:
+            if torch.is_autocast_enabled():
+                target_dtype = torch.get_autocast_gpu_dtype()
+            # Handle the case where the model is quantized
+            elif hasattr(self.config, "_pre_quantization_dtype"):
+                target_dtype = self.config._pre_quantization_dtype
+            else:
+                target_dtype = self.q_proj.weight.dtype
+
+            logger.warning_once(
+                f"The input hidden states seems to be silently casted in float32, this might be related to"
+                f" the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in"
+                f" {target_dtype}."
+            )
+
+            query_states = query_states.to(target_dtype)
+            key_states = key_states.to(target_dtype)
+            value_states = value_states.to(target_dtype)
+
+        # Reashape to the expected shape for Flash Attention
+        query_states = query_states.transpose(1, 2)
+        key_states = key_states.transpose(1, 2)
+        value_states = value_states.transpose(1, 2)
+
+        attn_output = self._flash_attention_forward(
+            query_states,
+            key_states,
+            value_states,
+            attention_mask,
+            q_len,
+            dropout=dropout_rate,
+            use_sliding_windows=use_sliding_windows,
+        )
+
+        attn_output = attn_output.reshape(bsz, q_len, int(self.hidden_size/self.config.tp_size)).contiguous()
+        attn_output = self.o_proj(attn_output)
+
+        if not output_attentions:
+            attn_weights = None
+
+        # 打印注意力模块的输出
+        attention_result = {
+            "Output weights:": attn_output.detach().cpu().tolist(),
+            # "Attention weights:": attn_weights.detach().cpu().tolist(),
+        }
+        # 将数据写入 JSON 文件
+        with open("a_flash_attention_outputs.json", "w") as f:
+            json.dump(attention_result, f, indent=4)
+
+        return attn_output, attn_weights, past_key_value
+
+    def _flash_attention_forward(
+        self,
+        query_states,
+        key_states,
+        value_states,
+        attention_mask,
+        query_length,
+        dropout=0.0,
+        softmax_scale=None,
+        use_sliding_windows=False,
+    ):
+        """
+        Calls the forward method of Flash Attention - if the input hidden states contain at least one padding token
+        first unpad the input, then computes the attention scores and pad the final attention scores.
+
+        Args:
+            query_states (`torch.Tensor`):
+                Input query states to be passed to Flash Attention API
+            key_states (`torch.Tensor`):
+                Input key states to be passed to Flash Attention API
+            value_states (`torch.Tensor`):
+                Input value states to be passed to Flash Attention API
+            attention_mask (`torch.Tensor`):
+                The padding mask - corresponds to a tensor of size `(batch_size, seq_len)` where 0 stands for the
+                position of padding tokens and 1 for the position of non-padding tokens.
+            dropout (`float`):
+                Attention dropout
+            softmax_scale (`float`, *optional*):
+                The scaling of QK^T before applying softmax. Default to 1 / sqrt(head_dim)
+            use_sliding_windows (`bool`, *optional*):
+                Whether to activate sliding window attention.
+        """
+        if not self._flash_attn_uses_top_left_mask:
+            causal = self.is_causal
+        else:
+            # TODO: Remove the `query_length != 1` check once Flash Attention for RoCm is bumped to 2.1. For details, please see the comment in LlamaFlashAttention2 __init__.
+            causal = self.is_causal and query_length != 1
+
+        # Contains at least one padding token in the sequence
+        if attention_mask is not None:
+            batch_size = query_states.shape[0]
+            query_states, key_states, value_states, indices_q, cu_seq_lens, max_seq_lens = self._upad_input(
+                query_states, key_states, value_states, attention_mask, query_length
+            )
+
+            cu_seqlens_q, cu_seqlens_k = cu_seq_lens
+            max_seqlen_in_batch_q, max_seqlen_in_batch_k = max_seq_lens
+
+            if not use_sliding_windows:
+                attn_output_unpad = flash_attn_varlen_func(
+                    query_states,
+                    key_states,
+                    value_states,
+                    cu_seqlens_q=cu_seqlens_q,
+                    cu_seqlens_k=cu_seqlens_k,
+                    max_seqlen_q=max_seqlen_in_batch_q,
+                    max_seqlen_k=max_seqlen_in_batch_k,
+                    dropout_p=dropout,
+                    softmax_scale=softmax_scale,
+                    causal=causal,
+                )
+            else:
+                attn_output_unpad = flash_attn_varlen_func(
+                    query_states,
+                    key_states,
+                    value_states,
+                    cu_seqlens_q=cu_seqlens_q,
+                    cu_seqlens_k=cu_seqlens_k,
+                    max_seqlen_q=max_seqlen_in_batch_q,
+                    max_seqlen_k=max_seqlen_in_batch_k,
+                    dropout_p=dropout,
+                    softmax_scale=softmax_scale,
+                    causal=causal,
+                    window_size=(self.config.sliding_window, self.config.sliding_window),
+                )
+
+            attn_output = pad_input(attn_output_unpad, indices_q, batch_size, query_length)
+        else:
+            if not use_sliding_windows:
+                attn_output = flash_attn_func(
+                    query_states,
+                    key_states,
+                    value_states,
+                    dropout,
+                    softmax_scale=softmax_scale,
+                    causal=causal,
+                )
+            else:
+                attn_output = flash_attn_func(
+                    query_states,
+                    key_states,
+                    value_states,
+                    dropout,
+                    softmax_scale=softmax_scale,
+                    causal=causal,
+                    window_size=(self.config.sliding_window, self.config.sliding_window),
+                )
+
+        return attn_output
+
+    def _upad_input(self, query_layer, key_layer, value_layer, attention_mask, query_length):
+        batch_size, kv_seq_len, num_heads, head_dim = key_layer.shape
+
+        # On the first iteration we need to properly re-create the padding mask
+        # by slicing it on the proper place
+        if kv_seq_len != attention_mask.shape[-1]:
+            attention_mask_num_tokens = attention_mask.shape[-1]
+            attention_mask = attention_mask[:, attention_mask_num_tokens - kv_seq_len :]
+
+        indices_k, cu_seqlens_k, max_seqlen_in_batch_k = _get_unpad_data(attention_mask)
+
+        key_layer = index_first_axis(key_layer.reshape(batch_size * kv_seq_len, num_heads, head_dim), indices_k)
+        value_layer = index_first_axis(value_layer.reshape(batch_size * kv_seq_len, num_heads, head_dim), indices_k)
+
+        if query_length == kv_seq_len:
+            query_layer = index_first_axis(
+                query_layer.reshape(batch_size * kv_seq_len, num_heads, head_dim), indices_k
+            )
+            cu_seqlens_q = cu_seqlens_k
+            max_seqlen_in_batch_q = max_seqlen_in_batch_k
+            indices_q = indices_k
+        elif query_length == 1:
+            max_seqlen_in_batch_q = 1
+            cu_seqlens_q = torch.arange(
+                batch_size + 1, dtype=torch.int32, device=query_layer.device
+            )  # There is a memcpy here, that is very bad.
+            indices_q = cu_seqlens_q[:-1]
+            query_layer = query_layer.squeeze(1)
+        else:
+            # The -q_len: slice assumes left padding.
+            attention_mask = attention_mask[:, -query_length:]
+            query_layer, indices_q, cu_seqlens_q, max_seqlen_in_batch_q = unpad_input(query_layer, attention_mask)
+
+        return (
+            query_layer,
+            key_layer,
+            value_layer,
+            indices_q,
+            (cu_seqlens_q, cu_seqlens_k),
+            (max_seqlen_in_batch_q, max_seqlen_in_batch_k),
+        )
+
+
+# copied from transformers.models.llama.modeling_llama.LlamaSdpaAttention with Llama->Mistral
+# TODO @Arthur no longer copied from LLama after static cache
+class MistralSdpaAttention(MistralAttention):
+    """
+    Mistral attention module using torch.nn.functional.scaled_dot_product_attention. This module inherits from
+    `MistralAttention` as the weights of the module stays untouched. The only changes are on the forward pass to adapt to
+    SDPA API.
+    """
+
+    # Adapted from MistralAttention.forward
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Cache] = None,
+        output_attentions: bool = False,
+        use_cache: bool = False,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        if output_attentions:
+            # TODO: Improve this warning with e.g. `model.config.attn_implementation = "manual"` once this is implemented.
+            logger.warning_once(
+                "MistralModel is using MistralSdpaAttention, but `torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to the manual attention implementation, "
+                'but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
+            )
+            return super().forward(
+                hidden_states=hidden_states,
+                attention_mask=attention_mask,
+                position_ids=position_ids,
+                past_key_value=past_key_value,
+                output_attentions=output_attentions,
+                use_cache=use_cache,
+            )
+
+        bsz, q_len, _ = hidden_states.size()
+
+        query_states = self.q_proj(hidden_states)
+        key_states = self.k_proj(hidden_states)
+        value_states = self.v_proj(hidden_states)
+
+        # query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        # key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        # value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+
+        query_states, key_states, value_states = (
+            rearrange(query_states, "b n (h d) -> b n h d", d=self.head_dim),
+            rearrange(key_states, "b n (h d) -> b n h d", d=self.head_dim),
+            rearrange(value_states, "b n (h d) -> b n h d", d=self.head_dim),
+        )
+
+        query_states = query_states.transpose(1, 2)
+        key_states = key_states.transpose(1, 2)
+        value_states = value_states.transpose(1, 2)
+
+        # 打印注意力模块的输出
+        # 准备数据以写入 JSON 文件
+        attention_outputs = {
+            "Query states": query_states.detach().cpu().tolist(),
+            "Key states": key_states.detach().cpu().tolist(),
+            "Value states": value_states.detach().cpu().tolist()
+        }
+        # 将数据写入 JSON 文件
+        with open("a_sdpa_attention_outputs.json", "w") as f:
+            json.dump(attention_outputs, f, indent=4)
+        
+        if self.config.pp_size > 1:
+            query_states = query_states.contiguous()
+            key_states = key_states.contiguous()
+            value_states = value_states.contiguous()
+
+
+        kv_seq_len = key_states.shape[-2]
+        if past_key_value is not None:
+            kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
+        cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
+
+        if past_key_value is not None:
+            cache_kwargs = {"sin": sin, "cos": cos}  # Specific to RoPE models
+            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
+
+        key_states = repeat_kv(key_states, self.num_key_value_groups)
+        value_states = repeat_kv(value_states, self.num_key_value_groups)
+
+        if attention_mask is not None:
+            if attention_mask.size() != (bsz, 1, q_len, kv_seq_len):
+                raise ValueError(
+                    f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}"
+                )
+
+        # SDPA with memory-efficient backend is currently (torch==2.1.2) bugged with non-contiguous inputs with custom attn_mask,
+        # Reference: https://github.com/pytorch/pytorch/issues/112577.
+        if query_states.device.type == "cuda" and attention_mask is not None:
+            query_states = query_states.contiguous()
+            key_states = key_states.contiguous()
+            value_states = value_states.contiguous()
+
+        attn_output = torch.nn.functional.scaled_dot_product_attention(
+            query_states,
+            key_states,
+            value_states,
+            attn_mask=attention_mask,
+            dropout_p=self.attention_dropout if self.training else 0.0,
+            # The q_len > 1 is necessary to match with AttentionMaskConverter.to_causal_4d that does not create a causal mask in case q_len == 1.
+            is_causal=self.is_causal and attention_mask is None and q_len > 1,
+        )
+
+        attn_output = attn_output.transpose(1, 2).contiguous()
+        attn_output = attn_output.view(bsz, q_len, int(self.hidden_size/self.config.tp_size))
+
+        attn_output = self.o_proj(attn_output)
+
+        # 打印注意力模块的输出
+        attention_result = {
+            "Output weights:": attn_output.detach().cpu().tolist(),
+            # "Attention weights:": attn_weights.detach().cpu().tolist(),
+        }
+        # 将数据写入 JSON 文件
+        with open("a_sdpa_attention_outputs.json", "w") as f:
+            json.dump(attention_result, f, indent=4)
+
+        return attn_output, None, past_key_value
+
+
+MISTRAL_ATTENTION_CLASSES = {
+    "eager": MistralAttention,
+    "flash_attention_2": MistralFlashAttention2,
+    "sdpa": MistralSdpaAttention,
+}
+
+
+class MistralDecoderLayer(nn.Module):
+    def __init__(self, config: CollieConfig, layer_idx: int):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        config._attn_implementation = "sdpa"
+        self.self_attn = MISTRAL_ATTENTION_CLASSES[config._attn_implementation](config, layer_idx)
+        self.config = config
+        self.mlp = MistralMLP(config)
+        self.input_layernorm = MistralRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = MistralRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.idx = layer_idx
+        # 务必保持变量名一致
+        self.use_cache = self.config.model_config.use_cache
+        self.hidden_states = None
+        self.output_attentions = False
+
+class MistralDecoderLayer(nn.Module):
+    def __init__(self, config: CollieConfig, layer_idx: int):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        config._attn_implementation = "sdpa"
+        self.self_attn = MISTRAL_ATTENTION_CLASSES[config._attn_implementation](config, layer_idx)
+        self.config = config
+        self.mlp = MistralMLP(config)
+        self.input_layernorm = MistralRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = MistralRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.idx = layer_idx
+        # 务必保持变量名一致
+        self.use_cache = self.config.model_config.use_cache
+        self.hidden_states = None
+        self.output_attentions = False
+
+    def _forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        # output_attentions: Optional[bool] = False,
+        # use_cache: Optional[bool] = False,
+        **kwargs,
+    ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
+        # if "padding_mask" in kwargs:
+        #     warnings.warn(
+        #         "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`"
+        #     )
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
+            attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
+                `(batch, sequence_length)` where padding elements are indicated by 0.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            use_cache (`bool`, *optional*):
+                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
+                (see `past_key_values`).
+            past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
+        """
+
+        residual = hidden_states
+
+        hidden_states = self.input_layernorm(hidden_states)
+
+        # Self Attention
+        hidden_states, self_attn_weights, present_key_value = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_value=past_key_value,
+            # output_attentions=output_attentions,
+            # use_cache=use_cache,
+            **kwargs,
+        )
+        hidden_states = residual + hidden_states
+
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + hidden_states
+
+        # outputs = (hidden_states,)
+
+        # if output_attentions:
+        #     outputs += (self_attn_weights,)
+
+        # if use_cache:
+        #     outputs += (present_key_value,)
+
+        return hidden_states, present_key_value
+
+    def forward(self, inputs: dict):
+        layer_past = inputs_to_kv_cache_for_layer(idx=self.idx, inputs=inputs)
+
+        if self.config.checkpointing and self.training:
+            hidden_states, new_layer_past = torch.utils.checkpoint.checkpoint(
+                self._forward,
+                inputs["hidden_states"],
+                inputs.get("attention_mask", None),
+                inputs.get("position_ids", None),
+                layer_past,  # inputs.get("past_key_values", None),
+            )
+        else:
+            hidden_states, new_layer_past = self._forward(
+                inputs["hidden_states"],
+                inputs.get("attention_mask", None),
+                inputs.get("position_ids", None),
+                layer_past
+            )  # **inputs
+        inputs["hidden_states"] = hidden_states
+
+        inputs.update(kv_cache_to_inputs_for_layer(idx=self.idx, new_layer_past=new_layer_past))
+        return inputs
+
+
+    # def _forward(
+    #     self,
+    #     hidden_states: torch.Tensor,
+    #     attention_mask: Optional[torch.Tensor] = None,
+    #     position_ids: Optional[torch.LongTensor] = None,
+    #     past_key_value: Optional[Tuple[torch.Tensor]] = None,
+    #     # output_attentions: Optional[bool] = False,
+    #     # use_cache: Optional[bool] = False,
+    #     **kwargs,
+    # ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
+    #     # if "padding_mask" in kwargs:
+    #     #     warnings.warn(
+    #     #         "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`"
+    #     #     )
+    #     """
+    #     Args:
+    #         hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
+    #         attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
+    #             `(batch, sequence_length)` where padding elements are indicated by 0.
+    #         output_attentions (`bool`, *optional*):
+    #             Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+    #             returned tensors for more detail.
+    #         use_cache (`bool`, *optional*):
+    #             If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
+    #             (see `past_key_values`).
+    #         past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
+    #     """
+
+    #     residual = hidden_states
+
+    #     hidden_states = self.input_layernorm(hidden_states)
+
+    #     # Self Attention
+    #     hidden_states, self_attn_weights, present_key_value = self.self_attn(
+    #         hidden_states=hidden_states,
+    #         attention_mask=attention_mask,
+    #         position_ids=position_ids,
+    #         past_key_value=past_key_value,
+    #         # output_attentions=output_attentions,
+    #         # use_cache=use_cache,
+    #         **kwargs,
+    #     )
+    #     hidden_states = residual + hidden_states
+
+    #     # Fully Connected
+    #     residual = hidden_states
+    #     hidden_states = self.post_attention_layernorm(hidden_states)
+    #     hidden_states = self.mlp(hidden_states)
+    #     hidden_states = residual + hidden_states
+
+    #     # outputs = (hidden_states,)
+
+    #     # if output_attentions:
+    #     #     outputs += (self_attn_weights,)
+
+    #     # if use_cache:
+    #     #     outputs += (present_key_value,)
+
+    #     return hidden_states, present_key_value
+
+    # def forward(self, inputs: dict):
+    #     layer_past = inputs_to_kv_cache_for_layer(idx=self.idx, inputs=inputs)
+
+    #     if self.config.checkpointing and self.training:
+    #         hidden_states, new_layer_past = torch.utils.checkpoint.checkpoint(
+    #             self._forward,
+    #             inputs["hidden_states"],
+    #             inputs.get("attention_mask", None),
+    #             inputs.get("position_ids", None),
+    #             layer_past,  # inputs.get("past_key_values", None),
+    #         )
+    #     else:
+    #         hidden_states, new_layer_past = self._forward(
+    #             inputs["hidden_states"],
+    #             inputs.get("attention_mask", None),
+    #             inputs.get("position_ids", None),
+    #             layer_past
+    #         )  # **inputs
+    #     inputs["hidden_states"] = hidden_states
+
+    #     inputs.update(kv_cache_to_inputs_for_layer(idx=self.idx, new_layer_past=new_layer_past))
+    #     return inputs
+
+    # def forward(
+    #     self,
+    #     hidden_states: torch.Tensor,
+    #     attention_mask: Optional[torch.Tensor] = None,
+    #     position_ids: Optional[torch.LongTensor] = None,
+    #     past_key_value: Optional[Tuple[torch.Tensor]] = None,
+    #     output_attentions: Optional[bool] = False,
+    #     use_cache: Optional[bool] = False,
+    #     **kwargs,
+    # ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
+    #     if "padding_mask" in kwargs:
+    #         warnings.warn(
+    #             "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`"
+    #         )
+    #     """
+    #     Args:
+    #         hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
+    #         attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
+    #             `(batch, sequence_length)` where padding elements are indicated by 0.
+    #         output_attentions (`bool`, *optional*):
+    #             Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+    #             returned tensors for more detail.
+    #         use_cache (`bool`, *optional*):
+    #             If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
+    #             (see `past_key_values`).
+    #         past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
+    #     """
+
+    #     residual = hidden_states
+
+    #     hidden_states = self.input_layernorm(hidden_states)
+
+    #     # Self Attention
+    #     hidden_states, self_attn_weights, present_key_value = self.self_attn(
+    #         hidden_states=hidden_states,
+    #         attention_mask=attention_mask,
+    #         position_ids=position_ids,
+    #         past_key_value=past_key_value,
+    #         output_attentions=output_attentions,
+    #         use_cache=use_cache,
+    #         **kwargs,
+    #     )
+    #     hidden_states = residual + hidden_states
+
+    #     # Fully Connected
+    #     residual = hidden_states
+    #     hidden_states = self.post_attention_layernorm(hidden_states)
+    #     hidden_states = self.mlp(hidden_states)
+    #     hidden_states = residual + hidden_states
+
+    #     outputs = (hidden_states,)
+
+    #     if output_attentions:
+    #         outputs += (self_attn_weights,)
+
+    #     if use_cache:
+    #         outputs += (present_key_value,)
+
+    #     return outputs
+
+
+MISTRAL_START_DOCSTRING = r"""
+    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+
+    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
+    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
+    and behavior.
+
+    Parameters:
+        config ([`MistralConfig`]):
+            Model configuration class with all the parameters of the model. Initializing with a config file does not
+            load the weights associated with the model, only the configuration. Check out the
+            [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+
+@add_start_docstrings(
+    "The bare Mistral Model outputting raw hidden-states without any specific head on top.",
+    MISTRAL_START_DOCSTRING,
+)
+class MistralPreTrainedModel(PreTrainedModel):
+    config_class = MistralConfig
+    base_model_prefix = "model"
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["MistralDecoderLayer"]
+    _skip_keys_device_placement = "past_key_values"
+    _supports_flash_attn_2 = True
+    _supports_sdpa = True
+    _supports_cache_class = True
+
+    def _init_weights(self, module):
+        std = self.config.initializer_range
+        if isinstance(module, nn.Linear):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+
+
+MISTRAL_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
+            it.
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            [What are input IDs?](../glossary#input-ids)
+        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            If `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
+            `past_key_values`).
+
+            If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
+            and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
+            information on the default strategy.
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+        position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+            config.n_positions - 1]`.
+
+            [What are position IDs?](../glossary#position-ids)
+        past_key_values (`Cache` or `tuple(tuple(torch.FloatTensor))`, *optional*):
+            Pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
+            blocks) that can be used to speed up sequential decoding. This typically consists in the `past_key_values`
+            returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`.
+
+            Two formats are allowed:
+            - a [`~cache_utils.Cache`] instance;
+            - Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of
+            shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`). This is also known as the legacy
+            cache format.
+
+            The model will output the same cache format that is fed as input. If no `past_key_values` are passed, the
+            legacy cache format will be returned.
+
+            If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that don't
+            have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `input_ids`
+            of shape `(batch_size, sequence_length)`.
+        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
+            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
+            model's internal embedding lookup matrix.
+        use_cache (`bool`, *optional*):
+            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
+            `past_key_values`).
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+@add_start_docstrings(
+    "The bare Mistral Model outputting raw hidden-states without any specific head on top.",
+    MISTRAL_START_DOCSTRING,
+)
+class MistralModel(nn.Module):
+    """
+    Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`MistralDecoderLayer`]
+
+    Args:
+        config: MistralConfig
+    """
+
+    def __init__(self, config: CollieConfig):
+        # super().__init__(config)
+        super().__init__()
+        self.config = config
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+        
+        # aaaa
+        # self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
+        self.embed_tokens = tensor_parallel.VocabParallelEmbedding(
+            config.vocab_size, config.hidden_size, params_dtype=torch.float32
+        )
+        self.layers = nn.ModuleList(
+            [MistralDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
+        )
+        config._attn_implementation = "sdpa"
+        self._attn_implementation = config._attn_implementation
+        self.norm = MistralRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.gradient_checkpointing = False
+        # Initialize weights and apply final processing
+        # self.post_init()
+        
+
+    def get_input_embeddings(self):
+        return self.embed_tokens
+
+    def set_input_embeddings(self, value):
+        self.embed_tokens = value
+
+    @add_start_docstrings_to_model_forward(MISTRAL_INPUTS_DOCSTRING)
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        # aaaa
+        past_key_values: Optional[Tuple[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutputWithPast]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        # retrieve input_ids and inputs_embeds
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time")
+        elif input_ids is not None:
+            batch_size, seq_length = input_ids.shape
+        elif inputs_embeds is not None:
+            batch_size, seq_length, _ = inputs_embeds.shape
+        else:
+            raise ValueError("You have to specify either decoder_input_ids or decoder_inputs_embeds")
+
+        if self.gradient_checkpointing and self.training:
+            if use_cache:
+                logger.warning_once(
+                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+                )
+                use_cache = False
+
+        past_key_values_length = 0
+
+        if use_cache:
+            use_legacy_cache = not isinstance(past_key_values, Cache)
+            if use_legacy_cache:
+                past_key_values = DynamicCache.from_legacy_cache(past_key_values)
+            past_key_values_length = past_key_values.get_usable_length(seq_length)
+
+        if position_ids is None:
+            device = input_ids.device if input_ids is not None else inputs_embeds.device
+            position_ids = torch.arange(
+                past_key_values_length, seq_length + past_key_values_length, dtype=torch.long, device=device
+            )
+            position_ids = position_ids.unsqueeze(0).view(-1, seq_length)
+        else:
+            position_ids = position_ids.view(-1, seq_length).long()
+
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_tokens(input_ids)
+
+        # 打印嵌入层输出
+        embeddings_output = inputs_embeds.detach().cpu().tolist()
+        data_to_save = {"Embeddings Output": embeddings_output}
+        # 将输出写入 JSON 文件
+        with open('a_embeddings_output.json', 'w') as f:
+            json.dump(data_to_save, f, indent=4)
+
+        if attention_mask is not None and self._attn_implementation == "flash_attention_2" and use_cache:
+            is_padding_right = attention_mask[:, -1].sum().item() != batch_size
+            if is_padding_right:
+                raise ValueError(
+                    "You are attempting to perform batched generation with padding_side='right'"
+                    " this may lead to unexpected behaviour for Flash Attention version of Mistral. Make sure to "
+                    " call `tokenizer.padding_side  = 'left'` before tokenizing the input. "
+                )
+
+        if self._attn_implementation == "flash_attention_2":
+            # 2d mask is passed through the layers
+            attention_mask = attention_mask if (attention_mask is not None and 0 in attention_mask) else None
+        elif self._attn_implementation == "sdpa" and not output_attentions:
+            # output_attentions=True can not be supported when using SDPA, and we fall back on
+            # the manual implementation that requires a 4D causal mask in all cases.
+            attention_mask = _prepare_4d_causal_attention_mask_for_sdpa(
+                attention_mask,
+                (batch_size, seq_length),
+                inputs_embeds,
+                past_key_values_length,
+            )
+        else:
+            # 4d mask is passed through the layers
+            attention_mask = _prepare_4d_causal_attention_mask(
+                attention_mask,
+                (batch_size, seq_length),
+                inputs_embeds,
+                past_key_values_length,
+                sliding_window=self.config.sliding_window,
+            )
+
+        hidden_states = inputs_embeds
+
+        inputs = {
+            "input_ids": input_ids,
+            "hidden_states": hidden_states,
+            "attention_mask": attention_mask,
+            "position_ids": position_ids,
+            "past_key_values": past_key_values,
+            "output_attentions": output_attentions,
+            "use_cache": use_cache,
+        } 
+
+        # decoder layers
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attns = () if output_attentions else None
+        next_decoder_cache = None
+
+        # for decoder_layer in self.layers:
+        for idx, decoder_layer in enumerate(self.layers):
+            if output_hidden_states:
+                # all_hidden_states += (hidden_states,)
+                all_hidden_states += (inputs["hidden_states"],)
+
+            if self.gradient_checkpointing and self.training:
+                layer_outputs = self._gradient_checkpointing_func(
+                    decoder_layer.__call__,
+                    # hidden_states,
+                    # attention_mask,
+                    # position_ids,
+                    # past_key_values,
+                    # output_attentions,
+                    # use_cache,
+                    inputs,
+                )
+            else:
+                layer_outputs = decoder_layer(
+                    # hidden_states,
+                    # attention_mask=attention_mask,
+                    # position_ids=position_ids,
+                    # past_key_value=past_key_values,
+                    # output_attentions=output_attentions,
+                    # use_cache=use_cache,
+                    inputs,
+                )
+            inputs.update(layer_outputs)
+
+            # hidden_states = layer_outputs[0]
+            hidden_states = inputs["hidden_states"]
+
+            if use_cache:
+                # next_decoder_cache = layer_outputs[2 if output_attentions else 1]
+                next_decoder_cache = inputs["addition_info"][1 if output_attentions else 0]
+
+            if output_attentions:
+                # all_self_attns += (layer_outputs[1],)
+                all_self_attns += (inputs["addition_info"][0],)
+
+        hidden_states = self.norm(hidden_states)
+
+        # add hidden states from the last decoder layer
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+
+        next_cache = None
+        if use_cache:
+            next_cache = next_decoder_cache.to_legacy_cache() if use_legacy_cache else next_decoder_cache
+
+        if not return_dict:
+            return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None)
+        return BaseModelOutputWithPast(
+            last_hidden_state=hidden_states,
+            # past_key_values=next_cache,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attns,
+            past_key_values=past_key_values,
+        )
+
+    @classmethod
+    def pipeline_layers(cls, config: CollieConfig):
+        """
+        Get layers of pipeline.
+        :return: list
+        """
+        if isinstance(config, str):
+            config = CollieConfig.from_pretrained(config)
+
+        if config.tie_word_embeddings:
+            embed_tokens = TiedLayerSpec(
+                "embed_tokens",
+                dict_as_params(input_keys="input_ids", output_keys="hidden_states"),
+                tensor_parallel.VocabParallelEmbedding,
+                config.vocab_size,
+                config.hidden_size,
+            )
+        else:
+            embed_tokens = LayerSpec(
+                dict_as_params(input_keys="input_ids", output_keys="hidden_states"),
+                tensor_parallel.VocabParallelEmbedding,
+                config.vocab_size,
+                config.hidden_size,
+            )
+
+        layers = [
+            LayerSpec(MistralDecoderLayer, config, i) for i in range(config.num_hidden_layers)
+        ]
+        norm = LayerSpec(
+            dict_as_params(input_keys="hidden_states", output_keys="hidden_states"),
+            MistralRMSNorm,
+            hidden_size=config.hidden_size,
+            eps=config.rms_norm_eps,
+        )
+
+        return [
+            ("embed_tokens", embed_tokens),
+            ("layers", layers),
+            ("norm", norm),
+        ]
+
+class MistralForCausalLM(CollieModelForCausalLM):
+    _tied_weights_keys = ["lm_head.weight"]
+
+    def __init__(self, config:CollieConfig):
+        super().__init__(config)
+        self.model = MistralModel(config)
+        self.vocab_size = config.vocab_size
+        # self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+        self.lm_head = ColumnParallelLinearWithoutBias(
+            self.collie_config.hidden_size, self.collie_config.vocab_size, bias=False
+        )
+        # Initialize weights and apply final processing
+        # self.post_init()
+        # GenerationMixin 需要的额外参数
+        self.config.is_decoder = True
+        if config.model_config.tie_word_embeddings:
+            self.lm_head.weight = self.embed_tokens.weight
+        self.main_input_name = "input_ids"
+
+    def clean_cache(self):
+        self._clean_hidden_states([*self.model.layers, self.lm_head])
+        self._set_use_cache(self.model.layers, False)
+
+    def set_cache(self, use_cache):
+        self._set_use_cache(self.model.layers, use_cache)
+
+    def get_input_embeddings(self):
+        return self.model.embed_tokens
+
+    def set_input_embeddings(self, value):
+        self.model.embed_tokens = value
+
+    def get_output_embeddings(self):
+        return self.lm_head
+
+    def set_output_embeddings(self, new_embeddings):
+        self.lm_head = new_embeddings
+
+    def set_decoder(self, decoder):
+        self.model = decoder
+
+    def get_decoder(self):
+        return self.model
+
+    @add_start_docstrings_to_model_forward(MISTRAL_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        **kwargs,
+    ) -> Union[Tuple, CausalLMOutputWithPast]:
+        r"""
+        Args:
+            labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
+                config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+                (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+
+        Returns:
+
+        Example:
+
+        ```python
+        >>> from transformers import AutoTokenizer, MistralForCausalLM
+
+        >>> model = MistralForCausalLM.from_pretrained("mistralai/Mistral-7B-v0.1")
+        >>> tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-v0.1")
+
+        >>> prompt = "Hey, are you conscious? Can you talk to me?"
+        >>> inputs = tokenizer(prompt, return_tensors="pt")
+
+        >>> # Generate
+        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
+        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+        "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
+        ```"""
+
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
+        outputs = self.model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        hidden_states = outputs[0]
+        logits = self.lm_head(hidden_states)
+        logits = logits.float()
+
+        loss = None
+        if labels is not None:
+            # Shift so that tokens < n predict n
+            shift_logits = logits[..., :-1, :].contiguous()
+            shift_labels = labels[..., 1:].contiguous()
+            # Flatten the tokens
+            shift_logits = shift_logits.view(-1, self.config.vocab_size)
+            shift_labels = shift_labels.view(-1)
+            # Ensure tensors are on the same device
+            shift_labels = shift_labels.to(shift_logits.device)
+            loss_fct = CrossEntropyLoss()
+            loss = loss_fct(shift_logits, shift_labels)
+
+        if not return_dict:
+            output = (logits,) + outputs[1:]
+            return (loss,) + output if loss is not None else output
+
+        return CausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+    def prepare_inputs_for_generation(
+        self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, **kwargs
+    ):
+        # Omit tokens covered by past_key_values
+        if past_key_values is not None:
+            if isinstance(past_key_values, Cache):
+                cache_length = past_key_values.get_seq_length()
+                past_length = past_key_values.seen_tokens
+                max_cache_length = past_key_values.get_max_length()
+            else:
+                cache_length = past_length = past_key_values[0][0].shape[2]
+                max_cache_length = None
+
+            # Keep only the unprocessed tokens:
+            # 1 - If the length of the attention_mask exceeds the length of input_ids, then we are in a setting where
+            # some of the inputs are exclusively passed as part of the cache (e.g. when passing input_embeds as
+            # input)
+            if attention_mask is not None and attention_mask.shape[1] > input_ids.shape[1]:
+                input_ids = input_ids[:, -(attention_mask.shape[1] - past_length) :]
+            # 2 - If the past_length is smaller than input_ids', then input_ids holds all input tokens. We can discard
+            # input_ids based on the past_length.
+            elif past_length < input_ids.shape[1]:
+                input_ids = input_ids[:, past_length:]
+            # 3 - Otherwise (past_length >= input_ids.shape[1]), let's assume input_ids only has unprocessed tokens.
+
+            # If we are about to go beyond the maximum cache length, we need to crop the input attention mask.
+            if (
+                max_cache_length is not None
+                and attention_mask is not None
+                and cache_length + input_ids.shape[1] > max_cache_length
+            ):
+                attention_mask = attention_mask[:, -max_cache_length:]
+
+        position_ids = kwargs.get("position_ids", None)
+        if attention_mask is not None and position_ids is None:
+            # create position_ids on the fly for batch generation
+            position_ids = attention_mask.long().cumsum(-1) - 1
+            position_ids.masked_fill_(attention_mask == 0, 1)
+            if past_key_values:
+                position_ids = position_ids[:, -input_ids.shape[1] :]
+
+        # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
+        if inputs_embeds is not None and past_key_values is None:
+            model_inputs = {"inputs_embeds": inputs_embeds}
+        else:
+            model_inputs = {"input_ids": input_ids}
+
+        model_inputs.update(
+            {
+                "position_ids": position_ids,
+                "past_key_values": past_key_values,
+                "use_cache": kwargs.get("use_cache"),
+                "attention_mask": attention_mask,
+            }
+        )
+        return model_inputs
+
+    @staticmethod
+    def _reorder_cache(past_key_values, beam_idx):
+        reordered_past = ()
+        for layer_past in past_key_values:
+            reordered_past += (
+                tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past),
+            )
+        return reordered_past
+
+    @classmethod
+    def pipeline_layers(cls, config: CollieConfig):
+        """
+        Get layers of pipeline.
+        :return: list
+        """
+        if isinstance(config, str):
+            config = CollieConfig.from_pretrained(config)
+
+        if config.tie_word_embeddings:
+            output = TiedLayerSpec(
+                "embed_tokens",
+                dict_as_params(input_keys="hidden_states", output_keys="logits"),
+                ColumnParallelLMHead,
+                config.hidden_size,
+                config.vocab_size,
+                bias=False,
+            )
+        else:
+            output = LayerSpec(
+                dict_as_params(input_keys="hidden_states", output_keys="logits"),
+                ColumnParallelLMHead,
+                config.hidden_size,
+                config.vocab_size,
+                bias=False,
+            )
+
+        return [("model", MistralModel.pipeline_layers(config)), ("lm_head", output)]
+
+    @staticmethod
+    def load_parallel_state_dict(
+            path: str,
+            config: Union[CollieConfig, str],
+            process_exclusion: bool = False,
+            **kwargs,
+    ):
+        ...
+
+    @staticmethod
+    def load_parallel_state_dict(
+            path: str,
+            config: Union[CollieConfig, str],
+            process_exclusion: bool = False,
+            protocol: str = "file", # 指定加载state_dict时使用的协议
+            **kwargs,
+    ):
+        """
+        Load state_dict from ``path``.
+        The format of pretrained model should be the same as that of
+        `huggingface`.
+        :return: state_dict. Note that the state_dict should be processed
+            properly to match the current rank.
+        """
+        # 配置加载
+        if isinstance(config, str):
+            config = CollieConfig.from_pretrained(config)
+        # IO驱动初始化
+        io_driver = IODriver.from_protocol(protocol)
+        # 检查文件路径是否存在
+        if not io_driver.exists(path):
+            raise FileNotFoundError(f"folder {path} not found.")
+        # 初始化存储和处理变量
+        state_dict = OrderedDict()
+        weights = []
+        parts = None # 变量用于存储模型分割的部分信息
+        # 如果开启了进程互斥，那么每个进程都会显示进度条，否则只显示 RANK0 的
+        hide_progress = not process_exclusion and int(os.environ.get("RANK", "0")) != 0
+        if dist.is_initialized() and process_exclusion:
+            # 如果启动了进程互斥，则要进行 dist.get_world_size() 次循环
+            rank_order = range(dist.get_world_size())
+        else:
+            # 不开启只进行一次循环
+            rank_order = range(1)
+        # 权重文件加载和处理
+        for rank in rank_order:
+            # 如果开启了进程互斥，那么只有对应 RANK 的能进入循环；不开启进程互斥的话就都可以进
+            if int(os.environ.get("RANK", "0")) == rank or not process_exclusion:
+                # PP 分层的方法保存在了 os.environ["COLLIE_PP_PARTS"], 格式类似于 [0, 17, 35], 左闭右开
+                if env.is_pipeline:
+                    # 保存的是 json 格式
+                    parts = env.pipeline_parts
+                if hasattr(config, "num_key_value_heads"):
+                    # llama2 (transformers >= 4.31.0)
+                    num_key_value_heads = config.num_key_value_heads
+                else:
+                    num_key_value_heads = config.num_attention_heads
+                head_dim = config.hidden_size // config.num_attention_heads
+                # 如果存在 pytorch_model.bin.index.json 文件的话，此时不同的 pp 进程可以按需加载自己需要的权重
+                if (
+                        io_driver.exists(os.path.join(path, "pytorch_model.bin.index.json"))
+                        and "COLLIE_PP_PARTS" in os.environ.keys()
+                ):
+                    weight_map = json.loads(
+                        io_driver.load(
+                            os.path.join(path, "pytorch_model.bin.index.json"), mode="r"
+                        )
+                    )["weight_map"]
+                    # layers 表示自己需要的层
+                    layers = env.pipeline_layers_idx
+                    # 筛选出形似 model.layers.0 这样的层。包含两个条件：1. 有数字的层；2. 数字加一要在 layers 里面（因为最开始还有个 embedding 占一层）
+                    weights.extend(
+                        [
+                            value
+                            for key, value in weight_map.items()
+                            if len(key.split(".")) > 2
+                               and key.split(".")[2].isdigit()
+                               and (int(key.split(".")[2]) + 1) in layers
+                        ]
+                    )
+                    # 去重
+                    weights = list(set(weights))
+                    # 继续筛选，如果有 0 层，那么就要加载 embedding；如果有最后一层，那么就要加载 lm_head；如果有倒数第二层，那么就要加载 norm
+                    if 0 in layers:
+                        weights.append(weight_map["model.embed_tokens.weight"])
+                    if max(parts) - 1 in layers:
+                        weights.append(weight_map["lm_head.weight"])
+                    if max(parts) - 2 in layers:
+                        weights.append(weight_map["model.norm.weight"])
+                else:
+                    # 如果没有 pytorch_model.bin.index.json 文件的话，那么就加载所有的权重
+                    weights = [
+                        weight
+                        for weight in io_driver.list(path)
+                        if weight.endswith(".bin")
+                    ]
+                with progress(
+                    weights,
+                    desc="Loading state dict",
+                    total=len(weights),
+                    disable=hide_progress,
+                ) as pbar:
+                    for weight in pbar:
+                        part_state_dict = io_driver.load(
+                            os.path.join(path, weight), mode="rb"
+                        )
+                        # for key in list(part_state_dict.keys()):
+                            # if "attention.wqkv.weight" in key:
+                            #     # qkv_weights = part_state_dict.pop(key)
+                            #     qkv_weights = part_state_dict[key]
+                            #     print(qkv_weights.shape)
+                            #     (wq, wk, wv) = qkv_weights.split(
+                            #         [
+                            #             config.hidden_size,
+                            #             config.num_key_value_heads * head_dim,
+                            #             config.num_key_value_heads * head_dim,
+                            #         ],
+                            #         dim=0,
+                            #     )
+                            #     wq_name = key.replace("wqkv", "wq")
+                            #     wk_name = key.replace("wqkv", "wk")
+                            #     wv_name = key.replace("wqkv", "wv")
+                            #     part_state_dict[wq_name] = wq
+                            #     part_state_dict[wk_name] = wk
+                            #     part_state_dict[wv_name] = wv
+                        state_dict.update(part_state_dict)
+                        del part_state_dict
+                if parts is not None:
+                    # 这一步是 pp 的复筛
+                    layers = env.pipeline_layers_idx
+                    for key in list(state_dict.keys()):
+                        if key.startswith("layers"):
+                            layer = int(key.split(".")[1])
+                            if layer + 1 not in layers:
+                                state_dict.pop(key)
+                        # if key.endswith("tok_embeddings.weight"):
+                        if key.endswith("embed_tokens.weight"):
+                            if 0 not in layers:
+                                state_dict.pop(key)
+                        if key == "norm.weight":
+                            if max(parts) - 2 not in layers:
+                                state_dict.pop(key)
+                        # if key.endswith("output.weight"):
+                        if key.endswith("lm_head.weight"):
+                            if max(parts) - 1 not in layers:
+                                state_dict.pop(key)
+                # 根据用户配置的新的 tp size 进行分割
+                for key in list(state_dict.keys()):
+                    col_filter = [
+                        # "wq.weight",
+                        # "wk.weight",
+                        # "wv.weight",
+                        # "wqkv.weight",
+                        # "w1.weight",
+                        # "w3.weight",
+                        # "tok_embeddings.weight",
+                        # "output.weight",
+                        "q_proj.weight",
+                        "k_proj.weight",
+                        "v_proj.weight",
+                        #"o_proj.weight",
+                        "lm_head.weight",
+                        "gate_proj.weight",
+                        "up_proj.weight",
+                        #"down_proj.weight",
+                        "embed_tokens.weight",
+                    ]
+                    col_split = any([key.endswith(filter) for filter in col_filter])
+
+                    if col_split:
+                        tensor = (
+                            list(torch.chunk(state_dict[key], config.tp_size, dim=0))[
+                                env.tp_rank
+                            ]
+                            .detach()
+                            .clone()
+                        )
+                        del state_dict[key]
+                        if process_exclusion:
+                            # CPU 内存回收（速度很慢）
+                            gc.collect()
+                        state_dict[key] = tensor
+                    elif key.endswith("o_proj.weight") or key.endswith("down_proj.weight"):
+                        tensor = (
+                            list(torch.chunk(state_dict[key], config.tp_size, dim=1))[
+                                env.tp_rank
+                            ]
+                            .detach()
+                            .clone()
+                        )
+                        del state_dict[key]
+                        if process_exclusion:
+                            # CPU 内存回收（速度很慢）
+                            gc.collect()
+                        state_dict[key] = tensor
+            if dist.is_initialized() and process_exclusion:
+                # 如果选择了进程互斥，那么本次循环中不需要加载权重的进程需等待
+                dist.barrier()
+        return state_dict
+
+    @staticmethod
+    def save_parallel_state_dict(
+        state_dict: dict,
+        path: str,
+        config: CollieConfig,
+        process_exclusion: bool = False,
+        **kwargs,
+    ):
+        ...
+
+    @staticmethod
+    def save_parallel_state_dict(
+            state_dict: dict,
+            path: str,
+            config: CollieConfig,
+            process_exclusion: bool = False,
+            protocol: str = "file",
+    ):
+        """
+        Save state_dict to ``path``.
+        The format of saved state dict should be the same as that of
+        `huggingface`.
+        """
+        io_driver = IODriver.from_protocol(protocol)
+        # gather to tp rank 0
+        if dist.is_initialized() and process_exclusion:
+            # 如果启动了进程互斥，则要进行 pp_size 次循环
+            rank_order = range(config.pp_size)
+        else:
+            # 不开启只进行一次循环
+            rank_order = range(1)
+        dst = parallel_state.get_tensor_model_parallel_src_rank()
+        with progress(
+                rank_order,
+                desc="Saving model",
+                disable=int(os.environ.get("RANK", "0")) != 0,
+        ) as pbar:
+            for rank in pbar:
+                if env.dp_rank == 0 and (env.pp_rank == rank or not process_exclusion):
+                    for key in sorted(list(state_dict.keys())):
+                        tensor_list = None
+                        if env.tp_rank == 0:
+                            tensor_list = [
+                                torch.zeros_like(state_dict[key])
+                                .to(state_dict[key].dtype)
+                                .cuda()
+                                for _ in range(config.tp_size)
+                            ]
+                        dist.gather(
+                            state_dict[key].cuda(),
+                            dst=dst,
+                            gather_list=tensor_list,
+                            group=env.tp_group,
+                        )
+                        if env.tp_rank == 0:
+                            col_filter = [
+                                # "wq.weight",
+                                # "wk.weight",
+                                # "wv.weight",
+                                # "wqkv.weight",
+                                # "w1.weight",
+                                # "w3.weight",
+                                # "tok_embeddings.weight",
+                                # "output.weight",
+                                "q_proj.weight",
+                                "k_proj.weight",
+                                "v_proj.weight",
+                                #"o_proj.weight",
+                                "lm_head.weight",
+                                "gate_proj.weight",
+                                "up_proj.weight",
+                                #"down_proj.weight",
+                                "embed_tokens.weight",
+                            ]
+                            col_split = any(
+                                [key.endswith(filter) for filter in col_filter]
+                            )
+
+                            if col_split:
+                                state_dict[key] = concat_tensor(tensor_list, dim=0)
+
+                                if process_exclusion:
+                                    # CPU 内存回收（速度很慢）
+                                    gc.collect()
+
+                            elif key.endswith("o_proj.weight") or key.endswith("down_proj.weight"):
+                                state_dict[key] = concat_tensor(tensor_list, dim=1)
+
+                                if process_exclusion:
+                                    # CPU 内存回收（速度很慢）
+                                    gc.collect()
+                    # 似乎不需要？
+                    # state_dict_keys = state_dict.keys()
+                    # for layer_id in range(config.num_layers):
+                    #     qkv_names = [None, None, None]
+                    #     for key in state_dict_keys:
+                    #         if f"layers.{layer_id}.attention.wq.weight" in key:
+                    #             qkv_names[0] = key
+                    #         elif f"layers.{layer_id}.attention.wk.weight" in key:
+                    #             qkv_names[1] = key
+                    #         elif f"layers.{layer_id}.attention.wv.weight" in key:
+                    #             qkv_names[2] = key
+                    #     qkv_name = qkv_names[0].replace("wq", "wqkv")
+                    #     state_dict[qkv_name] = torch.cat(
+                    #         [
+                    #             state_dict.pop(qkv_names[0]),
+                    #             state_dict.pop(qkv_names[1]),
+                    #             state_dict.pop(qkv_names[2]),
+                    #         ],
+                    #         dim=0
+                    #     )
+
+                    if env.tp_rank == 0:
+                        # Save gathered weights
+                        if env.is_pipeline:
+                            ckpt_name = f"pytorch_model-{env.pp_rank + 1:05d}-of-{config.pp_size:05d}.bin"
+                            total_size = 0
+                            weight_map = {}
+                            for name, weight in state_dict.items():
+                                weight_size = weight.numel() * dtype_byte_size(
+                                    weight.dtype
+                                )
+                                weight_map[name] = ckpt_name
+                                total_size += weight_size
+                            index_dict = dict(
+                                total_size=total_size, weight_map=weight_map
+                            )
+                            index_dicts = [None for _ in range(env.pp_size)]
+                            dist.gather_object(
+                                index_dict, index_dicts if env.pp_rank == 0 else None, group=env.pp_group
+                            )
+                            if env.pp_rank == 0:
+                                total_size = 0
+                                weight_map = {}
+                                for _index_dict in index_dicts:
+                                    total_size += _index_dict["total_size"]
+                                    weight_map.update(_index_dict["weight_map"])
+                                merged_dict = {
+                                    "metadata": {"total_size": total_size},
+                                    "weight_map": weight_map,
+                                }
+                                io_driver.save(
+                                    json.dumps(merged_dict, indent=2, sort_keys=True)
+                                    + "\n",
+                                    os.path.join(path, "pytorch_model.bin.index.json"),
+                                )
+
+                        else:
+                            ckpt_name = f"pytorch_model.bin"
+                        ckpt_path = os.path.join(path, ckpt_name)
+                        io_driver.save(state_dict, ckpt_path)
+                if dist.is_initialized() and process_exclusion:
+                    dist.barrier()
+        if env.rank == 0:
+            config.save_pretrained(path, protocol=protocol)
+        dist.barrier()
+
+
+@add_start_docstrings(
+    """
+    The Mistral Model transformer with a sequence classification head on top (linear layer).
+
+    [`MistralForSequenceClassification`] uses the last token in order to do the classification, as other causal models
+    (e.g. GPT-2) do.
+
+    Since it does classification on the last token, it requires to know the position of the last token. If a
+    `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If
+    no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the
+    padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
+    each row of the batch).
+    """,
+    MISTRAL_START_DOCSTRING,
+)
+# Copied from transformers.models.llama.modeling_llama.LlamaForSequenceClassification with Llama->Mistral, LLAMA->MISTRAL
+class MistralForSequenceClassification(MistralPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+        self.model = MistralModel(config)
+        self.score = nn.Linear(config.hidden_size, self.num_labels, bias=False)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.model.embed_tokens
+
+    def set_input_embeddings(self, value):
+        self.model.embed_tokens = value
+
+    @add_start_docstrings_to_model_forward(MISTRAL_INPUTS_DOCSTRING)
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, SequenceClassifierOutputWithPast]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        transformer_outputs = self.model(
+            input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        hidden_states = transformer_outputs[0]
+        logits = self.score(hidden_states)
+
+        if input_ids is not None:
+            batch_size = input_ids.shape[0]
+        else:
+            batch_size = inputs_embeds.shape[0]
+
+        if self.config.pad_token_id is None and batch_size != 1:
+            raise ValueError("Cannot handle batch sizes > 1 if no padding token is defined.")
+        if self.config.pad_token_id is None:
+            sequence_lengths = -1
+        else:
+            if input_ids is not None:
+                # if no pad token found, use modulo instead of reverse indexing for ONNX compatibility
+                sequence_lengths = torch.eq(input_ids, self.config.pad_token_id).int().argmax(-1) - 1
+                sequence_lengths = sequence_lengths % input_ids.shape[-1]
+                sequence_lengths = sequence_lengths.to(logits.device)
+            else:
+                sequence_lengths = -1
+
+        pooled_logits = logits[torch.arange(batch_size, device=logits.device), sequence_lengths]
+
+        loss = None
+        if labels is not None:
+            labels = labels.to(logits.device)
+            if self.config.problem_type is None:
+                if self.num_labels == 1:
+                    self.config.problem_type = "regression"
+                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
+                    self.config.problem_type = "single_label_classification"
+                else:
+                    self.config.problem_type = "multi_label_classification"
+
+            if self.config.problem_type == "regression":
+                loss_fct = MSELoss()
+                if self.num_labels == 1:
+                    loss = loss_fct(pooled_logits.squeeze(), labels.squeeze())
+                else:
+                    loss = loss_fct(pooled_logits, labels)
+            elif self.config.problem_type == "single_label_classification":
+                loss_fct = CrossEntropyLoss()
+                loss = loss_fct(pooled_logits.view(-1, self.num_labels), labels.view(-1))
+            elif self.config.problem_type == "multi_label_classification":
+                loss_fct = BCEWithLogitsLoss()
+                loss = loss_fct(pooled_logits, labels)
+        if not return_dict:
+            output = (pooled_logits,) + transformer_outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+
+        return SequenceClassifierOutputWithPast(
+            loss=loss,
+            logits=pooled_logits,
+            past_key_values=transformer_outputs.past_key_values,
+            hidden_states=transformer_outputs.hidden_states,
+            attentions=transformer_outputs.attentions,
+        )

From f61b34e08080f43d5e11e1deec1c913f154f5dca Mon Sep 17 00:00:00 2001
From: LinqiY <100989140+LinqiY@users.noreply.github.com>
Date: Fri, 26 Apr 2024 16:06:52 +0800
Subject: [PATCH 03/16] Add Raw Model

---
 collie/models/mistral/__init__.py             |   82 +
 .../__pycache__/__init__.cpython-310.pyc      |  Bin 0 -> 1210 bytes
 .../configuration_mistral.cpython-310.pyc     |  Bin 0 -> 6270 bytes
 .../modeling_mistral.cpython-310.pyc          |  Bin 0 -> 41165 bytes
 .../models/mistral/configuration_mistral.py   |  152 ++
 .../mistral/convert_mistral_weights_to_hf.py  |  276 +++
 .../models/mistral/modeling_flax_mistral.py   |  741 +++++++++
 collie/models/mistral/modeling_mistral.py     | 1473 +++++++++++++++++
 8 files changed, 2724 insertions(+)
 create mode 100644 collie/models/mistral/__init__.py
 create mode 100644 collie/models/mistral/__pycache__/__init__.cpython-310.pyc
 create mode 100644 collie/models/mistral/__pycache__/configuration_mistral.cpython-310.pyc
 create mode 100644 collie/models/mistral/__pycache__/modeling_mistral.cpython-310.pyc
 create mode 100644 collie/models/mistral/configuration_mistral.py
 create mode 100644 collie/models/mistral/convert_mistral_weights_to_hf.py
 create mode 100644 collie/models/mistral/modeling_flax_mistral.py
 create mode 100644 collie/models/mistral/modeling_mistral.py

diff --git a/collie/models/mistral/__init__.py b/collie/models/mistral/__init__.py
new file mode 100644
index 0000000..c5fa66e
--- /dev/null
+++ b/collie/models/mistral/__init__.py
@@ -0,0 +1,82 @@
+# Copyright 2023 Mistral AI and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from transformers.utils import OptionalDependencyNotAvailable, _LazyModule, is_flax_available, is_torch_available
+
+
+_import_structure = {
+    "configuration_mistral": ["MISTRAL_PRETRAINED_CONFIG_ARCHIVE_MAP", "MistralConfig"],
+}
+
+
+try:
+    if not is_torch_available():
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    pass
+else:
+    _import_structure["modeling_mistral"] = [
+        "MistralForCausalLM",
+        "MistralModel",
+        "MistralPreTrainedModel",
+        "MistralForSequenceClassification",
+    ]
+
+try:
+    if not is_flax_available():
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    pass
+else:
+    _import_structure["modeling_flax_mistral"] = [
+        "FlaxMistralForCausalLM",
+        "FlaxMistralModel",
+        "FlaxMistralPreTrainedModel",
+    ]
+
+
+if TYPE_CHECKING:
+    from .configuration_mistral import MISTRAL_PRETRAINED_CONFIG_ARCHIVE_MAP, MistralConfig
+
+    try:
+        if not is_torch_available():
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        pass
+    else:
+        from .modeling_mistral import (
+            MistralForCausalLM,
+            MistralForSequenceClassification,
+            MistralModel,
+            MistralPreTrainedModel,
+        )
+
+    try:
+        if not is_flax_available():
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        pass
+    else:
+        from .modeling_flax_mistral import (
+            FlaxMistralForCausalLM,
+            FlaxMistralModel,
+            FlaxMistralPreTrainedModel,
+        )
+
+
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
diff --git a/collie/models/mistral/__pycache__/__init__.cpython-310.pyc b/collie/models/mistral/__pycache__/__init__.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0eeae894122b29f98b82399a2066890e840baa6a
GIT binary patch
literal 1210
zcmZuw%Wm306dgaX&BMGBQqo1*O&4Aiy6Xm2MFbL5ASj7i;mydw1DR^ZCNrifSte3<
zRsW&8{*t#{<rlh1XAB_})!=LHx#ylab3KH%odM!{1Ht558i3z2*}N=~ycaj`C;<4t
zR|uert0?lfN>r-38r5B$rrZ=YTtncRuluQ~iWS$qv5;{G_zR@nfm`Jh2+}%0+QLzQ
z{ER?gUV|y{jh_lKeX{`sKPC8YxA@I1zLoIP4S*E(!_W9xV$rmlrWrS*NDaB>XiH<f
z*08UE=*f{R&AB;hyEe_cd0KD_Nv~Z0dfl5z{N6tH`t}(}x-C+qCAUP&Zkg`5J7U+O
zA|_&!U3VAhsDk$HfS=z7$VAWl!gn3*ey*ZQBjGv4dpTi8iS6^zunk+s?be6x;K<SA
zgOgc=!@whlXodnG1>?majGA9OOuP$%;w<cWH;aDg&qY$g98QS$6?*G>SwK<9#+Msd
zWgG?*Je@O7N<)fy#5@we>UYo1Pn$hBJZ+0-chEkBt&>5gdjy-Ot>f;;HtaWtvE5(t
zTZxFUSY830khQ!y_eifF=T?|7hDf}(%7zS`GY<#IU*a#HI?vG6Tx<@th{rje;Bn$D
zE>bCggXyD7M~nA5VvA3m>otoT_)`D*2C}_&Wv!e{j4n5IFU~Y@R3;BO6Q>_II+K3~
zX3LP3At%Bhwji8ff*=$xEi0tN_#vO6v9li=(PAbXGgT1dvKVc^Cn2MV@!CAXgfmUb
z=rW{aFeI-8F`b1hg5q7w$I+Z2X3FB)2+Pa#KVDc_778*HWhjXdYka|_$H1e)V{A@I
zc;S%*k}f5*G|hhjb|Bki_%o_AM8gQZz6>d<ldA;|>aDQX!|i&6c*N_;(8Oaot$MAt
z`i?_FFxoCdhyxr!Seq?ktFc`0ot!WqijcFbs-^1>YRXUyMSsYtx}rYhG}Zh!*Uh4$
Rs+(AKgJX!Es%BX))xXwfO11z1

literal 0
HcmV?d00001

diff --git a/collie/models/mistral/__pycache__/configuration_mistral.cpython-310.pyc b/collie/models/mistral/__pycache__/configuration_mistral.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0731f2e2d91d302df8b2d492eb9e55ec9352b625
GIT binary patch
literal 6270
zcmbtYOK%(36`r9eilimKV>fl$+$v2gx)v$fPU8l(8^xAw0hZ#2kODM};b`WPy!Jfk
zJ2O(GGC+X@NH^`O%Pdl$n{0}1`U?v5N6fD1!tf&N0&P+BJ9maNBuzbxQsUt}?mg$+
z^Z3rW*PolKYWUn;eY<_`Elv9e-3)(<xcLy@;1-CjH8oqe3y$s<ngyM%r<^Ib*evQA
zFMd7EOMlRtGj`FQwo6|Xn&ms1J!6-@(CjiVd_L8z*p)|Gt-ALXTNgnnO^3O@%^ha>
zUR!h`X@<i0UP6vm)8mu(CC3=yar>rvk7qNE-{}aiv$xg_!(Q+~qtT7jgSKh$y5%?A
zgs&+Ysf5M%KU&;gsxLPr4}52vH(YRRBm(PC1K-<w_&TiCJm?)+;lVs>g}pcT)nspe
z;;RwP7TI9<Ltdaya36$gO&y*n*m`ryE;NhwR7Y=4SBjPCN~tn~zjCEgsaB3vW-D`*
z<CPPYla*7I(|CKvKKG^8*6p+Q8(&nKr7yLj)|`Py&c}19xe`m8)2B48%JA9jih$ui
z?DA1BG0QOnJoi};`jSr=;J2B{9=B4)>*VOxlR8@qiC+|On<fj`)AT}NhMbw65}8g&
zL&LNzU)pdvjV3LE9=AkWV7e(gk;}a>xWsJU7M>b}5lKv$(&~zkTVW(Qq-JtxlsUQ<
zC3Z)IT_pg#%*Y7?wj&&e?F#Nd6bpna98-=?Oms3shP9Gzsp^B6W4;`IG~(fURY`s9
z?ia967G5VVDa30Ba2%2SdaOsL))KXoFi8`Z`CCt6XT*uu<x+&KEqxcBAJ5;`6S&xB
zmg%uAo<#ssF_gZe?3Z}XkHTIQ26gt3n+TFFSL?EUOL<#i!~r7OFuF?-DLh=dYB-Fo
zc&x}y@PWFb*xvRnbIS<C9%l<J;f1YB?4sWz2r`|EDyEV+K;T`uytK4b%cs)xPiT6C
znAk;dfX8SIBvDvrw<t1sl0aVMZgEKiZP9LXiP%L1J>wp|Miix%k-P`6!b`;X)^271
zl1U$^O6D<y9kf^vNr+)PUG8DJ<4|;u=UlenuCQ(H9c{%|mwxusu?61}$hH9H87wv7
z$1;7Dt%G_d4>sk}<u+ya(WEY4y>jJHvFrEl?=KTJHqv!E<}R1P5qvwg&W+4X=>k(m
z!c#%IDNQe+(1I(uX93q_zGM<_h9R5>I~iRB@c*Si#)>7ZD7d+4by@yGDTiP`nr_ed
zuCZ-85kI1C*T!r!To3Ya&bX#(s%Q?negx<oN_FA(98QSM?npoCA#NW;9O<(%Fc@&U
zokfp|b+Tv1J<<941REoD34as(P`Hf(!`1=gg54z)$waV)^}8#zOH3R=(DMFF#}o(G
z2P-w^LkYPfAo|XOl|h-SvtJ@RN0tT5+vYM%N|8%iE|Pzl)#cW+o{zkt%3>-m`Lj!j
z>pNsS2BRo6RnAmqXSu<_KWS6AJt7xq;&RjL`Mv@Q;DuVDF9n(^bYn@7Q6*II1V~cC
z>g@IaRBaxb!U+;h0o3d<#s(uodS;Kyyu_IDQ?Xt5Wv9`z+l|Xtmfou`U%7nsy*h4d
z<Zy<HfsVlQ!@Ra%N{Vk<ipdzQYpp!X(gZiH5ZR5Dwy4s!+L32b+8u!7t@%JW(R}Nb
zsOFIGEy5}sH6DD$c3Njro@`;jokT$-d{-M}oz2Hkz%@TLdVU}j2ylSDO&!ePNYEnG
zy?EvFB>lisxRJ{O{yYNgaOQBYqgocYiJXu|zbiW3ka5IUmU`sS`k)i}aliscsE%q!
zc(%WjW66jc5gh&w#ty`xY9Z|4iB!k5a^+4z1BO1%K`E35AhJVMwe9%kL3MShzI1tl
zFQ_N9O=+XA+eYyiDmiGrJqzW;Hasfsjxi*a5jtk~D!CBgJ6stAjeryuAF58$4HOfD
z_sBDcsk+P;m&QxeCNKxHg@c-?I@+XMGq6THk$z$?Q9mgmxv?<XumI#_q%EZ5K{2~2
zBR(DxfE2<S3Pwtt9Q;xcz}6{}NlieX9uizM6jTJcyRmATJ5Zd^KmllgwEWm}c1OZN
za))oDQwMOe7Vri8wYmt5H1w}49rNY&OxsYM4Z0MpaJ;#i;Bt&h8z{_8`+amqIm<0S
zIEdwFzqWj!g!wHFyrccXqTgN|MEHCX(YZwaIua*^alWQ`?%4;*3I#WIu+cF_(BiPj
zxNSPYzM+xD1TtQ=jH+HF%{3!gY)DpKJD`c3+d%p~Zh%v09wkN4BqO6I*nZ1IO+;A`
ze&*(f{{7V<4RVnwIvFWCy&65EuAb1SKH3N0P*fD)QM;}x&tWx(cn_7Gtm!gr*oCU!
z;jl5;9)tPn#6#g0c`M`3deYxAw}b;kCCE~l*lM+UyJ6S&)ZL96H<C>djz|XWDE#d9
zd>Ia)ex3H*mj-t?RnjKBX!bkU6sL%-uyQbtz5fvk+^#dW;m@Qd$v@cpT^~$K*e|3T
z#EAlu_Dt;#X}N=q{vq;`S(zNQsD%+FMr5v7K>2`HlqJCErhVUqC_y+xB5WI2Jf2B6
z=<x|`wZgm?4PI)WY1&KeB#~<%3<onW%OG))dJLRm6|@(>I{T+Pf8YE4FJAoi+rNDM
z&%gco#^%OLody;_ZodH0wBP*o!{h?j)*1CEA!qS3gYPkXgI|O6^}c3k{W7TDFMt;M
zQ=n7*B51Kc4LaRF1zPHtY%S8{h5qzsxW<#2{tRfjUjeQ3tDx2XG0<cElc2Nx)1Y(V
z@z1rVC+tFhwm;WD-aldMUlsdj{;2;>$4Q@tbE2t@S}~p-2KRV&7?<POVa$xrOhC17
zwA#tIns2@1^ZPVQ@v*eaqil^&y|R~#PtiVLWUCjSnXom8kB<UMGH7g15m)n;l4Ol_
zGV$zivk+IZ>K#{efQ(ON`8&Xecsjr~N5*AC^(lr?JB#SxPFubScghPyaOSGX?-BVv
z5k}+(M7~R8p2#~yen^BuLcT%d9Fg-x-X-!jkspD?rDr=h7z@79<BDOZLpsBd-@%)I
zb#Mmi`XOKJ`Yvxc&v$VUdegsmPpmdV9)v+dB>;YsBdN=cdOiKOe4H%c=#=mzd!HVn
zra)k%v+BKFxd3The1j#B(z%kJ|COhTda01#m-KR>peKLD{}iUH|K2;7;z9x_^~`jq
zB_)HT6q6MqbZ(sx$`CnAWRA#jA}2sLHze{?lQ_-PZp+hjsisqw)ym`+&vdzA#8tz<
z;v$ExXAR?dWIE}a8N<NI6Ae+TH^SW>k1rs6s2edv2W=k5jvxM22Ras4tB>bW+gsdm
zQu`|>$e)lBi$v&1m;4^oLCRWDFWxEXh2p>(`~RN%R?Xe0Rpm0KHra3qNbzgnl_<n+
zO%mF~r$>e*FQ({4wZlV$4%WGpq)S{%ZsK>>*B)&?T)Ai5f4GX@wT;zV#?6m6Zm-=j
zRvzBGyY|VdvA%L&DU&RY!YsL>6~^VCCq}(NkQb0AipOUQvwC%^Fjde?|2<p!A8w4D
Ab^rhX

literal 0
HcmV?d00001

diff --git a/collie/models/mistral/__pycache__/modeling_mistral.cpython-310.pyc b/collie/models/mistral/__pycache__/modeling_mistral.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7498201d2652276bb31a7060df030e3c9ee73f87
GIT binary patch
literal 41165
zcmd75dzc*8bsyMOUHzP%o}M=_c(4h81O@~K4^pH=2_zwapeTYviU6h6kaSODs%FqL
z)7^txHF%7BtOqDRVnbH^CC(<6;(^z)MLUU|WW8C(iQ_!hyPJod&6jU?Q{IOi@5W*5
zjq=5|6w#8nzu&p_=<4YKL1+Kip6T0Fw{G2g>%Pu8zjN-X_TXSKh0m{k?2*Nv&l{=V
z<w^G^i<4*Ybr;Q4%1+ItY{NEJjX6Vp%{fzk({pM3rdO?IW-imr&Sjgqxm+_phaBa4
zVXh#*#kr#V4m3+<%1AW_=gL8fA-Q6$7MrEHlEh_JE6w4#;pWKPNON><RN}I$W6klo
zaXHSdPBbUyCY!tFcFB2ub$4@WZmPLwZjYQ7R`)jd&FyRMpWEMjVD15lE3Q_X56(R(
z#{;Veng{0&HV@4mYCbggQ1jurhnt7z4mXd?9l`aIJ-9mEJUVyONI4JPPN~0ydY4xp
zX&#$9);vCUTyhv%ooSw!JJCEjcd~hE?iAuG&S}Tkdek{~*I0h+wsAc*_W|ed*5l3-
zca6CZ+QV}nvPbOEH&b&@K9{n`?D02K_PA48{_ySe+(!^IVNW7vGKe{2@3MCz?Njy#
z?J0ZDo5tMJNVC`8hcx?wG|$-ko#OIY`vK>pc9lnqS^Gizz?;_GIix*kA41wgLE7_(
zdB}bkF%LWEQR1^WKWrbt`4Kt4;GB1!buQS`Urx_GhigaeM{w<t;O>vv$L5}Q4%)|^
z=Wm<1I%A)})f3Kxx6`+cxsTf?=U(W#ddfbHtEZih%hiiW|ET>KVjg46+>1#0fgqp9
z?I#fbM38dUHZP^7KeRDfefieqwpzGaeX-$ss*Y}J+s^9D+Z>v+)0S^tM7*DWY0Yc2
zTlG~xb9sGj)$ud0ws1ae_-3m$o%V-6dhXdzHN2}A+e;14z1VhLzj99D=h>F0+H1EY
zCimi{XL*3TvS-g-K7C<!+VnH$>I+vLzjXdqtKMuZ$Qhy^tgR_$t*)HfV|Hz!zV6mn
zYjw|aTC7H`S$9|bqwkrdw%ArR*IuimhLazyyUvSzwwKnuwRMlReYuWDn>Z)AUwpAA
zzIw^Iw(hhRoO7#n*KI5|9F<Ig#~6B9IhR$v(Q<6bdOGWm)or`xdUfU1?Dhhhw9#5}
z{fGL+*1UF2TATH}QCGIVyW!RrQQTENYpZs;R=-|vtk$of!4D=cEh(q&IjV*SL^aVk
zr)#H9`nlEi(h>^vccbxE>kCdzIo`TzbyekU109R14+i~xuH9N}ETP4Rx(Z%gZ!P!}
zT@lyoYSn4g7>~BBG+MTEv$oh!u2-wyY`A`5t!~#EEwrmYu-@uAxq+nZ8#TAF;ixH;
zyujkaPX-4$e9z(QeiA_^wTxxVGnP%;ykl<iDBVeIS$9)*+P3bPJlZrn##ZKTYAcK5
zw3pk;BUPbe*qPwIw4J?cZdx6)V|CJtX$<q+?A!0+^OotS?MBnj*xs!*=K>lbUw3OB
z$Fq`FG1YEVLLV3Xw7ZV99w&#NX|%jYPy4x6yM+g7`#H1?N28x^)^GY&!*iOxaludH
z$^GoYdK0%!XZ&){h^@HJH8i7(ma1K;d&oD3=4mWAu0M)=YWi|~wYabz#JO*!)CW+v
zcdndJPP6Sf$FH`V&WY7)w;HVz=h_!9Hl97<Ij-lPkO74sJ)2LTn3)Ow7uu_<4X5Y2
zKE;Ts1vkvB-SUTOOO995-Duan`cv$l3WAC;VU&z1!!%99G_3EaClI-SZ{lMGXYMf^
zI4KaY6p5FUUbYs^A4n~Ntl8-|3v(IUvNKqAvp<lsQgb=<Q`RpBqw<v(FU_`9bLr0i
z^GjRb{PkaaY9Y*&dGnY<CylTBFaj@iJGEup1)cNEt+Z!tWje-P{B=^bEQUd<ma`3H
zZ*H6^R)bHnoNB|ZVj4BB*U>#*)oWKTf8b*MmZMmS=$_f>yq|H`*Bs^NYv`qhSF0(K
zZNKnxUDcZoCZsydbF1A#pJs13jisxeU%~8Y)UlG(oHe(xie(eas_U#S`U6)RwvBnk
z>EWkwRUN|ZZyA1}R$Jh*g96y?>Tw27BDjdJ%QB1Bh*2~)CX){`6V_`1Ohoj_<A`6G
zXTOGnj<Kxgk>Q!k>AS|JfjMQ_XtL!DdxvvvGu^?}t!xJ~a5*=bx@(N3FeCF@1)LYR
zjE)&Jagpb^Ut*BP>R4L?n4cwl2Qf!8cBYeQ7@JwIY-jD<ZSzMfn>lZ2xzfpYa(3P>
z+({d$Pl4sJ@OK->t^BE!d+CN@phJz+r}0gefUYdgzRd?z4<fjXiJV(;oHfktjbZ7w
zD*LUfm+Oro)pToZS@dU?-FC}2ZoJJQgl5iiA!~Wl!_w})iODYghw0s}bHb<B+BbZw
z>C{_(M!DCNr#Km;)76K0bOwQ+wH*&s%ug?@t@~N8y-Jd4wb!sdSW=>8uQ%7G^Xh4)
z9t=C(!wT;D1F~$^WKHt(;ZX6@^_J_8C6WdY;g>imNj%*8b;mar^$>~12^J?!PA;P~
zcNc;zX6=~#Rg6kHZ<NdtWAD2ab7P`s^aM9zoMV849y_&~U#yWxt*`R9RI8EL=qtLZ
z@>is-PCYKeQ@t)v7>wQpXLU6k!BV}C;;J6YAH>lM_{vz$r^bv*UNLUx)#N0uMyp9t
z))B5B1u(=IRai$Xdw`<}>{~`wlq|+c$&Vhf4VrxG+2$3;wz1y5%?a^#1-(7Hz_ld&
zaB6X?$l7uf2l3j{;d)}qdXna(sA0ljU|L>gIcsB;0ntk1IETq&ZRIie3muDV(6YHF
zHJaLkiNImZ$-oM8(Bpi7`7M{02ffTz8DuWccsqrv7iO1qrT)XyLXp&YoW5n~k(zI`
zuGbcobL|2~aS<z$iy?=R{8rk}fr`|yjB!+62W^oXriavraJ%lKgE;ahNpRNMZbKL<
zCu|qjx2}Lu(tVr7s@7Tpvj^(;_+x%XdRl#)w+|_2iAAc~mG#9%tcW9^R9<bN?P4)&
zfnx(xrxsb{3X>0$qS(%2eSH<bB)`*HnGK4SQL7A^VQPJ!jFo6mOU<t$g*$?vXk^W-
zku^>l6~juGjpDl-4}?QG+0XHU$SEIwayVeGpTpPPKp=gaA_IZmNOepy9=35Oy=CD%
zg9Xy^3@nBE%-kfak?Wu*7s065X)Kmmq|4h@C%aWZ&lfvbRBiNg(93x{hbvqZ6IZ04
z^WcJ|k))$!MME!39~If0(v<cE#nIpwqsFx{wnqY{f!!UXla5m@q*ZO+kYiid>aMh@
zpTRPSzDh6DJ#~$j(<niAXc{?B=X9Se^LF-|9&Cn(uE$t&L86*hdR7~yknTFZZV5rw
z*lU!{5p&AeZ){BO)Fs<S%0e3Vh94eBoimFQnIfqbOAVosL1jR__>~TlNo3hb^96G^
zx=P~BJDhge?Xim#deXD1fn1e?08e~_$9#zRiPbt!InEWIY&xwMZsVqsmXSdjk4!Fk
zpT*ZLA}E?;#zcC;u)e#oe}~5p7umF0zzxbqF#DD<i>kcA^Y39$V?cVZYsMl}ok3i7
zyp~?U1&$7v3s*i>HuB|>qGIeqH@BT=*N@?9M<4^V6u9f%DchP#eZgErD}o!qmF{>O
ztt(eJF|Y&<V8USNr|rxgwrF<a=qs3Zpk~#p_0`3ych#wC%2fqTcB^e2C4_EfW>&_y
zEN99?H6T!A+^cmgVux|2I33aApT&=F-aLix>8__HBjgV#DXqrBegqLhM5Zw4#tiiZ
zoG)+{4?p2*ehv{X*(VGjYj<iZ)#1E0v69DUSk*h`7}kMK>bB8IkEJG4nAsR|5@$ql
zn9ZG>3?`%A!dN}Kwg%qXt+Ex=t?J9cvRaME4OvRI3M5Kguea>#igT-q729nqccv&T
z=2^AmJ{d8a*Q!V6MfRAvEUD(FpR8Vva?p3qM5$MHNFA3Fr^S@tAvLZ*l&wPS_QF16
zOQXgSeTcI`(jBsSADKi1sA2c1RP=ac+WGKd@@gP9g|+nBD`=$ZLR%@vU28+8YAxNn
z$*ULIs_NXVHzE8TL)Sn6Sg5zESDZwd>#k!npEa=JC~FZ5mxq|R8mRS+t4^zWqpen0
z<_(|<)q3@XPac<LJ8rG@mW#f3Hk=w}1lkQ0b^ciOXgh#ajvlK9)X`;mr>5h95I+0i
zL>7mub+xqKq_pC$ISYg*a3`7w*`p}{l2q%EG*F|f4S+kyGrAqk<h~X@LJbZ71_xx~
zxzU3?DO`@<Iz*$Z7`M8ms%t=Y9JP9@dPTMC_5#2Z-CBui$n3wShx|;k)mxBb6U9mi
zeQQ_UUZbmEe&vZ}?Wn#c0<}m|$EpDfbgX*S0WN_id1Hp-r**8_SgcBA66LO7+1fcX
z=Hd3slj8<`&q2Yyj~bW-o;g)*)>j-iQO^B7TGXCIe!_7qkC|-EON}PbA@o4}Tvh!n
ziDKCw@6||nz^JX_=jwT;(|SC=BA8OwS~V{i%8~BnYI+{SthuHd=(weDn)Y4&s(LZE
ztLu`KmECa})#U@MDnyMf3qAxE<C<@*sPD!4t~v<(Quhe=2cz`t96xuhCMz1`Mt#sr
zVaDoM(QnoC>PI%2$^JYk2#I`oye!gG-cWaNN;01Kkkazw9zqbxEF!T0`*<@Ci3Jii
zB^HrWGN7OtzZh@}FJ62(A-kK%g5->Oy5EOjn?NEMSq;+qGT76ah$Eogl-ePncsa18
z<{e%sP*CZ6r5Icxdke0r>z)CxgnS2ggIB1P$*#VYp1lAZ&AQS6PszvYmMj2$37V*$
z!3BR9Yn-E+j@<xfpb<bmh@ZU(B2(8$iJE6jVTmcQoGkmf^)-D^0BqQjL&YZdi-9Oy
zU-0}ag4$y1EmO0rKZff{*rAvyBk%M#@D<z&;)0RK-<14SjE&K_v10*&EggPXQ8Eta
z@O8f*0a`)C0|*5~OHx~CUkV2?)CBPW_{kV>5-=6!Ru0U*2!TW?As(iIfLLCUJr4wA
z7$U^LatQ(j&=D;rbR!@ga8<xSK9ayd$Z6$fg>Bs^VwgnuMsszo8{beDP+#>N19ExO
zgPM~S=4zg?e}TbYWWW)vew@Ke3|?mN2?nn)xWs@ULMW?!7RP>8Ln(eSE>_Jk<!cOL
z{*;`WzyW^;2ksbxvQdrThvK_dy7C<>y)n|&6(MJw?aK+@n+g6R3i#_RNq1ezs1z0B
z)T;>mObbZ(nxEBhcw&MveGXsgYBJxL1gY1pO#on<7G!ql-&RaD*umfjTMT*c3zYeS
zF#H93oB>Zs;RF+r-Fi2(iS%2<3lKAwk~oXv$Hu>u)%h}c4*YY$>%tz%D-&{_L45)7
z17^e|T6I95j@$&3K+1Bgs;{|c<f*2GwuKBgR1f6Cm)g}u)dr^_Wbv4k*jjJaz!27e
za;`hIn5;p1j933@(wpF}D({Ff?49nA%mq511>^pCws+RKxdx`f&w(E!YmX7ceo$ZG
z#f-+jNBh(VxG15Ed`OFW#*<$NT4m+>KS54<G0x--3rGNX|F0r$0pG-jBaa{VYY4){
z*fy{TPuOY4vMnd$WOt_)u^0oI#$<eRa4u)(ocwaZ&O2GV;AEleE7}9l^%V=I{FMqN
zd(s}1G-ce6v=E?%<Y)j#6&#iDJM0YFBlwOwWu^&J;#bahr%*0Luf?+HC^K_ILJ%q#
zCA<7eFfq=CTCa^?k%oQ|5K7}X8wlK1)H%Ah4@Zfs&%4#LtE<&dx7Vw)j$<DN4+~~(
zrV2e3nMGl1!QNKg)rN4(T2T{c_+<4W*hkFBrlX`>3M~hob6Sq73%kS1k&X;(Fgf2#
z5FL;wRAMbx-|8OFG<#NYmG%p(MAgA`-rOega~=wjy9l9m8<8W5pYE7;YCWZ<i13Sm
znqAKQ1m@&dfVKb#FqZR3mB%k2>Kb8boG%xrNW2CRTf#5C0IPwxN$enE%ZM%G7vMr|
z$R@5=!Pzi=@dZAI^q9TKYbC8d?TrBA!7)k|G!MVouVii;pEtbG<uP7idVB}QQg+T8
zU!DM>N8&r#b)9wSy}s)`uI~=62YIukME+CW9_ETKo@aT8pgKz3D+p6@cBA}!3kw63
zr@HSpep&hpE4hd^WYOx?`KWK^xh{b_$2tQh4)TNyHT3_D28I+CBN;tVc0hQo+Urzt
z$}$$L-;fWG44gJ)AYMaml%!^=FXLW}4|2v;cU=i@3$lN`VLLq)5UFZ`N=}_QvJi#H
z%;k-zLr@$_lWKF_^&qujVZ07jlkD=9Te57)VpTn|gr}JAx?%pw>c)sjBlBE-Bo*V#
z>6r`aOBjLx<03S2dS4f6DqP0Tau8>!Wd)2@g3h*u>ebrN?Wl9Or0Lc(OeCvYG(B}e
zejzGT{WO#R6jP2R*TE&#UWX!xmGFy{uxg+XUfnN10jw3_{&1YHjTGoczxYWm>d&%E
zw07dpG5?6>``K%nt;((hzt@A`w&t;}1jSL>C$IaK#=K<KDn4~T2J3%Eyi0`$87#6`
zmQgmWck|gie)Tsm@gq<e7I8KEuJvx!s)7%wLWyW0#xhFZ+1TB?)J04gv?+pq2=L~o
zueclAG=W?<*!FG}0n|__&|9Lo1hCAz))p9A+q`R1bxi6zt988Rkr-UJb-m%-P(P0o
z=@7pF6NnZN1lREazGA*Ct+&_RX-hMje~ak^0{m}zl;m~%Y)wd=iRH^BFIkPXbJ4Oq
zqtjf_)Q~C-^+O2!G7ttYCeHq_X0l@93l%%2kOq0xU&bv9<XOVcBo4@~zJRYgn}+?6
z(aBv;se%Wd1Wzxk<Cb|h^?G_UzggJCDgp);?CK^~46w00!m6a1+0DVu0GQc)hl;)&
zzIn7_fs8BE>I0opbd76K_q9@IaK~%Rvr8$6T)NK&yU*;*m(!c&E@k16C=1J-p&gZl
zw-StUX?fT#0>-R#01U4i>L*dp5uqGE?v2`9BgdA<v2vHb0*ji@8y%$Enm~wDcg-hw
zUut=>GuWxvgLll=&CTJ>5pP!qPu>~sjC9hsGl&~Nd+FSN9dW~U`OD_!DDv5jRNxht
zM>?Z-Eu<@L4ti6FOD*rYjWZncDvt-fy<7X3E85nwhwfsGdi%E?z}5U+1KOgo&KP?P
z{lH!*bSjG$DA0HD1eNYMJjbw3`4g$HSn@oilE#+|bpT?B<N&G`+>hA($YG?D598#1
zNf+He)-i#5j(OGP2bT|Y#_UmV@A5%=>`r>?P-kHIA&eD$?cok7-eG)?*yDH7h~*QF
zbp|m`#yjJQ+T*EMf0RFu@+YDvka($sq>t(_9*(%JX$Vlv?PxF}AGw>xd1~ueXB;&d
z?_|DYxEnU=YfmCo*QkQPHL-0JP3$m=3a3)Y=PQYEG>&Jn#DHYiEbs)F@NluG?a@05
zrb7Z(%y&l%-LxH8E<wRU?kpskAW$^hxQ;lWrq>^vdGv`s#E|URe7v-?Amj+b4T3u}
zAFo?)r8i2S5FlAI`y0g<0Llf&B_Rai*+iV3UQt|8f%k}rZ+{ooG4EWs3`v9}8?@O|
z>kBX(0*qe=;k`;NpiK;J!Kq%SvJvnipW^YE>hp`$Ti`KasTBY??_95f52)7Hz4mb^
z2_XQu4S;JlSc!nq1f#fe;(Ev(9IFC?-NqHrx}s5EC5fki6g`IK0z#ZsI2JS!tdVA*
zG~?0?P=NT4AVF_qB}{D`idNsMHpp;+hPBsMZSoBew`hD(eW!4&dIjtd_>6ZRjBj8c
z^hvEX7zdC$uqb2A&x35B#;v6d>A+SXj?eeG^_`)Y#1tW{%ahZ`{d~YkK>5le2tlZl
zqBN`i27|xJ;FlRhluG?=o|1pKx9Be7m?q%A$ixwsqE)i6s{_wO{o#^#Rs9lE5Yh6p
z>uWGFa73a~e}&OxCH!o&?lsp}VG&XHuA-VWh#>v-i?j?2ECh3Id(netQjr>I?14a$
zp9^>u^$U#urwo3D!4`vO82r}=rYAM66>%{lkm~R9`mZwhdkFmO3L0w3RiwjyM(eu`
z*e3|WNb@^dRun1h2FE2W^af{L3{)^J1iio=*J*IY_UMKZWm431k$wqdu~iF3l0P74
zIupNw6Cm=mP{go@oYuo42pp0jfAD$KJ72VDMfGkuT?BJfH1?W%sXfh<!8~QZQ_9jt
zqX=t_B2tm98p(`6$jzHYT4AM2M%A1ETQvd(%K}?f%uFEN6xlCpFBFil)56uU;A$mP
zF^XmuELX*{-Yp@g{m6S`*EYs0VByv0P{Fq{v$L}oF2Fh_shIS0H5wGcvSSLV6epRV
z22P<BmU^IQrIjM~;}<C^1`z{{u&%V*tIFcd_Xc%-21))Gd|eKNe9AKRW_CS<$H0?{
z1=_bP)BJ8eZGI<jncvQ4%s;`kcg&pmt^I|OZz|rsfN$b67@WB@?&yXCBFnV=oK?_&
znL@@MwX+TlJaSHszYyy4a+JqW0Y?=4iw@ucr?^<6G4q?5xq(6%Ib<Nz57`yiqm=An
zdjxpzposTnu%Y9TUCITTdqxY+>H5bpJhYszE%>Tph1s<LYGWK?K3J>t306Q@Tfp#E
zqX6^0y*gjTD$ZpzT>k=vY;ZRw#VxI<SZ{glbr1lEY9t2j7L+?s{I53@mVZqKpy~>j
z&Q?O`6CD}EHV}e_S8V}v5LK4w<Q6<+BvK)3v9^9?wXsk=`||Txe&tzvpOJ*qc3_Mw
zs6s$u0ZfUO4-^!-CQTPar3_6&%ftHXg5cf?bPY`nJ8V;cbI=XpE7Af{2FM9U16tpo
zHogch*)wmaO2iA&l$>Q2fccM2XC`o~zrWi?7epI2r0q4V8H=7U2cfxuB`egw!X5o&
zC))J5tj!3fd}O`SGa4c}Xtq~k=vfdG(m)lt7Vs5mU;||Y(m=UO8pv&#H1`u@+H^uL
zfHbg`@qnz{4dnwMDqS&@6JT!J6+_;>7;tCgu1k;?2D{{i@EVbc?rW46cDgp$DFMX*
zrn*@U`0Ve+-4)^Dl{aitkukD73eGVXaPdgDHP&Gn72)E`-Z;+Ck2cI2>^#jEw#EU<
zPuxx2hUo%0T$+=R&nH?!KEJX#9Ps%SaQYFS@9l2?0{C`t?LYxxph)hz{R>jkp5?tO
z7NmpS?%(qEZSA)QMA8zu0G#fpP$Tm571#omkZ!a$7o>SE&PUEh84`tBM&7$91rTql
zV37fy2yQn#18TTA(HRe(PM#5*I-gXI7irT_70!9$J<}i`-PXEPJH0C<ocbZO^B}K*
zD>oC8#dAdBSeh;~Fb&BrKM2dI>=xvXpv*ke@akP-34LIfVJT&zZ7cQ=<fTcp&%r=m
zs&pnJdFhbIOAm>>bV%eS9#d|57(9BzuCyrMAKrvawtQrBcV{<7>-36=mirn?KYACO
zy!s4|9>McZwT8W8d@g(VHdFktJmYc16Th>TXOP2`^xa5j3ZrQ>z!C=_3r}HwP(IT~
z9Ig6@<Gp))hsR729y84ejJA{bo`MutN}LULpV?!OclUUw?QwfTjAQqDk1jvf+0)qz
zjCt2K<5(@J{C+}GNiBas<kLYQABE1Y&WOGHPHuBwXW!Q2!MuFpZd%5}KFr<^cJ|@P
z_CazoUCgM>{q_`6cGVV=(*xV;^T0joW4ezf>a(x2KaiaE%&K3*Ql$PqgWU|yGx&!L
z{t<)!j=_TrejR~d4)Exj#-lfWigajU9d;eCbm%1$p(tXvq46MI3FGXwHjGzx0=186
z(F9}@RH_MyjHWG%i;aZ_vv1HCgeDaL9U<T(P|EeSB?ZJ?SK&CBzUpe@iUL|TBi5N4
z*sKC1lJ+(NNs2*19sEB~cao$d^WF9d5e!JPg4M+ttu~!WKKpSfdO|(O{PZ!XbO6YD
zY^nbq!SCTKAAb@Eo<qA_aGo1db!}m}@vCe*O(6qoEY1F+Ob1q3(DOh5f<Zc-Sje;3
zZ-BZ6#?OISOp`Y3OsSkzk!o=tsrVw4up;`&qpq(7_8*a@P!&R;md{!=#+E}UyrS%9
zs-FTwLO_{!R<G0-RwQ3tVWjho8TAkFh#PyQ-(xH+kgocQvVd>@SKQ-|5-m%%r>p)M
zu1tSO{RV#2Z!#6kyVuk|=cx!=ALkMIS2fSzK4dTQ@%^ML^*`~#BMgo)h$X8%0rL<H
zRW}&udJ8}fX0jMtO2e-Io_A7frQSl|muW*ANGFumM0ncZl>)aL)K(iSj`|l&PCb~G
z+C<6{dFv44$PxIX0Fo)4v)OA-ivpnSPcZW+c4=)rFi6h<0FWB}2i|{%!AlHYX7C9H
zuP}&I#s03CIl{Q2C=`kmTTJ*21CjHp_aWyg;hqF+^?BoOG5FsY5PDYs2ZP^6;FkkS
zjcvl4`j5Okq9r&y3oMkOP;GgxtTYP*bTnJ(=R*h<_S9?{5g66)@b*$5xz?cia?~d!
z3P8EoG}FFT{hx@^`tRT6(Z6Q!e=+z#6h!Lq4KjZJ1L8yZuLy~61Qz*__e26Ln%y$r
zfi$E(6UuVKe79(t-?6e<YAj|d=9pQA90YkTUB=NN^N_I%G9O|BiEzv~0;x|+i;xpD
zC4+Ki84_gCgyaMWKKorrj#<W1qAVF@E0#X@Lq)IUtT4=eYph7q>mDgGG<5A{Uc-+6
zIv#d`3~Bfo!vW>6bNIU7MgUTratSpXn}wGAwg+JuVXUM#VMK460CC@dQVwhz*q#8q
z7R9<K+F#FZf{}%~9-u7Pgc4Y}GMKwT&C+%9?$aKaT1gKO89?ta%{M!MUGEe(M_S|k
zuc9{EUIrm-T{CwE!06=w_6~H4oy={pbDdF`1mxWn4-B&8u5-=a$?9v#_@4Z7AkdP-
zbiTf61u|b1pL0;2X=yEJKM*SKYTG6ikru8-(qi>EwQO9B`>pbUR0PqHDmyH(SS6tD
z6BJX+QX+e?Oxei?V=r8dvkT0?XvPJraa|=W1$tIjT?lrFq@)(!wb;N0(Jo0<Ckd=u
zS<Kmlg)3va@tW<{q*48tl%MCh!!R(S4QQFBViZ0hqT-~ti6CVrvWYRF`!0_}{ZCn1
zH|liX<;hKd+7hE45?S;<$|REzjtYT0k9XP83YSn143#+o1uUJ{_901QOIZAA>Hj3F
z-h%Xx@ypSaXnyu$Xqm_S!jc2op2O)>cw-_Vc3vJ&Ce{)@Gvp^&qVd4OzQA01Te<PP
z2@)q#1gK@r1y12a*>v0J_Q{fs#~VOL_%xq*;d?HFRl<V<vZe(g<AaQ6s^|6o7mm_X
z)e}{sr$<5tZ#q$zKFkYhMAR%{G~A~`pMtI!7!1n*e})Z>9xAwliAUUk+IUBAe9if#
zc-v|1TH<WQFLmux#kH=p6~7RY$u)m4(VDOels(qi0f*g9DP=Jr4yv*Yatz?W0QyP>
zRS*QQaZv>Dl;~JrET}&~8uiy0kOT7vYU>yqu$98{p!PEQ#~26$wVy|O5KND0j-!Wv
zIEhOka2(*^<REpK<Q}9}elh0Xp^DeLcF{Tmx~+&9yHRt<S^TK}cvmxpTYqtH-{=+5
z?uQx#y$e?fSJA^u{0XWyW`clHB*<2wZcpcdBwL{66=1+n#tSbH((d>-{|$36K8ufl
zI{$IGY)x2y5XbQ-`;Ikgexo}MDZgbEH%<gNV=vv`i33b*%NRzjbNC7m@G$zlV~W{b
zb~8s6HR!X&z2Lh6MW_eDW6*wpD*`+eJGosN>fnwQY|rXI`3d#5@Bp3()n}Si9;Q1`
zWZzA91^^+Lh)?&4Ph%@r4n}^j;=0l2y3r{Ch8hG<F|b)_od#!OBDHIqn?s(3JxAF7
zH<`i_EI_gSFAY^_xdUri+u)ugsFRH?*x*9sHd=slg8F9$x@!+W4fQu+c4z>GBKM*>
ziVEPN(xM5jWpL44<c>xw*giv98oDlF>sN3*UDT?cCvdFxG1y`t(oBm-M;I_Ab_jy(
zVk?4r0B1n=pJwEMCdDGA`2|u*eK?>=n4Boi6*a&~3eO>$^^R3i{_Z60sZq)enCoc9
ztmw%<%-dxxsPJfzLC^RuBJL)~_eW4DxKgWV>;mt)%iIMW2=yM&dZb}eB4#HvJjsF&
zE@AW4h+zhND~(dDcQy{}G)}t~jsKfXkyd0?XD`5}IjQkb|Ar6#Bm-&Cf6JpZgSQzB
zF!*;2UT46WsaXA;w`~1Bk}M;IOR<28jZ)4W2&b5}gIc5bt<-1C158!?X2n*_V*%g9
zhpaU}+teF@<zl*!wg+MJh+%8P?vcO17PE4c!%+@LRBhy)0@ND?sy3J=Oo?APk1ZEP
zZ8Bh&U^O`)>WvZxY1tnO+WZo{Z$xx=<J&^zf=gYTVxWi!b^wU^YpXRA>l9~%C}Sr_
z+oQRX4drwU2i!vyBE|iw)FEJox2r<51_Vqm$U6Wc5L&V-&_t34#ttD8UAw+U2Le*^
zrSmVJrJ5r!fqV}Pf4f=FI65+qFGrgD+n5WTj2aaXAJDv3R>00*uN07kxDMUH8iDo3
z5=dXKq+-^ym8Dg`K>C!Q2;j#tGy>ud{y3)8DQ*p54W{u4jaLTmT3h7+(l64KC%8rk
zz55#B(w(j`&kEqsVSGmbi4G*rO5JB-0yGFik<ed2*@MXrpy2Y3CO|)$03Ziocu5nW
zp+I@?Ggv{btx1Hb<z1p4;1P{=hU|d|{GWnkHz4w%2LFqIe+A4}|4cyZ5<u$_<g-V>
zR^-$f5zrbm$J>jz)bc)1nE@QL&3IfAlT|}~6Z8mu0)Gc-wCRwT@_f_V4|qL_0r*_j
zsUjno;_?H_ReMOF?IE2LAnYNu<AZ><*$@0>oAbAX9zL*qaQTot1xiL89zy#4_<gu#
zz-$Ph`In6LBa#z9avZ@7XvMOJwZt0su;d2WH0sGCFu57U7@3Aqld(zjq0#Q%0$e<{
zt*@Yp>DyNtzFti9)+l!BYD1gR+4>J-SqxT)7wN7?TaS{F?zg09tBrYWGJpST#(Z@f
zW<0Nz`0={pp1G{noq23pgGA2qHAJb=&t|;^M+jX@SQ0$e)zUljL<rzo5!{nZsUc8-
zR|7q(*zI?lP{$@WADB5ga}rArmmD`(_h>P~+69~PLOY{)Hx+;?y-{!;^tn|_7Jue|
zQ->|<0TJGSX=5WFGn49yBdE6m%Y7mJKZJ#-NAuVP>i+{=-Pl8aOK~IK3#-*y0K)!)
zXsdtBqyG_sZ!N6bbs@&s99)oUQ#93*g0TGp<sOaEQLUpD^83%3&0lBm{_)&<Vw;-5
z5@h#hnf)&@xWnLIFo;ptU*xH<T9&|rzryn@gZ~+UzxQ6$OdH1h0u#%FKEopcOvmpB
zFugZc8sn6|&1}DyL5IQbF!)^t0$YBRN5YRqu;pJRE5@`ijR5;)y_>Z{eucl;cglrs
z5VOaO1>y(+&d7YlEP;s|!+kVZ*#&!H`HKXW3V_V)w<<*%tt1Rq_V!x?W&$FtgozvM
z%e&p1Z3F)HUZyBhSeCxZHXxo)j?FBF&%YDD1~ZiW#D7w}lY87{qWQy%p?|Lc9zP8`
zDomN3IXQ~5v7w5C&18-h>{j76!qObfROByP$l4Wha1cWB4)(4rlcOuhkv3DwgL{)$
z#19~zdsRxo@1R`c8Pk;brREm9QxIZ@O4%NQX-jOvGVF)D=+4u$&ruh-wIend-?qUc
zgXa)V#>PSS1`j&jsF^TQFlun#1Q`k_FS(lAjE5;gha2zV#EE#Scz!5&#pMC;I^5z=
z>Ki*K8y{foeE^LQG3hLd6j^Dy+-1%23}XE;E>^wEqQCpa=PzA;<?O}U*~^!ooxS}0
zOS83e7tdb0^z0?SK$V@`Bpb}&0&I0neAoCTdK#j(i;kXRMUp=xz=Rg4c-h|t2&{{P
zjjyQxh869Zev~_Qq2=5%4vLs6S!fH0>rWXQ;e_b!gZs7`+0Jp6Py#v57w~m|1i^Mg
z@&fv16MP}Iw$VU@X7))QG<_!vIUGnnI28043@y;Ja)&lF=k7^y(1rNu`XW{B8FdC+
zRAJG85%xeQtIp%B7@Q4g+4t8_C#_6|?@4Y<{0&C@CWE7hP=^^j!r%z6m+oiHq>{<c
z1I=#Opy~c_)W4CH-!L?+F6L{Og?=tH9PANWNrDctvmV1isNylxRyk0)V9nEirFZgq
z3xJL8Pzy>K{oU9TtZ|7hO)PQS#M3E0|L-x_QAU*|jxwr0aDmRP0((56n04M6KZguI
zS@;j44JY|9deMW0VL7aRa?vZn_AU$CyDV()_R{u_ch)cBRuP0kTQzcX2~)M#=6gIS
z&SR(kxvNf~3l?W<vhN4{8r<rk3$$(?%^^I>68EFQmQ1TiL`NY72D<rJrLWOw*_Q+}
zImnx+$`Kc~ZZwVKR^yQKLB?>a1s;gIEP-~1+>q9=MA~iamtRxs!g$FZA}Q7J7G|ck
zGmq{khQPQA{|WWT)#hsvy9y2$p0(aMa@F(J+$T?*Si7Y!wbjxIx{o{oC>{FQ6X0yG
zdd<~?JvUCn;^B(k7bO*V6=rcTzQZHcQTwHJI0uLd)kTTIMmg}<L}`_5Ug-aGdjqqz
zVCV~#XSeQOnjEYzOox{=wolOhiUTu%Mq35ip2#0wmzMx-<WN2GS}2g8lg;AuZ}h;}
zn(No8*SXmcqP89b+Pem7#d>3_p?0|!`InJig++R3I?k>O9C8F@FT&U%;{2t>s=<fd
zFWH7Bl91=y3*5QRU2m{hk2vCE+ANaii8_Yk7V82NCvtyn{`-2y`3$*y8g7W?xMR=1
z5vuvnEm39Ui9)N*nc~K)mr?2!!Vke3!R?$F$$Bl4t#Yb<BiIUjJkVLO`m7X<BLiO{
z3u0Iz7)E<-W&zVb@d2BYdIn!Xr$W}lNyEb$e5={!T-Hu68Q3Ii%>mbh!(kIHh0VDE
z**u$qwJUu&<|7N&o+tM+@>G{)iSUO)t-IE@yNRvYSUyr*x6ZLHe+B_)4RyrCaSboM
z48#<hT;Q{CnJf=D26_!OK%cNVk{f@jJX#i)T-vD`xq{%{BJMrg_uc`y_nf|l5KI!C
zdc`?}yamBRC@NmW51fU1@W>_H4;>$bn->&J2IxVpNIw`<i(#Xn2m*f&H)-dA7QOk=
zv#4sqjjQ^z%q>GFqcy+qZ0MHF9|&|`+&UW?m}b$CSUuEZSZlkMZ?4?_p!QE0jOI<W
zq<8{^??15N&=7#_rWZikl3U-g%#B0Afa>Y2nM5|T@Fma#jFR#|MI<v>yR7tw=|C=U
zY>NTD*tqFG5Lvs>ii*ZoHCP!htgN+RHjZiP4})#0xp1Nc6IyvF*WXifR~l=ixGp@$
z(UV<GT-uTd_!G4#$B3VXzg)i@$CIh{$K!Kt0wN2EI+JX%Ski%O=~SDIMI04eeug^~
z0U`%~B~vj){bD5`Y)Or_X7jpT8O4_G^P)W!bFgjdQ*kp)r4h_}rh7G)&ud*1Z`U*k
zHZF1l7Vyv>nv2nO81!7>ihm1Rivlgbm{$Q?Xe?oPVI{-f*uc$DxHE-&cY+<bi508i
zb?NQ7Muyc0V21V;T3jPbjWfJ9e-;)+msufhT_F{}<T%xC@*Z8Bfm3CyT3GQwI1+{S
zPY!32phUGAZXnsGug%74=rk)_zhC<ly%TX|)Q$A~dH3lzj==ft(keVoqfrh@{Bd+n
ztZF4~(>6+?`0jDIb>AI!4;}G~+&@HE_<FkmkQgozG0jPrqN2G7C-mBQ0C5S7t@^xK
z)Gyev<g7XiBt|fdn}6-(vFfQe=AYE8M%V<$L(RD!7UE2a4RZ7-O(Xe)j(zl~b_EgN
zb5d*Dx7`z_?->ShJI7DJq`j>jT|LgL0aFx~`v3E>ge`6JJ!DCC_CYIegZc+9yI?XY
z`y)8_Akt%-xvk}3t%jy!t@*&Olo(p-ad`x`mx(dIt)%Im77sSq-GF%~M^dm0BOEdz
z-*rq(F@t?=*fwut?g^d6V5_0ZGvqbEGQs~?P|K)?#871ZjYO@;(&6^TBAnE<nYGe8
zI`p)LM<1-?O^i2;#@E;Y;#?wTW9#Z><N6Gl*!mUs#HkORIyrOd(Z?P?H7$dU*0+h$
z8ZAx>&FKYtm1vm)f8;q!w|W|dV_oJ6E^5mpa^ZxE-)pq*bXPpy!%-TVsjw#_JDNYu
zf_{(nA)E=cgbADm3Onmy8zvr*)=oPt_hItbH6H-sOI@o3yG57}SuYapt;zKa+$%+8
zqCwx7?`h<Cu7~NW+n3ewQgUVOnLcQ~bKF=8BB-jNw=nu$4}6AoNu<-$_cCz-dvZK5
zNV9v`+#@v8;<yb~Obksn0fl+mYwLx+dS$h}umbT2>4F3cd`&@|BGFJfHC*D5xCE4s
zs{khjSZR{KwV;EzTNSiPZ*mQwd4RtS(k|?MYQrZV+_Q&;ZkzRqVzrkd<PLhxXOM{u
zl2NULR)ZCSO9I+hEs14$SmrV9iEy5_L&EE5^@X%}(BVCodb<1a4fqU_d8jp<sCfH^
zV<+vXj-w8cW;8Rk?vW7do_zz4rWxE1C!XbaRbCiDVj6b5=y$ujTzDhR{#56q!6_yO
zE=GgMUos=a&m0$3lF-wk|4ihaC{i2UC%LSI#l7aA0*33Do<U;FRs)3=R~_KOf<DBO
zBGUUx3m`$bk8agjsxRD16e=wev|n$-MqMcUm=@Ob;DiK=afO)?+^I(z%3MV6LyVUt
zKsH&o+I@%;$vZXAq)Qkmi3;gDCUXz@!Ek`}RJ1SQV;P$AaVX+Q$xP@XgL<9nA%+sE
ziQo~)pR!*MBiL@Y4tt4u%Ju<d-B5fKy@euS0l9?j<Cx1tEA);>XLWif-t5$Lv}2x0
zh+xdeHA$3<%l!sQxZFnUcx)vn8Hp!A?rjXY#jb&TzdSznb;N%E@44Q+6fqlc@u)ok
z@VVIMoC00XCiI_6)WX%aw00Xlao~u6TRWH>OI|=KFtWf3Xh2hFt{&-$#z8})rx(}7
zjSpo6VLxQs4CW_Z=>p6FyKkyj+c$(MX}53&7AS6OR9Dzo{G#i+(3I<_6a5BZO_dco
zQLe0XhdJ(nI?<Xq0^9(Kv{$g&f=y73n{lT^YubGnNMd}$StlzzFRI;ReWnztNMiVL
z{D<>J=*CWTAY6!nE8Yi>7jhYTHHa0a99ZvV=ouP$f|!!E$Mfh4at(+j3pBC#&Jl)!
zGa;sedTG_n+<X)besvwL6o~fmr7=!YI14kQ3?>6X3u19k1WytUZP|nWpZhS$eM`ZN
zPSie8T<-%!Er3e*j5G9V?*r(^hJ#RSZ2HgW1R;=`G`>i5?j`Z}0O_@B0)vpRLVm6a
zL7OQq0BuNob_Hkd-^M|Rz=_j3D*yCbR08E+fV#m_#?E7pIULtHh1*79+XwFMV9W)*
z*??2RFMN7J^~WPB0S9qZ#_<ppes)>naSWGsC5M|&E5Yw@@H-Oxjyl86h%+iG!ZGc{
z4lk)16GO0Zpn#KpDFBwFDK>smdZEj>L9<MxDVjs#s)lYp8gc>j(WTc)-Rnn#jbt;`
zXUQT<B-ws8dLO+B#2P6viPPxJpA13NbAnz;Isxz!7$mV!B6ewf-7g~0n#&)=mR2M6
zgK#kg9b_86bn+&?S9y#tcd?{BIuSI)0UUg+Zoxoq5$>4q_M5y{LYl$b#`)B3Xwh(Q
z*~VS9A;iKmsW#+Qm<PVYl1D~#og>`#T%#q<Scpf49zhoXOGO{nncCJ-eBB8IOF&NF
z$l>iRs4=vu=ASjM!JDXHhSPZ00Pck!6LaRIY-NKZ6{?(L{l8!Q2;XQzzWb?vjdl?4
zRuienzrS>Cq;#D52v}^eOi$q;yi3%u>JZAgFg+eCV8yA9HW3A<pgxT2eo=6&*j=o8
z60zzTR!DUC^EmQLLKZ^f(jqDV98_LZmeF|Yu<(1`0|#E)sGoM#L(K1k%x{|IS)!1g
z*a4R;K)j(3&W5M{5^wn^gO4!>(T`Ev_FHTb&R3W;;BPN>M@_&MH4D!j_%%!B#(4J>
zh}tR9W5^Kzb4rlzKjd&D(j7P;K!4CBB7SJ7+Y9uLaTl5<yAiE){{VIRLm9co#)FB{
z`=#DC-g)OaeE+b>ns9W&RynARdX;t2o_GC>P|9sp;`<$XtMv7-N_c5evP%6@&(g*o
z(k~fEZFMZBM~|Xxe^9Q3e3<xy5zjFGSU(Dcmjwi`K*O@G72bTR@UsU8Jm=%O--9~B
zAdjj8n2xu&3!mx+m_bS${B8wtCe+z@y#&%tb+SQR8gB|HNZg@LA&9eJ$d1=vA>Csz
z8kb)W+uCF=y^enh3Wh8;s=?H-H58<Iz5|5@ROpD?EVZzx*n0Op76kP=>Xb*ARj+%6
zEw~frGGtquSv!NRZ&{^0m@Imw+h&QL6UwUT4Q|0R0u=RpEfWmbaWqJ;2=Jr%<3Zgv
zQ3vj+&n#D9&^`>`W!Sym8H7pu@bV}OB(rxcybLP6HD+h-8e8Ldm&pJ;e5G4?#06(t
zqcA{(8NKBDvsj=e#5e$NNZFc%x#uo?cOyl`&UY#>7aaDcY}gjdZub!no(^DMIfB>s
zl<uVayb9`{B#Z@3X~(al9roFSFvI^kp8D%FY2>y58*gQRx@JGhdjRH^MQk91A*1>_
z+ORCG9^3`9NNf?1oZf{ePZPfK-tCw$lEiDVT2;wgn_Z4~a_~7Y?p0w<34=?R;%7R8
z*!*X<2E7Nn<7BKHuq$^m@NkfU--bNOn6`%{Psv4{49?(@0lp7X%LmV=-Z&30226Rj
z18-wu>Hyzn%P3QQ62H8HU-(_ywfqpgk<pj11=HK?-Q4ExRuMb>yW;E-m~p}zf_Tjs
z>tu27T4y)wubTEK>M;g`*hx9x=u99T&ZWeOFB#XKKq>I&hS8yG{zGwnP+N?0<nk9f
zWrTmeGr4v6ZmMN;CJ|pi>-?+-|8Mrhn<><03inMcS|n??@y<1GI*hX??^s|6cFk^F
z{xswXFwwGMU*27SHwd($VhxeWXyyyMorn!*J$PARc3le49$tB{@ho%IvZP=$cqIb%
z_Cxgv-Xf6&mlT2`iO4YDjk#_3378A`Jp=87tVI8!f<}bo0@7BO8jB3IeiuMjF0dYT
zVMRIv|Ds&s)hI%7c(5@XsrTH_m-j@xhTp@bw|WqHZ2SSAn9>Ou_$CBJuuc#K0wn{~
z#CX9VWIe5~7cII}JyayZ+Qm6@SaEm%;Xt&m--?{OuD9U9S!)~uh#3$tS%2B=)3pUP
z^uYk8Eg&>hTy*(&MORY_7iAT7Lo?kKs8-KZ57Tqb;rVdiT#!(n2I7nihMB5x5a3n*
zJ)4mFy6H?8UHH3<jl6?t7Zcndlc2GrYFOY?F28#be2!dEf!0Z~5J0E=Nf6;{SQbKB
zCk2`6Ig%aYdM}+Te1vU#b4@+<CbZNpHPz;4r&1e-6BHMXMswXRwkW&{uK^rh<yP}<
z8Ag%DsaKJgUyLQQ>7$w_lV_Y~ioeGcLzI5I>6=)dejb<9D@^oHdGs{~Z!vhBaRWS}
zofk|IX@j6~qQJAL7`M`LHQqSnTdQ#2rM{n$zs!Kl$i1BaXddffCZ^$#qTQO}hHu?{
zFCzngZ!MfdNma1CW0bF-(@H<}2__!G%kH}CiPv(<TRVr4=ky-U7kv+_b&`cdtb(dB
zjxD7JhcsNBaooOymuYENZ*->cTS(+j5P*tpyx>wnRI+}B&D>qSHtrqXHd$jw_;^RE
zTMI|JOfuc4!<Y<-{)Qt0MnyJfu0Mh?#h0atQKjw|e=s~1Ce|O6pU}42FUlQ)Ble6@
z*^u-fG3s5KD31-o5%0TW=`oXU0i-Q}IOzqCN(DEKda<Jlo&icY6T>u^=<1zSW9f=X
z&vWtH11HjZ<sR`rkez}p0e4+x#kei|Z6pqljqIOT`EnW_>|nE)rR_mbT4+gzl;4Vt
zX&IY+CtskAV-{Wwtk2>un8{hVmoa$zX!Mu;_C~nRNXsC>T*JHb!q4<Xuf@W@;c%eY
zg0_|W6o<p>45;$=cZGJJ&(FU6>g7wd^Dmu)9q9A3&*=&AHt*viqGSsgmlSo9H~%{x
zQRdQa?Km){?NyyeVqhS_M|mo;!v);-o9vm7A&RC3Bf0!=KAla6yVv69w7${p=Z>~^
z`M}mLn>YVcwqSlUTQt8>92om!#atJVVd6t3mmfEat1+4Y8=iD5k;1x1zmHX3PY5I-
z%l#Wf6yWx10Ka+hZkK|+4^4qnb43S8!fnGD1abgBcX!Of@b*9ZARORV;E;C|I6-B3
z6j;KT91Y`W97kh_9T(JLLgI0Z7jw%!INF7y3H<I3ey4)pJ;Co@XVTf_lm&m7u=m;f
z--I)EAP~C?!}cNj0lNzCdsEIHc~v)9lDWNtOzZ<A@{k|CdE!D_g|C3#D6eXt;4|8A
zW}9S0(Tv^io=2dO05ZDVixpTY!xlEi0&tw;Io_#4BtXB1h5N7IrdHb%sUJl~FdX_2
ze)NE35317)USPlyl~@Zs!=qRR&IMIfmzXS;PGy)y(&?8-nJD1U_XppAQc6b#rd<zy
zA5tmJf`!QiTc_Pan!1`o4?f9C#KT<5=hXU(C|@Meyb;mG?sDIsJW8@fd2~5d1I-Sp
zbcbcn^{D`-^_Rm6l=@eopY)kv8;}S3N=u09GE4a!&LE9%@VL)&`{R75Ko*Tj#809k
z>c7A}8%F~Eptl{i-Lh@?alJW*uljUY5A|6b!3d4efM0GoH}w0AxQWN@=?{OL&$_LU
zpJKh<*r{H9Z@G_pp?x@p!<yv+`qf*d0p=fE`)X8?$e~O;GiSOI-beSSH`ze_I%j~L
z6O{P1s7v~l*p(3I*g(*5vZ3cG<UXr*;y4I3U2#_3zMRKjMn^4>2Ma$0>3Gh5b!o~6
zw80PrB19W#JP#NVag_*JFo7%-B@UYwBLspT*+W-1+z`qQoJ?olDZrq9K-}k);8{`p
zJHq{q_WGXhlmrzRq?zO9pltv-$V%QfJA;6i4Lb|T+TeC}zO7`dh*U#Jkpbk)I7-}%
zt8|982E;QX{XAlGe5b;5<Xyp0rZa@CqB!=-w^N(Lkoa*OTO`Y!qWeW`Z#Pgs$!|B`
zx;TREgM8v2>6CG0WD=9TGXi;X2pi&oPT=Ly19(Tr0Q|b=@9+g$h5FB-GX=w~;*-RC
z8Kb)dxV|=_{{PsHgC$&)P8T6bn<#`Lj5aMn4O&a<RN6PG72rjhrqNzmE6M`noj_*2
zLeWQ?dO>zfT5Ykk03eUi(l^Ao;`phPCq)63(8bY|9BQ-H61Cde)JtT+P)CQ!MbUAd
zcf(mMWXix|Flta1$<Gn}FmOjlT@xRG=)35CBfE@6h>5%^l3eg+n4tN)AF1CXbSb1?
z3Ewgo)!|wFy2dWExB2;bSj=B-x1uypJ@u4!f(yH$=p8iN20*uf2^+;@p}g9)495m_
z%t!hO0&1V-y2pC^$bqKz)?06cPBD%@@zLYgVfJ(&HaBFRkr<486V>T82)}ohiRXp=
zCd7Yac;I6WUIobRyG4iv8kGgwiaZUUQZ0${>Z@Xo*mRygkjO$qs>mX$-w~ZFcR^4Y
zSPRsDXAZ1+c%;P52dmHN*8;@`2z=B4cyI)8Iip*1CX76yOPq<Yo@3DL-V6k;GmoC!
zq19%z7Z$yK9|<#K3Gk#yb73)XhaI&Zdtw4}47&&L0(ZEG(XpYEL>HUx9QTaiGt;k~
zj1A!4Pcy#Wdj4=zw6IZg>%G6vi<Ly!;OLK;xWzGSXQvB0(vQA$1B4+Z?|z#R@JP1Z
zzJ(W<a0<#BY2f}j#txK&^wXogpur+yHPiNSW*0G&nwME)+^;YYIsXMF7A%0QpQ<p3
zz=^N`QAzAX2-OcUi+=L=30`=WIsOQbei(sYa<4YvLK!dD!J|ZHvZ(rbK4xtZ2>ws-
zrXEI<o0xtc%fGzdcWhb<xxLQoM6mp3zJ#}hgAumiWN!-MZ3RMYNyJhM0w*>&$sSZ~
z@16aVjj4WfUjsS|l=i|833v|Q7w~n-7zW^S%fpcejhh<RFXMocz+6s4r6B$QHuEjG
zY^9eQe&Lb>q}0MIhc}^w0MQp$!+?g<xFUNZ^)KFpjhzb`@Q$v*8%~!;8KX4};*XsX
zyl(|)P?R1yYm&{~cY=Ghf^u5$mlNDE2_^)k$Q7pV<h$|)%)ga|te66cF9i2~1q=#R
z3SWUrAP4Y0PgMaO2_r=T-X_v~i5`>{+?`<}6;eySL3v%d)88KzB|j%Em{LCnhQfl%
zpxCmYcrbVb{|7~|HGtVs5BRh|?u9Qy#euwwU<Hb^8fZO_zR<hV1rU0WrveDcaOJ3k
zYws8<l3_~xXnLRvDEkH?d~m6F!Ma{jv>VQNt<nyt(qI<VRSxA7ya9f(Xl-4?ej{w@
zAoPMv<EbLENT4dI&%cse1ilFoCv5is5*5NvEUo*perHfg=#y5oZR7R#h339`ouw1b
zTCiMVKUD@}*L?=X=HXR`uV}(P5TG)9vj^Zs3Y)a?N<B;l;6fF6ro>(mV=s^&q@>qK
zOprVf80*_Wi}D*M`%I8~7M+kCFlS@s`V3CMdzzgT^dc2#lT0ra!xNpf)+va_!n|hj
z6`6E~#HAo15Ku<qRD2X)c>$}yyn}>!w=n2_kZ)<kHsZU+T{D1tA<#ljLFc9L{zAV5
zrpUQTKo0P?b|Jt;5Ge!(*3Lm?Yt>tZpTFYNn;<_o^^zoFFKcs-4a}!AKYJjx@mQZm
z0&Lq>Ft*Vh+YcQ!)GBhq7?t7jWgNrJslEwAivz?jY8;Tqx&l9eR01zyCrqM%r&uA1
z&s_NUUWh!fzDN<<#RAPF?|Z8lB5AW|92`&Y?*Y{Ok($>)g@Ua{vSFteTKgQUb(_KW
zGxz}pgxl0j9#MHN<i#JVoqK8a!t>A7F1+*#p||%3x{(a1{}MTZcyOQSg5DLXev*aM
zc`>FNF%EYNSMQ*!+<BzPr?Mai`9dDP)bt;*$L)w5#{VDp!{qYV|JseoWe3bZE|$uF
zWPZB%uKA%6L$S2Fk1;A7)MaqV!Pp3P@^!JT;oim2B2jOG#LFA;av|=C6pMG7%eU96
zM<7q&ox0i{Cfq>~S;U}wF9n;1&OzbeMtQ&}3DCASpr<B6vwsvkWNhLT2Ajgm%f6}T
zJ}{YJ`bD><J^N|yPQOIrh%>V^Q+@8`%g0Yo<Mx21za%dY!sb+QypqgLHX@=a;wr$}
zk#8Rqk0_{KyGrw0sYqyVPAef1L85AlnQUF!bsosOUav7XgSRrlJ1Mkbw7Eh_T6zk0
zmr7x{48347N1Q^>2>FJpN|<bQ`%Yy}Vw0@eH(0WitBrK~7cHf@^u7Y4ZoV2kE==1S
zc4ZhooN&7{)u`<Pt?AM_UaG{~rO-sRg7?wS2g|;|Pq6EwOn_B_rt4i+XgU>|h*s8N
z*RTmht$KvD++Im3Gko$yMKSIuA}(MaB|Ze(`Emaaf)IX|l^w=V-LJzAOxfsdEC&;|
z-1a}{nc>+2P-+2QD)M5wMh|6(0Skp!u)r)WPt!J>!L&^u!?-O>0TVZ;jB7)|ZzcF0
z4t__RA*bRD2XOMJJw`ZrZbU$1IQyNR@Wbi4<CIF!H`1aVA!AY@nAVscAY*ZGp<mri
z$QWj2?*lT1{s(4q0p=9L$2%g&8F9>eZ`}CPq!$`DrZ;NAji+NW)_*y9ZxmU5FW2JF
zp;?rmwtWyfMcDWXidUOB*!aME7D}8>i;*$dx~&)36M~$6U$O!_!Nv|N@R|Fo0HMj=
z3hayy?kp5-g9b?=DOm_K_%I$Jfc&x%<VS9pHi)qgxc-nIU-zg%KEQb3t{=hRvjh$-
z<3$*m@R-OtWCjDA{8Iqn_`<=bJOF)x=}dU-IXk;K)H(>6f_Yu_a$EF(E%)L@n*b4E
zm;~6*w!r@n0p?>_cyrQ(oWZNuAmBj*Ihf3+GYG$B@a)D|J9h`ZE;;>RDg-m30EUE#
zPjnYw{E}KGI0zT_I37YRP4#0iETUGR+~SK`7-y-kb+XG9<UPD%y8l&hTtM2d*?h4j
z(^@uLkFWpLdFFBTAA2K!Cr5%@O!bf8@w-CvCe#5T0L$UlBaG{0<@llCUgrEe-k99W
zJH`bF#cLgLACB)r2E?6VNkydox;GJ~)_HY`Fs8}jMUR9+p);cQP(Y!hc)cXr8KyUx
z&M>Zx3K)vz$eW~BI)I>j5z20Sr{K$uU}&j+2P0P&V=l<=8~VKGmV_N)&K>xa6X8n$
z!K(BHJnw~c1}JB|zQN@=H>&ZqgY(-jo;g!J1!4!zLBDq}D3T;RLMM;B2r%gQCA=>K
zG?5homKl(5HWoj9a(+r@+{3r<_GISgWH;Hf+#<1dD<Fu>au&XiMdFJEPLcikfg|WU
zEx2x+0dPm&16w6*m>vl`ATGGwDk?<h@8CER^q;lC-bwO08YqIz48<3`N;qm0e-Ag0
z@U?#gSM;v^h_&;l@Pa?D(f6AAGpHKuEyO%H#*%&pvD!D<$9eRNtUjS7-*m2tR~kPH
zA9N66V`)mYc)bLS5hhje63dq;e3dsJ;>{vR7kD(rIDwx8lA>Ivh8gq#QGsjc_YFk(
zqg@PlXiw1vOqEc+UkOKcAdtKM;2K<UL*~&~mB3f(M_JXK(5b5=rwSDx8>9EM-oNZO
z^8*yGp`^u29cY*Vx3=bfm_NV(0xoYUwDI)}G|D)DFH#C-H3ngYY4^$<d<*jkoaFXk
zSY7JI9mqDo0h%bOEj$9eWcRpuppml><=9L@%Wdpq?Ep8;qkqeQbhfWWg!Gz7Xmuv(
zCyD(uivJiJ>LOC)Q>9_bVg2kO-k<a}_D702V<PNEzSA#>GhaUkK9~k>>R+-tzkvYn
zn#O)jm^#Tb$RkmL*H#^uc*F;u8;yMM#Qtex=YHxqbDGd;^d^wlL=f)@4Sd@w8ObuR
zInL*HbqzyzCh*)2A<tFh!Fn!hC`dAot-0nNN@Nl0n6z^<!cjGSgP38_Aj{v^{|2q+
zbd{K}RFF_6+}xi2sdc6!%x@%-LSLYAQ%m~eJepuI$$&DyKb{c7YE7*}5#t=ZQ54eP
zRqbYt_ABDn-JgtOXRct!S!iOVJzno;x(ArE#jOYM)CM(Ho$s;bJ6%F|LF$i~qDZz;
zzsOw=_^<KipTa;<bfN8MU)0W3WT(ItUi$)rI}E<aK!*G8@#w1zeulyCGnir?4>LH3
zz@H2)iY{F~`^x1mgQ8PRA^?c+>_i4+pmxm-nI#03{QO7IquAu|6jB9#7)Q0&M7YDo
zdN6{p1XzcVU4_4R!+I&7P4jrnEW)fSZ;cF&?9D$^ER5`(ni?skK2sS>eFotfb5*eM
k?~LTE=geVQo4z|@Srg`k@EEB+4Byiq!FkF2!->lO3(fVAQ~&?~

literal 0
HcmV?d00001

diff --git a/collie/models/mistral/configuration_mistral.py b/collie/models/mistral/configuration_mistral.py
new file mode 100644
index 0000000..20ffba5
--- /dev/null
+++ b/collie/models/mistral/configuration_mistral.py
@@ -0,0 +1,152 @@
+# coding=utf-8
+# Copyright 2023 Mistral AI and the HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Mistral model configuration"""
+
+from transformers.configuration_utils import PretrainedConfig
+from transformers.utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+MISTRAL_PRETRAINED_CONFIG_ARCHIVE_MAP = {
+    "mistralai/Mistral-7B-v0.1": "https://huggingface.co/mistralai/Mistral-7B-v0.1/resolve/main/config.json",
+    "mistralai/Mistral-7B-Instruct-v0.1": "https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.1/resolve/main/config.json",
+}
+
+
+class MistralConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`MistralModel`]. It is used to instantiate an
+    Mistral model according to the specified arguments, defining the model architecture. Instantiating a configuration
+    with the defaults will yield a similar configuration to that of the Mistral-7B-v0.1 or Mistral-7B-Instruct-v0.1.
+
+    [mistralai/Mistral-7B-v0.1](https://huggingface.co/mistralai/Mistral-7B-v0.1)
+    [mistralai/Mistral-7B-Instruct-v0.1](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.1)
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 32000):
+            Vocabulary size of the Mistral model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`MistralModel`]
+        hidden_size (`int`, *optional*, defaults to 4096):
+            Dimension of the hidden representations.
+        intermediate_size (`int`, *optional*, defaults to 14336):
+            Dimension of the MLP representations.
+        num_hidden_layers (`int`, *optional*, defaults to 32):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 32):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        num_key_value_heads (`int`, *optional*, defaults to 8):
+            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
+            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
+            `num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used. When
+            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
+            by meanpooling all the original heads within that group. For more details checkout [this
+            paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to `8`.
+        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
+            The non-linear activation function (function or string) in the decoder.
+        max_position_embeddings (`int`, *optional*, defaults to `4096*32`):
+            The maximum sequence length that this model might ever be used with. Mistral's sliding window attention
+            allows sequence of up to 4096*32 tokens.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        rms_norm_eps (`float`, *optional*, defaults to 1e-06):
+            The epsilon used by the rms normalization layers.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models). Only
+            relevant if `config.is_decoder=True`.
+        pad_token_id (`int`, *optional*):
+            The id of the padding token.
+        bos_token_id (`int`, *optional*, defaults to 1):
+            The id of the "beginning-of-sequence" token.
+        eos_token_id (`int`, *optional*, defaults to 2):
+            The id of the "end-of-sequence" token.
+        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
+            Whether the model's input and output word embeddings should be tied.
+        rope_theta (`float`, *optional*, defaults to 10000.0):
+            The base period of the RoPE embeddings.
+        sliding_window (`int`, *optional*, defaults to 4096):
+            Sliding window attention window size. If not specified, will default to `4096`.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+
+    ```python
+    >>> from transformers import MistralModel, MistralConfig
+
+    >>> # Initializing a Mistral 7B style configuration
+    >>> configuration = MistralConfig()
+
+    >>> # Initializing a model from the Mistral 7B style configuration
+    >>> model = MistralModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "mistral"
+    keys_to_ignore_at_inference = ["past_key_values"]
+
+    def __init__(
+        self,
+        vocab_size=32000,
+        hidden_size=4096,
+        intermediate_size=14336,
+        num_hidden_layers=32,
+        num_attention_heads=32,
+        num_key_value_heads=8,
+        hidden_act="silu",
+        max_position_embeddings=4096 * 32,
+        initializer_range=0.02,
+        rms_norm_eps=1e-6,
+        use_cache=True,
+        pad_token_id=None,
+        bos_token_id=1,
+        eos_token_id=2,
+        tie_word_embeddings=False,
+        rope_theta=10000.0,
+        sliding_window=4096,
+        attention_dropout=0.0,
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.sliding_window = sliding_window
+
+        # for backward compatibility
+        if num_key_value_heads is None:
+            num_key_value_heads = num_attention_heads
+
+        self.num_key_value_heads = num_key_value_heads
+        self.hidden_act = hidden_act
+        self.initializer_range = initializer_range
+        self.rms_norm_eps = rms_norm_eps
+        self.use_cache = use_cache
+        self.rope_theta = rope_theta
+        self.attention_dropout = attention_dropout
+
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )
diff --git a/collie/models/mistral/convert_mistral_weights_to_hf.py b/collie/models/mistral/convert_mistral_weights_to_hf.py
new file mode 100644
index 0000000..4ba6236
--- /dev/null
+++ b/collie/models/mistral/convert_mistral_weights_to_hf.py
@@ -0,0 +1,276 @@
+# Copyright 2023 Mistral AI and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+import gc
+import json
+import os
+import shutil
+import warnings
+
+import torch
+
+from transformers import (
+    LlamaTokenizer,
+    MistralConfig,
+    MistralForCausalLM,
+)
+
+
+try:
+    from transformers import LlamaTokenizerFast
+
+    tokenizer_class = LlamaTokenizerFast
+except ImportError as e:
+    warnings.warn(e)
+    warnings.warn(
+        "The converted tokenizer will be the `slow` tokenizer. To use the fast, update your `tokenizers` library and re-run the tokenizer conversion"
+    )
+    tokenizer_class = LlamaTokenizer
+
+"""
+Sample usage:
+
+```
+python src/transformers/models/mistral/convert_mistral_weights_to_hf.py \
+    --input_dir /path/to/downloaded/mistral/weights --model_size 7B --output_dir /output/path
+```
+
+Thereafter, models can be loaded via:
+
+```py
+from transformers import MistralForCausalLM, LlamaTokenizer
+
+model = MistralForCausalLM.from_pretrained("/output/path")
+tokenizer = LlamaTokenizer.from_pretrained("/output/path")
+```
+
+Important note: you need to be able to host the whole model in RAM to execute this script (even if the biggest versions
+come in several checkpoints they each contain a part of each weight of the model, so we need to load them all in RAM).
+"""
+
+NUM_SHARDS = {"7B": 1}
+
+
+def compute_intermediate_size(n, ffn_dim_multiplier=1, multiple_of=256):
+    return multiple_of * ((int(ffn_dim_multiplier * int(8 * n / 3)) + multiple_of - 1) // multiple_of)
+
+
+def read_json(path):
+    with open(path, "r") as f:
+        return json.load(f)
+
+
+def write_json(text, path):
+    with open(path, "w") as f:
+        json.dump(text, f)
+
+
+def write_model(model_path, input_base_path, model_size, tokenizer_path=None, safe_serialization=True):
+    # for backward compatibility, before you needed the repo to be called `my_repo/model_size`
+    if not os.path.isfile(os.path.join(input_base_path, "params.json")):
+        input_base_path = os.path.join(input_base_path, model_size)
+
+    os.makedirs(model_path, exist_ok=True)
+    tmp_model_path = os.path.join(model_path, "tmp")
+    os.makedirs(tmp_model_path, exist_ok=True)
+
+    params = read_json(os.path.join(input_base_path, "params.json"))
+    num_shards = NUM_SHARDS[model_size]
+
+    # For some reason this is a string in the params.json
+    sliding_window = int(params["sliding_window"])
+    n_layers = params["n_layers"]
+    n_heads = params["n_heads"]
+    n_heads_per_shard = n_heads // num_shards
+    dim = params["dim"]
+    dims_per_head = dim // n_heads
+    base = params.get("rope_theta", 10000.0)
+    inv_freq = 1.0 / (base ** (torch.arange(0, dims_per_head, 2).float() / dims_per_head))
+    max_position_embeddings = 4096 * 8
+
+    if tokenizer_path is not None:
+        tokenizer = tokenizer_class(tokenizer_path)
+        tokenizer.save_pretrained(model_path)
+    vocab_size = tokenizer.vocab_size if tokenizer_path is not None else 32000
+
+    if "n_kv_heads" in params:
+        num_key_value_heads = params["n_kv_heads"]  # for GQA / MQA
+        num_local_key_value_heads = num_key_value_heads // num_shards
+        key_value_dim = dims_per_head * num_local_key_value_heads
+    else:  # compatibility with other checkpoints
+        num_key_value_heads = n_heads
+        num_local_key_value_heads = n_heads_per_shard
+        key_value_dim = dim
+
+    # permute for sliced rotary
+    def permute(w, n_heads=n_heads, dim1=dim, dim2=dim):
+        return w.view(n_heads, dim1 // n_heads // 2, 2, dim2).transpose(1, 2).reshape(dim1, dim2)
+
+    print(f"Fetching all parameters from the checkpoint at {input_base_path}.")
+    # Load weights
+    loaded = [
+        torch.load(os.path.join(input_base_path, f"consolidated.{i:02d}.pth"), map_location="cpu")
+        for i in range(num_shards)
+    ]
+    param_count = 0
+    index_dict = {"weight_map": {}}
+    for layer_i in range(n_layers):
+        filename = f"pytorch_model-{layer_i + 1}-of-{n_layers + 1}.bin"
+
+        # Sharded
+        # Note that attention.w{q,k,v,o}, feed_fordward.w[1,2,3], attention_norm.weight and ffn_norm.weight share
+        # the same storage object, saving attention_norm and ffn_norm will save other weights too, which is
+        # redundant as other weights will be stitched from multiple shards. To avoid that, they are cloned.
+
+        state_dict = {
+            f"model.layers.{layer_i}.input_layernorm.weight": loaded[0][
+                f"layers.{layer_i}.attention_norm.weight"
+            ].clone(),
+            f"model.layers.{layer_i}.post_attention_layernorm.weight": loaded[0][
+                f"layers.{layer_i}.ffn_norm.weight"
+            ].clone(),
+        }
+        state_dict[f"model.layers.{layer_i}.self_attn.q_proj.weight"] = permute(
+            torch.cat(
+                [
+                    loaded[i][f"layers.{layer_i}.attention.wq.weight"].view(n_heads_per_shard, dims_per_head, dim)
+                    for i in range(num_shards)
+                ],
+                dim=0,
+            ).reshape(dim, dim)
+        )
+        state_dict[f"model.layers.{layer_i}.self_attn.k_proj.weight"] = permute(
+            torch.cat(
+                [
+                    loaded[i][f"layers.{layer_i}.attention.wk.weight"].view(
+                        num_local_key_value_heads, dims_per_head, dim
+                    )
+                    for i in range(num_shards)
+                ],
+                dim=0,
+            ).reshape(key_value_dim, dim),
+            num_key_value_heads,
+            key_value_dim,
+            dim,
+        )
+        state_dict[f"model.layers.{layer_i}.self_attn.v_proj.weight"] = torch.cat(
+            [
+                loaded[i][f"layers.{layer_i}.attention.wv.weight"].view(num_local_key_value_heads, dims_per_head, dim)
+                for i in range(num_shards)
+            ],
+            dim=0,
+        ).reshape(key_value_dim, dim)
+
+        state_dict[f"model.layers.{layer_i}.self_attn.o_proj.weight"] = torch.cat(
+            [loaded[i][f"layers.{layer_i}.attention.wo.weight"] for i in range(num_shards)], dim=1
+        )
+        state_dict[f"model.layers.{layer_i}.mlp.gate_proj.weight"] = torch.cat(
+            [loaded[i][f"layers.{layer_i}.feed_forward.w1.weight"] for i in range(num_shards)], dim=0
+        )
+        state_dict[f"model.layers.{layer_i}.mlp.down_proj.weight"] = torch.cat(
+            [loaded[i][f"layers.{layer_i}.feed_forward.w2.weight"] for i in range(num_shards)], dim=1
+        )
+        state_dict[f"model.layers.{layer_i}.mlp.up_proj.weight"] = torch.cat(
+            [loaded[i][f"layers.{layer_i}.feed_forward.w3.weight"] for i in range(num_shards)], dim=0
+        )
+
+        state_dict[f"model.layers.{layer_i}.self_attn.rotary_emb.inv_freq"] = inv_freq
+        for k, v in state_dict.items():
+            index_dict["weight_map"][k] = filename
+            param_count += v.numel()
+        torch.save(state_dict, os.path.join(tmp_model_path, filename))
+
+    filename = f"pytorch_model-{n_layers + 1}-of-{n_layers + 1}.bin"
+    state_dict = {
+        "model.norm.weight": loaded[0]["norm.weight"],
+        "model.embed_tokens.weight": torch.cat([loaded[i]["tok_embeddings.weight"] for i in range(num_shards)], dim=1),
+        "lm_head.weight": torch.cat([loaded[i]["output.weight"] for i in range(num_shards)], dim=0),
+    }
+
+    for k, v in state_dict.items():
+        index_dict["weight_map"][k] = filename
+        param_count += v.numel()
+    torch.save(state_dict, os.path.join(tmp_model_path, filename))
+
+    # Write configs
+    index_dict["metadata"] = {"total_size": param_count * 2}
+    write_json(index_dict, os.path.join(tmp_model_path, "pytorch_model.bin.index.json"))
+    config = MistralConfig(
+        hidden_size=dim,
+        intermediate_size=params["hidden_dim"],
+        num_attention_heads=params["n_heads"],
+        num_hidden_layers=params["n_layers"],
+        rms_norm_eps=params["norm_eps"],
+        num_key_value_heads=num_key_value_heads,
+        vocab_size=vocab_size,
+        rope_theta=base,
+        max_position_embeddings=max_position_embeddings,
+        sliding_window=sliding_window,
+    )
+    config.save_pretrained(tmp_model_path)
+
+    # Make space so we can load the model properly now.
+    del state_dict
+    del loaded
+    gc.collect()
+
+    print("Loading the checkpoint in a Mistral model.")
+    model = MistralForCausalLM.from_pretrained(tmp_model_path, torch_dtype=torch.bfloat16, low_cpu_mem_usage=True)
+    # Avoid saving this as part of the config.
+    del model.config._name_or_path
+    model.config.torch_dtype = torch.float16
+    print("Saving in the Transformers format.")
+    model.save_pretrained(model_path, safe_serialization=safe_serialization)
+    shutil.rmtree(tmp_model_path)
+
+
+def write_tokenizer(tokenizer_path, input_tokenizer_path):
+    # Initialize the tokenizer based on the `spm` model
+    print(f"Saving a {tokenizer_class.__name__} to {tokenizer_path}.")
+    tokenizer = tokenizer_class(input_tokenizer_path)
+    tokenizer.save_pretrained(tokenizer_path)
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--input_dir",
+        help="Location of Mistral weights, which contains tokenizer.model and model folders",
+    )
+    parser.add_argument(
+        "--model_size",
+        choices=["7B", "tokenizer_only"],
+        help="'f' models correspond to the finetuned versions, and are specific to the Mistral2 official release. For more details on Mistral2, checkout the original repo: https://huggingface.co/meta-mistral",
+    )
+    parser.add_argument(
+        "--output_dir",
+        help="Location to write HF model and tokenizer",
+    )
+    parser.add_argument("--safe_serialization", type=bool, help="Whether or not to save using `safetensors`.")
+    args = parser.parse_args()
+    spm_path = os.path.join(args.input_dir, "tokenizer.model")
+    if args.model_size != "tokenizer_only":
+        write_model(
+            model_path=args.output_dir,
+            input_base_path=args.input_dir,
+            model_size=args.model_size,
+            safe_serialization=args.safe_serialization,
+            tokenizer_path=spm_path,
+        )
+    else:
+        write_tokenizer(args.output_dir, spm_path)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/collie/models/mistral/modeling_flax_mistral.py b/collie/models/mistral/modeling_flax_mistral.py
new file mode 100644
index 0000000..0a837f4
--- /dev/null
+++ b/collie/models/mistral/modeling_flax_mistral.py
@@ -0,0 +1,741 @@
+# coding=utf-8
+# Copyright 2024 Mistral AI and the HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Flax Mistral model."""
+from typing import Optional, Tuple
+
+import flax.linen as nn
+import jax
+import jax.numpy as jnp
+import numpy as np
+from flax.core.frozen_dict import FrozenDict, freeze, unfreeze
+from flax.linen import combine_masks, make_causal_mask
+from flax.linen.attention import dot_product_attention_weights
+from flax.traverse_util import flatten_dict, unflatten_dict
+from jax import lax
+
+from transformers.modeling_flax_outputs import (
+    FlaxBaseModelOutput,
+    FlaxBaseModelOutputWithPast,
+    FlaxCausalLMOutput,
+    FlaxCausalLMOutputWithCrossAttentions,
+)
+from transformers.modeling_flax_utils import ACT2FN, FlaxPreTrainedModel, append_call_sample_docstring, logging
+from transformers.utils import add_start_docstrings, add_start_docstrings_to_model_forward
+from .configuration_mistral import MistralConfig
+
+
+logger = logging.get_logger(__name__)
+
+_CONFIG_FOR_DOC = "MistralConfig"
+_REAL_CHECKPOINT_FOR_DOC = "mistralai/Mistral-7B-v0.1"
+_CHECKPOINT_FOR_DOC = "ksmcg/Mistral-tiny"
+
+MISTRAL_START_DOCSTRING = r"""
+
+    This model inherits from [`FlaxPreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+
+    This model is also a Flax Linen
+    [flax.nn.Module](https://flax.readthedocs.io/en/latest/_autosummary/flax.nn.module.html) subclass. Use it as a
+    regular Flax Module and refer to the Flax documentation for all matter related to general usage and behavior.
+
+    Finally, this model supports inherent JAX features such as:
+
+    - [Just-In-Time (JIT) compilation](https://jax.readthedocs.io/en/latest/jax.html#just-in-time-compilation-jit)
+    - [Automatic Differentiation](https://jax.readthedocs.io/en/latest/jax.html#automatic-differentiation)
+    - [Vectorization](https://jax.readthedocs.io/en/latest/jax.html#vectorization-vmap)
+    - [Parallelization](https://jax.readthedocs.io/en/latest/jax.html#parallelization-pmap)
+
+    Parameters:
+        config ([`MistralConfig`]): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~FlaxPreTrainedModel.from_pretrained`] method to load the model weights.
+        dtype (`jax.numpy.dtype`, *optional*, defaults to `jax.numpy.float32`):
+            The data type of the computation. Can be one of `jax.numpy.float32`, `jax.numpy.float16`, or
+            `jax.numpy.bfloat16`.
+
+            This can be used to enable mixed-precision training or half-precision inference on GPUs or TPUs. If
+            specified all the computation will be performed with the given `dtype`.
+
+            **Note that this only specifies the dtype of the computation and does not influence the dtype of model
+            parameters.**
+
+            If you wish to change the dtype of the model parameters, see [`~FlaxPreTrainedModel.to_fp16`] and
+            [`~FlaxPreTrainedModel.to_bf16`].
+"""
+
+MISTRAL_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`numpy.ndarray` of shape `(batch_size, input_ids_length)`):
+            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
+            it.
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            [What are input IDs?](../glossary#input-ids)
+        attention_mask (`numpy.ndarray` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            If `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
+            `past_key_values`).
+
+            If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
+            and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
+            information on the default strategy.
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+        position_ids (`numpy.ndarray` of shape `(batch_size, sequence_length)`, *optional*):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+            config.n_positions - 1]`.
+
+            [What are position IDs?](../glossary#position-ids)
+        past_key_values (`Dict[str, np.ndarray]`, *optional*, returned by `init_cache` or when passing previous `past_key_values`):
+            Dictionary of pre-computed hidden-states (key and values in the attention blocks) that can be used for fast
+            auto-regressive decoding. Pre-computed key and value hidden-states are of shape *[batch_size, max_length]*.
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+# Copied from transformers.models.llama.modeling_flax_llama.FlaxLlamaRMSNorm with Llama->Mistral
+class FlaxMistralRMSNorm(nn.Module):
+    config: MistralConfig
+    dtype: jnp.dtype = jnp.float32
+
+    def setup(self):
+        self.epsilon = self.config.rms_norm_eps
+        self.weight = self.param("weight", lambda _, shape: jnp.ones(shape), self.config.hidden_size)
+
+    def __call__(self, hidden_states):
+        variance = jnp.asarray(hidden_states, dtype=jnp.float32)
+        variance = jnp.power(variance, 2)
+        variance = variance.mean(-1, keepdims=True)
+        # use `jax.numpy.sqrt` as `jax.lax.rsqrt` does not match `torch.rsqrt`
+        hidden_states = hidden_states / jnp.sqrt(variance + self.epsilon)
+
+        return self.weight * jnp.asarray(hidden_states, dtype=self.dtype)
+
+
+# Copied from transformers.models.llama.modeling_flax_llama.FlaxLlamaRotaryEmbedding with Llama->Mistral
+class FlaxMistralRotaryEmbedding(nn.Module):
+    config: MistralConfig
+    dtype: jnp.dtype = jnp.float32
+
+    def setup(self):
+        head_dim = self.config.hidden_size // self.config.num_attention_heads
+        self.sincos = create_sinusoidal_positions(self.config.max_position_embeddings, head_dim)
+
+    def __call__(self, key, query, position_ids):
+        sincos = self.sincos[position_ids]
+        sin_pos, cos_pos = jnp.split(sincos, 2, axis=-1)
+
+        key = apply_rotary_pos_emb(key, sin_pos, cos_pos)
+        query = apply_rotary_pos_emb(query, sin_pos, cos_pos)
+
+        key = jnp.asarray(key, dtype=self.dtype)
+        query = jnp.asarray(query, dtype=self.dtype)
+
+        return key, query
+
+
+# Copied from transformers.models.llama.modeling_flax_llama.FlaxLlamaMLP with Llama->Mistral
+class FlaxMistralMLP(nn.Module):
+    config: MistralConfig
+    dtype: jnp.dtype = jnp.float32
+
+    def setup(self):
+        embed_dim = self.config.hidden_size
+        inner_dim = self.config.intermediate_size if self.config.intermediate_size is not None else 4 * embed_dim
+
+        kernel_init = jax.nn.initializers.normal(self.config.initializer_range)
+        self.act = ACT2FN[self.config.hidden_act]
+
+        self.gate_proj = nn.Dense(inner_dim, use_bias=False, dtype=self.dtype, kernel_init=kernel_init)
+        self.down_proj = nn.Dense(embed_dim, use_bias=False, dtype=self.dtype, kernel_init=kernel_init)
+        self.up_proj = nn.Dense(inner_dim, use_bias=False, dtype=self.dtype, kernel_init=kernel_init)
+
+    def __call__(self, hidden_states):
+        up_proj_states = self.up_proj(hidden_states)
+        gate_states = self.act(self.gate_proj(hidden_states))
+
+        hidden_states = self.down_proj(up_proj_states * gate_states)
+        return hidden_states
+
+
+# Copied from transformers.models.llama.modeling_flax_llama.apply_rotary_pos_emb
+def apply_rotary_pos_emb(tensor, sin_pos, cos_pos):
+    return (tensor * cos_pos) + (rotate_half(tensor) * sin_pos)
+
+
+# Copied from transformers.models.llama.modeling_flax_llama.create_sinusoidal_positions
+def create_sinusoidal_positions(num_pos, dim):
+    inv_freq = 1.0 / (10000 ** (np.arange(0, dim, 2) / dim))
+    freqs = np.einsum("i , j -> i j", np.arange(num_pos), inv_freq).astype("float32")
+
+    emb = np.concatenate((freqs, freqs), axis=-1)
+    out = np.concatenate((np.sin(emb)[:, None, :], np.cos(emb)[:, None, :]), axis=-1)
+    return jnp.array(out[:, :, :num_pos])
+
+
+# Copied from transformers.models.llama.modeling_flax_llama.rotate_half
+def rotate_half(tensor):
+    """Rotates half the hidden dims of the input."""
+    rotate_half_tensor = jnp.concatenate(
+        (-tensor[..., tensor.shape[-1] // 2 :], tensor[..., : tensor.shape[-1] // 2]), axis=-1
+    )
+    return rotate_half_tensor
+
+
+class FlaxMistralAttention(nn.Module):
+    config: MistralConfig
+    dtype: jnp.dtype = jnp.float32
+
+    def setup(self):
+        config = self.config
+        self.hidden_size = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_dim = self.hidden_size // self.num_heads
+        self.num_key_value_heads = config.num_key_value_heads
+        self.num_key_value_groups = self.num_heads // self.num_key_value_heads
+        self.max_position_embeddings = config.max_position_embeddings
+        self.attention_softmax_in_fp32 = self.dtype is not jnp.float32
+        self.rope_theta = config.rope_theta
+        if (self.head_dim * self.num_heads) != self.hidden_size:
+            raise ValueError(
+                f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}"
+                f" and `num_heads`: {self.num_heads})."
+            )
+        self.q_proj = nn.Dense(self.num_heads * self.head_dim, use_bias=False, dtype=self.dtype)
+        self.k_proj = nn.Dense(self.num_key_value_heads * self.head_dim, use_bias=False, dtype=self.dtype)
+        self.v_proj = nn.Dense(self.num_key_value_heads * self.head_dim, use_bias=False, dtype=self.dtype)
+        self.o_proj = nn.Dense(self.hidden_size, use_bias=False, dtype=self.dtype)
+        casual_mask = make_causal_mask(jnp.ones((1, config.max_position_embeddings), dtype="bool"), dtype="bool")
+        self.causal_mask = jnp.triu(casual_mask, k=-config.sliding_window)
+        self.rotary_emb = FlaxMistralRotaryEmbedding(config, dtype=self.dtype)
+
+    def _split_heads(self, hidden_states, num_heads):
+        return hidden_states.reshape(hidden_states.shape[:2] + (num_heads, self.head_dim))
+
+    def _merge_heads(self, hidden_states):
+        return hidden_states.reshape(hidden_states.shape[:2] + (self.hidden_size,))
+
+    @nn.compact
+    # Copied from transformers.models.gpt_neo.modeling_flax_gpt_neo.FlaxGPTNeoSelfAttention._concatenate_to_cache
+    def _concatenate_to_cache(self, key, value, query, attention_mask):
+        """
+        This function takes projected key, value states from a single input token and concatenates the states to cached
+        states from previous steps. This function is slighly adapted from the official Flax repository:
+        https://github.com/google/flax/blob/491ce18759622506588784b4fca0e4bf05f8c8cd/flax/linen/attention.py#L252
+        """
+        # detect if we're initializing by absence of existing cache data.
+        is_initialized = self.has_variable("cache", "cached_key")
+        cached_key = self.variable("cache", "cached_key", jnp.zeros, key.shape, key.dtype)
+        cached_value = self.variable("cache", "cached_value", jnp.zeros, value.shape, value.dtype)
+        cache_index = self.variable("cache", "cache_index", lambda: jnp.array(0, dtype=jnp.int32))
+
+        if is_initialized:
+            *batch_dims, max_length, num_heads, depth_per_head = cached_key.value.shape
+            # update key, value caches with our new 1d spatial slices
+            cur_index = cache_index.value
+            indices = (0,) * len(batch_dims) + (cur_index, 0, 0)
+            key = lax.dynamic_update_slice(cached_key.value, key, indices)
+            value = lax.dynamic_update_slice(cached_value.value, value, indices)
+            cached_key.value = key
+            cached_value.value = value
+            num_updated_cache_vectors = query.shape[1]
+            cache_index.value = cache_index.value + num_updated_cache_vectors
+            # causal mask for cached decoder self-attention: our single query position should only attend to those key positions that have already been generated and cached, not the remaining zero elements.
+            pad_mask = jnp.broadcast_to(
+                jnp.arange(max_length) < cur_index + num_updated_cache_vectors,
+                tuple(batch_dims) + (1, num_updated_cache_vectors, max_length),
+            )
+            attention_mask = combine_masks(pad_mask, attention_mask)
+        return key, value, attention_mask
+
+    def __call__(
+        self,
+        hidden_states: jnp.ndarray,
+        attention_mask: Optional[jnp.ndarray] = None,
+        position_ids: Optional[jnp.ndarray] = None,
+        deterministic: bool = True,
+        output_attentions: bool = False,
+        init_cache: bool = False,
+    ) -> Tuple[jnp.ndarray, jnp.ndarray]:
+        query_states = self.q_proj(hidden_states)
+        key_states = self.k_proj(hidden_states)
+        value_states = self.v_proj(hidden_states)
+
+        query_states = self._split_heads(query_states, self.num_heads)
+        key_states = self._split_heads(key_states, self.num_key_value_heads)
+        value_states = self._split_heads(value_states, self.num_key_value_heads)
+
+        key_states, query_states = self.rotary_emb(key_states, query_states, position_ids)
+        query_length, key_length = query_states.shape[1], key_states.shape[1]
+        if self.has_variable("cache", "cached_key"):
+            mask_shift = self.variables["cache"]["cache_index"]
+            max_decoder_length = self.variables["cache"]["cached_key"].shape[1]
+            causal_mask = lax.dynamic_slice(
+                self.causal_mask, (0, 0, mask_shift, 0), (1, 1, query_length, max_decoder_length)
+            )
+        else:
+            causal_mask = self.causal_mask[:, :, :query_length, :key_length]
+
+        batch_size = hidden_states.shape[0]
+        causal_mask = jnp.broadcast_to(causal_mask, (batch_size,) + causal_mask.shape[1:])
+        attention_mask = jnp.broadcast_to(jnp.expand_dims(attention_mask, axis=(-3, -2)), causal_mask.shape)
+        attention_mask = combine_masks(attention_mask, causal_mask)
+
+        if self.has_variable("cache", "cached_key") or init_cache:
+            key_states, value_states, attention_mask = self._concatenate_to_cache(
+                key_states, value_states, query_states, attention_mask
+            )
+        key_states = jnp.repeat(key_states, self.num_key_value_groups, axis=2)
+        value_states = jnp.repeat(value_states, self.num_key_value_groups, axis=2)
+
+        attention_bias = lax.select(
+            attention_mask > 0,
+            jnp.full(attention_mask.shape, 0.0).astype(self.dtype),
+            jnp.full(attention_mask.shape, jnp.finfo(self.dtype).min).astype(self.dtype),
+        )
+
+        # usual dot product attention
+        attention_dtype = jnp.float32 if self.attention_softmax_in_fp32 else self.dtype
+        attn_weights = dot_product_attention_weights(
+            query_states,
+            key_states,
+            bias=attention_bias,
+            deterministic=deterministic,
+            dropout_rate=self.config.attention_dropout,
+            dtype=attention_dtype,
+        )
+
+        if self.attention_softmax_in_fp32:
+            attn_weights = attn_weights.astype(self.dtype)
+
+        attn_output = jnp.einsum("...hqk,...khd->...qhd", attn_weights, value_states)
+        attn_output = self._merge_heads(attn_output)
+        attn_output = self.o_proj(attn_output)
+
+        outputs = (attn_output, attn_weights) if output_attentions else (attn_output,)
+        return outputs
+
+
+# Copied from transformers.models.llama.modeling_flax_llama.FlaxLlamaDecoderLayer with Llama->Mistral
+class FlaxMistralDecoderLayer(nn.Module):
+    config: MistralConfig
+    dtype: jnp.dtype = jnp.float32
+
+    def setup(self):
+        self.input_layernorm = FlaxMistralRMSNorm(self.config, dtype=self.dtype)
+        self.self_attn = FlaxMistralAttention(self.config, dtype=self.dtype)
+        self.post_attention_layernorm = FlaxMistralRMSNorm(self.config, dtype=self.dtype)
+        self.mlp = FlaxMistralMLP(self.config, dtype=self.dtype)
+
+    def __call__(
+        self,
+        hidden_states,
+        attention_mask=None,
+        position_ids=None,
+        deterministic: bool = True,
+        init_cache: bool = False,
+        output_attentions: bool = False,
+    ):
+        residual = hidden_states
+        hidden_states = self.input_layernorm(hidden_states)
+        outputs = self.self_attn(
+            hidden_states,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            deterministic=deterministic,
+            init_cache=init_cache,
+            output_attentions=output_attentions,
+        )
+        # residual connection
+        attn_output = outputs[0]
+        hidden_states = residual + attn_output
+
+        residual = hidden_states
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        # residual connection
+        hidden_states = residual + hidden_states
+
+        return (hidden_states,) + outputs[1:]
+
+
+# Copied from transformers.models.gpt_neo.modeling_flax_gpt_neo.FlaxGPTNeoPreTrainedModel with GPTNeo->Mistral, GPT_NEO->MISTRAL, transformer->model
+class FlaxMistralPreTrainedModel(FlaxPreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    config_class = MistralConfig
+    base_model_prefix = "model"
+    module_class: nn.Module = None
+
+    def __init__(
+        self,
+        config: MistralConfig,
+        input_shape: Tuple = (1, 1),
+        seed: int = 0,
+        dtype: jnp.dtype = jnp.float32,
+        _do_init: bool = True,
+        **kwargs,
+    ):
+        module = self.module_class(config=config, dtype=dtype, **kwargs)
+        super().__init__(config, module, input_shape=input_shape, seed=seed, dtype=dtype, _do_init=_do_init)
+
+    def init_weights(self, rng: jax.random.PRNGKey, input_shape: Tuple, params: FrozenDict = None) -> FrozenDict:
+        # init input tensors
+        input_ids = jnp.zeros(input_shape, dtype="i4")
+        attention_mask = jnp.ones_like(input_ids)
+        position_ids = jnp.broadcast_to(jnp.arange(jnp.atleast_2d(input_ids).shape[-1]), input_shape)
+        params_rng, dropout_rng = jax.random.split(rng)
+        rngs = {"params": params_rng, "dropout": dropout_rng}
+
+        random_params = self.module.init(rngs, input_ids, attention_mask, position_ids, return_dict=False)["params"]
+
+        if params is not None:
+            random_params = flatten_dict(unfreeze(random_params))
+            params = flatten_dict(unfreeze(params))
+            for missing_key in self._missing_keys:
+                params[missing_key] = random_params[missing_key]
+            self._missing_keys = set()
+            return freeze(unflatten_dict(params))
+        else:
+            return random_params
+
+    def init_cache(self, batch_size, max_length):
+        r"""
+        Args:
+            batch_size (`int`):
+                batch_size used for fast auto-regressive decoding. Defines the batch size of the initialized cache.
+            max_length (`int`):
+                maximum possible length for auto-regressive decoding. Defines the sequence length of the initialized
+                cache.
+        """
+        # init input variables to retrieve cache
+        input_ids = jnp.ones((batch_size, max_length))
+        attention_mask = jnp.ones_like(input_ids)
+        position_ids = jnp.broadcast_to(jnp.arange(jnp.atleast_2d(input_ids).shape[-1]), input_ids.shape)
+
+        init_variables = self.module.init(
+            jax.random.PRNGKey(0), input_ids, attention_mask, position_ids, return_dict=False, init_cache=True
+        )
+        return unfreeze(init_variables["cache"])
+
+    @add_start_docstrings_to_model_forward(MISTRAL_INPUTS_DOCSTRING)
+    def __call__(
+        self,
+        input_ids,
+        attention_mask=None,
+        position_ids=None,
+        params: dict = None,
+        past_key_values: dict = None,
+        dropout_rng: jax.random.PRNGKey = None,
+        train: bool = False,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ):
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.return_dict
+
+        batch_size, sequence_length = input_ids.shape
+
+        if position_ids is None:
+            if past_key_values is not None:
+                raise ValueError("Make sure to provide `position_ids` when passing `past_key_values`.")
+
+            position_ids = jnp.broadcast_to(jnp.arange(sequence_length)[None, :], (batch_size, sequence_length))
+
+        if attention_mask is None:
+            attention_mask = jnp.ones((batch_size, sequence_length))
+
+        # Handle any PRNG if needed
+        rngs = {}
+        if dropout_rng is not None:
+            rngs["dropout"] = dropout_rng
+
+        inputs = {"params": params or self.params}
+
+        # if past_key_values are passed then cache is already initialized a private flag init_cache has to be passed down to ensure cache is used. It has to be made sure that cache is marked as mutable so that it can be changed by FlaxMistralAttention module
+        if past_key_values:
+            inputs["cache"] = past_key_values
+            mutable = ["cache"]
+        else:
+            mutable = False
+
+        outputs = self.module.apply(
+            inputs,
+            jnp.array(input_ids, dtype="i4"),
+            jnp.array(attention_mask, dtype="i4"),
+            jnp.array(position_ids, dtype="i4"),
+            not train,
+            False,
+            output_attentions,
+            output_hidden_states,
+            return_dict,
+            rngs=rngs,
+            mutable=mutable,
+        )
+
+        # add updated cache to model output
+        if past_key_values is not None and return_dict:
+            outputs, past_key_values = outputs
+            outputs["past_key_values"] = unfreeze(past_key_values["cache"])
+            return outputs
+        elif past_key_values is not None and not return_dict:
+            outputs, past_key_values = outputs
+            outputs = outputs[:1] + (unfreeze(past_key_values["cache"]),) + outputs[1:]
+
+        return outputs
+
+
+# Copied from transformers.models.llama.modeling_flax_llama.FlaxLlamaLayerCollection with Llama->Mistral
+class FlaxMistralLayerCollection(nn.Module):
+    config: MistralConfig
+    dtype: jnp.dtype = jnp.float32
+
+    def setup(self):
+        self.blocks = [
+            FlaxMistralDecoderLayer(self.config, dtype=self.dtype, name=str(i))
+            for i in range(self.config.num_hidden_layers)
+        ]
+
+    def __call__(
+        self,
+        hidden_states,
+        attention_mask=None,
+        position_ids=None,
+        deterministic: bool = True,
+        init_cache: bool = False,
+        output_attentions: bool = False,
+        output_hidden_states: bool = False,
+        return_dict: bool = False,
+    ):
+        all_attentions = () if output_attentions else None
+        all_hidden_states = () if output_hidden_states else None
+
+        for block in self.blocks:
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+            layer_outputs = block(
+                hidden_states,
+                attention_mask=attention_mask,
+                position_ids=position_ids,
+                deterministic=deterministic,
+                init_cache=init_cache,
+                output_attentions=output_attentions,
+            )
+            hidden_states = layer_outputs[0]
+
+            if output_attentions:
+                all_attentions += (layer_outputs[1],)
+
+        # this contains possible `None` values - `FlaxMistralModule` will filter them out
+        outputs = (hidden_states, all_hidden_states, all_attentions)
+
+        return outputs
+
+
+# Copied from transformers.models.llama.modeling_flax_llama.FlaxLlamaModule with Llama->Mistral
+class FlaxMistralModule(nn.Module):
+    config: MistralConfig
+    dtype: jnp.dtype = jnp.float32
+
+    def setup(self):
+        self.hidden_size = self.config.hidden_size
+        embedding_init = jax.nn.initializers.normal(stddev=self.config.initializer_range)
+        self.embed_tokens = nn.Embed(
+            self.config.vocab_size,
+            self.hidden_size,
+            embedding_init=embedding_init,
+            dtype=self.dtype,
+        )
+        self.layers = FlaxMistralLayerCollection(self.config, dtype=self.dtype)
+        self.norm = FlaxMistralRMSNorm(self.config, dtype=self.dtype)
+
+    def __call__(
+        self,
+        input_ids,
+        attention_mask=None,
+        position_ids=None,
+        deterministic=True,
+        init_cache: bool = False,
+        output_attentions: bool = False,
+        output_hidden_states: bool = False,
+        return_dict: bool = True,
+    ):
+        input_embeds = self.embed_tokens(input_ids.astype("i4"))
+
+        outputs = self.layers(
+            input_embeds,
+            position_ids=position_ids,
+            attention_mask=attention_mask,
+            deterministic=deterministic,
+            init_cache=init_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        hidden_states = outputs[0]
+        hidden_states = self.norm(hidden_states)
+
+        if output_hidden_states:
+            all_hidden_states = outputs[1] + (hidden_states,)
+            outputs = (hidden_states, all_hidden_states) + outputs[2:]
+        else:
+            outputs = (hidden_states,) + outputs[1:]
+
+        if not return_dict:
+            return tuple(v for v in outputs if v is not None)
+
+        return FlaxBaseModelOutput(
+            last_hidden_state=hidden_states,
+            hidden_states=outputs[1],
+            attentions=outputs[-1],
+        )
+
+
+@add_start_docstrings(
+    "The bare Mistral Model transformer outputting raw hidden-states without any specific head on top.",
+    MISTRAL_START_DOCSTRING,
+)
+class FlaxMistralModel(FlaxMistralPreTrainedModel):
+    module_class = FlaxMistralModule
+
+
+append_call_sample_docstring(
+    FlaxMistralModel,
+    _CHECKPOINT_FOR_DOC,
+    FlaxBaseModelOutputWithPast,
+    _CONFIG_FOR_DOC,
+    real_checkpoint=_REAL_CHECKPOINT_FOR_DOC,
+)
+
+
+# Copied from transformers.models.llama.modeling_flax_llama.FlaxLlamaForCausalLMModule with Llama->Mistral
+class FlaxMistralForCausalLMModule(nn.Module):
+    config: MistralConfig
+    dtype: jnp.dtype = jnp.float32
+
+    def setup(self):
+        self.model = FlaxMistralModule(self.config, dtype=self.dtype)
+        self.lm_head = nn.Dense(
+            self.config.vocab_size,
+            use_bias=False,
+            dtype=self.dtype,
+            kernel_init=jax.nn.initializers.normal(stddev=self.config.initializer_range),
+        )
+
+    def __call__(
+        self,
+        input_ids,
+        attention_mask=None,
+        position_ids=None,
+        deterministic: bool = True,
+        init_cache: bool = False,
+        output_attentions: bool = False,
+        output_hidden_states: bool = False,
+        return_dict: bool = True,
+    ):
+        outputs = self.model(
+            input_ids,
+            position_ids=position_ids,
+            attention_mask=attention_mask,
+            deterministic=deterministic,
+            init_cache=init_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        hidden_states = outputs[0]
+        lm_logits = self.lm_head(hidden_states)
+
+        if not return_dict:
+            return (lm_logits,) + outputs[1:]
+
+        return FlaxCausalLMOutput(logits=lm_logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions)
+
+
+@add_start_docstrings(
+    """
+    The Mistral Model transformer with a language modeling head (linear layer) on top.
+    """,
+    MISTRAL_START_DOCSTRING,
+)
+
+# Copied from transformers.models.gptj.modeling_flax_gptj.FlaxGPTJForCausalLM with GPTJ->Mistral
+class FlaxMistralForCausalLM(FlaxMistralPreTrainedModel):
+    module_class = FlaxMistralForCausalLMModule
+
+    def prepare_inputs_for_generation(self, input_ids, max_length, attention_mask: Optional[jax.Array] = None):
+        # initializing the cache
+        batch_size, seq_length = input_ids.shape
+
+        past_key_values = self.init_cache(batch_size, max_length)
+        # Note that usually one would have to put 0's in the attention_mask for x > input_ids.shape[-1] and x < cache_length.
+        # But since Mistral uses a causal mask, those positions are masked anyways.
+        # Thus we can create a single static attention_mask here, which is more efficient for compilation
+        extended_attention_mask = jnp.ones((batch_size, max_length), dtype="i4")
+        if attention_mask is not None:
+            position_ids = attention_mask.cumsum(axis=-1) - 1
+            extended_attention_mask = lax.dynamic_update_slice(extended_attention_mask, attention_mask, (0, 0))
+        else:
+            position_ids = jnp.broadcast_to(jnp.arange(seq_length, dtype="i4")[None, :], (batch_size, seq_length))
+
+        return {
+            "past_key_values": past_key_values,
+            "attention_mask": extended_attention_mask,
+            "position_ids": position_ids,
+        }
+
+    def update_inputs_for_generation(self, model_outputs, model_kwargs):
+        model_kwargs["past_key_values"] = model_outputs.past_key_values
+        model_kwargs["position_ids"] = model_kwargs["position_ids"][:, -1:] + 1
+        return model_kwargs
+
+
+append_call_sample_docstring(
+    FlaxMistralForCausalLM,
+    _CHECKPOINT_FOR_DOC,
+    FlaxCausalLMOutputWithCrossAttentions,
+    _CONFIG_FOR_DOC,
+    real_checkpoint=_REAL_CHECKPOINT_FOR_DOC,
+)
diff --git a/collie/models/mistral/modeling_mistral.py b/collie/models/mistral/modeling_mistral.py
new file mode 100644
index 0000000..03fad65
--- /dev/null
+++ b/collie/models/mistral/modeling_mistral.py
@@ -0,0 +1,1473 @@
+# coding=utf-8
+# Copyright 2023 Mistral AI and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" PyTorch Mistral model."""
+import inspect
+import math
+import warnings
+from typing import List, Optional, Tuple, Union
+import json
+
+import torch
+import torch.nn.functional as F
+import torch.utils.checkpoint
+from torch import nn
+from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
+
+from transformers.activations import ACT2FN
+from transformers.cache_utils import Cache, DynamicCache
+from transformers.modeling_attn_mask_utils import _prepare_4d_causal_attention_mask, _prepare_4d_causal_attention_mask_for_sdpa
+from transformers.modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast, SequenceClassifierOutputWithPast
+from transformers.modeling_utils import PreTrainedModel
+from transformers.utils import (
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    is_flash_attn_2_available,
+    is_flash_attn_greater_or_equal_2_10,
+    logging,
+    replace_return_docstrings,
+)
+from .configuration_mistral import MistralConfig
+
+
+if is_flash_attn_2_available():
+    from flash_attn import flash_attn_func, flash_attn_varlen_func
+    from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input  # noqa
+
+    _flash_supports_window_size = "window_size" in list(inspect.signature(flash_attn_func).parameters)
+
+
+logger = logging.get_logger(__name__)
+
+_CONFIG_FOR_DOC = "MistralConfig"
+
+
+# Copied from transformers.models.llama.modeling_llama._get_unpad_data
+def _get_unpad_data(attention_mask):
+    seqlens_in_batch = attention_mask.sum(dim=-1, dtype=torch.int32)
+    indices = torch.nonzero(attention_mask.flatten(), as_tuple=False).flatten()
+    max_seqlen_in_batch = seqlens_in_batch.max().item()
+    cu_seqlens = F.pad(torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.int32), (1, 0))
+    return (
+        indices,
+        cu_seqlens,
+        max_seqlen_in_batch,
+    )
+
+
+# Copied from transformers.models.llama.modeling_llama.LlamaRMSNorm with Llama->Mistral
+class MistralRMSNorm(nn.Module):
+    def __init__(self, hidden_size, eps=1e-6, dtype=torch.bfloat16):
+        """
+        MistralRMSNorm is equivalent to T5LayerNorm
+        """
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+        self.variance_epsilon = eps
+
+    def forward(self, hidden_states):
+        input_dtype = hidden_states.dtype
+        hidden_states = hidden_states.to(torch.float32)
+        variance = hidden_states.pow(2).mean(-1, keepdim=True)
+        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
+        self.weight = nn.Parameter(self.weight.to(input_dtype))
+        ans = self.weight * hidden_states.to(input_dtype)
+
+        # 打印层标准化的输出
+        hidden_states_output = ans.detach().cpu().tolist()
+        data_to_save = {"Layer Norm Output": hidden_states_output}
+        # 将输出写入 JSON 文件
+        with open('rms_output.json', 'w') as f:
+            json.dump(data_to_save, f, indent=4)
+        return self.weight * hidden_states.to(input_dtype)
+
+
+# copied from transformers.models.llama.modeling_llama.LlamaRotaryEmbedding with Llama->Mistral
+# TODO @Arthur no longer copied from LLama after static cache
+class MistralRotaryEmbedding(nn.Module):
+    def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None):
+        super().__init__()
+
+        self.dim = dim
+        self.max_position_embeddings = max_position_embeddings
+        self.base = base
+        inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2, dtype=torch.int64).float().to(device) / self.dim))
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
+
+        # Build here to make `torch.jit.trace` work.
+        self._set_cos_sin_cache(
+            seq_len=max_position_embeddings, device=self.inv_freq.device, dtype=torch.get_default_dtype()
+        )
+
+    def _set_cos_sin_cache(self, seq_len, device, dtype):
+        self.max_seq_len_cached = seq_len
+        t = torch.arange(self.max_seq_len_cached, device=device, dtype=torch.int64).type_as(self.inv_freq)
+
+        freqs = torch.outer(t, self.inv_freq)
+        # Different from paper, but it uses a different permutation in order to obtain the same calculation
+        emb = torch.cat((freqs, freqs), dim=-1)
+        self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False)
+        self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False)
+
+    def forward(self, x, seq_len=None):
+        # x: [bs, num_attention_heads, seq_len, head_size]
+        if seq_len > self.max_seq_len_cached:
+            self._set_cos_sin_cache(seq_len=seq_len, device=x.device, dtype=x.dtype)
+
+        return (
+            self.cos_cached[:seq_len].to(dtype=x.dtype),
+            self.sin_cached[:seq_len].to(dtype=x.dtype),
+        )
+
+
+# Copied from transformers.models.llama.modeling_llama.rotate_half
+def rotate_half(x):
+    """Rotates half the hidden dims of the input."""
+    x1 = x[..., : x.shape[-1] // 2]
+    x2 = x[..., x.shape[-1] // 2 :]
+    return torch.cat((-x2, x1), dim=-1)
+
+
+# copied from transformers.models.llama.modeling_llama.apply_rotary_pos_emb
+# TODO @Arthur no longer copied from LLama after static cache
+def apply_rotary_pos_emb(q, k, cos, sin, position_ids, unsqueeze_dim=1):
+    """Applies Rotary Position Embedding to the query and key tensors.
+
+    Args:
+        q (`torch.Tensor`): The query tensor.
+        k (`torch.Tensor`): The key tensor.
+        cos (`torch.Tensor`): The cosine part of the rotary embedding.
+        sin (`torch.Tensor`): The sine part of the rotary embedding.
+        position_ids (`torch.Tensor`):
+            The position indices of the tokens corresponding to the query and key tensors. For example, this can be
+            used to pass offsetted position ids when working with a KV-cache.
+        unsqueeze_dim (`int`, *optional*, defaults to 1):
+            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
+            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
+            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
+            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
+            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
+            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
+    Returns:
+        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
+    """
+    cos = cos[position_ids].unsqueeze(unsqueeze_dim)
+    sin = sin[position_ids].unsqueeze(unsqueeze_dim)
+    q_embed = (q * cos) + (rotate_half(q) * sin)
+    k_embed = (k * cos) + (rotate_half(k) * sin)
+    return q_embed, k_embed
+
+
+class MistralMLP(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.hidden_size = config.hidden_size
+        self.intermediate_size = config.intermediate_size
+        self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False, dtype = torch.bfloat16)
+        self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False,dtype = torch.bfloat16)
+        self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False,dtype = torch.bfloat16)
+        self.act_fn = ACT2FN[config.hidden_act]
+
+    def forward(self, x):
+        x = x.to(dtype=torch.bfloat16)
+        output = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
+
+        # 打印MLP层输出
+        mlp_output = output.detach().cpu().tolist()
+        data_to_save = {"MLP Output": mlp_output}
+        # 将输出写入 JSON 文件
+        with open('mlp_output.json', 'w') as f:
+            json.dump(data_to_save, f, indent=4)
+
+        return output
+
+
+# Copied from transformers.models.llama.modeling_llama.repeat_kv
+def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
+    """
+    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
+    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
+    """
+    batch, num_key_value_heads, slen, head_dim = hidden_states.shape
+    if n_rep == 1:
+        return hidden_states
+    hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
+    return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
+
+
+class MistralAttention(nn.Module):
+    """
+    Multi-headed attention from 'Attention Is All You Need' paper. Modified to use sliding window attention: Longformer
+    and "Generating Long Sequences with Sparse Transformers".
+    """
+
+    def __init__(self, config: MistralConfig, layer_idx: Optional[int] = None):
+        super().__init__()
+        self.config = config
+        self.layer_idx = layer_idx
+        if layer_idx is None:
+            logger.warning_once(
+                f"Instantiating {self.__class__.__name__} without passing a `layer_idx` is not recommended and will "
+                "lead to errors during the forward call if caching is used. Please make sure to provide a `layer_idx` "
+                "when creating this class."
+            )
+
+        self.hidden_size = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_dim = self.hidden_size // self.num_heads
+        self.num_key_value_heads = config.num_key_value_heads
+        self.num_key_value_groups = self.num_heads // self.num_key_value_heads
+        self.max_position_embeddings = config.max_position_embeddings
+        self.rope_theta = config.rope_theta
+        self.is_causal = True
+        self.attention_dropout = config.attention_dropout
+
+        if (self.head_dim * self.num_heads) != self.hidden_size:
+            raise ValueError(
+                f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}"
+                f" and `num_heads`: {self.num_heads})."
+            )
+        self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=False,dtype = torch.bfloat16)
+        self.k_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=False,dtype = torch.bfloat16)
+        self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=False,dtype = torch.bfloat16)
+        self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=False,dtype = torch.bfloat16)
+
+        self.rotary_emb = MistralRotaryEmbedding(
+            self.head_dim,
+            max_position_embeddings=self.max_position_embeddings,
+            base=self.rope_theta,
+        )
+
+    def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
+        return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Cache] = None,
+        output_attentions: bool = False,
+        use_cache: bool = False,
+        **kwargs,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        if "padding_mask" in kwargs:
+            warnings.warn(
+                "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`"
+            )
+        bsz, q_len, _ = hidden_states.size()
+        query_states = self.q_proj(hidden_states)
+        key_states = self.k_proj(hidden_states)
+        value_states = self.v_proj(hidden_states)
+
+        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+
+        # 打印注意力模块的输出
+        # 准备数据以写入 JSON 文件
+        attention_outputs = {
+            "Query states": query_states.detach().cpu().tolist(),
+            "Key states": key_states.detach().cpu().tolist(),
+            "Value states": value_states.detach().cpu().tolist()
+        }
+        # 将数据写入 JSON 文件
+        with open("attention_outputs.json", "w") as f:
+            json.dump(attention_outputs, f, indent=4)
+
+        kv_seq_len = key_states.shape[-2]
+        if past_key_value is not None:
+            if self.layer_idx is None:
+                raise ValueError(
+                    f"The cache structure has changed since version v4.36. If you are using {self.__class__.__name__} "
+                    "for auto-regressive decoding with k/v caching, please make sure to initialize the attention class "
+                    "with a layer index."
+                )
+            kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
+        cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
+
+        if past_key_value is not None:
+            cache_kwargs = {"sin": sin, "cos": cos}  # Specific to RoPE models
+            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
+
+        # repeat k/v heads if n_kv_heads < n_heads
+        key_states = repeat_kv(key_states, self.num_key_value_groups)
+        value_states = repeat_kv(value_states, self.num_key_value_groups)
+
+        attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim)
+
+        if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len):
+            raise ValueError(
+                f"Attention weights should be of size {(bsz, self.num_heads, q_len, kv_seq_len)}, but is"
+                f" {attn_weights.size()}"
+            )
+
+        if attention_mask is not None:
+            if attention_mask.size() != (bsz, 1, q_len, kv_seq_len):
+                raise ValueError(
+                    f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}"
+                )
+
+            attn_weights = attn_weights + attention_mask
+
+        # upcast attention to fp32
+        attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
+        attn_weights = nn.functional.dropout(attn_weights, p=self.attention_dropout, training=self.training)
+        attn_output = torch.matmul(attn_weights, value_states)
+
+        if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim):
+            raise ValueError(
+                f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is"
+                f" {attn_output.size()}"
+            )
+
+        attn_output = attn_output.transpose(1, 2).contiguous()
+        attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
+
+        attn_output = self.o_proj(attn_output)
+
+        if not output_attentions:
+            attn_weights = None
+
+        # 打印注意力模块的输出
+        attention_result = {
+            "Output weights:": attn_output.detach().cpu().tolist(),
+            # "Attention weights:": attn_weights.detach().cpu().tolist(),
+        }
+        # 将数据写入 JSON 文件
+        with open("attention_outputs.json", "w") as f:
+            json.dump(attention_result, f, indent=4)
+
+        return attn_output, attn_weights, past_key_value
+
+
+class MistralFlashAttention2(MistralAttention):
+    """
+    Mistral flash attention module. This module inherits from `MistralAttention` as the weights of the module stays
+    untouched. The only required change would be on the forward pass where it needs to correctly call the public API of
+    flash attention and deal with padding tokens in case the input contains any of them.
+    """
+
+    # Copied from transformers.models.llama.modeling_llama.LlamaFlashAttention2.__init__
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+        # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1.
+        # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignement, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0.
+        # Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left).
+        self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10()
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Cache] = None,
+        output_attentions: bool = False,
+        use_cache: bool = False,
+        **kwargs,
+    ):
+        if "padding_mask" in kwargs:
+            warnings.warn(
+                "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`"
+            )
+
+            # overwrite attention_mask with padding_mask
+            attention_mask = kwargs.pop("padding_mask")
+        bsz, q_len, _ = hidden_states.size()
+
+        query_states = self.q_proj(hidden_states.to(torch.bfloat16))
+        key_states = self.k_proj(hidden_states.to(torch.bfloat16))
+        value_states = self.v_proj(hidden_states.to(torch.bfloat16))
+
+        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+
+        kv_seq_len = key_states.shape[-2]
+        if past_key_value is not None:
+            if self.layer_idx is None:
+                raise ValueError(
+                    f"The cache structure has changed since version v4.36. If you are using {self.__class__.__name__} "
+                    "for auto-regressive decoding with k/v caching, please make sure to initialize the attention class "
+                    "with a layer index."
+                )
+            kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
+
+        # Because the input can be padded, the absolute sequence length depends on the max position id.
+        rotary_seq_len = max(kv_seq_len, position_ids[:, -1].max().item()) + 1
+        cos, sin = self.rotary_emb(value_states, seq_len=rotary_seq_len)
+
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
+
+        use_sliding_windows = (
+            _flash_supports_window_size
+            and getattr(self.config, "sliding_window", None) is not None
+            and kv_seq_len > self.config.sliding_window
+        )
+
+        if not _flash_supports_window_size:
+            logger.warning_once(
+                "The current flash attention version does not support sliding window attention, for a more memory efficient implementation"
+                " make sure to upgrade flash-attn library."
+            )
+
+        if past_key_value is not None:
+            # Activate slicing cache only if the config has a value `sliding_windows` attribute
+            cache_has_contents = past_key_value.get_seq_length(self.layer_idx) > 0
+            if (
+                getattr(self.config, "sliding_window", None) is not None
+                and kv_seq_len > self.config.sliding_window
+                and cache_has_contents
+            ):
+                slicing_tokens = 1 - self.config.sliding_window
+
+                past_key = past_key_value[self.layer_idx][0]
+                past_value = past_key_value[self.layer_idx][1]
+
+                past_key = past_key[:, :, slicing_tokens:, :].contiguous()
+                past_value = past_value[:, :, slicing_tokens:, :].contiguous()
+
+                if past_key.shape[-2] != self.config.sliding_window - 1:
+                    raise ValueError(
+                        f"past key must have a shape of (`batch_size, num_heads, self.config.sliding_window-1, head_dim`), got"
+                        f" {past_key.shape}"
+                    )
+
+                if attention_mask is not None:
+                    attention_mask = attention_mask[:, slicing_tokens:]
+                    attention_mask = torch.cat([attention_mask, torch.ones_like(attention_mask[:, -1:])], dim=-1)
+
+            cache_kwargs = {"sin": sin, "cos": cos}  # Specific to RoPE models
+            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
+
+        # repeat k/v heads if n_kv_heads < n_heads
+        key_states = repeat_kv(key_states, self.num_key_value_groups)
+        value_states = repeat_kv(value_states, self.num_key_value_groups)
+        dropout_rate = 0.0 if not self.training else self.attention_dropout
+
+        # In PEFT, usually we cast the layer norms in float32 for training stability reasons
+        # therefore the input hidden states gets silently casted in float32. Hence, we need
+        # cast them back in float16 just to be sure everything works as expected.
+        input_dtype = query_states.dtype
+        if input_dtype == torch.float32:
+            if torch.is_autocast_enabled():
+                target_dtype = torch.get_autocast_gpu_dtype()
+            # Handle the case where the model is quantized
+            elif hasattr(self.config, "_pre_quantization_dtype"):
+                target_dtype = self.config._pre_quantization_dtype
+            else:
+                target_dtype = self.q_proj.weight.dtype
+
+            logger.warning_once(
+                f"The input hidden states seems to be silently casted in float32, this might be related to"
+                f" the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in"
+                f" {target_dtype}."
+            )
+
+            query_states = query_states.to(target_dtype)
+            key_states = key_states.to(target_dtype)
+            value_states = value_states.to(target_dtype)
+
+        # Reashape to the expected shape for Flash Attention
+        query_states = query_states.transpose(1, 2)
+        key_states = key_states.transpose(1, 2)
+        value_states = value_states.transpose(1, 2)
+        # 打印注意力模块的输出
+        # 准备数据以写入 JSON 文件
+        attention_outputs = {
+            "Query states": query_states.detach().cpu().tolist(),
+            "Key states": key_states.detach().cpu().tolist(),
+            "Value states": value_states.detach().cpu().tolist()
+        }
+        # 将数据写入 JSON 文件
+        with open("flash_attention_outputs.json", "w") as f:
+            json.dump(attention_outputs, f, indent=4)
+
+        attn_output = self._flash_attention_forward(
+            query_states,
+            key_states,
+            value_states,
+            attention_mask,
+            q_len,
+            dropout=dropout_rate,
+            use_sliding_windows=use_sliding_windows,
+        )
+
+        attn_output = attn_output.reshape(bsz, q_len, self.hidden_size).contiguous()
+        attn_output = self.o_proj(attn_output)
+
+        if not output_attentions:
+            attn_weights = None
+
+        # 打印注意力模块的输出
+        attention_result = {
+            "Output weights:": attn_output.detach().cpu().tolist(),
+            # "Attention weights:": attn_weights.detach().cpu().tolist(),
+        }
+        # 将数据写入 JSON 文件
+        with open("flash_attention_outputs.json", "w") as f:
+            json.dump(attention_result, f, indent=4)
+
+        return attn_output, attn_weights, past_key_value
+
+    def _flash_attention_forward(
+        self,
+        query_states,
+        key_states,
+        value_states,
+        attention_mask,
+        query_length,
+        dropout=0.0,
+        softmax_scale=None,
+        use_sliding_windows=False,
+    ):
+        """
+        Calls the forward method of Flash Attention - if the input hidden states contain at least one padding token
+        first unpad the input, then computes the attention scores and pad the final attention scores.
+
+        Args:
+            query_states (`torch.Tensor`):
+                Input query states to be passed to Flash Attention API
+            key_states (`torch.Tensor`):
+                Input key states to be passed to Flash Attention API
+            value_states (`torch.Tensor`):
+                Input value states to be passed to Flash Attention API
+            attention_mask (`torch.Tensor`):
+                The padding mask - corresponds to a tensor of size `(batch_size, seq_len)` where 0 stands for the
+                position of padding tokens and 1 for the position of non-padding tokens.
+            dropout (`float`):
+                Attention dropout
+            softmax_scale (`float`, *optional*):
+                The scaling of QK^T before applying softmax. Default to 1 / sqrt(head_dim)
+            use_sliding_windows (`bool`, *optional*):
+                Whether to activate sliding window attention.
+        """
+        if not self._flash_attn_uses_top_left_mask:
+            causal = self.is_causal
+        else:
+            # TODO: Remove the `query_length != 1` check once Flash Attention for RoCm is bumped to 2.1. For details, please see the comment in LlamaFlashAttention2 __init__.
+            causal = self.is_causal and query_length != 1
+
+        # Contains at least one padding token in the sequence
+        if attention_mask is not None:
+            batch_size = query_states.shape[0]
+            query_states, key_states, value_states, indices_q, cu_seq_lens, max_seq_lens = self._upad_input(
+                query_states, key_states, value_states, attention_mask, query_length
+            )
+
+            cu_seqlens_q, cu_seqlens_k = cu_seq_lens
+            max_seqlen_in_batch_q, max_seqlen_in_batch_k = max_seq_lens
+
+            if not use_sliding_windows:
+                attn_output_unpad = flash_attn_varlen_func(
+                    query_states,
+                    key_states,
+                    value_states,
+                    cu_seqlens_q=cu_seqlens_q,
+                    cu_seqlens_k=cu_seqlens_k,
+                    max_seqlen_q=max_seqlen_in_batch_q,
+                    max_seqlen_k=max_seqlen_in_batch_k,
+                    dropout_p=dropout,
+                    softmax_scale=softmax_scale,
+                    causal=causal,
+                )
+            else:
+                attn_output_unpad = flash_attn_varlen_func(
+                    query_states,
+                    key_states,
+                    value_states,
+                    cu_seqlens_q=cu_seqlens_q,
+                    cu_seqlens_k=cu_seqlens_k,
+                    max_seqlen_q=max_seqlen_in_batch_q,
+                    max_seqlen_k=max_seqlen_in_batch_k,
+                    dropout_p=dropout,
+                    softmax_scale=softmax_scale,
+                    causal=causal,
+                    window_size=(self.config.sliding_window, self.config.sliding_window),
+                )
+
+            attn_output = pad_input(attn_output_unpad, indices_q, batch_size, query_length)
+        else:
+            if not use_sliding_windows:
+                attn_output = flash_attn_func(
+                    query_states,
+                    key_states,
+                    value_states,
+                    dropout,
+                    softmax_scale=softmax_scale,
+                    causal=causal,
+                )
+            else:
+                attn_output = flash_attn_func(
+                    query_states,
+                    key_states,
+                    value_states,
+                    dropout,
+                    softmax_scale=softmax_scale,
+                    causal=causal,
+                    window_size=(self.config.sliding_window, self.config.sliding_window),
+                )
+
+        return attn_output
+
+    def _upad_input(self, query_layer, key_layer, value_layer, attention_mask, query_length):
+        batch_size, kv_seq_len, num_heads, head_dim = key_layer.shape
+
+        # On the first iteration we need to properly re-create the padding mask
+        # by slicing it on the proper place
+        if kv_seq_len != attention_mask.shape[-1]:
+            attention_mask_num_tokens = attention_mask.shape[-1]
+            attention_mask = attention_mask[:, attention_mask_num_tokens - kv_seq_len :]
+
+        indices_k, cu_seqlens_k, max_seqlen_in_batch_k = _get_unpad_data(attention_mask)
+
+        key_layer = index_first_axis(key_layer.reshape(batch_size * kv_seq_len, num_heads, head_dim), indices_k)
+        value_layer = index_first_axis(value_layer.reshape(batch_size * kv_seq_len, num_heads, head_dim), indices_k)
+
+        if query_length == kv_seq_len:
+            query_layer = index_first_axis(
+                query_layer.reshape(batch_size * kv_seq_len, num_heads, head_dim), indices_k
+            )
+            cu_seqlens_q = cu_seqlens_k
+            max_seqlen_in_batch_q = max_seqlen_in_batch_k
+            indices_q = indices_k
+        elif query_length == 1:
+            max_seqlen_in_batch_q = 1
+            cu_seqlens_q = torch.arange(
+                batch_size + 1, dtype=torch.int32, device=query_layer.device
+            )  # There is a memcpy here, that is very bad.
+            indices_q = cu_seqlens_q[:-1]
+            query_layer = query_layer.squeeze(1)
+        else:
+            # The -q_len: slice assumes left padding.
+            attention_mask = attention_mask[:, -query_length:]
+            query_layer, indices_q, cu_seqlens_q, max_seqlen_in_batch_q = unpad_input(query_layer, attention_mask)
+
+        return (
+            query_layer,
+            key_layer,
+            value_layer,
+            indices_q,
+            (cu_seqlens_q, cu_seqlens_k),
+            (max_seqlen_in_batch_q, max_seqlen_in_batch_k),
+        )
+
+
+# copied from transformers.models.llama.modeling_llama.LlamaSdpaAttention with Llama->Mistral
+# TODO @Arthur no longer copied from LLama after static cache
+class MistralSdpaAttention(MistralAttention):
+    """
+    Mistral attention module using torch.nn.functional.scaled_dot_product_attention. This module inherits from
+    `MistralAttention` as the weights of the module stays untouched. The only changes are on the forward pass to adapt to
+    SDPA API.
+    """
+
+    # Adapted from MistralAttention.forward
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Cache] = None,
+        output_attentions: bool = False,
+        use_cache: bool = False,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        if output_attentions:
+            # TODO: Improve this warning with e.g. `model.config.attn_implementation = "manual"` once this is implemented.
+            logger.warning_once(
+                "MistralModel is using MistralSdpaAttention, but `torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to the manual attention implementation, "
+                'but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
+            )
+            return super().forward(
+                hidden_states=hidden_states,
+                attention_mask=attention_mask,
+                position_ids=position_ids,
+                past_key_value=past_key_value,
+                output_attentions=output_attentions,
+                use_cache=use_cache,
+            )
+
+        bsz, q_len, _ = hidden_states.size()
+
+        query_states = self.q_proj(hidden_states.to(torch.bfloat16))
+        key_states = self.k_proj(hidden_states.to(torch.bfloat16))
+        value_states = self.v_proj(hidden_states.to(torch.bfloat16))
+        
+        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+
+        # 打印注意力模块的输出
+        # 准备数据以写入 JSON 文件
+        attention_outputs = {
+            "Query states": query_states.detach().cpu().tolist(),
+            "Key states": key_states.detach().cpu().tolist(),
+            "Value states": value_states.detach().cpu().tolist()
+        }
+        # 将数据写入 JSON 文件
+        with open("sdpa_attention_outputs.json", "w") as f:
+            json.dump(attention_outputs, f, indent=4)
+
+
+        kv_seq_len = key_states.shape[-2]
+        if past_key_value is not None:
+            kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
+        cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
+
+        if past_key_value is not None:
+            cache_kwargs = {"sin": sin, "cos": cos}  # Specific to RoPE models
+            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
+
+        key_states = repeat_kv(key_states, self.num_key_value_groups)
+        value_states = repeat_kv(value_states, self.num_key_value_groups)
+
+        if attention_mask is not None:
+            if attention_mask.size() != (bsz, 1, q_len, kv_seq_len):
+                raise ValueError(
+                    f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}"
+                )
+
+        # SDPA with memory-efficient backend is currently (torch==2.1.2) bugged with non-contiguous inputs with custom attn_mask,
+        # Reference: https://github.com/pytorch/pytorch/issues/112577.
+        if query_states.device.type == "cuda" and attention_mask is not None:
+            query_states = query_states.contiguous()
+            key_states = key_states.contiguous()
+            value_states = value_states.contiguous()
+        
+        attn_output = torch.nn.functional.scaled_dot_product_attention(
+            query_states,
+            key_states,
+            value_states,
+            attn_mask=attention_mask,
+            dropout_p=self.attention_dropout if self.training else 0.0,
+            # The q_len > 1 is necessary to match with AttentionMaskConverter.to_causal_4d that does not create a causal mask in case q_len == 1.
+            is_causal=self.is_causal and attention_mask is None and q_len > 1,
+        )
+
+        attn_output = attn_output.transpose(1, 2).contiguous()
+        attn_output = attn_output.view(bsz, q_len, self.hidden_size)
+
+        attn_output = self.o_proj(attn_output)
+
+        # 打印注意力模块的输出
+        attention_result = {
+            "Output weights:": attn_output.detach().cpu().tolist(),
+            # "Attention weights:": attn_weights.detach().cpu().tolist(),
+        }
+        # 将数据写入 JSON 文件
+        with open("sdpa_attention_outputs.json", "w") as f:
+            json.dump(attention_result, f, indent=4)
+
+        return attn_output, None, past_key_value
+
+
+MISTRAL_ATTENTION_CLASSES = {
+    "eager": MistralAttention,
+    "flash_attention_2": MistralFlashAttention2,
+    "sdpa": MistralSdpaAttention,
+}
+
+
+class MistralDecoderLayer(nn.Module):
+    def __init__(self, config: MistralConfig, layer_idx: int):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        # bbbb
+        config._attn_implementation = "sdpa"
+        self.self_attn = MISTRAL_ATTENTION_CLASSES[config._attn_implementation](config, layer_idx)
+
+        self.mlp = MistralMLP(config)
+        self.input_layernorm = MistralRMSNorm(config.hidden_size, eps=config.rms_norm_eps,dtype=torch.bfloat16)
+        self.post_attention_layernorm = MistralRMSNorm(config.hidden_size, eps=config.rms_norm_eps,dtype=torch.bfloat16)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        output_attentions: Optional[bool] = False,
+        use_cache: Optional[bool] = False,
+        **kwargs,
+    ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
+        if "padding_mask" in kwargs:
+            warnings.warn(
+                "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`"
+            )
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
+            attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
+                `(batch, sequence_length)` where padding elements are indicated by 0.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            use_cache (`bool`, *optional*):
+                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
+                (see `past_key_values`).
+            past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
+        """
+
+        residual = hidden_states
+
+        hidden_states = self.input_layernorm(hidden_states)
+        # Self Attention
+        hidden_states, self_attn_weights, present_key_value = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_value=past_key_value,
+            output_attentions=output_attentions,
+            use_cache=use_cache,
+        )
+        hidden_states = residual + hidden_states
+
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + hidden_states
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (self_attn_weights,)
+
+        if use_cache:
+            outputs += (present_key_value,)
+        
+        return outputs
+
+
+MISTRAL_START_DOCSTRING = r"""
+    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+
+    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
+    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
+    and behavior.
+
+    Parameters:
+        config ([`MistralConfig`]):
+            Model configuration class with all the parameters of the model. Initializing with a config file does not
+            load the weights associated with the model, only the configuration. Check out the
+            [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+
+@add_start_docstrings(
+    "The bare Mistral Model outputting raw hidden-states without any specific head on top.",
+    MISTRAL_START_DOCSTRING,
+)
+class MistralPreTrainedModel(PreTrainedModel):
+    config_class = MistralConfig
+    base_model_prefix = "model"
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["MistralDecoderLayer"]
+    _skip_keys_device_placement = "past_key_values"
+    _supports_flash_attn_2 = True
+    _supports_sdpa = True
+    _supports_cache_class = True
+
+    def _init_weights(self, module):
+        std = self.config.initializer_range
+        if isinstance(module, nn.Linear):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+
+
+MISTRAL_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
+            it.
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            [What are input IDs?](../glossary#input-ids)
+        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            If `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
+            `past_key_values`).
+
+            If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
+            and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
+            information on the default strategy.
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+        position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+            config.n_positions - 1]`.
+
+            [What are position IDs?](../glossary#position-ids)
+        past_key_values (`Cache` or `tuple(tuple(torch.FloatTensor))`, *optional*):
+            Pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
+            blocks) that can be used to speed up sequential decoding. This typically consists in the `past_key_values`
+            returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`.
+
+            Two formats are allowed:
+            - a [`~cache_utils.Cache`] instance;
+            - Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of
+            shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`). This is also known as the legacy
+            cache format.
+
+            The model will output the same cache format that is fed as input. If no `past_key_values` are passed, the
+            legacy cache format will be returned.
+
+            If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that don't
+            have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `input_ids`
+            of shape `(batch_size, sequence_length)`.
+        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
+            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
+            model's internal embedding lookup matrix.
+        use_cache (`bool`, *optional*):
+            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
+            `past_key_values`).
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+@add_start_docstrings(
+    "The bare Mistral Model outputting raw hidden-states without any specific head on top.",
+    MISTRAL_START_DOCSTRING,
+)
+class MistralModel(MistralPreTrainedModel):
+    """
+    Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`MistralDecoderLayer`]
+
+    Args:
+        config: MistralConfig
+    """
+
+    def __init__(self, config: MistralConfig):
+        super().__init__(config)
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+
+        self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx, dtype = torch.bfloat16)
+        self.layers = nn.ModuleList(
+            [MistralDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
+        )
+        config._attn_implementation = "sdpa"
+        self._attn_implementation = config._attn_implementation
+        self.norm = MistralRMSNorm(config.hidden_size, eps=config.rms_norm_eps,dtype=torch.bfloat16)
+
+        self.gradient_checkpointing = False
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.embed_tokens
+
+    def set_input_embeddings(self, value):
+        self.embed_tokens = value
+
+    @add_start_docstrings_to_model_forward(MISTRAL_INPUTS_DOCSTRING)
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutputWithPast]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        # retrieve input_ids and inputs_embeds
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time")
+        elif input_ids is not None:
+            batch_size, seq_length = input_ids.shape
+        elif inputs_embeds is not None:
+            batch_size, seq_length, _ = inputs_embeds.shape
+        else:
+            raise ValueError("You have to specify either decoder_input_ids or decoder_inputs_embeds")
+
+        if self.gradient_checkpointing and self.training:
+            if use_cache:
+                logger.warning_once(
+                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+                )
+                use_cache = False
+
+        past_key_values_length = 0
+
+        if use_cache:
+            use_legacy_cache = not isinstance(past_key_values, Cache)
+            if use_legacy_cache:
+                past_key_values = DynamicCache.from_legacy_cache(past_key_values)
+            past_key_values_length = past_key_values.get_usable_length(seq_length)
+
+        if position_ids is None:
+            device = input_ids.device if input_ids is not None else inputs_embeds.device
+            position_ids = torch.arange(
+                past_key_values_length, seq_length + past_key_values_length, dtype=torch.long, device=device
+            )
+            position_ids = position_ids.unsqueeze(0).view(-1, seq_length)
+        else:
+            position_ids = position_ids.view(-1, seq_length).long()
+
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_tokens(input_ids)
+
+        # 打印嵌入层输出
+        embeddings_output = inputs_embeds.detach().cpu().tolist()
+        data_to_save = {"Embeddings Output": embeddings_output}
+        # 将输出写入 JSON 文件
+        with open('embeddings_output.json', 'w') as f:
+            json.dump(data_to_save, f, indent=4)
+
+        if attention_mask is not None and self._attn_implementation == "flash_attention_2" and use_cache:
+            is_padding_right = attention_mask[:, -1].sum().item() != batch_size
+            if is_padding_right:
+                raise ValueError(
+                    "You are attempting to perform batched generation with padding_side='right'"
+                    " this may lead to unexpected behaviour for Flash Attention version of Mistral. Make sure to "
+                    " call `tokenizer.padding_side  = 'left'` before tokenizing the input. "
+                )
+
+        if self._attn_implementation == "flash_attention_2":
+            # 2d mask is passed through the layers
+            attention_mask = attention_mask if (attention_mask is not None and 0 in attention_mask) else None
+        elif self._attn_implementation == "sdpa" and not output_attentions:
+            # output_attentions=True can not be supported when using SDPA, and we fall back on
+            # the manual implementation that requires a 4D causal mask in all cases.
+            attention_mask = _prepare_4d_causal_attention_mask_for_sdpa(
+                attention_mask,
+                (batch_size, seq_length),
+                inputs_embeds,
+                past_key_values_length,
+            )
+        else:
+            # 4d mask is passed through the layers
+            attention_mask = _prepare_4d_causal_attention_mask(
+                attention_mask,
+                (batch_size, seq_length),
+                inputs_embeds,
+                past_key_values_length,
+                sliding_window=self.config.sliding_window,
+            )
+
+        hidden_states = inputs_embeds
+
+        # decoder layers
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attns = () if output_attentions else None
+        next_decoder_cache = None
+
+        for decoder_layer in self.layers:
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+
+            if self.gradient_checkpointing and self.training:
+                layer_outputs = self._gradient_checkpointing_func(
+                    decoder_layer.__call__,
+                    hidden_states,
+                    attention_mask,
+                    position_ids,
+                    past_key_values,
+                    output_attentions,
+                    use_cache,
+                )
+            else:
+                layer_outputs = decoder_layer(
+                    hidden_states,
+                    attention_mask=attention_mask,
+                    position_ids=position_ids,
+                    past_key_value=past_key_values,
+                    output_attentions=output_attentions,
+                    use_cache=use_cache,
+                )
+
+            hidden_states = layer_outputs[0]
+
+            if use_cache:
+                next_decoder_cache = layer_outputs[2 if output_attentions else 1]
+
+            if output_attentions:
+                all_self_attns += (layer_outputs[1],)
+
+        hidden_states = self.norm(hidden_states)
+
+        # add hidden states from the last decoder layer
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+
+        next_cache = None
+        if use_cache:
+            next_cache = next_decoder_cache.to_legacy_cache() if use_legacy_cache else next_decoder_cache
+
+        if not return_dict:
+            return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None)
+        return BaseModelOutputWithPast(
+            last_hidden_state=hidden_states,
+            past_key_values=next_cache,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attns,
+        )
+
+
+class MistralForCausalLM(MistralPreTrainedModel):
+    _tied_weights_keys = ["lm_head.weight"]
+
+    def __init__(self, config):
+        super().__init__(config)
+        self.model = MistralModel(config)
+        self.vocab_size = config.vocab_size
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False,dtype = torch.bfloat16)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.model.embed_tokens
+
+    def set_input_embeddings(self, value):
+        self.model.embed_tokens = value
+
+    def get_output_embeddings(self):
+        return self.lm_head
+
+    def set_output_embeddings(self, new_embeddings):
+        self.lm_head = new_embeddings
+
+    def set_decoder(self, decoder):
+        self.model = decoder
+
+    def get_decoder(self):
+        return self.model
+
+    @add_start_docstrings_to_model_forward(MISTRAL_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, CausalLMOutputWithPast]:
+        r"""
+        Args:
+            labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
+                config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+                (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+
+        Returns:
+
+        Example:
+
+        ```python
+        >>> from transformers import AutoTokenizer, MistralForCausalLM
+
+        >>> model = MistralForCausalLM.from_pretrained("mistralai/Mistral-7B-v0.1")
+        >>> tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-v0.1")
+
+        >>> prompt = "Hey, are you conscious? Can you talk to me?"
+        >>> inputs = tokenizer(prompt, return_tensors="pt")
+
+        >>> # Generate
+        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
+        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+        "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
+        ```"""
+
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
+        outputs = self.model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        hidden_states = outputs[0]
+        hidden_states = hidden_states.to(dtype=torch.bfloat16)
+        logits = self.lm_head(hidden_states)
+        logits = logits.float()
+
+        loss = None
+        if labels is not None:
+            # Shift so that tokens < n predict n
+            shift_logits = logits[..., :-1, :].contiguous()
+            shift_labels = labels[..., 1:].contiguous()
+            # Flatten the tokens
+            shift_logits = shift_logits.view(-1, self.config.vocab_size)
+            shift_labels = shift_labels.view(-1)
+            # Ensure tensors are on the same device
+            shift_labels = shift_labels.to(shift_logits.device)
+            loss_fct = CrossEntropyLoss()
+            loss = loss_fct(shift_logits, shift_labels)
+
+        if not return_dict:
+            output = (logits,) + outputs[1:]
+            return (loss,) + output if loss is not None else output
+
+        return CausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+    def prepare_inputs_for_generation(
+        self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, **kwargs
+    ):
+        # Omit tokens covered by past_key_values
+        if past_key_values is not None:
+            if isinstance(past_key_values, Cache):
+                cache_length = past_key_values.get_seq_length()
+                past_length = past_key_values.seen_tokens
+                max_cache_length = past_key_values.get_max_length()
+            else:
+                cache_length = past_length = past_key_values[0][0].shape[2]
+                max_cache_length = None
+
+            # Keep only the unprocessed tokens:
+            # 1 - If the length of the attention_mask exceeds the length of input_ids, then we are in a setting where
+            # some of the inputs are exclusively passed as part of the cache (e.g. when passing input_embeds as
+            # input)
+            if attention_mask is not None and attention_mask.shape[1] > input_ids.shape[1]:
+                input_ids = input_ids[:, -(attention_mask.shape[1] - past_length) :]
+            # 2 - If the past_length is smaller than input_ids', then input_ids holds all input tokens. We can discard
+            # input_ids based on the past_length.
+            elif past_length < input_ids.shape[1]:
+                input_ids = input_ids[:, past_length:]
+            # 3 - Otherwise (past_length >= input_ids.shape[1]), let's assume input_ids only has unprocessed tokens.
+
+            # If we are about to go beyond the maximum cache length, we need to crop the input attention mask.
+            if (
+                max_cache_length is not None
+                and attention_mask is not None
+                and cache_length + input_ids.shape[1] > max_cache_length
+            ):
+                attention_mask = attention_mask[:, -max_cache_length:]
+
+        position_ids = kwargs.get("position_ids", None)
+        if attention_mask is not None and position_ids is None:
+            # create position_ids on the fly for batch generation
+            position_ids = attention_mask.long().cumsum(-1) - 1
+            position_ids.masked_fill_(attention_mask == 0, 1)
+            if past_key_values:
+                position_ids = position_ids[:, -input_ids.shape[1] :]
+
+        # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
+        if inputs_embeds is not None and past_key_values is None:
+            model_inputs = {"inputs_embeds": inputs_embeds}
+        else:
+            model_inputs = {"input_ids": input_ids}
+
+        model_inputs.update(
+            {
+                "position_ids": position_ids,
+                "past_key_values": past_key_values,
+                "use_cache": kwargs.get("use_cache"),
+                "attention_mask": attention_mask,
+            }
+        )
+        return model_inputs
+
+    @staticmethod
+    def _reorder_cache(past_key_values, beam_idx):
+        reordered_past = ()
+        for layer_past in past_key_values:
+            reordered_past += (
+                tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past),
+            )
+        return reordered_past
+
+
+@add_start_docstrings(
+    """
+    The Mistral Model transformer with a sequence classification head on top (linear layer).
+
+    [`MistralForSequenceClassification`] uses the last token in order to do the classification, as other causal models
+    (e.g. GPT-2) do.
+
+    Since it does classification on the last token, it requires to know the position of the last token. If a
+    `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If
+    no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the
+    padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
+    each row of the batch).
+    """,
+    MISTRAL_START_DOCSTRING,
+)
+# Copied from transformers.models.llama.modeling_llama.LlamaForSequenceClassification with Llama->Mistral, LLAMA->MISTRAL
+class MistralForSequenceClassification(MistralPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+        self.model = MistralModel(config)
+        self.score = nn.Linear(config.hidden_size, self.num_labels, bias=False,dtype = torch.bfloat16)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.model.embed_tokens
+
+    def set_input_embeddings(self, value):
+        self.model.embed_tokens = value
+
+    @add_start_docstrings_to_model_forward(MISTRAL_INPUTS_DOCSTRING)
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, SequenceClassifierOutputWithPast]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        transformer_outputs = self.model(
+            input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        hidden_states = transformer_outputs[0]
+        logits = self.score(hidden_states)
+
+        if input_ids is not None:
+            batch_size = input_ids.shape[0]
+        else:
+            batch_size = inputs_embeds.shape[0]
+
+        if self.config.pad_token_id is None and batch_size != 1:
+            raise ValueError("Cannot handle batch sizes > 1 if no padding token is defined.")
+        if self.config.pad_token_id is None:
+            sequence_lengths = -1
+        else:
+            if input_ids is not None:
+                # if no pad token found, use modulo instead of reverse indexing for ONNX compatibility
+                sequence_lengths = torch.eq(input_ids, self.config.pad_token_id).int().argmax(-1) - 1
+                sequence_lengths = sequence_lengths % input_ids.shape[-1]
+                sequence_lengths = sequence_lengths.to(logits.device)
+            else:
+                sequence_lengths = -1
+
+        pooled_logits = logits[torch.arange(batch_size, device=logits.device), sequence_lengths]
+
+        loss = None
+        if labels is not None:
+            labels = labels.to(logits.device)
+            if self.config.problem_type is None:
+                if self.num_labels == 1:
+                    self.config.problem_type = "regression"
+                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
+                    self.config.problem_type = "single_label_classification"
+                else:
+                    self.config.problem_type = "multi_label_classification"
+
+            if self.config.problem_type == "regression":
+                loss_fct = MSELoss()
+                if self.num_labels == 1:
+                    loss = loss_fct(pooled_logits.squeeze(), labels.squeeze())
+                else:
+                    loss = loss_fct(pooled_logits, labels)
+            elif self.config.problem_type == "single_label_classification":
+                loss_fct = CrossEntropyLoss()
+                loss = loss_fct(pooled_logits.view(-1, self.num_labels), labels.view(-1))
+            elif self.config.problem_type == "multi_label_classification":
+                loss_fct = BCEWithLogitsLoss()
+                loss = loss_fct(pooled_logits, labels)
+        if not return_dict:
+            output = (pooled_logits,) + transformer_outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+
+        return SequenceClassifierOutputWithPast(
+            loss=loss,
+            logits=pooled_logits,
+            past_key_values=transformer_outputs.past_key_values,
+            hidden_states=transformer_outputs.hidden_states,
+            attentions=transformer_outputs.attentions,
+        )

From b2ad2cfa73d1bc0d3cb442a50cca6ba3fcfd3a7f Mon Sep 17 00:00:00 2001
From: LinqiY <100989140+LinqiY@users.noreply.github.com>
Date: Mon, 6 May 2024 15:36:09 +0800
Subject: [PATCH 04/16] Add safetensors

---
 collie/driver/io/file.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/collie/driver/io/file.py b/collie/driver/io/file.py
index 0196de4..18427fb 100644
--- a/collie/driver/io/file.py
+++ b/collie/driver/io/file.py
@@ -4,13 +4,17 @@
 import io
 import torch
 import shutil
+from safetensors.torch import save_file, load_file
 
 class FileIODriver(IODriver):
     @staticmethod
     def load(path: str, mode: str):
         assert os.path.exists(path), f"File {path} does not exist."
         if 'b' in mode.lower():
-            return torch.load(path, map_location=torch.device('cpu'))
+            if path.endswith(".safetensors"):
+                return load_file(path, device='cpu')
+            else:
+                return torch.load(path, map_location=torch.device('cpu'))
         else:
             with open(path, 'r') as f:
                 return f.read()

From b47c0fb4a4dfdee278598179309a90643b068ee8 Mon Sep 17 00:00:00 2001
From: LinqiY <100989140+LinqiY@users.noreply.github.com>
Date: Mon, 6 May 2024 15:39:30 +0800
Subject: [PATCH 05/16] Delete collie/models/mistral/modeling_mistral.py

---
 collie/models/mistral/modeling_mistral.py | 1473 ---------------------
 1 file changed, 1473 deletions(-)
 delete mode 100644 collie/models/mistral/modeling_mistral.py

diff --git a/collie/models/mistral/modeling_mistral.py b/collie/models/mistral/modeling_mistral.py
deleted file mode 100644
index 03fad65..0000000
--- a/collie/models/mistral/modeling_mistral.py
+++ /dev/null
@@ -1,1473 +0,0 @@
-# coding=utf-8
-# Copyright 2023 Mistral AI and the HuggingFace Inc. team. All rights reserved.
-#
-# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
-# and OPT implementations in this library. It has been modified from its
-# original forms to accommodate minor architectural differences compared
-# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-""" PyTorch Mistral model."""
-import inspect
-import math
-import warnings
-from typing import List, Optional, Tuple, Union
-import json
-
-import torch
-import torch.nn.functional as F
-import torch.utils.checkpoint
-from torch import nn
-from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
-
-from transformers.activations import ACT2FN
-from transformers.cache_utils import Cache, DynamicCache
-from transformers.modeling_attn_mask_utils import _prepare_4d_causal_attention_mask, _prepare_4d_causal_attention_mask_for_sdpa
-from transformers.modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast, SequenceClassifierOutputWithPast
-from transformers.modeling_utils import PreTrainedModel
-from transformers.utils import (
-    add_start_docstrings,
-    add_start_docstrings_to_model_forward,
-    is_flash_attn_2_available,
-    is_flash_attn_greater_or_equal_2_10,
-    logging,
-    replace_return_docstrings,
-)
-from .configuration_mistral import MistralConfig
-
-
-if is_flash_attn_2_available():
-    from flash_attn import flash_attn_func, flash_attn_varlen_func
-    from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input  # noqa
-
-    _flash_supports_window_size = "window_size" in list(inspect.signature(flash_attn_func).parameters)
-
-
-logger = logging.get_logger(__name__)
-
-_CONFIG_FOR_DOC = "MistralConfig"
-
-
-# Copied from transformers.models.llama.modeling_llama._get_unpad_data
-def _get_unpad_data(attention_mask):
-    seqlens_in_batch = attention_mask.sum(dim=-1, dtype=torch.int32)
-    indices = torch.nonzero(attention_mask.flatten(), as_tuple=False).flatten()
-    max_seqlen_in_batch = seqlens_in_batch.max().item()
-    cu_seqlens = F.pad(torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.int32), (1, 0))
-    return (
-        indices,
-        cu_seqlens,
-        max_seqlen_in_batch,
-    )
-
-
-# Copied from transformers.models.llama.modeling_llama.LlamaRMSNorm with Llama->Mistral
-class MistralRMSNorm(nn.Module):
-    def __init__(self, hidden_size, eps=1e-6, dtype=torch.bfloat16):
-        """
-        MistralRMSNorm is equivalent to T5LayerNorm
-        """
-        super().__init__()
-        self.weight = nn.Parameter(torch.ones(hidden_size))
-        self.variance_epsilon = eps
-
-    def forward(self, hidden_states):
-        input_dtype = hidden_states.dtype
-        hidden_states = hidden_states.to(torch.float32)
-        variance = hidden_states.pow(2).mean(-1, keepdim=True)
-        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
-        self.weight = nn.Parameter(self.weight.to(input_dtype))
-        ans = self.weight * hidden_states.to(input_dtype)
-
-        # 打印层标准化的输出
-        hidden_states_output = ans.detach().cpu().tolist()
-        data_to_save = {"Layer Norm Output": hidden_states_output}
-        # 将输出写入 JSON 文件
-        with open('rms_output.json', 'w') as f:
-            json.dump(data_to_save, f, indent=4)
-        return self.weight * hidden_states.to(input_dtype)
-
-
-# copied from transformers.models.llama.modeling_llama.LlamaRotaryEmbedding with Llama->Mistral
-# TODO @Arthur no longer copied from LLama after static cache
-class MistralRotaryEmbedding(nn.Module):
-    def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None):
-        super().__init__()
-
-        self.dim = dim
-        self.max_position_embeddings = max_position_embeddings
-        self.base = base
-        inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2, dtype=torch.int64).float().to(device) / self.dim))
-        self.register_buffer("inv_freq", inv_freq, persistent=False)
-
-        # Build here to make `torch.jit.trace` work.
-        self._set_cos_sin_cache(
-            seq_len=max_position_embeddings, device=self.inv_freq.device, dtype=torch.get_default_dtype()
-        )
-
-    def _set_cos_sin_cache(self, seq_len, device, dtype):
-        self.max_seq_len_cached = seq_len
-        t = torch.arange(self.max_seq_len_cached, device=device, dtype=torch.int64).type_as(self.inv_freq)
-
-        freqs = torch.outer(t, self.inv_freq)
-        # Different from paper, but it uses a different permutation in order to obtain the same calculation
-        emb = torch.cat((freqs, freqs), dim=-1)
-        self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False)
-        self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False)
-
-    def forward(self, x, seq_len=None):
-        # x: [bs, num_attention_heads, seq_len, head_size]
-        if seq_len > self.max_seq_len_cached:
-            self._set_cos_sin_cache(seq_len=seq_len, device=x.device, dtype=x.dtype)
-
-        return (
-            self.cos_cached[:seq_len].to(dtype=x.dtype),
-            self.sin_cached[:seq_len].to(dtype=x.dtype),
-        )
-
-
-# Copied from transformers.models.llama.modeling_llama.rotate_half
-def rotate_half(x):
-    """Rotates half the hidden dims of the input."""
-    x1 = x[..., : x.shape[-1] // 2]
-    x2 = x[..., x.shape[-1] // 2 :]
-    return torch.cat((-x2, x1), dim=-1)
-
-
-# copied from transformers.models.llama.modeling_llama.apply_rotary_pos_emb
-# TODO @Arthur no longer copied from LLama after static cache
-def apply_rotary_pos_emb(q, k, cos, sin, position_ids, unsqueeze_dim=1):
-    """Applies Rotary Position Embedding to the query and key tensors.
-
-    Args:
-        q (`torch.Tensor`): The query tensor.
-        k (`torch.Tensor`): The key tensor.
-        cos (`torch.Tensor`): The cosine part of the rotary embedding.
-        sin (`torch.Tensor`): The sine part of the rotary embedding.
-        position_ids (`torch.Tensor`):
-            The position indices of the tokens corresponding to the query and key tensors. For example, this can be
-            used to pass offsetted position ids when working with a KV-cache.
-        unsqueeze_dim (`int`, *optional*, defaults to 1):
-            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
-            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
-            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
-            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
-            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
-            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
-    Returns:
-        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
-    """
-    cos = cos[position_ids].unsqueeze(unsqueeze_dim)
-    sin = sin[position_ids].unsqueeze(unsqueeze_dim)
-    q_embed = (q * cos) + (rotate_half(q) * sin)
-    k_embed = (k * cos) + (rotate_half(k) * sin)
-    return q_embed, k_embed
-
-
-class MistralMLP(nn.Module):
-    def __init__(self, config):
-        super().__init__()
-        self.config = config
-        self.hidden_size = config.hidden_size
-        self.intermediate_size = config.intermediate_size
-        self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False, dtype = torch.bfloat16)
-        self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False,dtype = torch.bfloat16)
-        self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False,dtype = torch.bfloat16)
-        self.act_fn = ACT2FN[config.hidden_act]
-
-    def forward(self, x):
-        x = x.to(dtype=torch.bfloat16)
-        output = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
-
-        # 打印MLP层输出
-        mlp_output = output.detach().cpu().tolist()
-        data_to_save = {"MLP Output": mlp_output}
-        # 将输出写入 JSON 文件
-        with open('mlp_output.json', 'w') as f:
-            json.dump(data_to_save, f, indent=4)
-
-        return output
-
-
-# Copied from transformers.models.llama.modeling_llama.repeat_kv
-def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
-    """
-    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
-    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
-    """
-    batch, num_key_value_heads, slen, head_dim = hidden_states.shape
-    if n_rep == 1:
-        return hidden_states
-    hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
-    return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
-
-
-class MistralAttention(nn.Module):
-    """
-    Multi-headed attention from 'Attention Is All You Need' paper. Modified to use sliding window attention: Longformer
-    and "Generating Long Sequences with Sparse Transformers".
-    """
-
-    def __init__(self, config: MistralConfig, layer_idx: Optional[int] = None):
-        super().__init__()
-        self.config = config
-        self.layer_idx = layer_idx
-        if layer_idx is None:
-            logger.warning_once(
-                f"Instantiating {self.__class__.__name__} without passing a `layer_idx` is not recommended and will "
-                "lead to errors during the forward call if caching is used. Please make sure to provide a `layer_idx` "
-                "when creating this class."
-            )
-
-        self.hidden_size = config.hidden_size
-        self.num_heads = config.num_attention_heads
-        self.head_dim = self.hidden_size // self.num_heads
-        self.num_key_value_heads = config.num_key_value_heads
-        self.num_key_value_groups = self.num_heads // self.num_key_value_heads
-        self.max_position_embeddings = config.max_position_embeddings
-        self.rope_theta = config.rope_theta
-        self.is_causal = True
-        self.attention_dropout = config.attention_dropout
-
-        if (self.head_dim * self.num_heads) != self.hidden_size:
-            raise ValueError(
-                f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}"
-                f" and `num_heads`: {self.num_heads})."
-            )
-        self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=False,dtype = torch.bfloat16)
-        self.k_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=False,dtype = torch.bfloat16)
-        self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=False,dtype = torch.bfloat16)
-        self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=False,dtype = torch.bfloat16)
-
-        self.rotary_emb = MistralRotaryEmbedding(
-            self.head_dim,
-            max_position_embeddings=self.max_position_embeddings,
-            base=self.rope_theta,
-        )
-
-    def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
-        return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
-
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_value: Optional[Cache] = None,
-        output_attentions: bool = False,
-        use_cache: bool = False,
-        **kwargs,
-    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
-        if "padding_mask" in kwargs:
-            warnings.warn(
-                "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`"
-            )
-        bsz, q_len, _ = hidden_states.size()
-        query_states = self.q_proj(hidden_states)
-        key_states = self.k_proj(hidden_states)
-        value_states = self.v_proj(hidden_states)
-
-        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
-        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
-        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
-
-        # 打印注意力模块的输出
-        # 准备数据以写入 JSON 文件
-        attention_outputs = {
-            "Query states": query_states.detach().cpu().tolist(),
-            "Key states": key_states.detach().cpu().tolist(),
-            "Value states": value_states.detach().cpu().tolist()
-        }
-        # 将数据写入 JSON 文件
-        with open("attention_outputs.json", "w") as f:
-            json.dump(attention_outputs, f, indent=4)
-
-        kv_seq_len = key_states.shape[-2]
-        if past_key_value is not None:
-            if self.layer_idx is None:
-                raise ValueError(
-                    f"The cache structure has changed since version v4.36. If you are using {self.__class__.__name__} "
-                    "for auto-regressive decoding with k/v caching, please make sure to initialize the attention class "
-                    "with a layer index."
-                )
-            kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
-        cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
-        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
-
-        if past_key_value is not None:
-            cache_kwargs = {"sin": sin, "cos": cos}  # Specific to RoPE models
-            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
-
-        # repeat k/v heads if n_kv_heads < n_heads
-        key_states = repeat_kv(key_states, self.num_key_value_groups)
-        value_states = repeat_kv(value_states, self.num_key_value_groups)
-
-        attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim)
-
-        if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len):
-            raise ValueError(
-                f"Attention weights should be of size {(bsz, self.num_heads, q_len, kv_seq_len)}, but is"
-                f" {attn_weights.size()}"
-            )
-
-        if attention_mask is not None:
-            if attention_mask.size() != (bsz, 1, q_len, kv_seq_len):
-                raise ValueError(
-                    f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}"
-                )
-
-            attn_weights = attn_weights + attention_mask
-
-        # upcast attention to fp32
-        attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
-        attn_weights = nn.functional.dropout(attn_weights, p=self.attention_dropout, training=self.training)
-        attn_output = torch.matmul(attn_weights, value_states)
-
-        if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim):
-            raise ValueError(
-                f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is"
-                f" {attn_output.size()}"
-            )
-
-        attn_output = attn_output.transpose(1, 2).contiguous()
-        attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
-
-        attn_output = self.o_proj(attn_output)
-
-        if not output_attentions:
-            attn_weights = None
-
-        # 打印注意力模块的输出
-        attention_result = {
-            "Output weights:": attn_output.detach().cpu().tolist(),
-            # "Attention weights:": attn_weights.detach().cpu().tolist(),
-        }
-        # 将数据写入 JSON 文件
-        with open("attention_outputs.json", "w") as f:
-            json.dump(attention_result, f, indent=4)
-
-        return attn_output, attn_weights, past_key_value
-
-
-class MistralFlashAttention2(MistralAttention):
-    """
-    Mistral flash attention module. This module inherits from `MistralAttention` as the weights of the module stays
-    untouched. The only required change would be on the forward pass where it needs to correctly call the public API of
-    flash attention and deal with padding tokens in case the input contains any of them.
-    """
-
-    # Copied from transformers.models.llama.modeling_llama.LlamaFlashAttention2.__init__
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-
-        # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1.
-        # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignement, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0.
-        # Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left).
-        self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10()
-
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_value: Optional[Cache] = None,
-        output_attentions: bool = False,
-        use_cache: bool = False,
-        **kwargs,
-    ):
-        if "padding_mask" in kwargs:
-            warnings.warn(
-                "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`"
-            )
-
-            # overwrite attention_mask with padding_mask
-            attention_mask = kwargs.pop("padding_mask")
-        bsz, q_len, _ = hidden_states.size()
-
-        query_states = self.q_proj(hidden_states.to(torch.bfloat16))
-        key_states = self.k_proj(hidden_states.to(torch.bfloat16))
-        value_states = self.v_proj(hidden_states.to(torch.bfloat16))
-
-        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
-        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
-        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
-
-        kv_seq_len = key_states.shape[-2]
-        if past_key_value is not None:
-            if self.layer_idx is None:
-                raise ValueError(
-                    f"The cache structure has changed since version v4.36. If you are using {self.__class__.__name__} "
-                    "for auto-regressive decoding with k/v caching, please make sure to initialize the attention class "
-                    "with a layer index."
-                )
-            kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
-
-        # Because the input can be padded, the absolute sequence length depends on the max position id.
-        rotary_seq_len = max(kv_seq_len, position_ids[:, -1].max().item()) + 1
-        cos, sin = self.rotary_emb(value_states, seq_len=rotary_seq_len)
-
-        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
-
-        use_sliding_windows = (
-            _flash_supports_window_size
-            and getattr(self.config, "sliding_window", None) is not None
-            and kv_seq_len > self.config.sliding_window
-        )
-
-        if not _flash_supports_window_size:
-            logger.warning_once(
-                "The current flash attention version does not support sliding window attention, for a more memory efficient implementation"
-                " make sure to upgrade flash-attn library."
-            )
-
-        if past_key_value is not None:
-            # Activate slicing cache only if the config has a value `sliding_windows` attribute
-            cache_has_contents = past_key_value.get_seq_length(self.layer_idx) > 0
-            if (
-                getattr(self.config, "sliding_window", None) is not None
-                and kv_seq_len > self.config.sliding_window
-                and cache_has_contents
-            ):
-                slicing_tokens = 1 - self.config.sliding_window
-
-                past_key = past_key_value[self.layer_idx][0]
-                past_value = past_key_value[self.layer_idx][1]
-
-                past_key = past_key[:, :, slicing_tokens:, :].contiguous()
-                past_value = past_value[:, :, slicing_tokens:, :].contiguous()
-
-                if past_key.shape[-2] != self.config.sliding_window - 1:
-                    raise ValueError(
-                        f"past key must have a shape of (`batch_size, num_heads, self.config.sliding_window-1, head_dim`), got"
-                        f" {past_key.shape}"
-                    )
-
-                if attention_mask is not None:
-                    attention_mask = attention_mask[:, slicing_tokens:]
-                    attention_mask = torch.cat([attention_mask, torch.ones_like(attention_mask[:, -1:])], dim=-1)
-
-            cache_kwargs = {"sin": sin, "cos": cos}  # Specific to RoPE models
-            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
-
-        # repeat k/v heads if n_kv_heads < n_heads
-        key_states = repeat_kv(key_states, self.num_key_value_groups)
-        value_states = repeat_kv(value_states, self.num_key_value_groups)
-        dropout_rate = 0.0 if not self.training else self.attention_dropout
-
-        # In PEFT, usually we cast the layer norms in float32 for training stability reasons
-        # therefore the input hidden states gets silently casted in float32. Hence, we need
-        # cast them back in float16 just to be sure everything works as expected.
-        input_dtype = query_states.dtype
-        if input_dtype == torch.float32:
-            if torch.is_autocast_enabled():
-                target_dtype = torch.get_autocast_gpu_dtype()
-            # Handle the case where the model is quantized
-            elif hasattr(self.config, "_pre_quantization_dtype"):
-                target_dtype = self.config._pre_quantization_dtype
-            else:
-                target_dtype = self.q_proj.weight.dtype
-
-            logger.warning_once(
-                f"The input hidden states seems to be silently casted in float32, this might be related to"
-                f" the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in"
-                f" {target_dtype}."
-            )
-
-            query_states = query_states.to(target_dtype)
-            key_states = key_states.to(target_dtype)
-            value_states = value_states.to(target_dtype)
-
-        # Reashape to the expected shape for Flash Attention
-        query_states = query_states.transpose(1, 2)
-        key_states = key_states.transpose(1, 2)
-        value_states = value_states.transpose(1, 2)
-        # 打印注意力模块的输出
-        # 准备数据以写入 JSON 文件
-        attention_outputs = {
-            "Query states": query_states.detach().cpu().tolist(),
-            "Key states": key_states.detach().cpu().tolist(),
-            "Value states": value_states.detach().cpu().tolist()
-        }
-        # 将数据写入 JSON 文件
-        with open("flash_attention_outputs.json", "w") as f:
-            json.dump(attention_outputs, f, indent=4)
-
-        attn_output = self._flash_attention_forward(
-            query_states,
-            key_states,
-            value_states,
-            attention_mask,
-            q_len,
-            dropout=dropout_rate,
-            use_sliding_windows=use_sliding_windows,
-        )
-
-        attn_output = attn_output.reshape(bsz, q_len, self.hidden_size).contiguous()
-        attn_output = self.o_proj(attn_output)
-
-        if not output_attentions:
-            attn_weights = None
-
-        # 打印注意力模块的输出
-        attention_result = {
-            "Output weights:": attn_output.detach().cpu().tolist(),
-            # "Attention weights:": attn_weights.detach().cpu().tolist(),
-        }
-        # 将数据写入 JSON 文件
-        with open("flash_attention_outputs.json", "w") as f:
-            json.dump(attention_result, f, indent=4)
-
-        return attn_output, attn_weights, past_key_value
-
-    def _flash_attention_forward(
-        self,
-        query_states,
-        key_states,
-        value_states,
-        attention_mask,
-        query_length,
-        dropout=0.0,
-        softmax_scale=None,
-        use_sliding_windows=False,
-    ):
-        """
-        Calls the forward method of Flash Attention - if the input hidden states contain at least one padding token
-        first unpad the input, then computes the attention scores and pad the final attention scores.
-
-        Args:
-            query_states (`torch.Tensor`):
-                Input query states to be passed to Flash Attention API
-            key_states (`torch.Tensor`):
-                Input key states to be passed to Flash Attention API
-            value_states (`torch.Tensor`):
-                Input value states to be passed to Flash Attention API
-            attention_mask (`torch.Tensor`):
-                The padding mask - corresponds to a tensor of size `(batch_size, seq_len)` where 0 stands for the
-                position of padding tokens and 1 for the position of non-padding tokens.
-            dropout (`float`):
-                Attention dropout
-            softmax_scale (`float`, *optional*):
-                The scaling of QK^T before applying softmax. Default to 1 / sqrt(head_dim)
-            use_sliding_windows (`bool`, *optional*):
-                Whether to activate sliding window attention.
-        """
-        if not self._flash_attn_uses_top_left_mask:
-            causal = self.is_causal
-        else:
-            # TODO: Remove the `query_length != 1` check once Flash Attention for RoCm is bumped to 2.1. For details, please see the comment in LlamaFlashAttention2 __init__.
-            causal = self.is_causal and query_length != 1
-
-        # Contains at least one padding token in the sequence
-        if attention_mask is not None:
-            batch_size = query_states.shape[0]
-            query_states, key_states, value_states, indices_q, cu_seq_lens, max_seq_lens = self._upad_input(
-                query_states, key_states, value_states, attention_mask, query_length
-            )
-
-            cu_seqlens_q, cu_seqlens_k = cu_seq_lens
-            max_seqlen_in_batch_q, max_seqlen_in_batch_k = max_seq_lens
-
-            if not use_sliding_windows:
-                attn_output_unpad = flash_attn_varlen_func(
-                    query_states,
-                    key_states,
-                    value_states,
-                    cu_seqlens_q=cu_seqlens_q,
-                    cu_seqlens_k=cu_seqlens_k,
-                    max_seqlen_q=max_seqlen_in_batch_q,
-                    max_seqlen_k=max_seqlen_in_batch_k,
-                    dropout_p=dropout,
-                    softmax_scale=softmax_scale,
-                    causal=causal,
-                )
-            else:
-                attn_output_unpad = flash_attn_varlen_func(
-                    query_states,
-                    key_states,
-                    value_states,
-                    cu_seqlens_q=cu_seqlens_q,
-                    cu_seqlens_k=cu_seqlens_k,
-                    max_seqlen_q=max_seqlen_in_batch_q,
-                    max_seqlen_k=max_seqlen_in_batch_k,
-                    dropout_p=dropout,
-                    softmax_scale=softmax_scale,
-                    causal=causal,
-                    window_size=(self.config.sliding_window, self.config.sliding_window),
-                )
-
-            attn_output = pad_input(attn_output_unpad, indices_q, batch_size, query_length)
-        else:
-            if not use_sliding_windows:
-                attn_output = flash_attn_func(
-                    query_states,
-                    key_states,
-                    value_states,
-                    dropout,
-                    softmax_scale=softmax_scale,
-                    causal=causal,
-                )
-            else:
-                attn_output = flash_attn_func(
-                    query_states,
-                    key_states,
-                    value_states,
-                    dropout,
-                    softmax_scale=softmax_scale,
-                    causal=causal,
-                    window_size=(self.config.sliding_window, self.config.sliding_window),
-                )
-
-        return attn_output
-
-    def _upad_input(self, query_layer, key_layer, value_layer, attention_mask, query_length):
-        batch_size, kv_seq_len, num_heads, head_dim = key_layer.shape
-
-        # On the first iteration we need to properly re-create the padding mask
-        # by slicing it on the proper place
-        if kv_seq_len != attention_mask.shape[-1]:
-            attention_mask_num_tokens = attention_mask.shape[-1]
-            attention_mask = attention_mask[:, attention_mask_num_tokens - kv_seq_len :]
-
-        indices_k, cu_seqlens_k, max_seqlen_in_batch_k = _get_unpad_data(attention_mask)
-
-        key_layer = index_first_axis(key_layer.reshape(batch_size * kv_seq_len, num_heads, head_dim), indices_k)
-        value_layer = index_first_axis(value_layer.reshape(batch_size * kv_seq_len, num_heads, head_dim), indices_k)
-
-        if query_length == kv_seq_len:
-            query_layer = index_first_axis(
-                query_layer.reshape(batch_size * kv_seq_len, num_heads, head_dim), indices_k
-            )
-            cu_seqlens_q = cu_seqlens_k
-            max_seqlen_in_batch_q = max_seqlen_in_batch_k
-            indices_q = indices_k
-        elif query_length == 1:
-            max_seqlen_in_batch_q = 1
-            cu_seqlens_q = torch.arange(
-                batch_size + 1, dtype=torch.int32, device=query_layer.device
-            )  # There is a memcpy here, that is very bad.
-            indices_q = cu_seqlens_q[:-1]
-            query_layer = query_layer.squeeze(1)
-        else:
-            # The -q_len: slice assumes left padding.
-            attention_mask = attention_mask[:, -query_length:]
-            query_layer, indices_q, cu_seqlens_q, max_seqlen_in_batch_q = unpad_input(query_layer, attention_mask)
-
-        return (
-            query_layer,
-            key_layer,
-            value_layer,
-            indices_q,
-            (cu_seqlens_q, cu_seqlens_k),
-            (max_seqlen_in_batch_q, max_seqlen_in_batch_k),
-        )
-
-
-# copied from transformers.models.llama.modeling_llama.LlamaSdpaAttention with Llama->Mistral
-# TODO @Arthur no longer copied from LLama after static cache
-class MistralSdpaAttention(MistralAttention):
-    """
-    Mistral attention module using torch.nn.functional.scaled_dot_product_attention. This module inherits from
-    `MistralAttention` as the weights of the module stays untouched. The only changes are on the forward pass to adapt to
-    SDPA API.
-    """
-
-    # Adapted from MistralAttention.forward
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_value: Optional[Cache] = None,
-        output_attentions: bool = False,
-        use_cache: bool = False,
-    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
-        if output_attentions:
-            # TODO: Improve this warning with e.g. `model.config.attn_implementation = "manual"` once this is implemented.
-            logger.warning_once(
-                "MistralModel is using MistralSdpaAttention, but `torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to the manual attention implementation, "
-                'but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
-            )
-            return super().forward(
-                hidden_states=hidden_states,
-                attention_mask=attention_mask,
-                position_ids=position_ids,
-                past_key_value=past_key_value,
-                output_attentions=output_attentions,
-                use_cache=use_cache,
-            )
-
-        bsz, q_len, _ = hidden_states.size()
-
-        query_states = self.q_proj(hidden_states.to(torch.bfloat16))
-        key_states = self.k_proj(hidden_states.to(torch.bfloat16))
-        value_states = self.v_proj(hidden_states.to(torch.bfloat16))
-        
-        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
-        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
-        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
-
-        # 打印注意力模块的输出
-        # 准备数据以写入 JSON 文件
-        attention_outputs = {
-            "Query states": query_states.detach().cpu().tolist(),
-            "Key states": key_states.detach().cpu().tolist(),
-            "Value states": value_states.detach().cpu().tolist()
-        }
-        # 将数据写入 JSON 文件
-        with open("sdpa_attention_outputs.json", "w") as f:
-            json.dump(attention_outputs, f, indent=4)
-
-
-        kv_seq_len = key_states.shape[-2]
-        if past_key_value is not None:
-            kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
-        cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
-        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
-
-        if past_key_value is not None:
-            cache_kwargs = {"sin": sin, "cos": cos}  # Specific to RoPE models
-            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
-
-        key_states = repeat_kv(key_states, self.num_key_value_groups)
-        value_states = repeat_kv(value_states, self.num_key_value_groups)
-
-        if attention_mask is not None:
-            if attention_mask.size() != (bsz, 1, q_len, kv_seq_len):
-                raise ValueError(
-                    f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}"
-                )
-
-        # SDPA with memory-efficient backend is currently (torch==2.1.2) bugged with non-contiguous inputs with custom attn_mask,
-        # Reference: https://github.com/pytorch/pytorch/issues/112577.
-        if query_states.device.type == "cuda" and attention_mask is not None:
-            query_states = query_states.contiguous()
-            key_states = key_states.contiguous()
-            value_states = value_states.contiguous()
-        
-        attn_output = torch.nn.functional.scaled_dot_product_attention(
-            query_states,
-            key_states,
-            value_states,
-            attn_mask=attention_mask,
-            dropout_p=self.attention_dropout if self.training else 0.0,
-            # The q_len > 1 is necessary to match with AttentionMaskConverter.to_causal_4d that does not create a causal mask in case q_len == 1.
-            is_causal=self.is_causal and attention_mask is None and q_len > 1,
-        )
-
-        attn_output = attn_output.transpose(1, 2).contiguous()
-        attn_output = attn_output.view(bsz, q_len, self.hidden_size)
-
-        attn_output = self.o_proj(attn_output)
-
-        # 打印注意力模块的输出
-        attention_result = {
-            "Output weights:": attn_output.detach().cpu().tolist(),
-            # "Attention weights:": attn_weights.detach().cpu().tolist(),
-        }
-        # 将数据写入 JSON 文件
-        with open("sdpa_attention_outputs.json", "w") as f:
-            json.dump(attention_result, f, indent=4)
-
-        return attn_output, None, past_key_value
-
-
-MISTRAL_ATTENTION_CLASSES = {
-    "eager": MistralAttention,
-    "flash_attention_2": MistralFlashAttention2,
-    "sdpa": MistralSdpaAttention,
-}
-
-
-class MistralDecoderLayer(nn.Module):
-    def __init__(self, config: MistralConfig, layer_idx: int):
-        super().__init__()
-        self.hidden_size = config.hidden_size
-        # bbbb
-        config._attn_implementation = "sdpa"
-        self.self_attn = MISTRAL_ATTENTION_CLASSES[config._attn_implementation](config, layer_idx)
-
-        self.mlp = MistralMLP(config)
-        self.input_layernorm = MistralRMSNorm(config.hidden_size, eps=config.rms_norm_eps,dtype=torch.bfloat16)
-        self.post_attention_layernorm = MistralRMSNorm(config.hidden_size, eps=config.rms_norm_eps,dtype=torch.bfloat16)
-
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_value: Optional[Tuple[torch.Tensor]] = None,
-        output_attentions: Optional[bool] = False,
-        use_cache: Optional[bool] = False,
-        **kwargs,
-    ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
-        if "padding_mask" in kwargs:
-            warnings.warn(
-                "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`"
-            )
-        """
-        Args:
-            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
-            attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
-                `(batch, sequence_length)` where padding elements are indicated by 0.
-            output_attentions (`bool`, *optional*):
-                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
-                returned tensors for more detail.
-            use_cache (`bool`, *optional*):
-                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
-                (see `past_key_values`).
-            past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
-        """
-
-        residual = hidden_states
-
-        hidden_states = self.input_layernorm(hidden_states)
-        # Self Attention
-        hidden_states, self_attn_weights, present_key_value = self.self_attn(
-            hidden_states=hidden_states,
-            attention_mask=attention_mask,
-            position_ids=position_ids,
-            past_key_value=past_key_value,
-            output_attentions=output_attentions,
-            use_cache=use_cache,
-        )
-        hidden_states = residual + hidden_states
-
-        # Fully Connected
-        residual = hidden_states
-        hidden_states = self.post_attention_layernorm(hidden_states)
-        hidden_states = self.mlp(hidden_states)
-        hidden_states = residual + hidden_states
-
-        outputs = (hidden_states,)
-
-        if output_attentions:
-            outputs += (self_attn_weights,)
-
-        if use_cache:
-            outputs += (present_key_value,)
-        
-        return outputs
-
-
-MISTRAL_START_DOCSTRING = r"""
-    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
-    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
-    etc.)
-
-    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
-    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
-    and behavior.
-
-    Parameters:
-        config ([`MistralConfig`]):
-            Model configuration class with all the parameters of the model. Initializing with a config file does not
-            load the weights associated with the model, only the configuration. Check out the
-            [`~PreTrainedModel.from_pretrained`] method to load the model weights.
-"""
-
-
-@add_start_docstrings(
-    "The bare Mistral Model outputting raw hidden-states without any specific head on top.",
-    MISTRAL_START_DOCSTRING,
-)
-class MistralPreTrainedModel(PreTrainedModel):
-    config_class = MistralConfig
-    base_model_prefix = "model"
-    supports_gradient_checkpointing = True
-    _no_split_modules = ["MistralDecoderLayer"]
-    _skip_keys_device_placement = "past_key_values"
-    _supports_flash_attn_2 = True
-    _supports_sdpa = True
-    _supports_cache_class = True
-
-    def _init_weights(self, module):
-        std = self.config.initializer_range
-        if isinstance(module, nn.Linear):
-            module.weight.data.normal_(mean=0.0, std=std)
-            if module.bias is not None:
-                module.bias.data.zero_()
-        elif isinstance(module, nn.Embedding):
-            module.weight.data.normal_(mean=0.0, std=std)
-            if module.padding_idx is not None:
-                module.weight.data[module.padding_idx].zero_()
-
-
-MISTRAL_INPUTS_DOCSTRING = r"""
-    Args:
-        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
-            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
-            it.
-
-            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
-            [`PreTrainedTokenizer.__call__`] for details.
-
-            [What are input IDs?](../glossary#input-ids)
-        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
-            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
-
-            - 1 for tokens that are **not masked**,
-            - 0 for tokens that are **masked**.
-
-            [What are attention masks?](../glossary#attention-mask)
-
-            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
-            [`PreTrainedTokenizer.__call__`] for details.
-
-            If `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
-            `past_key_values`).
-
-            If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
-            and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
-            information on the default strategy.
-
-            - 1 indicates the head is **not masked**,
-            - 0 indicates the head is **masked**.
-        position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
-            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
-            config.n_positions - 1]`.
-
-            [What are position IDs?](../glossary#position-ids)
-        past_key_values (`Cache` or `tuple(tuple(torch.FloatTensor))`, *optional*):
-            Pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
-            blocks) that can be used to speed up sequential decoding. This typically consists in the `past_key_values`
-            returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`.
-
-            Two formats are allowed:
-            - a [`~cache_utils.Cache`] instance;
-            - Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of
-            shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`). This is also known as the legacy
-            cache format.
-
-            The model will output the same cache format that is fed as input. If no `past_key_values` are passed, the
-            legacy cache format will be returned.
-
-            If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that don't
-            have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `input_ids`
-            of shape `(batch_size, sequence_length)`.
-        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
-            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
-            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
-            model's internal embedding lookup matrix.
-        use_cache (`bool`, *optional*):
-            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
-            `past_key_values`).
-        output_attentions (`bool`, *optional*):
-            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
-            tensors for more detail.
-        output_hidden_states (`bool`, *optional*):
-            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
-            more detail.
-        return_dict (`bool`, *optional*):
-            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
-"""
-
-
-@add_start_docstrings(
-    "The bare Mistral Model outputting raw hidden-states without any specific head on top.",
-    MISTRAL_START_DOCSTRING,
-)
-class MistralModel(MistralPreTrainedModel):
-    """
-    Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`MistralDecoderLayer`]
-
-    Args:
-        config: MistralConfig
-    """
-
-    def __init__(self, config: MistralConfig):
-        super().__init__(config)
-        self.padding_idx = config.pad_token_id
-        self.vocab_size = config.vocab_size
-
-        self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx, dtype = torch.bfloat16)
-        self.layers = nn.ModuleList(
-            [MistralDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
-        )
-        config._attn_implementation = "sdpa"
-        self._attn_implementation = config._attn_implementation
-        self.norm = MistralRMSNorm(config.hidden_size, eps=config.rms_norm_eps,dtype=torch.bfloat16)
-
-        self.gradient_checkpointing = False
-        # Initialize weights and apply final processing
-        self.post_init()
-
-    def get_input_embeddings(self):
-        return self.embed_tokens
-
-    def set_input_embeddings(self, value):
-        self.embed_tokens = value
-
-    @add_start_docstrings_to_model_forward(MISTRAL_INPUTS_DOCSTRING)
-    def forward(
-        self,
-        input_ids: torch.LongTensor = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[List[torch.FloatTensor]] = None,
-        inputs_embeds: Optional[torch.FloatTensor] = None,
-        use_cache: Optional[bool] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ) -> Union[Tuple, BaseModelOutputWithPast]:
-        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
-        output_hidden_states = (
-            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
-        )
-        use_cache = use_cache if use_cache is not None else self.config.use_cache
-
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        # retrieve input_ids and inputs_embeds
-        if input_ids is not None and inputs_embeds is not None:
-            raise ValueError("You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time")
-        elif input_ids is not None:
-            batch_size, seq_length = input_ids.shape
-        elif inputs_embeds is not None:
-            batch_size, seq_length, _ = inputs_embeds.shape
-        else:
-            raise ValueError("You have to specify either decoder_input_ids or decoder_inputs_embeds")
-
-        if self.gradient_checkpointing and self.training:
-            if use_cache:
-                logger.warning_once(
-                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
-                )
-                use_cache = False
-
-        past_key_values_length = 0
-
-        if use_cache:
-            use_legacy_cache = not isinstance(past_key_values, Cache)
-            if use_legacy_cache:
-                past_key_values = DynamicCache.from_legacy_cache(past_key_values)
-            past_key_values_length = past_key_values.get_usable_length(seq_length)
-
-        if position_ids is None:
-            device = input_ids.device if input_ids is not None else inputs_embeds.device
-            position_ids = torch.arange(
-                past_key_values_length, seq_length + past_key_values_length, dtype=torch.long, device=device
-            )
-            position_ids = position_ids.unsqueeze(0).view(-1, seq_length)
-        else:
-            position_ids = position_ids.view(-1, seq_length).long()
-
-        if inputs_embeds is None:
-            inputs_embeds = self.embed_tokens(input_ids)
-
-        # 打印嵌入层输出
-        embeddings_output = inputs_embeds.detach().cpu().tolist()
-        data_to_save = {"Embeddings Output": embeddings_output}
-        # 将输出写入 JSON 文件
-        with open('embeddings_output.json', 'w') as f:
-            json.dump(data_to_save, f, indent=4)
-
-        if attention_mask is not None and self._attn_implementation == "flash_attention_2" and use_cache:
-            is_padding_right = attention_mask[:, -1].sum().item() != batch_size
-            if is_padding_right:
-                raise ValueError(
-                    "You are attempting to perform batched generation with padding_side='right'"
-                    " this may lead to unexpected behaviour for Flash Attention version of Mistral. Make sure to "
-                    " call `tokenizer.padding_side  = 'left'` before tokenizing the input. "
-                )
-
-        if self._attn_implementation == "flash_attention_2":
-            # 2d mask is passed through the layers
-            attention_mask = attention_mask if (attention_mask is not None and 0 in attention_mask) else None
-        elif self._attn_implementation == "sdpa" and not output_attentions:
-            # output_attentions=True can not be supported when using SDPA, and we fall back on
-            # the manual implementation that requires a 4D causal mask in all cases.
-            attention_mask = _prepare_4d_causal_attention_mask_for_sdpa(
-                attention_mask,
-                (batch_size, seq_length),
-                inputs_embeds,
-                past_key_values_length,
-            )
-        else:
-            # 4d mask is passed through the layers
-            attention_mask = _prepare_4d_causal_attention_mask(
-                attention_mask,
-                (batch_size, seq_length),
-                inputs_embeds,
-                past_key_values_length,
-                sliding_window=self.config.sliding_window,
-            )
-
-        hidden_states = inputs_embeds
-
-        # decoder layers
-        all_hidden_states = () if output_hidden_states else None
-        all_self_attns = () if output_attentions else None
-        next_decoder_cache = None
-
-        for decoder_layer in self.layers:
-            if output_hidden_states:
-                all_hidden_states += (hidden_states,)
-
-            if self.gradient_checkpointing and self.training:
-                layer_outputs = self._gradient_checkpointing_func(
-                    decoder_layer.__call__,
-                    hidden_states,
-                    attention_mask,
-                    position_ids,
-                    past_key_values,
-                    output_attentions,
-                    use_cache,
-                )
-            else:
-                layer_outputs = decoder_layer(
-                    hidden_states,
-                    attention_mask=attention_mask,
-                    position_ids=position_ids,
-                    past_key_value=past_key_values,
-                    output_attentions=output_attentions,
-                    use_cache=use_cache,
-                )
-
-            hidden_states = layer_outputs[0]
-
-            if use_cache:
-                next_decoder_cache = layer_outputs[2 if output_attentions else 1]
-
-            if output_attentions:
-                all_self_attns += (layer_outputs[1],)
-
-        hidden_states = self.norm(hidden_states)
-
-        # add hidden states from the last decoder layer
-        if output_hidden_states:
-            all_hidden_states += (hidden_states,)
-
-        next_cache = None
-        if use_cache:
-            next_cache = next_decoder_cache.to_legacy_cache() if use_legacy_cache else next_decoder_cache
-
-        if not return_dict:
-            return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None)
-        return BaseModelOutputWithPast(
-            last_hidden_state=hidden_states,
-            past_key_values=next_cache,
-            hidden_states=all_hidden_states,
-            attentions=all_self_attns,
-        )
-
-
-class MistralForCausalLM(MistralPreTrainedModel):
-    _tied_weights_keys = ["lm_head.weight"]
-
-    def __init__(self, config):
-        super().__init__(config)
-        self.model = MistralModel(config)
-        self.vocab_size = config.vocab_size
-        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False,dtype = torch.bfloat16)
-
-        # Initialize weights and apply final processing
-        self.post_init()
-
-    def get_input_embeddings(self):
-        return self.model.embed_tokens
-
-    def set_input_embeddings(self, value):
-        self.model.embed_tokens = value
-
-    def get_output_embeddings(self):
-        return self.lm_head
-
-    def set_output_embeddings(self, new_embeddings):
-        self.lm_head = new_embeddings
-
-    def set_decoder(self, decoder):
-        self.model = decoder
-
-    def get_decoder(self):
-        return self.model
-
-    @add_start_docstrings_to_model_forward(MISTRAL_INPUTS_DOCSTRING)
-    @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
-    def forward(
-        self,
-        input_ids: torch.LongTensor = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[List[torch.FloatTensor]] = None,
-        inputs_embeds: Optional[torch.FloatTensor] = None,
-        labels: Optional[torch.LongTensor] = None,
-        use_cache: Optional[bool] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ) -> Union[Tuple, CausalLMOutputWithPast]:
-        r"""
-        Args:
-            labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
-                Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
-                config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
-                (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
-
-        Returns:
-
-        Example:
-
-        ```python
-        >>> from transformers import AutoTokenizer, MistralForCausalLM
-
-        >>> model = MistralForCausalLM.from_pretrained("mistralai/Mistral-7B-v0.1")
-        >>> tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-v0.1")
-
-        >>> prompt = "Hey, are you conscious? Can you talk to me?"
-        >>> inputs = tokenizer(prompt, return_tensors="pt")
-
-        >>> # Generate
-        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
-        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
-        "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
-        ```"""
-
-        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
-        output_hidden_states = (
-            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
-        )
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
-        outputs = self.model(
-            input_ids=input_ids,
-            attention_mask=attention_mask,
-            position_ids=position_ids,
-            past_key_values=past_key_values,
-            inputs_embeds=inputs_embeds,
-            use_cache=use_cache,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-
-        hidden_states = outputs[0]
-        hidden_states = hidden_states.to(dtype=torch.bfloat16)
-        logits = self.lm_head(hidden_states)
-        logits = logits.float()
-
-        loss = None
-        if labels is not None:
-            # Shift so that tokens < n predict n
-            shift_logits = logits[..., :-1, :].contiguous()
-            shift_labels = labels[..., 1:].contiguous()
-            # Flatten the tokens
-            shift_logits = shift_logits.view(-1, self.config.vocab_size)
-            shift_labels = shift_labels.view(-1)
-            # Ensure tensors are on the same device
-            shift_labels = shift_labels.to(shift_logits.device)
-            loss_fct = CrossEntropyLoss()
-            loss = loss_fct(shift_logits, shift_labels)
-
-        if not return_dict:
-            output = (logits,) + outputs[1:]
-            return (loss,) + output if loss is not None else output
-
-        return CausalLMOutputWithPast(
-            loss=loss,
-            logits=logits,
-            past_key_values=outputs.past_key_values,
-            hidden_states=outputs.hidden_states,
-            attentions=outputs.attentions,
-        )
-
-    def prepare_inputs_for_generation(
-        self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, **kwargs
-    ):
-        # Omit tokens covered by past_key_values
-        if past_key_values is not None:
-            if isinstance(past_key_values, Cache):
-                cache_length = past_key_values.get_seq_length()
-                past_length = past_key_values.seen_tokens
-                max_cache_length = past_key_values.get_max_length()
-            else:
-                cache_length = past_length = past_key_values[0][0].shape[2]
-                max_cache_length = None
-
-            # Keep only the unprocessed tokens:
-            # 1 - If the length of the attention_mask exceeds the length of input_ids, then we are in a setting where
-            # some of the inputs are exclusively passed as part of the cache (e.g. when passing input_embeds as
-            # input)
-            if attention_mask is not None and attention_mask.shape[1] > input_ids.shape[1]:
-                input_ids = input_ids[:, -(attention_mask.shape[1] - past_length) :]
-            # 2 - If the past_length is smaller than input_ids', then input_ids holds all input tokens. We can discard
-            # input_ids based on the past_length.
-            elif past_length < input_ids.shape[1]:
-                input_ids = input_ids[:, past_length:]
-            # 3 - Otherwise (past_length >= input_ids.shape[1]), let's assume input_ids only has unprocessed tokens.
-
-            # If we are about to go beyond the maximum cache length, we need to crop the input attention mask.
-            if (
-                max_cache_length is not None
-                and attention_mask is not None
-                and cache_length + input_ids.shape[1] > max_cache_length
-            ):
-                attention_mask = attention_mask[:, -max_cache_length:]
-
-        position_ids = kwargs.get("position_ids", None)
-        if attention_mask is not None and position_ids is None:
-            # create position_ids on the fly for batch generation
-            position_ids = attention_mask.long().cumsum(-1) - 1
-            position_ids.masked_fill_(attention_mask == 0, 1)
-            if past_key_values:
-                position_ids = position_ids[:, -input_ids.shape[1] :]
-
-        # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
-        if inputs_embeds is not None and past_key_values is None:
-            model_inputs = {"inputs_embeds": inputs_embeds}
-        else:
-            model_inputs = {"input_ids": input_ids}
-
-        model_inputs.update(
-            {
-                "position_ids": position_ids,
-                "past_key_values": past_key_values,
-                "use_cache": kwargs.get("use_cache"),
-                "attention_mask": attention_mask,
-            }
-        )
-        return model_inputs
-
-    @staticmethod
-    def _reorder_cache(past_key_values, beam_idx):
-        reordered_past = ()
-        for layer_past in past_key_values:
-            reordered_past += (
-                tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past),
-            )
-        return reordered_past
-
-
-@add_start_docstrings(
-    """
-    The Mistral Model transformer with a sequence classification head on top (linear layer).
-
-    [`MistralForSequenceClassification`] uses the last token in order to do the classification, as other causal models
-    (e.g. GPT-2) do.
-
-    Since it does classification on the last token, it requires to know the position of the last token. If a
-    `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If
-    no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the
-    padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
-    each row of the batch).
-    """,
-    MISTRAL_START_DOCSTRING,
-)
-# Copied from transformers.models.llama.modeling_llama.LlamaForSequenceClassification with Llama->Mistral, LLAMA->MISTRAL
-class MistralForSequenceClassification(MistralPreTrainedModel):
-    def __init__(self, config):
-        super().__init__(config)
-        self.num_labels = config.num_labels
-        self.model = MistralModel(config)
-        self.score = nn.Linear(config.hidden_size, self.num_labels, bias=False,dtype = torch.bfloat16)
-
-        # Initialize weights and apply final processing
-        self.post_init()
-
-    def get_input_embeddings(self):
-        return self.model.embed_tokens
-
-    def set_input_embeddings(self, value):
-        self.model.embed_tokens = value
-
-    @add_start_docstrings_to_model_forward(MISTRAL_INPUTS_DOCSTRING)
-    def forward(
-        self,
-        input_ids: torch.LongTensor = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[List[torch.FloatTensor]] = None,
-        inputs_embeds: Optional[torch.FloatTensor] = None,
-        labels: Optional[torch.LongTensor] = None,
-        use_cache: Optional[bool] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ) -> Union[Tuple, SequenceClassifierOutputWithPast]:
-        r"""
-        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
-            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
-            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
-            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
-        """
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        transformer_outputs = self.model(
-            input_ids,
-            attention_mask=attention_mask,
-            position_ids=position_ids,
-            past_key_values=past_key_values,
-            inputs_embeds=inputs_embeds,
-            use_cache=use_cache,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-        hidden_states = transformer_outputs[0]
-        logits = self.score(hidden_states)
-
-        if input_ids is not None:
-            batch_size = input_ids.shape[0]
-        else:
-            batch_size = inputs_embeds.shape[0]
-
-        if self.config.pad_token_id is None and batch_size != 1:
-            raise ValueError("Cannot handle batch sizes > 1 if no padding token is defined.")
-        if self.config.pad_token_id is None:
-            sequence_lengths = -1
-        else:
-            if input_ids is not None:
-                # if no pad token found, use modulo instead of reverse indexing for ONNX compatibility
-                sequence_lengths = torch.eq(input_ids, self.config.pad_token_id).int().argmax(-1) - 1
-                sequence_lengths = sequence_lengths % input_ids.shape[-1]
-                sequence_lengths = sequence_lengths.to(logits.device)
-            else:
-                sequence_lengths = -1
-
-        pooled_logits = logits[torch.arange(batch_size, device=logits.device), sequence_lengths]
-
-        loss = None
-        if labels is not None:
-            labels = labels.to(logits.device)
-            if self.config.problem_type is None:
-                if self.num_labels == 1:
-                    self.config.problem_type = "regression"
-                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
-                    self.config.problem_type = "single_label_classification"
-                else:
-                    self.config.problem_type = "multi_label_classification"
-
-            if self.config.problem_type == "regression":
-                loss_fct = MSELoss()
-                if self.num_labels == 1:
-                    loss = loss_fct(pooled_logits.squeeze(), labels.squeeze())
-                else:
-                    loss = loss_fct(pooled_logits, labels)
-            elif self.config.problem_type == "single_label_classification":
-                loss_fct = CrossEntropyLoss()
-                loss = loss_fct(pooled_logits.view(-1, self.num_labels), labels.view(-1))
-            elif self.config.problem_type == "multi_label_classification":
-                loss_fct = BCEWithLogitsLoss()
-                loss = loss_fct(pooled_logits, labels)
-        if not return_dict:
-            output = (pooled_logits,) + transformer_outputs[1:]
-            return ((loss,) + output) if loss is not None else output
-
-        return SequenceClassifierOutputWithPast(
-            loss=loss,
-            logits=pooled_logits,
-            past_key_values=transformer_outputs.past_key_values,
-            hidden_states=transformer_outputs.hidden_states,
-            attentions=transformer_outputs.attentions,
-        )

From cb21a96886c1d63faae2e44663a06597879925da Mon Sep 17 00:00:00 2001
From: LinqiY <100989140+LinqiY@users.noreply.github.com>
Date: Mon, 6 May 2024 15:39:39 +0800
Subject: [PATCH 06/16] Delete collie/models/mistral/modeling_flax_mistral.py

---
 .../models/mistral/modeling_flax_mistral.py   | 741 ------------------
 1 file changed, 741 deletions(-)
 delete mode 100644 collie/models/mistral/modeling_flax_mistral.py

diff --git a/collie/models/mistral/modeling_flax_mistral.py b/collie/models/mistral/modeling_flax_mistral.py
deleted file mode 100644
index 0a837f4..0000000
--- a/collie/models/mistral/modeling_flax_mistral.py
+++ /dev/null
@@ -1,741 +0,0 @@
-# coding=utf-8
-# Copyright 2024 Mistral AI and the HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-""" Flax Mistral model."""
-from typing import Optional, Tuple
-
-import flax.linen as nn
-import jax
-import jax.numpy as jnp
-import numpy as np
-from flax.core.frozen_dict import FrozenDict, freeze, unfreeze
-from flax.linen import combine_masks, make_causal_mask
-from flax.linen.attention import dot_product_attention_weights
-from flax.traverse_util import flatten_dict, unflatten_dict
-from jax import lax
-
-from transformers.modeling_flax_outputs import (
-    FlaxBaseModelOutput,
-    FlaxBaseModelOutputWithPast,
-    FlaxCausalLMOutput,
-    FlaxCausalLMOutputWithCrossAttentions,
-)
-from transformers.modeling_flax_utils import ACT2FN, FlaxPreTrainedModel, append_call_sample_docstring, logging
-from transformers.utils import add_start_docstrings, add_start_docstrings_to_model_forward
-from .configuration_mistral import MistralConfig
-
-
-logger = logging.get_logger(__name__)
-
-_CONFIG_FOR_DOC = "MistralConfig"
-_REAL_CHECKPOINT_FOR_DOC = "mistralai/Mistral-7B-v0.1"
-_CHECKPOINT_FOR_DOC = "ksmcg/Mistral-tiny"
-
-MISTRAL_START_DOCSTRING = r"""
-
-    This model inherits from [`FlaxPreTrainedModel`]. Check the superclass documentation for the generic methods the
-    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
-    etc.)
-
-    This model is also a Flax Linen
-    [flax.nn.Module](https://flax.readthedocs.io/en/latest/_autosummary/flax.nn.module.html) subclass. Use it as a
-    regular Flax Module and refer to the Flax documentation for all matter related to general usage and behavior.
-
-    Finally, this model supports inherent JAX features such as:
-
-    - [Just-In-Time (JIT) compilation](https://jax.readthedocs.io/en/latest/jax.html#just-in-time-compilation-jit)
-    - [Automatic Differentiation](https://jax.readthedocs.io/en/latest/jax.html#automatic-differentiation)
-    - [Vectorization](https://jax.readthedocs.io/en/latest/jax.html#vectorization-vmap)
-    - [Parallelization](https://jax.readthedocs.io/en/latest/jax.html#parallelization-pmap)
-
-    Parameters:
-        config ([`MistralConfig`]): Model configuration class with all the parameters of the model.
-            Initializing with a config file does not load the weights associated with the model, only the
-            configuration. Check out the [`~FlaxPreTrainedModel.from_pretrained`] method to load the model weights.
-        dtype (`jax.numpy.dtype`, *optional*, defaults to `jax.numpy.float32`):
-            The data type of the computation. Can be one of `jax.numpy.float32`, `jax.numpy.float16`, or
-            `jax.numpy.bfloat16`.
-
-            This can be used to enable mixed-precision training or half-precision inference on GPUs or TPUs. If
-            specified all the computation will be performed with the given `dtype`.
-
-            **Note that this only specifies the dtype of the computation and does not influence the dtype of model
-            parameters.**
-
-            If you wish to change the dtype of the model parameters, see [`~FlaxPreTrainedModel.to_fp16`] and
-            [`~FlaxPreTrainedModel.to_bf16`].
-"""
-
-MISTRAL_INPUTS_DOCSTRING = r"""
-    Args:
-        input_ids (`numpy.ndarray` of shape `(batch_size, input_ids_length)`):
-            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
-            it.
-
-            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
-            [`PreTrainedTokenizer.__call__`] for details.
-
-            [What are input IDs?](../glossary#input-ids)
-        attention_mask (`numpy.ndarray` of shape `(batch_size, sequence_length)`, *optional*):
-            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
-
-            - 1 for tokens that are **not masked**,
-            - 0 for tokens that are **masked**.
-
-            [What are attention masks?](../glossary#attention-mask)
-
-            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
-            [`PreTrainedTokenizer.__call__`] for details.
-
-            If `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
-            `past_key_values`).
-
-            If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
-            and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
-            information on the default strategy.
-
-            - 1 indicates the head is **not masked**,
-            - 0 indicates the head is **masked**.
-        position_ids (`numpy.ndarray` of shape `(batch_size, sequence_length)`, *optional*):
-            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
-            config.n_positions - 1]`.
-
-            [What are position IDs?](../glossary#position-ids)
-        past_key_values (`Dict[str, np.ndarray]`, *optional*, returned by `init_cache` or when passing previous `past_key_values`):
-            Dictionary of pre-computed hidden-states (key and values in the attention blocks) that can be used for fast
-            auto-regressive decoding. Pre-computed key and value hidden-states are of shape *[batch_size, max_length]*.
-        output_attentions (`bool`, *optional*):
-            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
-            tensors for more detail.
-        output_hidden_states (`bool`, *optional*):
-            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
-            more detail.
-        return_dict (`bool`, *optional*):
-            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
-"""
-
-
-# Copied from transformers.models.llama.modeling_flax_llama.FlaxLlamaRMSNorm with Llama->Mistral
-class FlaxMistralRMSNorm(nn.Module):
-    config: MistralConfig
-    dtype: jnp.dtype = jnp.float32
-
-    def setup(self):
-        self.epsilon = self.config.rms_norm_eps
-        self.weight = self.param("weight", lambda _, shape: jnp.ones(shape), self.config.hidden_size)
-
-    def __call__(self, hidden_states):
-        variance = jnp.asarray(hidden_states, dtype=jnp.float32)
-        variance = jnp.power(variance, 2)
-        variance = variance.mean(-1, keepdims=True)
-        # use `jax.numpy.sqrt` as `jax.lax.rsqrt` does not match `torch.rsqrt`
-        hidden_states = hidden_states / jnp.sqrt(variance + self.epsilon)
-
-        return self.weight * jnp.asarray(hidden_states, dtype=self.dtype)
-
-
-# Copied from transformers.models.llama.modeling_flax_llama.FlaxLlamaRotaryEmbedding with Llama->Mistral
-class FlaxMistralRotaryEmbedding(nn.Module):
-    config: MistralConfig
-    dtype: jnp.dtype = jnp.float32
-
-    def setup(self):
-        head_dim = self.config.hidden_size // self.config.num_attention_heads
-        self.sincos = create_sinusoidal_positions(self.config.max_position_embeddings, head_dim)
-
-    def __call__(self, key, query, position_ids):
-        sincos = self.sincos[position_ids]
-        sin_pos, cos_pos = jnp.split(sincos, 2, axis=-1)
-
-        key = apply_rotary_pos_emb(key, sin_pos, cos_pos)
-        query = apply_rotary_pos_emb(query, sin_pos, cos_pos)
-
-        key = jnp.asarray(key, dtype=self.dtype)
-        query = jnp.asarray(query, dtype=self.dtype)
-
-        return key, query
-
-
-# Copied from transformers.models.llama.modeling_flax_llama.FlaxLlamaMLP with Llama->Mistral
-class FlaxMistralMLP(nn.Module):
-    config: MistralConfig
-    dtype: jnp.dtype = jnp.float32
-
-    def setup(self):
-        embed_dim = self.config.hidden_size
-        inner_dim = self.config.intermediate_size if self.config.intermediate_size is not None else 4 * embed_dim
-
-        kernel_init = jax.nn.initializers.normal(self.config.initializer_range)
-        self.act = ACT2FN[self.config.hidden_act]
-
-        self.gate_proj = nn.Dense(inner_dim, use_bias=False, dtype=self.dtype, kernel_init=kernel_init)
-        self.down_proj = nn.Dense(embed_dim, use_bias=False, dtype=self.dtype, kernel_init=kernel_init)
-        self.up_proj = nn.Dense(inner_dim, use_bias=False, dtype=self.dtype, kernel_init=kernel_init)
-
-    def __call__(self, hidden_states):
-        up_proj_states = self.up_proj(hidden_states)
-        gate_states = self.act(self.gate_proj(hidden_states))
-
-        hidden_states = self.down_proj(up_proj_states * gate_states)
-        return hidden_states
-
-
-# Copied from transformers.models.llama.modeling_flax_llama.apply_rotary_pos_emb
-def apply_rotary_pos_emb(tensor, sin_pos, cos_pos):
-    return (tensor * cos_pos) + (rotate_half(tensor) * sin_pos)
-
-
-# Copied from transformers.models.llama.modeling_flax_llama.create_sinusoidal_positions
-def create_sinusoidal_positions(num_pos, dim):
-    inv_freq = 1.0 / (10000 ** (np.arange(0, dim, 2) / dim))
-    freqs = np.einsum("i , j -> i j", np.arange(num_pos), inv_freq).astype("float32")
-
-    emb = np.concatenate((freqs, freqs), axis=-1)
-    out = np.concatenate((np.sin(emb)[:, None, :], np.cos(emb)[:, None, :]), axis=-1)
-    return jnp.array(out[:, :, :num_pos])
-
-
-# Copied from transformers.models.llama.modeling_flax_llama.rotate_half
-def rotate_half(tensor):
-    """Rotates half the hidden dims of the input."""
-    rotate_half_tensor = jnp.concatenate(
-        (-tensor[..., tensor.shape[-1] // 2 :], tensor[..., : tensor.shape[-1] // 2]), axis=-1
-    )
-    return rotate_half_tensor
-
-
-class FlaxMistralAttention(nn.Module):
-    config: MistralConfig
-    dtype: jnp.dtype = jnp.float32
-
-    def setup(self):
-        config = self.config
-        self.hidden_size = config.hidden_size
-        self.num_heads = config.num_attention_heads
-        self.head_dim = self.hidden_size // self.num_heads
-        self.num_key_value_heads = config.num_key_value_heads
-        self.num_key_value_groups = self.num_heads // self.num_key_value_heads
-        self.max_position_embeddings = config.max_position_embeddings
-        self.attention_softmax_in_fp32 = self.dtype is not jnp.float32
-        self.rope_theta = config.rope_theta
-        if (self.head_dim * self.num_heads) != self.hidden_size:
-            raise ValueError(
-                f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}"
-                f" and `num_heads`: {self.num_heads})."
-            )
-        self.q_proj = nn.Dense(self.num_heads * self.head_dim, use_bias=False, dtype=self.dtype)
-        self.k_proj = nn.Dense(self.num_key_value_heads * self.head_dim, use_bias=False, dtype=self.dtype)
-        self.v_proj = nn.Dense(self.num_key_value_heads * self.head_dim, use_bias=False, dtype=self.dtype)
-        self.o_proj = nn.Dense(self.hidden_size, use_bias=False, dtype=self.dtype)
-        casual_mask = make_causal_mask(jnp.ones((1, config.max_position_embeddings), dtype="bool"), dtype="bool")
-        self.causal_mask = jnp.triu(casual_mask, k=-config.sliding_window)
-        self.rotary_emb = FlaxMistralRotaryEmbedding(config, dtype=self.dtype)
-
-    def _split_heads(self, hidden_states, num_heads):
-        return hidden_states.reshape(hidden_states.shape[:2] + (num_heads, self.head_dim))
-
-    def _merge_heads(self, hidden_states):
-        return hidden_states.reshape(hidden_states.shape[:2] + (self.hidden_size,))
-
-    @nn.compact
-    # Copied from transformers.models.gpt_neo.modeling_flax_gpt_neo.FlaxGPTNeoSelfAttention._concatenate_to_cache
-    def _concatenate_to_cache(self, key, value, query, attention_mask):
-        """
-        This function takes projected key, value states from a single input token and concatenates the states to cached
-        states from previous steps. This function is slighly adapted from the official Flax repository:
-        https://github.com/google/flax/blob/491ce18759622506588784b4fca0e4bf05f8c8cd/flax/linen/attention.py#L252
-        """
-        # detect if we're initializing by absence of existing cache data.
-        is_initialized = self.has_variable("cache", "cached_key")
-        cached_key = self.variable("cache", "cached_key", jnp.zeros, key.shape, key.dtype)
-        cached_value = self.variable("cache", "cached_value", jnp.zeros, value.shape, value.dtype)
-        cache_index = self.variable("cache", "cache_index", lambda: jnp.array(0, dtype=jnp.int32))
-
-        if is_initialized:
-            *batch_dims, max_length, num_heads, depth_per_head = cached_key.value.shape
-            # update key, value caches with our new 1d spatial slices
-            cur_index = cache_index.value
-            indices = (0,) * len(batch_dims) + (cur_index, 0, 0)
-            key = lax.dynamic_update_slice(cached_key.value, key, indices)
-            value = lax.dynamic_update_slice(cached_value.value, value, indices)
-            cached_key.value = key
-            cached_value.value = value
-            num_updated_cache_vectors = query.shape[1]
-            cache_index.value = cache_index.value + num_updated_cache_vectors
-            # causal mask for cached decoder self-attention: our single query position should only attend to those key positions that have already been generated and cached, not the remaining zero elements.
-            pad_mask = jnp.broadcast_to(
-                jnp.arange(max_length) < cur_index + num_updated_cache_vectors,
-                tuple(batch_dims) + (1, num_updated_cache_vectors, max_length),
-            )
-            attention_mask = combine_masks(pad_mask, attention_mask)
-        return key, value, attention_mask
-
-    def __call__(
-        self,
-        hidden_states: jnp.ndarray,
-        attention_mask: Optional[jnp.ndarray] = None,
-        position_ids: Optional[jnp.ndarray] = None,
-        deterministic: bool = True,
-        output_attentions: bool = False,
-        init_cache: bool = False,
-    ) -> Tuple[jnp.ndarray, jnp.ndarray]:
-        query_states = self.q_proj(hidden_states)
-        key_states = self.k_proj(hidden_states)
-        value_states = self.v_proj(hidden_states)
-
-        query_states = self._split_heads(query_states, self.num_heads)
-        key_states = self._split_heads(key_states, self.num_key_value_heads)
-        value_states = self._split_heads(value_states, self.num_key_value_heads)
-
-        key_states, query_states = self.rotary_emb(key_states, query_states, position_ids)
-        query_length, key_length = query_states.shape[1], key_states.shape[1]
-        if self.has_variable("cache", "cached_key"):
-            mask_shift = self.variables["cache"]["cache_index"]
-            max_decoder_length = self.variables["cache"]["cached_key"].shape[1]
-            causal_mask = lax.dynamic_slice(
-                self.causal_mask, (0, 0, mask_shift, 0), (1, 1, query_length, max_decoder_length)
-            )
-        else:
-            causal_mask = self.causal_mask[:, :, :query_length, :key_length]
-
-        batch_size = hidden_states.shape[0]
-        causal_mask = jnp.broadcast_to(causal_mask, (batch_size,) + causal_mask.shape[1:])
-        attention_mask = jnp.broadcast_to(jnp.expand_dims(attention_mask, axis=(-3, -2)), causal_mask.shape)
-        attention_mask = combine_masks(attention_mask, causal_mask)
-
-        if self.has_variable("cache", "cached_key") or init_cache:
-            key_states, value_states, attention_mask = self._concatenate_to_cache(
-                key_states, value_states, query_states, attention_mask
-            )
-        key_states = jnp.repeat(key_states, self.num_key_value_groups, axis=2)
-        value_states = jnp.repeat(value_states, self.num_key_value_groups, axis=2)
-
-        attention_bias = lax.select(
-            attention_mask > 0,
-            jnp.full(attention_mask.shape, 0.0).astype(self.dtype),
-            jnp.full(attention_mask.shape, jnp.finfo(self.dtype).min).astype(self.dtype),
-        )
-
-        # usual dot product attention
-        attention_dtype = jnp.float32 if self.attention_softmax_in_fp32 else self.dtype
-        attn_weights = dot_product_attention_weights(
-            query_states,
-            key_states,
-            bias=attention_bias,
-            deterministic=deterministic,
-            dropout_rate=self.config.attention_dropout,
-            dtype=attention_dtype,
-        )
-
-        if self.attention_softmax_in_fp32:
-            attn_weights = attn_weights.astype(self.dtype)
-
-        attn_output = jnp.einsum("...hqk,...khd->...qhd", attn_weights, value_states)
-        attn_output = self._merge_heads(attn_output)
-        attn_output = self.o_proj(attn_output)
-
-        outputs = (attn_output, attn_weights) if output_attentions else (attn_output,)
-        return outputs
-
-
-# Copied from transformers.models.llama.modeling_flax_llama.FlaxLlamaDecoderLayer with Llama->Mistral
-class FlaxMistralDecoderLayer(nn.Module):
-    config: MistralConfig
-    dtype: jnp.dtype = jnp.float32
-
-    def setup(self):
-        self.input_layernorm = FlaxMistralRMSNorm(self.config, dtype=self.dtype)
-        self.self_attn = FlaxMistralAttention(self.config, dtype=self.dtype)
-        self.post_attention_layernorm = FlaxMistralRMSNorm(self.config, dtype=self.dtype)
-        self.mlp = FlaxMistralMLP(self.config, dtype=self.dtype)
-
-    def __call__(
-        self,
-        hidden_states,
-        attention_mask=None,
-        position_ids=None,
-        deterministic: bool = True,
-        init_cache: bool = False,
-        output_attentions: bool = False,
-    ):
-        residual = hidden_states
-        hidden_states = self.input_layernorm(hidden_states)
-        outputs = self.self_attn(
-            hidden_states,
-            attention_mask=attention_mask,
-            position_ids=position_ids,
-            deterministic=deterministic,
-            init_cache=init_cache,
-            output_attentions=output_attentions,
-        )
-        # residual connection
-        attn_output = outputs[0]
-        hidden_states = residual + attn_output
-
-        residual = hidden_states
-        hidden_states = self.post_attention_layernorm(hidden_states)
-        hidden_states = self.mlp(hidden_states)
-        # residual connection
-        hidden_states = residual + hidden_states
-
-        return (hidden_states,) + outputs[1:]
-
-
-# Copied from transformers.models.gpt_neo.modeling_flax_gpt_neo.FlaxGPTNeoPreTrainedModel with GPTNeo->Mistral, GPT_NEO->MISTRAL, transformer->model
-class FlaxMistralPreTrainedModel(FlaxPreTrainedModel):
-    """
-    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
-    models.
-    """
-
-    config_class = MistralConfig
-    base_model_prefix = "model"
-    module_class: nn.Module = None
-
-    def __init__(
-        self,
-        config: MistralConfig,
-        input_shape: Tuple = (1, 1),
-        seed: int = 0,
-        dtype: jnp.dtype = jnp.float32,
-        _do_init: bool = True,
-        **kwargs,
-    ):
-        module = self.module_class(config=config, dtype=dtype, **kwargs)
-        super().__init__(config, module, input_shape=input_shape, seed=seed, dtype=dtype, _do_init=_do_init)
-
-    def init_weights(self, rng: jax.random.PRNGKey, input_shape: Tuple, params: FrozenDict = None) -> FrozenDict:
-        # init input tensors
-        input_ids = jnp.zeros(input_shape, dtype="i4")
-        attention_mask = jnp.ones_like(input_ids)
-        position_ids = jnp.broadcast_to(jnp.arange(jnp.atleast_2d(input_ids).shape[-1]), input_shape)
-        params_rng, dropout_rng = jax.random.split(rng)
-        rngs = {"params": params_rng, "dropout": dropout_rng}
-
-        random_params = self.module.init(rngs, input_ids, attention_mask, position_ids, return_dict=False)["params"]
-
-        if params is not None:
-            random_params = flatten_dict(unfreeze(random_params))
-            params = flatten_dict(unfreeze(params))
-            for missing_key in self._missing_keys:
-                params[missing_key] = random_params[missing_key]
-            self._missing_keys = set()
-            return freeze(unflatten_dict(params))
-        else:
-            return random_params
-
-    def init_cache(self, batch_size, max_length):
-        r"""
-        Args:
-            batch_size (`int`):
-                batch_size used for fast auto-regressive decoding. Defines the batch size of the initialized cache.
-            max_length (`int`):
-                maximum possible length for auto-regressive decoding. Defines the sequence length of the initialized
-                cache.
-        """
-        # init input variables to retrieve cache
-        input_ids = jnp.ones((batch_size, max_length))
-        attention_mask = jnp.ones_like(input_ids)
-        position_ids = jnp.broadcast_to(jnp.arange(jnp.atleast_2d(input_ids).shape[-1]), input_ids.shape)
-
-        init_variables = self.module.init(
-            jax.random.PRNGKey(0), input_ids, attention_mask, position_ids, return_dict=False, init_cache=True
-        )
-        return unfreeze(init_variables["cache"])
-
-    @add_start_docstrings_to_model_forward(MISTRAL_INPUTS_DOCSTRING)
-    def __call__(
-        self,
-        input_ids,
-        attention_mask=None,
-        position_ids=None,
-        params: dict = None,
-        past_key_values: dict = None,
-        dropout_rng: jax.random.PRNGKey = None,
-        train: bool = False,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ):
-        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
-        output_hidden_states = (
-            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
-        )
-        return_dict = return_dict if return_dict is not None else self.config.return_dict
-
-        batch_size, sequence_length = input_ids.shape
-
-        if position_ids is None:
-            if past_key_values is not None:
-                raise ValueError("Make sure to provide `position_ids` when passing `past_key_values`.")
-
-            position_ids = jnp.broadcast_to(jnp.arange(sequence_length)[None, :], (batch_size, sequence_length))
-
-        if attention_mask is None:
-            attention_mask = jnp.ones((batch_size, sequence_length))
-
-        # Handle any PRNG if needed
-        rngs = {}
-        if dropout_rng is not None:
-            rngs["dropout"] = dropout_rng
-
-        inputs = {"params": params or self.params}
-
-        # if past_key_values are passed then cache is already initialized a private flag init_cache has to be passed down to ensure cache is used. It has to be made sure that cache is marked as mutable so that it can be changed by FlaxMistralAttention module
-        if past_key_values:
-            inputs["cache"] = past_key_values
-            mutable = ["cache"]
-        else:
-            mutable = False
-
-        outputs = self.module.apply(
-            inputs,
-            jnp.array(input_ids, dtype="i4"),
-            jnp.array(attention_mask, dtype="i4"),
-            jnp.array(position_ids, dtype="i4"),
-            not train,
-            False,
-            output_attentions,
-            output_hidden_states,
-            return_dict,
-            rngs=rngs,
-            mutable=mutable,
-        )
-
-        # add updated cache to model output
-        if past_key_values is not None and return_dict:
-            outputs, past_key_values = outputs
-            outputs["past_key_values"] = unfreeze(past_key_values["cache"])
-            return outputs
-        elif past_key_values is not None and not return_dict:
-            outputs, past_key_values = outputs
-            outputs = outputs[:1] + (unfreeze(past_key_values["cache"]),) + outputs[1:]
-
-        return outputs
-
-
-# Copied from transformers.models.llama.modeling_flax_llama.FlaxLlamaLayerCollection with Llama->Mistral
-class FlaxMistralLayerCollection(nn.Module):
-    config: MistralConfig
-    dtype: jnp.dtype = jnp.float32
-
-    def setup(self):
-        self.blocks = [
-            FlaxMistralDecoderLayer(self.config, dtype=self.dtype, name=str(i))
-            for i in range(self.config.num_hidden_layers)
-        ]
-
-    def __call__(
-        self,
-        hidden_states,
-        attention_mask=None,
-        position_ids=None,
-        deterministic: bool = True,
-        init_cache: bool = False,
-        output_attentions: bool = False,
-        output_hidden_states: bool = False,
-        return_dict: bool = False,
-    ):
-        all_attentions = () if output_attentions else None
-        all_hidden_states = () if output_hidden_states else None
-
-        for block in self.blocks:
-            if output_hidden_states:
-                all_hidden_states += (hidden_states,)
-            layer_outputs = block(
-                hidden_states,
-                attention_mask=attention_mask,
-                position_ids=position_ids,
-                deterministic=deterministic,
-                init_cache=init_cache,
-                output_attentions=output_attentions,
-            )
-            hidden_states = layer_outputs[0]
-
-            if output_attentions:
-                all_attentions += (layer_outputs[1],)
-
-        # this contains possible `None` values - `FlaxMistralModule` will filter them out
-        outputs = (hidden_states, all_hidden_states, all_attentions)
-
-        return outputs
-
-
-# Copied from transformers.models.llama.modeling_flax_llama.FlaxLlamaModule with Llama->Mistral
-class FlaxMistralModule(nn.Module):
-    config: MistralConfig
-    dtype: jnp.dtype = jnp.float32
-
-    def setup(self):
-        self.hidden_size = self.config.hidden_size
-        embedding_init = jax.nn.initializers.normal(stddev=self.config.initializer_range)
-        self.embed_tokens = nn.Embed(
-            self.config.vocab_size,
-            self.hidden_size,
-            embedding_init=embedding_init,
-            dtype=self.dtype,
-        )
-        self.layers = FlaxMistralLayerCollection(self.config, dtype=self.dtype)
-        self.norm = FlaxMistralRMSNorm(self.config, dtype=self.dtype)
-
-    def __call__(
-        self,
-        input_ids,
-        attention_mask=None,
-        position_ids=None,
-        deterministic=True,
-        init_cache: bool = False,
-        output_attentions: bool = False,
-        output_hidden_states: bool = False,
-        return_dict: bool = True,
-    ):
-        input_embeds = self.embed_tokens(input_ids.astype("i4"))
-
-        outputs = self.layers(
-            input_embeds,
-            position_ids=position_ids,
-            attention_mask=attention_mask,
-            deterministic=deterministic,
-            init_cache=init_cache,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-
-        hidden_states = outputs[0]
-        hidden_states = self.norm(hidden_states)
-
-        if output_hidden_states:
-            all_hidden_states = outputs[1] + (hidden_states,)
-            outputs = (hidden_states, all_hidden_states) + outputs[2:]
-        else:
-            outputs = (hidden_states,) + outputs[1:]
-
-        if not return_dict:
-            return tuple(v for v in outputs if v is not None)
-
-        return FlaxBaseModelOutput(
-            last_hidden_state=hidden_states,
-            hidden_states=outputs[1],
-            attentions=outputs[-1],
-        )
-
-
-@add_start_docstrings(
-    "The bare Mistral Model transformer outputting raw hidden-states without any specific head on top.",
-    MISTRAL_START_DOCSTRING,
-)
-class FlaxMistralModel(FlaxMistralPreTrainedModel):
-    module_class = FlaxMistralModule
-
-
-append_call_sample_docstring(
-    FlaxMistralModel,
-    _CHECKPOINT_FOR_DOC,
-    FlaxBaseModelOutputWithPast,
-    _CONFIG_FOR_DOC,
-    real_checkpoint=_REAL_CHECKPOINT_FOR_DOC,
-)
-
-
-# Copied from transformers.models.llama.modeling_flax_llama.FlaxLlamaForCausalLMModule with Llama->Mistral
-class FlaxMistralForCausalLMModule(nn.Module):
-    config: MistralConfig
-    dtype: jnp.dtype = jnp.float32
-
-    def setup(self):
-        self.model = FlaxMistralModule(self.config, dtype=self.dtype)
-        self.lm_head = nn.Dense(
-            self.config.vocab_size,
-            use_bias=False,
-            dtype=self.dtype,
-            kernel_init=jax.nn.initializers.normal(stddev=self.config.initializer_range),
-        )
-
-    def __call__(
-        self,
-        input_ids,
-        attention_mask=None,
-        position_ids=None,
-        deterministic: bool = True,
-        init_cache: bool = False,
-        output_attentions: bool = False,
-        output_hidden_states: bool = False,
-        return_dict: bool = True,
-    ):
-        outputs = self.model(
-            input_ids,
-            position_ids=position_ids,
-            attention_mask=attention_mask,
-            deterministic=deterministic,
-            init_cache=init_cache,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-
-        hidden_states = outputs[0]
-        lm_logits = self.lm_head(hidden_states)
-
-        if not return_dict:
-            return (lm_logits,) + outputs[1:]
-
-        return FlaxCausalLMOutput(logits=lm_logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions)
-
-
-@add_start_docstrings(
-    """
-    The Mistral Model transformer with a language modeling head (linear layer) on top.
-    """,
-    MISTRAL_START_DOCSTRING,
-)
-
-# Copied from transformers.models.gptj.modeling_flax_gptj.FlaxGPTJForCausalLM with GPTJ->Mistral
-class FlaxMistralForCausalLM(FlaxMistralPreTrainedModel):
-    module_class = FlaxMistralForCausalLMModule
-
-    def prepare_inputs_for_generation(self, input_ids, max_length, attention_mask: Optional[jax.Array] = None):
-        # initializing the cache
-        batch_size, seq_length = input_ids.shape
-
-        past_key_values = self.init_cache(batch_size, max_length)
-        # Note that usually one would have to put 0's in the attention_mask for x > input_ids.shape[-1] and x < cache_length.
-        # But since Mistral uses a causal mask, those positions are masked anyways.
-        # Thus we can create a single static attention_mask here, which is more efficient for compilation
-        extended_attention_mask = jnp.ones((batch_size, max_length), dtype="i4")
-        if attention_mask is not None:
-            position_ids = attention_mask.cumsum(axis=-1) - 1
-            extended_attention_mask = lax.dynamic_update_slice(extended_attention_mask, attention_mask, (0, 0))
-        else:
-            position_ids = jnp.broadcast_to(jnp.arange(seq_length, dtype="i4")[None, :], (batch_size, seq_length))
-
-        return {
-            "past_key_values": past_key_values,
-            "attention_mask": extended_attention_mask,
-            "position_ids": position_ids,
-        }
-
-    def update_inputs_for_generation(self, model_outputs, model_kwargs):
-        model_kwargs["past_key_values"] = model_outputs.past_key_values
-        model_kwargs["position_ids"] = model_kwargs["position_ids"][:, -1:] + 1
-        return model_kwargs
-
-
-append_call_sample_docstring(
-    FlaxMistralForCausalLM,
-    _CHECKPOINT_FOR_DOC,
-    FlaxCausalLMOutputWithCrossAttentions,
-    _CONFIG_FOR_DOC,
-    real_checkpoint=_REAL_CHECKPOINT_FOR_DOC,
-)

From 8696802c76fad80b0873a53e2943e42021402d8e Mon Sep 17 00:00:00 2001
From: LinqiY <100989140+LinqiY@users.noreply.github.com>
Date: Mon, 6 May 2024 15:39:49 +0800
Subject: [PATCH 07/16] Delete
 collie/models/mistral/convert_mistral_weights_to_hf.py

---
 .../mistral/convert_mistral_weights_to_hf.py  | 276 ------------------
 1 file changed, 276 deletions(-)
 delete mode 100644 collie/models/mistral/convert_mistral_weights_to_hf.py

diff --git a/collie/models/mistral/convert_mistral_weights_to_hf.py b/collie/models/mistral/convert_mistral_weights_to_hf.py
deleted file mode 100644
index 4ba6236..0000000
--- a/collie/models/mistral/convert_mistral_weights_to_hf.py
+++ /dev/null
@@ -1,276 +0,0 @@
-# Copyright 2023 Mistral AI and The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import argparse
-import gc
-import json
-import os
-import shutil
-import warnings
-
-import torch
-
-from transformers import (
-    LlamaTokenizer,
-    MistralConfig,
-    MistralForCausalLM,
-)
-
-
-try:
-    from transformers import LlamaTokenizerFast
-
-    tokenizer_class = LlamaTokenizerFast
-except ImportError as e:
-    warnings.warn(e)
-    warnings.warn(
-        "The converted tokenizer will be the `slow` tokenizer. To use the fast, update your `tokenizers` library and re-run the tokenizer conversion"
-    )
-    tokenizer_class = LlamaTokenizer
-
-"""
-Sample usage:
-
-```
-python src/transformers/models/mistral/convert_mistral_weights_to_hf.py \
-    --input_dir /path/to/downloaded/mistral/weights --model_size 7B --output_dir /output/path
-```
-
-Thereafter, models can be loaded via:
-
-```py
-from transformers import MistralForCausalLM, LlamaTokenizer
-
-model = MistralForCausalLM.from_pretrained("/output/path")
-tokenizer = LlamaTokenizer.from_pretrained("/output/path")
-```
-
-Important note: you need to be able to host the whole model in RAM to execute this script (even if the biggest versions
-come in several checkpoints they each contain a part of each weight of the model, so we need to load them all in RAM).
-"""
-
-NUM_SHARDS = {"7B": 1}
-
-
-def compute_intermediate_size(n, ffn_dim_multiplier=1, multiple_of=256):
-    return multiple_of * ((int(ffn_dim_multiplier * int(8 * n / 3)) + multiple_of - 1) // multiple_of)
-
-
-def read_json(path):
-    with open(path, "r") as f:
-        return json.load(f)
-
-
-def write_json(text, path):
-    with open(path, "w") as f:
-        json.dump(text, f)
-
-
-def write_model(model_path, input_base_path, model_size, tokenizer_path=None, safe_serialization=True):
-    # for backward compatibility, before you needed the repo to be called `my_repo/model_size`
-    if not os.path.isfile(os.path.join(input_base_path, "params.json")):
-        input_base_path = os.path.join(input_base_path, model_size)
-
-    os.makedirs(model_path, exist_ok=True)
-    tmp_model_path = os.path.join(model_path, "tmp")
-    os.makedirs(tmp_model_path, exist_ok=True)
-
-    params = read_json(os.path.join(input_base_path, "params.json"))
-    num_shards = NUM_SHARDS[model_size]
-
-    # For some reason this is a string in the params.json
-    sliding_window = int(params["sliding_window"])
-    n_layers = params["n_layers"]
-    n_heads = params["n_heads"]
-    n_heads_per_shard = n_heads // num_shards
-    dim = params["dim"]
-    dims_per_head = dim // n_heads
-    base = params.get("rope_theta", 10000.0)
-    inv_freq = 1.0 / (base ** (torch.arange(0, dims_per_head, 2).float() / dims_per_head))
-    max_position_embeddings = 4096 * 8
-
-    if tokenizer_path is not None:
-        tokenizer = tokenizer_class(tokenizer_path)
-        tokenizer.save_pretrained(model_path)
-    vocab_size = tokenizer.vocab_size if tokenizer_path is not None else 32000
-
-    if "n_kv_heads" in params:
-        num_key_value_heads = params["n_kv_heads"]  # for GQA / MQA
-        num_local_key_value_heads = num_key_value_heads // num_shards
-        key_value_dim = dims_per_head * num_local_key_value_heads
-    else:  # compatibility with other checkpoints
-        num_key_value_heads = n_heads
-        num_local_key_value_heads = n_heads_per_shard
-        key_value_dim = dim
-
-    # permute for sliced rotary
-    def permute(w, n_heads=n_heads, dim1=dim, dim2=dim):
-        return w.view(n_heads, dim1 // n_heads // 2, 2, dim2).transpose(1, 2).reshape(dim1, dim2)
-
-    print(f"Fetching all parameters from the checkpoint at {input_base_path}.")
-    # Load weights
-    loaded = [
-        torch.load(os.path.join(input_base_path, f"consolidated.{i:02d}.pth"), map_location="cpu")
-        for i in range(num_shards)
-    ]
-    param_count = 0
-    index_dict = {"weight_map": {}}
-    for layer_i in range(n_layers):
-        filename = f"pytorch_model-{layer_i + 1}-of-{n_layers + 1}.bin"
-
-        # Sharded
-        # Note that attention.w{q,k,v,o}, feed_fordward.w[1,2,3], attention_norm.weight and ffn_norm.weight share
-        # the same storage object, saving attention_norm and ffn_norm will save other weights too, which is
-        # redundant as other weights will be stitched from multiple shards. To avoid that, they are cloned.
-
-        state_dict = {
-            f"model.layers.{layer_i}.input_layernorm.weight": loaded[0][
-                f"layers.{layer_i}.attention_norm.weight"
-            ].clone(),
-            f"model.layers.{layer_i}.post_attention_layernorm.weight": loaded[0][
-                f"layers.{layer_i}.ffn_norm.weight"
-            ].clone(),
-        }
-        state_dict[f"model.layers.{layer_i}.self_attn.q_proj.weight"] = permute(
-            torch.cat(
-                [
-                    loaded[i][f"layers.{layer_i}.attention.wq.weight"].view(n_heads_per_shard, dims_per_head, dim)
-                    for i in range(num_shards)
-                ],
-                dim=0,
-            ).reshape(dim, dim)
-        )
-        state_dict[f"model.layers.{layer_i}.self_attn.k_proj.weight"] = permute(
-            torch.cat(
-                [
-                    loaded[i][f"layers.{layer_i}.attention.wk.weight"].view(
-                        num_local_key_value_heads, dims_per_head, dim
-                    )
-                    for i in range(num_shards)
-                ],
-                dim=0,
-            ).reshape(key_value_dim, dim),
-            num_key_value_heads,
-            key_value_dim,
-            dim,
-        )
-        state_dict[f"model.layers.{layer_i}.self_attn.v_proj.weight"] = torch.cat(
-            [
-                loaded[i][f"layers.{layer_i}.attention.wv.weight"].view(num_local_key_value_heads, dims_per_head, dim)
-                for i in range(num_shards)
-            ],
-            dim=0,
-        ).reshape(key_value_dim, dim)
-
-        state_dict[f"model.layers.{layer_i}.self_attn.o_proj.weight"] = torch.cat(
-            [loaded[i][f"layers.{layer_i}.attention.wo.weight"] for i in range(num_shards)], dim=1
-        )
-        state_dict[f"model.layers.{layer_i}.mlp.gate_proj.weight"] = torch.cat(
-            [loaded[i][f"layers.{layer_i}.feed_forward.w1.weight"] for i in range(num_shards)], dim=0
-        )
-        state_dict[f"model.layers.{layer_i}.mlp.down_proj.weight"] = torch.cat(
-            [loaded[i][f"layers.{layer_i}.feed_forward.w2.weight"] for i in range(num_shards)], dim=1
-        )
-        state_dict[f"model.layers.{layer_i}.mlp.up_proj.weight"] = torch.cat(
-            [loaded[i][f"layers.{layer_i}.feed_forward.w3.weight"] for i in range(num_shards)], dim=0
-        )
-
-        state_dict[f"model.layers.{layer_i}.self_attn.rotary_emb.inv_freq"] = inv_freq
-        for k, v in state_dict.items():
-            index_dict["weight_map"][k] = filename
-            param_count += v.numel()
-        torch.save(state_dict, os.path.join(tmp_model_path, filename))
-
-    filename = f"pytorch_model-{n_layers + 1}-of-{n_layers + 1}.bin"
-    state_dict = {
-        "model.norm.weight": loaded[0]["norm.weight"],
-        "model.embed_tokens.weight": torch.cat([loaded[i]["tok_embeddings.weight"] for i in range(num_shards)], dim=1),
-        "lm_head.weight": torch.cat([loaded[i]["output.weight"] for i in range(num_shards)], dim=0),
-    }
-
-    for k, v in state_dict.items():
-        index_dict["weight_map"][k] = filename
-        param_count += v.numel()
-    torch.save(state_dict, os.path.join(tmp_model_path, filename))
-
-    # Write configs
-    index_dict["metadata"] = {"total_size": param_count * 2}
-    write_json(index_dict, os.path.join(tmp_model_path, "pytorch_model.bin.index.json"))
-    config = MistralConfig(
-        hidden_size=dim,
-        intermediate_size=params["hidden_dim"],
-        num_attention_heads=params["n_heads"],
-        num_hidden_layers=params["n_layers"],
-        rms_norm_eps=params["norm_eps"],
-        num_key_value_heads=num_key_value_heads,
-        vocab_size=vocab_size,
-        rope_theta=base,
-        max_position_embeddings=max_position_embeddings,
-        sliding_window=sliding_window,
-    )
-    config.save_pretrained(tmp_model_path)
-
-    # Make space so we can load the model properly now.
-    del state_dict
-    del loaded
-    gc.collect()
-
-    print("Loading the checkpoint in a Mistral model.")
-    model = MistralForCausalLM.from_pretrained(tmp_model_path, torch_dtype=torch.bfloat16, low_cpu_mem_usage=True)
-    # Avoid saving this as part of the config.
-    del model.config._name_or_path
-    model.config.torch_dtype = torch.float16
-    print("Saving in the Transformers format.")
-    model.save_pretrained(model_path, safe_serialization=safe_serialization)
-    shutil.rmtree(tmp_model_path)
-
-
-def write_tokenizer(tokenizer_path, input_tokenizer_path):
-    # Initialize the tokenizer based on the `spm` model
-    print(f"Saving a {tokenizer_class.__name__} to {tokenizer_path}.")
-    tokenizer = tokenizer_class(input_tokenizer_path)
-    tokenizer.save_pretrained(tokenizer_path)
-
-
-def main():
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--input_dir",
-        help="Location of Mistral weights, which contains tokenizer.model and model folders",
-    )
-    parser.add_argument(
-        "--model_size",
-        choices=["7B", "tokenizer_only"],
-        help="'f' models correspond to the finetuned versions, and are specific to the Mistral2 official release. For more details on Mistral2, checkout the original repo: https://huggingface.co/meta-mistral",
-    )
-    parser.add_argument(
-        "--output_dir",
-        help="Location to write HF model and tokenizer",
-    )
-    parser.add_argument("--safe_serialization", type=bool, help="Whether or not to save using `safetensors`.")
-    args = parser.parse_args()
-    spm_path = os.path.join(args.input_dir, "tokenizer.model")
-    if args.model_size != "tokenizer_only":
-        write_model(
-            model_path=args.output_dir,
-            input_base_path=args.input_dir,
-            model_size=args.model_size,
-            safe_serialization=args.safe_serialization,
-            tokenizer_path=spm_path,
-        )
-    else:
-        write_tokenizer(args.output_dir, spm_path)
-
-
-if __name__ == "__main__":
-    main()

From ea0c9b4c742f51d183afdbb63d6279f79f22c899 Mon Sep 17 00:00:00 2001
From: LinqiY <100989140+LinqiY@users.noreply.github.com>
Date: Mon, 6 May 2024 15:39:58 +0800
Subject: [PATCH 08/16] Delete collie/models/mistral/configuration_mistral.py

---
 .../models/mistral/configuration_mistral.py   | 152 ------------------
 1 file changed, 152 deletions(-)
 delete mode 100644 collie/models/mistral/configuration_mistral.py

diff --git a/collie/models/mistral/configuration_mistral.py b/collie/models/mistral/configuration_mistral.py
deleted file mode 100644
index 20ffba5..0000000
--- a/collie/models/mistral/configuration_mistral.py
+++ /dev/null
@@ -1,152 +0,0 @@
-# coding=utf-8
-# Copyright 2023 Mistral AI and the HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-""" Mistral model configuration"""
-
-from transformers.configuration_utils import PretrainedConfig
-from transformers.utils import logging
-
-
-logger = logging.get_logger(__name__)
-
-MISTRAL_PRETRAINED_CONFIG_ARCHIVE_MAP = {
-    "mistralai/Mistral-7B-v0.1": "https://huggingface.co/mistralai/Mistral-7B-v0.1/resolve/main/config.json",
-    "mistralai/Mistral-7B-Instruct-v0.1": "https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.1/resolve/main/config.json",
-}
-
-
-class MistralConfig(PretrainedConfig):
-    r"""
-    This is the configuration class to store the configuration of a [`MistralModel`]. It is used to instantiate an
-    Mistral model according to the specified arguments, defining the model architecture. Instantiating a configuration
-    with the defaults will yield a similar configuration to that of the Mistral-7B-v0.1 or Mistral-7B-Instruct-v0.1.
-
-    [mistralai/Mistral-7B-v0.1](https://huggingface.co/mistralai/Mistral-7B-v0.1)
-    [mistralai/Mistral-7B-Instruct-v0.1](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.1)
-
-    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
-    documentation from [`PretrainedConfig`] for more information.
-
-
-    Args:
-        vocab_size (`int`, *optional*, defaults to 32000):
-            Vocabulary size of the Mistral model. Defines the number of different tokens that can be represented by the
-            `inputs_ids` passed when calling [`MistralModel`]
-        hidden_size (`int`, *optional*, defaults to 4096):
-            Dimension of the hidden representations.
-        intermediate_size (`int`, *optional*, defaults to 14336):
-            Dimension of the MLP representations.
-        num_hidden_layers (`int`, *optional*, defaults to 32):
-            Number of hidden layers in the Transformer encoder.
-        num_attention_heads (`int`, *optional*, defaults to 32):
-            Number of attention heads for each attention layer in the Transformer encoder.
-        num_key_value_heads (`int`, *optional*, defaults to 8):
-            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
-            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
-            `num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used. When
-            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
-            by meanpooling all the original heads within that group. For more details checkout [this
-            paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to `8`.
-        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
-            The non-linear activation function (function or string) in the decoder.
-        max_position_embeddings (`int`, *optional*, defaults to `4096*32`):
-            The maximum sequence length that this model might ever be used with. Mistral's sliding window attention
-            allows sequence of up to 4096*32 tokens.
-        initializer_range (`float`, *optional*, defaults to 0.02):
-            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-        rms_norm_eps (`float`, *optional*, defaults to 1e-06):
-            The epsilon used by the rms normalization layers.
-        use_cache (`bool`, *optional*, defaults to `True`):
-            Whether or not the model should return the last key/values attentions (not used by all models). Only
-            relevant if `config.is_decoder=True`.
-        pad_token_id (`int`, *optional*):
-            The id of the padding token.
-        bos_token_id (`int`, *optional*, defaults to 1):
-            The id of the "beginning-of-sequence" token.
-        eos_token_id (`int`, *optional*, defaults to 2):
-            The id of the "end-of-sequence" token.
-        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
-            Whether the model's input and output word embeddings should be tied.
-        rope_theta (`float`, *optional*, defaults to 10000.0):
-            The base period of the RoPE embeddings.
-        sliding_window (`int`, *optional*, defaults to 4096):
-            Sliding window attention window size. If not specified, will default to `4096`.
-        attention_dropout (`float`, *optional*, defaults to 0.0):
-            The dropout ratio for the attention probabilities.
-
-    ```python
-    >>> from transformers import MistralModel, MistralConfig
-
-    >>> # Initializing a Mistral 7B style configuration
-    >>> configuration = MistralConfig()
-
-    >>> # Initializing a model from the Mistral 7B style configuration
-    >>> model = MistralModel(configuration)
-
-    >>> # Accessing the model configuration
-    >>> configuration = model.config
-    ```"""
-
-    model_type = "mistral"
-    keys_to_ignore_at_inference = ["past_key_values"]
-
-    def __init__(
-        self,
-        vocab_size=32000,
-        hidden_size=4096,
-        intermediate_size=14336,
-        num_hidden_layers=32,
-        num_attention_heads=32,
-        num_key_value_heads=8,
-        hidden_act="silu",
-        max_position_embeddings=4096 * 32,
-        initializer_range=0.02,
-        rms_norm_eps=1e-6,
-        use_cache=True,
-        pad_token_id=None,
-        bos_token_id=1,
-        eos_token_id=2,
-        tie_word_embeddings=False,
-        rope_theta=10000.0,
-        sliding_window=4096,
-        attention_dropout=0.0,
-        **kwargs,
-    ):
-        self.vocab_size = vocab_size
-        self.max_position_embeddings = max_position_embeddings
-        self.hidden_size = hidden_size
-        self.intermediate_size = intermediate_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.sliding_window = sliding_window
-
-        # for backward compatibility
-        if num_key_value_heads is None:
-            num_key_value_heads = num_attention_heads
-
-        self.num_key_value_heads = num_key_value_heads
-        self.hidden_act = hidden_act
-        self.initializer_range = initializer_range
-        self.rms_norm_eps = rms_norm_eps
-        self.use_cache = use_cache
-        self.rope_theta = rope_theta
-        self.attention_dropout = attention_dropout
-
-        super().__init__(
-            pad_token_id=pad_token_id,
-            bos_token_id=bos_token_id,
-            eos_token_id=eos_token_id,
-            tie_word_embeddings=tie_word_embeddings,
-            **kwargs,
-        )

From 3c51c3b7c4d5443d8397da7baeda70b397d2fa5c Mon Sep 17 00:00:00 2001
From: LinqiY <100989140+LinqiY@users.noreply.github.com>
Date: Mon, 6 May 2024 15:40:47 +0800
Subject: [PATCH 09/16] Delete collie/models/mistral/__init__.py

---
 collie/models/mistral/__init__.py | 82 -------------------------------
 1 file changed, 82 deletions(-)
 delete mode 100644 collie/models/mistral/__init__.py

diff --git a/collie/models/mistral/__init__.py b/collie/models/mistral/__init__.py
deleted file mode 100644
index c5fa66e..0000000
--- a/collie/models/mistral/__init__.py
+++ /dev/null
@@ -1,82 +0,0 @@
-# Copyright 2023 Mistral AI and The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from typing import TYPE_CHECKING
-
-from transformers.utils import OptionalDependencyNotAvailable, _LazyModule, is_flax_available, is_torch_available
-
-
-_import_structure = {
-    "configuration_mistral": ["MISTRAL_PRETRAINED_CONFIG_ARCHIVE_MAP", "MistralConfig"],
-}
-
-
-try:
-    if not is_torch_available():
-        raise OptionalDependencyNotAvailable()
-except OptionalDependencyNotAvailable:
-    pass
-else:
-    _import_structure["modeling_mistral"] = [
-        "MistralForCausalLM",
-        "MistralModel",
-        "MistralPreTrainedModel",
-        "MistralForSequenceClassification",
-    ]
-
-try:
-    if not is_flax_available():
-        raise OptionalDependencyNotAvailable()
-except OptionalDependencyNotAvailable:
-    pass
-else:
-    _import_structure["modeling_flax_mistral"] = [
-        "FlaxMistralForCausalLM",
-        "FlaxMistralModel",
-        "FlaxMistralPreTrainedModel",
-    ]
-
-
-if TYPE_CHECKING:
-    from .configuration_mistral import MISTRAL_PRETRAINED_CONFIG_ARCHIVE_MAP, MistralConfig
-
-    try:
-        if not is_torch_available():
-            raise OptionalDependencyNotAvailable()
-    except OptionalDependencyNotAvailable:
-        pass
-    else:
-        from .modeling_mistral import (
-            MistralForCausalLM,
-            MistralForSequenceClassification,
-            MistralModel,
-            MistralPreTrainedModel,
-        )
-
-    try:
-        if not is_flax_available():
-            raise OptionalDependencyNotAvailable()
-    except OptionalDependencyNotAvailable:
-        pass
-    else:
-        from .modeling_flax_mistral import (
-            FlaxMistralForCausalLM,
-            FlaxMistralModel,
-            FlaxMistralPreTrainedModel,
-        )
-
-
-else:
-    import sys
-
-    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)

From 96a36285bfd01d430531fbeb47898862a4a02284 Mon Sep 17 00:00:00 2001
From: LinqiY <100989140+LinqiY@users.noreply.github.com>
Date: Mon, 6 May 2024 15:41:07 +0800
Subject: [PATCH 10/16] Delete collie/models/mistral/__pycache__ directory

---
 .../__pycache__/__init__.cpython-310.pyc        | Bin 1210 -> 0 bytes
 .../configuration_mistral.cpython-310.pyc       | Bin 6270 -> 0 bytes
 .../modeling_mistral.cpython-310.pyc            | Bin 41165 -> 0 bytes
 3 files changed, 0 insertions(+), 0 deletions(-)
 delete mode 100644 collie/models/mistral/__pycache__/__init__.cpython-310.pyc
 delete mode 100644 collie/models/mistral/__pycache__/configuration_mistral.cpython-310.pyc
 delete mode 100644 collie/models/mistral/__pycache__/modeling_mistral.cpython-310.pyc

diff --git a/collie/models/mistral/__pycache__/__init__.cpython-310.pyc b/collie/models/mistral/__pycache__/__init__.cpython-310.pyc
deleted file mode 100644
index 0eeae894122b29f98b82399a2066890e840baa6a..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 1210
zcmZuw%Wm306dgaX&BMGBQqo1*O&4Aiy6Xm2MFbL5ASj7i;mydw1DR^ZCNrifSte3<
zRsW&8{*t#{<rlh1XAB_})!=LHx#ylab3KH%odM!{1Ht558i3z2*}N=~ycaj`C;<4t
zR|uert0?lfN>r-38r5B$rrZ=YTtncRuluQ~iWS$qv5;{G_zR@nfm`Jh2+}%0+QLzQ
z{ER?gUV|y{jh_lKeX{`sKPC8YxA@I1zLoIP4S*E(!_W9xV$rmlrWrS*NDaB>XiH<f
z*08UE=*f{R&AB;hyEe_cd0KD_Nv~Z0dfl5z{N6tH`t}(}x-C+qCAUP&Zkg`5J7U+O
zA|_&!U3VAhsDk$HfS=z7$VAWl!gn3*ey*ZQBjGv4dpTi8iS6^zunk+s?be6x;K<SA
zgOgc=!@whlXodnG1>?majGA9OOuP$%;w<cWH;aDg&qY$g98QS$6?*G>SwK<9#+Msd
zWgG?*Je@O7N<)fy#5@we>UYo1Pn$hBJZ+0-chEkBt&>5gdjy-Ot>f;;HtaWtvE5(t
zTZxFUSY830khQ!y_eifF=T?|7hDf}(%7zS`GY<#IU*a#HI?vG6Tx<@th{rje;Bn$D
zE>bCggXyD7M~nA5VvA3m>otoT_)`D*2C}_&Wv!e{j4n5IFU~Y@R3;BO6Q>_II+K3~
zX3LP3At%Bhwji8ff*=$xEi0tN_#vO6v9li=(PAbXGgT1dvKVc^Cn2MV@!CAXgfmUb
z=rW{aFeI-8F`b1hg5q7w$I+Z2X3FB)2+Pa#KVDc_778*HWhjXdYka|_$H1e)V{A@I
zc;S%*k}f5*G|hhjb|Bki_%o_AM8gQZz6>d<ldA;|>aDQX!|i&6c*N_;(8Oaot$MAt
z`i?_FFxoCdhyxr!Seq?ktFc`0ot!WqijcFbs-^1>YRXUyMSsYtx}rYhG}Zh!*Uh4$
Rs+(AKgJX!Es%BX))xXwfO11z1

diff --git a/collie/models/mistral/__pycache__/configuration_mistral.cpython-310.pyc b/collie/models/mistral/__pycache__/configuration_mistral.cpython-310.pyc
deleted file mode 100644
index 0731f2e2d91d302df8b2d492eb9e55ec9352b625..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 6270
zcmbtYOK%(36`r9eilimKV>fl$+$v2gx)v$fPU8l(8^xAw0hZ#2kODM};b`WPy!Jfk
zJ2O(GGC+X@NH^`O%Pdl$n{0}1`U?v5N6fD1!tf&N0&P+BJ9maNBuzbxQsUt}?mg$+
z^Z3rW*PolKYWUn;eY<_`Elv9e-3)(<xcLy@;1-CjH8oqe3y$s<ngyM%r<^Ib*evQA
zFMd7EOMlRtGj`FQwo6|Xn&ms1J!6-@(CjiVd_L8z*p)|Gt-ALXTNgnnO^3O@%^ha>
zUR!h`X@<i0UP6vm)8mu(CC3=yar>rvk7qNE-{}aiv$xg_!(Q+~qtT7jgSKh$y5%?A
zgs&+Ysf5M%KU&;gsxLPr4}52vH(YRRBm(PC1K-<w_&TiCJm?)+;lVs>g}pcT)nspe
z;;RwP7TI9<Ltdaya36$gO&y*n*m`ryE;NhwR7Y=4SBjPCN~tn~zjCEgsaB3vW-D`*
z<CPPYla*7I(|CKvKKG^8*6p+Q8(&nKr7yLj)|`Py&c}19xe`m8)2B48%JA9jih$ui
z?DA1BG0QOnJoi};`jSr=;J2B{9=B4)>*VOxlR8@qiC+|On<fj`)AT}NhMbw65}8g&
zL&LNzU)pdvjV3LE9=AkWV7e(gk;}a>xWsJU7M>b}5lKv$(&~zkTVW(Qq-JtxlsUQ<
zC3Z)IT_pg#%*Y7?wj&&e?F#Nd6bpna98-=?Oms3shP9Gzsp^B6W4;`IG~(fURY`s9
z?ia967G5VVDa30Ba2%2SdaOsL))KXoFi8`Z`CCt6XT*uu<x+&KEqxcBAJ5;`6S&xB
zmg%uAo<#ssF_gZe?3Z}XkHTIQ26gt3n+TFFSL?EUOL<#i!~r7OFuF?-DLh=dYB-Fo
zc&x}y@PWFb*xvRnbIS<C9%l<J;f1YB?4sWz2r`|EDyEV+K;T`uytK4b%cs)xPiT6C
znAk;dfX8SIBvDvrw<t1sl0aVMZgEKiZP9LXiP%L1J>wp|Miix%k-P`6!b`;X)^271
zl1U$^O6D<y9kf^vNr+)PUG8DJ<4|;u=UlenuCQ(H9c{%|mwxusu?61}$hH9H87wv7
z$1;7Dt%G_d4>sk}<u+ya(WEY4y>jJHvFrEl?=KTJHqv!E<}R1P5qvwg&W+4X=>k(m
z!c#%IDNQe+(1I(uX93q_zGM<_h9R5>I~iRB@c*Si#)>7ZD7d+4by@yGDTiP`nr_ed
zuCZ-85kI1C*T!r!To3Ya&bX#(s%Q?negx<oN_FA(98QSM?npoCA#NW;9O<(%Fc@&U
zokfp|b+Tv1J<<941REoD34as(P`Hf(!`1=gg54z)$waV)^}8#zOH3R=(DMFF#}o(G
z2P-w^LkYPfAo|XOl|h-SvtJ@RN0tT5+vYM%N|8%iE|Pzl)#cW+o{zkt%3>-m`Lj!j
z>pNsS2BRo6RnAmqXSu<_KWS6AJt7xq;&RjL`Mv@Q;DuVDF9n(^bYn@7Q6*II1V~cC
z>g@IaRBaxb!U+;h0o3d<#s(uodS;Kyyu_IDQ?Xt5Wv9`z+l|Xtmfou`U%7nsy*h4d
z<Zy<HfsVlQ!@Ra%N{Vk<ipdzQYpp!X(gZiH5ZR5Dwy4s!+L32b+8u!7t@%JW(R}Nb
zsOFIGEy5}sH6DD$c3Njro@`;jokT$-d{-M}oz2Hkz%@TLdVU}j2ylSDO&!ePNYEnG
zy?EvFB>lisxRJ{O{yYNgaOQBYqgocYiJXu|zbiW3ka5IUmU`sS`k)i}aliscsE%q!
zc(%WjW66jc5gh&w#ty`xY9Z|4iB!k5a^+4z1BO1%K`E35AhJVMwe9%kL3MShzI1tl
zFQ_N9O=+XA+eYyiDmiGrJqzW;Hasfsjxi*a5jtk~D!CBgJ6stAjeryuAF58$4HOfD
z_sBDcsk+P;m&QxeCNKxHg@c-?I@+XMGq6THk$z$?Q9mgmxv?<XumI#_q%EZ5K{2~2
zBR(DxfE2<S3Pwtt9Q;xcz}6{}NlieX9uizM6jTJcyRmATJ5Zd^KmllgwEWm}c1OZN
za))oDQwMOe7Vri8wYmt5H1w}49rNY&OxsYM4Z0MpaJ;#i;Bt&h8z{_8`+amqIm<0S
zIEdwFzqWj!g!wHFyrccXqTgN|MEHCX(YZwaIua*^alWQ`?%4;*3I#WIu+cF_(BiPj
zxNSPYzM+xD1TtQ=jH+HF%{3!gY)DpKJD`c3+d%p~Zh%v09wkN4BqO6I*nZ1IO+;A`
ze&*(f{{7V<4RVnwIvFWCy&65EuAb1SKH3N0P*fD)QM;}x&tWx(cn_7Gtm!gr*oCU!
z;jl5;9)tPn#6#g0c`M`3deYxAw}b;kCCE~l*lM+UyJ6S&)ZL96H<C>djz|XWDE#d9
zd>Ia)ex3H*mj-t?RnjKBX!bkU6sL%-uyQbtz5fvk+^#dW;m@Qd$v@cpT^~$K*e|3T
z#EAlu_Dt;#X}N=q{vq;`S(zNQsD%+FMr5v7K>2`HlqJCErhVUqC_y+xB5WI2Jf2B6
z=<x|`wZgm?4PI)WY1&KeB#~<%3<onW%OG))dJLRm6|@(>I{T+Pf8YE4FJAoi+rNDM
z&%gco#^%OLody;_ZodH0wBP*o!{h?j)*1CEA!qS3gYPkXgI|O6^}c3k{W7TDFMt;M
zQ=n7*B51Kc4LaRF1zPHtY%S8{h5qzsxW<#2{tRfjUjeQ3tDx2XG0<cElc2Nx)1Y(V
z@z1rVC+tFhwm;WD-aldMUlsdj{;2;>$4Q@tbE2t@S}~p-2KRV&7?<POVa$xrOhC17
zwA#tIns2@1^ZPVQ@v*eaqil^&y|R~#PtiVLWUCjSnXom8kB<UMGH7g15m)n;l4Ol_
zGV$zivk+IZ>K#{efQ(ON`8&Xecsjr~N5*AC^(lr?JB#SxPFubScghPyaOSGX?-BVv
z5k}+(M7~R8p2#~yen^BuLcT%d9Fg-x-X-!jkspD?rDr=h7z@79<BDOZLpsBd-@%)I
zb#Mmi`XOKJ`Yvxc&v$VUdegsmPpmdV9)v+dB>;YsBdN=cdOiKOe4H%c=#=mzd!HVn
zra)k%v+BKFxd3The1j#B(z%kJ|COhTda01#m-KR>peKLD{}iUH|K2;7;z9x_^~`jq
zB_)HT6q6MqbZ(sx$`CnAWRA#jA}2sLHze{?lQ_-PZp+hjsisqw)ym`+&vdzA#8tz<
z;v$ExXAR?dWIE}a8N<NI6Ae+TH^SW>k1rs6s2edv2W=k5jvxM22Ras4tB>bW+gsdm
zQu`|>$e)lBi$v&1m;4^oLCRWDFWxEXh2p>(`~RN%R?Xe0Rpm0KHra3qNbzgnl_<n+
zO%mF~r$>e*FQ({4wZlV$4%WGpq)S{%ZsK>>*B)&?T)Ai5f4GX@wT;zV#?6m6Zm-=j
zRvzBGyY|VdvA%L&DU&RY!YsL>6~^VCCq}(NkQb0AipOUQvwC%^Fjde?|2<p!A8w4D
Ab^rhX

diff --git a/collie/models/mistral/__pycache__/modeling_mistral.cpython-310.pyc b/collie/models/mistral/__pycache__/modeling_mistral.cpython-310.pyc
deleted file mode 100644
index 7498201d2652276bb31a7060df030e3c9ee73f87..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 41165
zcmd75dzc*8bsyMOUHzP%o}M=_c(4h81O@~K4^pH=2_zwapeTYviU6h6kaSODs%FqL
z)7^txHF%7BtOqDRVnbH^CC(<6;(^z)MLUU|WW8C(iQ_!hyPJod&6jU?Q{IOi@5W*5
zjq=5|6w#8nzu&p_=<4YKL1+Kip6T0Fw{G2g>%Pu8zjN-X_TXSKh0m{k?2*Nv&l{=V
z<w^G^i<4*Ybr;Q4%1+ItY{NEJjX6Vp%{fzk({pM3rdO?IW-imr&Sjgqxm+_phaBa4
zVXh#*#kr#V4m3+<%1AW_=gL8fA-Q6$7MrEHlEh_JE6w4#;pWKPNON><RN}I$W6klo
zaXHSdPBbUyCY!tFcFB2ub$4@WZmPLwZjYQ7R`)jd&FyRMpWEMjVD15lE3Q_X56(R(
z#{;Veng{0&HV@4mYCbggQ1jurhnt7z4mXd?9l`aIJ-9mEJUVyONI4JPPN~0ydY4xp
zX&#$9);vCUTyhv%ooSw!JJCEjcd~hE?iAuG&S}Tkdek{~*I0h+wsAc*_W|ed*5l3-
zca6CZ+QV}nvPbOEH&b&@K9{n`?D02K_PA48{_ySe+(!^IVNW7vGKe{2@3MCz?Njy#
z?J0ZDo5tMJNVC`8hcx?wG|$-ko#OIY`vK>pc9lnqS^Gizz?;_GIix*kA41wgLE7_(
zdB}bkF%LWEQR1^WKWrbt`4Kt4;GB1!buQS`Urx_GhigaeM{w<t;O>vv$L5}Q4%)|^
z=Wm<1I%A)})f3Kxx6`+cxsTf?=U(W#ddfbHtEZih%hiiW|ET>KVjg46+>1#0fgqp9
z?I#fbM38dUHZP^7KeRDfefieqwpzGaeX-$ss*Y}J+s^9D+Z>v+)0S^tM7*DWY0Yc2
zTlG~xb9sGj)$ud0ws1ae_-3m$o%V-6dhXdzHN2}A+e;14z1VhLzj99D=h>F0+H1EY
zCimi{XL*3TvS-g-K7C<!+VnH$>I+vLzjXdqtKMuZ$Qhy^tgR_$t*)HfV|Hz!zV6mn
zYjw|aTC7H`S$9|bqwkrdw%ArR*IuimhLazyyUvSzwwKnuwRMlReYuWDn>Z)AUwpAA
zzIw^Iw(hhRoO7#n*KI5|9F<Ig#~6B9IhR$v(Q<6bdOGWm)or`xdUfU1?Dhhhw9#5}
z{fGL+*1UF2TATH}QCGIVyW!RrQQTENYpZs;R=-|vtk$of!4D=cEh(q&IjV*SL^aVk
zr)#H9`nlEi(h>^vccbxE>kCdzIo`TzbyekU109R14+i~xuH9N}ETP4Rx(Z%gZ!P!}
zT@lyoYSn4g7>~BBG+MTEv$oh!u2-wyY`A`5t!~#EEwrmYu-@uAxq+nZ8#TAF;ixH;
zyujkaPX-4$e9z(QeiA_^wTxxVGnP%;ykl<iDBVeIS$9)*+P3bPJlZrn##ZKTYAcK5
zw3pk;BUPbe*qPwIw4J?cZdx6)V|CJtX$<q+?A!0+^OotS?MBnj*xs!*=K>lbUw3OB
z$Fq`FG1YEVLLV3Xw7ZV99w&#NX|%jYPy4x6yM+g7`#H1?N28x^)^GY&!*iOxaludH
z$^GoYdK0%!XZ&){h^@HJH8i7(ma1K;d&oD3=4mWAu0M)=YWi|~wYabz#JO*!)CW+v
zcdndJPP6Sf$FH`V&WY7)w;HVz=h_!9Hl97<Ij-lPkO74sJ)2LTn3)Ow7uu_<4X5Y2
zKE;Ts1vkvB-SUTOOO995-Duan`cv$l3WAC;VU&z1!!%99G_3EaClI-SZ{lMGXYMf^
zI4KaY6p5FUUbYs^A4n~Ntl8-|3v(IUvNKqAvp<lsQgb=<Q`RpBqw<v(FU_`9bLr0i
z^GjRb{PkaaY9Y*&dGnY<CylTBFaj@iJGEup1)cNEt+Z!tWje-P{B=^bEQUd<ma`3H
zZ*H6^R)bHnoNB|ZVj4BB*U>#*)oWKTf8b*MmZMmS=$_f>yq|H`*Bs^NYv`qhSF0(K
zZNKnxUDcZoCZsydbF1A#pJs13jisxeU%~8Y)UlG(oHe(xie(eas_U#S`U6)RwvBnk
z>EWkwRUN|ZZyA1}R$Jh*g96y?>Tw27BDjdJ%QB1Bh*2~)CX){`6V_`1Ohoj_<A`6G
zXTOGnj<Kxgk>Q!k>AS|JfjMQ_XtL!DdxvvvGu^?}t!xJ~a5*=bx@(N3FeCF@1)LYR
zjE)&Jagpb^Ut*BP>R4L?n4cwl2Qf!8cBYeQ7@JwIY-jD<ZSzMfn>lZ2xzfpYa(3P>
z+({d$Pl4sJ@OK->t^BE!d+CN@phJz+r}0gefUYdgzRd?z4<fjXiJV(;oHfktjbZ7w
zD*LUfm+Oro)pToZS@dU?-FC}2ZoJJQgl5iiA!~Wl!_w})iODYghw0s}bHb<B+BbZw
z>C{_(M!DCNr#Km;)76K0bOwQ+wH*&s%ug?@t@~N8y-Jd4wb!sdSW=>8uQ%7G^Xh4)
z9t=C(!wT;D1F~$^WKHt(;ZX6@^_J_8C6WdY;g>imNj%*8b;mar^$>~12^J?!PA;P~
zcNc;zX6=~#Rg6kHZ<NdtWAD2ab7P`s^aM9zoMV849y_&~U#yWxt*`R9RI8EL=qtLZ
z@>is-PCYKeQ@t)v7>wQpXLU6k!BV}C;;J6YAH>lM_{vz$r^bv*UNLUx)#N0uMyp9t
z))B5B1u(=IRai$Xdw`<}>{~`wlq|+c$&Vhf4VrxG+2$3;wz1y5%?a^#1-(7Hz_ld&
zaB6X?$l7uf2l3j{;d)}qdXna(sA0ljU|L>gIcsB;0ntk1IETq&ZRIie3muDV(6YHF
zHJaLkiNImZ$-oM8(Bpi7`7M{02ffTz8DuWccsqrv7iO1qrT)XyLXp&YoW5n~k(zI`
zuGbcobL|2~aS<z$iy?=R{8rk}fr`|yjB!+62W^oXriavraJ%lKgE;ahNpRNMZbKL<
zCu|qjx2}Lu(tVr7s@7Tpvj^(;_+x%XdRl#)w+|_2iAAc~mG#9%tcW9^R9<bN?P4)&
zfnx(xrxsb{3X>0$qS(%2eSH<bB)`*HnGK4SQL7A^VQPJ!jFo6mOU<t$g*$?vXk^W-
zku^>l6~juGjpDl-4}?QG+0XHU$SEIwayVeGpTpPPKp=gaA_IZmNOepy9=35Oy=CD%
zg9Xy^3@nBE%-kfak?Wu*7s065X)Kmmq|4h@C%aWZ&lfvbRBiNg(93x{hbvqZ6IZ04
z^WcJ|k))$!MME!39~If0(v<cE#nIpwqsFx{wnqY{f!!UXla5m@q*ZO+kYiid>aMh@
zpTRPSzDh6DJ#~$j(<niAXc{?B=X9Se^LF-|9&Cn(uE$t&L86*hdR7~yknTFZZV5rw
z*lU!{5p&AeZ){BO)Fs<S%0e3Vh94eBoimFQnIfqbOAVosL1jR__>~TlNo3hb^96G^
zx=P~BJDhge?Xim#deXD1fn1e?08e~_$9#zRiPbt!InEWIY&xwMZsVqsmXSdjk4!Fk
zpT*ZLA}E?;#zcC;u)e#oe}~5p7umF0zzxbqF#DD<i>kcA^Y39$V?cVZYsMl}ok3i7
zyp~?U1&$7v3s*i>HuB|>qGIeqH@BT=*N@?9M<4^V6u9f%DchP#eZgErD}o!qmF{>O
ztt(eJF|Y&<V8USNr|rxgwrF<a=qs3Zpk~#p_0`3ych#wC%2fqTcB^e2C4_EfW>&_y
zEN99?H6T!A+^cmgVux|2I33aApT&=F-aLix>8__HBjgV#DXqrBegqLhM5Zw4#tiiZ
zoG)+{4?p2*ehv{X*(VGjYj<iZ)#1E0v69DUSk*h`7}kMK>bB8IkEJG4nAsR|5@$ql
zn9ZG>3?`%A!dN}Kwg%qXt+Ex=t?J9cvRaME4OvRI3M5Kguea>#igT-q729nqccv&T
z=2^AmJ{d8a*Q!V6MfRAvEUD(FpR8Vva?p3qM5$MHNFA3Fr^S@tAvLZ*l&wPS_QF16
zOQXgSeTcI`(jBsSADKi1sA2c1RP=ac+WGKd@@gP9g|+nBD`=$ZLR%@vU28+8YAxNn
z$*ULIs_NXVHzE8TL)Sn6Sg5zESDZwd>#k!npEa=JC~FZ5mxq|R8mRS+t4^zWqpen0
z<_(|<)q3@XPac<LJ8rG@mW#f3Hk=w}1lkQ0b^ciOXgh#ajvlK9)X`;mr>5h95I+0i
zL>7mub+xqKq_pC$ISYg*a3`7w*`p}{l2q%EG*F|f4S+kyGrAqk<h~X@LJbZ71_xx~
zxzU3?DO`@<Iz*$Z7`M8ms%t=Y9JP9@dPTMC_5#2Z-CBui$n3wShx|;k)mxBb6U9mi
zeQQ_UUZbmEe&vZ}?Wn#c0<}m|$EpDfbgX*S0WN_id1Hp-r**8_SgcBA66LO7+1fcX
z=Hd3slj8<`&q2Yyj~bW-o;g)*)>j-iQO^B7TGXCIe!_7qkC|-EON}PbA@o4}Tvh!n
ziDKCw@6||nz^JX_=jwT;(|SC=BA8OwS~V{i%8~BnYI+{SthuHd=(weDn)Y4&s(LZE
ztLu`KmECa})#U@MDnyMf3qAxE<C<@*sPD!4t~v<(Quhe=2cz`t96xuhCMz1`Mt#sr
zVaDoM(QnoC>PI%2$^JYk2#I`oye!gG-cWaNN;01Kkkazw9zqbxEF!T0`*<@Ci3Jii
zB^HrWGN7OtzZh@}FJ62(A-kK%g5->Oy5EOjn?NEMSq;+qGT76ah$Eogl-ePncsa18
z<{e%sP*CZ6r5Icxdke0r>z)CxgnS2ggIB1P$*#VYp1lAZ&AQS6PszvYmMj2$37V*$
z!3BR9Yn-E+j@<xfpb<bmh@ZU(B2(8$iJE6jVTmcQoGkmf^)-D^0BqQjL&YZdi-9Oy
zU-0}ag4$y1EmO0rKZff{*rAvyBk%M#@D<z&;)0RK-<14SjE&K_v10*&EggPXQ8Eta
z@O8f*0a`)C0|*5~OHx~CUkV2?)CBPW_{kV>5-=6!Ru0U*2!TW?As(iIfLLCUJr4wA
z7$U^LatQ(j&=D;rbR!@ga8<xSK9ayd$Z6$fg>Bs^VwgnuMsszo8{beDP+#>N19ExO
zgPM~S=4zg?e}TbYWWW)vew@Ke3|?mN2?nn)xWs@ULMW?!7RP>8Ln(eSE>_Jk<!cOL
z{*;`WzyW^;2ksbxvQdrThvK_dy7C<>y)n|&6(MJw?aK+@n+g6R3i#_RNq1ezs1z0B
z)T;>mObbZ(nxEBhcw&MveGXsgYBJxL1gY1pO#on<7G!ql-&RaD*umfjTMT*c3zYeS
zF#H93oB>Zs;RF+r-Fi2(iS%2<3lKAwk~oXv$Hu>u)%h}c4*YY$>%tz%D-&{_L45)7
z17^e|T6I95j@$&3K+1Bgs;{|c<f*2GwuKBgR1f6Cm)g}u)dr^_Wbv4k*jjJaz!27e
za;`hIn5;p1j933@(wpF}D({Ff?49nA%mq511>^pCws+RKxdx`f&w(E!YmX7ceo$ZG
z#f-+jNBh(VxG15Ed`OFW#*<$NT4m+>KS54<G0x--3rGNX|F0r$0pG-jBaa{VYY4){
z*fy{TPuOY4vMnd$WOt_)u^0oI#$<eRa4u)(ocwaZ&O2GV;AEleE7}9l^%V=I{FMqN
zd(s}1G-ce6v=E?%<Y)j#6&#iDJM0YFBlwOwWu^&J;#bahr%*0Luf?+HC^K_ILJ%q#
zCA<7eFfq=CTCa^?k%oQ|5K7}X8wlK1)H%Ah4@Zfs&%4#LtE<&dx7Vw)j$<DN4+~~(
zrV2e3nMGl1!QNKg)rN4(T2T{c_+<4W*hkFBrlX`>3M~hob6Sq73%kS1k&X;(Fgf2#
z5FL;wRAMbx-|8OFG<#NYmG%p(MAgA`-rOega~=wjy9l9m8<8W5pYE7;YCWZ<i13Sm
znqAKQ1m@&dfVKb#FqZR3mB%k2>Kb8boG%xrNW2CRTf#5C0IPwxN$enE%ZM%G7vMr|
z$R@5=!Pzi=@dZAI^q9TKYbC8d?TrBA!7)k|G!MVouVii;pEtbG<uP7idVB}QQg+T8
zU!DM>N8&r#b)9wSy}s)`uI~=62YIukME+CW9_ETKo@aT8pgKz3D+p6@cBA}!3kw63
zr@HSpep&hpE4hd^WYOx?`KWK^xh{b_$2tQh4)TNyHT3_D28I+CBN;tVc0hQo+Urzt
z$}$$L-;fWG44gJ)AYMaml%!^=FXLW}4|2v;cU=i@3$lN`VLLq)5UFZ`N=}_QvJi#H
z%;k-zLr@$_lWKF_^&qujVZ07jlkD=9Te57)VpTn|gr}JAx?%pw>c)sjBlBE-Bo*V#
z>6r`aOBjLx<03S2dS4f6DqP0Tau8>!Wd)2@g3h*u>ebrN?Wl9Or0Lc(OeCvYG(B}e
zejzGT{WO#R6jP2R*TE&#UWX!xmGFy{uxg+XUfnN10jw3_{&1YHjTGoczxYWm>d&%E
zw07dpG5?6>``K%nt;((hzt@A`w&t;}1jSL>C$IaK#=K<KDn4~T2J3%Eyi0`$87#6`
zmQgmWck|gie)Tsm@gq<e7I8KEuJvx!s)7%wLWyW0#xhFZ+1TB?)J04gv?+pq2=L~o
zueclAG=W?<*!FG}0n|__&|9Lo1hCAz))p9A+q`R1bxi6zt988Rkr-UJb-m%-P(P0o
z=@7pF6NnZN1lREazGA*Ct+&_RX-hMje~ak^0{m}zl;m~%Y)wd=iRH^BFIkPXbJ4Oq
zqtjf_)Q~C-^+O2!G7ttYCeHq_X0l@93l%%2kOq0xU&bv9<XOVcBo4@~zJRYgn}+?6
z(aBv;se%Wd1Wzxk<Cb|h^?G_UzggJCDgp);?CK^~46w00!m6a1+0DVu0GQc)hl;)&
zzIn7_fs8BE>I0opbd76K_q9@IaK~%Rvr8$6T)NK&yU*;*m(!c&E@k16C=1J-p&gZl
zw-StUX?fT#0>-R#01U4i>L*dp5uqGE?v2`9BgdA<v2vHb0*ji@8y%$Enm~wDcg-hw
zUut=>GuWxvgLll=&CTJ>5pP!qPu>~sjC9hsGl&~Nd+FSN9dW~U`OD_!DDv5jRNxht
zM>?Z-Eu<@L4ti6FOD*rYjWZncDvt-fy<7X3E85nwhwfsGdi%E?z}5U+1KOgo&KP?P
z{lH!*bSjG$DA0HD1eNYMJjbw3`4g$HSn@oilE#+|bpT?B<N&G`+>hA($YG?D598#1
zNf+He)-i#5j(OGP2bT|Y#_UmV@A5%=>`r>?P-kHIA&eD$?cok7-eG)?*yDH7h~*QF
zbp|m`#yjJQ+T*EMf0RFu@+YDvka($sq>t(_9*(%JX$Vlv?PxF}AGw>xd1~ueXB;&d
z?_|DYxEnU=YfmCo*QkQPHL-0JP3$m=3a3)Y=PQYEG>&Jn#DHYiEbs)F@NluG?a@05
zrb7Z(%y&l%-LxH8E<wRU?kpskAW$^hxQ;lWrq>^vdGv`s#E|URe7v-?Amj+b4T3u}
zAFo?)r8i2S5FlAI`y0g<0Llf&B_Rai*+iV3UQt|8f%k}rZ+{ooG4EWs3`v9}8?@O|
z>kBX(0*qe=;k`;NpiK;J!Kq%SvJvnipW^YE>hp`$Ti`KasTBY??_95f52)7Hz4mb^
z2_XQu4S;JlSc!nq1f#fe;(Ev(9IFC?-NqHrx}s5EC5fki6g`IK0z#ZsI2JS!tdVA*
zG~?0?P=NT4AVF_qB}{D`idNsMHpp;+hPBsMZSoBew`hD(eW!4&dIjtd_>6ZRjBj8c
z^hvEX7zdC$uqb2A&x35B#;v6d>A+SXj?eeG^_`)Y#1tW{%ahZ`{d~YkK>5le2tlZl
zqBN`i27|xJ;FlRhluG?=o|1pKx9Be7m?q%A$ixwsqE)i6s{_wO{o#^#Rs9lE5Yh6p
z>uWGFa73a~e}&OxCH!o&?lsp}VG&XHuA-VWh#>v-i?j?2ECh3Id(netQjr>I?14a$
zp9^>u^$U#urwo3D!4`vO82r}=rYAM66>%{lkm~R9`mZwhdkFmO3L0w3RiwjyM(eu`
z*e3|WNb@^dRun1h2FE2W^af{L3{)^J1iio=*J*IY_UMKZWm431k$wqdu~iF3l0P74
zIupNw6Cm=mP{go@oYuo42pp0jfAD$KJ72VDMfGkuT?BJfH1?W%sXfh<!8~QZQ_9jt
zqX=t_B2tm98p(`6$jzHYT4AM2M%A1ETQvd(%K}?f%uFEN6xlCpFBFil)56uU;A$mP
zF^XmuELX*{-Yp@g{m6S`*EYs0VByv0P{Fq{v$L}oF2Fh_shIS0H5wGcvSSLV6epRV
z22P<BmU^IQrIjM~;}<C^1`z{{u&%V*tIFcd_Xc%-21))Gd|eKNe9AKRW_CS<$H0?{
z1=_bP)BJ8eZGI<jncvQ4%s;`kcg&pmt^I|OZz|rsfN$b67@WB@?&yXCBFnV=oK?_&
znL@@MwX+TlJaSHszYyy4a+JqW0Y?=4iw@ucr?^<6G4q?5xq(6%Ib<Nz57`yiqm=An
zdjxpzposTnu%Y9TUCITTdqxY+>H5bpJhYszE%>Tph1s<LYGWK?K3J>t306Q@Tfp#E
zqX6^0y*gjTD$ZpzT>k=vY;ZRw#VxI<SZ{glbr1lEY9t2j7L+?s{I53@mVZqKpy~>j
z&Q?O`6CD}EHV}e_S8V}v5LK4w<Q6<+BvK)3v9^9?wXsk=`||Txe&tzvpOJ*qc3_Mw
zs6s$u0ZfUO4-^!-CQTPar3_6&%ftHXg5cf?bPY`nJ8V;cbI=XpE7Af{2FM9U16tpo
zHogch*)wmaO2iA&l$>Q2fccM2XC`o~zrWi?7epI2r0q4V8H=7U2cfxuB`egw!X5o&
zC))J5tj!3fd}O`SGa4c}Xtq~k=vfdG(m)lt7Vs5mU;||Y(m=UO8pv&#H1`u@+H^uL
zfHbg`@qnz{4dnwMDqS&@6JT!J6+_;>7;tCgu1k;?2D{{i@EVbc?rW46cDgp$DFMX*
zrn*@U`0Ve+-4)^Dl{aitkukD73eGVXaPdgDHP&Gn72)E`-Z;+Ck2cI2>^#jEw#EU<
zPuxx2hUo%0T$+=R&nH?!KEJX#9Ps%SaQYFS@9l2?0{C`t?LYxxph)hz{R>jkp5?tO
z7NmpS?%(qEZSA)QMA8zu0G#fpP$Tm571#omkZ!a$7o>SE&PUEh84`tBM&7$91rTql
zV37fy2yQn#18TTA(HRe(PM#5*I-gXI7irT_70!9$J<}i`-PXEPJH0C<ocbZO^B}K*
zD>oC8#dAdBSeh;~Fb&BrKM2dI>=xvXpv*ke@akP-34LIfVJT&zZ7cQ=<fTcp&%r=m
zs&pnJdFhbIOAm>>bV%eS9#d|57(9BzuCyrMAKrvawtQrBcV{<7>-36=mirn?KYACO
zy!s4|9>McZwT8W8d@g(VHdFktJmYc16Th>TXOP2`^xa5j3ZrQ>z!C=_3r}HwP(IT~
z9Ig6@<Gp))hsR729y84ejJA{bo`MutN}LULpV?!OclUUw?QwfTjAQqDk1jvf+0)qz
zjCt2K<5(@J{C+}GNiBas<kLYQABE1Y&WOGHPHuBwXW!Q2!MuFpZd%5}KFr<^cJ|@P
z_CazoUCgM>{q_`6cGVV=(*xV;^T0joW4ezf>a(x2KaiaE%&K3*Ql$PqgWU|yGx&!L
z{t<)!j=_TrejR~d4)Exj#-lfWigajU9d;eCbm%1$p(tXvq46MI3FGXwHjGzx0=186
z(F9}@RH_MyjHWG%i;aZ_vv1HCgeDaL9U<T(P|EeSB?ZJ?SK&CBzUpe@iUL|TBi5N4
z*sKC1lJ+(NNs2*19sEB~cao$d^WF9d5e!JPg4M+ttu~!WKKpSfdO|(O{PZ!XbO6YD
zY^nbq!SCTKAAb@Eo<qA_aGo1db!}m}@vCe*O(6qoEY1F+Ob1q3(DOh5f<Zc-Sje;3
zZ-BZ6#?OISOp`Y3OsSkzk!o=tsrVw4up;`&qpq(7_8*a@P!&R;md{!=#+E}UyrS%9
zs-FTwLO_{!R<G0-RwQ3tVWjho8TAkFh#PyQ-(xH+kgocQvVd>@SKQ-|5-m%%r>p)M
zu1tSO{RV#2Z!#6kyVuk|=cx!=ALkMIS2fSzK4dTQ@%^ML^*`~#BMgo)h$X8%0rL<H
zRW}&udJ8}fX0jMtO2e-Io_A7frQSl|muW*ANGFumM0ncZl>)aL)K(iSj`|l&PCb~G
z+C<6{dFv44$PxIX0Fo)4v)OA-ivpnSPcZW+c4=)rFi6h<0FWB}2i|{%!AlHYX7C9H
zuP}&I#s03CIl{Q2C=`kmTTJ*21CjHp_aWyg;hqF+^?BoOG5FsY5PDYs2ZP^6;FkkS
zjcvl4`j5Okq9r&y3oMkOP;GgxtTYP*bTnJ(=R*h<_S9?{5g66)@b*$5xz?cia?~d!
z3P8EoG}FFT{hx@^`tRT6(Z6Q!e=+z#6h!Lq4KjZJ1L8yZuLy~61Qz*__e26Ln%y$r
zfi$E(6UuVKe79(t-?6e<YAj|d=9pQA90YkTUB=NN^N_I%G9O|BiEzv~0;x|+i;xpD
zC4+Ki84_gCgyaMWKKorrj#<W1qAVF@E0#X@Lq)IUtT4=eYph7q>mDgGG<5A{Uc-+6
zIv#d`3~Bfo!vW>6bNIU7MgUTratSpXn}wGAwg+JuVXUM#VMK460CC@dQVwhz*q#8q
z7R9<K+F#FZf{}%~9-u7Pgc4Y}GMKwT&C+%9?$aKaT1gKO89?ta%{M!MUGEe(M_S|k
zuc9{EUIrm-T{CwE!06=w_6~H4oy={pbDdF`1mxWn4-B&8u5-=a$?9v#_@4Z7AkdP-
zbiTf61u|b1pL0;2X=yEJKM*SKYTG6ikru8-(qi>EwQO9B`>pbUR0PqHDmyH(SS6tD
z6BJX+QX+e?Oxei?V=r8dvkT0?XvPJraa|=W1$tIjT?lrFq@)(!wb;N0(Jo0<Ckd=u
zS<Kmlg)3va@tW<{q*48tl%MCh!!R(S4QQFBViZ0hqT-~ti6CVrvWYRF`!0_}{ZCn1
zH|liX<;hKd+7hE45?S;<$|REzjtYT0k9XP83YSn143#+o1uUJ{_901QOIZAA>Hj3F
z-h%Xx@ypSaXnyu$Xqm_S!jc2op2O)>cw-_Vc3vJ&Ce{)@Gvp^&qVd4OzQA01Te<PP
z2@)q#1gK@r1y12a*>v0J_Q{fs#~VOL_%xq*;d?HFRl<V<vZe(g<AaQ6s^|6o7mm_X
z)e}{sr$<5tZ#q$zKFkYhMAR%{G~A~`pMtI!7!1n*e})Z>9xAwliAUUk+IUBAe9if#
zc-v|1TH<WQFLmux#kH=p6~7RY$u)m4(VDOels(qi0f*g9DP=Jr4yv*Yatz?W0QyP>
zRS*QQaZv>Dl;~JrET}&~8uiy0kOT7vYU>yqu$98{p!PEQ#~26$wVy|O5KND0j-!Wv
zIEhOka2(*^<REpK<Q}9}elh0Xp^DeLcF{Tmx~+&9yHRt<S^TK}cvmxpTYqtH-{=+5
z?uQx#y$e?fSJA^u{0XWyW`clHB*<2wZcpcdBwL{66=1+n#tSbH((d>-{|$36K8ufl
zI{$IGY)x2y5XbQ-`;Ikgexo}MDZgbEH%<gNV=vv`i33b*%NRzjbNC7m@G$zlV~W{b
zb~8s6HR!X&z2Lh6MW_eDW6*wpD*`+eJGosN>fnwQY|rXI`3d#5@Bp3()n}Si9;Q1`
zWZzA91^^+Lh)?&4Ph%@r4n}^j;=0l2y3r{Ch8hG<F|b)_od#!OBDHIqn?s(3JxAF7
zH<`i_EI_gSFAY^_xdUri+u)ugsFRH?*x*9sHd=slg8F9$x@!+W4fQu+c4z>GBKM*>
ziVEPN(xM5jWpL44<c>xw*giv98oDlF>sN3*UDT?cCvdFxG1y`t(oBm-M;I_Ab_jy(
zVk?4r0B1n=pJwEMCdDGA`2|u*eK?>=n4Boi6*a&~3eO>$^^R3i{_Z60sZq)enCoc9
ztmw%<%-dxxsPJfzLC^RuBJL)~_eW4DxKgWV>;mt)%iIMW2=yM&dZb}eB4#HvJjsF&
zE@AW4h+zhND~(dDcQy{}G)}t~jsKfXkyd0?XD`5}IjQkb|Ar6#Bm-&Cf6JpZgSQzB
zF!*;2UT46WsaXA;w`~1Bk}M;IOR<28jZ)4W2&b5}gIc5bt<-1C158!?X2n*_V*%g9
zhpaU}+teF@<zl*!wg+MJh+%8P?vcO17PE4c!%+@LRBhy)0@ND?sy3J=Oo?APk1ZEP
zZ8Bh&U^O`)>WvZxY1tnO+WZo{Z$xx=<J&^zf=gYTVxWi!b^wU^YpXRA>l9~%C}Sr_
z+oQRX4drwU2i!vyBE|iw)FEJox2r<51_Vqm$U6Wc5L&V-&_t34#ttD8UAw+U2Le*^
zrSmVJrJ5r!fqV}Pf4f=FI65+qFGrgD+n5WTj2aaXAJDv3R>00*uN07kxDMUH8iDo3
z5=dXKq+-^ym8Dg`K>C!Q2;j#tGy>ud{y3)8DQ*p54W{u4jaLTmT3h7+(l64KC%8rk
zz55#B(w(j`&kEqsVSGmbi4G*rO5JB-0yGFik<ed2*@MXrpy2Y3CO|)$03Ziocu5nW
zp+I@?Ggv{btx1Hb<z1p4;1P{=hU|d|{GWnkHz4w%2LFqIe+A4}|4cyZ5<u$_<g-V>
zR^-$f5zrbm$J>jz)bc)1nE@QL&3IfAlT|}~6Z8mu0)Gc-wCRwT@_f_V4|qL_0r*_j
zsUjno;_?H_ReMOF?IE2LAnYNu<AZ><*$@0>oAbAX9zL*qaQTot1xiL89zy#4_<gu#
zz-$Ph`In6LBa#z9avZ@7XvMOJwZt0su;d2WH0sGCFu57U7@3Aqld(zjq0#Q%0$e<{
zt*@Yp>DyNtzFti9)+l!BYD1gR+4>J-SqxT)7wN7?TaS{F?zg09tBrYWGJpST#(Z@f
zW<0Nz`0={pp1G{noq23pgGA2qHAJb=&t|;^M+jX@SQ0$e)zUljL<rzo5!{nZsUc8-
zR|7q(*zI?lP{$@WADB5ga}rArmmD`(_h>P~+69~PLOY{)Hx+;?y-{!;^tn|_7Jue|
zQ->|<0TJGSX=5WFGn49yBdE6m%Y7mJKZJ#-NAuVP>i+{=-Pl8aOK~IK3#-*y0K)!)
zXsdtBqyG_sZ!N6bbs@&s99)oUQ#93*g0TGp<sOaEQLUpD^83%3&0lBm{_)&<Vw;-5
z5@h#hnf)&@xWnLIFo;ptU*xH<T9&|rzryn@gZ~+UzxQ6$OdH1h0u#%FKEopcOvmpB
zFugZc8sn6|&1}DyL5IQbF!)^t0$YBRN5YRqu;pJRE5@`ijR5;)y_>Z{eucl;cglrs
z5VOaO1>y(+&d7YlEP;s|!+kVZ*#&!H`HKXW3V_V)w<<*%tt1Rq_V!x?W&$FtgozvM
z%e&p1Z3F)HUZyBhSeCxZHXxo)j?FBF&%YDD1~ZiW#D7w}lY87{qWQy%p?|Lc9zP8`
zDomN3IXQ~5v7w5C&18-h>{j76!qObfROByP$l4Wha1cWB4)(4rlcOuhkv3DwgL{)$
z#19~zdsRxo@1R`c8Pk;brREm9QxIZ@O4%NQX-jOvGVF)D=+4u$&ruh-wIend-?qUc
zgXa)V#>PSS1`j&jsF^TQFlun#1Q`k_FS(lAjE5;gha2zV#EE#Scz!5&#pMC;I^5z=
z>Ki*K8y{foeE^LQG3hLd6j^Dy+-1%23}XE;E>^wEqQCpa=PzA;<?O}U*~^!ooxS}0
zOS83e7tdb0^z0?SK$V@`Bpb}&0&I0neAoCTdK#j(i;kXRMUp=xz=Rg4c-h|t2&{{P
zjjyQxh869Zev~_Qq2=5%4vLs6S!fH0>rWXQ;e_b!gZs7`+0Jp6Py#v57w~m|1i^Mg
z@&fv16MP}Iw$VU@X7))QG<_!vIUGnnI28043@y;Ja)&lF=k7^y(1rNu`XW{B8FdC+
zRAJG85%xeQtIp%B7@Q4g+4t8_C#_6|?@4Y<{0&C@CWE7hP=^^j!r%z6m+oiHq>{<c
z1I=#Opy~c_)W4CH-!L?+F6L{Og?=tH9PANWNrDctvmV1isNylxRyk0)V9nEirFZgq
z3xJL8Pzy>K{oU9TtZ|7hO)PQS#M3E0|L-x_QAU*|jxwr0aDmRP0((56n04M6KZguI
zS@;j44JY|9deMW0VL7aRa?vZn_AU$CyDV()_R{u_ch)cBRuP0kTQzcX2~)M#=6gIS
z&SR(kxvNf~3l?W<vhN4{8r<rk3$$(?%^^I>68EFQmQ1TiL`NY72D<rJrLWOw*_Q+}
zImnx+$`Kc~ZZwVKR^yQKLB?>a1s;gIEP-~1+>q9=MA~iamtRxs!g$FZA}Q7J7G|ck
zGmq{khQPQA{|WWT)#hsvy9y2$p0(aMa@F(J+$T?*Si7Y!wbjxIx{o{oC>{FQ6X0yG
zdd<~?JvUCn;^B(k7bO*V6=rcTzQZHcQTwHJI0uLd)kTTIMmg}<L}`_5Ug-aGdjqqz
zVCV~#XSeQOnjEYzOox{=wolOhiUTu%Mq35ip2#0wmzMx-<WN2GS}2g8lg;AuZ}h;}
zn(No8*SXmcqP89b+Pem7#d>3_p?0|!`InJig++R3I?k>O9C8F@FT&U%;{2t>s=<fd
zFWH7Bl91=y3*5QRU2m{hk2vCE+ANaii8_Yk7V82NCvtyn{`-2y`3$*y8g7W?xMR=1
z5vuvnEm39Ui9)N*nc~K)mr?2!!Vke3!R?$F$$Bl4t#Yb<BiIUjJkVLO`m7X<BLiO{
z3u0Iz7)E<-W&zVb@d2BYdIn!Xr$W}lNyEb$e5={!T-Hu68Q3Ii%>mbh!(kIHh0VDE
z**u$qwJUu&<|7N&o+tM+@>G{)iSUO)t-IE@yNRvYSUyr*x6ZLHe+B_)4RyrCaSboM
z48#<hT;Q{CnJf=D26_!OK%cNVk{f@jJX#i)T-vD`xq{%{BJMrg_uc`y_nf|l5KI!C
zdc`?}yamBRC@NmW51fU1@W>_H4;>$bn->&J2IxVpNIw`<i(#Xn2m*f&H)-dA7QOk=
zv#4sqjjQ^z%q>GFqcy+qZ0MHF9|&|`+&UW?m}b$CSUuEZSZlkMZ?4?_p!QE0jOI<W
zq<8{^??15N&=7#_rWZikl3U-g%#B0Afa>Y2nM5|T@Fma#jFR#|MI<v>yR7tw=|C=U
zY>NTD*tqFG5Lvs>ii*ZoHCP!htgN+RHjZiP4})#0xp1Nc6IyvF*WXifR~l=ixGp@$
z(UV<GT-uTd_!G4#$B3VXzg)i@$CIh{$K!Kt0wN2EI+JX%Ski%O=~SDIMI04eeug^~
z0U`%~B~vj){bD5`Y)Or_X7jpT8O4_G^P)W!bFgjdQ*kp)r4h_}rh7G)&ud*1Z`U*k
zHZF1l7Vyv>nv2nO81!7>ihm1Rivlgbm{$Q?Xe?oPVI{-f*uc$DxHE-&cY+<bi508i
zb?NQ7Muyc0V21V;T3jPbjWfJ9e-;)+msufhT_F{}<T%xC@*Z8Bfm3CyT3GQwI1+{S
zPY!32phUGAZXnsGug%74=rk)_zhC<ly%TX|)Q$A~dH3lzj==ft(keVoqfrh@{Bd+n
ztZF4~(>6+?`0jDIb>AI!4;}G~+&@HE_<FkmkQgozG0jPrqN2G7C-mBQ0C5S7t@^xK
z)Gyev<g7XiBt|fdn}6-(vFfQe=AYE8M%V<$L(RD!7UE2a4RZ7-O(Xe)j(zl~b_EgN
zb5d*Dx7`z_?->ShJI7DJq`j>jT|LgL0aFx~`v3E>ge`6JJ!DCC_CYIegZc+9yI?XY
z`y)8_Akt%-xvk}3t%jy!t@*&Olo(p-ad`x`mx(dIt)%Im77sSq-GF%~M^dm0BOEdz
z-*rq(F@t?=*fwut?g^d6V5_0ZGvqbEGQs~?P|K)?#871ZjYO@;(&6^TBAnE<nYGe8
zI`p)LM<1-?O^i2;#@E;Y;#?wTW9#Z><N6Gl*!mUs#HkORIyrOd(Z?P?H7$dU*0+h$
z8ZAx>&FKYtm1vm)f8;q!w|W|dV_oJ6E^5mpa^ZxE-)pq*bXPpy!%-TVsjw#_JDNYu
zf_{(nA)E=cgbADm3Onmy8zvr*)=oPt_hItbH6H-sOI@o3yG57}SuYapt;zKa+$%+8
zqCwx7?`h<Cu7~NW+n3ewQgUVOnLcQ~bKF=8BB-jNw=nu$4}6AoNu<-$_cCz-dvZK5
zNV9v`+#@v8;<yb~Obksn0fl+mYwLx+dS$h}umbT2>4F3cd`&@|BGFJfHC*D5xCE4s
zs{khjSZR{KwV;EzTNSiPZ*mQwd4RtS(k|?MYQrZV+_Q&;ZkzRqVzrkd<PLhxXOM{u
zl2NULR)ZCSO9I+hEs14$SmrV9iEy5_L&EE5^@X%}(BVCodb<1a4fqU_d8jp<sCfH^
zV<+vXj-w8cW;8Rk?vW7do_zz4rWxE1C!XbaRbCiDVj6b5=y$ujTzDhR{#56q!6_yO
zE=GgMUos=a&m0$3lF-wk|4ihaC{i2UC%LSI#l7aA0*33Do<U;FRs)3=R~_KOf<DBO
zBGUUx3m`$bk8agjsxRD16e=wev|n$-MqMcUm=@Ob;DiK=afO)?+^I(z%3MV6LyVUt
zKsH&o+I@%;$vZXAq)Qkmi3;gDCUXz@!Ek`}RJ1SQV;P$AaVX+Q$xP@XgL<9nA%+sE
ziQo~)pR!*MBiL@Y4tt4u%Ju<d-B5fKy@euS0l9?j<Cx1tEA);>XLWif-t5$Lv}2x0
zh+xdeHA$3<%l!sQxZFnUcx)vn8Hp!A?rjXY#jb&TzdSznb;N%E@44Q+6fqlc@u)ok
z@VVIMoC00XCiI_6)WX%aw00Xlao~u6TRWH>OI|=KFtWf3Xh2hFt{&-$#z8})rx(}7
zjSpo6VLxQs4CW_Z=>p6FyKkyj+c$(MX}53&7AS6OR9Dzo{G#i+(3I<_6a5BZO_dco
zQLe0XhdJ(nI?<Xq0^9(Kv{$g&f=y73n{lT^YubGnNMd}$StlzzFRI;ReWnztNMiVL
z{D<>J=*CWTAY6!nE8Yi>7jhYTHHa0a99ZvV=ouP$f|!!E$Mfh4at(+j3pBC#&Jl)!
zGa;sedTG_n+<X)besvwL6o~fmr7=!YI14kQ3?>6X3u19k1WytUZP|nWpZhS$eM`ZN
zPSie8T<-%!Er3e*j5G9V?*r(^hJ#RSZ2HgW1R;=`G`>i5?j`Z}0O_@B0)vpRLVm6a
zL7OQq0BuNob_Hkd-^M|Rz=_j3D*yCbR08E+fV#m_#?E7pIULtHh1*79+XwFMV9W)*
z*??2RFMN7J^~WPB0S9qZ#_<ppes)>naSWGsC5M|&E5Yw@@H-Oxjyl86h%+iG!ZGc{
z4lk)16GO0Zpn#KpDFBwFDK>smdZEj>L9<MxDVjs#s)lYp8gc>j(WTc)-Rnn#jbt;`
zXUQT<B-ws8dLO+B#2P6viPPxJpA13NbAnz;Isxz!7$mV!B6ewf-7g~0n#&)=mR2M6
zgK#kg9b_86bn+&?S9y#tcd?{BIuSI)0UUg+Zoxoq5$>4q_M5y{LYl$b#`)B3Xwh(Q
z*~VS9A;iKmsW#+Qm<PVYl1D~#og>`#T%#q<Scpf49zhoXOGO{nncCJ-eBB8IOF&NF
z$l>iRs4=vu=ASjM!JDXHhSPZ00Pck!6LaRIY-NKZ6{?(L{l8!Q2;XQzzWb?vjdl?4
zRuienzrS>Cq;#D52v}^eOi$q;yi3%u>JZAgFg+eCV8yA9HW3A<pgxT2eo=6&*j=o8
z60zzTR!DUC^EmQLLKZ^f(jqDV98_LZmeF|Yu<(1`0|#E)sGoM#L(K1k%x{|IS)!1g
z*a4R;K)j(3&W5M{5^wn^gO4!>(T`Ev_FHTb&R3W;;BPN>M@_&MH4D!j_%%!B#(4J>
zh}tR9W5^Kzb4rlzKjd&D(j7P;K!4CBB7SJ7+Y9uLaTl5<yAiE){{VIRLm9co#)FB{
z`=#DC-g)OaeE+b>ns9W&RynARdX;t2o_GC>P|9sp;`<$XtMv7-N_c5evP%6@&(g*o
z(k~fEZFMZBM~|Xxe^9Q3e3<xy5zjFGSU(Dcmjwi`K*O@G72bTR@UsU8Jm=%O--9~B
zAdjj8n2xu&3!mx+m_bS${B8wtCe+z@y#&%tb+SQR8gB|HNZg@LA&9eJ$d1=vA>Csz
z8kb)W+uCF=y^enh3Wh8;s=?H-H58<Iz5|5@ROpD?EVZzx*n0Op76kP=>Xb*ARj+%6
zEw~frGGtquSv!NRZ&{^0m@Imw+h&QL6UwUT4Q|0R0u=RpEfWmbaWqJ;2=Jr%<3Zgv
zQ3vj+&n#D9&^`>`W!Sym8H7pu@bV}OB(rxcybLP6HD+h-8e8Ldm&pJ;e5G4?#06(t
zqcA{(8NKBDvsj=e#5e$NNZFc%x#uo?cOyl`&UY#>7aaDcY}gjdZub!no(^DMIfB>s
zl<uVayb9`{B#Z@3X~(al9roFSFvI^kp8D%FY2>y58*gQRx@JGhdjRH^MQk91A*1>_
z+ORCG9^3`9NNf?1oZf{ePZPfK-tCw$lEiDVT2;wgn_Z4~a_~7Y?p0w<34=?R;%7R8
z*!*X<2E7Nn<7BKHuq$^m@NkfU--bNOn6`%{Psv4{49?(@0lp7X%LmV=-Z&30226Rj
z18-wu>Hyzn%P3QQ62H8HU-(_ywfqpgk<pj11=HK?-Q4ExRuMb>yW;E-m~p}zf_Tjs
z>tu27T4y)wubTEK>M;g`*hx9x=u99T&ZWeOFB#XKKq>I&hS8yG{zGwnP+N?0<nk9f
zWrTmeGr4v6ZmMN;CJ|pi>-?+-|8Mrhn<><03inMcS|n??@y<1GI*hX??^s|6cFk^F
z{xswXFwwGMU*27SHwd($VhxeWXyyyMorn!*J$PARc3le49$tB{@ho%IvZP=$cqIb%
z_Cxgv-Xf6&mlT2`iO4YDjk#_3378A`Jp=87tVI8!f<}bo0@7BO8jB3IeiuMjF0dYT
zVMRIv|Ds&s)hI%7c(5@XsrTH_m-j@xhTp@bw|WqHZ2SSAn9>Ou_$CBJuuc#K0wn{~
z#CX9VWIe5~7cII}JyayZ+Qm6@SaEm%;Xt&m--?{OuD9U9S!)~uh#3$tS%2B=)3pUP
z^uYk8Eg&>hTy*(&MORY_7iAT7Lo?kKs8-KZ57Tqb;rVdiT#!(n2I7nihMB5x5a3n*
zJ)4mFy6H?8UHH3<jl6?t7Zcndlc2GrYFOY?F28#be2!dEf!0Z~5J0E=Nf6;{SQbKB
zCk2`6Ig%aYdM}+Te1vU#b4@+<CbZNpHPz;4r&1e-6BHMXMswXRwkW&{uK^rh<yP}<
z8Ag%DsaKJgUyLQQ>7$w_lV_Y~ioeGcLzI5I>6=)dejb<9D@^oHdGs{~Z!vhBaRWS}
zofk|IX@j6~qQJAL7`M`LHQqSnTdQ#2rM{n$zs!Kl$i1BaXddffCZ^$#qTQO}hHu?{
zFCzngZ!MfdNma1CW0bF-(@H<}2__!G%kH}CiPv(<TRVr4=ky-U7kv+_b&`cdtb(dB
zjxD7JhcsNBaooOymuYENZ*->cTS(+j5P*tpyx>wnRI+}B&D>qSHtrqXHd$jw_;^RE
zTMI|JOfuc4!<Y<-{)Qt0MnyJfu0Mh?#h0atQKjw|e=s~1Ce|O6pU}42FUlQ)Ble6@
z*^u-fG3s5KD31-o5%0TW=`oXU0i-Q}IOzqCN(DEKda<Jlo&icY6T>u^=<1zSW9f=X
z&vWtH11HjZ<sR`rkez}p0e4+x#kei|Z6pqljqIOT`EnW_>|nE)rR_mbT4+gzl;4Vt
zX&IY+CtskAV-{Wwtk2>un8{hVmoa$zX!Mu;_C~nRNXsC>T*JHb!q4<Xuf@W@;c%eY
zg0_|W6o<p>45;$=cZGJJ&(FU6>g7wd^Dmu)9q9A3&*=&AHt*viqGSsgmlSo9H~%{x
zQRdQa?Km){?NyyeVqhS_M|mo;!v);-o9vm7A&RC3Bf0!=KAla6yVv69w7${p=Z>~^
z`M}mLn>YVcwqSlUTQt8>92om!#atJVVd6t3mmfEat1+4Y8=iD5k;1x1zmHX3PY5I-
z%l#Wf6yWx10Ka+hZkK|+4^4qnb43S8!fnGD1abgBcX!Of@b*9ZARORV;E;C|I6-B3
z6j;KT91Y`W97kh_9T(JLLgI0Z7jw%!INF7y3H<I3ey4)pJ;Co@XVTf_lm&m7u=m;f
z--I)EAP~C?!}cNj0lNzCdsEIHc~v)9lDWNtOzZ<A@{k|CdE!D_g|C3#D6eXt;4|8A
zW}9S0(Tv^io=2dO05ZDVixpTY!xlEi0&tw;Io_#4BtXB1h5N7IrdHb%sUJl~FdX_2
ze)NE35317)USPlyl~@Zs!=qRR&IMIfmzXS;PGy)y(&?8-nJD1U_XppAQc6b#rd<zy
zA5tmJf`!QiTc_Pan!1`o4?f9C#KT<5=hXU(C|@Meyb;mG?sDIsJW8@fd2~5d1I-Sp
zbcbcn^{D`-^_Rm6l=@eopY)kv8;}S3N=u09GE4a!&LE9%@VL)&`{R75Ko*Tj#809k
z>c7A}8%F~Eptl{i-Lh@?alJW*uljUY5A|6b!3d4efM0GoH}w0AxQWN@=?{OL&$_LU
zpJKh<*r{H9Z@G_pp?x@p!<yv+`qf*d0p=fE`)X8?$e~O;GiSOI-beSSH`ze_I%j~L
z6O{P1s7v~l*p(3I*g(*5vZ3cG<UXr*;y4I3U2#_3zMRKjMn^4>2Ma$0>3Gh5b!o~6
zw80PrB19W#JP#NVag_*JFo7%-B@UYwBLspT*+W-1+z`qQoJ?olDZrq9K-}k);8{`p
zJHq{q_WGXhlmrzRq?zO9pltv-$V%QfJA;6i4Lb|T+TeC}zO7`dh*U#Jkpbk)I7-}%
zt8|982E;QX{XAlGe5b;5<Xyp0rZa@CqB!=-w^N(Lkoa*OTO`Y!qWeW`Z#Pgs$!|B`
zx;TREgM8v2>6CG0WD=9TGXi;X2pi&oPT=Ly19(Tr0Q|b=@9+g$h5FB-GX=w~;*-RC
z8Kb)dxV|=_{{PsHgC$&)P8T6bn<#`Lj5aMn4O&a<RN6PG72rjhrqNzmE6M`noj_*2
zLeWQ?dO>zfT5Ykk03eUi(l^Ao;`phPCq)63(8bY|9BQ-H61Cde)JtT+P)CQ!MbUAd
zcf(mMWXix|Flta1$<Gn}FmOjlT@xRG=)35CBfE@6h>5%^l3eg+n4tN)AF1CXbSb1?
z3Ewgo)!|wFy2dWExB2;bSj=B-x1uypJ@u4!f(yH$=p8iN20*uf2^+;@p}g9)495m_
z%t!hO0&1V-y2pC^$bqKz)?06cPBD%@@zLYgVfJ(&HaBFRkr<486V>T82)}ohiRXp=
zCd7Yac;I6WUIobRyG4iv8kGgwiaZUUQZ0${>Z@Xo*mRygkjO$qs>mX$-w~ZFcR^4Y
zSPRsDXAZ1+c%;P52dmHN*8;@`2z=B4cyI)8Iip*1CX76yOPq<Yo@3DL-V6k;GmoC!
zq19%z7Z$yK9|<#K3Gk#yb73)XhaI&Zdtw4}47&&L0(ZEG(XpYEL>HUx9QTaiGt;k~
zj1A!4Pcy#Wdj4=zw6IZg>%G6vi<Ly!;OLK;xWzGSXQvB0(vQA$1B4+Z?|z#R@JP1Z
zzJ(W<a0<#BY2f}j#txK&^wXogpur+yHPiNSW*0G&nwME)+^;YYIsXMF7A%0QpQ<p3
zz=^N`QAzAX2-OcUi+=L=30`=WIsOQbei(sYa<4YvLK!dD!J|ZHvZ(rbK4xtZ2>ws-
zrXEI<o0xtc%fGzdcWhb<xxLQoM6mp3zJ#}hgAumiWN!-MZ3RMYNyJhM0w*>&$sSZ~
z@16aVjj4WfUjsS|l=i|833v|Q7w~n-7zW^S%fpcejhh<RFXMocz+6s4r6B$QHuEjG
zY^9eQe&Lb>q}0MIhc}^w0MQp$!+?g<xFUNZ^)KFpjhzb`@Q$v*8%~!;8KX4};*XsX
zyl(|)P?R1yYm&{~cY=Ghf^u5$mlNDE2_^)k$Q7pV<h$|)%)ga|te66cF9i2~1q=#R
z3SWUrAP4Y0PgMaO2_r=T-X_v~i5`>{+?`<}6;eySL3v%d)88KzB|j%Em{LCnhQfl%
zpxCmYcrbVb{|7~|HGtVs5BRh|?u9Qy#euwwU<Hb^8fZO_zR<hV1rU0WrveDcaOJ3k
zYws8<l3_~xXnLRvDEkH?d~m6F!Ma{jv>VQNt<nyt(qI<VRSxA7ya9f(Xl-4?ej{w@
zAoPMv<EbLENT4dI&%cse1ilFoCv5is5*5NvEUo*perHfg=#y5oZR7R#h339`ouw1b
zTCiMVKUD@}*L?=X=HXR`uV}(P5TG)9vj^Zs3Y)a?N<B;l;6fF6ro>(mV=s^&q@>qK
zOprVf80*_Wi}D*M`%I8~7M+kCFlS@s`V3CMdzzgT^dc2#lT0ra!xNpf)+va_!n|hj
z6`6E~#HAo15Ku<qRD2X)c>$}yyn}>!w=n2_kZ)<kHsZU+T{D1tA<#ljLFc9L{zAV5
zrpUQTKo0P?b|Jt;5Ge!(*3Lm?Yt>tZpTFYNn;<_o^^zoFFKcs-4a}!AKYJjx@mQZm
z0&Lq>Ft*Vh+YcQ!)GBhq7?t7jWgNrJslEwAivz?jY8;Tqx&l9eR01zyCrqM%r&uA1
z&s_NUUWh!fzDN<<#RAPF?|Z8lB5AW|92`&Y?*Y{Ok($>)g@Ua{vSFteTKgQUb(_KW
zGxz}pgxl0j9#MHN<i#JVoqK8a!t>A7F1+*#p||%3x{(a1{}MTZcyOQSg5DLXev*aM
zc`>FNF%EYNSMQ*!+<BzPr?Mai`9dDP)bt;*$L)w5#{VDp!{qYV|JseoWe3bZE|$uF
zWPZB%uKA%6L$S2Fk1;A7)MaqV!Pp3P@^!JT;oim2B2jOG#LFA;av|=C6pMG7%eU96
zM<7q&ox0i{Cfq>~S;U}wF9n;1&OzbeMtQ&}3DCASpr<B6vwsvkWNhLT2Ajgm%f6}T
zJ}{YJ`bD><J^N|yPQOIrh%>V^Q+@8`%g0Yo<Mx21za%dY!sb+QypqgLHX@=a;wr$}
zk#8Rqk0_{KyGrw0sYqyVPAef1L85AlnQUF!bsosOUav7XgSRrlJ1Mkbw7Eh_T6zk0
zmr7x{48347N1Q^>2>FJpN|<bQ`%Yy}Vw0@eH(0WitBrK~7cHf@^u7Y4ZoV2kE==1S
zc4ZhooN&7{)u`<Pt?AM_UaG{~rO-sRg7?wS2g|;|Pq6EwOn_B_rt4i+XgU>|h*s8N
z*RTmht$KvD++Im3Gko$yMKSIuA}(MaB|Ze(`Emaaf)IX|l^w=V-LJzAOxfsdEC&;|
z-1a}{nc>+2P-+2QD)M5wMh|6(0Skp!u)r)WPt!J>!L&^u!?-O>0TVZ;jB7)|ZzcF0
z4t__RA*bRD2XOMJJw`ZrZbU$1IQyNR@Wbi4<CIF!H`1aVA!AY@nAVscAY*ZGp<mri
z$QWj2?*lT1{s(4q0p=9L$2%g&8F9>eZ`}CPq!$`DrZ;NAji+NW)_*y9ZxmU5FW2JF
zp;?rmwtWyfMcDWXidUOB*!aME7D}8>i;*$dx~&)36M~$6U$O!_!Nv|N@R|Fo0HMj=
z3hayy?kp5-g9b?=DOm_K_%I$Jfc&x%<VS9pHi)qgxc-nIU-zg%KEQb3t{=hRvjh$-
z<3$*m@R-OtWCjDA{8Iqn_`<=bJOF)x=}dU-IXk;K)H(>6f_Yu_a$EF(E%)L@n*b4E
zm;~6*w!r@n0p?>_cyrQ(oWZNuAmBj*Ihf3+GYG$B@a)D|J9h`ZE;;>RDg-m30EUE#
zPjnYw{E}KGI0zT_I37YRP4#0iETUGR+~SK`7-y-kb+XG9<UPD%y8l&hTtM2d*?h4j
z(^@uLkFWpLdFFBTAA2K!Cr5%@O!bf8@w-CvCe#5T0L$UlBaG{0<@llCUgrEe-k99W
zJH`bF#cLgLACB)r2E?6VNkydox;GJ~)_HY`Fs8}jMUR9+p);cQP(Y!hc)cXr8KyUx
z&M>Zx3K)vz$eW~BI)I>j5z20Sr{K$uU}&j+2P0P&V=l<=8~VKGmV_N)&K>xa6X8n$
z!K(BHJnw~c1}JB|zQN@=H>&ZqgY(-jo;g!J1!4!zLBDq}D3T;RLMM;B2r%gQCA=>K
zG?5homKl(5HWoj9a(+r@+{3r<_GISgWH;Hf+#<1dD<Fu>au&XiMdFJEPLcikfg|WU
zEx2x+0dPm&16w6*m>vl`ATGGwDk?<h@8CER^q;lC-bwO08YqIz48<3`N;qm0e-Ag0
z@U?#gSM;v^h_&;l@Pa?D(f6AAGpHKuEyO%H#*%&pvD!D<$9eRNtUjS7-*m2tR~kPH
zA9N66V`)mYc)bLS5hhje63dq;e3dsJ;>{vR7kD(rIDwx8lA>Ivh8gq#QGsjc_YFk(
zqg@PlXiw1vOqEc+UkOKcAdtKM;2K<UL*~&~mB3f(M_JXK(5b5=rwSDx8>9EM-oNZO
z^8*yGp`^u29cY*Vx3=bfm_NV(0xoYUwDI)}G|D)DFH#C-H3ngYY4^$<d<*jkoaFXk
zSY7JI9mqDo0h%bOEj$9eWcRpuppml><=9L@%Wdpq?Ep8;qkqeQbhfWWg!Gz7Xmuv(
zCyD(uivJiJ>LOC)Q>9_bVg2kO-k<a}_D702V<PNEzSA#>GhaUkK9~k>>R+-tzkvYn
zn#O)jm^#Tb$RkmL*H#^uc*F;u8;yMM#Qtex=YHxqbDGd;^d^wlL=f)@4Sd@w8ObuR
zInL*HbqzyzCh*)2A<tFh!Fn!hC`dAot-0nNN@Nl0n6z^<!cjGSgP38_Aj{v^{|2q+
zbd{K}RFF_6+}xi2sdc6!%x@%-LSLYAQ%m~eJepuI$$&DyKb{c7YE7*}5#t=ZQ54eP
zRqbYt_ABDn-JgtOXRct!S!iOVJzno;x(ArE#jOYM)CM(Ho$s;bJ6%F|LF$i~qDZz;
zzsOw=_^<KipTa;<bfN8MU)0W3WT(ItUi$)rI}E<aK!*G8@#w1zeulyCGnir?4>LH3
zz@H2)iY{F~`^x1mgQ8PRA^?c+>_i4+pmxm-nI#03{QO7IquAu|6jB9#7)Q0&M7YDo
zdN6{p1XzcVU4_4R!+I&7P4jrnEW)fSZ;cF&?9D$^ER5`(ni?skK2sS>eFotfb5*eM
k?~LTE=geVQo4z|@Srg`k@EEB+4Byiq!FkF2!->lO3(fVAQ~&?~


From f644007ce5f4ec2875c55922d2634c7db6ab965d Mon Sep 17 00:00:00 2001
From: LinqiY <100989140+LinqiY@users.noreply.github.com>
Date: Mon, 6 May 2024 15:41:22 +0800
Subject: [PATCH 11/16] Delete collie/models/mistral2 directory

---
 collie/models/mistral2/__init__.py            |    2 -
 .../__pycache__/__init__.cpython-310.pyc      |  Bin 295 -> 0 bytes
 .../configuration_mistraltp.cpython-310.pyc   |  Bin 6283 -> 0 bytes
 .../__pycache__/model.cpython-310.pyc         |  Bin 49178 -> 0 bytes
 .../__pycache__/modeltp.cpython-310.pyc       |  Bin 52277 -> 0 bytes
 .../mistral2/configuration_mistraltp.py       |  155 --
 collie/models/mistral2/model.py               | 2026 ---------------
 collie/models/mistral2/modelpp.py             | 1922 --------------
 collie/models/mistral2/modeltp.py             | 2254 -----------------
 9 files changed, 6359 deletions(-)
 delete mode 100644 collie/models/mistral2/__init__.py
 delete mode 100644 collie/models/mistral2/__pycache__/__init__.cpython-310.pyc
 delete mode 100644 collie/models/mistral2/__pycache__/configuration_mistraltp.cpython-310.pyc
 delete mode 100644 collie/models/mistral2/__pycache__/model.cpython-310.pyc
 delete mode 100644 collie/models/mistral2/__pycache__/modeltp.cpython-310.pyc
 delete mode 100644 collie/models/mistral2/configuration_mistraltp.py
 delete mode 100644 collie/models/mistral2/model.py
 delete mode 100644 collie/models/mistral2/modelpp.py
 delete mode 100644 collie/models/mistral2/modeltp.py

diff --git a/collie/models/mistral2/__init__.py b/collie/models/mistral2/__init__.py
deleted file mode 100644
index 9dc3f79..0000000
--- a/collie/models/mistral2/__init__.py
+++ /dev/null
@@ -1,2 +0,0 @@
-from .modeltp import MistralForCausalLM
-from .configuration_mistraltp import MistralConfig
\ No newline at end of file
diff --git a/collie/models/mistral2/__pycache__/__init__.cpython-310.pyc b/collie/models/mistral2/__pycache__/__init__.cpython-310.pyc
deleted file mode 100644
index 76a01ca4171928aebb54f37b4541ecbf0bd2731f..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 295
zcmd1j<>g`kf)fuV(xQO$V-N=!FabFZKwK;XBvKes7;_kM8KW2(L2RZRrd;MIW+0n6
zm_d`}B_mLYCgUw3-^}8YqQo4x{37SX(&EG%A77v-FI3byKQApa-A|Jxiaj?!B{ip{
zpa^6~lz1{&qO>TnBr`uRJ{MvJP?i}eyON=Z1xSI3U(xzSsk!+jsk#~YxvBa&g_W6k
z`p)@2KAEoiC8@<F#rj}#ffU3DBYizRC`!)H$;nK`kdKeg%*!l^kJl@xyv1Py)LojB
PY6tR6G00LL1_4F@r8rN7

diff --git a/collie/models/mistral2/__pycache__/configuration_mistraltp.cpython-310.pyc b/collie/models/mistral2/__pycache__/configuration_mistraltp.cpython-310.pyc
deleted file mode 100644
index b9656ff93c053ebc2cddc39aad2ef3badc5f9ac6..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 6283
zcmbtY&2JmW6<<;mMaq(GS^h}sw3#YRBN`H^52taB+Kpn%vH(l5Wg!J*7|Ye}kQ{qI
z=-FK<QW>B?0;HGr)ME}k^pZ=_OaFov{Ui2P^k5ih&js3|=<m%gcS)N1FiJ^JOLFGD
znfHG0_uh;*IXSN3wYK_h<InGC+TZ9+@2h|}ALGxz31MkFnw7KicFxJ~<a6{qVvjh5
zodTW<)~HqdCciVfrdeZF=_}1D@%)z~J4LJfSgVX5zQZ<zA4tPyj%RV3nV#Da%}^SF
z@Z49>u2pi;`G=BYjBvSiM}3dainiBma(TGk3WBzOvsP<`&8Bdh4a4MB)2lhLT0_(l
z1M?q!GQYP_U93s&d-fi$IZ#-O?Ny)oo_o0UCaTt5EITxV<8@Z@hi@OblEayyU-fA=
z$pyVHay`BLEeNjd<lz3imD?Gy@;e1<q?y|pEf>n8<zjgZKc#ZHJYGIio+wY2&z8@X
z&zGml7x3+L`J#2@Yps#9E?JkqD(@7()(YCr7(((^G?_RocC|4&rD@{~ukDub8GeHn
z?++6*ZNtas9`gfF@*xAf1~b@`dZKuPB3*x4W$OXy3w>_UWC4#FZXk?+Gs9ITlL-kn
z4Ab<ag&5OlT7}={rf3LEH)Jz(xEuI$%;F8<szDeLgUXO*O9b2uLdjt@wL_zf{=F!(
z`yyy51JGrJcHpypVcYCLa2uwWFC1YTvVUT{CL>_jE3GYAdKY!flf57H#ke}ItUhs$
zg6z}TH$f(WymE}jKHG1mx^HSFUUm$VBx9Mk`wV^tPTUrkB47>aIf(pV0@t4+#1=CR
zm+kTt0Z7I`dbaXk962ux+F{^V*%mhdk``C{vOH5oTam;uCK_<MLx>bE9vw9t&Q>~B
zWYfQ?o(S7}o@wmrzBuG;wl3VDKF6+kZQ>xqzNRo0;{d>0URqdKsAN-V`e!sf1SSpu
z4(RB|K#amFyGzLAaRRxav&$t7v_zx9C9n$wJ?Acc4HPAwk-QDM!k57K?m_ARvPmy2
zmE5BX%dfLGk`Tl8TinHT+a`1m=3Khrmar`Do^Hq27k>Psfeqgk$TlDD>25XF$I|sG
zU;EY69(>BB!!640(^*}-zPx;**^LJekG2UP>&ZTC<A6*56tNxH=VltFWP^z!;VO`B
zOT+aES_ma~P4Jq`woK-YAVBcoC%pv#|6dD8Rk2tVftwp<i)CM^#R2T6TibJfpzj%W
z$WOVp8w0-Stp^#LQ?1D|6`K8)7lJx_Qe8N0n-gQRHR*+I;Pz3-kv=P3hk>UXDS8yv
z$)BnAc+Izl_!y~6{2TL!;%yWdwgDO!>^`wbYJ$yf++V58F>wk*i$^ORSU9vkTB$G(
zi;(*Qrq>>=bjw_o{S4XJw=Hnq9+yE}iX2+yJjIupEp9$<d&nEAEQZpOeKr?|zDd4g
zaEfA66-?!JmN^{y<3@#BeRjbn4maGk=P9B9U#J~=QlN`MI~IeCDxty?C`lQsvU^=p
zwRm6%+mBZYqGnGp*6%aYHriZfCB~4Si@mBRo3*yps4Xood{|vvUb_Ba6>lpPaE6M3
zj==STthUd^#WyL%<c#`_dX{BLf*WRl?8Z*(RB7vt&^0OTj^Xk8j4$kPrv4hLIqbXh
z@CrwbhhDLl)R{ylo9${RQ!t6xRk~Sc@c|ZajL-G9=L<yw9Hehi2XitL)QNSkEiVmQ
zA9M;QbePXygrFVHZ0<Hy%Yrab5EAq|qS*=<2fotOqkvYsoyZ41^KGFzs(s;F-hRd<
zePIMRynT%AnnTqB>=21$jY;Lof`SGNeVmO_D0NU|ld5XN_Kf4|>Oys4X-F)nC$tP{
zp|9IR@#rZzXun+(<;2omD(<%4W0es*rtej90pR;wIRuS>6ejPfPSWv}5}mgxGAAu{
zk<Tv-mZoiR4rU7*HBoi6ak*wt4Ls5M#9_SrxP)ZR!f4$Dk&}~lk&ee<c3Xyg5D}mh
z;u-=YB~C_u2?^lr6w0_Jpid8oE^3M@{4CtqHO=iVoY+7SsE@S#)U^-#;2^op_t2?>
zI9VP1f&*Js_<92UTWXEj_S%M}tIh^p3U)ZyTn!01#w86D<|h3<I-^YGuIC@8vcF$j
zJl4X@E(hPyd12mb%y$t!6C*ma$lt``Fc@chn&GZ>tgS$BeIExMy^k$ULdHGA_K$={
ziV0-AY8h3%h?{Fhwpfp?tae}}R^|igwYd&WfpMA?L6eM(9^(6512qw4L3nAHx4eg|
zJuApeChVjq?DRT%`a?aWQGI*_zoDoo!lQOwRi2Y-4(aYGJ89FUTks22zmsWWxIen{
z)s=_hEs9pEpS7j8YwQXej7pTHGO=E-w-17r=c;$NZ{Lm&LAWF7wxfu%)AnT0rTPsz
zbI*0(-BC#!>!R83;ZU3~w&Kd}IQG#eC~yb%z=1!t8Yh4E?02g>Eml9991w>Zh}$!D
zI;8D(5BewAi)W>F)TU<p7SU&O#RSXyw4*cu`VZ|#ZbTWvH4<@K*W=MxazKyH;iwg4
zy{P|6`$E%RY3E7YfMB?qc~ydl3e;oZ8Y{28{Q1S-ul@D#H$Q#(tAG6QxBvXhA8v1N
zzRJ<SJjA`15SsSOKYbiO;5nLVfUa9TM~Ef!q8b&8OZYd2zccvre*w|Sbu?Y;lpu4R
zJY>Ez0y)wtKo&ZqkfWU`$YQ5xX`v>sc1FLzGd>yXj6s$<Wyo@89CEyK2J%ejJmf^@
z0_0?H_Dk*AIV<0p=uCFbcFtM3Zwj4@zsvnPhkHH^S4Sh8l|nSp%lFYlFI7hqy)+t4
z4<Wj5waWQuJUfj?SB~hPqBBY3M+qEFy>`5ers!zUclwK_hnyCov;7<s4;naTMB`bP
zNwx->n`ol9e~8LyL662W&Wxtgl<rbSH0tBzBcqb8S{GffTmoRY-H_iwI3+G#HThi<
z-y`uZiSLt`A#s(&dnA59;(ZbX33-{s6%uce_#p{K;sc0i8mn;i^z@}VlBoE69~Wr;
zx4EdS>*_L3*X7$7{;y}aEp@zr&$m2>*X$PuxH7%tJ$N8iYXSEIzorre|Kh8wrCPO`
z{F)waAaKb_ywmf2U$W}Sw4hyWAIJ&}3jFyC5XCFST=r8MDddXz?0qp;8p-G4pTd9g
zqvQWRyqr)-%stiArPGp<L{c`&G6}lbjs<0ooFFkt;w*`C5Zjv)sjNv{d}{aP1$tD|
zr|F4Ouu`I!xrW1aJsQ__Y%;Xzc|zA;god4bGp6gfsiGmu3wm(S=FwGPi@FzGG|?7v
zq&I@EE`Us~To6qrF)+DpCvi~0kn?1}MuIMZ2^7jzC9RMvtQB+lLf0k7{lENoHG8u%
zCYPaMhxHbXgz*wW*TVqkJ4yT$O~vu7;?|x1aPuxl==1kC)*o+gtvt{lZmr_~`sV6g
z{m!SG_tw|+m90DX*FRg;H&z}hlkp}9R`C<<Br5$RcHV8GLZ3oVI6IM_$mfgsT=Bmb
Gi~j>0wxh=Y

diff --git a/collie/models/mistral2/__pycache__/model.cpython-310.pyc b/collie/models/mistral2/__pycache__/model.cpython-310.pyc
deleted file mode 100644
index ab53c9573dc702d9ab95ac9870bc99c46c10a54a..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 49178
zcmd75d7K>AeIM9WU48fToR|RyR|5nI3<xw1Qlvx^pdgTtXaPft041|YsHZViGw7K<
z2Cr&xOnPKmpe@QWEIHZF#<r~Nz=|VNv6HnU+liBSv$3tjUL|(ca-6MooWt4JRAf6|
zMUn}U;(ousS5<w?3`xoUV|%7wSG{`m?&EiS-|w{s2C^}He(TBPoK-Vof5@BePZBrp
z#@D?+9*fzra?Cbtvu>0P`8CU?{Km_1{Ko56BT-H?lI3J0RZcb1<+P#0XUZ7~mu=+C
zm=SB_%LBoaf;_S6nMSUh6Td`#urX8~Y7Cc$8zbcr@k`c68)M}$xlYx$G{(#0jfwJv
z+^6eX8{5j;8k6NoxzE(MH+GbFG<KGEHg=VFiC?z9yHP9`8+*!o<UUv5+qkQIS7Tp!
zU*qob-Hm(7_cZpG_csod58!#;9;i<>4werZG3V}^G4-R!d!c@B<52lf<8b+~#4uPd
zHI9^zG>(>!Hjb5#!EeYp?ig$LIfrf=3-{kNuExsmaQ3e~;5>NSD8JJlF274^;GxEM
zl^>QE%=(G)3Gq8wJ}JMClpn$GNd4W7Q{__<X0-mE#_96u#-rs&<$kRGSmR9jj9hQ2
zzqj$e^7|T(mmin=@%nc+-d}#dTu;=`HqMpL$@SLybmN`ncgpp)`V);O%TG2wQ2s#U
zsq$0eH(5X5xKO^(c)I*_<Adc7Ha=ATkoa$}e^29?@-uS1qh4-2TYk3jT=}`i#q!0*
zhsz&^|4w_?3$b$Ly)k>YU3?*C7oFU~%*}YY3ZFgpUij<{d@k8{+4~T7);?|DZQt{P
zQMM6gzkL8<4g_Hwd&<cy%-IK>dHY^o%_Z$a_Td+-@@0fA*+&reND#ILpQH9M_#AU;
zNO1x8$L;%Yf1lhhIyGm(S+wu}iFmn=XYa5dz_SN}w;T3@<)(9&{Z6NO)5Ozv*$?6A
zLr(E#{H9TE+22)O>U#RHeF9HU=y&n_3f?$rKLWo;oM(`>LfCf)F`csC1ON9pEs4ps
z&GWITM^`3_Pu{rDQnQzf=W4E}s`X-{WjpoK%N%IaQ<iU?g}<MEV#%wun$^0WxUjrb
zcl^ZDP25izzS(R}#r@&;oPO*>HShA-)_l!#&$e9GA3Ck@^H|eUt)&~{lR9_)F<#)U
z<f+paj-Q#HGX2Eq>g;95&p&#jS#8v2<qqC^Doe^)sw$^)zg?NFF1yuwrRsT3li8>=
zs_vqH@GXN>=31)a+DldBaN<2x*Ez?sJ+bU9Eql!ElT{RL{ItY=_FSKT@w{_o*=f!?
zr|VVMt<BXO)f)o29DGta7gV*@bZm**AGEz2OHSp|4bQ2#wH0S7>5o-yyW)CP<yGv~
zEb6${oOk`ZH}b7`t%_7XGk&eAY=2wLt;`|)%Pf4ea=cQ#TCLTqm(T)t^*)+cPStZ%
z1tmtlQCr6=$Bz1`dTV|j3HsYmoAv6fQ&En$teRa}dD%cqqe%)u2RMGZ)tsx%vvgf4
z&n-7+{qZi3t5sEZnicw^c86+B+qqtut0~v3RIk@uKeJS|E43zS+RrUFyKb%_XzQBP
zj@qu1&OM=QM>+PRwOJ487SQh1dflnB+C0Y}M6<gnSLlt*W>8wCs?B)^;WKBeHym|-
z$(i*BF4P=5x<$bJ=~lg7bAmD%e){nz9#yrgj$$Wd<=`sr??$NQM)S!a-Lq)Ws$vgm
zEqm{&Rb79qFTlC?IaS-=@l@;Drte_v49vEgvsJI6^Y0gIl&?~Cr63L0k2}q)etJo@
z&~`E^#&u?;E1hYnuwx>o9g9~n3NTREMWj~IFHj|QmfYVF_~2O-uIp*!*ZZ#Yv`ycY
zr?c!E;U|HM6uzhNbw3EF9b3Q>>KO~BZQe3hc@=NR)~wqxJ8oOIOkS;;ZDTEQJGPd@
zb=*s>r4cIAHta<3UffRJHdn2-*|ys8xi}`J)bz`5;PaB{$L(6fPsorugZ4_J>O9Uq
zO1iA6CXoq!o%Q4HGQxVC%KSvF>D_nSPc>W36-TxF6#6aa4nN+gUiYn<=QModj2}m@
z^pms84ZJ#)@C$wO%aH3_K^J#X-<3;M5AmkZb!)SZ>yILyihkUAnw?z^{M?sf>ix*u
zo$?XoG+Lf>_;Rb^9I0QqQEMJK-8y@=_Sg~6aXt5l)G~hb-0LkJDV2i%S?Ray1IQbt
zr5pZWW!~{BdT7~IugVs2hu{nu<3`??G)&VpOv8FzorC8rzC9l+xO4Bpg%bnO6a&GO
z{8(%bM3fzWAp^3>vJ+Sst&hd5SUJI#@dv|6;;D1zr(3G=N4Ngu7uLS{#sBr>EJ(2E
z!)x{y{!08_>}G7uxDE2kGuPstwU%fbxAE7GRg!4=SS4i}h~Hc}ku3(FAfX_gV$CgL
zTB}{HqD8!-*D79khYWLOB6??fD(xrS<s}S>bOjAl^C}fZD$&nym^K_tkm>~QtyUAw
zn7roH<}Z8x5GJ!)6;pl1S#oQ2tZQ-Ka-I5|pSxVMZA^2V%Q0S;+?O!-R4TJv5)d~V
zS)HZx0XR0kE|bezBSzL(+1giGr7%ykCIW>YRusRd@pY%+w2cKlYz)s_h~G9=4Gba6
zMs+PD*fbnktMN9Tt|i+Ta0{u4*llAhhH$ZkG{PEfQ|_&{g-T6Kzs%&+gK#cj6r~m&
zX9)vqDx)5Pmv3UsNS$Dmw5l98@uk)^-)cD3rk_ym73Ha?8NmfOQwha><`0DR!U@{-
zb20%{WTg7(ur~d8wdv|QiYf|YQ7>(EMC0b+B+*&N@Hb>wV=EK=RThMvGSm|Y`%;n(
z3qrxERQzm(i{)~i*ZE3?i=KWWjiKfwpFvyAI`w+kUXqt*@Km=Pt9%w;X}NT)U<{>I
z=m+o4CmJzF;TMgVtc)0r6pomro#KeGb9UO!fRs$g*h!)p@_wgmEl>kD9&223Y#VFy
z%dvDU_VUnv&|9<Y2;ql42~YGWx{iy^C~9-$m@;zW>;e^Z4>Z2zB^Hu4#uH}bIIdIZ
zD%M&Wvu~ztakMR%+he1#?XlGauCsQconVe{#SPBoImWk;Ul{NbYX!{VY5LnS<UKPz
zuQT=EACc)+J&W6y4BdaywdU2zoN}(5!Pv-R?75iCFi2mD`zcJ36%1HEX**Xjamou*
zgX&4Vt{drHxbi1Bn=ZB7n$UJmSTC+`T>{OfJ7*H(7puKY?GN1VC#0{ci+p`hIrB_X
zRW2>h&0*}1U>f!+vn?0HwyBp*wZtTs8GM-2rtQpCm+SZ?xi*!QPOX?3b(PMwFtmP8
zI!n}rCFh?&2zLZd)<~L3BWWCE2aXqv>>Dc~DSiB@-iGds|FA3JeOXeY@xKBOY1|l>
zM>I#QZE{()ja%_G3-<}k2bO1G<m)?gl?!RAji#K#GHk~&HzW}*ZCmZ+S_Vy@ZDS6x
z(a=FNr|lG;<ocgTL#IKJOD#!D$&`U+mPX35K7|Ca&!{WNrk{;!*GBIi36?~*cMwim
zPO;~z8+;)pv~ksyO7#<<t<YF;5J>8;@o^j}=oXD5=Bbo!6pyd7;dE~^JhVMJ8>y$p
zrM}5UwGr;e@C_!G9Y(<%F(-{(#>&BHV(D*_b$w;SrK~eMORFqeY7n)InHIA(ztTc7
zj4T^%_WkBIw3Yaqw>a>U>wV|QfyJkl%fhwbpvd3HYnGx@Wc9tgWlvYn!<mZf8Q8e4
zq-vyFcKRk~z0cz7Qgo6v$Bgm#xM6*3W%p*~4=32TYTymkgfsn;F^#-j=6#LM0v)b!
zI%`ZrEz;>EE<;3CLQd>Ih_B1Ji}Pu^Fp^dD?V%>)@QAwo`*711aSW;o)Z@08ZB53$
z-<(4=a#@qvE%LWfz4C-Z1G8WbLk68cZYOTBN|P%GIq<M96)#uob4Bm6Q`E~<5zC%i
zY-ul9vPz|C>E$w`sgEEVSo#z0<tpYeE{f^}@lq_1Z(cu!@A0luv%r2%Noo})_V?i-
z3&|w9-I$?1iThbj<l!fr(4{}|b@U)>TWl@X=EycNnRo6mvA4}J%mnS&O`{zji%rBZ
zw$bOr&xrgmp4%zuO-8keIz6?t1eV7w>glKWWH7T9qZNnr6Ke(ht-4XHHtph~bEBxa
z6t|QWdg_##cOQx<t1HC=GeTRJE(k9+GxbpMLKK63vlNA1+$3}-l}=a;`b|RP3HY!g
zxIQmzGuAY694!%@Xb^OhXugd|qBW>yZ;+{|c%)ES_%L`eU<<-rdaXrNQgODWz}GFc
zz%w-G-@M9;XIiT0T(363+#EvFfZCm{Hj9^>o-~(T$7VcBAU=`S9A+*LKAmhJ*VisP
z&EmC|T4b8nz`+))#rJ>kuuR;Y+FEY9XnSYHsbEB)-oP)-94a1c1wx*Khl;@h=`y`z
zQ=N_wmVJLu6#I)+HNV^-Y3_mzr?3QXqKXhbssbXLVijaHa&)-{Q4`{fUPm>#&xS>)
zpyDsGL-v##E!Y>rW&f>$54enet81zVJ{p6zexrCvwW{_kL_4~+da@y-|BCMNrQTX^
zf=}y7R#Mn7cg6KNx?H{LNQq_cC_XC;qp%H!iUAdLsCXGd1J?M(68lf{P_Z^wl+5&`
zyNGFP^T-&7yHT9%H)wly3buVz!A$VPv0|gT=(s)UyxpQj^+~Z)*pH={y|sD1)_`aV
zEzl`fQJ1ABS+>VpRT2sq)m5ilePueX`}0#G(CV72W<*ACpnJNSnn5>fEU6kAZay5Q
z8_s@3J(=6ZWeLj6ZaR(X^eKRfDqALeaG}N(-&j=NkNI8w031Kx-NXHXC_Ed-PhF|V
zj0V<BU-YjqV^z%PH!6DeBc;q>{}A!YDqq-Az)~}2qz&~q;WvwK&xb25Kkhy_A>Sfg
z3po83(%@RaQugrq2|pK-3+K*0*~6`pm81}ek-A(Q*D)m`1gd~PT>ynz0ZW1<om?td
z6lxM6TEOCd3%-?P-?K#5vy_+SJA4;!fk}~PX?ZW>Wf6OB8%TyqK44JJ^bi-65Ak~*
z->;F_ZsSsY0vA%XplP|F_g8cJ;d-@k$*!LKL%zn6TCtUTx@saK;7bqJTeH==dlDD=
z6>vM&B}xhh=Bwy5ki&|UnVd1C5Rg1vZrPh=QITftWKaG0`pl_M!|yBf{|Pu3{?bwX
zc^K78arL)w?GI}f9zta-xFUU3Yw({M=x}S0m5FFreJFSV^n1gxYaq3>IQL%qr<Ru>
zm~Sok8PtR$m+CMdWFYo!>iaD3vw;kuI_vpKIF-5POQt6JKa1x|NJ^nFNhb;Y`MbDq
z?}r1f3;jK9m<h|UAmuW7|EihV{B9_{GPY4uZrHm|Go2zFh%@;G1-1q%UyH2P(6=dH
zxebw(G}@G<K8LXCB|3kH&QHMsCq+@oMm<er6W_o?>10J?WwbkI0Y$FJb9Z&o4u$=r
zc<^O>1Bpf67}}%gr!K+q6HUnYm;9s_^!F5xgeV8E6z{{h!0?JaXRSh<v}!>nfC0K_
zssRob(A;1rzn?5+;D*1Ty%Qi<$@-uoz=kGnCstWWlt;j4EGB*wJ;hhPEK~ml5;XYd
zY}zHoBOy+CZ3+2>@&W=obf~(6#qYp%P#|2k4i)t?_k@&cN*2zLniuVWTbyqd=Tr-1
zjI6nbB*o@(qXKHP0(s%GQ|YXkU`{(r%M=&0;H@Ix=m>x}dqW}@x$`s_u^(c6C!Om{
zpjiAAh&}QRkS4Gl)X(r?LJN*ZH^>hpj1-Ne!-cM^soCGD;*eTdy!t9`^-P~gqqiR=
zWAO#}&Enhh84K>*pT<Qv)7u7S_;EY#ShnRPoaDCH93;As@<Jr=!azA?r=0Xc#!fp)
zJL4pw*v#5FC^oYhQ~vUqyggwLNSFfNM_6!YgK~u_7^>xjlgIC{GhmP4JL(h|CJc#R
zx!)E;y5Q^P3ig=2<%L9fP$XkR=p|eI;jsIk3N?Hy|5$4J9Aq@L!>l3Da#8E(=H4F#
zE<WxSPu1(iXIjg}X~(hmgF*%=SSp@t*(61UXa@0Ibn7*t#kC5m6X8R}vmj0}?i!Ag
zbjcF!eXrAWR8@#6K92N(AhJnUpT|6jIHCS)y82ajucip573VKMBgz?&wAa^36rvUM
z0#>PYD<~ukSU*iWwj5KF6qXD7n_NhJ8iVpPkmf+nU_jCvt3cWdWlF4)#X#nInVrNy
ztmNRE$1lDMe3BR60r(c+Tfi@57nMPqg2N%)4dWMI2mlct!xwQa#?>?42xLgOMyet%
z#BcI5iJQhp4R4hG42^GYEQZxNB?8Zwos|`OEDFIqgduVeiH0GzY#1Wn6=K|*=%j!v
z$gM!A3`mHAq_Wl9#@O*ixfTZP!CNfRq)-Gy(<_C?o0uV>FxG9j@{gq5ur3P&MMAJz
zoQc|Kh7%Nc08BU_6~VPgmqS0eRzugqtR%e%`h%8oi}o_LxH6vwvo*L71Oq)!PK(bF
z8~L?T@kzXkenR@Y=q@Xvps_w)t=Udr284H;r5;yDjwnRYP`a@4NGNZI9-`P-c0KTh
zm?5u%pe0d$>4wZ-GHVqN%%c=DT`$Z$R9qPmo@a*hkc6V2IaS)6Y<jyi3E(A~CRsVO
zZWO->y`(;aA@Y}|Vt^dciT|ZzdgQMxCLmyRp44p=5mZ1VQJ&h~p`~PsP``)}>L4eJ
zJk<S7G<yXaV@H+gf0>SuTC#jZ)1PiYKNGd3`Ued8bBt!JccGbAt>q<Ga^PplA6Kvh
zdsROJGEajl{NYZ#HbP){_}LF~)qIRyT?t8-yrL<(<YMr9wTrAvw)9P%=_3(JXvm*{
zuoU`-bD=Ki1_a#zD$AwJkpvw{I?E^+)*I<$8o&CR7XJ|_GqZS_e8YOf%viYYCi*tN
z<2zBvG|5BD$iKcaxp`mplUkf5;#EI>$z7Qx2N8aFO*-l{zM4cdKq6wg#uP+BuX)>A
zBg<mm7X7yt@lGPJh!~+}wyxHkYw91-hwbKPfVycyOyoK~WbBx(=9gQ`E@(=ry<cZI
zkIonA2ss?l#D20OB*zo%a(@@kJ1WO$aV+T|XEafAhLQXYI0RV$su9sh{;+0PIy4X<
z9GJkgC|~_7UYX^p6@Iqjg0f8J#r^R(l%GaBbv33k9!NtJUe>5J^LFgH_-cAJvx=2M
z)6J`xuc65R1-**3K`Vk+2fSoE$Mr~5R&m_rd+r80ck#FxU(K!NR|ncTtO+1RG3R3q
zl(i@mtbe(+Jb1<gfCagBKDy4g2cl~bxeEge1v?2z{z5yqHVAkE!}6Em_&d<fbJc+e
zy3mGHf7wu<z*|GIE`QV;wz&e1ER14Z%zOsutdAOPgj*Yfi%_@C2l-xXVM}|UUBJ3%
zK4-2Dt`2$QZKTs4Y!9{LHxuwfS%TPq9)5#%?kCLEVZ<|mP|W2}d-%46a45026@IaW
zZ8vd;Yd+=mfH%3eow1^H2|IrqUC-OGwi8cNw+$%!M%p86i8k|_X=mDnIScFiS5Sh1
zZa<WxphJE-_8CjcLmI^Rl%Wc3Q^#N@!}sBvMGS-Obm%AVOStI$k+un$^@z7?VfR9@
zJz@`elM8$7;al;wy=c9=&<FIheeDtC?QVSUu}5yj;mZ<@v<J|FqwUe2+@n;?Khi&o
z^hcu-h`;1P!bf=+4clDWl(Fr{I~nf)Qk%jzw;j^;+i|25TRYeuMV?06iBB2sqRpBc
zUAq@OVay&|-LkUVJJjA{ZvinqaV!4Gfz@&E@LH)ojyEPg1VyQZzngI**32A>;r$=H
zhB3guXYl<{teLlB-jPokH-19<&c{|Ko{2S$Yq9pk+R^qHM~uY3HHd$l@h==}PqfGF
zZT6(S{gwr~Z+z{zy@O#lNby&@Q*4i-{wx91?3{-D33xLuFGSAWQ2@S)cx$HHTPPsg
z0qzbeInvu9@dHwWFqW(EgOvU1{iXXJ+<>ehfj!e%OqdX21PKPxsWfwE>{1aD`pZRo
zs(AQhkvF)8#@Mj6?uAq2%ijX-=gyf6U=u}PT?ATgc@}szC~&GEhAvYDY*YF=>lA_E
z62UU_^FXQi_+0S@=zkyv1JT``t3}Xe#p<%xIt&dlAy&YBL9jgwt^fp@5b=vgu7(ub
zp(2Fu>nJsfZ)(*#$u4rQQTa4wrX#5qg^EOBfq^SQxWxGyqKqHFkH`RD#{3vt*%!^p
zMU6XgAqQ<O*KN{r;Dw1XDc;E(DqaGK3fk|^p3V!b%?(0p-PJo0JEV}tlAp#}f*d#J
zSEMPLffQ<HgIDjI{<g76D0~=W<ydLKZ8d#E-AOEabBEuFmP`XIDOKwm*Ur4mbrKM+
zJjDNKKcL=bYc)T~Dr$On;@NC=>OMam5IxZJ^9md>szCj3thRqf=YOYjHyxpHgx$_Z
zWKYFgl4TTn<s~hD`j-q!dc#jG1zx{FFCnvjg;x<_rC|Vom4bYx%6i_rtbUatC_?p<
z%S(V%Il^VDU#2&efqt@4^%~1{U}GTTLv9I;AuZr%34;s(vMIMU=Rp~+C}7cg+lLq?
z6_8izpVI&L=}=qf$07C*GEe<;-X^_Porqft)Ea6;YS{uX$M%QM)auT3%RAFrZrVbf
zL2R~!P_}Q`%Z(*JaZLf*Hnmk#4kCnz)_e7TFu^a;`B#$6B5Hiz6{^BdXq1Kl9E}LY
zHSW^&vm_6+l*P|$QA%)^1*R}iYjjJwWz<%5)P7$2OVHK*TvzV_k`J&tDQ0F4g@VYD
z^~p;=kAVuVUq9}ncSLRmWO&S_5Hy`7X=lArh-b|#2*N_D0HQJbdOle&@@5{eqP$5N
zVcN*zmn0%d%aOzgW#xDSD7kpvD4OFSEk{6PS|Ba6iE)IRBnfKmfaV-HIy@Z<o(?4@
z5m(Z*jI?E$W_$?GcfDafUqJlk8`=1fIYOUrWm7BL))SZkrzj%BmlD&{(`U|14fko;
z{Zxg}TtJl$u&Fq2sBh2_?Yr)TS_7+o6W)H7l8L}0M-S^#t5sJVd{SAPtLc9MK~k(r
zqM6b$%h-|F`W}=`sAW{^D=6T%(sA?kv}L}QOqj3Y*&Q=wzOpMb@=e9JXYuX%3<P)X
zbzF3dpCV9Ux@=|iUm}yRNA09Tz*5Rd@fSj*v|OcemBAGSO<4!xC?`9Y=gtR+jdGa+
zVn{%UG-wY2V3oIr?GY$121J}x0J*)T1GG9r<a9KN9<P2k%B$r{8gE$?JagBiuJL3P
zGGhMKH<;BS&IhPnOPywdc8y}4DPm>f(h$z+0lXl18{_4MR=X`Xz1A`iybuX+fonCP
zssxZ_xdz~Ku(UyO8!iXU9^ry$d?6md;#TvDO~|-JF)JG8Sq~A3^ntlpTE0}T%@$8R
z`8bvoDQkZjxvbj`fc~-o1_S^kp`x?K`VP&hUh7050KHk$!<yoP9Jmo^&Kg?G)x&eM
zr?S@#;wx+srCMN!j2bWuf7|$BQa9$!Sf12+oD!N)a^As$P2g33S2x@pBn1$^ttCv*
zb5NpS4GhsQmP}r6;Ej#c1Yc&qW^P7|LZSo5>gx`XbYi-nIqI7%dl;WRh&F?-Fg`zn
zL;}XA(8c(q)=a`R1^XQDVSzN`Ll#I7u)+cX(t)r8UMgUL0NuI;q;$Xng?E|-lDi%j
zC=;+iB8me!t3`l$urvc*EKqn&)+W3r`%r+`j4V(K;fDeih`$WSU$Q`%9v0|RcxzZ#
zppQX7NS0!BVXTh@+9E8_ux5e4@D$o;avOLaJ4@Wo+Bm>x6SreGfj0(wKp+*F8L&RW
zBn_<&L$F6C39L_tN%AIJzvgXco-q}G&?alr`ZY<Zu&`s%BGcf0$=kWM%N`K+Q&={z
z33V`2yP0di;0g#g+8+zTJlBaw?ndc?v0)AE*a`+f;C929kP2W2w?ucCv{uK#6zI~4
z+a8mW%Jm$<JgmnpAmZOD4C2vsty8s=TVulJybJX_$Y)?_%pSJvISPrHnocvoL=_kI
z05n(xixoVDL_fe%-8SaY2KEp@f+p&A$Q}l>If44QD_}N<+Y=G9xlfqQyM@`@C(I_V
z*=z0rd-D-{q)C~_{?)Bu<qoWFYj5+43sZ|GYVMyR^@F#;1gZZCS7e|jo5Nsk*mk2g
z8DdS!co_Z^En5pE#4sssH`bm+Z`u-wxpQkr+LIU^WN!5pd#1jE03hnid>;1e{yu(G
z`qBU-G@15RFrnkOQmfnB+t-c>8+aV)--mBrvw_lD=w)m7x3?qR?ZOKF53qte@Wwk(
zlAZR%>MpQ?543mLTgjLSE4bTxaP6J#-FTz8jxjUc(Qd{pU<KdxDf7nN5i3{>Si$1j
zL+zcy3NrrfLHy&Zh=1X`+Qs&6dk43+2`jjJ?O_`!Zomqf?%D1X+uPea0#>j%t^PG9
z&EKN)+jRa79pZWQZi?UG)tBi|*Q)u8e~T->5GYS7T6wbalU&4RmjQ7C=%;^;33)TS
z1so8C>Hy_0wSeQ=OapUB^J-XgvBWjNY!c-)H&>gjG5Q*DQA8I(G7czFk0^b4X<k7+
zq_c3CL}syGyQCn5ED2V51-r-~E+_O%<mQ5{sRjrXtknl*x}iD|b3ueneXgVx-lg8M
zABI|zLUe*RW~L570ST?aM_66I2j`FQm5)CG!Kg!AI)r*Fl68$YUilT)onBr8V4^0g
zk){KHBD9j=2(b|XC6={apiEer1Au;j5NmMn0z)dNE(|-~BMC4EUApkXQq<+8016d>
z=8E8oHN)D3#!Fv9JsU+|Dt-vm47DD7vv{dGyD0JM3?rOxlpxmrItuBJQbgEWJFfco
zc%uG*&VQiuAL;yOIA_!!^76mXVJ%IaP=Czpm*H^J!IBby!<+Jd>c8;~k)D2)R}ssn
z{)D%iv3u%I`ScW>_t5!OI>PF04+xH+hq^)k7KYMK0V|F!rPLNs6e6fszl1ma0x{$P
z|3l7^gt<UP{R}sjfVEk4)SuDw3v`5o6pl}LKeB_u9r~k?!&5-Pimo_KGNZOX&d4L*
z`K9GxUquQs5XlVo7VF&@8ZEuGLo%VmaC>CPX^_9v?8Aq6MgCHKlg?kzd4<j$I#e*L
z*XX=X=UZ^5_Ph-)QV9X}MZQD0jlx_MQzjjV!D0&Xs+dZ^DHj5@z&e&uZR6t+%{QWa
zK<xtr(Dd9g=bKqdtTbhceR6@k8hZ?QCG4n5^7VYcSy!N~brjhWKMyU0;QR?WSJ>(u
zQ)G5jhEA4Fj!vEqDLL(&<<%UV--o};E*tWg*#tPtx8gG6-CXA0IQUG{0*?sJ&@^AS
zlA2r1CWg#0vml&nK3>4pK69V36<j8mu7C#}GY%j%&7XoZP2|D7g1=?@CfG9U>`HzM
zylRrZ<YDuM_y(+-s5@TSmF3Qax8zil9b|Yvc^vklaa6`EN#5`?h70m~r}1^!ZLoaC
z+!S16HPe*e)&RDa7>n^$Y&@||NX{=p&kXDr$fZCNAYc&zk35%L1*r@Oik9l<A?q%H
zC>+2qNWpg6eZ&LVEa3|xHy<Y6xt#%-oLwDhZrS)Las!@Hy8wqLZskBArXUl|wX^NS
zO^|`@QNUc%?jn$8I(8jv@>Wtm>-F!8F9jk(Vwg%-SHT4ct5wA3H1xolAq?sd%NfC6
z<ej93ixKNsJWPc#r|gZ^f`IV?cS`LuCSuGi&|HfcSF>wTnzHQ{=tw;6L>J&DiF?FG
z3!NoQA9{)?y5Jq*NoFv7Yp#a<-CfME4idn@S+r9<kZ}5RON-XSucQ1iC_j&LJ8*zH
z4^}+A){(yj7mS<!Dgq<h6HP}V^|sUF(*GT%)-CgT+v&+m-)V|b3kkdXHqs<15%vm^
z5+CjYo(nBe4UqkjjCn?5o_f*AQ!~VlgZetktT!QGK>uZL>ZyLVVhCkrd|}Fg_<-H%
zSomU3Ky1Z5+#6U65f~vq!4!@H3z38U1?$r7jGG{EPl!P3T5%x?aiVCtLG2qv$;#sk
z)CZt6A9(+VFMyyz!6E4<<d*(H#HHe+de0bp>9OLGBBj3vLNaiwCodZiMaU5)b3G{g
z4Knm0sOQlNOaoLLw!3<pqn0`qr~zq0?qZK_zOxUxa;4{P(a(2nT*kAmyG7tLwf?`d
z<PY@JCUBo(B4H}6?%{&DpAL+X#IT7T&Oti&(m6zD4;BQFhmi+FHqo_ym{Fq)_)ic}
zL+EmqW%Lc;|1dqMo%H@J9U;1Q^J)j2sWDBF^wAfTaED^+K#ot!RHsSmLUQG2J5)Z<
z4|*FnYA2`biW&enst&p96xH9-RZZbuaJ;*r_o4tQA~lK3$(sJ?BQy3QYo9Q&fO^Pn
zvIwk0JRK)mA!`gl_zWb3kOZMTw)30*hB?r=>s%F#<kt%YYux&3$B$RZJJzWAjcz}L
ze8tMH91TU({R{tQ^kmOQreS28fB+!^*e%+oU`dm!DJuN2KwGp9unjT{wL84V;t#2e
zNG!7L)K)Dc;T|t+X>3E~46uL@0UofVfD2$j1{@8_+S~DV4zdRm{_zd`<JkU~0&?|f
zJU2FYZnX1I#Seg}$gK`FkApNZ5gH~?RtG%`+w`z!b|Qu=Na6rMh(iHgfXW`Z=Qh4o
zK;+iagfy%gO~`?Q{3mkVxrapq<lh9A)&S8$@<o#rLy#-w=S+|-19Rpa_W@hMhO<f3
z9dv}k3oPS$Dl20_HtVZh@DS}#*A~i?^kYbDtJRx#R1tSjArL6y=k!`EjGdq1GN~_f
zdJU5sCF-CO2gMOOq|G}>l=8OT#iv4MIH10xA+xAQ|NVSZCWHHUb(BtD2R{tIKVt`f
z7}<kn#>yI7LBnn}w?g|ujSKWGaR7RtpCKp*o$?LRvg4qe&A9Hagalc4R_@xgqjpUj
zL=s`DA+b7r278PzBh{|miv{K|Qlh9q-o_bZh*vpY4fBfBzv9?b%>U-<lMy`q629&O
zc!BDZe9Fv)gKRV9N%oc4_nHqfRQ8)gwqhK!c)jOC;+mgzN|XSP8_y5~X*uY(HqgBM
zg;X*rSAeS6DO^#3l6EpsqGYH*VVE!^e&xP{#}(yF&dvi1oD(HV9=){SZwV{?JnZL0
zOYh2Svd{(3x`@R<-4X1X5ma}xS?Z`oOR_FQWYY2o7qg+(??8>-MByUIeMd?c(8KE$
zE*e4rx)&=vBuQ9nNv%K?N-`L`>bUOO)g@Y(kd)6q`s67pQvzV`TPO<F!=y&gl4*R|
z)!Yjh4ef*)6-@zT{#qr1%`tke05imy=oV%P8hjAFfu2i4Z2?&)tu3GiA*Df7U{kUJ
zk0L`9$DIaJg?AcAC3m7X$O_;n%iaG1okC0m3oqmXkZ^VlpeTV(Wg%Y;uPI**uNis}
zFevH`ymoeN7*bJ&<uAkW7y1XJK<Lx}XG}1FKL81`wFXEJ-X0ar0I$X<cZQ__f}=2N
zdRxGp<uGRs26BJoHhBMM;a9LTkXH{O9u1;GOzk0&S7X%xT$p9qighA~Yvz*Ic>z)z
z>i4jspibK<NcGd0jR}GJJ#U+hbtv?K+!;0y7%Bn}WfvwFw%d8E7Jm}N^Cz2?*xRwN
za{)jnDFISN`bG6B+&xVLK&#lo-cK2=w8R3*IIaMkTD0r}*v;Yga9F!id%V93D`?u=
zhxZ7J8VEp2fN8JY13;L;cd(^g1AGRjtDnGJ5{$}oG;&1%i5`rBZ_hB+7?T;j`{M17
z)0yHrf$q#Y*x-o^YT22==1MTaq@T6?P>UuT)h4WTbWIz*l3krGy@OO}Z;sF4o!*dI
zJ}I)#K!Yk<K)S*B9e~g~N=HjaG2A)K-C(XE-h{aeHjjq*rp~TE$dB}f))}md%}Q_b
zC-yp3SdQ8&JQD=YfM+l=$*!1BdnwoiA`33@mVGM4&1LTtwu>BD`MmnS;P}?;vR##>
z4ZG7bk{_~qn(`(YJ+;3hoYWX&S;zir#Hk`W{Y7TxSLsk{+ArJu75WKnVu`>_<e^D=
zehrSl<IN_24TTZGsBi@Tg0FppP9(IP8hJay$S?ApH<d$npeiCb)O7hXd`)PVNc^`C
z)3vO>z$$=LvEE2p;nL3E<eft1?Fa)$K;(>>0n`M5lr+pSXmH6G@XhiU(f9?30F$o_
zWeG6JnkzduTzyNyTBU^88Q74<k*eR6yrEM3Er1~pV<y?b3OIzYq->BMWSf7R(KP^t
zq{sh4@lBdfkg?$p&xLmD0;%+IAf`Y|%p%&JPek$jb$EARBTn2l9V^(Y#2u>nGC(8p
zm&_#XAyPV^*3%AlE-jGq$;g!kjif>4h%fwe@aImYeDFIU&v?f$d48!m#oi>i*q>Ce
z2LUwc07{1aaC_dP#11;@ERC*p0Q~EA_pp*tf!at~_wF8Abg7sg7W@S$hPmwDq)!5P
zX9Lz8U<QjaN3kaYo20k+EW6GpC;MAW80k>ogG8qCU7HFxV*G9A9zTEKsZ(bwr!HK0
zZ2H3EPfS-%pFMT{{A1_+G16T9lbN5Ps!N{8HbPiQt1pQOB!ICRZiQMR8piSmg+bSR
z1wH+(5CwISah)gX*N~BokQqPBj+Mv5Ec*Wlde#VFSSve%-rilmZyTMlp6FuyB)T}L
z-5-Lp9-E#)?XLp7pm%kO<*Z&ZEPg8qW)$*lkTz%&z!ZRGkvBAigF8Ysgv01OpQEBW
zp-zAh$;=tRf9Kjs^(ZCI!Cg)>fE^X=2@aWLM-8UV!b8DmYD}G@L&OqV_=6}!AArAq
zL7^(fPeZ0#v$4eb!%>@7LImD0w2>~xVi)K?rC0BMBV95>{3x61e#9E;K+L#Rfc_!i
z&Gd|rzmrZ|5L>W0p>2SMLcZfG+c#>`p6P2Hi*|s8l!<>+K24@C@@anV42_iq$TwN+
z7Wul&<KoD)ewKcZA^a^C<`4@L1HdnmpY%>>d7#{qK)EG>a@#?a8{e#+#jC;#grG1I
zS3Q`pXJ`7%fX=|c!|BUTpf?qRbh42P<P)q7q6xGf8`U8s!#p<{0<@(S1tO`J1Oq*G
zNBym3va-<(cr>tF$jSj1pfY06xVt(eL=drCy$X_q`)2{F4T%%2XNWYxFtf0vmW2+I
zO<|I%<IR?)v<;Z<5`$-|!|X;ivKRYoL_LDwgcY`n2QGWwlKaq+BTF~*qn4UKLX)RQ
zAW(*8^9Tsd%U+{?SKo_MKnGmX8{H%WPXm7jlpBgvMegU9VN)VXR3{~B3+2Gp7a2BE
zU7-=%^%cz80$dl0!EQadR5>V70D$MUgqx!4OAb`#wU!EW9+Az!F5?m!5KuhuY{-+I
zmM!r!7yE>{nh4jSm${1++@|gW+9(Th#e8$Gy_hA8%o0hf0?r=d&e?SO6+-dmYV$?H
zWFtacYOEN1*!&W0h(!cT+nVM6d~RdJp4EszJ*4r6yzj|l*l#f}FmO-o&(3_Lub-Dl
zbQ9ztuye<rxfm+D&@54Aq(mV+vXotU`T|nDL=HTdBe+i&Jz391viDI{uLYaD4+okX
zW}lfN3NbJ-Gb<nrVdGm%rCAK_o)4Jq*t_u+2~9{=IB^)~gIPY?EGO;wyn*e{RvE%Q
zSo${0S$uQYL6ZcYG;gOcSEoBz(!Qem2`SbEnIrtcP&uvD(r(YLd(0mKjXcfF{2(0g
zb5xa5nrV3P1;`E6UM%}iSAw1aQ9x^|dW)}c&P6;+FNiTWZI6!xK=5uB@1EN5U62Xj
zx=-t8a6tjl99$;`AtPXZj1n`*>t~QL7*X|L%S}V*VGttR`k<g$r}4ck?Wa&Xge{7s
z?){{;lq;qo0YZlrQW{-V4VWL+RwWX&zgqD#kA+74{9K>~;=bq*&6z}9Vg^wkL{_?{
zaL(TTfVSfq^ym~;0Cy|=(@=WT$R>>Ly^#kMK{&Vdx@E519a7GH4OZ%jXBu`b`Xq$g
znOa_t25sQeAEy1mz#=nx`CRR~zdOQ-5Ozd#QU#da*~O(6VEh=X{xImJiVFi=0IW)>
zTz`ATU92r}*>z!fkcJQ|VvCt8fj?e}VvGo87<%>#9e+~I{+7<YMoY+SqGo!lOqM=b
z_0HqLE9iPI$!n1DVkiUR)Zi~~Du$?@9ST;rUgfT)^t#Ppb%5qGqDZ8vgW@_(q8Q8$
z_ESvtEAbxJY8Sq)7b3tx<OR&)A^Mmz(pBJLE^*qwfo(>C!dvi2;P-3u=wF!5u;V!}
zQ5EiO;qIScUv<y4Rr9)3*qkWCYy_-->k{$Jh*I<Z&(53zAm;)z#LXyN%FjDav3re=
z9+qUsi-Tzot4B|w8wZCmQY=T63SQvaZv$LZtA_^H!b$wu57Dv}r$=2$k3Z@@a`6D{
z0?*fBI~|pBm-rt><8-u+T;MeJM|8k_#@gO?gT0Bm^c;5}Q9OFJRfDk9Q7A!!i&K;|
zXW||eFA5;$l3|E}FN-1qd!C%SGs_hTNV1t{j~*%>yEyZZrZ>VWI2<aGb+-^hT&$3T
z2Z<$Q2_5_3L2c=y^UhJNcwYBT7{0F?bn3ZN0tW4`?da)YJ`E_NFxCH`_a$tKo9SDc
zWS1Wl)HX<fz}y=!hO!%hV-Gw#j0eajIT$gDrh%EjCYc};b-z3SuwqZ2UzgHUUyTP_
z^sWI9$)2QT@llrviLXO~T?}@{!5D&#u_tRSI$H%<E|K5>-2~fpnjP=x5d<?abFn8^
zr0ei{Z4L$wTZ~%i{S$gv!*~(q@do-EdgHUK05KiXp=GP;dhKe7lx+2qd*s-=jvXx>
zyYK!7j!j8tBV4s7wOW(ILX&%eej%EsKqPt2{Efa!VPBVVf`=NULsFdL+qdfNo4wWP
z?_n<u5gRnG7ePwClL`Gz=7(@35EK#^oD`bYfjtZqkief!ru#5>$E+iSwCB5K2{wxm
zATnR{)VE%)OE4=HrHKl?IMY|jov|K<E3Thb!$-X{Yv1re^_@nE94vyW7+RX6P5Qvz
zTo+q9HT7m_50EE^18kPvN9!ISVu}4Wm@&~cSq0?i32@dEeen`ZkS%HiR}f&92X1si
z^i-4%4U0!yT!NL4vj7JL=xHv0OF;v1zbn=zy_Yi(mqV&s<JyItMlINLgz0(?80Ko-
zsP9Qu8=*q%Sg#pc1~Qh6VijvONFi7{AlztPHo%5y9@17UX9%MaT1TrCq{f2=?|anO
z+!wCFB$SLpt?5I?Th|=B7gBT>c>qhJiK%6e3$bq5izu2Va=)vmEQgD7^a&T!u;E3|
z+1=zq8)+CyaVF}VqI`GJ8-xjy5g|4TIjM39Js#R#Mch3}YCDO&R8|i}UDHni!F5QF
zAi;H2utJNW5Kv)38)8Zk7Jjh_NkO=4t?tZMXK(Z*Dm4<+Uw_3$O(^u3=Gb)S1RINf
zg^>}wse2mIoI~q_mzOC(wrMw8>pP9mJJi5(T|!6c$&k)tZ|osG7<RC}jBdE_k%Z7I
zAG;#1l#GO~Wl+ykeT$)Za_XEZ9sm<6yW`M<?N)QY*OO1d+av0l!YUH%Ip_fc&SL{Q
zMh!(4s2CXwQ=8Fd#{kF+cF!{i#TYZ4ob;rN#~XE&aJr4i@eW{2A`&Z-+({Txi(MW0
z?b7(z2+=tp;Z5heCn914=0UYp0G5kw&LPlMHd+5UMa^DrNo}`a=?F#%xEF%KF*OED
z1$q`p0WH;(HCOlap2|T*qowDT#W)jr1R+0Up9#h%4kv=(0ozrIms{6_C}}lu2NI}L
z*T}ArvG_&Pb%|82B2Tmig*jE`!k%=cb@#L1L-a&oZ6tMrWTdr-*%oAis$K6iN;Idv
z4XhygHw?Wp!!x38J+#4);wsY9{n-D*@gnQXX6ryW5kt86HfX$%%FwexhZ<wYdMnIh
zh<C(dO41(X(HZ0%(BUjl#lYvWs)LcxSq1gfs)@Oo$Qwfyd*nsr$J36SUgj)}j69fh
z1kH$bS|TV(*tNAS@xNqYdN=w6<><+MPjdYQh-v_t?(1i0)&2tLO2qyo>;(Et<XaX<
zy)>Sqi0%ooX$tPOYXD;*uY-Rs%7RwP&O+MI^P#YYANQBxgkm_cNl5LUDd?j>yBCPw
zXzVo&5ghCiI+>eBX59w{Qh;h&$U(E0$M1mX_IO1t-vImyxE`cN&n`#^T*G2v-WkI0
zQ1Ck({Eh^_qt381;*5$85awQJqYwwGjR_)Z3q*kv{y->MlDb&=U!)nj5C@uO;-aES
zBu;B+=Yt^`K+|h_veeCfFxYxkDn3RUSv*Piv(`7zo`A2B6nh0VI`W4?QR-<C5lc9c
z;`In3DG^QJ8)tK@!0G5FU*x`Q^F^`LTe0wqZJE?duBCX5FE{#HfMLSc5`6WcC$RJM
zLfXsWY5u11D6Qxics_u8u`}om@;!Wqcq#>+4AY)qWkdvziRqs0<6+iDnYD2^^T^P}
z6wZ{2A#27>^GD1pu<b9x!6`$8uCRDwmX3;T52!*vi+4I&d_VgztxJ%;KG}rGQ0%OZ
z$2zA#NE1dX#%^=I;cMy)#A9csM!U4DOhYW`XjCLf0)?XnVT3^hR@wxKfF3TwS6gn&
z%d8)wD6>cwa!xr>FQG-M!$Qt=cMBX#M*Xs@-p5#`rCdzK5|!)t#$reHd4?06*CJmN
z5g^rQ4h5#HP}r>y$iX=F1dRi0nFh`hK!fp~P94=qPvam$!YTyn`!yDj0_<Q1KdOES
z&+sEU%8vd`gtuedgQZu}X!&!mg$<H4R`&KLzftgYU6_64G`?TUGArzIK;j13QP-In
zZF$#E$m-WGP#)ufAN_e+3G;+woBHy!QSfPE$idQ*D%7A_!T3Cl#Qgzz5)xNpZAI*^
z_+uL_D>ywOIFJu>wN_=bSmB4WC+}Ha*NUUODUs^~8QrA0H0lw66BECob~^Agp{T}z
zBar`3wv&Nh97jiF#BX0a6Zl!cY2(~4gu5S@X!-T9ZA<o;YyXqb93+9w0}ih_7=(Ge
z4NV1<<ANk?!oat!ccsGs7xvL($iq#l=e*1saEU}Y+SV!%g+Kx(mGXe+^YS;%JdL9j
zRKpusgV8kL*>G6Z8t~b;8lbfRm_GfVgS@RG4@1~%o>&+HLU4G27{Xk80Eq2j;QwI1
z;Fg8su;OcDKrI?;TX4Kh4rasR%{2UiyR}hZ5`hwzcs~c#{kUK!69K00i^%naod(M9
zi^$^_=}U~djYBn{HQ5Sl1>1le%wjv+P<u#y5hcdD1aoiUTmLSI=XYT!)r1ke-))<~
z{NWI<<}L}7Ma(a^Q|`;&ByfR1I06x#Xb)f$n%NwHHG(cbgpV!kfF*<kcIYR#{Y|}Q
z=Own0b_zS^<H6l1;vYlkosY&YJ_`E+X*?Ul_O@Yx8qk~?L({++pWt_G7i<XZ#;$)k
zWCZBK<n7ezcr)AH((Pvt0FO8hYXv}uBK0KRJ>4Fcl;3R^kPob-NSAS+gNd<m-b;$G
zm~e&o)M9{6WU9|~@`Kz?NNfx30-juMkFD*w9cvoxG5BX-PhiE{3rx@O3o+zo0`HB^
zSzJ<X!pOFFSLg@ha2D3$(dm^7&w!K%{VJQw<;V#f{Lq4mGZYF&<5MWz9^Y_mQVu~n
z(<Sff;iQU{#~7<-<3ucilPs_g9jZsH>OE0#;8Pya;~B=gQeG#zz_JdbWmt~DrqE!k
z2m->X9E?tv28pz+KEk0Z7Jw;TfGSI+GqTG^J%E`;Z_YoBtCg=Z-z4uz0XM*Df%E_^
z9Z2b+Y{lt?V3xGjTXfNqflw6(%!qShzv6!K{Q+-Uy%E{GU2ej{wHDb2GFr$`NjuBF
zk*@8G;jVuWJ;Ah2$$yvGTQnHiSkd-Sxl-!RK(TnDxSxhz_RoZSu!4Y68nlx%X{o3l
zMUg=S$o3#EI5?E<41UC}Wj`z?vTV?^qHzhqWEyk%<>sRABe{_w#z&?>C}uy1CGQF*
zf^cDxgiN)-Dc-o+zo=v$X5C(2QYUwU!V=pv=JS)Wl>@!&8)}Up)Gm4>d<|!O4%fM_
zxtj}6Tu`)C_iTr^ncAnRoi8EZ>Lx>oMXy2fkKK!jfHHrGfvz&p@AK*)onNB!J@m`*
zDh~%}J*x7xq$RNVDY#F9Iy;+Cd{`oD&Z|$+Q*fGG`!%8RF#X;`r$C4MgLMme2;%Rk
zgq^)tT`K3t@=QOi7*op(Jcx5jyK{nLJ>}F-+7=3}n|&V>?zFm8)JN&Zy3lscq-tKE
zA2++;xH5eh9rx>FQHuHF6w-9SP}sFEX5h+0tjO;41=(62lA{r{6-5~;x?zUVrF$C9
z#X+L*ittyiKZ2gYBj*G>s4LeW2ycbh^9ST7FxLmeYx0JO)B1XvK(n~v*!`C%=-2@4
z;=VB!A2Wxr`J6g&v6*Y;LFDP}=GbBbSQa*jLqiUb77(a<tIZgUl?gJ{+#w-}a?K_u
zu^pY8Z;}G{OumL7;kA)`)k^2L!h{;g#w0cGK^h@A6d_;fKt<`Bypzr(p_@-4R_l4Z
z#r<t~mp(Y2H2O=vwi4|&(S&(0PH^yG_?a5-*VoIA9>I(WTx^96{k?Rypkn;3AqevE
z=_jAQaK7^B6Q==+e0=)7x-$<kAZGx*6NvMHdWwFs35ZORwxPx@_ZE2B8eV&b4e~yO
zrJ{W#l^#jQldx0UDWblzz5zIKWK@lCvvdGAOQy}gNM_7$CbQ-@vbnLZE5<sD2zx$+
zDDdO{CZ2Y9X~0-CR`yM~X&ZZiz*VPlHf0{a8SEk|;5RFV)M5a>XhUjwm}MJ-{{Xn?
z!JCFNM84SGa?8w&um7_juqPbYf3mkaTkty${(75Sjp1q%SA4ovIPvY`k868J6k;c?
zw!vpt@Vh(sEe5}PoJnW9Gb}v$xV_iDi#$2F@*SCR`$2o3eK!oi?R0k8_t^Vi0Ig8o
z4M~Vd73>4{)C<^8Sl;98${e)ci9_4>;>o0AIeQtKeaP8{lMW8sCE@b#VyMg!9Jane
zIQuBxN!!P|!y?Ax&b}bDC9$RL`|SHAw!5R)`oc&|SseHtgcI4qJ)mbEiV()f&vcG`
zzEh}cW9=mket8Rq{yB6E&Crv!<=&zIR4i~~a7p?o9$<_>6DH+29#ahFr}ge393lXe
z&_Z5lMwmt><ya{Q%RU&e>^R|5U_BQUr{AfeF5Z~^Rb8OdqH`}Y!{e`c_BL&L0Jo+Y
z;#5#3DGUKc;k=05w-hl2f&EbrFX9|CTc3!;?G-pTQW9a*gAK@>0xph+zIO4yT&O$z
z?-Z9~LSSe^16CR_6{W?zIPW9iad)+E&I4nRYewu4E<oyvkYWLN?Vlto5dRUWcKGwu
z1o*-P9AI2hbtbkXCV;-4g(Zh(?`okdjf<rE3>@Nqz@J94V0D8}q)bu4uxnZ3k@ewS
zA-%B`W-7akd35?wsVnq4W;!Yp>xgC2c}Eut@~-Xl#noq}?&&bE0E`L0FRN`|?f@qT
zD2l3}6U-auR^%+&(a_byOHs%k>J0IB<PB-b7a849;$Z43lg_X~?I^AU??%cZ^hp~X
z8mv3zZ%@)d5{Z(Ag;)gyVn`Bxj<vB$r?&=V89E*2AiqftSa!{%bVMVYR)f06WIlmx
zU<k6GY}hZhuvCGRp%dvD<V5{7yt6VDQltGfvF@F9{e-P}8ei28Got=3u7ZBx7n;ts
z?vdJCA^sK?c3mQmGh@$hnz0Svc^er+4RIn0vz7`dt-eO`bBsTDb~egMbWUn_6?KJt
z8!e>%KC5Vhp6Tb<K0%UCM6I%6l3gJo=*$ImC!4FPh<;jT;Ef1iiVrV2b$3HT_-E0A
zvpG27hoheNY%f=K@uzJA@*wPyjO0t$e=mMRk=)3H+$bx4lkIFIU!n!d&`=NevO+Ee
zi+cdo-pXJXCibDqZcuKUfPF?vf7X(a$`ot^yXBJ-+ZF8a<S}sx$X*QYzUPTSYbk^p
zLI?}{Qt3yT4E<og5c~3JKM-c^A@PB=NZ#wckvR5vXo1qCVMv?A1U{`vVn66$JL`T7
zCh%bsK*zU@$Es3}#BzUsdk{}XCoslo9>{}LO77stYM95NIeCb(Q@0WX23J3inN#HK
zMSOZ?^7PS-5UE!GiorHa;jA>XFyb0V60*q}|AQPf=a-4Bkad+0i611gTx&A}gjFEj
zAha^T<3Y;M3$0^nffA!5>Aolkn8U}89u+NNkHVCof9N)w^MoE~Jf6rKLPaXZlc+BF
zHtgp@ZV|vbqY7n`{2b7Qfss85YFGejNJK#&foJ9*ir`a`76gYg1=ZhOq>YNug&KG&
zJVr9g!(;k9)-G(~%*+h-He7BsqcA5=p44`M0q%;9f#QY@8awO(k{$|WB3%<p2ldE!
zq@5rH2`1^TaQ!&@1A80VfN`~UBnW-@!S@`#3T)cm4knawMxqwP+moGdFUp?s0%Qvj
z6C&LEJ_l!gviYc2M-PFGcK~`GfeoytczM;jU`89xBYS(I&?0g~5#{fIj+OgWsIjdC
zI@%L^mpl}y=jFSK@6{)Bbx<NKY9QQlL^O9o*Jdg7JfKr7=>-?UyF*aFUk^C`6Zak6
zq}EE>YK)$;4}_611=xoqrdzfm0PU{(p5T-YVV?)7JRE}<lun=t2wI(W+!G?nn|k(W
z2c`J!RO55a$M-ixRUTEh-ZcHVKv}{HM|+eyHI8mOJ(byX#n^DAz#0=PX$j*bE?>gA
zH5`3dU0U-nhTV}&sTaA{4)=?sKF&2z_yM6-sBP578AWU)Ji<l#iF}DPo&pd()*o$P
z^5^>)*+#;iFYv)e5+5;4_yIn>#AttsSHB3y&%2jvuqCZa5#7mD>Ze)4${dfQ)az{@
z$smG&r?76j-;GGe4r<v;?}j+d=qJCK&Xe7+U>JT9GM6E2jLVDE#F8D4rtJIGSp4Lx
z(Pp_GHU!feWredyFZyt)rqLZ)u1V(xiif6$D-UW?wMC!Q0%ec65Ql0=%o(hvn=pS%
zV?6xAa0r%A3ulS1LYV|rk(g4(Hj+4=V8I0&D8Df8fUPB36%svz(Vi~DH%4<9%X}Cf
zB9tgRV%AGFWU6VAsWyXjnlS4VyfFdeK1fBLFnl}R6*ts1Yh3hWSlTneyPp9mLe<b`
zpxOZmpqG0)4g<|N*Jh2OaDXM0PT*bICPE>!#2cj76+7(`QsMP^slk}~Jd8kCHDfi~
zw4gvTcm+cbSr9N#!yq4N=(gi9KLJ%0;?9Bu$WCifd?aDk^5+N9JnEzDZ_;boIdPU#
zq#o40m3?_C*Cm79$-W7S)WOUr3#K+P+L@!Yp{zK_LiGto{upAQ!74)GE7%c+4L7tm
zib#bole9%BD#J$d!!c3L3u6{ceB6u!5ojp)WNO`yCD$Qet|!pG5#jF)vDo@CEss`$
zX%ai$euiV^-AFbK<3Bvd3%hh6HQbTh3u{+6gaOA{V*f9sl|$GIjGa8j4r2&1!ZeLs
zVuXNQH>}rSCMdmfY$ML)P1EjS6v*W!@fDeW2^Szb&2|i`vmvMjO)u6tp`abtY9~=@
znOCefU<WkX#39#1{F5W%SiBEkcOM+lAq-H5u!dguB9BOg6yP>E=|GnN4i-!aS}%sf
zL;XCcA?G^f#L%tib3!;7B82D|w3SoY{l>FZE;-c()}8CR0nQ?z`T@lB71l6IXFk6-
zw(^dRY6&_4h*5wBm8RWD^<bz^A}aJ!=`df!wO<HcLg!)^@v~Y!&1;>5Uq>jBvOr+b
zlfeJO9Fbq<+{dFc(lGPX(W!S+HN7wPG#DF-s{&b0cL@$AI7245tVhdW5Vf$P7c8}l
z8fy=Bb8Mi9ZkaIg$)hoGEGE%J&}Ev<K`*=Mb7)!Zl!xQO7$eMt!amEKInO4~tUMA?
z9q$FbszDh7S*$HNux#RV@L~^XqU9l2o_0HsP%W;C)*rBOSxtU$BUr@J@#c*BG^;aO
zF0na{>KSgwQ*sc9KiId^QmyfY{iqmqgbpQA+Fph%t@rSjB020#405YC=ey{5@mX+(
zpJj-D%t+X=yL#2XhTosE4H*Yb%qC^6;_)e3r;xs5W;%DtSI5&`2D-YWPJIcV@hv&!
z1-sB&po5?*y*^vVmMffh$i@yo?90f3tZ-lBO-J0qjG~kNHZv{R|2MpnmA<zO{#Nci
zjn1*wC)fxWE{91#1X5P2b6jP2m)z_{F&7LQ#IR%Bf>d*sISW5T*U&s}4qw+tVK(Zk
z9*%Ks0}X*wma&|)ad@=5!{csG*c#%YuEY~~X>h7Z91NDSacEu}7>U+V4~LyE@Hjh*
zHkoeXD0CXtC2Hb!Or69cchVcg-4ISwkj2Xzv{SrlXCH-?p^HBSTS$E|v~!E5`*a&t
z_}xDsnnFEG9K~wd8^hwA1_@zf^lVwpgReIh#=*H83lqGWq%9}p9r@Y1hJ#ma@Z7fz
z_1kt<ayWTA2IRwUd)wD`0I9LFI}|kzw-D9_Qg5|@)OLLwhXt<f{+Ka`6B%G_BMn(V
z@iwMoLdYPw-sA0UqZB9yY%{%RAl{!t_`4t-0D@0K>}#h{7a&ID=|2me;(Ug#rz64B
z-wK|N>Zi>t503%PAhEWG?Z8x5Q{LS`)@<?aVY(n2#&21l+zna5AeM+RNEW(E1dCH9
zPDDnDh>yTwa<Il^#6B{7<&R+bKi+o;T&l+oKP2sPoZgbI18)!BN4*g|ck$_ubICYh
zZ@tX}_a5{zxSO)Kq1IsA1UbYZes*et<(21n^Wck8T7<jT#u*8C>%CH2a@GCZ-Y%yy
zlwnPZwWjNZp~~H%5DN~LgKskkr)yVyI^lrwnC!{-{Kp{%w|D*|j?ME9$uhc2T69z|
zqri%cA<n;tHa)yhYA0U=Digiw2xKq2Z=u(tCXPa$a}31XamaMGFvJNEZ+PdYz5CYg
z=Z<9{RN(tC`ty~mQraT(>He}@?~y$J{<`>6ZyEo;+bDiLvXJ^$BoAM~{X3Y4E04Bm
zy>V|fiAg6~7|wFTy{yq1Yb!>mASA$XRx{XxR17{BAe{hZ1i-hfA6PATOd9|hn&t1D
z0!J~1pi$&ZAqgHuGIM!(ejYLvnD++~B=U~do=!GO#pxF8#>!S7iID(ioxx60!8F_l
zjfO8kBLLb1J%a24mdXgiV;&&mSnQ;{lA3GP39??v2}e4|bID56zV+0p>G%7_QB59`
zdz7<j@9YTdLg)?ErAxIY*m0hiCuf`uo_^x&*~cHNJo#kh$x}~VI4=hXOc^+94Qe4j
zt5smw;J4I6gmLL0VK>UT_bE7n8@(HdHSVuRPs3rP7$xZS1{jU~;XT<P9xAE!(-@Bq
zLqCLrCD0c+lfeS1#++i4`c^J95uFdHMjRxwpO9VNeyZl$HSok@x&(oRdW&OOS0BM^
zz^QnATUIC`+v>PI&vl3lZ!$&&Ic>z_oUzxE*2=p#BH=b$mAZ(z2=2PFT{=|>Y($qp
zK2Ra93~0tS)F`q{E5rH)!b=1Ho+>7t3wi(&QRJDeDOD_38Pd5}0-$INs>--yv0e3H
zfVNAaUqH`_iGY4ugH<udsvy0XjG3euUk7gBjg=9OEXpjUS7ZOjtOv@)vK@C8Xb_w+
zT*rBOH95V{H`OIw$84rMquq$xu-Lh-!l@reu$M9`1Nu<oFvx&D+c>(_M;r&YcLww!
z$HA>cu`)EzVTBR15WZA^%%|)A&Jb;~bo;3PinKSZU3TEFk&+03&zRiMeQyUft84B*
zGUow*kOb3*oh49q`~iIX`q$5+QdNWgA3>Drk9qYnoT&#i&-o>K9N?pW#j7b?`2zuo
zsVN*k>0Adf>*}*1_A=-<7|Lw9e#&WHt*KU1{R&=2qad7LfF`$NFv3<G{F;Y_61Q$^
zaP(2ekRy&JAWCrxaY&wGM=w&_!|ePVQ~QT>!~z@%J{*Jy0}H-IYFQ4h@Dm&quAE=t
zk4CX*DxHiH3jiUa?I&<1Vgn~ka)f}Q#@zcFqdrCF7@hx{X_9#cL<q-5c+zdv@6&gZ
z&VPdA$6<Ixpd)@__VRLbQGJy!@Z?EOihhziFlH~~Apla2Z_dy9DQdsL+W9HiI#D%8
z0m9NpyMCF`i{YkBtyQrVwudRnL@-_Zsle=opOEVD2Y?QfE+@yC;<Rm~T@miiw|K3q
z?jNvJK67m?G3^V?Dom6poLLer1*ky6%z5T?odN{(B|pP!nbp*<GY+A+P2Is{7Ly6=
zcM$U!N3D-A*&!ZEkp=e-^H1d9fi=Q$h&YmQ2M!EOLhhA@3tJs=^Nj-ddVRjP`G#p(
zuO(qtPY$WTVHWvp8mjprm;vV3d_Eb+d*3n>l&2Am1S}Gy1sc1Q9S64Qk9Xb4?oJ|I
z+5y4g9FVUO;grQ4ujyLP1LI_jk0xPs7;lKVVOV%d0SJ#AZH2|XeqW^U{1Bd+&s&Dd
zVCI|cCiFQAqU^$(sQ(jJqL9_Kl#t;44ZI<90?SN1@lpXL2E0S3MSh2M%Gnn3jzb$Q
z1Dn0rBO)YXoZJXkM)#|Tj@D)cVVxl7j5QBus;OZdnqe*gO|zN=f*U$FUU9nzXfPU~
z13w+{4dAPV^GGbl!BOMj<xd_3caj$VVjp>tg@T<B3iAxE2iqC&7bn{Y=brI~yx|v(
zwGq;|LgiLJ4hzMrS%e$qJL<WQ#lQuC@3u48O2Ryi>8&IPV=ioY(ICCZ^Wq?8<XaOs
zNZSI35WfF8(4INao;h!;(4Kk5)6NC7C$x_0lQ>Hbwr+uIHW}N)IICU#FprlOs_qP?
z*&TKP+}MxF)gYF$&&eC$R=VE!T<482ZP17;?@s(ah=Z+BavNsi?BP#aHjcf92$ff(
zp8@V2*!MyW=iH$tc9F}uxCQhHk2V=z+kHFc0WrO>=cX}6EhY7hL&zfz;)eOS!j<0u
zCj@J__87R0Q5-#;c>#9HkrrA`_?9u$(p~LQNuBliN_z|#?G*0!frp7pO$`RE^FWx7
z_LvC1cuL<EdtzIRDYXaJ?#3PPS>$FwY3J;TcA;h<?QO8RI}UE?9&i7`0U*9{6t%tM
zR@~kR1b%j53WsJJ_AVYej-#(ldpAxp41DGI?g_s6qJeYe@Lcym)Ywz(AL=RG-z)8K
z7`zi~`&hA?$a8Rj_J_N9CzR+=P>v<k?qQ%YEW1cMNnspfuxZcA4@Q*nd5p7e;+&f$
z--0yf2nz^$p*cd^GSH5G2c)?i%dnD1K1j(3m8E`{)5gE0!<huc(SPRke}S`pLlou%
zP1q8(D|EpyH$<%&dRW{x)}%{Aq7T6W%@%iWl7bmRyya(UfGIeTN)#>+!B?F`dI2FJ
zSXf*O6~QU5kCb?wvr<5xS^3wABu&1)F`2sA{In67N;#B`1vv*I3g~)$7;s`<wU(FE
zA0c;}5=rVStS}*`Ne-$1M(0l=KUtwjSMGvrb|VsKrO?BM9QNZ!-(jy<hg)-p)qlWy
z6w?WTsYw<tqwDg*xmCzvlJL?nu={U^)&B|Buw-yNs!q9$^7a(+CNupI`GZM@+Chhj
zA^kKg!odnTXc2k{<8c8!<-a0c*y@C4Sxia!MppeEp05n|6o~XUA|+`tRT9ch{V8Ml
zf9ZUU4tt(HE$uGGxR+29t!@oXT}5k?nicX!(@CWN)PLv8e@=(QpAw2LWd~FcPu<q!
z5Oe9+>UD<s3pyst_f@8s)GaQ35C@ho0U6Flk1+s6^D4osBpp&->g#k~gX1T#SI(&`
zlFD*!0!dHMR|7#8YF%0&xs^sHbd*BVS?rYo$;eZ(UA;zsld-0lNCA#NsQYfjnFD4n
zV__qLEzP$E=xn7kNJk>c1+_>@Q;pDj6b`7W#U-yID8;Civ73*mMWM`c0dLF0&nY6N
zRzM1t>s&)#4D!131n<s6HZQ?+{FFK9UN^y%rLT?Q>fdwxejZhp9l)X3pv)+h=N}~V
zuba@z?7$V#;lPet$=57MeEAfk{Ix8s6q}>8ux43s0R}gp2hu#a%8#ebT?l7^Vk2e7
zErnVxoCG3WZ!*l|7e|l@jYc^#jxDoz^qC2pZPx3cvT$T~UfwsKH*f8FHGhNmFJ(j2
z_cRnGxSlf3Oqpjy1`A8qm)ZU`Ity_8kqV%qc4*~PhK6Qs1Q!0c+0H*jXN8Uc#D9iY
ze~-@3(fN5g1dXd7<<&M`abWv{mD5j5pLzVfl`~H~MF9D?2OXbdJpUXqLd#5ppR?M$
zr}_<g0l^7F0)Y=j?n+oPlGbWJ+mB7R`RNlshH&?Q9Cg2lMA9)p>do{J4voPPGT|S=
z_UjSoXxOiBgh5Zk<Y_ksJ()AVo*gKB&3q>NhWReUl!lCA2Z(mC=W?Fix6F9{b@T`r
z#<pJT_<?=5@~`2Pi$~0a_J<-5VYR=RAwh4w%J3b36QupvtBPf+`Y5(6;1?iv3TWWK
zJY*NbMsJ_02KH)ZA)sPy)^J>?KrOoMLKY8DRZ|U4ybLW;MH{Rk`#&ETp*!6Z$IL~V
zjnD>}xGzT|Rfv%ic8l^cTbmy3O&fR^;FGwd0gwUfqA)&6ShGF?{D4!MFBRYW<b}h>
zr|^1!K01$`VL(99q6iY~jZUmvqAEIBfb|TVHmc7JNA_ApnsX8AB(z^d86&Bc&O2gu
zr7HCb0?5`XW-!TO3k>$DL>9?C{FL<xSUWITDv9ASHoNu4h+9Y}0mb6}JD33IK9-J^
zBsOkT>l#y*bhXi!jgyvCTw2(N+!6c5x)Rg&E4m^K3n##wQZcH#S-ll-ei^DI=2a5y
z$rg}dmtlQnCeT!in8?-WGpIFb3aQJgfewW#qSc<TX;=j!ojt%@c4{Qdq@)>^yeFgR
zcSU?=F<N^*6sPgy{u4Oih7Mt5Ac%Y+jcpw?jSwqC`C}*X%VS(pFo}>_{|Dv(rqH4G
z%)qR9T1+AE=ok0^^^wCB44%_e0`6ccL0`jILKp%j6C50E4KoS)cPRKB4t__RL2d;J
zwqkIL2w0Hvh-_g1P;zSA4~OffL#ju?i*equMeuIibZ=dNf&kiiZ7E&sqGRiVKAZ_)
zPZQA4VB-St>KU>Rg0YX|7<Z~sagfrJh))jt!A2@rT?hcN)Q4&vE0#oTmglV)Oi##3
zhkgbFh>&cEa=BjW-Si#8jmybLNw{Tk!HrflIhv10pz2M_z4?X$<UoU}{|@sabfaL(
zH^2_d^7RSQv_Flok*0t1L`l_Yb#n&^zODt>3a9af7_lb<tGtz>cgoMD^M{#%&%eD4
z5H#JNfz4sd>I+PABPiFVm}O?%y$3}Iu*k^>i%k6|ItmT!x8pdgy%&p|;|V#?MB;=P
zAPMr&$qs;nCHnVXbUaI*6>u(jBD@xaF;E5o9-SmY87Ie_1fl=~K=yzWP{n7^zO4>6
z?+W6&?4{Otuqe^V(A=`4Vn2Km`aBD8C1@5UEu0@eE_Z9#(Z0aE41$M7JnaE5wJ-n<
z9lpRE;{-7dz11O`C%<U4^q)a17rw=VPGbweQX?NfgX=-$(o}yPcn0oSC^RPnKTG{|
zJGn4~xQ8L7`D}0viH-VcI}?Ppa3+qtMHg{$U;J-+BZ8t1WBxVl&>rHhdgK8vaMi=j
zd+FCs$~Did;k%6aKY3#z0vr4BnbX1BCf>s}G&6jYdyN>H`{e)38xKS4xY}7rl2Yv<
z;Iebjg3Fm~g4iBrn?Z_@;GXv`%=QF!ix1)aA)ZUW4c|%brpFYOuiio*lZlv<@wbi8
zo_aHLew(7PLn~Z;3;PjRHizLV;vY7~WkckEWls#Gt^;HTVC>G^hLdDz^bm_}$Ld@(
z@fA;;C?3N?2dbebNiN?Ji2A@eU{(*G$6@7w^AZOM$aS!Ev$FV^S)c4l9dRGc62i1Q
zk)IaJgpbkO*3yk&F=Uj}I8yQq=byhpN8m>v-E1C!zdoaW<-}>d*u(6WjlJ|j3gEc(
z6%Y={5YN;Fwb8Tov#0@`qd~3UkO{Ve1U;$~e>)A4)v@)LXi#kdIwC3k$;NWst8Ms9
z{Q|N!wOwOUKF;i4pmPE<lRna~$y?ctEg~7yxuSI`Nt`hR%v}fjrLHqid8m+JBLHXb
z@$Bk{8I1^SKEY@x+EowoN<g#!g14BIA=RZLHF$|vg3A3);j`F%2R7cFgGLad8LcvO
zyO6X;kiI_@_Ur&Z==uXoIN=nmSO5_w-d6oGosDotWRHFXId>H<#>RNgPX8ZJv^nYz
zuvHI7W|NDg<bD_Q1lUhZnZEdOBVX3oi!VY7#&!(ye}>&5Z$QHUoxoOX{bT&(N5QMU
z8tn?(d<8}E3BXkkpapRFXfNBT&l2qLlpX$&ZLCifjPq)c6@VBv-d`-O#;Vwy*ZVHK
ze1#Q7mOdTJ52rI5?=Jqfc)R@A|4@vX@+ZVQ<*5Q2H3b@&wjR`<Gf#gE$4}DsCpP^{
z8Kg+mi6|;Vgdr}EM&~(nj{6hy@N)Q*;xg@z>oD4u7<L8*2Gb$`362WXoml!byLfqP
zR}XYc!CAPNEL}zNtM9QO8!Hld$D~2|?w%-w4MRC-pROr}f`b<ZSP<GQ@ON(fLfHQY
z8IXbj8EfV38;91p;JJSysPad8g6XH+b)_W^)cC3iI@H^U?4dl-!-7>BlqOZ{-jd?>
zZ$I0y5--Q%%6Yl|M8~&u3HA%2UlL0N`XtfajJrsO0y)SKvE8PMZ59wWQ4ggLpitZS
zEK9l(0~n{9=^lTOue}qE?+@#?qPkbrF5ygZ+s}2~XbGbT?_S|MloUeFp_QQ0a(m(J
z4>TOe>!3X@@dR>p7lZEOgQQbywjiFn8#m>G?cl^A+NCW~x2x`9u>E`zY%0dvvZ?q0
z{f3xI%_(UOY6&NJOwoHd@J4Y;x=7)c<Pv%V4;;9c(H^367!IxfOFX)jL>MXPN=i9J
z2I`LlAv9Bu?IAECdYrG0_CC-t+{gFsr}GXv570RW$4{Qq=}*(APUn+!{wAFtr6aS3
z2wcCw+h3qFf(n&0w$yuh^%$J;L}=#V{Do6bUFfoMaGpU#o+WG#g)1`NyQunzwD;2w
z>-wNtAm;upIM%2>pp0j=j)(w{lj;yeb=b^c6peg38RzwwnT3IgycnP01Wk1;zL<XV
zc)B$~HETLGGBC0~eRno9vSV`pNIv$xLuX>&3->9U!iy<6`T9u8de$74>Hk|JmNjmk
X(br=ZLVY*G^3Qw%zIpR&<3s-+!t&0O

diff --git a/collie/models/mistral2/__pycache__/modeltp.cpython-310.pyc b/collie/models/mistral2/__pycache__/modeltp.cpython-310.pyc
deleted file mode 100644
index f7c6a28cecdfc2502d5bbb914f4ff15a9b802990..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 52277
zcmd753wRvYeIL3rJNv`}SiC_HBv%wA5)=uNdfKucmMPJeO+m6q+wn^BVu=}m3+w|t
zGayA|5zD4rD-R})+dOV#e+6SFvD`FinlDY_=H@!jo5a0MZf??>Ow+V!+M76a>f}1H
zBT6#w@Ap45vpWkwlH<1DU0~0iIdk66|NFH1`|}BWe(fE%E&hwEM&eKT(EUl{;jQ?(
zr;>?;ohT=4!#3+i*^pneY|3x4oWyUkZZ%TnR3lwZH!|f+BU{cII)1L4lX&??pP4Wc
zjY7FUcrzexta`4|SMHOrRDG~fEEgL?<)OxKd04{I^^wMCc~tzF`dDMUJl>cnPsnq&
zKH1n+-qn~YPswwxzPmA9o^I?Z?`iBU@0GB8eP5$gE;Vi{-z3j{_5F>T%QrU;ln*p+
zDc{n#wR~&iVEJI<Q27wv7wrD}Oyh9*u#s?Xxsp&nh`JBdZ)+SWA88ydAC(*i>$8nx
z<ztQG<>QSL<r4@iI=4H<<{i$FYsT`OSB#5^@?Fls&AXj@t{LSw*+b<wOAp-JcuV;{
z$-%7OU%p?$9w<K`zYmrl#P4wZt&NAu4@sPn`r8^O%O@KTmmikr(fT8eQ{_|QAFIE;
z@s9F48jqGAmFMyLI~(sRzf1fR_0x^V%8!YEvOd>%Q~6Ee-&KFS@kIHF#=Fb!Zai6j
zQo^R{XBubAXB+P+zo+ru@_QTaE5A>|ch|qA@&5Ar#XnsyH=Zg#)p)x6bmN)wGmQ_F
zKY;K(_TJ|c<;vR=_CCAxe8Mg{earJ#lI1EwZnF0yWPcEH&c4|`fVd0x!}cxqt<M`}
z8*vWWhY;sb5XZ4+oZRxFeb`yDZ=-K9Z6C3ZK5v!JBkrty3~`SIacc-UZl6HN38#h<
zm+^eNeFvWJkmnVr<}5oa_MM+dmg{(Tmwh+h-5sQEAmpBSi2bH=)4ADxv(vm{;_bcm
zTM+*(PU%YWicxOa_mx+>-rjFNfVU6m-0=PaQaotC6=836-jCWS#C<5p>23B&gr9U;
zl9OwjXA(1ytWT7lxO}#y7S5L*tGS-4)=Q0+?bK(#$fY+oWBJx;g!|dYSG`)RS*`o2
zvumq$$4|YdiRT%^H=E6wq(AhwlaIWw=AA#?TB>>O>6YvI#ghs@k2F2iTD>eGna9pN
zLJv}<A3AyV_EU2+rk^@lT{!Rfg@-RUtBu-%JR$g|%Bpf!tIDa|X;&7iYi_k(sd}E%
zWHl;{s=MMJe%&aQ#g?kL_G%S1oOoN+bsl5e9$)iT*F4tti7J{leo}Hj{a8<U>5Ow>
z&1o(;C+k($tu59Z6_0^h4nCorv#MHaI<{o(58B@4Ri|?9vgcIX+PX87_D8F>U2(mt
z@+x*~0exI+F1h|K+l5xVRz<p>6~9zfw!f?9Ru)nIc{aXTxxG@oSgqBo=P&{{$6qZe
zr|LPXf)=CR=&jo;Cyx7>dTVJ31^T<toAv5~Q&En$rkY(<`J#c5#*p*}6X0a4xma6b
z<GKo7Tx%})<6Qw4tE%oaD-1{H7Hdu0dA71xQ?6I3K3jAB+-lXX)SBpMzi+MC^>7JM
zTbHDB)NWmJ-{Z=5lw&_!Tkufs0EWF<uRC@2n&<d~7<CuT3WHJC92%=swYlUVe(rSj
zvZKzdItzaP*_vZVkBC?}*{auTPS7U9&p!J2!>V@CQJjP98~Bp`KEzsUG@l5{J&p0K
zDo&8rn)kL^)%8bvB0TmEr)vAtPqr@Ym<CIyf1%Y}sCpG$e}BM6`zlpe8q#q6q|?0U
zXIE7VLnn)3Tvt|R(y5jT=OuEQUb%=xfOWzdBE5=ffi9`D<^FUKf_G88uD4NGJgrQ$
z9n;F&1x}6dlY%FM?@4^!_rhr>mT{(f#<FRfSIrIjlI_H%buD2hZR@H@--g*XHdEIU
zn`!uyUS>0kSh==gr-F1zJAKXEu-azZY9|+y*po7IU%Y|Oi>9BnYYjgoE9w-+D~qo4
zxcMlVv8I|rCA2^9C*3u~^|+Jysan&!<90vOY&F*%)$%i#x7<4XWTX15Z`C}f;Txy?
zBxa?bURY}&=}gKW=-FP1u5$rX+(my^&Q(3+o59qrEjX?}f_y6ab?0q<VJ!%AUreZX
zp>Eg9$CT4(dCt-Et%h@~e&KShdF*8C^y%6o$2`aN++)(q_|aSM?Z;+kgZ~AYwp-py
zd~bDj^|C)$S#rFJURrk5t8zr#BAlWzZWN3u!!%99G^|(EV+dTpH}<iDC-+u(oCKh#
z1R$vNM-z*Hs_f+RIlxhtox(|JeKcVu$|;VFKM?E@Pd;{LuB94(cJ(iRX7g*m{?8v+
z07w>n=;vhNuVn8ft|T^%YXG-Ab2I5#o2j;O4S(%KC5?ekR5G@K?9KK2^QGVuFQ-&<
zOW4$E7poW$ujI8#XYZ0#&PqgS=4P^f%3WK<ipW+lE;X-GQG^=(99L<>!S<-`=d;yn
zViePtoZ8ZP&o5#xt5vblSDaP1R>!H9^exw^FZzAwYqpJDjypM)>#F-Aww_95fd>Ne
z<`}Efblwfe#@A(Wd286n8|xGCHq3@qS}+kO{AA(b_awgVkHcvj%X-lmp1GX7W^5Q(
zK$eZ(T266bxUe>oZM@w~x3S!oGZTqx#%KcZ63bb{HQJ^;TWxDIhlQ8Nw-1XiWvAMy
znz51g3U=DgTrt0=xRLStmj~MEcE--yxvNPd@jkS|!rvA6o7ocy_t8s+fpIeu@5eV@
z0!A!9_eC~F-2>+=R%m9$aaOTt*N0?uN*tb&J~Y>dsuk66E7(zSCd@9ot)_2W`XZ+x
zIyKGnqv_2Is(TUOn^?y(P*}OGD%WyywROq28cwz8r<8j^dFnkZ_$*_mZO6mj=qDFe
z*Zj2CsuN(bTB}&_mXu@JYmL>Jtl|>!`@=Ei+2Z<rvUgTwlkv0ROz@M{rduA3MGdO%
z7r5HE54+Wij&CgLX%Ni?>^A!BuA?+}98Ma`aTI?=qnOMZ1#{S#zEMoBkN3=vAlZze
z9!H`V)11TriJVHs&sPW*tkvl+R4P2)^&6RZN=dtddAZ=!>*3s#O1&R%^(>u(ZvkJK
zrP;)QQOv3^48fgGv^g9{ShP9hWpm)Bz|8@3h?|4m$GHPAIwjjg8ndF{M@?%1?793%
z<D6sL07AaVss6>{K>(Hu+#tda7Y!GIY#7hN)7dcE+$2odB$Ax&6)XizL(5Aor)_K<
zID3-tXRs`+%`B#Ru5EE+SvGekMiRR*?cvXJ>a)U};&|?3e#?dBelN8-fYU0=a65sj
z=jN7lrT*$cIat(Fczn^&voc$2UaTxC=fWw>*gUow7ZVM$_Qj;1!C_Fr-o&|h5l4h1
zz-)d3$@Lh$89sl4=f-NwtqBn8g#F_B);R#DddjD<RROM%L+tK5{gkW#Rb}!)<t(vC
zRXMk|xQK0T7zd75S!lV~p_&>%P;D0ZAx0nK0bx6f)wMc)375{KWeq4+Mtzvhl`ytW
zCvzp5%2M;^5W^jYlQ+_4+DIG6F_Eq0fRVqkzAv1@@p0~Kb>T!pa)BK<>ax|X!y{vx
zARK_vNVHAD54LeNxoP1!g{{x>3~X}x$=o2skZEHi7XfeBNo<2@#LL=NJFPdmHnvL}
z;~b1~*3RG!w!Y1L_Zu1KEHDr0BpE2#zc9)&MtOFpz*_bxwSj8-`KWVk%<bXe?B{R?
z@nqoC^N6cH!UP$?0jjR_sh`4@hp|cm5>y}M>m*9h1DZt6GZ{T5&oMd2O;0w%!_Z@{
z5ln2H>)B(}3yAmK___r+X=B<LFo(@4W3RD3v(u1lnJL?z*)llX6J&`bRxCA$PR5yn
zbBSLWAX!9~jW{1McVVa`+`P(Vm);t>NJKL^r#^^W)wjYyi?7npMs!-NzLSrf=jyxQ
z%p~=8Z9J=<W0*|J&dlVN_k;Mlq#5PSQDZzgZdhMm-@8Ne!_76RE;BbyzquEUIaI~t
z^BSFtba;~Mnz0CViB6|%Ss$kv#%+o%a=CGFFU<}N=OuK(KuGxEmKII=ci^EdWGM6$
zfaF~X+nP#z#9Tx#^8A*qEef~MyYhw$16yDp77V6-(oS7vkEYiTKZ#|BGpuyJT3;-A
z=be&1uSz(9-BL>j$?-NjJ128owlnn|s0KislzYC4O^oNP`j`Z%@5hgCK6?V++q+uL
z2K#+VN~^H2Pa{AMkts~NQA7O@o)@@_haYwdn=aFd$uWVfU5U*^o9o)dPTqOKuHH6B
zu@AHpSB!RYG%=CD%EpwFFe3`XYHnv_G8xq-`t+gIRbX~*iM^mMm!1f=)lzia5Kd&T
z0E@@_YSS*QIG0Nr8FOdz0yjRSmfU+I2=YSd(7d1!vu6e6oS(V3bT-OCr<{#quj~-J
zQ%WZ;mi!K}@doH-3COJ%jv0FzHIB}QPBw_TLpI+)CeZ;@v$v^K)I37sY<w8K6ktDL
zExpzXI;phKQb1o<TR@GPORwGKrBf|ca-OX=fWsZZ&;Y7ms5VRIoLHG_u46NwRRGy2
zYY`ilhmcM+Q0q(Qoo4A$ORccXOF-01)zZ7(dsMdVPH(L>U5vf6?o_ZM&~HEs=Z};Q
zw*peu;UlHsbaYwXiJ8ts2-|)zmc_wRRV}SGh=RDFHjpHQl;|R4kFEe=s#FE~fEt~z
zfuM#wqvYr&_o=W66?FVFoRG10V+4C*xSYRL;5+9rZ*@<VKrX<dtzRykQ?07K00Nfo
ztync=^<U6aem36gO`xN(Vx@#_YggJ@qw@fHrNy#$l%5j2SkR**r2qsvQaTUX2z&gQ
zS<av4ky353B$bJkyMk?N=ggRgyIr1~HyC?P3XXl$!7T9p6QxFV#c^ZhywRpb{Ym0l
zIFF^7@!mXBYk&ZU5$Lq5q}viJmgDhyorDHPebs4KPn&Mn^Z7|3+jZ?#^Fj(a)V*EJ
z%ww80R#goHw-heZZFj$t-puXNnnYz~cicvG`J}T&oh=(aux;alZ>*^A#s0274#zKa
z&v3s#iqFCEGZ!kdqXA3Qp57B?w2B@5az*cc1d|!<pCMm4<OjwEaMX+%SwnpaVGH=i
zKFQ$8JpfOLvk1llg2wY%U@X9$W8{pKpAVpg$4);H!^(-&lGMdKUBY!+5E3DERe*;t
z13aw&9l?oCY#itb87Ls2;B3E&&`P@JU8?I{#>+Ad(;^iR6nU4GbU81N-1~L`VyF}X
z<m6Ngwji9r?-_i*V!>&{t9}3;=~)1@JkNW3IeTBd+Bj!dANT_%V@s{X`YoM~h=BLm
z`|7QQYTbPR9-RV+j&+XY(Eg<=rVMD!g3J?#2CW73mGdoo#~P|1VP&G)sUO{1H}zu(
z`wtBN44kvybXNaQnAM9(^;!7+A&u;VaE{|vNX+WT5kAx3LDWDH6#BP%UyuN>d&9A7
z0JJpC`Zk7V)>grMXf69W^n@c`b(Al1plvtxGa*wyACM5L3!a~bQ(0`jXlj7}C-Giw
zgK+&8!reRJ0O7)H&l+aRGAxktOg_JCW_ErmX4gly8Ohk(4w3DXEQU}lNVfa}{MrPB
zujz7|nAxNv39u%F+zt^JP-+r%B+JWio`a4w1k@}GqOJub8`ObbI#L_ZDkw>J#VAPv
z!OpGcvGTeIM2-4tH-Sh!gX*ad(5b+g$*Z5H|7Ym@3pzhVXORxkc6FXkjm|Qi6*wWr
zE0AqEn6LRxIck+LFVN{g`2ye*{Qs}`3hrBqNH_T#R<ii2m0Tb0>gWKwP;NY*4z@sm
z`8S!F?)`#Myh$-k)#3Q5Cb%-Iep<6;wu}~cA{j0A1on!=)7A!PfEyNw8`y1DOx4eo
z5Bvd$@kfXp2X6QaJR$|)n^+vimE&|RwZSnbVFe+h2??`^r>*~<9Ajq*oZ+7*Ll@<h
zNCipkv#2lFO(3#jdDI;o<cFRGG{qzMNJ+nQ@0V812oMlrCc@BhOG~ZNqG|yo6ZqkX
zl-OKrQ~)_wK;vC=DxLEmXm;nAoZ-P7q$)8*hi1Q13dvkp1#`eDi8K0X=h;<2Uw#IF
zD{&x95{`rVdA>|(TK~v4^#N6wG@fjDX!lGRa;2nKRxZATob;)c%3|#vCzA9_2wT86
z_8ASHT<&<`sbw2Dt;X%7W7(FIa?-mJiy*>-Fc0F!^Zn(FopG?<?5vZvb50t}>Ac+s
z=5#)1%3mQ@uqW((i8Fxohzm?}P<%K~ARdr%3iusz`t4zSN1Or131i|{o_8fsE^ySv
z0ejRQdp=bj6bfGvvt-gQ2DAU6kR`kRucV_N1FflclpO@HG#VT|+6SY^rAOV;L-l&;
z{jIgqoa5LB0qg^+oh?1qvI$iS{0}^!<ko9~R%oVSC&PP7rvYbU*)<#`<r2Hw|8}S8
zsH(t)d>t`xfiDn6ID<_ed4gZtbahg9zXopR6gOi(C+svp=g)4TjYQDES)90ApaJqQ
zl5NvYtR>VG`5S`Cr<XH7gf;m|kflH|F+iP9R6wD}PMN6Wv5tB6=clj^D}4wp;1}Oz
zz9~p(KSBo(I)GnLVJd?*37|zh4dEAG5Hb-Ts~35#B-Q)9VGzUMN2x;g#BchOsVl~Z
z4R3_ujE!&KXabOi(EI_V<OSLojbgA4VT?XdcEcEB+r}t##TfS{Iwimd0vuqjeu*(4
zrA&Iem^;2`*Ycn}c$F=h66mHlw?6P_6B`6L{CW)6e_qB7o3`MBglVg#`Dl#hxjzA&
z!F~g%7YLY4In0AgHB2pRN-~SU@@XlzWUrC!F56kKSp%U$G@vF#8u<>n5%rucJ%O~C
zCqyMm?wS&00Ehj>n(g#dK+w<yGTwFO$U=BSvuD>I40#7&9hDkut_L(18{|d6=EN_~
zU6$=jHm%a3CA4C`E5ZD|rS)M!-{!duNi2q$GqXE`fNvDQI))i60M`0#y^HuYU_bRD
zR>(IEk=-4Z{Y~d||9`if9tm?&#5F({jtJv3yE|~9>=5c#5JMg2UQvK}MiZl60gK;J
zWrhnSQNTtyJfhuCkDs55hEjc=(SMQIjK+_dCDmG6b)^P=o|t|G2e4Q5a{yVj5R5<6
z$=5~<tPVf_UY?ncaHcB(PtzAPbediXelK=`sOgq|AneCNNNXad1t9sote3#0($H;y
z(>`>T+e9=1rvy(eW5BR(WYbyv>Tg!UhrxZ$<8Ateb;HbA@OJ~SJ3eKZCW@IQsB0O8
zSJ!v#I90*%6Iz{mj_paFbJrJ$3xyy033Q#r*Db*Ti!=eChdc)otj%lICXp}mnt-{Q
zR6T`wLMw$>fOWCvTvETzkk7;Ma}bhhg5KpiK3E0Vtd`bVYwnDtJMVWH?>RdE7o9jv
z>8C3KZ_Ke1iFuVeT;k|#oYiqoX^`X;bNMH5C=~<Qng~MjhcqzTfq)=$gZ)dhB-H1S
zWPyiP_?du*R9V)@{d5xSYNMUGm{2(nfFW8hN7SZyE%9`6BfF8?z(JvrfDLTd;K%?S
z*uc@CS@#?LUb@}K<48DfNjw!|PyL;zWYSD-^lcP2`rCb*S%NT-_vmXEB7dRXANd)l
zi)oEm-B119Pj==L$&CRpv%!88X7->kvj^IP1c!Q&5MgG2YYYX!0kmASQ($p?9HkC{
zMb29Bmo?&Vf4i_eYy(u<D7L}wSTod5qs1d~%zxG!wE=c7k1dbmm>u{er1L&(v=MJ}
z0xn`*Gw)%##PVdjzg@&pXFhFi3~dbK2t+yUq4sb)c_oD~^i7bvkFX)T_z81k1o=!M
z7E2guk6g154?XDZMp$Bb`U;-l=Ue*wy*-<InJfA+Z4X_;T=w>Dmhd)n%>WN{v^~m^
zZ4&^v3E#ox{ks$GT)Vhvtt3BYsNHDG@OE#}vJsv3hZ3K(q=f`>45WFmZR$J$YIJgX
z3vwE5W0ZB6q?dS6`q8!t`u?bQ^YVe^TiO6Dy*<mf+T&M~n+Mx{%ZD(_^t+k%C~A5b
z-`nhot4V~ijic>;%$~9K*tYFKdxI8z1ofEQ(js{;b&~W^oyNlXC!@~Xk05Q%JBre0
z!~VKrTuY+l#OAT~7-~7zPJPU9f5b*x>|IFFHBW&Yk8hcm<2%gD+=&G8`Hk4j978)T
zk%gL?15FK*f;?y_^LSlBWKl?V^WDJ$3fT@MCc$JR;2lCifH^3EbP-|T99+C}_KthD
zfo}+D&v%X$7K9uDivfn3oxeVMt^}^b`I0?TI{H9~5AcHpZrEA^OL*kT^}@S_&!SPh
z_58a)vkv?PIiJF($!cfM3b1`w<UxAS{V#qUQef9lodpIebbX{(YYPxw1fQ!4K<hl&
z<Ti=a3r-0VeL{a{6YriaJ-S%B4BP_ZqXC8U`o$99zEX9~YaIm_odS1|0tEek0T>5>
zKS2;yj$I6)!6PN`D7HX>q;u7(b%Jz6jH52oP@m4ES`r8q{RfH2SrFl#!;TA@&&Tm2
zq|z^9^G&QDh&KF^7DIADE^e*WZNiB_vng^^x}G~yItTC<aOL%zIte&H+r-u^zBeIv
z5NnN9KZ}zJHEu4g%jh<P&Ys^U>Gi?Kg_IZ8<=&a&el~z`Ae}%TkY7|u{cASk|Df}4
z=-fi*^K^cn&LW+iXfYbe%B-$xLfXG&T;k&zs`>(b5get37a;WuV40luCGWiY9me<+
z96!Cb3LzXv{Srg|cRC_1lx|eL##$Y6LLg6~z7!MU!3~CdP-dD>c?zut(##pRwdjGP
zug)=-DxG^JMF3kVQsdNL(D?<1CqeNL(xCcnJkCt$Q(veD(NV4bfYJYuj?Xmd74+Yd
zD-eO7(sCjO1bKuquVq|aKhKk0uNS|dX&}K<7r>{Fd{^jINO6(!hU}eg61=RHiZ0SG
zV2U;?!36XB<Vk1Z7x4g27Nx1N8i2O<04pL@^lm~boVnRN!7}T{fR#7%0OkfV0|5B)
zuNKk;NH-0n_>{&IWt$3+a^hd!#22XuAP%R7NxMf{2qz{BM#(GzK%4@oXaPXXr^XR)
zih!gw4Mu*!*pIi<!P{bL+Q=u<rezc@>qY@N?Y)stu1{_O2?NYdh|Djh=H}*3othcy
zVetEz3T2-mA$5oY#I>kMt<j8sJ&iQ;SN-1z_Vc791Oa^vu+Ftwbw#jJdT3_}fnP?H
z6#MTuJlTY0Os6JKnrQ;J{Ig!NO!Mp6r1@&rGG9rj%$M=*x|uOw+M65xnquk&d}E*f
z;7RDV-PEE;Y%^Q7a{4coOW7lK+M%FS#>wy(G@q>avhd~LBV8!(09<nNiv=p1fXdUC
z8$b>z(0m5%BILRX_K-aczDd7Oe+B>`k45rbrzn{o?Mt^;--+gGGLRN%Es4N**FLGG
z*hmt@mZl%DQG%5Kr2$POn(rOe`4TpJ?zZ7J83;B6sj)~dYlh!i(`&6kL>d@8&x}?R
zOi~EYtkob{9-J0n8HYP;Ge!##UNf+HoEJ5()C5UNSi-{FUht5S5DQp~)wOf=+Cu4}
zCmzKvEp6>>BM)EOfg**RDZw!fx}|W-aSDTHtB((1Mu212^sr64IJ7SZJiCUbO2u$w
zPE=01L3{;|ArT4~j8TL1#HWq#2K?@sR}ux1laeH;$SMF@@&pp-$xRf%-`lNDfP)_*
z)vZ<RA&X$(;8+R8ZY<fct~2d6Sm5t)TC+C8So)C^SkG*ThzWDOxJ=Im8N*Xb$Tf$r
z;3@wIWdwN2Ko_2p*)%B-E3)Ou7{;RU6k;qQVk{U7WJwUW-^&CT3na#_LJ~c|Si&cb
zvB*;lW65=4ETNzHLFgyO5{TGt764-xVo&|uPXWf#--WRZfUx&k7|U}pxG=F?TpkjP
z<p)seaDcJ!mo?%qF_w`S#`1H3Uq=OF`AKg~FqZM<i5`q)QZSZLjj;efDYh|#YzQXV
zgA_p8+yxoBscVTVrkw)pPQoK`4`3>S(+qEnc+-N@47W!*IE}Zb^(AjF;uf)iK%kg-
zL+eXYQgM0TibaUt{Tr{ed6PXX*r#Avz%2d|aG(9GHKh4Q5O1tE7sPqKlaD-&(G{X6
z@}5j=jEmIZD7Lo|U~d!A6WAadlkJJ1b<)Q6xU^LKi<C-YKTfnKUpEf&(S6M}F7LBb
zlL^75s_5rYz5{+?#&D+RNg`%xy39bh3z!gOA8+2gC47sGHxLTDW-MV0>@mnXn&{hM
zdmOmoF7(f>0d6?j-WB172L(4gB)H*0!42srjyMAh<45g@CTSjrH>QAR-L|p2y&E{#
zkrfj?_g_%@(QCj%)Gxp{i}p`9hrDBKmpyrfF@8hZcpTv*EL+Pbki)c$-LCdDX46za
z<nG(Ny*-W9MqE_;I5V|RaQoL?O^M|@WG?lCLX&Gx0Ylt<HM6m&y=U`I!O8AI`FBfg
z?%54_v1>^guRU1tZ))#B$$Nm8neHCoWqa*ur0QCmz{~b|y{m8EYv5(3`_|a{+tc0~
z;AMN})W5?frT&P{zo+xZbf)MK9a8@h&idYJXR{4JtpI}C4&PCK!i4{c&Y#k`ht7Y7
z;|~OEkcwu5tp702w}myxT|pA5_xK4hA-e_PB$CIW5wY5WaM@10lp`85!f}k#uK{e4
zqGyYXwS^k9uTfZwB2S=kL%u0S6kl6iQegDxDjX$nS*q90DbN#VMS6N23Va}Z1G3c=
za1r>a24XQd#t+SR%MV55QHuT47iTpeem36rqu?)w?6>)uBj6;0{{J29ss94!&+wIx
zKLHx2LqT~!xa(4NEdjm$i|jjn@&?k98q7wS4x}c*UIhLHEfBO=j(w4@!l@l7A_QV<
z4JwW>rgG|naU&g}gGI3T1@n}quB`_0S&<}P2{@=m51Ukt5HI+wQTDT?_W{r$|AHw?
z=c)@UlCQ2X;`zobDAa$4M*1V9_QiY0RsR)lX5Os6gdg>nj0MGsRrTNa_|0N&>dVaK
zHL*7JSB&xyoww2HU~IdCl@LruH<-$T3Hlibs$=#jH2}Cm$m{A?k=!4k0DFMq5CJ4~
zE~2V_jtV@LdTqr~f6c&Oq9cf)plX7&5swqJ&L08woR~lRwBj_0Q`-JGGmk=-R@VX@
zkPL_mQkfzvbC#J@;rNqV)^dn$5HZv^#QW&`TRMUrmfiq%s079mV)4sN_zIm@>3p5e
z4LX$a(J~5v@)G5NfM&l1`BTGuJ*+W6Gy|Fe;LkTbx6B=6fkY$?hC=HwASOePfj&Vv
zmC59V0Q0SYlM2xnl<yb7A`o2y%C#$twMo(;cC#JU_y-7gi3EpOV;=b7H-I++f9&9m
z`;)*NO$c2W=8e2*zG|g4CYnzb%~5lJm=rM10r(D>2aHMJjR*;_%2DGG%GDU>u#rv`
zkS}moVz4H#FQ~|+zYgp(%}`>c1w%pwKhs!R`lV8yiVr=wXDDXg3-X3jVh}yBKrlA^
zjKV|g?IgY~#~4Rg!p*=nHgZk*ZS_Nb##l*iK$6BbL2P~ooL)%S0AvY>@gl7u(i=~w
zHvs-Z21FCq3m^{<Yz(&h@mmCdmUSQW018WdfR_N4hA2GU&H?nzZwxobwttJ-087*^
zB+XM-`#`SFfGFM9&bL!nERdx~AZL<wS0J~hbJw}1ucr08cz92K864!2!%Vii0mxpk
zrxHFV!HLy4UeJFyhA2x$bVz!*6ycbqqh#4~tKDu74{#|UoaFyv1I0E0-m%b6H3k-y
zDXPj4wZz*_c7YHkg(#u3p{s<w085Bxp9^#lfyD2^REssJ#CPGhI!Yj0&Za#SlP_mT
zH`QmWEO%5tM&;*Gs!#_a?7>N;&nTj-Ky2~Q+eN@lW7%{lOK-S59@&4-(z<C^Z@4^3
z^!F_>8X>`8-$0oJ0m4}!wA7<rQtSc)R0B1Eh!Q)mg^uE)&XBj&G=t-;dJ{AQ%wNu?
zSod=jL%CSy7nU4Q1UQ{egb8C2p}u@H9$C``m?1yGMvVpwZ~|q8E#-FBO%OR2BcOd&
zTu?WhD4T8x`E9af=P?1P=4j2k-}QmB0EEzR(8DRe%kUuM+0w&W`;4>nMCn+G6xl-|
z_%;)(%QnCaYD8LFOk{tX3cU{;a!_tq2B0tK2zxur4RtzDON43d@r+;S=x$do#GY3C
zLYJ~P-gP~#K-^Qa;47<sf2=nlMk-b%pmn;Br(}uFO?39txtY!ZI=9fd2?qkex+nlt
zn8>eR=2U?ZDMP4b*!n7Km>Uql!}g%|F!(3w2n4l{zG*l!qZ$I~fhS0*4$02}ZJiLM
zE|V~X)XL9ypmPWyXw`T0PM_{8(w*I?JLIX;RDZ0in?n70q}?`nMPwR6_Jmex*H8wD
zKpMa}P2X-*?TU#5)WcJ{1aX68Hc5ejyio+@7{Um0`s<Us+s%H}?C*XK{h(R@?ZALF
zZha{X(ZA{I)`<Dl?l8oB$;z)E3#gO5=l)KxVN5A%2(`}OD=@&VXiM7^S<duEhAelS
z&lU|T>;f4<?F;=l`#~5HqCvi$nbc$o>We`+vJI9j*!2Pfc#!1;!ye}`<iWtayq0YD
z0sS`-p4=up2}Q~bBuC$a_r^Bwjdo#UpxqCcqHm+vyd98<iP$h2voYvd(D;M$?L-1T
z5Z53AkOYHvpgn+k8q^2`+uHz36+#6YMiZ>ep#G`8?%Kl$1nO@>mel~vLg+<96h#ok
z3X3M7mi|R^k^00|pu9%x8jLi7>puiP<bi(>KVsiN?WIp3alsOv=i>u(7!&%rICh~3
zq)K=KPvE@_?9)fFAYp!vr=<4w=_5=YlyG$pVz3nKF~QmdonJ?4VS1H!#m}A!ao7OF
zj#kWyUi>iDl+X+02z@hjdgeFh#Gi3~--iOgQL*yIB%swva}t~r(-<aHn}kRWcq<f(
z!JGu-TY&20uwf>HnM<C@_4WNb&C{+;BaPCq(GXOfI|W^3E~+j?$P6p^ZiY#RX8A}W
zw93)fM_-;kMITqDV)b`shxFm?Z{X|RjRa(e6f$OCxWsm1hU8yLe9*juvGQLl+KPEB
zAbIRV(3+nuOp!nwHkqRkqUB)P+7PDYFX)14@nzu4z(<Bi*2#e>k|RTeal)ARmFJE)
ztT0>p>;gn_`-CY{z$6{;M}s~;1A9Z!xx4;~oN~ddE=Vz8Uj+I$B2(RL&UQGHvvMdy
zkE-QST+0UMyTdkr4F-x7_xEC=02toNK+#-w0KGWg0RrQoC8z>^CShRc=kd_BtE;rh
zAtj%A_=$(e90}xVUx)I)RZM9N14%I@&NTNGtc7+;jR@%<M10NSfS!%sDS!!aAG(S?
z0{e|6i~zlphU9<ZO`7~qmOw@eF#$)(i+B>Tp(LKPAXE6H1)1bY7y)?^(8+JYh=7PL
z<w@}GB4dXBLOU<Y=3oMB_Ja}77kesnKM5nC-y4V+0RtXHkwCH?*im%pBQZi`VtELH
zNfcfB1WFwN!Irh+FA1;wEr7^7DvW?Z?!Uk$e*!zIwK)biu{<uMe)=XrG#<43g^&--
z(VGOO*C+U*7EVIzgY>@-9I|K+fB-#=e6$b~a%v9?0UBqJ2f{LP+>KMG4}J<N(O(d`
zXhYq=`Ghfot&tSYr$Gwps2kq2J=h5WA-SkyMWQ+R<vq)L?I9skPw1RLj-J@5)!x44
z((+Bq`=u2q8FjcB@o&ZNfhG*v;rEu08Lc}cClI{hgRs(yWshhKHXK>$L*S|)v}f8`
z?;z40!gpwSMiZTZ$0jxpLomzO7y+?=q<eHhW**%#wxc_Ytrq|BVxtQIvw+MzHdp;l
zY<a=%@feMlQ5HmR8ry9*T3lpa>u9|3{bathg<3qXDb<}l$G!ioT65;1P6}+DK)xnU
zYPx5m+JsG#uI(*OHrCbBI<tZxa32ZM#A9mWrw~>H&aEhlbj$2_WPR?MJwAIJ+Y+}U
zH`o&?s>0d@>awBqQ%BDcq%5t%JCC!wS&0{a|9+<mlVJM=u>&<5Dz?l_sw-ykLRFES
zggtg<2JESdCAv<BzLxBh<^_=WFLZ77=k$@f?OO|Lc2y2-sPN~c2}EHvMj_~dkbrON
zkWn=USdQucA#`A&`Thzk7?FE>iLzhd+Yd?gg@h;&MVf(MW#IH{opoCHMku<{yx(Io
zk=W>=wvN3KYU?*bXzfVV2xU~GBtrA-VTrSEr*UGr-vtk(F954qH_}!J-0(MjeIWOZ
zQ9y?Q??n<bW&sf3DDt6<%p`=Z<uAf^ilBR@Un=GyCu7EBWTv;<BxZvXTnPl&zb)YR
z+Atr8cdsY?au@ckQTE9Z#3i_A=EMHj3D|?=OLqM46jRbXg{(w>Xfd>YsHt$3+acHm
z2x|e^#y+eQKXHR*hLf~S#|rePr~zE)vQ(7vLv_iADG2DO)3m3jF=@`|Eowk41dU33
z<vBRrYCyDMQ!yc{U{qk>3`h~kwggkIFdVW7aMK{A8n9*oB_I#TkC;=*({dg{`nH6~
z;Z-5`HoZ(K0jmE{wxdYUr2>xnnZ%1gjP65cLB{Y~P@PjR4tEKAaB=vf+--XIIDgk;
zkDfXE<U^+`51l>x$lTdSAD^q7JpIs_Gmo6{bL6~9BHam}W-dvnONyN+zd#FK<Xq8G
znm;JS2aPl_(4PdEs|!l)dXt1s6=((ALdleXeMu3G(=zUVh(gr=NvCIB5&QUlboW;#
z;mKppr?Do|!1lQ|U{wp|i1NnuaE^BmVI+I9)xe90eaZeI$jv2|+rh)6210D0g30%x
zaFKJ#CiWVNAq$8e`xJvGS^FK4tnH><4kB3)tSS=s0MZO4KK=^kK_7*&atO=oP&^S6
zNiX*WVSV&dW&kRAa2FxA1so&=hE2Z%RM{S|2cZNwAQ;IYFp?pu@Ee%S>H#!NJxJ#v
zIy();kD@o!yBOm%oVgwa1ufL`Idn(NwilToBzmxm-4DQN!vr0GC;XB;fcXt;%^^T<
z8=nABPiZEQH4KtPK+rHCbAojxsYF4UsX(&O>7)1Utk{OJs&7R#1s5sdvz7iHs#$5%
zIibCmQ?xUu#r$aWMKc$G-0<WVLV!?$sVrIwNWopkDTKXC8wWG<QNb)6i!a!;gvz{K
zS#e@(Ck3D*$oPW3N}=0Wrp&mz3Sy#`<Tr3gBZ72DHg+`cUgK{<!V_*aw9k%rNTJMe
z3$*tQ5??|0rmO*aOe--<$3^W2o+vsxVo8H)chQ@G1wIR1&2AZVcOKX%_Tw!r*SVAL
z5oAb;yGSn32w<%qZxxBk!VP_ct%2A$42!T8^8)N1Db0tBQq3Yoy~rAgu`7rgN7W)V
zKL9;ZxUgpc_QJEa8VQ?kY1ARe9&9aXCf=5KMbIFz!yMTE=$gJWJ@j%^heDA=*R?1H
ztEd}53kHrzLI<pusAiBGtYfWe>KE0IWOdKUMu~TC=O*wlop;cA1kOxuXYh+GZa)j&
zS<MEP;15NIOeC2+1ZfQyYo(*#)U_vbJBBgD0BZRe?#U$tS<`tl9gw{MaJ^z?3NK}I
z76`&TDzf?Y?GJ_^8=9O1>D-NJ+ozB%H>{KRy5A4SBcB$V56se01Pkh7%SlCS&{79!
zf^azlssc4@p+dz2A7X(Jbh}2HDkuozGp|v}2ip3egeEp$J0{h95R*Fjws{}(%X`GB
zK`EuxFRi47tvyZib&s;^DWKgmRKY8Mpy^!FM=-2ixZX@b-H8_IWaJgVJn?yFVP&-i
zk#-<C8qOx%=cm@ZS{)+ko%iYql;bBM1fW6tI}mLqrAM4q@bj`sN<q3hPqJd2b5)L1
z!pVQl;X42i=$nuaCpD7;1_Iy9RyOo!U&*Gy^EcLa2k>j`Y!628#Uv!WXNtYxYtQ+<
z9Ux06&hP75GrD;e?K8iQP&Z+KA7K;rBadtXV(Y^MYps_7^hd(n1qgFb=ki}0zpr{4
zab;D763@ioV*<}l&G*<4pNHL~lN=e@L&cu$0u9arM~AIqED|k=#f^`r(-M{5A(5<E
z=EAd<0s~2`jsTbDPKweqgslY@D;;tnF--w$nk@;jU1SW?rkD~m<$&`i#DMjo7|Alh
zdeN#{6J%Qyl%-V1TbP~Erq#Mzj030+YgX0B*!)uwi2_s&7&d+8(0R{Wb?-fPZ1u8!
z)ly5xXfOR3SgR1&IR<LRd9P8wxhLTa<lD|k|LE=VJrJ&j2sfHkMeUc?V5%u9R2L--
zeC5D?CO3L^VQ2w=YX`HppiBZm7-SYndP$dq+yV8JC4D$^Pd(>A=&ROJ0rx*LIoxHl
zNrS$nLr;YS+LHnc&p*?{)<i`yPju`xS~LLZRnGx!>kPGGy=lZotQyO?$`Bb<s6T~z
zN*p?!B*?9_2+_8P+7s>f1OYhwl5Hqxk2%*`py3;;)Wd#4RKO7}T+ioN9m9DGV1kij
zxj!}kZ9VgRmXsZe>4}<>W6wVmihW>MqRL2%LfQY>{Q7&&qSSLdbAzpcHY3nk8h{Z!
zd{w;^*c&+-$S|<_tQ1A;1Iuj-B1R}Q?bhn-0#<YE1Ilyat@sMJA_VK5B<$V7(yeWl
z({^&nfF%yA3_cew4l&C><;s1e3qx0;U}vyPXCpm{o|gM5Y1LT)(EY*q$sRwEuze)B
zU7ciIDCdaN0Ox`5^$ah$3=W#w4`fo#BA}nJ{c;szTnNbu)we|7vKUC!CT@v1lIMrC
zd8B=4+q8X>_N0CX7gj^4;MmDQEH=QVCz(e$Z){Wy7T7)5M-_a7hL%E2IR!y7+Oy5m
zDjCHjpe|3+kDu0N_O%UXi$;}cv0XKAb%D0ll%nzWil2KVw3F-i1#m4*bcC7;Y4jyF
z5QS?%dJi1GUt0kWCiKstBVD4BFv1K35=>(dQ)rHvN=eqMmbrc)m{UFDH5<!j4(6hI
zcxAiUo`PNr+B&#DM2o(`We%9*i?wI{eUb1Z#bzm8Tj`vzSgHOHaO;W-tCrBkkVd)w
z?uxroTP0ZJ!jvyf3s>YC2c84|cqPg)qLSc_2!EgxPEv_K)_E3SP*+i%kN21y)^bii
zg%_V?XOXoA+Kt$s0$(xsE0~Hgs;7bb${LR&S(+-)ZCkRVLN&hyjp|6$#1WaE9q~~u
ze!%4V7=rMlB)}dXD%Nm6x(Ff9bKLYVL(er3{1SOukd$jnm|ob-fc*q^A45$@>WT&y
zK4P0z&Fd27;vN}RBLMMR=cu|v=w5VkpPGLNQl@8FA*xF9ASYVWeZogCXK}L#HZE*>
zI6Gp6ZXX@iy>S>-DoDVSUt>F=6;oD?pQqkObCKK}btgUgu>0UMhj6LDQXSW>pi^#^
z@S_-<j`U50i)|}D^@ay5j>*Pa^cZnjZ0gmEtr`#!N1+7`9!*iv+=oM)RR$CdkgFD8
z5{QWfUx9`!@Yw~PN05Y`f9m*=(urs0@72(T?tl{H0|Q|yJuSq#B|GHsVdAI6svZ0A
zVQq4%ljgV<z}k`~jNda2I{n;f0i*WzcJ%fr-v)3|SnB`J=Mr$?t;nx1pBx(iKwv`^
zGV-Em$F&C_6=rNir5d&YqitY5F!U*s^?F_s;f>9ZEhPc@jYU9hxJWNSo}V*Gv!$ac
z6M|kxBvPP94YNEp)}9=-m~0hPd6wV?pe7iE)%4yjZVX3g{+U>zgzAvIwg{`iEoKcO
z2UP?0vW87S?Bfm0H_XPT*a2e1ssqVZ)w8vWvxH--=iFl_-hATt?1?+>y!*tAOg0iN
zVx`rZToxMG3s~*ZHU$vLbCxdmbPDIXtP{Kxz+b~O7^K&m?K`FF%=d7XhLVf4Sqq|r
zhQ9wk8~SVP58+CnaAaV`Ss+>mlnXSF0<b%5_hIyotz;RT&Mv`W5%@#)i&%f_<9Zg>
z8ly7Nq0h|sbaH2{hw(~Vx7F}fd}r+e?9hEDNsGZDs7j&vR^Df`4H2i8iW93jz>}ka
zjI!MW=^i3^Dx&a+Y6-i56c;M!0BRsEaSoO}SAZd5eq-)3!UFH=>cof@r(?tP9uJqD
zh+BGqXdNM<a|Ik(aQtdKyP!ndf{|ogdlMGAW!1-ujc95(uUSP|$T%{9*kKg|(^|uU
z-~j{U5hd#xtNe&I4Lu)G{xsKGdOR5Lo>x7?efAP;6UsW&L{n6}b;+^gqP0g+2Ou;W
zm=a>itRC5C&@>I>z9rU{qa~r%v8Lg`i^xwG4Fa@Vyw!Bv(tI>Ig|6a)rY9(vtOzmu
z%T1L>=<N|DG*Cwl7p^uw9G6>;saR?FDWDu5(JM%xu?kLTu~{`UFV-8epWt5*_?0H;
z1ffE9-C3$GT#mI<dL-z--j0oiP~b6*ugSo=Kw>qOGDxXs8p>S6Q4dUBwg55e(FA3-
zXH`Kz9LGBXL2ecK{aA%`9pkx&@L)K>dMdi@!N)R4(3KMCQ?e4ejzPUo^&Ey0$gOj}
zbO<P@D1KuG+pXq7FIG>{7sD*CDK<qDYGYL9C73J0QYFCx9V2UDW+&L}1m;p;zJ*ap
z#F+2YBvvk7Z#PlG?Ka{obZF6%k(g$tl6-g~bWP+p3gbhOuXDT7YwmZCN5BSbZfmnN
zY!}CzOQ5T5a{hCRS~%a5-fqF>9<1U}mygl0<poCtW)?sJO?8$tSMOG_&Ot|Gq!-u3
zOkf0l2)6<26ZaZ|>;QGk()rdU0ZLj;JOKpi^fjt0U@U$K|8@u}7f~nLCB&X88*r>#
z8Qp`N_aHn`!6c%&fiBWo0V@Guf~q~+8I)*G!+^Y=%>4fxCI?81Zve&%H!Hmxbf7U#
ztk-KtDEE)U^nV2F3DbiYB|35B>~+~rK}*7^9da4Jsm8<=Rf83+4+ag2&c})icxO=q
zq7wq=TF*SguGY&p)6E7q6Pm2w#30B4sgK6TNT7RM><a;V?OMP%$m_tLOLCyi<`+P1
zh<!+_;m7^ga6%%S7{{eZhbeNnfk;O{c%upREC_Jqt>>;7xh)@il3exWKA1cy;I|)K
z_Ja6m7`Gq30fY{c$8HZuIQ%f3TX2f_Ee5|s!S8VJJK_u>-LP=wVgHQAa&Z^yh%+jp
zCnik!PCO4${qmUb=Ep%A+~pSnQj>Jf`ah9z>5_8O01ZzW4Ks0{!@wU7;R0F()w`#j
z2Ztj@>LUc2B~VUnz$x4CP=M%2iE+w}&ivkxvU*a8(-KdpdNEcR38Z9~B{@DHhSQOT
zd4X~&<_lsFzhdE+X7E!Um}T_i3;7q^$%UJq5Z>n%Hep(35ezRFzt4IDct3c>c$m8V
z2HqEK<WLzxDC|yDhP+{>$9IGbF@%iTkS3^%2|+S5hdbzCFAZ23sAZHii<wXft31Lg
zkHc9)MW4yw;>rZ7Y+NzF*Sr9`<eJH;#XVq1%bYzf2C5)sBN{dlnat1M$D8H|h(B;M
zx<YK}k0+uVm}F!lL6mN5`e7=O-cfG_E$!6Ic$e&saOY_EP3sZ>eo%{u<mZLDB`}&g
zfcpF6?<Kj5oCaQ(8qrC$fhysAony3uoF9Q=f+oJ^Q9Dpg<!;}UCeRKEbl5$qa6>BP
zp<MMglr9oL>J;l?iL}r7_Eb#uFHxkH<nfqHD2f!d8Al(v)Jg)|R^-<#3sP;+Xc&hc
zLmE22a%<vP_qvFBFE)<Il^W4P6aNZH$)t$~25&G*>v)Hs_u@ql1JWO*7PCH`qa)Ql
z8jfAsSidP&`gXCmECSA|llXom&zf+`LdzwnjruU_qV4_rDS_v<REhU;^;YS4SS4H|
z8L!fIvFE6r1tghpW*5i)8I<k!%bO5f(?VC2xAI4~JE(BUQ*b*lIiVuU!&Zf#J@8OC
zpUrjGvGO6hP0?~F9W`kSl#B_8OG{X>oejcFO7Te8R68AnB_Yj|lduErTo7hK1q!#2
zBELJKZX~}R6r#m~iVlAOtcWyN<Pb`44hC@^ZG-gzX(18QZ{i{jTiaZLtw;4V>Xe0>
zR!@65h<1x1Zrj>O+bP&ZNh<~By5$wFn1uw@msG>+-yGomfndQ~n^1j%ub;MXVAlKl
zgSu^?4n<f}Ni7$l3I#EIi0${a`=Jaq1WTQRxP8vT^_<DgQ5!d1ZI0m%x;|K}Og6I!
z3!X%{912*H?~g$!U|eLGaI@X!1e6CR@!bUx{-T|27a>eO<W1SoN)a2U!`|-YY1$5@
z?AKK&;cvOH^9wOia#Q;8%jkzac0UxjzKm9XnZoZ>D6?^!78pBwQQkfX=I5b$4v}~D
zW%S{I^m>pA0`yQWlbmiqc;AH8SP1h&oE{gIHcOKCpa|%XwKK3ZGv<{bcn>uKC~>9Q
z{ZJb>oBiHR-C;6U_QQTq3PKer*e5k$@ltLDgSb5`n0_Jy`<hvldxt$N<%~0Db#L$l
z`#l?z-p$D608ff%PGL^$f?XS?cz1ieJw~fPiw5kkjw23JR`JWX_=TOFspVT?!_(Z@
z-Lzm~X6%XqYdB-DKe}54EF?ARlt%nE@`4p1DgClNhPr4ALd^4p_Jn@RFez=~W5$J(
zDB(~Wwu_JhOZ)-UXH?e(wVsw-f22KtH=k-xZq8gwG>!Ho!gJ`kpYaY0tdKx&Oe5XI
zqQ!IY3NFX>ZbKMCAP6#f)glBjw|@5hpojw55hE{h9~EweYJpK35;vn=K`1@3&~Qf}
zJw*sa#KApWW3~PWbJgf*D`1ww9t;?$07hb2aJwO0B^DUwyI$Txf&n%KQ=K?1fo{<R
zx)3zNGXw}*mn;gQT7AD$S1!;yaG`%NJ9|pBcGWTLZ(<~8eJDEl-0(6H7#Z#C@UL;F
zh<XchS^pB-Ocb9avIekCKwH4q0!{_k&A9y;2&3lv3k6LoA95oh*W}!PP|+sL!2mz6
zUXCo#uQhSqgl1a=bZXF%0g7+Y+Yc@G0L}}JA#y*ujL>7SiOLIugbb$H?h2Gj_m>XR
zj@!ZcP#-&pD6IkYMW(}SNuZ7yLsW$!9{Zi_#yWQNo};5<z)~Msj8g6=ON||XBMVKK
zLRjo*=3yuhA%5ko{}4f}9OXA<{{Z9py*RrsU^fYmNWxQKA-g=SjElX;W9~k7`m?L*
zfj=Pl0H$QZd~PbSelT`Wqbk$^>4LGsRJc2Rh<u3<S{8u-X|A7-=%KFSSvhk=19&_h
z)m8S251&6s+`9W%`~u(qAf4wJ=`ZLzOy^hW%rmTyz5*PGuT#iDQ=tOW+@de>A;u60
z?RiGC>aa4YKEXiIZy*fwI>uB5CZnRLHnlTAA1#c?xcWH<&>SzraMZUUFMqlcuCzGM
zxk8OY-ZZSai(0ztgG@1q8;-l{jmy>L?)5>GF|$X5S2A}xsxa(D^<8w>b9w?vM}9ZM
z4l&m}C`$oRNle}N<0N-=#QtE^M<iLxMRt34`Jz8k9@?_1J2&{0i*m7(ww5fbQO|1;
zP#eP1iA@3*7rrAXl0|g=VN5h$m@Aq9!C+Cy&hHPy1RVDJ<tJ3%@$-^QD4#vES7gcV
z;Jl?~%;*4Y;@=oej+(p%nlcDtFW)Rc&`q1PfpRa*(!dCfh{RFhLo4}?CW|Ibr*^K7
z9#2k78Y-F;(o>LOpaO6jl4x*WLFCYHq#;%pDhF!?jPy$beK5&t3@6hN!7J+0GwHvB
zHg3jxDWA*2)ap2Fkq(>IbCj@zf;|9s2;AeYPsu^lPSP*0hYGJ6(FxXI3_eEF#rVE#
z37dEzrsV<CQhDA4{{rrp`gVjsnsN_8JrCNPke|d2=UzsTyd2);uK-AB4tRY#yajk6
z=p!X>vR(lx`ZfBvk(4xY?;CO?VwuNx(9QrvgB(vAx2pogO2UQ3okepgiDNZ?{Xs!^
zNtk*&M1eH|B-F;*ssp_@r$cnRSMLh<!swaXq2>zy(b-jBG$HvS&^cu|{JsF}=EIhq
zleml^rzCs`a+)P_>KTHbCcDv7Q{9R<B0v;vo;4*1SA?LdHB8>?oPNwWohSg$T0b7!
zI9G#4X*p51YXgl?TVyYaR^P(%xR)qy50Ja$PDi01C6;vk!5(ogf5fXf72MHjcb&xQ
zAaglJhX|~?%>16EBV|aN+I-xgGsh^zIB^X_a-r_({aN<Ls3LmFMVKa}sp3ug6Sn_e
zWJD`ZX_>mT7{R~v%cfj6Ay2R5Gte)&VPd*tQSt37X~?5O>gCl)kXTRjo&s=Zc=&-i
z=pMl!QKA=#R8cm0o~`<JI@ELaCqqS?N9Ue+&)GAThaW!)#hgdy-mXuE93yh+s4RUv
z`_%jC6TN$`Eyc}Pb{SaZz%l1)AyD<>jJ2a?%LTm6vs>7|6z>_%WGArdaZy8Oee1o#
z`rDXXD-0B8VW2pjHUBQ1GryM3n_o@$nSYxv6#fQ!`m>XUV%`hLHuj--6hH27;cbVG
z2LU>*ob{TNJmNl}=w(IQB>_>petQ7Vc`;9%fXJOTPh5bp<57h7gAxb&ky8XsZkU!c
zbK_h8?7Qs=X9Nn;lg=1^$3ffMB|cgVoq~^VCxr^QTf*VDr=u8q;M;|ey}|Fk;I|a~
z-sDU<yPYAS2aenO?VCvt1pRM1H*Vi!AFywMS>-*>Ui()2;PZg*%lklu6^f>P$es~O
z;Z4ro++q7o_HE97yqR(=XFqeZk2t$<-PcijRw#@&Ggj`H{pRiB*~gJ4YoF+jiyUuv
z4g|3+$t`Q&Vc#ja-4f;26Gw8&+joWWgo=49Hp_ec@DjXJEp<}t-JE{x`aoUZSTw7}
z?p}v<_%!B)CLIz9riOo@BnKJzuDdlsJOm1tmiskLAslA44nJ<vf_&m~LCxTOzuYU8
ztqkxUEGo>QlBu~<pay*w#hjI089Ub#c+jRTI9=7#bYv5{4KE-T--3cO6i2%5fy@ML
zlEzS!F$5FTJDK>ibYx#T%Nzz9plSz1ohA^gE|OShbLNK7kBwP`>}Z${7d^s=T~_YW
z;d3PX-qpD~6U4YXoWxi3ErEl|`LPTw!yg7oYB~A6(zyud1z|KRXIz7rsC1AC>mMno
z3NFB8S~CtJ>8_Ez_NUQA3KoJ|6)~IC6}DTN6t#;R5%UpvXb;=RnoUBLHM*O<yFFcz
zV~_(8iKs~wLu8Y5)iF<ks%sP1N%ga=hKwSsiUUh%e*%N{R1W%$5W1ug#!ajmOF_j#
zMUn1`MG<4CG_nW!qI>!)B!!_$=+BV&XzE(XOxUJx6!-rFC|F3vStG(?x(oish>k=u
zNOW9IRDfZHIMu&GIqIWy;_a8S=U`ZY!uAzl%QeK(fwgz+1oa`7@&l*^78&QrwsT^P
zjS3*2PNs)Z5%m$ISw9p2CB6NyCCQdK!V%=OSN|Zahx*^(3+8~<f$5ak(;fa8+q$KY
zqpa6=>{PF9le~d?p?^3<!<uCRoT9f%eat_2cRZ>{WN)&w&(3y5d;`OyeuW*h&7}1E
zI5<IxcSJ+7U5Q;0!59faS<&dPA@?~A^`7Ezh|XAq50a5^&H(_o9bWi3#9!!x6Mned
z`OLZN-inMtQZ|Il9Fmj`kd$*0R*cBUCdkKNDk0ufJ0FpmVF3oG2w;8aRMB1x$kQ+@
zO||PBER(?23#?T^$r|S3`$4h;5qL8V((8b2z#dgva=QS_9l#ds6bPo!1H)aaNVAzC
zffg|=T)4zA5>gok8!5a@NVbI>NQ|(&$!Fv}1Yf>gTqaR@2t?-u?5GUW(h6*(47T&`
zN81C7u)3x5+m(RQE6K@h&qrV_#b}RAV0qL2rq>S}F0he>v#tQe@B)l<XRfAj(q*d@
z+K$kdIB$;#jUI$ZX5+uJ6dy}CEyFC-EiIrH(w?+nA!^WETBFETgFIorq+l`?R<xKz
zAi5cl8_$tur3HOK_KVBxvb2C1+Tn^lBcgRjPaHok;s-HF4C)d=Xrj49nI<jhD@43d
zLIcZ*G{1SK)-pwbym-{1ERvr?x-qamP0A@7Ku)Vj{zOuFi=glEt&re?`$vQB?{3m|
zP3RIwdNRE9J*vYa&sJ%WyG!VKetsS*Q|DXFD9!^9JfQE=fYfMo(<$}9Xazhbp?)MJ
zG<I#g9l2ZPBjW_h;x-#}559M3e*<VRIFZ#_XeICHJ#RaD5kjx~JHqD7GooK2-&l2e
zHG*C{%UF9My$`xAGTi?T=kgJu<z6Or9NhB-N(w#*BePAB4)UsX5k_w~5AKg;A!nY8
zETZ}y(z#L%j-nuyK=|YS{i`0D6ia+Fs0DKQY)7V-O${i<j?l{Q*S$F#1|HHS&g#P!
z)bt~e>v=XHfZcz`@f~_?R-0wiJNBV4GnN3AbSjI8h7+`C^xP8x$Rki6BUFZK%Y)X5
z%oCzk7aaF~A+66mb-W|~{Ec+u)6GW@Hbe#|>TYfD=TVW73p*U+G27{JOxro=41Xt2
zrEN}(ZBGisP0VV$_*3w{h|3wd`f$3mmT-ZLP;oWM@8gIH0|;cKcab6gAqx{YQ6QCn
zz_<UF&hOC?44_Kihv*QxQ_yfp^ky5_)ZDS=zrj+rJLs+>Ge4uj4OiFm3ye#w*e|%}
zYq*0&w<LO!?bOe*m6b)XWPg_R>4E8}ReLwiZTFq<j2_lRMk`W`)`&da^w+Wlq8S$S
zZl^$8EP|w{ZD?2^AVQT@s3SrR0c~V`YP&VB3GFcp4?L<P3l9W+172Yh%G0J0`^NNL
zf#CT<eHt8MF$A`eZQ_mxn!Dr|w)t>`T8Rx9!vvoUWI5p%Jk(C&jo4Drzc6_Ql{L-3
zjNZX4W|wJ6qdA0Q9x8E6w~|m#M)8reKB~cw&|K<fP)?I}CUlAkfEy@9-Y|YU+m$!C
zwwpZkq2dj2L)Y<>07b|l{v<fO00H!IZzpjJNv@sSWGr0K4&F16mbQslh%NaB<#pvw
z!{6k4e^z=BXeDG@EO3DHO$&TvgFYBO%L9M`*A(^0LP92K=RN_BHS*4rd^x9iV{!)t
z*N6th?x9Zz^D=8WIibbhfzf2Pauv?>bulo;IWGZ^I@tN-z|;o$JA0Jog1nb=`1|NQ
zf*fcOmO3^SD4If%@2_>Hf{;ns0u#!Dm-=u`lna891sflg{J;?i*#Io9`?bCdH83Tk
zH_)C1;qM8RJoIZC4yXj%Bsf<e<5GDmip|2jCQZsf`wukm>GXb>lZC-0T-c7q0IGb^
z7{(WGKzBz>Q0BQnfTbJOD=@^FjYz3ohv)2T79GP9h~1(pnq4*v4`5ESod7$#2mu1q
zOLT;y+DXj@7B;eZ!D>TFiB?g$5yU`$VnbYtRL>VBdttxzll``dN%sP-dc`z{9f50R
zz=i|r3XBAUmzaair2>E<=UMVZz^_zQgvwrs7%rj&cLfLw4c9+r&pFiwj-6+9|DR^;
zk0IK>VF$By=5zZK>vwLmi-4U5vM_|qWnjCA&KxE3`oNw1$H)nDRHn-xvhaZ*5vCR=
zh<YCab;-PLK*WqdpNQrBo2&^jWbSw-t!Y8pC=#gY9=3S8*f@~@8wtIv;sM*5(cxhP
zW@tdGQhR%O80c%c@14SlurFvf2fchZ7bEr_Tn;<q^=%fIqi`mGRV;C&bLvxUp->g-
zsDP4F<I@N`rGAQUq9YBOsltE!3`2$N{33lqp6XEEk1&t-(UBwXR{DexMzS%~YX2$I
z$kDmoal=Tz%Z_EbZlby-n`4DsH*+Bo9f!`#<Jpcm+Aj39hm3y8w;&jB-iIZS7C;in
z5j?w4hgv1B*X9U>AI>XOKybsa@S#KW;A+5(`J<@z|DHZ>FufbZB(=*ii?rCBzI9Oq
zUcbzC3F#>lU5?&8O?FD>TU4vZts1zQqXqKe0&5n2#^Iq+5uTH-kH%~_fjkQ+DCCni
zC<O$G#%qQ<?Pb8EO28<lsc#qdvOpTsO9`BixTCoZsioF&FXus;b+dpoDRm8!Pj;FY
zNKlUH8tfnAym`RGy~vxzYoIgp9Pz;ZqOYBQ80IgZ`4O1v?8%|sw_>{QY2(%>_m3&x
zq@JQ=(?-^Vl#ZPR>(0is9or}X#Wt45fkYcHA`0IW5oy#N^_hfuP}s7<J;a9k9Xl^I
zoVu2P+|ciMyEmsHJGG}fmdG+8t_^8i+~_>Cy!YF2H}&Sej~b8)rB$me(DKqXY>ku&
z1&R13Z+{!DKsz#a`UL~|{uJWh3`8ALx)S3+JBz*mTp({hAG{rky&VqT{&w&dGDA$;
z%=7BQ=V5qu^Cpf1OWnwLw?Niy%!4^lSa8eXzGCZp_5p1l#CAOjB)_Xg?cz1lo`7@~
zZhLO0AyqWQT)%DT!k<A_D`^47xspUF+VE#4CandMw%o0}s`~&kQ)#((4Rv_tJ!F(_
z9<nE|@%H6=Jg`PLXY5_*HCPZs4M9w>GZSpDyhqBzWUm@bb(=jUsopNV#rJRTesAv<
z>8-M-drkKXW0kvOAs4g+q0Jzk?p+D##DfIXRIJ`h-wsN&J^jNll<FOkBW;h2=m`52
z^~T-Q$nz&Krbltta{2{GUSl>L0~uuRRotnKo;VKD$_c<#w}ZSg#u)bljzXHxcz0~x
zNuva93B3>V`NBnM?LOAi{hatqQs+O~l7HrP^Z$eG^4BX1rT>N0;V<xf7wd50;WkZ`
z-&9Rw)5({D&uN;}kn>W@(-UPb0bj>PHxE6-FxpuV3IL=)sY4jez&3etF9eP>_T9M`
zm=p!kmXa$mC3=*}{Q0$|C6G681r?;EqTq;0-KoZGX|9Dk5=5O(a-@=iu3)FAU>ojH
ze?TaNqM%FzN(rh9NpofpUPT6S!%C;*^~_?cj#Iw0-Y3Z6Vhbj{X6JnC$%p3N<r~K}
zq&kINQT-yE>(hbd7i~0Q_FSzA%$Rp5FT1U#KX~%-)2APOr1HcQl_ws0^6VMkfC+Wn
z^b0zYpVyow=&!HFz#$$S*wHml{YiL)r+y2h1!+J7^9(oNV~t>^FaYr%44;XQ@P2Wa
zc)}j1<1h^IVD$b4?qiUL)ZBO+mqR>eB7hqLM@hi0eoE{(_?en(*MOXg#Zp8TI4TM5
zzj^`50OtsEg-4eblqSpm9)NHzG0$`YQW&sOVcwWBaYd}P{@^yi)J|to7g!bX^4E9C
zjG7gScmULgY`T9QHe-EAN1~8-@G{n`JRiY51~n4Z<nT$aii{V5Ozin!;RU^8T(>xo
zgccp#k3xc508HwewWQ(k*1wajC(xEom;~Ei1q^XxeV9v$^g@}&Q0iLn0IyoM<1Roj
zsRbH?pRsFlU9@kibGlnOGS^48Y3zDmSL0sHtoL7_jrHZAwd?&W@yCnZkBS|&UR>gu
z=}t5N+Dg~6`Wr;sw%;h>@f89Eq51*#Da^%dL7!=0<-Z~Iivif325<p6E-(<jgwQT%
zo0H}H(Ov2y!@mPrUhi*pSK@k+cAdIIX71h=LYzTbzdxYpX&A;&JI{ik=lVmZ0NY^4
zIK}n*h*B-3_9ME?Xy0tPe#U8Dtf^L05fj9iAeBD=7HP)@jIG*yCHJ{OBBX9>9fsp9
zwU0tufy{<K5c&w6fub|ThH-aLzeY!ZRswjwwG6XmzC|$_wajc%TnD(SoIn0Zl#Q0(
zA(q1SfPiQFsTyQF8I$!)g6!+3ad8dasE1g_2|8b7xx`%|u!hUeJej%bFBm#P=gV;X
zB+Lnkq=uhbIKS3hk(=M-u5oUJBEE+=knD4Ob7{fPkgp89%Fn=jk*YZgIv+j2>Yp%g
zLadPJZ&hpsolhk-2=-?`6PU8`Q_??vKa_xE8p@6Kex3za1h?`nsf6_1=h>t`XO*p0
zmaUo8%%KbRMGsQh16JQ!bt3PWaej_|*~rxIG7qj)tY4XivT>be=zl=Ye@DpK$5Xxt
zb8aTg5JT98q@-!cv%%0?GH*^prYsGTQx-1l?<CC|1Hh>%;bz@1Z<v<#N*Xqk<+3$g
z%2vo{!3{6sEx+b-P!PdoZDvXcStMq?Vgggw@^Sfb$j|-x-s|~&X_QObKDcfUq%9hj
zqp?`}=~~Z0j?Ne#N#jZlq>#%r%$L&{sH31plQ5#(8;TO1E8?yBoMot8*!dQ^A$#tE
zXuBXI>W|?QMyT$kltk|xV2-U`z|p4XPr}$5#&9b(^0V9t1sln3i;e7*E=HZBC>gO2
zo`vq0kQ*;k5FLjUMeuOS4YhHG!Dg|!3@Me3G;jp4yXf=M!qUp%iUbJeMR)=P^aRsL
ztt5fgCO>9;&v9TQS-~d`5cgOffJuEqXs6&GZ0CSaJkUlw_mo%khF&muV=G}lK(*Cx
z$Nd`{dBlT07xWXJ{%8VFKg9Cv9EAB;r_n%|4+4YBV=q7w4rlZDHAv*QnQFq$U$uY{
zgz0|_ux1}%%|35ZV9f&aY4-)NCK!I|dvL!ouWZ1T2h0sd7L)3`y{XOJ0>z!eUOa6N
z0Q>r1#5ai3?8hVpu#>J7Kh{YRmL0Tr4}R~liwFhtZX#h1{g8!g2hg|T8~G$OPoRA=
zP)lqKq9>qxfnFE`M8a$PhNy=D?TMk~o30q6WDt_oS415L+QZOh7`X78z<hXL^eC{4
z5qk`+!zDZ@3nM3Z$|!p2=Jtq`&i;I<Jqn$L44w}FX93<3zCkVT4(rh#MIDNOtp}jb
zFtIDalG=mNlNhrh_e2~5kaeFu(H^K7D0|l@Oz2glHgEL~;!=chd&=HzPhU;id!Pf8
zU!K9W2!_2ExF6`9P1D|oo412duvksptc;W|80`tZ*E0}xd6M%(J&EVrWE_qHSIYF@
zN+0OPmC%kO!7N)v?;Zt<*Ro4AeHP{+CfIIT|BeU}K7(b}4UCi7_d1|BS6BeR3&;`L
zivnZv?*WR7xcqtn^&lW4K$iMX+%^7`4mT3)t$#)T{|jes3-AzuvWF2>fQKQ9hi)2r
zSzI$VWlDpX4nhL&+UOP-P3e;=Pn&AN&8bynqi<8X8|4MihDc?@FMB+9ydHStPrAS}
ztMDp?mD8_o3z+V-J#7b=0+vhKIF9=ORzcAOH@o5?wWL~WtLkr2xh;Lct(Sn5`YPV)
zDiUN;U!(I<6ei~r!6XwTybhS8t~18VaMlN6SjSO6dHgPW-8$M@JgWW@?;+=q7w}R;
zEIdTF)P-BC0K<gYWn6Hll^yo}53rW9+us`XX0-W>i0<8}o9y#NA_RvS>j)hdhVruz
z>cky807Docn8sO5lzHR}d+p%qigh^O$gBU1_v=Hk1`+y3rUWczW);Ca^$K$_Sb#}~
zkeR;P_h#m}pF$p*(HUB=i%urBKwEw*8blHk>Nl7;!E6ZoDFNs*_NqklGpm6o&00FP
z>Sx3Zoe?^`DA!Nx5m%goe%4!so&pDbh!IE?S7Y>z(;?8MEcyy?{1i1D>Plw8y^L9A
z7X85tYMrBLW02&+90i)QI4J{w(Ozq;y83`lGt&t=dztf~p1KhpJ~O%PVJ^_xBvb97
zbBNA9I+96W(21PNYCnT-h69jlW!0;QRA4m7P^hJppnzq40WM3s-;^9DJ)YPO;N-{;
zF>(Ocoi|9k1hTk9)A^IapeNlymMqKQ0DQm1)%#g=R=&Rg7DpC}2%0|#+W4y`7$Vc4
zl2iQv$_ZBb6$>)ce2+E$N*=dPm?JcqXjyQfV_-f9RffP<7|)t}5fAs|lEVm;4IrOR
z36%#6+(q71#`oIzA|<hL_pZhJdrcT&v|a^>l^H;}Om9ACUfuh0;WD3J%!lIDb93rj
zFvK&)sTuQ>`j7M;>=C&3hbxdew?n(qvNALw{N0=(KgSvL^K>YQrzNpJPv0-o`87Jf
zPKVOj>YvfKi$1Pvf3R}$@wrovzP)nl@h2&%y{(Kkx_FC>HX%lBRrvRjBUrmM;JXk=
zYX3VH@)LAOS682*LwH&yv*M8LEXVzmh?gha{p%<pn}CjnnJwZPL|lp({!x1)uKl5h
zyGO1Y_KLGR$#(Y-e8qf!{)YKx<dg+1Vj7S-aB{hf`Rit~@G52kt~#+^>4X7iw+gR7
zBK$!!rNhBFLtGth<_N`GFEf58+yp#7`m$o1sy>=6$M#temjaSUVCEH+u~4KXG;AB%
zs?ip5Yq+>xWE;BeyOs`7bE6vEy&l?@i!396#D69*CwY=m^idx3&(HvjHmyc2GA$ZH
zp(MevNCmUC31N}GkJ3Q^1p+}l$shp%E5R@#OnE0)8g<B-U79Vu{fV<jZ=XT(K-}jH
zbR!||MB6|pFrJ-QsYO?GssKAUxc44)hH)EIuT`R9CV@#pOJt-Z5=QBy5d%t9sRF>B
zj%@+h3WEJ-SSTgMk=8g!(G%I{!1S;bhSyN{jOU0)&>)eT8mKVl(S6Y}a}sM+s&$Da
zOS#&7)AmJ6DK4*L0(B8uk6ndnLr7g2hK&<RnAuX)cMDn>c4-X^B-T|5ja3VxSm$9!
zX+B^L3vI~N;$E~5Bt77Iv>b^3)3HE2p#80IXxIfpgFVDrc6ubNq?8%9JXX=bp#K6^
zYwR->Jh{IBPpBRt=nDvr&u5_|K_e##*dEr_g?Tk$2F6V?TmOI(WS#6XFEY&Imqtx^
zi3mdA>w~WVKiDjIg838ehuM=b28^FL19&$W{1$`Xq2PDe8Kh!Gpx8i#3m_ilVNnQx
zWa7-YA1>D&msE_m%I%Vn93d$8t#G<E0l)?TKfGd+2rJj-K)GMvHwx)W6SA0r9sr~_
zb3__Mm_0eCd7`)*^a<oAQvT4;gSnJI0#z8DPjiPQIIXJmk*oxQEwA_Xb67x>e+J3+
zUoyh`>2%;V9wUo{*YfaCgO9d@3(3gn$BymZS#lpW&?3yAV_gJX6#4Bde3X(q#q-!|
z5tRJ}c-HUgkb}41zaVP1jVw2NJ^awm?@4^s{~E8rP6Ff<jZ(k(#wtJ|Wi8*<k;2|t
zPFVd0Yd|tmm(<pGGwgE++g3=J6?X|)2g1MUNcfjrPE3>?gn#>Z?IZY&xTX+y$rQkO
z?FZgUaogK4@ho{a&;&O&^ovL@loTQ9^8iJHaed(fVCG{1d|2e~f#LHVcx)Sk&6|U~
z&U+bN`JmHYGdWRVhawHGodx+I@DHV|^sdCXJZ-{~#4_tL2+SGzwELk5iEAMl3L#%y
zip!-3%McVN*5b*q=n$m>-x%5m5MD+-ejNTm)Y4QRhms@p0|uH?L71gJ(@rlJkvA?@
zbUzjNL8nolfhweqYuW5Qq@s&Fvv~Vk-mr)*hdKW$1(<2g0(F23LFS?6Z6eg17XK|l
zTIT#G-e@TD3_BRC>EJaJ>EH(khAG36@`(LqZ#<0Mmb}_|(2p|hA_SrP!0VQ4hDAhr
zh+_uYL8?6z6Ox|bb(OdTa}r-%R=GT7_lb~np-PQp*`!POe8VEqGd<U<L^(0H9mSt_
zfKiU&h5{VmlycZ!5E_IA9D8Ewc#8ydApP9g+i3e3=1XzdcI-z+8(-=E`%5Qq&;e?Q
z70Kf}l3G6W7=%QRp1H6_TL=_7gp6}=bhCN<%x^9B0iAIV%o55)cQQXIHbx(z;k?z$
zUANoJ;ga-IT=N})wC~z!A3%6Q(^2j}sSkS?(6eDpLO=nzEy*pRLZ#XwppssYB!Y9a
z6e$qn4<SF0&7(f?chjU`9jfOPz0`)%Be2e&YOK}0+P3f1Z=z~5yS3cRKV<dgka-t&
zCM_m+iD6%0^{Mvno6ZHzK}lED5`^4Tm9IWT=Xt&_fR_X-8MsawW~|gZnT;U1-^Xl7
z&{fCj!&V%KuL83JEtihSOx5Z8IHH7iz<(#BZ`aK`?YU6k?Eu@MwtJV@>LFz47sHty
z2<*9j|7xpM2MQL*<Wjg*{WhJSWsQ2yGs>>6@i?Vtbd)zI_WnU?wU0~#sKlA0Am2la
z8Bcq`9022hgnS93!d%|ik1t}%y>O!-`ZMmdqySq1jDX2r8P_hwqa99A@a1V*LOp^J
zKv#Nf#2TH8bOfvH;F2A@QA|7J*lfZ!T7efiY>ynusIdoj#`PXW;`gwlh{|UZg`q6<
z^h1r^Sf_mhxLslNZxnOJLLd-2<0&dEgK6EQ47sPi!a98kj-RF#Uzn(nHb|2MttId#
z^SL|{-OJLs!&-w_Uu7H~)Bd=Qqpbo$2QRpM2xOn&{%a+ZSmrdRczLpG2D-CVTvc^Z
za64BK{OWnFr9>t3jscGHEwL;F4MRI<X|gHy?khThww%FA?%6&;D7Gyc*CYtYS}X6~
zKDN$un9)d|@`q#5^xILsl^H{3u$rJlZjI0!{D~L}R%wumRIPifO75D-ckX*o(@aCt
zT>eBSboLx9k%Flt#xT@gM%hQFM2B=ZkPe~WQiWav$V_BD>5F94C?`Oka3dBluENzb
zo)l;Gb|xFrV?`#fs-0T{S<mn5deDSLq1i1n4GDx$3DulW9CNPlj{W|I13Dde#<K*v
z)y<5001lY6wPp)sxm)m19<UwUGs)}IX35aiCJzqsO`xNT)N+^PA%+!MO3j(oY|~lX
z#6xmtc_;`*b7pmuLJi+p%m&^ZavP$lBXo|!;XNLbj~*qVK_25v5;ukP=?@1nG*S<J
z444DHok>UHFLVxfFx{PW?xJ%yoyXw#>Bn^WRKr)#()l4epQ7_=I$7=?LTCLYKK?SD
zK8D>z=Omr`;glyrI|66ULNun!jKEWjB6KW4dq`Z7_1?7&N4SNby-)YY1BjJye;AH6
zqA!}A#HKndggb7kMUd#AJYbZJLN=YGf7Hywa!Em~ns9?A4;Ei+zXiOVOp&*m%?$Ss
zPiJq*=Z2@Jjt>_SA1pqc_#oUl^IN&)r(Ye;3>On0wARcaIRd^uY+2*xDeWJ%5Oag^
P`DZ?k(1Q7u@#6mn+3oyM

diff --git a/collie/models/mistral2/configuration_mistraltp.py b/collie/models/mistral2/configuration_mistraltp.py
deleted file mode 100644
index ad6691b..0000000
--- a/collie/models/mistral2/configuration_mistraltp.py
+++ /dev/null
@@ -1,155 +0,0 @@
-# coding=utf-8
-# Copyright 2023 Mistral AI and the HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-""" Mistral model configuration"""
-
-from transformers.configuration_utils import PretrainedConfig
-# from transformers.utils import logging
-from collie.log.logger import logger
-
-
-# logger = logging.get_logger(__name__)
-
-MISTRAL_PRETRAINED_CONFIG_ARCHIVE_MAP = {
-    "mistralai/Mistral-7B-v0.1": "https://huggingface.co/mistralai/Mistral-7B-v0.1/resolve/main/config.json",
-    "mistralai/Mistral-7B-Instruct-v0.1": "https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.1/resolve/main/config.json",
-}
-
-
-class MistralConfig(PretrainedConfig):
-    r"""
-    This is the configuration class to store the configuration of a [`MistralModel`]. It is used to instantiate an
-    Mistral model according to the specified arguments, defining the model architecture. Instantiating a configuration
-    with the defaults will yield a similar configuration to that of the Mistral-7B-v0.1 or Mistral-7B-Instruct-v0.1.
-
-    [mistralai/Mistral-7B-v0.1](https://huggingface.co/mistralai/Mistral-7B-v0.1)
-    [mistralai/Mistral-7B-Instruct-v0.1](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.1)
-
-    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
-    documentation from [`PretrainedConfig`] for more information.
-
-
-    Args:
-        vocab_size (`int`, *optional*, defaults to 32000):
-            Vocabulary size of the Mistral model. Defines the number of different tokens that can be represented by the
-            `inputs_ids` passed when calling [`MistralModel`]
-        hidden_size (`int`, *optional*, defaults to 4096):
-            Dimension of the hidden representations.
-        intermediate_size (`int`, *optional*, defaults to 14336):
-            Dimension of the MLP representations.
-        num_hidden_layers (`int`, *optional*, defaults to 32):
-            Number of hidden layers in the Transformer encoder.
-        num_attention_heads (`int`, *optional*, defaults to 32):
-            Number of attention heads for each attention layer in the Transformer encoder.
-        num_key_value_heads (`int`, *optional*, defaults to 8):
-            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
-            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
-            `num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used. When
-            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
-            by meanpooling all the original heads within that group. For more details checkout [this
-            paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to `8`.
-        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
-            The non-linear activation function (function or string) in the decoder.
-        max_position_embeddings (`int`, *optional*, defaults to `4096*32`):
-            The maximum sequence length that this model might ever be used with. Mistral's sliding window attention
-            allows sequence of up to 4096*32 tokens.
-        initializer_range (`float`, *optional*, defaults to 0.02):
-            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-        rms_norm_eps (`float`, *optional*, defaults to 1e-06):
-            The epsilon used by the rms normalization layers.
-        use_cache (`bool`, *optional*, defaults to `True`):
-            Whether or not the model should return the last key/values attentions (not used by all models). Only
-            relevant if `config.is_decoder=True`.
-        pad_token_id (`int`, *optional*):
-            The id of the padding token.
-        bos_token_id (`int`, *optional*, defaults to 1):
-            The id of the "beginning-of-sequence" token.
-        eos_token_id (`int`, *optional*, defaults to 2):
-            The id of the "end-of-sequence" token.
-        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
-            Whether the model's input and output word embeddings should be tied.
-        rope_theta (`float`, *optional*, defaults to 10000.0):
-            The base period of the RoPE embeddings.
-        sliding_window (`int`, *optional*, defaults to 4096):
-            Sliding window attention window size. If not specified, will default to `4096`.
-        attention_dropout (`float`, *optional*, defaults to 0.0):
-            The dropout ratio for the attention probabilities.
-
-    ```python
-    >>> from transformers import MistralModel, MistralConfig
-
-    >>> # Initializing a Mistral 7B style configuration
-    >>> configuration = MistralConfig()
-
-    >>> # Initializing a model from the Mistral 7B style configuration
-    >>> model = MistralModel(configuration)
-
-    >>> # Accessing the model configuration
-    >>> configuration = model.config
-    ```"""
-
-    model_type = "mistral"
-    keys_to_ignore_at_inference = ["past_key_values"]
-
-    def __init__(
-        self,
-        vocab_size=32000,
-        hidden_size=4096,
-        intermediate_size=14336,
-        num_hidden_layers=32,
-        num_attention_heads=32,
-        num_key_value_heads=8,
-        hidden_act="silu",
-        max_position_embeddings=4096 * 32,
-        initializer_range=0.02,
-        rms_norm_eps=1e-6,
-        use_cache=True,
-        pad_token_id=None,
-        bos_token_id=1,
-        eos_token_id=2,
-        tie_word_embeddings=False,
-        rope_theta=10000.0,
-        sliding_window=4096,
-        attention_dropout=0.0,
-        attn_implementation="flash_attention_2",
-        **kwargs,
-    ):
-        self.vocab_size = vocab_size
-        self.max_position_embeddings = max_position_embeddings
-        self.hidden_size = hidden_size
-        self.intermediate_size = intermediate_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.sliding_window = sliding_window
-
-        # for backward compatibility
-        if num_key_value_heads is None:
-            num_key_value_heads = num_attention_heads
-
-        self.num_key_value_heads = num_key_value_heads
-        self.hidden_act = hidden_act
-        self.initializer_range = initializer_range
-        self.rms_norm_eps = rms_norm_eps
-        self.use_cache = use_cache
-        self.rope_theta = rope_theta
-        self.attention_dropout = attention_dropout
-
-        # 调用父类的初始化函数,将一些公共参数传递给父类处理
-        super().__init__(
-            pad_token_id=pad_token_id,
-            bos_token_id=bos_token_id,
-            eos_token_id=eos_token_id,
-            tie_word_embeddings=tie_word_embeddings,
-            **kwargs,
-        )
diff --git a/collie/models/mistral2/model.py b/collie/models/mistral2/model.py
deleted file mode 100644
index 60d9553..0000000
--- a/collie/models/mistral2/model.py
+++ /dev/null
@@ -1,2026 +0,0 @@
-# coding=utf-8
-# Copyright 2023 Mistral AI and the HuggingFace Inc. team. All rights reserved.
-#
-# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
-# and OPT implementations in this library. It has been modified from its
-# original forms to accommodate minor architectural differences compared
-# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-""" PyTorch Mistral model."""
-import inspect
-import math
-import warnings
-from typing import List, Optional, Tuple, Union
-
-import torch
-import torch.nn.functional as F
-import torch.utils.checkpoint
-from torch import nn
-from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
-
-from transformers.activations import ACT2FN
-from transformers.cache_utils import Cache, DynamicCache
-from transformers.modeling_attn_mask_utils import _prepare_4d_causal_attention_mask, _prepare_4d_causal_attention_mask_for_sdpa
-from transformers.modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast, SequenceClassifierOutputWithPast
-from transformers.modeling_utils import PreTrainedModel, dtype_byte_size
-from transformers.utils import (
-    add_start_docstrings,
-    add_start_docstrings_to_model_forward,
-    is_flash_attn_2_available,
-    is_flash_attn_greater_or_equal_2_10,
-    logging,
-    replace_return_docstrings,
-)
-from .configuration_mistraltp import Mistral2Config
-
-
-if is_flash_attn_2_available():
-    from flash_attn import flash_attn_func, flash_attn_varlen_func
-    from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input  # noqa
-
-    _flash_supports_window_size = "window_size" in list(inspect.signature(flash_attn_func).parameters)
-
-
-logger = logging.get_logger(__name__)
-
-_CONFIG_FOR_DOC = "Mistral2Config"
-
-#modified for collie
-import torch.distributed as dist
-import gc
-import json
-import os
-from collections import OrderedDict
-from megatron.core import parallel_state, tensor_parallel
-from einops import rearrange
-from deepspeed.pipe import LayerSpec, TiedLayerSpec
-
-from collie.config import CollieConfig
-from collie.driver.io import IODriver
-from collie.log.logger import logger
-from collie.module import (
-    ColumnParallelLinearWithoutBias,
-    ColumnParallelLMHead,
-    RowParallelLinearWithoutBias,
-)
-from collie.utils import concat_tensor, dict_as_params, env, progress
-from collie.models.base import CollieModelForCausalLM
-from collie.models.utils import (
-    kv_cache_to_inputs_for_layer, inputs_to_kv_cache_for_layer,
-    kv_cache_to_inputs_for_model, inputs_to_kv_cache_for_model,
-)
-
-
-# Copied from transformers.models.llama.modeling_llama._get_unpad_data
-def _get_unpad_data(attention_mask):
-    seqlens_in_batch = attention_mask.sum(dim=-1, dtype=torch.int32)
-    indices = torch.nonzero(attention_mask.flatten(), as_tuple=False).flatten()
-    max_seqlen_in_batch = seqlens_in_batch.max().item()
-    cu_seqlens = F.pad(torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.int32), (1, 0))
-    return (
-        indices,
-        cu_seqlens,
-        max_seqlen_in_batch,
-    )
-
-
-# Copied from transformers.models.llama.modeling_llama.LlamaRMSNorm with Llama->Mistral
-class Mistral2RMSNorm(nn.Module):
-    def __init__(self, hidden_size, eps=1e-6):
-        """
-        MistralRMSNorm is equivalent to T5LayerNorm
-        """
-        super().__init__()
-        self.weight = nn.Parameter(torch.ones(hidden_size))
-        self.variance_epsilon = eps
-
-    def forward(self, hidden_states):
-        input_dtype = hidden_states.dtype
-        hidden_states = hidden_states.to(torch.float32)
-        variance = hidden_states.pow(2).mean(-1, keepdim=True)
-        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
-        ans = self.weight * hidden_states.to(input_dtype)
-        # --------------------------------------------------------
-        # # 将Tensor转换为列表
-        # ans_list = ans.tolist()
-        # # 指定.json文件的路径
-        # file_path = '/remote-home/lqyin/CoLLiE/tests/models/mistraltp/rms_ans.json'
-        
-        # # 尝试打开现有的.json文件并读取内容，如果文件不存在则创建一个新的列表
-        # try:
-        #     with open(file_path, 'r', encoding='utf-8') as file:
-        #         results_list = json.load(file)
-        # except FileNotFoundError:
-        #     results_list = []
-        # # 将当前结果添加到列表中
-        # results_list.append(ans_list)
-        # # 将更新后的列表写回.json文件
-        # with open(file_path, 'w', encoding='utf-8') as file:
-        #     json.dump(results_list, file, ensure_ascii=False, indent=4)
-        #     file.write('\n')  # 在文件末尾添加一个换行符
-        # --------------------------------------------------------
-        return ans
-
-
-# copied from transformers.models.llama.modeling_llama.LlamaRotaryEmbedding with Llama->Mistral
-# TODO @Arthur no longer copied from LLama after static cache
-class Mistral2RotaryEmbedding(nn.Module):
-    def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None):
-        super().__init__()
-
-        self.dim = dim
-        self.max_position_embeddings = max_position_embeddings
-        self.base = base
-        inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2, dtype=torch.int64).float().to(device) / self.dim))
-        self.register_buffer("inv_freq", inv_freq, persistent=False)
-
-        # Build here to make `torch.jit.trace` work.
-        self._set_cos_sin_cache(
-            seq_len=max_position_embeddings, device=self.inv_freq.device, dtype=torch.get_default_dtype()
-        )
-
-    def _set_cos_sin_cache(self, seq_len, device, dtype):
-        self.max_seq_len_cached = seq_len
-        t = torch.arange(self.max_seq_len_cached, device=device, dtype=torch.int64).type_as(self.inv_freq)
-
-        freqs = torch.outer(t, self.inv_freq)
-        # Different from paper, but it uses a different permutation in order to obtain the same calculation
-        emb = torch.cat((freqs, freqs), dim=-1)
-        self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False)
-        self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False)
-
-    def forward(self, x, seq_len=None):
-        # x: [bs, num_attention_heads, seq_len, head_size]
-        if seq_len > self.max_seq_len_cached:
-            self._set_cos_sin_cache(seq_len=seq_len, device=x.device, dtype=x.dtype)
-
-        return (
-            self.cos_cached[:seq_len].to(dtype=x.dtype),
-            self.sin_cached[:seq_len].to(dtype=x.dtype),
-        )
-
-
-# Copied from transformers.models.llama.modeling_llama.rotate_half
-def rotate_half(x):
-    """Rotates half the hidden dims of the input."""
-    x1 = x[..., : x.shape[-1] // 2]
-    x2 = x[..., x.shape[-1] // 2 :]
-    return torch.cat((-x2, x1), dim=-1)
-
-
-# copied from transformers.models.llama.modeling_llama.apply_rotary_pos_emb
-# TODO @Arthur no longer copied from LLama after static cache
-def apply_rotary_pos_emb(q, k, cos, sin, position_ids, unsqueeze_dim=1):
-    """Applies Rotary Position Embedding to the query and key tensors.
-
-    Args:
-        q (`torch.Tensor`): The query tensor.
-        k (`torch.Tensor`): The key tensor.
-        cos (`torch.Tensor`): The cosine part of the rotary embedding.
-        sin (`torch.Tensor`): The sine part of the rotary embedding.
-        position_ids (`torch.Tensor`):
-            The position indices of the tokens corresponding to the query and key tensors. For example, this can be
-            used to pass offsetted position ids when working with a KV-cache.
-        unsqueeze_dim (`int`, *optional*, defaults to 1):
-            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
-            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
-            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
-            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
-            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
-            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
-    Returns:
-        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
-    """
-    cos = cos[position_ids].unsqueeze(unsqueeze_dim)
-    sin = sin[position_ids].unsqueeze(unsqueeze_dim)
-    q_embed = (q * cos) + (rotate_half(q) * sin)
-    k_embed = (k * cos) + (rotate_half(k) * sin)
-    return q_embed, k_embed
-
-
-class Mistral2MLP(nn.Module):
-    def __init__(self, config):
-        super().__init__()
-        self.config = config
-        self.hidden_size = config.hidden_size
-        self.intermediate_size = config.intermediate_size
-        # self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
-        # self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
-        # self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False)
-        
-        self.up_proj = ColumnParallelLinearWithoutBias(
-            self.hidden_size,
-            self.intermediate_size,
-            bias=False,
-            gather_output=False,
-            init_method=lambda x: x,
-        )
-        self.gate_proj = ColumnParallelLinearWithoutBias(
-            self.hidden_size,
-            self.intermediate_size,
-            bias=False,
-            gather_output=False,
-            init_method=lambda x: x,
-        )
-        self.down_proj = RowParallelLinearWithoutBias(
-            self.intermediate_size,
-            self.hidden_size,
-            bias=False,
-            input_is_parallel=True,
-            init_method=lambda x: x,
-        )
-        
-        self.act_fn = ACT2FN[config.hidden_act]
-
-    def forward(self, x):
-        return self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
-
-
-# Copied from transformers.models.llama.modeling_llama.repeat_kv
-def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
-    """
-    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
-    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
-    """
-    batch, num_key_value_heads, slen, head_dim = hidden_states.shape
-    if n_rep == 1:
-        return hidden_states
-    hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
-    return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
-
-
-class Mistral2Attention(nn.Module):
-    """
-    Multi-headed attention from 'Attention Is All You Need' paper. Modified to use sliding window attention: Longformer
-    and "Generating Long Sequences with Sparse Transformers".
-    """
-
-    def __init__(self, config: CollieConfig, layer_idx: Optional[int] = None):
-        super().__init__()
-        self.config = config
-        self.layer_idx = layer_idx
-        if layer_idx is None:
-            logger.warning_once(
-                f"Instantiating {self.__class__.__name__} without passing a `layer_idx` is not recommended and will "
-                "lead to errors during the forward call if caching is used. Please make sure to provide a `layer_idx` "
-                "when creating this class."
-            )
-
-        self.hidden_size = config.hidden_size
-        self.num_heads = config.num_attention_heads
-        self.head_dim = self.hidden_size // self.num_heads
-        self.num_key_value_heads = config.num_key_value_heads
-        self.num_key_value_groups = self.num_heads // self.num_key_value_heads
-        self.max_position_embeddings = config.max_position_embeddings
-        self.rope_theta = config.rope_theta
-        self.is_causal = True
-        self.attention_dropout = config.attention_dropout
-
-        if (self.head_dim * self.num_heads) != self.hidden_size:
-            raise ValueError(
-                f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}"
-                f" and `num_heads`: {self.num_heads})."
-            )
-        # self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=False)
-        # self.k_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=False)
-        # self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=False)
-        # self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=False)
-
-        self.q_proj = ColumnParallelLinearWithoutBias(
-            self.hidden_size,
-            self.num_heads * self.head_dim,
-            bias=False,
-            gather_output=False,
-            init_method=lambda x: x,
-        )
-        self.k_proj = ColumnParallelLinearWithoutBias(
-            self.hidden_size,
-            self.num_key_value_heads * self.head_dim,
-            bias=False,
-            gather_output=False,
-            init_method=lambda x: x,
-        )
-        self.v_proj = ColumnParallelLinearWithoutBias(
-            self.hidden_size,
-            self.num_key_value_heads * self.head_dim,
-            bias=False,
-            gather_output=False,
-            init_method=lambda x: x,
-        )
-        self.o_proj = RowParallelLinearWithoutBias(
-            self.num_heads * self.head_dim,
-            self.hidden_size,
-            bias=False,
-            input_is_parallel=True,
-            init_method=lambda x: x,
-        )
-
-        self.rotary_emb = Mistral2RotaryEmbedding(
-            self.head_dim,
-            max_position_embeddings=self.max_position_embeddings,
-            base=self.rope_theta,
-        )
-
-    def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
-        return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
-
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_value: Optional[Cache] = None,
-        output_attentions: bool = False,
-        use_cache: bool = False,
-        **kwargs,
-    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
-        if "padding_mask" in kwargs:
-            warnings.warn(
-                "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`"
-            )
-        bsz, q_len, _ = hidden_states.size()
-
-        query_states = self.q_proj(hidden_states)
-        key_states = self.k_proj(hidden_states)
-        value_states = self.v_proj(hidden_states)
-
-        # query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
-        # key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
-        # value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
-
-        query_states, key_states, value_states = (
-            rearrange(query_states, "b n (h d) -> b n h d", d=self.head_dim),
-            rearrange(key_states, "b n (h d) -> b n h d", d=self.head_dim),
-            rearrange(value_states, "b n (h d) -> b n h d", d=self.head_dim),
-        )
-
-        self.num_heads_tp = query_states.shape[2]
-        self.tp_size = self.num_heads // self.num_heads_tp
-
-        query_states = query_states.transpose(1, 2)
-        key_states = key_states.transpose(1, 2)
-        value_states = value_states.transpose(1, 2)
-
-        if self.config.pp_size > 1:
-            query_states = query_states.contiguous()
-            key_states = key_states.contiguous()
-            value_states = value_states.contiguous()
-
-        kv_seq_len = key_states.shape[-2]
-        if past_key_value is not None:
-            if self.layer_idx is None:
-                raise ValueError(
-                    f"The cache structure has changed since version v4.36. If you are using {self.__class__.__name__} "
-                    "for auto-regressive decoding with k/v caching, please make sure to initialize the attention class "
-                    "with a layer index."
-                )
-            kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
-        cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
-        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
-
-        if past_key_value is not None:
-            cache_kwargs = {"sin": sin, "cos": cos}  # Specific to RoPE models
-            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
-
-        # repeat k/v heads if n_kv_heads < n_heads
-        key_states = repeat_kv(key_states, self.num_key_value_groups)
-        value_states = repeat_kv(value_states, self.num_key_value_groups)
-
-        attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim)
-
-        if attn_weights.size() != (bsz, self.num_heads_tp, q_len, kv_seq_len):
-            raise ValueError(
-                f"Attention weights should be of size {(bsz, self.num_heads_tp, q_len, kv_seq_len)}, but is"
-                f" {attn_weights.size()}"
-            )
-
-        if attention_mask is not None:
-            if attention_mask.size() != (bsz, 1, q_len, kv_seq_len):
-                raise ValueError(
-                    f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}"
-                )
-
-            attn_weights = attn_weights + attention_mask
-
-        # upcast attention to fp32
-        attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
-        attn_weights = nn.functional.dropout(attn_weights, p=self.attention_dropout, training=self.training)
-        attn_output = torch.matmul(attn_weights, value_states)
-
-        if attn_output.size() != (bsz, self.num_heads_tp, q_len, self.head_dim):
-            raise ValueError(
-                f"`attn_output` should be of size {(bsz, self.num_heads_tp, q_len, self.head_dim)}, but is"
-                f" {attn_output.size()}"
-            )
-
-        attn_output = attn_output.transpose(1, 2).contiguous()
-        attn_output = attn_output.reshape(bsz, q_len, int(self.hidden_size/self.tp_size))
-
-        attn_output = self.o_proj(attn_output)
-
-        if not output_attentions:
-            attn_weights = None
-
-        # --------------------------------------------------------
-        # 将Tensor转换为列表
-        ans_list = attn_output.tolist()
-        # 指定.json文件的路径
-        file_path = '/remote-home/lqyin/CoLLiE/tests/models/mistraltp/attn_output_1.json'
-        
-        # 尝试打开现有的.json文件并读取内容，如果文件不存在则创建一个新的列表
-        try:
-            with open(file_path, 'r', encoding='utf-8') as file:
-                results_list = json.load(file)
-        except FileNotFoundError:
-            results_list = []
-        # 将当前结果添加到列表中
-        results_list.append(ans_list)
-        # 将更新后的列表写回.json文件
-        with open(file_path, 'w', encoding='utf-8') as file:
-            json.dump(results_list, file, ensure_ascii=False, indent=4)
-            file.write('\n\n\n')  # 在文件末尾添加一个换行符
-        # --------------------------------------------------------
-
-
-
-        return attn_output, attn_weights, past_key_value
-
-
-class Mistral2FlashAttention2(Mistral2Attention):
-    """
-    Mistral flash attention module. This module inherits from `MistralAttention` as the weights of the module stays
-    untouched. The only required change would be on the forward pass where it needs to correctly call the public API of
-    flash attention and deal with padding tokens in case the input contains any of them.
-    """
-
-    # Copied from transformers.models.llama.modeling_llama.LlamaFlashAttention2.__init__
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-
-        # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1.
-        # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignement, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0.
-        # Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left).
-        self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10()
-
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_value: Optional[Cache] = None,
-        output_attentions: bool = False,
-        use_cache: bool = False,
-        **kwargs,
-    ):
-        if "padding_mask" in kwargs:
-            warnings.warn(
-                "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`"
-            )
-
-            # overwrite attention_mask with padding_mask
-            attention_mask = kwargs.pop("padding_mask")
-        bsz, q_len, _ = hidden_states.size()
-
-        query_states = self.q_proj(hidden_states)
-        key_states = self.k_proj(hidden_states)
-        value_states = self.v_proj(hidden_states)
-
-        # query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
-        # key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
-        # value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
-
-        query_states, key_states, value_states = (
-            rearrange(query_states, "b n (h d) -> b n h d", d=self.head_dim),
-            rearrange(key_states, "b n (h d) -> b n h d", d=self.head_dim),
-            rearrange(value_states, "b n (h d) -> b n h d", d=self.head_dim),
-        )
-
-        self.num_heads_tp = query_states.shape[2]
-        self.tp_size = self.num_heads // self.num_heads_tp
-
-        query_states = query_states.transpose(1, 2)
-        key_states = key_states.transpose(1, 2)
-        value_states = value_states.transpose(1, 2)
-
-        if self.config.pp_size > 1:
-            query_states = query_states.contiguous()
-            key_states = key_states.contiguous()
-            value_states = value_states.contiguous()
-
-
-        kv_seq_len = key_states.shape[-2]
-        if past_key_value is not None:
-            if self.layer_idx is None:
-                raise ValueError(
-                    f"The cache structure has changed since version v4.36. If you are using {self.__class__.__name__} "
-                    "for auto-regressive decoding with k/v caching, please make sure to initialize the attention class "
-                    "with a layer index."
-                )
-            kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
-
-        # Because the input can be padded, the absolute sequence length depends on the max position id.
-        rotary_seq_len = max(kv_seq_len, position_ids[:, -1].max().item()) + 1
-        cos, sin = self.rotary_emb(value_states, seq_len=rotary_seq_len)
-
-        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
-
-        use_sliding_windows = (
-            _flash_supports_window_size
-            and getattr(self.config, "sliding_window", None) is not None
-            and kv_seq_len > self.config.sliding_window
-        )
-
-        if not _flash_supports_window_size:
-            logger.warning_once(
-                "The current flash attention version does not support sliding window attention, for a more memory efficient implementation"
-                " make sure to upgrade flash-attn library."
-            )
-
-        if past_key_value is not None:
-            # Activate slicing cache only if the config has a value `sliding_windows` attribute
-            cache_has_contents = past_key_value.get_seq_length(self.layer_idx) > 0
-            if (
-                getattr(self.config, "sliding_window", None) is not None
-                and kv_seq_len > self.config.sliding_window
-                and cache_has_contents
-            ):
-                slicing_tokens = 1 - self.config.sliding_window
-
-                past_key = past_key_value[self.layer_idx][0]
-                past_value = past_key_value[self.layer_idx][1]
-
-                past_key = past_key[:, :, slicing_tokens:, :].contiguous()
-                past_value = past_value[:, :, slicing_tokens:, :].contiguous()
-
-                if past_key.shape[-2] != self.config.sliding_window - 1:
-                    raise ValueError(
-                        f"past key must have a shape of (`batch_size, num_heads, self.config.sliding_window-1, head_dim`), got"
-                        f" {past_key.shape}"
-                    )
-
-                if attention_mask is not None:
-                    attention_mask = attention_mask[:, slicing_tokens:]
-                    attention_mask = torch.cat([attention_mask, torch.ones_like(attention_mask[:, -1:])], dim=-1)
-
-            cache_kwargs = {"sin": sin, "cos": cos}  # Specific to RoPE models
-            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
-
-        # repeat k/v heads if n_kv_heads < n_heads
-        key_states = repeat_kv(key_states, self.num_key_value_groups)
-        value_states = repeat_kv(value_states, self.num_key_value_groups)
-        dropout_rate = 0.0 if not self.training else self.attention_dropout
-
-        # In PEFT, usually we cast the layer norms in float32 for training stability reasons
-        # therefore the input hidden states gets silently casted in float32. Hence, we need
-        # cast them back in float16 just to be sure everything works as expected.
-        input_dtype = query_states.dtype
-        if input_dtype == torch.float32:
-            if torch.is_autocast_enabled():
-                target_dtype = torch.get_autocast_gpu_dtype()
-            # Handle the case where the model is quantized
-            elif hasattr(self.config, "_pre_quantization_dtype"):
-                target_dtype = self.config._pre_quantization_dtype
-            else:
-                target_dtype = self.q_proj.weight.dtype
-
-            logger.warning_once(
-                f"The input hidden states seems to be silently casted in float32, this might be related to"
-                f" the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in"
-                f" {target_dtype}."
-            )
-
-            query_states = query_states.to(target_dtype)
-            key_states = key_states.to(target_dtype)
-            value_states = value_states.to(target_dtype)
-
-        # Reashape to the expected shape for Flash Attention
-        query_states = query_states.transpose(1, 2)
-        key_states = key_states.transpose(1, 2)
-        value_states = value_states.transpose(1, 2)
-
-        attn_output = self._flash_attention_forward(
-            query_states,
-            key_states,
-            value_states,
-            attention_mask,
-            q_len,
-            dropout=dropout_rate,
-            use_sliding_windows=use_sliding_windows,
-        )
-
-        attn_output = attn_output.reshape(bsz, q_len, int(self.hidden_size/self.tp_size)).contiguous()
-        attn_output = self.o_proj(attn_output)
-
-        if not output_attentions:
-            attn_weights = None
-
-  # --------------------------------------------------------
-        # 将Tensor转换为列表
-        ans_list = attn_output.tolist()
-        # 指定.json文件的路径
-        file_path = '/remote-home/lqyin/CoLLiE/tests/models/mistraltp/attn_output_1.json'
-        
-        # 尝试打开现有的.json文件并读取内容，如果文件不存在则创建一个新的列表
-        try:
-            with open(file_path, 'r', encoding='utf-8') as file:
-                results_list = json.load(file)
-        except FileNotFoundError:
-            results_list = []
-        # 将当前结果添加到列表中
-        results_list.append(ans_list)
-        # 将更新后的列表写回.json文件
-        with open(file_path, 'w', encoding='utf-8') as file:
-            json.dump(results_list, file, ensure_ascii=False, indent=4)
-            file.write('\n\n\n')  # 在文件末尾添加一个换行符
-        # --------------------------------------------------------
-
-
-        return attn_output, attn_weights, past_key_value
-
-    def _flash_attention_forward(
-        self,
-        query_states,
-        key_states,
-        value_states,
-        attention_mask,
-        query_length,
-        dropout=0.0,
-        softmax_scale=None,
-        use_sliding_windows=False,
-    ):
-        """
-        Calls the forward method of Flash Attention - if the input hidden states contain at least one padding token
-        first unpad the input, then computes the attention scores and pad the final attention scores.
-
-        Args:
-            query_states (`torch.Tensor`):
-                Input query states to be passed to Flash Attention API
-            key_states (`torch.Tensor`):
-                Input key states to be passed to Flash Attention API
-            value_states (`torch.Tensor`):
-                Input value states to be passed to Flash Attention API
-            attention_mask (`torch.Tensor`):
-                The padding mask - corresponds to a tensor of size `(batch_size, seq_len)` where 0 stands for the
-                position of padding tokens and 1 for the position of non-padding tokens.
-            dropout (`float`):
-                Attention dropout
-            softmax_scale (`float`, *optional*):
-                The scaling of QK^T before applying softmax. Default to 1 / sqrt(head_dim)
-            use_sliding_windows (`bool`, *optional*):
-                Whether to activate sliding window attention.
-        """
-        if not self._flash_attn_uses_top_left_mask:
-            causal = self.is_causal
-        else:
-            # TODO: Remove the `query_length != 1` check once Flash Attention for RoCm is bumped to 2.1. For details, please see the comment in LlamaFlashAttention2 __init__.
-            causal = self.is_causal and query_length != 1
-
-        # Contains at least one padding token in the sequence
-        if attention_mask is not None:
-            batch_size = query_states.shape[0]
-            query_states, key_states, value_states, indices_q, cu_seq_lens, max_seq_lens = self._upad_input(
-                query_states, key_states, value_states, attention_mask, query_length
-            )
-
-            cu_seqlens_q, cu_seqlens_k = cu_seq_lens
-            max_seqlen_in_batch_q, max_seqlen_in_batch_k = max_seq_lens
-
-            if not use_sliding_windows:
-                attn_output_unpad = flash_attn_varlen_func(
-                    query_states,
-                    key_states,
-                    value_states,
-                    cu_seqlens_q=cu_seqlens_q,
-                    cu_seqlens_k=cu_seqlens_k,
-                    max_seqlen_q=max_seqlen_in_batch_q,
-                    max_seqlen_k=max_seqlen_in_batch_k,
-                    dropout_p=dropout,
-                    softmax_scale=softmax_scale,
-                    causal=causal,
-                )
-            else:
-                attn_output_unpad = flash_attn_varlen_func(
-                    query_states,
-                    key_states,
-                    value_states,
-                    cu_seqlens_q=cu_seqlens_q,
-                    cu_seqlens_k=cu_seqlens_k,
-                    max_seqlen_q=max_seqlen_in_batch_q,
-                    max_seqlen_k=max_seqlen_in_batch_k,
-                    dropout_p=dropout,
-                    softmax_scale=softmax_scale,
-                    causal=causal,
-                    window_size=(self.config.sliding_window, self.config.sliding_window),
-                )
-
-            attn_output = pad_input(attn_output_unpad, indices_q, batch_size, query_length)
-        else:
-            if not use_sliding_windows:
-                attn_output = flash_attn_func(
-                    query_states,
-                    key_states,
-                    value_states,
-                    dropout,
-                    softmax_scale=softmax_scale,
-                    causal=causal,
-                )
-            else:
-                attn_output = flash_attn_func(
-                    query_states,
-                    key_states,
-                    value_states,
-                    dropout,
-                    softmax_scale=softmax_scale,
-                    causal=causal,
-                    window_size=(self.config.sliding_window, self.config.sliding_window),
-                )
-
-        return attn_output
-
-    def _upad_input(self, query_layer, key_layer, value_layer, attention_mask, query_length):
-        batch_size, kv_seq_len, num_heads, head_dim = key_layer.shape
-
-        # On the first iteration we need to properly re-create the padding mask
-        # by slicing it on the proper place
-        if kv_seq_len != attention_mask.shape[-1]:
-            attention_mask_num_tokens = attention_mask.shape[-1]
-            attention_mask = attention_mask[:, attention_mask_num_tokens - kv_seq_len :]
-
-        indices_k, cu_seqlens_k, max_seqlen_in_batch_k = _get_unpad_data(attention_mask)
-
-        key_layer = index_first_axis(key_layer.reshape(batch_size * kv_seq_len, num_heads, head_dim), indices_k)
-        value_layer = index_first_axis(value_layer.reshape(batch_size * kv_seq_len, num_heads, head_dim), indices_k)
-
-        if query_length == kv_seq_len:
-            query_layer = index_first_axis(
-                query_layer.reshape(batch_size * kv_seq_len, num_heads, head_dim), indices_k
-            )
-            cu_seqlens_q = cu_seqlens_k
-            max_seqlen_in_batch_q = max_seqlen_in_batch_k
-            indices_q = indices_k
-        elif query_length == 1:
-            max_seqlen_in_batch_q = 1
-            cu_seqlens_q = torch.arange(
-                batch_size + 1, dtype=torch.int32, device=query_layer.device
-            )  # There is a memcpy here, that is very bad.
-            indices_q = cu_seqlens_q[:-1]
-            query_layer = query_layer.squeeze(1)
-        else:
-            # The -q_len: slice assumes left padding.
-            attention_mask = attention_mask[:, -query_length:]
-            query_layer, indices_q, cu_seqlens_q, max_seqlen_in_batch_q = unpad_input(query_layer, attention_mask)
-
-        return (
-            query_layer,
-            key_layer,
-            value_layer,
-            indices_q,
-            (cu_seqlens_q, cu_seqlens_k),
-            (max_seqlen_in_batch_q, max_seqlen_in_batch_k),
-        )
-
-
-# copied from transformers.models.llama.modeling_llama.LlamaSdpaAttention with Llama->Mistral
-# TODO @Arthur no longer copied from LLama after static cache
-class Mistral2SdpaAttention(Mistral2Attention):
-    """
-    Mistral attention module using torch.nn.functional.scaled_dot_product_attention. This module inherits from
-    `MistralAttention` as the weights of the module stays untouched. The only changes are on the forward pass to adapt to
-    SDPA API.
-    """
-
-    # Adapted from MistralAttention.forward
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_value: Optional[Cache] = None,
-        output_attentions: bool = False,
-        use_cache: bool = False,
-    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
-        if output_attentions:
-            # TODO: Improve this warning with e.g. `model.config.attn_implementation = "manual"` once this is implemented.
-            logger.warning_once(
-                "MistralModel is using MistralSdpaAttention, but `torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to the manual attention implementation, "
-                'but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
-            )
-            return super().forward(
-                hidden_states=hidden_states,
-                attention_mask=attention_mask,
-                position_ids=position_ids,
-                past_key_value=past_key_value,
-                output_attentions=output_attentions,
-                use_cache=use_cache,
-            )
-
-        bsz, q_len, _ = hidden_states.size()
-
-        query_states = self.q_proj(hidden_states)
-        key_states = self.k_proj(hidden_states)
-        value_states = self.v_proj(hidden_states)
-
-        # query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
-        # key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
-        # value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
-
-        query_states, key_states, value_states = (
-            rearrange(query_states, "b n (h d) -> b n h d", d=self.head_dim),
-            rearrange(key_states, "b n (h d) -> b n h d", d=self.head_dim),
-            rearrange(value_states, "b n (h d) -> b n h d", d=self.head_dim),
-        )
-
-        self.num_heads_tp = query_states.shape[2]
-        self.tp_size = self.num_heads // self.num_heads_tp
-
-        query_states = query_states.transpose(1, 2)
-        key_states = key_states.transpose(1, 2)
-        value_states = value_states.transpose(1, 2)
-
-        if self.config.pp_size > 1:
-            query_states = query_states.contiguous()
-            key_states = key_states.contiguous()
-            value_states = value_states.contiguous()
-
-
-        kv_seq_len = key_states.shape[-2]
-        if past_key_value is not None:
-            kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
-        cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
-
-        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
-
-        if past_key_value is not None:
-            cache_kwargs = {"sin": sin, "cos": cos}  # Specific to RoPE models
-            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
-
-        key_states = repeat_kv(key_states, self.num_key_value_groups)
-        value_states = repeat_kv(value_states, self.num_key_value_groups)
-
-        if attention_mask is not None:
-            if attention_mask.size() != (bsz, 1, q_len, kv_seq_len):
-                raise ValueError(
-                    f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}"
-                )
-
-        # SDPA with memory-efficient backend is currently (torch==2.1.2) bugged with non-contiguous inputs with custom attn_mask,
-        # Reference: https://github.com/pytorch/pytorch/issues/112577.
-        if query_states.device.type == "cuda" and attention_mask is not None:
-            query_states = query_states.contiguous()
-            key_states = key_states.contiguous()
-            value_states = value_states.contiguous()
-
-        attn_output = torch.nn.functional.scaled_dot_product_attention(
-            query_states,
-            key_states,
-            value_states,
-            attn_mask=attention_mask,
-            dropout_p=self.attention_dropout if self.training else 0.0,
-            # The q_len > 1 is necessary to match with AttentionMaskConverter.to_causal_4d that does not create a causal mask in case q_len == 1.
-            is_causal=self.is_causal and attention_mask is None and q_len > 1,
-        )
-
-        attn_output = attn_output.transpose(1, 2).contiguous()
-        attn_output = attn_output.view(bsz, q_len, int(self.hidden_size/self.tp_size))
-
-        attn_output = self.o_proj(attn_output)
-
-        return attn_output, None, past_key_value
-
-
-MISTRAL_ATTENTION_CLASSES = {
-    "eager": Mistral2Attention,
-    "flash_attention_2": Mistral2FlashAttention2,
-    "sdpa": Mistral2SdpaAttention,
-}
-
-
-class MistralDecoderLayer(nn.Module):
-    def __init__(self, config: CollieConfig, layer_idx: int):
-        super().__init__()
-        self.hidden_size = config.hidden_size
-
-        self.self_attn = MISTRAL_ATTENTION_CLASSES[config._attn_implementation](config, layer_idx)
-
-        self.mlp = Mistral2MLP(config)
-        self.input_layernorm = Mistral2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
-        self.post_attention_layernorm = Mistral2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
-
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_value: Optional[Tuple[torch.Tensor]] = None,
-        output_attentions: Optional[bool] = False,
-        use_cache: Optional[bool] = False,
-        **kwargs,
-    ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
-        if "padding_mask" in kwargs:
-            warnings.warn(
-                "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`"
-            )
-        """
-        Args:
-            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
-            attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
-                `(batch, sequence_length)` where padding elements are indicated by 0.
-            output_attentions (`bool`, *optional*):
-                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
-                returned tensors for more detail.
-            use_cache (`bool`, *optional*):
-                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
-                (see `past_key_values`).
-            past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
-        """
-
-        residual = hidden_states
-
-        hidden_states = self.input_layernorm(hidden_states)
-
-        # Self Attention
-        hidden_states, self_attn_weights, present_key_value = self.self_attn(
-            hidden_states=hidden_states,
-            attention_mask=attention_mask,
-            position_ids=position_ids,
-            past_key_value=past_key_value,
-            output_attentions=output_attentions,
-            use_cache=use_cache,
-        )
-        hidden_states = residual + hidden_states
-
-        # Fully Connected
-        residual = hidden_states
-        hidden_states = self.post_attention_layernorm(hidden_states)
-        hidden_states = self.mlp(hidden_states)
-        hidden_states = residual + hidden_states
-
-        outputs = (hidden_states,)
-
-        if output_attentions:
-            outputs += (self_attn_weights,)
-
-        if use_cache:
-            outputs += (present_key_value,)
-
-        # --------------------------------------------------------
-        # # 将Tensor转换为列表
-        # ans_list = [tensor.tolist() for tensor in outputs]
-        # # 指定.json文件的路径
-        # file_path = '/remote-home/lqyin/CoLLiE/tests/models/mistraltp/decoder_outputs.json'
-        
-        # # 尝试打开现有的.json文件并读取内容，如果文件不存在则创建一个新的列表
-        # try:
-        #     with open(file_path, 'r', encoding='utf-8') as file:
-        #         results_list = json.load(file)
-        # except FileNotFoundError:
-        #     results_list = []
-        # # 将当前结果添加到列表中
-        # results_list.append(ans_list)
-        # # 将更新后的列表写回.json文件
-        # with open(file_path, 'w', encoding='utf-8') as file:
-        #     json.dump(results_list, file, ensure_ascii=False, indent=4)
-        #     file.write('\n')  # 在文件末尾添加一个换行符
-        # --------------------------------------------------------
-
-        return outputs
-
-
-MISTRAL_START_DOCSTRING = r"""
-    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
-    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
-    etc.)
-
-    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
-    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
-    and behavior.
-
-    Parameters:
-        config ([`MistralConfig`]):
-            Model configuration class with all the parameters of the model. Initializing with a config file does not
-            load the weights associated with the model, only the configuration. Check out the
-            [`~PreTrainedModel.from_pretrained`] method to load the model weights.
-"""
-
-
-@add_start_docstrings(
-    "The bare Mistral Model outputting raw hidden-states without any specific head on top.",
-    MISTRAL_START_DOCSTRING,
-)
-class Mistral2PreTrainedModel(PreTrainedModel):
-    config_class = Mistral2Config
-    base_model_prefix = "model"
-    supports_gradient_checkpointing = True
-    _no_split_modules = ["MistralDecoderLayer"]
-    _skip_keys_device_placement = "past_key_values"
-    _supports_flash_attn_2 = True
-    _supports_sdpa = True
-    _supports_cache_class = True
-
-    def _init_weights(self, module):
-        std = self.config.initializer_range
-        if isinstance(module, nn.Linear):
-            module.weight.data.normal_(mean=0.0, std=std)
-            if module.bias is not None:
-                module.bias.data.zero_()
-        elif isinstance(module, nn.Embedding):
-            module.weight.data.normal_(mean=0.0, std=std)
-            if module.padding_idx is not None:
-                module.weight.data[module.padding_idx].zero_()
-
-
-MISTRAL_INPUTS_DOCSTRING = r"""
-    Args:
-        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
-            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
-            it.
-
-            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
-            [`PreTrainedTokenizer.__call__`] for details.
-
-            [What are input IDs?](../glossary#input-ids)
-        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
-            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
-
-            - 1 for tokens that are **not masked**,
-            - 0 for tokens that are **masked**.
-
-            [What are attention masks?](../glossary#attention-mask)
-
-            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
-            [`PreTrainedTokenizer.__call__`] for details.
-
-            If `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
-            `past_key_values`).
-
-            If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
-            and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
-            information on the default strategy.
-
-            - 1 indicates the head is **not masked**,
-            - 0 indicates the head is **masked**.
-        position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
-            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
-            config.n_positions - 1]`.
-
-            [What are position IDs?](../glossary#position-ids)
-        past_key_values (`Cache` or `tuple(tuple(torch.FloatTensor))`, *optional*):
-            Pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
-            blocks) that can be used to speed up sequential decoding. This typically consists in the `past_key_values`
-            returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`.
-
-            Two formats are allowed:
-            - a [`~cache_utils.Cache`] instance;
-            - Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of
-            shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`). This is also known as the legacy
-            cache format.
-
-            The model will output the same cache format that is fed as input. If no `past_key_values` are passed, the
-            legacy cache format will be returned.
-
-            If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that don't
-            have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `input_ids`
-            of shape `(batch_size, sequence_length)`.
-        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
-            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
-            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
-            model's internal embedding lookup matrix.
-        use_cache (`bool`, *optional*):
-            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
-            `past_key_values`).
-        output_attentions (`bool`, *optional*):
-            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
-            tensors for more detail.
-        output_hidden_states (`bool`, *optional*):
-            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
-            more detail.
-        return_dict (`bool`, *optional*):
-            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
-"""
-
-
-@add_start_docstrings(
-    "The bare Mistral Model outputting raw hidden-states without any specific head on top.",
-    MISTRAL_START_DOCSTRING,
-)
-class Mistral2Model(nn.Module):
-    """
-    Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`MistralDecoderLayer`]
-
-    Args:
-        config: MistralConfig
-    """
-
-    def __init__(self, config:  CollieConfig):
-        # super().__init__(config)
-        super().__init__()
-        self.config = config
-        self.padding_idx = config.pad_token_id
-        self.vocab_size = config.vocab_size
-
-        self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
-        self.layers = nn.ModuleList(
-            [MistralDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
-        )
-        self._attn_implementation = config._attn_implementation
-        self.norm = Mistral2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
-
-        self.gradient_checkpointing = False
-        # Initialize weights and apply final processing
-        # self.post_init()
-
-    def get_input_embeddings(self):
-        return self.embed_tokens
-
-    def set_input_embeddings(self, value):
-        self.embed_tokens = value
-
-    @add_start_docstrings_to_model_forward(MISTRAL_INPUTS_DOCSTRING)
-    def forward(
-        self,
-        input_ids: torch.LongTensor = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[List[torch.FloatTensor]] = None,
-        inputs_embeds: Optional[torch.FloatTensor] = None,
-        use_cache: Optional[bool] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ) -> Union[Tuple, BaseModelOutputWithPast]:
-        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
-        output_hidden_states = (
-            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
-        )
-        use_cache = use_cache if use_cache is not None else self.config.use_cache
-
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        # retrieve input_ids and inputs_embeds
-        if input_ids is not None and inputs_embeds is not None:
-            raise ValueError("You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time")
-        elif input_ids is not None:
-            batch_size, seq_length = input_ids.shape
-        elif inputs_embeds is not None:
-            batch_size, seq_length, _ = inputs_embeds.shape
-        else:
-            raise ValueError("You have to specify either decoder_input_ids or decoder_inputs_embeds")
-
-        if self.gradient_checkpointing and self.training:
-            if use_cache:
-                logger.warning_once(
-                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
-                )
-                use_cache = False
-
-        past_key_values_length = 0
-
-        if use_cache:
-            use_legacy_cache = not isinstance(past_key_values, Cache)
-            if use_legacy_cache:
-                past_key_values = DynamicCache.from_legacy_cache(past_key_values)
-            past_key_values_length = past_key_values.get_usable_length(seq_length)
-
-        if position_ids is None:
-            device = input_ids.device if input_ids is not None else inputs_embeds.device
-            position_ids = torch.arange(
-                past_key_values_length, seq_length + past_key_values_length, dtype=torch.long, device=device
-            )
-            position_ids = position_ids.unsqueeze(0).view(-1, seq_length)
-        else:
-            position_ids = position_ids.view(-1, seq_length).long()
-
-        if inputs_embeds is None:
-            inputs_embeds = self.embed_tokens(input_ids)
-
-        if attention_mask is not None and self._attn_implementation == "flash_attention_2" and use_cache:
-            is_padding_right = attention_mask[:, -1].sum().item() != batch_size
-            if is_padding_right:
-                raise ValueError(
-                    "You are attempting to perform batched generation with padding_side='right'"
-                    " this may lead to unexpected behaviour for Flash Attention version of Mistral. Make sure to "
-                    " call `tokenizer.padding_side  = 'left'` before tokenizing the input. "
-                )
-
-        if self._attn_implementation == "flash_attention_2":
-            # 2d mask is passed through the layers
-            attention_mask = attention_mask if (attention_mask is not None and 0 in attention_mask) else None
-        elif self._attn_implementation == "sdpa" and not output_attentions:
-            # output_attentions=True can not be supported when using SDPA, and we fall back on
-            # the manual implementation that requires a 4D causal mask in all cases.
-            attention_mask = _prepare_4d_causal_attention_mask_for_sdpa(
-                attention_mask,
-                (batch_size, seq_length),
-                inputs_embeds,
-                past_key_values_length,
-            )
-        else:
-            # 4d mask is passed through the layers
-            attention_mask = _prepare_4d_causal_attention_mask(
-                attention_mask,
-                (batch_size, seq_length),
-                inputs_embeds,
-                past_key_values_length,
-                sliding_window=self.config.sliding_window,
-            )
-
-        hidden_states = inputs_embeds
-    
-
-        # --------------------------------------------------------
-        # # 将Tensor转换为列表
-        # ans_list = inputs_embeds.tolist()
-        # # 指定.json文件的路径
-        # file_path = '/remote-home/lqyin/CoLLiE/tests/models/mistraltp/inputs_embeds.json'
-        
-        # # 尝试打开现有的.json文件并读取内容，如果文件不存在则创建一个新的列表
-        # try:
-        #     with open(file_path, 'r', encoding='utf-8') as file:
-        #         results_list = json.load(file)
-        # except FileNotFoundError:
-        #     results_list = []
-        # # 将当前结果添加到列表中
-        # results_list.append(ans_list)
-        # # 将更新后的列表写回.json文件
-        # with open(file_path, 'w', encoding='utf-8') as file:
-        #     json.dump(results_list, file, ensure_ascii=False, indent=4)
-        #     file.write('\n')  # 在文件末尾添加一个换行符
-        # # --------------------------------------------------------
-
-        # decoder layers
-        all_hidden_states = () if output_hidden_states else None
-        all_self_attns = () if output_attentions else None
-        next_decoder_cache = None
-
-        for decoder_layer in self.layers:
-            if output_hidden_states:
-                all_hidden_states += (hidden_states,)
-
-            if self.gradient_checkpointing and self.training:
-                layer_outputs = self._gradient_checkpointing_func(
-                    decoder_layer.__call__,
-                    hidden_states,
-                    attention_mask,
-                    position_ids,
-                    past_key_values,
-                    output_attentions,
-                    use_cache,
-                )
-            else:
-                layer_outputs = decoder_layer(
-                    hidden_states,
-                    attention_mask=attention_mask,
-                    position_ids=position_ids,
-                    past_key_value=past_key_values,
-                    output_attentions=output_attentions,
-                    use_cache=use_cache,
-                )
-
-            hidden_states = layer_outputs[0]
-
-            if use_cache:
-                next_decoder_cache = layer_outputs[2 if output_attentions else 1]
-
-            if output_attentions:
-                all_self_attns += (layer_outputs[1],)
-
-        hidden_states = self.norm(hidden_states)
-
-        # add hidden states from the last decoder layer
-        if output_hidden_states:
-            all_hidden_states += (hidden_states,)
-
-        next_cache = None
-        if use_cache:
-            next_cache = next_decoder_cache.to_legacy_cache() if use_legacy_cache else next_decoder_cache
-
-        if not return_dict:
-            return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None)
-        return BaseModelOutputWithPast(
-            last_hidden_state=hidden_states,
-            past_key_values=next_cache,
-            hidden_states=all_hidden_states,
-            attentions=all_self_attns,
-        )
-
-
-class Mistral2ForCausalLM(CollieModelForCausalLM):
-    _tied_weights_keys = ["lm_head.weight"]
-
-    def __init__(self, config:CollieConfig):
-        super().__init__(config)
-        self.model = Mistral2Model(config)
-        self.vocab_size = config.vocab_size
-        # self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
-        self.lm_head = ColumnParallelLinearWithoutBias(
-            self.collie_config.hidden_size, self.collie_config.vocab_size, bias=False
-        )
-        # Initialize weights and apply final processing
-        # self.post_init()
-        # GenerationMixin 需要的额外参数
-        self.config.is_decoder = True
-        if config.model_config.tie_word_embeddings:
-            self.lm_head.weight = self.embed_tokens.weight
-        self.main_input_name = "input_ids"
-
-    def clean_cache(self):
-        self._clean_hidden_states([*self.model.layers, self.lm_head])
-        self._set_use_cache(self.model.layers, False)
-
-    def set_cache(self, use_cache):
-        self._set_use_cache(self.model.layers, use_cache)
-
-    def get_input_embeddings(self):
-        return self.model.embed_tokens
-
-    def set_input_embeddings(self, value):
-        self.model.embed_tokens = value
-
-    def get_output_embeddings(self):
-        return self.lm_head
-
-    def set_output_embeddings(self, new_embeddings):
-        self.lm_head = new_embeddings
-
-    def set_decoder(self, decoder):
-        self.model = decoder
-
-    def get_decoder(self):
-        return self.model
-
-    @add_start_docstrings_to_model_forward(MISTRAL_INPUTS_DOCSTRING)
-    @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
-    def forward(
-        self,
-        input_ids: torch.LongTensor = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[List[torch.FloatTensor]] = None,
-        inputs_embeds: Optional[torch.FloatTensor] = None,
-        labels: Optional[torch.LongTensor] = None,
-        use_cache: Optional[bool] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-        **kwargs,
-    ) -> Union[Tuple, CausalLMOutputWithPast]:
-        r"""
-        Args:
-            labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
-                Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
-                config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
-                (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
-
-        Returns:
-
-        Example:
-
-        ```python
-        >>> from transformers import AutoTokenizer, MistralForCausalLM
-
-        >>> model = MistralForCausalLM.from_pretrained("mistralai/Mistral-7B-v0.1")
-        >>> tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-v0.1")
-
-        >>> prompt = "Hey, are you conscious? Can you talk to me?"
-        >>> inputs = tokenizer(prompt, return_tensors="pt")
-
-        >>> # Generate
-        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
-        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
-        "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
-        ```"""
-
-        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
-        output_hidden_states = (
-            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
-        )
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
-        outputs = self.model(
-            input_ids=input_ids,
-            attention_mask=attention_mask,
-            position_ids=position_ids,
-            past_key_values=past_key_values,
-            inputs_embeds=inputs_embeds,
-            use_cache=use_cache,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-
-        hidden_states = outputs[0]
-        logits = self.lm_head(hidden_states)
-        logits = logits.float()
-
-        loss = None
-        if labels is not None:
-            # Shift so that tokens < n predict n
-            shift_logits = logits[..., :-1, :].contiguous()
-            shift_labels = labels[..., 1:].contiguous()
-            # Flatten the tokens
-            shift_logits = shift_logits.view(-1, self.config.vocab_size)
-            shift_labels = shift_labels.view(-1)
-            # Ensure tensors are on the same device
-            shift_labels = shift_labels.to(shift_logits.device)
-            loss_fct = CrossEntropyLoss()
-            loss = loss_fct(shift_logits, shift_labels)
-
-        if not return_dict:
-            output = (logits,) + outputs[1:]
-            return (loss,) + output if loss is not None else output
-
-        return CausalLMOutputWithPast(
-            loss=loss,
-            logits=logits,
-            past_key_values=outputs.past_key_values,
-            hidden_states=outputs.hidden_states,
-            attentions=outputs.attentions,
-        )
-
-    def prepare_inputs_for_generation(
-        self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, **kwargs
-    ):
-        # Omit tokens covered by past_key_values
-        if past_key_values is not None:
-            if isinstance(past_key_values, Cache):
-                cache_length = past_key_values.get_seq_length()
-                past_length = past_key_values.seen_tokens
-                max_cache_length = past_key_values.get_max_length()
-            else:
-                cache_length = past_length = past_key_values[0][0].shape[2]
-                max_cache_length = None
-
-            # Keep only the unprocessed tokens:
-            # 1 - If the length of the attention_mask exceeds the length of input_ids, then we are in a setting where
-            # some of the inputs are exclusively passed as part of the cache (e.g. when passing input_embeds as
-            # input)
-            if attention_mask is not None and attention_mask.shape[1] > input_ids.shape[1]:
-                input_ids = input_ids[:, -(attention_mask.shape[1] - past_length) :]
-            # 2 - If the past_length is smaller than input_ids', then input_ids holds all input tokens. We can discard
-            # input_ids based on the past_length.
-            elif past_length < input_ids.shape[1]:
-                input_ids = input_ids[:, past_length:]
-            # 3 - Otherwise (past_length >= input_ids.shape[1]), let's assume input_ids only has unprocessed tokens.
-
-            # If we are about to go beyond the maximum cache length, we need to crop the input attention mask.
-            if (
-                max_cache_length is not None
-                and attention_mask is not None
-                and cache_length + input_ids.shape[1] > max_cache_length
-            ):
-                attention_mask = attention_mask[:, -max_cache_length:]
-
-        position_ids = kwargs.get("position_ids", None)
-        if attention_mask is not None and position_ids is None:
-            # create position_ids on the fly for batch generation
-            position_ids = attention_mask.long().cumsum(-1) - 1
-            position_ids.masked_fill_(attention_mask == 0, 1)
-            if past_key_values:
-                position_ids = position_ids[:, -input_ids.shape[1] :]
-
-        # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
-        if inputs_embeds is not None and past_key_values is None:
-            model_inputs = {"inputs_embeds": inputs_embeds}
-        else:
-            model_inputs = {"input_ids": input_ids}
-
-        model_inputs.update(
-            {
-                "position_ids": position_ids,
-                "past_key_values": past_key_values,
-                "use_cache": kwargs.get("use_cache"),
-                "attention_mask": attention_mask,
-            }
-        )
-        return model_inputs
-
-    @staticmethod
-    def _reorder_cache(past_key_values, beam_idx):
-        reordered_past = ()
-        for layer_past in past_key_values:
-            reordered_past += (
-                tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past),
-            )
-        return reordered_past
-
-    @classmethod
-    def pipeline_layers(cls, config: CollieConfig):
-        """
-        Get layers of pipeline.
-        :return: list
-        """
-        if isinstance(config, str):
-            config = CollieConfig.from_pretrained(config)
-
-        if config.tie_word_embeddings:
-            output = TiedLayerSpec(
-                "embed_tokens",
-                dict_as_params(input_keys="hidden_states", output_keys="logits"),
-                ColumnParallelLMHead,
-                config.hidden_size,
-                config.vocab_size,
-                bias=False,
-            )
-        else:
-            output = LayerSpec(
-                dict_as_params(input_keys="hidden_states", output_keys="logits"),
-                ColumnParallelLMHead,
-                config.hidden_size,
-                config.vocab_size,
-                bias=False,
-            )
-
-        return [("model", Mistral2Model.pipeline_layers(config)), ("lm_head", output)]
-
-    @staticmethod
-    def load_parallel_state_dict(
-            path: str,
-            config: Union[CollieConfig, str],
-            process_exclusion: bool = False,
-            **kwargs,
-    ):
-        ...
-
-    @staticmethod
-    def load_parallel_state_dict(
-            path: str,
-            config: Union[CollieConfig, str],
-            process_exclusion: bool = False,
-            protocol: str = "file", # 指定加载state_dict时使用的协议
-            **kwargs,
-    ):
-        """
-        Load state_dict from ``path``.
-        The format of pretrained model should be the same as that of
-        `huggingface`.
-        :return: state_dict. Note that the state_dict should be processed
-            properly to match the current rank.
-        """
-        # 配置加载
-        if isinstance(config, str):
-            config = CollieConfig.from_pretrained(config)
-        # IO驱动初始化
-        io_driver = IODriver.from_protocol(protocol)
-        # 检查文件路径是否存在
-        if not io_driver.exists(path):
-            raise FileNotFoundError(f"folder {path} not found.")
-        # 初始化存储和处理变量
-        state_dict = OrderedDict()
-        weights = []
-        parts = None # 变量用于存储模型分割的部分信息
-        # 如果开启了进程互斥，那么每个进程都会显示进度条，否则只显示 RANK0 的
-        hide_progress = not process_exclusion and int(os.environ.get("RANK", "0")) != 0
-        if dist.is_initialized() and process_exclusion:
-            # 如果启动了进程互斥，则要进行 dist.get_world_size() 次循环
-            rank_order = range(dist.get_world_size())
-        else:
-            # 不开启只进行一次循环
-            rank_order = range(1)
-        # 权重文件加载和处理
-        for rank in rank_order:
-            # 如果开启了进程互斥，那么只有对应 RANK 的能进入循环；不开启进程互斥的话就都可以进
-            if int(os.environ.get("RANK", "0")) == rank or not process_exclusion:
-                # PP 分层的方法保存在了 os.environ["COLLIE_PP_PARTS"], 格式类似于 [0, 17, 35], 左闭右开
-                if env.is_pipeline:
-                    # 保存的是 json 格式
-                    parts = env.pipeline_parts
-                if hasattr(config, "num_key_value_heads"):
-                    # llama2 (transformers >= 4.31.0)
-                    num_key_value_heads = config.num_key_value_heads
-                else:
-                    num_key_value_heads = config.num_attention_heads
-                head_dim = config.hidden_size // config.num_attention_heads
-                # 如果存在 pytorch_model.bin.index.json 文件的话，此时不同的 pp 进程可以按需加载自己需要的权重
-                if (
-                        io_driver.exists(os.path.join(path, "pytorch_model.bin.index.json"))
-                        and "COLLIE_PP_PARTS" in os.environ.keys()
-                ):
-                    weight_map = json.loads(
-                        io_driver.load(
-                            os.path.join(path, "pytorch_model.bin.index.json"), mode="r"
-                        )
-                    )["weight_map"]
-                    # layers 表示自己需要的层
-                    layers = env.pipeline_layers_idx
-                    # 筛选出形似 model.layers.0 这样的层。包含两个条件：1. 有数字的层；2. 数字加一要在 layers 里面（因为最开始还有个 embedding 占一层）
-                    weights.extend(
-                        [
-                            value
-                            for key, value in weight_map.items()
-                            if len(key.split(".")) > 2
-                               and key.split(".")[2].isdigit()
-                               and (int(key.split(".")[2]) + 1) in layers
-                        ]
-                    )
-                    # 去重
-                    weights = list(set(weights))
-                    # 继续筛选，如果有 0 层，那么就要加载 embedding；如果有最后一层，那么就要加载 lm_head；如果有倒数第二层，那么就要加载 norm
-                    if 0 in layers:
-                        weights.append(weight_map["model.tok_embeddings.weight"])
-                    if max(parts) - 1 in layers:
-                        weights.append(weight_map["output.weight"])
-                    if max(parts) - 2 in layers:
-                        weights.append(weight_map["model.norm.weight"])
-                else:
-                    # 如果没有 pytorch_model.bin.index.json 文件的话，那么就加载所有的权重
-                    weights = [
-                        weight
-                        for weight in io_driver.list(path)
-                        if weight.endswith(".bin")
-                    ]
-                with progress(
-                    weights,
-                    desc="Loading state dict",
-                    total=len(weights),
-                    disable=hide_progress,
-                ) as pbar:
-                    for weight in pbar:
-                        part_state_dict = io_driver.load(
-                            os.path.join(path, weight), mode="rb"
-                        )
-                        # for key in list(part_state_dict.keys()):
-                            # if "attention.wqkv.weight" in key:
-                            #     # qkv_weights = part_state_dict.pop(key)
-                            #     qkv_weights = part_state_dict[key]
-                            #     print(qkv_weights.shape)
-                            #     (wq, wk, wv) = qkv_weights.split(
-                            #         [
-                            #             config.hidden_size,
-                            #             config.num_key_value_heads * head_dim,
-                            #             config.num_key_value_heads * head_dim,
-                            #         ],
-                            #         dim=0,
-                            #     )
-                            #     wq_name = key.replace("wqkv", "wq")
-                            #     wk_name = key.replace("wqkv", "wk")
-                            #     wv_name = key.replace("wqkv", "wv")
-                            #     part_state_dict[wq_name] = wq
-                            #     part_state_dict[wk_name] = wk
-                            #     part_state_dict[wv_name] = wv
-                        state_dict.update(part_state_dict)
-                        del part_state_dict
-                if parts is not None:
-                    # 这一步是 pp 的复筛
-                    layers = env.pipeline_layers_idx
-                    for key in list(state_dict.keys()):
-                        if key.startswith("layers"):
-                            layer = int(key.split(".")[1])
-                            if layer + 1 not in layers:
-                                state_dict.pop(key)
-                        # if key.endswith("tok_embeddings.weight"):
-                        if key.endswith("embed_tokens.weight"):
-                            if 0 not in layers:
-                                state_dict.pop(key)
-                        if key == "norm.weight":
-                            if max(parts) - 2 not in layers:
-                                state_dict.pop(key)
-                        # if key.endswith("output.weight"):
-                        if key.endswith("lm_head.weight"):
-                            if max(parts) - 1 not in layers:
-                                state_dict.pop(key)
-                # 根据用户配置的新的 tp size 进行分割
-                for key in list(state_dict.keys()):
-                    col_filter = [
-                        # "wq.weight",
-                        # "wk.weight",
-                        # "wv.weight",
-                        # "wqkv.weight",
-                        # "w1.weight",
-                        # "w3.weight",
-                        # "tok_embeddings.weight",
-                        # "output.weight",
-                        "q_proj.weight",
-                        "k_proj.weight",
-                        "v_proj.weight",
-                        "o_proj.weight",
-                        "lm_head.weight",
-                        "gate_proj.weight",
-                        "up_proj.weight",
-                        "down_proj.weight",
-                        "embed_tokens.weight",
-                    ]
-                    col_split = any([key.endswith(filter) for filter in col_filter])
-
-                    if col_split:
-                        tensor = (
-                            list(torch.chunk(state_dict[key], config.tp_size, dim=0))[
-                                env.tp_rank
-                            ]
-                            .detach()
-                            .clone()
-                        )
-                        del state_dict[key]
-                        if process_exclusion:
-                            # CPU 内存回收（速度很慢）
-                            gc.collect()
-                        state_dict[key] = tensor
-                    elif key.endswith("o_proj.weight") or key.endswith("down_proj.weight"):
-                        tensor = (
-                            list(torch.chunk(state_dict[key], config.tp_size, dim=1))[
-                                env.tp_rank
-                            ]
-                            .detach()
-                            .clone()
-                        )
-                        del state_dict[key]
-                        if process_exclusion:
-                            # CPU 内存回收（速度很慢）
-                            gc.collect()
-                        state_dict[key] = tensor
-            if dist.is_initialized() and process_exclusion:
-                # 如果选择了进程互斥，那么本次循环中不需要加载权重的进程需等待
-                dist.barrier()
-        return state_dict
-
-    @staticmethod
-    def save_parallel_state_dict(
-        state_dict: dict,
-        path: str,
-        config: CollieConfig,
-        process_exclusion: bool = False,
-        **kwargs,
-    ):
-        ...
-
-    @staticmethod
-    def save_parallel_state_dict(
-            state_dict: dict,
-            path: str,
-            config: CollieConfig,
-            process_exclusion: bool = False,
-            protocol: str = "file",
-    ):
-        """
-        Save state_dict to ``path``.
-        The format of saved state dict should be the same as that of
-        `huggingface`.
-        """
-        io_driver = IODriver.from_protocol(protocol)
-        # gather to tp rank 0
-        if dist.is_initialized() and process_exclusion:
-            # 如果启动了进程互斥，则要进行 pp_size 次循环
-            rank_order = range(config.pp_size)
-        else:
-            # 不开启只进行一次循环
-            rank_order = range(1)
-        dst = parallel_state.get_tensor_model_parallel_src_rank()
-        with progress(
-                rank_order,
-                desc="Saving model",
-                disable=int(os.environ.get("RANK", "0")) != 0,
-        ) as pbar:
-            for rank in pbar:
-                if env.dp_rank == 0 and (env.pp_rank == rank or not process_exclusion):
-                    for key in sorted(list(state_dict.keys())):
-                        tensor_list = None
-                        if env.tp_rank == 0:
-                            tensor_list = [
-                                torch.zeros_like(state_dict[key])
-                                .to(state_dict[key].dtype)
-                                .cuda()
-                                for _ in range(config.tp_size)
-                            ]
-                        dist.gather(
-                            state_dict[key].cuda(),
-                            dst=dst,
-                            gather_list=tensor_list,
-                            group=env.tp_group,
-                        )
-                        if env.tp_rank == 0:
-                            col_filter = [
-                                # "wq.weight",
-                                # "wk.weight",
-                                # "wv.weight",
-                                # "wqkv.weight",
-                                # "w1.weight",
-                                # "w3.weight",
-                                # "tok_embeddings.weight",
-                                # "output.weight",
-                                "q_proj.weight",
-                                "k_proj.weight",
-                                "v_proj.weight",
-                                "o_proj.weight",
-                                "lm_head.weight",
-                                "gate_proj.weight",
-                                "up_proj.weight",
-                                "down_proj.weight",
-                                "embed_tokens.weight",
-                            ]
-                            col_split = any(
-                                [key.endswith(filter) for filter in col_filter]
-                            )
-
-                            if col_split:
-                                state_dict[key] = concat_tensor(tensor_list, dim=0)
-
-                                if process_exclusion:
-                                    # CPU 内存回收（速度很慢）
-                                    gc.collect()
-
-                            elif key.endswith("o_proj.weight") or key.endswith("down_proj.weight"):
-                                state_dict[key] = concat_tensor(tensor_list, dim=1)
-
-                                if process_exclusion:
-                                    # CPU 内存回收（速度很慢）
-                                    gc.collect()
-                    # 似乎不需要？
-                    # state_dict_keys = state_dict.keys()
-                    # for layer_id in range(config.num_layers):
-                    #     qkv_names = [None, None, None]
-                    #     for key in state_dict_keys:
-                    #         if f"layers.{layer_id}.attention.wq.weight" in key:
-                    #             qkv_names[0] = key
-                    #         elif f"layers.{layer_id}.attention.wk.weight" in key:
-                    #             qkv_names[1] = key
-                    #         elif f"layers.{layer_id}.attention.wv.weight" in key:
-                    #             qkv_names[2] = key
-                    #     qkv_name = qkv_names[0].replace("wq", "wqkv")
-                    #     state_dict[qkv_name] = torch.cat(
-                    #         [
-                    #             state_dict.pop(qkv_names[0]),
-                    #             state_dict.pop(qkv_names[1]),
-                    #             state_dict.pop(qkv_names[2]),
-                    #         ],
-                    #         dim=0
-                    #     )
-
-                    if env.tp_rank == 0:
-                        # Save gathered weights
-                        if env.is_pipeline:
-                            ckpt_name = f"pytorch_model-{env.pp_rank + 1:05d}-of-{config.pp_size:05d}.bin"
-                            total_size = 0
-                            weight_map = {}
-                            for name, weight in state_dict.items():
-                                weight_size = weight.numel() * dtype_byte_size(
-                                    weight.dtype
-                                )
-                                weight_map[name] = ckpt_name
-                                total_size += weight_size
-                            index_dict = dict(
-                                total_size=total_size, weight_map=weight_map
-                            )
-                            index_dicts = [None for _ in range(env.pp_size)]
-                            dist.gather_object(
-                                index_dict, index_dicts if env.pp_rank == 0 else None, group=env.pp_group
-                            )
-                            if env.pp_rank == 0:
-                                total_size = 0
-                                weight_map = {}
-                                for _index_dict in index_dicts:
-                                    total_size += _index_dict["total_size"]
-                                    weight_map.update(_index_dict["weight_map"])
-                                merged_dict = {
-                                    "metadata": {"total_size": total_size},
-                                    "weight_map": weight_map,
-                                }
-                                io_driver.save(
-                                    json.dumps(merged_dict, indent=2, sort_keys=True)
-                                    + "\n",
-                                    os.path.join(path, "pytorch_model.bin.index.json"),
-                                )
-
-                        else:
-                            ckpt_name = f"pytorch_model.bin"
-                        ckpt_path = os.path.join(path, ckpt_name)
-                        io_driver.save(state_dict, ckpt_path)
-                if dist.is_initialized() and process_exclusion:
-                    dist.barrier()
-        if env.rank == 0:
-            config.save_pretrained(path, protocol=protocol)
-        dist.barrier()
-
-
-@add_start_docstrings(
-    """
-    The Mistral Model transformer with a sequence classification head on top (linear layer).
-
-    [`MistralForSequenceClassification`] uses the last token in order to do the classification, as other causal models
-    (e.g. GPT-2) do.
-
-    Since it does classification on the last token, it requires to know the position of the last token. If a
-    `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If
-    no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the
-    padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
-    each row of the batch).
-    """,
-    MISTRAL_START_DOCSTRING,
-)
-# Copied from transformers.models.llama.modeling_llama.LlamaForSequenceClassification with Llama->Mistral, LLAMA->MISTRAL
-class MistralForSequenceClassification(Mistral2PreTrainedModel):
-    def __init__(self, config):
-        super().__init__(config)
-        self.num_labels = config.num_labels
-        self.model = Mistral2Model(config)
-        self.score = nn.Linear(config.hidden_size, self.num_labels, bias=False)
-
-        # Initialize weights and apply final processing
-        self.post_init()
-
-    def get_input_embeddings(self):
-        return self.model.embed_tokens
-
-    def set_input_embeddings(self, value):
-        self.model.embed_tokens = value
-
-    @add_start_docstrings_to_model_forward(MISTRAL_INPUTS_DOCSTRING)
-    def forward(
-        self,
-        input_ids: torch.LongTensor = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[List[torch.FloatTensor]] = None,
-        inputs_embeds: Optional[torch.FloatTensor] = None,
-        labels: Optional[torch.LongTensor] = None,
-        use_cache: Optional[bool] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ) -> Union[Tuple, SequenceClassifierOutputWithPast]:
-        r"""
-        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
-            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
-            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
-            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
-        """
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        transformer_outputs = self.model(
-            input_ids,
-            attention_mask=attention_mask,
-            position_ids=position_ids,
-            past_key_values=past_key_values,
-            inputs_embeds=inputs_embeds,
-            use_cache=use_cache,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-        hidden_states = transformer_outputs[0]
-        logits = self.score(hidden_states)
-
-        if input_ids is not None:
-            batch_size = input_ids.shape[0]
-        else:
-            batch_size = inputs_embeds.shape[0]
-
-        if self.config.pad_token_id is None and batch_size != 1:
-            raise ValueError("Cannot handle batch sizes > 1 if no padding token is defined.")
-        if self.config.pad_token_id is None:
-            sequence_lengths = -1
-        else:
-            if input_ids is not None:
-                # if no pad token found, use modulo instead of reverse indexing for ONNX compatibility
-                sequence_lengths = torch.eq(input_ids, self.config.pad_token_id).int().argmax(-1) - 1
-                sequence_lengths = sequence_lengths % input_ids.shape[-1]
-                sequence_lengths = sequence_lengths.to(logits.device)
-            else:
-                sequence_lengths = -1
-
-        pooled_logits = logits[torch.arange(batch_size, device=logits.device), sequence_lengths]
-
-        loss = None
-        if labels is not None:
-            labels = labels.to(logits.device)
-            if self.config.problem_type is None:
-                if self.num_labels == 1:
-                    self.config.problem_type = "regression"
-                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
-                    self.config.problem_type = "single_label_classification"
-                else:
-                    self.config.problem_type = "multi_label_classification"
-
-            if self.config.problem_type == "regression":
-                loss_fct = MSELoss()
-                if self.num_labels == 1:
-                    loss = loss_fct(pooled_logits.squeeze(), labels.squeeze())
-                else:
-                    loss = loss_fct(pooled_logits, labels)
-            elif self.config.problem_type == "single_label_classification":
-                loss_fct = CrossEntropyLoss()
-                loss = loss_fct(pooled_logits.view(-1, self.num_labels), labels.view(-1))
-            elif self.config.problem_type == "multi_label_classification":
-                loss_fct = BCEWithLogitsLoss()
-                loss = loss_fct(pooled_logits, labels)
-        if not return_dict:
-            output = (pooled_logits,) + transformer_outputs[1:]
-            return ((loss,) + output) if loss is not None else output
-
-        return SequenceClassifierOutputWithPast(
-            loss=loss,
-            logits=pooled_logits,
-            past_key_values=transformer_outputs.past_key_values,
-            hidden_states=transformer_outputs.hidden_states,
-            attentions=transformer_outputs.attentions,
-        )
diff --git a/collie/models/mistral2/modelpp.py b/collie/models/mistral2/modelpp.py
deleted file mode 100644
index 1180a10..0000000
--- a/collie/models/mistral2/modelpp.py
+++ /dev/null
@@ -1,1922 +0,0 @@
-# coding=utf-8
-# Copyright 2023 Mistral AI and the HuggingFace Inc. team. All rights reserved.
-#
-# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
-# and OPT implementations in this library. It has been modified from its
-# original forms to accommodate minor architectural differences compared
-# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-""" PyTorch Mistral model."""
-import inspect
-import math
-import warnings
-from typing import List, Optional, Tuple, Union
-
-import torch
-import torch.nn.functional as F
-import torch.utils.checkpoint
-from torch import nn
-from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
-
-from transformers.activations import ACT2FN
-from transformers.cache_utils import Cache, DynamicCache
-from transformers.modeling_attn_mask_utils import _prepare_4d_causal_attention_mask, _prepare_4d_causal_attention_mask_for_sdpa
-from transformers.modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast, SequenceClassifierOutputWithPast
-from transformers.modeling_utils import PreTrainedModel, dtype_byte_size
-from transformers.utils import (
-    add_start_docstrings,
-    add_start_docstrings_to_model_forward,
-    is_flash_attn_2_available,
-    is_flash_attn_greater_or_equal_2_10,
-    logging,
-    replace_return_docstrings,
-)
-from .configuration_mistraltp import Mistral2Config
-
-
-if is_flash_attn_2_available():
-    from flash_attn import flash_attn_func, flash_attn_varlen_func
-    from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input  # noqa
-
-    _flash_supports_window_size = "window_size" in list(inspect.signature(flash_attn_func).parameters)
-
-
-logger = logging.get_logger(__name__)
-
-_CONFIG_FOR_DOC = "Mistral2Config"
-
-#modified for collie
-import torch.distributed as dist
-import gc
-import json
-import os
-from collections import OrderedDict
-from megatron.core import parallel_state, tensor_parallel
-from einops import rearrange
-from deepspeed.pipe import LayerSpec, TiedLayerSpec
-
-from collie.config import CollieConfig
-from collie.driver.io import IODriver
-from collie.log.logger import logger
-from collie.module import (
-    ColumnParallelLinearWithoutBias,
-    ColumnParallelLMHead,
-    RowParallelLinearWithoutBias,
-)
-from collie.utils import concat_tensor, dict_as_params, env, progress
-from collie.models.base import CollieModelForCausalLM
-from collie.models.utils import (
-    kv_cache_to_inputs_for_layer, inputs_to_kv_cache_for_layer,
-    kv_cache_to_inputs_for_model, inputs_to_kv_cache_for_model,
-)
-
-
-# Copied from transformers.models.llama.modeling_llama._get_unpad_data
-def _get_unpad_data(attention_mask):
-    seqlens_in_batch = attention_mask.sum(dim=-1, dtype=torch.int32)
-    indices = torch.nonzero(attention_mask.flatten(), as_tuple=False).flatten()
-    max_seqlen_in_batch = seqlens_in_batch.max().item()
-    cu_seqlens = F.pad(torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.int32), (1, 0))
-    return (
-        indices,
-        cu_seqlens,
-        max_seqlen_in_batch,
-    )
-
-
-# Copied from transformers.models.llama.modeling_llama.LlamaRMSNorm with Llama->Mistral
-class Mistral2RMSNorm(nn.Module):
-    def __init__(self, hidden_size, eps=1e-6):
-        """
-        MistralRMSNorm is equivalent to T5LayerNorm
-        """
-        super().__init__()
-        self.weight = nn.Parameter(torch.ones(hidden_size))
-        self.variance_epsilon = eps
-
-    def forward(self, hidden_states):
-        input_dtype = hidden_states.dtype
-        hidden_states = hidden_states.to(torch.float32)
-        variance = hidden_states.pow(2).mean(-1, keepdim=True)
-        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
-        return self.weight * hidden_states.to(input_dtype)
-
-
-# copied from transformers.models.llama.modeling_llama.LlamaRotaryEmbedding with Llama->Mistral
-# TODO @Arthur no longer copied from LLama after static cache
-class Mistral2RotaryEmbedding(nn.Module):
-    def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None):
-        super().__init__()
-
-        self.dim = dim
-        self.max_position_embeddings = max_position_embeddings
-        self.base = base
-        inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2, dtype=torch.int64).float().to(device) / self.dim))
-        self.register_buffer("inv_freq", inv_freq, persistent=False)
-
-        # Build here to make `torch.jit.trace` work.
-        self._set_cos_sin_cache(
-            seq_len=max_position_embeddings, device=self.inv_freq.device, dtype=torch.get_default_dtype()
-        )
-
-    def _set_cos_sin_cache(self, seq_len, device, dtype):
-        self.max_seq_len_cached = seq_len
-        t = torch.arange(self.max_seq_len_cached, device=device, dtype=torch.int64).type_as(self.inv_freq)
-
-        freqs = torch.outer(t, self.inv_freq)
-        # Different from paper, but it uses a different permutation in order to obtain the same calculation
-        emb = torch.cat((freqs, freqs), dim=-1)
-        self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False)
-        self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False)
-
-    def forward(self, x, seq_len=None):
-        # x: [bs, num_attention_heads, seq_len, head_size]
-        if seq_len > self.max_seq_len_cached:
-            self._set_cos_sin_cache(seq_len=seq_len, device=x.device, dtype=x.dtype)
-
-        return (
-            self.cos_cached[:seq_len].to(dtype=x.dtype),
-            self.sin_cached[:seq_len].to(dtype=x.dtype),
-        )
-
-
-# Copied from transformers.models.llama.modeling_llama.rotate_half
-def rotate_half(x):
-    """Rotates half the hidden dims of the input."""
-    x1 = x[..., : x.shape[-1] // 2]
-    x2 = x[..., x.shape[-1] // 2 :]
-    return torch.cat((-x2, x1), dim=-1)
-
-
-# copied from transformers.models.llama.modeling_llama.apply_rotary_pos_emb
-# TODO @Arthur no longer copied from LLama after static cache
-def apply_rotary_pos_emb(q, k, cos, sin, position_ids, unsqueeze_dim=1):
-    """Applies Rotary Position Embedding to the query and key tensors.
-
-    Args:
-        q (`torch.Tensor`): The query tensor.
-        k (`torch.Tensor`): The key tensor.
-        cos (`torch.Tensor`): The cosine part of the rotary embedding.
-        sin (`torch.Tensor`): The sine part of the rotary embedding.
-        position_ids (`torch.Tensor`):
-            The position indices of the tokens corresponding to the query and key tensors. For example, this can be
-            used to pass offsetted position ids when working with a KV-cache.
-        unsqueeze_dim (`int`, *optional*, defaults to 1):
-            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
-            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
-            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
-            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
-            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
-            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
-    Returns:
-        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
-    """
-    cos = cos[position_ids].unsqueeze(unsqueeze_dim)
-    sin = sin[position_ids].unsqueeze(unsqueeze_dim)
-    q_embed = (q * cos) + (rotate_half(q) * sin)
-    k_embed = (k * cos) + (rotate_half(k) * sin)
-    return q_embed, k_embed
-
-
-class Mistral2MLP(nn.Module):
-    def __init__(self, config):
-        super().__init__()
-        self.config = config
-        self.hidden_size = config.hidden_size
-        self.intermediate_size = config.intermediate_size
-        # self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
-        # self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
-        # self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False)
-        
-        self.up_proj = ColumnParallelLinearWithoutBias(
-            self.hidden_size,
-            self.intermediate_size,
-            bias=False,
-            gather_output=False,
-            init_method=lambda x: x,
-        )
-        self.gate_proj = ColumnParallelLinearWithoutBias(
-            self.hidden_size,
-            self.intermediate_size,
-            bias=False,
-            gather_output=False,
-            init_method=lambda x: x,
-        )
-        self.down_proj = RowParallelLinearWithoutBias(
-            self.intermediate_size,
-            self.hidden_size,
-            bias=False,
-            input_is_parallel=True,
-            init_method=lambda x: x,
-        )
-        
-        self.act_fn = ACT2FN[config.hidden_act]
-
-    def forward(self, x):
-        return self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
-
-
-# Copied from transformers.models.llama.modeling_llama.repeat_kv
-def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
-    """
-    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
-    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
-    """
-    batch, num_key_value_heads, slen, head_dim = hidden_states.shape
-    if n_rep == 1:
-        return hidden_states
-    hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
-    return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
-
-
-class Mistral2Attention(nn.Module):
-    """
-    Multi-headed attention from 'Attention Is All You Need' paper. Modified to use sliding window attention: Longformer
-    and "Generating Long Sequences with Sparse Transformers".
-    """
-
-    def __init__(self, config: CollieConfig, layer_idx: Optional[int] = None):
-        super().__init__()
-        self.config = config
-        self.layer_idx = layer_idx
-        if layer_idx is None:
-            logger.warning_once(
-                f"Instantiating {self.__class__.__name__} without passing a `layer_idx` is not recommended and will "
-                "lead to errors during the forward call if caching is used. Please make sure to provide a `layer_idx` "
-                "when creating this class."
-            )
-
-        self.hidden_size = config.hidden_size
-        self.num_heads = config.num_attention_heads
-        self.head_dim = self.hidden_size // self.num_heads
-        self.num_key_value_heads = config.num_key_value_heads
-        self.num_key_value_groups = self.num_heads // self.num_key_value_heads
-        self.max_position_embeddings = config.max_position_embeddings
-        self.rope_theta = config.rope_theta
-        self.is_causal = True
-        self.attention_dropout = config.attention_dropout
-
-        if (self.head_dim * self.num_heads) != self.hidden_size:
-            raise ValueError(
-                f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}"
-                f" and `num_heads`: {self.num_heads})."
-            )
-        # self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=False)
-        # self.k_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=False)
-        # self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=False)
-        # self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=False)
-
-        self.q_proj = ColumnParallelLinearWithoutBias(
-            self.hidden_size,
-            self.num_heads * self.head_dim,
-            bias=False,
-            gather_output=False,
-            init_method=lambda x: x,
-        )
-        self.k_proj = ColumnParallelLinearWithoutBias(
-            self.hidden_size,
-            self.num_key_value_heads * self.head_dim,
-            bias=False,
-            gather_output=False,
-            init_method=lambda x: x,
-        )
-        self.v_proj = ColumnParallelLinearWithoutBias(
-            self.hidden_size,
-            self.num_key_value_heads * self.head_dim,
-            bias=False,
-            gather_output=False,
-            init_method=lambda x: x,
-        )
-        self.o_proj = RowParallelLinearWithoutBias(
-            self.num_heads * self.head_dim,
-            self.hidden_size,
-            bias=False,
-            input_is_parallel=True,
-            init_method=lambda x: x,
-        )
-
-        self.rotary_emb = Mistral2RotaryEmbedding(
-            self.head_dim,
-            max_position_embeddings=self.max_position_embeddings,
-            base=self.rope_theta,
-        )
-
-    def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
-        return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
-
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_value: Optional[Cache] = None,
-        output_attentions: bool = False,
-        use_cache: bool = False,
-        **kwargs,
-    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
-        if "padding_mask" in kwargs:
-            warnings.warn(
-                "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`"
-            )
-        bsz, q_len, _ = hidden_states.size()
-
-        query_states = self.q_proj(hidden_states)
-        key_states = self.k_proj(hidden_states)
-        value_states = self.v_proj(hidden_states)
-
-        # query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
-        # key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
-        # value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
-
-        query_states, key_states, value_states = (
-            rearrange(query_states, "b n (h d) -> b n h d", d=self.head_dim),
-            rearrange(key_states, "b n (h d) -> b n h d", d=self.head_dim),
-            rearrange(value_states, "b n (h d) -> b n h d", d=self.head_dim),
-        )
-
-        self.num_heads_tp = query_states.shape[2]
-        self.tp_size = self.num_heads // self.num_heads_tp
-
-        query_states = query_states.transpose(1, 2)
-        key_states = key_states.transpose(1, 2)
-        value_states = value_states.transpose(1, 2)
-
-        if self.config.pp_size > 1:
-            query_states = query_states.contiguous()
-            key_states = key_states.contiguous()
-            value_states = value_states.contiguous()
-
-        kv_seq_len = key_states.shape[-2]
-        if past_key_value is not None:
-            if self.layer_idx is None:
-                raise ValueError(
-                    f"The cache structure has changed since version v4.36. If you are using {self.__class__.__name__} "
-                    "for auto-regressive decoding with k/v caching, please make sure to initialize the attention class "
-                    "with a layer index."
-                )
-            kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
-        cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
-        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
-
-        if past_key_value is not None:
-            cache_kwargs = {"sin": sin, "cos": cos}  # Specific to RoPE models
-            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
-
-        # repeat k/v heads if n_kv_heads < n_heads
-        key_states = repeat_kv(key_states, self.num_key_value_groups)
-        value_states = repeat_kv(value_states, self.num_key_value_groups)
-
-        attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim)
-
-        if attn_weights.size() != (bsz, self.num_heads_tp, q_len, kv_seq_len):
-            raise ValueError(
-                f"Attention weights should be of size {(bsz, self.num_heads_tp, q_len, kv_seq_len)}, but is"
-                f" {attn_weights.size()}"
-            )
-
-        if attention_mask is not None:
-            if attention_mask.size() != (bsz, 1, q_len, kv_seq_len):
-                raise ValueError(
-                    f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}"
-                )
-
-            attn_weights = attn_weights + attention_mask
-
-        # upcast attention to fp32
-        attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
-        attn_weights = nn.functional.dropout(attn_weights, p=self.attention_dropout, training=self.training)
-        attn_output = torch.matmul(attn_weights, value_states)
-
-        if attn_output.size() != (bsz, self.num_heads_tp, q_len, self.head_dim):
-            raise ValueError(
-                f"`attn_output` should be of size {(bsz, self.num_heads_tp, q_len, self.head_dim)}, but is"
-                f" {attn_output.size()}"
-            )
-
-        attn_output = attn_output.transpose(1, 2).contiguous()
-        attn_output = attn_output.reshape(bsz, q_len, int(self.hidden_size/self.tp_size))
-
-        attn_output = self.o_proj(attn_output)
-
-        if not output_attentions:
-            attn_weights = None
-
-        return attn_output, attn_weights, past_key_value
-
-
-class Mistral2FlashAttention2(Mistral2Attention):
-    """
-    Mistral flash attention module. This module inherits from `MistralAttention` as the weights of the module stays
-    untouched. The only required change would be on the forward pass where it needs to correctly call the public API of
-    flash attention and deal with padding tokens in case the input contains any of them.
-    """
-
-    # Copied from transformers.models.llama.modeling_llama.LlamaFlashAttention2.__init__
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-
-        # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1.
-        # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignement, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0.
-        # Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left).
-        self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10()
-
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_value: Optional[Cache] = None,
-        output_attentions: bool = False,
-        use_cache: bool = False,
-        **kwargs,
-    ):
-        if "padding_mask" in kwargs:
-            warnings.warn(
-                "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`"
-            )
-
-            # overwrite attention_mask with padding_mask
-            attention_mask = kwargs.pop("padding_mask")
-        bsz, q_len, _ = hidden_states.size()
-
-        query_states = self.q_proj(hidden_states)
-        key_states = self.k_proj(hidden_states)
-        value_states = self.v_proj(hidden_states)
-
-        # query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
-        # key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
-        # value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
-
-        query_states, key_states, value_states = (
-            rearrange(query_states, "b n (h d) -> b n h d", d=self.head_dim),
-            rearrange(key_states, "b n (h d) -> b n h d", d=self.head_dim),
-            rearrange(value_states, "b n (h d) -> b n h d", d=self.head_dim),
-        )
-
-        self.num_heads_tp = query_states.shape[2]
-        self.tp_size = self.num_heads // self.num_heads_tp
-
-        query_states = query_states.transpose(1, 2)
-        key_states = key_states.transpose(1, 2)
-        value_states = value_states.transpose(1, 2)
-
-        if self.config.pp_size > 1:
-            query_states = query_states.contiguous()
-            key_states = key_states.contiguous()
-            value_states = value_states.contiguous()
-
-
-        kv_seq_len = key_states.shape[-2]
-        if past_key_value is not None:
-            if self.layer_idx is None:
-                raise ValueError(
-                    f"The cache structure has changed since version v4.36. If you are using {self.__class__.__name__} "
-                    "for auto-regressive decoding with k/v caching, please make sure to initialize the attention class "
-                    "with a layer index."
-                )
-            kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
-
-        # Because the input can be padded, the absolute sequence length depends on the max position id.
-        rotary_seq_len = max(kv_seq_len, position_ids[:, -1].max().item()) + 1
-        cos, sin = self.rotary_emb(value_states, seq_len=rotary_seq_len)
-
-        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
-
-        use_sliding_windows = (
-            _flash_supports_window_size
-            and getattr(self.config, "sliding_window", None) is not None
-            and kv_seq_len > self.config.sliding_window
-        )
-
-        if not _flash_supports_window_size:
-            logger.warning_once(
-                "The current flash attention version does not support sliding window attention, for a more memory efficient implementation"
-                " make sure to upgrade flash-attn library."
-            )
-
-        if past_key_value is not None:
-            # Activate slicing cache only if the config has a value `sliding_windows` attribute
-            cache_has_contents = past_key_value.get_seq_length(self.layer_idx) > 0
-            if (
-                getattr(self.config, "sliding_window", None) is not None
-                and kv_seq_len > self.config.sliding_window
-                and cache_has_contents
-            ):
-                slicing_tokens = 1 - self.config.sliding_window
-
-                past_key = past_key_value[self.layer_idx][0]
-                past_value = past_key_value[self.layer_idx][1]
-
-                past_key = past_key[:, :, slicing_tokens:, :].contiguous()
-                past_value = past_value[:, :, slicing_tokens:, :].contiguous()
-
-                if past_key.shape[-2] != self.config.sliding_window - 1:
-                    raise ValueError(
-                        f"past key must have a shape of (`batch_size, num_heads, self.config.sliding_window-1, head_dim`), got"
-                        f" {past_key.shape}"
-                    )
-
-                if attention_mask is not None:
-                    attention_mask = attention_mask[:, slicing_tokens:]
-                    attention_mask = torch.cat([attention_mask, torch.ones_like(attention_mask[:, -1:])], dim=-1)
-
-            cache_kwargs = {"sin": sin, "cos": cos}  # Specific to RoPE models
-            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
-
-        # repeat k/v heads if n_kv_heads < n_heads
-        key_states = repeat_kv(key_states, self.num_key_value_groups)
-        value_states = repeat_kv(value_states, self.num_key_value_groups)
-        dropout_rate = 0.0 if not self.training else self.attention_dropout
-
-        # In PEFT, usually we cast the layer norms in float32 for training stability reasons
-        # therefore the input hidden states gets silently casted in float32. Hence, we need
-        # cast them back in float16 just to be sure everything works as expected.
-        input_dtype = query_states.dtype
-        if input_dtype == torch.float32:
-            if torch.is_autocast_enabled():
-                target_dtype = torch.get_autocast_gpu_dtype()
-            # Handle the case where the model is quantized
-            elif hasattr(self.config, "_pre_quantization_dtype"):
-                target_dtype = self.config._pre_quantization_dtype
-            else:
-                target_dtype = self.q_proj.weight.dtype
-
-            logger.warning_once(
-                f"The input hidden states seems to be silently casted in float32, this might be related to"
-                f" the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in"
-                f" {target_dtype}."
-            )
-
-            query_states = query_states.to(target_dtype)
-            key_states = key_states.to(target_dtype)
-            value_states = value_states.to(target_dtype)
-
-        # Reashape to the expected shape for Flash Attention
-        query_states = query_states.transpose(1, 2)
-        key_states = key_states.transpose(1, 2)
-        value_states = value_states.transpose(1, 2)
-
-        attn_output = self._flash_attention_forward(
-            query_states,
-            key_states,
-            value_states,
-            attention_mask,
-            q_len,
-            dropout=dropout_rate,
-            use_sliding_windows=use_sliding_windows,
-        )
-
-        attn_output = attn_output.reshape(bsz, q_len, int(self.hidden_size/self.tp_size)).contiguous()
-        attn_output = self.o_proj(attn_output)
-
-        if not output_attentions:
-            attn_weights = None
-
-        return attn_output, attn_weights, past_key_value
-
-    def _flash_attention_forward(
-        self,
-        query_states,
-        key_states,
-        value_states,
-        attention_mask,
-        query_length,
-        dropout=0.0,
-        softmax_scale=None,
-        use_sliding_windows=False,
-    ):
-        """
-        Calls the forward method of Flash Attention - if the input hidden states contain at least one padding token
-        first unpad the input, then computes the attention scores and pad the final attention scores.
-
-        Args:
-            query_states (`torch.Tensor`):
-                Input query states to be passed to Flash Attention API
-            key_states (`torch.Tensor`):
-                Input key states to be passed to Flash Attention API
-            value_states (`torch.Tensor`):
-                Input value states to be passed to Flash Attention API
-            attention_mask (`torch.Tensor`):
-                The padding mask - corresponds to a tensor of size `(batch_size, seq_len)` where 0 stands for the
-                position of padding tokens and 1 for the position of non-padding tokens.
-            dropout (`float`):
-                Attention dropout
-            softmax_scale (`float`, *optional*):
-                The scaling of QK^T before applying softmax. Default to 1 / sqrt(head_dim)
-            use_sliding_windows (`bool`, *optional*):
-                Whether to activate sliding window attention.
-        """
-        if not self._flash_attn_uses_top_left_mask:
-            causal = self.is_causal
-        else:
-            # TODO: Remove the `query_length != 1` check once Flash Attention for RoCm is bumped to 2.1. For details, please see the comment in LlamaFlashAttention2 __init__.
-            causal = self.is_causal and query_length != 1
-
-        # Contains at least one padding token in the sequence
-        if attention_mask is not None:
-            batch_size = query_states.shape[0]
-            query_states, key_states, value_states, indices_q, cu_seq_lens, max_seq_lens = self._upad_input(
-                query_states, key_states, value_states, attention_mask, query_length
-            )
-
-            cu_seqlens_q, cu_seqlens_k = cu_seq_lens
-            max_seqlen_in_batch_q, max_seqlen_in_batch_k = max_seq_lens
-
-            if not use_sliding_windows:
-                attn_output_unpad = flash_attn_varlen_func(
-                    query_states,
-                    key_states,
-                    value_states,
-                    cu_seqlens_q=cu_seqlens_q,
-                    cu_seqlens_k=cu_seqlens_k,
-                    max_seqlen_q=max_seqlen_in_batch_q,
-                    max_seqlen_k=max_seqlen_in_batch_k,
-                    dropout_p=dropout,
-                    softmax_scale=softmax_scale,
-                    causal=causal,
-                )
-            else:
-                attn_output_unpad = flash_attn_varlen_func(
-                    query_states,
-                    key_states,
-                    value_states,
-                    cu_seqlens_q=cu_seqlens_q,
-                    cu_seqlens_k=cu_seqlens_k,
-                    max_seqlen_q=max_seqlen_in_batch_q,
-                    max_seqlen_k=max_seqlen_in_batch_k,
-                    dropout_p=dropout,
-                    softmax_scale=softmax_scale,
-                    causal=causal,
-                    window_size=(self.config.sliding_window, self.config.sliding_window),
-                )
-
-            attn_output = pad_input(attn_output_unpad, indices_q, batch_size, query_length)
-        else:
-            if not use_sliding_windows:
-                attn_output = flash_attn_func(
-                    query_states,
-                    key_states,
-                    value_states,
-                    dropout,
-                    softmax_scale=softmax_scale,
-                    causal=causal,
-                )
-            else:
-                attn_output = flash_attn_func(
-                    query_states,
-                    key_states,
-                    value_states,
-                    dropout,
-                    softmax_scale=softmax_scale,
-                    causal=causal,
-                    window_size=(self.config.sliding_window, self.config.sliding_window),
-                )
-
-        return attn_output
-
-    def _upad_input(self, query_layer, key_layer, value_layer, attention_mask, query_length):
-        batch_size, kv_seq_len, num_heads, head_dim = key_layer.shape
-
-        # On the first iteration we need to properly re-create the padding mask
-        # by slicing it on the proper place
-        if kv_seq_len != attention_mask.shape[-1]:
-            attention_mask_num_tokens = attention_mask.shape[-1]
-            attention_mask = attention_mask[:, attention_mask_num_tokens - kv_seq_len :]
-
-        indices_k, cu_seqlens_k, max_seqlen_in_batch_k = _get_unpad_data(attention_mask)
-
-        key_layer = index_first_axis(key_layer.reshape(batch_size * kv_seq_len, num_heads, head_dim), indices_k)
-        value_layer = index_first_axis(value_layer.reshape(batch_size * kv_seq_len, num_heads, head_dim), indices_k)
-
-        if query_length == kv_seq_len:
-            query_layer = index_first_axis(
-                query_layer.reshape(batch_size * kv_seq_len, num_heads, head_dim), indices_k
-            )
-            cu_seqlens_q = cu_seqlens_k
-            max_seqlen_in_batch_q = max_seqlen_in_batch_k
-            indices_q = indices_k
-        elif query_length == 1:
-            max_seqlen_in_batch_q = 1
-            cu_seqlens_q = torch.arange(
-                batch_size + 1, dtype=torch.int32, device=query_layer.device
-            )  # There is a memcpy here, that is very bad.
-            indices_q = cu_seqlens_q[:-1]
-            query_layer = query_layer.squeeze(1)
-        else:
-            # The -q_len: slice assumes left padding.
-            attention_mask = attention_mask[:, -query_length:]
-            query_layer, indices_q, cu_seqlens_q, max_seqlen_in_batch_q = unpad_input(query_layer, attention_mask)
-
-        return (
-            query_layer,
-            key_layer,
-            value_layer,
-            indices_q,
-            (cu_seqlens_q, cu_seqlens_k),
-            (max_seqlen_in_batch_q, max_seqlen_in_batch_k),
-        )
-
-
-# copied from transformers.models.llama.modeling_llama.LlamaSdpaAttention with Llama->Mistral
-# TODO @Arthur no longer copied from LLama after static cache
-class Mistral2SdpaAttention(Mistral2Attention):
-    """
-    Mistral attention module using torch.nn.functional.scaled_dot_product_attention. This module inherits from
-    `MistralAttention` as the weights of the module stays untouched. The only changes are on the forward pass to adapt to
-    SDPA API.
-    """
-
-    # Adapted from MistralAttention.forward
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_value: Optional[Cache] = None,
-        output_attentions: bool = False,
-        use_cache: bool = False,
-    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
-        if output_attentions:
-            # TODO: Improve this warning with e.g. `model.config.attn_implementation = "manual"` once this is implemented.
-            logger.warning_once(
-                "MistralModel is using MistralSdpaAttention, but `torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to the manual attention implementation, "
-                'but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
-            )
-            return super().forward(
-                hidden_states=hidden_states,
-                attention_mask=attention_mask,
-                position_ids=position_ids,
-                past_key_value=past_key_value,
-                output_attentions=output_attentions,
-                use_cache=use_cache,
-            )
-
-        bsz, q_len, _ = hidden_states.size()
-
-        query_states = self.q_proj(hidden_states)
-        key_states = self.k_proj(hidden_states)
-        value_states = self.v_proj(hidden_states)
-
-        # query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
-        # key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
-        # value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
-
-        query_states, key_states, value_states = (
-            rearrange(query_states, "b n (h d) -> b n h d", d=self.head_dim),
-            rearrange(key_states, "b n (h d) -> b n h d", d=self.head_dim),
-            rearrange(value_states, "b n (h d) -> b n h d", d=self.head_dim),
-        )
-
-        self.num_heads_tp = query_states.shape[2]
-        self.tp_size = self.num_heads // self.num_heads_tp
-
-        query_states = query_states.transpose(1, 2)
-        key_states = key_states.transpose(1, 2)
-        value_states = value_states.transpose(1, 2)
-
-        if self.config.pp_size > 1:
-            query_states = query_states.contiguous()
-            key_states = key_states.contiguous()
-            value_states = value_states.contiguous()
-
-
-        kv_seq_len = key_states.shape[-2]
-        if past_key_value is not None:
-            kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
-        cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
-
-        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
-
-        if past_key_value is not None:
-            cache_kwargs = {"sin": sin, "cos": cos}  # Specific to RoPE models
-            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
-
-        key_states = repeat_kv(key_states, self.num_key_value_groups)
-        value_states = repeat_kv(value_states, self.num_key_value_groups)
-
-        if attention_mask is not None:
-            if attention_mask.size() != (bsz, 1, q_len, kv_seq_len):
-                raise ValueError(
-                    f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}"
-                )
-
-        # SDPA with memory-efficient backend is currently (torch==2.1.2) bugged with non-contiguous inputs with custom attn_mask,
-        # Reference: https://github.com/pytorch/pytorch/issues/112577.
-        if query_states.device.type == "cuda" and attention_mask is not None:
-            query_states = query_states.contiguous()
-            key_states = key_states.contiguous()
-            value_states = value_states.contiguous()
-
-        attn_output = torch.nn.functional.scaled_dot_product_attention(
-            query_states,
-            key_states,
-            value_states,
-            attn_mask=attention_mask,
-            dropout_p=self.attention_dropout if self.training else 0.0,
-            # The q_len > 1 is necessary to match with AttentionMaskConverter.to_causal_4d that does not create a causal mask in case q_len == 1.
-            is_causal=self.is_causal and attention_mask is None and q_len > 1,
-        )
-
-        attn_output = attn_output.transpose(1, 2).contiguous()
-        attn_output = attn_output.view(bsz, q_len, int(self.hidden_size/self.tp_size))
-
-        attn_output = self.o_proj(attn_output)
-
-        return attn_output, None, past_key_value
-
-
-MISTRAL_ATTENTION_CLASSES = {
-    "eager": Mistral2Attention,
-    "flash_attention_2": Mistral2FlashAttention2,
-    "sdpa": Mistral2SdpaAttention,
-}
-
-
-class MistralDecoderLayer(nn.Module):
-    def __init__(self, config: CollieConfig, layer_idx: int):
-        super().__init__()
-        self.hidden_size = config.hidden_size
-
-        self.self_attn = MISTRAL_ATTENTION_CLASSES[config._attn_implementation](config, layer_idx)
-
-        self.mlp = Mistral2MLP(config)
-        self.input_layernorm = Mistral2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
-        self.post_attention_layernorm = Mistral2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
-
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_value: Optional[Tuple[torch.Tensor]] = None,
-        output_attentions: Optional[bool] = False,
-        use_cache: Optional[bool] = False,
-        **kwargs,
-    ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
-        if "padding_mask" in kwargs:
-            warnings.warn(
-                "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`"
-            )
-        """
-        Args:
-            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
-            attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
-                `(batch, sequence_length)` where padding elements are indicated by 0.
-            output_attentions (`bool`, *optional*):
-                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
-                returned tensors for more detail.
-            use_cache (`bool`, *optional*):
-                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
-                (see `past_key_values`).
-            past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
-        """
-
-        residual = hidden_states
-
-        hidden_states = self.input_layernorm(hidden_states)
-
-        # Self Attention
-        hidden_states, self_attn_weights, present_key_value = self.self_attn(
-            hidden_states=hidden_states,
-            attention_mask=attention_mask,
-            position_ids=position_ids,
-            past_key_value=past_key_value,
-            output_attentions=output_attentions,
-            use_cache=use_cache,
-        )
-        hidden_states = residual + hidden_states
-
-        # Fully Connected
-        residual = hidden_states
-        hidden_states = self.post_attention_layernorm(hidden_states)
-        hidden_states = self.mlp(hidden_states)
-        hidden_states = residual + hidden_states
-
-        outputs = (hidden_states,)
-
-        if output_attentions:
-            outputs += (self_attn_weights,)
-
-        if use_cache:
-            outputs += (present_key_value,)
-
-        return outputs
-
-
-MISTRAL_START_DOCSTRING = r"""
-    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
-    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
-    etc.)
-
-    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
-    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
-    and behavior.
-
-    Parameters:
-        config ([`MistralConfig`]):
-            Model configuration class with all the parameters of the model. Initializing with a config file does not
-            load the weights associated with the model, only the configuration. Check out the
-            [`~PreTrainedModel.from_pretrained`] method to load the model weights.
-"""
-
-
-@add_start_docstrings(
-    "The bare Mistral Model outputting raw hidden-states without any specific head on top.",
-    MISTRAL_START_DOCSTRING,
-)
-class Mistral2PreTrainedModel(PreTrainedModel):
-    config_class = Mistral2Config
-    base_model_prefix = "model"
-    supports_gradient_checkpointing = True
-    _no_split_modules = ["MistralDecoderLayer"]
-    _skip_keys_device_placement = "past_key_values"
-    _supports_flash_attn_2 = True
-    _supports_sdpa = True
-    _supports_cache_class = True
-
-    def _init_weights(self, module):
-        std = self.config.initializer_range
-        if isinstance(module, nn.Linear):
-            module.weight.data.normal_(mean=0.0, std=std)
-            if module.bias is not None:
-                module.bias.data.zero_()
-        elif isinstance(module, nn.Embedding):
-            module.weight.data.normal_(mean=0.0, std=std)
-            if module.padding_idx is not None:
-                module.weight.data[module.padding_idx].zero_()
-
-
-MISTRAL_INPUTS_DOCSTRING = r"""
-    Args:
-        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
-            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
-            it.
-
-            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
-            [`PreTrainedTokenizer.__call__`] for details.
-
-            [What are input IDs?](../glossary#input-ids)
-        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
-            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
-
-            - 1 for tokens that are **not masked**,
-            - 0 for tokens that are **masked**.
-
-            [What are attention masks?](../glossary#attention-mask)
-
-            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
-            [`PreTrainedTokenizer.__call__`] for details.
-
-            If `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
-            `past_key_values`).
-
-            If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
-            and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
-            information on the default strategy.
-
-            - 1 indicates the head is **not masked**,
-            - 0 indicates the head is **masked**.
-        position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
-            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
-            config.n_positions - 1]`.
-
-            [What are position IDs?](../glossary#position-ids)
-        past_key_values (`Cache` or `tuple(tuple(torch.FloatTensor))`, *optional*):
-            Pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
-            blocks) that can be used to speed up sequential decoding. This typically consists in the `past_key_values`
-            returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`.
-
-            Two formats are allowed:
-            - a [`~cache_utils.Cache`] instance;
-            - Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of
-            shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`). This is also known as the legacy
-            cache format.
-
-            The model will output the same cache format that is fed as input. If no `past_key_values` are passed, the
-            legacy cache format will be returned.
-
-            If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that don't
-            have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `input_ids`
-            of shape `(batch_size, sequence_length)`.
-        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
-            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
-            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
-            model's internal embedding lookup matrix.
-        use_cache (`bool`, *optional*):
-            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
-            `past_key_values`).
-        output_attentions (`bool`, *optional*):
-            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
-            tensors for more detail.
-        output_hidden_states (`bool`, *optional*):
-            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
-            more detail.
-        return_dict (`bool`, *optional*):
-            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
-"""
-
-
-@add_start_docstrings(
-    "The bare Mistral Model outputting raw hidden-states without any specific head on top.",
-    MISTRAL_START_DOCSTRING,
-)
-class Mistral2Model(nn.Module):
-    """
-    Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`MistralDecoderLayer`]
-
-    Args:
-        config: MistralConfig
-    """
-
-    def __init__(self, config:  CollieConfig):
-        # super().__init__(config)
-        super().__init__()
-        self.config = config
-        self.padding_idx = config.pad_token_id
-        self.vocab_size = config.vocab_size
-
-        self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
-        self.layers = nn.ModuleList(
-            [MistralDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
-        )
-        self._attn_implementation = config._attn_implementation
-        self.norm = Mistral2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
-
-        self.gradient_checkpointing = False
-        # Initialize weights and apply final processing
-        # self.post_init()
-
-    def get_input_embeddings(self):
-        return self.embed_tokens
-
-    def set_input_embeddings(self, value):
-        self.embed_tokens = value
-
-    @add_start_docstrings_to_model_forward(MISTRAL_INPUTS_DOCSTRING)
-    def forward(
-        self,
-        input_ids: torch.LongTensor = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[List[torch.FloatTensor]] = None,
-        inputs_embeds: Optional[torch.FloatTensor] = None,
-        use_cache: Optional[bool] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ) -> Union[Tuple, BaseModelOutputWithPast]:
-        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
-        output_hidden_states = (
-            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
-        )
-        use_cache = use_cache if use_cache is not None else self.config.use_cache
-
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        # retrieve input_ids and inputs_embeds
-        if input_ids is not None and inputs_embeds is not None:
-            raise ValueError("You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time")
-        elif input_ids is not None:
-            batch_size, seq_length = input_ids.shape
-        elif inputs_embeds is not None:
-            batch_size, seq_length, _ = inputs_embeds.shape
-        else:
-            raise ValueError("You have to specify either decoder_input_ids or decoder_inputs_embeds")
-
-        if self.gradient_checkpointing and self.training:
-            if use_cache:
-                logger.warning_once(
-                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
-                )
-                use_cache = False
-
-        past_key_values_length = 0
-
-        if use_cache:
-            use_legacy_cache = not isinstance(past_key_values, Cache)
-            if use_legacy_cache:
-                past_key_values = DynamicCache.from_legacy_cache(past_key_values)
-            past_key_values_length = past_key_values.get_usable_length(seq_length)
-
-        if position_ids is None:
-            device = input_ids.device if input_ids is not None else inputs_embeds.device
-            position_ids = torch.arange(
-                past_key_values_length, seq_length + past_key_values_length, dtype=torch.long, device=device
-            )
-            position_ids = position_ids.unsqueeze(0).view(-1, seq_length)
-        else:
-            position_ids = position_ids.view(-1, seq_length).long()
-
-        if inputs_embeds is None:
-            inputs_embeds = self.embed_tokens(input_ids)
-
-        if attention_mask is not None and self._attn_implementation == "flash_attention_2" and use_cache:
-            is_padding_right = attention_mask[:, -1].sum().item() != batch_size
-            if is_padding_right:
-                raise ValueError(
-                    "You are attempting to perform batched generation with padding_side='right'"
-                    " this may lead to unexpected behaviour for Flash Attention version of Mistral. Make sure to "
-                    " call `tokenizer.padding_side  = 'left'` before tokenizing the input. "
-                )
-
-        if self._attn_implementation == "flash_attention_2":
-            # 2d mask is passed through the layers
-            attention_mask = attention_mask if (attention_mask is not None and 0 in attention_mask) else None
-        elif self._attn_implementation == "sdpa" and not output_attentions:
-            # output_attentions=True can not be supported when using SDPA, and we fall back on
-            # the manual implementation that requires a 4D causal mask in all cases.
-            attention_mask = _prepare_4d_causal_attention_mask_for_sdpa(
-                attention_mask,
-                (batch_size, seq_length),
-                inputs_embeds,
-                past_key_values_length,
-            )
-        else:
-            # 4d mask is passed through the layers
-            attention_mask = _prepare_4d_causal_attention_mask(
-                attention_mask,
-                (batch_size, seq_length),
-                inputs_embeds,
-                past_key_values_length,
-                sliding_window=self.config.sliding_window,
-            )
-
-        hidden_states = inputs_embeds
-
-        # decoder layers
-        all_hidden_states = () if output_hidden_states else None
-        all_self_attns = () if output_attentions else None
-        next_decoder_cache = None
-
-        for decoder_layer in self.layers:
-            if output_hidden_states:
-                all_hidden_states += (hidden_states,)
-
-            if self.gradient_checkpointing and self.training:
-                layer_outputs = self._gradient_checkpointing_func(
-                    decoder_layer.__call__,
-                    hidden_states,
-                    attention_mask,
-                    position_ids,
-                    past_key_values,
-                    output_attentions,
-                    use_cache,
-                )
-            else:
-                layer_outputs = decoder_layer(
-                    hidden_states,
-                    attention_mask=attention_mask,
-                    position_ids=position_ids,
-                    past_key_value=past_key_values,
-                    output_attentions=output_attentions,
-                    use_cache=use_cache,
-                )
-
-            hidden_states = layer_outputs[0]
-
-            if use_cache:
-                next_decoder_cache = layer_outputs[2 if output_attentions else 1]
-
-            if output_attentions:
-                all_self_attns += (layer_outputs[1],)
-
-        hidden_states = self.norm(hidden_states)
-
-        # add hidden states from the last decoder layer
-        if output_hidden_states:
-            all_hidden_states += (hidden_states,)
-
-        next_cache = None
-        if use_cache:
-            next_cache = next_decoder_cache.to_legacy_cache() if use_legacy_cache else next_decoder_cache
-
-        if not return_dict:
-            return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None)
-        return BaseModelOutputWithPast(
-            last_hidden_state=hidden_states,
-            past_key_values=next_cache,
-            hidden_states=all_hidden_states,
-            attentions=all_self_attns,
-        )
-
-
-class Mistral2ForCausalLM(CollieModelForCausalLM):
-    _tied_weights_keys = ["lm_head.weight"]
-
-    def __init__(self, config:CollieConfig):
-        super().__init__(config)
-        self.model = Mistral2Model(config)
-        self.vocab_size = config.vocab_size
-        # self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
-        self.lm_head = ColumnParallelLinearWithoutBias(
-            self.collie_config.hidden_size, self.collie_config.vocab_size, bias=False
-        )
-        # Initialize weights and apply final processing
-        # self.post_init()
-        # GenerationMixin 需要的额外参数
-        self.config.is_decoder = True
-        if config.model_config.tie_word_embeddings:
-            self.lm_head.weight = self.embed_tokens.weight
-        self.main_input_name = "input_ids"
-
-    def clean_cache(self):
-        self._clean_hidden_states([*self.model.layers, self.lm_head])
-        self._set_use_cache(self.model.layers, False)
-
-    def set_cache(self, use_cache):
-        self._set_use_cache(self.model.layers, use_cache)
-
-    def get_input_embeddings(self):
-        return self.model.embed_tokens
-
-    def set_input_embeddings(self, value):
-        self.model.embed_tokens = value
-
-    def get_output_embeddings(self):
-        return self.lm_head
-
-    def set_output_embeddings(self, new_embeddings):
-        self.lm_head = new_embeddings
-
-    def set_decoder(self, decoder):
-        self.model = decoder
-
-    def get_decoder(self):
-        return self.model
-
-    @add_start_docstrings_to_model_forward(MISTRAL_INPUTS_DOCSTRING)
-    @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
-    def forward(
-        self,
-        input_ids: torch.LongTensor = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[List[torch.FloatTensor]] = None,
-        inputs_embeds: Optional[torch.FloatTensor] = None,
-        labels: Optional[torch.LongTensor] = None,
-        use_cache: Optional[bool] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-        **kwargs,
-    ) -> Union[Tuple, CausalLMOutputWithPast]:
-        r"""
-        Args:
-            labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
-                Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
-                config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
-                (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
-
-        Returns:
-
-        Example:
-
-        ```python
-        >>> from transformers import AutoTokenizer, MistralForCausalLM
-
-        >>> model = MistralForCausalLM.from_pretrained("mistralai/Mistral-7B-v0.1")
-        >>> tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-v0.1")
-
-        >>> prompt = "Hey, are you conscious? Can you talk to me?"
-        >>> inputs = tokenizer(prompt, return_tensors="pt")
-
-        >>> # Generate
-        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
-        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
-        "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
-        ```"""
-
-        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
-        output_hidden_states = (
-            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
-        )
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
-        outputs = self.model(
-            input_ids=input_ids,
-            attention_mask=attention_mask,
-            position_ids=position_ids,
-            past_key_values=past_key_values,
-            inputs_embeds=inputs_embeds,
-            use_cache=use_cache,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-
-        hidden_states = outputs[0]
-        logits = self.lm_head(hidden_states)
-        logits = logits.float()
-
-        loss = None
-        if labels is not None:
-            # Shift so that tokens < n predict n
-            shift_logits = logits[..., :-1, :].contiguous()
-            shift_labels = labels[..., 1:].contiguous()
-            # Flatten the tokens
-            shift_logits = shift_logits.view(-1, self.config.vocab_size)
-            shift_labels = shift_labels.view(-1)
-            # Ensure tensors are on the same device
-            shift_labels = shift_labels.to(shift_logits.device)
-            loss_fct = CrossEntropyLoss()
-            loss = loss_fct(shift_logits, shift_labels)
-
-        if not return_dict:
-            output = (logits,) + outputs[1:]
-            return (loss,) + output if loss is not None else output
-
-        return CausalLMOutputWithPast(
-            loss=loss,
-            logits=logits,
-            past_key_values=outputs.past_key_values,
-            hidden_states=outputs.hidden_states,
-            attentions=outputs.attentions,
-        )
-
-    def prepare_inputs_for_generation(
-        self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, **kwargs
-    ):
-        # Omit tokens covered by past_key_values
-        if past_key_values is not None:
-            if isinstance(past_key_values, Cache):
-                cache_length = past_key_values.get_seq_length()
-                past_length = past_key_values.seen_tokens
-                max_cache_length = past_key_values.get_max_length()
-            else:
-                cache_length = past_length = past_key_values[0][0].shape[2]
-                max_cache_length = None
-
-            # Keep only the unprocessed tokens:
-            # 1 - If the length of the attention_mask exceeds the length of input_ids, then we are in a setting where
-            # some of the inputs are exclusively passed as part of the cache (e.g. when passing input_embeds as
-            # input)
-            if attention_mask is not None and attention_mask.shape[1] > input_ids.shape[1]:
-                input_ids = input_ids[:, -(attention_mask.shape[1] - past_length) :]
-            # 2 - If the past_length is smaller than input_ids', then input_ids holds all input tokens. We can discard
-            # input_ids based on the past_length.
-            elif past_length < input_ids.shape[1]:
-                input_ids = input_ids[:, past_length:]
-            # 3 - Otherwise (past_length >= input_ids.shape[1]), let's assume input_ids only has unprocessed tokens.
-
-            # If we are about to go beyond the maximum cache length, we need to crop the input attention mask.
-            if (
-                max_cache_length is not None
-                and attention_mask is not None
-                and cache_length + input_ids.shape[1] > max_cache_length
-            ):
-                attention_mask = attention_mask[:, -max_cache_length:]
-
-        position_ids = kwargs.get("position_ids", None)
-        if attention_mask is not None and position_ids is None:
-            # create position_ids on the fly for batch generation
-            position_ids = attention_mask.long().cumsum(-1) - 1
-            position_ids.masked_fill_(attention_mask == 0, 1)
-            if past_key_values:
-                position_ids = position_ids[:, -input_ids.shape[1] :]
-
-        # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
-        if inputs_embeds is not None and past_key_values is None:
-            model_inputs = {"inputs_embeds": inputs_embeds}
-        else:
-            model_inputs = {"input_ids": input_ids}
-
-        model_inputs.update(
-            {
-                "position_ids": position_ids,
-                "past_key_values": past_key_values,
-                "use_cache": kwargs.get("use_cache"),
-                "attention_mask": attention_mask,
-            }
-        )
-        return model_inputs
-
-    @staticmethod
-    def _reorder_cache(past_key_values, beam_idx):
-        reordered_past = ()
-        for layer_past in past_key_values:
-            reordered_past += (
-                tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past),
-            )
-        return reordered_past
-
-    @classmethod
-    def pipeline_layers(cls, config: CollieConfig):
-        """
-        Get layers of pipeline.
-        :return: list
-        """
-        if isinstance(config, str):
-            config = CollieConfig.from_pretrained(config)
-
-        if config.tie_word_embeddings:
-            output = TiedLayerSpec(
-                "embed_tokens",
-                dict_as_params(input_keys="hidden_states", output_keys="logits"),
-                ColumnParallelLMHead,
-                config.hidden_size,
-                config.vocab_size,
-                bias=False,
-            )
-        else:
-            output = LayerSpec(
-                dict_as_params(input_keys="hidden_states", output_keys="logits"),
-                ColumnParallelLMHead,
-                config.hidden_size,
-                config.vocab_size,
-                bias=False,
-            )
-
-        return [("model", Mistral2Model.pipeline_layers(config)), ("lm_head", output)]
-
-    @staticmethod
-    def load_parallel_state_dict(
-            path: str,
-            config: Union[CollieConfig, str],
-            process_exclusion: bool = False,
-            **kwargs,
-    ):
-        ...
-
-    @staticmethod
-    def load_parallel_state_dict(
-            path: str,
-            config: Union[CollieConfig, str],
-            process_exclusion: bool = False,
-            protocol: str = "file", # 指定加载state_dict时使用的协议
-            **kwargs,
-    ):
-        """
-        Load state_dict from ``path``.
-        The format of pretrained model should be the same as that of
-        `huggingface`.
-        :return: state_dict. Note that the state_dict should be processed
-            properly to match the current rank.
-        """
-        # 配置加载
-        if isinstance(config, str):
-            config = CollieConfig.from_pretrained(config)
-        # IO驱动初始化
-        io_driver = IODriver.from_protocol(protocol)
-        # 检查文件路径是否存在
-        if not io_driver.exists(path):
-            raise FileNotFoundError(f"folder {path} not found.")
-        # 初始化存储和处理变量
-        state_dict = OrderedDict()
-        weights = []
-        parts = None # 变量用于存储模型分割的部分信息
-        # 如果开启了进程互斥，那么每个进程都会显示进度条，否则只显示 RANK0 的
-        hide_progress = not process_exclusion and int(os.environ.get("RANK", "0")) != 0
-        if dist.is_initialized() and process_exclusion:
-            # 如果启动了进程互斥，则要进行 dist.get_world_size() 次循环
-            rank_order = range(dist.get_world_size())
-        else:
-            # 不开启只进行一次循环
-            rank_order = range(1)
-        # 权重文件加载和处理
-        for rank in rank_order:
-            # 如果开启了进程互斥，那么只有对应 RANK 的能进入循环；不开启进程互斥的话就都可以进
-            if int(os.environ.get("RANK", "0")) == rank or not process_exclusion:
-                # PP 分层的方法保存在了 os.environ["COLLIE_PP_PARTS"], 格式类似于 [0, 17, 35], 左闭右开
-                if env.is_pipeline:
-                    # 保存的是 json 格式
-                    parts = env.pipeline_parts
-                if hasattr(config, "num_key_value_heads"):
-                    # llama2 (transformers >= 4.31.0)
-                    num_key_value_heads = config.num_key_value_heads
-                else:
-                    num_key_value_heads = config.num_attention_heads
-                head_dim = config.hidden_size // config.num_attention_heads
-                # 如果存在 pytorch_model.bin.index.json 文件的话，此时不同的 pp 进程可以按需加载自己需要的权重
-                if (
-                        io_driver.exists(os.path.join(path, "pytorch_model.bin.index.json"))
-                        and "COLLIE_PP_PARTS" in os.environ.keys()
-                ):
-                    weight_map = json.loads(
-                        io_driver.load(
-                            os.path.join(path, "pytorch_model.bin.index.json"), mode="r"
-                        )
-                    )["weight_map"]
-                    # layers 表示自己需要的层
-                    layers = env.pipeline_layers_idx
-                    # 筛选出形似 model.layers.0 这样的层。包含两个条件：1. 有数字的层；2. 数字加一要在 layers 里面（因为最开始还有个 embedding 占一层）
-                    weights.extend(
-                        [
-                            value
-                            for key, value in weight_map.items()
-                            if len(key.split(".")) > 2
-                               and key.split(".")[2].isdigit()
-                               and (int(key.split(".")[2]) + 1) in layers
-                        ]
-                    )
-                    # 去重
-                    weights = list(set(weights))
-                    # 继续筛选，如果有 0 层，那么就要加载 embedding；如果有最后一层，那么就要加载 lm_head；如果有倒数第二层，那么就要加载 norm
-                    if 0 in layers:
-                        weights.append(weight_map["model.tok_embeddings.weight"])
-                    if max(parts) - 1 in layers:
-                        weights.append(weight_map["output.weight"])
-                    if max(parts) - 2 in layers:
-                        weights.append(weight_map["model.norm.weight"])
-                else:
-                    # 如果没有 pytorch_model.bin.index.json 文件的话，那么就加载所有的权重
-                    weights = [
-                        weight
-                        for weight in io_driver.list(path)
-                        if weight.endswith(".bin")
-                    ]
-                with progress(
-                    weights,
-                    desc="Loading state dict",
-                    total=len(weights),
-                    disable=hide_progress,
-                ) as pbar:
-                    for weight in pbar:
-                        part_state_dict = io_driver.load(
-                            os.path.join(path, weight), mode="rb"
-                        )
-                        # for key in list(part_state_dict.keys()):
-                            # if "attention.wqkv.weight" in key:
-                            #     # qkv_weights = part_state_dict.pop(key)
-                            #     qkv_weights = part_state_dict[key]
-                            #     print(qkv_weights.shape)
-                            #     (wq, wk, wv) = qkv_weights.split(
-                            #         [
-                            #             config.hidden_size,
-                            #             config.num_key_value_heads * head_dim,
-                            #             config.num_key_value_heads * head_dim,
-                            #         ],
-                            #         dim=0,
-                            #     )
-                            #     wq_name = key.replace("wqkv", "wq")
-                            #     wk_name = key.replace("wqkv", "wk")
-                            #     wv_name = key.replace("wqkv", "wv")
-                            #     part_state_dict[wq_name] = wq
-                            #     part_state_dict[wk_name] = wk
-                            #     part_state_dict[wv_name] = wv
-                        state_dict.update(part_state_dict)
-                        del part_state_dict
-                if parts is not None:
-                    # 这一步是 pp 的复筛
-                    layers = env.pipeline_layers_idx
-                    for key in list(state_dict.keys()):
-                        if key.startswith("layers"):
-                            layer = int(key.split(".")[1])
-                            if layer + 1 not in layers:
-                                state_dict.pop(key)
-                        # if key.endswith("tok_embeddings.weight"):
-                        if key.endswith("embed_tokens.weight"):
-                            if 0 not in layers:
-                                state_dict.pop(key)
-                        if key == "norm.weight":
-                            if max(parts) - 2 not in layers:
-                                state_dict.pop(key)
-                        # if key.endswith("output.weight"):
-                        if key.endswith("lm_head.weight"):
-                            if max(parts) - 1 not in layers:
-                                state_dict.pop(key)
-                # 根据用户配置的新的 tp size 进行分割
-                for key in list(state_dict.keys()):
-                    col_filter = [
-                        # "wq.weight",
-                        # "wk.weight",
-                        # "wv.weight",
-                        # "wqkv.weight",
-                        # "w1.weight",
-                        # "w3.weight",
-                        # "tok_embeddings.weight",
-                        # "output.weight",
-                        "q_proj.weight",
-                        "k_proj.weight",
-                        "v_proj.weight",
-                        "o_proj.weight",
-                        "lm_head.weight",
-                        "gate_proj.weight",
-                        "up_proj.weight",
-                        "down_proj.weight",
-                        "embed_tokens.weight",
-                    ]
-                    col_split = any([key.endswith(filter) for filter in col_filter])
-
-                    if col_split:
-                        tensor = (
-                            list(torch.chunk(state_dict[key], config.tp_size, dim=0))[
-                                env.tp_rank
-                            ]
-                            .detach()
-                            .clone()
-                        )
-                        del state_dict[key]
-                        if process_exclusion:
-                            # CPU 内存回收（速度很慢）
-                            gc.collect()
-                        state_dict[key] = tensor
-                    elif key.endswith("o_proj.weight") or key.endswith("down_proj.weight"):
-                        tensor = (
-                            list(torch.chunk(state_dict[key], config.tp_size, dim=1))[
-                                env.tp_rank
-                            ]
-                            .detach()
-                            .clone()
-                        )
-                        del state_dict[key]
-                        if process_exclusion:
-                            # CPU 内存回收（速度很慢）
-                            gc.collect()
-                        state_dict[key] = tensor
-            if dist.is_initialized() and process_exclusion:
-                # 如果选择了进程互斥，那么本次循环中不需要加载权重的进程需等待
-                dist.barrier()
-        return state_dict
-
-    @staticmethod
-    def save_parallel_state_dict(
-        state_dict: dict,
-        path: str,
-        config: CollieConfig,
-        process_exclusion: bool = False,
-        **kwargs,
-    ):
-        ...
-
-    @staticmethod
-    def save_parallel_state_dict(
-            state_dict: dict,
-            path: str,
-            config: CollieConfig,
-            process_exclusion: bool = False,
-            protocol: str = "file",
-    ):
-        """
-        Save state_dict to ``path``.
-        The format of saved state dict should be the same as that of
-        `huggingface`.
-        """
-        io_driver = IODriver.from_protocol(protocol)
-        # gather to tp rank 0
-        if dist.is_initialized() and process_exclusion:
-            # 如果启动了进程互斥，则要进行 pp_size 次循环
-            rank_order = range(config.pp_size)
-        else:
-            # 不开启只进行一次循环
-            rank_order = range(1)
-        dst = parallel_state.get_tensor_model_parallel_src_rank()
-        with progress(
-                rank_order,
-                desc="Saving model",
-                disable=int(os.environ.get("RANK", "0")) != 0,
-        ) as pbar:
-            for rank in pbar:
-                if env.dp_rank == 0 and (env.pp_rank == rank or not process_exclusion):
-                    for key in sorted(list(state_dict.keys())):
-                        tensor_list = None
-                        if env.tp_rank == 0:
-                            tensor_list = [
-                                torch.zeros_like(state_dict[key])
-                                .to(state_dict[key].dtype)
-                                .cuda()
-                                for _ in range(config.tp_size)
-                            ]
-                        dist.gather(
-                            state_dict[key].cuda(),
-                            dst=dst,
-                            gather_list=tensor_list,
-                            group=env.tp_group,
-                        )
-                        if env.tp_rank == 0:
-                            col_filter = [
-                                # "wq.weight",
-                                # "wk.weight",
-                                # "wv.weight",
-                                # "wqkv.weight",
-                                # "w1.weight",
-                                # "w3.weight",
-                                # "tok_embeddings.weight",
-                                # "output.weight",
-                                "q_proj.weight",
-                                "k_proj.weight",
-                                "v_proj.weight",
-                                "o_proj.weight",
-                                "lm_head.weight",
-                                "gate_proj.weight",
-                                "up_proj.weight",
-                                "down_proj.weight",
-                                "embed_tokens.weight",
-                            ]
-                            col_split = any(
-                                [key.endswith(filter) for filter in col_filter]
-                            )
-
-                            if col_split:
-                                state_dict[key] = concat_tensor(tensor_list, dim=0)
-
-                                if process_exclusion:
-                                    # CPU 内存回收（速度很慢）
-                                    gc.collect()
-
-                            elif key.endswith("o_proj.weight") or key.endswith("down_proj.weight"):
-                                state_dict[key] = concat_tensor(tensor_list, dim=1)
-
-                                if process_exclusion:
-                                    # CPU 内存回收（速度很慢）
-                                    gc.collect()
-                    # 似乎不需要？
-                    # state_dict_keys = state_dict.keys()
-                    # for layer_id in range(config.num_layers):
-                    #     qkv_names = [None, None, None]
-                    #     for key in state_dict_keys:
-                    #         if f"layers.{layer_id}.attention.wq.weight" in key:
-                    #             qkv_names[0] = key
-                    #         elif f"layers.{layer_id}.attention.wk.weight" in key:
-                    #             qkv_names[1] = key
-                    #         elif f"layers.{layer_id}.attention.wv.weight" in key:
-                    #             qkv_names[2] = key
-                    #     qkv_name = qkv_names[0].replace("wq", "wqkv")
-                    #     state_dict[qkv_name] = torch.cat(
-                    #         [
-                    #             state_dict.pop(qkv_names[0]),
-                    #             state_dict.pop(qkv_names[1]),
-                    #             state_dict.pop(qkv_names[2]),
-                    #         ],
-                    #         dim=0
-                    #     )
-
-                    if env.tp_rank == 0:
-                        # Save gathered weights
-                        if env.is_pipeline:
-                            ckpt_name = f"pytorch_model-{env.pp_rank + 1:05d}-of-{config.pp_size:05d}.bin"
-                            total_size = 0
-                            weight_map = {}
-                            for name, weight in state_dict.items():
-                                weight_size = weight.numel() * dtype_byte_size(
-                                    weight.dtype
-                                )
-                                weight_map[name] = ckpt_name
-                                total_size += weight_size
-                            index_dict = dict(
-                                total_size=total_size, weight_map=weight_map
-                            )
-                            index_dicts = [None for _ in range(env.pp_size)]
-                            dist.gather_object(
-                                index_dict, index_dicts if env.pp_rank == 0 else None, group=env.pp_group
-                            )
-                            if env.pp_rank == 0:
-                                total_size = 0
-                                weight_map = {}
-                                for _index_dict in index_dicts:
-                                    total_size += _index_dict["total_size"]
-                                    weight_map.update(_index_dict["weight_map"])
-                                merged_dict = {
-                                    "metadata": {"total_size": total_size},
-                                    "weight_map": weight_map,
-                                }
-                                io_driver.save(
-                                    json.dumps(merged_dict, indent=2, sort_keys=True)
-                                    + "\n",
-                                    os.path.join(path, "pytorch_model.bin.index.json"),
-                                )
-
-                        else:
-                            ckpt_name = f"pytorch_model.bin"
-                        ckpt_path = os.path.join(path, ckpt_name)
-                        io_driver.save(state_dict, ckpt_path)
-                if dist.is_initialized() and process_exclusion:
-                    dist.barrier()
-        if env.rank == 0:
-            config.save_pretrained(path, protocol=protocol)
-        dist.barrier()
-
-
-@add_start_docstrings(
-    """
-    The Mistral Model transformer with a sequence classification head on top (linear layer).
-
-    [`MistralForSequenceClassification`] uses the last token in order to do the classification, as other causal models
-    (e.g. GPT-2) do.
-
-    Since it does classification on the last token, it requires to know the position of the last token. If a
-    `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If
-    no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the
-    padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
-    each row of the batch).
-    """,
-    MISTRAL_START_DOCSTRING,
-)
-# Copied from transformers.models.llama.modeling_llama.LlamaForSequenceClassification with Llama->Mistral, LLAMA->MISTRAL
-class MistralForSequenceClassification(Mistral2PreTrainedModel):
-    def __init__(self, config):
-        super().__init__(config)
-        self.num_labels = config.num_labels
-        self.model = Mistral2Model(config)
-        self.score = nn.Linear(config.hidden_size, self.num_labels, bias=False)
-
-        # Initialize weights and apply final processing
-        self.post_init()
-
-    def get_input_embeddings(self):
-        return self.model.embed_tokens
-
-    def set_input_embeddings(self, value):
-        self.model.embed_tokens = value
-
-    @add_start_docstrings_to_model_forward(MISTRAL_INPUTS_DOCSTRING)
-    def forward(
-        self,
-        input_ids: torch.LongTensor = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[List[torch.FloatTensor]] = None,
-        inputs_embeds: Optional[torch.FloatTensor] = None,
-        labels: Optional[torch.LongTensor] = None,
-        use_cache: Optional[bool] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ) -> Union[Tuple, SequenceClassifierOutputWithPast]:
-        r"""
-        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
-            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
-            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
-            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
-        """
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        transformer_outputs = self.model(
-            input_ids,
-            attention_mask=attention_mask,
-            position_ids=position_ids,
-            past_key_values=past_key_values,
-            inputs_embeds=inputs_embeds,
-            use_cache=use_cache,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-        hidden_states = transformer_outputs[0]
-        logits = self.score(hidden_states)
-
-        if input_ids is not None:
-            batch_size = input_ids.shape[0]
-        else:
-            batch_size = inputs_embeds.shape[0]
-
-        if self.config.pad_token_id is None and batch_size != 1:
-            raise ValueError("Cannot handle batch sizes > 1 if no padding token is defined.")
-        if self.config.pad_token_id is None:
-            sequence_lengths = -1
-        else:
-            if input_ids is not None:
-                # if no pad token found, use modulo instead of reverse indexing for ONNX compatibility
-                sequence_lengths = torch.eq(input_ids, self.config.pad_token_id).int().argmax(-1) - 1
-                sequence_lengths = sequence_lengths % input_ids.shape[-1]
-                sequence_lengths = sequence_lengths.to(logits.device)
-            else:
-                sequence_lengths = -1
-
-        pooled_logits = logits[torch.arange(batch_size, device=logits.device), sequence_lengths]
-
-        loss = None
-        if labels is not None:
-            labels = labels.to(logits.device)
-            if self.config.problem_type is None:
-                if self.num_labels == 1:
-                    self.config.problem_type = "regression"
-                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
-                    self.config.problem_type = "single_label_classification"
-                else:
-                    self.config.problem_type = "multi_label_classification"
-
-            if self.config.problem_type == "regression":
-                loss_fct = MSELoss()
-                if self.num_labels == 1:
-                    loss = loss_fct(pooled_logits.squeeze(), labels.squeeze())
-                else:
-                    loss = loss_fct(pooled_logits, labels)
-            elif self.config.problem_type == "single_label_classification":
-                loss_fct = CrossEntropyLoss()
-                loss = loss_fct(pooled_logits.view(-1, self.num_labels), labels.view(-1))
-            elif self.config.problem_type == "multi_label_classification":
-                loss_fct = BCEWithLogitsLoss()
-                loss = loss_fct(pooled_logits, labels)
-        if not return_dict:
-            output = (pooled_logits,) + transformer_outputs[1:]
-            return ((loss,) + output) if loss is not None else output
-
-        return SequenceClassifierOutputWithPast(
-            loss=loss,
-            logits=pooled_logits,
-            past_key_values=transformer_outputs.past_key_values,
-            hidden_states=transformer_outputs.hidden_states,
-            attentions=transformer_outputs.attentions,
-        )
diff --git a/collie/models/mistral2/modeltp.py b/collie/models/mistral2/modeltp.py
deleted file mode 100644
index e91037f..0000000
--- a/collie/models/mistral2/modeltp.py
+++ /dev/null
@@ -1,2254 +0,0 @@
-# coding=utf-8
-# Copyright 2023 Mistral AI and the HuggingFace Inc. team. All rights reserved.
-#
-# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
-# and OPT implementations in this library. It has been modified from its
-# original forms to accommodate minor architectural differences compared
-# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-""" PyTorch Mistral model."""
-import inspect
-import math
-import warnings
-from typing import List, Optional, Tuple, Union
-
-import torch
-import torch.nn.functional as F
-import torch.utils.checkpoint
-from torch import nn
-from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
-
-from transformers.activations import ACT2FN
-from transformers.cache_utils import Cache, DynamicCache
-from transformers.modeling_attn_mask_utils import _prepare_4d_causal_attention_mask, _prepare_4d_causal_attention_mask_for_sdpa
-from transformers.modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast, SequenceClassifierOutputWithPast
-from transformers.modeling_utils import PreTrainedModel, dtype_byte_size
-from transformers.utils import (
-    add_start_docstrings,
-    add_start_docstrings_to_model_forward,
-    is_flash_attn_2_available,
-    is_flash_attn_greater_or_equal_2_10,
-    logging,
-    replace_return_docstrings,
-)
-from .configuration_mistraltp import MistralConfig
-
-
-if is_flash_attn_2_available():
-    from flash_attn import flash_attn_func, flash_attn_varlen_func
-    from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input  # noqa
-
-    _flash_supports_window_size = "window_size" in list(inspect.signature(flash_attn_func).parameters)
-
-
-logger = logging.get_logger(__name__)
-
-_CONFIG_FOR_DOC = "MistralConfig"
-
-#modified for collie
-import torch.distributed as dist
-import gc
-import json
-import os
-from collections import OrderedDict
-from megatron.core import parallel_state, tensor_parallel
-from einops import rearrange
-from deepspeed.pipe import LayerSpec, TiedLayerSpec
-
-from collie.config import CollieConfig
-from collie.driver.io import IODriver
-from collie.log.logger import logger
-from collie.module import (
-    ColumnParallelLinearWithoutBias,
-    ColumnParallelLMHead,
-    RowParallelLinearWithoutBias,
-)
-from collie.utils import concat_tensor, dict_as_params, env, progress
-from collie.models.base import CollieModelForCausalLM
-from collie.models.utils import (
-    kv_cache_to_inputs_for_layer, inputs_to_kv_cache_for_layer,
-    kv_cache_to_inputs_for_model, inputs_to_kv_cache_for_model,
-)
-
-
-# Copied from transformers.models.llama.modeling_llama._get_unpad_data
-def _get_unpad_data(attention_mask):
-    seqlens_in_batch = attention_mask.sum(dim=-1, dtype=torch.int32)
-    indices = torch.nonzero(attention_mask.flatten(), as_tuple=False).flatten()
-    max_seqlen_in_batch = seqlens_in_batch.max().item()
-    cu_seqlens = F.pad(torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.int32), (1, 0))
-    return (
-        indices,
-        cu_seqlens,
-        max_seqlen_in_batch,
-    )
-
-
-# Copied from transformers.models.llama.modeling_llama.LlamaRMSNorm with Llama->Mistral
-class MistralRMSNorm(nn.Module):
-    def __init__(self, hidden_size, eps=1e-6):
-        """
-        MistralRMSNorm is equivalent to T5LayerNorm
-        """
-        super().__init__()
-        self.weight = nn.Parameter(torch.ones(hidden_size))
-        self.variance_epsilon = eps
-
-    def forward(self, hidden_states):
-        input_dtype = hidden_states.dtype
-        hidden_states = hidden_states.to(torch.float32)
-        variance = hidden_states.pow(2).mean(-1, keepdim=True)
-        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
-        ans = self.weight * hidden_states.to(input_dtype)
-
-        # # 打印层标准化的输出
-        hidden_states_output = ans.detach().cpu().tolist()
-        data_to_save = {"Layer Norm Output": hidden_states_output}
-        # 将输出写入 JSON 文件
-        with open('a_rms_output.json', 'w') as f:
-            json.dump(data_to_save, f, indent=4)
-
-        return self.weight * hidden_states.to(input_dtype)
-
-
-# copied from transformers.models.llama.modeling_llama.LlamaRotaryEmbedding with Llama->Mistral
-# TODO @Arthur no longer copied from LLama after static cache
-class MistralRotaryEmbedding(nn.Module):
-    def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None):
-        super().__init__()
-
-        self.dim = dim
-        self.max_position_embeddings = max_position_embeddings
-        self.base = base
-        inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2, dtype=torch.int64).float().to(device) / self.dim))
-        self.register_buffer("inv_freq", inv_freq, persistent=False)
-
-        # Build here to make `torch.jit.trace` work.
-        self._set_cos_sin_cache(
-            seq_len=max_position_embeddings, device=self.inv_freq.device, dtype=torch.get_default_dtype()
-        )
-
-    def _set_cos_sin_cache(self, seq_len, device, dtype):
-        self.max_seq_len_cached = seq_len
-        t = torch.arange(self.max_seq_len_cached, device=device, dtype=torch.int64).type_as(self.inv_freq)
-
-        freqs = torch.outer(t, self.inv_freq)
-        # Different from paper, but it uses a different permutation in order to obtain the same calculation
-        emb = torch.cat((freqs, freqs), dim=-1)
-        self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False)
-        self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False)
-
-    def forward(self, x, seq_len=None):
-        # x: [bs, num_attention_heads, seq_len, head_size]
-        if seq_len > self.max_seq_len_cached:
-            self._set_cos_sin_cache(seq_len=seq_len, device=x.device, dtype=x.dtype)
-
-        return (
-            self.cos_cached[:seq_len].to(dtype=x.dtype),
-            self.sin_cached[:seq_len].to(dtype=x.dtype),
-        )
-
-
-# Copied from transformers.models.llama.modeling_llama.rotate_half
-def rotate_half(x):
-    """Rotates half the hidden dims of the input."""
-    x1 = x[..., : x.shape[-1] // 2]
-    x2 = x[..., x.shape[-1] // 2 :]
-    return torch.cat((-x2, x1), dim=-1)
-
-
-# copied from transformers.models.llama.modeling_llama.apply_rotary_pos_emb
-# TODO @Arthur no longer copied from LLama after static cache
-def apply_rotary_pos_emb(q, k, cos, sin, position_ids, unsqueeze_dim=1):
-    """Applies Rotary Position Embedding to the query and key tensors.
-
-    Args:
-        q (`torch.Tensor`): The query tensor.
-        k (`torch.Tensor`): The key tensor.
-        cos (`torch.Tensor`): The cosine part of the rotary embedding.
-        sin (`torch.Tensor`): The sine part of the rotary embedding.
-        position_ids (`torch.Tensor`):
-            The position indices of the tokens corresponding to the query and key tensors. For example, this can be
-            used to pass offsetted position ids when working with a KV-cache.
-        unsqueeze_dim (`int`, *optional*, defaults to 1):
-            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
-            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
-            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
-            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
-            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
-            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
-    Returns:
-        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
-    """
-    cos = cos[position_ids].unsqueeze(unsqueeze_dim)
-    sin = sin[position_ids].unsqueeze(unsqueeze_dim)
-    q_embed = (q * cos) + (rotate_half(q) * sin)
-    k_embed = (k * cos) + (rotate_half(k) * sin)
-    return q_embed, k_embed
-
-
-class MistralMLP(nn.Module):
-    def __init__(self, config):
-        super().__init__()
-        self.config = config
-        self.hidden_size = config.hidden_size
-        self.intermediate_size = config.intermediate_size
-        # self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
-        # self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
-        # self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False)
-        
-        self.up_proj = ColumnParallelLinearWithoutBias(
-            self.hidden_size,
-            self.intermediate_size,
-            bias=False,
-            gather_output=False,
-            init_method=lambda x: x,
-        )
-        self.gate_proj = ColumnParallelLinearWithoutBias(
-            self.hidden_size,
-            self.intermediate_size,
-            bias=False,
-            gather_output=False,
-            init_method=lambda x: x,
-        )
-        self.down_proj = RowParallelLinearWithoutBias(
-            self.intermediate_size,
-            self.hidden_size,
-            bias=False,
-            input_is_parallel=True,
-            init_method=lambda x: x,
-        )
-        
-        self.act_fn = ACT2FN[config.hidden_act]
-
-    def forward(self, x):
-        output = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
-
-        # 打印MLP层输出
-        mlp_output = output.detach().cpu().tolist()
-        data_to_save = {"MLP Output": mlp_output}
-        # 将输出写入 JSON 文件
-        with open('a_mlp_output.json', 'w') as f:
-            json.dump(data_to_save, f, indent=4)
-        
-        return output
-
-
-# Copied from transformers.models.llama.modeling_llama.repeat_kv
-def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
-    """
-    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
-    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
-    """
-    batch, num_key_value_heads, slen, head_dim = hidden_states.shape
-    if n_rep == 1:
-        return hidden_states
-    hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
-    return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
-
-
-class MistralAttention(nn.Module):
-    """
-    Multi-headed attention from 'Attention Is All You Need' paper. Modified to use sliding window attention: Longformer
-    and "Generating Long Sequences with Sparse Transformers".
-    """
-
-    def __init__(self, config: CollieConfig, layer_idx: Optional[int] = None):
-        super().__init__()
-        self.config = config
-        self.layer_idx = layer_idx
-        if layer_idx is None:
-            logger.warning_once(
-                f"Instantiating {self.__class__.__name__} without passing a `layer_idx` is not recommended and will "
-                "lead to errors during the forward call if caching is used. Please make sure to provide a `layer_idx` "
-                "when creating this class."
-            )
-
-        self.hidden_size = config.hidden_size
-        self.num_heads = config.num_attention_heads
-        self.head_dim = self.hidden_size // self.num_heads
-        self.num_key_value_heads = config.num_key_value_heads
-        self.num_key_value_groups = self.num_heads // self.num_key_value_heads
-        self.max_position_embeddings = config.max_position_embeddings
-        self.rope_theta = config.rope_theta
-        self.is_causal = True
-        self.attention_dropout = config.attention_dropout
-
-        if (self.head_dim * self.num_heads) != self.hidden_size:
-            raise ValueError(
-                f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}"
-                f" and `num_heads`: {self.num_heads})."
-            )
-        # self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=False)
-        # self.k_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=False)
-        # self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=False)
-        # self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=False)
-
-        self.q_proj = ColumnParallelLinearWithoutBias(
-            self.hidden_size,
-            self.num_heads * self.head_dim,
-            bias=False,
-            gather_output=False,
-            init_method=lambda x: x,
-        )
-        self.k_proj = ColumnParallelLinearWithoutBias(
-            self.hidden_size,
-            self.num_key_value_heads * self.head_dim,
-            bias=False,
-            gather_output=False,
-            init_method=lambda x: x,
-        )
-        self.v_proj = ColumnParallelLinearWithoutBias(
-            self.hidden_size,
-            self.num_key_value_heads * self.head_dim,
-            bias=False,
-            gather_output=False,
-            init_method=lambda x: x,
-        )
-        # aaaa
-        self.o_proj = RowParallelLinearWithoutBias(
-            self.num_heads * self.head_dim,
-            self.hidden_size,
-            bias=False,
-            input_is_parallel=True,
-            init_method=lambda x: x,
-        )
-
-        self.rotary_emb = MistralRotaryEmbedding(
-            self.head_dim,
-            max_position_embeddings=self.max_position_embeddings,
-            base=self.rope_theta,
-        )
-
-    def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
-        return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
-
-    def forward(
-        self,
-        hidden_states: torch.Tensor,    # 输入维度 [bsz, q_len, hidden_size]
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_value: Optional[Cache] = None,
-        output_attentions: bool = False,
-        use_cache: bool = False,
-        **kwargs,
-    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
-        if "padding_mask" in kwargs:
-            warnings.warn(
-                "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`"
-            )
-
-        bsz, q_len, _ = hidden_states.size()
-
-        query_states = self.q_proj(hidden_states)   # [bsz, q_len, num_heads * head_dim]
-        key_states = self.k_proj(hidden_states)     # [bsz, q_len, num_key_value_heads * head_dim]
-        value_states = self.v_proj(hidden_states)   # [bsz, q_len, num_key_value_heads * head_dim]
-
-        # query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
-        # key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
-        # value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
-
-        query_states, key_states, value_states = (
-            rearrange(query_states, "b n (h d) -> b n h d", d=self.head_dim),   # [bsz, q_len, num_heads, head_dim]
-            rearrange(key_states, "b n (h d) -> b n h d", d=self.head_dim),     # [bsz, q_len, num_key_value_heads, head_dim]
-            rearrange(value_states, "b n (h d) -> b n h d", d=self.head_dim),   # [bsz, q_len, num_key_value_heads, head_dim]
-        )
-
-        query_states = query_states.transpose(1, 2)     # [bsz, num_heads, q_len, head_dim]
-        key_states = key_states.transpose(1, 2)         # [bsz, num_key_value_heads, q_len, head_dim]
-        value_states = value_states.transpose(1, 2)     # [bsz, num_key_value_heads, q_len, head_dim]
-        
-        # 打印注意力模块的输出
-        # 准备数据以写入 JSON 文件
-        attention_outputs = {
-            "Query states": query_states.detach().cpu().tolist(),
-            "Key states": key_states.detach().cpu().tolist(),
-            "Value states": value_states.detach().cpu().tolist()
-        }
-        # 将数据写入 JSON 文件
-        with open("a_attention_outputs.json", "w") as f:
-            json.dump(attention_outputs, f, indent=4)
-
-        if self.config.pp_size > 1:
-            query_states = query_states.contiguous()
-            key_states = key_states.contiguous()
-            value_states = value_states.contiguous()
-
-        kv_seq_len = key_states.shape[-2]
-        if past_key_value is not None:
-            if self.layer_idx is None:
-                raise ValueError(
-                    f"The cache structure has changed since version v4.36. If you are using {self.__class__.__name__} "
-                    "for auto-regressive decoding with k/v caching, please make sure to initialize the attention class "
-                    "with a layer index."
-                )
-            kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
-        cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
-        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
-
-        if past_key_value is not None:
-            cache_kwargs = {"sin": sin, "cos": cos}  # Specific to RoPE models
-            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
-
-        # repeat k/v heads if n_kv_heads < n_heads
-        key_states = repeat_kv(key_states, self.num_key_value_groups)
-        value_states = repeat_kv(value_states, self.num_key_value_groups)
-
-        attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim)
-
-        if attn_weights.size() != (bsz, self.num_heads/self.config.tp_size, q_len, kv_seq_len):
-            raise ValueError(
-                f"Attention weights should be of size {(bsz, self.num_heads/self.config.tp_size, q_len, kv_seq_len)}, but is"
-                f" {attn_weights.size()}"
-            )
-
-        if attention_mask is not None:
-            if attention_mask.size() != (bsz, 1, q_len, kv_seq_len):
-                raise ValueError(
-                    f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}"
-                )
-
-            attn_weights = attn_weights + attention_mask
-
-        # upcast attention to fp32
-        attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
-        attn_weights = nn.functional.dropout(attn_weights, p=self.attention_dropout, training=self.training)
-        attn_output = torch.matmul(attn_weights, value_states)
-
-        if attn_output.size() != (bsz, self.num_heads/self.config.tp_size, q_len, self.head_dim):
-            raise ValueError(
-                f"`attn_output` should be of size {(bsz, self.num_heads/self.config.tp_size, q_len, self.head_dim)}, but is"
-                f" {attn_output.size()}"
-            )
-
-        attn_output = attn_output.transpose(1, 2).contiguous()
-        attn_output = attn_output.reshape(bsz, q_len, int(self.hidden_size/self.config.tp_size))
-
-
-        attn_output = self.o_proj(attn_output)
-
-        if not output_attentions:
-            attn_weights = None
-
-        # 打印注意力模块的输出
-        attention_result = {
-            "Output weights:": attn_output.detach().cpu().tolist(),
-            # "Attention weights:": attn_weights.detach().cpu().tolist(),
-        }
-        # 将数据写入 JSON 文件
-        with open("a_attention_outputs.json", "w") as f:
-            json.dump(attention_result, f, indent=4)
-
-        return attn_output, attn_weights, past_key_value
-
-
-class MistralFlashAttention2(MistralAttention):
-    """
-    Mistral flash attention module. This module inherits from `MistralAttention` as the weights of the module stays
-    untouched. The only required change would be on the forward pass where it needs to correctly call the public API of
-    flash attention and deal with padding tokens in case the input contains any of them.
-    """
-
-    # Copied from transformers.models.llama.modeling_llama.LlamaFlashAttention2.__init__
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-
-        # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1.
-        # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignement, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0.
-        # Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left).
-        self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10()
-
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_value: Optional[Cache] = None,
-        output_attentions: bool = False,
-        use_cache: bool = False,
-        **kwargs,
-    ):
-        if "padding_mask" in kwargs:
-            warnings.warn(
-                "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`"
-            )
-
-            # overwrite attention_mask with padding_mask
-            attention_mask = kwargs.pop("padding_mask")
-        bsz, q_len, _ = hidden_states.size()
-
-        query_states = self.q_proj(hidden_states)
-        key_states = self.k_proj(hidden_states)
-        value_states = self.v_proj(hidden_states)
-
-        # query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
-        # key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
-        # value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
-
-        query_states, key_states, value_states = (
-            rearrange(query_states, "b n (h d) -> b n h d", d=self.head_dim),
-            rearrange(key_states, "b n (h d) -> b n h d", d=self.head_dim),
-            rearrange(value_states, "b n (h d) -> b n h d", d=self.head_dim),
-        )
-
-        query_states = query_states.transpose(1, 2)
-        key_states = key_states.transpose(1, 2)
-        value_states = value_states.transpose(1, 2)
-
-        # 打印注意力模块的输出
-        # 准备数据以写入 JSON 文件
-        attention_outputs = {
-            "Query states": query_states.detach().cpu().tolist(),
-            "Key states": key_states.detach().cpu().tolist(),
-            "Value states": value_states.detach().cpu().tolist()
-        }
-        # 将数据写入 JSON 文件
-        with open("a_flash_attention_outputs.json", "w") as f:
-            json.dump(attention_outputs, f, indent=4)
-
-        if self.config.pp_size > 1:
-            query_states = query_states.contiguous()
-            key_states = key_states.contiguous()
-            value_states = value_states.contiguous()
-
-
-        kv_seq_len = key_states.shape[-2]
-        if past_key_value is not None:
-            if self.layer_idx is None:
-                raise ValueError(
-                    f"The cache structure has changed since version v4.36. If you are using {self.__class__.__name__} "
-                    "for auto-regressive decoding with k/v caching, please make sure to initialize the attention class "
-                    "with a layer index."
-                )
-            kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
-
-        # Because the input can be padded, the absolute sequence length depends on the max position id.
-        rotary_seq_len = max(kv_seq_len, position_ids[:, -1].max().item()) + 1
-        cos, sin = self.rotary_emb(value_states, seq_len=rotary_seq_len)
-
-        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
-
-        use_sliding_windows = (
-            _flash_supports_window_size
-            and getattr(self.config, "sliding_window", None) is not None
-            and kv_seq_len > self.config.sliding_window
-        )
-
-        if not _flash_supports_window_size:
-            logger.warning_once(
-                "The current flash attention version does not support sliding window attention, for a more memory efficient implementation"
-                " make sure to upgrade flash-attn library."
-            )
-
-        if past_key_value is not None:
-            # Activate slicing cache only if the config has a value `sliding_windows` attribute
-            cache_has_contents = past_key_value.get_seq_length(self.layer_idx) > 0
-            if (
-                getattr(self.config, "sliding_window", None) is not None
-                and kv_seq_len > self.config.sliding_window
-                and cache_has_contents
-            ):
-                slicing_tokens = 1 - self.config.sliding_window
-
-                past_key = past_key_value[self.layer_idx][0]
-                past_value = past_key_value[self.layer_idx][1]
-
-                past_key = past_key[:, :, slicing_tokens:, :].contiguous()
-                past_value = past_value[:, :, slicing_tokens:, :].contiguous()
-
-                if past_key.shape[-2] != self.config.sliding_window - 1:
-                    raise ValueError(
-                        f"past key must have a shape of (`batch_size, num_heads, self.config.sliding_window-1, head_dim`), got"
-                        f" {past_key.shape}"
-                    )
-
-                if attention_mask is not None:
-                    attention_mask = attention_mask[:, slicing_tokens:]
-                    attention_mask = torch.cat([attention_mask, torch.ones_like(attention_mask[:, -1:])], dim=-1)
-
-            cache_kwargs = {"sin": sin, "cos": cos}  # Specific to RoPE models
-            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
-
-        # repeat k/v heads if n_kv_heads < n_heads
-        key_states = repeat_kv(key_states, self.num_key_value_groups)
-        value_states = repeat_kv(value_states, self.num_key_value_groups)
-        dropout_rate = 0.0 if not self.training else self.attention_dropout
-
-        # In PEFT, usually we cast the layer norms in float32 for training stability reasons
-        # therefore the input hidden states gets silently casted in float32. Hence, we need
-        # cast them back in float16 just to be sure everything works as expected.
-        input_dtype = query_states.dtype
-        if input_dtype == torch.float32:
-            if torch.is_autocast_enabled():
-                target_dtype = torch.get_autocast_gpu_dtype()
-            # Handle the case where the model is quantized
-            elif hasattr(self.config, "_pre_quantization_dtype"):
-                target_dtype = self.config._pre_quantization_dtype
-            else:
-                target_dtype = self.q_proj.weight.dtype
-
-            logger.warning_once(
-                f"The input hidden states seems to be silently casted in float32, this might be related to"
-                f" the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in"
-                f" {target_dtype}."
-            )
-
-            query_states = query_states.to(target_dtype)
-            key_states = key_states.to(target_dtype)
-            value_states = value_states.to(target_dtype)
-
-        # Reashape to the expected shape for Flash Attention
-        query_states = query_states.transpose(1, 2)
-        key_states = key_states.transpose(1, 2)
-        value_states = value_states.transpose(1, 2)
-
-        attn_output = self._flash_attention_forward(
-            query_states,
-            key_states,
-            value_states,
-            attention_mask,
-            q_len,
-            dropout=dropout_rate,
-            use_sliding_windows=use_sliding_windows,
-        )
-
-        attn_output = attn_output.reshape(bsz, q_len, int(self.hidden_size/self.config.tp_size)).contiguous()
-        attn_output = self.o_proj(attn_output)
-
-        if not output_attentions:
-            attn_weights = None
-
-        # 打印注意力模块的输出
-        attention_result = {
-            "Output weights:": attn_output.detach().cpu().tolist(),
-            # "Attention weights:": attn_weights.detach().cpu().tolist(),
-        }
-        # 将数据写入 JSON 文件
-        with open("a_flash_attention_outputs.json", "w") as f:
-            json.dump(attention_result, f, indent=4)
-
-        return attn_output, attn_weights, past_key_value
-
-    def _flash_attention_forward(
-        self,
-        query_states,
-        key_states,
-        value_states,
-        attention_mask,
-        query_length,
-        dropout=0.0,
-        softmax_scale=None,
-        use_sliding_windows=False,
-    ):
-        """
-        Calls the forward method of Flash Attention - if the input hidden states contain at least one padding token
-        first unpad the input, then computes the attention scores and pad the final attention scores.
-
-        Args:
-            query_states (`torch.Tensor`):
-                Input query states to be passed to Flash Attention API
-            key_states (`torch.Tensor`):
-                Input key states to be passed to Flash Attention API
-            value_states (`torch.Tensor`):
-                Input value states to be passed to Flash Attention API
-            attention_mask (`torch.Tensor`):
-                The padding mask - corresponds to a tensor of size `(batch_size, seq_len)` where 0 stands for the
-                position of padding tokens and 1 for the position of non-padding tokens.
-            dropout (`float`):
-                Attention dropout
-            softmax_scale (`float`, *optional*):
-                The scaling of QK^T before applying softmax. Default to 1 / sqrt(head_dim)
-            use_sliding_windows (`bool`, *optional*):
-                Whether to activate sliding window attention.
-        """
-        if not self._flash_attn_uses_top_left_mask:
-            causal = self.is_causal
-        else:
-            # TODO: Remove the `query_length != 1` check once Flash Attention for RoCm is bumped to 2.1. For details, please see the comment in LlamaFlashAttention2 __init__.
-            causal = self.is_causal and query_length != 1
-
-        # Contains at least one padding token in the sequence
-        if attention_mask is not None:
-            batch_size = query_states.shape[0]
-            query_states, key_states, value_states, indices_q, cu_seq_lens, max_seq_lens = self._upad_input(
-                query_states, key_states, value_states, attention_mask, query_length
-            )
-
-            cu_seqlens_q, cu_seqlens_k = cu_seq_lens
-            max_seqlen_in_batch_q, max_seqlen_in_batch_k = max_seq_lens
-
-            if not use_sliding_windows:
-                attn_output_unpad = flash_attn_varlen_func(
-                    query_states,
-                    key_states,
-                    value_states,
-                    cu_seqlens_q=cu_seqlens_q,
-                    cu_seqlens_k=cu_seqlens_k,
-                    max_seqlen_q=max_seqlen_in_batch_q,
-                    max_seqlen_k=max_seqlen_in_batch_k,
-                    dropout_p=dropout,
-                    softmax_scale=softmax_scale,
-                    causal=causal,
-                )
-            else:
-                attn_output_unpad = flash_attn_varlen_func(
-                    query_states,
-                    key_states,
-                    value_states,
-                    cu_seqlens_q=cu_seqlens_q,
-                    cu_seqlens_k=cu_seqlens_k,
-                    max_seqlen_q=max_seqlen_in_batch_q,
-                    max_seqlen_k=max_seqlen_in_batch_k,
-                    dropout_p=dropout,
-                    softmax_scale=softmax_scale,
-                    causal=causal,
-                    window_size=(self.config.sliding_window, self.config.sliding_window),
-                )
-
-            attn_output = pad_input(attn_output_unpad, indices_q, batch_size, query_length)
-        else:
-            if not use_sliding_windows:
-                attn_output = flash_attn_func(
-                    query_states,
-                    key_states,
-                    value_states,
-                    dropout,
-                    softmax_scale=softmax_scale,
-                    causal=causal,
-                )
-            else:
-                attn_output = flash_attn_func(
-                    query_states,
-                    key_states,
-                    value_states,
-                    dropout,
-                    softmax_scale=softmax_scale,
-                    causal=causal,
-                    window_size=(self.config.sliding_window, self.config.sliding_window),
-                )
-
-        return attn_output
-
-    def _upad_input(self, query_layer, key_layer, value_layer, attention_mask, query_length):
-        batch_size, kv_seq_len, num_heads, head_dim = key_layer.shape
-
-        # On the first iteration we need to properly re-create the padding mask
-        # by slicing it on the proper place
-        if kv_seq_len != attention_mask.shape[-1]:
-            attention_mask_num_tokens = attention_mask.shape[-1]
-            attention_mask = attention_mask[:, attention_mask_num_tokens - kv_seq_len :]
-
-        indices_k, cu_seqlens_k, max_seqlen_in_batch_k = _get_unpad_data(attention_mask)
-
-        key_layer = index_first_axis(key_layer.reshape(batch_size * kv_seq_len, num_heads, head_dim), indices_k)
-        value_layer = index_first_axis(value_layer.reshape(batch_size * kv_seq_len, num_heads, head_dim), indices_k)
-
-        if query_length == kv_seq_len:
-            query_layer = index_first_axis(
-                query_layer.reshape(batch_size * kv_seq_len, num_heads, head_dim), indices_k
-            )
-            cu_seqlens_q = cu_seqlens_k
-            max_seqlen_in_batch_q = max_seqlen_in_batch_k
-            indices_q = indices_k
-        elif query_length == 1:
-            max_seqlen_in_batch_q = 1
-            cu_seqlens_q = torch.arange(
-                batch_size + 1, dtype=torch.int32, device=query_layer.device
-            )  # There is a memcpy here, that is very bad.
-            indices_q = cu_seqlens_q[:-1]
-            query_layer = query_layer.squeeze(1)
-        else:
-            # The -q_len: slice assumes left padding.
-            attention_mask = attention_mask[:, -query_length:]
-            query_layer, indices_q, cu_seqlens_q, max_seqlen_in_batch_q = unpad_input(query_layer, attention_mask)
-
-        return (
-            query_layer,
-            key_layer,
-            value_layer,
-            indices_q,
-            (cu_seqlens_q, cu_seqlens_k),
-            (max_seqlen_in_batch_q, max_seqlen_in_batch_k),
-        )
-
-
-# copied from transformers.models.llama.modeling_llama.LlamaSdpaAttention with Llama->Mistral
-# TODO @Arthur no longer copied from LLama after static cache
-class MistralSdpaAttention(MistralAttention):
-    """
-    Mistral attention module using torch.nn.functional.scaled_dot_product_attention. This module inherits from
-    `MistralAttention` as the weights of the module stays untouched. The only changes are on the forward pass to adapt to
-    SDPA API.
-    """
-
-    # Adapted from MistralAttention.forward
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_value: Optional[Cache] = None,
-        output_attentions: bool = False,
-        use_cache: bool = False,
-    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
-        if output_attentions:
-            # TODO: Improve this warning with e.g. `model.config.attn_implementation = "manual"` once this is implemented.
-            logger.warning_once(
-                "MistralModel is using MistralSdpaAttention, but `torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to the manual attention implementation, "
-                'but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
-            )
-            return super().forward(
-                hidden_states=hidden_states,
-                attention_mask=attention_mask,
-                position_ids=position_ids,
-                past_key_value=past_key_value,
-                output_attentions=output_attentions,
-                use_cache=use_cache,
-            )
-
-        bsz, q_len, _ = hidden_states.size()
-
-        query_states = self.q_proj(hidden_states)
-        key_states = self.k_proj(hidden_states)
-        value_states = self.v_proj(hidden_states)
-
-        # query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
-        # key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
-        # value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
-
-        query_states, key_states, value_states = (
-            rearrange(query_states, "b n (h d) -> b n h d", d=self.head_dim),
-            rearrange(key_states, "b n (h d) -> b n h d", d=self.head_dim),
-            rearrange(value_states, "b n (h d) -> b n h d", d=self.head_dim),
-        )
-
-        query_states = query_states.transpose(1, 2)
-        key_states = key_states.transpose(1, 2)
-        value_states = value_states.transpose(1, 2)
-
-        # 打印注意力模块的输出
-        # 准备数据以写入 JSON 文件
-        attention_outputs = {
-            "Query states": query_states.detach().cpu().tolist(),
-            "Key states": key_states.detach().cpu().tolist(),
-            "Value states": value_states.detach().cpu().tolist()
-        }
-        # 将数据写入 JSON 文件
-        with open("a_sdpa_attention_outputs.json", "w") as f:
-            json.dump(attention_outputs, f, indent=4)
-        
-        if self.config.pp_size > 1:
-            query_states = query_states.contiguous()
-            key_states = key_states.contiguous()
-            value_states = value_states.contiguous()
-
-
-        kv_seq_len = key_states.shape[-2]
-        if past_key_value is not None:
-            kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
-        cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
-        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
-
-        if past_key_value is not None:
-            cache_kwargs = {"sin": sin, "cos": cos}  # Specific to RoPE models
-            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
-
-        key_states = repeat_kv(key_states, self.num_key_value_groups)
-        value_states = repeat_kv(value_states, self.num_key_value_groups)
-
-        if attention_mask is not None:
-            if attention_mask.size() != (bsz, 1, q_len, kv_seq_len):
-                raise ValueError(
-                    f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}"
-                )
-
-        # SDPA with memory-efficient backend is currently (torch==2.1.2) bugged with non-contiguous inputs with custom attn_mask,
-        # Reference: https://github.com/pytorch/pytorch/issues/112577.
-        if query_states.device.type == "cuda" and attention_mask is not None:
-            query_states = query_states.contiguous()
-            key_states = key_states.contiguous()
-            value_states = value_states.contiguous()
-
-        attn_output = torch.nn.functional.scaled_dot_product_attention(
-            query_states,
-            key_states,
-            value_states,
-            attn_mask=attention_mask,
-            dropout_p=self.attention_dropout if self.training else 0.0,
-            # The q_len > 1 is necessary to match with AttentionMaskConverter.to_causal_4d that does not create a causal mask in case q_len == 1.
-            is_causal=self.is_causal and attention_mask is None and q_len > 1,
-        )
-
-        attn_output = attn_output.transpose(1, 2).contiguous()
-        attn_output = attn_output.view(bsz, q_len, int(self.hidden_size/self.config.tp_size))
-
-        attn_output = self.o_proj(attn_output)
-
-        # 打印注意力模块的输出
-        attention_result = {
-            "Output weights:": attn_output.detach().cpu().tolist(),
-            # "Attention weights:": attn_weights.detach().cpu().tolist(),
-        }
-        # 将数据写入 JSON 文件
-        with open("a_sdpa_attention_outputs.json", "w") as f:
-            json.dump(attention_result, f, indent=4)
-
-        return attn_output, None, past_key_value
-
-
-MISTRAL_ATTENTION_CLASSES = {
-    "eager": MistralAttention,
-    "flash_attention_2": MistralFlashAttention2,
-    "sdpa": MistralSdpaAttention,
-}
-
-
-class MistralDecoderLayer(nn.Module):
-    def __init__(self, config: CollieConfig, layer_idx: int):
-        super().__init__()
-        self.hidden_size = config.hidden_size
-        config._attn_implementation = "sdpa"
-        self.self_attn = MISTRAL_ATTENTION_CLASSES[config._attn_implementation](config, layer_idx)
-        self.config = config
-        self.mlp = MistralMLP(config)
-        self.input_layernorm = MistralRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
-        self.post_attention_layernorm = MistralRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
-        self.idx = layer_idx
-        # 务必保持变量名一致
-        self.use_cache = self.config.model_config.use_cache
-        self.hidden_states = None
-        self.output_attentions = False
-
-class MistralDecoderLayer(nn.Module):
-    def __init__(self, config: CollieConfig, layer_idx: int):
-        super().__init__()
-        self.hidden_size = config.hidden_size
-        config._attn_implementation = "sdpa"
-        self.self_attn = MISTRAL_ATTENTION_CLASSES[config._attn_implementation](config, layer_idx)
-        self.config = config
-        self.mlp = MistralMLP(config)
-        self.input_layernorm = MistralRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
-        self.post_attention_layernorm = MistralRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
-        self.idx = layer_idx
-        # 务必保持变量名一致
-        self.use_cache = self.config.model_config.use_cache
-        self.hidden_states = None
-        self.output_attentions = False
-
-    def _forward(
-        self,
-        hidden_states: torch.Tensor,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_value: Optional[Tuple[torch.Tensor]] = None,
-        # output_attentions: Optional[bool] = False,
-        # use_cache: Optional[bool] = False,
-        **kwargs,
-    ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
-        # if "padding_mask" in kwargs:
-        #     warnings.warn(
-        #         "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`"
-        #     )
-        """
-        Args:
-            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
-            attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
-                `(batch, sequence_length)` where padding elements are indicated by 0.
-            output_attentions (`bool`, *optional*):
-                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
-                returned tensors for more detail.
-            use_cache (`bool`, *optional*):
-                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
-                (see `past_key_values`).
-            past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
-        """
-
-        residual = hidden_states
-
-        hidden_states = self.input_layernorm(hidden_states)
-
-        # Self Attention
-        hidden_states, self_attn_weights, present_key_value = self.self_attn(
-            hidden_states=hidden_states,
-            attention_mask=attention_mask,
-            position_ids=position_ids,
-            past_key_value=past_key_value,
-            # output_attentions=output_attentions,
-            # use_cache=use_cache,
-            **kwargs,
-        )
-        hidden_states = residual + hidden_states
-
-        # Fully Connected
-        residual = hidden_states
-        hidden_states = self.post_attention_layernorm(hidden_states)
-        hidden_states = self.mlp(hidden_states)
-        hidden_states = residual + hidden_states
-
-        # outputs = (hidden_states,)
-
-        # if output_attentions:
-        #     outputs += (self_attn_weights,)
-
-        # if use_cache:
-        #     outputs += (present_key_value,)
-
-        return hidden_states, present_key_value
-
-    def forward(self, inputs: dict):
-        layer_past = inputs_to_kv_cache_for_layer(idx=self.idx, inputs=inputs)
-
-        if self.config.checkpointing and self.training:
-            hidden_states, new_layer_past = torch.utils.checkpoint.checkpoint(
-                self._forward,
-                inputs["hidden_states"],
-                inputs.get("attention_mask", None),
-                inputs.get("position_ids", None),
-                layer_past,  # inputs.get("past_key_values", None),
-            )
-        else:
-            hidden_states, new_layer_past = self._forward(
-                inputs["hidden_states"],
-                inputs.get("attention_mask", None),
-                inputs.get("position_ids", None),
-                layer_past
-            )  # **inputs
-        inputs["hidden_states"] = hidden_states
-
-        inputs.update(kv_cache_to_inputs_for_layer(idx=self.idx, new_layer_past=new_layer_past))
-        return inputs
-
-
-    # def _forward(
-    #     self,
-    #     hidden_states: torch.Tensor,
-    #     attention_mask: Optional[torch.Tensor] = None,
-    #     position_ids: Optional[torch.LongTensor] = None,
-    #     past_key_value: Optional[Tuple[torch.Tensor]] = None,
-    #     # output_attentions: Optional[bool] = False,
-    #     # use_cache: Optional[bool] = False,
-    #     **kwargs,
-    # ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
-    #     # if "padding_mask" in kwargs:
-    #     #     warnings.warn(
-    #     #         "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`"
-    #     #     )
-    #     """
-    #     Args:
-    #         hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
-    #         attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
-    #             `(batch, sequence_length)` where padding elements are indicated by 0.
-    #         output_attentions (`bool`, *optional*):
-    #             Whether or not to return the attentions tensors of all attention layers. See `attentions` under
-    #             returned tensors for more detail.
-    #         use_cache (`bool`, *optional*):
-    #             If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
-    #             (see `past_key_values`).
-    #         past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
-    #     """
-
-    #     residual = hidden_states
-
-    #     hidden_states = self.input_layernorm(hidden_states)
-
-    #     # Self Attention
-    #     hidden_states, self_attn_weights, present_key_value = self.self_attn(
-    #         hidden_states=hidden_states,
-    #         attention_mask=attention_mask,
-    #         position_ids=position_ids,
-    #         past_key_value=past_key_value,
-    #         # output_attentions=output_attentions,
-    #         # use_cache=use_cache,
-    #         **kwargs,
-    #     )
-    #     hidden_states = residual + hidden_states
-
-    #     # Fully Connected
-    #     residual = hidden_states
-    #     hidden_states = self.post_attention_layernorm(hidden_states)
-    #     hidden_states = self.mlp(hidden_states)
-    #     hidden_states = residual + hidden_states
-
-    #     # outputs = (hidden_states,)
-
-    #     # if output_attentions:
-    #     #     outputs += (self_attn_weights,)
-
-    #     # if use_cache:
-    #     #     outputs += (present_key_value,)
-
-    #     return hidden_states, present_key_value
-
-    # def forward(self, inputs: dict):
-    #     layer_past = inputs_to_kv_cache_for_layer(idx=self.idx, inputs=inputs)
-
-    #     if self.config.checkpointing and self.training:
-    #         hidden_states, new_layer_past = torch.utils.checkpoint.checkpoint(
-    #             self._forward,
-    #             inputs["hidden_states"],
-    #             inputs.get("attention_mask", None),
-    #             inputs.get("position_ids", None),
-    #             layer_past,  # inputs.get("past_key_values", None),
-    #         )
-    #     else:
-    #         hidden_states, new_layer_past = self._forward(
-    #             inputs["hidden_states"],
-    #             inputs.get("attention_mask", None),
-    #             inputs.get("position_ids", None),
-    #             layer_past
-    #         )  # **inputs
-    #     inputs["hidden_states"] = hidden_states
-
-    #     inputs.update(kv_cache_to_inputs_for_layer(idx=self.idx, new_layer_past=new_layer_past))
-    #     return inputs
-
-    # def forward(
-    #     self,
-    #     hidden_states: torch.Tensor,
-    #     attention_mask: Optional[torch.Tensor] = None,
-    #     position_ids: Optional[torch.LongTensor] = None,
-    #     past_key_value: Optional[Tuple[torch.Tensor]] = None,
-    #     output_attentions: Optional[bool] = False,
-    #     use_cache: Optional[bool] = False,
-    #     **kwargs,
-    # ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
-    #     if "padding_mask" in kwargs:
-    #         warnings.warn(
-    #             "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`"
-    #         )
-    #     """
-    #     Args:
-    #         hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
-    #         attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
-    #             `(batch, sequence_length)` where padding elements are indicated by 0.
-    #         output_attentions (`bool`, *optional*):
-    #             Whether or not to return the attentions tensors of all attention layers. See `attentions` under
-    #             returned tensors for more detail.
-    #         use_cache (`bool`, *optional*):
-    #             If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
-    #             (see `past_key_values`).
-    #         past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
-    #     """
-
-    #     residual = hidden_states
-
-    #     hidden_states = self.input_layernorm(hidden_states)
-
-    #     # Self Attention
-    #     hidden_states, self_attn_weights, present_key_value = self.self_attn(
-    #         hidden_states=hidden_states,
-    #         attention_mask=attention_mask,
-    #         position_ids=position_ids,
-    #         past_key_value=past_key_value,
-    #         output_attentions=output_attentions,
-    #         use_cache=use_cache,
-    #         **kwargs,
-    #     )
-    #     hidden_states = residual + hidden_states
-
-    #     # Fully Connected
-    #     residual = hidden_states
-    #     hidden_states = self.post_attention_layernorm(hidden_states)
-    #     hidden_states = self.mlp(hidden_states)
-    #     hidden_states = residual + hidden_states
-
-    #     outputs = (hidden_states,)
-
-    #     if output_attentions:
-    #         outputs += (self_attn_weights,)
-
-    #     if use_cache:
-    #         outputs += (present_key_value,)
-
-    #     return outputs
-
-
-MISTRAL_START_DOCSTRING = r"""
-    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
-    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
-    etc.)
-
-    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
-    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
-    and behavior.
-
-    Parameters:
-        config ([`MistralConfig`]):
-            Model configuration class with all the parameters of the model. Initializing with a config file does not
-            load the weights associated with the model, only the configuration. Check out the
-            [`~PreTrainedModel.from_pretrained`] method to load the model weights.
-"""
-
-
-@add_start_docstrings(
-    "The bare Mistral Model outputting raw hidden-states without any specific head on top.",
-    MISTRAL_START_DOCSTRING,
-)
-class MistralPreTrainedModel(PreTrainedModel):
-    config_class = MistralConfig
-    base_model_prefix = "model"
-    supports_gradient_checkpointing = True
-    _no_split_modules = ["MistralDecoderLayer"]
-    _skip_keys_device_placement = "past_key_values"
-    _supports_flash_attn_2 = True
-    _supports_sdpa = True
-    _supports_cache_class = True
-
-    def _init_weights(self, module):
-        std = self.config.initializer_range
-        if isinstance(module, nn.Linear):
-            module.weight.data.normal_(mean=0.0, std=std)
-            if module.bias is not None:
-                module.bias.data.zero_()
-        elif isinstance(module, nn.Embedding):
-            module.weight.data.normal_(mean=0.0, std=std)
-            if module.padding_idx is not None:
-                module.weight.data[module.padding_idx].zero_()
-
-
-MISTRAL_INPUTS_DOCSTRING = r"""
-    Args:
-        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
-            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
-            it.
-
-            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
-            [`PreTrainedTokenizer.__call__`] for details.
-
-            [What are input IDs?](../glossary#input-ids)
-        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
-            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
-
-            - 1 for tokens that are **not masked**,
-            - 0 for tokens that are **masked**.
-
-            [What are attention masks?](../glossary#attention-mask)
-
-            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
-            [`PreTrainedTokenizer.__call__`] for details.
-
-            If `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
-            `past_key_values`).
-
-            If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
-            and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
-            information on the default strategy.
-
-            - 1 indicates the head is **not masked**,
-            - 0 indicates the head is **masked**.
-        position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
-            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
-            config.n_positions - 1]`.
-
-            [What are position IDs?](../glossary#position-ids)
-        past_key_values (`Cache` or `tuple(tuple(torch.FloatTensor))`, *optional*):
-            Pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
-            blocks) that can be used to speed up sequential decoding. This typically consists in the `past_key_values`
-            returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`.
-
-            Two formats are allowed:
-            - a [`~cache_utils.Cache`] instance;
-            - Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of
-            shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`). This is also known as the legacy
-            cache format.
-
-            The model will output the same cache format that is fed as input. If no `past_key_values` are passed, the
-            legacy cache format will be returned.
-
-            If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that don't
-            have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `input_ids`
-            of shape `(batch_size, sequence_length)`.
-        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
-            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
-            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
-            model's internal embedding lookup matrix.
-        use_cache (`bool`, *optional*):
-            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
-            `past_key_values`).
-        output_attentions (`bool`, *optional*):
-            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
-            tensors for more detail.
-        output_hidden_states (`bool`, *optional*):
-            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
-            more detail.
-        return_dict (`bool`, *optional*):
-            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
-"""
-
-
-@add_start_docstrings(
-    "The bare Mistral Model outputting raw hidden-states without any specific head on top.",
-    MISTRAL_START_DOCSTRING,
-)
-class MistralModel(nn.Module):
-    """
-    Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`MistralDecoderLayer`]
-
-    Args:
-        config: MistralConfig
-    """
-
-    def __init__(self, config: CollieConfig):
-        # super().__init__(config)
-        super().__init__()
-        self.config = config
-        self.padding_idx = config.pad_token_id
-        self.vocab_size = config.vocab_size
-        
-        # aaaa
-        # self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
-        self.embed_tokens = tensor_parallel.VocabParallelEmbedding(
-            config.vocab_size, config.hidden_size, params_dtype=torch.float32
-        )
-        self.layers = nn.ModuleList(
-            [MistralDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
-        )
-        config._attn_implementation = "sdpa"
-        self._attn_implementation = config._attn_implementation
-        self.norm = MistralRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
-        self.gradient_checkpointing = False
-        # Initialize weights and apply final processing
-        # self.post_init()
-        
-
-    def get_input_embeddings(self):
-        return self.embed_tokens
-
-    def set_input_embeddings(self, value):
-        self.embed_tokens = value
-
-    @add_start_docstrings_to_model_forward(MISTRAL_INPUTS_DOCSTRING)
-    def forward(
-        self,
-        input_ids: torch.LongTensor = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        # aaaa
-        past_key_values: Optional[Tuple[torch.FloatTensor]] = None,
-        inputs_embeds: Optional[torch.FloatTensor] = None,
-        use_cache: Optional[bool] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ) -> Union[Tuple, BaseModelOutputWithPast]:
-        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
-        output_hidden_states = (
-            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
-        )
-        use_cache = use_cache if use_cache is not None else self.config.use_cache
-
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        # retrieve input_ids and inputs_embeds
-        if input_ids is not None and inputs_embeds is not None:
-            raise ValueError("You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time")
-        elif input_ids is not None:
-            batch_size, seq_length = input_ids.shape
-        elif inputs_embeds is not None:
-            batch_size, seq_length, _ = inputs_embeds.shape
-        else:
-            raise ValueError("You have to specify either decoder_input_ids or decoder_inputs_embeds")
-
-        if self.gradient_checkpointing and self.training:
-            if use_cache:
-                logger.warning_once(
-                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
-                )
-                use_cache = False
-
-        past_key_values_length = 0
-
-        if use_cache:
-            use_legacy_cache = not isinstance(past_key_values, Cache)
-            if use_legacy_cache:
-                past_key_values = DynamicCache.from_legacy_cache(past_key_values)
-            past_key_values_length = past_key_values.get_usable_length(seq_length)
-
-        if position_ids is None:
-            device = input_ids.device if input_ids is not None else inputs_embeds.device
-            position_ids = torch.arange(
-                past_key_values_length, seq_length + past_key_values_length, dtype=torch.long, device=device
-            )
-            position_ids = position_ids.unsqueeze(0).view(-1, seq_length)
-        else:
-            position_ids = position_ids.view(-1, seq_length).long()
-
-        if inputs_embeds is None:
-            inputs_embeds = self.embed_tokens(input_ids)
-
-        # 打印嵌入层输出
-        embeddings_output = inputs_embeds.detach().cpu().tolist()
-        data_to_save = {"Embeddings Output": embeddings_output}
-        # 将输出写入 JSON 文件
-        with open('a_embeddings_output.json', 'w') as f:
-            json.dump(data_to_save, f, indent=4)
-
-        if attention_mask is not None and self._attn_implementation == "flash_attention_2" and use_cache:
-            is_padding_right = attention_mask[:, -1].sum().item() != batch_size
-            if is_padding_right:
-                raise ValueError(
-                    "You are attempting to perform batched generation with padding_side='right'"
-                    " this may lead to unexpected behaviour for Flash Attention version of Mistral. Make sure to "
-                    " call `tokenizer.padding_side  = 'left'` before tokenizing the input. "
-                )
-
-        if self._attn_implementation == "flash_attention_2":
-            # 2d mask is passed through the layers
-            attention_mask = attention_mask if (attention_mask is not None and 0 in attention_mask) else None
-        elif self._attn_implementation == "sdpa" and not output_attentions:
-            # output_attentions=True can not be supported when using SDPA, and we fall back on
-            # the manual implementation that requires a 4D causal mask in all cases.
-            attention_mask = _prepare_4d_causal_attention_mask_for_sdpa(
-                attention_mask,
-                (batch_size, seq_length),
-                inputs_embeds,
-                past_key_values_length,
-            )
-        else:
-            # 4d mask is passed through the layers
-            attention_mask = _prepare_4d_causal_attention_mask(
-                attention_mask,
-                (batch_size, seq_length),
-                inputs_embeds,
-                past_key_values_length,
-                sliding_window=self.config.sliding_window,
-            )
-
-        hidden_states = inputs_embeds
-
-        inputs = {
-            "input_ids": input_ids,
-            "hidden_states": hidden_states,
-            "attention_mask": attention_mask,
-            "position_ids": position_ids,
-            "past_key_values": past_key_values,
-            "output_attentions": output_attentions,
-            "use_cache": use_cache,
-        } 
-
-        # decoder layers
-        all_hidden_states = () if output_hidden_states else None
-        all_self_attns = () if output_attentions else None
-        next_decoder_cache = None
-
-        # for decoder_layer in self.layers:
-        for idx, decoder_layer in enumerate(self.layers):
-            if output_hidden_states:
-                # all_hidden_states += (hidden_states,)
-                all_hidden_states += (inputs["hidden_states"],)
-
-            if self.gradient_checkpointing and self.training:
-                layer_outputs = self._gradient_checkpointing_func(
-                    decoder_layer.__call__,
-                    # hidden_states,
-                    # attention_mask,
-                    # position_ids,
-                    # past_key_values,
-                    # output_attentions,
-                    # use_cache,
-                    inputs,
-                )
-            else:
-                layer_outputs = decoder_layer(
-                    # hidden_states,
-                    # attention_mask=attention_mask,
-                    # position_ids=position_ids,
-                    # past_key_value=past_key_values,
-                    # output_attentions=output_attentions,
-                    # use_cache=use_cache,
-                    inputs,
-                )
-            inputs.update(layer_outputs)
-
-            # hidden_states = layer_outputs[0]
-            hidden_states = inputs["hidden_states"]
-
-            if use_cache:
-                # next_decoder_cache = layer_outputs[2 if output_attentions else 1]
-                next_decoder_cache = inputs["addition_info"][1 if output_attentions else 0]
-
-            if output_attentions:
-                # all_self_attns += (layer_outputs[1],)
-                all_self_attns += (inputs["addition_info"][0],)
-
-        hidden_states = self.norm(hidden_states)
-
-        # add hidden states from the last decoder layer
-        if output_hidden_states:
-            all_hidden_states += (hidden_states,)
-
-        next_cache = None
-        if use_cache:
-            next_cache = next_decoder_cache.to_legacy_cache() if use_legacy_cache else next_decoder_cache
-
-        if not return_dict:
-            return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None)
-        return BaseModelOutputWithPast(
-            last_hidden_state=hidden_states,
-            # past_key_values=next_cache,
-            hidden_states=all_hidden_states,
-            attentions=all_self_attns,
-            past_key_values=past_key_values,
-        )
-
-    @classmethod
-    def pipeline_layers(cls, config: CollieConfig):
-        """
-        Get layers of pipeline.
-        :return: list
-        """
-        if isinstance(config, str):
-            config = CollieConfig.from_pretrained(config)
-
-        if config.tie_word_embeddings:
-            embed_tokens = TiedLayerSpec(
-                "embed_tokens",
-                dict_as_params(input_keys="input_ids", output_keys="hidden_states"),
-                tensor_parallel.VocabParallelEmbedding,
-                config.vocab_size,
-                config.hidden_size,
-            )
-        else:
-            embed_tokens = LayerSpec(
-                dict_as_params(input_keys="input_ids", output_keys="hidden_states"),
-                tensor_parallel.VocabParallelEmbedding,
-                config.vocab_size,
-                config.hidden_size,
-            )
-
-        layers = [
-            LayerSpec(MistralDecoderLayer, config, i) for i in range(config.num_hidden_layers)
-        ]
-        norm = LayerSpec(
-            dict_as_params(input_keys="hidden_states", output_keys="hidden_states"),
-            MistralRMSNorm,
-            hidden_size=config.hidden_size,
-            eps=config.rms_norm_eps,
-        )
-
-        return [
-            ("embed_tokens", embed_tokens),
-            ("layers", layers),
-            ("norm", norm),
-        ]
-
-class MistralForCausalLM(CollieModelForCausalLM):
-    _tied_weights_keys = ["lm_head.weight"]
-
-    def __init__(self, config:CollieConfig):
-        super().__init__(config)
-        self.model = MistralModel(config)
-        self.vocab_size = config.vocab_size
-        # self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
-        self.lm_head = ColumnParallelLinearWithoutBias(
-            self.collie_config.hidden_size, self.collie_config.vocab_size, bias=False
-        )
-        # Initialize weights and apply final processing
-        # self.post_init()
-        # GenerationMixin 需要的额外参数
-        self.config.is_decoder = True
-        if config.model_config.tie_word_embeddings:
-            self.lm_head.weight = self.embed_tokens.weight
-        self.main_input_name = "input_ids"
-
-    def clean_cache(self):
-        self._clean_hidden_states([*self.model.layers, self.lm_head])
-        self._set_use_cache(self.model.layers, False)
-
-    def set_cache(self, use_cache):
-        self._set_use_cache(self.model.layers, use_cache)
-
-    def get_input_embeddings(self):
-        return self.model.embed_tokens
-
-    def set_input_embeddings(self, value):
-        self.model.embed_tokens = value
-
-    def get_output_embeddings(self):
-        return self.lm_head
-
-    def set_output_embeddings(self, new_embeddings):
-        self.lm_head = new_embeddings
-
-    def set_decoder(self, decoder):
-        self.model = decoder
-
-    def get_decoder(self):
-        return self.model
-
-    @add_start_docstrings_to_model_forward(MISTRAL_INPUTS_DOCSTRING)
-    @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
-    def forward(
-        self,
-        input_ids: torch.LongTensor = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[List[torch.FloatTensor]] = None,
-        inputs_embeds: Optional[torch.FloatTensor] = None,
-        labels: Optional[torch.LongTensor] = None,
-        use_cache: Optional[bool] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-        **kwargs,
-    ) -> Union[Tuple, CausalLMOutputWithPast]:
-        r"""
-        Args:
-            labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
-                Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
-                config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
-                (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
-
-        Returns:
-
-        Example:
-
-        ```python
-        >>> from transformers import AutoTokenizer, MistralForCausalLM
-
-        >>> model = MistralForCausalLM.from_pretrained("mistralai/Mistral-7B-v0.1")
-        >>> tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-v0.1")
-
-        >>> prompt = "Hey, are you conscious? Can you talk to me?"
-        >>> inputs = tokenizer(prompt, return_tensors="pt")
-
-        >>> # Generate
-        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
-        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
-        "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
-        ```"""
-
-        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
-        output_hidden_states = (
-            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
-        )
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
-        outputs = self.model(
-            input_ids=input_ids,
-            attention_mask=attention_mask,
-            position_ids=position_ids,
-            past_key_values=past_key_values,
-            inputs_embeds=inputs_embeds,
-            use_cache=use_cache,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-
-        hidden_states = outputs[0]
-        logits = self.lm_head(hidden_states)
-        logits = logits.float()
-
-        loss = None
-        if labels is not None:
-            # Shift so that tokens < n predict n
-            shift_logits = logits[..., :-1, :].contiguous()
-            shift_labels = labels[..., 1:].contiguous()
-            # Flatten the tokens
-            shift_logits = shift_logits.view(-1, self.config.vocab_size)
-            shift_labels = shift_labels.view(-1)
-            # Ensure tensors are on the same device
-            shift_labels = shift_labels.to(shift_logits.device)
-            loss_fct = CrossEntropyLoss()
-            loss = loss_fct(shift_logits, shift_labels)
-
-        if not return_dict:
-            output = (logits,) + outputs[1:]
-            return (loss,) + output if loss is not None else output
-
-        return CausalLMOutputWithPast(
-            loss=loss,
-            logits=logits,
-            past_key_values=outputs.past_key_values,
-            hidden_states=outputs.hidden_states,
-            attentions=outputs.attentions,
-        )
-
-    def prepare_inputs_for_generation(
-        self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, **kwargs
-    ):
-        # Omit tokens covered by past_key_values
-        if past_key_values is not None:
-            if isinstance(past_key_values, Cache):
-                cache_length = past_key_values.get_seq_length()
-                past_length = past_key_values.seen_tokens
-                max_cache_length = past_key_values.get_max_length()
-            else:
-                cache_length = past_length = past_key_values[0][0].shape[2]
-                max_cache_length = None
-
-            # Keep only the unprocessed tokens:
-            # 1 - If the length of the attention_mask exceeds the length of input_ids, then we are in a setting where
-            # some of the inputs are exclusively passed as part of the cache (e.g. when passing input_embeds as
-            # input)
-            if attention_mask is not None and attention_mask.shape[1] > input_ids.shape[1]:
-                input_ids = input_ids[:, -(attention_mask.shape[1] - past_length) :]
-            # 2 - If the past_length is smaller than input_ids', then input_ids holds all input tokens. We can discard
-            # input_ids based on the past_length.
-            elif past_length < input_ids.shape[1]:
-                input_ids = input_ids[:, past_length:]
-            # 3 - Otherwise (past_length >= input_ids.shape[1]), let's assume input_ids only has unprocessed tokens.
-
-            # If we are about to go beyond the maximum cache length, we need to crop the input attention mask.
-            if (
-                max_cache_length is not None
-                and attention_mask is not None
-                and cache_length + input_ids.shape[1] > max_cache_length
-            ):
-                attention_mask = attention_mask[:, -max_cache_length:]
-
-        position_ids = kwargs.get("position_ids", None)
-        if attention_mask is not None and position_ids is None:
-            # create position_ids on the fly for batch generation
-            position_ids = attention_mask.long().cumsum(-1) - 1
-            position_ids.masked_fill_(attention_mask == 0, 1)
-            if past_key_values:
-                position_ids = position_ids[:, -input_ids.shape[1] :]
-
-        # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
-        if inputs_embeds is not None and past_key_values is None:
-            model_inputs = {"inputs_embeds": inputs_embeds}
-        else:
-            model_inputs = {"input_ids": input_ids}
-
-        model_inputs.update(
-            {
-                "position_ids": position_ids,
-                "past_key_values": past_key_values,
-                "use_cache": kwargs.get("use_cache"),
-                "attention_mask": attention_mask,
-            }
-        )
-        return model_inputs
-
-    @staticmethod
-    def _reorder_cache(past_key_values, beam_idx):
-        reordered_past = ()
-        for layer_past in past_key_values:
-            reordered_past += (
-                tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past),
-            )
-        return reordered_past
-
-    @classmethod
-    def pipeline_layers(cls, config: CollieConfig):
-        """
-        Get layers of pipeline.
-        :return: list
-        """
-        if isinstance(config, str):
-            config = CollieConfig.from_pretrained(config)
-
-        if config.tie_word_embeddings:
-            output = TiedLayerSpec(
-                "embed_tokens",
-                dict_as_params(input_keys="hidden_states", output_keys="logits"),
-                ColumnParallelLMHead,
-                config.hidden_size,
-                config.vocab_size,
-                bias=False,
-            )
-        else:
-            output = LayerSpec(
-                dict_as_params(input_keys="hidden_states", output_keys="logits"),
-                ColumnParallelLMHead,
-                config.hidden_size,
-                config.vocab_size,
-                bias=False,
-            )
-
-        return [("model", MistralModel.pipeline_layers(config)), ("lm_head", output)]
-
-    @staticmethod
-    def load_parallel_state_dict(
-            path: str,
-            config: Union[CollieConfig, str],
-            process_exclusion: bool = False,
-            **kwargs,
-    ):
-        ...
-
-    @staticmethod
-    def load_parallel_state_dict(
-            path: str,
-            config: Union[CollieConfig, str],
-            process_exclusion: bool = False,
-            protocol: str = "file", # 指定加载state_dict时使用的协议
-            **kwargs,
-    ):
-        """
-        Load state_dict from ``path``.
-        The format of pretrained model should be the same as that of
-        `huggingface`.
-        :return: state_dict. Note that the state_dict should be processed
-            properly to match the current rank.
-        """
-        # 配置加载
-        if isinstance(config, str):
-            config = CollieConfig.from_pretrained(config)
-        # IO驱动初始化
-        io_driver = IODriver.from_protocol(protocol)
-        # 检查文件路径是否存在
-        if not io_driver.exists(path):
-            raise FileNotFoundError(f"folder {path} not found.")
-        # 初始化存储和处理变量
-        state_dict = OrderedDict()
-        weights = []
-        parts = None # 变量用于存储模型分割的部分信息
-        # 如果开启了进程互斥，那么每个进程都会显示进度条，否则只显示 RANK0 的
-        hide_progress = not process_exclusion and int(os.environ.get("RANK", "0")) != 0
-        if dist.is_initialized() and process_exclusion:
-            # 如果启动了进程互斥，则要进行 dist.get_world_size() 次循环
-            rank_order = range(dist.get_world_size())
-        else:
-            # 不开启只进行一次循环
-            rank_order = range(1)
-        # 权重文件加载和处理
-        for rank in rank_order:
-            # 如果开启了进程互斥，那么只有对应 RANK 的能进入循环；不开启进程互斥的话就都可以进
-            if int(os.environ.get("RANK", "0")) == rank or not process_exclusion:
-                # PP 分层的方法保存在了 os.environ["COLLIE_PP_PARTS"], 格式类似于 [0, 17, 35], 左闭右开
-                if env.is_pipeline:
-                    # 保存的是 json 格式
-                    parts = env.pipeline_parts
-                if hasattr(config, "num_key_value_heads"):
-                    # llama2 (transformers >= 4.31.0)
-                    num_key_value_heads = config.num_key_value_heads
-                else:
-                    num_key_value_heads = config.num_attention_heads
-                head_dim = config.hidden_size // config.num_attention_heads
-                # 如果存在 pytorch_model.bin.index.json 文件的话，此时不同的 pp 进程可以按需加载自己需要的权重
-                if (
-                        io_driver.exists(os.path.join(path, "pytorch_model.bin.index.json"))
-                        and "COLLIE_PP_PARTS" in os.environ.keys()
-                ):
-                    weight_map = json.loads(
-                        io_driver.load(
-                            os.path.join(path, "pytorch_model.bin.index.json"), mode="r"
-                        )
-                    )["weight_map"]
-                    # layers 表示自己需要的层
-                    layers = env.pipeline_layers_idx
-                    # 筛选出形似 model.layers.0 这样的层。包含两个条件：1. 有数字的层；2. 数字加一要在 layers 里面（因为最开始还有个 embedding 占一层）
-                    weights.extend(
-                        [
-                            value
-                            for key, value in weight_map.items()
-                            if len(key.split(".")) > 2
-                               and key.split(".")[2].isdigit()
-                               and (int(key.split(".")[2]) + 1) in layers
-                        ]
-                    )
-                    # 去重
-                    weights = list(set(weights))
-                    # 继续筛选，如果有 0 层，那么就要加载 embedding；如果有最后一层，那么就要加载 lm_head；如果有倒数第二层，那么就要加载 norm
-                    if 0 in layers:
-                        weights.append(weight_map["model.embed_tokens.weight"])
-                    if max(parts) - 1 in layers:
-                        weights.append(weight_map["lm_head.weight"])
-                    if max(parts) - 2 in layers:
-                        weights.append(weight_map["model.norm.weight"])
-                else:
-                    # 如果没有 pytorch_model.bin.index.json 文件的话，那么就加载所有的权重
-                    weights = [
-                        weight
-                        for weight in io_driver.list(path)
-                        if weight.endswith(".bin")
-                    ]
-                with progress(
-                    weights,
-                    desc="Loading state dict",
-                    total=len(weights),
-                    disable=hide_progress,
-                ) as pbar:
-                    for weight in pbar:
-                        part_state_dict = io_driver.load(
-                            os.path.join(path, weight), mode="rb"
-                        )
-                        # for key in list(part_state_dict.keys()):
-                            # if "attention.wqkv.weight" in key:
-                            #     # qkv_weights = part_state_dict.pop(key)
-                            #     qkv_weights = part_state_dict[key]
-                            #     print(qkv_weights.shape)
-                            #     (wq, wk, wv) = qkv_weights.split(
-                            #         [
-                            #             config.hidden_size,
-                            #             config.num_key_value_heads * head_dim,
-                            #             config.num_key_value_heads * head_dim,
-                            #         ],
-                            #         dim=0,
-                            #     )
-                            #     wq_name = key.replace("wqkv", "wq")
-                            #     wk_name = key.replace("wqkv", "wk")
-                            #     wv_name = key.replace("wqkv", "wv")
-                            #     part_state_dict[wq_name] = wq
-                            #     part_state_dict[wk_name] = wk
-                            #     part_state_dict[wv_name] = wv
-                        state_dict.update(part_state_dict)
-                        del part_state_dict
-                if parts is not None:
-                    # 这一步是 pp 的复筛
-                    layers = env.pipeline_layers_idx
-                    for key in list(state_dict.keys()):
-                        if key.startswith("layers"):
-                            layer = int(key.split(".")[1])
-                            if layer + 1 not in layers:
-                                state_dict.pop(key)
-                        # if key.endswith("tok_embeddings.weight"):
-                        if key.endswith("embed_tokens.weight"):
-                            if 0 not in layers:
-                                state_dict.pop(key)
-                        if key == "norm.weight":
-                            if max(parts) - 2 not in layers:
-                                state_dict.pop(key)
-                        # if key.endswith("output.weight"):
-                        if key.endswith("lm_head.weight"):
-                            if max(parts) - 1 not in layers:
-                                state_dict.pop(key)
-                # 根据用户配置的新的 tp size 进行分割
-                for key in list(state_dict.keys()):
-                    col_filter = [
-                        # "wq.weight",
-                        # "wk.weight",
-                        # "wv.weight",
-                        # "wqkv.weight",
-                        # "w1.weight",
-                        # "w3.weight",
-                        # "tok_embeddings.weight",
-                        # "output.weight",
-                        "q_proj.weight",
-                        "k_proj.weight",
-                        "v_proj.weight",
-                        #"o_proj.weight",
-                        "lm_head.weight",
-                        "gate_proj.weight",
-                        "up_proj.weight",
-                        #"down_proj.weight",
-                        "embed_tokens.weight",
-                    ]
-                    col_split = any([key.endswith(filter) for filter in col_filter])
-
-                    if col_split:
-                        tensor = (
-                            list(torch.chunk(state_dict[key], config.tp_size, dim=0))[
-                                env.tp_rank
-                            ]
-                            .detach()
-                            .clone()
-                        )
-                        del state_dict[key]
-                        if process_exclusion:
-                            # CPU 内存回收（速度很慢）
-                            gc.collect()
-                        state_dict[key] = tensor
-                    elif key.endswith("o_proj.weight") or key.endswith("down_proj.weight"):
-                        tensor = (
-                            list(torch.chunk(state_dict[key], config.tp_size, dim=1))[
-                                env.tp_rank
-                            ]
-                            .detach()
-                            .clone()
-                        )
-                        del state_dict[key]
-                        if process_exclusion:
-                            # CPU 内存回收（速度很慢）
-                            gc.collect()
-                        state_dict[key] = tensor
-            if dist.is_initialized() and process_exclusion:
-                # 如果选择了进程互斥，那么本次循环中不需要加载权重的进程需等待
-                dist.barrier()
-        return state_dict
-
-    @staticmethod
-    def save_parallel_state_dict(
-        state_dict: dict,
-        path: str,
-        config: CollieConfig,
-        process_exclusion: bool = False,
-        **kwargs,
-    ):
-        ...
-
-    @staticmethod
-    def save_parallel_state_dict(
-            state_dict: dict,
-            path: str,
-            config: CollieConfig,
-            process_exclusion: bool = False,
-            protocol: str = "file",
-    ):
-        """
-        Save state_dict to ``path``.
-        The format of saved state dict should be the same as that of
-        `huggingface`.
-        """
-        io_driver = IODriver.from_protocol(protocol)
-        # gather to tp rank 0
-        if dist.is_initialized() and process_exclusion:
-            # 如果启动了进程互斥，则要进行 pp_size 次循环
-            rank_order = range(config.pp_size)
-        else:
-            # 不开启只进行一次循环
-            rank_order = range(1)
-        dst = parallel_state.get_tensor_model_parallel_src_rank()
-        with progress(
-                rank_order,
-                desc="Saving model",
-                disable=int(os.environ.get("RANK", "0")) != 0,
-        ) as pbar:
-            for rank in pbar:
-                if env.dp_rank == 0 and (env.pp_rank == rank or not process_exclusion):
-                    for key in sorted(list(state_dict.keys())):
-                        tensor_list = None
-                        if env.tp_rank == 0:
-                            tensor_list = [
-                                torch.zeros_like(state_dict[key])
-                                .to(state_dict[key].dtype)
-                                .cuda()
-                                for _ in range(config.tp_size)
-                            ]
-                        dist.gather(
-                            state_dict[key].cuda(),
-                            dst=dst,
-                            gather_list=tensor_list,
-                            group=env.tp_group,
-                        )
-                        if env.tp_rank == 0:
-                            col_filter = [
-                                # "wq.weight",
-                                # "wk.weight",
-                                # "wv.weight",
-                                # "wqkv.weight",
-                                # "w1.weight",
-                                # "w3.weight",
-                                # "tok_embeddings.weight",
-                                # "output.weight",
-                                "q_proj.weight",
-                                "k_proj.weight",
-                                "v_proj.weight",
-                                #"o_proj.weight",
-                                "lm_head.weight",
-                                "gate_proj.weight",
-                                "up_proj.weight",
-                                #"down_proj.weight",
-                                "embed_tokens.weight",
-                            ]
-                            col_split = any(
-                                [key.endswith(filter) for filter in col_filter]
-                            )
-
-                            if col_split:
-                                state_dict[key] = concat_tensor(tensor_list, dim=0)
-
-                                if process_exclusion:
-                                    # CPU 内存回收（速度很慢）
-                                    gc.collect()
-
-                            elif key.endswith("o_proj.weight") or key.endswith("down_proj.weight"):
-                                state_dict[key] = concat_tensor(tensor_list, dim=1)
-
-                                if process_exclusion:
-                                    # CPU 内存回收（速度很慢）
-                                    gc.collect()
-                    # 似乎不需要？
-                    # state_dict_keys = state_dict.keys()
-                    # for layer_id in range(config.num_layers):
-                    #     qkv_names = [None, None, None]
-                    #     for key in state_dict_keys:
-                    #         if f"layers.{layer_id}.attention.wq.weight" in key:
-                    #             qkv_names[0] = key
-                    #         elif f"layers.{layer_id}.attention.wk.weight" in key:
-                    #             qkv_names[1] = key
-                    #         elif f"layers.{layer_id}.attention.wv.weight" in key:
-                    #             qkv_names[2] = key
-                    #     qkv_name = qkv_names[0].replace("wq", "wqkv")
-                    #     state_dict[qkv_name] = torch.cat(
-                    #         [
-                    #             state_dict.pop(qkv_names[0]),
-                    #             state_dict.pop(qkv_names[1]),
-                    #             state_dict.pop(qkv_names[2]),
-                    #         ],
-                    #         dim=0
-                    #     )
-
-                    if env.tp_rank == 0:
-                        # Save gathered weights
-                        if env.is_pipeline:
-                            ckpt_name = f"pytorch_model-{env.pp_rank + 1:05d}-of-{config.pp_size:05d}.bin"
-                            total_size = 0
-                            weight_map = {}
-                            for name, weight in state_dict.items():
-                                weight_size = weight.numel() * dtype_byte_size(
-                                    weight.dtype
-                                )
-                                weight_map[name] = ckpt_name
-                                total_size += weight_size
-                            index_dict = dict(
-                                total_size=total_size, weight_map=weight_map
-                            )
-                            index_dicts = [None for _ in range(env.pp_size)]
-                            dist.gather_object(
-                                index_dict, index_dicts if env.pp_rank == 0 else None, group=env.pp_group
-                            )
-                            if env.pp_rank == 0:
-                                total_size = 0
-                                weight_map = {}
-                                for _index_dict in index_dicts:
-                                    total_size += _index_dict["total_size"]
-                                    weight_map.update(_index_dict["weight_map"])
-                                merged_dict = {
-                                    "metadata": {"total_size": total_size},
-                                    "weight_map": weight_map,
-                                }
-                                io_driver.save(
-                                    json.dumps(merged_dict, indent=2, sort_keys=True)
-                                    + "\n",
-                                    os.path.join(path, "pytorch_model.bin.index.json"),
-                                )
-
-                        else:
-                            ckpt_name = f"pytorch_model.bin"
-                        ckpt_path = os.path.join(path, ckpt_name)
-                        io_driver.save(state_dict, ckpt_path)
-                if dist.is_initialized() and process_exclusion:
-                    dist.barrier()
-        if env.rank == 0:
-            config.save_pretrained(path, protocol=protocol)
-        dist.barrier()
-
-
-@add_start_docstrings(
-    """
-    The Mistral Model transformer with a sequence classification head on top (linear layer).
-
-    [`MistralForSequenceClassification`] uses the last token in order to do the classification, as other causal models
-    (e.g. GPT-2) do.
-
-    Since it does classification on the last token, it requires to know the position of the last token. If a
-    `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If
-    no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the
-    padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
-    each row of the batch).
-    """,
-    MISTRAL_START_DOCSTRING,
-)
-# Copied from transformers.models.llama.modeling_llama.LlamaForSequenceClassification with Llama->Mistral, LLAMA->MISTRAL
-class MistralForSequenceClassification(MistralPreTrainedModel):
-    def __init__(self, config):
-        super().__init__(config)
-        self.num_labels = config.num_labels
-        self.model = MistralModel(config)
-        self.score = nn.Linear(config.hidden_size, self.num_labels, bias=False)
-
-        # Initialize weights and apply final processing
-        self.post_init()
-
-    def get_input_embeddings(self):
-        return self.model.embed_tokens
-
-    def set_input_embeddings(self, value):
-        self.model.embed_tokens = value
-
-    @add_start_docstrings_to_model_forward(MISTRAL_INPUTS_DOCSTRING)
-    def forward(
-        self,
-        input_ids: torch.LongTensor = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[List[torch.FloatTensor]] = None,
-        inputs_embeds: Optional[torch.FloatTensor] = None,
-        labels: Optional[torch.LongTensor] = None,
-        use_cache: Optional[bool] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ) -> Union[Tuple, SequenceClassifierOutputWithPast]:
-        r"""
-        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
-            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
-            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
-            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
-        """
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        transformer_outputs = self.model(
-            input_ids,
-            attention_mask=attention_mask,
-            position_ids=position_ids,
-            past_key_values=past_key_values,
-            inputs_embeds=inputs_embeds,
-            use_cache=use_cache,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-        hidden_states = transformer_outputs[0]
-        logits = self.score(hidden_states)
-
-        if input_ids is not None:
-            batch_size = input_ids.shape[0]
-        else:
-            batch_size = inputs_embeds.shape[0]
-
-        if self.config.pad_token_id is None and batch_size != 1:
-            raise ValueError("Cannot handle batch sizes > 1 if no padding token is defined.")
-        if self.config.pad_token_id is None:
-            sequence_lengths = -1
-        else:
-            if input_ids is not None:
-                # if no pad token found, use modulo instead of reverse indexing for ONNX compatibility
-                sequence_lengths = torch.eq(input_ids, self.config.pad_token_id).int().argmax(-1) - 1
-                sequence_lengths = sequence_lengths % input_ids.shape[-1]
-                sequence_lengths = sequence_lengths.to(logits.device)
-            else:
-                sequence_lengths = -1
-
-        pooled_logits = logits[torch.arange(batch_size, device=logits.device), sequence_lengths]
-
-        loss = None
-        if labels is not None:
-            labels = labels.to(logits.device)
-            if self.config.problem_type is None:
-                if self.num_labels == 1:
-                    self.config.problem_type = "regression"
-                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
-                    self.config.problem_type = "single_label_classification"
-                else:
-                    self.config.problem_type = "multi_label_classification"
-
-            if self.config.problem_type == "regression":
-                loss_fct = MSELoss()
-                if self.num_labels == 1:
-                    loss = loss_fct(pooled_logits.squeeze(), labels.squeeze())
-                else:
-                    loss = loss_fct(pooled_logits, labels)
-            elif self.config.problem_type == "single_label_classification":
-                loss_fct = CrossEntropyLoss()
-                loss = loss_fct(pooled_logits.view(-1, self.num_labels), labels.view(-1))
-            elif self.config.problem_type == "multi_label_classification":
-                loss_fct = BCEWithLogitsLoss()
-                loss = loss_fct(pooled_logits, labels)
-        if not return_dict:
-            output = (pooled_logits,) + transformer_outputs[1:]
-            return ((loss,) + output) if loss is not None else output
-
-        return SequenceClassifierOutputWithPast(
-            loss=loss,
-            logits=pooled_logits,
-            past_key_values=transformer_outputs.past_key_values,
-            hidden_states=transformer_outputs.hidden_states,
-            attentions=transformer_outputs.attentions,
-        )

From 7dd75f01f7583fb92559c900b26d1c3f0424469f Mon Sep 17 00:00:00 2001
From: LinqiY <100989140+LinqiY@users.noreply.github.com>
Date: Mon, 6 May 2024 15:42:44 +0800
Subject: [PATCH 12/16] Add mistral

---
 collie/models/mistral/__init__.py |    2 +
 collie/models/mistral/model.py    | 1919 +++++++++++++++++++++++++++++
 2 files changed, 1921 insertions(+)
 create mode 100644 collie/models/mistral/__init__.py
 create mode 100644 collie/models/mistral/model.py

diff --git a/collie/models/mistral/__init__.py b/collie/models/mistral/__init__.py
new file mode 100644
index 0000000..e998c29
--- /dev/null
+++ b/collie/models/mistral/__init__.py
@@ -0,0 +1,2 @@
+from .model import MistralForCausalLM
+from .configuration_mistral import MistralConfig
\ No newline at end of file
diff --git a/collie/models/mistral/model.py b/collie/models/mistral/model.py
new file mode 100644
index 0000000..a85d0c7
--- /dev/null
+++ b/collie/models/mistral/model.py
@@ -0,0 +1,1919 @@
+# coding=utf-8
+# Copyright 2023 Mistral AI and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" PyTorch Mistral model."""
+import inspect
+import math
+import warnings
+from typing import List, Optional, Tuple, Union
+
+import torch
+import torch.nn.functional as F
+import torch.utils.checkpoint
+from torch import nn
+from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
+
+from transformers.activations import ACT2FN
+from transformers.cache_utils import Cache, DynamicCache
+from transformers.modeling_attn_mask_utils import _prepare_4d_causal_attention_mask, _prepare_4d_causal_attention_mask_for_sdpa
+from transformers.modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast, SequenceClassifierOutputWithPast
+from transformers.modeling_utils import PreTrainedModel, dtype_byte_size
+from transformers.utils import (
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    is_flash_attn_2_available,
+    is_flash_attn_greater_or_equal_2_10,
+    logging,
+    replace_return_docstrings,
+)
+from .configuration_mistral import MistralConfig
+
+
+if is_flash_attn_2_available():
+    from flash_attn import flash_attn_func, flash_attn_varlen_func
+    from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input  # noqa
+
+    _flash_supports_window_size = "window_size" in list(inspect.signature(flash_attn_func).parameters)
+
+
+logger = logging.get_logger(__name__)
+
+_CONFIG_FOR_DOC = "MistralConfig"
+
+#modified for collie
+import torch.distributed as dist
+import gc
+import json
+import os
+from collections import OrderedDict
+from megatron.core import parallel_state, tensor_parallel
+from einops import rearrange
+from deepspeed.pipe import LayerSpec, TiedLayerSpec
+
+from collie.config import CollieConfig
+from collie.driver.io import IODriver
+from collie.log.logger import logger
+from collie.module import (
+    ColumnParallelLinearWithoutBias,
+    ColumnParallelLMHead,
+    RowParallelLinearWithoutBias,
+)
+from collie.utils import concat_tensor, dict_as_params, env, progress
+from collie.models.base import CollieModelForCausalLM
+from collie.models.utils import (
+    kv_cache_to_inputs_for_layer, inputs_to_kv_cache_for_layer,
+    kv_cache_to_inputs_for_model, inputs_to_kv_cache_for_model,
+)
+
+
+# Copied from transformers.models.llama.modeling_llama._get_unpad_data
+def _get_unpad_data(attention_mask):
+    seqlens_in_batch = attention_mask.sum(dim=-1, dtype=torch.int32)
+    indices = torch.nonzero(attention_mask.flatten(), as_tuple=False).flatten()
+    max_seqlen_in_batch = seqlens_in_batch.max().item()
+    cu_seqlens = F.pad(torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.int32), (1, 0))
+    return (
+        indices,
+        cu_seqlens,
+        max_seqlen_in_batch,
+    )
+
+
+# Copied from transformers.models.llama.modeling_llama.LlamaRMSNorm with Llama->Mistral
+class MistralRMSNorm(nn.Module):
+    def __init__(self, hidden_size, eps=1e-6):
+        """
+        MistralRMSNorm is equivalent to T5LayerNorm
+        """
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+        self.variance_epsilon = eps
+
+    def forward(self, hidden_states):
+        input_dtype = hidden_states.dtype
+        hidden_states = hidden_states.to(torch.float32)
+        variance = hidden_states.pow(2).mean(-1, keepdim=True)
+        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
+        ans = self.weight * hidden_states.to(input_dtype)
+
+        return self.weight * hidden_states.to(input_dtype)
+
+
+# copied from transformers.models.llama.modeling_llama.LlamaRotaryEmbedding with Llama->Mistral
+# TODO @Arthur no longer copied from LLama after static cache
+class MistralRotaryEmbedding(nn.Module):
+    def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None):
+        super().__init__()
+
+        self.dim = dim
+        self.max_position_embeddings = max_position_embeddings
+        self.base = base
+        inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2, dtype=torch.int64).float().to(device) / self.dim))
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
+
+        # Build here to make `torch.jit.trace` work.
+        self._set_cos_sin_cache(
+            seq_len=max_position_embeddings, device=self.inv_freq.device, dtype=torch.get_default_dtype()
+        )
+
+    def _set_cos_sin_cache(self, seq_len, device, dtype):
+        self.max_seq_len_cached = seq_len
+        t = torch.arange(self.max_seq_len_cached, device=device, dtype=torch.int64).type_as(self.inv_freq)
+
+        freqs = torch.outer(t, self.inv_freq)
+        # Different from paper, but it uses a different permutation in order to obtain the same calculation
+        emb = torch.cat((freqs, freqs), dim=-1)
+        self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False)
+        self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False)
+
+    def forward(self, x, seq_len=None):
+        # x: [bs, num_attention_heads, seq_len, head_size]
+        if seq_len > self.max_seq_len_cached:
+            self._set_cos_sin_cache(seq_len=seq_len, device=x.device, dtype=x.dtype)
+
+        return (
+            self.cos_cached[:seq_len].to(dtype=x.dtype),
+            self.sin_cached[:seq_len].to(dtype=x.dtype),
+        )
+
+
+# Copied from transformers.models.llama.modeling_llama.rotate_half
+def rotate_half(x):
+    """Rotates half the hidden dims of the input."""
+    x1 = x[..., : x.shape[-1] // 2]
+    x2 = x[..., x.shape[-1] // 2 :]
+    return torch.cat((-x2, x1), dim=-1)
+
+
+# copied from transformers.models.llama.modeling_llama.apply_rotary_pos_emb
+# TODO @Arthur no longer copied from LLama after static cache
+def apply_rotary_pos_emb(q, k, cos, sin, position_ids, unsqueeze_dim=1):
+    """Applies Rotary Position Embedding to the query and key tensors.
+
+    Args:
+        q (`torch.Tensor`): The query tensor.
+        k (`torch.Tensor`): The key tensor.
+        cos (`torch.Tensor`): The cosine part of the rotary embedding.
+        sin (`torch.Tensor`): The sine part of the rotary embedding.
+        position_ids (`torch.Tensor`):
+            The position indices of the tokens corresponding to the query and key tensors. For example, this can be
+            used to pass offsetted position ids when working with a KV-cache.
+        unsqueeze_dim (`int`, *optional*, defaults to 1):
+            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
+            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
+            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
+            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
+            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
+            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
+    Returns:
+        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
+    """
+    cos = cos[position_ids].unsqueeze(unsqueeze_dim)
+    sin = sin[position_ids].unsqueeze(unsqueeze_dim)
+    q_embed = (q * cos) + (rotate_half(q) * sin)
+    k_embed = (k * cos) + (rotate_half(k) * sin)
+    return q_embed, k_embed
+
+
+class MistralMLP(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.hidden_size = config.hidden_size
+        self.intermediate_size = config.intermediate_size
+        self.up_proj = ColumnParallelLinearWithoutBias(
+            self.hidden_size,
+            self.intermediate_size,
+            bias=False,
+            gather_output=False,
+            init_method=lambda x: x,
+        )
+        self.gate_proj = ColumnParallelLinearWithoutBias(
+            self.hidden_size,
+            self.intermediate_size,
+            bias=False,
+            gather_output=False,
+            init_method=lambda x: x,
+        )
+        self.down_proj = RowParallelLinearWithoutBias(
+            self.intermediate_size,
+            self.hidden_size,
+            bias=False,
+            input_is_parallel=True,
+            init_method=lambda x: x,
+        )
+        
+        self.act_fn = ACT2FN[config.hidden_act]
+
+    def forward(self, x):
+        output = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))      
+        return output
+
+
+# Copied from transformers.models.llama.modeling_llama.repeat_kv
+def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
+    """
+    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
+    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
+    """
+    batch, num_key_value_heads, slen, head_dim = hidden_states.shape
+    if n_rep == 1:
+        return hidden_states
+    hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
+    return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
+
+
+class MistralAttention(nn.Module):
+    """
+    Multi-headed attention from 'Attention Is All You Need' paper. Modified to use sliding window attention: Longformer
+    and "Generating Long Sequences with Sparse Transformers".
+    """
+
+    def __init__(self, config: CollieConfig, layer_idx: Optional[int] = None):
+        super().__init__()
+        self.config = config
+        self.layer_idx = layer_idx
+        if layer_idx is None:
+            logger.warning_once(
+                f"Instantiating {self.__class__.__name__} without passing a `layer_idx` is not recommended and will "
+                "lead to errors during the forward call if caching is used. Please make sure to provide a `layer_idx` "
+                "when creating this class."
+            )
+
+        self.hidden_size = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_dim = self.hidden_size // self.num_heads
+        self.num_key_value_heads = config.num_key_value_heads
+        self.num_key_value_groups = self.num_heads // self.num_key_value_heads
+        self.max_position_embeddings = config.max_position_embeddings
+        self.rope_theta = config.rope_theta
+        self.is_causal = True
+        self.attention_dropout = config.attention_dropout
+
+        if (self.head_dim * self.num_heads) != self.hidden_size:
+            raise ValueError(
+                f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}"
+                f" and `num_heads`: {self.num_heads})."
+            )
+        self.q_proj = ColumnParallelLinearWithoutBias(
+            self.hidden_size,
+            self.num_heads * self.head_dim,
+            bias=False,
+            gather_output=False,
+            init_method=lambda x: x,
+        )
+        self.k_proj = ColumnParallelLinearWithoutBias(
+            self.hidden_size,
+            self.num_key_value_heads * self.head_dim,
+            bias=False,
+            gather_output=False,
+            init_method=lambda x: x,
+        )
+        self.v_proj = ColumnParallelLinearWithoutBias(
+            self.hidden_size,
+            self.num_key_value_heads * self.head_dim,
+            bias=False,
+            gather_output=False,
+            init_method=lambda x: x,
+        )
+        # aaaa
+        self.o_proj = RowParallelLinearWithoutBias(
+            self.num_heads * self.head_dim,
+            self.hidden_size,
+            bias=False,
+            input_is_parallel=True,
+            init_method=lambda x: x,
+        )
+
+        self.rotary_emb = MistralRotaryEmbedding(
+            self.head_dim,
+            max_position_embeddings=self.max_position_embeddings,
+            base=self.rope_theta,
+        )
+
+    def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
+        return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,    # 输入维度 [bsz, q_len, hidden_size]
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Cache] = None,
+        output_attentions: bool = False,
+        use_cache: bool = False,
+        **kwargs,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        if "padding_mask" in kwargs:
+            warnings.warn(
+                "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`"
+            )
+
+        bsz, q_len, _ = hidden_states.size()
+
+        query_states = self.q_proj(hidden_states)   # [bsz, q_len, num_heads * head_dim]
+        key_states = self.k_proj(hidden_states)     # [bsz, q_len, num_key_value_heads * head_dim]
+        value_states = self.v_proj(hidden_states)   # [bsz, q_len, num_key_value_heads * head_dim]
+
+        query_states, key_states, value_states = (
+            rearrange(query_states, "b n (h d) -> b n h d", d=self.head_dim),   # [bsz, q_len, num_heads, head_dim]
+            rearrange(key_states, "b n (h d) -> b n h d", d=self.head_dim),     # [bsz, q_len, num_key_value_heads, head_dim]
+            rearrange(value_states, "b n (h d) -> b n h d", d=self.head_dim),   # [bsz, q_len, num_key_value_heads, head_dim]
+        )
+
+        query_states = query_states.transpose(1, 2)     # [bsz, num_heads, q_len, head_dim]
+        key_states = key_states.transpose(1, 2)         # [bsz, num_key_value_heads, q_len, head_dim]
+        value_states = value_states.transpose(1, 2)     # [bsz, num_key_value_heads, q_len, head_dim]
+        
+
+        if self.config.pp_size > 1:
+            query_states = query_states.contiguous()
+            key_states = key_states.contiguous()
+            value_states = value_states.contiguous()
+
+        kv_seq_len = key_states.shape[-2]
+        if past_key_value is not None:
+            if self.layer_idx is None:
+                raise ValueError(
+                    f"The cache structure has changed since version v4.36. If you are using {self.__class__.__name__} "
+                    "for auto-regressive decoding with k/v caching, please make sure to initialize the attention class "
+                    "with a layer index."
+                )
+            kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
+        cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
+
+        if past_key_value is not None:
+            cache_kwargs = {"sin": sin, "cos": cos}  # Specific to RoPE models
+            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
+
+        # repeat k/v heads if n_kv_heads < n_heads
+        key_states = repeat_kv(key_states, self.num_key_value_groups)
+        value_states = repeat_kv(value_states, self.num_key_value_groups)
+
+        attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim)
+
+        if attn_weights.size() != (bsz, self.num_heads/self.config.tp_size, q_len, kv_seq_len):
+            raise ValueError(
+                f"Attention weights should be of size {(bsz, self.num_heads/self.config.tp_size, q_len, kv_seq_len)}, but is"
+                f" {attn_weights.size()}"
+            )
+
+        if attention_mask is not None:
+            if attention_mask.size() != (bsz, 1, q_len, kv_seq_len):
+                raise ValueError(
+                    f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}"
+                )
+
+            attn_weights = attn_weights + attention_mask
+
+        # upcast attention to fp32
+        attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
+        attn_weights = nn.functional.dropout(attn_weights, p=self.attention_dropout, training=self.training)
+        attn_output = torch.matmul(attn_weights, value_states)
+
+        if attn_output.size() != (bsz, self.num_heads/self.config.tp_size, q_len, self.head_dim):
+            raise ValueError(
+                f"`attn_output` should be of size {(bsz, self.num_heads/self.config.tp_size, q_len, self.head_dim)}, but is"
+                f" {attn_output.size()}"
+            )
+
+        attn_output = attn_output.transpose(1, 2).contiguous()
+        attn_output = attn_output.reshape(bsz, q_len, int(self.hidden_size/self.config.tp_size))
+
+
+        attn_output = self.o_proj(attn_output)
+
+        if not output_attentions:
+            attn_weights = None
+        return attn_output, attn_weights, past_key_value
+
+
+class MistralFlashAttention2(MistralAttention):
+    """
+    Mistral flash attention module. This module inherits from `MistralAttention` as the weights of the module stays
+    untouched. The only required change would be on the forward pass where it needs to correctly call the public API of
+    flash attention and deal with padding tokens in case the input contains any of them.
+    """
+
+    # Copied from transformers.models.llama.modeling_llama.LlamaFlashAttention2.__init__
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+        # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1.
+        # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignement, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0.
+        # Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left).
+        self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10()
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Cache] = None,
+        output_attentions: bool = False,
+        use_cache: bool = False,
+        **kwargs,
+    ):
+        if "padding_mask" in kwargs:
+            warnings.warn(
+                "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`"
+            )
+
+            # overwrite attention_mask with padding_mask
+            attention_mask = kwargs.pop("padding_mask")
+        bsz, q_len, _ = hidden_states.size()
+
+        query_states = self.q_proj(hidden_states)
+        key_states = self.k_proj(hidden_states)
+        value_states = self.v_proj(hidden_states)
+
+        query_states, key_states, value_states = (
+            rearrange(query_states, "b n (h d) -> b n h d", d=self.head_dim),
+            rearrange(key_states, "b n (h d) -> b n h d", d=self.head_dim),
+            rearrange(value_states, "b n (h d) -> b n h d", d=self.head_dim),
+        )
+
+        query_states = query_states.transpose(1, 2)
+        key_states = key_states.transpose(1, 2)
+        value_states = value_states.transpose(1, 2)
+
+        if self.config.pp_size > 1:
+            query_states = query_states.contiguous()
+            key_states = key_states.contiguous()
+            value_states = value_states.contiguous()
+
+
+        kv_seq_len = key_states.shape[-2]
+        if past_key_value is not None:
+            if self.layer_idx is None:
+                raise ValueError(
+                    f"The cache structure has changed since version v4.36. If you are using {self.__class__.__name__} "
+                    "for auto-regressive decoding with k/v caching, please make sure to initialize the attention class "
+                    "with a layer index."
+                )
+            kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
+
+        # Because the input can be padded, the absolute sequence length depends on the max position id.
+        rotary_seq_len = max(kv_seq_len, position_ids[:, -1].max().item()) + 1
+        cos, sin = self.rotary_emb(value_states, seq_len=rotary_seq_len)
+
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
+
+        use_sliding_windows = (
+            _flash_supports_window_size
+            and getattr(self.config, "sliding_window", None) is not None
+            and kv_seq_len > self.config.sliding_window
+        )
+
+        if not _flash_supports_window_size:
+            logger.warning_once(
+                "The current flash attention version does not support sliding window attention, for a more memory efficient implementation"
+                " make sure to upgrade flash-attn library."
+            )
+
+        if past_key_value is not None:
+            # Activate slicing cache only if the config has a value `sliding_windows` attribute
+            cache_has_contents = past_key_value.get_seq_length(self.layer_idx) > 0
+            if (
+                getattr(self.config, "sliding_window", None) is not None
+                and kv_seq_len > self.config.sliding_window
+                and cache_has_contents
+            ):
+                slicing_tokens = 1 - self.config.sliding_window
+
+                past_key = past_key_value[self.layer_idx][0]
+                past_value = past_key_value[self.layer_idx][1]
+
+                past_key = past_key[:, :, slicing_tokens:, :].contiguous()
+                past_value = past_value[:, :, slicing_tokens:, :].contiguous()
+
+                if past_key.shape[-2] != self.config.sliding_window - 1:
+                    raise ValueError(
+                        f"past key must have a shape of (`batch_size, num_heads, self.config.sliding_window-1, head_dim`), got"
+                        f" {past_key.shape}"
+                    )
+
+                if attention_mask is not None:
+                    attention_mask = attention_mask[:, slicing_tokens:]
+                    attention_mask = torch.cat([attention_mask, torch.ones_like(attention_mask[:, -1:])], dim=-1)
+
+            cache_kwargs = {"sin": sin, "cos": cos}  # Specific to RoPE models
+            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
+
+        # repeat k/v heads if n_kv_heads < n_heads
+        key_states = repeat_kv(key_states, self.num_key_value_groups)
+        value_states = repeat_kv(value_states, self.num_key_value_groups)
+        dropout_rate = 0.0 if not self.training else self.attention_dropout
+
+        # In PEFT, usually we cast the layer norms in float32 for training stability reasons
+        # therefore the input hidden states gets silently casted in float32. Hence, we need
+        # cast them back in float16 just to be sure everything works as expected.
+        input_dtype = query_states.dtype
+        if input_dtype == torch.float32:
+            if torch.is_autocast_enabled():
+                target_dtype = torch.get_autocast_gpu_dtype()
+            # Handle the case where the model is quantized
+            elif hasattr(self.config, "_pre_quantization_dtype"):
+                target_dtype = self.config._pre_quantization_dtype
+            else:
+                target_dtype = self.q_proj.weight.dtype
+
+            logger.warning_once(
+                f"The input hidden states seems to be silently casted in float32, this might be related to"
+                f" the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in"
+                f" {target_dtype}."
+            )
+
+            query_states = query_states.to(target_dtype)
+            key_states = key_states.to(target_dtype)
+            value_states = value_states.to(target_dtype)
+
+        # Reashape to the expected shape for Flash Attention
+        query_states = query_states.transpose(1, 2)
+        key_states = key_states.transpose(1, 2)
+        value_states = value_states.transpose(1, 2)
+
+        attn_output = self._flash_attention_forward(
+            query_states,
+            key_states,
+            value_states,
+            attention_mask,
+            q_len,
+            dropout=dropout_rate,
+            use_sliding_windows=use_sliding_windows,
+        )
+
+        attn_output = attn_output.reshape(bsz, q_len, int(self.hidden_size/self.config.tp_size)).contiguous()
+        attn_output = self.o_proj(attn_output)
+
+        if not output_attentions:
+            attn_weights = None
+
+        return attn_output, attn_weights, past_key_value
+
+    def _flash_attention_forward(
+        self,
+        query_states,
+        key_states,
+        value_states,
+        attention_mask,
+        query_length,
+        dropout=0.0,
+        softmax_scale=None,
+        use_sliding_windows=False,
+    ):
+        """
+        Calls the forward method of Flash Attention - if the input hidden states contain at least one padding token
+        first unpad the input, then computes the attention scores and pad the final attention scores.
+
+        Args:
+            query_states (`torch.Tensor`):
+                Input query states to be passed to Flash Attention API
+            key_states (`torch.Tensor`):
+                Input key states to be passed to Flash Attention API
+            value_states (`torch.Tensor`):
+                Input value states to be passed to Flash Attention API
+            attention_mask (`torch.Tensor`):
+                The padding mask - corresponds to a tensor of size `(batch_size, seq_len)` where 0 stands for the
+                position of padding tokens and 1 for the position of non-padding tokens.
+            dropout (`float`):
+                Attention dropout
+            softmax_scale (`float`, *optional*):
+                The scaling of QK^T before applying softmax. Default to 1 / sqrt(head_dim)
+            use_sliding_windows (`bool`, *optional*):
+                Whether to activate sliding window attention.
+        """
+        if not self._flash_attn_uses_top_left_mask:
+            causal = self.is_causal
+        else:
+            # TODO: Remove the `query_length != 1` check once Flash Attention for RoCm is bumped to 2.1. For details, please see the comment in LlamaFlashAttention2 __init__.
+            causal = self.is_causal and query_length != 1
+
+        # Contains at least one padding token in the sequence
+        if attention_mask is not None:
+            batch_size = query_states.shape[0]
+            query_states, key_states, value_states, indices_q, cu_seq_lens, max_seq_lens = self._upad_input(
+                query_states, key_states, value_states, attention_mask, query_length
+            )
+
+            cu_seqlens_q, cu_seqlens_k = cu_seq_lens
+            max_seqlen_in_batch_q, max_seqlen_in_batch_k = max_seq_lens
+
+            if not use_sliding_windows:
+                attn_output_unpad = flash_attn_varlen_func(
+                    query_states,
+                    key_states,
+                    value_states,
+                    cu_seqlens_q=cu_seqlens_q,
+                    cu_seqlens_k=cu_seqlens_k,
+                    max_seqlen_q=max_seqlen_in_batch_q,
+                    max_seqlen_k=max_seqlen_in_batch_k,
+                    dropout_p=dropout,
+                    softmax_scale=softmax_scale,
+                    causal=causal,
+                )
+            else:
+                attn_output_unpad = flash_attn_varlen_func(
+                    query_states,
+                    key_states,
+                    value_states,
+                    cu_seqlens_q=cu_seqlens_q,
+                    cu_seqlens_k=cu_seqlens_k,
+                    max_seqlen_q=max_seqlen_in_batch_q,
+                    max_seqlen_k=max_seqlen_in_batch_k,
+                    dropout_p=dropout,
+                    softmax_scale=softmax_scale,
+                    causal=causal,
+                    window_size=(self.config.sliding_window, self.config.sliding_window),
+                )
+
+            attn_output = pad_input(attn_output_unpad, indices_q, batch_size, query_length)
+        else:
+            if not use_sliding_windows:
+                attn_output = flash_attn_func(
+                    query_states,
+                    key_states,
+                    value_states,
+                    dropout,
+                    softmax_scale=softmax_scale,
+                    causal=causal,
+                )
+            else:
+                attn_output = flash_attn_func(
+                    query_states,
+                    key_states,
+                    value_states,
+                    dropout,
+                    softmax_scale=softmax_scale,
+                    causal=causal,
+                    window_size=(self.config.sliding_window, self.config.sliding_window),
+                )
+
+        return attn_output
+
+    def _upad_input(self, query_layer, key_layer, value_layer, attention_mask, query_length):
+        batch_size, kv_seq_len, num_heads, head_dim = key_layer.shape
+
+        # On the first iteration we need to properly re-create the padding mask
+        # by slicing it on the proper place
+        if kv_seq_len != attention_mask.shape[-1]:
+            attention_mask_num_tokens = attention_mask.shape[-1]
+            attention_mask = attention_mask[:, attention_mask_num_tokens - kv_seq_len :]
+
+        indices_k, cu_seqlens_k, max_seqlen_in_batch_k = _get_unpad_data(attention_mask)
+
+        key_layer = index_first_axis(key_layer.reshape(batch_size * kv_seq_len, num_heads, head_dim), indices_k)
+        value_layer = index_first_axis(value_layer.reshape(batch_size * kv_seq_len, num_heads, head_dim), indices_k)
+
+        if query_length == kv_seq_len:
+            query_layer = index_first_axis(
+                query_layer.reshape(batch_size * kv_seq_len, num_heads, head_dim), indices_k
+            )
+            cu_seqlens_q = cu_seqlens_k
+            max_seqlen_in_batch_q = max_seqlen_in_batch_k
+            indices_q = indices_k
+        elif query_length == 1:
+            max_seqlen_in_batch_q = 1
+            cu_seqlens_q = torch.arange(
+                batch_size + 1, dtype=torch.int32, device=query_layer.device
+            )  # There is a memcpy here, that is very bad.
+            indices_q = cu_seqlens_q[:-1]
+            query_layer = query_layer.squeeze(1)
+        else:
+            # The -q_len: slice assumes left padding.
+            attention_mask = attention_mask[:, -query_length:]
+            query_layer, indices_q, cu_seqlens_q, max_seqlen_in_batch_q = unpad_input(query_layer, attention_mask)
+
+        return (
+            query_layer,
+            key_layer,
+            value_layer,
+            indices_q,
+            (cu_seqlens_q, cu_seqlens_k),
+            (max_seqlen_in_batch_q, max_seqlen_in_batch_k),
+        )
+
+
+# copied from transformers.models.llama.modeling_llama.LlamaSdpaAttention with Llama->Mistral
+# TODO @Arthur no longer copied from LLama after static cache
+class MistralSdpaAttention(MistralAttention):
+    """
+    Mistral attention module using torch.nn.functional.scaled_dot_product_attention. This module inherits from
+    `MistralAttention` as the weights of the module stays untouched. The only changes are on the forward pass to adapt to
+    SDPA API.
+    """
+
+    # Adapted from MistralAttention.forward
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Cache] = None,
+        output_attentions: bool = False,
+        use_cache: bool = False,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        if output_attentions:
+            # TODO: Improve this warning with e.g. `model.config.attn_implementation = "manual"` once this is implemented.
+            logger.warning_once(
+                "MistralModel is using MistralSdpaAttention, but `torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to the manual attention implementation, "
+                'but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
+            )
+            return super().forward(
+                hidden_states=hidden_states,
+                attention_mask=attention_mask,
+                position_ids=position_ids,
+                past_key_value=past_key_value,
+                output_attentions=output_attentions,
+                use_cache=use_cache,
+            )
+
+        bsz, q_len, _ = hidden_states.size()
+
+        query_states = self.q_proj(hidden_states)
+        key_states = self.k_proj(hidden_states)
+        value_states = self.v_proj(hidden_states)
+
+        query_states, key_states, value_states = (
+            rearrange(query_states, "b n (h d) -> b n h d", d=self.head_dim),
+            rearrange(key_states, "b n (h d) -> b n h d", d=self.head_dim),
+            rearrange(value_states, "b n (h d) -> b n h d", d=self.head_dim),
+        )
+
+        query_states = query_states.transpose(1, 2)
+        key_states = key_states.transpose(1, 2)
+        value_states = value_states.transpose(1, 2)
+        
+        if self.config.pp_size > 1:
+            query_states = query_states.contiguous()
+            key_states = key_states.contiguous()
+            value_states = value_states.contiguous()
+
+
+        kv_seq_len = key_states.shape[-2]
+        if past_key_value is not None:
+            kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
+        cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
+
+        if past_key_value is not None:
+            cache_kwargs = {"sin": sin, "cos": cos}  # Specific to RoPE models
+            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
+
+        key_states = repeat_kv(key_states, self.num_key_value_groups)
+        value_states = repeat_kv(value_states, self.num_key_value_groups)
+
+        if attention_mask is not None:
+            if attention_mask.size() != (bsz, 1, q_len, kv_seq_len):
+                raise ValueError(
+                    f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}"
+                )
+
+        # SDPA with memory-efficient backend is currently (torch==2.1.2) bugged with non-contiguous inputs with custom attn_mask,
+        # Reference: https://github.com/pytorch/pytorch/issues/112577.
+        if query_states.device.type == "cuda" and attention_mask is not None:
+            query_states = query_states.contiguous()
+            key_states = key_states.contiguous()
+            value_states = value_states.contiguous()
+
+        attn_output = torch.nn.functional.scaled_dot_product_attention(
+            query_states,
+            key_states,
+            value_states,
+            attn_mask=attention_mask,
+            dropout_p=self.attention_dropout if self.training else 0.0,
+            # The q_len > 1 is necessary to match with AttentionMaskConverter.to_causal_4d that does not create a causal mask in case q_len == 1.
+            is_causal=self.is_causal and attention_mask is None and q_len > 1,
+        )
+
+        attn_output = attn_output.transpose(1, 2).contiguous()
+        attn_output = attn_output.view(bsz, q_len, int(self.hidden_size/self.config.tp_size))
+
+        attn_output = self.o_proj(attn_output)
+
+        return attn_output, None, past_key_value
+
+
+MISTRAL_ATTENTION_CLASSES = {
+    "eager": MistralAttention,
+    "flash_attention_2": MistralFlashAttention2,
+    "sdpa": MistralSdpaAttention,
+}
+
+
+class MistralDecoderLayer(nn.Module):
+    def __init__(self, config: CollieConfig, layer_idx: int):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        config._attn_implementation = "sdpa"
+        self.self_attn = MISTRAL_ATTENTION_CLASSES[config._attn_implementation](config, layer_idx)
+        self.config = config
+        self.mlp = MistralMLP(config)
+        self.input_layernorm = MistralRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = MistralRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.idx = layer_idx
+        # 务必保持变量名一致
+        self.use_cache = self.config.model_config.use_cache
+        self.hidden_states = None
+        self.output_attentions = False
+
+class MistralDecoderLayer(nn.Module):
+    def __init__(self, config: CollieConfig, layer_idx: int):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        config._attn_implementation = "sdpa"
+        self.self_attn = MISTRAL_ATTENTION_CLASSES[config._attn_implementation](config, layer_idx)
+        self.config = config
+        self.mlp = MistralMLP(config)
+        self.input_layernorm = MistralRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = MistralRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.idx = layer_idx
+        # 务必保持变量名一致
+        self.use_cache = self.config.model_config.use_cache
+        self.hidden_states = None
+        self.output_attentions = False
+
+    def _forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        # output_attentions: Optional[bool] = False,
+        # use_cache: Optional[bool] = False,
+        **kwargs,
+    ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
+        # if "padding_mask" in kwargs:
+        #     warnings.warn(
+        #         "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`"
+        #     )
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
+            attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
+                `(batch, sequence_length)` where padding elements are indicated by 0.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            use_cache (`bool`, *optional*):
+                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
+                (see `past_key_values`).
+            past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
+        """
+
+        residual = hidden_states
+
+        hidden_states = self.input_layernorm(hidden_states)
+
+        # Self Attention
+        hidden_states, self_attn_weights, present_key_value = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_value=past_key_value,
+            # output_attentions=output_attentions,
+            # use_cache=use_cache,
+            **kwargs,
+        )
+        hidden_states = residual + hidden_states
+
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + hidden_states
+
+        return hidden_states, present_key_value
+
+    def forward(self, inputs: dict):
+        layer_past = inputs_to_kv_cache_for_layer(idx=self.idx, inputs=inputs)
+
+        if self.config.checkpointing and self.training:
+            hidden_states, new_layer_past = torch.utils.checkpoint.checkpoint(
+                self._forward,
+                inputs["hidden_states"],
+                inputs.get("attention_mask", None),
+                inputs.get("position_ids", None),
+                layer_past,  # inputs.get("past_key_values", None),
+            )
+        else:
+            hidden_states, new_layer_past = self._forward(
+                inputs["hidden_states"],
+                inputs.get("attention_mask", None),
+                inputs.get("position_ids", None),
+                layer_past
+            )  # **inputs
+        inputs["hidden_states"] = hidden_states
+
+        inputs.update(kv_cache_to_inputs_for_layer(idx=self.idx, new_layer_past=new_layer_past))
+        return inputs
+
+
+MISTRAL_START_DOCSTRING = r"""
+    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+
+    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
+    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
+    and behavior.
+
+    Parameters:
+        config ([`MistralConfig`]):
+            Model configuration class with all the parameters of the model. Initializing with a config file does not
+            load the weights associated with the model, only the configuration. Check out the
+            [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+
+@add_start_docstrings(
+    "The bare Mistral Model outputting raw hidden-states without any specific head on top.",
+    MISTRAL_START_DOCSTRING,
+)
+class MistralPreTrainedModel(PreTrainedModel):
+    config_class = MistralConfig
+    base_model_prefix = "model"
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["MistralDecoderLayer"]
+    _skip_keys_device_placement = "past_key_values"
+    _supports_flash_attn_2 = True
+    _supports_sdpa = True
+    _supports_cache_class = True
+
+    def _init_weights(self, module):
+        std = self.config.initializer_range
+        if isinstance(module, nn.Linear):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+
+
+MISTRAL_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
+            it.
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            [What are input IDs?](../glossary#input-ids)
+        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            If `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
+            `past_key_values`).
+
+            If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
+            and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
+            information on the default strategy.
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+        position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+            config.n_positions - 1]`.
+
+            [What are position IDs?](../glossary#position-ids)
+        past_key_values (`Cache` or `tuple(tuple(torch.FloatTensor))`, *optional*):
+            Pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
+            blocks) that can be used to speed up sequential decoding. This typically consists in the `past_key_values`
+            returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`.
+
+            Two formats are allowed:
+            - a [`~cache_utils.Cache`] instance;
+            - Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of
+            shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`). This is also known as the legacy
+            cache format.
+
+            The model will output the same cache format that is fed as input. If no `past_key_values` are passed, the
+            legacy cache format will be returned.
+
+            If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that don't
+            have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `input_ids`
+            of shape `(batch_size, sequence_length)`.
+        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
+            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
+            model's internal embedding lookup matrix.
+        use_cache (`bool`, *optional*):
+            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
+            `past_key_values`).
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+@add_start_docstrings(
+    "The bare Mistral Model outputting raw hidden-states without any specific head on top.",
+    MISTRAL_START_DOCSTRING,
+)
+class MistralModel(nn.Module):
+    """
+    Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`MistralDecoderLayer`]
+
+    Args:
+        config: MistralConfig
+    """
+
+    def __init__(self, config: CollieConfig):
+        super().__init__()
+        self.config = config
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+        
+        self.embed_tokens = tensor_parallel.VocabParallelEmbedding(
+            config.vocab_size, config.hidden_size, params_dtype=torch.float32
+        )
+        self.layers = nn.ModuleList(
+            [MistralDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
+        )
+        config._attn_implementation = "sdpa"
+        self._attn_implementation = config._attn_implementation
+        self.norm = MistralRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.gradient_checkpointing = False
+
+    def get_input_embeddings(self):
+        return self.embed_tokens
+
+    def set_input_embeddings(self, value):
+        self.embed_tokens = value
+
+    @add_start_docstrings_to_model_forward(MISTRAL_INPUTS_DOCSTRING)
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        # aaaa
+        past_key_values: Optional[Tuple[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutputWithPast]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        # retrieve input_ids and inputs_embeds
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time")
+        elif input_ids is not None:
+            batch_size, seq_length = input_ids.shape
+        elif inputs_embeds is not None:
+            batch_size, seq_length, _ = inputs_embeds.shape
+        else:
+            raise ValueError("You have to specify either decoder_input_ids or decoder_inputs_embeds")
+
+        if self.gradient_checkpointing and self.training:
+            if use_cache:
+                logger.warning_once(
+                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+                )
+                use_cache = False
+
+        past_key_values_length = 0
+
+        if use_cache:
+            use_legacy_cache = not isinstance(past_key_values, Cache)
+            if use_legacy_cache:
+                past_key_values = DynamicCache.from_legacy_cache(past_key_values)
+            past_key_values_length = past_key_values.get_usable_length(seq_length)
+
+        if position_ids is None:
+            device = input_ids.device if input_ids is not None else inputs_embeds.device
+            position_ids = torch.arange(
+                past_key_values_length, seq_length + past_key_values_length, dtype=torch.long, device=device
+            )
+            position_ids = position_ids.unsqueeze(0).view(-1, seq_length)
+        else:
+            position_ids = position_ids.view(-1, seq_length).long()
+
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_tokens(input_ids)
+
+        if attention_mask is not None and self._attn_implementation == "flash_attention_2" and use_cache:
+            is_padding_right = attention_mask[:, -1].sum().item() != batch_size
+            if is_padding_right:
+                raise ValueError(
+                    "You are attempting to perform batched generation with padding_side='right'"
+                    " this may lead to unexpected behaviour for Flash Attention version of Mistral. Make sure to "
+                    " call `tokenizer.padding_side  = 'left'` before tokenizing the input. "
+                )
+
+        if self._attn_implementation == "flash_attention_2":
+            # 2d mask is passed through the layers
+            attention_mask = attention_mask if (attention_mask is not None and 0 in attention_mask) else None
+        elif self._attn_implementation == "sdpa" and not output_attentions:
+            # output_attentions=True can not be supported when using SDPA, and we fall back on
+            # the manual implementation that requires a 4D causal mask in all cases.
+            attention_mask = _prepare_4d_causal_attention_mask_for_sdpa(
+                attention_mask,
+                (batch_size, seq_length),
+                inputs_embeds,
+                past_key_values_length,
+            )
+        else:
+            # 4d mask is passed through the layers
+            attention_mask = _prepare_4d_causal_attention_mask(
+                attention_mask,
+                (batch_size, seq_length),
+                inputs_embeds,
+                past_key_values_length,
+                sliding_window=self.config.sliding_window,
+            )
+
+        hidden_states = inputs_embeds
+
+        inputs = {
+            "input_ids": input_ids,
+            "hidden_states": hidden_states,
+            "attention_mask": attention_mask,
+            "position_ids": position_ids,
+            "past_key_values": past_key_values,
+            "output_attentions": output_attentions,
+            "use_cache": use_cache,
+        } 
+
+        # decoder layers
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attns = () if output_attentions else None
+        next_decoder_cache = None
+
+        # for decoder_layer in self.layers:
+        for idx, decoder_layer in enumerate(self.layers):
+            if output_hidden_states:
+                # all_hidden_states += (hidden_states,)
+                all_hidden_states += (inputs["hidden_states"],)
+
+            if self.gradient_checkpointing and self.training:
+                layer_outputs = self._gradient_checkpointing_func(
+                    decoder_layer.__call__,
+                    inputs,
+                )
+            else:
+                layer_outputs = decoder_layer(
+                    inputs,
+                )
+            inputs.update(layer_outputs)
+
+            # hidden_states = layer_outputs[0]
+            hidden_states = inputs["hidden_states"]
+
+            if use_cache:
+                # next_decoder_cache = layer_outputs[2 if output_attentions else 1]
+                next_decoder_cache = inputs["addition_info"][1 if output_attentions else 0]
+
+            if output_attentions:
+                # all_self_attns += (layer_outputs[1],)
+                all_self_attns += (inputs["addition_info"][0],)
+
+        hidden_states = self.norm(hidden_states)
+
+        # add hidden states from the last decoder layer
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+
+        next_cache = None
+        if use_cache:
+            next_cache = next_decoder_cache.to_legacy_cache() if use_legacy_cache else next_decoder_cache
+
+        if not return_dict:
+            return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None)
+        return BaseModelOutputWithPast(
+            last_hidden_state=hidden_states,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attns,
+            past_key_values=past_key_values,
+        )
+
+    @classmethod
+    def pipeline_layers(cls, config: CollieConfig):
+        """
+        Get layers of pipeline.
+        :return: list
+        """
+        if isinstance(config, str):
+            config = CollieConfig.from_pretrained(config)
+
+        if config.tie_word_embeddings:
+            embed_tokens = TiedLayerSpec(
+                "embed_tokens",
+                dict_as_params(input_keys="input_ids", output_keys="hidden_states"),
+                tensor_parallel.VocabParallelEmbedding,
+                config.vocab_size,
+                config.hidden_size,
+            )
+        else:
+            embed_tokens = LayerSpec(
+                dict_as_params(input_keys="input_ids", output_keys="hidden_states"),
+                tensor_parallel.VocabParallelEmbedding,
+                config.vocab_size,
+                config.hidden_size,
+            )
+
+        layers = [
+            LayerSpec(MistralDecoderLayer, config, i) for i in range(config.num_hidden_layers)
+        ]
+        norm = LayerSpec(
+            dict_as_params(input_keys="hidden_states", output_keys="hidden_states"),
+            MistralRMSNorm,
+            hidden_size=config.hidden_size,
+            eps=config.rms_norm_eps,
+        )
+
+        return [
+            ("embed_tokens", embed_tokens),
+            ("layers", layers),
+            ("norm", norm),
+        ]
+
+class MistralForCausalLM(CollieModelForCausalLM):
+    _tied_weights_keys = ["lm_head.weight"]
+
+    def __init__(self, config:CollieConfig):
+        super().__init__(config)
+        self.model = MistralModel(config)
+        self.vocab_size = config.vocab_size
+        self.lm_head = ColumnParallelLinearWithoutBias(
+            self.collie_config.hidden_size, self.collie_config.vocab_size, bias=False
+        )
+        # Initialize weights and apply final processing
+        # GenerationMixin 需要的额外参数
+        self.config.is_decoder = True
+        if config.model_config.tie_word_embeddings:
+            self.lm_head.weight = self.embed_tokens.weight
+        self.main_input_name = "input_ids"
+
+    def clean_cache(self):
+        self._clean_hidden_states([*self.model.layers, self.lm_head])
+        self._set_use_cache(self.model.layers, False)
+
+    def set_cache(self, use_cache):
+        self._set_use_cache(self.model.layers, use_cache)
+
+    def get_input_embeddings(self):
+        return self.model.embed_tokens
+
+    def set_input_embeddings(self, value):
+        self.model.embed_tokens = value
+
+    def get_output_embeddings(self):
+        return self.lm_head
+
+    def set_output_embeddings(self, new_embeddings):
+        self.lm_head = new_embeddings
+
+    def set_decoder(self, decoder):
+        self.model = decoder
+
+    def get_decoder(self):
+        return self.model
+
+    @add_start_docstrings_to_model_forward(MISTRAL_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        **kwargs,
+    ) -> Union[Tuple, CausalLMOutputWithPast]:
+        r"""
+        Args:
+            labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
+                config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+                (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+
+        Returns:
+
+        Example:
+
+        ```python
+        >>> from transformers import AutoTokenizer, MistralForCausalLM
+
+        >>> model = MistralForCausalLM.from_pretrained("mistralai/Mistral-7B-v0.1")
+        >>> tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-v0.1")
+
+        >>> prompt = "Hey, are you conscious? Can you talk to me?"
+        >>> inputs = tokenizer(prompt, return_tensors="pt")
+
+        >>> # Generate
+        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
+        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+        "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
+        ```"""
+
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
+        outputs = self.model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        hidden_states = outputs[0]
+        logits = self.lm_head(hidden_states)
+        logits = logits.float()
+
+        loss = None
+        if labels is not None:
+            # Shift so that tokens < n predict n
+            shift_logits = logits[..., :-1, :].contiguous()
+            shift_labels = labels[..., 1:].contiguous()
+            # Flatten the tokens
+            shift_logits = shift_logits.view(-1, self.config.vocab_size)
+            shift_labels = shift_labels.view(-1)
+            # Ensure tensors are on the same device
+            shift_labels = shift_labels.to(shift_logits.device)
+            loss_fct = CrossEntropyLoss()
+            loss = loss_fct(shift_logits, shift_labels)
+
+        if not return_dict:
+            output = (logits,) + outputs[1:]
+            return (loss,) + output if loss is not None else output
+
+        return CausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+    def prepare_inputs_for_generation(
+        self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, **kwargs
+    ):
+        # Omit tokens covered by past_key_values
+        if past_key_values is not None:
+            if isinstance(past_key_values, Cache):
+                cache_length = past_key_values.get_seq_length()
+                past_length = past_key_values.seen_tokens
+                max_cache_length = past_key_values.get_max_length()
+            else:
+                cache_length = past_length = past_key_values[0][0].shape[2]
+                max_cache_length = None
+
+            # Keep only the unprocessed tokens:
+            # 1 - If the length of the attention_mask exceeds the length of input_ids, then we are in a setting where
+            # some of the inputs are exclusively passed as part of the cache (e.g. when passing input_embeds as
+            # input)
+            if attention_mask is not None and attention_mask.shape[1] > input_ids.shape[1]:
+                input_ids = input_ids[:, -(attention_mask.shape[1] - past_length) :]
+            # 2 - If the past_length is smaller than input_ids', then input_ids holds all input tokens. We can discard
+            # input_ids based on the past_length.
+            elif past_length < input_ids.shape[1]:
+                input_ids = input_ids[:, past_length:]
+            # 3 - Otherwise (past_length >= input_ids.shape[1]), let's assume input_ids only has unprocessed tokens.
+
+            # If we are about to go beyond the maximum cache length, we need to crop the input attention mask.
+            if (
+                max_cache_length is not None
+                and attention_mask is not None
+                and cache_length + input_ids.shape[1] > max_cache_length
+            ):
+                attention_mask = attention_mask[:, -max_cache_length:]
+
+        position_ids = kwargs.get("position_ids", None)
+        if attention_mask is not None and position_ids is None:
+            # create position_ids on the fly for batch generation
+            position_ids = attention_mask.long().cumsum(-1) - 1
+            position_ids.masked_fill_(attention_mask == 0, 1)
+            if past_key_values:
+                position_ids = position_ids[:, -input_ids.shape[1] :]
+
+        # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
+        if inputs_embeds is not None and past_key_values is None:
+            model_inputs = {"inputs_embeds": inputs_embeds}
+        else:
+            model_inputs = {"input_ids": input_ids}
+
+        model_inputs.update(
+            {
+                "position_ids": position_ids,
+                "past_key_values": past_key_values,
+                "use_cache": kwargs.get("use_cache"),
+                "attention_mask": attention_mask,
+            }
+        )
+        return model_inputs
+
+    @staticmethod
+    def _reorder_cache(past_key_values, beam_idx):
+        reordered_past = ()
+        for layer_past in past_key_values:
+            reordered_past += (
+                tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past),
+            )
+        return reordered_past
+
+    @classmethod
+    def pipeline_layers(cls, config: CollieConfig):
+        """
+        Get layers of pipeline.
+        :return: list
+        """
+        if isinstance(config, str):
+            config = CollieConfig.from_pretrained(config)
+
+        if config.tie_word_embeddings:
+            output = TiedLayerSpec(
+                "embed_tokens",
+                dict_as_params(input_keys="hidden_states", output_keys="logits"),
+                ColumnParallelLMHead,
+                config.hidden_size,
+                config.vocab_size,
+                bias=False,
+            )
+        else:
+            output = LayerSpec(
+                dict_as_params(input_keys="hidden_states", output_keys="logits"),
+                ColumnParallelLMHead,
+                config.hidden_size,
+                config.vocab_size,
+                bias=False,
+            )
+
+        return [("model", MistralModel.pipeline_layers(config)), ("lm_head", output)]
+
+    @staticmethod
+    def load_parallel_state_dict(
+            path: str,
+            config: Union[CollieConfig, str],
+            process_exclusion: bool = False,
+            **kwargs,
+    ):
+        ...
+
+    @staticmethod
+    def load_parallel_state_dict(
+            path: str,
+            config: Union[CollieConfig, str],
+            process_exclusion: bool = False,
+            protocol: str = "file", # 指定加载state_dict时使用的协议
+            **kwargs,
+    ):
+        """
+        Load state_dict from ``path``.
+        The format of pretrained model should be the same as that of
+        `huggingface`.
+        :return: state_dict. Note that the state_dict should be processed
+            properly to match the current rank.
+        """
+        # 配置加载
+        if isinstance(config, str):
+            config = CollieConfig.from_pretrained(config)
+        # IO驱动初始化
+        io_driver = IODriver.from_protocol(protocol)
+        # 检查文件路径是否存在
+        if not io_driver.exists(path):
+            raise FileNotFoundError(f"folder {path} not found.")
+        # 初始化存储和处理变量
+        state_dict = OrderedDict()
+        weights = []
+        parts = None # 变量用于存储模型分割的部分信息
+        # 如果开启了进程互斥，那么每个进程都会显示进度条，否则只显示 RANK0 的
+        hide_progress = not process_exclusion and int(os.environ.get("RANK", "0")) != 0
+        if dist.is_initialized() and process_exclusion:
+            # 如果启动了进程互斥，则要进行 dist.get_world_size() 次循环
+            rank_order = range(dist.get_world_size())
+        else:
+            # 不开启只进行一次循环
+            rank_order = range(1)
+        # 权重文件加载和处理
+        for rank in rank_order:
+            # 如果开启了进程互斥，那么只有对应 RANK 的能进入循环；不开启进程互斥的话就都可以进
+            if int(os.environ.get("RANK", "0")) == rank or not process_exclusion:
+                # PP 分层的方法保存在了 os.environ["COLLIE_PP_PARTS"], 格式类似于 [0, 17, 35], 左闭右开
+                if env.is_pipeline:
+                    # 保存的是 json 格式
+                    parts = env.pipeline_parts
+                if hasattr(config, "num_key_value_heads"):
+                    # llama2 (transformers >= 4.31.0)
+                    num_key_value_heads = config.num_key_value_heads
+                else:
+                    num_key_value_heads = config.num_attention_heads
+                head_dim = config.hidden_size // config.num_attention_heads
+                # 如果存在 pytorch_model.bin.index.json 文件的话，此时不同的 pp 进程可以按需加载自己需要的权重
+                if (
+                        io_driver.exists(os.path.join(path, "pytorch_model.bin.index.json"))
+                        and "COLLIE_PP_PARTS" in os.environ.keys()
+                ):
+                    weight_map = json.loads(
+                        io_driver.load(
+                            os.path.join(path, "pytorch_model.bin.index.json"), mode="r"
+                        )
+                    )["weight_map"]
+                    # layers 表示自己需要的层
+                    layers = env.pipeline_layers_idx
+                    # 筛选出形似 model.layers.0 这样的层。包含两个条件：1. 有数字的层；2. 数字加一要在 layers 里面（因为最开始还有个 embedding 占一层）
+                    weights.extend(
+                        [
+                            value
+                            for key, value in weight_map.items()
+                            if len(key.split(".")) > 2
+                               and key.split(".")[2].isdigit()
+                               and (int(key.split(".")[2]) + 1) in layers
+                        ]
+                    )
+                    # 去重
+                    weights = list(set(weights))
+                    # 继续筛选，如果有 0 层，那么就要加载 embedding；如果有最后一层，那么就要加载 lm_head；如果有倒数第二层，那么就要加载 norm
+                    if 0 in layers:
+                        weights.append(weight_map["model.embed_tokens.weight"])
+                    if max(parts) - 1 in layers:
+                        weights.append(weight_map["lm_head.weight"])
+                    if max(parts) - 2 in layers:
+                        weights.append(weight_map["model.norm.weight"])
+                else:
+                    # 如果没有 pytorch_model.bin.index.json 文件的话，那么就加载所有的权重
+                    weights = [
+                        weight
+                        for weight in io_driver.list(path)
+                        if weight.endswith(".bin")
+                    ]
+                with progress(
+                    weights,
+                    desc="Loading state dict",
+                    total=len(weights),
+                    disable=hide_progress,
+                ) as pbar:
+                    for weight in pbar:
+                        part_state_dict = io_driver.load(
+                            os.path.join(path, weight), mode="rb"
+                        )
+                        state_dict.update(part_state_dict)
+                        del part_state_dict
+                if parts is not None:
+                    # 这一步是 pp 的复筛
+                    layers = env.pipeline_layers_idx
+                    for key in list(state_dict.keys()):
+                        if key.startswith("layers"):
+                            layer = int(key.split(".")[1])
+                            if layer + 1 not in layers:
+                                state_dict.pop(key)
+                        # if key.endswith("tok_embeddings.weight"):
+                        if key.endswith("embed_tokens.weight"):
+                            if 0 not in layers:
+                                state_dict.pop(key)
+                        if key == "norm.weight":
+                            if max(parts) - 2 not in layers:
+                                state_dict.pop(key)
+                        # if key.endswith("output.weight"):
+                        if key.endswith("lm_head.weight"):
+                            if max(parts) - 1 not in layers:
+                                state_dict.pop(key)
+                # 根据用户配置的新的 tp size 进行分割
+                for key in list(state_dict.keys()):
+                    col_filter = [
+                        "q_proj.weight",
+                        "k_proj.weight",
+                        "v_proj.weight",
+                        "lm_head.weight",
+                        "gate_proj.weight",
+                        "up_proj.weight",
+                        "embed_tokens.weight",
+                    ]
+                    col_split = any([key.endswith(filter) for filter in col_filter])
+
+                    if col_split:
+                        tensor = (
+                            list(torch.chunk(state_dict[key], config.tp_size, dim=0))[
+                                env.tp_rank
+                            ]
+                            .detach()
+                            .clone()
+                        )
+                        del state_dict[key]
+                        if process_exclusion:
+                            # CPU 内存回收（速度很慢）
+                            gc.collect()
+                        state_dict[key] = tensor
+                    elif key.endswith("o_proj.weight") or key.endswith("down_proj.weight"):
+                        tensor = (
+                            list(torch.chunk(state_dict[key], config.tp_size, dim=1))[
+                                env.tp_rank
+                            ]
+                            .detach()
+                            .clone()
+                        )
+                        del state_dict[key]
+                        if process_exclusion:
+                            # CPU 内存回收（速度很慢）
+                            gc.collect()
+                        state_dict[key] = tensor
+            if dist.is_initialized() and process_exclusion:
+                # 如果选择了进程互斥，那么本次循环中不需要加载权重的进程需等待
+                dist.barrier()
+        return state_dict
+
+    @staticmethod
+    def save_parallel_state_dict(
+        state_dict: dict,
+        path: str,
+        config: CollieConfig,
+        process_exclusion: bool = False,
+        **kwargs,
+    ):
+        ...
+
+    @staticmethod
+    def save_parallel_state_dict(
+            state_dict: dict,
+            path: str,
+            config: CollieConfig,
+            process_exclusion: bool = False,
+            protocol: str = "file",
+    ):
+        """
+        Save state_dict to ``path``.
+        The format of saved state dict should be the same as that of
+        `huggingface`.
+        """
+        io_driver = IODriver.from_protocol(protocol)
+        # gather to tp rank 0
+        if dist.is_initialized() and process_exclusion:
+            # 如果启动了进程互斥，则要进行 pp_size 次循环
+            rank_order = range(config.pp_size)
+        else:
+            # 不开启只进行一次循环
+            rank_order = range(1)
+        dst = parallel_state.get_tensor_model_parallel_src_rank()
+        with progress(
+                rank_order,
+                desc="Saving model",
+                disable=int(os.environ.get("RANK", "0")) != 0,
+        ) as pbar:
+            for rank in pbar:
+                if env.dp_rank == 0 and (env.pp_rank == rank or not process_exclusion):
+                    for key in sorted(list(state_dict.keys())):
+                        tensor_list = None
+                        if env.tp_rank == 0:
+                            tensor_list = [
+                                torch.zeros_like(state_dict[key])
+                                .to(state_dict[key].dtype)
+                                .cuda()
+                                for _ in range(config.tp_size)
+                            ]
+                        dist.gather(
+                            state_dict[key].cuda(),
+                            dst=dst,
+                            gather_list=tensor_list,
+                            group=env.tp_group,
+                        )
+                        if env.tp_rank == 0:
+                            col_filter = [
+                                "q_proj.weight",
+                                "k_proj.weight",
+                                "v_proj.weight",
+                                "lm_head.weight",
+                                "gate_proj.weight",
+                                "up_proj.weight",
+                                "embed_tokens.weight",
+                            ]
+                            col_split = any(
+                                [key.endswith(filter) for filter in col_filter]
+                            )
+
+                            if col_split:
+                                state_dict[key] = concat_tensor(tensor_list, dim=0)
+
+                                if process_exclusion:
+                                    # CPU 内存回收（速度很慢）
+                                    gc.collect()
+
+                            elif key.endswith("o_proj.weight") or key.endswith("down_proj.weight"):
+                                state_dict[key] = concat_tensor(tensor_list, dim=1)
+
+                                if process_exclusion:
+                                    # CPU 内存回收（速度很慢）
+                                    gc.collect()
+
+                    if env.tp_rank == 0:
+                        # Save gathered weights
+                        if env.is_pipeline:
+                            ckpt_name = f"pytorch_model-{env.pp_rank + 1:05d}-of-{config.pp_size:05d}.bin"
+                            total_size = 0
+                            weight_map = {}
+                            for name, weight in state_dict.items():
+                                weight_size = weight.numel() * dtype_byte_size(
+                                    weight.dtype
+                                )
+                                weight_map[name] = ckpt_name
+                                total_size += weight_size
+                            index_dict = dict(
+                                total_size=total_size, weight_map=weight_map
+                            )
+                            index_dicts = [None for _ in range(env.pp_size)]
+                            dist.gather_object(
+                                index_dict, index_dicts if env.pp_rank == 0 else None, group=env.pp_group
+                            )
+                            if env.pp_rank == 0:
+                                total_size = 0
+                                weight_map = {}
+                                for _index_dict in index_dicts:
+                                    total_size += _index_dict["total_size"]
+                                    weight_map.update(_index_dict["weight_map"])
+                                merged_dict = {
+                                    "metadata": {"total_size": total_size},
+                                    "weight_map": weight_map,
+                                }
+                                io_driver.save(
+                                    json.dumps(merged_dict, indent=2, sort_keys=True)
+                                    + "\n",
+                                    os.path.join(path, "pytorch_model.bin.index.json"),
+                                )
+
+                        else:
+                            ckpt_name = f"pytorch_model.bin"
+                        ckpt_path = os.path.join(path, ckpt_name)
+                        io_driver.save(state_dict, ckpt_path)
+                if dist.is_initialized() and process_exclusion:
+                    dist.barrier()
+        if env.rank == 0:
+            config.save_pretrained(path, protocol=protocol)
+        dist.barrier()
+
+
+@add_start_docstrings(
+    """
+    The Mistral Model transformer with a sequence classification head on top (linear layer).
+
+    [`MistralForSequenceClassification`] uses the last token in order to do the classification, as other causal models
+    (e.g. GPT-2) do.
+
+    Since it does classification on the last token, it requires to know the position of the last token. If a
+    `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If
+    no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the
+    padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
+    each row of the batch).
+    """,
+    MISTRAL_START_DOCSTRING,
+)
+# Copied from transformers.models.llama.modeling_llama.LlamaForSequenceClassification with Llama->Mistral, LLAMA->MISTRAL
+class MistralForSequenceClassification(MistralPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+        self.model = MistralModel(config)
+        self.score = nn.Linear(config.hidden_size, self.num_labels, bias=False)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.model.embed_tokens
+
+    def set_input_embeddings(self, value):
+        self.model.embed_tokens = value
+
+    @add_start_docstrings_to_model_forward(MISTRAL_INPUTS_DOCSTRING)
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, SequenceClassifierOutputWithPast]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        transformer_outputs = self.model(
+            input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        hidden_states = transformer_outputs[0]
+        logits = self.score(hidden_states)
+
+        if input_ids is not None:
+            batch_size = input_ids.shape[0]
+        else:
+            batch_size = inputs_embeds.shape[0]
+
+        if self.config.pad_token_id is None and batch_size != 1:
+            raise ValueError("Cannot handle batch sizes > 1 if no padding token is defined.")
+        if self.config.pad_token_id is None:
+            sequence_lengths = -1
+        else:
+            if input_ids is not None:
+                # if no pad token found, use modulo instead of reverse indexing for ONNX compatibility
+                sequence_lengths = torch.eq(input_ids, self.config.pad_token_id).int().argmax(-1) - 1
+                sequence_lengths = sequence_lengths % input_ids.shape[-1]
+                sequence_lengths = sequence_lengths.to(logits.device)
+            else:
+                sequence_lengths = -1
+
+        pooled_logits = logits[torch.arange(batch_size, device=logits.device), sequence_lengths]
+
+        loss = None
+        if labels is not None:
+            labels = labels.to(logits.device)
+            if self.config.problem_type is None:
+                if self.num_labels == 1:
+                    self.config.problem_type = "regression"
+                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
+                    self.config.problem_type = "single_label_classification"
+                else:
+                    self.config.problem_type = "multi_label_classification"
+
+            if self.config.problem_type == "regression":
+                loss_fct = MSELoss()
+                if self.num_labels == 1:
+                    loss = loss_fct(pooled_logits.squeeze(), labels.squeeze())
+                else:
+                    loss = loss_fct(pooled_logits, labels)
+            elif self.config.problem_type == "single_label_classification":
+                loss_fct = CrossEntropyLoss()
+                loss = loss_fct(pooled_logits.view(-1, self.num_labels), labels.view(-1))
+            elif self.config.problem_type == "multi_label_classification":
+                loss_fct = BCEWithLogitsLoss()
+                loss = loss_fct(pooled_logits, labels)
+        if not return_dict:
+            output = (pooled_logits,) + transformer_outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+
+        return SequenceClassifierOutputWithPast(
+            loss=loss,
+            logits=pooled_logits,
+            past_key_values=transformer_outputs.past_key_values,
+            hidden_states=transformer_outputs.hidden_states,
+            attentions=transformer_outputs.attentions,
+        )

From 507ca245b785e2588a2bb776c26ad2523557236e Mon Sep 17 00:00:00 2001
From: LinqiY <100989140+LinqiY@users.noreply.github.com>
Date: Mon, 6 May 2024 15:43:23 +0800
Subject: [PATCH 13/16] Delete tests/models/mistral2 directory

---
 tests/models/mistral2/__init__.py             |    2 -
 .../__pycache__/__init__.cpython-310.pyc      |  Bin 295 -> 0 bytes
 .../configuration_mistraltp.cpython-310.pyc   |  Bin 6283 -> 0 bytes
 .../__pycache__/model.cpython-310.pyc         |  Bin 49178 -> 0 bytes
 .../__pycache__/modeltp.cpython-310.pyc       |  Bin 52277 -> 0 bytes
 .../mistral2/configuration_mistraltp.py       |  155 --
 tests/models/mistral2/model.py                | 2026 ---------------
 tests/models/mistral2/modelpp.py              | 1922 --------------
 tests/models/mistral2/modeltp.py              | 2254 -----------------
 9 files changed, 6359 deletions(-)
 delete mode 100644 tests/models/mistral2/__init__.py
 delete mode 100644 tests/models/mistral2/__pycache__/__init__.cpython-310.pyc
 delete mode 100644 tests/models/mistral2/__pycache__/configuration_mistraltp.cpython-310.pyc
 delete mode 100644 tests/models/mistral2/__pycache__/model.cpython-310.pyc
 delete mode 100644 tests/models/mistral2/__pycache__/modeltp.cpython-310.pyc
 delete mode 100644 tests/models/mistral2/configuration_mistraltp.py
 delete mode 100644 tests/models/mistral2/model.py
 delete mode 100644 tests/models/mistral2/modelpp.py
 delete mode 100644 tests/models/mistral2/modeltp.py

diff --git a/tests/models/mistral2/__init__.py b/tests/models/mistral2/__init__.py
deleted file mode 100644
index 9dc3f79..0000000
--- a/tests/models/mistral2/__init__.py
+++ /dev/null
@@ -1,2 +0,0 @@
-from .modeltp import MistralForCausalLM
-from .configuration_mistraltp import MistralConfig
\ No newline at end of file
diff --git a/tests/models/mistral2/__pycache__/__init__.cpython-310.pyc b/tests/models/mistral2/__pycache__/__init__.cpython-310.pyc
deleted file mode 100644
index 76a01ca4171928aebb54f37b4541ecbf0bd2731f..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 295
zcmd1j<>g`kf)fuV(xQO$V-N=!FabFZKwK;XBvKes7;_kM8KW2(L2RZRrd;MIW+0n6
zm_d`}B_mLYCgUw3-^}8YqQo4x{37SX(&EG%A77v-FI3byKQApa-A|Jxiaj?!B{ip{
zpa^6~lz1{&qO>TnBr`uRJ{MvJP?i}eyON=Z1xSI3U(xzSsk!+jsk#~YxvBa&g_W6k
z`p)@2KAEoiC8@<F#rj}#ffU3DBYizRC`!)H$;nK`kdKeg%*!l^kJl@xyv1Py)LojB
PY6tR6G00LL1_4F@r8rN7

diff --git a/tests/models/mistral2/__pycache__/configuration_mistraltp.cpython-310.pyc b/tests/models/mistral2/__pycache__/configuration_mistraltp.cpython-310.pyc
deleted file mode 100644
index b9656ff93c053ebc2cddc39aad2ef3badc5f9ac6..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 6283
zcmbtY&2JmW6<<;mMaq(GS^h}sw3#YRBN`H^52taB+Kpn%vH(l5Wg!J*7|Ye}kQ{qI
z=-FK<QW>B?0;HGr)ME}k^pZ=_OaFov{Ui2P^k5ih&js3|=<m%gcS)N1FiJ^JOLFGD
znfHG0_uh;*IXSN3wYK_h<InGC+TZ9+@2h|}ALGxz31MkFnw7KicFxJ~<a6{qVvjh5
zodTW<)~HqdCciVfrdeZF=_}1D@%)z~J4LJfSgVX5zQZ<zA4tPyj%RV3nV#Da%}^SF
z@Z49>u2pi;`G=BYjBvSiM}3dainiBma(TGk3WBzOvsP<`&8Bdh4a4MB)2lhLT0_(l
z1M?q!GQYP_U93s&d-fi$IZ#-O?Ny)oo_o0UCaTt5EITxV<8@Z@hi@OblEayyU-fA=
z$pyVHay`BLEeNjd<lz3imD?Gy@;e1<q?y|pEf>n8<zjgZKc#ZHJYGIio+wY2&z8@X
z&zGml7x3+L`J#2@Yps#9E?JkqD(@7()(YCr7(((^G?_RocC|4&rD@{~ukDub8GeHn
z?++6*ZNtas9`gfF@*xAf1~b@`dZKuPB3*x4W$OXy3w>_UWC4#FZXk?+Gs9ITlL-kn
z4Ab<ag&5OlT7}={rf3LEH)Jz(xEuI$%;F8<szDeLgUXO*O9b2uLdjt@wL_zf{=F!(
z`yyy51JGrJcHpypVcYCLa2uwWFC1YTvVUT{CL>_jE3GYAdKY!flf57H#ke}ItUhs$
zg6z}TH$f(WymE}jKHG1mx^HSFUUm$VBx9Mk`wV^tPTUrkB47>aIf(pV0@t4+#1=CR
zm+kTt0Z7I`dbaXk962ux+F{^V*%mhdk``C{vOH5oTam;uCK_<MLx>bE9vw9t&Q>~B
zWYfQ?o(S7}o@wmrzBuG;wl3VDKF6+kZQ>xqzNRo0;{d>0URqdKsAN-V`e!sf1SSpu
z4(RB|K#amFyGzLAaRRxav&$t7v_zx9C9n$wJ?Acc4HPAwk-QDM!k57K?m_ARvPmy2
zmE5BX%dfLGk`Tl8TinHT+a`1m=3Khrmar`Do^Hq27k>Psfeqgk$TlDD>25XF$I|sG
zU;EY69(>BB!!640(^*}-zPx;**^LJekG2UP>&ZTC<A6*56tNxH=VltFWP^z!;VO`B
zOT+aES_ma~P4Jq`woK-YAVBcoC%pv#|6dD8Rk2tVftwp<i)CM^#R2T6TibJfpzj%W
z$WOVp8w0-Stp^#LQ?1D|6`K8)7lJx_Qe8N0n-gQRHR*+I;Pz3-kv=P3hk>UXDS8yv
z$)BnAc+Izl_!y~6{2TL!;%yWdwgDO!>^`wbYJ$yf++V58F>wk*i$^ORSU9vkTB$G(
zi;(*Qrq>>=bjw_o{S4XJw=Hnq9+yE}iX2+yJjIupEp9$<d&nEAEQZpOeKr?|zDd4g
zaEfA66-?!JmN^{y<3@#BeRjbn4maGk=P9B9U#J~=QlN`MI~IeCDxty?C`lQsvU^=p
zwRm6%+mBZYqGnGp*6%aYHriZfCB~4Si@mBRo3*yps4Xood{|vvUb_Ba6>lpPaE6M3
zj==STthUd^#WyL%<c#`_dX{BLf*WRl?8Z*(RB7vt&^0OTj^Xk8j4$kPrv4hLIqbXh
z@CrwbhhDLl)R{ylo9${RQ!t6xRk~Sc@c|ZajL-G9=L<yw9Hehi2XitL)QNSkEiVmQ
zA9M;QbePXygrFVHZ0<Hy%Yrab5EAq|qS*=<2fotOqkvYsoyZ41^KGFzs(s;F-hRd<
zePIMRynT%AnnTqB>=21$jY;Lof`SGNeVmO_D0NU|ld5XN_Kf4|>Oys4X-F)nC$tP{
zp|9IR@#rZzXun+(<;2omD(<%4W0es*rtej90pR;wIRuS>6ejPfPSWv}5}mgxGAAu{
zk<Tv-mZoiR4rU7*HBoi6ak*wt4Ls5M#9_SrxP)ZR!f4$Dk&}~lk&ee<c3Xyg5D}mh
z;u-=YB~C_u2?^lr6w0_Jpid8oE^3M@{4CtqHO=iVoY+7SsE@S#)U^-#;2^op_t2?>
zI9VP1f&*Js_<92UTWXEj_S%M}tIh^p3U)ZyTn!01#w86D<|h3<I-^YGuIC@8vcF$j
zJl4X@E(hPyd12mb%y$t!6C*ma$lt``Fc@chn&GZ>tgS$BeIExMy^k$ULdHGA_K$={
ziV0-AY8h3%h?{Fhwpfp?tae}}R^|igwYd&WfpMA?L6eM(9^(6512qw4L3nAHx4eg|
zJuApeChVjq?DRT%`a?aWQGI*_zoDoo!lQOwRi2Y-4(aYGJ89FUTks22zmsWWxIen{
z)s=_hEs9pEpS7j8YwQXej7pTHGO=E-w-17r=c;$NZ{Lm&LAWF7wxfu%)AnT0rTPsz
zbI*0(-BC#!>!R83;ZU3~w&Kd}IQG#eC~yb%z=1!t8Yh4E?02g>Eml9991w>Zh}$!D
zI;8D(5BewAi)W>F)TU<p7SU&O#RSXyw4*cu`VZ|#ZbTWvH4<@K*W=MxazKyH;iwg4
zy{P|6`$E%RY3E7YfMB?qc~ydl3e;oZ8Y{28{Q1S-ul@D#H$Q#(tAG6QxBvXhA8v1N
zzRJ<SJjA`15SsSOKYbiO;5nLVfUa9TM~Ef!q8b&8OZYd2zccvre*w|Sbu?Y;lpu4R
zJY>Ez0y)wtKo&ZqkfWU`$YQ5xX`v>sc1FLzGd>yXj6s$<Wyo@89CEyK2J%ejJmf^@
z0_0?H_Dk*AIV<0p=uCFbcFtM3Zwj4@zsvnPhkHH^S4Sh8l|nSp%lFYlFI7hqy)+t4
z4<Wj5waWQuJUfj?SB~hPqBBY3M+qEFy>`5ers!zUclwK_hnyCov;7<s4;naTMB`bP
zNwx->n`ol9e~8LyL662W&Wxtgl<rbSH0tBzBcqb8S{GffTmoRY-H_iwI3+G#HThi<
z-y`uZiSLt`A#s(&dnA59;(ZbX33-{s6%uce_#p{K;sc0i8mn;i^z@}VlBoE69~Wr;
zx4EdS>*_L3*X7$7{;y}aEp@zr&$m2>*X$PuxH7%tJ$N8iYXSEIzorre|Kh8wrCPO`
z{F)waAaKb_ywmf2U$W}Sw4hyWAIJ&}3jFyC5XCFST=r8MDddXz?0qp;8p-G4pTd9g
zqvQWRyqr)-%stiArPGp<L{c`&G6}lbjs<0ooFFkt;w*`C5Zjv)sjNv{d}{aP1$tD|
zr|F4Ouu`I!xrW1aJsQ__Y%;Xzc|zA;god4bGp6gfsiGmu3wm(S=FwGPi@FzGG|?7v
zq&I@EE`Us~To6qrF)+DpCvi~0kn?1}MuIMZ2^7jzC9RMvtQB+lLf0k7{lENoHG8u%
zCYPaMhxHbXgz*wW*TVqkJ4yT$O~vu7;?|x1aPuxl==1kC)*o+gtvt{lZmr_~`sV6g
z{m!SG_tw|+m90DX*FRg;H&z}hlkp}9R`C<<Br5$RcHV8GLZ3oVI6IM_$mfgsT=Bmb
Gi~j>0wxh=Y

diff --git a/tests/models/mistral2/__pycache__/model.cpython-310.pyc b/tests/models/mistral2/__pycache__/model.cpython-310.pyc
deleted file mode 100644
index ab53c9573dc702d9ab95ac9870bc99c46c10a54a..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 49178
zcmd75d7K>AeIM9WU48fToR|RyR|5nI3<xw1Qlvx^pdgTtXaPft041|YsHZViGw7K<
z2Cr&xOnPKmpe@QWEIHZF#<r~Nz=|VNv6HnU+liBSv$3tjUL|(ca-6MooWt4JRAf6|
zMUn}U;(ousS5<w?3`xoUV|%7wSG{`m?&EiS-|w{s2C^}He(TBPoK-Vof5@BePZBrp
z#@D?+9*fzra?Cbtvu>0P`8CU?{Km_1{Ko56BT-H?lI3J0RZcb1<+P#0XUZ7~mu=+C
zm=SB_%LBoaf;_S6nMSUh6Td`#urX8~Y7Cc$8zbcr@k`c68)M}$xlYx$G{(#0jfwJv
z+^6eX8{5j;8k6NoxzE(MH+GbFG<KGEHg=VFiC?z9yHP9`8+*!o<UUv5+qkQIS7Tp!
zU*qob-Hm(7_cZpG_csod58!#;9;i<>4werZG3V}^G4-R!d!c@B<52lf<8b+~#4uPd
zHI9^zG>(>!Hjb5#!EeYp?ig$LIfrf=3-{kNuExsmaQ3e~;5>NSD8JJlF274^;GxEM
zl^>QE%=(G)3Gq8wJ}JMClpn$GNd4W7Q{__<X0-mE#_96u#-rs&<$kRGSmR9jj9hQ2
zzqj$e^7|T(mmin=@%nc+-d}#dTu;=`HqMpL$@SLybmN`ncgpp)`V);O%TG2wQ2s#U
zsq$0eH(5X5xKO^(c)I*_<Adc7Ha=ATkoa$}e^29?@-uS1qh4-2TYk3jT=}`i#q!0*
zhsz&^|4w_?3$b$Ly)k>YU3?*C7oFU~%*}YY3ZFgpUij<{d@k8{+4~T7);?|DZQt{P
zQMM6gzkL8<4g_Hwd&<cy%-IK>dHY^o%_Z$a_Td+-@@0fA*+&reND#ILpQH9M_#AU;
zNO1x8$L;%Yf1lhhIyGm(S+wu}iFmn=XYa5dz_SN}w;T3@<)(9&{Z6NO)5Ozv*$?6A
zLr(E#{H9TE+22)O>U#RHeF9HU=y&n_3f?$rKLWo;oM(`>LfCf)F`csC1ON9pEs4ps
z&GWITM^`3_Pu{rDQnQzf=W4E}s`X-{WjpoK%N%IaQ<iU?g}<MEV#%wun$^0WxUjrb
zcl^ZDP25izzS(R}#r@&;oPO*>HShA-)_l!#&$e9GA3Ck@^H|eUt)&~{lR9_)F<#)U
z<f+paj-Q#HGX2Eq>g;95&p&#jS#8v2<qqC^Doe^)sw$^)zg?NFF1yuwrRsT3li8>=
zs_vqH@GXN>=31)a+DldBaN<2x*Ez?sJ+bU9Eql!ElT{RL{ItY=_FSKT@w{_o*=f!?
zr|VVMt<BXO)f)o29DGta7gV*@bZm**AGEz2OHSp|4bQ2#wH0S7>5o-yyW)CP<yGv~
zEb6${oOk`ZH}b7`t%_7XGk&eAY=2wLt;`|)%Pf4ea=cQ#TCLTqm(T)t^*)+cPStZ%
z1tmtlQCr6=$Bz1`dTV|j3HsYmoAv6fQ&En$teRa}dD%cqqe%)u2RMGZ)tsx%vvgf4
z&n-7+{qZi3t5sEZnicw^c86+B+qqtut0~v3RIk@uKeJS|E43zS+RrUFyKb%_XzQBP
zj@qu1&OM=QM>+PRwOJ487SQh1dflnB+C0Y}M6<gnSLlt*W>8wCs?B)^;WKBeHym|-
z$(i*BF4P=5x<$bJ=~lg7bAmD%e){nz9#yrgj$$Wd<=`sr??$NQM)S!a-Lq)Ws$vgm
zEqm{&Rb79qFTlC?IaS-=@l@;Drte_v49vEgvsJI6^Y0gIl&?~Cr63L0k2}q)etJo@
z&~`E^#&u?;E1hYnuwx>o9g9~n3NTREMWj~IFHj|QmfYVF_~2O-uIp*!*ZZ#Yv`ycY
zr?c!E;U|HM6uzhNbw3EF9b3Q>>KO~BZQe3hc@=NR)~wqxJ8oOIOkS;;ZDTEQJGPd@
zb=*s>r4cIAHta<3UffRJHdn2-*|ys8xi}`J)bz`5;PaB{$L(6fPsorugZ4_J>O9Uq
zO1iA6CXoq!o%Q4HGQxVC%KSvF>D_nSPc>W36-TxF6#6aa4nN+gUiYn<=QModj2}m@
z^pms84ZJ#)@C$wO%aH3_K^J#X-<3;M5AmkZb!)SZ>yILyihkUAnw?z^{M?sf>ix*u
zo$?XoG+Lf>_;Rb^9I0QqQEMJK-8y@=_Sg~6aXt5l)G~hb-0LkJDV2i%S?Ray1IQbt
zr5pZWW!~{BdT7~IugVs2hu{nu<3`??G)&VpOv8FzorC8rzC9l+xO4Bpg%bnO6a&GO
z{8(%bM3fzWAp^3>vJ+Sst&hd5SUJI#@dv|6;;D1zr(3G=N4Ngu7uLS{#sBr>EJ(2E
z!)x{y{!08_>}G7uxDE2kGuPstwU%fbxAE7GRg!4=SS4i}h~Hc}ku3(FAfX_gV$CgL
zTB}{HqD8!-*D79khYWLOB6??fD(xrS<s}S>bOjAl^C}fZD$&nym^K_tkm>~QtyUAw
zn7roH<}Z8x5GJ!)6;pl1S#oQ2tZQ-Ka-I5|pSxVMZA^2V%Q0S;+?O!-R4TJv5)d~V
zS)HZx0XR0kE|bezBSzL(+1giGr7%ykCIW>YRusRd@pY%+w2cKlYz)s_h~G9=4Gba6
zMs+PD*fbnktMN9Tt|i+Ta0{u4*llAhhH$ZkG{PEfQ|_&{g-T6Kzs%&+gK#cj6r~m&
zX9)vqDx)5Pmv3UsNS$Dmw5l98@uk)^-)cD3rk_ym73Ha?8NmfOQwha><`0DR!U@{-
zb20%{WTg7(ur~d8wdv|QiYf|YQ7>(EMC0b+B+*&N@Hb>wV=EK=RThMvGSm|Y`%;n(
z3qrxERQzm(i{)~i*ZE3?i=KWWjiKfwpFvyAI`w+kUXqt*@Km=Pt9%w;X}NT)U<{>I
z=m+o4CmJzF;TMgVtc)0r6pomro#KeGb9UO!fRs$g*h!)p@_wgmEl>kD9&223Y#VFy
z%dvDU_VUnv&|9<Y2;ql42~YGWx{iy^C~9-$m@;zW>;e^Z4>Z2zB^Hu4#uH}bIIdIZ
zD%M&Wvu~ztakMR%+he1#?XlGauCsQconVe{#SPBoImWk;Ul{NbYX!{VY5LnS<UKPz
zuQT=EACc)+J&W6y4BdaywdU2zoN}(5!Pv-R?75iCFi2mD`zcJ36%1HEX**Xjamou*
zgX&4Vt{drHxbi1Bn=ZB7n$UJmSTC+`T>{OfJ7*H(7puKY?GN1VC#0{ci+p`hIrB_X
zRW2>h&0*}1U>f!+vn?0HwyBp*wZtTs8GM-2rtQpCm+SZ?xi*!QPOX?3b(PMwFtmP8
zI!n}rCFh?&2zLZd)<~L3BWWCE2aXqv>>Dc~DSiB@-iGds|FA3JeOXeY@xKBOY1|l>
zM>I#QZE{()ja%_G3-<}k2bO1G<m)?gl?!RAji#K#GHk~&HzW}*ZCmZ+S_Vy@ZDS6x
z(a=FNr|lG;<ocgTL#IKJOD#!D$&`U+mPX35K7|Ca&!{WNrk{;!*GBIi36?~*cMwim
zPO;~z8+;)pv~ksyO7#<<t<YF;5J>8;@o^j}=oXD5=Bbo!6pyd7;dE~^JhVMJ8>y$p
zrM}5UwGr;e@C_!G9Y(<%F(-{(#>&BHV(D*_b$w;SrK~eMORFqeY7n)InHIA(ztTc7
zj4T^%_WkBIw3Yaqw>a>U>wV|QfyJkl%fhwbpvd3HYnGx@Wc9tgWlvYn!<mZf8Q8e4
zq-vyFcKRk~z0cz7Qgo6v$Bgm#xM6*3W%p*~4=32TYTymkgfsn;F^#-j=6#LM0v)b!
zI%`ZrEz;>EE<;3CLQd>Ih_B1Ji}Pu^Fp^dD?V%>)@QAwo`*711aSW;o)Z@08ZB53$
z-<(4=a#@qvE%LWfz4C-Z1G8WbLk68cZYOTBN|P%GIq<M96)#uob4Bm6Q`E~<5zC%i
zY-ul9vPz|C>E$w`sgEEVSo#z0<tpYeE{f^}@lq_1Z(cu!@A0luv%r2%Noo})_V?i-
z3&|w9-I$?1iThbj<l!fr(4{}|b@U)>TWl@X=EycNnRo6mvA4}J%mnS&O`{zji%rBZ
zw$bOr&xrgmp4%zuO-8keIz6?t1eV7w>glKWWH7T9qZNnr6Ke(ht-4XHHtph~bEBxa
z6t|QWdg_##cOQx<t1HC=GeTRJE(k9+GxbpMLKK63vlNA1+$3}-l}=a;`b|RP3HY!g
zxIQmzGuAY694!%@Xb^OhXugd|qBW>yZ;+{|c%)ES_%L`eU<<-rdaXrNQgODWz}GFc
zz%w-G-@M9;XIiT0T(363+#EvFfZCm{Hj9^>o-~(T$7VcBAU=`S9A+*LKAmhJ*VisP
z&EmC|T4b8nz`+))#rJ>kuuR;Y+FEY9XnSYHsbEB)-oP)-94a1c1wx*Khl;@h=`y`z
zQ=N_wmVJLu6#I)+HNV^-Y3_mzr?3QXqKXhbssbXLVijaHa&)-{Q4`{fUPm>#&xS>)
zpyDsGL-v##E!Y>rW&f>$54enet81zVJ{p6zexrCvwW{_kL_4~+da@y-|BCMNrQTX^
zf=}y7R#Mn7cg6KNx?H{LNQq_cC_XC;qp%H!iUAdLsCXGd1J?M(68lf{P_Z^wl+5&`
zyNGFP^T-&7yHT9%H)wly3buVz!A$VPv0|gT=(s)UyxpQj^+~Z)*pH={y|sD1)_`aV
zEzl`fQJ1ABS+>VpRT2sq)m5ilePueX`}0#G(CV72W<*ACpnJNSnn5>fEU6kAZay5Q
z8_s@3J(=6ZWeLj6ZaR(X^eKRfDqALeaG}N(-&j=NkNI8w031Kx-NXHXC_Ed-PhF|V
zj0V<BU-YjqV^z%PH!6DeBc;q>{}A!YDqq-Az)~}2qz&~q;WvwK&xb25Kkhy_A>Sfg
z3po83(%@RaQugrq2|pK-3+K*0*~6`pm81}ek-A(Q*D)m`1gd~PT>ynz0ZW1<om?td
z6lxM6TEOCd3%-?P-?K#5vy_+SJA4;!fk}~PX?ZW>Wf6OB8%TyqK44JJ^bi-65Ak~*
z->;F_ZsSsY0vA%XplP|F_g8cJ;d-@k$*!LKL%zn6TCtUTx@saK;7bqJTeH==dlDD=
z6>vM&B}xhh=Bwy5ki&|UnVd1C5Rg1vZrPh=QITftWKaG0`pl_M!|yBf{|Pu3{?bwX
zc^K78arL)w?GI}f9zta-xFUU3Yw({M=x}S0m5FFreJFSV^n1gxYaq3>IQL%qr<Ru>
zm~Sok8PtR$m+CMdWFYo!>iaD3vw;kuI_vpKIF-5POQt6JKa1x|NJ^nFNhb;Y`MbDq
z?}r1f3;jK9m<h|UAmuW7|EihV{B9_{GPY4uZrHm|Go2zFh%@;G1-1q%UyH2P(6=dH
zxebw(G}@G<K8LXCB|3kH&QHMsCq+@oMm<er6W_o?>10J?WwbkI0Y$FJb9Z&o4u$=r
zc<^O>1Bpf67}}%gr!K+q6HUnYm;9s_^!F5xgeV8E6z{{h!0?JaXRSh<v}!>nfC0K_
zssRob(A;1rzn?5+;D*1Ty%Qi<$@-uoz=kGnCstWWlt;j4EGB*wJ;hhPEK~ml5;XYd
zY}zHoBOy+CZ3+2>@&W=obf~(6#qYp%P#|2k4i)t?_k@&cN*2zLniuVWTbyqd=Tr-1
zjI6nbB*o@(qXKHP0(s%GQ|YXkU`{(r%M=&0;H@Ix=m>x}dqW}@x$`s_u^(c6C!Om{
zpjiAAh&}QRkS4Gl)X(r?LJN*ZH^>hpj1-Ne!-cM^soCGD;*eTdy!t9`^-P~gqqiR=
zWAO#}&Enhh84K>*pT<Qv)7u7S_;EY#ShnRPoaDCH93;As@<Jr=!azA?r=0Xc#!fp)
zJL4pw*v#5FC^oYhQ~vUqyggwLNSFfNM_6!YgK~u_7^>xjlgIC{GhmP4JL(h|CJc#R
zx!)E;y5Q^P3ig=2<%L9fP$XkR=p|eI;jsIk3N?Hy|5$4J9Aq@L!>l3Da#8E(=H4F#
zE<WxSPu1(iXIjg}X~(hmgF*%=SSp@t*(61UXa@0Ibn7*t#kC5m6X8R}vmj0}?i!Ag
zbjcF!eXrAWR8@#6K92N(AhJnUpT|6jIHCS)y82ajucip573VKMBgz?&wAa^36rvUM
z0#>PYD<~ukSU*iWwj5KF6qXD7n_NhJ8iVpPkmf+nU_jCvt3cWdWlF4)#X#nInVrNy
ztmNRE$1lDMe3BR60r(c+Tfi@57nMPqg2N%)4dWMI2mlct!xwQa#?>?42xLgOMyet%
z#BcI5iJQhp4R4hG42^GYEQZxNB?8Zwos|`OEDFIqgduVeiH0GzY#1Wn6=K|*=%j!v
z$gM!A3`mHAq_Wl9#@O*ixfTZP!CNfRq)-Gy(<_C?o0uV>FxG9j@{gq5ur3P&MMAJz
zoQc|Kh7%Nc08BU_6~VPgmqS0eRzugqtR%e%`h%8oi}o_LxH6vwvo*L71Oq)!PK(bF
z8~L?T@kzXkenR@Y=q@Xvps_w)t=Udr284H;r5;yDjwnRYP`a@4NGNZI9-`P-c0KTh
zm?5u%pe0d$>4wZ-GHVqN%%c=DT`$Z$R9qPmo@a*hkc6V2IaS)6Y<jyi3E(A~CRsVO
zZWO->y`(;aA@Y}|Vt^dciT|ZzdgQMxCLmyRp44p=5mZ1VQJ&h~p`~PsP``)}>L4eJ
zJk<S7G<yXaV@H+gf0>SuTC#jZ)1PiYKNGd3`Ued8bBt!JccGbAt>q<Ga^PplA6Kvh
zdsROJGEajl{NYZ#HbP){_}LF~)qIRyT?t8-yrL<(<YMr9wTrAvw)9P%=_3(JXvm*{
zuoU`-bD=Ki1_a#zD$AwJkpvw{I?E^+)*I<$8o&CR7XJ|_GqZS_e8YOf%viYYCi*tN
z<2zBvG|5BD$iKcaxp`mplUkf5;#EI>$z7Qx2N8aFO*-l{zM4cdKq6wg#uP+BuX)>A
zBg<mm7X7yt@lGPJh!~+}wyxHkYw91-hwbKPfVycyOyoK~WbBx(=9gQ`E@(=ry<cZI
zkIonA2ss?l#D20OB*zo%a(@@kJ1WO$aV+T|XEafAhLQXYI0RV$su9sh{;+0PIy4X<
z9GJkgC|~_7UYX^p6@Iqjg0f8J#r^R(l%GaBbv33k9!NtJUe>5J^LFgH_-cAJvx=2M
z)6J`xuc65R1-**3K`Vk+2fSoE$Mr~5R&m_rd+r80ck#FxU(K!NR|ncTtO+1RG3R3q
zl(i@mtbe(+Jb1<gfCagBKDy4g2cl~bxeEge1v?2z{z5yqHVAkE!}6Em_&d<fbJc+e
zy3mGHf7wu<z*|GIE`QV;wz&e1ER14Z%zOsutdAOPgj*Yfi%_@C2l-xXVM}|UUBJ3%
zK4-2Dt`2$QZKTs4Y!9{LHxuwfS%TPq9)5#%?kCLEVZ<|mP|W2}d-%46a45026@IaW
zZ8vd;Yd+=mfH%3eow1^H2|IrqUC-OGwi8cNw+$%!M%p86i8k|_X=mDnIScFiS5Sh1
zZa<WxphJE-_8CjcLmI^Rl%Wc3Q^#N@!}sBvMGS-Obm%AVOStI$k+un$^@z7?VfR9@
zJz@`elM8$7;al;wy=c9=&<FIheeDtC?QVSUu}5yj;mZ<@v<J|FqwUe2+@n;?Khi&o
z^hcu-h`;1P!bf=+4clDWl(Fr{I~nf)Qk%jzw;j^;+i|25TRYeuMV?06iBB2sqRpBc
zUAq@OVay&|-LkUVJJjA{ZvinqaV!4Gfz@&E@LH)ojyEPg1VyQZzngI**32A>;r$=H
zhB3guXYl<{teLlB-jPokH-19<&c{|Ko{2S$Yq9pk+R^qHM~uY3HHd$l@h==}PqfGF
zZT6(S{gwr~Z+z{zy@O#lNby&@Q*4i-{wx91?3{-D33xLuFGSAWQ2@S)cx$HHTPPsg
z0qzbeInvu9@dHwWFqW(EgOvU1{iXXJ+<>ehfj!e%OqdX21PKPxsWfwE>{1aD`pZRo
zs(AQhkvF)8#@Mj6?uAq2%ijX-=gyf6U=u}PT?ATgc@}szC~&GEhAvYDY*YF=>lA_E
z62UU_^FXQi_+0S@=zkyv1JT``t3}Xe#p<%xIt&dlAy&YBL9jgwt^fp@5b=vgu7(ub
zp(2Fu>nJsfZ)(*#$u4rQQTa4wrX#5qg^EOBfq^SQxWxGyqKqHFkH`RD#{3vt*%!^p
zMU6XgAqQ<O*KN{r;Dw1XDc;E(DqaGK3fk|^p3V!b%?(0p-PJo0JEV}tlAp#}f*d#J
zSEMPLffQ<HgIDjI{<g76D0~=W<ydLKZ8d#E-AOEabBEuFmP`XIDOKwm*Ur4mbrKM+
zJjDNKKcL=bYc)T~Dr$On;@NC=>OMam5IxZJ^9md>szCj3thRqf=YOYjHyxpHgx$_Z
zWKYFgl4TTn<s~hD`j-q!dc#jG1zx{FFCnvjg;x<_rC|Vom4bYx%6i_rtbUatC_?p<
z%S(V%Il^VDU#2&efqt@4^%~1{U}GTTLv9I;AuZr%34;s(vMIMU=Rp~+C}7cg+lLq?
z6_8izpVI&L=}=qf$07C*GEe<;-X^_Porqft)Ea6;YS{uX$M%QM)auT3%RAFrZrVbf
zL2R~!P_}Q`%Z(*JaZLf*Hnmk#4kCnz)_e7TFu^a;`B#$6B5Hiz6{^BdXq1Kl9E}LY
zHSW^&vm_6+l*P|$QA%)^1*R}iYjjJwWz<%5)P7$2OVHK*TvzV_k`J&tDQ0F4g@VYD
z^~p;=kAVuVUq9}ncSLRmWO&S_5Hy`7X=lArh-b|#2*N_D0HQJbdOle&@@5{eqP$5N
zVcN*zmn0%d%aOzgW#xDSD7kpvD4OFSEk{6PS|Ba6iE)IRBnfKmfaV-HIy@Z<o(?4@
z5m(Z*jI?E$W_$?GcfDafUqJlk8`=1fIYOUrWm7BL))SZkrzj%BmlD&{(`U|14fko;
z{Zxg}TtJl$u&Fq2sBh2_?Yr)TS_7+o6W)H7l8L}0M-S^#t5sJVd{SAPtLc9MK~k(r
zqM6b$%h-|F`W}=`sAW{^D=6T%(sA?kv}L}QOqj3Y*&Q=wzOpMb@=e9JXYuX%3<P)X
zbzF3dpCV9Ux@=|iUm}yRNA09Tz*5Rd@fSj*v|OcemBAGSO<4!xC?`9Y=gtR+jdGa+
zVn{%UG-wY2V3oIr?GY$121J}x0J*)T1GG9r<a9KN9<P2k%B$r{8gE$?JagBiuJL3P
zGGhMKH<;BS&IhPnOPywdc8y}4DPm>f(h$z+0lXl18{_4MR=X`Xz1A`iybuX+fonCP
zssxZ_xdz~Ku(UyO8!iXU9^ry$d?6md;#TvDO~|-JF)JG8Sq~A3^ntlpTE0}T%@$8R
z`8bvoDQkZjxvbj`fc~-o1_S^kp`x?K`VP&hUh7050KHk$!<yoP9Jmo^&Kg?G)x&eM
zr?S@#;wx+srCMN!j2bWuf7|$BQa9$!Sf12+oD!N)a^As$P2g33S2x@pBn1$^ttCv*
zb5NpS4GhsQmP}r6;Ej#c1Yc&qW^P7|LZSo5>gx`XbYi-nIqI7%dl;WRh&F?-Fg`zn
zL;}XA(8c(q)=a`R1^XQDVSzN`Ll#I7u)+cX(t)r8UMgUL0NuI;q;$Xng?E|-lDi%j
zC=;+iB8me!t3`l$urvc*EKqn&)+W3r`%r+`j4V(K;fDeih`$WSU$Q`%9v0|RcxzZ#
zppQX7NS0!BVXTh@+9E8_ux5e4@D$o;avOLaJ4@Wo+Bm>x6SreGfj0(wKp+*F8L&RW
zBn_<&L$F6C39L_tN%AIJzvgXco-q}G&?alr`ZY<Zu&`s%BGcf0$=kWM%N`K+Q&={z
z33V`2yP0di;0g#g+8+zTJlBaw?ndc?v0)AE*a`+f;C929kP2W2w?ucCv{uK#6zI~4
z+a8mW%Jm$<JgmnpAmZOD4C2vsty8s=TVulJybJX_$Y)?_%pSJvISPrHnocvoL=_kI
z05n(xixoVDL_fe%-8SaY2KEp@f+p&A$Q}l>If44QD_}N<+Y=G9xlfqQyM@`@C(I_V
z*=z0rd-D-{q)C~_{?)Bu<qoWFYj5+43sZ|GYVMyR^@F#;1gZZCS7e|jo5Nsk*mk2g
z8DdS!co_Z^En5pE#4sssH`bm+Z`u-wxpQkr+LIU^WN!5pd#1jE03hnid>;1e{yu(G
z`qBU-G@15RFrnkOQmfnB+t-c>8+aV)--mBrvw_lD=w)m7x3?qR?ZOKF53qte@Wwk(
zlAZR%>MpQ?543mLTgjLSE4bTxaP6J#-FTz8jxjUc(Qd{pU<KdxDf7nN5i3{>Si$1j
zL+zcy3NrrfLHy&Zh=1X`+Qs&6dk43+2`jjJ?O_`!Zomqf?%D1X+uPea0#>j%t^PG9
z&EKN)+jRa79pZWQZi?UG)tBi|*Q)u8e~T->5GYS7T6wbalU&4RmjQ7C=%;^;33)TS
z1so8C>Hy_0wSeQ=OapUB^J-XgvBWjNY!c-)H&>gjG5Q*DQA8I(G7czFk0^b4X<k7+
zq_c3CL}syGyQCn5ED2V51-r-~E+_O%<mQ5{sRjrXtknl*x}iD|b3ueneXgVx-lg8M
zABI|zLUe*RW~L570ST?aM_66I2j`FQm5)CG!Kg!AI)r*Fl68$YUilT)onBr8V4^0g
zk){KHBD9j=2(b|XC6={apiEer1Au;j5NmMn0z)dNE(|-~BMC4EUApkXQq<+8016d>
z=8E8oHN)D3#!Fv9JsU+|Dt-vm47DD7vv{dGyD0JM3?rOxlpxmrItuBJQbgEWJFfco
zc%uG*&VQiuAL;yOIA_!!^76mXVJ%IaP=Czpm*H^J!IBby!<+Jd>c8;~k)D2)R}ssn
z{)D%iv3u%I`ScW>_t5!OI>PF04+xH+hq^)k7KYMK0V|F!rPLNs6e6fszl1ma0x{$P
z|3l7^gt<UP{R}sjfVEk4)SuDw3v`5o6pl}LKeB_u9r~k?!&5-Pimo_KGNZOX&d4L*
z`K9GxUquQs5XlVo7VF&@8ZEuGLo%VmaC>CPX^_9v?8Aq6MgCHKlg?kzd4<j$I#e*L
z*XX=X=UZ^5_Ph-)QV9X}MZQD0jlx_MQzjjV!D0&Xs+dZ^DHj5@z&e&uZR6t+%{QWa
zK<xtr(Dd9g=bKqdtTbhceR6@k8hZ?QCG4n5^7VYcSy!N~brjhWKMyU0;QR?WSJ>(u
zQ)G5jhEA4Fj!vEqDLL(&<<%UV--o};E*tWg*#tPtx8gG6-CXA0IQUG{0*?sJ&@^AS
zlA2r1CWg#0vml&nK3>4pK69V36<j8mu7C#}GY%j%&7XoZP2|D7g1=?@CfG9U>`HzM
zylRrZ<YDuM_y(+-s5@TSmF3Qax8zil9b|Yvc^vklaa6`EN#5`?h70m~r}1^!ZLoaC
z+!S16HPe*e)&RDa7>n^$Y&@||NX{=p&kXDr$fZCNAYc&zk35%L1*r@Oik9l<A?q%H
zC>+2qNWpg6eZ&LVEa3|xHy<Y6xt#%-oLwDhZrS)Las!@Hy8wqLZskBArXUl|wX^NS
zO^|`@QNUc%?jn$8I(8jv@>Wtm>-F!8F9jk(Vwg%-SHT4ct5wA3H1xolAq?sd%NfC6
z<ej93ixKNsJWPc#r|gZ^f`IV?cS`LuCSuGi&|HfcSF>wTnzHQ{=tw;6L>J&DiF?FG
z3!NoQA9{)?y5Jq*NoFv7Yp#a<-CfME4idn@S+r9<kZ}5RON-XSucQ1iC_j&LJ8*zH
z4^}+A){(yj7mS<!Dgq<h6HP}V^|sUF(*GT%)-CgT+v&+m-)V|b3kkdXHqs<15%vm^
z5+CjYo(nBe4UqkjjCn?5o_f*AQ!~VlgZetktT!QGK>uZL>ZyLVVhCkrd|}Fg_<-H%
zSomU3Ky1Z5+#6U65f~vq!4!@H3z38U1?$r7jGG{EPl!P3T5%x?aiVCtLG2qv$;#sk
z)CZt6A9(+VFMyyz!6E4<<d*(H#HHe+de0bp>9OLGBBj3vLNaiwCodZiMaU5)b3G{g
z4Knm0sOQlNOaoLLw!3<pqn0`qr~zq0?qZK_zOxUxa;4{P(a(2nT*kAmyG7tLwf?`d
z<PY@JCUBo(B4H}6?%{&DpAL+X#IT7T&Oti&(m6zD4;BQFhmi+FHqo_ym{Fq)_)ic}
zL+EmqW%Lc;|1dqMo%H@J9U;1Q^J)j2sWDBF^wAfTaED^+K#ot!RHsSmLUQG2J5)Z<
z4|*FnYA2`biW&enst&p96xH9-RZZbuaJ;*r_o4tQA~lK3$(sJ?BQy3QYo9Q&fO^Pn
zvIwk0JRK)mA!`gl_zWb3kOZMTw)30*hB?r=>s%F#<kt%YYux&3$B$RZJJzWAjcz}L
ze8tMH91TU({R{tQ^kmOQreS28fB+!^*e%+oU`dm!DJuN2KwGp9unjT{wL84V;t#2e
zNG!7L)K)Dc;T|t+X>3E~46uL@0UofVfD2$j1{@8_+S~DV4zdRm{_zd`<JkU~0&?|f
zJU2FYZnX1I#Seg}$gK`FkApNZ5gH~?RtG%`+w`z!b|Qu=Na6rMh(iHgfXW`Z=Qh4o
zK;+iagfy%gO~`?Q{3mkVxrapq<lh9A)&S8$@<o#rLy#-w=S+|-19Rpa_W@hMhO<f3
z9dv}k3oPS$Dl20_HtVZh@DS}#*A~i?^kYbDtJRx#R1tSjArL6y=k!`EjGdq1GN~_f
zdJU5sCF-CO2gMOOq|G}>l=8OT#iv4MIH10xA+xAQ|NVSZCWHHUb(BtD2R{tIKVt`f
z7}<kn#>yI7LBnn}w?g|ujSKWGaR7RtpCKp*o$?LRvg4qe&A9Hagalc4R_@xgqjpUj
zL=s`DA+b7r278PzBh{|miv{K|Qlh9q-o_bZh*vpY4fBfBzv9?b%>U-<lMy`q629&O
zc!BDZe9Fv)gKRV9N%oc4_nHqfRQ8)gwqhK!c)jOC;+mgzN|XSP8_y5~X*uY(HqgBM
zg;X*rSAeS6DO^#3l6EpsqGYH*VVE!^e&xP{#}(yF&dvi1oD(HV9=){SZwV{?JnZL0
zOYh2Svd{(3x`@R<-4X1X5ma}xS?Z`oOR_FQWYY2o7qg+(??8>-MByUIeMd?c(8KE$
zE*e4rx)&=vBuQ9nNv%K?N-`L`>bUOO)g@Y(kd)6q`s67pQvzV`TPO<F!=y&gl4*R|
z)!Yjh4ef*)6-@zT{#qr1%`tke05imy=oV%P8hjAFfu2i4Z2?&)tu3GiA*Df7U{kUJ
zk0L`9$DIaJg?AcAC3m7X$O_;n%iaG1okC0m3oqmXkZ^VlpeTV(Wg%Y;uPI**uNis}
zFevH`ymoeN7*bJ&<uAkW7y1XJK<Lx}XG}1FKL81`wFXEJ-X0ar0I$X<cZQ__f}=2N
zdRxGp<uGRs26BJoHhBMM;a9LTkXH{O9u1;GOzk0&S7X%xT$p9qighA~Yvz*Ic>z)z
z>i4jspibK<NcGd0jR}GJJ#U+hbtv?K+!;0y7%Bn}WfvwFw%d8E7Jm}N^Cz2?*xRwN
za{)jnDFISN`bG6B+&xVLK&#lo-cK2=w8R3*IIaMkTD0r}*v;Yga9F!id%V93D`?u=
zhxZ7J8VEp2fN8JY13;L;cd(^g1AGRjtDnGJ5{$}oG;&1%i5`rBZ_hB+7?T;j`{M17
z)0yHrf$q#Y*x-o^YT22==1MTaq@T6?P>UuT)h4WTbWIz*l3krGy@OO}Z;sF4o!*dI
zJ}I)#K!Yk<K)S*B9e~g~N=HjaG2A)K-C(XE-h{aeHjjq*rp~TE$dB}f))}md%}Q_b
zC-yp3SdQ8&JQD=YfM+l=$*!1BdnwoiA`33@mVGM4&1LTtwu>BD`MmnS;P}?;vR##>
z4ZG7bk{_~qn(`(YJ+;3hoYWX&S;zir#Hk`W{Y7TxSLsk{+ArJu75WKnVu`>_<e^D=
zehrSl<IN_24TTZGsBi@Tg0FppP9(IP8hJay$S?ApH<d$npeiCb)O7hXd`)PVNc^`C
z)3vO>z$$=LvEE2p;nL3E<eft1?Fa)$K;(>>0n`M5lr+pSXmH6G@XhiU(f9?30F$o_
zWeG6JnkzduTzyNyTBU^88Q74<k*eR6yrEM3Er1~pV<y?b3OIzYq->BMWSf7R(KP^t
zq{sh4@lBdfkg?$p&xLmD0;%+IAf`Y|%p%&JPek$jb$EARBTn2l9V^(Y#2u>nGC(8p
zm&_#XAyPV^*3%AlE-jGq$;g!kjif>4h%fwe@aImYeDFIU&v?f$d48!m#oi>i*q>Ce
z2LUwc07{1aaC_dP#11;@ERC*p0Q~EA_pp*tf!at~_wF8Abg7sg7W@S$hPmwDq)!5P
zX9Lz8U<QjaN3kaYo20k+EW6GpC;MAW80k>ogG8qCU7HFxV*G9A9zTEKsZ(bwr!HK0
zZ2H3EPfS-%pFMT{{A1_+G16T9lbN5Ps!N{8HbPiQt1pQOB!ICRZiQMR8piSmg+bSR
z1wH+(5CwISah)gX*N~BokQqPBj+Mv5Ec*Wlde#VFSSve%-rilmZyTMlp6FuyB)T}L
z-5-Lp9-E#)?XLp7pm%kO<*Z&ZEPg8qW)$*lkTz%&z!ZRGkvBAigF8Ysgv01OpQEBW
zp-zAh$;=tRf9Kjs^(ZCI!Cg)>fE^X=2@aWLM-8UV!b8DmYD}G@L&OqV_=6}!AArAq
zL7^(fPeZ0#v$4eb!%>@7LImD0w2>~xVi)K?rC0BMBV95>{3x61e#9E;K+L#Rfc_!i
z&Gd|rzmrZ|5L>W0p>2SMLcZfG+c#>`p6P2Hi*|s8l!<>+K24@C@@anV42_iq$TwN+
z7Wul&<KoD)ewKcZA^a^C<`4@L1HdnmpY%>>d7#{qK)EG>a@#?a8{e#+#jC;#grG1I
zS3Q`pXJ`7%fX=|c!|BUTpf?qRbh42P<P)q7q6xGf8`U8s!#p<{0<@(S1tO`J1Oq*G
zNBym3va-<(cr>tF$jSj1pfY06xVt(eL=drCy$X_q`)2{F4T%%2XNWYxFtf0vmW2+I
zO<|I%<IR?)v<;Z<5`$-|!|X;ivKRYoL_LDwgcY`n2QGWwlKaq+BTF~*qn4UKLX)RQ
zAW(*8^9Tsd%U+{?SKo_MKnGmX8{H%WPXm7jlpBgvMegU9VN)VXR3{~B3+2Gp7a2BE
zU7-=%^%cz80$dl0!EQadR5>V70D$MUgqx!4OAb`#wU!EW9+Az!F5?m!5KuhuY{-+I
zmM!r!7yE>{nh4jSm${1++@|gW+9(Th#e8$Gy_hA8%o0hf0?r=d&e?SO6+-dmYV$?H
zWFtacYOEN1*!&W0h(!cT+nVM6d~RdJp4EszJ*4r6yzj|l*l#f}FmO-o&(3_Lub-Dl
zbQ9ztuye<rxfm+D&@54Aq(mV+vXotU`T|nDL=HTdBe+i&Jz391viDI{uLYaD4+okX
zW}lfN3NbJ-Gb<nrVdGm%rCAK_o)4Jq*t_u+2~9{=IB^)~gIPY?EGO;wyn*e{RvE%Q
zSo${0S$uQYL6ZcYG;gOcSEoBz(!Qem2`SbEnIrtcP&uvD(r(YLd(0mKjXcfF{2(0g
zb5xa5nrV3P1;`E6UM%}iSAw1aQ9x^|dW)}c&P6;+FNiTWZI6!xK=5uB@1EN5U62Xj
zx=-t8a6tjl99$;`AtPXZj1n`*>t~QL7*X|L%S}V*VGttR`k<g$r}4ck?Wa&Xge{7s
z?){{;lq;qo0YZlrQW{-V4VWL+RwWX&zgqD#kA+74{9K>~;=bq*&6z}9Vg^wkL{_?{
zaL(TTfVSfq^ym~;0Cy|=(@=WT$R>>Ly^#kMK{&Vdx@E519a7GH4OZ%jXBu`b`Xq$g
znOa_t25sQeAEy1mz#=nx`CRR~zdOQ-5Ozd#QU#da*~O(6VEh=X{xImJiVFi=0IW)>
zTz`ATU92r}*>z!fkcJQ|VvCt8fj?e}VvGo87<%>#9e+~I{+7<YMoY+SqGo!lOqM=b
z_0HqLE9iPI$!n1DVkiUR)Zi~~Du$?@9ST;rUgfT)^t#Ppb%5qGqDZ8vgW@_(q8Q8$
z_ESvtEAbxJY8Sq)7b3tx<OR&)A^Mmz(pBJLE^*qwfo(>C!dvi2;P-3u=wF!5u;V!}
zQ5EiO;qIScUv<y4Rr9)3*qkWCYy_-->k{$Jh*I<Z&(53zAm;)z#LXyN%FjDav3re=
z9+qUsi-Tzot4B|w8wZCmQY=T63SQvaZv$LZtA_^H!b$wu57Dv}r$=2$k3Z@@a`6D{
z0?*fBI~|pBm-rt><8-u+T;MeJM|8k_#@gO?gT0Bm^c;5}Q9OFJRfDk9Q7A!!i&K;|
zXW||eFA5;$l3|E}FN-1qd!C%SGs_hTNV1t{j~*%>yEyZZrZ>VWI2<aGb+-^hT&$3T
z2Z<$Q2_5_3L2c=y^UhJNcwYBT7{0F?bn3ZN0tW4`?da)YJ`E_NFxCH`_a$tKo9SDc
zWS1Wl)HX<fz}y=!hO!%hV-Gw#j0eajIT$gDrh%EjCYc};b-z3SuwqZ2UzgHUUyTP_
z^sWI9$)2QT@llrviLXO~T?}@{!5D&#u_tRSI$H%<E|K5>-2~fpnjP=x5d<?abFn8^
zr0ei{Z4L$wTZ~%i{S$gv!*~(q@do-EdgHUK05KiXp=GP;dhKe7lx+2qd*s-=jvXx>
zyYK!7j!j8tBV4s7wOW(ILX&%eej%EsKqPt2{Efa!VPBVVf`=NULsFdL+qdfNo4wWP
z?_n<u5gRnG7ePwClL`Gz=7(@35EK#^oD`bYfjtZqkief!ru#5>$E+iSwCB5K2{wxm
zATnR{)VE%)OE4=HrHKl?IMY|jov|K<E3Thb!$-X{Yv1re^_@nE94vyW7+RX6P5Qvz
zTo+q9HT7m_50EE^18kPvN9!ISVu}4Wm@&~cSq0?i32@dEeen`ZkS%HiR}f&92X1si
z^i-4%4U0!yT!NL4vj7JL=xHv0OF;v1zbn=zy_Yi(mqV&s<JyItMlINLgz0(?80Ko-
zsP9Qu8=*q%Sg#pc1~Qh6VijvONFi7{AlztPHo%5y9@17UX9%MaT1TrCq{f2=?|anO
z+!wCFB$SLpt?5I?Th|=B7gBT>c>qhJiK%6e3$bq5izu2Va=)vmEQgD7^a&T!u;E3|
z+1=zq8)+CyaVF}VqI`GJ8-xjy5g|4TIjM39Js#R#Mch3}YCDO&R8|i}UDHni!F5QF
zAi;H2utJNW5Kv)38)8Zk7Jjh_NkO=4t?tZMXK(Z*Dm4<+Uw_3$O(^u3=Gb)S1RINf
zg^>}wse2mIoI~q_mzOC(wrMw8>pP9mJJi5(T|!6c$&k)tZ|osG7<RC}jBdE_k%Z7I
zAG;#1l#GO~Wl+ykeT$)Za_XEZ9sm<6yW`M<?N)QY*OO1d+av0l!YUH%Ip_fc&SL{Q
zMh!(4s2CXwQ=8Fd#{kF+cF!{i#TYZ4ob;rN#~XE&aJr4i@eW{2A`&Z-+({Txi(MW0
z?b7(z2+=tp;Z5heCn914=0UYp0G5kw&LPlMHd+5UMa^DrNo}`a=?F#%xEF%KF*OED
z1$q`p0WH;(HCOlap2|T*qowDT#W)jr1R+0Up9#h%4kv=(0ozrIms{6_C}}lu2NI}L
z*T}ArvG_&Pb%|82B2Tmig*jE`!k%=cb@#L1L-a&oZ6tMrWTdr-*%oAis$K6iN;Idv
z4XhygHw?Wp!!x38J+#4);wsY9{n-D*@gnQXX6ryW5kt86HfX$%%FwexhZ<wYdMnIh
zh<C(dO41(X(HZ0%(BUjl#lYvWs)LcxSq1gfs)@Oo$Qwfyd*nsr$J36SUgj)}j69fh
z1kH$bS|TV(*tNAS@xNqYdN=w6<><+MPjdYQh-v_t?(1i0)&2tLO2qyo>;(Et<XaX<
zy)>Sqi0%ooX$tPOYXD;*uY-Rs%7RwP&O+MI^P#YYANQBxgkm_cNl5LUDd?j>yBCPw
zXzVo&5ghCiI+>eBX59w{Qh;h&$U(E0$M1mX_IO1t-vImyxE`cN&n`#^T*G2v-WkI0
zQ1Ck({Eh^_qt381;*5$85awQJqYwwGjR_)Z3q*kv{y->MlDb&=U!)nj5C@uO;-aES
zBu;B+=Yt^`K+|h_veeCfFxYxkDn3RUSv*Piv(`7zo`A2B6nh0VI`W4?QR-<C5lc9c
z;`In3DG^QJ8)tK@!0G5FU*x`Q^F^`LTe0wqZJE?duBCX5FE{#HfMLSc5`6WcC$RJM
zLfXsWY5u11D6Qxics_u8u`}om@;!Wqcq#>+4AY)qWkdvziRqs0<6+iDnYD2^^T^P}
z6wZ{2A#27>^GD1pu<b9x!6`$8uCRDwmX3;T52!*vi+4I&d_VgztxJ%;KG}rGQ0%OZ
z$2zA#NE1dX#%^=I;cMy)#A9csM!U4DOhYW`XjCLf0)?XnVT3^hR@wxKfF3TwS6gn&
z%d8)wD6>cwa!xr>FQG-M!$Qt=cMBX#M*Xs@-p5#`rCdzK5|!)t#$reHd4?06*CJmN
z5g^rQ4h5#HP}r>y$iX=F1dRi0nFh`hK!fp~P94=qPvam$!YTyn`!yDj0_<Q1KdOES
z&+sEU%8vd`gtuedgQZu}X!&!mg$<H4R`&KLzftgYU6_64G`?TUGArzIK;j13QP-In
zZF$#E$m-WGP#)ufAN_e+3G;+woBHy!QSfPE$idQ*D%7A_!T3Cl#Qgzz5)xNpZAI*^
z_+uL_D>ywOIFJu>wN_=bSmB4WC+}Ha*NUUODUs^~8QrA0H0lw66BECob~^Agp{T}z
zBar`3wv&Nh97jiF#BX0a6Zl!cY2(~4gu5S@X!-T9ZA<o;YyXqb93+9w0}ih_7=(Ge
z4NV1<<ANk?!oat!ccsGs7xvL($iq#l=e*1saEU}Y+SV!%g+Kx(mGXe+^YS;%JdL9j
zRKpusgV8kL*>G6Z8t~b;8lbfRm_GfVgS@RG4@1~%o>&+HLU4G27{Xk80Eq2j;QwI1
z;Fg8su;OcDKrI?;TX4Kh4rasR%{2UiyR}hZ5`hwzcs~c#{kUK!69K00i^%naod(M9
zi^$^_=}U~djYBn{HQ5Sl1>1le%wjv+P<u#y5hcdD1aoiUTmLSI=XYT!)r1ke-))<~
z{NWI<<}L}7Ma(a^Q|`;&ByfR1I06x#Xb)f$n%NwHHG(cbgpV!kfF*<kcIYR#{Y|}Q
z=Own0b_zS^<H6l1;vYlkosY&YJ_`E+X*?Ul_O@Yx8qk~?L({++pWt_G7i<XZ#;$)k
zWCZBK<n7ezcr)AH((Pvt0FO8hYXv}uBK0KRJ>4Fcl;3R^kPob-NSAS+gNd<m-b;$G
zm~e&o)M9{6WU9|~@`Kz?NNfx30-juMkFD*w9cvoxG5BX-PhiE{3rx@O3o+zo0`HB^
zSzJ<X!pOFFSLg@ha2D3$(dm^7&w!K%{VJQw<;V#f{Lq4mGZYF&<5MWz9^Y_mQVu~n
z(<Sff;iQU{#~7<-<3ucilPs_g9jZsH>OE0#;8Pya;~B=gQeG#zz_JdbWmt~DrqE!k
z2m->X9E?tv28pz+KEk0Z7Jw;TfGSI+GqTG^J%E`;Z_YoBtCg=Z-z4uz0XM*Df%E_^
z9Z2b+Y{lt?V3xGjTXfNqflw6(%!qShzv6!K{Q+-Uy%E{GU2ej{wHDb2GFr$`NjuBF
zk*@8G;jVuWJ;Ah2$$yvGTQnHiSkd-Sxl-!RK(TnDxSxhz_RoZSu!4Y68nlx%X{o3l
zMUg=S$o3#EI5?E<41UC}Wj`z?vTV?^qHzhqWEyk%<>sRABe{_w#z&?>C}uy1CGQF*
zf^cDxgiN)-Dc-o+zo=v$X5C(2QYUwU!V=pv=JS)Wl>@!&8)}Up)Gm4>d<|!O4%fM_
zxtj}6Tu`)C_iTr^ncAnRoi8EZ>Lx>oMXy2fkKK!jfHHrGfvz&p@AK*)onNB!J@m`*
zDh~%}J*x7xq$RNVDY#F9Iy;+Cd{`oD&Z|$+Q*fGG`!%8RF#X;`r$C4MgLMme2;%Rk
zgq^)tT`K3t@=QOi7*op(Jcx5jyK{nLJ>}F-+7=3}n|&V>?zFm8)JN&Zy3lscq-tKE
zA2++;xH5eh9rx>FQHuHF6w-9SP}sFEX5h+0tjO;41=(62lA{r{6-5~;x?zUVrF$C9
z#X+L*ittyiKZ2gYBj*G>s4LeW2ycbh^9ST7FxLmeYx0JO)B1XvK(n~v*!`C%=-2@4
z;=VB!A2Wxr`J6g&v6*Y;LFDP}=GbBbSQa*jLqiUb77(a<tIZgUl?gJ{+#w-}a?K_u
zu^pY8Z;}G{OumL7;kA)`)k^2L!h{;g#w0cGK^h@A6d_;fKt<`Bypzr(p_@-4R_l4Z
z#r<t~mp(Y2H2O=vwi4|&(S&(0PH^yG_?a5-*VoIA9>I(WTx^96{k?Rypkn;3AqevE
z=_jAQaK7^B6Q==+e0=)7x-$<kAZGx*6NvMHdWwFs35ZORwxPx@_ZE2B8eV&b4e~yO
zrJ{W#l^#jQldx0UDWblzz5zIKWK@lCvvdGAOQy}gNM_7$CbQ-@vbnLZE5<sD2zx$+
zDDdO{CZ2Y9X~0-CR`yM~X&ZZiz*VPlHf0{a8SEk|;5RFV)M5a>XhUjwm}MJ-{{Xn?
z!JCFNM84SGa?8w&um7_juqPbYf3mkaTkty${(75Sjp1q%SA4ovIPvY`k868J6k;c?
zw!vpt@Vh(sEe5}PoJnW9Gb}v$xV_iDi#$2F@*SCR`$2o3eK!oi?R0k8_t^Vi0Ig8o
z4M~Vd73>4{)C<^8Sl;98${e)ci9_4>;>o0AIeQtKeaP8{lMW8sCE@b#VyMg!9Jane
zIQuBxN!!P|!y?Ax&b}bDC9$RL`|SHAw!5R)`oc&|SseHtgcI4qJ)mbEiV()f&vcG`
zzEh}cW9=mket8Rq{yB6E&Crv!<=&zIR4i~~a7p?o9$<_>6DH+29#ahFr}ge393lXe
z&_Z5lMwmt><ya{Q%RU&e>^R|5U_BQUr{AfeF5Z~^Rb8OdqH`}Y!{e`c_BL&L0Jo+Y
z;#5#3DGUKc;k=05w-hl2f&EbrFX9|CTc3!;?G-pTQW9a*gAK@>0xph+zIO4yT&O$z
z?-Z9~LSSe^16CR_6{W?zIPW9iad)+E&I4nRYewu4E<oyvkYWLN?Vlto5dRUWcKGwu
z1o*-P9AI2hbtbkXCV;-4g(Zh(?`okdjf<rE3>@Nqz@J94V0D8}q)bu4uxnZ3k@ewS
zA-%B`W-7akd35?wsVnq4W;!Yp>xgC2c}Eut@~-Xl#noq}?&&bE0E`L0FRN`|?f@qT
zD2l3}6U-auR^%+&(a_byOHs%k>J0IB<PB-b7a849;$Z43lg_X~?I^AU??%cZ^hp~X
z8mv3zZ%@)d5{Z(Ag;)gyVn`Bxj<vB$r?&=V89E*2AiqftSa!{%bVMVYR)f06WIlmx
zU<k6GY}hZhuvCGRp%dvD<V5{7yt6VDQltGfvF@F9{e-P}8ei28Got=3u7ZBx7n;ts
z?vdJCA^sK?c3mQmGh@$hnz0Svc^er+4RIn0vz7`dt-eO`bBsTDb~egMbWUn_6?KJt
z8!e>%KC5Vhp6Tb<K0%UCM6I%6l3gJo=*$ImC!4FPh<;jT;Ef1iiVrV2b$3HT_-E0A
zvpG27hoheNY%f=K@uzJA@*wPyjO0t$e=mMRk=)3H+$bx4lkIFIU!n!d&`=NevO+Ee
zi+cdo-pXJXCibDqZcuKUfPF?vf7X(a$`ot^yXBJ-+ZF8a<S}sx$X*QYzUPTSYbk^p
zLI?}{Qt3yT4E<og5c~3JKM-c^A@PB=NZ#wckvR5vXo1qCVMv?A1U{`vVn66$JL`T7
zCh%bsK*zU@$Es3}#BzUsdk{}XCoslo9>{}LO77stYM95NIeCb(Q@0WX23J3inN#HK
zMSOZ?^7PS-5UE!GiorHa;jA>XFyb0V60*q}|AQPf=a-4Bkad+0i611gTx&A}gjFEj
zAha^T<3Y;M3$0^nffA!5>Aolkn8U}89u+NNkHVCof9N)w^MoE~Jf6rKLPaXZlc+BF
zHtgp@ZV|vbqY7n`{2b7Qfss85YFGejNJK#&foJ9*ir`a`76gYg1=ZhOq>YNug&KG&
zJVr9g!(;k9)-G(~%*+h-He7BsqcA5=p44`M0q%;9f#QY@8awO(k{$|WB3%<p2ldE!
zq@5rH2`1^TaQ!&@1A80VfN`~UBnW-@!S@`#3T)cm4knawMxqwP+moGdFUp?s0%Qvj
z6C&LEJ_l!gviYc2M-PFGcK~`GfeoytczM;jU`89xBYS(I&?0g~5#{fIj+OgWsIjdC
zI@%L^mpl}y=jFSK@6{)Bbx<NKY9QQlL^O9o*Jdg7JfKr7=>-?UyF*aFUk^C`6Zak6
zq}EE>YK)$;4}_611=xoqrdzfm0PU{(p5T-YVV?)7JRE}<lun=t2wI(W+!G?nn|k(W
z2c`J!RO55a$M-ixRUTEh-ZcHVKv}{HM|+eyHI8mOJ(byX#n^DAz#0=PX$j*bE?>gA
zH5`3dU0U-nhTV}&sTaA{4)=?sKF&2z_yM6-sBP578AWU)Ji<l#iF}DPo&pd()*o$P
z^5^>)*+#;iFYv)e5+5;4_yIn>#AttsSHB3y&%2jvuqCZa5#7mD>Ze)4${dfQ)az{@
z$smG&r?76j-;GGe4r<v;?}j+d=qJCK&Xe7+U>JT9GM6E2jLVDE#F8D4rtJIGSp4Lx
z(Pp_GHU!feWredyFZyt)rqLZ)u1V(xiif6$D-UW?wMC!Q0%ec65Ql0=%o(hvn=pS%
zV?6xAa0r%A3ulS1LYV|rk(g4(Hj+4=V8I0&D8Df8fUPB36%svz(Vi~DH%4<9%X}Cf
zB9tgRV%AGFWU6VAsWyXjnlS4VyfFdeK1fBLFnl}R6*ts1Yh3hWSlTneyPp9mLe<b`
zpxOZmpqG0)4g<|N*Jh2OaDXM0PT*bICPE>!#2cj76+7(`QsMP^slk}~Jd8kCHDfi~
zw4gvTcm+cbSr9N#!yq4N=(gi9KLJ%0;?9Bu$WCifd?aDk^5+N9JnEzDZ_;boIdPU#
zq#o40m3?_C*Cm79$-W7S)WOUr3#K+P+L@!Yp{zK_LiGto{upAQ!74)GE7%c+4L7tm
zib#bole9%BD#J$d!!c3L3u6{ceB6u!5ojp)WNO`yCD$Qet|!pG5#jF)vDo@CEss`$
zX%ai$euiV^-AFbK<3Bvd3%hh6HQbTh3u{+6gaOA{V*f9sl|$GIjGa8j4r2&1!ZeLs
zVuXNQH>}rSCMdmfY$ML)P1EjS6v*W!@fDeW2^Szb&2|i`vmvMjO)u6tp`abtY9~=@
znOCefU<WkX#39#1{F5W%SiBEkcOM+lAq-H5u!dguB9BOg6yP>E=|GnN4i-!aS}%sf
zL;XCcA?G^f#L%tib3!;7B82D|w3SoY{l>FZE;-c()}8CR0nQ?z`T@lB71l6IXFk6-
zw(^dRY6&_4h*5wBm8RWD^<bz^A}aJ!=`df!wO<HcLg!)^@v~Y!&1;>5Uq>jBvOr+b
zlfeJO9Fbq<+{dFc(lGPX(W!S+HN7wPG#DF-s{&b0cL@$AI7245tVhdW5Vf$P7c8}l
z8fy=Bb8Mi9ZkaIg$)hoGEGE%J&}Ev<K`*=Mb7)!Zl!xQO7$eMt!amEKInO4~tUMA?
z9q$FbszDh7S*$HNux#RV@L~^XqU9l2o_0HsP%W;C)*rBOSxtU$BUr@J@#c*BG^;aO
zF0na{>KSgwQ*sc9KiId^QmyfY{iqmqgbpQA+Fph%t@rSjB020#405YC=ey{5@mX+(
zpJj-D%t+X=yL#2XhTosE4H*Yb%qC^6;_)e3r;xs5W;%DtSI5&`2D-YWPJIcV@hv&!
z1-sB&po5?*y*^vVmMffh$i@yo?90f3tZ-lBO-J0qjG~kNHZv{R|2MpnmA<zO{#Nci
zjn1*wC)fxWE{91#1X5P2b6jP2m)z_{F&7LQ#IR%Bf>d*sISW5T*U&s}4qw+tVK(Zk
z9*%Ks0}X*wma&|)ad@=5!{csG*c#%YuEY~~X>h7Z91NDSacEu}7>U+V4~LyE@Hjh*
zHkoeXD0CXtC2Hb!Or69cchVcg-4ISwkj2Xzv{SrlXCH-?p^HBSTS$E|v~!E5`*a&t
z_}xDsnnFEG9K~wd8^hwA1_@zf^lVwpgReIh#=*H83lqGWq%9}p9r@Y1hJ#ma@Z7fz
z_1kt<ayWTA2IRwUd)wD`0I9LFI}|kzw-D9_Qg5|@)OLLwhXt<f{+Ka`6B%G_BMn(V
z@iwMoLdYPw-sA0UqZB9yY%{%RAl{!t_`4t-0D@0K>}#h{7a&ID=|2me;(Ug#rz64B
z-wK|N>Zi>t503%PAhEWG?Z8x5Q{LS`)@<?aVY(n2#&21l+zna5AeM+RNEW(E1dCH9
zPDDnDh>yTwa<Il^#6B{7<&R+bKi+o;T&l+oKP2sPoZgbI18)!BN4*g|ck$_ubICYh
zZ@tX}_a5{zxSO)Kq1IsA1UbYZes*et<(21n^Wck8T7<jT#u*8C>%CH2a@GCZ-Y%yy
zlwnPZwWjNZp~~H%5DN~LgKskkr)yVyI^lrwnC!{-{Kp{%w|D*|j?ME9$uhc2T69z|
zqri%cA<n;tHa)yhYA0U=Digiw2xKq2Z=u(tCXPa$a}31XamaMGFvJNEZ+PdYz5CYg
z=Z<9{RN(tC`ty~mQraT(>He}@?~y$J{<`>6ZyEo;+bDiLvXJ^$BoAM~{X3Y4E04Bm
zy>V|fiAg6~7|wFTy{yq1Yb!>mASA$XRx{XxR17{BAe{hZ1i-hfA6PATOd9|hn&t1D
z0!J~1pi$&ZAqgHuGIM!(ejYLvnD++~B=U~do=!GO#pxF8#>!S7iID(ioxx60!8F_l
zjfO8kBLLb1J%a24mdXgiV;&&mSnQ;{lA3GP39??v2}e4|bID56zV+0p>G%7_QB59`
zdz7<j@9YTdLg)?ErAxIY*m0hiCuf`uo_^x&*~cHNJo#kh$x}~VI4=hXOc^+94Qe4j
zt5smw;J4I6gmLL0VK>UT_bE7n8@(HdHSVuRPs3rP7$xZS1{jU~;XT<P9xAE!(-@Bq
zLqCLrCD0c+lfeS1#++i4`c^J95uFdHMjRxwpO9VNeyZl$HSok@x&(oRdW&OOS0BM^
zz^QnATUIC`+v>PI&vl3lZ!$&&Ic>z_oUzxE*2=p#BH=b$mAZ(z2=2PFT{=|>Y($qp
zK2Ra93~0tS)F`q{E5rH)!b=1Ho+>7t3wi(&QRJDeDOD_38Pd5}0-$INs>--yv0e3H
zfVNAaUqH`_iGY4ugH<udsvy0XjG3euUk7gBjg=9OEXpjUS7ZOjtOv@)vK@C8Xb_w+
zT*rBOH95V{H`OIw$84rMquq$xu-Lh-!l@reu$M9`1Nu<oFvx&D+c>(_M;r&YcLww!
z$HA>cu`)EzVTBR15WZA^%%|)A&Jb;~bo;3PinKSZU3TEFk&+03&zRiMeQyUft84B*
zGUow*kOb3*oh49q`~iIX`q$5+QdNWgA3>Drk9qYnoT&#i&-o>K9N?pW#j7b?`2zuo
zsVN*k>0Adf>*}*1_A=-<7|Lw9e#&WHt*KU1{R&=2qad7LfF`$NFv3<G{F;Y_61Q$^
zaP(2ekRy&JAWCrxaY&wGM=w&_!|ePVQ~QT>!~z@%J{*Jy0}H-IYFQ4h@Dm&quAE=t
zk4CX*DxHiH3jiUa?I&<1Vgn~ka)f}Q#@zcFqdrCF7@hx{X_9#cL<q-5c+zdv@6&gZ
z&VPdA$6<Ixpd)@__VRLbQGJy!@Z?EOihhziFlH~~Apla2Z_dy9DQdsL+W9HiI#D%8
z0m9NpyMCF`i{YkBtyQrVwudRnL@-_Zsle=opOEVD2Y?QfE+@yC;<Rm~T@miiw|K3q
z?jNvJK67m?G3^V?Dom6poLLer1*ky6%z5T?odN{(B|pP!nbp*<GY+A+P2Is{7Ly6=
zcM$U!N3D-A*&!ZEkp=e-^H1d9fi=Q$h&YmQ2M!EOLhhA@3tJs=^Nj-ddVRjP`G#p(
zuO(qtPY$WTVHWvp8mjprm;vV3d_Eb+d*3n>l&2Am1S}Gy1sc1Q9S64Qk9Xb4?oJ|I
z+5y4g9FVUO;grQ4ujyLP1LI_jk0xPs7;lKVVOV%d0SJ#AZH2|XeqW^U{1Bd+&s&Dd
zVCI|cCiFQAqU^$(sQ(jJqL9_Kl#t;44ZI<90?SN1@lpXL2E0S3MSh2M%Gnn3jzb$Q
z1Dn0rBO)YXoZJXkM)#|Tj@D)cVVxl7j5QBus;OZdnqe*gO|zN=f*U$FUU9nzXfPU~
z13w+{4dAPV^GGbl!BOMj<xd_3caj$VVjp>tg@T<B3iAxE2iqC&7bn{Y=brI~yx|v(
zwGq;|LgiLJ4hzMrS%e$qJL<WQ#lQuC@3u48O2Ryi>8&IPV=ioY(ICCZ^Wq?8<XaOs
zNZSI35WfF8(4INao;h!;(4Kk5)6NC7C$x_0lQ>Hbwr+uIHW}N)IICU#FprlOs_qP?
z*&TKP+}MxF)gYF$&&eC$R=VE!T<482ZP17;?@s(ah=Z+BavNsi?BP#aHjcf92$ff(
zp8@V2*!MyW=iH$tc9F}uxCQhHk2V=z+kHFc0WrO>=cX}6EhY7hL&zfz;)eOS!j<0u
zCj@J__87R0Q5-#;c>#9HkrrA`_?9u$(p~LQNuBliN_z|#?G*0!frp7pO$`RE^FWx7
z_LvC1cuL<EdtzIRDYXaJ?#3PPS>$FwY3J;TcA;h<?QO8RI}UE?9&i7`0U*9{6t%tM
zR@~kR1b%j53WsJJ_AVYej-#(ldpAxp41DGI?g_s6qJeYe@Lcym)Ywz(AL=RG-z)8K
z7`zi~`&hA?$a8Rj_J_N9CzR+=P>v<k?qQ%YEW1cMNnspfuxZcA4@Q*nd5p7e;+&f$
z--0yf2nz^$p*cd^GSH5G2c)?i%dnD1K1j(3m8E`{)5gE0!<huc(SPRke}S`pLlou%
zP1q8(D|EpyH$<%&dRW{x)}%{Aq7T6W%@%iWl7bmRyya(UfGIeTN)#>+!B?F`dI2FJ
zSXf*O6~QU5kCb?wvr<5xS^3wABu&1)F`2sA{In67N;#B`1vv*I3g~)$7;s`<wU(FE
zA0c;}5=rVStS}*`Ne-$1M(0l=KUtwjSMGvrb|VsKrO?BM9QNZ!-(jy<hg)-p)qlWy
z6w?WTsYw<tqwDg*xmCzvlJL?nu={U^)&B|Buw-yNs!q9$^7a(+CNupI`GZM@+Chhj
zA^kKg!odnTXc2k{<8c8!<-a0c*y@C4Sxia!MppeEp05n|6o~XUA|+`tRT9ch{V8Ml
zf9ZUU4tt(HE$uGGxR+29t!@oXT}5k?nicX!(@CWN)PLv8e@=(QpAw2LWd~FcPu<q!
z5Oe9+>UD<s3pyst_f@8s)GaQ35C@ho0U6Flk1+s6^D4osBpp&->g#k~gX1T#SI(&`
zlFD*!0!dHMR|7#8YF%0&xs^sHbd*BVS?rYo$;eZ(UA;zsld-0lNCA#NsQYfjnFD4n
zV__qLEzP$E=xn7kNJk>c1+_>@Q;pDj6b`7W#U-yID8;Civ73*mMWM`c0dLF0&nY6N
zRzM1t>s&)#4D!131n<s6HZQ?+{FFK9UN^y%rLT?Q>fdwxejZhp9l)X3pv)+h=N}~V
zuba@z?7$V#;lPet$=57MeEAfk{Ix8s6q}>8ux43s0R}gp2hu#a%8#ebT?l7^Vk2e7
zErnVxoCG3WZ!*l|7e|l@jYc^#jxDoz^qC2pZPx3cvT$T~UfwsKH*f8FHGhNmFJ(j2
z_cRnGxSlf3Oqpjy1`A8qm)ZU`Ity_8kqV%qc4*~PhK6Qs1Q!0c+0H*jXN8Uc#D9iY
ze~-@3(fN5g1dXd7<<&M`abWv{mD5j5pLzVfl`~H~MF9D?2OXbdJpUXqLd#5ppR?M$
zr}_<g0l^7F0)Y=j?n+oPlGbWJ+mB7R`RNlshH&?Q9Cg2lMA9)p>do{J4voPPGT|S=
z_UjSoXxOiBgh5Zk<Y_ksJ()AVo*gKB&3q>NhWReUl!lCA2Z(mC=W?Fix6F9{b@T`r
z#<pJT_<?=5@~`2Pi$~0a_J<-5VYR=RAwh4w%J3b36QupvtBPf+`Y5(6;1?iv3TWWK
zJY*NbMsJ_02KH)ZA)sPy)^J>?KrOoMLKY8DRZ|U4ybLW;MH{Rk`#&ETp*!6Z$IL~V
zjnD>}xGzT|Rfv%ic8l^cTbmy3O&fR^;FGwd0gwUfqA)&6ShGF?{D4!MFBRYW<b}h>
zr|^1!K01$`VL(99q6iY~jZUmvqAEIBfb|TVHmc7JNA_ApnsX8AB(z^d86&Bc&O2gu
zr7HCb0?5`XW-!TO3k>$DL>9?C{FL<xSUWITDv9ASHoNu4h+9Y}0mb6}JD33IK9-J^
zBsOkT>l#y*bhXi!jgyvCTw2(N+!6c5x)Rg&E4m^K3n##wQZcH#S-ll-ei^DI=2a5y
z$rg}dmtlQnCeT!in8?-WGpIFb3aQJgfewW#qSc<TX;=j!ojt%@c4{Qdq@)>^yeFgR
zcSU?=F<N^*6sPgy{u4Oih7Mt5Ac%Y+jcpw?jSwqC`C}*X%VS(pFo}>_{|Dv(rqH4G
z%)qR9T1+AE=ok0^^^wCB44%_e0`6ccL0`jILKp%j6C50E4KoS)cPRKB4t__RL2d;J
zwqkIL2w0Hvh-_g1P;zSA4~OffL#ju?i*equMeuIibZ=dNf&kiiZ7E&sqGRiVKAZ_)
zPZQA4VB-St>KU>Rg0YX|7<Z~sagfrJh))jt!A2@rT?hcN)Q4&vE0#oTmglV)Oi##3
zhkgbFh>&cEa=BjW-Si#8jmybLNw{Tk!HrflIhv10pz2M_z4?X$<UoU}{|@sabfaL(
zH^2_d^7RSQv_Flok*0t1L`l_Yb#n&^zODt>3a9af7_lb<tGtz>cgoMD^M{#%&%eD4
z5H#JNfz4sd>I+PABPiFVm}O?%y$3}Iu*k^>i%k6|ItmT!x8pdgy%&p|;|V#?MB;=P
zAPMr&$qs;nCHnVXbUaI*6>u(jBD@xaF;E5o9-SmY87Ie_1fl=~K=yzWP{n7^zO4>6
z?+W6&?4{Otuqe^V(A=`4Vn2Km`aBD8C1@5UEu0@eE_Z9#(Z0aE41$M7JnaE5wJ-n<
z9lpRE;{-7dz11O`C%<U4^q)a17rw=VPGbweQX?NfgX=-$(o}yPcn0oSC^RPnKTG{|
zJGn4~xQ8L7`D}0viH-VcI}?Ppa3+qtMHg{$U;J-+BZ8t1WBxVl&>rHhdgK8vaMi=j
zd+FCs$~Did;k%6aKY3#z0vr4BnbX1BCf>s}G&6jYdyN>H`{e)38xKS4xY}7rl2Yv<
z;Iebjg3Fm~g4iBrn?Z_@;GXv`%=QF!ix1)aA)ZUW4c|%brpFYOuiio*lZlv<@wbi8
zo_aHLew(7PLn~Z;3;PjRHizLV;vY7~WkckEWls#Gt^;HTVC>G^hLdDz^bm_}$Ld@(
z@fA;;C?3N?2dbebNiN?Ji2A@eU{(*G$6@7w^AZOM$aS!Ev$FV^S)c4l9dRGc62i1Q
zk)IaJgpbkO*3yk&F=Uj}I8yQq=byhpN8m>v-E1C!zdoaW<-}>d*u(6WjlJ|j3gEc(
z6%Y={5YN;Fwb8Tov#0@`qd~3UkO{Ve1U;$~e>)A4)v@)LXi#kdIwC3k$;NWst8Ms9
z{Q|N!wOwOUKF;i4pmPE<lRna~$y?ctEg~7yxuSI`Nt`hR%v}fjrLHqid8m+JBLHXb
z@$Bk{8I1^SKEY@x+EowoN<g#!g14BIA=RZLHF$|vg3A3);j`F%2R7cFgGLad8LcvO
zyO6X;kiI_@_Ur&Z==uXoIN=nmSO5_w-d6oGosDotWRHFXId>H<#>RNgPX8ZJv^nYz
zuvHI7W|NDg<bD_Q1lUhZnZEdOBVX3oi!VY7#&!(ye}>&5Z$QHUoxoOX{bT&(N5QMU
z8tn?(d<8}E3BXkkpapRFXfNBT&l2qLlpX$&ZLCifjPq)c6@VBv-d`-O#;Vwy*ZVHK
ze1#Q7mOdTJ52rI5?=Jqfc)R@A|4@vX@+ZVQ<*5Q2H3b@&wjR`<Gf#gE$4}DsCpP^{
z8Kg+mi6|;Vgdr}EM&~(nj{6hy@N)Q*;xg@z>oD4u7<L8*2Gb$`362WXoml!byLfqP
zR}XYc!CAPNEL}zNtM9QO8!Hld$D~2|?w%-w4MRC-pROr}f`b<ZSP<GQ@ON(fLfHQY
z8IXbj8EfV38;91p;JJSysPad8g6XH+b)_W^)cC3iI@H^U?4dl-!-7>BlqOZ{-jd?>
zZ$I0y5--Q%%6Yl|M8~&u3HA%2UlL0N`XtfajJrsO0y)SKvE8PMZ59wWQ4ggLpitZS
zEK9l(0~n{9=^lTOue}qE?+@#?qPkbrF5ygZ+s}2~XbGbT?_S|MloUeFp_QQ0a(m(J
z4>TOe>!3X@@dR>p7lZEOgQQbywjiFn8#m>G?cl^A+NCW~x2x`9u>E`zY%0dvvZ?q0
z{f3xI%_(UOY6&NJOwoHd@J4Y;x=7)c<Pv%V4;;9c(H^367!IxfOFX)jL>MXPN=i9J
z2I`LlAv9Bu?IAECdYrG0_CC-t+{gFsr}GXv570RW$4{Qq=}*(APUn+!{wAFtr6aS3
z2wcCw+h3qFf(n&0w$yuh^%$J;L}=#V{Do6bUFfoMaGpU#o+WG#g)1`NyQunzwD;2w
z>-wNtAm;upIM%2>pp0j=j)(w{lj;yeb=b^c6peg38RzwwnT3IgycnP01Wk1;zL<XV
zc)B$~HETLGGBC0~eRno9vSV`pNIv$xLuX>&3->9U!iy<6`T9u8de$74>Hk|JmNjmk
X(br=ZLVY*G^3Qw%zIpR&<3s-+!t&0O

diff --git a/tests/models/mistral2/__pycache__/modeltp.cpython-310.pyc b/tests/models/mistral2/__pycache__/modeltp.cpython-310.pyc
deleted file mode 100644
index f7c6a28cecdfc2502d5bbb914f4ff15a9b802990..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 52277
zcmd753wRvYeIL3rJNv`}SiC_HBv%wA5)=uNdfKucmMPJeO+m6q+wn^BVu=}m3+w|t
zGayA|5zD4rD-R})+dOV#e+6SFvD`FinlDY_=H@!jo5a0MZf??>Ow+V!+M76a>f}1H
zBT6#w@Ap45vpWkwlH<1DU0~0iIdk66|NFH1`|}BWe(fE%E&hwEM&eKT(EUl{;jQ?(
zr;>?;ohT=4!#3+i*^pneY|3x4oWyUkZZ%TnR3lwZH!|f+BU{cII)1L4lX&??pP4Wc
zjY7FUcrzexta`4|SMHOrRDG~fEEgL?<)OxKd04{I^^wMCc~tzF`dDMUJl>cnPsnq&
zKH1n+-qn~YPswwxzPmA9o^I?Z?`iBU@0GB8eP5$gE;Vi{-z3j{_5F>T%QrU;ln*p+
zDc{n#wR~&iVEJI<Q27wv7wrD}Oyh9*u#s?Xxsp&nh`JBdZ)+SWA88ydAC(*i>$8nx
z<ztQG<>QSL<r4@iI=4H<<{i$FYsT`OSB#5^@?Fls&AXj@t{LSw*+b<wOAp-JcuV;{
z$-%7OU%p?$9w<K`zYmrl#P4wZt&NAu4@sPn`r8^O%O@KTmmikr(fT8eQ{_|QAFIE;
z@s9F48jqGAmFMyLI~(sRzf1fR_0x^V%8!YEvOd>%Q~6Ee-&KFS@kIHF#=Fb!Zai6j
zQo^R{XBubAXB+P+zo+ru@_QTaE5A>|ch|qA@&5Ar#XnsyH=Zg#)p)x6bmN)wGmQ_F
zKY;K(_TJ|c<;vR=_CCAxe8Mg{earJ#lI1EwZnF0yWPcEH&c4|`fVd0x!}cxqt<M`}
z8*vWWhY;sb5XZ4+oZRxFeb`yDZ=-K9Z6C3ZK5v!JBkrty3~`SIacc-UZl6HN38#h<
zm+^eNeFvWJkmnVr<}5oa_MM+dmg{(Tmwh+h-5sQEAmpBSi2bH=)4ADxv(vm{;_bcm
zTM+*(PU%YWicxOa_mx+>-rjFNfVU6m-0=PaQaotC6=836-jCWS#C<5p>23B&gr9U;
zl9OwjXA(1ytWT7lxO}#y7S5L*tGS-4)=Q0+?bK(#$fY+oWBJx;g!|dYSG`)RS*`o2
zvumq$$4|YdiRT%^H=E6wq(AhwlaIWw=AA#?TB>>O>6YvI#ghs@k2F2iTD>eGna9pN
zLJv}<A3AyV_EU2+rk^@lT{!Rfg@-RUtBu-%JR$g|%Bpf!tIDa|X;&7iYi_k(sd}E%
zWHl;{s=MMJe%&aQ#g?kL_G%S1oOoN+bsl5e9$)iT*F4tti7J{leo}Hj{a8<U>5Ow>
z&1o(;C+k($tu59Z6_0^h4nCorv#MHaI<{o(58B@4Ri|?9vgcIX+PX87_D8F>U2(mt
z@+x*~0exI+F1h|K+l5xVRz<p>6~9zfw!f?9Ru)nIc{aXTxxG@oSgqBo=P&{{$6qZe
zr|LPXf)=CR=&jo;Cyx7>dTVJ31^T<toAv5~Q&En$rkY(<`J#c5#*p*}6X0a4xma6b
z<GKo7Tx%})<6Qw4tE%oaD-1{H7Hdu0dA71xQ?6I3K3jAB+-lXX)SBpMzi+MC^>7JM
zTbHDB)NWmJ-{Z=5lw&_!Tkufs0EWF<uRC@2n&<d~7<CuT3WHJC92%=swYlUVe(rSj
zvZKzdItzaP*_vZVkBC?}*{auTPS7U9&p!J2!>V@CQJjP98~Bp`KEzsUG@l5{J&p0K
zDo&8rn)kL^)%8bvB0TmEr)vAtPqr@Ym<CIyf1%Y}sCpG$e}BM6`zlpe8q#q6q|?0U
zXIE7VLnn)3Tvt|R(y5jT=OuEQUb%=xfOWzdBE5=ffi9`D<^FUKf_G88uD4NGJgrQ$
z9n;F&1x}6dlY%FM?@4^!_rhr>mT{(f#<FRfSIrIjlI_H%buD2hZR@H@--g*XHdEIU
zn`!uyUS>0kSh==gr-F1zJAKXEu-azZY9|+y*po7IU%Y|Oi>9BnYYjgoE9w-+D~qo4
zxcMlVv8I|rCA2^9C*3u~^|+Jysan&!<90vOY&F*%)$%i#x7<4XWTX15Z`C}f;Txy?
zBxa?bURY}&=}gKW=-FP1u5$rX+(my^&Q(3+o59qrEjX?}f_y6ab?0q<VJ!%AUreZX
zp>Eg9$CT4(dCt-Et%h@~e&KShdF*8C^y%6o$2`aN++)(q_|aSM?Z;+kgZ~AYwp-py
zd~bDj^|C)$S#rFJURrk5t8zr#BAlWzZWN3u!!%99G^|(EV+dTpH}<iDC-+u(oCKh#
z1R$vNM-z*Hs_f+RIlxhtox(|JeKcVu$|;VFKM?E@Pd;{LuB94(cJ(iRX7g*m{?8v+
z07w>n=;vhNuVn8ft|T^%YXG-Ab2I5#o2j;O4S(%KC5?ekR5G@K?9KK2^QGVuFQ-&<
zOW4$E7poW$ujI8#XYZ0#&PqgS=4P^f%3WK<ipW+lE;X-GQG^=(99L<>!S<-`=d;yn
zViePtoZ8ZP&o5#xt5vblSDaP1R>!H9^exw^FZzAwYqpJDjypM)>#F-Aww_95fd>Ne
z<`}Efblwfe#@A(Wd286n8|xGCHq3@qS}+kO{AA(b_awgVkHcvj%X-lmp1GX7W^5Q(
zK$eZ(T266bxUe>oZM@w~x3S!oGZTqx#%KcZ63bb{HQJ^;TWxDIhlQ8Nw-1XiWvAMy
znz51g3U=DgTrt0=xRLStmj~MEcE--yxvNPd@jkS|!rvA6o7ocy_t8s+fpIeu@5eV@
z0!A!9_eC~F-2>+=R%m9$aaOTt*N0?uN*tb&J~Y>dsuk66E7(zSCd@9ot)_2W`XZ+x
zIyKGnqv_2Is(TUOn^?y(P*}OGD%WyywROq28cwz8r<8j^dFnkZ_$*_mZO6mj=qDFe
z*Zj2CsuN(bTB}&_mXu@JYmL>Jtl|>!`@=Ei+2Z<rvUgTwlkv0ROz@M{rduA3MGdO%
z7r5HE54+Wij&CgLX%Ni?>^A!BuA?+}98Ma`aTI?=qnOMZ1#{S#zEMoBkN3=vAlZze
z9!H`V)11TriJVHs&sPW*tkvl+R4P2)^&6RZN=dtddAZ=!>*3s#O1&R%^(>u(ZvkJK
zrP;)QQOv3^48fgGv^g9{ShP9hWpm)Bz|8@3h?|4m$GHPAIwjjg8ndF{M@?%1?793%
z<D6sL07AaVss6>{K>(Hu+#tda7Y!GIY#7hN)7dcE+$2odB$Ax&6)XizL(5Aor)_K<
zID3-tXRs`+%`B#Ru5EE+SvGekMiRR*?cvXJ>a)U};&|?3e#?dBelN8-fYU0=a65sj
z=jN7lrT*$cIat(Fczn^&voc$2UaTxC=fWw>*gUow7ZVM$_Qj;1!C_Fr-o&|h5l4h1
zz-)d3$@Lh$89sl4=f-NwtqBn8g#F_B);R#DddjD<RROM%L+tK5{gkW#Rb}!)<t(vC
zRXMk|xQK0T7zd75S!lV~p_&>%P;D0ZAx0nK0bx6f)wMc)375{KWeq4+Mtzvhl`ytW
zCvzp5%2M;^5W^jYlQ+_4+DIG6F_Eq0fRVqkzAv1@@p0~Kb>T!pa)BK<>ax|X!y{vx
zARK_vNVHAD54LeNxoP1!g{{x>3~X}x$=o2skZEHi7XfeBNo<2@#LL=NJFPdmHnvL}
z;~b1~*3RG!w!Y1L_Zu1KEHDr0BpE2#zc9)&MtOFpz*_bxwSj8-`KWVk%<bXe?B{R?
z@nqoC^N6cH!UP$?0jjR_sh`4@hp|cm5>y}M>m*9h1DZt6GZ{T5&oMd2O;0w%!_Z@{
z5ln2H>)B(}3yAmK___r+X=B<LFo(@4W3RD3v(u1lnJL?z*)llX6J&`bRxCA$PR5yn
zbBSLWAX!9~jW{1McVVa`+`P(Vm);t>NJKL^r#^^W)wjYyi?7npMs!-NzLSrf=jyxQ
z%p~=8Z9J=<W0*|J&dlVN_k;Mlq#5PSQDZzgZdhMm-@8Ne!_76RE;BbyzquEUIaI~t
z^BSFtba;~Mnz0CViB6|%Ss$kv#%+o%a=CGFFU<}N=OuK(KuGxEmKII=ci^EdWGM6$
zfaF~X+nP#z#9Tx#^8A*qEef~MyYhw$16yDp77V6-(oS7vkEYiTKZ#|BGpuyJT3;-A
z=be&1uSz(9-BL>j$?-NjJ128owlnn|s0KislzYC4O^oNP`j`Z%@5hgCK6?V++q+uL
z2K#+VN~^H2Pa{AMkts~NQA7O@o)@@_haYwdn=aFd$uWVfU5U*^o9o)dPTqOKuHH6B
zu@AHpSB!RYG%=CD%EpwFFe3`XYHnv_G8xq-`t+gIRbX~*iM^mMm!1f=)lzia5Kd&T
z0E@@_YSS*QIG0Nr8FOdz0yjRSmfU+I2=YSd(7d1!vu6e6oS(V3bT-OCr<{#quj~-J
zQ%WZ;mi!K}@doH-3COJ%jv0FzHIB}QPBw_TLpI+)CeZ;@v$v^K)I37sY<w8K6ktDL
zExpzXI;phKQb1o<TR@GPORwGKrBf|ca-OX=fWsZZ&;Y7ms5VRIoLHG_u46NwRRGy2
zYY`ilhmcM+Q0q(Qoo4A$ORccXOF-01)zZ7(dsMdVPH(L>U5vf6?o_ZM&~HEs=Z};Q
zw*peu;UlHsbaYwXiJ8ts2-|)zmc_wRRV}SGh=RDFHjpHQl;|R4kFEe=s#FE~fEt~z
zfuM#wqvYr&_o=W66?FVFoRG10V+4C*xSYRL;5+9rZ*@<VKrX<dtzRykQ?07K00Nfo
ztync=^<U6aem36gO`xN(Vx@#_YggJ@qw@fHrNy#$l%5j2SkR**r2qsvQaTUX2z&gQ
zS<av4ky353B$bJkyMk?N=ggRgyIr1~HyC?P3XXl$!7T9p6QxFV#c^ZhywRpb{Ym0l
zIFF^7@!mXBYk&ZU5$Lq5q}viJmgDhyorDHPebs4KPn&Mn^Z7|3+jZ?#^Fj(a)V*EJ
z%ww80R#goHw-heZZFj$t-puXNnnYz~cicvG`J}T&oh=(aux;alZ>*^A#s0274#zKa
z&v3s#iqFCEGZ!kdqXA3Qp57B?w2B@5az*cc1d|!<pCMm4<OjwEaMX+%SwnpaVGH=i
zKFQ$8JpfOLvk1llg2wY%U@X9$W8{pKpAVpg$4);H!^(-&lGMdKUBY!+5E3DERe*;t
z13aw&9l?oCY#itb87Ls2;B3E&&`P@JU8?I{#>+Ad(;^iR6nU4GbU81N-1~L`VyF}X
z<m6Ngwji9r?-_i*V!>&{t9}3;=~)1@JkNW3IeTBd+Bj!dANT_%V@s{X`YoM~h=BLm
z`|7QQYTbPR9-RV+j&+XY(Eg<=rVMD!g3J?#2CW73mGdoo#~P|1VP&G)sUO{1H}zu(
z`wtBN44kvybXNaQnAM9(^;!7+A&u;VaE{|vNX+WT5kAx3LDWDH6#BP%UyuN>d&9A7
z0JJpC`Zk7V)>grMXf69W^n@c`b(Al1plvtxGa*wyACM5L3!a~bQ(0`jXlj7}C-Giw
zgK+&8!reRJ0O7)H&l+aRGAxktOg_JCW_ErmX4gly8Ohk(4w3DXEQU}lNVfa}{MrPB
zujz7|nAxNv39u%F+zt^JP-+r%B+JWio`a4w1k@}GqOJub8`ObbI#L_ZDkw>J#VAPv
z!OpGcvGTeIM2-4tH-Sh!gX*ad(5b+g$*Z5H|7Ym@3pzhVXORxkc6FXkjm|Qi6*wWr
zE0AqEn6LRxIck+LFVN{g`2ye*{Qs}`3hrBqNH_T#R<ii2m0Tb0>gWKwP;NY*4z@sm
z`8S!F?)`#Myh$-k)#3Q5Cb%-Iep<6;wu}~cA{j0A1on!=)7A!PfEyNw8`y1DOx4eo
z5Bvd$@kfXp2X6QaJR$|)n^+vimE&|RwZSnbVFe+h2??`^r>*~<9Ajq*oZ+7*Ll@<h
zNCipkv#2lFO(3#jdDI;o<cFRGG{qzMNJ+nQ@0V812oMlrCc@BhOG~ZNqG|yo6ZqkX
zl-OKrQ~)_wK;vC=DxLEmXm;nAoZ-P7q$)8*hi1Q13dvkp1#`eDi8K0X=h;<2Uw#IF
zD{&x95{`rVdA>|(TK~v4^#N6wG@fjDX!lGRa;2nKRxZATob;)c%3|#vCzA9_2wT86
z_8ASHT<&<`sbw2Dt;X%7W7(FIa?-mJiy*>-Fc0F!^Zn(FopG?<?5vZvb50t}>Ac+s
z=5#)1%3mQ@uqW((i8Fxohzm?}P<%K~ARdr%3iusz`t4zSN1Or131i|{o_8fsE^ySv
z0ejRQdp=bj6bfGvvt-gQ2DAU6kR`kRucV_N1FflclpO@HG#VT|+6SY^rAOV;L-l&;
z{jIgqoa5LB0qg^+oh?1qvI$iS{0}^!<ko9~R%oVSC&PP7rvYbU*)<#`<r2Hw|8}S8
zsH(t)d>t`xfiDn6ID<_ed4gZtbahg9zXopR6gOi(C+svp=g)4TjYQDES)90ApaJqQ
zl5NvYtR>VG`5S`Cr<XH7gf;m|kflH|F+iP9R6wD}PMN6Wv5tB6=clj^D}4wp;1}Oz
zz9~p(KSBo(I)GnLVJd?*37|zh4dEAG5Hb-Ts~35#B-Q)9VGzUMN2x;g#BchOsVl~Z
z4R3_ujE!&KXabOi(EI_V<OSLojbgA4VT?XdcEcEB+r}t##TfS{Iwimd0vuqjeu*(4
zrA&Iem^;2`*Ycn}c$F=h66mHlw?6P_6B`6L{CW)6e_qB7o3`MBglVg#`Dl#hxjzA&
z!F~g%7YLY4In0AgHB2pRN-~SU@@XlzWUrC!F56kKSp%U$G@vF#8u<>n5%rucJ%O~C
zCqyMm?wS&00Ehj>n(g#dK+w<yGTwFO$U=BSvuD>I40#7&9hDkut_L(18{|d6=EN_~
zU6$=jHm%a3CA4C`E5ZD|rS)M!-{!duNi2q$GqXE`fNvDQI))i60M`0#y^HuYU_bRD
zR>(IEk=-4Z{Y~d||9`if9tm?&#5F({jtJv3yE|~9>=5c#5JMg2UQvK}MiZl60gK;J
zWrhnSQNTtyJfhuCkDs55hEjc=(SMQIjK+_dCDmG6b)^P=o|t|G2e4Q5a{yVj5R5<6
z$=5~<tPVf_UY?ncaHcB(PtzAPbediXelK=`sOgq|AneCNNNXad1t9sote3#0($H;y
z(>`>T+e9=1rvy(eW5BR(WYbyv>Tg!UhrxZ$<8Ateb;HbA@OJ~SJ3eKZCW@IQsB0O8
zSJ!v#I90*%6Iz{mj_paFbJrJ$3xyy033Q#r*Db*Ti!=eChdc)otj%lICXp}mnt-{Q
zR6T`wLMw$>fOWCvTvETzkk7;Ma}bhhg5KpiK3E0Vtd`bVYwnDtJMVWH?>RdE7o9jv
z>8C3KZ_Ke1iFuVeT;k|#oYiqoX^`X;bNMH5C=~<Qng~MjhcqzTfq)=$gZ)dhB-H1S
zWPyiP_?du*R9V)@{d5xSYNMUGm{2(nfFW8hN7SZyE%9`6BfF8?z(JvrfDLTd;K%?S
z*uc@CS@#?LUb@}K<48DfNjw!|PyL;zWYSD-^lcP2`rCb*S%NT-_vmXEB7dRXANd)l
zi)oEm-B119Pj==L$&CRpv%!88X7->kvj^IP1c!Q&5MgG2YYYX!0kmASQ($p?9HkC{
zMb29Bmo?&Vf4i_eYy(u<D7L}wSTod5qs1d~%zxG!wE=c7k1dbmm>u{er1L&(v=MJ}
z0xn`*Gw)%##PVdjzg@&pXFhFi3~dbK2t+yUq4sb)c_oD~^i7bvkFX)T_z81k1o=!M
z7E2guk6g154?XDZMp$Bb`U;-l=Ue*wy*-<InJfA+Z4X_;T=w>Dmhd)n%>WN{v^~m^
zZ4&^v3E#ox{ks$GT)Vhvtt3BYsNHDG@OE#}vJsv3hZ3K(q=f`>45WFmZR$J$YIJgX
z3vwE5W0ZB6q?dS6`q8!t`u?bQ^YVe^TiO6Dy*<mf+T&M~n+Mx{%ZD(_^t+k%C~A5b
z-`nhot4V~ijic>;%$~9K*tYFKdxI8z1ofEQ(js{;b&~W^oyNlXC!@~Xk05Q%JBre0
z!~VKrTuY+l#OAT~7-~7zPJPU9f5b*x>|IFFHBW&Yk8hcm<2%gD+=&G8`Hk4j978)T
zk%gL?15FK*f;?y_^LSlBWKl?V^WDJ$3fT@MCc$JR;2lCifH^3EbP-|T99+C}_KthD
zfo}+D&v%X$7K9uDivfn3oxeVMt^}^b`I0?TI{H9~5AcHpZrEA^OL*kT^}@S_&!SPh
z_58a)vkv?PIiJF($!cfM3b1`w<UxAS{V#qUQef9lodpIebbX{(YYPxw1fQ!4K<hl&
z<Ti=a3r-0VeL{a{6YriaJ-S%B4BP_ZqXC8U`o$99zEX9~YaIm_odS1|0tEek0T>5>
zKS2;yj$I6)!6PN`D7HX>q;u7(b%Jz6jH52oP@m4ES`r8q{RfH2SrFl#!;TA@&&Tm2
zq|z^9^G&QDh&KF^7DIADE^e*WZNiB_vng^^x}G~yItTC<aOL%zIte&H+r-u^zBeIv
z5NnN9KZ}zJHEu4g%jh<P&Ys^U>Gi?Kg_IZ8<=&a&el~z`Ae}%TkY7|u{cASk|Df}4
z=-fi*^K^cn&LW+iXfYbe%B-$xLfXG&T;k&zs`>(b5get37a;WuV40luCGWiY9me<+
z96!Cb3LzXv{Srg|cRC_1lx|eL##$Y6LLg6~z7!MU!3~CdP-dD>c?zut(##pRwdjGP
zug)=-DxG^JMF3kVQsdNL(D?<1CqeNL(xCcnJkCt$Q(veD(NV4bfYJYuj?Xmd74+Yd
zD-eO7(sCjO1bKuquVq|aKhKk0uNS|dX&}K<7r>{Fd{^jINO6(!hU}eg61=RHiZ0SG
zV2U;?!36XB<Vk1Z7x4g27Nx1N8i2O<04pL@^lm~boVnRN!7}T{fR#7%0OkfV0|5B)
zuNKk;NH-0n_>{&IWt$3+a^hd!#22XuAP%R7NxMf{2qz{BM#(GzK%4@oXaPXXr^XR)
zih!gw4Mu*!*pIi<!P{bL+Q=u<rezc@>qY@N?Y)stu1{_O2?NYdh|Djh=H}*3othcy
zVetEz3T2-mA$5oY#I>kMt<j8sJ&iQ;SN-1z_Vc791Oa^vu+Ftwbw#jJdT3_}fnP?H
z6#MTuJlTY0Os6JKnrQ;J{Ig!NO!Mp6r1@&rGG9rj%$M=*x|uOw+M65xnquk&d}E*f
z;7RDV-PEE;Y%^Q7a{4coOW7lK+M%FS#>wy(G@q>avhd~LBV8!(09<nNiv=p1fXdUC
z8$b>z(0m5%BILRX_K-aczDd7Oe+B>`k45rbrzn{o?Mt^;--+gGGLRN%Es4N**FLGG
z*hmt@mZl%DQG%5Kr2$POn(rOe`4TpJ?zZ7J83;B6sj)~dYlh!i(`&6kL>d@8&x}?R
zOi~EYtkob{9-J0n8HYP;Ge!##UNf+HoEJ5()C5UNSi-{FUht5S5DQp~)wOf=+Cu4}
zCmzKvEp6>>BM)EOfg**RDZw!fx}|W-aSDTHtB((1Mu212^sr64IJ7SZJiCUbO2u$w
zPE=01L3{;|ArT4~j8TL1#HWq#2K?@sR}ux1laeH;$SMF@@&pp-$xRf%-`lNDfP)_*
z)vZ<RA&X$(;8+R8ZY<fct~2d6Sm5t)TC+C8So)C^SkG*ThzWDOxJ=Im8N*Xb$Tf$r
z;3@wIWdwN2Ko_2p*)%B-E3)Ou7{;RU6k;qQVk{U7WJwUW-^&CT3na#_LJ~c|Si&cb
zvB*;lW65=4ETNzHLFgyO5{TGt764-xVo&|uPXWf#--WRZfUx&k7|U}pxG=F?TpkjP
z<p)seaDcJ!mo?%qF_w`S#`1H3Uq=OF`AKg~FqZM<i5`q)QZSZLjj;efDYh|#YzQXV
zgA_p8+yxoBscVTVrkw)pPQoK`4`3>S(+qEnc+-N@47W!*IE}Zb^(AjF;uf)iK%kg-
zL+eXYQgM0TibaUt{Tr{ed6PXX*r#Avz%2d|aG(9GHKh4Q5O1tE7sPqKlaD-&(G{X6
z@}5j=jEmIZD7Lo|U~d!A6WAadlkJJ1b<)Q6xU^LKi<C-YKTfnKUpEf&(S6M}F7LBb
zlL^75s_5rYz5{+?#&D+RNg`%xy39bh3z!gOA8+2gC47sGHxLTDW-MV0>@mnXn&{hM
zdmOmoF7(f>0d6?j-WB172L(4gB)H*0!42srjyMAh<45g@CTSjrH>QAR-L|p2y&E{#
zkrfj?_g_%@(QCj%)Gxp{i}p`9hrDBKmpyrfF@8hZcpTv*EL+Pbki)c$-LCdDX46za
z<nG(Ny*-W9MqE_;I5V|RaQoL?O^M|@WG?lCLX&Gx0Ylt<HM6m&y=U`I!O8AI`FBfg
z?%54_v1>^guRU1tZ))#B$$Nm8neHCoWqa*ur0QCmz{~b|y{m8EYv5(3`_|a{+tc0~
z;AMN})W5?frT&P{zo+xZbf)MK9a8@h&idYJXR{4JtpI}C4&PCK!i4{c&Y#k`ht7Y7
z;|~OEkcwu5tp702w}myxT|pA5_xK4hA-e_PB$CIW5wY5WaM@10lp`85!f}k#uK{e4
zqGyYXwS^k9uTfZwB2S=kL%u0S6kl6iQegDxDjX$nS*q90DbN#VMS6N23Va}Z1G3c=
za1r>a24XQd#t+SR%MV55QHuT47iTpeem36rqu?)w?6>)uBj6;0{{J29ss94!&+wIx
zKLHx2LqT~!xa(4NEdjm$i|jjn@&?k98q7wS4x}c*UIhLHEfBO=j(w4@!l@l7A_QV<
z4JwW>rgG|naU&g}gGI3T1@n}quB`_0S&<}P2{@=m51Ukt5HI+wQTDT?_W{r$|AHw?
z=c)@UlCQ2X;`zobDAa$4M*1V9_QiY0RsR)lX5Os6gdg>nj0MGsRrTNa_|0N&>dVaK
zHL*7JSB&xyoww2HU~IdCl@LruH<-$T3Hlibs$=#jH2}Cm$m{A?k=!4k0DFMq5CJ4~
zE~2V_jtV@LdTqr~f6c&Oq9cf)plX7&5swqJ&L08woR~lRwBj_0Q`-JGGmk=-R@VX@
zkPL_mQkfzvbC#J@;rNqV)^dn$5HZv^#QW&`TRMUrmfiq%s079mV)4sN_zIm@>3p5e
z4LX$a(J~5v@)G5NfM&l1`BTGuJ*+W6Gy|Fe;LkTbx6B=6fkY$?hC=HwASOePfj&Vv
zmC59V0Q0SYlM2xnl<yb7A`o2y%C#$twMo(;cC#JU_y-7gi3EpOV;=b7H-I++f9&9m
z`;)*NO$c2W=8e2*zG|g4CYnzb%~5lJm=rM10r(D>2aHMJjR*;_%2DGG%GDU>u#rv`
zkS}moVz4H#FQ~|+zYgp(%}`>c1w%pwKhs!R`lV8yiVr=wXDDXg3-X3jVh}yBKrlA^
zjKV|g?IgY~#~4Rg!p*=nHgZk*ZS_Nb##l*iK$6BbL2P~ooL)%S0AvY>@gl7u(i=~w
zHvs-Z21FCq3m^{<Yz(&h@mmCdmUSQW018WdfR_N4hA2GU&H?nzZwxobwttJ-087*^
zB+XM-`#`SFfGFM9&bL!nERdx~AZL<wS0J~hbJw}1ucr08cz92K864!2!%Vii0mxpk
zrxHFV!HLy4UeJFyhA2x$bVz!*6ycbqqh#4~tKDu74{#|UoaFyv1I0E0-m%b6H3k-y
zDXPj4wZz*_c7YHkg(#u3p{s<w085Bxp9^#lfyD2^REssJ#CPGhI!Yj0&Za#SlP_mT
zH`QmWEO%5tM&;*Gs!#_a?7>N;&nTj-Ky2~Q+eN@lW7%{lOK-S59@&4-(z<C^Z@4^3
z^!F_>8X>`8-$0oJ0m4}!wA7<rQtSc)R0B1Eh!Q)mg^uE)&XBj&G=t-;dJ{AQ%wNu?
zSod=jL%CSy7nU4Q1UQ{egb8C2p}u@H9$C``m?1yGMvVpwZ~|q8E#-FBO%OR2BcOd&
zTu?WhD4T8x`E9af=P?1P=4j2k-}QmB0EEzR(8DRe%kUuM+0w&W`;4>nMCn+G6xl-|
z_%;)(%QnCaYD8LFOk{tX3cU{;a!_tq2B0tK2zxur4RtzDON43d@r+;S=x$do#GY3C
zLYJ~P-gP~#K-^Qa;47<sf2=nlMk-b%pmn;Br(}uFO?39txtY!ZI=9fd2?qkex+nlt
zn8>eR=2U?ZDMP4b*!n7Km>Uql!}g%|F!(3w2n4l{zG*l!qZ$I~fhS0*4$02}ZJiLM
zE|V~X)XL9ypmPWyXw`T0PM_{8(w*I?JLIX;RDZ0in?n70q}?`nMPwR6_Jmex*H8wD
zKpMa}P2X-*?TU#5)WcJ{1aX68Hc5ejyio+@7{Um0`s<Us+s%H}?C*XK{h(R@?ZALF
zZha{X(ZA{I)`<Dl?l8oB$;z)E3#gO5=l)KxVN5A%2(`}OD=@&VXiM7^S<duEhAelS
z&lU|T>;f4<?F;=l`#~5HqCvi$nbc$o>We`+vJI9j*!2Pfc#!1;!ye}`<iWtayq0YD
z0sS`-p4=up2}Q~bBuC$a_r^Bwjdo#UpxqCcqHm+vyd98<iP$h2voYvd(D;M$?L-1T
z5Z53AkOYHvpgn+k8q^2`+uHz36+#6YMiZ>ep#G`8?%Kl$1nO@>mel~vLg+<96h#ok
z3X3M7mi|R^k^00|pu9%x8jLi7>puiP<bi(>KVsiN?WIp3alsOv=i>u(7!&%rICh~3
zq)K=KPvE@_?9)fFAYp!vr=<4w=_5=YlyG$pVz3nKF~QmdonJ?4VS1H!#m}A!ao7OF
zj#kWyUi>iDl+X+02z@hjdgeFh#Gi3~--iOgQL*yIB%swva}t~r(-<aHn}kRWcq<f(
z!JGu-TY&20uwf>HnM<C@_4WNb&C{+;BaPCq(GXOfI|W^3E~+j?$P6p^ZiY#RX8A}W
zw93)fM_-;kMITqDV)b`shxFm?Z{X|RjRa(e6f$OCxWsm1hU8yLe9*juvGQLl+KPEB
zAbIRV(3+nuOp!nwHkqRkqUB)P+7PDYFX)14@nzu4z(<Bi*2#e>k|RTeal)ARmFJE)
ztT0>p>;gn_`-CY{z$6{;M}s~;1A9Z!xx4;~oN~ddE=Vz8Uj+I$B2(RL&UQGHvvMdy
zkE-QST+0UMyTdkr4F-x7_xEC=02toNK+#-w0KGWg0RrQoC8z>^CShRc=kd_BtE;rh
zAtj%A_=$(e90}xVUx)I)RZM9N14%I@&NTNGtc7+;jR@%<M10NSfS!%sDS!!aAG(S?
z0{e|6i~zlphU9<ZO`7~qmOw@eF#$)(i+B>Tp(LKPAXE6H1)1bY7y)?^(8+JYh=7PL
z<w@}GB4dXBLOU<Y=3oMB_Ja}77kesnKM5nC-y4V+0RtXHkwCH?*im%pBQZi`VtELH
zNfcfB1WFwN!Irh+FA1;wEr7^7DvW?Z?!Uk$e*!zIwK)biu{<uMe)=XrG#<43g^&--
z(VGOO*C+U*7EVIzgY>@-9I|K+fB-#=e6$b~a%v9?0UBqJ2f{LP+>KMG4}J<N(O(d`
zXhYq=`Ghfot&tSYr$Gwps2kq2J=h5WA-SkyMWQ+R<vq)L?I9skPw1RLj-J@5)!x44
z((+Bq`=u2q8FjcB@o&ZNfhG*v;rEu08Lc}cClI{hgRs(yWshhKHXK>$L*S|)v}f8`
z?;z40!gpwSMiZTZ$0jxpLomzO7y+?=q<eHhW**%#wxc_Ytrq|BVxtQIvw+MzHdp;l
zY<a=%@feMlQ5HmR8ry9*T3lpa>u9|3{bathg<3qXDb<}l$G!ioT65;1P6}+DK)xnU
zYPx5m+JsG#uI(*OHrCbBI<tZxa32ZM#A9mWrw~>H&aEhlbj$2_WPR?MJwAIJ+Y+}U
zH`o&?s>0d@>awBqQ%BDcq%5t%JCC!wS&0{a|9+<mlVJM=u>&<5Dz?l_sw-ykLRFES
zggtg<2JESdCAv<BzLxBh<^_=WFLZ77=k$@f?OO|Lc2y2-sPN~c2}EHvMj_~dkbrON
zkWn=USdQucA#`A&`Thzk7?FE>iLzhd+Yd?gg@h;&MVf(MW#IH{opoCHMku<{yx(Io
zk=W>=wvN3KYU?*bXzfVV2xU~GBtrA-VTrSEr*UGr-vtk(F954qH_}!J-0(MjeIWOZ
zQ9y?Q??n<bW&sf3DDt6<%p`=Z<uAf^ilBR@Un=GyCu7EBWTv;<BxZvXTnPl&zb)YR
z+Atr8cdsY?au@ckQTE9Z#3i_A=EMHj3D|?=OLqM46jRbXg{(w>Xfd>YsHt$3+acHm
z2x|e^#y+eQKXHR*hLf~S#|rePr~zE)vQ(7vLv_iADG2DO)3m3jF=@`|Eowk41dU33
z<vBRrYCyDMQ!yc{U{qk>3`h~kwggkIFdVW7aMK{A8n9*oB_I#TkC;=*({dg{`nH6~
z;Z-5`HoZ(K0jmE{wxdYUr2>xnnZ%1gjP65cLB{Y~P@PjR4tEKAaB=vf+--XIIDgk;
zkDfXE<U^+`51l>x$lTdSAD^q7JpIs_Gmo6{bL6~9BHam}W-dvnONyN+zd#FK<Xq8G
znm;JS2aPl_(4PdEs|!l)dXt1s6=((ALdleXeMu3G(=zUVh(gr=NvCIB5&QUlboW;#
z;mKppr?Do|!1lQ|U{wp|i1NnuaE^BmVI+I9)xe90eaZeI$jv2|+rh)6210D0g30%x
zaFKJ#CiWVNAq$8e`xJvGS^FK4tnH><4kB3)tSS=s0MZO4KK=^kK_7*&atO=oP&^S6
zNiX*WVSV&dW&kRAa2FxA1so&=hE2Z%RM{S|2cZNwAQ;IYFp?pu@Ee%S>H#!NJxJ#v
zIy();kD@o!yBOm%oVgwa1ufL`Idn(NwilToBzmxm-4DQN!vr0GC;XB;fcXt;%^^T<
z8=nABPiZEQH4KtPK+rHCbAojxsYF4UsX(&O>7)1Utk{OJs&7R#1s5sdvz7iHs#$5%
zIibCmQ?xUu#r$aWMKc$G-0<WVLV!?$sVrIwNWopkDTKXC8wWG<QNb)6i!a!;gvz{K
zS#e@(Ck3D*$oPW3N}=0Wrp&mz3Sy#`<Tr3gBZ72DHg+`cUgK{<!V_*aw9k%rNTJMe
z3$*tQ5??|0rmO*aOe--<$3^W2o+vsxVo8H)chQ@G1wIR1&2AZVcOKX%_Tw!r*SVAL
z5oAb;yGSn32w<%qZxxBk!VP_ct%2A$42!T8^8)N1Db0tBQq3Yoy~rAgu`7rgN7W)V
zKL9;ZxUgpc_QJEa8VQ?kY1ARe9&9aXCf=5KMbIFz!yMTE=$gJWJ@j%^heDA=*R?1H
ztEd}53kHrzLI<pusAiBGtYfWe>KE0IWOdKUMu~TC=O*wlop;cA1kOxuXYh+GZa)j&
zS<MEP;15NIOeC2+1ZfQyYo(*#)U_vbJBBgD0BZRe?#U$tS<`tl9gw{MaJ^z?3NK}I
z76`&TDzf?Y?GJ_^8=9O1>D-NJ+ozB%H>{KRy5A4SBcB$V56se01Pkh7%SlCS&{79!
zf^azlssc4@p+dz2A7X(Jbh}2HDkuozGp|v}2ip3egeEp$J0{h95R*Fjws{}(%X`GB
zK`EuxFRi47tvyZib&s;^DWKgmRKY8Mpy^!FM=-2ixZX@b-H8_IWaJgVJn?yFVP&-i
zk#-<C8qOx%=cm@ZS{)+ko%iYql;bBM1fW6tI}mLqrAM4q@bj`sN<q3hPqJd2b5)L1
z!pVQl;X42i=$nuaCpD7;1_Iy9RyOo!U&*Gy^EcLa2k>j`Y!628#Uv!WXNtYxYtQ+<
z9Ux06&hP75GrD;e?K8iQP&Z+KA7K;rBadtXV(Y^MYps_7^hd(n1qgFb=ki}0zpr{4
zab;D763@ioV*<}l&G*<4pNHL~lN=e@L&cu$0u9arM~AIqED|k=#f^`r(-M{5A(5<E
z=EAd<0s~2`jsTbDPKweqgslY@D;;tnF--w$nk@;jU1SW?rkD~m<$&`i#DMjo7|Alh
zdeN#{6J%Qyl%-V1TbP~Erq#Mzj030+YgX0B*!)uwi2_s&7&d+8(0R{Wb?-fPZ1u8!
z)ly5xXfOR3SgR1&IR<LRd9P8wxhLTa<lD|k|LE=VJrJ&j2sfHkMeUc?V5%u9R2L--
zeC5D?CO3L^VQ2w=YX`HppiBZm7-SYndP$dq+yV8JC4D$^Pd(>A=&ROJ0rx*LIoxHl
zNrS$nLr;YS+LHnc&p*?{)<i`yPju`xS~LLZRnGx!>kPGGy=lZotQyO?$`Bb<s6T~z
zN*p?!B*?9_2+_8P+7s>f1OYhwl5Hqxk2%*`py3;;)Wd#4RKO7}T+ioN9m9DGV1kij
zxj!}kZ9VgRmXsZe>4}<>W6wVmihW>MqRL2%LfQY>{Q7&&qSSLdbAzpcHY3nk8h{Z!
zd{w;^*c&+-$S|<_tQ1A;1Iuj-B1R}Q?bhn-0#<YE1Ilyat@sMJA_VK5B<$V7(yeWl
z({^&nfF%yA3_cew4l&C><;s1e3qx0;U}vyPXCpm{o|gM5Y1LT)(EY*q$sRwEuze)B
zU7ciIDCdaN0Ox`5^$ah$3=W#w4`fo#BA}nJ{c;szTnNbu)we|7vKUC!CT@v1lIMrC
zd8B=4+q8X>_N0CX7gj^4;MmDQEH=QVCz(e$Z){Wy7T7)5M-_a7hL%E2IR!y7+Oy5m
zDjCHjpe|3+kDu0N_O%UXi$;}cv0XKAb%D0ll%nzWil2KVw3F-i1#m4*bcC7;Y4jyF
z5QS?%dJi1GUt0kWCiKstBVD4BFv1K35=>(dQ)rHvN=eqMmbrc)m{UFDH5<!j4(6hI
zcxAiUo`PNr+B&#DM2o(`We%9*i?wI{eUb1Z#bzm8Tj`vzSgHOHaO;W-tCrBkkVd)w
z?uxroTP0ZJ!jvyf3s>YC2c84|cqPg)qLSc_2!EgxPEv_K)_E3SP*+i%kN21y)^bii
zg%_V?XOXoA+Kt$s0$(xsE0~Hgs;7bb${LR&S(+-)ZCkRVLN&hyjp|6$#1WaE9q~~u
ze!%4V7=rMlB)}dXD%Nm6x(Ff9bKLYVL(er3{1SOukd$jnm|ob-fc*q^A45$@>WT&y
zK4P0z&Fd27;vN}RBLMMR=cu|v=w5VkpPGLNQl@8FA*xF9ASYVWeZogCXK}L#HZE*>
zI6Gp6ZXX@iy>S>-DoDVSUt>F=6;oD?pQqkObCKK}btgUgu>0UMhj6LDQXSW>pi^#^
z@S_-<j`U50i)|}D^@ay5j>*Pa^cZnjZ0gmEtr`#!N1+7`9!*iv+=oM)RR$CdkgFD8
z5{QWfUx9`!@Yw~PN05Y`f9m*=(urs0@72(T?tl{H0|Q|yJuSq#B|GHsVdAI6svZ0A
zVQq4%ljgV<z}k`~jNda2I{n;f0i*WzcJ%fr-v)3|SnB`J=Mr$?t;nx1pBx(iKwv`^
zGV-Em$F&C_6=rNir5d&YqitY5F!U*s^?F_s;f>9ZEhPc@jYU9hxJWNSo}V*Gv!$ac
z6M|kxBvPP94YNEp)}9=-m~0hPd6wV?pe7iE)%4yjZVX3g{+U>zgzAvIwg{`iEoKcO
z2UP?0vW87S?Bfm0H_XPT*a2e1ssqVZ)w8vWvxH--=iFl_-hATt?1?+>y!*tAOg0iN
zVx`rZToxMG3s~*ZHU$vLbCxdmbPDIXtP{Kxz+b~O7^K&m?K`FF%=d7XhLVf4Sqq|r
zhQ9wk8~SVP58+CnaAaV`Ss+>mlnXSF0<b%5_hIyotz;RT&Mv`W5%@#)i&%f_<9Zg>
z8ly7Nq0h|sbaH2{hw(~Vx7F}fd}r+e?9hEDNsGZDs7j&vR^Df`4H2i8iW93jz>}ka
zjI!MW=^i3^Dx&a+Y6-i56c;M!0BRsEaSoO}SAZd5eq-)3!UFH=>cof@r(?tP9uJqD
zh+BGqXdNM<a|Ik(aQtdKyP!ndf{|ogdlMGAW!1-ujc95(uUSP|$T%{9*kKg|(^|uU
z-~j{U5hd#xtNe&I4Lu)G{xsKGdOR5Lo>x7?efAP;6UsW&L{n6}b;+^gqP0g+2Ou;W
zm=a>itRC5C&@>I>z9rU{qa~r%v8Lg`i^xwG4Fa@Vyw!Bv(tI>Ig|6a)rY9(vtOzmu
z%T1L>=<N|DG*Cwl7p^uw9G6>;saR?FDWDu5(JM%xu?kLTu~{`UFV-8epWt5*_?0H;
z1ffE9-C3$GT#mI<dL-z--j0oiP~b6*ugSo=Kw>qOGDxXs8p>S6Q4dUBwg55e(FA3-
zXH`Kz9LGBXL2ecK{aA%`9pkx&@L)K>dMdi@!N)R4(3KMCQ?e4ejzPUo^&Ey0$gOj}
zbO<P@D1KuG+pXq7FIG>{7sD*CDK<qDYGYL9C73J0QYFCx9V2UDW+&L}1m;p;zJ*ap
z#F+2YBvvk7Z#PlG?Ka{obZF6%k(g$tl6-g~bWP+p3gbhOuXDT7YwmZCN5BSbZfmnN
zY!}CzOQ5T5a{hCRS~%a5-fqF>9<1U}mygl0<poCtW)?sJO?8$tSMOG_&Ot|Gq!-u3
zOkf0l2)6<26ZaZ|>;QGk()rdU0ZLj;JOKpi^fjt0U@U$K|8@u}7f~nLCB&X88*r>#
z8Qp`N_aHn`!6c%&fiBWo0V@Guf~q~+8I)*G!+^Y=%>4fxCI?81Zve&%H!Hmxbf7U#
ztk-KtDEE)U^nV2F3DbiYB|35B>~+~rK}*7^9da4Jsm8<=Rf83+4+ag2&c})icxO=q
zq7wq=TF*SguGY&p)6E7q6Pm2w#30B4sgK6TNT7RM><a;V?OMP%$m_tLOLCyi<`+P1
zh<!+_;m7^ga6%%S7{{eZhbeNnfk;O{c%upREC_Jqt>>;7xh)@il3exWKA1cy;I|)K
z_Ja6m7`Gq30fY{c$8HZuIQ%f3TX2f_Ee5|s!S8VJJK_u>-LP=wVgHQAa&Z^yh%+jp
zCnik!PCO4${qmUb=Ep%A+~pSnQj>Jf`ah9z>5_8O01ZzW4Ks0{!@wU7;R0F()w`#j
z2Ztj@>LUc2B~VUnz$x4CP=M%2iE+w}&ivkxvU*a8(-KdpdNEcR38Z9~B{@DHhSQOT
zd4X~&<_lsFzhdE+X7E!Um}T_i3;7q^$%UJq5Z>n%Hep(35ezRFzt4IDct3c>c$m8V
z2HqEK<WLzxDC|yDhP+{>$9IGbF@%iTkS3^%2|+S5hdbzCFAZ23sAZHii<wXft31Lg
zkHc9)MW4yw;>rZ7Y+NzF*Sr9`<eJH;#XVq1%bYzf2C5)sBN{dlnat1M$D8H|h(B;M
zx<YK}k0+uVm}F!lL6mN5`e7=O-cfG_E$!6Ic$e&saOY_EP3sZ>eo%{u<mZLDB`}&g
zfcpF6?<Kj5oCaQ(8qrC$fhysAony3uoF9Q=f+oJ^Q9Dpg<!;}UCeRKEbl5$qa6>BP
zp<MMglr9oL>J;l?iL}r7_Eb#uFHxkH<nfqHD2f!d8Al(v)Jg)|R^-<#3sP;+Xc&hc
zLmE22a%<vP_qvFBFE)<Il^W4P6aNZH$)t$~25&G*>v)Hs_u@ql1JWO*7PCH`qa)Ql
z8jfAsSidP&`gXCmECSA|llXom&zf+`LdzwnjruU_qV4_rDS_v<REhU;^;YS4SS4H|
z8L!fIvFE6r1tghpW*5i)8I<k!%bO5f(?VC2xAI4~JE(BUQ*b*lIiVuU!&Zf#J@8OC
zpUrjGvGO6hP0?~F9W`kSl#B_8OG{X>oejcFO7Te8R68AnB_Yj|lduErTo7hK1q!#2
zBELJKZX~}R6r#m~iVlAOtcWyN<Pb`44hC@^ZG-gzX(18QZ{i{jTiaZLtw;4V>Xe0>
zR!@65h<1x1Zrj>O+bP&ZNh<~By5$wFn1uw@msG>+-yGomfndQ~n^1j%ub;MXVAlKl
zgSu^?4n<f}Ni7$l3I#EIi0${a`=Jaq1WTQRxP8vT^_<DgQ5!d1ZI0m%x;|K}Og6I!
z3!X%{912*H?~g$!U|eLGaI@X!1e6CR@!bUx{-T|27a>eO<W1SoN)a2U!`|-YY1$5@
z?AKK&;cvOH^9wOia#Q;8%jkzac0UxjzKm9XnZoZ>D6?^!78pBwQQkfX=I5b$4v}~D
zW%S{I^m>pA0`yQWlbmiqc;AH8SP1h&oE{gIHcOKCpa|%XwKK3ZGv<{bcn>uKC~>9Q
z{ZJb>oBiHR-C;6U_QQTq3PKer*e5k$@ltLDgSb5`n0_Jy`<hvldxt$N<%~0Db#L$l
z`#l?z-p$D608ff%PGL^$f?XS?cz1ieJw~fPiw5kkjw23JR`JWX_=TOFspVT?!_(Z@
z-Lzm~X6%XqYdB-DKe}54EF?ARlt%nE@`4p1DgClNhPr4ALd^4p_Jn@RFez=~W5$J(
zDB(~Wwu_JhOZ)-UXH?e(wVsw-f22KtH=k-xZq8gwG>!Ho!gJ`kpYaY0tdKx&Oe5XI
zqQ!IY3NFX>ZbKMCAP6#f)glBjw|@5hpojw55hE{h9~EweYJpK35;vn=K`1@3&~Qf}
zJw*sa#KApWW3~PWbJgf*D`1ww9t;?$07hb2aJwO0B^DUwyI$Txf&n%KQ=K?1fo{<R
zx)3zNGXw}*mn;gQT7AD$S1!;yaG`%NJ9|pBcGWTLZ(<~8eJDEl-0(6H7#Z#C@UL;F
zh<XchS^pB-Ocb9avIekCKwH4q0!{_k&A9y;2&3lv3k6LoA95oh*W}!PP|+sL!2mz6
zUXCo#uQhSqgl1a=bZXF%0g7+Y+Yc@G0L}}JA#y*ujL>7SiOLIugbb$H?h2Gj_m>XR
zj@!ZcP#-&pD6IkYMW(}SNuZ7yLsW$!9{Zi_#yWQNo};5<z)~Msj8g6=ON||XBMVKK
zLRjo*=3yuhA%5ko{}4f}9OXA<{{Z9py*RrsU^fYmNWxQKA-g=SjElX;W9~k7`m?L*
zfj=Pl0H$QZd~PbSelT`Wqbk$^>4LGsRJc2Rh<u3<S{8u-X|A7-=%KFSSvhk=19&_h
z)m8S251&6s+`9W%`~u(qAf4wJ=`ZLzOy^hW%rmTyz5*PGuT#iDQ=tOW+@de>A;u60
z?RiGC>aa4YKEXiIZy*fwI>uB5CZnRLHnlTAA1#c?xcWH<&>SzraMZUUFMqlcuCzGM
zxk8OY-ZZSai(0ztgG@1q8;-l{jmy>L?)5>GF|$X5S2A}xsxa(D^<8w>b9w?vM}9ZM
z4l&m}C`$oRNle}N<0N-=#QtE^M<iLxMRt34`Jz8k9@?_1J2&{0i*m7(ww5fbQO|1;
zP#eP1iA@3*7rrAXl0|g=VN5h$m@Aq9!C+Cy&hHPy1RVDJ<tJ3%@$-^QD4#vES7gcV
z;Jl?~%;*4Y;@=oej+(p%nlcDtFW)Rc&`q1PfpRa*(!dCfh{RFhLo4}?CW|Ibr*^K7
z9#2k78Y-F;(o>LOpaO6jl4x*WLFCYHq#;%pDhF!?jPy$beK5&t3@6hN!7J+0GwHvB
zHg3jxDWA*2)ap2Fkq(>IbCj@zf;|9s2;AeYPsu^lPSP*0hYGJ6(FxXI3_eEF#rVE#
z37dEzrsV<CQhDA4{{rrp`gVjsnsN_8JrCNPke|d2=UzsTyd2);uK-AB4tRY#yajk6
z=p!X>vR(lx`ZfBvk(4xY?;CO?VwuNx(9QrvgB(vAx2pogO2UQ3okepgiDNZ?{Xs!^
zNtk*&M1eH|B-F;*ssp_@r$cnRSMLh<!swaXq2>zy(b-jBG$HvS&^cu|{JsF}=EIhq
zleml^rzCs`a+)P_>KTHbCcDv7Q{9R<B0v;vo;4*1SA?LdHB8>?oPNwWohSg$T0b7!
zI9G#4X*p51YXgl?TVyYaR^P(%xR)qy50Ja$PDi01C6;vk!5(ogf5fXf72MHjcb&xQ
zAaglJhX|~?%>16EBV|aN+I-xgGsh^zIB^X_a-r_({aN<Ls3LmFMVKa}sp3ug6Sn_e
zWJD`ZX_>mT7{R~v%cfj6Ay2R5Gte)&VPd*tQSt37X~?5O>gCl)kXTRjo&s=Zc=&-i
z=pMl!QKA=#R8cm0o~`<JI@ELaCqqS?N9Ue+&)GAThaW!)#hgdy-mXuE93yh+s4RUv
z`_%jC6TN$`Eyc}Pb{SaZz%l1)AyD<>jJ2a?%LTm6vs>7|6z>_%WGArdaZy8Oee1o#
z`rDXXD-0B8VW2pjHUBQ1GryM3n_o@$nSYxv6#fQ!`m>XUV%`hLHuj--6hH27;cbVG
z2LU>*ob{TNJmNl}=w(IQB>_>petQ7Vc`;9%fXJOTPh5bp<57h7gAxb&ky8XsZkU!c
zbK_h8?7Qs=X9Nn;lg=1^$3ffMB|cgVoq~^VCxr^QTf*VDr=u8q;M;|ey}|Fk;I|a~
z-sDU<yPYAS2aenO?VCvt1pRM1H*Vi!AFywMS>-*>Ui()2;PZg*%lklu6^f>P$es~O
z;Z4ro++q7o_HE97yqR(=XFqeZk2t$<-PcijRw#@&Ggj`H{pRiB*~gJ4YoF+jiyUuv
z4g|3+$t`Q&Vc#ja-4f;26Gw8&+joWWgo=49Hp_ec@DjXJEp<}t-JE{x`aoUZSTw7}
z?p}v<_%!B)CLIz9riOo@BnKJzuDdlsJOm1tmiskLAslA44nJ<vf_&m~LCxTOzuYU8
ztqkxUEGo>QlBu~<pay*w#hjI089Ub#c+jRTI9=7#bYv5{4KE-T--3cO6i2%5fy@ML
zlEzS!F$5FTJDK>ibYx#T%Nzz9plSz1ohA^gE|OShbLNK7kBwP`>}Z${7d^s=T~_YW
z;d3PX-qpD~6U4YXoWxi3ErEl|`LPTw!yg7oYB~A6(zyud1z|KRXIz7rsC1AC>mMno
z3NFB8S~CtJ>8_Ez_NUQA3KoJ|6)~IC6}DTN6t#;R5%UpvXb;=RnoUBLHM*O<yFFcz
zV~_(8iKs~wLu8Y5)iF<ks%sP1N%ga=hKwSsiUUh%e*%N{R1W%$5W1ug#!ajmOF_j#
zMUn1`MG<4CG_nW!qI>!)B!!_$=+BV&XzE(XOxUJx6!-rFC|F3vStG(?x(oish>k=u
zNOW9IRDfZHIMu&GIqIWy;_a8S=U`ZY!uAzl%QeK(fwgz+1oa`7@&l*^78&QrwsT^P
zjS3*2PNs)Z5%m$ISw9p2CB6NyCCQdK!V%=OSN|Zahx*^(3+8~<f$5ak(;fa8+q$KY
zqpa6=>{PF9le~d?p?^3<!<uCRoT9f%eat_2cRZ>{WN)&w&(3y5d;`OyeuW*h&7}1E
zI5<IxcSJ+7U5Q;0!59faS<&dPA@?~A^`7Ezh|XAq50a5^&H(_o9bWi3#9!!x6Mned
z`OLZN-inMtQZ|Il9Fmj`kd$*0R*cBUCdkKNDk0ufJ0FpmVF3oG2w;8aRMB1x$kQ+@
zO||PBER(?23#?T^$r|S3`$4h;5qL8V((8b2z#dgva=QS_9l#ds6bPo!1H)aaNVAzC
zffg|=T)4zA5>gok8!5a@NVbI>NQ|(&$!Fv}1Yf>gTqaR@2t?-u?5GUW(h6*(47T&`
zN81C7u)3x5+m(RQE6K@h&qrV_#b}RAV0qL2rq>S}F0he>v#tQe@B)l<XRfAj(q*d@
z+K$kdIB$;#jUI$ZX5+uJ6dy}CEyFC-EiIrH(w?+nA!^WETBFETgFIorq+l`?R<xKz
zAi5cl8_$tur3HOK_KVBxvb2C1+Tn^lBcgRjPaHok;s-HF4C)d=Xrj49nI<jhD@43d
zLIcZ*G{1SK)-pwbym-{1ERvr?x-qamP0A@7Ku)Vj{zOuFi=glEt&re?`$vQB?{3m|
zP3RIwdNRE9J*vYa&sJ%WyG!VKetsS*Q|DXFD9!^9JfQE=fYfMo(<$}9Xazhbp?)MJ
zG<I#g9l2ZPBjW_h;x-#}559M3e*<VRIFZ#_XeICHJ#RaD5kjx~JHqD7GooK2-&l2e
zHG*C{%UF9My$`xAGTi?T=kgJu<z6Or9NhB-N(w#*BePAB4)UsX5k_w~5AKg;A!nY8
zETZ}y(z#L%j-nuyK=|YS{i`0D6ia+Fs0DKQY)7V-O${i<j?l{Q*S$F#1|HHS&g#P!
z)bt~e>v=XHfZcz`@f~_?R-0wiJNBV4GnN3AbSjI8h7+`C^xP8x$Rki6BUFZK%Y)X5
z%oCzk7aaF~A+66mb-W|~{Ec+u)6GW@Hbe#|>TYfD=TVW73p*U+G27{JOxro=41Xt2
zrEN}(ZBGisP0VV$_*3w{h|3wd`f$3mmT-ZLP;oWM@8gIH0|;cKcab6gAqx{YQ6QCn
zz_<UF&hOC?44_Kihv*QxQ_yfp^ky5_)ZDS=zrj+rJLs+>Ge4uj4OiFm3ye#w*e|%}
zYq*0&w<LO!?bOe*m6b)XWPg_R>4E8}ReLwiZTFq<j2_lRMk`W`)`&da^w+Wlq8S$S
zZl^$8EP|w{ZD?2^AVQT@s3SrR0c~V`YP&VB3GFcp4?L<P3l9W+172Yh%G0J0`^NNL
zf#CT<eHt8MF$A`eZQ_mxn!Dr|w)t>`T8Rx9!vvoUWI5p%Jk(C&jo4Drzc6_Ql{L-3
zjNZX4W|wJ6qdA0Q9x8E6w~|m#M)8reKB~cw&|K<fP)?I}CUlAkfEy@9-Y|YU+m$!C
zwwpZkq2dj2L)Y<>07b|l{v<fO00H!IZzpjJNv@sSWGr0K4&F16mbQslh%NaB<#pvw
z!{6k4e^z=BXeDG@EO3DHO$&TvgFYBO%L9M`*A(^0LP92K=RN_BHS*4rd^x9iV{!)t
z*N6th?x9Zz^D=8WIibbhfzf2Pauv?>bulo;IWGZ^I@tN-z|;o$JA0Jog1nb=`1|NQ
zf*fcOmO3^SD4If%@2_>Hf{;ns0u#!Dm-=u`lna891sflg{J;?i*#Io9`?bCdH83Tk
zH_)C1;qM8RJoIZC4yXj%Bsf<e<5GDmip|2jCQZsf`wukm>GXb>lZC-0T-c7q0IGb^
z7{(WGKzBz>Q0BQnfTbJOD=@^FjYz3ohv)2T79GP9h~1(pnq4*v4`5ESod7$#2mu1q
zOLT;y+DXj@7B;eZ!D>TFiB?g$5yU`$VnbYtRL>VBdttxzll``dN%sP-dc`z{9f50R
zz=i|r3XBAUmzaair2>E<=UMVZz^_zQgvwrs7%rj&cLfLw4c9+r&pFiwj-6+9|DR^;
zk0IK>VF$By=5zZK>vwLmi-4U5vM_|qWnjCA&KxE3`oNw1$H)nDRHn-xvhaZ*5vCR=
zh<YCab;-PLK*WqdpNQrBo2&^jWbSw-t!Y8pC=#gY9=3S8*f@~@8wtIv;sM*5(cxhP
zW@tdGQhR%O80c%c@14SlurFvf2fchZ7bEr_Tn;<q^=%fIqi`mGRV;C&bLvxUp->g-
zsDP4F<I@N`rGAQUq9YBOsltE!3`2$N{33lqp6XEEk1&t-(UBwXR{DexMzS%~YX2$I
z$kDmoal=Tz%Z_EbZlby-n`4DsH*+Bo9f!`#<Jpcm+Aj39hm3y8w;&jB-iIZS7C;in
z5j?w4hgv1B*X9U>AI>XOKybsa@S#KW;A+5(`J<@z|DHZ>FufbZB(=*ii?rCBzI9Oq
zUcbzC3F#>lU5?&8O?FD>TU4vZts1zQqXqKe0&5n2#^Iq+5uTH-kH%~_fjkQ+DCCni
zC<O$G#%qQ<?Pb8EO28<lsc#qdvOpTsO9`BixTCoZsioF&FXus;b+dpoDRm8!Pj;FY
zNKlUH8tfnAym`RGy~vxzYoIgp9Pz;ZqOYBQ80IgZ`4O1v?8%|sw_>{QY2(%>_m3&x
zq@JQ=(?-^Vl#ZPR>(0is9or}X#Wt45fkYcHA`0IW5oy#N^_hfuP}s7<J;a9k9Xl^I
zoVu2P+|ciMyEmsHJGG}fmdG+8t_^8i+~_>Cy!YF2H}&Sej~b8)rB$me(DKqXY>ku&
z1&R13Z+{!DKsz#a`UL~|{uJWh3`8ALx)S3+JBz*mTp({hAG{rky&VqT{&w&dGDA$;
z%=7BQ=V5qu^Cpf1OWnwLw?Niy%!4^lSa8eXzGCZp_5p1l#CAOjB)_Xg?cz1lo`7@~
zZhLO0AyqWQT)%DT!k<A_D`^47xspUF+VE#4CandMw%o0}s`~&kQ)#((4Rv_tJ!F(_
z9<nE|@%H6=Jg`PLXY5_*HCPZs4M9w>GZSpDyhqBzWUm@bb(=jUsopNV#rJRTesAv<
z>8-M-drkKXW0kvOAs4g+q0Jzk?p+D##DfIXRIJ`h-wsN&J^jNll<FOkBW;h2=m`52
z^~T-Q$nz&Krbltta{2{GUSl>L0~uuRRotnKo;VKD$_c<#w}ZSg#u)bljzXHxcz0~x
zNuva93B3>V`NBnM?LOAi{hatqQs+O~l7HrP^Z$eG^4BX1rT>N0;V<xf7wd50;WkZ`
z-&9Rw)5({D&uN;}kn>W@(-UPb0bj>PHxE6-FxpuV3IL=)sY4jez&3etF9eP>_T9M`
zm=p!kmXa$mC3=*}{Q0$|C6G681r?;EqTq;0-KoZGX|9Dk5=5O(a-@=iu3)FAU>ojH
ze?TaNqM%FzN(rh9NpofpUPT6S!%C;*^~_?cj#Iw0-Y3Z6Vhbj{X6JnC$%p3N<r~K}
zq&kINQT-yE>(hbd7i~0Q_FSzA%$Rp5FT1U#KX~%-)2APOr1HcQl_ws0^6VMkfC+Wn
z^b0zYpVyow=&!HFz#$$S*wHml{YiL)r+y2h1!+J7^9(oNV~t>^FaYr%44;XQ@P2Wa
zc)}j1<1h^IVD$b4?qiUL)ZBO+mqR>eB7hqLM@hi0eoE{(_?en(*MOXg#Zp8TI4TM5
zzj^`50OtsEg-4eblqSpm9)NHzG0$`YQW&sOVcwWBaYd}P{@^yi)J|to7g!bX^4E9C
zjG7gScmULgY`T9QHe-EAN1~8-@G{n`JRiY51~n4Z<nT$aii{V5Ozin!;RU^8T(>xo
zgccp#k3xc508HwewWQ(k*1wajC(xEom;~Ei1q^XxeV9v$^g@}&Q0iLn0IyoM<1Roj
zsRbH?pRsFlU9@kibGlnOGS^48Y3zDmSL0sHtoL7_jrHZAwd?&W@yCnZkBS|&UR>gu
z=}t5N+Dg~6`Wr;sw%;h>@f89Eq51*#Da^%dL7!=0<-Z~Iivif325<p6E-(<jgwQT%
zo0H}H(Ov2y!@mPrUhi*pSK@k+cAdIIX71h=LYzTbzdxYpX&A;&JI{ik=lVmZ0NY^4
zIK}n*h*B-3_9ME?Xy0tPe#U8Dtf^L05fj9iAeBD=7HP)@jIG*yCHJ{OBBX9>9fsp9
zwU0tufy{<K5c&w6fub|ThH-aLzeY!ZRswjwwG6XmzC|$_wajc%TnD(SoIn0Zl#Q0(
zA(q1SfPiQFsTyQF8I$!)g6!+3ad8dasE1g_2|8b7xx`%|u!hUeJej%bFBm#P=gV;X
zB+Lnkq=uhbIKS3hk(=M-u5oUJBEE+=knD4Ob7{fPkgp89%Fn=jk*YZgIv+j2>Yp%g
zLadPJZ&hpsolhk-2=-?`6PU8`Q_??vKa_xE8p@6Kex3za1h?`nsf6_1=h>t`XO*p0
zmaUo8%%KbRMGsQh16JQ!bt3PWaej_|*~rxIG7qj)tY4XivT>be=zl=Ye@DpK$5Xxt
zb8aTg5JT98q@-!cv%%0?GH*^prYsGTQx-1l?<CC|1Hh>%;bz@1Z<v<#N*Xqk<+3$g
z%2vo{!3{6sEx+b-P!PdoZDvXcStMq?Vgggw@^Sfb$j|-x-s|~&X_QObKDcfUq%9hj
zqp?`}=~~Z0j?Ne#N#jZlq>#%r%$L&{sH31plQ5#(8;TO1E8?yBoMot8*!dQ^A$#tE
zXuBXI>W|?QMyT$kltk|xV2-U`z|p4XPr}$5#&9b(^0V9t1sln3i;e7*E=HZBC>gO2
zo`vq0kQ*;k5FLjUMeuOS4YhHG!Dg|!3@Me3G;jp4yXf=M!qUp%iUbJeMR)=P^aRsL
ztt5fgCO>9;&v9TQS-~d`5cgOffJuEqXs6&GZ0CSaJkUlw_mo%khF&muV=G}lK(*Cx
z$Nd`{dBlT07xWXJ{%8VFKg9Cv9EAB;r_n%|4+4YBV=q7w4rlZDHAv*QnQFq$U$uY{
zgz0|_ux1}%%|35ZV9f&aY4-)NCK!I|dvL!ouWZ1T2h0sd7L)3`y{XOJ0>z!eUOa6N
z0Q>r1#5ai3?8hVpu#>J7Kh{YRmL0Tr4}R~liwFhtZX#h1{g8!g2hg|T8~G$OPoRA=
zP)lqKq9>qxfnFE`M8a$PhNy=D?TMk~o30q6WDt_oS415L+QZOh7`X78z<hXL^eC{4
z5qk`+!zDZ@3nM3Z$|!p2=Jtq`&i;I<Jqn$L44w}FX93<3zCkVT4(rh#MIDNOtp}jb
zFtIDalG=mNlNhrh_e2~5kaeFu(H^K7D0|l@Oz2glHgEL~;!=chd&=HzPhU;id!Pf8
zU!K9W2!_2ExF6`9P1D|oo412duvksptc;W|80`tZ*E0}xd6M%(J&EVrWE_qHSIYF@
zN+0OPmC%kO!7N)v?;Zt<*Ro4AeHP{+CfIIT|BeU}K7(b}4UCi7_d1|BS6BeR3&;`L
zivnZv?*WR7xcqtn^&lW4K$iMX+%^7`4mT3)t$#)T{|jes3-AzuvWF2>fQKQ9hi)2r
zSzI$VWlDpX4nhL&+UOP-P3e;=Pn&AN&8bynqi<8X8|4MihDc?@FMB+9ydHStPrAS}
ztMDp?mD8_o3z+V-J#7b=0+vhKIF9=ORzcAOH@o5?wWL~WtLkr2xh;Lct(Sn5`YPV)
zDiUN;U!(I<6ei~r!6XwTybhS8t~18VaMlN6SjSO6dHgPW-8$M@JgWW@?;+=q7w}R;
zEIdTF)P-BC0K<gYWn6Hll^yo}53rW9+us`XX0-W>i0<8}o9y#NA_RvS>j)hdhVruz
z>cky807Docn8sO5lzHR}d+p%qigh^O$gBU1_v=Hk1`+y3rUWczW);Ca^$K$_Sb#}~
zkeR;P_h#m}pF$p*(HUB=i%urBKwEw*8blHk>Nl7;!E6ZoDFNs*_NqklGpm6o&00FP
z>Sx3Zoe?^`DA!Nx5m%goe%4!so&pDbh!IE?S7Y>z(;?8MEcyy?{1i1D>Plw8y^L9A
z7X85tYMrBLW02&+90i)QI4J{w(Ozq;y83`lGt&t=dztf~p1KhpJ~O%PVJ^_xBvb97
zbBNA9I+96W(21PNYCnT-h69jlW!0;QRA4m7P^hJppnzq40WM3s-;^9DJ)YPO;N-{;
zF>(Ocoi|9k1hTk9)A^IapeNlymMqKQ0DQm1)%#g=R=&Rg7DpC}2%0|#+W4y`7$Vc4
zl2iQv$_ZBb6$>)ce2+E$N*=dPm?JcqXjyQfV_-f9RffP<7|)t}5fAs|lEVm;4IrOR
z36%#6+(q71#`oIzA|<hL_pZhJdrcT&v|a^>l^H;}Om9ACUfuh0;WD3J%!lIDb93rj
zFvK&)sTuQ>`j7M;>=C&3hbxdew?n(qvNALw{N0=(KgSvL^K>YQrzNpJPv0-o`87Jf
zPKVOj>YvfKi$1Pvf3R}$@wrovzP)nl@h2&%y{(Kkx_FC>HX%lBRrvRjBUrmM;JXk=
zYX3VH@)LAOS682*LwH&yv*M8LEXVzmh?gha{p%<pn}CjnnJwZPL|lp({!x1)uKl5h
zyGO1Y_KLGR$#(Y-e8qf!{)YKx<dg+1Vj7S-aB{hf`Rit~@G52kt~#+^>4X7iw+gR7
zBK$!!rNhBFLtGth<_N`GFEf58+yp#7`m$o1sy>=6$M#temjaSUVCEH+u~4KXG;AB%
zs?ip5Yq+>xWE;BeyOs`7bE6vEy&l?@i!396#D69*CwY=m^idx3&(HvjHmyc2GA$ZH
zp(MevNCmUC31N}GkJ3Q^1p+}l$shp%E5R@#OnE0)8g<B-U79Vu{fV<jZ=XT(K-}jH
zbR!||MB6|pFrJ-QsYO?GssKAUxc44)hH)EIuT`R9CV@#pOJt-Z5=QBy5d%t9sRF>B
zj%@+h3WEJ-SSTgMk=8g!(G%I{!1S;bhSyN{jOU0)&>)eT8mKVl(S6Y}a}sM+s&$Da
zOS#&7)AmJ6DK4*L0(B8uk6ndnLr7g2hK&<RnAuX)cMDn>c4-X^B-T|5ja3VxSm$9!
zX+B^L3vI~N;$E~5Bt77Iv>b^3)3HE2p#80IXxIfpgFVDrc6ubNq?8%9JXX=bp#K6^
zYwR->Jh{IBPpBRt=nDvr&u5_|K_e##*dEr_g?Tk$2F6V?TmOI(WS#6XFEY&Imqtx^
zi3mdA>w~WVKiDjIg838ehuM=b28^FL19&$W{1$`Xq2PDe8Kh!Gpx8i#3m_ilVNnQx
zWa7-YA1>D&msE_m%I%Vn93d$8t#G<E0l)?TKfGd+2rJj-K)GMvHwx)W6SA0r9sr~_
zb3__Mm_0eCd7`)*^a<oAQvT4;gSnJI0#z8DPjiPQIIXJmk*oxQEwA_Xb67x>e+J3+
zUoyh`>2%;V9wUo{*YfaCgO9d@3(3gn$BymZS#lpW&?3yAV_gJX6#4Bde3X(q#q-!|
z5tRJ}c-HUgkb}41zaVP1jVw2NJ^awm?@4^s{~E8rP6Ff<jZ(k(#wtJ|Wi8*<k;2|t
zPFVd0Yd|tmm(<pGGwgE++g3=J6?X|)2g1MUNcfjrPE3>?gn#>Z?IZY&xTX+y$rQkO
z?FZgUaogK4@ho{a&;&O&^ovL@loTQ9^8iJHaed(fVCG{1d|2e~f#LHVcx)Sk&6|U~
z&U+bN`JmHYGdWRVhawHGodx+I@DHV|^sdCXJZ-{~#4_tL2+SGzwELk5iEAMl3L#%y
zip!-3%McVN*5b*q=n$m>-x%5m5MD+-ejNTm)Y4QRhms@p0|uH?L71gJ(@rlJkvA?@
zbUzjNL8nolfhweqYuW5Qq@s&Fvv~Vk-mr)*hdKW$1(<2g0(F23LFS?6Z6eg17XK|l
zTIT#G-e@TD3_BRC>EJaJ>EH(khAG36@`(LqZ#<0Mmb}_|(2p|hA_SrP!0VQ4hDAhr
zh+_uYL8?6z6Ox|bb(OdTa}r-%R=GT7_lb~np-PQp*`!POe8VEqGd<U<L^(0H9mSt_
zfKiU&h5{VmlycZ!5E_IA9D8Ewc#8ydApP9g+i3e3=1XzdcI-z+8(-=E`%5Qq&;e?Q
z70Kf}l3G6W7=%QRp1H6_TL=_7gp6}=bhCN<%x^9B0iAIV%o55)cQQXIHbx(z;k?z$
zUANoJ;ga-IT=N})wC~z!A3%6Q(^2j}sSkS?(6eDpLO=nzEy*pRLZ#XwppssYB!Y9a
z6e$qn4<SF0&7(f?chjU`9jfOPz0`)%Be2e&YOK}0+P3f1Z=z~5yS3cRKV<dgka-t&
zCM_m+iD6%0^{Mvno6ZHzK}lED5`^4Tm9IWT=Xt&_fR_X-8MsawW~|gZnT;U1-^Xl7
z&{fCj!&V%KuL83JEtihSOx5Z8IHH7iz<(#BZ`aK`?YU6k?Eu@MwtJV@>LFz47sHty
z2<*9j|7xpM2MQL*<Wjg*{WhJSWsQ2yGs>>6@i?Vtbd)zI_WnU?wU0~#sKlA0Am2la
z8Bcq`9022hgnS93!d%|ik1t}%y>O!-`ZMmdqySq1jDX2r8P_hwqa99A@a1V*LOp^J
zKv#Nf#2TH8bOfvH;F2A@QA|7J*lfZ!T7efiY>ynusIdoj#`PXW;`gwlh{|UZg`q6<
z^h1r^Sf_mhxLslNZxnOJLLd-2<0&dEgK6EQ47sPi!a98kj-RF#Uzn(nHb|2MttId#
z^SL|{-OJLs!&-w_Uu7H~)Bd=Qqpbo$2QRpM2xOn&{%a+ZSmrdRczLpG2D-CVTvc^Z
za64BK{OWnFr9>t3jscGHEwL;F4MRI<X|gHy?khThww%FA?%6&;D7Gyc*CYtYS}X6~
zKDN$un9)d|@`q#5^xILsl^H{3u$rJlZjI0!{D~L}R%wumRIPifO75D-ckX*o(@aCt
zT>eBSboLx9k%Flt#xT@gM%hQFM2B=ZkPe~WQiWav$V_BD>5F94C?`Oka3dBluENzb
zo)l;Gb|xFrV?`#fs-0T{S<mn5deDSLq1i1n4GDx$3DulW9CNPlj{W|I13Dde#<K*v
z)y<5001lY6wPp)sxm)m19<UwUGs)}IX35aiCJzqsO`xNT)N+^PA%+!MO3j(oY|~lX
z#6xmtc_;`*b7pmuLJi+p%m&^ZavP$lBXo|!;XNLbj~*qVK_25v5;ukP=?@1nG*S<J
z444DHok>UHFLVxfFx{PW?xJ%yoyXw#>Bn^WRKr)#()l4epQ7_=I$7=?LTCLYKK?SD
zK8D>z=Omr`;glyrI|66ULNun!jKEWjB6KW4dq`Z7_1?7&N4SNby-)YY1BjJye;AH6
zqA!}A#HKndggb7kMUd#AJYbZJLN=YGf7Hywa!Em~ns9?A4;Ei+zXiOVOp&*m%?$Ss
zPiJq*=Z2@Jjt>_SA1pqc_#oUl^IN&)r(Ye;3>On0wARcaIRd^uY+2*xDeWJ%5Oag^
P`DZ?k(1Q7u@#6mn+3oyM

diff --git a/tests/models/mistral2/configuration_mistraltp.py b/tests/models/mistral2/configuration_mistraltp.py
deleted file mode 100644
index ad6691b..0000000
--- a/tests/models/mistral2/configuration_mistraltp.py
+++ /dev/null
@@ -1,155 +0,0 @@
-# coding=utf-8
-# Copyright 2023 Mistral AI and the HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-""" Mistral model configuration"""
-
-from transformers.configuration_utils import PretrainedConfig
-# from transformers.utils import logging
-from collie.log.logger import logger
-
-
-# logger = logging.get_logger(__name__)
-
-MISTRAL_PRETRAINED_CONFIG_ARCHIVE_MAP = {
-    "mistralai/Mistral-7B-v0.1": "https://huggingface.co/mistralai/Mistral-7B-v0.1/resolve/main/config.json",
-    "mistralai/Mistral-7B-Instruct-v0.1": "https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.1/resolve/main/config.json",
-}
-
-
-class MistralConfig(PretrainedConfig):
-    r"""
-    This is the configuration class to store the configuration of a [`MistralModel`]. It is used to instantiate an
-    Mistral model according to the specified arguments, defining the model architecture. Instantiating a configuration
-    with the defaults will yield a similar configuration to that of the Mistral-7B-v0.1 or Mistral-7B-Instruct-v0.1.
-
-    [mistralai/Mistral-7B-v0.1](https://huggingface.co/mistralai/Mistral-7B-v0.1)
-    [mistralai/Mistral-7B-Instruct-v0.1](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.1)
-
-    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
-    documentation from [`PretrainedConfig`] for more information.
-
-
-    Args:
-        vocab_size (`int`, *optional*, defaults to 32000):
-            Vocabulary size of the Mistral model. Defines the number of different tokens that can be represented by the
-            `inputs_ids` passed when calling [`MistralModel`]
-        hidden_size (`int`, *optional*, defaults to 4096):
-            Dimension of the hidden representations.
-        intermediate_size (`int`, *optional*, defaults to 14336):
-            Dimension of the MLP representations.
-        num_hidden_layers (`int`, *optional*, defaults to 32):
-            Number of hidden layers in the Transformer encoder.
-        num_attention_heads (`int`, *optional*, defaults to 32):
-            Number of attention heads for each attention layer in the Transformer encoder.
-        num_key_value_heads (`int`, *optional*, defaults to 8):
-            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
-            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
-            `num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used. When
-            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
-            by meanpooling all the original heads within that group. For more details checkout [this
-            paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to `8`.
-        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
-            The non-linear activation function (function or string) in the decoder.
-        max_position_embeddings (`int`, *optional*, defaults to `4096*32`):
-            The maximum sequence length that this model might ever be used with. Mistral's sliding window attention
-            allows sequence of up to 4096*32 tokens.
-        initializer_range (`float`, *optional*, defaults to 0.02):
-            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-        rms_norm_eps (`float`, *optional*, defaults to 1e-06):
-            The epsilon used by the rms normalization layers.
-        use_cache (`bool`, *optional*, defaults to `True`):
-            Whether or not the model should return the last key/values attentions (not used by all models). Only
-            relevant if `config.is_decoder=True`.
-        pad_token_id (`int`, *optional*):
-            The id of the padding token.
-        bos_token_id (`int`, *optional*, defaults to 1):
-            The id of the "beginning-of-sequence" token.
-        eos_token_id (`int`, *optional*, defaults to 2):
-            The id of the "end-of-sequence" token.
-        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
-            Whether the model's input and output word embeddings should be tied.
-        rope_theta (`float`, *optional*, defaults to 10000.0):
-            The base period of the RoPE embeddings.
-        sliding_window (`int`, *optional*, defaults to 4096):
-            Sliding window attention window size. If not specified, will default to `4096`.
-        attention_dropout (`float`, *optional*, defaults to 0.0):
-            The dropout ratio for the attention probabilities.
-
-    ```python
-    >>> from transformers import MistralModel, MistralConfig
-
-    >>> # Initializing a Mistral 7B style configuration
-    >>> configuration = MistralConfig()
-
-    >>> # Initializing a model from the Mistral 7B style configuration
-    >>> model = MistralModel(configuration)
-
-    >>> # Accessing the model configuration
-    >>> configuration = model.config
-    ```"""
-
-    model_type = "mistral"
-    keys_to_ignore_at_inference = ["past_key_values"]
-
-    def __init__(
-        self,
-        vocab_size=32000,
-        hidden_size=4096,
-        intermediate_size=14336,
-        num_hidden_layers=32,
-        num_attention_heads=32,
-        num_key_value_heads=8,
-        hidden_act="silu",
-        max_position_embeddings=4096 * 32,
-        initializer_range=0.02,
-        rms_norm_eps=1e-6,
-        use_cache=True,
-        pad_token_id=None,
-        bos_token_id=1,
-        eos_token_id=2,
-        tie_word_embeddings=False,
-        rope_theta=10000.0,
-        sliding_window=4096,
-        attention_dropout=0.0,
-        attn_implementation="flash_attention_2",
-        **kwargs,
-    ):
-        self.vocab_size = vocab_size
-        self.max_position_embeddings = max_position_embeddings
-        self.hidden_size = hidden_size
-        self.intermediate_size = intermediate_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.sliding_window = sliding_window
-
-        # for backward compatibility
-        if num_key_value_heads is None:
-            num_key_value_heads = num_attention_heads
-
-        self.num_key_value_heads = num_key_value_heads
-        self.hidden_act = hidden_act
-        self.initializer_range = initializer_range
-        self.rms_norm_eps = rms_norm_eps
-        self.use_cache = use_cache
-        self.rope_theta = rope_theta
-        self.attention_dropout = attention_dropout
-
-        # 调用父类的初始化函数,将一些公共参数传递给父类处理
-        super().__init__(
-            pad_token_id=pad_token_id,
-            bos_token_id=bos_token_id,
-            eos_token_id=eos_token_id,
-            tie_word_embeddings=tie_word_embeddings,
-            **kwargs,
-        )
diff --git a/tests/models/mistral2/model.py b/tests/models/mistral2/model.py
deleted file mode 100644
index 60d9553..0000000
--- a/tests/models/mistral2/model.py
+++ /dev/null
@@ -1,2026 +0,0 @@
-# coding=utf-8
-# Copyright 2023 Mistral AI and the HuggingFace Inc. team. All rights reserved.
-#
-# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
-# and OPT implementations in this library. It has been modified from its
-# original forms to accommodate minor architectural differences compared
-# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-""" PyTorch Mistral model."""
-import inspect
-import math
-import warnings
-from typing import List, Optional, Tuple, Union
-
-import torch
-import torch.nn.functional as F
-import torch.utils.checkpoint
-from torch import nn
-from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
-
-from transformers.activations import ACT2FN
-from transformers.cache_utils import Cache, DynamicCache
-from transformers.modeling_attn_mask_utils import _prepare_4d_causal_attention_mask, _prepare_4d_causal_attention_mask_for_sdpa
-from transformers.modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast, SequenceClassifierOutputWithPast
-from transformers.modeling_utils import PreTrainedModel, dtype_byte_size
-from transformers.utils import (
-    add_start_docstrings,
-    add_start_docstrings_to_model_forward,
-    is_flash_attn_2_available,
-    is_flash_attn_greater_or_equal_2_10,
-    logging,
-    replace_return_docstrings,
-)
-from .configuration_mistraltp import Mistral2Config
-
-
-if is_flash_attn_2_available():
-    from flash_attn import flash_attn_func, flash_attn_varlen_func
-    from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input  # noqa
-
-    _flash_supports_window_size = "window_size" in list(inspect.signature(flash_attn_func).parameters)
-
-
-logger = logging.get_logger(__name__)
-
-_CONFIG_FOR_DOC = "Mistral2Config"
-
-#modified for collie
-import torch.distributed as dist
-import gc
-import json
-import os
-from collections import OrderedDict
-from megatron.core import parallel_state, tensor_parallel
-from einops import rearrange
-from deepspeed.pipe import LayerSpec, TiedLayerSpec
-
-from collie.config import CollieConfig
-from collie.driver.io import IODriver
-from collie.log.logger import logger
-from collie.module import (
-    ColumnParallelLinearWithoutBias,
-    ColumnParallelLMHead,
-    RowParallelLinearWithoutBias,
-)
-from collie.utils import concat_tensor, dict_as_params, env, progress
-from collie.models.base import CollieModelForCausalLM
-from collie.models.utils import (
-    kv_cache_to_inputs_for_layer, inputs_to_kv_cache_for_layer,
-    kv_cache_to_inputs_for_model, inputs_to_kv_cache_for_model,
-)
-
-
-# Copied from transformers.models.llama.modeling_llama._get_unpad_data
-def _get_unpad_data(attention_mask):
-    seqlens_in_batch = attention_mask.sum(dim=-1, dtype=torch.int32)
-    indices = torch.nonzero(attention_mask.flatten(), as_tuple=False).flatten()
-    max_seqlen_in_batch = seqlens_in_batch.max().item()
-    cu_seqlens = F.pad(torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.int32), (1, 0))
-    return (
-        indices,
-        cu_seqlens,
-        max_seqlen_in_batch,
-    )
-
-
-# Copied from transformers.models.llama.modeling_llama.LlamaRMSNorm with Llama->Mistral
-class Mistral2RMSNorm(nn.Module):
-    def __init__(self, hidden_size, eps=1e-6):
-        """
-        MistralRMSNorm is equivalent to T5LayerNorm
-        """
-        super().__init__()
-        self.weight = nn.Parameter(torch.ones(hidden_size))
-        self.variance_epsilon = eps
-
-    def forward(self, hidden_states):
-        input_dtype = hidden_states.dtype
-        hidden_states = hidden_states.to(torch.float32)
-        variance = hidden_states.pow(2).mean(-1, keepdim=True)
-        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
-        ans = self.weight * hidden_states.to(input_dtype)
-        # --------------------------------------------------------
-        # # 将Tensor转换为列表
-        # ans_list = ans.tolist()
-        # # 指定.json文件的路径
-        # file_path = '/remote-home/lqyin/CoLLiE/tests/models/mistraltp/rms_ans.json'
-        
-        # # 尝试打开现有的.json文件并读取内容，如果文件不存在则创建一个新的列表
-        # try:
-        #     with open(file_path, 'r', encoding='utf-8') as file:
-        #         results_list = json.load(file)
-        # except FileNotFoundError:
-        #     results_list = []
-        # # 将当前结果添加到列表中
-        # results_list.append(ans_list)
-        # # 将更新后的列表写回.json文件
-        # with open(file_path, 'w', encoding='utf-8') as file:
-        #     json.dump(results_list, file, ensure_ascii=False, indent=4)
-        #     file.write('\n')  # 在文件末尾添加一个换行符
-        # --------------------------------------------------------
-        return ans
-
-
-# copied from transformers.models.llama.modeling_llama.LlamaRotaryEmbedding with Llama->Mistral
-# TODO @Arthur no longer copied from LLama after static cache
-class Mistral2RotaryEmbedding(nn.Module):
-    def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None):
-        super().__init__()
-
-        self.dim = dim
-        self.max_position_embeddings = max_position_embeddings
-        self.base = base
-        inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2, dtype=torch.int64).float().to(device) / self.dim))
-        self.register_buffer("inv_freq", inv_freq, persistent=False)
-
-        # Build here to make `torch.jit.trace` work.
-        self._set_cos_sin_cache(
-            seq_len=max_position_embeddings, device=self.inv_freq.device, dtype=torch.get_default_dtype()
-        )
-
-    def _set_cos_sin_cache(self, seq_len, device, dtype):
-        self.max_seq_len_cached = seq_len
-        t = torch.arange(self.max_seq_len_cached, device=device, dtype=torch.int64).type_as(self.inv_freq)
-
-        freqs = torch.outer(t, self.inv_freq)
-        # Different from paper, but it uses a different permutation in order to obtain the same calculation
-        emb = torch.cat((freqs, freqs), dim=-1)
-        self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False)
-        self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False)
-
-    def forward(self, x, seq_len=None):
-        # x: [bs, num_attention_heads, seq_len, head_size]
-        if seq_len > self.max_seq_len_cached:
-            self._set_cos_sin_cache(seq_len=seq_len, device=x.device, dtype=x.dtype)
-
-        return (
-            self.cos_cached[:seq_len].to(dtype=x.dtype),
-            self.sin_cached[:seq_len].to(dtype=x.dtype),
-        )
-
-
-# Copied from transformers.models.llama.modeling_llama.rotate_half
-def rotate_half(x):
-    """Rotates half the hidden dims of the input."""
-    x1 = x[..., : x.shape[-1] // 2]
-    x2 = x[..., x.shape[-1] // 2 :]
-    return torch.cat((-x2, x1), dim=-1)
-
-
-# copied from transformers.models.llama.modeling_llama.apply_rotary_pos_emb
-# TODO @Arthur no longer copied from LLama after static cache
-def apply_rotary_pos_emb(q, k, cos, sin, position_ids, unsqueeze_dim=1):
-    """Applies Rotary Position Embedding to the query and key tensors.
-
-    Args:
-        q (`torch.Tensor`): The query tensor.
-        k (`torch.Tensor`): The key tensor.
-        cos (`torch.Tensor`): The cosine part of the rotary embedding.
-        sin (`torch.Tensor`): The sine part of the rotary embedding.
-        position_ids (`torch.Tensor`):
-            The position indices of the tokens corresponding to the query and key tensors. For example, this can be
-            used to pass offsetted position ids when working with a KV-cache.
-        unsqueeze_dim (`int`, *optional*, defaults to 1):
-            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
-            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
-            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
-            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
-            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
-            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
-    Returns:
-        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
-    """
-    cos = cos[position_ids].unsqueeze(unsqueeze_dim)
-    sin = sin[position_ids].unsqueeze(unsqueeze_dim)
-    q_embed = (q * cos) + (rotate_half(q) * sin)
-    k_embed = (k * cos) + (rotate_half(k) * sin)
-    return q_embed, k_embed
-
-
-class Mistral2MLP(nn.Module):
-    def __init__(self, config):
-        super().__init__()
-        self.config = config
-        self.hidden_size = config.hidden_size
-        self.intermediate_size = config.intermediate_size
-        # self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
-        # self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
-        # self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False)
-        
-        self.up_proj = ColumnParallelLinearWithoutBias(
-            self.hidden_size,
-            self.intermediate_size,
-            bias=False,
-            gather_output=False,
-            init_method=lambda x: x,
-        )
-        self.gate_proj = ColumnParallelLinearWithoutBias(
-            self.hidden_size,
-            self.intermediate_size,
-            bias=False,
-            gather_output=False,
-            init_method=lambda x: x,
-        )
-        self.down_proj = RowParallelLinearWithoutBias(
-            self.intermediate_size,
-            self.hidden_size,
-            bias=False,
-            input_is_parallel=True,
-            init_method=lambda x: x,
-        )
-        
-        self.act_fn = ACT2FN[config.hidden_act]
-
-    def forward(self, x):
-        return self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
-
-
-# Copied from transformers.models.llama.modeling_llama.repeat_kv
-def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
-    """
-    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
-    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
-    """
-    batch, num_key_value_heads, slen, head_dim = hidden_states.shape
-    if n_rep == 1:
-        return hidden_states
-    hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
-    return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
-
-
-class Mistral2Attention(nn.Module):
-    """
-    Multi-headed attention from 'Attention Is All You Need' paper. Modified to use sliding window attention: Longformer
-    and "Generating Long Sequences with Sparse Transformers".
-    """
-
-    def __init__(self, config: CollieConfig, layer_idx: Optional[int] = None):
-        super().__init__()
-        self.config = config
-        self.layer_idx = layer_idx
-        if layer_idx is None:
-            logger.warning_once(
-                f"Instantiating {self.__class__.__name__} without passing a `layer_idx` is not recommended and will "
-                "lead to errors during the forward call if caching is used. Please make sure to provide a `layer_idx` "
-                "when creating this class."
-            )
-
-        self.hidden_size = config.hidden_size
-        self.num_heads = config.num_attention_heads
-        self.head_dim = self.hidden_size // self.num_heads
-        self.num_key_value_heads = config.num_key_value_heads
-        self.num_key_value_groups = self.num_heads // self.num_key_value_heads
-        self.max_position_embeddings = config.max_position_embeddings
-        self.rope_theta = config.rope_theta
-        self.is_causal = True
-        self.attention_dropout = config.attention_dropout
-
-        if (self.head_dim * self.num_heads) != self.hidden_size:
-            raise ValueError(
-                f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}"
-                f" and `num_heads`: {self.num_heads})."
-            )
-        # self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=False)
-        # self.k_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=False)
-        # self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=False)
-        # self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=False)
-
-        self.q_proj = ColumnParallelLinearWithoutBias(
-            self.hidden_size,
-            self.num_heads * self.head_dim,
-            bias=False,
-            gather_output=False,
-            init_method=lambda x: x,
-        )
-        self.k_proj = ColumnParallelLinearWithoutBias(
-            self.hidden_size,
-            self.num_key_value_heads * self.head_dim,
-            bias=False,
-            gather_output=False,
-            init_method=lambda x: x,
-        )
-        self.v_proj = ColumnParallelLinearWithoutBias(
-            self.hidden_size,
-            self.num_key_value_heads * self.head_dim,
-            bias=False,
-            gather_output=False,
-            init_method=lambda x: x,
-        )
-        self.o_proj = RowParallelLinearWithoutBias(
-            self.num_heads * self.head_dim,
-            self.hidden_size,
-            bias=False,
-            input_is_parallel=True,
-            init_method=lambda x: x,
-        )
-
-        self.rotary_emb = Mistral2RotaryEmbedding(
-            self.head_dim,
-            max_position_embeddings=self.max_position_embeddings,
-            base=self.rope_theta,
-        )
-
-    def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
-        return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
-
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_value: Optional[Cache] = None,
-        output_attentions: bool = False,
-        use_cache: bool = False,
-        **kwargs,
-    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
-        if "padding_mask" in kwargs:
-            warnings.warn(
-                "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`"
-            )
-        bsz, q_len, _ = hidden_states.size()
-
-        query_states = self.q_proj(hidden_states)
-        key_states = self.k_proj(hidden_states)
-        value_states = self.v_proj(hidden_states)
-
-        # query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
-        # key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
-        # value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
-
-        query_states, key_states, value_states = (
-            rearrange(query_states, "b n (h d) -> b n h d", d=self.head_dim),
-            rearrange(key_states, "b n (h d) -> b n h d", d=self.head_dim),
-            rearrange(value_states, "b n (h d) -> b n h d", d=self.head_dim),
-        )
-
-        self.num_heads_tp = query_states.shape[2]
-        self.tp_size = self.num_heads // self.num_heads_tp
-
-        query_states = query_states.transpose(1, 2)
-        key_states = key_states.transpose(1, 2)
-        value_states = value_states.transpose(1, 2)
-
-        if self.config.pp_size > 1:
-            query_states = query_states.contiguous()
-            key_states = key_states.contiguous()
-            value_states = value_states.contiguous()
-
-        kv_seq_len = key_states.shape[-2]
-        if past_key_value is not None:
-            if self.layer_idx is None:
-                raise ValueError(
-                    f"The cache structure has changed since version v4.36. If you are using {self.__class__.__name__} "
-                    "for auto-regressive decoding with k/v caching, please make sure to initialize the attention class "
-                    "with a layer index."
-                )
-            kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
-        cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
-        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
-
-        if past_key_value is not None:
-            cache_kwargs = {"sin": sin, "cos": cos}  # Specific to RoPE models
-            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
-
-        # repeat k/v heads if n_kv_heads < n_heads
-        key_states = repeat_kv(key_states, self.num_key_value_groups)
-        value_states = repeat_kv(value_states, self.num_key_value_groups)
-
-        attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim)
-
-        if attn_weights.size() != (bsz, self.num_heads_tp, q_len, kv_seq_len):
-            raise ValueError(
-                f"Attention weights should be of size {(bsz, self.num_heads_tp, q_len, kv_seq_len)}, but is"
-                f" {attn_weights.size()}"
-            )
-
-        if attention_mask is not None:
-            if attention_mask.size() != (bsz, 1, q_len, kv_seq_len):
-                raise ValueError(
-                    f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}"
-                )
-
-            attn_weights = attn_weights + attention_mask
-
-        # upcast attention to fp32
-        attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
-        attn_weights = nn.functional.dropout(attn_weights, p=self.attention_dropout, training=self.training)
-        attn_output = torch.matmul(attn_weights, value_states)
-
-        if attn_output.size() != (bsz, self.num_heads_tp, q_len, self.head_dim):
-            raise ValueError(
-                f"`attn_output` should be of size {(bsz, self.num_heads_tp, q_len, self.head_dim)}, but is"
-                f" {attn_output.size()}"
-            )
-
-        attn_output = attn_output.transpose(1, 2).contiguous()
-        attn_output = attn_output.reshape(bsz, q_len, int(self.hidden_size/self.tp_size))
-
-        attn_output = self.o_proj(attn_output)
-
-        if not output_attentions:
-            attn_weights = None
-
-        # --------------------------------------------------------
-        # 将Tensor转换为列表
-        ans_list = attn_output.tolist()
-        # 指定.json文件的路径
-        file_path = '/remote-home/lqyin/CoLLiE/tests/models/mistraltp/attn_output_1.json'
-        
-        # 尝试打开现有的.json文件并读取内容，如果文件不存在则创建一个新的列表
-        try:
-            with open(file_path, 'r', encoding='utf-8') as file:
-                results_list = json.load(file)
-        except FileNotFoundError:
-            results_list = []
-        # 将当前结果添加到列表中
-        results_list.append(ans_list)
-        # 将更新后的列表写回.json文件
-        with open(file_path, 'w', encoding='utf-8') as file:
-            json.dump(results_list, file, ensure_ascii=False, indent=4)
-            file.write('\n\n\n')  # 在文件末尾添加一个换行符
-        # --------------------------------------------------------
-
-
-
-        return attn_output, attn_weights, past_key_value
-
-
-class Mistral2FlashAttention2(Mistral2Attention):
-    """
-    Mistral flash attention module. This module inherits from `MistralAttention` as the weights of the module stays
-    untouched. The only required change would be on the forward pass where it needs to correctly call the public API of
-    flash attention and deal with padding tokens in case the input contains any of them.
-    """
-
-    # Copied from transformers.models.llama.modeling_llama.LlamaFlashAttention2.__init__
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-
-        # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1.
-        # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignement, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0.
-        # Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left).
-        self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10()
-
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_value: Optional[Cache] = None,
-        output_attentions: bool = False,
-        use_cache: bool = False,
-        **kwargs,
-    ):
-        if "padding_mask" in kwargs:
-            warnings.warn(
-                "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`"
-            )
-
-            # overwrite attention_mask with padding_mask
-            attention_mask = kwargs.pop("padding_mask")
-        bsz, q_len, _ = hidden_states.size()
-
-        query_states = self.q_proj(hidden_states)
-        key_states = self.k_proj(hidden_states)
-        value_states = self.v_proj(hidden_states)
-
-        # query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
-        # key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
-        # value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
-
-        query_states, key_states, value_states = (
-            rearrange(query_states, "b n (h d) -> b n h d", d=self.head_dim),
-            rearrange(key_states, "b n (h d) -> b n h d", d=self.head_dim),
-            rearrange(value_states, "b n (h d) -> b n h d", d=self.head_dim),
-        )
-
-        self.num_heads_tp = query_states.shape[2]
-        self.tp_size = self.num_heads // self.num_heads_tp
-
-        query_states = query_states.transpose(1, 2)
-        key_states = key_states.transpose(1, 2)
-        value_states = value_states.transpose(1, 2)
-
-        if self.config.pp_size > 1:
-            query_states = query_states.contiguous()
-            key_states = key_states.contiguous()
-            value_states = value_states.contiguous()
-
-
-        kv_seq_len = key_states.shape[-2]
-        if past_key_value is not None:
-            if self.layer_idx is None:
-                raise ValueError(
-                    f"The cache structure has changed since version v4.36. If you are using {self.__class__.__name__} "
-                    "for auto-regressive decoding with k/v caching, please make sure to initialize the attention class "
-                    "with a layer index."
-                )
-            kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
-
-        # Because the input can be padded, the absolute sequence length depends on the max position id.
-        rotary_seq_len = max(kv_seq_len, position_ids[:, -1].max().item()) + 1
-        cos, sin = self.rotary_emb(value_states, seq_len=rotary_seq_len)
-
-        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
-
-        use_sliding_windows = (
-            _flash_supports_window_size
-            and getattr(self.config, "sliding_window", None) is not None
-            and kv_seq_len > self.config.sliding_window
-        )
-
-        if not _flash_supports_window_size:
-            logger.warning_once(
-                "The current flash attention version does not support sliding window attention, for a more memory efficient implementation"
-                " make sure to upgrade flash-attn library."
-            )
-
-        if past_key_value is not None:
-            # Activate slicing cache only if the config has a value `sliding_windows` attribute
-            cache_has_contents = past_key_value.get_seq_length(self.layer_idx) > 0
-            if (
-                getattr(self.config, "sliding_window", None) is not None
-                and kv_seq_len > self.config.sliding_window
-                and cache_has_contents
-            ):
-                slicing_tokens = 1 - self.config.sliding_window
-
-                past_key = past_key_value[self.layer_idx][0]
-                past_value = past_key_value[self.layer_idx][1]
-
-                past_key = past_key[:, :, slicing_tokens:, :].contiguous()
-                past_value = past_value[:, :, slicing_tokens:, :].contiguous()
-
-                if past_key.shape[-2] != self.config.sliding_window - 1:
-                    raise ValueError(
-                        f"past key must have a shape of (`batch_size, num_heads, self.config.sliding_window-1, head_dim`), got"
-                        f" {past_key.shape}"
-                    )
-
-                if attention_mask is not None:
-                    attention_mask = attention_mask[:, slicing_tokens:]
-                    attention_mask = torch.cat([attention_mask, torch.ones_like(attention_mask[:, -1:])], dim=-1)
-
-            cache_kwargs = {"sin": sin, "cos": cos}  # Specific to RoPE models
-            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
-
-        # repeat k/v heads if n_kv_heads < n_heads
-        key_states = repeat_kv(key_states, self.num_key_value_groups)
-        value_states = repeat_kv(value_states, self.num_key_value_groups)
-        dropout_rate = 0.0 if not self.training else self.attention_dropout
-
-        # In PEFT, usually we cast the layer norms in float32 for training stability reasons
-        # therefore the input hidden states gets silently casted in float32. Hence, we need
-        # cast them back in float16 just to be sure everything works as expected.
-        input_dtype = query_states.dtype
-        if input_dtype == torch.float32:
-            if torch.is_autocast_enabled():
-                target_dtype = torch.get_autocast_gpu_dtype()
-            # Handle the case where the model is quantized
-            elif hasattr(self.config, "_pre_quantization_dtype"):
-                target_dtype = self.config._pre_quantization_dtype
-            else:
-                target_dtype = self.q_proj.weight.dtype
-
-            logger.warning_once(
-                f"The input hidden states seems to be silently casted in float32, this might be related to"
-                f" the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in"
-                f" {target_dtype}."
-            )
-
-            query_states = query_states.to(target_dtype)
-            key_states = key_states.to(target_dtype)
-            value_states = value_states.to(target_dtype)
-
-        # Reashape to the expected shape for Flash Attention
-        query_states = query_states.transpose(1, 2)
-        key_states = key_states.transpose(1, 2)
-        value_states = value_states.transpose(1, 2)
-
-        attn_output = self._flash_attention_forward(
-            query_states,
-            key_states,
-            value_states,
-            attention_mask,
-            q_len,
-            dropout=dropout_rate,
-            use_sliding_windows=use_sliding_windows,
-        )
-
-        attn_output = attn_output.reshape(bsz, q_len, int(self.hidden_size/self.tp_size)).contiguous()
-        attn_output = self.o_proj(attn_output)
-
-        if not output_attentions:
-            attn_weights = None
-
-  # --------------------------------------------------------
-        # 将Tensor转换为列表
-        ans_list = attn_output.tolist()
-        # 指定.json文件的路径
-        file_path = '/remote-home/lqyin/CoLLiE/tests/models/mistraltp/attn_output_1.json'
-        
-        # 尝试打开现有的.json文件并读取内容，如果文件不存在则创建一个新的列表
-        try:
-            with open(file_path, 'r', encoding='utf-8') as file:
-                results_list = json.load(file)
-        except FileNotFoundError:
-            results_list = []
-        # 将当前结果添加到列表中
-        results_list.append(ans_list)
-        # 将更新后的列表写回.json文件
-        with open(file_path, 'w', encoding='utf-8') as file:
-            json.dump(results_list, file, ensure_ascii=False, indent=4)
-            file.write('\n\n\n')  # 在文件末尾添加一个换行符
-        # --------------------------------------------------------
-
-
-        return attn_output, attn_weights, past_key_value
-
-    def _flash_attention_forward(
-        self,
-        query_states,
-        key_states,
-        value_states,
-        attention_mask,
-        query_length,
-        dropout=0.0,
-        softmax_scale=None,
-        use_sliding_windows=False,
-    ):
-        """
-        Calls the forward method of Flash Attention - if the input hidden states contain at least one padding token
-        first unpad the input, then computes the attention scores and pad the final attention scores.
-
-        Args:
-            query_states (`torch.Tensor`):
-                Input query states to be passed to Flash Attention API
-            key_states (`torch.Tensor`):
-                Input key states to be passed to Flash Attention API
-            value_states (`torch.Tensor`):
-                Input value states to be passed to Flash Attention API
-            attention_mask (`torch.Tensor`):
-                The padding mask - corresponds to a tensor of size `(batch_size, seq_len)` where 0 stands for the
-                position of padding tokens and 1 for the position of non-padding tokens.
-            dropout (`float`):
-                Attention dropout
-            softmax_scale (`float`, *optional*):
-                The scaling of QK^T before applying softmax. Default to 1 / sqrt(head_dim)
-            use_sliding_windows (`bool`, *optional*):
-                Whether to activate sliding window attention.
-        """
-        if not self._flash_attn_uses_top_left_mask:
-            causal = self.is_causal
-        else:
-            # TODO: Remove the `query_length != 1` check once Flash Attention for RoCm is bumped to 2.1. For details, please see the comment in LlamaFlashAttention2 __init__.
-            causal = self.is_causal and query_length != 1
-
-        # Contains at least one padding token in the sequence
-        if attention_mask is not None:
-            batch_size = query_states.shape[0]
-            query_states, key_states, value_states, indices_q, cu_seq_lens, max_seq_lens = self._upad_input(
-                query_states, key_states, value_states, attention_mask, query_length
-            )
-
-            cu_seqlens_q, cu_seqlens_k = cu_seq_lens
-            max_seqlen_in_batch_q, max_seqlen_in_batch_k = max_seq_lens
-
-            if not use_sliding_windows:
-                attn_output_unpad = flash_attn_varlen_func(
-                    query_states,
-                    key_states,
-                    value_states,
-                    cu_seqlens_q=cu_seqlens_q,
-                    cu_seqlens_k=cu_seqlens_k,
-                    max_seqlen_q=max_seqlen_in_batch_q,
-                    max_seqlen_k=max_seqlen_in_batch_k,
-                    dropout_p=dropout,
-                    softmax_scale=softmax_scale,
-                    causal=causal,
-                )
-            else:
-                attn_output_unpad = flash_attn_varlen_func(
-                    query_states,
-                    key_states,
-                    value_states,
-                    cu_seqlens_q=cu_seqlens_q,
-                    cu_seqlens_k=cu_seqlens_k,
-                    max_seqlen_q=max_seqlen_in_batch_q,
-                    max_seqlen_k=max_seqlen_in_batch_k,
-                    dropout_p=dropout,
-                    softmax_scale=softmax_scale,
-                    causal=causal,
-                    window_size=(self.config.sliding_window, self.config.sliding_window),
-                )
-
-            attn_output = pad_input(attn_output_unpad, indices_q, batch_size, query_length)
-        else:
-            if not use_sliding_windows:
-                attn_output = flash_attn_func(
-                    query_states,
-                    key_states,
-                    value_states,
-                    dropout,
-                    softmax_scale=softmax_scale,
-                    causal=causal,
-                )
-            else:
-                attn_output = flash_attn_func(
-                    query_states,
-                    key_states,
-                    value_states,
-                    dropout,
-                    softmax_scale=softmax_scale,
-                    causal=causal,
-                    window_size=(self.config.sliding_window, self.config.sliding_window),
-                )
-
-        return attn_output
-
-    def _upad_input(self, query_layer, key_layer, value_layer, attention_mask, query_length):
-        batch_size, kv_seq_len, num_heads, head_dim = key_layer.shape
-
-        # On the first iteration we need to properly re-create the padding mask
-        # by slicing it on the proper place
-        if kv_seq_len != attention_mask.shape[-1]:
-            attention_mask_num_tokens = attention_mask.shape[-1]
-            attention_mask = attention_mask[:, attention_mask_num_tokens - kv_seq_len :]
-
-        indices_k, cu_seqlens_k, max_seqlen_in_batch_k = _get_unpad_data(attention_mask)
-
-        key_layer = index_first_axis(key_layer.reshape(batch_size * kv_seq_len, num_heads, head_dim), indices_k)
-        value_layer = index_first_axis(value_layer.reshape(batch_size * kv_seq_len, num_heads, head_dim), indices_k)
-
-        if query_length == kv_seq_len:
-            query_layer = index_first_axis(
-                query_layer.reshape(batch_size * kv_seq_len, num_heads, head_dim), indices_k
-            )
-            cu_seqlens_q = cu_seqlens_k
-            max_seqlen_in_batch_q = max_seqlen_in_batch_k
-            indices_q = indices_k
-        elif query_length == 1:
-            max_seqlen_in_batch_q = 1
-            cu_seqlens_q = torch.arange(
-                batch_size + 1, dtype=torch.int32, device=query_layer.device
-            )  # There is a memcpy here, that is very bad.
-            indices_q = cu_seqlens_q[:-1]
-            query_layer = query_layer.squeeze(1)
-        else:
-            # The -q_len: slice assumes left padding.
-            attention_mask = attention_mask[:, -query_length:]
-            query_layer, indices_q, cu_seqlens_q, max_seqlen_in_batch_q = unpad_input(query_layer, attention_mask)
-
-        return (
-            query_layer,
-            key_layer,
-            value_layer,
-            indices_q,
-            (cu_seqlens_q, cu_seqlens_k),
-            (max_seqlen_in_batch_q, max_seqlen_in_batch_k),
-        )
-
-
-# copied from transformers.models.llama.modeling_llama.LlamaSdpaAttention with Llama->Mistral
-# TODO @Arthur no longer copied from LLama after static cache
-class Mistral2SdpaAttention(Mistral2Attention):
-    """
-    Mistral attention module using torch.nn.functional.scaled_dot_product_attention. This module inherits from
-    `MistralAttention` as the weights of the module stays untouched. The only changes are on the forward pass to adapt to
-    SDPA API.
-    """
-
-    # Adapted from MistralAttention.forward
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_value: Optional[Cache] = None,
-        output_attentions: bool = False,
-        use_cache: bool = False,
-    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
-        if output_attentions:
-            # TODO: Improve this warning with e.g. `model.config.attn_implementation = "manual"` once this is implemented.
-            logger.warning_once(
-                "MistralModel is using MistralSdpaAttention, but `torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to the manual attention implementation, "
-                'but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
-            )
-            return super().forward(
-                hidden_states=hidden_states,
-                attention_mask=attention_mask,
-                position_ids=position_ids,
-                past_key_value=past_key_value,
-                output_attentions=output_attentions,
-                use_cache=use_cache,
-            )
-
-        bsz, q_len, _ = hidden_states.size()
-
-        query_states = self.q_proj(hidden_states)
-        key_states = self.k_proj(hidden_states)
-        value_states = self.v_proj(hidden_states)
-
-        # query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
-        # key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
-        # value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
-
-        query_states, key_states, value_states = (
-            rearrange(query_states, "b n (h d) -> b n h d", d=self.head_dim),
-            rearrange(key_states, "b n (h d) -> b n h d", d=self.head_dim),
-            rearrange(value_states, "b n (h d) -> b n h d", d=self.head_dim),
-        )
-
-        self.num_heads_tp = query_states.shape[2]
-        self.tp_size = self.num_heads // self.num_heads_tp
-
-        query_states = query_states.transpose(1, 2)
-        key_states = key_states.transpose(1, 2)
-        value_states = value_states.transpose(1, 2)
-
-        if self.config.pp_size > 1:
-            query_states = query_states.contiguous()
-            key_states = key_states.contiguous()
-            value_states = value_states.contiguous()
-
-
-        kv_seq_len = key_states.shape[-2]
-        if past_key_value is not None:
-            kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
-        cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
-
-        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
-
-        if past_key_value is not None:
-            cache_kwargs = {"sin": sin, "cos": cos}  # Specific to RoPE models
-            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
-
-        key_states = repeat_kv(key_states, self.num_key_value_groups)
-        value_states = repeat_kv(value_states, self.num_key_value_groups)
-
-        if attention_mask is not None:
-            if attention_mask.size() != (bsz, 1, q_len, kv_seq_len):
-                raise ValueError(
-                    f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}"
-                )
-
-        # SDPA with memory-efficient backend is currently (torch==2.1.2) bugged with non-contiguous inputs with custom attn_mask,
-        # Reference: https://github.com/pytorch/pytorch/issues/112577.
-        if query_states.device.type == "cuda" and attention_mask is not None:
-            query_states = query_states.contiguous()
-            key_states = key_states.contiguous()
-            value_states = value_states.contiguous()
-
-        attn_output = torch.nn.functional.scaled_dot_product_attention(
-            query_states,
-            key_states,
-            value_states,
-            attn_mask=attention_mask,
-            dropout_p=self.attention_dropout if self.training else 0.0,
-            # The q_len > 1 is necessary to match with AttentionMaskConverter.to_causal_4d that does not create a causal mask in case q_len == 1.
-            is_causal=self.is_causal and attention_mask is None and q_len > 1,
-        )
-
-        attn_output = attn_output.transpose(1, 2).contiguous()
-        attn_output = attn_output.view(bsz, q_len, int(self.hidden_size/self.tp_size))
-
-        attn_output = self.o_proj(attn_output)
-
-        return attn_output, None, past_key_value
-
-
-MISTRAL_ATTENTION_CLASSES = {
-    "eager": Mistral2Attention,
-    "flash_attention_2": Mistral2FlashAttention2,
-    "sdpa": Mistral2SdpaAttention,
-}
-
-
-class MistralDecoderLayer(nn.Module):
-    def __init__(self, config: CollieConfig, layer_idx: int):
-        super().__init__()
-        self.hidden_size = config.hidden_size
-
-        self.self_attn = MISTRAL_ATTENTION_CLASSES[config._attn_implementation](config, layer_idx)
-
-        self.mlp = Mistral2MLP(config)
-        self.input_layernorm = Mistral2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
-        self.post_attention_layernorm = Mistral2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
-
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_value: Optional[Tuple[torch.Tensor]] = None,
-        output_attentions: Optional[bool] = False,
-        use_cache: Optional[bool] = False,
-        **kwargs,
-    ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
-        if "padding_mask" in kwargs:
-            warnings.warn(
-                "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`"
-            )
-        """
-        Args:
-            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
-            attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
-                `(batch, sequence_length)` where padding elements are indicated by 0.
-            output_attentions (`bool`, *optional*):
-                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
-                returned tensors for more detail.
-            use_cache (`bool`, *optional*):
-                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
-                (see `past_key_values`).
-            past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
-        """
-
-        residual = hidden_states
-
-        hidden_states = self.input_layernorm(hidden_states)
-
-        # Self Attention
-        hidden_states, self_attn_weights, present_key_value = self.self_attn(
-            hidden_states=hidden_states,
-            attention_mask=attention_mask,
-            position_ids=position_ids,
-            past_key_value=past_key_value,
-            output_attentions=output_attentions,
-            use_cache=use_cache,
-        )
-        hidden_states = residual + hidden_states
-
-        # Fully Connected
-        residual = hidden_states
-        hidden_states = self.post_attention_layernorm(hidden_states)
-        hidden_states = self.mlp(hidden_states)
-        hidden_states = residual + hidden_states
-
-        outputs = (hidden_states,)
-
-        if output_attentions:
-            outputs += (self_attn_weights,)
-
-        if use_cache:
-            outputs += (present_key_value,)
-
-        # --------------------------------------------------------
-        # # 将Tensor转换为列表
-        # ans_list = [tensor.tolist() for tensor in outputs]
-        # # 指定.json文件的路径
-        # file_path = '/remote-home/lqyin/CoLLiE/tests/models/mistraltp/decoder_outputs.json'
-        
-        # # 尝试打开现有的.json文件并读取内容，如果文件不存在则创建一个新的列表
-        # try:
-        #     with open(file_path, 'r', encoding='utf-8') as file:
-        #         results_list = json.load(file)
-        # except FileNotFoundError:
-        #     results_list = []
-        # # 将当前结果添加到列表中
-        # results_list.append(ans_list)
-        # # 将更新后的列表写回.json文件
-        # with open(file_path, 'w', encoding='utf-8') as file:
-        #     json.dump(results_list, file, ensure_ascii=False, indent=4)
-        #     file.write('\n')  # 在文件末尾添加一个换行符
-        # --------------------------------------------------------
-
-        return outputs
-
-
-MISTRAL_START_DOCSTRING = r"""
-    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
-    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
-    etc.)
-
-    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
-    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
-    and behavior.
-
-    Parameters:
-        config ([`MistralConfig`]):
-            Model configuration class with all the parameters of the model. Initializing with a config file does not
-            load the weights associated with the model, only the configuration. Check out the
-            [`~PreTrainedModel.from_pretrained`] method to load the model weights.
-"""
-
-
-@add_start_docstrings(
-    "The bare Mistral Model outputting raw hidden-states without any specific head on top.",
-    MISTRAL_START_DOCSTRING,
-)
-class Mistral2PreTrainedModel(PreTrainedModel):
-    config_class = Mistral2Config
-    base_model_prefix = "model"
-    supports_gradient_checkpointing = True
-    _no_split_modules = ["MistralDecoderLayer"]
-    _skip_keys_device_placement = "past_key_values"
-    _supports_flash_attn_2 = True
-    _supports_sdpa = True
-    _supports_cache_class = True
-
-    def _init_weights(self, module):
-        std = self.config.initializer_range
-        if isinstance(module, nn.Linear):
-            module.weight.data.normal_(mean=0.0, std=std)
-            if module.bias is not None:
-                module.bias.data.zero_()
-        elif isinstance(module, nn.Embedding):
-            module.weight.data.normal_(mean=0.0, std=std)
-            if module.padding_idx is not None:
-                module.weight.data[module.padding_idx].zero_()
-
-
-MISTRAL_INPUTS_DOCSTRING = r"""
-    Args:
-        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
-            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
-            it.
-
-            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
-            [`PreTrainedTokenizer.__call__`] for details.
-
-            [What are input IDs?](../glossary#input-ids)
-        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
-            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
-
-            - 1 for tokens that are **not masked**,
-            - 0 for tokens that are **masked**.
-
-            [What are attention masks?](../glossary#attention-mask)
-
-            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
-            [`PreTrainedTokenizer.__call__`] for details.
-
-            If `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
-            `past_key_values`).
-
-            If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
-            and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
-            information on the default strategy.
-
-            - 1 indicates the head is **not masked**,
-            - 0 indicates the head is **masked**.
-        position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
-            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
-            config.n_positions - 1]`.
-
-            [What are position IDs?](../glossary#position-ids)
-        past_key_values (`Cache` or `tuple(tuple(torch.FloatTensor))`, *optional*):
-            Pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
-            blocks) that can be used to speed up sequential decoding. This typically consists in the `past_key_values`
-            returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`.
-
-            Two formats are allowed:
-            - a [`~cache_utils.Cache`] instance;
-            - Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of
-            shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`). This is also known as the legacy
-            cache format.
-
-            The model will output the same cache format that is fed as input. If no `past_key_values` are passed, the
-            legacy cache format will be returned.
-
-            If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that don't
-            have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `input_ids`
-            of shape `(batch_size, sequence_length)`.
-        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
-            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
-            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
-            model's internal embedding lookup matrix.
-        use_cache (`bool`, *optional*):
-            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
-            `past_key_values`).
-        output_attentions (`bool`, *optional*):
-            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
-            tensors for more detail.
-        output_hidden_states (`bool`, *optional*):
-            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
-            more detail.
-        return_dict (`bool`, *optional*):
-            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
-"""
-
-
-@add_start_docstrings(
-    "The bare Mistral Model outputting raw hidden-states without any specific head on top.",
-    MISTRAL_START_DOCSTRING,
-)
-class Mistral2Model(nn.Module):
-    """
-    Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`MistralDecoderLayer`]
-
-    Args:
-        config: MistralConfig
-    """
-
-    def __init__(self, config:  CollieConfig):
-        # super().__init__(config)
-        super().__init__()
-        self.config = config
-        self.padding_idx = config.pad_token_id
-        self.vocab_size = config.vocab_size
-
-        self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
-        self.layers = nn.ModuleList(
-            [MistralDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
-        )
-        self._attn_implementation = config._attn_implementation
-        self.norm = Mistral2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
-
-        self.gradient_checkpointing = False
-        # Initialize weights and apply final processing
-        # self.post_init()
-
-    def get_input_embeddings(self):
-        return self.embed_tokens
-
-    def set_input_embeddings(self, value):
-        self.embed_tokens = value
-
-    @add_start_docstrings_to_model_forward(MISTRAL_INPUTS_DOCSTRING)
-    def forward(
-        self,
-        input_ids: torch.LongTensor = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[List[torch.FloatTensor]] = None,
-        inputs_embeds: Optional[torch.FloatTensor] = None,
-        use_cache: Optional[bool] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ) -> Union[Tuple, BaseModelOutputWithPast]:
-        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
-        output_hidden_states = (
-            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
-        )
-        use_cache = use_cache if use_cache is not None else self.config.use_cache
-
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        # retrieve input_ids and inputs_embeds
-        if input_ids is not None and inputs_embeds is not None:
-            raise ValueError("You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time")
-        elif input_ids is not None:
-            batch_size, seq_length = input_ids.shape
-        elif inputs_embeds is not None:
-            batch_size, seq_length, _ = inputs_embeds.shape
-        else:
-            raise ValueError("You have to specify either decoder_input_ids or decoder_inputs_embeds")
-
-        if self.gradient_checkpointing and self.training:
-            if use_cache:
-                logger.warning_once(
-                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
-                )
-                use_cache = False
-
-        past_key_values_length = 0
-
-        if use_cache:
-            use_legacy_cache = not isinstance(past_key_values, Cache)
-            if use_legacy_cache:
-                past_key_values = DynamicCache.from_legacy_cache(past_key_values)
-            past_key_values_length = past_key_values.get_usable_length(seq_length)
-
-        if position_ids is None:
-            device = input_ids.device if input_ids is not None else inputs_embeds.device
-            position_ids = torch.arange(
-                past_key_values_length, seq_length + past_key_values_length, dtype=torch.long, device=device
-            )
-            position_ids = position_ids.unsqueeze(0).view(-1, seq_length)
-        else:
-            position_ids = position_ids.view(-1, seq_length).long()
-
-        if inputs_embeds is None:
-            inputs_embeds = self.embed_tokens(input_ids)
-
-        if attention_mask is not None and self._attn_implementation == "flash_attention_2" and use_cache:
-            is_padding_right = attention_mask[:, -1].sum().item() != batch_size
-            if is_padding_right:
-                raise ValueError(
-                    "You are attempting to perform batched generation with padding_side='right'"
-                    " this may lead to unexpected behaviour for Flash Attention version of Mistral. Make sure to "
-                    " call `tokenizer.padding_side  = 'left'` before tokenizing the input. "
-                )
-
-        if self._attn_implementation == "flash_attention_2":
-            # 2d mask is passed through the layers
-            attention_mask = attention_mask if (attention_mask is not None and 0 in attention_mask) else None
-        elif self._attn_implementation == "sdpa" and not output_attentions:
-            # output_attentions=True can not be supported when using SDPA, and we fall back on
-            # the manual implementation that requires a 4D causal mask in all cases.
-            attention_mask = _prepare_4d_causal_attention_mask_for_sdpa(
-                attention_mask,
-                (batch_size, seq_length),
-                inputs_embeds,
-                past_key_values_length,
-            )
-        else:
-            # 4d mask is passed through the layers
-            attention_mask = _prepare_4d_causal_attention_mask(
-                attention_mask,
-                (batch_size, seq_length),
-                inputs_embeds,
-                past_key_values_length,
-                sliding_window=self.config.sliding_window,
-            )
-
-        hidden_states = inputs_embeds
-    
-
-        # --------------------------------------------------------
-        # # 将Tensor转换为列表
-        # ans_list = inputs_embeds.tolist()
-        # # 指定.json文件的路径
-        # file_path = '/remote-home/lqyin/CoLLiE/tests/models/mistraltp/inputs_embeds.json'
-        
-        # # 尝试打开现有的.json文件并读取内容，如果文件不存在则创建一个新的列表
-        # try:
-        #     with open(file_path, 'r', encoding='utf-8') as file:
-        #         results_list = json.load(file)
-        # except FileNotFoundError:
-        #     results_list = []
-        # # 将当前结果添加到列表中
-        # results_list.append(ans_list)
-        # # 将更新后的列表写回.json文件
-        # with open(file_path, 'w', encoding='utf-8') as file:
-        #     json.dump(results_list, file, ensure_ascii=False, indent=4)
-        #     file.write('\n')  # 在文件末尾添加一个换行符
-        # # --------------------------------------------------------
-
-        # decoder layers
-        all_hidden_states = () if output_hidden_states else None
-        all_self_attns = () if output_attentions else None
-        next_decoder_cache = None
-
-        for decoder_layer in self.layers:
-            if output_hidden_states:
-                all_hidden_states += (hidden_states,)
-
-            if self.gradient_checkpointing and self.training:
-                layer_outputs = self._gradient_checkpointing_func(
-                    decoder_layer.__call__,
-                    hidden_states,
-                    attention_mask,
-                    position_ids,
-                    past_key_values,
-                    output_attentions,
-                    use_cache,
-                )
-            else:
-                layer_outputs = decoder_layer(
-                    hidden_states,
-                    attention_mask=attention_mask,
-                    position_ids=position_ids,
-                    past_key_value=past_key_values,
-                    output_attentions=output_attentions,
-                    use_cache=use_cache,
-                )
-
-            hidden_states = layer_outputs[0]
-
-            if use_cache:
-                next_decoder_cache = layer_outputs[2 if output_attentions else 1]
-
-            if output_attentions:
-                all_self_attns += (layer_outputs[1],)
-
-        hidden_states = self.norm(hidden_states)
-
-        # add hidden states from the last decoder layer
-        if output_hidden_states:
-            all_hidden_states += (hidden_states,)
-
-        next_cache = None
-        if use_cache:
-            next_cache = next_decoder_cache.to_legacy_cache() if use_legacy_cache else next_decoder_cache
-
-        if not return_dict:
-            return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None)
-        return BaseModelOutputWithPast(
-            last_hidden_state=hidden_states,
-            past_key_values=next_cache,
-            hidden_states=all_hidden_states,
-            attentions=all_self_attns,
-        )
-
-
-class Mistral2ForCausalLM(CollieModelForCausalLM):
-    _tied_weights_keys = ["lm_head.weight"]
-
-    def __init__(self, config:CollieConfig):
-        super().__init__(config)
-        self.model = Mistral2Model(config)
-        self.vocab_size = config.vocab_size
-        # self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
-        self.lm_head = ColumnParallelLinearWithoutBias(
-            self.collie_config.hidden_size, self.collie_config.vocab_size, bias=False
-        )
-        # Initialize weights and apply final processing
-        # self.post_init()
-        # GenerationMixin 需要的额外参数
-        self.config.is_decoder = True
-        if config.model_config.tie_word_embeddings:
-            self.lm_head.weight = self.embed_tokens.weight
-        self.main_input_name = "input_ids"
-
-    def clean_cache(self):
-        self._clean_hidden_states([*self.model.layers, self.lm_head])
-        self._set_use_cache(self.model.layers, False)
-
-    def set_cache(self, use_cache):
-        self._set_use_cache(self.model.layers, use_cache)
-
-    def get_input_embeddings(self):
-        return self.model.embed_tokens
-
-    def set_input_embeddings(self, value):
-        self.model.embed_tokens = value
-
-    def get_output_embeddings(self):
-        return self.lm_head
-
-    def set_output_embeddings(self, new_embeddings):
-        self.lm_head = new_embeddings
-
-    def set_decoder(self, decoder):
-        self.model = decoder
-
-    def get_decoder(self):
-        return self.model
-
-    @add_start_docstrings_to_model_forward(MISTRAL_INPUTS_DOCSTRING)
-    @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
-    def forward(
-        self,
-        input_ids: torch.LongTensor = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[List[torch.FloatTensor]] = None,
-        inputs_embeds: Optional[torch.FloatTensor] = None,
-        labels: Optional[torch.LongTensor] = None,
-        use_cache: Optional[bool] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-        **kwargs,
-    ) -> Union[Tuple, CausalLMOutputWithPast]:
-        r"""
-        Args:
-            labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
-                Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
-                config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
-                (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
-
-        Returns:
-
-        Example:
-
-        ```python
-        >>> from transformers import AutoTokenizer, MistralForCausalLM
-
-        >>> model = MistralForCausalLM.from_pretrained("mistralai/Mistral-7B-v0.1")
-        >>> tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-v0.1")
-
-        >>> prompt = "Hey, are you conscious? Can you talk to me?"
-        >>> inputs = tokenizer(prompt, return_tensors="pt")
-
-        >>> # Generate
-        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
-        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
-        "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
-        ```"""
-
-        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
-        output_hidden_states = (
-            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
-        )
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
-        outputs = self.model(
-            input_ids=input_ids,
-            attention_mask=attention_mask,
-            position_ids=position_ids,
-            past_key_values=past_key_values,
-            inputs_embeds=inputs_embeds,
-            use_cache=use_cache,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-
-        hidden_states = outputs[0]
-        logits = self.lm_head(hidden_states)
-        logits = logits.float()
-
-        loss = None
-        if labels is not None:
-            # Shift so that tokens < n predict n
-            shift_logits = logits[..., :-1, :].contiguous()
-            shift_labels = labels[..., 1:].contiguous()
-            # Flatten the tokens
-            shift_logits = shift_logits.view(-1, self.config.vocab_size)
-            shift_labels = shift_labels.view(-1)
-            # Ensure tensors are on the same device
-            shift_labels = shift_labels.to(shift_logits.device)
-            loss_fct = CrossEntropyLoss()
-            loss = loss_fct(shift_logits, shift_labels)
-
-        if not return_dict:
-            output = (logits,) + outputs[1:]
-            return (loss,) + output if loss is not None else output
-
-        return CausalLMOutputWithPast(
-            loss=loss,
-            logits=logits,
-            past_key_values=outputs.past_key_values,
-            hidden_states=outputs.hidden_states,
-            attentions=outputs.attentions,
-        )
-
-    def prepare_inputs_for_generation(
-        self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, **kwargs
-    ):
-        # Omit tokens covered by past_key_values
-        if past_key_values is not None:
-            if isinstance(past_key_values, Cache):
-                cache_length = past_key_values.get_seq_length()
-                past_length = past_key_values.seen_tokens
-                max_cache_length = past_key_values.get_max_length()
-            else:
-                cache_length = past_length = past_key_values[0][0].shape[2]
-                max_cache_length = None
-
-            # Keep only the unprocessed tokens:
-            # 1 - If the length of the attention_mask exceeds the length of input_ids, then we are in a setting where
-            # some of the inputs are exclusively passed as part of the cache (e.g. when passing input_embeds as
-            # input)
-            if attention_mask is not None and attention_mask.shape[1] > input_ids.shape[1]:
-                input_ids = input_ids[:, -(attention_mask.shape[1] - past_length) :]
-            # 2 - If the past_length is smaller than input_ids', then input_ids holds all input tokens. We can discard
-            # input_ids based on the past_length.
-            elif past_length < input_ids.shape[1]:
-                input_ids = input_ids[:, past_length:]
-            # 3 - Otherwise (past_length >= input_ids.shape[1]), let's assume input_ids only has unprocessed tokens.
-
-            # If we are about to go beyond the maximum cache length, we need to crop the input attention mask.
-            if (
-                max_cache_length is not None
-                and attention_mask is not None
-                and cache_length + input_ids.shape[1] > max_cache_length
-            ):
-                attention_mask = attention_mask[:, -max_cache_length:]
-
-        position_ids = kwargs.get("position_ids", None)
-        if attention_mask is not None and position_ids is None:
-            # create position_ids on the fly for batch generation
-            position_ids = attention_mask.long().cumsum(-1) - 1
-            position_ids.masked_fill_(attention_mask == 0, 1)
-            if past_key_values:
-                position_ids = position_ids[:, -input_ids.shape[1] :]
-
-        # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
-        if inputs_embeds is not None and past_key_values is None:
-            model_inputs = {"inputs_embeds": inputs_embeds}
-        else:
-            model_inputs = {"input_ids": input_ids}
-
-        model_inputs.update(
-            {
-                "position_ids": position_ids,
-                "past_key_values": past_key_values,
-                "use_cache": kwargs.get("use_cache"),
-                "attention_mask": attention_mask,
-            }
-        )
-        return model_inputs
-
-    @staticmethod
-    def _reorder_cache(past_key_values, beam_idx):
-        reordered_past = ()
-        for layer_past in past_key_values:
-            reordered_past += (
-                tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past),
-            )
-        return reordered_past
-
-    @classmethod
-    def pipeline_layers(cls, config: CollieConfig):
-        """
-        Get layers of pipeline.
-        :return: list
-        """
-        if isinstance(config, str):
-            config = CollieConfig.from_pretrained(config)
-
-        if config.tie_word_embeddings:
-            output = TiedLayerSpec(
-                "embed_tokens",
-                dict_as_params(input_keys="hidden_states", output_keys="logits"),
-                ColumnParallelLMHead,
-                config.hidden_size,
-                config.vocab_size,
-                bias=False,
-            )
-        else:
-            output = LayerSpec(
-                dict_as_params(input_keys="hidden_states", output_keys="logits"),
-                ColumnParallelLMHead,
-                config.hidden_size,
-                config.vocab_size,
-                bias=False,
-            )
-
-        return [("model", Mistral2Model.pipeline_layers(config)), ("lm_head", output)]
-
-    @staticmethod
-    def load_parallel_state_dict(
-            path: str,
-            config: Union[CollieConfig, str],
-            process_exclusion: bool = False,
-            **kwargs,
-    ):
-        ...
-
-    @staticmethod
-    def load_parallel_state_dict(
-            path: str,
-            config: Union[CollieConfig, str],
-            process_exclusion: bool = False,
-            protocol: str = "file", # 指定加载state_dict时使用的协议
-            **kwargs,
-    ):
-        """
-        Load state_dict from ``path``.
-        The format of pretrained model should be the same as that of
-        `huggingface`.
-        :return: state_dict. Note that the state_dict should be processed
-            properly to match the current rank.
-        """
-        # 配置加载
-        if isinstance(config, str):
-            config = CollieConfig.from_pretrained(config)
-        # IO驱动初始化
-        io_driver = IODriver.from_protocol(protocol)
-        # 检查文件路径是否存在
-        if not io_driver.exists(path):
-            raise FileNotFoundError(f"folder {path} not found.")
-        # 初始化存储和处理变量
-        state_dict = OrderedDict()
-        weights = []
-        parts = None # 变量用于存储模型分割的部分信息
-        # 如果开启了进程互斥，那么每个进程都会显示进度条，否则只显示 RANK0 的
-        hide_progress = not process_exclusion and int(os.environ.get("RANK", "0")) != 0
-        if dist.is_initialized() and process_exclusion:
-            # 如果启动了进程互斥，则要进行 dist.get_world_size() 次循环
-            rank_order = range(dist.get_world_size())
-        else:
-            # 不开启只进行一次循环
-            rank_order = range(1)
-        # 权重文件加载和处理
-        for rank in rank_order:
-            # 如果开启了进程互斥，那么只有对应 RANK 的能进入循环；不开启进程互斥的话就都可以进
-            if int(os.environ.get("RANK", "0")) == rank or not process_exclusion:
-                # PP 分层的方法保存在了 os.environ["COLLIE_PP_PARTS"], 格式类似于 [0, 17, 35], 左闭右开
-                if env.is_pipeline:
-                    # 保存的是 json 格式
-                    parts = env.pipeline_parts
-                if hasattr(config, "num_key_value_heads"):
-                    # llama2 (transformers >= 4.31.0)
-                    num_key_value_heads = config.num_key_value_heads
-                else:
-                    num_key_value_heads = config.num_attention_heads
-                head_dim = config.hidden_size // config.num_attention_heads
-                # 如果存在 pytorch_model.bin.index.json 文件的话，此时不同的 pp 进程可以按需加载自己需要的权重
-                if (
-                        io_driver.exists(os.path.join(path, "pytorch_model.bin.index.json"))
-                        and "COLLIE_PP_PARTS" in os.environ.keys()
-                ):
-                    weight_map = json.loads(
-                        io_driver.load(
-                            os.path.join(path, "pytorch_model.bin.index.json"), mode="r"
-                        )
-                    )["weight_map"]
-                    # layers 表示自己需要的层
-                    layers = env.pipeline_layers_idx
-                    # 筛选出形似 model.layers.0 这样的层。包含两个条件：1. 有数字的层；2. 数字加一要在 layers 里面（因为最开始还有个 embedding 占一层）
-                    weights.extend(
-                        [
-                            value
-                            for key, value in weight_map.items()
-                            if len(key.split(".")) > 2
-                               and key.split(".")[2].isdigit()
-                               and (int(key.split(".")[2]) + 1) in layers
-                        ]
-                    )
-                    # 去重
-                    weights = list(set(weights))
-                    # 继续筛选，如果有 0 层，那么就要加载 embedding；如果有最后一层，那么就要加载 lm_head；如果有倒数第二层，那么就要加载 norm
-                    if 0 in layers:
-                        weights.append(weight_map["model.tok_embeddings.weight"])
-                    if max(parts) - 1 in layers:
-                        weights.append(weight_map["output.weight"])
-                    if max(parts) - 2 in layers:
-                        weights.append(weight_map["model.norm.weight"])
-                else:
-                    # 如果没有 pytorch_model.bin.index.json 文件的话，那么就加载所有的权重
-                    weights = [
-                        weight
-                        for weight in io_driver.list(path)
-                        if weight.endswith(".bin")
-                    ]
-                with progress(
-                    weights,
-                    desc="Loading state dict",
-                    total=len(weights),
-                    disable=hide_progress,
-                ) as pbar:
-                    for weight in pbar:
-                        part_state_dict = io_driver.load(
-                            os.path.join(path, weight), mode="rb"
-                        )
-                        # for key in list(part_state_dict.keys()):
-                            # if "attention.wqkv.weight" in key:
-                            #     # qkv_weights = part_state_dict.pop(key)
-                            #     qkv_weights = part_state_dict[key]
-                            #     print(qkv_weights.shape)
-                            #     (wq, wk, wv) = qkv_weights.split(
-                            #         [
-                            #             config.hidden_size,
-                            #             config.num_key_value_heads * head_dim,
-                            #             config.num_key_value_heads * head_dim,
-                            #         ],
-                            #         dim=0,
-                            #     )
-                            #     wq_name = key.replace("wqkv", "wq")
-                            #     wk_name = key.replace("wqkv", "wk")
-                            #     wv_name = key.replace("wqkv", "wv")
-                            #     part_state_dict[wq_name] = wq
-                            #     part_state_dict[wk_name] = wk
-                            #     part_state_dict[wv_name] = wv
-                        state_dict.update(part_state_dict)
-                        del part_state_dict
-                if parts is not None:
-                    # 这一步是 pp 的复筛
-                    layers = env.pipeline_layers_idx
-                    for key in list(state_dict.keys()):
-                        if key.startswith("layers"):
-                            layer = int(key.split(".")[1])
-                            if layer + 1 not in layers:
-                                state_dict.pop(key)
-                        # if key.endswith("tok_embeddings.weight"):
-                        if key.endswith("embed_tokens.weight"):
-                            if 0 not in layers:
-                                state_dict.pop(key)
-                        if key == "norm.weight":
-                            if max(parts) - 2 not in layers:
-                                state_dict.pop(key)
-                        # if key.endswith("output.weight"):
-                        if key.endswith("lm_head.weight"):
-                            if max(parts) - 1 not in layers:
-                                state_dict.pop(key)
-                # 根据用户配置的新的 tp size 进行分割
-                for key in list(state_dict.keys()):
-                    col_filter = [
-                        # "wq.weight",
-                        # "wk.weight",
-                        # "wv.weight",
-                        # "wqkv.weight",
-                        # "w1.weight",
-                        # "w3.weight",
-                        # "tok_embeddings.weight",
-                        # "output.weight",
-                        "q_proj.weight",
-                        "k_proj.weight",
-                        "v_proj.weight",
-                        "o_proj.weight",
-                        "lm_head.weight",
-                        "gate_proj.weight",
-                        "up_proj.weight",
-                        "down_proj.weight",
-                        "embed_tokens.weight",
-                    ]
-                    col_split = any([key.endswith(filter) for filter in col_filter])
-
-                    if col_split:
-                        tensor = (
-                            list(torch.chunk(state_dict[key], config.tp_size, dim=0))[
-                                env.tp_rank
-                            ]
-                            .detach()
-                            .clone()
-                        )
-                        del state_dict[key]
-                        if process_exclusion:
-                            # CPU 内存回收（速度很慢）
-                            gc.collect()
-                        state_dict[key] = tensor
-                    elif key.endswith("o_proj.weight") or key.endswith("down_proj.weight"):
-                        tensor = (
-                            list(torch.chunk(state_dict[key], config.tp_size, dim=1))[
-                                env.tp_rank
-                            ]
-                            .detach()
-                            .clone()
-                        )
-                        del state_dict[key]
-                        if process_exclusion:
-                            # CPU 内存回收（速度很慢）
-                            gc.collect()
-                        state_dict[key] = tensor
-            if dist.is_initialized() and process_exclusion:
-                # 如果选择了进程互斥，那么本次循环中不需要加载权重的进程需等待
-                dist.barrier()
-        return state_dict
-
-    @staticmethod
-    def save_parallel_state_dict(
-        state_dict: dict,
-        path: str,
-        config: CollieConfig,
-        process_exclusion: bool = False,
-        **kwargs,
-    ):
-        ...
-
-    @staticmethod
-    def save_parallel_state_dict(
-            state_dict: dict,
-            path: str,
-            config: CollieConfig,
-            process_exclusion: bool = False,
-            protocol: str = "file",
-    ):
-        """
-        Save state_dict to ``path``.
-        The format of saved state dict should be the same as that of
-        `huggingface`.
-        """
-        io_driver = IODriver.from_protocol(protocol)
-        # gather to tp rank 0
-        if dist.is_initialized() and process_exclusion:
-            # 如果启动了进程互斥，则要进行 pp_size 次循环
-            rank_order = range(config.pp_size)
-        else:
-            # 不开启只进行一次循环
-            rank_order = range(1)
-        dst = parallel_state.get_tensor_model_parallel_src_rank()
-        with progress(
-                rank_order,
-                desc="Saving model",
-                disable=int(os.environ.get("RANK", "0")) != 0,
-        ) as pbar:
-            for rank in pbar:
-                if env.dp_rank == 0 and (env.pp_rank == rank or not process_exclusion):
-                    for key in sorted(list(state_dict.keys())):
-                        tensor_list = None
-                        if env.tp_rank == 0:
-                            tensor_list = [
-                                torch.zeros_like(state_dict[key])
-                                .to(state_dict[key].dtype)
-                                .cuda()
-                                for _ in range(config.tp_size)
-                            ]
-                        dist.gather(
-                            state_dict[key].cuda(),
-                            dst=dst,
-                            gather_list=tensor_list,
-                            group=env.tp_group,
-                        )
-                        if env.tp_rank == 0:
-                            col_filter = [
-                                # "wq.weight",
-                                # "wk.weight",
-                                # "wv.weight",
-                                # "wqkv.weight",
-                                # "w1.weight",
-                                # "w3.weight",
-                                # "tok_embeddings.weight",
-                                # "output.weight",
-                                "q_proj.weight",
-                                "k_proj.weight",
-                                "v_proj.weight",
-                                "o_proj.weight",
-                                "lm_head.weight",
-                                "gate_proj.weight",
-                                "up_proj.weight",
-                                "down_proj.weight",
-                                "embed_tokens.weight",
-                            ]
-                            col_split = any(
-                                [key.endswith(filter) for filter in col_filter]
-                            )
-
-                            if col_split:
-                                state_dict[key] = concat_tensor(tensor_list, dim=0)
-
-                                if process_exclusion:
-                                    # CPU 内存回收（速度很慢）
-                                    gc.collect()
-
-                            elif key.endswith("o_proj.weight") or key.endswith("down_proj.weight"):
-                                state_dict[key] = concat_tensor(tensor_list, dim=1)
-
-                                if process_exclusion:
-                                    # CPU 内存回收（速度很慢）
-                                    gc.collect()
-                    # 似乎不需要？
-                    # state_dict_keys = state_dict.keys()
-                    # for layer_id in range(config.num_layers):
-                    #     qkv_names = [None, None, None]
-                    #     for key in state_dict_keys:
-                    #         if f"layers.{layer_id}.attention.wq.weight" in key:
-                    #             qkv_names[0] = key
-                    #         elif f"layers.{layer_id}.attention.wk.weight" in key:
-                    #             qkv_names[1] = key
-                    #         elif f"layers.{layer_id}.attention.wv.weight" in key:
-                    #             qkv_names[2] = key
-                    #     qkv_name = qkv_names[0].replace("wq", "wqkv")
-                    #     state_dict[qkv_name] = torch.cat(
-                    #         [
-                    #             state_dict.pop(qkv_names[0]),
-                    #             state_dict.pop(qkv_names[1]),
-                    #             state_dict.pop(qkv_names[2]),
-                    #         ],
-                    #         dim=0
-                    #     )
-
-                    if env.tp_rank == 0:
-                        # Save gathered weights
-                        if env.is_pipeline:
-                            ckpt_name = f"pytorch_model-{env.pp_rank + 1:05d}-of-{config.pp_size:05d}.bin"
-                            total_size = 0
-                            weight_map = {}
-                            for name, weight in state_dict.items():
-                                weight_size = weight.numel() * dtype_byte_size(
-                                    weight.dtype
-                                )
-                                weight_map[name] = ckpt_name
-                                total_size += weight_size
-                            index_dict = dict(
-                                total_size=total_size, weight_map=weight_map
-                            )
-                            index_dicts = [None for _ in range(env.pp_size)]
-                            dist.gather_object(
-                                index_dict, index_dicts if env.pp_rank == 0 else None, group=env.pp_group
-                            )
-                            if env.pp_rank == 0:
-                                total_size = 0
-                                weight_map = {}
-                                for _index_dict in index_dicts:
-                                    total_size += _index_dict["total_size"]
-                                    weight_map.update(_index_dict["weight_map"])
-                                merged_dict = {
-                                    "metadata": {"total_size": total_size},
-                                    "weight_map": weight_map,
-                                }
-                                io_driver.save(
-                                    json.dumps(merged_dict, indent=2, sort_keys=True)
-                                    + "\n",
-                                    os.path.join(path, "pytorch_model.bin.index.json"),
-                                )
-
-                        else:
-                            ckpt_name = f"pytorch_model.bin"
-                        ckpt_path = os.path.join(path, ckpt_name)
-                        io_driver.save(state_dict, ckpt_path)
-                if dist.is_initialized() and process_exclusion:
-                    dist.barrier()
-        if env.rank == 0:
-            config.save_pretrained(path, protocol=protocol)
-        dist.barrier()
-
-
-@add_start_docstrings(
-    """
-    The Mistral Model transformer with a sequence classification head on top (linear layer).
-
-    [`MistralForSequenceClassification`] uses the last token in order to do the classification, as other causal models
-    (e.g. GPT-2) do.
-
-    Since it does classification on the last token, it requires to know the position of the last token. If a
-    `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If
-    no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the
-    padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
-    each row of the batch).
-    """,
-    MISTRAL_START_DOCSTRING,
-)
-# Copied from transformers.models.llama.modeling_llama.LlamaForSequenceClassification with Llama->Mistral, LLAMA->MISTRAL
-class MistralForSequenceClassification(Mistral2PreTrainedModel):
-    def __init__(self, config):
-        super().__init__(config)
-        self.num_labels = config.num_labels
-        self.model = Mistral2Model(config)
-        self.score = nn.Linear(config.hidden_size, self.num_labels, bias=False)
-
-        # Initialize weights and apply final processing
-        self.post_init()
-
-    def get_input_embeddings(self):
-        return self.model.embed_tokens
-
-    def set_input_embeddings(self, value):
-        self.model.embed_tokens = value
-
-    @add_start_docstrings_to_model_forward(MISTRAL_INPUTS_DOCSTRING)
-    def forward(
-        self,
-        input_ids: torch.LongTensor = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[List[torch.FloatTensor]] = None,
-        inputs_embeds: Optional[torch.FloatTensor] = None,
-        labels: Optional[torch.LongTensor] = None,
-        use_cache: Optional[bool] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ) -> Union[Tuple, SequenceClassifierOutputWithPast]:
-        r"""
-        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
-            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
-            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
-            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
-        """
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        transformer_outputs = self.model(
-            input_ids,
-            attention_mask=attention_mask,
-            position_ids=position_ids,
-            past_key_values=past_key_values,
-            inputs_embeds=inputs_embeds,
-            use_cache=use_cache,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-        hidden_states = transformer_outputs[0]
-        logits = self.score(hidden_states)
-
-        if input_ids is not None:
-            batch_size = input_ids.shape[0]
-        else:
-            batch_size = inputs_embeds.shape[0]
-
-        if self.config.pad_token_id is None and batch_size != 1:
-            raise ValueError("Cannot handle batch sizes > 1 if no padding token is defined.")
-        if self.config.pad_token_id is None:
-            sequence_lengths = -1
-        else:
-            if input_ids is not None:
-                # if no pad token found, use modulo instead of reverse indexing for ONNX compatibility
-                sequence_lengths = torch.eq(input_ids, self.config.pad_token_id).int().argmax(-1) - 1
-                sequence_lengths = sequence_lengths % input_ids.shape[-1]
-                sequence_lengths = sequence_lengths.to(logits.device)
-            else:
-                sequence_lengths = -1
-
-        pooled_logits = logits[torch.arange(batch_size, device=logits.device), sequence_lengths]
-
-        loss = None
-        if labels is not None:
-            labels = labels.to(logits.device)
-            if self.config.problem_type is None:
-                if self.num_labels == 1:
-                    self.config.problem_type = "regression"
-                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
-                    self.config.problem_type = "single_label_classification"
-                else:
-                    self.config.problem_type = "multi_label_classification"
-
-            if self.config.problem_type == "regression":
-                loss_fct = MSELoss()
-                if self.num_labels == 1:
-                    loss = loss_fct(pooled_logits.squeeze(), labels.squeeze())
-                else:
-                    loss = loss_fct(pooled_logits, labels)
-            elif self.config.problem_type == "single_label_classification":
-                loss_fct = CrossEntropyLoss()
-                loss = loss_fct(pooled_logits.view(-1, self.num_labels), labels.view(-1))
-            elif self.config.problem_type == "multi_label_classification":
-                loss_fct = BCEWithLogitsLoss()
-                loss = loss_fct(pooled_logits, labels)
-        if not return_dict:
-            output = (pooled_logits,) + transformer_outputs[1:]
-            return ((loss,) + output) if loss is not None else output
-
-        return SequenceClassifierOutputWithPast(
-            loss=loss,
-            logits=pooled_logits,
-            past_key_values=transformer_outputs.past_key_values,
-            hidden_states=transformer_outputs.hidden_states,
-            attentions=transformer_outputs.attentions,
-        )
diff --git a/tests/models/mistral2/modelpp.py b/tests/models/mistral2/modelpp.py
deleted file mode 100644
index 1180a10..0000000
--- a/tests/models/mistral2/modelpp.py
+++ /dev/null
@@ -1,1922 +0,0 @@
-# coding=utf-8
-# Copyright 2023 Mistral AI and the HuggingFace Inc. team. All rights reserved.
-#
-# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
-# and OPT implementations in this library. It has been modified from its
-# original forms to accommodate minor architectural differences compared
-# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-""" PyTorch Mistral model."""
-import inspect
-import math
-import warnings
-from typing import List, Optional, Tuple, Union
-
-import torch
-import torch.nn.functional as F
-import torch.utils.checkpoint
-from torch import nn
-from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
-
-from transformers.activations import ACT2FN
-from transformers.cache_utils import Cache, DynamicCache
-from transformers.modeling_attn_mask_utils import _prepare_4d_causal_attention_mask, _prepare_4d_causal_attention_mask_for_sdpa
-from transformers.modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast, SequenceClassifierOutputWithPast
-from transformers.modeling_utils import PreTrainedModel, dtype_byte_size
-from transformers.utils import (
-    add_start_docstrings,
-    add_start_docstrings_to_model_forward,
-    is_flash_attn_2_available,
-    is_flash_attn_greater_or_equal_2_10,
-    logging,
-    replace_return_docstrings,
-)
-from .configuration_mistraltp import Mistral2Config
-
-
-if is_flash_attn_2_available():
-    from flash_attn import flash_attn_func, flash_attn_varlen_func
-    from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input  # noqa
-
-    _flash_supports_window_size = "window_size" in list(inspect.signature(flash_attn_func).parameters)
-
-
-logger = logging.get_logger(__name__)
-
-_CONFIG_FOR_DOC = "Mistral2Config"
-
-#modified for collie
-import torch.distributed as dist
-import gc
-import json
-import os
-from collections import OrderedDict
-from megatron.core import parallel_state, tensor_parallel
-from einops import rearrange
-from deepspeed.pipe import LayerSpec, TiedLayerSpec
-
-from collie.config import CollieConfig
-from collie.driver.io import IODriver
-from collie.log.logger import logger
-from collie.module import (
-    ColumnParallelLinearWithoutBias,
-    ColumnParallelLMHead,
-    RowParallelLinearWithoutBias,
-)
-from collie.utils import concat_tensor, dict_as_params, env, progress
-from collie.models.base import CollieModelForCausalLM
-from collie.models.utils import (
-    kv_cache_to_inputs_for_layer, inputs_to_kv_cache_for_layer,
-    kv_cache_to_inputs_for_model, inputs_to_kv_cache_for_model,
-)
-
-
-# Copied from transformers.models.llama.modeling_llama._get_unpad_data
-def _get_unpad_data(attention_mask):
-    seqlens_in_batch = attention_mask.sum(dim=-1, dtype=torch.int32)
-    indices = torch.nonzero(attention_mask.flatten(), as_tuple=False).flatten()
-    max_seqlen_in_batch = seqlens_in_batch.max().item()
-    cu_seqlens = F.pad(torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.int32), (1, 0))
-    return (
-        indices,
-        cu_seqlens,
-        max_seqlen_in_batch,
-    )
-
-
-# Copied from transformers.models.llama.modeling_llama.LlamaRMSNorm with Llama->Mistral
-class Mistral2RMSNorm(nn.Module):
-    def __init__(self, hidden_size, eps=1e-6):
-        """
-        MistralRMSNorm is equivalent to T5LayerNorm
-        """
-        super().__init__()
-        self.weight = nn.Parameter(torch.ones(hidden_size))
-        self.variance_epsilon = eps
-
-    def forward(self, hidden_states):
-        input_dtype = hidden_states.dtype
-        hidden_states = hidden_states.to(torch.float32)
-        variance = hidden_states.pow(2).mean(-1, keepdim=True)
-        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
-        return self.weight * hidden_states.to(input_dtype)
-
-
-# copied from transformers.models.llama.modeling_llama.LlamaRotaryEmbedding with Llama->Mistral
-# TODO @Arthur no longer copied from LLama after static cache
-class Mistral2RotaryEmbedding(nn.Module):
-    def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None):
-        super().__init__()
-
-        self.dim = dim
-        self.max_position_embeddings = max_position_embeddings
-        self.base = base
-        inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2, dtype=torch.int64).float().to(device) / self.dim))
-        self.register_buffer("inv_freq", inv_freq, persistent=False)
-
-        # Build here to make `torch.jit.trace` work.
-        self._set_cos_sin_cache(
-            seq_len=max_position_embeddings, device=self.inv_freq.device, dtype=torch.get_default_dtype()
-        )
-
-    def _set_cos_sin_cache(self, seq_len, device, dtype):
-        self.max_seq_len_cached = seq_len
-        t = torch.arange(self.max_seq_len_cached, device=device, dtype=torch.int64).type_as(self.inv_freq)
-
-        freqs = torch.outer(t, self.inv_freq)
-        # Different from paper, but it uses a different permutation in order to obtain the same calculation
-        emb = torch.cat((freqs, freqs), dim=-1)
-        self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False)
-        self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False)
-
-    def forward(self, x, seq_len=None):
-        # x: [bs, num_attention_heads, seq_len, head_size]
-        if seq_len > self.max_seq_len_cached:
-            self._set_cos_sin_cache(seq_len=seq_len, device=x.device, dtype=x.dtype)
-
-        return (
-            self.cos_cached[:seq_len].to(dtype=x.dtype),
-            self.sin_cached[:seq_len].to(dtype=x.dtype),
-        )
-
-
-# Copied from transformers.models.llama.modeling_llama.rotate_half
-def rotate_half(x):
-    """Rotates half the hidden dims of the input."""
-    x1 = x[..., : x.shape[-1] // 2]
-    x2 = x[..., x.shape[-1] // 2 :]
-    return torch.cat((-x2, x1), dim=-1)
-
-
-# copied from transformers.models.llama.modeling_llama.apply_rotary_pos_emb
-# TODO @Arthur no longer copied from LLama after static cache
-def apply_rotary_pos_emb(q, k, cos, sin, position_ids, unsqueeze_dim=1):
-    """Applies Rotary Position Embedding to the query and key tensors.
-
-    Args:
-        q (`torch.Tensor`): The query tensor.
-        k (`torch.Tensor`): The key tensor.
-        cos (`torch.Tensor`): The cosine part of the rotary embedding.
-        sin (`torch.Tensor`): The sine part of the rotary embedding.
-        position_ids (`torch.Tensor`):
-            The position indices of the tokens corresponding to the query and key tensors. For example, this can be
-            used to pass offsetted position ids when working with a KV-cache.
-        unsqueeze_dim (`int`, *optional*, defaults to 1):
-            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
-            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
-            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
-            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
-            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
-            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
-    Returns:
-        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
-    """
-    cos = cos[position_ids].unsqueeze(unsqueeze_dim)
-    sin = sin[position_ids].unsqueeze(unsqueeze_dim)
-    q_embed = (q * cos) + (rotate_half(q) * sin)
-    k_embed = (k * cos) + (rotate_half(k) * sin)
-    return q_embed, k_embed
-
-
-class Mistral2MLP(nn.Module):
-    def __init__(self, config):
-        super().__init__()
-        self.config = config
-        self.hidden_size = config.hidden_size
-        self.intermediate_size = config.intermediate_size
-        # self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
-        # self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
-        # self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False)
-        
-        self.up_proj = ColumnParallelLinearWithoutBias(
-            self.hidden_size,
-            self.intermediate_size,
-            bias=False,
-            gather_output=False,
-            init_method=lambda x: x,
-        )
-        self.gate_proj = ColumnParallelLinearWithoutBias(
-            self.hidden_size,
-            self.intermediate_size,
-            bias=False,
-            gather_output=False,
-            init_method=lambda x: x,
-        )
-        self.down_proj = RowParallelLinearWithoutBias(
-            self.intermediate_size,
-            self.hidden_size,
-            bias=False,
-            input_is_parallel=True,
-            init_method=lambda x: x,
-        )
-        
-        self.act_fn = ACT2FN[config.hidden_act]
-
-    def forward(self, x):
-        return self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
-
-
-# Copied from transformers.models.llama.modeling_llama.repeat_kv
-def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
-    """
-    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
-    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
-    """
-    batch, num_key_value_heads, slen, head_dim = hidden_states.shape
-    if n_rep == 1:
-        return hidden_states
-    hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
-    return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
-
-
-class Mistral2Attention(nn.Module):
-    """
-    Multi-headed attention from 'Attention Is All You Need' paper. Modified to use sliding window attention: Longformer
-    and "Generating Long Sequences with Sparse Transformers".
-    """
-
-    def __init__(self, config: CollieConfig, layer_idx: Optional[int] = None):
-        super().__init__()
-        self.config = config
-        self.layer_idx = layer_idx
-        if layer_idx is None:
-            logger.warning_once(
-                f"Instantiating {self.__class__.__name__} without passing a `layer_idx` is not recommended and will "
-                "lead to errors during the forward call if caching is used. Please make sure to provide a `layer_idx` "
-                "when creating this class."
-            )
-
-        self.hidden_size = config.hidden_size
-        self.num_heads = config.num_attention_heads
-        self.head_dim = self.hidden_size // self.num_heads
-        self.num_key_value_heads = config.num_key_value_heads
-        self.num_key_value_groups = self.num_heads // self.num_key_value_heads
-        self.max_position_embeddings = config.max_position_embeddings
-        self.rope_theta = config.rope_theta
-        self.is_causal = True
-        self.attention_dropout = config.attention_dropout
-
-        if (self.head_dim * self.num_heads) != self.hidden_size:
-            raise ValueError(
-                f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}"
-                f" and `num_heads`: {self.num_heads})."
-            )
-        # self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=False)
-        # self.k_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=False)
-        # self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=False)
-        # self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=False)
-
-        self.q_proj = ColumnParallelLinearWithoutBias(
-            self.hidden_size,
-            self.num_heads * self.head_dim,
-            bias=False,
-            gather_output=False,
-            init_method=lambda x: x,
-        )
-        self.k_proj = ColumnParallelLinearWithoutBias(
-            self.hidden_size,
-            self.num_key_value_heads * self.head_dim,
-            bias=False,
-            gather_output=False,
-            init_method=lambda x: x,
-        )
-        self.v_proj = ColumnParallelLinearWithoutBias(
-            self.hidden_size,
-            self.num_key_value_heads * self.head_dim,
-            bias=False,
-            gather_output=False,
-            init_method=lambda x: x,
-        )
-        self.o_proj = RowParallelLinearWithoutBias(
-            self.num_heads * self.head_dim,
-            self.hidden_size,
-            bias=False,
-            input_is_parallel=True,
-            init_method=lambda x: x,
-        )
-
-        self.rotary_emb = Mistral2RotaryEmbedding(
-            self.head_dim,
-            max_position_embeddings=self.max_position_embeddings,
-            base=self.rope_theta,
-        )
-
-    def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
-        return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
-
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_value: Optional[Cache] = None,
-        output_attentions: bool = False,
-        use_cache: bool = False,
-        **kwargs,
-    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
-        if "padding_mask" in kwargs:
-            warnings.warn(
-                "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`"
-            )
-        bsz, q_len, _ = hidden_states.size()
-
-        query_states = self.q_proj(hidden_states)
-        key_states = self.k_proj(hidden_states)
-        value_states = self.v_proj(hidden_states)
-
-        # query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
-        # key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
-        # value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
-
-        query_states, key_states, value_states = (
-            rearrange(query_states, "b n (h d) -> b n h d", d=self.head_dim),
-            rearrange(key_states, "b n (h d) -> b n h d", d=self.head_dim),
-            rearrange(value_states, "b n (h d) -> b n h d", d=self.head_dim),
-        )
-
-        self.num_heads_tp = query_states.shape[2]
-        self.tp_size = self.num_heads // self.num_heads_tp
-
-        query_states = query_states.transpose(1, 2)
-        key_states = key_states.transpose(1, 2)
-        value_states = value_states.transpose(1, 2)
-
-        if self.config.pp_size > 1:
-            query_states = query_states.contiguous()
-            key_states = key_states.contiguous()
-            value_states = value_states.contiguous()
-
-        kv_seq_len = key_states.shape[-2]
-        if past_key_value is not None:
-            if self.layer_idx is None:
-                raise ValueError(
-                    f"The cache structure has changed since version v4.36. If you are using {self.__class__.__name__} "
-                    "for auto-regressive decoding with k/v caching, please make sure to initialize the attention class "
-                    "with a layer index."
-                )
-            kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
-        cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
-        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
-
-        if past_key_value is not None:
-            cache_kwargs = {"sin": sin, "cos": cos}  # Specific to RoPE models
-            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
-
-        # repeat k/v heads if n_kv_heads < n_heads
-        key_states = repeat_kv(key_states, self.num_key_value_groups)
-        value_states = repeat_kv(value_states, self.num_key_value_groups)
-
-        attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim)
-
-        if attn_weights.size() != (bsz, self.num_heads_tp, q_len, kv_seq_len):
-            raise ValueError(
-                f"Attention weights should be of size {(bsz, self.num_heads_tp, q_len, kv_seq_len)}, but is"
-                f" {attn_weights.size()}"
-            )
-
-        if attention_mask is not None:
-            if attention_mask.size() != (bsz, 1, q_len, kv_seq_len):
-                raise ValueError(
-                    f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}"
-                )
-
-            attn_weights = attn_weights + attention_mask
-
-        # upcast attention to fp32
-        attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
-        attn_weights = nn.functional.dropout(attn_weights, p=self.attention_dropout, training=self.training)
-        attn_output = torch.matmul(attn_weights, value_states)
-
-        if attn_output.size() != (bsz, self.num_heads_tp, q_len, self.head_dim):
-            raise ValueError(
-                f"`attn_output` should be of size {(bsz, self.num_heads_tp, q_len, self.head_dim)}, but is"
-                f" {attn_output.size()}"
-            )
-
-        attn_output = attn_output.transpose(1, 2).contiguous()
-        attn_output = attn_output.reshape(bsz, q_len, int(self.hidden_size/self.tp_size))
-
-        attn_output = self.o_proj(attn_output)
-
-        if not output_attentions:
-            attn_weights = None
-
-        return attn_output, attn_weights, past_key_value
-
-
-class Mistral2FlashAttention2(Mistral2Attention):
-    """
-    Mistral flash attention module. This module inherits from `MistralAttention` as the weights of the module stays
-    untouched. The only required change would be on the forward pass where it needs to correctly call the public API of
-    flash attention and deal with padding tokens in case the input contains any of them.
-    """
-
-    # Copied from transformers.models.llama.modeling_llama.LlamaFlashAttention2.__init__
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-
-        # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1.
-        # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignement, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0.
-        # Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left).
-        self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10()
-
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_value: Optional[Cache] = None,
-        output_attentions: bool = False,
-        use_cache: bool = False,
-        **kwargs,
-    ):
-        if "padding_mask" in kwargs:
-            warnings.warn(
-                "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`"
-            )
-
-            # overwrite attention_mask with padding_mask
-            attention_mask = kwargs.pop("padding_mask")
-        bsz, q_len, _ = hidden_states.size()
-
-        query_states = self.q_proj(hidden_states)
-        key_states = self.k_proj(hidden_states)
-        value_states = self.v_proj(hidden_states)
-
-        # query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
-        # key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
-        # value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
-
-        query_states, key_states, value_states = (
-            rearrange(query_states, "b n (h d) -> b n h d", d=self.head_dim),
-            rearrange(key_states, "b n (h d) -> b n h d", d=self.head_dim),
-            rearrange(value_states, "b n (h d) -> b n h d", d=self.head_dim),
-        )
-
-        self.num_heads_tp = query_states.shape[2]
-        self.tp_size = self.num_heads // self.num_heads_tp
-
-        query_states = query_states.transpose(1, 2)
-        key_states = key_states.transpose(1, 2)
-        value_states = value_states.transpose(1, 2)
-
-        if self.config.pp_size > 1:
-            query_states = query_states.contiguous()
-            key_states = key_states.contiguous()
-            value_states = value_states.contiguous()
-
-
-        kv_seq_len = key_states.shape[-2]
-        if past_key_value is not None:
-            if self.layer_idx is None:
-                raise ValueError(
-                    f"The cache structure has changed since version v4.36. If you are using {self.__class__.__name__} "
-                    "for auto-regressive decoding with k/v caching, please make sure to initialize the attention class "
-                    "with a layer index."
-                )
-            kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
-
-        # Because the input can be padded, the absolute sequence length depends on the max position id.
-        rotary_seq_len = max(kv_seq_len, position_ids[:, -1].max().item()) + 1
-        cos, sin = self.rotary_emb(value_states, seq_len=rotary_seq_len)
-
-        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
-
-        use_sliding_windows = (
-            _flash_supports_window_size
-            and getattr(self.config, "sliding_window", None) is not None
-            and kv_seq_len > self.config.sliding_window
-        )
-
-        if not _flash_supports_window_size:
-            logger.warning_once(
-                "The current flash attention version does not support sliding window attention, for a more memory efficient implementation"
-                " make sure to upgrade flash-attn library."
-            )
-
-        if past_key_value is not None:
-            # Activate slicing cache only if the config has a value `sliding_windows` attribute
-            cache_has_contents = past_key_value.get_seq_length(self.layer_idx) > 0
-            if (
-                getattr(self.config, "sliding_window", None) is not None
-                and kv_seq_len > self.config.sliding_window
-                and cache_has_contents
-            ):
-                slicing_tokens = 1 - self.config.sliding_window
-
-                past_key = past_key_value[self.layer_idx][0]
-                past_value = past_key_value[self.layer_idx][1]
-
-                past_key = past_key[:, :, slicing_tokens:, :].contiguous()
-                past_value = past_value[:, :, slicing_tokens:, :].contiguous()
-
-                if past_key.shape[-2] != self.config.sliding_window - 1:
-                    raise ValueError(
-                        f"past key must have a shape of (`batch_size, num_heads, self.config.sliding_window-1, head_dim`), got"
-                        f" {past_key.shape}"
-                    )
-
-                if attention_mask is not None:
-                    attention_mask = attention_mask[:, slicing_tokens:]
-                    attention_mask = torch.cat([attention_mask, torch.ones_like(attention_mask[:, -1:])], dim=-1)
-
-            cache_kwargs = {"sin": sin, "cos": cos}  # Specific to RoPE models
-            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
-
-        # repeat k/v heads if n_kv_heads < n_heads
-        key_states = repeat_kv(key_states, self.num_key_value_groups)
-        value_states = repeat_kv(value_states, self.num_key_value_groups)
-        dropout_rate = 0.0 if not self.training else self.attention_dropout
-
-        # In PEFT, usually we cast the layer norms in float32 for training stability reasons
-        # therefore the input hidden states gets silently casted in float32. Hence, we need
-        # cast them back in float16 just to be sure everything works as expected.
-        input_dtype = query_states.dtype
-        if input_dtype == torch.float32:
-            if torch.is_autocast_enabled():
-                target_dtype = torch.get_autocast_gpu_dtype()
-            # Handle the case where the model is quantized
-            elif hasattr(self.config, "_pre_quantization_dtype"):
-                target_dtype = self.config._pre_quantization_dtype
-            else:
-                target_dtype = self.q_proj.weight.dtype
-
-            logger.warning_once(
-                f"The input hidden states seems to be silently casted in float32, this might be related to"
-                f" the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in"
-                f" {target_dtype}."
-            )
-
-            query_states = query_states.to(target_dtype)
-            key_states = key_states.to(target_dtype)
-            value_states = value_states.to(target_dtype)
-
-        # Reashape to the expected shape for Flash Attention
-        query_states = query_states.transpose(1, 2)
-        key_states = key_states.transpose(1, 2)
-        value_states = value_states.transpose(1, 2)
-
-        attn_output = self._flash_attention_forward(
-            query_states,
-            key_states,
-            value_states,
-            attention_mask,
-            q_len,
-            dropout=dropout_rate,
-            use_sliding_windows=use_sliding_windows,
-        )
-
-        attn_output = attn_output.reshape(bsz, q_len, int(self.hidden_size/self.tp_size)).contiguous()
-        attn_output = self.o_proj(attn_output)
-
-        if not output_attentions:
-            attn_weights = None
-
-        return attn_output, attn_weights, past_key_value
-
-    def _flash_attention_forward(
-        self,
-        query_states,
-        key_states,
-        value_states,
-        attention_mask,
-        query_length,
-        dropout=0.0,
-        softmax_scale=None,
-        use_sliding_windows=False,
-    ):
-        """
-        Calls the forward method of Flash Attention - if the input hidden states contain at least one padding token
-        first unpad the input, then computes the attention scores and pad the final attention scores.
-
-        Args:
-            query_states (`torch.Tensor`):
-                Input query states to be passed to Flash Attention API
-            key_states (`torch.Tensor`):
-                Input key states to be passed to Flash Attention API
-            value_states (`torch.Tensor`):
-                Input value states to be passed to Flash Attention API
-            attention_mask (`torch.Tensor`):
-                The padding mask - corresponds to a tensor of size `(batch_size, seq_len)` where 0 stands for the
-                position of padding tokens and 1 for the position of non-padding tokens.
-            dropout (`float`):
-                Attention dropout
-            softmax_scale (`float`, *optional*):
-                The scaling of QK^T before applying softmax. Default to 1 / sqrt(head_dim)
-            use_sliding_windows (`bool`, *optional*):
-                Whether to activate sliding window attention.
-        """
-        if not self._flash_attn_uses_top_left_mask:
-            causal = self.is_causal
-        else:
-            # TODO: Remove the `query_length != 1` check once Flash Attention for RoCm is bumped to 2.1. For details, please see the comment in LlamaFlashAttention2 __init__.
-            causal = self.is_causal and query_length != 1
-
-        # Contains at least one padding token in the sequence
-        if attention_mask is not None:
-            batch_size = query_states.shape[0]
-            query_states, key_states, value_states, indices_q, cu_seq_lens, max_seq_lens = self._upad_input(
-                query_states, key_states, value_states, attention_mask, query_length
-            )
-
-            cu_seqlens_q, cu_seqlens_k = cu_seq_lens
-            max_seqlen_in_batch_q, max_seqlen_in_batch_k = max_seq_lens
-
-            if not use_sliding_windows:
-                attn_output_unpad = flash_attn_varlen_func(
-                    query_states,
-                    key_states,
-                    value_states,
-                    cu_seqlens_q=cu_seqlens_q,
-                    cu_seqlens_k=cu_seqlens_k,
-                    max_seqlen_q=max_seqlen_in_batch_q,
-                    max_seqlen_k=max_seqlen_in_batch_k,
-                    dropout_p=dropout,
-                    softmax_scale=softmax_scale,
-                    causal=causal,
-                )
-            else:
-                attn_output_unpad = flash_attn_varlen_func(
-                    query_states,
-                    key_states,
-                    value_states,
-                    cu_seqlens_q=cu_seqlens_q,
-                    cu_seqlens_k=cu_seqlens_k,
-                    max_seqlen_q=max_seqlen_in_batch_q,
-                    max_seqlen_k=max_seqlen_in_batch_k,
-                    dropout_p=dropout,
-                    softmax_scale=softmax_scale,
-                    causal=causal,
-                    window_size=(self.config.sliding_window, self.config.sliding_window),
-                )
-
-            attn_output = pad_input(attn_output_unpad, indices_q, batch_size, query_length)
-        else:
-            if not use_sliding_windows:
-                attn_output = flash_attn_func(
-                    query_states,
-                    key_states,
-                    value_states,
-                    dropout,
-                    softmax_scale=softmax_scale,
-                    causal=causal,
-                )
-            else:
-                attn_output = flash_attn_func(
-                    query_states,
-                    key_states,
-                    value_states,
-                    dropout,
-                    softmax_scale=softmax_scale,
-                    causal=causal,
-                    window_size=(self.config.sliding_window, self.config.sliding_window),
-                )
-
-        return attn_output
-
-    def _upad_input(self, query_layer, key_layer, value_layer, attention_mask, query_length):
-        batch_size, kv_seq_len, num_heads, head_dim = key_layer.shape
-
-        # On the first iteration we need to properly re-create the padding mask
-        # by slicing it on the proper place
-        if kv_seq_len != attention_mask.shape[-1]:
-            attention_mask_num_tokens = attention_mask.shape[-1]
-            attention_mask = attention_mask[:, attention_mask_num_tokens - kv_seq_len :]
-
-        indices_k, cu_seqlens_k, max_seqlen_in_batch_k = _get_unpad_data(attention_mask)
-
-        key_layer = index_first_axis(key_layer.reshape(batch_size * kv_seq_len, num_heads, head_dim), indices_k)
-        value_layer = index_first_axis(value_layer.reshape(batch_size * kv_seq_len, num_heads, head_dim), indices_k)
-
-        if query_length == kv_seq_len:
-            query_layer = index_first_axis(
-                query_layer.reshape(batch_size * kv_seq_len, num_heads, head_dim), indices_k
-            )
-            cu_seqlens_q = cu_seqlens_k
-            max_seqlen_in_batch_q = max_seqlen_in_batch_k
-            indices_q = indices_k
-        elif query_length == 1:
-            max_seqlen_in_batch_q = 1
-            cu_seqlens_q = torch.arange(
-                batch_size + 1, dtype=torch.int32, device=query_layer.device
-            )  # There is a memcpy here, that is very bad.
-            indices_q = cu_seqlens_q[:-1]
-            query_layer = query_layer.squeeze(1)
-        else:
-            # The -q_len: slice assumes left padding.
-            attention_mask = attention_mask[:, -query_length:]
-            query_layer, indices_q, cu_seqlens_q, max_seqlen_in_batch_q = unpad_input(query_layer, attention_mask)
-
-        return (
-            query_layer,
-            key_layer,
-            value_layer,
-            indices_q,
-            (cu_seqlens_q, cu_seqlens_k),
-            (max_seqlen_in_batch_q, max_seqlen_in_batch_k),
-        )
-
-
-# copied from transformers.models.llama.modeling_llama.LlamaSdpaAttention with Llama->Mistral
-# TODO @Arthur no longer copied from LLama after static cache
-class Mistral2SdpaAttention(Mistral2Attention):
-    """
-    Mistral attention module using torch.nn.functional.scaled_dot_product_attention. This module inherits from
-    `MistralAttention` as the weights of the module stays untouched. The only changes are on the forward pass to adapt to
-    SDPA API.
-    """
-
-    # Adapted from MistralAttention.forward
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_value: Optional[Cache] = None,
-        output_attentions: bool = False,
-        use_cache: bool = False,
-    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
-        if output_attentions:
-            # TODO: Improve this warning with e.g. `model.config.attn_implementation = "manual"` once this is implemented.
-            logger.warning_once(
-                "MistralModel is using MistralSdpaAttention, but `torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to the manual attention implementation, "
-                'but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
-            )
-            return super().forward(
-                hidden_states=hidden_states,
-                attention_mask=attention_mask,
-                position_ids=position_ids,
-                past_key_value=past_key_value,
-                output_attentions=output_attentions,
-                use_cache=use_cache,
-            )
-
-        bsz, q_len, _ = hidden_states.size()
-
-        query_states = self.q_proj(hidden_states)
-        key_states = self.k_proj(hidden_states)
-        value_states = self.v_proj(hidden_states)
-
-        # query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
-        # key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
-        # value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
-
-        query_states, key_states, value_states = (
-            rearrange(query_states, "b n (h d) -> b n h d", d=self.head_dim),
-            rearrange(key_states, "b n (h d) -> b n h d", d=self.head_dim),
-            rearrange(value_states, "b n (h d) -> b n h d", d=self.head_dim),
-        )
-
-        self.num_heads_tp = query_states.shape[2]
-        self.tp_size = self.num_heads // self.num_heads_tp
-
-        query_states = query_states.transpose(1, 2)
-        key_states = key_states.transpose(1, 2)
-        value_states = value_states.transpose(1, 2)
-
-        if self.config.pp_size > 1:
-            query_states = query_states.contiguous()
-            key_states = key_states.contiguous()
-            value_states = value_states.contiguous()
-
-
-        kv_seq_len = key_states.shape[-2]
-        if past_key_value is not None:
-            kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
-        cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
-
-        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
-
-        if past_key_value is not None:
-            cache_kwargs = {"sin": sin, "cos": cos}  # Specific to RoPE models
-            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
-
-        key_states = repeat_kv(key_states, self.num_key_value_groups)
-        value_states = repeat_kv(value_states, self.num_key_value_groups)
-
-        if attention_mask is not None:
-            if attention_mask.size() != (bsz, 1, q_len, kv_seq_len):
-                raise ValueError(
-                    f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}"
-                )
-
-        # SDPA with memory-efficient backend is currently (torch==2.1.2) bugged with non-contiguous inputs with custom attn_mask,
-        # Reference: https://github.com/pytorch/pytorch/issues/112577.
-        if query_states.device.type == "cuda" and attention_mask is not None:
-            query_states = query_states.contiguous()
-            key_states = key_states.contiguous()
-            value_states = value_states.contiguous()
-
-        attn_output = torch.nn.functional.scaled_dot_product_attention(
-            query_states,
-            key_states,
-            value_states,
-            attn_mask=attention_mask,
-            dropout_p=self.attention_dropout if self.training else 0.0,
-            # The q_len > 1 is necessary to match with AttentionMaskConverter.to_causal_4d that does not create a causal mask in case q_len == 1.
-            is_causal=self.is_causal and attention_mask is None and q_len > 1,
-        )
-
-        attn_output = attn_output.transpose(1, 2).contiguous()
-        attn_output = attn_output.view(bsz, q_len, int(self.hidden_size/self.tp_size))
-
-        attn_output = self.o_proj(attn_output)
-
-        return attn_output, None, past_key_value
-
-
-MISTRAL_ATTENTION_CLASSES = {
-    "eager": Mistral2Attention,
-    "flash_attention_2": Mistral2FlashAttention2,
-    "sdpa": Mistral2SdpaAttention,
-}
-
-
-class MistralDecoderLayer(nn.Module):
-    def __init__(self, config: CollieConfig, layer_idx: int):
-        super().__init__()
-        self.hidden_size = config.hidden_size
-
-        self.self_attn = MISTRAL_ATTENTION_CLASSES[config._attn_implementation](config, layer_idx)
-
-        self.mlp = Mistral2MLP(config)
-        self.input_layernorm = Mistral2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
-        self.post_attention_layernorm = Mistral2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
-
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_value: Optional[Tuple[torch.Tensor]] = None,
-        output_attentions: Optional[bool] = False,
-        use_cache: Optional[bool] = False,
-        **kwargs,
-    ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
-        if "padding_mask" in kwargs:
-            warnings.warn(
-                "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`"
-            )
-        """
-        Args:
-            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
-            attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
-                `(batch, sequence_length)` where padding elements are indicated by 0.
-            output_attentions (`bool`, *optional*):
-                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
-                returned tensors for more detail.
-            use_cache (`bool`, *optional*):
-                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
-                (see `past_key_values`).
-            past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
-        """
-
-        residual = hidden_states
-
-        hidden_states = self.input_layernorm(hidden_states)
-
-        # Self Attention
-        hidden_states, self_attn_weights, present_key_value = self.self_attn(
-            hidden_states=hidden_states,
-            attention_mask=attention_mask,
-            position_ids=position_ids,
-            past_key_value=past_key_value,
-            output_attentions=output_attentions,
-            use_cache=use_cache,
-        )
-        hidden_states = residual + hidden_states
-
-        # Fully Connected
-        residual = hidden_states
-        hidden_states = self.post_attention_layernorm(hidden_states)
-        hidden_states = self.mlp(hidden_states)
-        hidden_states = residual + hidden_states
-
-        outputs = (hidden_states,)
-
-        if output_attentions:
-            outputs += (self_attn_weights,)
-
-        if use_cache:
-            outputs += (present_key_value,)
-
-        return outputs
-
-
-MISTRAL_START_DOCSTRING = r"""
-    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
-    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
-    etc.)
-
-    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
-    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
-    and behavior.
-
-    Parameters:
-        config ([`MistralConfig`]):
-            Model configuration class with all the parameters of the model. Initializing with a config file does not
-            load the weights associated with the model, only the configuration. Check out the
-            [`~PreTrainedModel.from_pretrained`] method to load the model weights.
-"""
-
-
-@add_start_docstrings(
-    "The bare Mistral Model outputting raw hidden-states without any specific head on top.",
-    MISTRAL_START_DOCSTRING,
-)
-class Mistral2PreTrainedModel(PreTrainedModel):
-    config_class = Mistral2Config
-    base_model_prefix = "model"
-    supports_gradient_checkpointing = True
-    _no_split_modules = ["MistralDecoderLayer"]
-    _skip_keys_device_placement = "past_key_values"
-    _supports_flash_attn_2 = True
-    _supports_sdpa = True
-    _supports_cache_class = True
-
-    def _init_weights(self, module):
-        std = self.config.initializer_range
-        if isinstance(module, nn.Linear):
-            module.weight.data.normal_(mean=0.0, std=std)
-            if module.bias is not None:
-                module.bias.data.zero_()
-        elif isinstance(module, nn.Embedding):
-            module.weight.data.normal_(mean=0.0, std=std)
-            if module.padding_idx is not None:
-                module.weight.data[module.padding_idx].zero_()
-
-
-MISTRAL_INPUTS_DOCSTRING = r"""
-    Args:
-        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
-            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
-            it.
-
-            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
-            [`PreTrainedTokenizer.__call__`] for details.
-
-            [What are input IDs?](../glossary#input-ids)
-        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
-            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
-
-            - 1 for tokens that are **not masked**,
-            - 0 for tokens that are **masked**.
-
-            [What are attention masks?](../glossary#attention-mask)
-
-            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
-            [`PreTrainedTokenizer.__call__`] for details.
-
-            If `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
-            `past_key_values`).
-
-            If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
-            and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
-            information on the default strategy.
-
-            - 1 indicates the head is **not masked**,
-            - 0 indicates the head is **masked**.
-        position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
-            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
-            config.n_positions - 1]`.
-
-            [What are position IDs?](../glossary#position-ids)
-        past_key_values (`Cache` or `tuple(tuple(torch.FloatTensor))`, *optional*):
-            Pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
-            blocks) that can be used to speed up sequential decoding. This typically consists in the `past_key_values`
-            returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`.
-
-            Two formats are allowed:
-            - a [`~cache_utils.Cache`] instance;
-            - Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of
-            shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`). This is also known as the legacy
-            cache format.
-
-            The model will output the same cache format that is fed as input. If no `past_key_values` are passed, the
-            legacy cache format will be returned.
-
-            If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that don't
-            have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `input_ids`
-            of shape `(batch_size, sequence_length)`.
-        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
-            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
-            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
-            model's internal embedding lookup matrix.
-        use_cache (`bool`, *optional*):
-            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
-            `past_key_values`).
-        output_attentions (`bool`, *optional*):
-            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
-            tensors for more detail.
-        output_hidden_states (`bool`, *optional*):
-            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
-            more detail.
-        return_dict (`bool`, *optional*):
-            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
-"""
-
-
-@add_start_docstrings(
-    "The bare Mistral Model outputting raw hidden-states without any specific head on top.",
-    MISTRAL_START_DOCSTRING,
-)
-class Mistral2Model(nn.Module):
-    """
-    Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`MistralDecoderLayer`]
-
-    Args:
-        config: MistralConfig
-    """
-
-    def __init__(self, config:  CollieConfig):
-        # super().__init__(config)
-        super().__init__()
-        self.config = config
-        self.padding_idx = config.pad_token_id
-        self.vocab_size = config.vocab_size
-
-        self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
-        self.layers = nn.ModuleList(
-            [MistralDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
-        )
-        self._attn_implementation = config._attn_implementation
-        self.norm = Mistral2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
-
-        self.gradient_checkpointing = False
-        # Initialize weights and apply final processing
-        # self.post_init()
-
-    def get_input_embeddings(self):
-        return self.embed_tokens
-
-    def set_input_embeddings(self, value):
-        self.embed_tokens = value
-
-    @add_start_docstrings_to_model_forward(MISTRAL_INPUTS_DOCSTRING)
-    def forward(
-        self,
-        input_ids: torch.LongTensor = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[List[torch.FloatTensor]] = None,
-        inputs_embeds: Optional[torch.FloatTensor] = None,
-        use_cache: Optional[bool] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ) -> Union[Tuple, BaseModelOutputWithPast]:
-        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
-        output_hidden_states = (
-            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
-        )
-        use_cache = use_cache if use_cache is not None else self.config.use_cache
-
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        # retrieve input_ids and inputs_embeds
-        if input_ids is not None and inputs_embeds is not None:
-            raise ValueError("You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time")
-        elif input_ids is not None:
-            batch_size, seq_length = input_ids.shape
-        elif inputs_embeds is not None:
-            batch_size, seq_length, _ = inputs_embeds.shape
-        else:
-            raise ValueError("You have to specify either decoder_input_ids or decoder_inputs_embeds")
-
-        if self.gradient_checkpointing and self.training:
-            if use_cache:
-                logger.warning_once(
-                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
-                )
-                use_cache = False
-
-        past_key_values_length = 0
-
-        if use_cache:
-            use_legacy_cache = not isinstance(past_key_values, Cache)
-            if use_legacy_cache:
-                past_key_values = DynamicCache.from_legacy_cache(past_key_values)
-            past_key_values_length = past_key_values.get_usable_length(seq_length)
-
-        if position_ids is None:
-            device = input_ids.device if input_ids is not None else inputs_embeds.device
-            position_ids = torch.arange(
-                past_key_values_length, seq_length + past_key_values_length, dtype=torch.long, device=device
-            )
-            position_ids = position_ids.unsqueeze(0).view(-1, seq_length)
-        else:
-            position_ids = position_ids.view(-1, seq_length).long()
-
-        if inputs_embeds is None:
-            inputs_embeds = self.embed_tokens(input_ids)
-
-        if attention_mask is not None and self._attn_implementation == "flash_attention_2" and use_cache:
-            is_padding_right = attention_mask[:, -1].sum().item() != batch_size
-            if is_padding_right:
-                raise ValueError(
-                    "You are attempting to perform batched generation with padding_side='right'"
-                    " this may lead to unexpected behaviour for Flash Attention version of Mistral. Make sure to "
-                    " call `tokenizer.padding_side  = 'left'` before tokenizing the input. "
-                )
-
-        if self._attn_implementation == "flash_attention_2":
-            # 2d mask is passed through the layers
-            attention_mask = attention_mask if (attention_mask is not None and 0 in attention_mask) else None
-        elif self._attn_implementation == "sdpa" and not output_attentions:
-            # output_attentions=True can not be supported when using SDPA, and we fall back on
-            # the manual implementation that requires a 4D causal mask in all cases.
-            attention_mask = _prepare_4d_causal_attention_mask_for_sdpa(
-                attention_mask,
-                (batch_size, seq_length),
-                inputs_embeds,
-                past_key_values_length,
-            )
-        else:
-            # 4d mask is passed through the layers
-            attention_mask = _prepare_4d_causal_attention_mask(
-                attention_mask,
-                (batch_size, seq_length),
-                inputs_embeds,
-                past_key_values_length,
-                sliding_window=self.config.sliding_window,
-            )
-
-        hidden_states = inputs_embeds
-
-        # decoder layers
-        all_hidden_states = () if output_hidden_states else None
-        all_self_attns = () if output_attentions else None
-        next_decoder_cache = None
-
-        for decoder_layer in self.layers:
-            if output_hidden_states:
-                all_hidden_states += (hidden_states,)
-
-            if self.gradient_checkpointing and self.training:
-                layer_outputs = self._gradient_checkpointing_func(
-                    decoder_layer.__call__,
-                    hidden_states,
-                    attention_mask,
-                    position_ids,
-                    past_key_values,
-                    output_attentions,
-                    use_cache,
-                )
-            else:
-                layer_outputs = decoder_layer(
-                    hidden_states,
-                    attention_mask=attention_mask,
-                    position_ids=position_ids,
-                    past_key_value=past_key_values,
-                    output_attentions=output_attentions,
-                    use_cache=use_cache,
-                )
-
-            hidden_states = layer_outputs[0]
-
-            if use_cache:
-                next_decoder_cache = layer_outputs[2 if output_attentions else 1]
-
-            if output_attentions:
-                all_self_attns += (layer_outputs[1],)
-
-        hidden_states = self.norm(hidden_states)
-
-        # add hidden states from the last decoder layer
-        if output_hidden_states:
-            all_hidden_states += (hidden_states,)
-
-        next_cache = None
-        if use_cache:
-            next_cache = next_decoder_cache.to_legacy_cache() if use_legacy_cache else next_decoder_cache
-
-        if not return_dict:
-            return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None)
-        return BaseModelOutputWithPast(
-            last_hidden_state=hidden_states,
-            past_key_values=next_cache,
-            hidden_states=all_hidden_states,
-            attentions=all_self_attns,
-        )
-
-
-class Mistral2ForCausalLM(CollieModelForCausalLM):
-    _tied_weights_keys = ["lm_head.weight"]
-
-    def __init__(self, config:CollieConfig):
-        super().__init__(config)
-        self.model = Mistral2Model(config)
-        self.vocab_size = config.vocab_size
-        # self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
-        self.lm_head = ColumnParallelLinearWithoutBias(
-            self.collie_config.hidden_size, self.collie_config.vocab_size, bias=False
-        )
-        # Initialize weights and apply final processing
-        # self.post_init()
-        # GenerationMixin 需要的额外参数
-        self.config.is_decoder = True
-        if config.model_config.tie_word_embeddings:
-            self.lm_head.weight = self.embed_tokens.weight
-        self.main_input_name = "input_ids"
-
-    def clean_cache(self):
-        self._clean_hidden_states([*self.model.layers, self.lm_head])
-        self._set_use_cache(self.model.layers, False)
-
-    def set_cache(self, use_cache):
-        self._set_use_cache(self.model.layers, use_cache)
-
-    def get_input_embeddings(self):
-        return self.model.embed_tokens
-
-    def set_input_embeddings(self, value):
-        self.model.embed_tokens = value
-
-    def get_output_embeddings(self):
-        return self.lm_head
-
-    def set_output_embeddings(self, new_embeddings):
-        self.lm_head = new_embeddings
-
-    def set_decoder(self, decoder):
-        self.model = decoder
-
-    def get_decoder(self):
-        return self.model
-
-    @add_start_docstrings_to_model_forward(MISTRAL_INPUTS_DOCSTRING)
-    @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
-    def forward(
-        self,
-        input_ids: torch.LongTensor = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[List[torch.FloatTensor]] = None,
-        inputs_embeds: Optional[torch.FloatTensor] = None,
-        labels: Optional[torch.LongTensor] = None,
-        use_cache: Optional[bool] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-        **kwargs,
-    ) -> Union[Tuple, CausalLMOutputWithPast]:
-        r"""
-        Args:
-            labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
-                Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
-                config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
-                (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
-
-        Returns:
-
-        Example:
-
-        ```python
-        >>> from transformers import AutoTokenizer, MistralForCausalLM
-
-        >>> model = MistralForCausalLM.from_pretrained("mistralai/Mistral-7B-v0.1")
-        >>> tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-v0.1")
-
-        >>> prompt = "Hey, are you conscious? Can you talk to me?"
-        >>> inputs = tokenizer(prompt, return_tensors="pt")
-
-        >>> # Generate
-        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
-        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
-        "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
-        ```"""
-
-        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
-        output_hidden_states = (
-            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
-        )
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
-        outputs = self.model(
-            input_ids=input_ids,
-            attention_mask=attention_mask,
-            position_ids=position_ids,
-            past_key_values=past_key_values,
-            inputs_embeds=inputs_embeds,
-            use_cache=use_cache,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-
-        hidden_states = outputs[0]
-        logits = self.lm_head(hidden_states)
-        logits = logits.float()
-
-        loss = None
-        if labels is not None:
-            # Shift so that tokens < n predict n
-            shift_logits = logits[..., :-1, :].contiguous()
-            shift_labels = labels[..., 1:].contiguous()
-            # Flatten the tokens
-            shift_logits = shift_logits.view(-1, self.config.vocab_size)
-            shift_labels = shift_labels.view(-1)
-            # Ensure tensors are on the same device
-            shift_labels = shift_labels.to(shift_logits.device)
-            loss_fct = CrossEntropyLoss()
-            loss = loss_fct(shift_logits, shift_labels)
-
-        if not return_dict:
-            output = (logits,) + outputs[1:]
-            return (loss,) + output if loss is not None else output
-
-        return CausalLMOutputWithPast(
-            loss=loss,
-            logits=logits,
-            past_key_values=outputs.past_key_values,
-            hidden_states=outputs.hidden_states,
-            attentions=outputs.attentions,
-        )
-
-    def prepare_inputs_for_generation(
-        self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, **kwargs
-    ):
-        # Omit tokens covered by past_key_values
-        if past_key_values is not None:
-            if isinstance(past_key_values, Cache):
-                cache_length = past_key_values.get_seq_length()
-                past_length = past_key_values.seen_tokens
-                max_cache_length = past_key_values.get_max_length()
-            else:
-                cache_length = past_length = past_key_values[0][0].shape[2]
-                max_cache_length = None
-
-            # Keep only the unprocessed tokens:
-            # 1 - If the length of the attention_mask exceeds the length of input_ids, then we are in a setting where
-            # some of the inputs are exclusively passed as part of the cache (e.g. when passing input_embeds as
-            # input)
-            if attention_mask is not None and attention_mask.shape[1] > input_ids.shape[1]:
-                input_ids = input_ids[:, -(attention_mask.shape[1] - past_length) :]
-            # 2 - If the past_length is smaller than input_ids', then input_ids holds all input tokens. We can discard
-            # input_ids based on the past_length.
-            elif past_length < input_ids.shape[1]:
-                input_ids = input_ids[:, past_length:]
-            # 3 - Otherwise (past_length >= input_ids.shape[1]), let's assume input_ids only has unprocessed tokens.
-
-            # If we are about to go beyond the maximum cache length, we need to crop the input attention mask.
-            if (
-                max_cache_length is not None
-                and attention_mask is not None
-                and cache_length + input_ids.shape[1] > max_cache_length
-            ):
-                attention_mask = attention_mask[:, -max_cache_length:]
-
-        position_ids = kwargs.get("position_ids", None)
-        if attention_mask is not None and position_ids is None:
-            # create position_ids on the fly for batch generation
-            position_ids = attention_mask.long().cumsum(-1) - 1
-            position_ids.masked_fill_(attention_mask == 0, 1)
-            if past_key_values:
-                position_ids = position_ids[:, -input_ids.shape[1] :]
-
-        # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
-        if inputs_embeds is not None and past_key_values is None:
-            model_inputs = {"inputs_embeds": inputs_embeds}
-        else:
-            model_inputs = {"input_ids": input_ids}
-
-        model_inputs.update(
-            {
-                "position_ids": position_ids,
-                "past_key_values": past_key_values,
-                "use_cache": kwargs.get("use_cache"),
-                "attention_mask": attention_mask,
-            }
-        )
-        return model_inputs
-
-    @staticmethod
-    def _reorder_cache(past_key_values, beam_idx):
-        reordered_past = ()
-        for layer_past in past_key_values:
-            reordered_past += (
-                tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past),
-            )
-        return reordered_past
-
-    @classmethod
-    def pipeline_layers(cls, config: CollieConfig):
-        """
-        Get layers of pipeline.
-        :return: list
-        """
-        if isinstance(config, str):
-            config = CollieConfig.from_pretrained(config)
-
-        if config.tie_word_embeddings:
-            output = TiedLayerSpec(
-                "embed_tokens",
-                dict_as_params(input_keys="hidden_states", output_keys="logits"),
-                ColumnParallelLMHead,
-                config.hidden_size,
-                config.vocab_size,
-                bias=False,
-            )
-        else:
-            output = LayerSpec(
-                dict_as_params(input_keys="hidden_states", output_keys="logits"),
-                ColumnParallelLMHead,
-                config.hidden_size,
-                config.vocab_size,
-                bias=False,
-            )
-
-        return [("model", Mistral2Model.pipeline_layers(config)), ("lm_head", output)]
-
-    @staticmethod
-    def load_parallel_state_dict(
-            path: str,
-            config: Union[CollieConfig, str],
-            process_exclusion: bool = False,
-            **kwargs,
-    ):
-        ...
-
-    @staticmethod
-    def load_parallel_state_dict(
-            path: str,
-            config: Union[CollieConfig, str],
-            process_exclusion: bool = False,
-            protocol: str = "file", # 指定加载state_dict时使用的协议
-            **kwargs,
-    ):
-        """
-        Load state_dict from ``path``.
-        The format of pretrained model should be the same as that of
-        `huggingface`.
-        :return: state_dict. Note that the state_dict should be processed
-            properly to match the current rank.
-        """
-        # 配置加载
-        if isinstance(config, str):
-            config = CollieConfig.from_pretrained(config)
-        # IO驱动初始化
-        io_driver = IODriver.from_protocol(protocol)
-        # 检查文件路径是否存在
-        if not io_driver.exists(path):
-            raise FileNotFoundError(f"folder {path} not found.")
-        # 初始化存储和处理变量
-        state_dict = OrderedDict()
-        weights = []
-        parts = None # 变量用于存储模型分割的部分信息
-        # 如果开启了进程互斥，那么每个进程都会显示进度条，否则只显示 RANK0 的
-        hide_progress = not process_exclusion and int(os.environ.get("RANK", "0")) != 0
-        if dist.is_initialized() and process_exclusion:
-            # 如果启动了进程互斥，则要进行 dist.get_world_size() 次循环
-            rank_order = range(dist.get_world_size())
-        else:
-            # 不开启只进行一次循环
-            rank_order = range(1)
-        # 权重文件加载和处理
-        for rank in rank_order:
-            # 如果开启了进程互斥，那么只有对应 RANK 的能进入循环；不开启进程互斥的话就都可以进
-            if int(os.environ.get("RANK", "0")) == rank or not process_exclusion:
-                # PP 分层的方法保存在了 os.environ["COLLIE_PP_PARTS"], 格式类似于 [0, 17, 35], 左闭右开
-                if env.is_pipeline:
-                    # 保存的是 json 格式
-                    parts = env.pipeline_parts
-                if hasattr(config, "num_key_value_heads"):
-                    # llama2 (transformers >= 4.31.0)
-                    num_key_value_heads = config.num_key_value_heads
-                else:
-                    num_key_value_heads = config.num_attention_heads
-                head_dim = config.hidden_size // config.num_attention_heads
-                # 如果存在 pytorch_model.bin.index.json 文件的话，此时不同的 pp 进程可以按需加载自己需要的权重
-                if (
-                        io_driver.exists(os.path.join(path, "pytorch_model.bin.index.json"))
-                        and "COLLIE_PP_PARTS" in os.environ.keys()
-                ):
-                    weight_map = json.loads(
-                        io_driver.load(
-                            os.path.join(path, "pytorch_model.bin.index.json"), mode="r"
-                        )
-                    )["weight_map"]
-                    # layers 表示自己需要的层
-                    layers = env.pipeline_layers_idx
-                    # 筛选出形似 model.layers.0 这样的层。包含两个条件：1. 有数字的层；2. 数字加一要在 layers 里面（因为最开始还有个 embedding 占一层）
-                    weights.extend(
-                        [
-                            value
-                            for key, value in weight_map.items()
-                            if len(key.split(".")) > 2
-                               and key.split(".")[2].isdigit()
-                               and (int(key.split(".")[2]) + 1) in layers
-                        ]
-                    )
-                    # 去重
-                    weights = list(set(weights))
-                    # 继续筛选，如果有 0 层，那么就要加载 embedding；如果有最后一层，那么就要加载 lm_head；如果有倒数第二层，那么就要加载 norm
-                    if 0 in layers:
-                        weights.append(weight_map["model.tok_embeddings.weight"])
-                    if max(parts) - 1 in layers:
-                        weights.append(weight_map["output.weight"])
-                    if max(parts) - 2 in layers:
-                        weights.append(weight_map["model.norm.weight"])
-                else:
-                    # 如果没有 pytorch_model.bin.index.json 文件的话，那么就加载所有的权重
-                    weights = [
-                        weight
-                        for weight in io_driver.list(path)
-                        if weight.endswith(".bin")
-                    ]
-                with progress(
-                    weights,
-                    desc="Loading state dict",
-                    total=len(weights),
-                    disable=hide_progress,
-                ) as pbar:
-                    for weight in pbar:
-                        part_state_dict = io_driver.load(
-                            os.path.join(path, weight), mode="rb"
-                        )
-                        # for key in list(part_state_dict.keys()):
-                            # if "attention.wqkv.weight" in key:
-                            #     # qkv_weights = part_state_dict.pop(key)
-                            #     qkv_weights = part_state_dict[key]
-                            #     print(qkv_weights.shape)
-                            #     (wq, wk, wv) = qkv_weights.split(
-                            #         [
-                            #             config.hidden_size,
-                            #             config.num_key_value_heads * head_dim,
-                            #             config.num_key_value_heads * head_dim,
-                            #         ],
-                            #         dim=0,
-                            #     )
-                            #     wq_name = key.replace("wqkv", "wq")
-                            #     wk_name = key.replace("wqkv", "wk")
-                            #     wv_name = key.replace("wqkv", "wv")
-                            #     part_state_dict[wq_name] = wq
-                            #     part_state_dict[wk_name] = wk
-                            #     part_state_dict[wv_name] = wv
-                        state_dict.update(part_state_dict)
-                        del part_state_dict
-                if parts is not None:
-                    # 这一步是 pp 的复筛
-                    layers = env.pipeline_layers_idx
-                    for key in list(state_dict.keys()):
-                        if key.startswith("layers"):
-                            layer = int(key.split(".")[1])
-                            if layer + 1 not in layers:
-                                state_dict.pop(key)
-                        # if key.endswith("tok_embeddings.weight"):
-                        if key.endswith("embed_tokens.weight"):
-                            if 0 not in layers:
-                                state_dict.pop(key)
-                        if key == "norm.weight":
-                            if max(parts) - 2 not in layers:
-                                state_dict.pop(key)
-                        # if key.endswith("output.weight"):
-                        if key.endswith("lm_head.weight"):
-                            if max(parts) - 1 not in layers:
-                                state_dict.pop(key)
-                # 根据用户配置的新的 tp size 进行分割
-                for key in list(state_dict.keys()):
-                    col_filter = [
-                        # "wq.weight",
-                        # "wk.weight",
-                        # "wv.weight",
-                        # "wqkv.weight",
-                        # "w1.weight",
-                        # "w3.weight",
-                        # "tok_embeddings.weight",
-                        # "output.weight",
-                        "q_proj.weight",
-                        "k_proj.weight",
-                        "v_proj.weight",
-                        "o_proj.weight",
-                        "lm_head.weight",
-                        "gate_proj.weight",
-                        "up_proj.weight",
-                        "down_proj.weight",
-                        "embed_tokens.weight",
-                    ]
-                    col_split = any([key.endswith(filter) for filter in col_filter])
-
-                    if col_split:
-                        tensor = (
-                            list(torch.chunk(state_dict[key], config.tp_size, dim=0))[
-                                env.tp_rank
-                            ]
-                            .detach()
-                            .clone()
-                        )
-                        del state_dict[key]
-                        if process_exclusion:
-                            # CPU 内存回收（速度很慢）
-                            gc.collect()
-                        state_dict[key] = tensor
-                    elif key.endswith("o_proj.weight") or key.endswith("down_proj.weight"):
-                        tensor = (
-                            list(torch.chunk(state_dict[key], config.tp_size, dim=1))[
-                                env.tp_rank
-                            ]
-                            .detach()
-                            .clone()
-                        )
-                        del state_dict[key]
-                        if process_exclusion:
-                            # CPU 内存回收（速度很慢）
-                            gc.collect()
-                        state_dict[key] = tensor
-            if dist.is_initialized() and process_exclusion:
-                # 如果选择了进程互斥，那么本次循环中不需要加载权重的进程需等待
-                dist.barrier()
-        return state_dict
-
-    @staticmethod
-    def save_parallel_state_dict(
-        state_dict: dict,
-        path: str,
-        config: CollieConfig,
-        process_exclusion: bool = False,
-        **kwargs,
-    ):
-        ...
-
-    @staticmethod
-    def save_parallel_state_dict(
-            state_dict: dict,
-            path: str,
-            config: CollieConfig,
-            process_exclusion: bool = False,
-            protocol: str = "file",
-    ):
-        """
-        Save state_dict to ``path``.
-        The format of saved state dict should be the same as that of
-        `huggingface`.
-        """
-        io_driver = IODriver.from_protocol(protocol)
-        # gather to tp rank 0
-        if dist.is_initialized() and process_exclusion:
-            # 如果启动了进程互斥，则要进行 pp_size 次循环
-            rank_order = range(config.pp_size)
-        else:
-            # 不开启只进行一次循环
-            rank_order = range(1)
-        dst = parallel_state.get_tensor_model_parallel_src_rank()
-        with progress(
-                rank_order,
-                desc="Saving model",
-                disable=int(os.environ.get("RANK", "0")) != 0,
-        ) as pbar:
-            for rank in pbar:
-                if env.dp_rank == 0 and (env.pp_rank == rank or not process_exclusion):
-                    for key in sorted(list(state_dict.keys())):
-                        tensor_list = None
-                        if env.tp_rank == 0:
-                            tensor_list = [
-                                torch.zeros_like(state_dict[key])
-                                .to(state_dict[key].dtype)
-                                .cuda()
-                                for _ in range(config.tp_size)
-                            ]
-                        dist.gather(
-                            state_dict[key].cuda(),
-                            dst=dst,
-                            gather_list=tensor_list,
-                            group=env.tp_group,
-                        )
-                        if env.tp_rank == 0:
-                            col_filter = [
-                                # "wq.weight",
-                                # "wk.weight",
-                                # "wv.weight",
-                                # "wqkv.weight",
-                                # "w1.weight",
-                                # "w3.weight",
-                                # "tok_embeddings.weight",
-                                # "output.weight",
-                                "q_proj.weight",
-                                "k_proj.weight",
-                                "v_proj.weight",
-                                "o_proj.weight",
-                                "lm_head.weight",
-                                "gate_proj.weight",
-                                "up_proj.weight",
-                                "down_proj.weight",
-                                "embed_tokens.weight",
-                            ]
-                            col_split = any(
-                                [key.endswith(filter) for filter in col_filter]
-                            )
-
-                            if col_split:
-                                state_dict[key] = concat_tensor(tensor_list, dim=0)
-
-                                if process_exclusion:
-                                    # CPU 内存回收（速度很慢）
-                                    gc.collect()
-
-                            elif key.endswith("o_proj.weight") or key.endswith("down_proj.weight"):
-                                state_dict[key] = concat_tensor(tensor_list, dim=1)
-
-                                if process_exclusion:
-                                    # CPU 内存回收（速度很慢）
-                                    gc.collect()
-                    # 似乎不需要？
-                    # state_dict_keys = state_dict.keys()
-                    # for layer_id in range(config.num_layers):
-                    #     qkv_names = [None, None, None]
-                    #     for key in state_dict_keys:
-                    #         if f"layers.{layer_id}.attention.wq.weight" in key:
-                    #             qkv_names[0] = key
-                    #         elif f"layers.{layer_id}.attention.wk.weight" in key:
-                    #             qkv_names[1] = key
-                    #         elif f"layers.{layer_id}.attention.wv.weight" in key:
-                    #             qkv_names[2] = key
-                    #     qkv_name = qkv_names[0].replace("wq", "wqkv")
-                    #     state_dict[qkv_name] = torch.cat(
-                    #         [
-                    #             state_dict.pop(qkv_names[0]),
-                    #             state_dict.pop(qkv_names[1]),
-                    #             state_dict.pop(qkv_names[2]),
-                    #         ],
-                    #         dim=0
-                    #     )
-
-                    if env.tp_rank == 0:
-                        # Save gathered weights
-                        if env.is_pipeline:
-                            ckpt_name = f"pytorch_model-{env.pp_rank + 1:05d}-of-{config.pp_size:05d}.bin"
-                            total_size = 0
-                            weight_map = {}
-                            for name, weight in state_dict.items():
-                                weight_size = weight.numel() * dtype_byte_size(
-                                    weight.dtype
-                                )
-                                weight_map[name] = ckpt_name
-                                total_size += weight_size
-                            index_dict = dict(
-                                total_size=total_size, weight_map=weight_map
-                            )
-                            index_dicts = [None for _ in range(env.pp_size)]
-                            dist.gather_object(
-                                index_dict, index_dicts if env.pp_rank == 0 else None, group=env.pp_group
-                            )
-                            if env.pp_rank == 0:
-                                total_size = 0
-                                weight_map = {}
-                                for _index_dict in index_dicts:
-                                    total_size += _index_dict["total_size"]
-                                    weight_map.update(_index_dict["weight_map"])
-                                merged_dict = {
-                                    "metadata": {"total_size": total_size},
-                                    "weight_map": weight_map,
-                                }
-                                io_driver.save(
-                                    json.dumps(merged_dict, indent=2, sort_keys=True)
-                                    + "\n",
-                                    os.path.join(path, "pytorch_model.bin.index.json"),
-                                )
-
-                        else:
-                            ckpt_name = f"pytorch_model.bin"
-                        ckpt_path = os.path.join(path, ckpt_name)
-                        io_driver.save(state_dict, ckpt_path)
-                if dist.is_initialized() and process_exclusion:
-                    dist.barrier()
-        if env.rank == 0:
-            config.save_pretrained(path, protocol=protocol)
-        dist.barrier()
-
-
-@add_start_docstrings(
-    """
-    The Mistral Model transformer with a sequence classification head on top (linear layer).
-
-    [`MistralForSequenceClassification`] uses the last token in order to do the classification, as other causal models
-    (e.g. GPT-2) do.
-
-    Since it does classification on the last token, it requires to know the position of the last token. If a
-    `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If
-    no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the
-    padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
-    each row of the batch).
-    """,
-    MISTRAL_START_DOCSTRING,
-)
-# Copied from transformers.models.llama.modeling_llama.LlamaForSequenceClassification with Llama->Mistral, LLAMA->MISTRAL
-class MistralForSequenceClassification(Mistral2PreTrainedModel):
-    def __init__(self, config):
-        super().__init__(config)
-        self.num_labels = config.num_labels
-        self.model = Mistral2Model(config)
-        self.score = nn.Linear(config.hidden_size, self.num_labels, bias=False)
-
-        # Initialize weights and apply final processing
-        self.post_init()
-
-    def get_input_embeddings(self):
-        return self.model.embed_tokens
-
-    def set_input_embeddings(self, value):
-        self.model.embed_tokens = value
-
-    @add_start_docstrings_to_model_forward(MISTRAL_INPUTS_DOCSTRING)
-    def forward(
-        self,
-        input_ids: torch.LongTensor = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[List[torch.FloatTensor]] = None,
-        inputs_embeds: Optional[torch.FloatTensor] = None,
-        labels: Optional[torch.LongTensor] = None,
-        use_cache: Optional[bool] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ) -> Union[Tuple, SequenceClassifierOutputWithPast]:
-        r"""
-        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
-            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
-            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
-            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
-        """
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        transformer_outputs = self.model(
-            input_ids,
-            attention_mask=attention_mask,
-            position_ids=position_ids,
-            past_key_values=past_key_values,
-            inputs_embeds=inputs_embeds,
-            use_cache=use_cache,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-        hidden_states = transformer_outputs[0]
-        logits = self.score(hidden_states)
-
-        if input_ids is not None:
-            batch_size = input_ids.shape[0]
-        else:
-            batch_size = inputs_embeds.shape[0]
-
-        if self.config.pad_token_id is None and batch_size != 1:
-            raise ValueError("Cannot handle batch sizes > 1 if no padding token is defined.")
-        if self.config.pad_token_id is None:
-            sequence_lengths = -1
-        else:
-            if input_ids is not None:
-                # if no pad token found, use modulo instead of reverse indexing for ONNX compatibility
-                sequence_lengths = torch.eq(input_ids, self.config.pad_token_id).int().argmax(-1) - 1
-                sequence_lengths = sequence_lengths % input_ids.shape[-1]
-                sequence_lengths = sequence_lengths.to(logits.device)
-            else:
-                sequence_lengths = -1
-
-        pooled_logits = logits[torch.arange(batch_size, device=logits.device), sequence_lengths]
-
-        loss = None
-        if labels is not None:
-            labels = labels.to(logits.device)
-            if self.config.problem_type is None:
-                if self.num_labels == 1:
-                    self.config.problem_type = "regression"
-                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
-                    self.config.problem_type = "single_label_classification"
-                else:
-                    self.config.problem_type = "multi_label_classification"
-
-            if self.config.problem_type == "regression":
-                loss_fct = MSELoss()
-                if self.num_labels == 1:
-                    loss = loss_fct(pooled_logits.squeeze(), labels.squeeze())
-                else:
-                    loss = loss_fct(pooled_logits, labels)
-            elif self.config.problem_type == "single_label_classification":
-                loss_fct = CrossEntropyLoss()
-                loss = loss_fct(pooled_logits.view(-1, self.num_labels), labels.view(-1))
-            elif self.config.problem_type == "multi_label_classification":
-                loss_fct = BCEWithLogitsLoss()
-                loss = loss_fct(pooled_logits, labels)
-        if not return_dict:
-            output = (pooled_logits,) + transformer_outputs[1:]
-            return ((loss,) + output) if loss is not None else output
-
-        return SequenceClassifierOutputWithPast(
-            loss=loss,
-            logits=pooled_logits,
-            past_key_values=transformer_outputs.past_key_values,
-            hidden_states=transformer_outputs.hidden_states,
-            attentions=transformer_outputs.attentions,
-        )
diff --git a/tests/models/mistral2/modeltp.py b/tests/models/mistral2/modeltp.py
deleted file mode 100644
index e91037f..0000000
--- a/tests/models/mistral2/modeltp.py
+++ /dev/null
@@ -1,2254 +0,0 @@
-# coding=utf-8
-# Copyright 2023 Mistral AI and the HuggingFace Inc. team. All rights reserved.
-#
-# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
-# and OPT implementations in this library. It has been modified from its
-# original forms to accommodate minor architectural differences compared
-# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-""" PyTorch Mistral model."""
-import inspect
-import math
-import warnings
-from typing import List, Optional, Tuple, Union
-
-import torch
-import torch.nn.functional as F
-import torch.utils.checkpoint
-from torch import nn
-from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
-
-from transformers.activations import ACT2FN
-from transformers.cache_utils import Cache, DynamicCache
-from transformers.modeling_attn_mask_utils import _prepare_4d_causal_attention_mask, _prepare_4d_causal_attention_mask_for_sdpa
-from transformers.modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast, SequenceClassifierOutputWithPast
-from transformers.modeling_utils import PreTrainedModel, dtype_byte_size
-from transformers.utils import (
-    add_start_docstrings,
-    add_start_docstrings_to_model_forward,
-    is_flash_attn_2_available,
-    is_flash_attn_greater_or_equal_2_10,
-    logging,
-    replace_return_docstrings,
-)
-from .configuration_mistraltp import MistralConfig
-
-
-if is_flash_attn_2_available():
-    from flash_attn import flash_attn_func, flash_attn_varlen_func
-    from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input  # noqa
-
-    _flash_supports_window_size = "window_size" in list(inspect.signature(flash_attn_func).parameters)
-
-
-logger = logging.get_logger(__name__)
-
-_CONFIG_FOR_DOC = "MistralConfig"
-
-#modified for collie
-import torch.distributed as dist
-import gc
-import json
-import os
-from collections import OrderedDict
-from megatron.core import parallel_state, tensor_parallel
-from einops import rearrange
-from deepspeed.pipe import LayerSpec, TiedLayerSpec
-
-from collie.config import CollieConfig
-from collie.driver.io import IODriver
-from collie.log.logger import logger
-from collie.module import (
-    ColumnParallelLinearWithoutBias,
-    ColumnParallelLMHead,
-    RowParallelLinearWithoutBias,
-)
-from collie.utils import concat_tensor, dict_as_params, env, progress
-from collie.models.base import CollieModelForCausalLM
-from collie.models.utils import (
-    kv_cache_to_inputs_for_layer, inputs_to_kv_cache_for_layer,
-    kv_cache_to_inputs_for_model, inputs_to_kv_cache_for_model,
-)
-
-
-# Copied from transformers.models.llama.modeling_llama._get_unpad_data
-def _get_unpad_data(attention_mask):
-    seqlens_in_batch = attention_mask.sum(dim=-1, dtype=torch.int32)
-    indices = torch.nonzero(attention_mask.flatten(), as_tuple=False).flatten()
-    max_seqlen_in_batch = seqlens_in_batch.max().item()
-    cu_seqlens = F.pad(torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.int32), (1, 0))
-    return (
-        indices,
-        cu_seqlens,
-        max_seqlen_in_batch,
-    )
-
-
-# Copied from transformers.models.llama.modeling_llama.LlamaRMSNorm with Llama->Mistral
-class MistralRMSNorm(nn.Module):
-    def __init__(self, hidden_size, eps=1e-6):
-        """
-        MistralRMSNorm is equivalent to T5LayerNorm
-        """
-        super().__init__()
-        self.weight = nn.Parameter(torch.ones(hidden_size))
-        self.variance_epsilon = eps
-
-    def forward(self, hidden_states):
-        input_dtype = hidden_states.dtype
-        hidden_states = hidden_states.to(torch.float32)
-        variance = hidden_states.pow(2).mean(-1, keepdim=True)
-        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
-        ans = self.weight * hidden_states.to(input_dtype)
-
-        # # 打印层标准化的输出
-        hidden_states_output = ans.detach().cpu().tolist()
-        data_to_save = {"Layer Norm Output": hidden_states_output}
-        # 将输出写入 JSON 文件
-        with open('a_rms_output.json', 'w') as f:
-            json.dump(data_to_save, f, indent=4)
-
-        return self.weight * hidden_states.to(input_dtype)
-
-
-# copied from transformers.models.llama.modeling_llama.LlamaRotaryEmbedding with Llama->Mistral
-# TODO @Arthur no longer copied from LLama after static cache
-class MistralRotaryEmbedding(nn.Module):
-    def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None):
-        super().__init__()
-
-        self.dim = dim
-        self.max_position_embeddings = max_position_embeddings
-        self.base = base
-        inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2, dtype=torch.int64).float().to(device) / self.dim))
-        self.register_buffer("inv_freq", inv_freq, persistent=False)
-
-        # Build here to make `torch.jit.trace` work.
-        self._set_cos_sin_cache(
-            seq_len=max_position_embeddings, device=self.inv_freq.device, dtype=torch.get_default_dtype()
-        )
-
-    def _set_cos_sin_cache(self, seq_len, device, dtype):
-        self.max_seq_len_cached = seq_len
-        t = torch.arange(self.max_seq_len_cached, device=device, dtype=torch.int64).type_as(self.inv_freq)
-
-        freqs = torch.outer(t, self.inv_freq)
-        # Different from paper, but it uses a different permutation in order to obtain the same calculation
-        emb = torch.cat((freqs, freqs), dim=-1)
-        self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False)
-        self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False)
-
-    def forward(self, x, seq_len=None):
-        # x: [bs, num_attention_heads, seq_len, head_size]
-        if seq_len > self.max_seq_len_cached:
-            self._set_cos_sin_cache(seq_len=seq_len, device=x.device, dtype=x.dtype)
-
-        return (
-            self.cos_cached[:seq_len].to(dtype=x.dtype),
-            self.sin_cached[:seq_len].to(dtype=x.dtype),
-        )
-
-
-# Copied from transformers.models.llama.modeling_llama.rotate_half
-def rotate_half(x):
-    """Rotates half the hidden dims of the input."""
-    x1 = x[..., : x.shape[-1] // 2]
-    x2 = x[..., x.shape[-1] // 2 :]
-    return torch.cat((-x2, x1), dim=-1)
-
-
-# copied from transformers.models.llama.modeling_llama.apply_rotary_pos_emb
-# TODO @Arthur no longer copied from LLama after static cache
-def apply_rotary_pos_emb(q, k, cos, sin, position_ids, unsqueeze_dim=1):
-    """Applies Rotary Position Embedding to the query and key tensors.
-
-    Args:
-        q (`torch.Tensor`): The query tensor.
-        k (`torch.Tensor`): The key tensor.
-        cos (`torch.Tensor`): The cosine part of the rotary embedding.
-        sin (`torch.Tensor`): The sine part of the rotary embedding.
-        position_ids (`torch.Tensor`):
-            The position indices of the tokens corresponding to the query and key tensors. For example, this can be
-            used to pass offsetted position ids when working with a KV-cache.
-        unsqueeze_dim (`int`, *optional*, defaults to 1):
-            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
-            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
-            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
-            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
-            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
-            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
-    Returns:
-        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
-    """
-    cos = cos[position_ids].unsqueeze(unsqueeze_dim)
-    sin = sin[position_ids].unsqueeze(unsqueeze_dim)
-    q_embed = (q * cos) + (rotate_half(q) * sin)
-    k_embed = (k * cos) + (rotate_half(k) * sin)
-    return q_embed, k_embed
-
-
-class MistralMLP(nn.Module):
-    def __init__(self, config):
-        super().__init__()
-        self.config = config
-        self.hidden_size = config.hidden_size
-        self.intermediate_size = config.intermediate_size
-        # self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
-        # self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
-        # self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False)
-        
-        self.up_proj = ColumnParallelLinearWithoutBias(
-            self.hidden_size,
-            self.intermediate_size,
-            bias=False,
-            gather_output=False,
-            init_method=lambda x: x,
-        )
-        self.gate_proj = ColumnParallelLinearWithoutBias(
-            self.hidden_size,
-            self.intermediate_size,
-            bias=False,
-            gather_output=False,
-            init_method=lambda x: x,
-        )
-        self.down_proj = RowParallelLinearWithoutBias(
-            self.intermediate_size,
-            self.hidden_size,
-            bias=False,
-            input_is_parallel=True,
-            init_method=lambda x: x,
-        )
-        
-        self.act_fn = ACT2FN[config.hidden_act]
-
-    def forward(self, x):
-        output = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
-
-        # 打印MLP层输出
-        mlp_output = output.detach().cpu().tolist()
-        data_to_save = {"MLP Output": mlp_output}
-        # 将输出写入 JSON 文件
-        with open('a_mlp_output.json', 'w') as f:
-            json.dump(data_to_save, f, indent=4)
-        
-        return output
-
-
-# Copied from transformers.models.llama.modeling_llama.repeat_kv
-def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
-    """
-    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
-    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
-    """
-    batch, num_key_value_heads, slen, head_dim = hidden_states.shape
-    if n_rep == 1:
-        return hidden_states
-    hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
-    return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
-
-
-class MistralAttention(nn.Module):
-    """
-    Multi-headed attention from 'Attention Is All You Need' paper. Modified to use sliding window attention: Longformer
-    and "Generating Long Sequences with Sparse Transformers".
-    """
-
-    def __init__(self, config: CollieConfig, layer_idx: Optional[int] = None):
-        super().__init__()
-        self.config = config
-        self.layer_idx = layer_idx
-        if layer_idx is None:
-            logger.warning_once(
-                f"Instantiating {self.__class__.__name__} without passing a `layer_idx` is not recommended and will "
-                "lead to errors during the forward call if caching is used. Please make sure to provide a `layer_idx` "
-                "when creating this class."
-            )
-
-        self.hidden_size = config.hidden_size
-        self.num_heads = config.num_attention_heads
-        self.head_dim = self.hidden_size // self.num_heads
-        self.num_key_value_heads = config.num_key_value_heads
-        self.num_key_value_groups = self.num_heads // self.num_key_value_heads
-        self.max_position_embeddings = config.max_position_embeddings
-        self.rope_theta = config.rope_theta
-        self.is_causal = True
-        self.attention_dropout = config.attention_dropout
-
-        if (self.head_dim * self.num_heads) != self.hidden_size:
-            raise ValueError(
-                f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}"
-                f" and `num_heads`: {self.num_heads})."
-            )
-        # self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=False)
-        # self.k_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=False)
-        # self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=False)
-        # self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=False)
-
-        self.q_proj = ColumnParallelLinearWithoutBias(
-            self.hidden_size,
-            self.num_heads * self.head_dim,
-            bias=False,
-            gather_output=False,
-            init_method=lambda x: x,
-        )
-        self.k_proj = ColumnParallelLinearWithoutBias(
-            self.hidden_size,
-            self.num_key_value_heads * self.head_dim,
-            bias=False,
-            gather_output=False,
-            init_method=lambda x: x,
-        )
-        self.v_proj = ColumnParallelLinearWithoutBias(
-            self.hidden_size,
-            self.num_key_value_heads * self.head_dim,
-            bias=False,
-            gather_output=False,
-            init_method=lambda x: x,
-        )
-        # aaaa
-        self.o_proj = RowParallelLinearWithoutBias(
-            self.num_heads * self.head_dim,
-            self.hidden_size,
-            bias=False,
-            input_is_parallel=True,
-            init_method=lambda x: x,
-        )
-
-        self.rotary_emb = MistralRotaryEmbedding(
-            self.head_dim,
-            max_position_embeddings=self.max_position_embeddings,
-            base=self.rope_theta,
-        )
-
-    def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
-        return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
-
-    def forward(
-        self,
-        hidden_states: torch.Tensor,    # 输入维度 [bsz, q_len, hidden_size]
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_value: Optional[Cache] = None,
-        output_attentions: bool = False,
-        use_cache: bool = False,
-        **kwargs,
-    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
-        if "padding_mask" in kwargs:
-            warnings.warn(
-                "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`"
-            )
-
-        bsz, q_len, _ = hidden_states.size()
-
-        query_states = self.q_proj(hidden_states)   # [bsz, q_len, num_heads * head_dim]
-        key_states = self.k_proj(hidden_states)     # [bsz, q_len, num_key_value_heads * head_dim]
-        value_states = self.v_proj(hidden_states)   # [bsz, q_len, num_key_value_heads * head_dim]
-
-        # query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
-        # key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
-        # value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
-
-        query_states, key_states, value_states = (
-            rearrange(query_states, "b n (h d) -> b n h d", d=self.head_dim),   # [bsz, q_len, num_heads, head_dim]
-            rearrange(key_states, "b n (h d) -> b n h d", d=self.head_dim),     # [bsz, q_len, num_key_value_heads, head_dim]
-            rearrange(value_states, "b n (h d) -> b n h d", d=self.head_dim),   # [bsz, q_len, num_key_value_heads, head_dim]
-        )
-
-        query_states = query_states.transpose(1, 2)     # [bsz, num_heads, q_len, head_dim]
-        key_states = key_states.transpose(1, 2)         # [bsz, num_key_value_heads, q_len, head_dim]
-        value_states = value_states.transpose(1, 2)     # [bsz, num_key_value_heads, q_len, head_dim]
-        
-        # 打印注意力模块的输出
-        # 准备数据以写入 JSON 文件
-        attention_outputs = {
-            "Query states": query_states.detach().cpu().tolist(),
-            "Key states": key_states.detach().cpu().tolist(),
-            "Value states": value_states.detach().cpu().tolist()
-        }
-        # 将数据写入 JSON 文件
-        with open("a_attention_outputs.json", "w") as f:
-            json.dump(attention_outputs, f, indent=4)
-
-        if self.config.pp_size > 1:
-            query_states = query_states.contiguous()
-            key_states = key_states.contiguous()
-            value_states = value_states.contiguous()
-
-        kv_seq_len = key_states.shape[-2]
-        if past_key_value is not None:
-            if self.layer_idx is None:
-                raise ValueError(
-                    f"The cache structure has changed since version v4.36. If you are using {self.__class__.__name__} "
-                    "for auto-regressive decoding with k/v caching, please make sure to initialize the attention class "
-                    "with a layer index."
-                )
-            kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
-        cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
-        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
-
-        if past_key_value is not None:
-            cache_kwargs = {"sin": sin, "cos": cos}  # Specific to RoPE models
-            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
-
-        # repeat k/v heads if n_kv_heads < n_heads
-        key_states = repeat_kv(key_states, self.num_key_value_groups)
-        value_states = repeat_kv(value_states, self.num_key_value_groups)
-
-        attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim)
-
-        if attn_weights.size() != (bsz, self.num_heads/self.config.tp_size, q_len, kv_seq_len):
-            raise ValueError(
-                f"Attention weights should be of size {(bsz, self.num_heads/self.config.tp_size, q_len, kv_seq_len)}, but is"
-                f" {attn_weights.size()}"
-            )
-
-        if attention_mask is not None:
-            if attention_mask.size() != (bsz, 1, q_len, kv_seq_len):
-                raise ValueError(
-                    f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}"
-                )
-
-            attn_weights = attn_weights + attention_mask
-
-        # upcast attention to fp32
-        attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
-        attn_weights = nn.functional.dropout(attn_weights, p=self.attention_dropout, training=self.training)
-        attn_output = torch.matmul(attn_weights, value_states)
-
-        if attn_output.size() != (bsz, self.num_heads/self.config.tp_size, q_len, self.head_dim):
-            raise ValueError(
-                f"`attn_output` should be of size {(bsz, self.num_heads/self.config.tp_size, q_len, self.head_dim)}, but is"
-                f" {attn_output.size()}"
-            )
-
-        attn_output = attn_output.transpose(1, 2).contiguous()
-        attn_output = attn_output.reshape(bsz, q_len, int(self.hidden_size/self.config.tp_size))
-
-
-        attn_output = self.o_proj(attn_output)
-
-        if not output_attentions:
-            attn_weights = None
-
-        # 打印注意力模块的输出
-        attention_result = {
-            "Output weights:": attn_output.detach().cpu().tolist(),
-            # "Attention weights:": attn_weights.detach().cpu().tolist(),
-        }
-        # 将数据写入 JSON 文件
-        with open("a_attention_outputs.json", "w") as f:
-            json.dump(attention_result, f, indent=4)
-
-        return attn_output, attn_weights, past_key_value
-
-
-class MistralFlashAttention2(MistralAttention):
-    """
-    Mistral flash attention module. This module inherits from `MistralAttention` as the weights of the module stays
-    untouched. The only required change would be on the forward pass where it needs to correctly call the public API of
-    flash attention and deal with padding tokens in case the input contains any of them.
-    """
-
-    # Copied from transformers.models.llama.modeling_llama.LlamaFlashAttention2.__init__
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-
-        # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1.
-        # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignement, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0.
-        # Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left).
-        self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10()
-
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_value: Optional[Cache] = None,
-        output_attentions: bool = False,
-        use_cache: bool = False,
-        **kwargs,
-    ):
-        if "padding_mask" in kwargs:
-            warnings.warn(
-                "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`"
-            )
-
-            # overwrite attention_mask with padding_mask
-            attention_mask = kwargs.pop("padding_mask")
-        bsz, q_len, _ = hidden_states.size()
-
-        query_states = self.q_proj(hidden_states)
-        key_states = self.k_proj(hidden_states)
-        value_states = self.v_proj(hidden_states)
-
-        # query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
-        # key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
-        # value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
-
-        query_states, key_states, value_states = (
-            rearrange(query_states, "b n (h d) -> b n h d", d=self.head_dim),
-            rearrange(key_states, "b n (h d) -> b n h d", d=self.head_dim),
-            rearrange(value_states, "b n (h d) -> b n h d", d=self.head_dim),
-        )
-
-        query_states = query_states.transpose(1, 2)
-        key_states = key_states.transpose(1, 2)
-        value_states = value_states.transpose(1, 2)
-
-        # 打印注意力模块的输出
-        # 准备数据以写入 JSON 文件
-        attention_outputs = {
-            "Query states": query_states.detach().cpu().tolist(),
-            "Key states": key_states.detach().cpu().tolist(),
-            "Value states": value_states.detach().cpu().tolist()
-        }
-        # 将数据写入 JSON 文件
-        with open("a_flash_attention_outputs.json", "w") as f:
-            json.dump(attention_outputs, f, indent=4)
-
-        if self.config.pp_size > 1:
-            query_states = query_states.contiguous()
-            key_states = key_states.contiguous()
-            value_states = value_states.contiguous()
-
-
-        kv_seq_len = key_states.shape[-2]
-        if past_key_value is not None:
-            if self.layer_idx is None:
-                raise ValueError(
-                    f"The cache structure has changed since version v4.36. If you are using {self.__class__.__name__} "
-                    "for auto-regressive decoding with k/v caching, please make sure to initialize the attention class "
-                    "with a layer index."
-                )
-            kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
-
-        # Because the input can be padded, the absolute sequence length depends on the max position id.
-        rotary_seq_len = max(kv_seq_len, position_ids[:, -1].max().item()) + 1
-        cos, sin = self.rotary_emb(value_states, seq_len=rotary_seq_len)
-
-        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
-
-        use_sliding_windows = (
-            _flash_supports_window_size
-            and getattr(self.config, "sliding_window", None) is not None
-            and kv_seq_len > self.config.sliding_window
-        )
-
-        if not _flash_supports_window_size:
-            logger.warning_once(
-                "The current flash attention version does not support sliding window attention, for a more memory efficient implementation"
-                " make sure to upgrade flash-attn library."
-            )
-
-        if past_key_value is not None:
-            # Activate slicing cache only if the config has a value `sliding_windows` attribute
-            cache_has_contents = past_key_value.get_seq_length(self.layer_idx) > 0
-            if (
-                getattr(self.config, "sliding_window", None) is not None
-                and kv_seq_len > self.config.sliding_window
-                and cache_has_contents
-            ):
-                slicing_tokens = 1 - self.config.sliding_window
-
-                past_key = past_key_value[self.layer_idx][0]
-                past_value = past_key_value[self.layer_idx][1]
-
-                past_key = past_key[:, :, slicing_tokens:, :].contiguous()
-                past_value = past_value[:, :, slicing_tokens:, :].contiguous()
-
-                if past_key.shape[-2] != self.config.sliding_window - 1:
-                    raise ValueError(
-                        f"past key must have a shape of (`batch_size, num_heads, self.config.sliding_window-1, head_dim`), got"
-                        f" {past_key.shape}"
-                    )
-
-                if attention_mask is not None:
-                    attention_mask = attention_mask[:, slicing_tokens:]
-                    attention_mask = torch.cat([attention_mask, torch.ones_like(attention_mask[:, -1:])], dim=-1)
-
-            cache_kwargs = {"sin": sin, "cos": cos}  # Specific to RoPE models
-            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
-
-        # repeat k/v heads if n_kv_heads < n_heads
-        key_states = repeat_kv(key_states, self.num_key_value_groups)
-        value_states = repeat_kv(value_states, self.num_key_value_groups)
-        dropout_rate = 0.0 if not self.training else self.attention_dropout
-
-        # In PEFT, usually we cast the layer norms in float32 for training stability reasons
-        # therefore the input hidden states gets silently casted in float32. Hence, we need
-        # cast them back in float16 just to be sure everything works as expected.
-        input_dtype = query_states.dtype
-        if input_dtype == torch.float32:
-            if torch.is_autocast_enabled():
-                target_dtype = torch.get_autocast_gpu_dtype()
-            # Handle the case where the model is quantized
-            elif hasattr(self.config, "_pre_quantization_dtype"):
-                target_dtype = self.config._pre_quantization_dtype
-            else:
-                target_dtype = self.q_proj.weight.dtype
-
-            logger.warning_once(
-                f"The input hidden states seems to be silently casted in float32, this might be related to"
-                f" the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in"
-                f" {target_dtype}."
-            )
-
-            query_states = query_states.to(target_dtype)
-            key_states = key_states.to(target_dtype)
-            value_states = value_states.to(target_dtype)
-
-        # Reashape to the expected shape for Flash Attention
-        query_states = query_states.transpose(1, 2)
-        key_states = key_states.transpose(1, 2)
-        value_states = value_states.transpose(1, 2)
-
-        attn_output = self._flash_attention_forward(
-            query_states,
-            key_states,
-            value_states,
-            attention_mask,
-            q_len,
-            dropout=dropout_rate,
-            use_sliding_windows=use_sliding_windows,
-        )
-
-        attn_output = attn_output.reshape(bsz, q_len, int(self.hidden_size/self.config.tp_size)).contiguous()
-        attn_output = self.o_proj(attn_output)
-
-        if not output_attentions:
-            attn_weights = None
-
-        # 打印注意力模块的输出
-        attention_result = {
-            "Output weights:": attn_output.detach().cpu().tolist(),
-            # "Attention weights:": attn_weights.detach().cpu().tolist(),
-        }
-        # 将数据写入 JSON 文件
-        with open("a_flash_attention_outputs.json", "w") as f:
-            json.dump(attention_result, f, indent=4)
-
-        return attn_output, attn_weights, past_key_value
-
-    def _flash_attention_forward(
-        self,
-        query_states,
-        key_states,
-        value_states,
-        attention_mask,
-        query_length,
-        dropout=0.0,
-        softmax_scale=None,
-        use_sliding_windows=False,
-    ):
-        """
-        Calls the forward method of Flash Attention - if the input hidden states contain at least one padding token
-        first unpad the input, then computes the attention scores and pad the final attention scores.
-
-        Args:
-            query_states (`torch.Tensor`):
-                Input query states to be passed to Flash Attention API
-            key_states (`torch.Tensor`):
-                Input key states to be passed to Flash Attention API
-            value_states (`torch.Tensor`):
-                Input value states to be passed to Flash Attention API
-            attention_mask (`torch.Tensor`):
-                The padding mask - corresponds to a tensor of size `(batch_size, seq_len)` where 0 stands for the
-                position of padding tokens and 1 for the position of non-padding tokens.
-            dropout (`float`):
-                Attention dropout
-            softmax_scale (`float`, *optional*):
-                The scaling of QK^T before applying softmax. Default to 1 / sqrt(head_dim)
-            use_sliding_windows (`bool`, *optional*):
-                Whether to activate sliding window attention.
-        """
-        if not self._flash_attn_uses_top_left_mask:
-            causal = self.is_causal
-        else:
-            # TODO: Remove the `query_length != 1` check once Flash Attention for RoCm is bumped to 2.1. For details, please see the comment in LlamaFlashAttention2 __init__.
-            causal = self.is_causal and query_length != 1
-
-        # Contains at least one padding token in the sequence
-        if attention_mask is not None:
-            batch_size = query_states.shape[0]
-            query_states, key_states, value_states, indices_q, cu_seq_lens, max_seq_lens = self._upad_input(
-                query_states, key_states, value_states, attention_mask, query_length
-            )
-
-            cu_seqlens_q, cu_seqlens_k = cu_seq_lens
-            max_seqlen_in_batch_q, max_seqlen_in_batch_k = max_seq_lens
-
-            if not use_sliding_windows:
-                attn_output_unpad = flash_attn_varlen_func(
-                    query_states,
-                    key_states,
-                    value_states,
-                    cu_seqlens_q=cu_seqlens_q,
-                    cu_seqlens_k=cu_seqlens_k,
-                    max_seqlen_q=max_seqlen_in_batch_q,
-                    max_seqlen_k=max_seqlen_in_batch_k,
-                    dropout_p=dropout,
-                    softmax_scale=softmax_scale,
-                    causal=causal,
-                )
-            else:
-                attn_output_unpad = flash_attn_varlen_func(
-                    query_states,
-                    key_states,
-                    value_states,
-                    cu_seqlens_q=cu_seqlens_q,
-                    cu_seqlens_k=cu_seqlens_k,
-                    max_seqlen_q=max_seqlen_in_batch_q,
-                    max_seqlen_k=max_seqlen_in_batch_k,
-                    dropout_p=dropout,
-                    softmax_scale=softmax_scale,
-                    causal=causal,
-                    window_size=(self.config.sliding_window, self.config.sliding_window),
-                )
-
-            attn_output = pad_input(attn_output_unpad, indices_q, batch_size, query_length)
-        else:
-            if not use_sliding_windows:
-                attn_output = flash_attn_func(
-                    query_states,
-                    key_states,
-                    value_states,
-                    dropout,
-                    softmax_scale=softmax_scale,
-                    causal=causal,
-                )
-            else:
-                attn_output = flash_attn_func(
-                    query_states,
-                    key_states,
-                    value_states,
-                    dropout,
-                    softmax_scale=softmax_scale,
-                    causal=causal,
-                    window_size=(self.config.sliding_window, self.config.sliding_window),
-                )
-
-        return attn_output
-
-    def _upad_input(self, query_layer, key_layer, value_layer, attention_mask, query_length):
-        batch_size, kv_seq_len, num_heads, head_dim = key_layer.shape
-
-        # On the first iteration we need to properly re-create the padding mask
-        # by slicing it on the proper place
-        if kv_seq_len != attention_mask.shape[-1]:
-            attention_mask_num_tokens = attention_mask.shape[-1]
-            attention_mask = attention_mask[:, attention_mask_num_tokens - kv_seq_len :]
-
-        indices_k, cu_seqlens_k, max_seqlen_in_batch_k = _get_unpad_data(attention_mask)
-
-        key_layer = index_first_axis(key_layer.reshape(batch_size * kv_seq_len, num_heads, head_dim), indices_k)
-        value_layer = index_first_axis(value_layer.reshape(batch_size * kv_seq_len, num_heads, head_dim), indices_k)
-
-        if query_length == kv_seq_len:
-            query_layer = index_first_axis(
-                query_layer.reshape(batch_size * kv_seq_len, num_heads, head_dim), indices_k
-            )
-            cu_seqlens_q = cu_seqlens_k
-            max_seqlen_in_batch_q = max_seqlen_in_batch_k
-            indices_q = indices_k
-        elif query_length == 1:
-            max_seqlen_in_batch_q = 1
-            cu_seqlens_q = torch.arange(
-                batch_size + 1, dtype=torch.int32, device=query_layer.device
-            )  # There is a memcpy here, that is very bad.
-            indices_q = cu_seqlens_q[:-1]
-            query_layer = query_layer.squeeze(1)
-        else:
-            # The -q_len: slice assumes left padding.
-            attention_mask = attention_mask[:, -query_length:]
-            query_layer, indices_q, cu_seqlens_q, max_seqlen_in_batch_q = unpad_input(query_layer, attention_mask)
-
-        return (
-            query_layer,
-            key_layer,
-            value_layer,
-            indices_q,
-            (cu_seqlens_q, cu_seqlens_k),
-            (max_seqlen_in_batch_q, max_seqlen_in_batch_k),
-        )
-
-
-# copied from transformers.models.llama.modeling_llama.LlamaSdpaAttention with Llama->Mistral
-# TODO @Arthur no longer copied from LLama after static cache
-class MistralSdpaAttention(MistralAttention):
-    """
-    Mistral attention module using torch.nn.functional.scaled_dot_product_attention. This module inherits from
-    `MistralAttention` as the weights of the module stays untouched. The only changes are on the forward pass to adapt to
-    SDPA API.
-    """
-
-    # Adapted from MistralAttention.forward
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_value: Optional[Cache] = None,
-        output_attentions: bool = False,
-        use_cache: bool = False,
-    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
-        if output_attentions:
-            # TODO: Improve this warning with e.g. `model.config.attn_implementation = "manual"` once this is implemented.
-            logger.warning_once(
-                "MistralModel is using MistralSdpaAttention, but `torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to the manual attention implementation, "
-                'but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
-            )
-            return super().forward(
-                hidden_states=hidden_states,
-                attention_mask=attention_mask,
-                position_ids=position_ids,
-                past_key_value=past_key_value,
-                output_attentions=output_attentions,
-                use_cache=use_cache,
-            )
-
-        bsz, q_len, _ = hidden_states.size()
-
-        query_states = self.q_proj(hidden_states)
-        key_states = self.k_proj(hidden_states)
-        value_states = self.v_proj(hidden_states)
-
-        # query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
-        # key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
-        # value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
-
-        query_states, key_states, value_states = (
-            rearrange(query_states, "b n (h d) -> b n h d", d=self.head_dim),
-            rearrange(key_states, "b n (h d) -> b n h d", d=self.head_dim),
-            rearrange(value_states, "b n (h d) -> b n h d", d=self.head_dim),
-        )
-
-        query_states = query_states.transpose(1, 2)
-        key_states = key_states.transpose(1, 2)
-        value_states = value_states.transpose(1, 2)
-
-        # 打印注意力模块的输出
-        # 准备数据以写入 JSON 文件
-        attention_outputs = {
-            "Query states": query_states.detach().cpu().tolist(),
-            "Key states": key_states.detach().cpu().tolist(),
-            "Value states": value_states.detach().cpu().tolist()
-        }
-        # 将数据写入 JSON 文件
-        with open("a_sdpa_attention_outputs.json", "w") as f:
-            json.dump(attention_outputs, f, indent=4)
-        
-        if self.config.pp_size > 1:
-            query_states = query_states.contiguous()
-            key_states = key_states.contiguous()
-            value_states = value_states.contiguous()
-
-
-        kv_seq_len = key_states.shape[-2]
-        if past_key_value is not None:
-            kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
-        cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
-        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
-
-        if past_key_value is not None:
-            cache_kwargs = {"sin": sin, "cos": cos}  # Specific to RoPE models
-            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
-
-        key_states = repeat_kv(key_states, self.num_key_value_groups)
-        value_states = repeat_kv(value_states, self.num_key_value_groups)
-
-        if attention_mask is not None:
-            if attention_mask.size() != (bsz, 1, q_len, kv_seq_len):
-                raise ValueError(
-                    f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}"
-                )
-
-        # SDPA with memory-efficient backend is currently (torch==2.1.2) bugged with non-contiguous inputs with custom attn_mask,
-        # Reference: https://github.com/pytorch/pytorch/issues/112577.
-        if query_states.device.type == "cuda" and attention_mask is not None:
-            query_states = query_states.contiguous()
-            key_states = key_states.contiguous()
-            value_states = value_states.contiguous()
-
-        attn_output = torch.nn.functional.scaled_dot_product_attention(
-            query_states,
-            key_states,
-            value_states,
-            attn_mask=attention_mask,
-            dropout_p=self.attention_dropout if self.training else 0.0,
-            # The q_len > 1 is necessary to match with AttentionMaskConverter.to_causal_4d that does not create a causal mask in case q_len == 1.
-            is_causal=self.is_causal and attention_mask is None and q_len > 1,
-        )
-
-        attn_output = attn_output.transpose(1, 2).contiguous()
-        attn_output = attn_output.view(bsz, q_len, int(self.hidden_size/self.config.tp_size))
-
-        attn_output = self.o_proj(attn_output)
-
-        # 打印注意力模块的输出
-        attention_result = {
-            "Output weights:": attn_output.detach().cpu().tolist(),
-            # "Attention weights:": attn_weights.detach().cpu().tolist(),
-        }
-        # 将数据写入 JSON 文件
-        with open("a_sdpa_attention_outputs.json", "w") as f:
-            json.dump(attention_result, f, indent=4)
-
-        return attn_output, None, past_key_value
-
-
-MISTRAL_ATTENTION_CLASSES = {
-    "eager": MistralAttention,
-    "flash_attention_2": MistralFlashAttention2,
-    "sdpa": MistralSdpaAttention,
-}
-
-
-class MistralDecoderLayer(nn.Module):
-    def __init__(self, config: CollieConfig, layer_idx: int):
-        super().__init__()
-        self.hidden_size = config.hidden_size
-        config._attn_implementation = "sdpa"
-        self.self_attn = MISTRAL_ATTENTION_CLASSES[config._attn_implementation](config, layer_idx)
-        self.config = config
-        self.mlp = MistralMLP(config)
-        self.input_layernorm = MistralRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
-        self.post_attention_layernorm = MistralRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
-        self.idx = layer_idx
-        # 务必保持变量名一致
-        self.use_cache = self.config.model_config.use_cache
-        self.hidden_states = None
-        self.output_attentions = False
-
-class MistralDecoderLayer(nn.Module):
-    def __init__(self, config: CollieConfig, layer_idx: int):
-        super().__init__()
-        self.hidden_size = config.hidden_size
-        config._attn_implementation = "sdpa"
-        self.self_attn = MISTRAL_ATTENTION_CLASSES[config._attn_implementation](config, layer_idx)
-        self.config = config
-        self.mlp = MistralMLP(config)
-        self.input_layernorm = MistralRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
-        self.post_attention_layernorm = MistralRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
-        self.idx = layer_idx
-        # 务必保持变量名一致
-        self.use_cache = self.config.model_config.use_cache
-        self.hidden_states = None
-        self.output_attentions = False
-
-    def _forward(
-        self,
-        hidden_states: torch.Tensor,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_value: Optional[Tuple[torch.Tensor]] = None,
-        # output_attentions: Optional[bool] = False,
-        # use_cache: Optional[bool] = False,
-        **kwargs,
-    ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
-        # if "padding_mask" in kwargs:
-        #     warnings.warn(
-        #         "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`"
-        #     )
-        """
-        Args:
-            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
-            attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
-                `(batch, sequence_length)` where padding elements are indicated by 0.
-            output_attentions (`bool`, *optional*):
-                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
-                returned tensors for more detail.
-            use_cache (`bool`, *optional*):
-                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
-                (see `past_key_values`).
-            past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
-        """
-
-        residual = hidden_states
-
-        hidden_states = self.input_layernorm(hidden_states)
-
-        # Self Attention
-        hidden_states, self_attn_weights, present_key_value = self.self_attn(
-            hidden_states=hidden_states,
-            attention_mask=attention_mask,
-            position_ids=position_ids,
-            past_key_value=past_key_value,
-            # output_attentions=output_attentions,
-            # use_cache=use_cache,
-            **kwargs,
-        )
-        hidden_states = residual + hidden_states
-
-        # Fully Connected
-        residual = hidden_states
-        hidden_states = self.post_attention_layernorm(hidden_states)
-        hidden_states = self.mlp(hidden_states)
-        hidden_states = residual + hidden_states
-
-        # outputs = (hidden_states,)
-
-        # if output_attentions:
-        #     outputs += (self_attn_weights,)
-
-        # if use_cache:
-        #     outputs += (present_key_value,)
-
-        return hidden_states, present_key_value
-
-    def forward(self, inputs: dict):
-        layer_past = inputs_to_kv_cache_for_layer(idx=self.idx, inputs=inputs)
-
-        if self.config.checkpointing and self.training:
-            hidden_states, new_layer_past = torch.utils.checkpoint.checkpoint(
-                self._forward,
-                inputs["hidden_states"],
-                inputs.get("attention_mask", None),
-                inputs.get("position_ids", None),
-                layer_past,  # inputs.get("past_key_values", None),
-            )
-        else:
-            hidden_states, new_layer_past = self._forward(
-                inputs["hidden_states"],
-                inputs.get("attention_mask", None),
-                inputs.get("position_ids", None),
-                layer_past
-            )  # **inputs
-        inputs["hidden_states"] = hidden_states
-
-        inputs.update(kv_cache_to_inputs_for_layer(idx=self.idx, new_layer_past=new_layer_past))
-        return inputs
-
-
-    # def _forward(
-    #     self,
-    #     hidden_states: torch.Tensor,
-    #     attention_mask: Optional[torch.Tensor] = None,
-    #     position_ids: Optional[torch.LongTensor] = None,
-    #     past_key_value: Optional[Tuple[torch.Tensor]] = None,
-    #     # output_attentions: Optional[bool] = False,
-    #     # use_cache: Optional[bool] = False,
-    #     **kwargs,
-    # ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
-    #     # if "padding_mask" in kwargs:
-    #     #     warnings.warn(
-    #     #         "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`"
-    #     #     )
-    #     """
-    #     Args:
-    #         hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
-    #         attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
-    #             `(batch, sequence_length)` where padding elements are indicated by 0.
-    #         output_attentions (`bool`, *optional*):
-    #             Whether or not to return the attentions tensors of all attention layers. See `attentions` under
-    #             returned tensors for more detail.
-    #         use_cache (`bool`, *optional*):
-    #             If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
-    #             (see `past_key_values`).
-    #         past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
-    #     """
-
-    #     residual = hidden_states
-
-    #     hidden_states = self.input_layernorm(hidden_states)
-
-    #     # Self Attention
-    #     hidden_states, self_attn_weights, present_key_value = self.self_attn(
-    #         hidden_states=hidden_states,
-    #         attention_mask=attention_mask,
-    #         position_ids=position_ids,
-    #         past_key_value=past_key_value,
-    #         # output_attentions=output_attentions,
-    #         # use_cache=use_cache,
-    #         **kwargs,
-    #     )
-    #     hidden_states = residual + hidden_states
-
-    #     # Fully Connected
-    #     residual = hidden_states
-    #     hidden_states = self.post_attention_layernorm(hidden_states)
-    #     hidden_states = self.mlp(hidden_states)
-    #     hidden_states = residual + hidden_states
-
-    #     # outputs = (hidden_states,)
-
-    #     # if output_attentions:
-    #     #     outputs += (self_attn_weights,)
-
-    #     # if use_cache:
-    #     #     outputs += (present_key_value,)
-
-    #     return hidden_states, present_key_value
-
-    # def forward(self, inputs: dict):
-    #     layer_past = inputs_to_kv_cache_for_layer(idx=self.idx, inputs=inputs)
-
-    #     if self.config.checkpointing and self.training:
-    #         hidden_states, new_layer_past = torch.utils.checkpoint.checkpoint(
-    #             self._forward,
-    #             inputs["hidden_states"],
-    #             inputs.get("attention_mask", None),
-    #             inputs.get("position_ids", None),
-    #             layer_past,  # inputs.get("past_key_values", None),
-    #         )
-    #     else:
-    #         hidden_states, new_layer_past = self._forward(
-    #             inputs["hidden_states"],
-    #             inputs.get("attention_mask", None),
-    #             inputs.get("position_ids", None),
-    #             layer_past
-    #         )  # **inputs
-    #     inputs["hidden_states"] = hidden_states
-
-    #     inputs.update(kv_cache_to_inputs_for_layer(idx=self.idx, new_layer_past=new_layer_past))
-    #     return inputs
-
-    # def forward(
-    #     self,
-    #     hidden_states: torch.Tensor,
-    #     attention_mask: Optional[torch.Tensor] = None,
-    #     position_ids: Optional[torch.LongTensor] = None,
-    #     past_key_value: Optional[Tuple[torch.Tensor]] = None,
-    #     output_attentions: Optional[bool] = False,
-    #     use_cache: Optional[bool] = False,
-    #     **kwargs,
-    # ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
-    #     if "padding_mask" in kwargs:
-    #         warnings.warn(
-    #             "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`"
-    #         )
-    #     """
-    #     Args:
-    #         hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
-    #         attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
-    #             `(batch, sequence_length)` where padding elements are indicated by 0.
-    #         output_attentions (`bool`, *optional*):
-    #             Whether or not to return the attentions tensors of all attention layers. See `attentions` under
-    #             returned tensors for more detail.
-    #         use_cache (`bool`, *optional*):
-    #             If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
-    #             (see `past_key_values`).
-    #         past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
-    #     """
-
-    #     residual = hidden_states
-
-    #     hidden_states = self.input_layernorm(hidden_states)
-
-    #     # Self Attention
-    #     hidden_states, self_attn_weights, present_key_value = self.self_attn(
-    #         hidden_states=hidden_states,
-    #         attention_mask=attention_mask,
-    #         position_ids=position_ids,
-    #         past_key_value=past_key_value,
-    #         output_attentions=output_attentions,
-    #         use_cache=use_cache,
-    #         **kwargs,
-    #     )
-    #     hidden_states = residual + hidden_states
-
-    #     # Fully Connected
-    #     residual = hidden_states
-    #     hidden_states = self.post_attention_layernorm(hidden_states)
-    #     hidden_states = self.mlp(hidden_states)
-    #     hidden_states = residual + hidden_states
-
-    #     outputs = (hidden_states,)
-
-    #     if output_attentions:
-    #         outputs += (self_attn_weights,)
-
-    #     if use_cache:
-    #         outputs += (present_key_value,)
-
-    #     return outputs
-
-
-MISTRAL_START_DOCSTRING = r"""
-    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
-    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
-    etc.)
-
-    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
-    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
-    and behavior.
-
-    Parameters:
-        config ([`MistralConfig`]):
-            Model configuration class with all the parameters of the model. Initializing with a config file does not
-            load the weights associated with the model, only the configuration. Check out the
-            [`~PreTrainedModel.from_pretrained`] method to load the model weights.
-"""
-
-
-@add_start_docstrings(
-    "The bare Mistral Model outputting raw hidden-states without any specific head on top.",
-    MISTRAL_START_DOCSTRING,
-)
-class MistralPreTrainedModel(PreTrainedModel):
-    config_class = MistralConfig
-    base_model_prefix = "model"
-    supports_gradient_checkpointing = True
-    _no_split_modules = ["MistralDecoderLayer"]
-    _skip_keys_device_placement = "past_key_values"
-    _supports_flash_attn_2 = True
-    _supports_sdpa = True
-    _supports_cache_class = True
-
-    def _init_weights(self, module):
-        std = self.config.initializer_range
-        if isinstance(module, nn.Linear):
-            module.weight.data.normal_(mean=0.0, std=std)
-            if module.bias is not None:
-                module.bias.data.zero_()
-        elif isinstance(module, nn.Embedding):
-            module.weight.data.normal_(mean=0.0, std=std)
-            if module.padding_idx is not None:
-                module.weight.data[module.padding_idx].zero_()
-
-
-MISTRAL_INPUTS_DOCSTRING = r"""
-    Args:
-        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
-            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
-            it.
-
-            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
-            [`PreTrainedTokenizer.__call__`] for details.
-
-            [What are input IDs?](../glossary#input-ids)
-        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
-            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
-
-            - 1 for tokens that are **not masked**,
-            - 0 for tokens that are **masked**.
-
-            [What are attention masks?](../glossary#attention-mask)
-
-            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
-            [`PreTrainedTokenizer.__call__`] for details.
-
-            If `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
-            `past_key_values`).
-
-            If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
-            and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
-            information on the default strategy.
-
-            - 1 indicates the head is **not masked**,
-            - 0 indicates the head is **masked**.
-        position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
-            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
-            config.n_positions - 1]`.
-
-            [What are position IDs?](../glossary#position-ids)
-        past_key_values (`Cache` or `tuple(tuple(torch.FloatTensor))`, *optional*):
-            Pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
-            blocks) that can be used to speed up sequential decoding. This typically consists in the `past_key_values`
-            returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`.
-
-            Two formats are allowed:
-            - a [`~cache_utils.Cache`] instance;
-            - Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of
-            shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`). This is also known as the legacy
-            cache format.
-
-            The model will output the same cache format that is fed as input. If no `past_key_values` are passed, the
-            legacy cache format will be returned.
-
-            If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that don't
-            have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `input_ids`
-            of shape `(batch_size, sequence_length)`.
-        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
-            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
-            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
-            model's internal embedding lookup matrix.
-        use_cache (`bool`, *optional*):
-            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
-            `past_key_values`).
-        output_attentions (`bool`, *optional*):
-            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
-            tensors for more detail.
-        output_hidden_states (`bool`, *optional*):
-            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
-            more detail.
-        return_dict (`bool`, *optional*):
-            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
-"""
-
-
-@add_start_docstrings(
-    "The bare Mistral Model outputting raw hidden-states without any specific head on top.",
-    MISTRAL_START_DOCSTRING,
-)
-class MistralModel(nn.Module):
-    """
-    Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`MistralDecoderLayer`]
-
-    Args:
-        config: MistralConfig
-    """
-
-    def __init__(self, config: CollieConfig):
-        # super().__init__(config)
-        super().__init__()
-        self.config = config
-        self.padding_idx = config.pad_token_id
-        self.vocab_size = config.vocab_size
-        
-        # aaaa
-        # self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
-        self.embed_tokens = tensor_parallel.VocabParallelEmbedding(
-            config.vocab_size, config.hidden_size, params_dtype=torch.float32
-        )
-        self.layers = nn.ModuleList(
-            [MistralDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
-        )
-        config._attn_implementation = "sdpa"
-        self._attn_implementation = config._attn_implementation
-        self.norm = MistralRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
-        self.gradient_checkpointing = False
-        # Initialize weights and apply final processing
-        # self.post_init()
-        
-
-    def get_input_embeddings(self):
-        return self.embed_tokens
-
-    def set_input_embeddings(self, value):
-        self.embed_tokens = value
-
-    @add_start_docstrings_to_model_forward(MISTRAL_INPUTS_DOCSTRING)
-    def forward(
-        self,
-        input_ids: torch.LongTensor = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        # aaaa
-        past_key_values: Optional[Tuple[torch.FloatTensor]] = None,
-        inputs_embeds: Optional[torch.FloatTensor] = None,
-        use_cache: Optional[bool] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ) -> Union[Tuple, BaseModelOutputWithPast]:
-        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
-        output_hidden_states = (
-            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
-        )
-        use_cache = use_cache if use_cache is not None else self.config.use_cache
-
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        # retrieve input_ids and inputs_embeds
-        if input_ids is not None and inputs_embeds is not None:
-            raise ValueError("You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time")
-        elif input_ids is not None:
-            batch_size, seq_length = input_ids.shape
-        elif inputs_embeds is not None:
-            batch_size, seq_length, _ = inputs_embeds.shape
-        else:
-            raise ValueError("You have to specify either decoder_input_ids or decoder_inputs_embeds")
-
-        if self.gradient_checkpointing and self.training:
-            if use_cache:
-                logger.warning_once(
-                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
-                )
-                use_cache = False
-
-        past_key_values_length = 0
-
-        if use_cache:
-            use_legacy_cache = not isinstance(past_key_values, Cache)
-            if use_legacy_cache:
-                past_key_values = DynamicCache.from_legacy_cache(past_key_values)
-            past_key_values_length = past_key_values.get_usable_length(seq_length)
-
-        if position_ids is None:
-            device = input_ids.device if input_ids is not None else inputs_embeds.device
-            position_ids = torch.arange(
-                past_key_values_length, seq_length + past_key_values_length, dtype=torch.long, device=device
-            )
-            position_ids = position_ids.unsqueeze(0).view(-1, seq_length)
-        else:
-            position_ids = position_ids.view(-1, seq_length).long()
-
-        if inputs_embeds is None:
-            inputs_embeds = self.embed_tokens(input_ids)
-
-        # 打印嵌入层输出
-        embeddings_output = inputs_embeds.detach().cpu().tolist()
-        data_to_save = {"Embeddings Output": embeddings_output}
-        # 将输出写入 JSON 文件
-        with open('a_embeddings_output.json', 'w') as f:
-            json.dump(data_to_save, f, indent=4)
-
-        if attention_mask is not None and self._attn_implementation == "flash_attention_2" and use_cache:
-            is_padding_right = attention_mask[:, -1].sum().item() != batch_size
-            if is_padding_right:
-                raise ValueError(
-                    "You are attempting to perform batched generation with padding_side='right'"
-                    " this may lead to unexpected behaviour for Flash Attention version of Mistral. Make sure to "
-                    " call `tokenizer.padding_side  = 'left'` before tokenizing the input. "
-                )
-
-        if self._attn_implementation == "flash_attention_2":
-            # 2d mask is passed through the layers
-            attention_mask = attention_mask if (attention_mask is not None and 0 in attention_mask) else None
-        elif self._attn_implementation == "sdpa" and not output_attentions:
-            # output_attentions=True can not be supported when using SDPA, and we fall back on
-            # the manual implementation that requires a 4D causal mask in all cases.
-            attention_mask = _prepare_4d_causal_attention_mask_for_sdpa(
-                attention_mask,
-                (batch_size, seq_length),
-                inputs_embeds,
-                past_key_values_length,
-            )
-        else:
-            # 4d mask is passed through the layers
-            attention_mask = _prepare_4d_causal_attention_mask(
-                attention_mask,
-                (batch_size, seq_length),
-                inputs_embeds,
-                past_key_values_length,
-                sliding_window=self.config.sliding_window,
-            )
-
-        hidden_states = inputs_embeds
-
-        inputs = {
-            "input_ids": input_ids,
-            "hidden_states": hidden_states,
-            "attention_mask": attention_mask,
-            "position_ids": position_ids,
-            "past_key_values": past_key_values,
-            "output_attentions": output_attentions,
-            "use_cache": use_cache,
-        } 
-
-        # decoder layers
-        all_hidden_states = () if output_hidden_states else None
-        all_self_attns = () if output_attentions else None
-        next_decoder_cache = None
-
-        # for decoder_layer in self.layers:
-        for idx, decoder_layer in enumerate(self.layers):
-            if output_hidden_states:
-                # all_hidden_states += (hidden_states,)
-                all_hidden_states += (inputs["hidden_states"],)
-
-            if self.gradient_checkpointing and self.training:
-                layer_outputs = self._gradient_checkpointing_func(
-                    decoder_layer.__call__,
-                    # hidden_states,
-                    # attention_mask,
-                    # position_ids,
-                    # past_key_values,
-                    # output_attentions,
-                    # use_cache,
-                    inputs,
-                )
-            else:
-                layer_outputs = decoder_layer(
-                    # hidden_states,
-                    # attention_mask=attention_mask,
-                    # position_ids=position_ids,
-                    # past_key_value=past_key_values,
-                    # output_attentions=output_attentions,
-                    # use_cache=use_cache,
-                    inputs,
-                )
-            inputs.update(layer_outputs)
-
-            # hidden_states = layer_outputs[0]
-            hidden_states = inputs["hidden_states"]
-
-            if use_cache:
-                # next_decoder_cache = layer_outputs[2 if output_attentions else 1]
-                next_decoder_cache = inputs["addition_info"][1 if output_attentions else 0]
-
-            if output_attentions:
-                # all_self_attns += (layer_outputs[1],)
-                all_self_attns += (inputs["addition_info"][0],)
-
-        hidden_states = self.norm(hidden_states)
-
-        # add hidden states from the last decoder layer
-        if output_hidden_states:
-            all_hidden_states += (hidden_states,)
-
-        next_cache = None
-        if use_cache:
-            next_cache = next_decoder_cache.to_legacy_cache() if use_legacy_cache else next_decoder_cache
-
-        if not return_dict:
-            return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None)
-        return BaseModelOutputWithPast(
-            last_hidden_state=hidden_states,
-            # past_key_values=next_cache,
-            hidden_states=all_hidden_states,
-            attentions=all_self_attns,
-            past_key_values=past_key_values,
-        )
-
-    @classmethod
-    def pipeline_layers(cls, config: CollieConfig):
-        """
-        Get layers of pipeline.
-        :return: list
-        """
-        if isinstance(config, str):
-            config = CollieConfig.from_pretrained(config)
-
-        if config.tie_word_embeddings:
-            embed_tokens = TiedLayerSpec(
-                "embed_tokens",
-                dict_as_params(input_keys="input_ids", output_keys="hidden_states"),
-                tensor_parallel.VocabParallelEmbedding,
-                config.vocab_size,
-                config.hidden_size,
-            )
-        else:
-            embed_tokens = LayerSpec(
-                dict_as_params(input_keys="input_ids", output_keys="hidden_states"),
-                tensor_parallel.VocabParallelEmbedding,
-                config.vocab_size,
-                config.hidden_size,
-            )
-
-        layers = [
-            LayerSpec(MistralDecoderLayer, config, i) for i in range(config.num_hidden_layers)
-        ]
-        norm = LayerSpec(
-            dict_as_params(input_keys="hidden_states", output_keys="hidden_states"),
-            MistralRMSNorm,
-            hidden_size=config.hidden_size,
-            eps=config.rms_norm_eps,
-        )
-
-        return [
-            ("embed_tokens", embed_tokens),
-            ("layers", layers),
-            ("norm", norm),
-        ]
-
-class MistralForCausalLM(CollieModelForCausalLM):
-    _tied_weights_keys = ["lm_head.weight"]
-
-    def __init__(self, config:CollieConfig):
-        super().__init__(config)
-        self.model = MistralModel(config)
-        self.vocab_size = config.vocab_size
-        # self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
-        self.lm_head = ColumnParallelLinearWithoutBias(
-            self.collie_config.hidden_size, self.collie_config.vocab_size, bias=False
-        )
-        # Initialize weights and apply final processing
-        # self.post_init()
-        # GenerationMixin 需要的额外参数
-        self.config.is_decoder = True
-        if config.model_config.tie_word_embeddings:
-            self.lm_head.weight = self.embed_tokens.weight
-        self.main_input_name = "input_ids"
-
-    def clean_cache(self):
-        self._clean_hidden_states([*self.model.layers, self.lm_head])
-        self._set_use_cache(self.model.layers, False)
-
-    def set_cache(self, use_cache):
-        self._set_use_cache(self.model.layers, use_cache)
-
-    def get_input_embeddings(self):
-        return self.model.embed_tokens
-
-    def set_input_embeddings(self, value):
-        self.model.embed_tokens = value
-
-    def get_output_embeddings(self):
-        return self.lm_head
-
-    def set_output_embeddings(self, new_embeddings):
-        self.lm_head = new_embeddings
-
-    def set_decoder(self, decoder):
-        self.model = decoder
-
-    def get_decoder(self):
-        return self.model
-
-    @add_start_docstrings_to_model_forward(MISTRAL_INPUTS_DOCSTRING)
-    @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
-    def forward(
-        self,
-        input_ids: torch.LongTensor = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[List[torch.FloatTensor]] = None,
-        inputs_embeds: Optional[torch.FloatTensor] = None,
-        labels: Optional[torch.LongTensor] = None,
-        use_cache: Optional[bool] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-        **kwargs,
-    ) -> Union[Tuple, CausalLMOutputWithPast]:
-        r"""
-        Args:
-            labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
-                Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
-                config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
-                (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
-
-        Returns:
-
-        Example:
-
-        ```python
-        >>> from transformers import AutoTokenizer, MistralForCausalLM
-
-        >>> model = MistralForCausalLM.from_pretrained("mistralai/Mistral-7B-v0.1")
-        >>> tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-v0.1")
-
-        >>> prompt = "Hey, are you conscious? Can you talk to me?"
-        >>> inputs = tokenizer(prompt, return_tensors="pt")
-
-        >>> # Generate
-        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
-        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
-        "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
-        ```"""
-
-        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
-        output_hidden_states = (
-            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
-        )
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
-        outputs = self.model(
-            input_ids=input_ids,
-            attention_mask=attention_mask,
-            position_ids=position_ids,
-            past_key_values=past_key_values,
-            inputs_embeds=inputs_embeds,
-            use_cache=use_cache,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-
-        hidden_states = outputs[0]
-        logits = self.lm_head(hidden_states)
-        logits = logits.float()
-
-        loss = None
-        if labels is not None:
-            # Shift so that tokens < n predict n
-            shift_logits = logits[..., :-1, :].contiguous()
-            shift_labels = labels[..., 1:].contiguous()
-            # Flatten the tokens
-            shift_logits = shift_logits.view(-1, self.config.vocab_size)
-            shift_labels = shift_labels.view(-1)
-            # Ensure tensors are on the same device
-            shift_labels = shift_labels.to(shift_logits.device)
-            loss_fct = CrossEntropyLoss()
-            loss = loss_fct(shift_logits, shift_labels)
-
-        if not return_dict:
-            output = (logits,) + outputs[1:]
-            return (loss,) + output if loss is not None else output
-
-        return CausalLMOutputWithPast(
-            loss=loss,
-            logits=logits,
-            past_key_values=outputs.past_key_values,
-            hidden_states=outputs.hidden_states,
-            attentions=outputs.attentions,
-        )
-
-    def prepare_inputs_for_generation(
-        self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, **kwargs
-    ):
-        # Omit tokens covered by past_key_values
-        if past_key_values is not None:
-            if isinstance(past_key_values, Cache):
-                cache_length = past_key_values.get_seq_length()
-                past_length = past_key_values.seen_tokens
-                max_cache_length = past_key_values.get_max_length()
-            else:
-                cache_length = past_length = past_key_values[0][0].shape[2]
-                max_cache_length = None
-
-            # Keep only the unprocessed tokens:
-            # 1 - If the length of the attention_mask exceeds the length of input_ids, then we are in a setting where
-            # some of the inputs are exclusively passed as part of the cache (e.g. when passing input_embeds as
-            # input)
-            if attention_mask is not None and attention_mask.shape[1] > input_ids.shape[1]:
-                input_ids = input_ids[:, -(attention_mask.shape[1] - past_length) :]
-            # 2 - If the past_length is smaller than input_ids', then input_ids holds all input tokens. We can discard
-            # input_ids based on the past_length.
-            elif past_length < input_ids.shape[1]:
-                input_ids = input_ids[:, past_length:]
-            # 3 - Otherwise (past_length >= input_ids.shape[1]), let's assume input_ids only has unprocessed tokens.
-
-            # If we are about to go beyond the maximum cache length, we need to crop the input attention mask.
-            if (
-                max_cache_length is not None
-                and attention_mask is not None
-                and cache_length + input_ids.shape[1] > max_cache_length
-            ):
-                attention_mask = attention_mask[:, -max_cache_length:]
-
-        position_ids = kwargs.get("position_ids", None)
-        if attention_mask is not None and position_ids is None:
-            # create position_ids on the fly for batch generation
-            position_ids = attention_mask.long().cumsum(-1) - 1
-            position_ids.masked_fill_(attention_mask == 0, 1)
-            if past_key_values:
-                position_ids = position_ids[:, -input_ids.shape[1] :]
-
-        # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
-        if inputs_embeds is not None and past_key_values is None:
-            model_inputs = {"inputs_embeds": inputs_embeds}
-        else:
-            model_inputs = {"input_ids": input_ids}
-
-        model_inputs.update(
-            {
-                "position_ids": position_ids,
-                "past_key_values": past_key_values,
-                "use_cache": kwargs.get("use_cache"),
-                "attention_mask": attention_mask,
-            }
-        )
-        return model_inputs
-
-    @staticmethod
-    def _reorder_cache(past_key_values, beam_idx):
-        reordered_past = ()
-        for layer_past in past_key_values:
-            reordered_past += (
-                tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past),
-            )
-        return reordered_past
-
-    @classmethod
-    def pipeline_layers(cls, config: CollieConfig):
-        """
-        Get layers of pipeline.
-        :return: list
-        """
-        if isinstance(config, str):
-            config = CollieConfig.from_pretrained(config)
-
-        if config.tie_word_embeddings:
-            output = TiedLayerSpec(
-                "embed_tokens",
-                dict_as_params(input_keys="hidden_states", output_keys="logits"),
-                ColumnParallelLMHead,
-                config.hidden_size,
-                config.vocab_size,
-                bias=False,
-            )
-        else:
-            output = LayerSpec(
-                dict_as_params(input_keys="hidden_states", output_keys="logits"),
-                ColumnParallelLMHead,
-                config.hidden_size,
-                config.vocab_size,
-                bias=False,
-            )
-
-        return [("model", MistralModel.pipeline_layers(config)), ("lm_head", output)]
-
-    @staticmethod
-    def load_parallel_state_dict(
-            path: str,
-            config: Union[CollieConfig, str],
-            process_exclusion: bool = False,
-            **kwargs,
-    ):
-        ...
-
-    @staticmethod
-    def load_parallel_state_dict(
-            path: str,
-            config: Union[CollieConfig, str],
-            process_exclusion: bool = False,
-            protocol: str = "file", # 指定加载state_dict时使用的协议
-            **kwargs,
-    ):
-        """
-        Load state_dict from ``path``.
-        The format of pretrained model should be the same as that of
-        `huggingface`.
-        :return: state_dict. Note that the state_dict should be processed
-            properly to match the current rank.
-        """
-        # 配置加载
-        if isinstance(config, str):
-            config = CollieConfig.from_pretrained(config)
-        # IO驱动初始化
-        io_driver = IODriver.from_protocol(protocol)
-        # 检查文件路径是否存在
-        if not io_driver.exists(path):
-            raise FileNotFoundError(f"folder {path} not found.")
-        # 初始化存储和处理变量
-        state_dict = OrderedDict()
-        weights = []
-        parts = None # 变量用于存储模型分割的部分信息
-        # 如果开启了进程互斥，那么每个进程都会显示进度条，否则只显示 RANK0 的
-        hide_progress = not process_exclusion and int(os.environ.get("RANK", "0")) != 0
-        if dist.is_initialized() and process_exclusion:
-            # 如果启动了进程互斥，则要进行 dist.get_world_size() 次循环
-            rank_order = range(dist.get_world_size())
-        else:
-            # 不开启只进行一次循环
-            rank_order = range(1)
-        # 权重文件加载和处理
-        for rank in rank_order:
-            # 如果开启了进程互斥，那么只有对应 RANK 的能进入循环；不开启进程互斥的话就都可以进
-            if int(os.environ.get("RANK", "0")) == rank or not process_exclusion:
-                # PP 分层的方法保存在了 os.environ["COLLIE_PP_PARTS"], 格式类似于 [0, 17, 35], 左闭右开
-                if env.is_pipeline:
-                    # 保存的是 json 格式
-                    parts = env.pipeline_parts
-                if hasattr(config, "num_key_value_heads"):
-                    # llama2 (transformers >= 4.31.0)
-                    num_key_value_heads = config.num_key_value_heads
-                else:
-                    num_key_value_heads = config.num_attention_heads
-                head_dim = config.hidden_size // config.num_attention_heads
-                # 如果存在 pytorch_model.bin.index.json 文件的话，此时不同的 pp 进程可以按需加载自己需要的权重
-                if (
-                        io_driver.exists(os.path.join(path, "pytorch_model.bin.index.json"))
-                        and "COLLIE_PP_PARTS" in os.environ.keys()
-                ):
-                    weight_map = json.loads(
-                        io_driver.load(
-                            os.path.join(path, "pytorch_model.bin.index.json"), mode="r"
-                        )
-                    )["weight_map"]
-                    # layers 表示自己需要的层
-                    layers = env.pipeline_layers_idx
-                    # 筛选出形似 model.layers.0 这样的层。包含两个条件：1. 有数字的层；2. 数字加一要在 layers 里面（因为最开始还有个 embedding 占一层）
-                    weights.extend(
-                        [
-                            value
-                            for key, value in weight_map.items()
-                            if len(key.split(".")) > 2
-                               and key.split(".")[2].isdigit()
-                               and (int(key.split(".")[2]) + 1) in layers
-                        ]
-                    )
-                    # 去重
-                    weights = list(set(weights))
-                    # 继续筛选，如果有 0 层，那么就要加载 embedding；如果有最后一层，那么就要加载 lm_head；如果有倒数第二层，那么就要加载 norm
-                    if 0 in layers:
-                        weights.append(weight_map["model.embed_tokens.weight"])
-                    if max(parts) - 1 in layers:
-                        weights.append(weight_map["lm_head.weight"])
-                    if max(parts) - 2 in layers:
-                        weights.append(weight_map["model.norm.weight"])
-                else:
-                    # 如果没有 pytorch_model.bin.index.json 文件的话，那么就加载所有的权重
-                    weights = [
-                        weight
-                        for weight in io_driver.list(path)
-                        if weight.endswith(".bin")
-                    ]
-                with progress(
-                    weights,
-                    desc="Loading state dict",
-                    total=len(weights),
-                    disable=hide_progress,
-                ) as pbar:
-                    for weight in pbar:
-                        part_state_dict = io_driver.load(
-                            os.path.join(path, weight), mode="rb"
-                        )
-                        # for key in list(part_state_dict.keys()):
-                            # if "attention.wqkv.weight" in key:
-                            #     # qkv_weights = part_state_dict.pop(key)
-                            #     qkv_weights = part_state_dict[key]
-                            #     print(qkv_weights.shape)
-                            #     (wq, wk, wv) = qkv_weights.split(
-                            #         [
-                            #             config.hidden_size,
-                            #             config.num_key_value_heads * head_dim,
-                            #             config.num_key_value_heads * head_dim,
-                            #         ],
-                            #         dim=0,
-                            #     )
-                            #     wq_name = key.replace("wqkv", "wq")
-                            #     wk_name = key.replace("wqkv", "wk")
-                            #     wv_name = key.replace("wqkv", "wv")
-                            #     part_state_dict[wq_name] = wq
-                            #     part_state_dict[wk_name] = wk
-                            #     part_state_dict[wv_name] = wv
-                        state_dict.update(part_state_dict)
-                        del part_state_dict
-                if parts is not None:
-                    # 这一步是 pp 的复筛
-                    layers = env.pipeline_layers_idx
-                    for key in list(state_dict.keys()):
-                        if key.startswith("layers"):
-                            layer = int(key.split(".")[1])
-                            if layer + 1 not in layers:
-                                state_dict.pop(key)
-                        # if key.endswith("tok_embeddings.weight"):
-                        if key.endswith("embed_tokens.weight"):
-                            if 0 not in layers:
-                                state_dict.pop(key)
-                        if key == "norm.weight":
-                            if max(parts) - 2 not in layers:
-                                state_dict.pop(key)
-                        # if key.endswith("output.weight"):
-                        if key.endswith("lm_head.weight"):
-                            if max(parts) - 1 not in layers:
-                                state_dict.pop(key)
-                # 根据用户配置的新的 tp size 进行分割
-                for key in list(state_dict.keys()):
-                    col_filter = [
-                        # "wq.weight",
-                        # "wk.weight",
-                        # "wv.weight",
-                        # "wqkv.weight",
-                        # "w1.weight",
-                        # "w3.weight",
-                        # "tok_embeddings.weight",
-                        # "output.weight",
-                        "q_proj.weight",
-                        "k_proj.weight",
-                        "v_proj.weight",
-                        #"o_proj.weight",
-                        "lm_head.weight",
-                        "gate_proj.weight",
-                        "up_proj.weight",
-                        #"down_proj.weight",
-                        "embed_tokens.weight",
-                    ]
-                    col_split = any([key.endswith(filter) for filter in col_filter])
-
-                    if col_split:
-                        tensor = (
-                            list(torch.chunk(state_dict[key], config.tp_size, dim=0))[
-                                env.tp_rank
-                            ]
-                            .detach()
-                            .clone()
-                        )
-                        del state_dict[key]
-                        if process_exclusion:
-                            # CPU 内存回收（速度很慢）
-                            gc.collect()
-                        state_dict[key] = tensor
-                    elif key.endswith("o_proj.weight") or key.endswith("down_proj.weight"):
-                        tensor = (
-                            list(torch.chunk(state_dict[key], config.tp_size, dim=1))[
-                                env.tp_rank
-                            ]
-                            .detach()
-                            .clone()
-                        )
-                        del state_dict[key]
-                        if process_exclusion:
-                            # CPU 内存回收（速度很慢）
-                            gc.collect()
-                        state_dict[key] = tensor
-            if dist.is_initialized() and process_exclusion:
-                # 如果选择了进程互斥，那么本次循环中不需要加载权重的进程需等待
-                dist.barrier()
-        return state_dict
-
-    @staticmethod
-    def save_parallel_state_dict(
-        state_dict: dict,
-        path: str,
-        config: CollieConfig,
-        process_exclusion: bool = False,
-        **kwargs,
-    ):
-        ...
-
-    @staticmethod
-    def save_parallel_state_dict(
-            state_dict: dict,
-            path: str,
-            config: CollieConfig,
-            process_exclusion: bool = False,
-            protocol: str = "file",
-    ):
-        """
-        Save state_dict to ``path``.
-        The format of saved state dict should be the same as that of
-        `huggingface`.
-        """
-        io_driver = IODriver.from_protocol(protocol)
-        # gather to tp rank 0
-        if dist.is_initialized() and process_exclusion:
-            # 如果启动了进程互斥，则要进行 pp_size 次循环
-            rank_order = range(config.pp_size)
-        else:
-            # 不开启只进行一次循环
-            rank_order = range(1)
-        dst = parallel_state.get_tensor_model_parallel_src_rank()
-        with progress(
-                rank_order,
-                desc="Saving model",
-                disable=int(os.environ.get("RANK", "0")) != 0,
-        ) as pbar:
-            for rank in pbar:
-                if env.dp_rank == 0 and (env.pp_rank == rank or not process_exclusion):
-                    for key in sorted(list(state_dict.keys())):
-                        tensor_list = None
-                        if env.tp_rank == 0:
-                            tensor_list = [
-                                torch.zeros_like(state_dict[key])
-                                .to(state_dict[key].dtype)
-                                .cuda()
-                                for _ in range(config.tp_size)
-                            ]
-                        dist.gather(
-                            state_dict[key].cuda(),
-                            dst=dst,
-                            gather_list=tensor_list,
-                            group=env.tp_group,
-                        )
-                        if env.tp_rank == 0:
-                            col_filter = [
-                                # "wq.weight",
-                                # "wk.weight",
-                                # "wv.weight",
-                                # "wqkv.weight",
-                                # "w1.weight",
-                                # "w3.weight",
-                                # "tok_embeddings.weight",
-                                # "output.weight",
-                                "q_proj.weight",
-                                "k_proj.weight",
-                                "v_proj.weight",
-                                #"o_proj.weight",
-                                "lm_head.weight",
-                                "gate_proj.weight",
-                                "up_proj.weight",
-                                #"down_proj.weight",
-                                "embed_tokens.weight",
-                            ]
-                            col_split = any(
-                                [key.endswith(filter) for filter in col_filter]
-                            )
-
-                            if col_split:
-                                state_dict[key] = concat_tensor(tensor_list, dim=0)
-
-                                if process_exclusion:
-                                    # CPU 内存回收（速度很慢）
-                                    gc.collect()
-
-                            elif key.endswith("o_proj.weight") or key.endswith("down_proj.weight"):
-                                state_dict[key] = concat_tensor(tensor_list, dim=1)
-
-                                if process_exclusion:
-                                    # CPU 内存回收（速度很慢）
-                                    gc.collect()
-                    # 似乎不需要？
-                    # state_dict_keys = state_dict.keys()
-                    # for layer_id in range(config.num_layers):
-                    #     qkv_names = [None, None, None]
-                    #     for key in state_dict_keys:
-                    #         if f"layers.{layer_id}.attention.wq.weight" in key:
-                    #             qkv_names[0] = key
-                    #         elif f"layers.{layer_id}.attention.wk.weight" in key:
-                    #             qkv_names[1] = key
-                    #         elif f"layers.{layer_id}.attention.wv.weight" in key:
-                    #             qkv_names[2] = key
-                    #     qkv_name = qkv_names[0].replace("wq", "wqkv")
-                    #     state_dict[qkv_name] = torch.cat(
-                    #         [
-                    #             state_dict.pop(qkv_names[0]),
-                    #             state_dict.pop(qkv_names[1]),
-                    #             state_dict.pop(qkv_names[2]),
-                    #         ],
-                    #         dim=0
-                    #     )
-
-                    if env.tp_rank == 0:
-                        # Save gathered weights
-                        if env.is_pipeline:
-                            ckpt_name = f"pytorch_model-{env.pp_rank + 1:05d}-of-{config.pp_size:05d}.bin"
-                            total_size = 0
-                            weight_map = {}
-                            for name, weight in state_dict.items():
-                                weight_size = weight.numel() * dtype_byte_size(
-                                    weight.dtype
-                                )
-                                weight_map[name] = ckpt_name
-                                total_size += weight_size
-                            index_dict = dict(
-                                total_size=total_size, weight_map=weight_map
-                            )
-                            index_dicts = [None for _ in range(env.pp_size)]
-                            dist.gather_object(
-                                index_dict, index_dicts if env.pp_rank == 0 else None, group=env.pp_group
-                            )
-                            if env.pp_rank == 0:
-                                total_size = 0
-                                weight_map = {}
-                                for _index_dict in index_dicts:
-                                    total_size += _index_dict["total_size"]
-                                    weight_map.update(_index_dict["weight_map"])
-                                merged_dict = {
-                                    "metadata": {"total_size": total_size},
-                                    "weight_map": weight_map,
-                                }
-                                io_driver.save(
-                                    json.dumps(merged_dict, indent=2, sort_keys=True)
-                                    + "\n",
-                                    os.path.join(path, "pytorch_model.bin.index.json"),
-                                )
-
-                        else:
-                            ckpt_name = f"pytorch_model.bin"
-                        ckpt_path = os.path.join(path, ckpt_name)
-                        io_driver.save(state_dict, ckpt_path)
-                if dist.is_initialized() and process_exclusion:
-                    dist.barrier()
-        if env.rank == 0:
-            config.save_pretrained(path, protocol=protocol)
-        dist.barrier()
-
-
-@add_start_docstrings(
-    """
-    The Mistral Model transformer with a sequence classification head on top (linear layer).
-
-    [`MistralForSequenceClassification`] uses the last token in order to do the classification, as other causal models
-    (e.g. GPT-2) do.
-
-    Since it does classification on the last token, it requires to know the position of the last token. If a
-    `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If
-    no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the
-    padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
-    each row of the batch).
-    """,
-    MISTRAL_START_DOCSTRING,
-)
-# Copied from transformers.models.llama.modeling_llama.LlamaForSequenceClassification with Llama->Mistral, LLAMA->MISTRAL
-class MistralForSequenceClassification(MistralPreTrainedModel):
-    def __init__(self, config):
-        super().__init__(config)
-        self.num_labels = config.num_labels
-        self.model = MistralModel(config)
-        self.score = nn.Linear(config.hidden_size, self.num_labels, bias=False)
-
-        # Initialize weights and apply final processing
-        self.post_init()
-
-    def get_input_embeddings(self):
-        return self.model.embed_tokens
-
-    def set_input_embeddings(self, value):
-        self.model.embed_tokens = value
-
-    @add_start_docstrings_to_model_forward(MISTRAL_INPUTS_DOCSTRING)
-    def forward(
-        self,
-        input_ids: torch.LongTensor = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[List[torch.FloatTensor]] = None,
-        inputs_embeds: Optional[torch.FloatTensor] = None,
-        labels: Optional[torch.LongTensor] = None,
-        use_cache: Optional[bool] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ) -> Union[Tuple, SequenceClassifierOutputWithPast]:
-        r"""
-        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
-            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
-            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
-            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
-        """
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        transformer_outputs = self.model(
-            input_ids,
-            attention_mask=attention_mask,
-            position_ids=position_ids,
-            past_key_values=past_key_values,
-            inputs_embeds=inputs_embeds,
-            use_cache=use_cache,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-        hidden_states = transformer_outputs[0]
-        logits = self.score(hidden_states)
-
-        if input_ids is not None:
-            batch_size = input_ids.shape[0]
-        else:
-            batch_size = inputs_embeds.shape[0]
-
-        if self.config.pad_token_id is None and batch_size != 1:
-            raise ValueError("Cannot handle batch sizes > 1 if no padding token is defined.")
-        if self.config.pad_token_id is None:
-            sequence_lengths = -1
-        else:
-            if input_ids is not None:
-                # if no pad token found, use modulo instead of reverse indexing for ONNX compatibility
-                sequence_lengths = torch.eq(input_ids, self.config.pad_token_id).int().argmax(-1) - 1
-                sequence_lengths = sequence_lengths % input_ids.shape[-1]
-                sequence_lengths = sequence_lengths.to(logits.device)
-            else:
-                sequence_lengths = -1
-
-        pooled_logits = logits[torch.arange(batch_size, device=logits.device), sequence_lengths]
-
-        loss = None
-        if labels is not None:
-            labels = labels.to(logits.device)
-            if self.config.problem_type is None:
-                if self.num_labels == 1:
-                    self.config.problem_type = "regression"
-                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
-                    self.config.problem_type = "single_label_classification"
-                else:
-                    self.config.problem_type = "multi_label_classification"
-
-            if self.config.problem_type == "regression":
-                loss_fct = MSELoss()
-                if self.num_labels == 1:
-                    loss = loss_fct(pooled_logits.squeeze(), labels.squeeze())
-                else:
-                    loss = loss_fct(pooled_logits, labels)
-            elif self.config.problem_type == "single_label_classification":
-                loss_fct = CrossEntropyLoss()
-                loss = loss_fct(pooled_logits.view(-1, self.num_labels), labels.view(-1))
-            elif self.config.problem_type == "multi_label_classification":
-                loss_fct = BCEWithLogitsLoss()
-                loss = loss_fct(pooled_logits, labels)
-        if not return_dict:
-            output = (pooled_logits,) + transformer_outputs[1:]
-            return ((loss,) + output) if loss is not None else output
-
-        return SequenceClassifierOutputWithPast(
-            loss=loss,
-            logits=pooled_logits,
-            past_key_values=transformer_outputs.past_key_values,
-            hidden_states=transformer_outputs.hidden_states,
-            attentions=transformer_outputs.attentions,
-        )

From 0068f2395de5886ede219eef283e2be5c0c7bc75 Mon Sep 17 00:00:00 2001
From: LinqiY <100989140+LinqiY@users.noreply.github.com>
Date: Mon, 6 May 2024 15:45:14 +0800
Subject: [PATCH 14/16] Add tests for mistral

---
 tests/models/mistral/test_generation.py | 37 +++++++++++++++++++++++++
 tests/models/mistral/test_raw.py        | 22 +++++++++++++++
 2 files changed, 59 insertions(+)
 create mode 100644 tests/models/mistral/test_generation.py
 create mode 100644 tests/models/mistral/test_raw.py

diff --git a/tests/models/mistral/test_generation.py b/tests/models/mistral/test_generation.py
new file mode 100644
index 0000000..ac06de8
--- /dev/null
+++ b/tests/models/mistral/test_generation.py
@@ -0,0 +1,37 @@
+import sys
+sys.path.append("../../../")
+
+from transformers import AutoTokenizer, GenerationConfig
+
+from collie.models.mistral2 import MistralForCausalLM, MistralConfig
+from collie import CollieConfig, env
+
+model_name_or_path = "mistralai/Mistral-7B-v0.1"
+
+tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
+
+config = CollieConfig.from_pretrained(model_name_or_path)
+
+config.dp_size = 1
+config.tp_size = 2
+config.pp_size = 2
+# config.architectures = ["MistralForCausalLM"]
+print("------------------------------")
+model = MistralForCausalLM.from_pretrained(model_name_or_path, config=config).cuda()
+model.eval()
+print("------------------------------")
+prompt = "Llama is a"
+# prompt = "Q:What do we eat for tonight?A:"
+inputs = tokenizer(prompt, return_tensors="pt")
+print("inputs:")
+print(inputs)
+
+
+gen_config = GenerationConfig(max_new_tokens=256, early_stopping=False, eos_token_id=2)
+
+outs = model.generate(inputs["input_ids"].cuda(), generation_config=gen_config)
+if env.local_rank == 0:
+    print("outs:")
+    print(outs)
+    print("last:")
+    print(tokenizer.decode(outs[0], skip_special_tokens=True))
\ No newline at end of file
diff --git a/tests/models/mistral/test_raw.py b/tests/models/mistral/test_raw.py
new file mode 100644
index 0000000..1f50b39
--- /dev/null
+++ b/tests/models/mistral/test_raw.py
@@ -0,0 +1,22 @@
+import sys
+
+import torch
+
+sys.path.append("../../../")
+
+from transformers import AutoTokenizer, GenerationConfig, AutoModelForCausalLM
+from collie.models.mistral.modeling_mistral import MistralForCausalLM
+
+model_name_or_path = "mistralai/Mistral-7B-v0.1"
+tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
+model = MistralForCausalLM.from_pretrained(model_name_or_path).cuda()
+model.eval()
+prompt = "Llama is a"
+# prompt = "Q:What do we eat for tonight?A:"
+inputs = tokenizer(prompt, return_tensors="pt")
+print(inputs)
+gen_config = GenerationConfig(max_new_tokens=256, early_stopping=False, eos_token_id=2)
+outs = model.generate(inputs["input_ids"].cuda(), generation_config=gen_config)
+
+print(outs)
+print(tokenizer.decode(outs[0], skip_special_tokens=True))
\ No newline at end of file

From 2c3af7fc62ed102e0a9fd5f4dfee7191e9560431 Mon Sep 17 00:00:00 2001
From: LinqiY <100989140+LinqiY@users.noreply.github.com>
Date: Thu, 16 May 2024 10:51:20 +0800
Subject: [PATCH 15/16] Add MistralConfig

---
 .../models/mistral/configuration_mistral.py   | 155 ++++++++++++++++++
 1 file changed, 155 insertions(+)
 create mode 100644 collie/models/mistral/configuration_mistral.py

diff --git a/collie/models/mistral/configuration_mistral.py b/collie/models/mistral/configuration_mistral.py
new file mode 100644
index 0000000..ad6691b
--- /dev/null
+++ b/collie/models/mistral/configuration_mistral.py
@@ -0,0 +1,155 @@
+# coding=utf-8
+# Copyright 2023 Mistral AI and the HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Mistral model configuration"""
+
+from transformers.configuration_utils import PretrainedConfig
+# from transformers.utils import logging
+from collie.log.logger import logger
+
+
+# logger = logging.get_logger(__name__)
+
+MISTRAL_PRETRAINED_CONFIG_ARCHIVE_MAP = {
+    "mistralai/Mistral-7B-v0.1": "https://huggingface.co/mistralai/Mistral-7B-v0.1/resolve/main/config.json",
+    "mistralai/Mistral-7B-Instruct-v0.1": "https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.1/resolve/main/config.json",
+}
+
+
+class MistralConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`MistralModel`]. It is used to instantiate an
+    Mistral model according to the specified arguments, defining the model architecture. Instantiating a configuration
+    with the defaults will yield a similar configuration to that of the Mistral-7B-v0.1 or Mistral-7B-Instruct-v0.1.
+
+    [mistralai/Mistral-7B-v0.1](https://huggingface.co/mistralai/Mistral-7B-v0.1)
+    [mistralai/Mistral-7B-Instruct-v0.1](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.1)
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 32000):
+            Vocabulary size of the Mistral model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`MistralModel`]
+        hidden_size (`int`, *optional*, defaults to 4096):
+            Dimension of the hidden representations.
+        intermediate_size (`int`, *optional*, defaults to 14336):
+            Dimension of the MLP representations.
+        num_hidden_layers (`int`, *optional*, defaults to 32):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 32):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        num_key_value_heads (`int`, *optional*, defaults to 8):
+            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
+            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
+            `num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used. When
+            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
+            by meanpooling all the original heads within that group. For more details checkout [this
+            paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to `8`.
+        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
+            The non-linear activation function (function or string) in the decoder.
+        max_position_embeddings (`int`, *optional*, defaults to `4096*32`):
+            The maximum sequence length that this model might ever be used with. Mistral's sliding window attention
+            allows sequence of up to 4096*32 tokens.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        rms_norm_eps (`float`, *optional*, defaults to 1e-06):
+            The epsilon used by the rms normalization layers.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models). Only
+            relevant if `config.is_decoder=True`.
+        pad_token_id (`int`, *optional*):
+            The id of the padding token.
+        bos_token_id (`int`, *optional*, defaults to 1):
+            The id of the "beginning-of-sequence" token.
+        eos_token_id (`int`, *optional*, defaults to 2):
+            The id of the "end-of-sequence" token.
+        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
+            Whether the model's input and output word embeddings should be tied.
+        rope_theta (`float`, *optional*, defaults to 10000.0):
+            The base period of the RoPE embeddings.
+        sliding_window (`int`, *optional*, defaults to 4096):
+            Sliding window attention window size. If not specified, will default to `4096`.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+
+    ```python
+    >>> from transformers import MistralModel, MistralConfig
+
+    >>> # Initializing a Mistral 7B style configuration
+    >>> configuration = MistralConfig()
+
+    >>> # Initializing a model from the Mistral 7B style configuration
+    >>> model = MistralModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "mistral"
+    keys_to_ignore_at_inference = ["past_key_values"]
+
+    def __init__(
+        self,
+        vocab_size=32000,
+        hidden_size=4096,
+        intermediate_size=14336,
+        num_hidden_layers=32,
+        num_attention_heads=32,
+        num_key_value_heads=8,
+        hidden_act="silu",
+        max_position_embeddings=4096 * 32,
+        initializer_range=0.02,
+        rms_norm_eps=1e-6,
+        use_cache=True,
+        pad_token_id=None,
+        bos_token_id=1,
+        eos_token_id=2,
+        tie_word_embeddings=False,
+        rope_theta=10000.0,
+        sliding_window=4096,
+        attention_dropout=0.0,
+        attn_implementation="flash_attention_2",
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.sliding_window = sliding_window
+
+        # for backward compatibility
+        if num_key_value_heads is None:
+            num_key_value_heads = num_attention_heads
+
+        self.num_key_value_heads = num_key_value_heads
+        self.hidden_act = hidden_act
+        self.initializer_range = initializer_range
+        self.rms_norm_eps = rms_norm_eps
+        self.use_cache = use_cache
+        self.rope_theta = rope_theta
+        self.attention_dropout = attention_dropout
+
+        # 调用父类的初始化函数,将一些公共参数传递给父类处理
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )

From 86445698f75f94acdad180216993d2eaa1537954 Mon Sep 17 00:00:00 2001
From: LinqiY <100989140+LinqiY@users.noreply.github.com>
Date: Sat, 18 May 2024 09:27:21 +0800
Subject: [PATCH 16/16] Update __init__.py

---
 collie/models/__init__.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/collie/models/__init__.py b/collie/models/__init__.py
index 9a11a47..a60817e 100644
--- a/collie/models/__init__.py
+++ b/collie/models/__init__.py
@@ -6,3 +6,4 @@
 from .chatglm2 import ChatGLM2ForCausalLM
 from .moss_moon import Moss003MoonForCausalLM
 from .internlm2 import InternLM2ForCausalLM
+from .mistral import MistralForCausalLM