From 666fb6269d88b4790bc0f3eb4936f1f72f934fac Mon Sep 17 00:00:00 2001 From: LinqiY <100989140+LinqiY@users.noreply.github.com> Date: Fri, 26 Apr 2024 15:24:03 +0800 Subject: [PATCH 01/16] Add Mistral tp&pp model --- tests/models/mistral2/__init__.py | 2 + .../__pycache__/__init__.cpython-310.pyc | Bin 0 -> 295 bytes .../configuration_mistraltp.cpython-310.pyc | Bin 0 -> 6283 bytes .../__pycache__/model.cpython-310.pyc | Bin 0 -> 49178 bytes .../__pycache__/modeltp.cpython-310.pyc | Bin 0 -> 52277 bytes .../mistral2/configuration_mistraltp.py | 155 ++ tests/models/mistral2/model.py | 2026 +++++++++++++++ tests/models/mistral2/modelpp.py | 1922 ++++++++++++++ tests/models/mistral2/modeltp.py | 2254 +++++++++++++++++ 9 files changed, 6359 insertions(+) create mode 100644 tests/models/mistral2/__init__.py create mode 100644 tests/models/mistral2/__pycache__/__init__.cpython-310.pyc create mode 100644 tests/models/mistral2/__pycache__/configuration_mistraltp.cpython-310.pyc create mode 100644 tests/models/mistral2/__pycache__/model.cpython-310.pyc create mode 100644 tests/models/mistral2/__pycache__/modeltp.cpython-310.pyc create mode 100644 tests/models/mistral2/configuration_mistraltp.py create mode 100644 tests/models/mistral2/model.py create mode 100644 tests/models/mistral2/modelpp.py create mode 100644 tests/models/mistral2/modeltp.py diff --git a/tests/models/mistral2/__init__.py b/tests/models/mistral2/__init__.py new file mode 100644 index 0000000..9dc3f79 --- /dev/null +++ b/tests/models/mistral2/__init__.py @@ -0,0 +1,2 @@ +from .modeltp import MistralForCausalLM +from .configuration_mistraltp import MistralConfig \ No newline at end of file diff --git a/tests/models/mistral2/__pycache__/__init__.cpython-310.pyc b/tests/models/mistral2/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..76a01ca4171928aebb54f37b4541ecbf0bd2731f GIT binary patch literal 295 zcmd1j<>g`kf)fuV(xQO$V-N=!FabFZKwK;XBvKes7;_kM8KW2(L2RZRrd;MIW+0n6 zm_d`}B_mLYCgUw3-^}8YqQo4x{37SX(&EG%A77v-FI3byKQApa-A|Jxiaj?!B{ip{ zpa^6~lz1{&qO>TnBr`uRJ{MvJP?i}eyON=Z1xSI3U(xzSsk!+jsk#~YxvBa&g_W6k z`p)@2KAEoiC8@B?0;HGr)ME}k^pZ=_OaFov{Ui2P^k5ih&js3|=u2pi;`G=BYjBvSiM}3dainiBma(TGk3WBzOvsP<`&8Bdh4a4MB)2lhLT0_(l z1M?q!GQYP_U93s&d-fi$IZ#-O?Ny)oo_o0UCaTt5EITxV<8@Z@hi@OblEayyU-fA= z$pyVHay`BLEeNjdn8_UWC4#FZXk?+Gs9ITlL-kn z4Ab_jE3GYAdKY!flf57H#ke}ItUhs$ zg6z}TH$f(WymE}jKHG1mx^HSFUUm$VBx9Mk`wV^tPTUrkB47>aIf(pV0@t4+#1=CR zm+kTt0Z7I`dbaXk962ux+F{^V*%mhdk``C{vOH5oTam;uCK_bE9vw9t&Q>~B zWYfQ?o(S7}o@wmrzBuG;wl3VDKF6+kZQ>xqzNRo0;{d>0URqdKsAN-V`e!sf1SSpu z4(RB|K#amFyGzLAaRRxav&$t7v_zx9C9n$wJ?Acc4HPAwk-QDM!k57K?m_ARvPmy2 zmE5BX%dfLGk`Tl8TinHT+a`1m=3Khrmar`Do^Hq27k>Psfeqgk$TlDD>25XF$I|sG zU;EY69(>BB!!640(^*}-zPx;**^LJekG2UP>&ZTCUXDS8yv z$)BnAc+Izl_!y~6{2TL!;%yWdwgDO!>^`wbYJ$yf++V58F>wk*i$^ORSU9vkTB$G( zi;(*Qrq>>=bjw_o{S4XJw=Hnq9+yE}iX2+yJjIupEp9$lpPaE6M3 zj==STthUd^#WyL%=21$jY;Lof`SGNeVmO_D0NU|ld5XN_Kf4|>Oys4X-F)nC$tP{ zp|9IR@#rZzXun+(<;2omD(<%4W0es*rtej90pR;wIRuS>6ejPfPSWv}5}mgxGAAu{ zk zI9VP1f&*Js_<92UTWXEj_S%M}tIh^p3U)ZyTn!01#w86D<|h3tgS$BeIExMy^k$ULdHGA_K$={ ziV0-AY8h3%h?{Fhwpfp?tae}}R^|igwYd&WfpMA?L6eM(9^(6512qw4L3nAHx4eg| zJuApeChVjq?DRT%`a?aWQGI*_zoDoo!lQOwRi2Y-4(aYGJ89FUTks22zmsWWxIen{ z)s=_hEs9pEpS7j8YwQXej7pTHGO=E-w-17r=c;$NZ{Lm&LAWF7wxfu%)AnT0rTPsz zbI*0(-BC#!>!R83;ZU3~w&Kd}IQG#eC~yb%z=1!t8Yh4E?02g>Eml9991w>Zh}$!D zI;8D(5BewAi)W>F)TUEz0y)wtKo&ZqkfWU`$YQ5xX`v>sc1FLzGd>yXj6s$`5ers!zUclwK_hnyCov;7n`ol9e~8LyL662W&Wxtgl*_L3*X7$7{;y}aEp@zr&$m2>*X$PuxH7%tJ$N8iYXSEIzorre|Kh8wrCPO` z{F)waAaKb_ywmf2U$W}Sw4hyWAIJ&}3jFyC5XCFST=r8MDddXz?0qp;8p-G4pTd9g zqvQWRyqr)-%stiArPGp0wxh=Y literal 0 HcmV?d00001 diff --git a/tests/models/mistral2/__pycache__/model.cpython-310.pyc b/tests/models/mistral2/__pycache__/model.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ab53c9573dc702d9ab95ac9870bc99c46c10a54a GIT binary patch literal 49178 zcmd75d7K>AeIM9WU48fToR|RyR|5nI30P`8CU?{Km_1{Ko56BT-H?lI3J0RZcb1<+P#0XUZ7~mu=+C zm=SB_%LBoaf;_S6nMSUh6Td`#urX8~Y7Cc$8zbcr@k`c68)M}$xlYx$G{(#0jfwJv z+^6eX8{5j;8k6NoxzE(MH+GbFG4werZG3V}^G4-R!d!c@B<52lf<8b+~#4uPd zHI9^zG>(>!Hjb5#!EeYp?ig$LIfrf=3-{kNuExsmaQ3e~;5>NSD8JJlF274^;GxEM zl^>QE%=(G)3Gq8wJ}JMClpn$GNd4W7Q{__YU3?*C7oFU~%*}YY3ZFgpUij<{d@k8{+4~T7);?|DZQt{P zQMM6gzkL8<4g_Hwd&dHY^o%_Z$a_Td+-@@0fA*+&reND#ILpQH9M_#AU; zNO1x8$L;%Yf1lhhIyGm(S+wu}iFmn=XYa5dz_SN}w;T3@<)(9&{Z6NO)5Ozv*$?6A zLr(E#{H9TE+22)O>U#RHeF9HU=y&n_3f?$rKLWo;oM(`>LfCf)F`csC1ON9pEs4ps z&GWITM^`3_Pu{rDQnQzf=W4E}s`X-{WjpoK%N%IaQHShA-)_l!#&$e9GA3Ck@^H|eUt)&~{lR9_)F<#)U zg;95&p&#jS#8v2= zs_vqH@GXN>=31)a+DldBaN<2x*Ez?sJ+bU9Eql!ElT{RL{ItY=_FSKT@w{_o*=f!? zr|VVMt5o-yyW)CP+PRwOJ487SQh1dflnB+C0Y}M68wCs?B)^;WKBeHym|- z$(i*BF4P=5x<$bJ=~lg7bAmD%e){nz9#yrgj$$Wd<=`sr??$NQM)S!a-Lq)Ws$vgm zEqm{&Rb79qFTlC?IaS-=@l@;Drte_v49vEgvsJI6^Y0gIl&?~Cr63L0k2}q)etJo@ z&~`E^#&u?;E1hYnuwx>o9g9~n3NTREMWj~IFHj|QmfYVF_~2O-uIp*!*ZZ#Yv`ycY zr?c!E;U|HM6uzhNbw3EF9b3Q>>KO~BZQe3hc@=NR)~wqxJ8oOIOkS;;ZDTEQJGPd@ zb=*s>r4cIAHta<3UffRJHdn2-*|ys8xi}`J)bz`5;PaB{$L(6fPsorugZ4_J>O9Uq zO1iA6CXoq!o%Q4HGQxVC%KSvF>D_nSPc>W36-TxF6#6aa4nN+gUiYn<=QModj2}m@ z^pms84ZJ#)@C$wO%aH3_K^J#X-<3;M5AmkZb!)SZ>yILyihkUAnw?z^{M?sf>ix*u zo$?XoG+Lf>_;Rb^9I0QqQEMJK-8y@=_Sg~6aXt5l)G~hb-0LkJDV2i%S?Ray1IQbt zr5pZWW!~{BdT7~IugVs2hu{nu<3`??G)&VpOv8FzorC8rzC9l+xO4Bpg%bnO6a&GO z{8(%bM3fzWAp^3>vJ+Sst&hd5SUJI#@dv|6;;D1zr(3G=N4Ngu7uLS{#sBr>EJ(2E z!)x{y{!08_>}G7uxDE2kGuPstwU%fbxAE7GRg!4=SS4i}h~Hc}ku3(FAfX_gV$CgL zTB}{HqD8!-*D79khYWLOB6??fD(xrSbOjAl^C}fZD$&nym^K_tkm>~QtyUAw zn7roH<}Z8x5GJ!)6;pl1S#oQ2tZQ-Ka-I5|pSxVMZA^2V%Q0S;+?O!-R4TJv5)d~V zS)HZx0XR0kE|bezBSzL(+1giGr7%ykCIW>YRusRd@pY%+w2cKlYz)s_h~G9=4Gba6 zMs+PD*fbnktMN9Tt|i+Ta0{u4*llAhhH$ZkG{PEfQ|_&{g-T6Kzs%&+gK#cj6r~m& zX9)vqDx)5Pmv3UsNS$Dmw5l98@uk)^-)cD3rk_ym73Ha?8NmfOQwha><`0DR!U@{- zb20%{WTg7(ur~d8wdv|QiYf|YQ7>(EMC0b+B+*&N@Hb>wV=EK=RThMvGSm|Y`%;n( z3qrxERQzm(i{)~i*ZE3?i=KWWjiKfwpFvyAI`w+kUXqt*@Km=Pt9%w;X}NT)U<{>I z=m+o4CmJzF;TMgVtc)0r6pomro#KeGb9UO!fRs$g*h!)p@_wgmEl>kD9&223Y#VFy z%dvDU_VUnv&|9;e^Z4>Z2zB^Hu4#uH}bIIdIZ zD%M&Wvu~ztakMR%+he1#?XlGauCsQconVe{#SPBoImWk;Ul{NbYX!{VY5LnS{OfJ7*H(7puKY?GN1VC#0{ci+p`hIrB_X zRW2>h&0*}1U>f!+vn?0HwyBp*wZtTs8GM-2rtQpCm+SZ?xi*!QPOX?3b(PMwFtmP8 zI!n}rCFh?&2zLZd)<~L3BWWCE2aXqv>>Dc~DSiB@-iGds|FA3JeOXeY@xKBOY1|l> zM>I#QZE{()ja%_G3-<}k2bO1G8;@o^j}=oXD5=Bbo!6pyd7;dE~^JhVMJ8>y$p zrM}5UwGr;e@C_!G9Y(<%F(-{(#>&BHV(D*_b$w;SrK~eMORFqeY7n)InHIA(ztTc7 zj4T^%_WkBIw3Yaqw>a>U>wV|QfyJkl%fhwbpvd3HYnGx@Wc9tgWlvYn!EE<;3CLQd>Ih_B1Ji}Pu^Fp^dD?V%>)@QAwo`*711aSW;o)Z@08ZB53$ z-<(4=a#@qvE%LWfz4C-Z1G8WbLk68cZYOTBN|P%GIqE$w`sgEEVSo#z0TWl@X=EycNnRo6mvA4}J%mnS&O`{zji%rBZ zw$bOr&xrgmp4%zuO-8keIz6?t1eV7w>glKWWH7T9qZNnr6Ke(ht-4XHHtph~bEBxa z6t|QWdg_##cOQxyZ;+{|c%)ES_%L`eU<<-rdaXrNQgODWz}GFc zz%w-G-@M9;XIiT0T(363+#EvFfZCm{Hj9^>o-~(T$7VcBAU=`S9A+*LKAmhJ*VisP z&EmC|T4b8nz`+))#rJ>kuuR;Y+FEY9XnSYHsbEB)-oP)-94a1c1wx*Khl;@h=`y`z zQ=N_wmVJLu6#I)+HNV^-Y3_mzr?3QXqKXhbssbXLVijaHa&)-{Q4`{fUPm>#&xS>) zpyDsGL-v##E!Y>rW&f>$54enet81zVJ{p6zexrCvwW{_kL_4~+da@y-|BCMNrQTX^ zf=}y7R#Mn7cg6KNx?H{LNQq_cC_XC;qp%H!iUAdLsCXGd1J?M(68lf{P_Z^wl+5&` zyNGFP^T-&7yHT9%H)wly3buVz!A$VPv0|gT=(s)UyxpQj^+~Z)*pH={y|sD1)_`aV zEzl`fQJ1ABS+>VpRT2sq)m5ilePueX`}0#G(CV72W<*ACpnJNSnn5>fEU6kAZay5Q z8_s@3J(=6ZWeLj6ZaR(X^eKRfDqALeaG}N(-&j=NkNI8w031Kx-NXHXC_Ed-PhF|V zj0V{}A!YDqq-Az)~}2qz&~q;WvwK&xb25Kkhy_A>Sfg z3po83(%@RaQugrq2|pK-3+K*0*~6`pm81}ek-A(Q*D)m`1gd~PT>ynz0ZW1Wf6OB8%TyqK44JJ^bi-65Ak~* z->;F_ZsSsY0vA%XplP|F_g8cJ;d-@k$*!LKL%zn6TCtUTx@saK;7bqJTeH==dlDD= z6>vM&B}xhh=Bwy5ki&|UnVd1C5Rg1vZrPh=QITftWKaG0`pl_M!|yBf{|Pu3{?bwX zc^K78arL)w?GI}f9zta-xFUU3Yw({M=x}S0m5FFreJFSV^n1gxYaq3>IQL%qr zm~Sok8PtR$m+CMdWFYo!>iaD3vw;kuI_vpKIF-5POQt6JKa1x|NJ^nFNhb;Y`MbDq z?}r1f3;jK9m10J?WwbkI0Y$FJb9Z&o4u$=r zc<^O>1Bpf67}}%gr!K+q6HUnYm;9s_^!F5xgeV8E6z{{h!0?JaXRShnfC0K_ zssRob(A;1rzn?5+;D*1Ty%Qi<$@-uoz=kGnCstWWlt;j4EGB*wJ;hhPEK~ml5;XYd zY}zHoBOy+CZ3+2>@&W=obf~(6#qYp%P#|2k4i)t?_k@&cN*2zLniuVWTbyqd=Tr-1 zjI6nbB*o@(qXKHP0(s%GQ|YXkU`{(r%M=&0;H@Ix=m>x}dqW}@x$`s_u^(c6C!Om{ zpjiAAh&}QRkS4Gl)X(r?LJN*ZH^>hpj1-Ne!-cM^soCGD;*eTdy!t9`^-P~gqqiR= zWAO#}&Enhh84K>*pTxjlgIC{GhmP4JL(h|CJc#R zx!)E;y5Q^P3ig=2<%L9fP$XkR=p|eI;jsIk3N?Hy|5$4J9Aq@L!>l3Da#8E(=H4F# zEPYD<~ukSU*iWwj5KF6qXD7n_NhJ8iVpPkmf+nU_jCvt3cWdWlF4)#X#nInVrNy ztmNRE$1lDMe3BR60r(c+Tfi@57nMPqg2N%)4dWMI2mlct!xwQa#?>?42xLgOMyet% z#BcI5iJQhp4R4hG42^GYEQZxNB?8Zwos|`OEDFIqgduVeiH0GzY#1Wn6=K|*=%j!v z$gM!A3`mHAq_Wl9#@O*ixfTZP!CNfRq)-Gy(<_C?o0uV>FxG9j@{gq5ur3P&MMAJz zoQc|Kh7%Nc08BU_6~VPgmqS0eRzugqtR%e%`h%8oi}o_LxH6vwvo*L71Oq)!PK(bF z8~L?T@kzXkenR@Y=q@Xvps_w)t=Udr284H;r5;yDjwnRYP`a@4NGNZI9-`P-c0KTh zm?5u%pe0d$>4wZ-GHVqN%%c=DT`$Z$R9qPmo@a*hkc6V2IaS)6Yy`(;aA@Y}|Vt^dciT|ZzdgQMxCLmyRp44p=5mZ1VQJ&h~p`~PsP``)}>L4eJ zJkSuTC#jZ)1PiYKNGd3`Ued8bBt!JccGbAt>q$z7Qx2N8aFO*-l{zM4cdKq6wg#uP+BuX)>A zBg5J^LFgH_-cAJvx=2M z)6J`xuc65R1-**3K`Vk+2fSoE$Mr~5R&m_rd+r80ck#FxU(K!NR|ncTtO+1RG3R3q zl(i@mtbe(+Jb1;r$=H zhB3guXYl<{teLlB-jPokH-19<&c{|Ko{2S$Yq9pk+R^qHM~uY3HHd$l@h==}PqfGF zZT6(S{gwr~Z+z{zy@O#lNby&@Q*4i-{wx91?3{-D33xLuFGSAWQ2@S)cx$HHTPPsg z0qzbeInvu9@dHwWFqW(EgOvU1{iXXJ+<>ehfj!e%OqdX21PKPxsWfwE>{1aD`pZRo zs(AQhkvF)8#@Mj6?uAq2%ijX-=gyf6U=u}PT?ATgc@}szC~&GEhAvYDY*YF=>lA_E z62UU_^FXQi_+0S@=zkyv1JT``t3}Xe#p<%xIt&dlAy&YBL9jgwt^fp@5b=vgu7(ub zp(2Fu>nJsfZ)(*#$u4rQQTa4wrX#5qg^EOBfq^SQxWxGyqKqHFkH`RD#{3vt*%!^p zMU6XgAqQOMam5IxZJ^9md>szCj3thRqf=YOYjHyxpHgx$_Z zWKYFgl4TTnf(h$z+0lXl18{_4MR=X`Xz1A`iybuX+fonCP zssxZ_xdz~Ku(UyO8!iXU9^ry$d?6md;#TvDO~|-JF)JG8Sq~A3^ntlpTE0}T%@$8R z`8bvoDQkZjxvbj`fc~-o1_S^kp`x?K`VP&hUh7050KHk$!gx`XbYi-nIqI7%dl;WRh&F?-Fg`zn zL;}XA(8c(q)=a`R1^XQDVSzN`Ll#I7u)+cX(t)r8UMgUL0NuI;q;$Xng?E|-lDi%j zC=;+iB8me!t3`l$urvc*EKqn&)+W3r`%r+`j4V(K;fDeih`$WSU$Q`%9v0|RcxzZ# zppQX7NS0!BVXTh@+9E8_ux5e4@D$o;avOLaJ4@Wo+Bm>x6SreGfj0(wKp+*F8L&RW zBn_<&L$F6C39L_tN%AIJzvgXco-q}G&?alr`ZYIf44QD_}N<+Y=G9xlfqQyM@`@C(I_V z*=z0rd-D-{q)C~_{?)Bu;1e{yu(G z`qBU-G@15RFrnkOQmfnB+t-c>8+aV)--mBrvw_lD=w)m7x3?qR?ZOKF53qte@Wwk( zlAZR%>MpQ?543mLTgjLSE4bTxaP6J#-FTz8jxjUc(Qd{pUSi$1j zL+zcy3NrrfLHy&Zh=1X`+Qs&6dk43+2`jjJ?O_`!Zomqf?%D1X+uPea0#>j%t^PG9 z&EKN)+jRa79pZWQZi?UG)tBi|*Q)u8e~T->5GYS7T6wbalU&4RmjQ7C=%;^;33)TS z1so8C>Hy_0wSeQ=OapUB^J-XgvBWjNY!c-)H&>gjG5Q*DQA8I(G7czFk0^b4X z=8E8oHN)D3#!Fv9JsU+|Dt-vm47DD7vv{dGyD0JM3?rOxlpxmrItuBJQbgEWJFfco zc%uG*&VQiuAL;yOIA_!!^76mXVJ%IaP=Czpm*H^J!IBby!<+Jd>c8;~k)D2)R}ssn z{)D%iv3u%I`ScW>_t5!OI>PF04+xH+hq^)k7KYMK0V|F!rPLNs6e6fszl1ma0x{$P z|3l7^gtCPX^_9v?8Aq6MgCHKlg?kzd4(u zQ)G5jhEA4Fj!vEqDLL(&<<%UV--o};E*tWg*#tPtx8gG6-CXA0IQUG{0*?sJ&@^AS zlA2r1CWg#0vml&nK3>4pK69V36`HzM zylRrZn^$Y&@||NX{=p&kXDr$fZCNAYc&zk35%L1*r@Oik9l+2qNWpg6eZ&LVEa3|xHyWtm>-F!8F9jk(Vwg%-SHT4ct5wA3H1xolAq?sd%NfC6 zwTnzHQ{=tw;6L>J&DiF?FG z3!NoQA9{)?y5Jq*NoFv7Yp#a<-CfME4idn@S+r9uZL>ZyLVVhCkrd|}Fg_<-H% zSomU3Ky1Z5+#6U65f~vq!4!@H3z38U1?$r7jGG{EPl!P3T5%x?aiVCtLG2qv$;#sk z)CZt6A9(+VFMyyz!6E4<9OLGBBj3vLNaiwCodZiMaU5)b3G{g z4Knm0sOQlNOaoLLw!3s%F#3E~46uL@0UofVfD2$j1{@8_+S~DV4zdRm{_zd`l=8OT#iv4MIH10xA+xAQ|NVSZCWHHUb(BtD2R{tIKVt`f z7}yI7LBnn}w?g|ujSKWGaR7RtpCKp*o$?LRvg4qe&A9Hagalc4R_@xgqjpUj zL=s`DA+b7r278PzBh{|miv{K|Qlh9q-o_bZh*vpY4fBfBzv9?b%>U--MByUIeMd?c(8KE$ zE*e4rx)&=vBuQ9nNv%K?N-`L`>bUOO)g@Y(kd)6q`s67pQvzV`TPOz)z z>i4jspibKUuT)h4WTbWIz*l3krGy@OO}Z;sF4o!*dI zJ}I)#K!YkBD`MmnS;P}?;vR##> z4ZG7bk{_~qn(`(YJ+;3hoYWX&S;zir#Hk`W{Y7TxSLsk{+ArJu75WKnVu`>_z$$=LvEE2p;nL3E>0n`M5lr+pSXmH6G@XhiU(f9?30F$o_ zWeG6JnkzduTzyNyTBU^88Q74BMWSf7R(KP^t zq{sh4@lBdfkg?$p&xLmD0;%+IAf`Y|%p%&JPek$jb$EARBTn2l9V^(Y#2u>nGC(8p zm&_#XAyPV^*3%AlE-jGq$;g!kjif>4h%fwe@aImYeDFIU&v?f$d48!m#oi>i*q>Ce z2LUwc07{1aaC_dP#11;@ERC*p0Q~EA_pp*tf!at~_wF8Abg7sg7W@S$hPmwDq)!5P zX9Lz8UogG8qCU7HFxV*G9A9zTEKsZ(bwr!HK0 zZ2H3EPfS-%pFMT{{A1_+G16T9lbN5Ps!N{8HbPiQt1pQOB!ICRZiQMR8piSmg+bSR z1wH+(5CwISah)gX*N~BokQqPBj+Mv5Ec*Wlde#VFSSve%-rilmZyTMlp6FuyB)T}L z-5-Lp9-E#)?XLp7pm%kO<*Z&ZEPg8qW)$*lkTz%&z!ZRGkvBAigF8Ysgv01OpQEBW zp-zAh$;=tRf9Kjs^(ZCI!Cg)>fE^X=2@aWLM-8UV!b8DmYD}G@L&OqV_=6}!AArAq zL7^(fPeZ0#v$4eb!%>@7LImD0w2>~xVi)K?rC0BMBV95>{3x61e#9E;K+L#Rfc_!i z&Gd|rzmrZ|5L>W0p>2SMLcZfG+c#>`p6P2Hi*|s8l!<>+K24@C@@anV42_iq$TwN+ z7Wul&>d7#{qK)EG>a@#?a8{e#+#jC;#grG1I zS3Q`pXJ`7%fX=|c!|BUTpf?qRbh42PtF$jSj1pfY06xVt(eL=drCy$X_q`)2{F4T%%2XNWYxFtf0vmW2+I zO<|I%V70D$MUgqx!4OAb`#wU!EW9+Az!F5?m!5KuhuY{-+I zmM!r!7yE>{nh4jSm${1++@|gW+9(Th#e8$Gy_hA8%o0hf0?r=d&e?SO6+-dmYV$?H zWFtacYOEN1*!&W0h(!cT+nVM6d~RdJp4EszJ*4r6yzj|l*l#f}FmO-o&(3_Lub-Dl zbQ9ztuyet~QL7*X|L%S}V*VGttR`k~;=bq*&6z}9Vg^wkL{_?{ zaL(TTfVSfq^ym~;0Cy|=(@=WT$R>>Ly^#kMK{&Vdx@E519a7GH4OZ%jXBu`b`Xq$g znOa_t25sQeAEy1mz#=nx`CRR~zdOQ-5Ozd#QU#da*~O(6VEh=X{xImJiVFi=0IW)> zTz`ATU92r}*>z!fkcJQ|VvCt8fj?e}VvGo87<%>#9e+~I{+7C!dvi2;P-3u=wF!5u;V!} zQ5EiO;qIScUvk{$Jh*I<8-u+T;MeJM|8k_#@gO?gT0Bm^c;5}Q9OFJRfDk9Q7A!!i&K;| zXW||eFA5;$l3|E}FN-1qd!C%SGs_hTNV1t{j~*%>yEyZZrZ>VWI2-2~fpnjP=x5d zyYK!7j!j8tBV4s7wOW(ILX&%eej%EsKqPt2{Efa!VPBVVf`=NULsFdL+qdfNo4wWP z?_n$E+iSwCB5K2{wxm zATnR{)VE%)OE4=HrHKl?IMY|jov|Kc>qhJiK%6e3$bq5izu2Va=)vmEQgD7^a&T!u;E3| z+1=zq8)+CyaVF}VqI`GJ8-xjy5g|4TIjM39Js#R#Mch3}YCDO&R8|i}UDHni!F5QF zAi;H2utJNW5Kv)38)8Zk7Jjh_NkO=4t?tZMXK(Z*Dm4<+Uw_3$O(^u3=Gb)S1RINf zg^>}wse2mIoI~q_mzOC(wrMw8>pP9mJJi5(T|!6c$&k)tZ|osG7<+MPjdYQh-v_t?(1i0)&2tLO2qyo>;(EtSqi0%ooX$tPOYXD;*uY-Rs%7RwP&O+MI^P#YYANQBxgkm_cNl5LUDd?j>yBCPw zXzVo&5ghCiI+>eBX59w{Qh;h&$U(E0$M1mX_IO1t-vImyxE`cN&n`#^T*G2v-WkI0 zQ1Ck({Eh^_qt381;*5$85awQJqYwwGjR_)Z3q*kv{y->MlDb&=U!)nj5C@uO;-aES zBu;B+=Yt^`K+|h_veeCfFxYxkDn3RUSv*Piv(`7zo`A2B6nh0VI`W4?QR-+4AY)qWkdvziRqs0<6+iDnYD2^^T^P} z6wZ{2A#27>^GD1pucwa!xr>FQG-M!$Qt=cMBX#M*Xs@-p5#`rCdzK5|!)t#$reHd4?06*CJmN z5g^rQ4h5#HP}r>y$iX=F1dRi0nFh`hK!fp~P94=qPvam$!YTyn`!yDj0_ywOIFJu>wN_=bSmB4WC+}Ha*NUUODUs^~8QrA0H0lw66BECob~^Agp{T}z zBar`3wv&Nh97jiF#BX0a6Zl!cY2(~4gu5S@X!-T9ZAG6Z8t~b;8lbfRm_GfVgS@RG4@1~%o>&+HLU4G27{Xk80Eq2j;QwI1 z;Fg8su;OcDKrI?;TX4Kh4rasR%{2UiyR}hZ5`hwzcs~c#{kUK!69K00i^%naod(M9 zi^$^_=}U~djYBn{HQ5Sl1>1le%wjv+P4Fcl;3R^kPob-NSAS+gNdOE0#;8Pya;~B=gQeG#zz_JdbWmt~DrqE!k z2m->X9E?tv28pz+KEk0Z7Jw;TfGSI+GqTG^J%E`;Z_YoBtCg=Z-z4uz0XM*Df%E_^ z9Z2b+Y{lt?V3xGjTXfNqflw6(%!qShzv6!K{Q+-Uy%E{GU2ej{wHDb2GFr$`NjuBF zk*@8G;jVuWJ;Ah2$$yvGTQnHiSkd-Sxl-!RK(TnDxSxhz_RoZSu!4Y68nlx%X{o3l zMUg=S$o3#EI5?E<41UC}Wj`z?vTV?^qHzhqWEyk%<>sRABe{_w#z&?>C}uy1CGQF* zf^cDxgiN)-Dc-o+zo=v$X5C(2QYUwU!V=pv=JS)Wl>@!&8)}Up)Gm4>d<|!O4%fM_ zxtj}6Tu`)C_iTr^ncAnRoi8EZ>Lx>oMXy2fkKK!jfHHrGfvz&p@AK*)onNB!J@m`* zDh~%}J*x7xq$RNVDY#F9Iy;+Cd{`oD&Z|$+Q*fGG`!%8RF#X;`r$C4MgLMme2;%Rk zgq^)tT`K3t@=QOi7*op(Jcx5jyK{nLJ>}F-+7=3}n|&V>?zFm8)JN&Zy3lscq-tKE zA2++;xH5eh9rx>FQHuHF6w-9SP}sFEX5h+0tjO;41=(62lA{r{6-5~;x?zUVrF$C9 z#X+L*ittyiKZ2gYBj*G>s4LeW2ycbh^9ST7FxLmeYx0JO)B1XvK(n~v*!`C%=-2@4 z;=VB!A2Wxr`J6g&v6*Y;LFDP}=GbBbSQa*jLqiUb77(aI(WTx^96{k?Rypkn;3AqevE z=_jAQaK7^B6Q==+e0=)7x-$XhUjwm}MJ-{{Xn? z!JCFNM84SGa?8w&um7_juqPbYf3mkaTkty${(75Sjp1q%SA4ovIPvY`k868J6k;c? zw!vpt@Vh(sEe5}PoJnW9Gb}v$xV_iDi#$2F@*SCR`$2o3eK!oi?R0k8_t^Vi0Ig8o z4M~Vd73>4{)C<^8Sl;98${e)ci9_4>;>o0AIeQtKeaP8{lMW8sCE@b#VyMg!9Jane zIQuBxN!!P|!y?Ax&b}bDC9$RL`|SHAw!5R)`oc&|SseHtgcI4qJ)mbEiV()f&vcG` zzEh}cW9=mket8Rq{yB6E&Crv!<=&zIR4i~~a7p?o9$<_>6DH+29#ahFr}ge393lXe z&_Z5lMwmt>^R|5U_BQUr{AfeF5Z~^Rb8OdqH`}Y!{e`c_BL&L0Jo+Y z;#5#3DGUKc;k=05w-hl2f&EbrFX9|CTc3!;?G-pTQW9a*gAK@>0xph+zIO4yT&O$z z?-Z9~LSSe^16CR_6{W?zIPW9iad)+E&I4nRYewu4E@Nqz@J94V0D8}q)bu4uxnZ3k@ewS zA-%B`W-7akd35?wsVnq4W;!Yp>xgC2c}Eut@~-Xl#noq}?&&bE0E`L0FRN`|?f@qT zD2l3}6U-auR^%+&(a_byOHs%k>J0IB%KC5Vhp6Tb{}LO77stYM95NIeCb(Q@0WX23J3inN#HK zMSOZ?^7PS-5UE!GiorHa;jA>XFyb0V60*q}|AQPf=a-4Bkad+0i611gTx&A}gjFEj zAha^T<3Y;M3$0^nffA!5>Aolkn8U}89u+NNkHVCof9N)w^MoE~Jf6rKLPaXZlc+BF zHtgp@ZV|vbqY7n`{2b7Qfss85YFGejNJK#&foJ9*ir`a`76gYg1=ZhOq>YNug&KG& zJVr9g!(;k9)-G(~%*+h-He7BsqcA5=p44`M0q%;9f#QY@8awO(k{$|WB3%-?UyF*aFUk^C`6Zak6 zq}EE>YK)$;4}_611=xoqrdzfm0PU{(p5T-YVV?)7JRE}gA zH5`3dU0U-nhTV}&sTaA{4)=?sKF&2z_yM6-sBP578AWU)Ji)*+#;iFYv)e5+5;4_yIn>#AttsSHB3y&%2jvuqCZa5#7mD>Ze)4${dfQ)az{@ z$smG&r?76j-;GGe4rJT9GM6E2jLVDE#F8D4rtJIGSp4Lx z(Pp_GHU!feWredyFZyt)rqLZ)u1V(xiif6$D-UW?wMC!Q0%ec65Ql0=%o(hvn=pS% zV?6xAa0r%A3ulS1LYV|rk(g4(Hj+4=V8I0&D8Df8fUPB36%svz(Vi~DH%4<9%X}Cf zB9tgRV%AGFWU6VAsWyXjnlS4VyfFdeK1fBLFnl}R6*ts1Yh3hWSlTneyPp9mLe!#2cj76+7(`QsMP^slk}~Jd8kCHDfi~ zw4gvTcm+cbSr9N#!yq4N=(gi9KLJ%0;?9Bu$WCifd?aDk^5+N9JnEzDZ_;boIdPU# zq#o40m3?_C*Cm79$-W7S)WOUr3#K+P+L@!Yp{zK_LiGto{upAQ!74)GE7%c+4L7tm zib#bole9%BD#J$d!!c3L3u6{ceB6u!5ojp)WNO`yCD$Qet|!pG5#jF)vDo@CEss`$ zX%ai$euiV^-AFbK<3Bvd3%hh6HQbTh3u{+6gaOA{V*f9sl|$GIjGa8j4r2&1!ZeLs zVuXNQH>}rSCMdmfY$ML)P1EjS6v*W!@fDeW2^Szb&2|i`vmvMjO)u6tp`abtY9~=@ znOCefUE=|GnN4i-!aS}%sf zL;XCcA?G^f#L%tib3!;7B82D|w3SoY{l>FZE;-c()}8CR0nQ?z`T@lB71l6IXFk6- zw(^dRY6&_4h*5wBm8RWD^5Uq>jBvOr+b zlfeJO9Fbq<+{dFc(lGPX(W!S+HN7wPG#DF-s{&b0cL@$AI7245tVhdW5Vf$P7c8}l z8fy=Bb8Mi9ZkaIg$)hoGEGE%J&}EvKSgwQ*sc9KiId^QmyfY{iqmqgbpQA+Fph%t@rSjB020#405YC=ey{5@mX+( zpJj-D%t+X=yL#2XhTosE4H*Yb%qC^6;_)e3r;xs5W;%DtSI5&`2D-YWPJIcV@hv&! z1-sB&po5?*y*^vVmMffh$i@yo?90f3tZ-lBO-J0qjG~kNHZv{R|2MpnmAd*sISW5T*U&s}4qw+tVK(Zk z9*%Ks0}X*wma&|)ad@=5!{csG*c#%YuEY~~X>h7Z91NDSacEu}7>U+V4~LyE@Hjh* zHkoeXD0CXtC2Hb!Or69cchVcg-4ISwkj2Xzv{SrlXCH-?p^HBSTS$E|v~!E5`*a&t z_}xDsnnFEG9K~wd8^hwA1_@zf^lVwpgReIh#=*H83lqGWq%9}p9r@Y1hJ#ma@Z7fz z_1kt}#h{7a&ID=|2me;(Ug#rz64B z-wK|N>Zi>t503%PAhEWG?Z8x5Q{LS`)@yTwa%CH2a@GCZ-Y%yy zlwnPZwWjNZp~~H%5DN~LgKskkr)yVyI^lrwnC!{-{Kp{%w|D*|j?ME9$uhc2T69z| zqri%cAHe}@?~y$J{<`>6ZyEo;+bDiLvXJ^$BoAM~{X3Y4E04Bm zy>V|fiAg6~7|wFTy{yq1Yb!>mASA$XRx{XxR17{BAe{hZ1i-hfA6PATOd9|hn&t1D z0!J~1pi$&ZAqgHuGIM!(ejYLvnD++~B=U~do=!GO#pxF8#>!S7iID(ioxx60!8F_l zjfO8kBLLb1J%a24mdXgiV;&&mSnQ;{lA3GP39??v2}e4|bID56zV+0p>G%7_QB59` zdz7UT_bE7n8@(HdHSVuRPs3rP7$xZS1{jU~;XTPI&vl3lZ!$&&Ic>z_oUzxE*2=p#BH=b$mAZ(z2=2PFT{=|>Y($qp zK2Ra93~0tS)F`q{E5rH)!b=1Ho+>7t3wi(&QRJDeDOD_38Pd5}0-$INs>--yv0e3H zfVNAaUqH`_iGY4ugH(_M;r&YcLww! z$HA>cu`)EzVTBR15WZA^%%|)A&Jb;~bo;3PinKSZU3TEFk&+03&zRiMeQyUft84B* zGUow*kOb3*oh49q`~iIX`q$5+QdNWgA3>Drk9qYnoT&#i&-o>K9N?pW#j7b?`2zuo zsVN*k>0Adf>*}*1_A=-<7|Lw9e#&WHt*KU1{R&=2qad7LfF`$NFv3!~z@%J{*Jy0}H-IYFQ4h@Dm&quAE=t zk4CX*DxHiH3jiUa?I&<1Vgn~ka)f}Q#@zcFqdrCF7@hx{X_9#cLl&2Am1S}Gy1sc1Q9S64Qk9Xb4?oJ|I z+5y4g9FVUO;grQ4ujyLP1LI_jk0xPs7;lKVVOV%d0SJ#AZH2|XeqW^U{1Bd+&s&Dd zVCI|cCiFQAqU^$(sQ(jJqL9_Kl#t;44ZI<90?SN1@lpXL2E0S3MSh2M%Gnn3jzb$Q z1Dn0rBO)YXoZJXkM)#|Tj@D)cVVxl7j5QBus;OZdnqe*gO|zN=f*U$FUU9nzXfPU~ z13w+{4dAPV^GGbl!BOMjtg@T8&IPV=ioY(ICCZ^Wq?8Hbwr+uIHW}N)IICU#FprlOs_qP? z*&TKP+}MxF)gYF$&&eC$R=VE!T<482ZP17;?@s(ah=Z+BavNsi?BP#aHjcf92$ff( zp8@V2*!MyW=iH$tc9F}uxCQhHk2V=z+kHFc0WrO>=cX}6EhY7hL&zfz;)eOS!j<0u zCj@J__87R0Q5-#;c>#9HkrrA`_?9u$(p~LQNuBliN_z|#?G*0!frp7pO$`RE^FWx7 z_LvC1cuL$FwY3J;TcA;hv+!B?F`dI2FJ zSXf*O6~QU5kCb?wvr<5xS^3wABu&1)F`2sA{In67N;#B`1vv*I3g~)$7;s`UDg#k~gX1T#SI(&` zlFD*!0!dHMR|7#8YF%0&xs^sHbd*BVS?rYo$;eZ(UA;zsld-0lNCA#NsQYfjnFD4n zV__qLEzP$E=xn7kNJk>c1+_>@Q;pDj6b`7W#U-yID8;Civ73*mMWM`c0dLF0&nY6N zRzM1t>s&)#4D!131nfdwxejZhp9l)X3pv)+h=N}~V zuba@z?7$V#;lPet$=57MeEAfk{Ix8s6q}>8ux43s0R}gp2hu#a%8#ebT?l7^Vk2e7 zErnVxoCG3WZ!*l|7e|l@jYc^#jxDoz^qC2pZPx3cvT$T~UfwsKH*f8FHGhNmFJ(j2 z_cRnGxSlf3Oqpjy1`A8qm)ZU`Ity_8kqV%qc4*~PhK6Qs1Q!0c+0H*jXN8Uc#D9iY ze~-@3(fN5g1dXd7<<&M`abWv{mD5j5pLzVfl`~H~MF9D?2OXbdJpUXqLd#5ppR?M$ zr}_do{J4voPPGT|S= z_UjSoXxOiBgh5ZkNhWReUl!lCA2Z(mC=W?Fix6F9{b@T`r z#?KrOoMLKY8DRZ|U4ybLW;MH{Rk`#&ETp*!6Z$IL~V zjnD>}xGzT|Rfv%ic8l^cTbmy3O&fR^;FGwd0gwUfqA)&6ShGF?{D4!MFBRYW zr|^1!K01$`VL(99q6iY~jZUmvqAEIBfb|TVHmc7JNA_ApnsX8AB(z^d86&Bc&O2gu zr7HCb0?5`XW-!TO3k>$DL>9?C{FLl#y*bhXi!jgyvCTw2(N+!6c5x)Rg&E4m^K3n##wQZcH#S-ll-ei^DI=2a5y z$rg}dmtlQnCeT!in8?-WGpIFb3aQJgfewW#qSc^yeFgR zcSU?=F_{|Dv(rqH4G z%)qR9T1+AE=ok0^^^wCB44%_e0`6ccL0`jILKp%j6C50E4KoS)cPRKB4t__RL2d;J zwqkIL2w0Hvh-_g1P;zSA4~OffL#ju?i*equMeuIibZ=dNf&kiiZ7E&sqGRiVKAZ_) zPZQA4VB-St>KU>Rg0YX|7&cEa=BjW-Si#8jmybLNw{Tk!HrflIhv10pz2M_z4?X$3a9af7_lbcgoMD^M{#%&%eD4 z5H#JNfz4sd>I+PABPiFVm}O?%y$3}Iu*k^>i%k6|ItmT!x8pdgy%&p|;|V#?MB;=P zAPMr&$qs;nCHnVXbUaI*6>u(jBD@xaF;E5o9-SmY87Ie_1fl=~K=yzWP{n7^zO4>6 z?+W6&?4{Otuqe^V(A=`4Vn2Km`aBD8C1@5UEu0@eE_Z9#(Z0aE41$M7JnaE5wJ-n< z9lpRE;{-7dz11O`C%rHhdgK8vaMi=j zd+FCs$~Did;k%6aKY3#z0vr4BnbX1BCf>s}G&6jYdyN>H`{e)38xKS4xY}7rl2Yv< z;Iebjg3Fm~g4iBrn?Z_@;GXv`%=QF!ix1)aA)ZUW4c|%brpFYOuiio*lZlv<@wbi8 zo_aHLew(7PLn~Z;3;PjRHizLV;vY7~WkckEWls#Gt^;HTVC>G^hLdDz^bm_}$Ld@( z@fA;;C?3N?2dbebNiN?Ji2A@eU{(*G$6@7w^AZOM$aS!Ev$FV^S)c4l9dRGc62i1Q zk)IaJgpbkO*3yk&F=Uj}I8yQq=byhpN8m>v-E1C!zdoaW<-}>d*u(6WjlJ|j3gEc( z6%Y={5YN;Fwb8Tov#0@`qd~3UkO{Ve1U;$~e>)A4)v@)LXi#kdIwC3k$;NWst8Ms9 z{Q|N!wOwOUKF;i4pmPEH<#>RNgPX8ZJv^nYz zuvHI7W|NDg&5Z$QHUoxoOX{bT&(N5QMU z8tn?(d<8}E3BXkkpapRFXfNBT&l2qLlpX$&ZLCifjPq)c6@VBv-d`-O#;Vwy*ZVHK ze1#Q7mOdTJ52rI5?=Jqfc)R@A|4@vX@+ZVQ<*5Q2H3b@&wjR`oD4u7BlqOZ{-jd?> zZ$I0y5--Q%%6Yl|M8~&u3HA%2UlL0N`XtfajJrsO0y)SKvE8PMZ59wWQ4ggLpitZS zEK9l(0~n{9=^lTOue}qE?+@#?qPkbrF5ygZ+s}2~XbGbT?_S|MloUeFp_QQ0a(m(J z4>TOe>!3X@@dR>p7lZEOgQQbywjiFn8#m>G?cl^A+NCW~x2x`9u>E`zY%0dvvZ?q0 z{f3xI%_(UOY6&NJOwoHd@J4Y;x=7)cMXPN=i9J z2I`LlAv9Bu?IAECdYrG0_CC-t+{gFsr}GXv570RW$4{Qq=}*(APUn+!{wAFtr6aS3 z2wcCw+h3qFf(n&0w$yuh^%$J;L}=#V{Do6bUFfoMaGpU#o+WG#g)1`NyQunzwD;2w z>-wNtAm;upIM%2>pp0j=j)(w{lj;yeb=b^c6peg38RzwwnT3IgycnP01Wk1;zL&3->9U!iy<6`T9u8de$74>Hk|JmNjmk X(br=ZLVY*G^3Qw%zIpR&<3s-+!t&0O literal 0 HcmV?d00001 diff --git a/tests/models/mistral2/__pycache__/modeltp.cpython-310.pyc b/tests/models/mistral2/__pycache__/modeltp.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f7c6a28cecdfc2502d5bbb914f4ff15a9b802990 GIT binary patch literal 52277 zcmd753wRvYeIL3rJNv`}SiC_HBv%wA5)=uNdfKucmMPJeO+m6q+wn^BVu=}m3+w|t zGayA|5zD4rD-R})+dOV#e+6SFvD`FinlDY_=H@!jo5a0MZf??>Ow+V!+M76a>f}1H zBT6#w@Ap45vpWkwlH<1DU0~0iIdk66|NFH1`|}BWe(fE%E&hwEM&eKT(EUl{;jQ?( zr;>?;ohT=4!#3+i*^pneY|3x4oWyUkZZ%TnR3lwZH!|f+BU{cII)1L4lX&??pP4Wc zjY7FUcrzexta`4|SMHOrRDG~fEEgL?<)OxKd04{I^^wMCc~tzF`dDMUJl>cnPsnq& zKH1n+-qn~YPswwxzPmA9o^I?Z?`iBU@0GB8eP5$gE;Vi{-z3j{_5F>T%QrU;ln*p+ zDc{n#wR~&iVEJI$8nx zQSL%Q~6Ee-&KFS@kIHF#=Fb!Zai6j zQo^R{XBubAXB+P+zo+ru@_QTaE5A>|ch|qA@&5Ar#XnsyH=Zg#)p)x6bmN)wGmQ_F zKY;K(_TJ|c<;vR=_CCAxe8Mg{earJ#lI1EwZnF0yWPcEH&c4|`fVd0x!}cxqt-rjFNfVU6m-0=PaQaotC6=836-jCWS#C<5p>23B&gr9U; zl9OwjXA(1ytWT7lxO}#y7S5L*tGS-4)=Q0+?bK(#$fY+oWBJx;g!|dYSG`)RS*`o2 zvumq$$4|YdiRT%^H=E6wq(AhwlaIWw=AA#?TB>>O>6YvI#ghs@k2F2iTD>eGna9pN zLJv}5Ow> z&1o(;C+k($tu59Z6_0^h4nCorv#MHaI<{o(58B@4Ri|?9vgcIX+PX87_D8F>U2(mt z@+x*~0exI+F1h|K+l5xVRz

6~9zfw!f?9Ru)nIc{aXTxxG@oSgqBo=P&{{$6qZe zr|LPXf)=CR=&jo;Cyx7>dTVJ31^T7JM zTbHDB)NWmJ-{Z=5lw&_!Tkufs0EWFV@CQJjP98~Bp`KEzsUG@l5{J&p0K zDo&8rn)kL^)%8bvB0TmEr)vAtPqr@YmmT{(f#0kSh==gr-F1zJAKXEu-azZY9|+y*po7IU%Y|Oi>9BnYYjgoE9w-+D~qo4 zxcMlVv8I|rCA2^9C*3u~^|+Jysan&!<90vOY&F*%)$%i#x7<4XWTX15Z`C}f;Txy? zBxa?bURY}&=}gKW=-FP1u5$rX+(my^&Q(3+o59qrEjX?}f_y6ab?0qEg9$CT4(dCt-Et%h@~e&KShdF*8C^y%6o$2`aN++)(q_|aSM?Z;+kgZ~AYwp-py zd~bDj^|C)$S#rFJURrk5t8zr#BAlWzZWN3u!!%99G^|(EV+dTpH}n=;vhNuVn8ft|T^%YXG-Ab2I5#o2j;O4S(%KC5?ekR5G@K?9KK2^QGVuFQ-&< zOW4$E7poW$ujI8#XYZ0#&PqgS=4P^f%3WK!S<-`=d;yn zViePtoZ8ZP&o5#xt5vblSDaP1R>!H9^exw^FZzAwYqpJDjypM)>#F-Aww_95fd>Ne z<`}Efblwfe#@A(Wd286n8|xGCHq3@qS}+kO{AA(b_awgVkHcvj%X-lmp1GX7W^5Q( zK$eZ(T266bxUe>oZM@w~x3S!oGZTqx#%KcZ63bb{HQJ^;TWxDIhlQ8Nw-1XiWvAMy znz51g3U=DgTrt0=xRLStmj~MEcE--yxvNPd@jkS|!rvA6o7ocy_t8s+fpIeu@5eV@ z0!A!9_eC~F-2>+=R%m9$aaOTt*N0?uN*tb&J~Y>dsuk66E7(zSCd@9ot)_2W`XZ+x zIyKGnqv_2Is(TUOn^?y(P*}OGD%WyywROq28cwz8r<8j^dFnkZ_$*_mZO6mj=qDFe z*Zj2CsuN(bTB}&_mXu@JYmL>Jtl|>!`@=Ei+2Z^A!BuA?+}98Ma`aTI?=qnOMZ1#{S#zEMoBkN3=vAlZze z9!H`V)11TriJVHs&sPW*tkvl+R4P2)^&6RZN=dtddAZ=!>*3s#O1&R%^(>u(ZvkJK zrP;)QQOv3^48fgGv^g9{ShP9hWpm)Bz|8@3h?|4m$GHPAIwjjg8ndF{M@?%1?793% z{K>(Hu+#tda7Y!GIY#7hN)7dcE+$2odB$Ax&6)XizL(5Aor)_K< zID3-tXRs`+%`B#Ru5EE+SvGekMiRR*?cvXJ>a)U};&|?3e#?dBelN8-fYU0=a65sj z=jN7lrT*$cIat(Fczn^&voc$2UaTxC=fWw>*gUow7ZVM$_Qj;1!C_Fr-o&|h5l4h1 zz-)d3$@Lh$89sl4=f-NwtqBn8g#F_B);R#DddjD%P;D0ZAx0nK0bx6f)wMc)375{KWeq4+Mtzvhl`ytW zCvzp5%2M;^5W^jYlQ+_4+DIG6F_Eq0fRVqkzAv1@@p0~Kb>T!pa)BK<>ax|X!y{vx zARK_vNVHAD54LeNxoP1!g{{x>3~X}x$=o2skZEHi7XfeBNo<2@#LL=NJFPdmHnvL} z;~b1~*3RG!w!Y1L_Zu1KEHDr0BpE2#zc9)&MtOFpz*_bxwSj8-`KWVk%y}M>m*9h1DZt6GZ{T5&oMd2O;0w%!_Z@{ z5ln2H>)B(}3yAmK___r+X=BNJKL^r#^^W)wjYyi?7npMs!-NzLSrf=jyxQ z%p~=8Z9J=j$?-NjJ128owlnn|s0KislzYC4O^oNP`j`Z%@5hgCK6?V++q+uL z2K#+VN~^H2Pa{AMkts~NQA7O@o)@@_haYwdn=aFd$uWVfU5U*^o9o)dPTqOKuHH6B zu@AHpSB!RYG%=CD%EpwFFe3`XYHnv_G8xq-`t+gIRbX~*iM^mMm!1f=)lzia5Kd&T z0E@@_YSS*QIG0Nr8FOdz0yjRSmfU+I2=YSd(7d1!vu6e6oS(V3bT-OCr<{#quj~-J zQ%WZ;mi!K}@doH-3COJ%jv0FzHIB}QPBw_TLpI+)CeZ;@v$v^K)I37sYU5vf6?o_ZM&~HEs=Z};Q zw*peu;UlHsbaYwXiJ8ts2-|)zmc_wRRV}SGh=RDFHjpHQl;|R4kFEe=s#FE~fEt~z zfuM#wqvYr&_o=W66?FVFoRG10V+4C*xSYRL;5+9rZ*@*^A#s0274#zKa z&v3s#iqFCEGZ!kdqXA3Qp57B?w2B@5az*cc1d|!+Ad(;^iR6nU4GbU81N-1~L`VyF}X z&{t9}3;=~)1@JkNW3IeTBd+Bj!dANT_%V@s{X`YoM~h=BLm z`|7QQYTbPR9-RV+j&+XY(Eg<=rVMD!g3J?#2CW73mGdoo#~P|1VP&G)sUO{1H}zu( z`wtBN44kvybXNaQnAM9(^;!7+A&u;VaE{|vNX+WT5kAx3LDWDH6#BP%UyuN>d&9A7 z0JJpC`Zk7V)>grMXf69W^n@c`b(Al1plvtxGa*wyACM5L3!a~bQ(0`jXlj7}C-Giw zgK+&8!reRJ0O7)H&l+aRGAxktOg_JCW_ErmX4gly8Ohk(4w3DXEQU}lNVfa}{MrPB zujz7|nAxNv39u%F+zt^JP-+r%B+JWio`a4w1k@}GqOJub8`ObbI#L_ZDkw>J#VAPv z!OpGcvGTeIM2-4tH-Sh!gX*ad(5b+g$*Z5H|7Ym@3pzhVXORxkc6FXkjm|Qi6*wWr zE0AqEn6LRxIck+LFVN{g`2ye*{Qs}`3hrBqNH_T#RgWKwP;NY*4z@sm z`8S!F?)`#Myh$-k)#3Q5Cb%-Iep<6;wu}~cA{j0A1on!=)7A!PfEyNw8`y1DOx4eo z5Bvd$@kfXp2X6QaJR$|)n^+vimE&|RwZSnbVFe+h2??`^r>*~<9Ajq*oZ+7*Ll@|(TK~v4^#N6wG@fjDX!lGRa;2nKRxZATob;)c%3|#vCzA9_2wT86 z_8ASHT<&<`sbw2Dt;X%7W7(FIa?-mJiy*>-Fc0F!^Zn(FopG?Ac+s z=5#)1%3mQ@uqW((i8Fxohzm?}P<%K~ARdr%3iusz`t4zSN1Or131i|{o_8fsE^ySv z0ejRQdp=bj6bfGvvt-gQ2DAU6kR`kRucV_N1FflclpO@HG#VT|+6SY^rAOV;L-l&; z{jIgqoa5LB0qg^+oh?1qvI$iS{0}^!t`xfiDn6ID<_ed4gZtbahg9zXopR6gOi(C+svp=g)4TjYQDES)90ApaJqQ zl5NvYtR>VG`5S`Crn5%rucJ%O~C zCqyMm?wS&00Ehj>n(g#dK+wI40#7&9hDkut_L(18{|d6=EN_~ zU6$=jHm%a3CA4C`E5ZD|rS)M!-{!duNi2q$GqXE`fNvDQI))i60M`0#y^HuYU_bRD zR>(IEk=-4Z{Y~d||9`if9tm?F({jtJv3yE|~9>=5c#5JMg2UQvK}MiZl60gK;J zWrhnSQNTtyJfhuCkDs55hEjc=(SMQIjK+_dCDmG6b)^P=o|t|G2e4Q5a{yVj5R5<6 z$=5~`>T+e9=1rvy(eW5BR(WYbyv>Tg!UhrxZ$<8Ateb;HbA@OJ~SJ3eKZCW@IQsB0O8 zSJ!v#I90*%6Iz{mj_paFbJrJ$3xyy033Q#r*Db*Ti!=eChdc)otj%lICXp}mnt-{Q zR6T`wLMw$>fOWCvTvETzkk7;Ma}bhhg5KpiK3E0Vtd`bVYwnDtJMVWH?>RdE7o9jv z>8C3KZ_Ke1iFuVeT;k|#oYiqoX^`X;bNMH5C=~kvj^IP1c!Q&5MgG2YYYX!0kmASQ($p?9HkC{ zMb29Bmo?&Vf4i_eYy(uu{er1L&(v=MJ} z0xn`*Gw)%##PVdjzg@&pXFhFi3~dbK2t+yUq4sb)c_oD~^i7bvkFX)T_z81k1o=!M z7E2guk6g154?XDZMp$Bb`U;-l=Ue*wy*-Dmhd)n%>WN{v^~m^ zZ4&^v3E#ox{ks$GT)Vhvtt3BYsNHDG@OE#}vJsv3hZ3K(q=f`>45WFmZR$J$YIJgX z3vwE5W0ZB6q?dS6`q8!t`u?bQ^YVe^TiO6Dy*;%$~9K*tYFKdxI8z1ofEQ(js{;b&~W^oyNlXC!@~Xk05Q%JBre0 z!~VKrTuY+l#OAT~7-~7zPJPU9f5b*x>|IFFHBW&Yk8hcm<2%gD+=&G8`Hk4j978)T zk%gL?15FK*f;?y_^LSlBWKl?V^WDJ$3fT@MCc$JR;2lCifH^3EbP-|T99+C}_KthD zfo}+D&v%X$7K9uDivfn3oxeVMt^}^b`I0?TI{H9~5AcHpZrEA^OL*kT^}@S_&!SPh z_58a)vkv?PIiJF($!cfM3b1`w5> zKS2;yj$I6)!6PN`D7HX>q;u7(b%Jz6jH52oP@m4ES`r8q{RfH2SrFl#!;TA@&&Tm2 zq|z^9^G&QDh&KF^7DIADE^e*WZNiB_vng^^x}G~yItTCGi?Kg_IZ8<=&a&el~z`Ae}%TkY7|u{cASk|Df}4 z=-fi*^K^cn&LW+iXfYbe%B-$xLfXG&T;k&zs`>(b5get37a;WuV40luCGWiY9me<+ z96!Cb3LzXv{Srg|cRC_1lx|eL##$Y6LLg6~z7!MU!3~CdP-dD>c?zut(##pRwdjGP zug)=-DxG^JMF3kVQsdNL(D?<1CqeNL(xCcnJkCt$Q(veD(NV4bfYJYuj?Xmd74+Yd zD-eO7(sCjO1bKuquVq|aKhKk0uNS|dX&}K<7r>{Fd{^jINO6(!hU}eg61=RHiZ0SG zV2U;?!36XB{&IWt$3+a^hd!#22XuAP%R7NxMf{2qz{BM#(GzK%4@oXaPXXr^XR) zih!gw4Mu*!*pIiqY@N?Y)stu1{_O2?NYdh|Djh=H}*3othcy zVetEz3T2-mA$5oY#I>kMtwy(G@q>avhd~LBV8!(09z(0m5%BILRX_K-aczDd7Oe+B>`k45rbrzn{o?Mt^;--+gGGLRN%Es4N**FLGG z*hmt@mZl%DQG%5Kr2$POn(rOe`4TpJ?zZ7J83;B6sj)~dYlh!i(`&6kL>d@8&x}?R zOi~EYtkob{9-J0n8HYP;Ge!##UNf+HoEJ5()C5UNSi-{FUht5S5DQp~)wOf=+Cu4} zCmzKvEp6>>BM)EOfg**RDZw!fx}|W-aSDTHtB((1Mu212^sr64IJ7SZJiCUbO2u$w zPE=01L3{;|ArT4~j8TL1#HWq#2K?@sR}ux1laeH;$SMF@@&pp-$xRf%-`lNDfP)_* z)vZS(+qEnc+-N@47W!*IE}Zb^(AjF;uf)iK%kg- zL+eXYQgM0TibaUt{Tr{ed6PXX*r#Avz%2d|aG(9GHKh4Q5O1tE7sPqKlaD-&(G{X6 z@}5j=jEmIZD7Lo|U~d!A6WAadlkJJ1b<)Q6xU^LKi@mnXn&{hM zdmOmoF7(f>0d6?j-WB172L(4gB)H*0!42srjyMAh<45g@CTSjrH>QAR-L|p2y&E{# zkrfj?_g_%@(QCj%)Gxp{i}p`9hrDBKmpyrfF@8hZcpTv*EL+Pbki)c$-LCdDX46za z^guRU1tZ))#B$$Nm8neHCoWqa*ur0QCmz{~b|y{m8EYv5(3`_|a{+tc0~ z;AMN})W5?frT&P{zo+xZbf)MK9a8@h&idYJXR{4JtpI}C4&PCK!i4{c&Y#k`ht7Y7 z;|~OEkcwu5tp702w}myxT|pA5_xK4hA-e_PB$CIW5wY5WaM@10lp`85!f}k#uK{e4 zqGyYXwS^k9uTfZwB2S=kL%u0S6kl6iQegDxDjX$nS*q90DbN#VMS6N23Va}Z1G3c= za1r>a24XQd#t+SR%MV55QHuT47iTpeem36rqu?)w?6>)uBj6;0{{J29ss94!&+wIx zKLHx2LqT~!xa(4NEdjm$i|jjn@&?k98q7wS4x}c*UIhLHEfBO=j(w4@!l@l7A_QV< z4JwW>rgG|naU&g}gGI3T1@n}quB`_0S&<}P2{@=m51Ukt5HI+wQTDT?_W{r$|AHw? z=c)@UlCQ2X;`zobDAa$4M*1V9_QiY0RsR)lX5Os6gdg>nj0MGsRrTNa_|0N&>dVaK zHL*7JSB&xyoww2HU~IdCl@LruH<-$T3Hlibs$=#jH2}Cm$m{A?k=!4k0DFMq5CJ4~ zE~2V_jtV@LdTqr~f6c&Oq9cf)plX7&5swqJ&L08woR~lRwBj_0Q`-JGGmk=-R@VX@ zkPL_mQkfzvbC#J@;rNqV)^dn$5HZv^#QW&`TRMUrmfiq%s079mV)4sN_zIm@>3p5e z4LX$a(J~5v@)G5NfM&l1`BTGuJ*+W6Gy|Fe;LkTbx6B=6fkY$?hC=HwASOePfj&Vv zmC59V0Q0SYlM2xnl2aHMJjR*;_%2DGG%GDU>u#rv` zkS}moVz4H#FQ~|+zYgp(%}`>c1w%pwKhs!R`lV8yiVr=wXDDXg3-X3jVh}yBKrlA^ zjKV|g?IgY~#~4Rg!p*=nHgZk*ZS_Nb##l*iK$6BbL2P~ooL)%S0AvY>@gl7u(i=~w zHvs-Z21FCq3m^{N;&nTj-Ky2~Q+eN@lW7%{lOK-S59@&4-(z8X>`8-$0oJ0m4}!wA7nMCn+G6xl-| z_%;)(%QnCaYD8LFOk{tX3cU{;a!_tq2B0tK2zxur4RtzDON43d@r+;S=x$do#GY3C zLYJ~P-gP~#K-^Qa;47eR=2U?ZDMP4b*!n7Km>Uql!}g%|F!(3w2n4l{zG*l!qZ$I~fhS0*4$02}ZJiLM zE|V~X)XL9ypmPWyXw`T0PM_{8(w*I?JLIX;RDZ0in?n70q}?`nMPwR6_Jmex*H8wD zKpMa}P2X-*?TU#5)WcJ{1aX68Hc5ejyio+@7{Um0`s;f4We`+vJI9j*!2Pfc#!1;!ye}`ep#G`8?%Kl$1nO@>mel~vLg+<96h#ok z3X3M7mi|R^k^00|pu9%x8jLi7>puiPKVsiN?WIp3alsOv=i>u(7!&%rICh~3 zq)K=KPvE@_?9)fFAYp!vr=<4w=_5=YlyG$pVz3nKF~QmdonJ?4VS1H!#m}A!ao7OF zj#kWyUi>iDl+X+02z@hjdgeFh#Gi3~--iOgQL*yIB%swva}t~r(-HnMk|RTeal)ARmFJE) ztT0>p>;gn_`-CY{z$6{;M}s~;1A9Z!xx4;~oN~ddE=Vz8Uj+I$B2(RL&UQGHvvMdy zkE-QST+0UMyTdkr4F-x7_xEC=02toNK+#-w0KGWg0RrQoC8z>^CShRc=kd_BtE;rh zAtj%A_=$(e90}xVUx)I)RZM9N14%I@&NTNGtc7+;jR@%Tp(LKPAXE6H1)1bY7y)?^(8+JYh=7PL z@-9I|K+fB-#=e6$b~a%v9?0UBqJ2f{LP+>KMG4}J$L*S|)v}f8` z?;z40!gpwSMiZTZ$0jxpLomzO7y+?=qu9|3{bathg<3qXDb<}l$G!ioT65;1P6}+DK)xnU zYPx5m+JsG#uI(*OHrCbBIH&aEhlbj$2_WPR?MJwAIJ+Y+}U zH`o&?s>0d@>awBqQ%BDcq%5t%JCC!wS&0{a|9+&<5Dz?l_sw-ykLRFES zggtg<2JESdCAviLzhd+Yd?gg@h;&MVf(MW#IH{opoCHMku<{yx(Io zk=W>=wvN3KYU?*bXzfVV2xU~GBtrA-VTrSEr*UGr-vtk(F954qH_}!J-0(MjeIWOZ zQ9y?Q??nXfd>YsHt$3+acHm z2x|e^#y+eQKXHR*hLf~S#|rePr~zE)vQ(7vLv_iADG2DO)3m3jF=@`|Eowk41dU33 z3`h~kwggkIFdVW7aMK{A8n9*oB_I#TkC;=*({dg{`nH6~ z;Z-5`HoZ(K0jmE{wxdYUr2>xnnZ%1gjP65cLB{Y~P@PjR4tEKAaB=vf+--XIIDgk; zkDfXEx$lTdSAD^q7JpIs_Gmo6{bL6~9BHam}W-dvnONyN+zd#FK<4kB3)tSS=s0MZO4KK=^kK_7*&atO=oP&^S6 zNiX*WVSV&dW&kRAa2FxA1so&=hE2Z%RM{S|2cZNwAQ;IYFp?pu@Ee%S>H#!NJxJ#v zIy();kD@o!yBOm%oVgwa1ufL`Idn(NwilToBzmxm-4DQN!vr0GC;XB;fcXt;%^^T< z8=nABPiZEQH4KtPK+rHCbAojxsYF4UsX(&O>7)1Utk{OJs&7R#1s5sdvz7iHs#$5% zIibCmQ?xUu#r$aWMKc$G-0KE0IWOdKUMu~TC=O*wlop;cA1kOxuXYh+GZa)j& zSD-NJ+ozB%H>{KRy5A4SBcB$V56se01Pkh7%SlCS&{79! zf^azlssc4@p+dz2A7X(Jbh}2HDkuozGp|v}2ip3egeEp$J0{h95R*Fjws{}(%X`GB zK`EuxFRi47tvyZib&s;^DWKgmRKY8Mpy^!FM=-2ixZX@b-H8_IWaJgVJn?yFVP&-i zk#-j`Y!628#Uv!WXNtYxYtQ+< z9Ux06&hP75GrD;e?K8iQP&Z+KA7K;rBadtXV(Y^MYps_7^hd(n1qgFb=ki}0zpr{4 zab;D763@ioV*<}l&G*<4pNHL~lN=e@L&cu$0u9arM~AIqED|k=#f^`r(-M{5A(5A=&ROJ0rx*LIoxHl zNrS$nLr;YS+LHnc&p*?{)kPGGy=lZotQyO?$`Bbf1OYhwl5Hqxk2%*`py3;;)Wd#4RKO7}T+ioN9m9DGV1kij zxj!}kZ9VgRmXsZe>4}<>W6wVmihW>MqRL2%LfQY>{Q7&&qSSLdbAzpcHY3nk8h{Z! zd{w;^*c&+-$S|<_tQ1A;1Iuj-B1R}Q?bhn-0#<;s1e3qx0;U}vyPXCpm{o|gM5Y1LT)(EY*q$sRwEuze)B zU7ciIDCdaN0Ox`5^$ah$3=W#w4`fo#BA}nJ{c;szTnNbu)we|7vKUC!CT@v1lIMrC zd8B=4+q8X>_N0CX7gj^4;MmDQEH=QVCz(e$Z){Wy7T7)5M-_a7hL%E2IR!y7+Oy5m zDjCHjpe|3+kDu0N_O%UXi$;}cv0XKAb%D0ll%nzWil2KVw3F-i1#m4*bcC7;Y4jyF z5QS?%dJi1GUt0kWCiKstBVD4BFv1K35=>(dQ)rHvN=eqMmbrc)m{UFDH5YC2c84|cqPg)qLSc_2!EgxPEv_K)_E3SP*+i%kN21y)^bii zg%_V?XOXoA+Kt$s0$(xsE0~Hgs;7bb${LR&S(+-)ZCkRVLN&hyjp|6$#1WaE9q~~u ze!%4V7=rMlB)}dXD%Nm6x(Ff9bKLYVL(er3{1SOukd$jnm|ob-fc*q^A45$@>WT&y zK4P0z&Fd27;vN}RBLMMR=cu|v=w5VkpPGLNQl@8FA*xF9ASYVWeZogCXK}L#HZE*> zI6Gp6ZXX@iy>S>-DoDVSUt>F=6;oD?pQqkObCKK}btgUgu>0UMhj6LDQXSW>pi^#^ z@S_-*Pa^cZnjZ0gmEtr`#!N1+7`9!*iv+=oM)RR$CdkgFD8 z5{QWfUx9`!@Yw~PN05Y`f9m*=(urs0@72(T?tl{H0|Q|yJuSq#B|GHsVdAI6svZ0A zVQq4%ljgV9ZEhPc@jYU9hxJWNSo}V*Gv!$ac z6M|kxBvPP94YNEp)}9=-m~0hPd6wV?pe7iE)%4yjZVX3g{+U>zgzAvIwg{`iEoKcO z2UP?0vW87S?Bfm0H_XPT*a2e1ssqVZ)w8vWvxH--=iFl_-hATt?1?+>y!*tAOg0iN zVx`rZToxMG3s~*ZHU$vLbCxdmbPDIXtP{Kxz+b~O7^K&m?K`FF%=d7XhLVf4Sqq|r zhQ9wk8~SVP58+CnaAaV`Ss+>mlnXSF0 z8ly7Nq0h|sbaH2{hw(~Vx7F}fd}r+e?9hEDNsGZDs7j&vR^Df`4H2i8iW93jz>}ka zjI!MW=^i3^Dx&a+Y6-i56c;M!0BRsEaSoO}SAZd5eq-)3!UFH=>cof@r(?tP9uJqD zh+BGqXdNMx7?efAP;6UsW&L{n6}b;+^gqP0g+2Ou;W zm=a>itRC5C&@>I>z9rU{qa~r%v8Lg`i^xwG4Fa@Vyw!Bv(tI>Ig|6a)rY9(vtOzmu z%T1L>=;saR?FDWDu5(JM%xu?kLTu~{`UFV-8epWt5*_?0H; z1ffE9-C3$GT#mIqOGDxXs8p>S6Q4dUBwg55e(FA3- zXH`Kz9LGBXL2ecK{aA%`9pkx&@L)K>dMdi@!N)R4(3KMCQ?e4ejzPUo^&Ey0$gOj} zbO{7sD*CDK9<1U}mygl0;QGk()rdU0ZLj;JOKpi^fjt0U@U$K|8@u}7f~nLCB&X88*r># z8Qp`N_aHn`!6c%&fiBWo0V@Guf~q~+8I)*G!+^Y=%>4fxCI?81Zve&%H!Hmxbf7U# ztk-KtDEE)U^nV2F3DbiYB|35B>~+~rK}*7^9da4Jsm8<=Rf83+4+ag2&c})icxO=q zq7wq=TF*SguGY&p)6E7q6Pm2w#30B4sgK6TNT7RM>>;7xh)@il3exWKA1cy;I|)K z_Ja6m7`Gq30fY{c$8HZuIQ%f3TX2f_Ee5|s!S8VJJK_u>-LP=wVgHQAa&Z^yh%+jp zCnik!PCO4${qmUb=Ep%A+~pSnQj>Jf`ah9z>5_8O01ZzW4Ks0{!@wU7;R0F()w`#j z2Ztj@>LUc2B~VUnz$x4CP=M%2iE+w}&ivkxvU*a8(-KdpdNEcR38Z9~B{@DHhSQOT zd4X~&<_lsFzhdE+X7E!Um}T_i3;7q^$%UJq5Z>n%Hep(35ezRFzt4IDct3c>c$m8V z2HqEK$9IGbF@%iTkS3^%2|+S5hdbzCFAZ23sAZHiirZ7Y+NzF*Sr9`eo%{uBP zpJ;l?iL}r7_Eb#uFHxkHv)Hs_u@ql1JWO*7PCH`qa)Ql z8jfAsSidP&`gXCmECSA|llXom&zf+`LdzwnjruU_qV4_rDS_voejcFO7Te8R68AnB_Yj|lduErTo7hK1q!#2 zBELJKZX~}R6r#m~iVlAOtcWyNXe0> zR!@65h<1x1Zrj>O+bP&ZNh<~By5$wFn1uw@msG>+-yGomfndQ~n^1j%ub;MXVAlKl zgSu^?4neOD6?^!78pBwQQkfX=I5b$4v}~D zW%S{I^m>pA0`yQWlbmiqc;AH8SP1h&oE{gIHcOKCpa|%XwKK3ZGv<{bcn>uKC~>9Q z{ZJb>oBiHR-C;6U_QQTq3PKer*e5k$@ltLDgSb5`n0_Jy`!Ho!gJ`kpYaY0tdKx&Oe5XI zqQ!IY3NFX>ZbKMCAP6#f)glBjw|@5hpojw55hE{h9~EweYJpK35;vn=K`1@3&~Qf} zJw*sa#KApWW3~PWbJgf*D`1ww9t;?$07hb2aJwO0B^DUwyI$Txf&n%KQ=K?1fo{7SiOLIugbb$H?h2Gj_m>XR zj@!ZcP#-&pD6IkYMW(}SNuZ7yLsW$!9{Zi_#yWQNo};54LGsRJc2RhA;u60 z?RiGC>aa4YKEXiIZy*fwI>uB5CZnRLHnlTAA1#c?xcWH<&>SzraMZUUFMqlcuCzGM zxk8OY-ZZSai(0ztgG@1q8;-l{jmy>L?)5>GF|$X5S2A}xsxa(D^<8w>b9w?vM}9ZM z4l&m}C`$oRNle}N<0N-=#QtE^MLOpaO6jl4x*WLFCYHq#;%pDhF!?jPy$beK5&t3@6hN!7J+0GwHvB zHg3jxDWA*2)ap2Fkq(>IbCj@zf;|9s2;AeYPsu^lPSP*0hYGJ6(FxXI3_eEF#rVE# z37dEzrsVvR(lx`ZfBvk(4xY?;CO?VwuNx(9QrvgB(vAx2pogO2UQ3okepgiDNZ?{Xs!^ zNtk*&M1eH|B-F;*ssp_@r$cnRSMLhzy(b-jBG$HvS&^cu|{JsF}=EIhq zleml^rzCs`a+)P_>KTHbCcDv7Q{9R?oPNwWohSg$T0b7! zI9G#4X*p51YXgl?TVyYaR^P(%xR)qy50Ja$PDi01C6;vk!5(ogf5fXf72MHjcb&xQ zAaglJhX|~?%>16EBV|aN+I-xgGsh^zIB^X_a-r_({aNmT7{R~v%cfj6Ay2R5Gte)&VPd*tQSt37X~?5O>gCl)kXTRjo&s=Zc=&-i z=pMl!QKA=#R8cm0o~`jJ2a?%LTm6vs>7|6z>_%WGArdaZy8Oee1o# z`rDXXD-0B8VW2pjHUBQ1GryM3n_o@$nSYxv6#fQ!`m>XUV%`hLHuj--6hH27;cbVG z2LU>*ob{TNJmNl}=w(IQB>_>petQ7Vc`;9%fXJOTPh5bp<57h7gAxb&ky8XsZkU!c zbK_h8?7Qs=X9Nn;lg=1^$3ffMB|cgVoq~^VCxr^QTf*VDr=u8q;M;|ey}|Fk;I|a~ z-sDU-*>Ui()2;PZg*%lklu6^f>P$es~O z;Z4ro++q7o_HE97yqR(=XFqeZk2t$<-PcijRw#@&Ggj`H{pRiB*~gJ4YoF+jiyUuv z4g|3+$t`Q&Vc#ja-4f;26Gw8&+joWWgo=49Hp_ec@DjXJEp<}t-JE{x`aoUZSTw7} z?p}v<_%!B)CLIz9riOo@BnKJzuDdlsJOm1tmiskLAslA44nJQlBu~ibYx#T%Nzz9plSz1ohA^gE|OShbLNK7kBwP`>}Z${7d^s=T~_YW z;d3PX-qpD~6U4YXoWxi3ErEl|`LPTw!yg7oYB~A6(zyud1z|KRXIz7rsC1AC>mMno z3NFB8S~CtJ>8_Ez_NUQA3KoJ|6)~IC6}DTN6t#;R5%UpvXb;=RnoUBLHM*O!)B!!_$=+BV&XzE(XOxUJx6!-rFC|F3vStG(?x(oish>k=u zNOW9IRDfZHIMu&GIqIWy;_a8S=U`ZY!uAzl%QeK(fwgz+1oa`7@&l*^78&QrwsT^P zjS3*2PNs)Z5%m$ISw9p2CB6NyCCQdK!V%=OSN|Zahx*^(3+8~{PF9le~d?p?^3{WN)&w&(3y5d;`OyeuW*h&7}1E zI5gok8!5a@NVbI>NQ|(&$!Fv}1Yf>gTqaR@2t?-u?5GUW(h6*(47T&` zN81C7u)3x5+m(RQE6K@h&qrV_#b}RAV0qL2rq>S}F0he>v#tQe@B)lv=XHfZcz`@f~_?R-0wiJNBV4GnN3AbSjI8h7+`C^xP8x$Rki6BUFZK%Y)X5 z%oCzk7aaF~A+66mb-W|~{Ec+u)6GW@Hbe#|>TYfD=TVW73p*U+G27{JOxro=41Xt2 zrEN}(ZBGisP0VV$_*3w{h|3wd`f$3mmT-ZLP;oWM@8gIH0|;cKcab6gAqx{YQ6QCn zz_Ge4uj4OiFm3ye#w*e|%} zYq*0&w4E8}ReLwiZTFq`T8Rx9!vvoUWI5p%Jk(C&jo4Drzc6_Ql{L-3 zjNZX4W|wJ6qdA0Q9x8E6w~|m#M)8reKB~cw&|K07b|l{vbulo;IWGZ^I@tN-z|;o$JA0Jog1nb=`1|NQ zf*fcOmO3^SD4If%@2_>Hf{;ns0u#!Dm-=u`lna891sflg{J;?i*#Io9`?bCdH83Tk zH_)C1;qM8RJoIZC4yXj%BsfGXb>lZC-0T-c7q0IGb^ z7{(WGKzBz>Q0BQnfTbJOD=@^FjYz3ohv)2T79GP9h~1(pnq4*v4`5ESod7$#2mu1q zOLT;y+DXj@7B;eZ!D>TFiB?g$5yU`$VnbYtRL>VBdttxzll``dN%sP-dc`z{9f50R zz=i|r3XBAUmzaair2>E<=UMVZz^_zQgvwrs7%rj&cLfLw4c9+r&pFiwj-6+9|DR^; zk0IK>VF$By=5zZK>vwLmi-4U5vM_|qWnjCA&KxE3`oNw1$H)nDRHn-xvhaZ*5vCR= zhg- zsDP4FAI>XOKybsa@S#KW;A+5(`J<@z|DHZ>FufbZB(=*ii?rCBzI9Oq zUcbzC3F#>lU5?&8O?FD>TU4vZts1zQqXqKe0&5n2#^Iq+5uTH-kH%~_fjkQ+DCCni zC{QY2(%>_m3&x zq@JQ=(?-^Vl#ZPR>(0is9or}X#Wt45fkYcHA`0IW5oy#N^_hfuP}s7Cy!YF2H}&Sej~b8)rB$me(DKqXY>ku& z1&R13Z+{!DKsz#a`UL~|{uJWh3`8ALx)S3+JBz*mTp({hAG{rky&VqT{&w&dGDA$; z%=7BQ=V5qu^Cpf1OWnwLw?Niy%!4^lSa8eXzGCZp_5p1l#CAOjB)_Xg?cz1lo`7@~ zZhLO0AyqWQT)%DT!kGZSpDyhqBzWUm@bb(=jUsopNV#rJRTesAv< z>8-M-drkKXW0kvOAs4g+q0Jzk?p+D##DfIXRIJ`h-wsN&J^jNllL0~uuRRotnKo;VKD$_c<#w}ZSg#u)bljzXHxcz0~x zNuva93B3>V`NBnM?LOAi{hatqQs+O~l7HrP^Z$eG^4BX1rT>N0;VW@(-UPb0bj>PHxE6-FxpuV3IL=)sY4jez&3etF9eP>_T9M` zm=p!kmXa$mC3=*}{Q0$|C6G681r?;EqTq;0-KoZGX|9Dk5=5O(a-@=iu3)FAU>ojH ze?TaNqM%FzN(rh9NpofpUPT6S!%C;*^~_?cj#Iw0-Y3Z6Vhbj{X6JnC$%p3N(hbd7i~0Q_FSzA%$Rp5FT1U#KX~%-)2APOr1HcQl_ws0^6VMkfC+Wn z^b0zYpVyow=&!HFz#$$S*wHml{YiL)r+y2h1!+J7^9(oNV~t>^FaYr%44;XQ@P2Wa zc)}j1<1h^IVD$b4?qiUL)ZBO+mqR>eB7hqLM@hi0eoE{(_?en(*MOXg#Zp8TI4TM5 zzj^`50OtsEg-4eblqSpm9)NHzG0$`YQW&sOVcwWBaYd}P{@^yi)J|to7g!bX^4E9C zjG7gScmULgY`T9QHe-EAN1~8-@G{n`JRiY51~n4Zxo zgccp#k3xc508HwewWQ(k*1wajC(xEom;~Ei1q^XxeV9v$^g@}&Q0iLn0IyoM<1Roj zsRbH?pRsFlU9@kibGlnOGS^48Y3zDmSL0sHtoL7_jrHZAwd?&W@yCnZkBS|&UR>gu z=}t5N+Dg~6`Wr;sw%;h>@f89Eq51*#Da^%dL7!=0<-Z~Iivif3259fsp9 zwU0tufy{Yp%g zLadPJZ&hpsolhk-2=-?`6PU8`Q_??vKa_xE8p@6Kex3za1h?`nsf6_1=h>t`XO*p0 zmaUo8%%KbRMGsQh16JQ!bt3PWaej_|*~rxIG7qj)tY4XivT>be=zl=Ye@DpK$5Xxt zb8aTg5JT98q@-!cv%%0?GH*^prYsGTQx-1l?%;bz@1Z#%r%$L&{sH31plQ5#(8;TO1E8?yBoMot8*!dQ^A$#tE zXuBXI>W|?QMyT$kltk|xV2-U`z|p4XPr}$5#&9b(^0V9t1sln3i;e7*E=HZBC>gO2 zo`vq0kQ*;k5FLjUMeuOS4YhHG!Dg|!3@Me3G;jp4yXf=M!qUp%iUbJeMR)=P^aRsL ztt5fgCO>9;&v9TQS-~d`5cgOffJuEqXs6&GZ0CSaJkUlw_mo%khF&muV=G}lK(*Cx z$Nd`{dBlT07xWXJ{%8VFKg9Cv9EAB;r_n%|4+4YBV=q7w4rlZDHAv*QnQFq$U$uY{ zgz0|_ux1}%%|35ZV9f&aY4-)NCK!I|dvL!ouWZ1T2h0sd7L)3`y{XOJ0>z!eUOa6N z0Q>r1#5ai3?8hVpu#>J7Kh{YRmL0Tr4}R~liwFhtZX#h1{g8!g2hg|T8~G$OPoRA= zP)lqKq9>qxfnFE`M8a$PhNy=D?TMk~o30q6WDt_oS415L+QZOh7`X78zfQKQ9hi)2r zSzI$VWlDpX4nhL&+UOP-P3e;=Pn&AN&8bynqi<8X8|4MihDc?@FMB+9ydHStPrAS} ztMDp?mD8_o3z+V-J#7b=0+vhKIF9=ORzcAOH@o5?wWL~WtLkr2xh;Lct(Sn5`YPV) zDiUN;U!(I<6ei~r!6XwTybhS8t~18VaMlN6SjSO6dHgPW-8$M@JgWW@?;+=q7w}R; zEIdTF)P-BC0Ki0<8}o9y#NA_RvS>j)hdhVruz z>cky807Docn8sO5lzHR}d+p%qigh^O$gBU1_v=Hk1`+y3rUWczW);Ca^$K$_Sb#}~ zkeR;P_h#m}pF$p*(HUB=i%urBKwEw*8blHk>Nl7;!E6ZoDFNs*_NqklGpm6o&00FP z>Sx3Zoe?^`DA!Nx5m%goe%4!so&pDbh!IE?S7Y>z(;?8MEcyy?{1i1D>Plw8y^L9A z7X85tYMrBLW02&+90i)QI4J{w(Ozq;y83`lGt&t=dztf~p1KhpJ~O%PVJ^_xBvb97 zbBNA9I+96W(21PNYCnT-h69jlW!0;QRA4m7P^hJppnzq40WM3s-;^9DJ)YPO;N-{; zF>(Ocoi|9k1hTk9)A^IapeNlymMqKQ0DQm1)%#g=R=&Rg7DpC}2%0|#+W4y`7$Vc4 zl2iQv$_ZBb6$>)ce2+E$N*=dPm?JcqXjyQfV_-f9RffP<7|)t}5fAs|lEVm;4IrOR z36%#6+(q71#`oIzA|l^H;}Om9ACUfuh0;WD3J%!lIDb93rj zFvK&)sTuQ>`j7M;>=C&3hbxdew?n(qvNALw{N0=(KgSvL^K>YQrzNpJPv0-o`87Jf zPKVOj>YvfKi$1Pvf3R}$@wrovzP)nl@h2&%y{(Kkx_FC>HX%lBRrvRjBUrmM;JXk= zYX3VH@)LAOS682*LwH&yv*M8LEXVzmh?gha{p%4X7iw+gR7 zBK$!!rNhBFLtGth<_N`GFEf58+yp#7`m$o1sy>=6$M#temjaSUVCEH+u~4KXG;AB% zs?ip5Yq+>xWE;BeyOs`7bE6vEy&l?@i!396#D69*CwY=m^idx3&(HvjHmyc2GA$ZH zp(MevNCmUC31N}GkJ3Q^1p+}l$shp%E5R@#OnE0)8gB zj%@+h3WEJ-SSTgMk=8g!(G%I{!1S;bhSyN{jOU0)&>)eT8mKVl(S6Y}a}sM+s&$Da zOS#&7)AmJ6DK4*L0(B8uk6ndnLr7g2hK&c4-X^B-T|5ja3VxSm$9! zX+B^L3vI~N;$E~5Bt77Iv>b^3)3HE2p#80IXxIfpgFVDrc6ubNq?8%9JXX=bp#K6^ zYwR->Jh{IBPpBRt=nDvr&u5_|K_e##*dEr_g?Tk$2F6V?TmOI(WS#6XFEY&Imqtx^ zi3mdA>w~WVKiDjIg838ehuM=b28^FL19&$W{1$`Xq2PDe8Kh!Gpx8i#3m_ilVNnQx zWa7-YA1>D&msE_m%I%Vn93d$8t#Ge+J3+ zUoyh`>2%;V9wUo{*YfaCgO9d@3(3gn$BymZS#lpW&?3yAV_gJX6#4Bde3X(q#q-!| z5tRJ}c-HUgkb}41zaVP1jVw2NJ^awm?@4^s{~E8rP6Ff?gn#>Z?IZY&xTX+y$rQkO z?FZgUaogK4@ho{a&;&O&^ovL@loTQ9^8iJHaed(fVCG{1d|2e~f#LHVcx)Sk&6|U~ z&U+bN`JmHYGdWRVhawHGodx+I@DHV|^sdCXJZ-{~#4_tL2+SGzwELk5iEAMl3L#%y zip!-3%McVN*5b*q=n$m>-x%5m5MD+-ejNTm)Y4QRhms@p0|uH?L71gJ(@rlJkvA?@ zbUzjNL8nolfhweqYuW5Qq@s&Fvv~Vk-mr)*hdKW$1(<2g0(F23LFS?6Z6eg17XK|l zTIT#G-e@TD3_BRC>EJaJ>EH(khAG36@`(LqZ#<0Mmb}_|(2p|hA_SrP!0VQ4hDAhr zh+_uYL8?6z6Ox|bb(OdTa}r-%R=GT7_lb~np-PQp*`!POe8VEqGdLFz47sHty z2<*9j|7xpM2MQL*>6@i?Vtbd)zI_WnU?wU0~#sKlA0Am2la z8Bcq`9022hgnS93!d%|ik1t}%y>O!-`ZMmdqySq1jDX2r8P_hwqa99A@a1V*LOp^J zKv#Nf#2TH8bOfvH;F2A@QA|7J*lfZ!T7efiY>ynusIdoj#`PXW;`gwlh{|UZg`q6< z^h1r^Sf_mhxLslNZxnOJLLd-2<0&dEgK6EQ47sPi!a98kj-RF#Uzn(nHb|2MttId# z^SL|{-OJLs!&-w_Uu7H~)Bd=Qqpbo$2QRpM2xOn&{%a+ZSmrdRczLpG2D-CVTvc^Z za64BK{OWnFr9>t3jscGHEwL;F4MRIeBSboLx9k%Flt#xT@gM%hQFM2B=ZkPe~WQiWav$V_BD>5F94C?`Oka3dBluENzb zo)l;Gb|xFrV?`#fs-0T{SUHFLVxfFx{PW?xJ%yoyXw#>Bn^WRKr)#()l4epQ7_=I$7=?LTCLYKK?SD zK8D>z=Omr`;glyrI|66ULNun!jKEWjB6KW4dq`Z7_1?7&N4SNby-)YY1BjJye;AH6 zqA!}A#HKndggb7kMUd#AJYbZJLN=YGf7Hywa!Em~ns9?A4;Ei+zXiOVOp&*m%?$Ss zPiJq*=Z2@Jjt>_SA1pqc_#oUl^IN&)r(Ye;3>On0wARcaIRd^uY+2*xDeWJ%5Oag^ P`DZ?k(1Q7u@#6mn+3oyM literal 0 HcmV?d00001 diff --git a/tests/models/mistral2/configuration_mistraltp.py b/tests/models/mistral2/configuration_mistraltp.py new file mode 100644 index 0000000..ad6691b --- /dev/null +++ b/tests/models/mistral2/configuration_mistraltp.py @@ -0,0 +1,155 @@ +# coding=utf-8 +# Copyright 2023 Mistral AI and the HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" Mistral model configuration""" + +from transformers.configuration_utils import PretrainedConfig +# from transformers.utils import logging +from collie.log.logger import logger + + +# logger = logging.get_logger(__name__) + +MISTRAL_PRETRAINED_CONFIG_ARCHIVE_MAP = { + "mistralai/Mistral-7B-v0.1": "https://huggingface.co/mistralai/Mistral-7B-v0.1/resolve/main/config.json", + "mistralai/Mistral-7B-Instruct-v0.1": "https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.1/resolve/main/config.json", +} + + +class MistralConfig(PretrainedConfig): + r""" + This is the configuration class to store the configuration of a [`MistralModel`]. It is used to instantiate an + Mistral model according to the specified arguments, defining the model architecture. Instantiating a configuration + with the defaults will yield a similar configuration to that of the Mistral-7B-v0.1 or Mistral-7B-Instruct-v0.1. + + [mistralai/Mistral-7B-v0.1](https://huggingface.co/mistralai/Mistral-7B-v0.1) + [mistralai/Mistral-7B-Instruct-v0.1](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.1) + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + + Args: + vocab_size (`int`, *optional*, defaults to 32000): + Vocabulary size of the Mistral model. Defines the number of different tokens that can be represented by the + `inputs_ids` passed when calling [`MistralModel`] + hidden_size (`int`, *optional*, defaults to 4096): + Dimension of the hidden representations. + intermediate_size (`int`, *optional*, defaults to 14336): + Dimension of the MLP representations. + num_hidden_layers (`int`, *optional*, defaults to 32): + Number of hidden layers in the Transformer encoder. + num_attention_heads (`int`, *optional*, defaults to 32): + Number of attention heads for each attention layer in the Transformer encoder. + num_key_value_heads (`int`, *optional*, defaults to 8): + This is the number of key_value heads that should be used to implement Grouped Query Attention. If + `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if + `num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used. When + converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed + by meanpooling all the original heads within that group. For more details checkout [this + paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to `8`. + hidden_act (`str` or `function`, *optional*, defaults to `"silu"`): + The non-linear activation function (function or string) in the decoder. + max_position_embeddings (`int`, *optional*, defaults to `4096*32`): + The maximum sequence length that this model might ever be used with. Mistral's sliding window attention + allows sequence of up to 4096*32 tokens. + initializer_range (`float`, *optional*, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + rms_norm_eps (`float`, *optional*, defaults to 1e-06): + The epsilon used by the rms normalization layers. + use_cache (`bool`, *optional*, defaults to `True`): + Whether or not the model should return the last key/values attentions (not used by all models). Only + relevant if `config.is_decoder=True`. + pad_token_id (`int`, *optional*): + The id of the padding token. + bos_token_id (`int`, *optional*, defaults to 1): + The id of the "beginning-of-sequence" token. + eos_token_id (`int`, *optional*, defaults to 2): + The id of the "end-of-sequence" token. + tie_word_embeddings (`bool`, *optional*, defaults to `False`): + Whether the model's input and output word embeddings should be tied. + rope_theta (`float`, *optional*, defaults to 10000.0): + The base period of the RoPE embeddings. + sliding_window (`int`, *optional*, defaults to 4096): + Sliding window attention window size. If not specified, will default to `4096`. + attention_dropout (`float`, *optional*, defaults to 0.0): + The dropout ratio for the attention probabilities. + + ```python + >>> from transformers import MistralModel, MistralConfig + + >>> # Initializing a Mistral 7B style configuration + >>> configuration = MistralConfig() + + >>> # Initializing a model from the Mistral 7B style configuration + >>> model = MistralModel(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + ```""" + + model_type = "mistral" + keys_to_ignore_at_inference = ["past_key_values"] + + def __init__( + self, + vocab_size=32000, + hidden_size=4096, + intermediate_size=14336, + num_hidden_layers=32, + num_attention_heads=32, + num_key_value_heads=8, + hidden_act="silu", + max_position_embeddings=4096 * 32, + initializer_range=0.02, + rms_norm_eps=1e-6, + use_cache=True, + pad_token_id=None, + bos_token_id=1, + eos_token_id=2, + tie_word_embeddings=False, + rope_theta=10000.0, + sliding_window=4096, + attention_dropout=0.0, + attn_implementation="flash_attention_2", + **kwargs, + ): + self.vocab_size = vocab_size + self.max_position_embeddings = max_position_embeddings + self.hidden_size = hidden_size + self.intermediate_size = intermediate_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.sliding_window = sliding_window + + # for backward compatibility + if num_key_value_heads is None: + num_key_value_heads = num_attention_heads + + self.num_key_value_heads = num_key_value_heads + self.hidden_act = hidden_act + self.initializer_range = initializer_range + self.rms_norm_eps = rms_norm_eps + self.use_cache = use_cache + self.rope_theta = rope_theta + self.attention_dropout = attention_dropout + + # 调用父类的初始化函数,将一些公共参数传递给父类处理 + super().__init__( + pad_token_id=pad_token_id, + bos_token_id=bos_token_id, + eos_token_id=eos_token_id, + tie_word_embeddings=tie_word_embeddings, + **kwargs, + ) diff --git a/tests/models/mistral2/model.py b/tests/models/mistral2/model.py new file mode 100644 index 0000000..60d9553 --- /dev/null +++ b/tests/models/mistral2/model.py @@ -0,0 +1,2026 @@ +# coding=utf-8 +# Copyright 2023 Mistral AI and the HuggingFace Inc. team. All rights reserved. +# +# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX +# and OPT implementations in this library. It has been modified from its +# original forms to accommodate minor architectural differences compared +# to GPT-NeoX and OPT used by the Meta AI team that trained the model. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" PyTorch Mistral model.""" +import inspect +import math +import warnings +from typing import List, Optional, Tuple, Union + +import torch +import torch.nn.functional as F +import torch.utils.checkpoint +from torch import nn +from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss + +from transformers.activations import ACT2FN +from transformers.cache_utils import Cache, DynamicCache +from transformers.modeling_attn_mask_utils import _prepare_4d_causal_attention_mask, _prepare_4d_causal_attention_mask_for_sdpa +from transformers.modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast, SequenceClassifierOutputWithPast +from transformers.modeling_utils import PreTrainedModel, dtype_byte_size +from transformers.utils import ( + add_start_docstrings, + add_start_docstrings_to_model_forward, + is_flash_attn_2_available, + is_flash_attn_greater_or_equal_2_10, + logging, + replace_return_docstrings, +) +from .configuration_mistraltp import Mistral2Config + + +if is_flash_attn_2_available(): + from flash_attn import flash_attn_func, flash_attn_varlen_func + from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input # noqa + + _flash_supports_window_size = "window_size" in list(inspect.signature(flash_attn_func).parameters) + + +logger = logging.get_logger(__name__) + +_CONFIG_FOR_DOC = "Mistral2Config" + +#modified for collie +import torch.distributed as dist +import gc +import json +import os +from collections import OrderedDict +from megatron.core import parallel_state, tensor_parallel +from einops import rearrange +from deepspeed.pipe import LayerSpec, TiedLayerSpec + +from collie.config import CollieConfig +from collie.driver.io import IODriver +from collie.log.logger import logger +from collie.module import ( + ColumnParallelLinearWithoutBias, + ColumnParallelLMHead, + RowParallelLinearWithoutBias, +) +from collie.utils import concat_tensor, dict_as_params, env, progress +from collie.models.base import CollieModelForCausalLM +from collie.models.utils import ( + kv_cache_to_inputs_for_layer, inputs_to_kv_cache_for_layer, + kv_cache_to_inputs_for_model, inputs_to_kv_cache_for_model, +) + + +# Copied from transformers.models.llama.modeling_llama._get_unpad_data +def _get_unpad_data(attention_mask): + seqlens_in_batch = attention_mask.sum(dim=-1, dtype=torch.int32) + indices = torch.nonzero(attention_mask.flatten(), as_tuple=False).flatten() + max_seqlen_in_batch = seqlens_in_batch.max().item() + cu_seqlens = F.pad(torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.int32), (1, 0)) + return ( + indices, + cu_seqlens, + max_seqlen_in_batch, + ) + + +# Copied from transformers.models.llama.modeling_llama.LlamaRMSNorm with Llama->Mistral +class Mistral2RMSNorm(nn.Module): + def __init__(self, hidden_size, eps=1e-6): + """ + MistralRMSNorm is equivalent to T5LayerNorm + """ + super().__init__() + self.weight = nn.Parameter(torch.ones(hidden_size)) + self.variance_epsilon = eps + + def forward(self, hidden_states): + input_dtype = hidden_states.dtype + hidden_states = hidden_states.to(torch.float32) + variance = hidden_states.pow(2).mean(-1, keepdim=True) + hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon) + ans = self.weight * hidden_states.to(input_dtype) + # -------------------------------------------------------- + # # 将Tensor转换为列表 + # ans_list = ans.tolist() + # # 指定.json文件的路径 + # file_path = '/remote-home/lqyin/CoLLiE/tests/models/mistraltp/rms_ans.json' + + # # 尝试打开现有的.json文件并读取内容,如果文件不存在则创建一个新的列表 + # try: + # with open(file_path, 'r', encoding='utf-8') as file: + # results_list = json.load(file) + # except FileNotFoundError: + # results_list = [] + # # 将当前结果添加到列表中 + # results_list.append(ans_list) + # # 将更新后的列表写回.json文件 + # with open(file_path, 'w', encoding='utf-8') as file: + # json.dump(results_list, file, ensure_ascii=False, indent=4) + # file.write('\n') # 在文件末尾添加一个换行符 + # -------------------------------------------------------- + return ans + + +# copied from transformers.models.llama.modeling_llama.LlamaRotaryEmbedding with Llama->Mistral +# TODO @Arthur no longer copied from LLama after static cache +class Mistral2RotaryEmbedding(nn.Module): + def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None): + super().__init__() + + self.dim = dim + self.max_position_embeddings = max_position_embeddings + self.base = base + inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2, dtype=torch.int64).float().to(device) / self.dim)) + self.register_buffer("inv_freq", inv_freq, persistent=False) + + # Build here to make `torch.jit.trace` work. + self._set_cos_sin_cache( + seq_len=max_position_embeddings, device=self.inv_freq.device, dtype=torch.get_default_dtype() + ) + + def _set_cos_sin_cache(self, seq_len, device, dtype): + self.max_seq_len_cached = seq_len + t = torch.arange(self.max_seq_len_cached, device=device, dtype=torch.int64).type_as(self.inv_freq) + + freqs = torch.outer(t, self.inv_freq) + # Different from paper, but it uses a different permutation in order to obtain the same calculation + emb = torch.cat((freqs, freqs), dim=-1) + self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False) + self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False) + + def forward(self, x, seq_len=None): + # x: [bs, num_attention_heads, seq_len, head_size] + if seq_len > self.max_seq_len_cached: + self._set_cos_sin_cache(seq_len=seq_len, device=x.device, dtype=x.dtype) + + return ( + self.cos_cached[:seq_len].to(dtype=x.dtype), + self.sin_cached[:seq_len].to(dtype=x.dtype), + ) + + +# Copied from transformers.models.llama.modeling_llama.rotate_half +def rotate_half(x): + """Rotates half the hidden dims of the input.""" + x1 = x[..., : x.shape[-1] // 2] + x2 = x[..., x.shape[-1] // 2 :] + return torch.cat((-x2, x1), dim=-1) + + +# copied from transformers.models.llama.modeling_llama.apply_rotary_pos_emb +# TODO @Arthur no longer copied from LLama after static cache +def apply_rotary_pos_emb(q, k, cos, sin, position_ids, unsqueeze_dim=1): + """Applies Rotary Position Embedding to the query and key tensors. + + Args: + q (`torch.Tensor`): The query tensor. + k (`torch.Tensor`): The key tensor. + cos (`torch.Tensor`): The cosine part of the rotary embedding. + sin (`torch.Tensor`): The sine part of the rotary embedding. + position_ids (`torch.Tensor`): + The position indices of the tokens corresponding to the query and key tensors. For example, this can be + used to pass offsetted position ids when working with a KV-cache. + unsqueeze_dim (`int`, *optional*, defaults to 1): + The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and + sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note + that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and + k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes + cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have + the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2. + Returns: + `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding. + """ + cos = cos[position_ids].unsqueeze(unsqueeze_dim) + sin = sin[position_ids].unsqueeze(unsqueeze_dim) + q_embed = (q * cos) + (rotate_half(q) * sin) + k_embed = (k * cos) + (rotate_half(k) * sin) + return q_embed, k_embed + + +class Mistral2MLP(nn.Module): + def __init__(self, config): + super().__init__() + self.config = config + self.hidden_size = config.hidden_size + self.intermediate_size = config.intermediate_size + # self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False) + # self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False) + # self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False) + + self.up_proj = ColumnParallelLinearWithoutBias( + self.hidden_size, + self.intermediate_size, + bias=False, + gather_output=False, + init_method=lambda x: x, + ) + self.gate_proj = ColumnParallelLinearWithoutBias( + self.hidden_size, + self.intermediate_size, + bias=False, + gather_output=False, + init_method=lambda x: x, + ) + self.down_proj = RowParallelLinearWithoutBias( + self.intermediate_size, + self.hidden_size, + bias=False, + input_is_parallel=True, + init_method=lambda x: x, + ) + + self.act_fn = ACT2FN[config.hidden_act] + + def forward(self, x): + return self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x)) + + +# Copied from transformers.models.llama.modeling_llama.repeat_kv +def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor: + """ + This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch, + num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim) + """ + batch, num_key_value_heads, slen, head_dim = hidden_states.shape + if n_rep == 1: + return hidden_states + hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim) + return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim) + + +class Mistral2Attention(nn.Module): + """ + Multi-headed attention from 'Attention Is All You Need' paper. Modified to use sliding window attention: Longformer + and "Generating Long Sequences with Sparse Transformers". + """ + + def __init__(self, config: CollieConfig, layer_idx: Optional[int] = None): + super().__init__() + self.config = config + self.layer_idx = layer_idx + if layer_idx is None: + logger.warning_once( + f"Instantiating {self.__class__.__name__} without passing a `layer_idx` is not recommended and will " + "lead to errors during the forward call if caching is used. Please make sure to provide a `layer_idx` " + "when creating this class." + ) + + self.hidden_size = config.hidden_size + self.num_heads = config.num_attention_heads + self.head_dim = self.hidden_size // self.num_heads + self.num_key_value_heads = config.num_key_value_heads + self.num_key_value_groups = self.num_heads // self.num_key_value_heads + self.max_position_embeddings = config.max_position_embeddings + self.rope_theta = config.rope_theta + self.is_causal = True + self.attention_dropout = config.attention_dropout + + if (self.head_dim * self.num_heads) != self.hidden_size: + raise ValueError( + f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}" + f" and `num_heads`: {self.num_heads})." + ) + # self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=False) + # self.k_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=False) + # self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=False) + # self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=False) + + self.q_proj = ColumnParallelLinearWithoutBias( + self.hidden_size, + self.num_heads * self.head_dim, + bias=False, + gather_output=False, + init_method=lambda x: x, + ) + self.k_proj = ColumnParallelLinearWithoutBias( + self.hidden_size, + self.num_key_value_heads * self.head_dim, + bias=False, + gather_output=False, + init_method=lambda x: x, + ) + self.v_proj = ColumnParallelLinearWithoutBias( + self.hidden_size, + self.num_key_value_heads * self.head_dim, + bias=False, + gather_output=False, + init_method=lambda x: x, + ) + self.o_proj = RowParallelLinearWithoutBias( + self.num_heads * self.head_dim, + self.hidden_size, + bias=False, + input_is_parallel=True, + init_method=lambda x: x, + ) + + self.rotary_emb = Mistral2RotaryEmbedding( + self.head_dim, + max_position_embeddings=self.max_position_embeddings, + base=self.rope_theta, + ) + + def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int): + return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous() + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_value: Optional[Cache] = None, + output_attentions: bool = False, + use_cache: bool = False, + **kwargs, + ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: + if "padding_mask" in kwargs: + warnings.warn( + "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`" + ) + bsz, q_len, _ = hidden_states.size() + + query_states = self.q_proj(hidden_states) + key_states = self.k_proj(hidden_states) + value_states = self.v_proj(hidden_states) + + # query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2) + # key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) + # value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) + + query_states, key_states, value_states = ( + rearrange(query_states, "b n (h d) -> b n h d", d=self.head_dim), + rearrange(key_states, "b n (h d) -> b n h d", d=self.head_dim), + rearrange(value_states, "b n (h d) -> b n h d", d=self.head_dim), + ) + + self.num_heads_tp = query_states.shape[2] + self.tp_size = self.num_heads // self.num_heads_tp + + query_states = query_states.transpose(1, 2) + key_states = key_states.transpose(1, 2) + value_states = value_states.transpose(1, 2) + + if self.config.pp_size > 1: + query_states = query_states.contiguous() + key_states = key_states.contiguous() + value_states = value_states.contiguous() + + kv_seq_len = key_states.shape[-2] + if past_key_value is not None: + if self.layer_idx is None: + raise ValueError( + f"The cache structure has changed since version v4.36. If you are using {self.__class__.__name__} " + "for auto-regressive decoding with k/v caching, please make sure to initialize the attention class " + "with a layer index." + ) + kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx) + cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len) + query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids) + + if past_key_value is not None: + cache_kwargs = {"sin": sin, "cos": cos} # Specific to RoPE models + key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs) + + # repeat k/v heads if n_kv_heads < n_heads + key_states = repeat_kv(key_states, self.num_key_value_groups) + value_states = repeat_kv(value_states, self.num_key_value_groups) + + attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim) + + if attn_weights.size() != (bsz, self.num_heads_tp, q_len, kv_seq_len): + raise ValueError( + f"Attention weights should be of size {(bsz, self.num_heads_tp, q_len, kv_seq_len)}, but is" + f" {attn_weights.size()}" + ) + + if attention_mask is not None: + if attention_mask.size() != (bsz, 1, q_len, kv_seq_len): + raise ValueError( + f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}" + ) + + attn_weights = attn_weights + attention_mask + + # upcast attention to fp32 + attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype) + attn_weights = nn.functional.dropout(attn_weights, p=self.attention_dropout, training=self.training) + attn_output = torch.matmul(attn_weights, value_states) + + if attn_output.size() != (bsz, self.num_heads_tp, q_len, self.head_dim): + raise ValueError( + f"`attn_output` should be of size {(bsz, self.num_heads_tp, q_len, self.head_dim)}, but is" + f" {attn_output.size()}" + ) + + attn_output = attn_output.transpose(1, 2).contiguous() + attn_output = attn_output.reshape(bsz, q_len, int(self.hidden_size/self.tp_size)) + + attn_output = self.o_proj(attn_output) + + if not output_attentions: + attn_weights = None + + # -------------------------------------------------------- + # 将Tensor转换为列表 + ans_list = attn_output.tolist() + # 指定.json文件的路径 + file_path = '/remote-home/lqyin/CoLLiE/tests/models/mistraltp/attn_output_1.json' + + # 尝试打开现有的.json文件并读取内容,如果文件不存在则创建一个新的列表 + try: + with open(file_path, 'r', encoding='utf-8') as file: + results_list = json.load(file) + except FileNotFoundError: + results_list = [] + # 将当前结果添加到列表中 + results_list.append(ans_list) + # 将更新后的列表写回.json文件 + with open(file_path, 'w', encoding='utf-8') as file: + json.dump(results_list, file, ensure_ascii=False, indent=4) + file.write('\n\n\n') # 在文件末尾添加一个换行符 + # -------------------------------------------------------- + + + + return attn_output, attn_weights, past_key_value + + +class Mistral2FlashAttention2(Mistral2Attention): + """ + Mistral flash attention module. This module inherits from `MistralAttention` as the weights of the module stays + untouched. The only required change would be on the forward pass where it needs to correctly call the public API of + flash attention and deal with padding tokens in case the input contains any of them. + """ + + # Copied from transformers.models.llama.modeling_llama.LlamaFlashAttention2.__init__ + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1. + # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignement, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0. + # Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left). + self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10() + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_value: Optional[Cache] = None, + output_attentions: bool = False, + use_cache: bool = False, + **kwargs, + ): + if "padding_mask" in kwargs: + warnings.warn( + "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`" + ) + + # overwrite attention_mask with padding_mask + attention_mask = kwargs.pop("padding_mask") + bsz, q_len, _ = hidden_states.size() + + query_states = self.q_proj(hidden_states) + key_states = self.k_proj(hidden_states) + value_states = self.v_proj(hidden_states) + + # query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2) + # key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) + # value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) + + query_states, key_states, value_states = ( + rearrange(query_states, "b n (h d) -> b n h d", d=self.head_dim), + rearrange(key_states, "b n (h d) -> b n h d", d=self.head_dim), + rearrange(value_states, "b n (h d) -> b n h d", d=self.head_dim), + ) + + self.num_heads_tp = query_states.shape[2] + self.tp_size = self.num_heads // self.num_heads_tp + + query_states = query_states.transpose(1, 2) + key_states = key_states.transpose(1, 2) + value_states = value_states.transpose(1, 2) + + if self.config.pp_size > 1: + query_states = query_states.contiguous() + key_states = key_states.contiguous() + value_states = value_states.contiguous() + + + kv_seq_len = key_states.shape[-2] + if past_key_value is not None: + if self.layer_idx is None: + raise ValueError( + f"The cache structure has changed since version v4.36. If you are using {self.__class__.__name__} " + "for auto-regressive decoding with k/v caching, please make sure to initialize the attention class " + "with a layer index." + ) + kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx) + + # Because the input can be padded, the absolute sequence length depends on the max position id. + rotary_seq_len = max(kv_seq_len, position_ids[:, -1].max().item()) + 1 + cos, sin = self.rotary_emb(value_states, seq_len=rotary_seq_len) + + query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids) + + use_sliding_windows = ( + _flash_supports_window_size + and getattr(self.config, "sliding_window", None) is not None + and kv_seq_len > self.config.sliding_window + ) + + if not _flash_supports_window_size: + logger.warning_once( + "The current flash attention version does not support sliding window attention, for a more memory efficient implementation" + " make sure to upgrade flash-attn library." + ) + + if past_key_value is not None: + # Activate slicing cache only if the config has a value `sliding_windows` attribute + cache_has_contents = past_key_value.get_seq_length(self.layer_idx) > 0 + if ( + getattr(self.config, "sliding_window", None) is not None + and kv_seq_len > self.config.sliding_window + and cache_has_contents + ): + slicing_tokens = 1 - self.config.sliding_window + + past_key = past_key_value[self.layer_idx][0] + past_value = past_key_value[self.layer_idx][1] + + past_key = past_key[:, :, slicing_tokens:, :].contiguous() + past_value = past_value[:, :, slicing_tokens:, :].contiguous() + + if past_key.shape[-2] != self.config.sliding_window - 1: + raise ValueError( + f"past key must have a shape of (`batch_size, num_heads, self.config.sliding_window-1, head_dim`), got" + f" {past_key.shape}" + ) + + if attention_mask is not None: + attention_mask = attention_mask[:, slicing_tokens:] + attention_mask = torch.cat([attention_mask, torch.ones_like(attention_mask[:, -1:])], dim=-1) + + cache_kwargs = {"sin": sin, "cos": cos} # Specific to RoPE models + key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs) + + # repeat k/v heads if n_kv_heads < n_heads + key_states = repeat_kv(key_states, self.num_key_value_groups) + value_states = repeat_kv(value_states, self.num_key_value_groups) + dropout_rate = 0.0 if not self.training else self.attention_dropout + + # In PEFT, usually we cast the layer norms in float32 for training stability reasons + # therefore the input hidden states gets silently casted in float32. Hence, we need + # cast them back in float16 just to be sure everything works as expected. + input_dtype = query_states.dtype + if input_dtype == torch.float32: + if torch.is_autocast_enabled(): + target_dtype = torch.get_autocast_gpu_dtype() + # Handle the case where the model is quantized + elif hasattr(self.config, "_pre_quantization_dtype"): + target_dtype = self.config._pre_quantization_dtype + else: + target_dtype = self.q_proj.weight.dtype + + logger.warning_once( + f"The input hidden states seems to be silently casted in float32, this might be related to" + f" the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in" + f" {target_dtype}." + ) + + query_states = query_states.to(target_dtype) + key_states = key_states.to(target_dtype) + value_states = value_states.to(target_dtype) + + # Reashape to the expected shape for Flash Attention + query_states = query_states.transpose(1, 2) + key_states = key_states.transpose(1, 2) + value_states = value_states.transpose(1, 2) + + attn_output = self._flash_attention_forward( + query_states, + key_states, + value_states, + attention_mask, + q_len, + dropout=dropout_rate, + use_sliding_windows=use_sliding_windows, + ) + + attn_output = attn_output.reshape(bsz, q_len, int(self.hidden_size/self.tp_size)).contiguous() + attn_output = self.o_proj(attn_output) + + if not output_attentions: + attn_weights = None + + # -------------------------------------------------------- + # 将Tensor转换为列表 + ans_list = attn_output.tolist() + # 指定.json文件的路径 + file_path = '/remote-home/lqyin/CoLLiE/tests/models/mistraltp/attn_output_1.json' + + # 尝试打开现有的.json文件并读取内容,如果文件不存在则创建一个新的列表 + try: + with open(file_path, 'r', encoding='utf-8') as file: + results_list = json.load(file) + except FileNotFoundError: + results_list = [] + # 将当前结果添加到列表中 + results_list.append(ans_list) + # 将更新后的列表写回.json文件 + with open(file_path, 'w', encoding='utf-8') as file: + json.dump(results_list, file, ensure_ascii=False, indent=4) + file.write('\n\n\n') # 在文件末尾添加一个换行符 + # -------------------------------------------------------- + + + return attn_output, attn_weights, past_key_value + + def _flash_attention_forward( + self, + query_states, + key_states, + value_states, + attention_mask, + query_length, + dropout=0.0, + softmax_scale=None, + use_sliding_windows=False, + ): + """ + Calls the forward method of Flash Attention - if the input hidden states contain at least one padding token + first unpad the input, then computes the attention scores and pad the final attention scores. + + Args: + query_states (`torch.Tensor`): + Input query states to be passed to Flash Attention API + key_states (`torch.Tensor`): + Input key states to be passed to Flash Attention API + value_states (`torch.Tensor`): + Input value states to be passed to Flash Attention API + attention_mask (`torch.Tensor`): + The padding mask - corresponds to a tensor of size `(batch_size, seq_len)` where 0 stands for the + position of padding tokens and 1 for the position of non-padding tokens. + dropout (`float`): + Attention dropout + softmax_scale (`float`, *optional*): + The scaling of QK^T before applying softmax. Default to 1 / sqrt(head_dim) + use_sliding_windows (`bool`, *optional*): + Whether to activate sliding window attention. + """ + if not self._flash_attn_uses_top_left_mask: + causal = self.is_causal + else: + # TODO: Remove the `query_length != 1` check once Flash Attention for RoCm is bumped to 2.1. For details, please see the comment in LlamaFlashAttention2 __init__. + causal = self.is_causal and query_length != 1 + + # Contains at least one padding token in the sequence + if attention_mask is not None: + batch_size = query_states.shape[0] + query_states, key_states, value_states, indices_q, cu_seq_lens, max_seq_lens = self._upad_input( + query_states, key_states, value_states, attention_mask, query_length + ) + + cu_seqlens_q, cu_seqlens_k = cu_seq_lens + max_seqlen_in_batch_q, max_seqlen_in_batch_k = max_seq_lens + + if not use_sliding_windows: + attn_output_unpad = flash_attn_varlen_func( + query_states, + key_states, + value_states, + cu_seqlens_q=cu_seqlens_q, + cu_seqlens_k=cu_seqlens_k, + max_seqlen_q=max_seqlen_in_batch_q, + max_seqlen_k=max_seqlen_in_batch_k, + dropout_p=dropout, + softmax_scale=softmax_scale, + causal=causal, + ) + else: + attn_output_unpad = flash_attn_varlen_func( + query_states, + key_states, + value_states, + cu_seqlens_q=cu_seqlens_q, + cu_seqlens_k=cu_seqlens_k, + max_seqlen_q=max_seqlen_in_batch_q, + max_seqlen_k=max_seqlen_in_batch_k, + dropout_p=dropout, + softmax_scale=softmax_scale, + causal=causal, + window_size=(self.config.sliding_window, self.config.sliding_window), + ) + + attn_output = pad_input(attn_output_unpad, indices_q, batch_size, query_length) + else: + if not use_sliding_windows: + attn_output = flash_attn_func( + query_states, + key_states, + value_states, + dropout, + softmax_scale=softmax_scale, + causal=causal, + ) + else: + attn_output = flash_attn_func( + query_states, + key_states, + value_states, + dropout, + softmax_scale=softmax_scale, + causal=causal, + window_size=(self.config.sliding_window, self.config.sliding_window), + ) + + return attn_output + + def _upad_input(self, query_layer, key_layer, value_layer, attention_mask, query_length): + batch_size, kv_seq_len, num_heads, head_dim = key_layer.shape + + # On the first iteration we need to properly re-create the padding mask + # by slicing it on the proper place + if kv_seq_len != attention_mask.shape[-1]: + attention_mask_num_tokens = attention_mask.shape[-1] + attention_mask = attention_mask[:, attention_mask_num_tokens - kv_seq_len :] + + indices_k, cu_seqlens_k, max_seqlen_in_batch_k = _get_unpad_data(attention_mask) + + key_layer = index_first_axis(key_layer.reshape(batch_size * kv_seq_len, num_heads, head_dim), indices_k) + value_layer = index_first_axis(value_layer.reshape(batch_size * kv_seq_len, num_heads, head_dim), indices_k) + + if query_length == kv_seq_len: + query_layer = index_first_axis( + query_layer.reshape(batch_size * kv_seq_len, num_heads, head_dim), indices_k + ) + cu_seqlens_q = cu_seqlens_k + max_seqlen_in_batch_q = max_seqlen_in_batch_k + indices_q = indices_k + elif query_length == 1: + max_seqlen_in_batch_q = 1 + cu_seqlens_q = torch.arange( + batch_size + 1, dtype=torch.int32, device=query_layer.device + ) # There is a memcpy here, that is very bad. + indices_q = cu_seqlens_q[:-1] + query_layer = query_layer.squeeze(1) + else: + # The -q_len: slice assumes left padding. + attention_mask = attention_mask[:, -query_length:] + query_layer, indices_q, cu_seqlens_q, max_seqlen_in_batch_q = unpad_input(query_layer, attention_mask) + + return ( + query_layer, + key_layer, + value_layer, + indices_q, + (cu_seqlens_q, cu_seqlens_k), + (max_seqlen_in_batch_q, max_seqlen_in_batch_k), + ) + + +# copied from transformers.models.llama.modeling_llama.LlamaSdpaAttention with Llama->Mistral +# TODO @Arthur no longer copied from LLama after static cache +class Mistral2SdpaAttention(Mistral2Attention): + """ + Mistral attention module using torch.nn.functional.scaled_dot_product_attention. This module inherits from + `MistralAttention` as the weights of the module stays untouched. The only changes are on the forward pass to adapt to + SDPA API. + """ + + # Adapted from MistralAttention.forward + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_value: Optional[Cache] = None, + output_attentions: bool = False, + use_cache: bool = False, + ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: + if output_attentions: + # TODO: Improve this warning with e.g. `model.config.attn_implementation = "manual"` once this is implemented. + logger.warning_once( + "MistralModel is using MistralSdpaAttention, but `torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to the manual attention implementation, " + 'but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.' + ) + return super().forward( + hidden_states=hidden_states, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_value=past_key_value, + output_attentions=output_attentions, + use_cache=use_cache, + ) + + bsz, q_len, _ = hidden_states.size() + + query_states = self.q_proj(hidden_states) + key_states = self.k_proj(hidden_states) + value_states = self.v_proj(hidden_states) + + # query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2) + # key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) + # value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) + + query_states, key_states, value_states = ( + rearrange(query_states, "b n (h d) -> b n h d", d=self.head_dim), + rearrange(key_states, "b n (h d) -> b n h d", d=self.head_dim), + rearrange(value_states, "b n (h d) -> b n h d", d=self.head_dim), + ) + + self.num_heads_tp = query_states.shape[2] + self.tp_size = self.num_heads // self.num_heads_tp + + query_states = query_states.transpose(1, 2) + key_states = key_states.transpose(1, 2) + value_states = value_states.transpose(1, 2) + + if self.config.pp_size > 1: + query_states = query_states.contiguous() + key_states = key_states.contiguous() + value_states = value_states.contiguous() + + + kv_seq_len = key_states.shape[-2] + if past_key_value is not None: + kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx) + cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len) + + query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids) + + if past_key_value is not None: + cache_kwargs = {"sin": sin, "cos": cos} # Specific to RoPE models + key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs) + + key_states = repeat_kv(key_states, self.num_key_value_groups) + value_states = repeat_kv(value_states, self.num_key_value_groups) + + if attention_mask is not None: + if attention_mask.size() != (bsz, 1, q_len, kv_seq_len): + raise ValueError( + f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}" + ) + + # SDPA with memory-efficient backend is currently (torch==2.1.2) bugged with non-contiguous inputs with custom attn_mask, + # Reference: https://github.com/pytorch/pytorch/issues/112577. + if query_states.device.type == "cuda" and attention_mask is not None: + query_states = query_states.contiguous() + key_states = key_states.contiguous() + value_states = value_states.contiguous() + + attn_output = torch.nn.functional.scaled_dot_product_attention( + query_states, + key_states, + value_states, + attn_mask=attention_mask, + dropout_p=self.attention_dropout if self.training else 0.0, + # The q_len > 1 is necessary to match with AttentionMaskConverter.to_causal_4d that does not create a causal mask in case q_len == 1. + is_causal=self.is_causal and attention_mask is None and q_len > 1, + ) + + attn_output = attn_output.transpose(1, 2).contiguous() + attn_output = attn_output.view(bsz, q_len, int(self.hidden_size/self.tp_size)) + + attn_output = self.o_proj(attn_output) + + return attn_output, None, past_key_value + + +MISTRAL_ATTENTION_CLASSES = { + "eager": Mistral2Attention, + "flash_attention_2": Mistral2FlashAttention2, + "sdpa": Mistral2SdpaAttention, +} + + +class MistralDecoderLayer(nn.Module): + def __init__(self, config: CollieConfig, layer_idx: int): + super().__init__() + self.hidden_size = config.hidden_size + + self.self_attn = MISTRAL_ATTENTION_CLASSES[config._attn_implementation](config, layer_idx) + + self.mlp = Mistral2MLP(config) + self.input_layernorm = Mistral2RMSNorm(config.hidden_size, eps=config.rms_norm_eps) + self.post_attention_layernorm = Mistral2RMSNorm(config.hidden_size, eps=config.rms_norm_eps) + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_value: Optional[Tuple[torch.Tensor]] = None, + output_attentions: Optional[bool] = False, + use_cache: Optional[bool] = False, + **kwargs, + ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]: + if "padding_mask" in kwargs: + warnings.warn( + "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`" + ) + """ + Args: + hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)` + attention_mask (`torch.FloatTensor`, *optional*): attention mask of size + `(batch, sequence_length)` where padding elements are indicated by 0. + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under + returned tensors for more detail. + use_cache (`bool`, *optional*): + If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding + (see `past_key_values`). + past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states + """ + + residual = hidden_states + + hidden_states = self.input_layernorm(hidden_states) + + # Self Attention + hidden_states, self_attn_weights, present_key_value = self.self_attn( + hidden_states=hidden_states, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_value=past_key_value, + output_attentions=output_attentions, + use_cache=use_cache, + ) + hidden_states = residual + hidden_states + + # Fully Connected + residual = hidden_states + hidden_states = self.post_attention_layernorm(hidden_states) + hidden_states = self.mlp(hidden_states) + hidden_states = residual + hidden_states + + outputs = (hidden_states,) + + if output_attentions: + outputs += (self_attn_weights,) + + if use_cache: + outputs += (present_key_value,) + + # -------------------------------------------------------- + # # 将Tensor转换为列表 + # ans_list = [tensor.tolist() for tensor in outputs] + # # 指定.json文件的路径 + # file_path = '/remote-home/lqyin/CoLLiE/tests/models/mistraltp/decoder_outputs.json' + + # # 尝试打开现有的.json文件并读取内容,如果文件不存在则创建一个新的列表 + # try: + # with open(file_path, 'r', encoding='utf-8') as file: + # results_list = json.load(file) + # except FileNotFoundError: + # results_list = [] + # # 将当前结果添加到列表中 + # results_list.append(ans_list) + # # 将更新后的列表写回.json文件 + # with open(file_path, 'w', encoding='utf-8') as file: + # json.dump(results_list, file, ensure_ascii=False, indent=4) + # file.write('\n') # 在文件末尾添加一个换行符 + # -------------------------------------------------------- + + return outputs + + +MISTRAL_START_DOCSTRING = r""" + This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the + library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads + etc.) + + This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. + Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage + and behavior. + + Parameters: + config ([`MistralConfig`]): + Model configuration class with all the parameters of the model. Initializing with a config file does not + load the weights associated with the model, only the configuration. Check out the + [`~PreTrainedModel.from_pretrained`] method to load the model weights. +""" + + +@add_start_docstrings( + "The bare Mistral Model outputting raw hidden-states without any specific head on top.", + MISTRAL_START_DOCSTRING, +) +class Mistral2PreTrainedModel(PreTrainedModel): + config_class = Mistral2Config + base_model_prefix = "model" + supports_gradient_checkpointing = True + _no_split_modules = ["MistralDecoderLayer"] + _skip_keys_device_placement = "past_key_values" + _supports_flash_attn_2 = True + _supports_sdpa = True + _supports_cache_class = True + + def _init_weights(self, module): + std = self.config.initializer_range + if isinstance(module, nn.Linear): + module.weight.data.normal_(mean=0.0, std=std) + if module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, nn.Embedding): + module.weight.data.normal_(mean=0.0, std=std) + if module.padding_idx is not None: + module.weight.data[module.padding_idx].zero_() + + +MISTRAL_INPUTS_DOCSTRING = r""" + Args: + input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`): + Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide + it. + + Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. + + [What are input IDs?](../glossary#input-ids) + attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*): + Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + [What are attention masks?](../glossary#attention-mask) + + Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. + + If `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see + `past_key_values`). + + If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`] + and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more + information on the default strategy. + + - 1 indicates the head is **not masked**, + - 0 indicates the head is **masked**. + position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0, + config.n_positions - 1]`. + + [What are position IDs?](../glossary#position-ids) + past_key_values (`Cache` or `tuple(tuple(torch.FloatTensor))`, *optional*): + Pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention + blocks) that can be used to speed up sequential decoding. This typically consists in the `past_key_values` + returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`. + + Two formats are allowed: + - a [`~cache_utils.Cache`] instance; + - Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of + shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`). This is also known as the legacy + cache format. + + The model will output the same cache format that is fed as input. If no `past_key_values` are passed, the + legacy cache format will be returned. + + If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that don't + have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `input_ids` + of shape `(batch_size, sequence_length)`. + inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): + Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This + is useful if you want more control over how to convert `input_ids` indices into associated vectors than the + model's internal embedding lookup matrix. + use_cache (`bool`, *optional*): + If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see + `past_key_values`). + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned + tensors for more detail. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for + more detail. + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. +""" + + +@add_start_docstrings( + "The bare Mistral Model outputting raw hidden-states without any specific head on top.", + MISTRAL_START_DOCSTRING, +) +class Mistral2Model(nn.Module): + """ + Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`MistralDecoderLayer`] + + Args: + config: MistralConfig + """ + + def __init__(self, config: CollieConfig): + # super().__init__(config) + super().__init__() + self.config = config + self.padding_idx = config.pad_token_id + self.vocab_size = config.vocab_size + + self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx) + self.layers = nn.ModuleList( + [MistralDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)] + ) + self._attn_implementation = config._attn_implementation + self.norm = Mistral2RMSNorm(config.hidden_size, eps=config.rms_norm_eps) + + self.gradient_checkpointing = False + # Initialize weights and apply final processing + # self.post_init() + + def get_input_embeddings(self): + return self.embed_tokens + + def set_input_embeddings(self, value): + self.embed_tokens = value + + @add_start_docstrings_to_model_forward(MISTRAL_INPUTS_DOCSTRING) + def forward( + self, + input_ids: torch.LongTensor = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[List[torch.FloatTensor]] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple, BaseModelOutputWithPast]: + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + use_cache = use_cache if use_cache is not None else self.config.use_cache + + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + # retrieve input_ids and inputs_embeds + if input_ids is not None and inputs_embeds is not None: + raise ValueError("You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time") + elif input_ids is not None: + batch_size, seq_length = input_ids.shape + elif inputs_embeds is not None: + batch_size, seq_length, _ = inputs_embeds.shape + else: + raise ValueError("You have to specify either decoder_input_ids or decoder_inputs_embeds") + + if self.gradient_checkpointing and self.training: + if use_cache: + logger.warning_once( + "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..." + ) + use_cache = False + + past_key_values_length = 0 + + if use_cache: + use_legacy_cache = not isinstance(past_key_values, Cache) + if use_legacy_cache: + past_key_values = DynamicCache.from_legacy_cache(past_key_values) + past_key_values_length = past_key_values.get_usable_length(seq_length) + + if position_ids is None: + device = input_ids.device if input_ids is not None else inputs_embeds.device + position_ids = torch.arange( + past_key_values_length, seq_length + past_key_values_length, dtype=torch.long, device=device + ) + position_ids = position_ids.unsqueeze(0).view(-1, seq_length) + else: + position_ids = position_ids.view(-1, seq_length).long() + + if inputs_embeds is None: + inputs_embeds = self.embed_tokens(input_ids) + + if attention_mask is not None and self._attn_implementation == "flash_attention_2" and use_cache: + is_padding_right = attention_mask[:, -1].sum().item() != batch_size + if is_padding_right: + raise ValueError( + "You are attempting to perform batched generation with padding_side='right'" + " this may lead to unexpected behaviour for Flash Attention version of Mistral. Make sure to " + " call `tokenizer.padding_side = 'left'` before tokenizing the input. " + ) + + if self._attn_implementation == "flash_attention_2": + # 2d mask is passed through the layers + attention_mask = attention_mask if (attention_mask is not None and 0 in attention_mask) else None + elif self._attn_implementation == "sdpa" and not output_attentions: + # output_attentions=True can not be supported when using SDPA, and we fall back on + # the manual implementation that requires a 4D causal mask in all cases. + attention_mask = _prepare_4d_causal_attention_mask_for_sdpa( + attention_mask, + (batch_size, seq_length), + inputs_embeds, + past_key_values_length, + ) + else: + # 4d mask is passed through the layers + attention_mask = _prepare_4d_causal_attention_mask( + attention_mask, + (batch_size, seq_length), + inputs_embeds, + past_key_values_length, + sliding_window=self.config.sliding_window, + ) + + hidden_states = inputs_embeds + + + # -------------------------------------------------------- + # # 将Tensor转换为列表 + # ans_list = inputs_embeds.tolist() + # # 指定.json文件的路径 + # file_path = '/remote-home/lqyin/CoLLiE/tests/models/mistraltp/inputs_embeds.json' + + # # 尝试打开现有的.json文件并读取内容,如果文件不存在则创建一个新的列表 + # try: + # with open(file_path, 'r', encoding='utf-8') as file: + # results_list = json.load(file) + # except FileNotFoundError: + # results_list = [] + # # 将当前结果添加到列表中 + # results_list.append(ans_list) + # # 将更新后的列表写回.json文件 + # with open(file_path, 'w', encoding='utf-8') as file: + # json.dump(results_list, file, ensure_ascii=False, indent=4) + # file.write('\n') # 在文件末尾添加一个换行符 + # # -------------------------------------------------------- + + # decoder layers + all_hidden_states = () if output_hidden_states else None + all_self_attns = () if output_attentions else None + next_decoder_cache = None + + for decoder_layer in self.layers: + if output_hidden_states: + all_hidden_states += (hidden_states,) + + if self.gradient_checkpointing and self.training: + layer_outputs = self._gradient_checkpointing_func( + decoder_layer.__call__, + hidden_states, + attention_mask, + position_ids, + past_key_values, + output_attentions, + use_cache, + ) + else: + layer_outputs = decoder_layer( + hidden_states, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_value=past_key_values, + output_attentions=output_attentions, + use_cache=use_cache, + ) + + hidden_states = layer_outputs[0] + + if use_cache: + next_decoder_cache = layer_outputs[2 if output_attentions else 1] + + if output_attentions: + all_self_attns += (layer_outputs[1],) + + hidden_states = self.norm(hidden_states) + + # add hidden states from the last decoder layer + if output_hidden_states: + all_hidden_states += (hidden_states,) + + next_cache = None + if use_cache: + next_cache = next_decoder_cache.to_legacy_cache() if use_legacy_cache else next_decoder_cache + + if not return_dict: + return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None) + return BaseModelOutputWithPast( + last_hidden_state=hidden_states, + past_key_values=next_cache, + hidden_states=all_hidden_states, + attentions=all_self_attns, + ) + + +class Mistral2ForCausalLM(CollieModelForCausalLM): + _tied_weights_keys = ["lm_head.weight"] + + def __init__(self, config:CollieConfig): + super().__init__(config) + self.model = Mistral2Model(config) + self.vocab_size = config.vocab_size + # self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False) + self.lm_head = ColumnParallelLinearWithoutBias( + self.collie_config.hidden_size, self.collie_config.vocab_size, bias=False + ) + # Initialize weights and apply final processing + # self.post_init() + # GenerationMixin 需要的额外参数 + self.config.is_decoder = True + if config.model_config.tie_word_embeddings: + self.lm_head.weight = self.embed_tokens.weight + self.main_input_name = "input_ids" + + def clean_cache(self): + self._clean_hidden_states([*self.model.layers, self.lm_head]) + self._set_use_cache(self.model.layers, False) + + def set_cache(self, use_cache): + self._set_use_cache(self.model.layers, use_cache) + + def get_input_embeddings(self): + return self.model.embed_tokens + + def set_input_embeddings(self, value): + self.model.embed_tokens = value + + def get_output_embeddings(self): + return self.lm_head + + def set_output_embeddings(self, new_embeddings): + self.lm_head = new_embeddings + + def set_decoder(self, decoder): + self.model = decoder + + def get_decoder(self): + return self.model + + @add_start_docstrings_to_model_forward(MISTRAL_INPUTS_DOCSTRING) + @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC) + def forward( + self, + input_ids: torch.LongTensor = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[List[torch.FloatTensor]] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + **kwargs, + ) -> Union[Tuple, CausalLMOutputWithPast]: + r""" + Args: + labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., + config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored + (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`. + + Returns: + + Example: + + ```python + >>> from transformers import AutoTokenizer, MistralForCausalLM + + >>> model = MistralForCausalLM.from_pretrained("mistralai/Mistral-7B-v0.1") + >>> tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-v0.1") + + >>> prompt = "Hey, are you conscious? Can you talk to me?" + >>> inputs = tokenizer(prompt, return_tensors="pt") + + >>> # Generate + >>> generate_ids = model.generate(inputs.input_ids, max_length=30) + >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0] + "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you." + ```""" + + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn) + outputs = self.model( + input_ids=input_ids, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + hidden_states = outputs[0] + logits = self.lm_head(hidden_states) + logits = logits.float() + + loss = None + if labels is not None: + # Shift so that tokens < n predict n + shift_logits = logits[..., :-1, :].contiguous() + shift_labels = labels[..., 1:].contiguous() + # Flatten the tokens + shift_logits = shift_logits.view(-1, self.config.vocab_size) + shift_labels = shift_labels.view(-1) + # Ensure tensors are on the same device + shift_labels = shift_labels.to(shift_logits.device) + loss_fct = CrossEntropyLoss() + loss = loss_fct(shift_logits, shift_labels) + + if not return_dict: + output = (logits,) + outputs[1:] + return (loss,) + output if loss is not None else output + + return CausalLMOutputWithPast( + loss=loss, + logits=logits, + past_key_values=outputs.past_key_values, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + def prepare_inputs_for_generation( + self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, **kwargs + ): + # Omit tokens covered by past_key_values + if past_key_values is not None: + if isinstance(past_key_values, Cache): + cache_length = past_key_values.get_seq_length() + past_length = past_key_values.seen_tokens + max_cache_length = past_key_values.get_max_length() + else: + cache_length = past_length = past_key_values[0][0].shape[2] + max_cache_length = None + + # Keep only the unprocessed tokens: + # 1 - If the length of the attention_mask exceeds the length of input_ids, then we are in a setting where + # some of the inputs are exclusively passed as part of the cache (e.g. when passing input_embeds as + # input) + if attention_mask is not None and attention_mask.shape[1] > input_ids.shape[1]: + input_ids = input_ids[:, -(attention_mask.shape[1] - past_length) :] + # 2 - If the past_length is smaller than input_ids', then input_ids holds all input tokens. We can discard + # input_ids based on the past_length. + elif past_length < input_ids.shape[1]: + input_ids = input_ids[:, past_length:] + # 3 - Otherwise (past_length >= input_ids.shape[1]), let's assume input_ids only has unprocessed tokens. + + # If we are about to go beyond the maximum cache length, we need to crop the input attention mask. + if ( + max_cache_length is not None + and attention_mask is not None + and cache_length + input_ids.shape[1] > max_cache_length + ): + attention_mask = attention_mask[:, -max_cache_length:] + + position_ids = kwargs.get("position_ids", None) + if attention_mask is not None and position_ids is None: + # create position_ids on the fly for batch generation + position_ids = attention_mask.long().cumsum(-1) - 1 + position_ids.masked_fill_(attention_mask == 0, 1) + if past_key_values: + position_ids = position_ids[:, -input_ids.shape[1] :] + + # if `inputs_embeds` are passed, we only want to use them in the 1st generation step + if inputs_embeds is not None and past_key_values is None: + model_inputs = {"inputs_embeds": inputs_embeds} + else: + model_inputs = {"input_ids": input_ids} + + model_inputs.update( + { + "position_ids": position_ids, + "past_key_values": past_key_values, + "use_cache": kwargs.get("use_cache"), + "attention_mask": attention_mask, + } + ) + return model_inputs + + @staticmethod + def _reorder_cache(past_key_values, beam_idx): + reordered_past = () + for layer_past in past_key_values: + reordered_past += ( + tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past), + ) + return reordered_past + + @classmethod + def pipeline_layers(cls, config: CollieConfig): + """ + Get layers of pipeline. + :return: list + """ + if isinstance(config, str): + config = CollieConfig.from_pretrained(config) + + if config.tie_word_embeddings: + output = TiedLayerSpec( + "embed_tokens", + dict_as_params(input_keys="hidden_states", output_keys="logits"), + ColumnParallelLMHead, + config.hidden_size, + config.vocab_size, + bias=False, + ) + else: + output = LayerSpec( + dict_as_params(input_keys="hidden_states", output_keys="logits"), + ColumnParallelLMHead, + config.hidden_size, + config.vocab_size, + bias=False, + ) + + return [("model", Mistral2Model.pipeline_layers(config)), ("lm_head", output)] + + @staticmethod + def load_parallel_state_dict( + path: str, + config: Union[CollieConfig, str], + process_exclusion: bool = False, + **kwargs, + ): + ... + + @staticmethod + def load_parallel_state_dict( + path: str, + config: Union[CollieConfig, str], + process_exclusion: bool = False, + protocol: str = "file", # 指定加载state_dict时使用的协议 + **kwargs, + ): + """ + Load state_dict from ``path``. + The format of pretrained model should be the same as that of + `huggingface`. + :return: state_dict. Note that the state_dict should be processed + properly to match the current rank. + """ + # 配置加载 + if isinstance(config, str): + config = CollieConfig.from_pretrained(config) + # IO驱动初始化 + io_driver = IODriver.from_protocol(protocol) + # 检查文件路径是否存在 + if not io_driver.exists(path): + raise FileNotFoundError(f"folder {path} not found.") + # 初始化存储和处理变量 + state_dict = OrderedDict() + weights = [] + parts = None # 变量用于存储模型分割的部分信息 + # 如果开启了进程互斥,那么每个进程都会显示进度条,否则只显示 RANK0 的 + hide_progress = not process_exclusion and int(os.environ.get("RANK", "0")) != 0 + if dist.is_initialized() and process_exclusion: + # 如果启动了进程互斥,则要进行 dist.get_world_size() 次循环 + rank_order = range(dist.get_world_size()) + else: + # 不开启只进行一次循环 + rank_order = range(1) + # 权重文件加载和处理 + for rank in rank_order: + # 如果开启了进程互斥,那么只有对应 RANK 的能进入循环;不开启进程互斥的话就都可以进 + if int(os.environ.get("RANK", "0")) == rank or not process_exclusion: + # PP 分层的方法保存在了 os.environ["COLLIE_PP_PARTS"], 格式类似于 [0, 17, 35], 左闭右开 + if env.is_pipeline: + # 保存的是 json 格式 + parts = env.pipeline_parts + if hasattr(config, "num_key_value_heads"): + # llama2 (transformers >= 4.31.0) + num_key_value_heads = config.num_key_value_heads + else: + num_key_value_heads = config.num_attention_heads + head_dim = config.hidden_size // config.num_attention_heads + # 如果存在 pytorch_model.bin.index.json 文件的话,此时不同的 pp 进程可以按需加载自己需要的权重 + if ( + io_driver.exists(os.path.join(path, "pytorch_model.bin.index.json")) + and "COLLIE_PP_PARTS" in os.environ.keys() + ): + weight_map = json.loads( + io_driver.load( + os.path.join(path, "pytorch_model.bin.index.json"), mode="r" + ) + )["weight_map"] + # layers 表示自己需要的层 + layers = env.pipeline_layers_idx + # 筛选出形似 model.layers.0 这样的层。包含两个条件:1. 有数字的层;2. 数字加一要在 layers 里面(因为最开始还有个 embedding 占一层) + weights.extend( + [ + value + for key, value in weight_map.items() + if len(key.split(".")) > 2 + and key.split(".")[2].isdigit() + and (int(key.split(".")[2]) + 1) in layers + ] + ) + # 去重 + weights = list(set(weights)) + # 继续筛选,如果有 0 层,那么就要加载 embedding;如果有最后一层,那么就要加载 lm_head;如果有倒数第二层,那么就要加载 norm + if 0 in layers: + weights.append(weight_map["model.tok_embeddings.weight"]) + if max(parts) - 1 in layers: + weights.append(weight_map["output.weight"]) + if max(parts) - 2 in layers: + weights.append(weight_map["model.norm.weight"]) + else: + # 如果没有 pytorch_model.bin.index.json 文件的话,那么就加载所有的权重 + weights = [ + weight + for weight in io_driver.list(path) + if weight.endswith(".bin") + ] + with progress( + weights, + desc="Loading state dict", + total=len(weights), + disable=hide_progress, + ) as pbar: + for weight in pbar: + part_state_dict = io_driver.load( + os.path.join(path, weight), mode="rb" + ) + # for key in list(part_state_dict.keys()): + # if "attention.wqkv.weight" in key: + # # qkv_weights = part_state_dict.pop(key) + # qkv_weights = part_state_dict[key] + # print(qkv_weights.shape) + # (wq, wk, wv) = qkv_weights.split( + # [ + # config.hidden_size, + # config.num_key_value_heads * head_dim, + # config.num_key_value_heads * head_dim, + # ], + # dim=0, + # ) + # wq_name = key.replace("wqkv", "wq") + # wk_name = key.replace("wqkv", "wk") + # wv_name = key.replace("wqkv", "wv") + # part_state_dict[wq_name] = wq + # part_state_dict[wk_name] = wk + # part_state_dict[wv_name] = wv + state_dict.update(part_state_dict) + del part_state_dict + if parts is not None: + # 这一步是 pp 的复筛 + layers = env.pipeline_layers_idx + for key in list(state_dict.keys()): + if key.startswith("layers"): + layer = int(key.split(".")[1]) + if layer + 1 not in layers: + state_dict.pop(key) + # if key.endswith("tok_embeddings.weight"): + if key.endswith("embed_tokens.weight"): + if 0 not in layers: + state_dict.pop(key) + if key == "norm.weight": + if max(parts) - 2 not in layers: + state_dict.pop(key) + # if key.endswith("output.weight"): + if key.endswith("lm_head.weight"): + if max(parts) - 1 not in layers: + state_dict.pop(key) + # 根据用户配置的新的 tp size 进行分割 + for key in list(state_dict.keys()): + col_filter = [ + # "wq.weight", + # "wk.weight", + # "wv.weight", + # "wqkv.weight", + # "w1.weight", + # "w3.weight", + # "tok_embeddings.weight", + # "output.weight", + "q_proj.weight", + "k_proj.weight", + "v_proj.weight", + "o_proj.weight", + "lm_head.weight", + "gate_proj.weight", + "up_proj.weight", + "down_proj.weight", + "embed_tokens.weight", + ] + col_split = any([key.endswith(filter) for filter in col_filter]) + + if col_split: + tensor = ( + list(torch.chunk(state_dict[key], config.tp_size, dim=0))[ + env.tp_rank + ] + .detach() + .clone() + ) + del state_dict[key] + if process_exclusion: + # CPU 内存回收(速度很慢) + gc.collect() + state_dict[key] = tensor + elif key.endswith("o_proj.weight") or key.endswith("down_proj.weight"): + tensor = ( + list(torch.chunk(state_dict[key], config.tp_size, dim=1))[ + env.tp_rank + ] + .detach() + .clone() + ) + del state_dict[key] + if process_exclusion: + # CPU 内存回收(速度很慢) + gc.collect() + state_dict[key] = tensor + if dist.is_initialized() and process_exclusion: + # 如果选择了进程互斥,那么本次循环中不需要加载权重的进程需等待 + dist.barrier() + return state_dict + + @staticmethod + def save_parallel_state_dict( + state_dict: dict, + path: str, + config: CollieConfig, + process_exclusion: bool = False, + **kwargs, + ): + ... + + @staticmethod + def save_parallel_state_dict( + state_dict: dict, + path: str, + config: CollieConfig, + process_exclusion: bool = False, + protocol: str = "file", + ): + """ + Save state_dict to ``path``. + The format of saved state dict should be the same as that of + `huggingface`. + """ + io_driver = IODriver.from_protocol(protocol) + # gather to tp rank 0 + if dist.is_initialized() and process_exclusion: + # 如果启动了进程互斥,则要进行 pp_size 次循环 + rank_order = range(config.pp_size) + else: + # 不开启只进行一次循环 + rank_order = range(1) + dst = parallel_state.get_tensor_model_parallel_src_rank() + with progress( + rank_order, + desc="Saving model", + disable=int(os.environ.get("RANK", "0")) != 0, + ) as pbar: + for rank in pbar: + if env.dp_rank == 0 and (env.pp_rank == rank or not process_exclusion): + for key in sorted(list(state_dict.keys())): + tensor_list = None + if env.tp_rank == 0: + tensor_list = [ + torch.zeros_like(state_dict[key]) + .to(state_dict[key].dtype) + .cuda() + for _ in range(config.tp_size) + ] + dist.gather( + state_dict[key].cuda(), + dst=dst, + gather_list=tensor_list, + group=env.tp_group, + ) + if env.tp_rank == 0: + col_filter = [ + # "wq.weight", + # "wk.weight", + # "wv.weight", + # "wqkv.weight", + # "w1.weight", + # "w3.weight", + # "tok_embeddings.weight", + # "output.weight", + "q_proj.weight", + "k_proj.weight", + "v_proj.weight", + "o_proj.weight", + "lm_head.weight", + "gate_proj.weight", + "up_proj.weight", + "down_proj.weight", + "embed_tokens.weight", + ] + col_split = any( + [key.endswith(filter) for filter in col_filter] + ) + + if col_split: + state_dict[key] = concat_tensor(tensor_list, dim=0) + + if process_exclusion: + # CPU 内存回收(速度很慢) + gc.collect() + + elif key.endswith("o_proj.weight") or key.endswith("down_proj.weight"): + state_dict[key] = concat_tensor(tensor_list, dim=1) + + if process_exclusion: + # CPU 内存回收(速度很慢) + gc.collect() + # 似乎不需要? + # state_dict_keys = state_dict.keys() + # for layer_id in range(config.num_layers): + # qkv_names = [None, None, None] + # for key in state_dict_keys: + # if f"layers.{layer_id}.attention.wq.weight" in key: + # qkv_names[0] = key + # elif f"layers.{layer_id}.attention.wk.weight" in key: + # qkv_names[1] = key + # elif f"layers.{layer_id}.attention.wv.weight" in key: + # qkv_names[2] = key + # qkv_name = qkv_names[0].replace("wq", "wqkv") + # state_dict[qkv_name] = torch.cat( + # [ + # state_dict.pop(qkv_names[0]), + # state_dict.pop(qkv_names[1]), + # state_dict.pop(qkv_names[2]), + # ], + # dim=0 + # ) + + if env.tp_rank == 0: + # Save gathered weights + if env.is_pipeline: + ckpt_name = f"pytorch_model-{env.pp_rank + 1:05d}-of-{config.pp_size:05d}.bin" + total_size = 0 + weight_map = {} + for name, weight in state_dict.items(): + weight_size = weight.numel() * dtype_byte_size( + weight.dtype + ) + weight_map[name] = ckpt_name + total_size += weight_size + index_dict = dict( + total_size=total_size, weight_map=weight_map + ) + index_dicts = [None for _ in range(env.pp_size)] + dist.gather_object( + index_dict, index_dicts if env.pp_rank == 0 else None, group=env.pp_group + ) + if env.pp_rank == 0: + total_size = 0 + weight_map = {} + for _index_dict in index_dicts: + total_size += _index_dict["total_size"] + weight_map.update(_index_dict["weight_map"]) + merged_dict = { + "metadata": {"total_size": total_size}, + "weight_map": weight_map, + } + io_driver.save( + json.dumps(merged_dict, indent=2, sort_keys=True) + + "\n", + os.path.join(path, "pytorch_model.bin.index.json"), + ) + + else: + ckpt_name = f"pytorch_model.bin" + ckpt_path = os.path.join(path, ckpt_name) + io_driver.save(state_dict, ckpt_path) + if dist.is_initialized() and process_exclusion: + dist.barrier() + if env.rank == 0: + config.save_pretrained(path, protocol=protocol) + dist.barrier() + + +@add_start_docstrings( + """ + The Mistral Model transformer with a sequence classification head on top (linear layer). + + [`MistralForSequenceClassification`] uses the last token in order to do the classification, as other causal models + (e.g. GPT-2) do. + + Since it does classification on the last token, it requires to know the position of the last token. If a + `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If + no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the + padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in + each row of the batch). + """, + MISTRAL_START_DOCSTRING, +) +# Copied from transformers.models.llama.modeling_llama.LlamaForSequenceClassification with Llama->Mistral, LLAMA->MISTRAL +class MistralForSequenceClassification(Mistral2PreTrainedModel): + def __init__(self, config): + super().__init__(config) + self.num_labels = config.num_labels + self.model = Mistral2Model(config) + self.score = nn.Linear(config.hidden_size, self.num_labels, bias=False) + + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self): + return self.model.embed_tokens + + def set_input_embeddings(self, value): + self.model.embed_tokens = value + + @add_start_docstrings_to_model_forward(MISTRAL_INPUTS_DOCSTRING) + def forward( + self, + input_ids: torch.LongTensor = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[List[torch.FloatTensor]] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple, SequenceClassifierOutputWithPast]: + r""" + labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): + Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., + config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If + `config.num_labels > 1` a classification loss is computed (Cross-Entropy). + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + transformer_outputs = self.model( + input_ids, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + hidden_states = transformer_outputs[0] + logits = self.score(hidden_states) + + if input_ids is not None: + batch_size = input_ids.shape[0] + else: + batch_size = inputs_embeds.shape[0] + + if self.config.pad_token_id is None and batch_size != 1: + raise ValueError("Cannot handle batch sizes > 1 if no padding token is defined.") + if self.config.pad_token_id is None: + sequence_lengths = -1 + else: + if input_ids is not None: + # if no pad token found, use modulo instead of reverse indexing for ONNX compatibility + sequence_lengths = torch.eq(input_ids, self.config.pad_token_id).int().argmax(-1) - 1 + sequence_lengths = sequence_lengths % input_ids.shape[-1] + sequence_lengths = sequence_lengths.to(logits.device) + else: + sequence_lengths = -1 + + pooled_logits = logits[torch.arange(batch_size, device=logits.device), sequence_lengths] + + loss = None + if labels is not None: + labels = labels.to(logits.device) + if self.config.problem_type is None: + if self.num_labels == 1: + self.config.problem_type = "regression" + elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int): + self.config.problem_type = "single_label_classification" + else: + self.config.problem_type = "multi_label_classification" + + if self.config.problem_type == "regression": + loss_fct = MSELoss() + if self.num_labels == 1: + loss = loss_fct(pooled_logits.squeeze(), labels.squeeze()) + else: + loss = loss_fct(pooled_logits, labels) + elif self.config.problem_type == "single_label_classification": + loss_fct = CrossEntropyLoss() + loss = loss_fct(pooled_logits.view(-1, self.num_labels), labels.view(-1)) + elif self.config.problem_type == "multi_label_classification": + loss_fct = BCEWithLogitsLoss() + loss = loss_fct(pooled_logits, labels) + if not return_dict: + output = (pooled_logits,) + transformer_outputs[1:] + return ((loss,) + output) if loss is not None else output + + return SequenceClassifierOutputWithPast( + loss=loss, + logits=pooled_logits, + past_key_values=transformer_outputs.past_key_values, + hidden_states=transformer_outputs.hidden_states, + attentions=transformer_outputs.attentions, + ) diff --git a/tests/models/mistral2/modelpp.py b/tests/models/mistral2/modelpp.py new file mode 100644 index 0000000..1180a10 --- /dev/null +++ b/tests/models/mistral2/modelpp.py @@ -0,0 +1,1922 @@ +# coding=utf-8 +# Copyright 2023 Mistral AI and the HuggingFace Inc. team. All rights reserved. +# +# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX +# and OPT implementations in this library. It has been modified from its +# original forms to accommodate minor architectural differences compared +# to GPT-NeoX and OPT used by the Meta AI team that trained the model. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" PyTorch Mistral model.""" +import inspect +import math +import warnings +from typing import List, Optional, Tuple, Union + +import torch +import torch.nn.functional as F +import torch.utils.checkpoint +from torch import nn +from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss + +from transformers.activations import ACT2FN +from transformers.cache_utils import Cache, DynamicCache +from transformers.modeling_attn_mask_utils import _prepare_4d_causal_attention_mask, _prepare_4d_causal_attention_mask_for_sdpa +from transformers.modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast, SequenceClassifierOutputWithPast +from transformers.modeling_utils import PreTrainedModel, dtype_byte_size +from transformers.utils import ( + add_start_docstrings, + add_start_docstrings_to_model_forward, + is_flash_attn_2_available, + is_flash_attn_greater_or_equal_2_10, + logging, + replace_return_docstrings, +) +from .configuration_mistraltp import Mistral2Config + + +if is_flash_attn_2_available(): + from flash_attn import flash_attn_func, flash_attn_varlen_func + from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input # noqa + + _flash_supports_window_size = "window_size" in list(inspect.signature(flash_attn_func).parameters) + + +logger = logging.get_logger(__name__) + +_CONFIG_FOR_DOC = "Mistral2Config" + +#modified for collie +import torch.distributed as dist +import gc +import json +import os +from collections import OrderedDict +from megatron.core import parallel_state, tensor_parallel +from einops import rearrange +from deepspeed.pipe import LayerSpec, TiedLayerSpec + +from collie.config import CollieConfig +from collie.driver.io import IODriver +from collie.log.logger import logger +from collie.module import ( + ColumnParallelLinearWithoutBias, + ColumnParallelLMHead, + RowParallelLinearWithoutBias, +) +from collie.utils import concat_tensor, dict_as_params, env, progress +from collie.models.base import CollieModelForCausalLM +from collie.models.utils import ( + kv_cache_to_inputs_for_layer, inputs_to_kv_cache_for_layer, + kv_cache_to_inputs_for_model, inputs_to_kv_cache_for_model, +) + + +# Copied from transformers.models.llama.modeling_llama._get_unpad_data +def _get_unpad_data(attention_mask): + seqlens_in_batch = attention_mask.sum(dim=-1, dtype=torch.int32) + indices = torch.nonzero(attention_mask.flatten(), as_tuple=False).flatten() + max_seqlen_in_batch = seqlens_in_batch.max().item() + cu_seqlens = F.pad(torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.int32), (1, 0)) + return ( + indices, + cu_seqlens, + max_seqlen_in_batch, + ) + + +# Copied from transformers.models.llama.modeling_llama.LlamaRMSNorm with Llama->Mistral +class Mistral2RMSNorm(nn.Module): + def __init__(self, hidden_size, eps=1e-6): + """ + MistralRMSNorm is equivalent to T5LayerNorm + """ + super().__init__() + self.weight = nn.Parameter(torch.ones(hidden_size)) + self.variance_epsilon = eps + + def forward(self, hidden_states): + input_dtype = hidden_states.dtype + hidden_states = hidden_states.to(torch.float32) + variance = hidden_states.pow(2).mean(-1, keepdim=True) + hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon) + return self.weight * hidden_states.to(input_dtype) + + +# copied from transformers.models.llama.modeling_llama.LlamaRotaryEmbedding with Llama->Mistral +# TODO @Arthur no longer copied from LLama after static cache +class Mistral2RotaryEmbedding(nn.Module): + def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None): + super().__init__() + + self.dim = dim + self.max_position_embeddings = max_position_embeddings + self.base = base + inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2, dtype=torch.int64).float().to(device) / self.dim)) + self.register_buffer("inv_freq", inv_freq, persistent=False) + + # Build here to make `torch.jit.trace` work. + self._set_cos_sin_cache( + seq_len=max_position_embeddings, device=self.inv_freq.device, dtype=torch.get_default_dtype() + ) + + def _set_cos_sin_cache(self, seq_len, device, dtype): + self.max_seq_len_cached = seq_len + t = torch.arange(self.max_seq_len_cached, device=device, dtype=torch.int64).type_as(self.inv_freq) + + freqs = torch.outer(t, self.inv_freq) + # Different from paper, but it uses a different permutation in order to obtain the same calculation + emb = torch.cat((freqs, freqs), dim=-1) + self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False) + self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False) + + def forward(self, x, seq_len=None): + # x: [bs, num_attention_heads, seq_len, head_size] + if seq_len > self.max_seq_len_cached: + self._set_cos_sin_cache(seq_len=seq_len, device=x.device, dtype=x.dtype) + + return ( + self.cos_cached[:seq_len].to(dtype=x.dtype), + self.sin_cached[:seq_len].to(dtype=x.dtype), + ) + + +# Copied from transformers.models.llama.modeling_llama.rotate_half +def rotate_half(x): + """Rotates half the hidden dims of the input.""" + x1 = x[..., : x.shape[-1] // 2] + x2 = x[..., x.shape[-1] // 2 :] + return torch.cat((-x2, x1), dim=-1) + + +# copied from transformers.models.llama.modeling_llama.apply_rotary_pos_emb +# TODO @Arthur no longer copied from LLama after static cache +def apply_rotary_pos_emb(q, k, cos, sin, position_ids, unsqueeze_dim=1): + """Applies Rotary Position Embedding to the query and key tensors. + + Args: + q (`torch.Tensor`): The query tensor. + k (`torch.Tensor`): The key tensor. + cos (`torch.Tensor`): The cosine part of the rotary embedding. + sin (`torch.Tensor`): The sine part of the rotary embedding. + position_ids (`torch.Tensor`): + The position indices of the tokens corresponding to the query and key tensors. For example, this can be + used to pass offsetted position ids when working with a KV-cache. + unsqueeze_dim (`int`, *optional*, defaults to 1): + The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and + sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note + that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and + k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes + cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have + the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2. + Returns: + `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding. + """ + cos = cos[position_ids].unsqueeze(unsqueeze_dim) + sin = sin[position_ids].unsqueeze(unsqueeze_dim) + q_embed = (q * cos) + (rotate_half(q) * sin) + k_embed = (k * cos) + (rotate_half(k) * sin) + return q_embed, k_embed + + +class Mistral2MLP(nn.Module): + def __init__(self, config): + super().__init__() + self.config = config + self.hidden_size = config.hidden_size + self.intermediate_size = config.intermediate_size + # self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False) + # self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False) + # self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False) + + self.up_proj = ColumnParallelLinearWithoutBias( + self.hidden_size, + self.intermediate_size, + bias=False, + gather_output=False, + init_method=lambda x: x, + ) + self.gate_proj = ColumnParallelLinearWithoutBias( + self.hidden_size, + self.intermediate_size, + bias=False, + gather_output=False, + init_method=lambda x: x, + ) + self.down_proj = RowParallelLinearWithoutBias( + self.intermediate_size, + self.hidden_size, + bias=False, + input_is_parallel=True, + init_method=lambda x: x, + ) + + self.act_fn = ACT2FN[config.hidden_act] + + def forward(self, x): + return self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x)) + + +# Copied from transformers.models.llama.modeling_llama.repeat_kv +def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor: + """ + This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch, + num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim) + """ + batch, num_key_value_heads, slen, head_dim = hidden_states.shape + if n_rep == 1: + return hidden_states + hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim) + return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim) + + +class Mistral2Attention(nn.Module): + """ + Multi-headed attention from 'Attention Is All You Need' paper. Modified to use sliding window attention: Longformer + and "Generating Long Sequences with Sparse Transformers". + """ + + def __init__(self, config: CollieConfig, layer_idx: Optional[int] = None): + super().__init__() + self.config = config + self.layer_idx = layer_idx + if layer_idx is None: + logger.warning_once( + f"Instantiating {self.__class__.__name__} without passing a `layer_idx` is not recommended and will " + "lead to errors during the forward call if caching is used. Please make sure to provide a `layer_idx` " + "when creating this class." + ) + + self.hidden_size = config.hidden_size + self.num_heads = config.num_attention_heads + self.head_dim = self.hidden_size // self.num_heads + self.num_key_value_heads = config.num_key_value_heads + self.num_key_value_groups = self.num_heads // self.num_key_value_heads + self.max_position_embeddings = config.max_position_embeddings + self.rope_theta = config.rope_theta + self.is_causal = True + self.attention_dropout = config.attention_dropout + + if (self.head_dim * self.num_heads) != self.hidden_size: + raise ValueError( + f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}" + f" and `num_heads`: {self.num_heads})." + ) + # self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=False) + # self.k_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=False) + # self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=False) + # self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=False) + + self.q_proj = ColumnParallelLinearWithoutBias( + self.hidden_size, + self.num_heads * self.head_dim, + bias=False, + gather_output=False, + init_method=lambda x: x, + ) + self.k_proj = ColumnParallelLinearWithoutBias( + self.hidden_size, + self.num_key_value_heads * self.head_dim, + bias=False, + gather_output=False, + init_method=lambda x: x, + ) + self.v_proj = ColumnParallelLinearWithoutBias( + self.hidden_size, + self.num_key_value_heads * self.head_dim, + bias=False, + gather_output=False, + init_method=lambda x: x, + ) + self.o_proj = RowParallelLinearWithoutBias( + self.num_heads * self.head_dim, + self.hidden_size, + bias=False, + input_is_parallel=True, + init_method=lambda x: x, + ) + + self.rotary_emb = Mistral2RotaryEmbedding( + self.head_dim, + max_position_embeddings=self.max_position_embeddings, + base=self.rope_theta, + ) + + def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int): + return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous() + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_value: Optional[Cache] = None, + output_attentions: bool = False, + use_cache: bool = False, + **kwargs, + ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: + if "padding_mask" in kwargs: + warnings.warn( + "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`" + ) + bsz, q_len, _ = hidden_states.size() + + query_states = self.q_proj(hidden_states) + key_states = self.k_proj(hidden_states) + value_states = self.v_proj(hidden_states) + + # query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2) + # key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) + # value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) + + query_states, key_states, value_states = ( + rearrange(query_states, "b n (h d) -> b n h d", d=self.head_dim), + rearrange(key_states, "b n (h d) -> b n h d", d=self.head_dim), + rearrange(value_states, "b n (h d) -> b n h d", d=self.head_dim), + ) + + self.num_heads_tp = query_states.shape[2] + self.tp_size = self.num_heads // self.num_heads_tp + + query_states = query_states.transpose(1, 2) + key_states = key_states.transpose(1, 2) + value_states = value_states.transpose(1, 2) + + if self.config.pp_size > 1: + query_states = query_states.contiguous() + key_states = key_states.contiguous() + value_states = value_states.contiguous() + + kv_seq_len = key_states.shape[-2] + if past_key_value is not None: + if self.layer_idx is None: + raise ValueError( + f"The cache structure has changed since version v4.36. If you are using {self.__class__.__name__} " + "for auto-regressive decoding with k/v caching, please make sure to initialize the attention class " + "with a layer index." + ) + kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx) + cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len) + query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids) + + if past_key_value is not None: + cache_kwargs = {"sin": sin, "cos": cos} # Specific to RoPE models + key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs) + + # repeat k/v heads if n_kv_heads < n_heads + key_states = repeat_kv(key_states, self.num_key_value_groups) + value_states = repeat_kv(value_states, self.num_key_value_groups) + + attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim) + + if attn_weights.size() != (bsz, self.num_heads_tp, q_len, kv_seq_len): + raise ValueError( + f"Attention weights should be of size {(bsz, self.num_heads_tp, q_len, kv_seq_len)}, but is" + f" {attn_weights.size()}" + ) + + if attention_mask is not None: + if attention_mask.size() != (bsz, 1, q_len, kv_seq_len): + raise ValueError( + f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}" + ) + + attn_weights = attn_weights + attention_mask + + # upcast attention to fp32 + attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype) + attn_weights = nn.functional.dropout(attn_weights, p=self.attention_dropout, training=self.training) + attn_output = torch.matmul(attn_weights, value_states) + + if attn_output.size() != (bsz, self.num_heads_tp, q_len, self.head_dim): + raise ValueError( + f"`attn_output` should be of size {(bsz, self.num_heads_tp, q_len, self.head_dim)}, but is" + f" {attn_output.size()}" + ) + + attn_output = attn_output.transpose(1, 2).contiguous() + attn_output = attn_output.reshape(bsz, q_len, int(self.hidden_size/self.tp_size)) + + attn_output = self.o_proj(attn_output) + + if not output_attentions: + attn_weights = None + + return attn_output, attn_weights, past_key_value + + +class Mistral2FlashAttention2(Mistral2Attention): + """ + Mistral flash attention module. This module inherits from `MistralAttention` as the weights of the module stays + untouched. The only required change would be on the forward pass where it needs to correctly call the public API of + flash attention and deal with padding tokens in case the input contains any of them. + """ + + # Copied from transformers.models.llama.modeling_llama.LlamaFlashAttention2.__init__ + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1. + # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignement, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0. + # Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left). + self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10() + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_value: Optional[Cache] = None, + output_attentions: bool = False, + use_cache: bool = False, + **kwargs, + ): + if "padding_mask" in kwargs: + warnings.warn( + "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`" + ) + + # overwrite attention_mask with padding_mask + attention_mask = kwargs.pop("padding_mask") + bsz, q_len, _ = hidden_states.size() + + query_states = self.q_proj(hidden_states) + key_states = self.k_proj(hidden_states) + value_states = self.v_proj(hidden_states) + + # query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2) + # key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) + # value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) + + query_states, key_states, value_states = ( + rearrange(query_states, "b n (h d) -> b n h d", d=self.head_dim), + rearrange(key_states, "b n (h d) -> b n h d", d=self.head_dim), + rearrange(value_states, "b n (h d) -> b n h d", d=self.head_dim), + ) + + self.num_heads_tp = query_states.shape[2] + self.tp_size = self.num_heads // self.num_heads_tp + + query_states = query_states.transpose(1, 2) + key_states = key_states.transpose(1, 2) + value_states = value_states.transpose(1, 2) + + if self.config.pp_size > 1: + query_states = query_states.contiguous() + key_states = key_states.contiguous() + value_states = value_states.contiguous() + + + kv_seq_len = key_states.shape[-2] + if past_key_value is not None: + if self.layer_idx is None: + raise ValueError( + f"The cache structure has changed since version v4.36. If you are using {self.__class__.__name__} " + "for auto-regressive decoding with k/v caching, please make sure to initialize the attention class " + "with a layer index." + ) + kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx) + + # Because the input can be padded, the absolute sequence length depends on the max position id. + rotary_seq_len = max(kv_seq_len, position_ids[:, -1].max().item()) + 1 + cos, sin = self.rotary_emb(value_states, seq_len=rotary_seq_len) + + query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids) + + use_sliding_windows = ( + _flash_supports_window_size + and getattr(self.config, "sliding_window", None) is not None + and kv_seq_len > self.config.sliding_window + ) + + if not _flash_supports_window_size: + logger.warning_once( + "The current flash attention version does not support sliding window attention, for a more memory efficient implementation" + " make sure to upgrade flash-attn library." + ) + + if past_key_value is not None: + # Activate slicing cache only if the config has a value `sliding_windows` attribute + cache_has_contents = past_key_value.get_seq_length(self.layer_idx) > 0 + if ( + getattr(self.config, "sliding_window", None) is not None + and kv_seq_len > self.config.sliding_window + and cache_has_contents + ): + slicing_tokens = 1 - self.config.sliding_window + + past_key = past_key_value[self.layer_idx][0] + past_value = past_key_value[self.layer_idx][1] + + past_key = past_key[:, :, slicing_tokens:, :].contiguous() + past_value = past_value[:, :, slicing_tokens:, :].contiguous() + + if past_key.shape[-2] != self.config.sliding_window - 1: + raise ValueError( + f"past key must have a shape of (`batch_size, num_heads, self.config.sliding_window-1, head_dim`), got" + f" {past_key.shape}" + ) + + if attention_mask is not None: + attention_mask = attention_mask[:, slicing_tokens:] + attention_mask = torch.cat([attention_mask, torch.ones_like(attention_mask[:, -1:])], dim=-1) + + cache_kwargs = {"sin": sin, "cos": cos} # Specific to RoPE models + key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs) + + # repeat k/v heads if n_kv_heads < n_heads + key_states = repeat_kv(key_states, self.num_key_value_groups) + value_states = repeat_kv(value_states, self.num_key_value_groups) + dropout_rate = 0.0 if not self.training else self.attention_dropout + + # In PEFT, usually we cast the layer norms in float32 for training stability reasons + # therefore the input hidden states gets silently casted in float32. Hence, we need + # cast them back in float16 just to be sure everything works as expected. + input_dtype = query_states.dtype + if input_dtype == torch.float32: + if torch.is_autocast_enabled(): + target_dtype = torch.get_autocast_gpu_dtype() + # Handle the case where the model is quantized + elif hasattr(self.config, "_pre_quantization_dtype"): + target_dtype = self.config._pre_quantization_dtype + else: + target_dtype = self.q_proj.weight.dtype + + logger.warning_once( + f"The input hidden states seems to be silently casted in float32, this might be related to" + f" the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in" + f" {target_dtype}." + ) + + query_states = query_states.to(target_dtype) + key_states = key_states.to(target_dtype) + value_states = value_states.to(target_dtype) + + # Reashape to the expected shape for Flash Attention + query_states = query_states.transpose(1, 2) + key_states = key_states.transpose(1, 2) + value_states = value_states.transpose(1, 2) + + attn_output = self._flash_attention_forward( + query_states, + key_states, + value_states, + attention_mask, + q_len, + dropout=dropout_rate, + use_sliding_windows=use_sliding_windows, + ) + + attn_output = attn_output.reshape(bsz, q_len, int(self.hidden_size/self.tp_size)).contiguous() + attn_output = self.o_proj(attn_output) + + if not output_attentions: + attn_weights = None + + return attn_output, attn_weights, past_key_value + + def _flash_attention_forward( + self, + query_states, + key_states, + value_states, + attention_mask, + query_length, + dropout=0.0, + softmax_scale=None, + use_sliding_windows=False, + ): + """ + Calls the forward method of Flash Attention - if the input hidden states contain at least one padding token + first unpad the input, then computes the attention scores and pad the final attention scores. + + Args: + query_states (`torch.Tensor`): + Input query states to be passed to Flash Attention API + key_states (`torch.Tensor`): + Input key states to be passed to Flash Attention API + value_states (`torch.Tensor`): + Input value states to be passed to Flash Attention API + attention_mask (`torch.Tensor`): + The padding mask - corresponds to a tensor of size `(batch_size, seq_len)` where 0 stands for the + position of padding tokens and 1 for the position of non-padding tokens. + dropout (`float`): + Attention dropout + softmax_scale (`float`, *optional*): + The scaling of QK^T before applying softmax. Default to 1 / sqrt(head_dim) + use_sliding_windows (`bool`, *optional*): + Whether to activate sliding window attention. + """ + if not self._flash_attn_uses_top_left_mask: + causal = self.is_causal + else: + # TODO: Remove the `query_length != 1` check once Flash Attention for RoCm is bumped to 2.1. For details, please see the comment in LlamaFlashAttention2 __init__. + causal = self.is_causal and query_length != 1 + + # Contains at least one padding token in the sequence + if attention_mask is not None: + batch_size = query_states.shape[0] + query_states, key_states, value_states, indices_q, cu_seq_lens, max_seq_lens = self._upad_input( + query_states, key_states, value_states, attention_mask, query_length + ) + + cu_seqlens_q, cu_seqlens_k = cu_seq_lens + max_seqlen_in_batch_q, max_seqlen_in_batch_k = max_seq_lens + + if not use_sliding_windows: + attn_output_unpad = flash_attn_varlen_func( + query_states, + key_states, + value_states, + cu_seqlens_q=cu_seqlens_q, + cu_seqlens_k=cu_seqlens_k, + max_seqlen_q=max_seqlen_in_batch_q, + max_seqlen_k=max_seqlen_in_batch_k, + dropout_p=dropout, + softmax_scale=softmax_scale, + causal=causal, + ) + else: + attn_output_unpad = flash_attn_varlen_func( + query_states, + key_states, + value_states, + cu_seqlens_q=cu_seqlens_q, + cu_seqlens_k=cu_seqlens_k, + max_seqlen_q=max_seqlen_in_batch_q, + max_seqlen_k=max_seqlen_in_batch_k, + dropout_p=dropout, + softmax_scale=softmax_scale, + causal=causal, + window_size=(self.config.sliding_window, self.config.sliding_window), + ) + + attn_output = pad_input(attn_output_unpad, indices_q, batch_size, query_length) + else: + if not use_sliding_windows: + attn_output = flash_attn_func( + query_states, + key_states, + value_states, + dropout, + softmax_scale=softmax_scale, + causal=causal, + ) + else: + attn_output = flash_attn_func( + query_states, + key_states, + value_states, + dropout, + softmax_scale=softmax_scale, + causal=causal, + window_size=(self.config.sliding_window, self.config.sliding_window), + ) + + return attn_output + + def _upad_input(self, query_layer, key_layer, value_layer, attention_mask, query_length): + batch_size, kv_seq_len, num_heads, head_dim = key_layer.shape + + # On the first iteration we need to properly re-create the padding mask + # by slicing it on the proper place + if kv_seq_len != attention_mask.shape[-1]: + attention_mask_num_tokens = attention_mask.shape[-1] + attention_mask = attention_mask[:, attention_mask_num_tokens - kv_seq_len :] + + indices_k, cu_seqlens_k, max_seqlen_in_batch_k = _get_unpad_data(attention_mask) + + key_layer = index_first_axis(key_layer.reshape(batch_size * kv_seq_len, num_heads, head_dim), indices_k) + value_layer = index_first_axis(value_layer.reshape(batch_size * kv_seq_len, num_heads, head_dim), indices_k) + + if query_length == kv_seq_len: + query_layer = index_first_axis( + query_layer.reshape(batch_size * kv_seq_len, num_heads, head_dim), indices_k + ) + cu_seqlens_q = cu_seqlens_k + max_seqlen_in_batch_q = max_seqlen_in_batch_k + indices_q = indices_k + elif query_length == 1: + max_seqlen_in_batch_q = 1 + cu_seqlens_q = torch.arange( + batch_size + 1, dtype=torch.int32, device=query_layer.device + ) # There is a memcpy here, that is very bad. + indices_q = cu_seqlens_q[:-1] + query_layer = query_layer.squeeze(1) + else: + # The -q_len: slice assumes left padding. + attention_mask = attention_mask[:, -query_length:] + query_layer, indices_q, cu_seqlens_q, max_seqlen_in_batch_q = unpad_input(query_layer, attention_mask) + + return ( + query_layer, + key_layer, + value_layer, + indices_q, + (cu_seqlens_q, cu_seqlens_k), + (max_seqlen_in_batch_q, max_seqlen_in_batch_k), + ) + + +# copied from transformers.models.llama.modeling_llama.LlamaSdpaAttention with Llama->Mistral +# TODO @Arthur no longer copied from LLama after static cache +class Mistral2SdpaAttention(Mistral2Attention): + """ + Mistral attention module using torch.nn.functional.scaled_dot_product_attention. This module inherits from + `MistralAttention` as the weights of the module stays untouched. The only changes are on the forward pass to adapt to + SDPA API. + """ + + # Adapted from MistralAttention.forward + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_value: Optional[Cache] = None, + output_attentions: bool = False, + use_cache: bool = False, + ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: + if output_attentions: + # TODO: Improve this warning with e.g. `model.config.attn_implementation = "manual"` once this is implemented. + logger.warning_once( + "MistralModel is using MistralSdpaAttention, but `torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to the manual attention implementation, " + 'but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.' + ) + return super().forward( + hidden_states=hidden_states, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_value=past_key_value, + output_attentions=output_attentions, + use_cache=use_cache, + ) + + bsz, q_len, _ = hidden_states.size() + + query_states = self.q_proj(hidden_states) + key_states = self.k_proj(hidden_states) + value_states = self.v_proj(hidden_states) + + # query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2) + # key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) + # value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) + + query_states, key_states, value_states = ( + rearrange(query_states, "b n (h d) -> b n h d", d=self.head_dim), + rearrange(key_states, "b n (h d) -> b n h d", d=self.head_dim), + rearrange(value_states, "b n (h d) -> b n h d", d=self.head_dim), + ) + + self.num_heads_tp = query_states.shape[2] + self.tp_size = self.num_heads // self.num_heads_tp + + query_states = query_states.transpose(1, 2) + key_states = key_states.transpose(1, 2) + value_states = value_states.transpose(1, 2) + + if self.config.pp_size > 1: + query_states = query_states.contiguous() + key_states = key_states.contiguous() + value_states = value_states.contiguous() + + + kv_seq_len = key_states.shape[-2] + if past_key_value is not None: + kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx) + cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len) + + query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids) + + if past_key_value is not None: + cache_kwargs = {"sin": sin, "cos": cos} # Specific to RoPE models + key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs) + + key_states = repeat_kv(key_states, self.num_key_value_groups) + value_states = repeat_kv(value_states, self.num_key_value_groups) + + if attention_mask is not None: + if attention_mask.size() != (bsz, 1, q_len, kv_seq_len): + raise ValueError( + f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}" + ) + + # SDPA with memory-efficient backend is currently (torch==2.1.2) bugged with non-contiguous inputs with custom attn_mask, + # Reference: https://github.com/pytorch/pytorch/issues/112577. + if query_states.device.type == "cuda" and attention_mask is not None: + query_states = query_states.contiguous() + key_states = key_states.contiguous() + value_states = value_states.contiguous() + + attn_output = torch.nn.functional.scaled_dot_product_attention( + query_states, + key_states, + value_states, + attn_mask=attention_mask, + dropout_p=self.attention_dropout if self.training else 0.0, + # The q_len > 1 is necessary to match with AttentionMaskConverter.to_causal_4d that does not create a causal mask in case q_len == 1. + is_causal=self.is_causal and attention_mask is None and q_len > 1, + ) + + attn_output = attn_output.transpose(1, 2).contiguous() + attn_output = attn_output.view(bsz, q_len, int(self.hidden_size/self.tp_size)) + + attn_output = self.o_proj(attn_output) + + return attn_output, None, past_key_value + + +MISTRAL_ATTENTION_CLASSES = { + "eager": Mistral2Attention, + "flash_attention_2": Mistral2FlashAttention2, + "sdpa": Mistral2SdpaAttention, +} + + +class MistralDecoderLayer(nn.Module): + def __init__(self, config: CollieConfig, layer_idx: int): + super().__init__() + self.hidden_size = config.hidden_size + + self.self_attn = MISTRAL_ATTENTION_CLASSES[config._attn_implementation](config, layer_idx) + + self.mlp = Mistral2MLP(config) + self.input_layernorm = Mistral2RMSNorm(config.hidden_size, eps=config.rms_norm_eps) + self.post_attention_layernorm = Mistral2RMSNorm(config.hidden_size, eps=config.rms_norm_eps) + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_value: Optional[Tuple[torch.Tensor]] = None, + output_attentions: Optional[bool] = False, + use_cache: Optional[bool] = False, + **kwargs, + ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]: + if "padding_mask" in kwargs: + warnings.warn( + "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`" + ) + """ + Args: + hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)` + attention_mask (`torch.FloatTensor`, *optional*): attention mask of size + `(batch, sequence_length)` where padding elements are indicated by 0. + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under + returned tensors for more detail. + use_cache (`bool`, *optional*): + If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding + (see `past_key_values`). + past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states + """ + + residual = hidden_states + + hidden_states = self.input_layernorm(hidden_states) + + # Self Attention + hidden_states, self_attn_weights, present_key_value = self.self_attn( + hidden_states=hidden_states, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_value=past_key_value, + output_attentions=output_attentions, + use_cache=use_cache, + ) + hidden_states = residual + hidden_states + + # Fully Connected + residual = hidden_states + hidden_states = self.post_attention_layernorm(hidden_states) + hidden_states = self.mlp(hidden_states) + hidden_states = residual + hidden_states + + outputs = (hidden_states,) + + if output_attentions: + outputs += (self_attn_weights,) + + if use_cache: + outputs += (present_key_value,) + + return outputs + + +MISTRAL_START_DOCSTRING = r""" + This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the + library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads + etc.) + + This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. + Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage + and behavior. + + Parameters: + config ([`MistralConfig`]): + Model configuration class with all the parameters of the model. Initializing with a config file does not + load the weights associated with the model, only the configuration. Check out the + [`~PreTrainedModel.from_pretrained`] method to load the model weights. +""" + + +@add_start_docstrings( + "The bare Mistral Model outputting raw hidden-states without any specific head on top.", + MISTRAL_START_DOCSTRING, +) +class Mistral2PreTrainedModel(PreTrainedModel): + config_class = Mistral2Config + base_model_prefix = "model" + supports_gradient_checkpointing = True + _no_split_modules = ["MistralDecoderLayer"] + _skip_keys_device_placement = "past_key_values" + _supports_flash_attn_2 = True + _supports_sdpa = True + _supports_cache_class = True + + def _init_weights(self, module): + std = self.config.initializer_range + if isinstance(module, nn.Linear): + module.weight.data.normal_(mean=0.0, std=std) + if module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, nn.Embedding): + module.weight.data.normal_(mean=0.0, std=std) + if module.padding_idx is not None: + module.weight.data[module.padding_idx].zero_() + + +MISTRAL_INPUTS_DOCSTRING = r""" + Args: + input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`): + Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide + it. + + Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. + + [What are input IDs?](../glossary#input-ids) + attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*): + Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + [What are attention masks?](../glossary#attention-mask) + + Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. + + If `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see + `past_key_values`). + + If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`] + and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more + information on the default strategy. + + - 1 indicates the head is **not masked**, + - 0 indicates the head is **masked**. + position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0, + config.n_positions - 1]`. + + [What are position IDs?](../glossary#position-ids) + past_key_values (`Cache` or `tuple(tuple(torch.FloatTensor))`, *optional*): + Pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention + blocks) that can be used to speed up sequential decoding. This typically consists in the `past_key_values` + returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`. + + Two formats are allowed: + - a [`~cache_utils.Cache`] instance; + - Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of + shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`). This is also known as the legacy + cache format. + + The model will output the same cache format that is fed as input. If no `past_key_values` are passed, the + legacy cache format will be returned. + + If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that don't + have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `input_ids` + of shape `(batch_size, sequence_length)`. + inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): + Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This + is useful if you want more control over how to convert `input_ids` indices into associated vectors than the + model's internal embedding lookup matrix. + use_cache (`bool`, *optional*): + If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see + `past_key_values`). + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned + tensors for more detail. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for + more detail. + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. +""" + + +@add_start_docstrings( + "The bare Mistral Model outputting raw hidden-states without any specific head on top.", + MISTRAL_START_DOCSTRING, +) +class Mistral2Model(nn.Module): + """ + Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`MistralDecoderLayer`] + + Args: + config: MistralConfig + """ + + def __init__(self, config: CollieConfig): + # super().__init__(config) + super().__init__() + self.config = config + self.padding_idx = config.pad_token_id + self.vocab_size = config.vocab_size + + self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx) + self.layers = nn.ModuleList( + [MistralDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)] + ) + self._attn_implementation = config._attn_implementation + self.norm = Mistral2RMSNorm(config.hidden_size, eps=config.rms_norm_eps) + + self.gradient_checkpointing = False + # Initialize weights and apply final processing + # self.post_init() + + def get_input_embeddings(self): + return self.embed_tokens + + def set_input_embeddings(self, value): + self.embed_tokens = value + + @add_start_docstrings_to_model_forward(MISTRAL_INPUTS_DOCSTRING) + def forward( + self, + input_ids: torch.LongTensor = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[List[torch.FloatTensor]] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple, BaseModelOutputWithPast]: + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + use_cache = use_cache if use_cache is not None else self.config.use_cache + + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + # retrieve input_ids and inputs_embeds + if input_ids is not None and inputs_embeds is not None: + raise ValueError("You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time") + elif input_ids is not None: + batch_size, seq_length = input_ids.shape + elif inputs_embeds is not None: + batch_size, seq_length, _ = inputs_embeds.shape + else: + raise ValueError("You have to specify either decoder_input_ids or decoder_inputs_embeds") + + if self.gradient_checkpointing and self.training: + if use_cache: + logger.warning_once( + "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..." + ) + use_cache = False + + past_key_values_length = 0 + + if use_cache: + use_legacy_cache = not isinstance(past_key_values, Cache) + if use_legacy_cache: + past_key_values = DynamicCache.from_legacy_cache(past_key_values) + past_key_values_length = past_key_values.get_usable_length(seq_length) + + if position_ids is None: + device = input_ids.device if input_ids is not None else inputs_embeds.device + position_ids = torch.arange( + past_key_values_length, seq_length + past_key_values_length, dtype=torch.long, device=device + ) + position_ids = position_ids.unsqueeze(0).view(-1, seq_length) + else: + position_ids = position_ids.view(-1, seq_length).long() + + if inputs_embeds is None: + inputs_embeds = self.embed_tokens(input_ids) + + if attention_mask is not None and self._attn_implementation == "flash_attention_2" and use_cache: + is_padding_right = attention_mask[:, -1].sum().item() != batch_size + if is_padding_right: + raise ValueError( + "You are attempting to perform batched generation with padding_side='right'" + " this may lead to unexpected behaviour for Flash Attention version of Mistral. Make sure to " + " call `tokenizer.padding_side = 'left'` before tokenizing the input. " + ) + + if self._attn_implementation == "flash_attention_2": + # 2d mask is passed through the layers + attention_mask = attention_mask if (attention_mask is not None and 0 in attention_mask) else None + elif self._attn_implementation == "sdpa" and not output_attentions: + # output_attentions=True can not be supported when using SDPA, and we fall back on + # the manual implementation that requires a 4D causal mask in all cases. + attention_mask = _prepare_4d_causal_attention_mask_for_sdpa( + attention_mask, + (batch_size, seq_length), + inputs_embeds, + past_key_values_length, + ) + else: + # 4d mask is passed through the layers + attention_mask = _prepare_4d_causal_attention_mask( + attention_mask, + (batch_size, seq_length), + inputs_embeds, + past_key_values_length, + sliding_window=self.config.sliding_window, + ) + + hidden_states = inputs_embeds + + # decoder layers + all_hidden_states = () if output_hidden_states else None + all_self_attns = () if output_attentions else None + next_decoder_cache = None + + for decoder_layer in self.layers: + if output_hidden_states: + all_hidden_states += (hidden_states,) + + if self.gradient_checkpointing and self.training: + layer_outputs = self._gradient_checkpointing_func( + decoder_layer.__call__, + hidden_states, + attention_mask, + position_ids, + past_key_values, + output_attentions, + use_cache, + ) + else: + layer_outputs = decoder_layer( + hidden_states, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_value=past_key_values, + output_attentions=output_attentions, + use_cache=use_cache, + ) + + hidden_states = layer_outputs[0] + + if use_cache: + next_decoder_cache = layer_outputs[2 if output_attentions else 1] + + if output_attentions: + all_self_attns += (layer_outputs[1],) + + hidden_states = self.norm(hidden_states) + + # add hidden states from the last decoder layer + if output_hidden_states: + all_hidden_states += (hidden_states,) + + next_cache = None + if use_cache: + next_cache = next_decoder_cache.to_legacy_cache() if use_legacy_cache else next_decoder_cache + + if not return_dict: + return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None) + return BaseModelOutputWithPast( + last_hidden_state=hidden_states, + past_key_values=next_cache, + hidden_states=all_hidden_states, + attentions=all_self_attns, + ) + + +class Mistral2ForCausalLM(CollieModelForCausalLM): + _tied_weights_keys = ["lm_head.weight"] + + def __init__(self, config:CollieConfig): + super().__init__(config) + self.model = Mistral2Model(config) + self.vocab_size = config.vocab_size + # self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False) + self.lm_head = ColumnParallelLinearWithoutBias( + self.collie_config.hidden_size, self.collie_config.vocab_size, bias=False + ) + # Initialize weights and apply final processing + # self.post_init() + # GenerationMixin 需要的额外参数 + self.config.is_decoder = True + if config.model_config.tie_word_embeddings: + self.lm_head.weight = self.embed_tokens.weight + self.main_input_name = "input_ids" + + def clean_cache(self): + self._clean_hidden_states([*self.model.layers, self.lm_head]) + self._set_use_cache(self.model.layers, False) + + def set_cache(self, use_cache): + self._set_use_cache(self.model.layers, use_cache) + + def get_input_embeddings(self): + return self.model.embed_tokens + + def set_input_embeddings(self, value): + self.model.embed_tokens = value + + def get_output_embeddings(self): + return self.lm_head + + def set_output_embeddings(self, new_embeddings): + self.lm_head = new_embeddings + + def set_decoder(self, decoder): + self.model = decoder + + def get_decoder(self): + return self.model + + @add_start_docstrings_to_model_forward(MISTRAL_INPUTS_DOCSTRING) + @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC) + def forward( + self, + input_ids: torch.LongTensor = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[List[torch.FloatTensor]] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + **kwargs, + ) -> Union[Tuple, CausalLMOutputWithPast]: + r""" + Args: + labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., + config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored + (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`. + + Returns: + + Example: + + ```python + >>> from transformers import AutoTokenizer, MistralForCausalLM + + >>> model = MistralForCausalLM.from_pretrained("mistralai/Mistral-7B-v0.1") + >>> tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-v0.1") + + >>> prompt = "Hey, are you conscious? Can you talk to me?" + >>> inputs = tokenizer(prompt, return_tensors="pt") + + >>> # Generate + >>> generate_ids = model.generate(inputs.input_ids, max_length=30) + >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0] + "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you." + ```""" + + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn) + outputs = self.model( + input_ids=input_ids, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + hidden_states = outputs[0] + logits = self.lm_head(hidden_states) + logits = logits.float() + + loss = None + if labels is not None: + # Shift so that tokens < n predict n + shift_logits = logits[..., :-1, :].contiguous() + shift_labels = labels[..., 1:].contiguous() + # Flatten the tokens + shift_logits = shift_logits.view(-1, self.config.vocab_size) + shift_labels = shift_labels.view(-1) + # Ensure tensors are on the same device + shift_labels = shift_labels.to(shift_logits.device) + loss_fct = CrossEntropyLoss() + loss = loss_fct(shift_logits, shift_labels) + + if not return_dict: + output = (logits,) + outputs[1:] + return (loss,) + output if loss is not None else output + + return CausalLMOutputWithPast( + loss=loss, + logits=logits, + past_key_values=outputs.past_key_values, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + def prepare_inputs_for_generation( + self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, **kwargs + ): + # Omit tokens covered by past_key_values + if past_key_values is not None: + if isinstance(past_key_values, Cache): + cache_length = past_key_values.get_seq_length() + past_length = past_key_values.seen_tokens + max_cache_length = past_key_values.get_max_length() + else: + cache_length = past_length = past_key_values[0][0].shape[2] + max_cache_length = None + + # Keep only the unprocessed tokens: + # 1 - If the length of the attention_mask exceeds the length of input_ids, then we are in a setting where + # some of the inputs are exclusively passed as part of the cache (e.g. when passing input_embeds as + # input) + if attention_mask is not None and attention_mask.shape[1] > input_ids.shape[1]: + input_ids = input_ids[:, -(attention_mask.shape[1] - past_length) :] + # 2 - If the past_length is smaller than input_ids', then input_ids holds all input tokens. We can discard + # input_ids based on the past_length. + elif past_length < input_ids.shape[1]: + input_ids = input_ids[:, past_length:] + # 3 - Otherwise (past_length >= input_ids.shape[1]), let's assume input_ids only has unprocessed tokens. + + # If we are about to go beyond the maximum cache length, we need to crop the input attention mask. + if ( + max_cache_length is not None + and attention_mask is not None + and cache_length + input_ids.shape[1] > max_cache_length + ): + attention_mask = attention_mask[:, -max_cache_length:] + + position_ids = kwargs.get("position_ids", None) + if attention_mask is not None and position_ids is None: + # create position_ids on the fly for batch generation + position_ids = attention_mask.long().cumsum(-1) - 1 + position_ids.masked_fill_(attention_mask == 0, 1) + if past_key_values: + position_ids = position_ids[:, -input_ids.shape[1] :] + + # if `inputs_embeds` are passed, we only want to use them in the 1st generation step + if inputs_embeds is not None and past_key_values is None: + model_inputs = {"inputs_embeds": inputs_embeds} + else: + model_inputs = {"input_ids": input_ids} + + model_inputs.update( + { + "position_ids": position_ids, + "past_key_values": past_key_values, + "use_cache": kwargs.get("use_cache"), + "attention_mask": attention_mask, + } + ) + return model_inputs + + @staticmethod + def _reorder_cache(past_key_values, beam_idx): + reordered_past = () + for layer_past in past_key_values: + reordered_past += ( + tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past), + ) + return reordered_past + + @classmethod + def pipeline_layers(cls, config: CollieConfig): + """ + Get layers of pipeline. + :return: list + """ + if isinstance(config, str): + config = CollieConfig.from_pretrained(config) + + if config.tie_word_embeddings: + output = TiedLayerSpec( + "embed_tokens", + dict_as_params(input_keys="hidden_states", output_keys="logits"), + ColumnParallelLMHead, + config.hidden_size, + config.vocab_size, + bias=False, + ) + else: + output = LayerSpec( + dict_as_params(input_keys="hidden_states", output_keys="logits"), + ColumnParallelLMHead, + config.hidden_size, + config.vocab_size, + bias=False, + ) + + return [("model", Mistral2Model.pipeline_layers(config)), ("lm_head", output)] + + @staticmethod + def load_parallel_state_dict( + path: str, + config: Union[CollieConfig, str], + process_exclusion: bool = False, + **kwargs, + ): + ... + + @staticmethod + def load_parallel_state_dict( + path: str, + config: Union[CollieConfig, str], + process_exclusion: bool = False, + protocol: str = "file", # 指定加载state_dict时使用的协议 + **kwargs, + ): + """ + Load state_dict from ``path``. + The format of pretrained model should be the same as that of + `huggingface`. + :return: state_dict. Note that the state_dict should be processed + properly to match the current rank. + """ + # 配置加载 + if isinstance(config, str): + config = CollieConfig.from_pretrained(config) + # IO驱动初始化 + io_driver = IODriver.from_protocol(protocol) + # 检查文件路径是否存在 + if not io_driver.exists(path): + raise FileNotFoundError(f"folder {path} not found.") + # 初始化存储和处理变量 + state_dict = OrderedDict() + weights = [] + parts = None # 变量用于存储模型分割的部分信息 + # 如果开启了进程互斥,那么每个进程都会显示进度条,否则只显示 RANK0 的 + hide_progress = not process_exclusion and int(os.environ.get("RANK", "0")) != 0 + if dist.is_initialized() and process_exclusion: + # 如果启动了进程互斥,则要进行 dist.get_world_size() 次循环 + rank_order = range(dist.get_world_size()) + else: + # 不开启只进行一次循环 + rank_order = range(1) + # 权重文件加载和处理 + for rank in rank_order: + # 如果开启了进程互斥,那么只有对应 RANK 的能进入循环;不开启进程互斥的话就都可以进 + if int(os.environ.get("RANK", "0")) == rank or not process_exclusion: + # PP 分层的方法保存在了 os.environ["COLLIE_PP_PARTS"], 格式类似于 [0, 17, 35], 左闭右开 + if env.is_pipeline: + # 保存的是 json 格式 + parts = env.pipeline_parts + if hasattr(config, "num_key_value_heads"): + # llama2 (transformers >= 4.31.0) + num_key_value_heads = config.num_key_value_heads + else: + num_key_value_heads = config.num_attention_heads + head_dim = config.hidden_size // config.num_attention_heads + # 如果存在 pytorch_model.bin.index.json 文件的话,此时不同的 pp 进程可以按需加载自己需要的权重 + if ( + io_driver.exists(os.path.join(path, "pytorch_model.bin.index.json")) + and "COLLIE_PP_PARTS" in os.environ.keys() + ): + weight_map = json.loads( + io_driver.load( + os.path.join(path, "pytorch_model.bin.index.json"), mode="r" + ) + )["weight_map"] + # layers 表示自己需要的层 + layers = env.pipeline_layers_idx + # 筛选出形似 model.layers.0 这样的层。包含两个条件:1. 有数字的层;2. 数字加一要在 layers 里面(因为最开始还有个 embedding 占一层) + weights.extend( + [ + value + for key, value in weight_map.items() + if len(key.split(".")) > 2 + and key.split(".")[2].isdigit() + and (int(key.split(".")[2]) + 1) in layers + ] + ) + # 去重 + weights = list(set(weights)) + # 继续筛选,如果有 0 层,那么就要加载 embedding;如果有最后一层,那么就要加载 lm_head;如果有倒数第二层,那么就要加载 norm + if 0 in layers: + weights.append(weight_map["model.tok_embeddings.weight"]) + if max(parts) - 1 in layers: + weights.append(weight_map["output.weight"]) + if max(parts) - 2 in layers: + weights.append(weight_map["model.norm.weight"]) + else: + # 如果没有 pytorch_model.bin.index.json 文件的话,那么就加载所有的权重 + weights = [ + weight + for weight in io_driver.list(path) + if weight.endswith(".bin") + ] + with progress( + weights, + desc="Loading state dict", + total=len(weights), + disable=hide_progress, + ) as pbar: + for weight in pbar: + part_state_dict = io_driver.load( + os.path.join(path, weight), mode="rb" + ) + # for key in list(part_state_dict.keys()): + # if "attention.wqkv.weight" in key: + # # qkv_weights = part_state_dict.pop(key) + # qkv_weights = part_state_dict[key] + # print(qkv_weights.shape) + # (wq, wk, wv) = qkv_weights.split( + # [ + # config.hidden_size, + # config.num_key_value_heads * head_dim, + # config.num_key_value_heads * head_dim, + # ], + # dim=0, + # ) + # wq_name = key.replace("wqkv", "wq") + # wk_name = key.replace("wqkv", "wk") + # wv_name = key.replace("wqkv", "wv") + # part_state_dict[wq_name] = wq + # part_state_dict[wk_name] = wk + # part_state_dict[wv_name] = wv + state_dict.update(part_state_dict) + del part_state_dict + if parts is not None: + # 这一步是 pp 的复筛 + layers = env.pipeline_layers_idx + for key in list(state_dict.keys()): + if key.startswith("layers"): + layer = int(key.split(".")[1]) + if layer + 1 not in layers: + state_dict.pop(key) + # if key.endswith("tok_embeddings.weight"): + if key.endswith("embed_tokens.weight"): + if 0 not in layers: + state_dict.pop(key) + if key == "norm.weight": + if max(parts) - 2 not in layers: + state_dict.pop(key) + # if key.endswith("output.weight"): + if key.endswith("lm_head.weight"): + if max(parts) - 1 not in layers: + state_dict.pop(key) + # 根据用户配置的新的 tp size 进行分割 + for key in list(state_dict.keys()): + col_filter = [ + # "wq.weight", + # "wk.weight", + # "wv.weight", + # "wqkv.weight", + # "w1.weight", + # "w3.weight", + # "tok_embeddings.weight", + # "output.weight", + "q_proj.weight", + "k_proj.weight", + "v_proj.weight", + "o_proj.weight", + "lm_head.weight", + "gate_proj.weight", + "up_proj.weight", + "down_proj.weight", + "embed_tokens.weight", + ] + col_split = any([key.endswith(filter) for filter in col_filter]) + + if col_split: + tensor = ( + list(torch.chunk(state_dict[key], config.tp_size, dim=0))[ + env.tp_rank + ] + .detach() + .clone() + ) + del state_dict[key] + if process_exclusion: + # CPU 内存回收(速度很慢) + gc.collect() + state_dict[key] = tensor + elif key.endswith("o_proj.weight") or key.endswith("down_proj.weight"): + tensor = ( + list(torch.chunk(state_dict[key], config.tp_size, dim=1))[ + env.tp_rank + ] + .detach() + .clone() + ) + del state_dict[key] + if process_exclusion: + # CPU 内存回收(速度很慢) + gc.collect() + state_dict[key] = tensor + if dist.is_initialized() and process_exclusion: + # 如果选择了进程互斥,那么本次循环中不需要加载权重的进程需等待 + dist.barrier() + return state_dict + + @staticmethod + def save_parallel_state_dict( + state_dict: dict, + path: str, + config: CollieConfig, + process_exclusion: bool = False, + **kwargs, + ): + ... + + @staticmethod + def save_parallel_state_dict( + state_dict: dict, + path: str, + config: CollieConfig, + process_exclusion: bool = False, + protocol: str = "file", + ): + """ + Save state_dict to ``path``. + The format of saved state dict should be the same as that of + `huggingface`. + """ + io_driver = IODriver.from_protocol(protocol) + # gather to tp rank 0 + if dist.is_initialized() and process_exclusion: + # 如果启动了进程互斥,则要进行 pp_size 次循环 + rank_order = range(config.pp_size) + else: + # 不开启只进行一次循环 + rank_order = range(1) + dst = parallel_state.get_tensor_model_parallel_src_rank() + with progress( + rank_order, + desc="Saving model", + disable=int(os.environ.get("RANK", "0")) != 0, + ) as pbar: + for rank in pbar: + if env.dp_rank == 0 and (env.pp_rank == rank or not process_exclusion): + for key in sorted(list(state_dict.keys())): + tensor_list = None + if env.tp_rank == 0: + tensor_list = [ + torch.zeros_like(state_dict[key]) + .to(state_dict[key].dtype) + .cuda() + for _ in range(config.tp_size) + ] + dist.gather( + state_dict[key].cuda(), + dst=dst, + gather_list=tensor_list, + group=env.tp_group, + ) + if env.tp_rank == 0: + col_filter = [ + # "wq.weight", + # "wk.weight", + # "wv.weight", + # "wqkv.weight", + # "w1.weight", + # "w3.weight", + # "tok_embeddings.weight", + # "output.weight", + "q_proj.weight", + "k_proj.weight", + "v_proj.weight", + "o_proj.weight", + "lm_head.weight", + "gate_proj.weight", + "up_proj.weight", + "down_proj.weight", + "embed_tokens.weight", + ] + col_split = any( + [key.endswith(filter) for filter in col_filter] + ) + + if col_split: + state_dict[key] = concat_tensor(tensor_list, dim=0) + + if process_exclusion: + # CPU 内存回收(速度很慢) + gc.collect() + + elif key.endswith("o_proj.weight") or key.endswith("down_proj.weight"): + state_dict[key] = concat_tensor(tensor_list, dim=1) + + if process_exclusion: + # CPU 内存回收(速度很慢) + gc.collect() + # 似乎不需要? + # state_dict_keys = state_dict.keys() + # for layer_id in range(config.num_layers): + # qkv_names = [None, None, None] + # for key in state_dict_keys: + # if f"layers.{layer_id}.attention.wq.weight" in key: + # qkv_names[0] = key + # elif f"layers.{layer_id}.attention.wk.weight" in key: + # qkv_names[1] = key + # elif f"layers.{layer_id}.attention.wv.weight" in key: + # qkv_names[2] = key + # qkv_name = qkv_names[0].replace("wq", "wqkv") + # state_dict[qkv_name] = torch.cat( + # [ + # state_dict.pop(qkv_names[0]), + # state_dict.pop(qkv_names[1]), + # state_dict.pop(qkv_names[2]), + # ], + # dim=0 + # ) + + if env.tp_rank == 0: + # Save gathered weights + if env.is_pipeline: + ckpt_name = f"pytorch_model-{env.pp_rank + 1:05d}-of-{config.pp_size:05d}.bin" + total_size = 0 + weight_map = {} + for name, weight in state_dict.items(): + weight_size = weight.numel() * dtype_byte_size( + weight.dtype + ) + weight_map[name] = ckpt_name + total_size += weight_size + index_dict = dict( + total_size=total_size, weight_map=weight_map + ) + index_dicts = [None for _ in range(env.pp_size)] + dist.gather_object( + index_dict, index_dicts if env.pp_rank == 0 else None, group=env.pp_group + ) + if env.pp_rank == 0: + total_size = 0 + weight_map = {} + for _index_dict in index_dicts: + total_size += _index_dict["total_size"] + weight_map.update(_index_dict["weight_map"]) + merged_dict = { + "metadata": {"total_size": total_size}, + "weight_map": weight_map, + } + io_driver.save( + json.dumps(merged_dict, indent=2, sort_keys=True) + + "\n", + os.path.join(path, "pytorch_model.bin.index.json"), + ) + + else: + ckpt_name = f"pytorch_model.bin" + ckpt_path = os.path.join(path, ckpt_name) + io_driver.save(state_dict, ckpt_path) + if dist.is_initialized() and process_exclusion: + dist.barrier() + if env.rank == 0: + config.save_pretrained(path, protocol=protocol) + dist.barrier() + + +@add_start_docstrings( + """ + The Mistral Model transformer with a sequence classification head on top (linear layer). + + [`MistralForSequenceClassification`] uses the last token in order to do the classification, as other causal models + (e.g. GPT-2) do. + + Since it does classification on the last token, it requires to know the position of the last token. If a + `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If + no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the + padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in + each row of the batch). + """, + MISTRAL_START_DOCSTRING, +) +# Copied from transformers.models.llama.modeling_llama.LlamaForSequenceClassification with Llama->Mistral, LLAMA->MISTRAL +class MistralForSequenceClassification(Mistral2PreTrainedModel): + def __init__(self, config): + super().__init__(config) + self.num_labels = config.num_labels + self.model = Mistral2Model(config) + self.score = nn.Linear(config.hidden_size, self.num_labels, bias=False) + + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self): + return self.model.embed_tokens + + def set_input_embeddings(self, value): + self.model.embed_tokens = value + + @add_start_docstrings_to_model_forward(MISTRAL_INPUTS_DOCSTRING) + def forward( + self, + input_ids: torch.LongTensor = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[List[torch.FloatTensor]] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple, SequenceClassifierOutputWithPast]: + r""" + labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): + Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., + config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If + `config.num_labels > 1` a classification loss is computed (Cross-Entropy). + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + transformer_outputs = self.model( + input_ids, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + hidden_states = transformer_outputs[0] + logits = self.score(hidden_states) + + if input_ids is not None: + batch_size = input_ids.shape[0] + else: + batch_size = inputs_embeds.shape[0] + + if self.config.pad_token_id is None and batch_size != 1: + raise ValueError("Cannot handle batch sizes > 1 if no padding token is defined.") + if self.config.pad_token_id is None: + sequence_lengths = -1 + else: + if input_ids is not None: + # if no pad token found, use modulo instead of reverse indexing for ONNX compatibility + sequence_lengths = torch.eq(input_ids, self.config.pad_token_id).int().argmax(-1) - 1 + sequence_lengths = sequence_lengths % input_ids.shape[-1] + sequence_lengths = sequence_lengths.to(logits.device) + else: + sequence_lengths = -1 + + pooled_logits = logits[torch.arange(batch_size, device=logits.device), sequence_lengths] + + loss = None + if labels is not None: + labels = labels.to(logits.device) + if self.config.problem_type is None: + if self.num_labels == 1: + self.config.problem_type = "regression" + elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int): + self.config.problem_type = "single_label_classification" + else: + self.config.problem_type = "multi_label_classification" + + if self.config.problem_type == "regression": + loss_fct = MSELoss() + if self.num_labels == 1: + loss = loss_fct(pooled_logits.squeeze(), labels.squeeze()) + else: + loss = loss_fct(pooled_logits, labels) + elif self.config.problem_type == "single_label_classification": + loss_fct = CrossEntropyLoss() + loss = loss_fct(pooled_logits.view(-1, self.num_labels), labels.view(-1)) + elif self.config.problem_type == "multi_label_classification": + loss_fct = BCEWithLogitsLoss() + loss = loss_fct(pooled_logits, labels) + if not return_dict: + output = (pooled_logits,) + transformer_outputs[1:] + return ((loss,) + output) if loss is not None else output + + return SequenceClassifierOutputWithPast( + loss=loss, + logits=pooled_logits, + past_key_values=transformer_outputs.past_key_values, + hidden_states=transformer_outputs.hidden_states, + attentions=transformer_outputs.attentions, + ) diff --git a/tests/models/mistral2/modeltp.py b/tests/models/mistral2/modeltp.py new file mode 100644 index 0000000..e91037f --- /dev/null +++ b/tests/models/mistral2/modeltp.py @@ -0,0 +1,2254 @@ +# coding=utf-8 +# Copyright 2023 Mistral AI and the HuggingFace Inc. team. All rights reserved. +# +# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX +# and OPT implementations in this library. It has been modified from its +# original forms to accommodate minor architectural differences compared +# to GPT-NeoX and OPT used by the Meta AI team that trained the model. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" PyTorch Mistral model.""" +import inspect +import math +import warnings +from typing import List, Optional, Tuple, Union + +import torch +import torch.nn.functional as F +import torch.utils.checkpoint +from torch import nn +from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss + +from transformers.activations import ACT2FN +from transformers.cache_utils import Cache, DynamicCache +from transformers.modeling_attn_mask_utils import _prepare_4d_causal_attention_mask, _prepare_4d_causal_attention_mask_for_sdpa +from transformers.modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast, SequenceClassifierOutputWithPast +from transformers.modeling_utils import PreTrainedModel, dtype_byte_size +from transformers.utils import ( + add_start_docstrings, + add_start_docstrings_to_model_forward, + is_flash_attn_2_available, + is_flash_attn_greater_or_equal_2_10, + logging, + replace_return_docstrings, +) +from .configuration_mistraltp import MistralConfig + + +if is_flash_attn_2_available(): + from flash_attn import flash_attn_func, flash_attn_varlen_func + from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input # noqa + + _flash_supports_window_size = "window_size" in list(inspect.signature(flash_attn_func).parameters) + + +logger = logging.get_logger(__name__) + +_CONFIG_FOR_DOC = "MistralConfig" + +#modified for collie +import torch.distributed as dist +import gc +import json +import os +from collections import OrderedDict +from megatron.core import parallel_state, tensor_parallel +from einops import rearrange +from deepspeed.pipe import LayerSpec, TiedLayerSpec + +from collie.config import CollieConfig +from collie.driver.io import IODriver +from collie.log.logger import logger +from collie.module import ( + ColumnParallelLinearWithoutBias, + ColumnParallelLMHead, + RowParallelLinearWithoutBias, +) +from collie.utils import concat_tensor, dict_as_params, env, progress +from collie.models.base import CollieModelForCausalLM +from collie.models.utils import ( + kv_cache_to_inputs_for_layer, inputs_to_kv_cache_for_layer, + kv_cache_to_inputs_for_model, inputs_to_kv_cache_for_model, +) + + +# Copied from transformers.models.llama.modeling_llama._get_unpad_data +def _get_unpad_data(attention_mask): + seqlens_in_batch = attention_mask.sum(dim=-1, dtype=torch.int32) + indices = torch.nonzero(attention_mask.flatten(), as_tuple=False).flatten() + max_seqlen_in_batch = seqlens_in_batch.max().item() + cu_seqlens = F.pad(torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.int32), (1, 0)) + return ( + indices, + cu_seqlens, + max_seqlen_in_batch, + ) + + +# Copied from transformers.models.llama.modeling_llama.LlamaRMSNorm with Llama->Mistral +class MistralRMSNorm(nn.Module): + def __init__(self, hidden_size, eps=1e-6): + """ + MistralRMSNorm is equivalent to T5LayerNorm + """ + super().__init__() + self.weight = nn.Parameter(torch.ones(hidden_size)) + self.variance_epsilon = eps + + def forward(self, hidden_states): + input_dtype = hidden_states.dtype + hidden_states = hidden_states.to(torch.float32) + variance = hidden_states.pow(2).mean(-1, keepdim=True) + hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon) + ans = self.weight * hidden_states.to(input_dtype) + + # # 打印层标准化的输出 + hidden_states_output = ans.detach().cpu().tolist() + data_to_save = {"Layer Norm Output": hidden_states_output} + # 将输出写入 JSON 文件 + with open('a_rms_output.json', 'w') as f: + json.dump(data_to_save, f, indent=4) + + return self.weight * hidden_states.to(input_dtype) + + +# copied from transformers.models.llama.modeling_llama.LlamaRotaryEmbedding with Llama->Mistral +# TODO @Arthur no longer copied from LLama after static cache +class MistralRotaryEmbedding(nn.Module): + def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None): + super().__init__() + + self.dim = dim + self.max_position_embeddings = max_position_embeddings + self.base = base + inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2, dtype=torch.int64).float().to(device) / self.dim)) + self.register_buffer("inv_freq", inv_freq, persistent=False) + + # Build here to make `torch.jit.trace` work. + self._set_cos_sin_cache( + seq_len=max_position_embeddings, device=self.inv_freq.device, dtype=torch.get_default_dtype() + ) + + def _set_cos_sin_cache(self, seq_len, device, dtype): + self.max_seq_len_cached = seq_len + t = torch.arange(self.max_seq_len_cached, device=device, dtype=torch.int64).type_as(self.inv_freq) + + freqs = torch.outer(t, self.inv_freq) + # Different from paper, but it uses a different permutation in order to obtain the same calculation + emb = torch.cat((freqs, freqs), dim=-1) + self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False) + self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False) + + def forward(self, x, seq_len=None): + # x: [bs, num_attention_heads, seq_len, head_size] + if seq_len > self.max_seq_len_cached: + self._set_cos_sin_cache(seq_len=seq_len, device=x.device, dtype=x.dtype) + + return ( + self.cos_cached[:seq_len].to(dtype=x.dtype), + self.sin_cached[:seq_len].to(dtype=x.dtype), + ) + + +# Copied from transformers.models.llama.modeling_llama.rotate_half +def rotate_half(x): + """Rotates half the hidden dims of the input.""" + x1 = x[..., : x.shape[-1] // 2] + x2 = x[..., x.shape[-1] // 2 :] + return torch.cat((-x2, x1), dim=-1) + + +# copied from transformers.models.llama.modeling_llama.apply_rotary_pos_emb +# TODO @Arthur no longer copied from LLama after static cache +def apply_rotary_pos_emb(q, k, cos, sin, position_ids, unsqueeze_dim=1): + """Applies Rotary Position Embedding to the query and key tensors. + + Args: + q (`torch.Tensor`): The query tensor. + k (`torch.Tensor`): The key tensor. + cos (`torch.Tensor`): The cosine part of the rotary embedding. + sin (`torch.Tensor`): The sine part of the rotary embedding. + position_ids (`torch.Tensor`): + The position indices of the tokens corresponding to the query and key tensors. For example, this can be + used to pass offsetted position ids when working with a KV-cache. + unsqueeze_dim (`int`, *optional*, defaults to 1): + The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and + sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note + that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and + k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes + cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have + the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2. + Returns: + `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding. + """ + cos = cos[position_ids].unsqueeze(unsqueeze_dim) + sin = sin[position_ids].unsqueeze(unsqueeze_dim) + q_embed = (q * cos) + (rotate_half(q) * sin) + k_embed = (k * cos) + (rotate_half(k) * sin) + return q_embed, k_embed + + +class MistralMLP(nn.Module): + def __init__(self, config): + super().__init__() + self.config = config + self.hidden_size = config.hidden_size + self.intermediate_size = config.intermediate_size + # self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False) + # self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False) + # self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False) + + self.up_proj = ColumnParallelLinearWithoutBias( + self.hidden_size, + self.intermediate_size, + bias=False, + gather_output=False, + init_method=lambda x: x, + ) + self.gate_proj = ColumnParallelLinearWithoutBias( + self.hidden_size, + self.intermediate_size, + bias=False, + gather_output=False, + init_method=lambda x: x, + ) + self.down_proj = RowParallelLinearWithoutBias( + self.intermediate_size, + self.hidden_size, + bias=False, + input_is_parallel=True, + init_method=lambda x: x, + ) + + self.act_fn = ACT2FN[config.hidden_act] + + def forward(self, x): + output = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x)) + + # 打印MLP层输出 + mlp_output = output.detach().cpu().tolist() + data_to_save = {"MLP Output": mlp_output} + # 将输出写入 JSON 文件 + with open('a_mlp_output.json', 'w') as f: + json.dump(data_to_save, f, indent=4) + + return output + + +# Copied from transformers.models.llama.modeling_llama.repeat_kv +def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor: + """ + This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch, + num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim) + """ + batch, num_key_value_heads, slen, head_dim = hidden_states.shape + if n_rep == 1: + return hidden_states + hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim) + return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim) + + +class MistralAttention(nn.Module): + """ + Multi-headed attention from 'Attention Is All You Need' paper. Modified to use sliding window attention: Longformer + and "Generating Long Sequences with Sparse Transformers". + """ + + def __init__(self, config: CollieConfig, layer_idx: Optional[int] = None): + super().__init__() + self.config = config + self.layer_idx = layer_idx + if layer_idx is None: + logger.warning_once( + f"Instantiating {self.__class__.__name__} without passing a `layer_idx` is not recommended and will " + "lead to errors during the forward call if caching is used. Please make sure to provide a `layer_idx` " + "when creating this class." + ) + + self.hidden_size = config.hidden_size + self.num_heads = config.num_attention_heads + self.head_dim = self.hidden_size // self.num_heads + self.num_key_value_heads = config.num_key_value_heads + self.num_key_value_groups = self.num_heads // self.num_key_value_heads + self.max_position_embeddings = config.max_position_embeddings + self.rope_theta = config.rope_theta + self.is_causal = True + self.attention_dropout = config.attention_dropout + + if (self.head_dim * self.num_heads) != self.hidden_size: + raise ValueError( + f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}" + f" and `num_heads`: {self.num_heads})." + ) + # self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=False) + # self.k_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=False) + # self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=False) + # self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=False) + + self.q_proj = ColumnParallelLinearWithoutBias( + self.hidden_size, + self.num_heads * self.head_dim, + bias=False, + gather_output=False, + init_method=lambda x: x, + ) + self.k_proj = ColumnParallelLinearWithoutBias( + self.hidden_size, + self.num_key_value_heads * self.head_dim, + bias=False, + gather_output=False, + init_method=lambda x: x, + ) + self.v_proj = ColumnParallelLinearWithoutBias( + self.hidden_size, + self.num_key_value_heads * self.head_dim, + bias=False, + gather_output=False, + init_method=lambda x: x, + ) + # aaaa + self.o_proj = RowParallelLinearWithoutBias( + self.num_heads * self.head_dim, + self.hidden_size, + bias=False, + input_is_parallel=True, + init_method=lambda x: x, + ) + + self.rotary_emb = MistralRotaryEmbedding( + self.head_dim, + max_position_embeddings=self.max_position_embeddings, + base=self.rope_theta, + ) + + def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int): + return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous() + + def forward( + self, + hidden_states: torch.Tensor, # 输入维度 [bsz, q_len, hidden_size] + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_value: Optional[Cache] = None, + output_attentions: bool = False, + use_cache: bool = False, + **kwargs, + ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: + if "padding_mask" in kwargs: + warnings.warn( + "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`" + ) + + bsz, q_len, _ = hidden_states.size() + + query_states = self.q_proj(hidden_states) # [bsz, q_len, num_heads * head_dim] + key_states = self.k_proj(hidden_states) # [bsz, q_len, num_key_value_heads * head_dim] + value_states = self.v_proj(hidden_states) # [bsz, q_len, num_key_value_heads * head_dim] + + # query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2) + # key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) + # value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) + + query_states, key_states, value_states = ( + rearrange(query_states, "b n (h d) -> b n h d", d=self.head_dim), # [bsz, q_len, num_heads, head_dim] + rearrange(key_states, "b n (h d) -> b n h d", d=self.head_dim), # [bsz, q_len, num_key_value_heads, head_dim] + rearrange(value_states, "b n (h d) -> b n h d", d=self.head_dim), # [bsz, q_len, num_key_value_heads, head_dim] + ) + + query_states = query_states.transpose(1, 2) # [bsz, num_heads, q_len, head_dim] + key_states = key_states.transpose(1, 2) # [bsz, num_key_value_heads, q_len, head_dim] + value_states = value_states.transpose(1, 2) # [bsz, num_key_value_heads, q_len, head_dim] + + # 打印注意力模块的输出 + # 准备数据以写入 JSON 文件 + attention_outputs = { + "Query states": query_states.detach().cpu().tolist(), + "Key states": key_states.detach().cpu().tolist(), + "Value states": value_states.detach().cpu().tolist() + } + # 将数据写入 JSON 文件 + with open("a_attention_outputs.json", "w") as f: + json.dump(attention_outputs, f, indent=4) + + if self.config.pp_size > 1: + query_states = query_states.contiguous() + key_states = key_states.contiguous() + value_states = value_states.contiguous() + + kv_seq_len = key_states.shape[-2] + if past_key_value is not None: + if self.layer_idx is None: + raise ValueError( + f"The cache structure has changed since version v4.36. If you are using {self.__class__.__name__} " + "for auto-regressive decoding with k/v caching, please make sure to initialize the attention class " + "with a layer index." + ) + kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx) + cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len) + query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids) + + if past_key_value is not None: + cache_kwargs = {"sin": sin, "cos": cos} # Specific to RoPE models + key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs) + + # repeat k/v heads if n_kv_heads < n_heads + key_states = repeat_kv(key_states, self.num_key_value_groups) + value_states = repeat_kv(value_states, self.num_key_value_groups) + + attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim) + + if attn_weights.size() != (bsz, self.num_heads/self.config.tp_size, q_len, kv_seq_len): + raise ValueError( + f"Attention weights should be of size {(bsz, self.num_heads/self.config.tp_size, q_len, kv_seq_len)}, but is" + f" {attn_weights.size()}" + ) + + if attention_mask is not None: + if attention_mask.size() != (bsz, 1, q_len, kv_seq_len): + raise ValueError( + f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}" + ) + + attn_weights = attn_weights + attention_mask + + # upcast attention to fp32 + attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype) + attn_weights = nn.functional.dropout(attn_weights, p=self.attention_dropout, training=self.training) + attn_output = torch.matmul(attn_weights, value_states) + + if attn_output.size() != (bsz, self.num_heads/self.config.tp_size, q_len, self.head_dim): + raise ValueError( + f"`attn_output` should be of size {(bsz, self.num_heads/self.config.tp_size, q_len, self.head_dim)}, but is" + f" {attn_output.size()}" + ) + + attn_output = attn_output.transpose(1, 2).contiguous() + attn_output = attn_output.reshape(bsz, q_len, int(self.hidden_size/self.config.tp_size)) + + + attn_output = self.o_proj(attn_output) + + if not output_attentions: + attn_weights = None + + # 打印注意力模块的输出 + attention_result = { + "Output weights:": attn_output.detach().cpu().tolist(), + # "Attention weights:": attn_weights.detach().cpu().tolist(), + } + # 将数据写入 JSON 文件 + with open("a_attention_outputs.json", "w") as f: + json.dump(attention_result, f, indent=4) + + return attn_output, attn_weights, past_key_value + + +class MistralFlashAttention2(MistralAttention): + """ + Mistral flash attention module. This module inherits from `MistralAttention` as the weights of the module stays + untouched. The only required change would be on the forward pass where it needs to correctly call the public API of + flash attention and deal with padding tokens in case the input contains any of them. + """ + + # Copied from transformers.models.llama.modeling_llama.LlamaFlashAttention2.__init__ + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1. + # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignement, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0. + # Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left). + self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10() + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_value: Optional[Cache] = None, + output_attentions: bool = False, + use_cache: bool = False, + **kwargs, + ): + if "padding_mask" in kwargs: + warnings.warn( + "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`" + ) + + # overwrite attention_mask with padding_mask + attention_mask = kwargs.pop("padding_mask") + bsz, q_len, _ = hidden_states.size() + + query_states = self.q_proj(hidden_states) + key_states = self.k_proj(hidden_states) + value_states = self.v_proj(hidden_states) + + # query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2) + # key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) + # value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) + + query_states, key_states, value_states = ( + rearrange(query_states, "b n (h d) -> b n h d", d=self.head_dim), + rearrange(key_states, "b n (h d) -> b n h d", d=self.head_dim), + rearrange(value_states, "b n (h d) -> b n h d", d=self.head_dim), + ) + + query_states = query_states.transpose(1, 2) + key_states = key_states.transpose(1, 2) + value_states = value_states.transpose(1, 2) + + # 打印注意力模块的输出 + # 准备数据以写入 JSON 文件 + attention_outputs = { + "Query states": query_states.detach().cpu().tolist(), + "Key states": key_states.detach().cpu().tolist(), + "Value states": value_states.detach().cpu().tolist() + } + # 将数据写入 JSON 文件 + with open("a_flash_attention_outputs.json", "w") as f: + json.dump(attention_outputs, f, indent=4) + + if self.config.pp_size > 1: + query_states = query_states.contiguous() + key_states = key_states.contiguous() + value_states = value_states.contiguous() + + + kv_seq_len = key_states.shape[-2] + if past_key_value is not None: + if self.layer_idx is None: + raise ValueError( + f"The cache structure has changed since version v4.36. If you are using {self.__class__.__name__} " + "for auto-regressive decoding with k/v caching, please make sure to initialize the attention class " + "with a layer index." + ) + kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx) + + # Because the input can be padded, the absolute sequence length depends on the max position id. + rotary_seq_len = max(kv_seq_len, position_ids[:, -1].max().item()) + 1 + cos, sin = self.rotary_emb(value_states, seq_len=rotary_seq_len) + + query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids) + + use_sliding_windows = ( + _flash_supports_window_size + and getattr(self.config, "sliding_window", None) is not None + and kv_seq_len > self.config.sliding_window + ) + + if not _flash_supports_window_size: + logger.warning_once( + "The current flash attention version does not support sliding window attention, for a more memory efficient implementation" + " make sure to upgrade flash-attn library." + ) + + if past_key_value is not None: + # Activate slicing cache only if the config has a value `sliding_windows` attribute + cache_has_contents = past_key_value.get_seq_length(self.layer_idx) > 0 + if ( + getattr(self.config, "sliding_window", None) is not None + and kv_seq_len > self.config.sliding_window + and cache_has_contents + ): + slicing_tokens = 1 - self.config.sliding_window + + past_key = past_key_value[self.layer_idx][0] + past_value = past_key_value[self.layer_idx][1] + + past_key = past_key[:, :, slicing_tokens:, :].contiguous() + past_value = past_value[:, :, slicing_tokens:, :].contiguous() + + if past_key.shape[-2] != self.config.sliding_window - 1: + raise ValueError( + f"past key must have a shape of (`batch_size, num_heads, self.config.sliding_window-1, head_dim`), got" + f" {past_key.shape}" + ) + + if attention_mask is not None: + attention_mask = attention_mask[:, slicing_tokens:] + attention_mask = torch.cat([attention_mask, torch.ones_like(attention_mask[:, -1:])], dim=-1) + + cache_kwargs = {"sin": sin, "cos": cos} # Specific to RoPE models + key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs) + + # repeat k/v heads if n_kv_heads < n_heads + key_states = repeat_kv(key_states, self.num_key_value_groups) + value_states = repeat_kv(value_states, self.num_key_value_groups) + dropout_rate = 0.0 if not self.training else self.attention_dropout + + # In PEFT, usually we cast the layer norms in float32 for training stability reasons + # therefore the input hidden states gets silently casted in float32. Hence, we need + # cast them back in float16 just to be sure everything works as expected. + input_dtype = query_states.dtype + if input_dtype == torch.float32: + if torch.is_autocast_enabled(): + target_dtype = torch.get_autocast_gpu_dtype() + # Handle the case where the model is quantized + elif hasattr(self.config, "_pre_quantization_dtype"): + target_dtype = self.config._pre_quantization_dtype + else: + target_dtype = self.q_proj.weight.dtype + + logger.warning_once( + f"The input hidden states seems to be silently casted in float32, this might be related to" + f" the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in" + f" {target_dtype}." + ) + + query_states = query_states.to(target_dtype) + key_states = key_states.to(target_dtype) + value_states = value_states.to(target_dtype) + + # Reashape to the expected shape for Flash Attention + query_states = query_states.transpose(1, 2) + key_states = key_states.transpose(1, 2) + value_states = value_states.transpose(1, 2) + + attn_output = self._flash_attention_forward( + query_states, + key_states, + value_states, + attention_mask, + q_len, + dropout=dropout_rate, + use_sliding_windows=use_sliding_windows, + ) + + attn_output = attn_output.reshape(bsz, q_len, int(self.hidden_size/self.config.tp_size)).contiguous() + attn_output = self.o_proj(attn_output) + + if not output_attentions: + attn_weights = None + + # 打印注意力模块的输出 + attention_result = { + "Output weights:": attn_output.detach().cpu().tolist(), + # "Attention weights:": attn_weights.detach().cpu().tolist(), + } + # 将数据写入 JSON 文件 + with open("a_flash_attention_outputs.json", "w") as f: + json.dump(attention_result, f, indent=4) + + return attn_output, attn_weights, past_key_value + + def _flash_attention_forward( + self, + query_states, + key_states, + value_states, + attention_mask, + query_length, + dropout=0.0, + softmax_scale=None, + use_sliding_windows=False, + ): + """ + Calls the forward method of Flash Attention - if the input hidden states contain at least one padding token + first unpad the input, then computes the attention scores and pad the final attention scores. + + Args: + query_states (`torch.Tensor`): + Input query states to be passed to Flash Attention API + key_states (`torch.Tensor`): + Input key states to be passed to Flash Attention API + value_states (`torch.Tensor`): + Input value states to be passed to Flash Attention API + attention_mask (`torch.Tensor`): + The padding mask - corresponds to a tensor of size `(batch_size, seq_len)` where 0 stands for the + position of padding tokens and 1 for the position of non-padding tokens. + dropout (`float`): + Attention dropout + softmax_scale (`float`, *optional*): + The scaling of QK^T before applying softmax. Default to 1 / sqrt(head_dim) + use_sliding_windows (`bool`, *optional*): + Whether to activate sliding window attention. + """ + if not self._flash_attn_uses_top_left_mask: + causal = self.is_causal + else: + # TODO: Remove the `query_length != 1` check once Flash Attention for RoCm is bumped to 2.1. For details, please see the comment in LlamaFlashAttention2 __init__. + causal = self.is_causal and query_length != 1 + + # Contains at least one padding token in the sequence + if attention_mask is not None: + batch_size = query_states.shape[0] + query_states, key_states, value_states, indices_q, cu_seq_lens, max_seq_lens = self._upad_input( + query_states, key_states, value_states, attention_mask, query_length + ) + + cu_seqlens_q, cu_seqlens_k = cu_seq_lens + max_seqlen_in_batch_q, max_seqlen_in_batch_k = max_seq_lens + + if not use_sliding_windows: + attn_output_unpad = flash_attn_varlen_func( + query_states, + key_states, + value_states, + cu_seqlens_q=cu_seqlens_q, + cu_seqlens_k=cu_seqlens_k, + max_seqlen_q=max_seqlen_in_batch_q, + max_seqlen_k=max_seqlen_in_batch_k, + dropout_p=dropout, + softmax_scale=softmax_scale, + causal=causal, + ) + else: + attn_output_unpad = flash_attn_varlen_func( + query_states, + key_states, + value_states, + cu_seqlens_q=cu_seqlens_q, + cu_seqlens_k=cu_seqlens_k, + max_seqlen_q=max_seqlen_in_batch_q, + max_seqlen_k=max_seqlen_in_batch_k, + dropout_p=dropout, + softmax_scale=softmax_scale, + causal=causal, + window_size=(self.config.sliding_window, self.config.sliding_window), + ) + + attn_output = pad_input(attn_output_unpad, indices_q, batch_size, query_length) + else: + if not use_sliding_windows: + attn_output = flash_attn_func( + query_states, + key_states, + value_states, + dropout, + softmax_scale=softmax_scale, + causal=causal, + ) + else: + attn_output = flash_attn_func( + query_states, + key_states, + value_states, + dropout, + softmax_scale=softmax_scale, + causal=causal, + window_size=(self.config.sliding_window, self.config.sliding_window), + ) + + return attn_output + + def _upad_input(self, query_layer, key_layer, value_layer, attention_mask, query_length): + batch_size, kv_seq_len, num_heads, head_dim = key_layer.shape + + # On the first iteration we need to properly re-create the padding mask + # by slicing it on the proper place + if kv_seq_len != attention_mask.shape[-1]: + attention_mask_num_tokens = attention_mask.shape[-1] + attention_mask = attention_mask[:, attention_mask_num_tokens - kv_seq_len :] + + indices_k, cu_seqlens_k, max_seqlen_in_batch_k = _get_unpad_data(attention_mask) + + key_layer = index_first_axis(key_layer.reshape(batch_size * kv_seq_len, num_heads, head_dim), indices_k) + value_layer = index_first_axis(value_layer.reshape(batch_size * kv_seq_len, num_heads, head_dim), indices_k) + + if query_length == kv_seq_len: + query_layer = index_first_axis( + query_layer.reshape(batch_size * kv_seq_len, num_heads, head_dim), indices_k + ) + cu_seqlens_q = cu_seqlens_k + max_seqlen_in_batch_q = max_seqlen_in_batch_k + indices_q = indices_k + elif query_length == 1: + max_seqlen_in_batch_q = 1 + cu_seqlens_q = torch.arange( + batch_size + 1, dtype=torch.int32, device=query_layer.device + ) # There is a memcpy here, that is very bad. + indices_q = cu_seqlens_q[:-1] + query_layer = query_layer.squeeze(1) + else: + # The -q_len: slice assumes left padding. + attention_mask = attention_mask[:, -query_length:] + query_layer, indices_q, cu_seqlens_q, max_seqlen_in_batch_q = unpad_input(query_layer, attention_mask) + + return ( + query_layer, + key_layer, + value_layer, + indices_q, + (cu_seqlens_q, cu_seqlens_k), + (max_seqlen_in_batch_q, max_seqlen_in_batch_k), + ) + + +# copied from transformers.models.llama.modeling_llama.LlamaSdpaAttention with Llama->Mistral +# TODO @Arthur no longer copied from LLama after static cache +class MistralSdpaAttention(MistralAttention): + """ + Mistral attention module using torch.nn.functional.scaled_dot_product_attention. This module inherits from + `MistralAttention` as the weights of the module stays untouched. The only changes are on the forward pass to adapt to + SDPA API. + """ + + # Adapted from MistralAttention.forward + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_value: Optional[Cache] = None, + output_attentions: bool = False, + use_cache: bool = False, + ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: + if output_attentions: + # TODO: Improve this warning with e.g. `model.config.attn_implementation = "manual"` once this is implemented. + logger.warning_once( + "MistralModel is using MistralSdpaAttention, but `torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to the manual attention implementation, " + 'but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.' + ) + return super().forward( + hidden_states=hidden_states, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_value=past_key_value, + output_attentions=output_attentions, + use_cache=use_cache, + ) + + bsz, q_len, _ = hidden_states.size() + + query_states = self.q_proj(hidden_states) + key_states = self.k_proj(hidden_states) + value_states = self.v_proj(hidden_states) + + # query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2) + # key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) + # value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) + + query_states, key_states, value_states = ( + rearrange(query_states, "b n (h d) -> b n h d", d=self.head_dim), + rearrange(key_states, "b n (h d) -> b n h d", d=self.head_dim), + rearrange(value_states, "b n (h d) -> b n h d", d=self.head_dim), + ) + + query_states = query_states.transpose(1, 2) + key_states = key_states.transpose(1, 2) + value_states = value_states.transpose(1, 2) + + # 打印注意力模块的输出 + # 准备数据以写入 JSON 文件 + attention_outputs = { + "Query states": query_states.detach().cpu().tolist(), + "Key states": key_states.detach().cpu().tolist(), + "Value states": value_states.detach().cpu().tolist() + } + # 将数据写入 JSON 文件 + with open("a_sdpa_attention_outputs.json", "w") as f: + json.dump(attention_outputs, f, indent=4) + + if self.config.pp_size > 1: + query_states = query_states.contiguous() + key_states = key_states.contiguous() + value_states = value_states.contiguous() + + + kv_seq_len = key_states.shape[-2] + if past_key_value is not None: + kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx) + cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len) + query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids) + + if past_key_value is not None: + cache_kwargs = {"sin": sin, "cos": cos} # Specific to RoPE models + key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs) + + key_states = repeat_kv(key_states, self.num_key_value_groups) + value_states = repeat_kv(value_states, self.num_key_value_groups) + + if attention_mask is not None: + if attention_mask.size() != (bsz, 1, q_len, kv_seq_len): + raise ValueError( + f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}" + ) + + # SDPA with memory-efficient backend is currently (torch==2.1.2) bugged with non-contiguous inputs with custom attn_mask, + # Reference: https://github.com/pytorch/pytorch/issues/112577. + if query_states.device.type == "cuda" and attention_mask is not None: + query_states = query_states.contiguous() + key_states = key_states.contiguous() + value_states = value_states.contiguous() + + attn_output = torch.nn.functional.scaled_dot_product_attention( + query_states, + key_states, + value_states, + attn_mask=attention_mask, + dropout_p=self.attention_dropout if self.training else 0.0, + # The q_len > 1 is necessary to match with AttentionMaskConverter.to_causal_4d that does not create a causal mask in case q_len == 1. + is_causal=self.is_causal and attention_mask is None and q_len > 1, + ) + + attn_output = attn_output.transpose(1, 2).contiguous() + attn_output = attn_output.view(bsz, q_len, int(self.hidden_size/self.config.tp_size)) + + attn_output = self.o_proj(attn_output) + + # 打印注意力模块的输出 + attention_result = { + "Output weights:": attn_output.detach().cpu().tolist(), + # "Attention weights:": attn_weights.detach().cpu().tolist(), + } + # 将数据写入 JSON 文件 + with open("a_sdpa_attention_outputs.json", "w") as f: + json.dump(attention_result, f, indent=4) + + return attn_output, None, past_key_value + + +MISTRAL_ATTENTION_CLASSES = { + "eager": MistralAttention, + "flash_attention_2": MistralFlashAttention2, + "sdpa": MistralSdpaAttention, +} + + +class MistralDecoderLayer(nn.Module): + def __init__(self, config: CollieConfig, layer_idx: int): + super().__init__() + self.hidden_size = config.hidden_size + config._attn_implementation = "sdpa" + self.self_attn = MISTRAL_ATTENTION_CLASSES[config._attn_implementation](config, layer_idx) + self.config = config + self.mlp = MistralMLP(config) + self.input_layernorm = MistralRMSNorm(config.hidden_size, eps=config.rms_norm_eps) + self.post_attention_layernorm = MistralRMSNorm(config.hidden_size, eps=config.rms_norm_eps) + self.idx = layer_idx + # 务必保持变量名一致 + self.use_cache = self.config.model_config.use_cache + self.hidden_states = None + self.output_attentions = False + +class MistralDecoderLayer(nn.Module): + def __init__(self, config: CollieConfig, layer_idx: int): + super().__init__() + self.hidden_size = config.hidden_size + config._attn_implementation = "sdpa" + self.self_attn = MISTRAL_ATTENTION_CLASSES[config._attn_implementation](config, layer_idx) + self.config = config + self.mlp = MistralMLP(config) + self.input_layernorm = MistralRMSNorm(config.hidden_size, eps=config.rms_norm_eps) + self.post_attention_layernorm = MistralRMSNorm(config.hidden_size, eps=config.rms_norm_eps) + self.idx = layer_idx + # 务必保持变量名一致 + self.use_cache = self.config.model_config.use_cache + self.hidden_states = None + self.output_attentions = False + + def _forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_value: Optional[Tuple[torch.Tensor]] = None, + # output_attentions: Optional[bool] = False, + # use_cache: Optional[bool] = False, + **kwargs, + ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]: + # if "padding_mask" in kwargs: + # warnings.warn( + # "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`" + # ) + """ + Args: + hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)` + attention_mask (`torch.FloatTensor`, *optional*): attention mask of size + `(batch, sequence_length)` where padding elements are indicated by 0. + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under + returned tensors for more detail. + use_cache (`bool`, *optional*): + If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding + (see `past_key_values`). + past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states + """ + + residual = hidden_states + + hidden_states = self.input_layernorm(hidden_states) + + # Self Attention + hidden_states, self_attn_weights, present_key_value = self.self_attn( + hidden_states=hidden_states, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_value=past_key_value, + # output_attentions=output_attentions, + # use_cache=use_cache, + **kwargs, + ) + hidden_states = residual + hidden_states + + # Fully Connected + residual = hidden_states + hidden_states = self.post_attention_layernorm(hidden_states) + hidden_states = self.mlp(hidden_states) + hidden_states = residual + hidden_states + + # outputs = (hidden_states,) + + # if output_attentions: + # outputs += (self_attn_weights,) + + # if use_cache: + # outputs += (present_key_value,) + + return hidden_states, present_key_value + + def forward(self, inputs: dict): + layer_past = inputs_to_kv_cache_for_layer(idx=self.idx, inputs=inputs) + + if self.config.checkpointing and self.training: + hidden_states, new_layer_past = torch.utils.checkpoint.checkpoint( + self._forward, + inputs["hidden_states"], + inputs.get("attention_mask", None), + inputs.get("position_ids", None), + layer_past, # inputs.get("past_key_values", None), + ) + else: + hidden_states, new_layer_past = self._forward( + inputs["hidden_states"], + inputs.get("attention_mask", None), + inputs.get("position_ids", None), + layer_past + ) # **inputs + inputs["hidden_states"] = hidden_states + + inputs.update(kv_cache_to_inputs_for_layer(idx=self.idx, new_layer_past=new_layer_past)) + return inputs + + + # def _forward( + # self, + # hidden_states: torch.Tensor, + # attention_mask: Optional[torch.Tensor] = None, + # position_ids: Optional[torch.LongTensor] = None, + # past_key_value: Optional[Tuple[torch.Tensor]] = None, + # # output_attentions: Optional[bool] = False, + # # use_cache: Optional[bool] = False, + # **kwargs, + # ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]: + # # if "padding_mask" in kwargs: + # # warnings.warn( + # # "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`" + # # ) + # """ + # Args: + # hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)` + # attention_mask (`torch.FloatTensor`, *optional*): attention mask of size + # `(batch, sequence_length)` where padding elements are indicated by 0. + # output_attentions (`bool`, *optional*): + # Whether or not to return the attentions tensors of all attention layers. See `attentions` under + # returned tensors for more detail. + # use_cache (`bool`, *optional*): + # If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding + # (see `past_key_values`). + # past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states + # """ + + # residual = hidden_states + + # hidden_states = self.input_layernorm(hidden_states) + + # # Self Attention + # hidden_states, self_attn_weights, present_key_value = self.self_attn( + # hidden_states=hidden_states, + # attention_mask=attention_mask, + # position_ids=position_ids, + # past_key_value=past_key_value, + # # output_attentions=output_attentions, + # # use_cache=use_cache, + # **kwargs, + # ) + # hidden_states = residual + hidden_states + + # # Fully Connected + # residual = hidden_states + # hidden_states = self.post_attention_layernorm(hidden_states) + # hidden_states = self.mlp(hidden_states) + # hidden_states = residual + hidden_states + + # # outputs = (hidden_states,) + + # # if output_attentions: + # # outputs += (self_attn_weights,) + + # # if use_cache: + # # outputs += (present_key_value,) + + # return hidden_states, present_key_value + + # def forward(self, inputs: dict): + # layer_past = inputs_to_kv_cache_for_layer(idx=self.idx, inputs=inputs) + + # if self.config.checkpointing and self.training: + # hidden_states, new_layer_past = torch.utils.checkpoint.checkpoint( + # self._forward, + # inputs["hidden_states"], + # inputs.get("attention_mask", None), + # inputs.get("position_ids", None), + # layer_past, # inputs.get("past_key_values", None), + # ) + # else: + # hidden_states, new_layer_past = self._forward( + # inputs["hidden_states"], + # inputs.get("attention_mask", None), + # inputs.get("position_ids", None), + # layer_past + # ) # **inputs + # inputs["hidden_states"] = hidden_states + + # inputs.update(kv_cache_to_inputs_for_layer(idx=self.idx, new_layer_past=new_layer_past)) + # return inputs + + # def forward( + # self, + # hidden_states: torch.Tensor, + # attention_mask: Optional[torch.Tensor] = None, + # position_ids: Optional[torch.LongTensor] = None, + # past_key_value: Optional[Tuple[torch.Tensor]] = None, + # output_attentions: Optional[bool] = False, + # use_cache: Optional[bool] = False, + # **kwargs, + # ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]: + # if "padding_mask" in kwargs: + # warnings.warn( + # "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`" + # ) + # """ + # Args: + # hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)` + # attention_mask (`torch.FloatTensor`, *optional*): attention mask of size + # `(batch, sequence_length)` where padding elements are indicated by 0. + # output_attentions (`bool`, *optional*): + # Whether or not to return the attentions tensors of all attention layers. See `attentions` under + # returned tensors for more detail. + # use_cache (`bool`, *optional*): + # If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding + # (see `past_key_values`). + # past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states + # """ + + # residual = hidden_states + + # hidden_states = self.input_layernorm(hidden_states) + + # # Self Attention + # hidden_states, self_attn_weights, present_key_value = self.self_attn( + # hidden_states=hidden_states, + # attention_mask=attention_mask, + # position_ids=position_ids, + # past_key_value=past_key_value, + # output_attentions=output_attentions, + # use_cache=use_cache, + # **kwargs, + # ) + # hidden_states = residual + hidden_states + + # # Fully Connected + # residual = hidden_states + # hidden_states = self.post_attention_layernorm(hidden_states) + # hidden_states = self.mlp(hidden_states) + # hidden_states = residual + hidden_states + + # outputs = (hidden_states,) + + # if output_attentions: + # outputs += (self_attn_weights,) + + # if use_cache: + # outputs += (present_key_value,) + + # return outputs + + +MISTRAL_START_DOCSTRING = r""" + This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the + library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads + etc.) + + This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. + Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage + and behavior. + + Parameters: + config ([`MistralConfig`]): + Model configuration class with all the parameters of the model. Initializing with a config file does not + load the weights associated with the model, only the configuration. Check out the + [`~PreTrainedModel.from_pretrained`] method to load the model weights. +""" + + +@add_start_docstrings( + "The bare Mistral Model outputting raw hidden-states without any specific head on top.", + MISTRAL_START_DOCSTRING, +) +class MistralPreTrainedModel(PreTrainedModel): + config_class = MistralConfig + base_model_prefix = "model" + supports_gradient_checkpointing = True + _no_split_modules = ["MistralDecoderLayer"] + _skip_keys_device_placement = "past_key_values" + _supports_flash_attn_2 = True + _supports_sdpa = True + _supports_cache_class = True + + def _init_weights(self, module): + std = self.config.initializer_range + if isinstance(module, nn.Linear): + module.weight.data.normal_(mean=0.0, std=std) + if module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, nn.Embedding): + module.weight.data.normal_(mean=0.0, std=std) + if module.padding_idx is not None: + module.weight.data[module.padding_idx].zero_() + + +MISTRAL_INPUTS_DOCSTRING = r""" + Args: + input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`): + Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide + it. + + Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. + + [What are input IDs?](../glossary#input-ids) + attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*): + Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + [What are attention masks?](../glossary#attention-mask) + + Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. + + If `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see + `past_key_values`). + + If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`] + and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more + information on the default strategy. + + - 1 indicates the head is **not masked**, + - 0 indicates the head is **masked**. + position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0, + config.n_positions - 1]`. + + [What are position IDs?](../glossary#position-ids) + past_key_values (`Cache` or `tuple(tuple(torch.FloatTensor))`, *optional*): + Pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention + blocks) that can be used to speed up sequential decoding. This typically consists in the `past_key_values` + returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`. + + Two formats are allowed: + - a [`~cache_utils.Cache`] instance; + - Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of + shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`). This is also known as the legacy + cache format. + + The model will output the same cache format that is fed as input. If no `past_key_values` are passed, the + legacy cache format will be returned. + + If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that don't + have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `input_ids` + of shape `(batch_size, sequence_length)`. + inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): + Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This + is useful if you want more control over how to convert `input_ids` indices into associated vectors than the + model's internal embedding lookup matrix. + use_cache (`bool`, *optional*): + If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see + `past_key_values`). + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned + tensors for more detail. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for + more detail. + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. +""" + + +@add_start_docstrings( + "The bare Mistral Model outputting raw hidden-states without any specific head on top.", + MISTRAL_START_DOCSTRING, +) +class MistralModel(nn.Module): + """ + Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`MistralDecoderLayer`] + + Args: + config: MistralConfig + """ + + def __init__(self, config: CollieConfig): + # super().__init__(config) + super().__init__() + self.config = config + self.padding_idx = config.pad_token_id + self.vocab_size = config.vocab_size + + # aaaa + # self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx) + self.embed_tokens = tensor_parallel.VocabParallelEmbedding( + config.vocab_size, config.hidden_size, params_dtype=torch.float32 + ) + self.layers = nn.ModuleList( + [MistralDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)] + ) + config._attn_implementation = "sdpa" + self._attn_implementation = config._attn_implementation + self.norm = MistralRMSNorm(config.hidden_size, eps=config.rms_norm_eps) + self.gradient_checkpointing = False + # Initialize weights and apply final processing + # self.post_init() + + + def get_input_embeddings(self): + return self.embed_tokens + + def set_input_embeddings(self, value): + self.embed_tokens = value + + @add_start_docstrings_to_model_forward(MISTRAL_INPUTS_DOCSTRING) + def forward( + self, + input_ids: torch.LongTensor = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + # aaaa + past_key_values: Optional[Tuple[torch.FloatTensor]] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple, BaseModelOutputWithPast]: + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + use_cache = use_cache if use_cache is not None else self.config.use_cache + + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + # retrieve input_ids and inputs_embeds + if input_ids is not None and inputs_embeds is not None: + raise ValueError("You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time") + elif input_ids is not None: + batch_size, seq_length = input_ids.shape + elif inputs_embeds is not None: + batch_size, seq_length, _ = inputs_embeds.shape + else: + raise ValueError("You have to specify either decoder_input_ids or decoder_inputs_embeds") + + if self.gradient_checkpointing and self.training: + if use_cache: + logger.warning_once( + "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..." + ) + use_cache = False + + past_key_values_length = 0 + + if use_cache: + use_legacy_cache = not isinstance(past_key_values, Cache) + if use_legacy_cache: + past_key_values = DynamicCache.from_legacy_cache(past_key_values) + past_key_values_length = past_key_values.get_usable_length(seq_length) + + if position_ids is None: + device = input_ids.device if input_ids is not None else inputs_embeds.device + position_ids = torch.arange( + past_key_values_length, seq_length + past_key_values_length, dtype=torch.long, device=device + ) + position_ids = position_ids.unsqueeze(0).view(-1, seq_length) + else: + position_ids = position_ids.view(-1, seq_length).long() + + if inputs_embeds is None: + inputs_embeds = self.embed_tokens(input_ids) + + # 打印嵌入层输出 + embeddings_output = inputs_embeds.detach().cpu().tolist() + data_to_save = {"Embeddings Output": embeddings_output} + # 将输出写入 JSON 文件 + with open('a_embeddings_output.json', 'w') as f: + json.dump(data_to_save, f, indent=4) + + if attention_mask is not None and self._attn_implementation == "flash_attention_2" and use_cache: + is_padding_right = attention_mask[:, -1].sum().item() != batch_size + if is_padding_right: + raise ValueError( + "You are attempting to perform batched generation with padding_side='right'" + " this may lead to unexpected behaviour for Flash Attention version of Mistral. Make sure to " + " call `tokenizer.padding_side = 'left'` before tokenizing the input. " + ) + + if self._attn_implementation == "flash_attention_2": + # 2d mask is passed through the layers + attention_mask = attention_mask if (attention_mask is not None and 0 in attention_mask) else None + elif self._attn_implementation == "sdpa" and not output_attentions: + # output_attentions=True can not be supported when using SDPA, and we fall back on + # the manual implementation that requires a 4D causal mask in all cases. + attention_mask = _prepare_4d_causal_attention_mask_for_sdpa( + attention_mask, + (batch_size, seq_length), + inputs_embeds, + past_key_values_length, + ) + else: + # 4d mask is passed through the layers + attention_mask = _prepare_4d_causal_attention_mask( + attention_mask, + (batch_size, seq_length), + inputs_embeds, + past_key_values_length, + sliding_window=self.config.sliding_window, + ) + + hidden_states = inputs_embeds + + inputs = { + "input_ids": input_ids, + "hidden_states": hidden_states, + "attention_mask": attention_mask, + "position_ids": position_ids, + "past_key_values": past_key_values, + "output_attentions": output_attentions, + "use_cache": use_cache, + } + + # decoder layers + all_hidden_states = () if output_hidden_states else None + all_self_attns = () if output_attentions else None + next_decoder_cache = None + + # for decoder_layer in self.layers: + for idx, decoder_layer in enumerate(self.layers): + if output_hidden_states: + # all_hidden_states += (hidden_states,) + all_hidden_states += (inputs["hidden_states"],) + + if self.gradient_checkpointing and self.training: + layer_outputs = self._gradient_checkpointing_func( + decoder_layer.__call__, + # hidden_states, + # attention_mask, + # position_ids, + # past_key_values, + # output_attentions, + # use_cache, + inputs, + ) + else: + layer_outputs = decoder_layer( + # hidden_states, + # attention_mask=attention_mask, + # position_ids=position_ids, + # past_key_value=past_key_values, + # output_attentions=output_attentions, + # use_cache=use_cache, + inputs, + ) + inputs.update(layer_outputs) + + # hidden_states = layer_outputs[0] + hidden_states = inputs["hidden_states"] + + if use_cache: + # next_decoder_cache = layer_outputs[2 if output_attentions else 1] + next_decoder_cache = inputs["addition_info"][1 if output_attentions else 0] + + if output_attentions: + # all_self_attns += (layer_outputs[1],) + all_self_attns += (inputs["addition_info"][0],) + + hidden_states = self.norm(hidden_states) + + # add hidden states from the last decoder layer + if output_hidden_states: + all_hidden_states += (hidden_states,) + + next_cache = None + if use_cache: + next_cache = next_decoder_cache.to_legacy_cache() if use_legacy_cache else next_decoder_cache + + if not return_dict: + return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None) + return BaseModelOutputWithPast( + last_hidden_state=hidden_states, + # past_key_values=next_cache, + hidden_states=all_hidden_states, + attentions=all_self_attns, + past_key_values=past_key_values, + ) + + @classmethod + def pipeline_layers(cls, config: CollieConfig): + """ + Get layers of pipeline. + :return: list + """ + if isinstance(config, str): + config = CollieConfig.from_pretrained(config) + + if config.tie_word_embeddings: + embed_tokens = TiedLayerSpec( + "embed_tokens", + dict_as_params(input_keys="input_ids", output_keys="hidden_states"), + tensor_parallel.VocabParallelEmbedding, + config.vocab_size, + config.hidden_size, + ) + else: + embed_tokens = LayerSpec( + dict_as_params(input_keys="input_ids", output_keys="hidden_states"), + tensor_parallel.VocabParallelEmbedding, + config.vocab_size, + config.hidden_size, + ) + + layers = [ + LayerSpec(MistralDecoderLayer, config, i) for i in range(config.num_hidden_layers) + ] + norm = LayerSpec( + dict_as_params(input_keys="hidden_states", output_keys="hidden_states"), + MistralRMSNorm, + hidden_size=config.hidden_size, + eps=config.rms_norm_eps, + ) + + return [ + ("embed_tokens", embed_tokens), + ("layers", layers), + ("norm", norm), + ] + +class MistralForCausalLM(CollieModelForCausalLM): + _tied_weights_keys = ["lm_head.weight"] + + def __init__(self, config:CollieConfig): + super().__init__(config) + self.model = MistralModel(config) + self.vocab_size = config.vocab_size + # self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False) + self.lm_head = ColumnParallelLinearWithoutBias( + self.collie_config.hidden_size, self.collie_config.vocab_size, bias=False + ) + # Initialize weights and apply final processing + # self.post_init() + # GenerationMixin 需要的额外参数 + self.config.is_decoder = True + if config.model_config.tie_word_embeddings: + self.lm_head.weight = self.embed_tokens.weight + self.main_input_name = "input_ids" + + def clean_cache(self): + self._clean_hidden_states([*self.model.layers, self.lm_head]) + self._set_use_cache(self.model.layers, False) + + def set_cache(self, use_cache): + self._set_use_cache(self.model.layers, use_cache) + + def get_input_embeddings(self): + return self.model.embed_tokens + + def set_input_embeddings(self, value): + self.model.embed_tokens = value + + def get_output_embeddings(self): + return self.lm_head + + def set_output_embeddings(self, new_embeddings): + self.lm_head = new_embeddings + + def set_decoder(self, decoder): + self.model = decoder + + def get_decoder(self): + return self.model + + @add_start_docstrings_to_model_forward(MISTRAL_INPUTS_DOCSTRING) + @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC) + def forward( + self, + input_ids: torch.LongTensor = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[List[torch.FloatTensor]] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + **kwargs, + ) -> Union[Tuple, CausalLMOutputWithPast]: + r""" + Args: + labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., + config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored + (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`. + + Returns: + + Example: + + ```python + >>> from transformers import AutoTokenizer, MistralForCausalLM + + >>> model = MistralForCausalLM.from_pretrained("mistralai/Mistral-7B-v0.1") + >>> tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-v0.1") + + >>> prompt = "Hey, are you conscious? Can you talk to me?" + >>> inputs = tokenizer(prompt, return_tensors="pt") + + >>> # Generate + >>> generate_ids = model.generate(inputs.input_ids, max_length=30) + >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0] + "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you." + ```""" + + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn) + outputs = self.model( + input_ids=input_ids, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + hidden_states = outputs[0] + logits = self.lm_head(hidden_states) + logits = logits.float() + + loss = None + if labels is not None: + # Shift so that tokens < n predict n + shift_logits = logits[..., :-1, :].contiguous() + shift_labels = labels[..., 1:].contiguous() + # Flatten the tokens + shift_logits = shift_logits.view(-1, self.config.vocab_size) + shift_labels = shift_labels.view(-1) + # Ensure tensors are on the same device + shift_labels = shift_labels.to(shift_logits.device) + loss_fct = CrossEntropyLoss() + loss = loss_fct(shift_logits, shift_labels) + + if not return_dict: + output = (logits,) + outputs[1:] + return (loss,) + output if loss is not None else output + + return CausalLMOutputWithPast( + loss=loss, + logits=logits, + past_key_values=outputs.past_key_values, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + def prepare_inputs_for_generation( + self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, **kwargs + ): + # Omit tokens covered by past_key_values + if past_key_values is not None: + if isinstance(past_key_values, Cache): + cache_length = past_key_values.get_seq_length() + past_length = past_key_values.seen_tokens + max_cache_length = past_key_values.get_max_length() + else: + cache_length = past_length = past_key_values[0][0].shape[2] + max_cache_length = None + + # Keep only the unprocessed tokens: + # 1 - If the length of the attention_mask exceeds the length of input_ids, then we are in a setting where + # some of the inputs are exclusively passed as part of the cache (e.g. when passing input_embeds as + # input) + if attention_mask is not None and attention_mask.shape[1] > input_ids.shape[1]: + input_ids = input_ids[:, -(attention_mask.shape[1] - past_length) :] + # 2 - If the past_length is smaller than input_ids', then input_ids holds all input tokens. We can discard + # input_ids based on the past_length. + elif past_length < input_ids.shape[1]: + input_ids = input_ids[:, past_length:] + # 3 - Otherwise (past_length >= input_ids.shape[1]), let's assume input_ids only has unprocessed tokens. + + # If we are about to go beyond the maximum cache length, we need to crop the input attention mask. + if ( + max_cache_length is not None + and attention_mask is not None + and cache_length + input_ids.shape[1] > max_cache_length + ): + attention_mask = attention_mask[:, -max_cache_length:] + + position_ids = kwargs.get("position_ids", None) + if attention_mask is not None and position_ids is None: + # create position_ids on the fly for batch generation + position_ids = attention_mask.long().cumsum(-1) - 1 + position_ids.masked_fill_(attention_mask == 0, 1) + if past_key_values: + position_ids = position_ids[:, -input_ids.shape[1] :] + + # if `inputs_embeds` are passed, we only want to use them in the 1st generation step + if inputs_embeds is not None and past_key_values is None: + model_inputs = {"inputs_embeds": inputs_embeds} + else: + model_inputs = {"input_ids": input_ids} + + model_inputs.update( + { + "position_ids": position_ids, + "past_key_values": past_key_values, + "use_cache": kwargs.get("use_cache"), + "attention_mask": attention_mask, + } + ) + return model_inputs + + @staticmethod + def _reorder_cache(past_key_values, beam_idx): + reordered_past = () + for layer_past in past_key_values: + reordered_past += ( + tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past), + ) + return reordered_past + + @classmethod + def pipeline_layers(cls, config: CollieConfig): + """ + Get layers of pipeline. + :return: list + """ + if isinstance(config, str): + config = CollieConfig.from_pretrained(config) + + if config.tie_word_embeddings: + output = TiedLayerSpec( + "embed_tokens", + dict_as_params(input_keys="hidden_states", output_keys="logits"), + ColumnParallelLMHead, + config.hidden_size, + config.vocab_size, + bias=False, + ) + else: + output = LayerSpec( + dict_as_params(input_keys="hidden_states", output_keys="logits"), + ColumnParallelLMHead, + config.hidden_size, + config.vocab_size, + bias=False, + ) + + return [("model", MistralModel.pipeline_layers(config)), ("lm_head", output)] + + @staticmethod + def load_parallel_state_dict( + path: str, + config: Union[CollieConfig, str], + process_exclusion: bool = False, + **kwargs, + ): + ... + + @staticmethod + def load_parallel_state_dict( + path: str, + config: Union[CollieConfig, str], + process_exclusion: bool = False, + protocol: str = "file", # 指定加载state_dict时使用的协议 + **kwargs, + ): + """ + Load state_dict from ``path``. + The format of pretrained model should be the same as that of + `huggingface`. + :return: state_dict. Note that the state_dict should be processed + properly to match the current rank. + """ + # 配置加载 + if isinstance(config, str): + config = CollieConfig.from_pretrained(config) + # IO驱动初始化 + io_driver = IODriver.from_protocol(protocol) + # 检查文件路径是否存在 + if not io_driver.exists(path): + raise FileNotFoundError(f"folder {path} not found.") + # 初始化存储和处理变量 + state_dict = OrderedDict() + weights = [] + parts = None # 变量用于存储模型分割的部分信息 + # 如果开启了进程互斥,那么每个进程都会显示进度条,否则只显示 RANK0 的 + hide_progress = not process_exclusion and int(os.environ.get("RANK", "0")) != 0 + if dist.is_initialized() and process_exclusion: + # 如果启动了进程互斥,则要进行 dist.get_world_size() 次循环 + rank_order = range(dist.get_world_size()) + else: + # 不开启只进行一次循环 + rank_order = range(1) + # 权重文件加载和处理 + for rank in rank_order: + # 如果开启了进程互斥,那么只有对应 RANK 的能进入循环;不开启进程互斥的话就都可以进 + if int(os.environ.get("RANK", "0")) == rank or not process_exclusion: + # PP 分层的方法保存在了 os.environ["COLLIE_PP_PARTS"], 格式类似于 [0, 17, 35], 左闭右开 + if env.is_pipeline: + # 保存的是 json 格式 + parts = env.pipeline_parts + if hasattr(config, "num_key_value_heads"): + # llama2 (transformers >= 4.31.0) + num_key_value_heads = config.num_key_value_heads + else: + num_key_value_heads = config.num_attention_heads + head_dim = config.hidden_size // config.num_attention_heads + # 如果存在 pytorch_model.bin.index.json 文件的话,此时不同的 pp 进程可以按需加载自己需要的权重 + if ( + io_driver.exists(os.path.join(path, "pytorch_model.bin.index.json")) + and "COLLIE_PP_PARTS" in os.environ.keys() + ): + weight_map = json.loads( + io_driver.load( + os.path.join(path, "pytorch_model.bin.index.json"), mode="r" + ) + )["weight_map"] + # layers 表示自己需要的层 + layers = env.pipeline_layers_idx + # 筛选出形似 model.layers.0 这样的层。包含两个条件:1. 有数字的层;2. 数字加一要在 layers 里面(因为最开始还有个 embedding 占一层) + weights.extend( + [ + value + for key, value in weight_map.items() + if len(key.split(".")) > 2 + and key.split(".")[2].isdigit() + and (int(key.split(".")[2]) + 1) in layers + ] + ) + # 去重 + weights = list(set(weights)) + # 继续筛选,如果有 0 层,那么就要加载 embedding;如果有最后一层,那么就要加载 lm_head;如果有倒数第二层,那么就要加载 norm + if 0 in layers: + weights.append(weight_map["model.embed_tokens.weight"]) + if max(parts) - 1 in layers: + weights.append(weight_map["lm_head.weight"]) + if max(parts) - 2 in layers: + weights.append(weight_map["model.norm.weight"]) + else: + # 如果没有 pytorch_model.bin.index.json 文件的话,那么就加载所有的权重 + weights = [ + weight + for weight in io_driver.list(path) + if weight.endswith(".bin") + ] + with progress( + weights, + desc="Loading state dict", + total=len(weights), + disable=hide_progress, + ) as pbar: + for weight in pbar: + part_state_dict = io_driver.load( + os.path.join(path, weight), mode="rb" + ) + # for key in list(part_state_dict.keys()): + # if "attention.wqkv.weight" in key: + # # qkv_weights = part_state_dict.pop(key) + # qkv_weights = part_state_dict[key] + # print(qkv_weights.shape) + # (wq, wk, wv) = qkv_weights.split( + # [ + # config.hidden_size, + # config.num_key_value_heads * head_dim, + # config.num_key_value_heads * head_dim, + # ], + # dim=0, + # ) + # wq_name = key.replace("wqkv", "wq") + # wk_name = key.replace("wqkv", "wk") + # wv_name = key.replace("wqkv", "wv") + # part_state_dict[wq_name] = wq + # part_state_dict[wk_name] = wk + # part_state_dict[wv_name] = wv + state_dict.update(part_state_dict) + del part_state_dict + if parts is not None: + # 这一步是 pp 的复筛 + layers = env.pipeline_layers_idx + for key in list(state_dict.keys()): + if key.startswith("layers"): + layer = int(key.split(".")[1]) + if layer + 1 not in layers: + state_dict.pop(key) + # if key.endswith("tok_embeddings.weight"): + if key.endswith("embed_tokens.weight"): + if 0 not in layers: + state_dict.pop(key) + if key == "norm.weight": + if max(parts) - 2 not in layers: + state_dict.pop(key) + # if key.endswith("output.weight"): + if key.endswith("lm_head.weight"): + if max(parts) - 1 not in layers: + state_dict.pop(key) + # 根据用户配置的新的 tp size 进行分割 + for key in list(state_dict.keys()): + col_filter = [ + # "wq.weight", + # "wk.weight", + # "wv.weight", + # "wqkv.weight", + # "w1.weight", + # "w3.weight", + # "tok_embeddings.weight", + # "output.weight", + "q_proj.weight", + "k_proj.weight", + "v_proj.weight", + #"o_proj.weight", + "lm_head.weight", + "gate_proj.weight", + "up_proj.weight", + #"down_proj.weight", + "embed_tokens.weight", + ] + col_split = any([key.endswith(filter) for filter in col_filter]) + + if col_split: + tensor = ( + list(torch.chunk(state_dict[key], config.tp_size, dim=0))[ + env.tp_rank + ] + .detach() + .clone() + ) + del state_dict[key] + if process_exclusion: + # CPU 内存回收(速度很慢) + gc.collect() + state_dict[key] = tensor + elif key.endswith("o_proj.weight") or key.endswith("down_proj.weight"): + tensor = ( + list(torch.chunk(state_dict[key], config.tp_size, dim=1))[ + env.tp_rank + ] + .detach() + .clone() + ) + del state_dict[key] + if process_exclusion: + # CPU 内存回收(速度很慢) + gc.collect() + state_dict[key] = tensor + if dist.is_initialized() and process_exclusion: + # 如果选择了进程互斥,那么本次循环中不需要加载权重的进程需等待 + dist.barrier() + return state_dict + + @staticmethod + def save_parallel_state_dict( + state_dict: dict, + path: str, + config: CollieConfig, + process_exclusion: bool = False, + **kwargs, + ): + ... + + @staticmethod + def save_parallel_state_dict( + state_dict: dict, + path: str, + config: CollieConfig, + process_exclusion: bool = False, + protocol: str = "file", + ): + """ + Save state_dict to ``path``. + The format of saved state dict should be the same as that of + `huggingface`. + """ + io_driver = IODriver.from_protocol(protocol) + # gather to tp rank 0 + if dist.is_initialized() and process_exclusion: + # 如果启动了进程互斥,则要进行 pp_size 次循环 + rank_order = range(config.pp_size) + else: + # 不开启只进行一次循环 + rank_order = range(1) + dst = parallel_state.get_tensor_model_parallel_src_rank() + with progress( + rank_order, + desc="Saving model", + disable=int(os.environ.get("RANK", "0")) != 0, + ) as pbar: + for rank in pbar: + if env.dp_rank == 0 and (env.pp_rank == rank or not process_exclusion): + for key in sorted(list(state_dict.keys())): + tensor_list = None + if env.tp_rank == 0: + tensor_list = [ + torch.zeros_like(state_dict[key]) + .to(state_dict[key].dtype) + .cuda() + for _ in range(config.tp_size) + ] + dist.gather( + state_dict[key].cuda(), + dst=dst, + gather_list=tensor_list, + group=env.tp_group, + ) + if env.tp_rank == 0: + col_filter = [ + # "wq.weight", + # "wk.weight", + # "wv.weight", + # "wqkv.weight", + # "w1.weight", + # "w3.weight", + # "tok_embeddings.weight", + # "output.weight", + "q_proj.weight", + "k_proj.weight", + "v_proj.weight", + #"o_proj.weight", + "lm_head.weight", + "gate_proj.weight", + "up_proj.weight", + #"down_proj.weight", + "embed_tokens.weight", + ] + col_split = any( + [key.endswith(filter) for filter in col_filter] + ) + + if col_split: + state_dict[key] = concat_tensor(tensor_list, dim=0) + + if process_exclusion: + # CPU 内存回收(速度很慢) + gc.collect() + + elif key.endswith("o_proj.weight") or key.endswith("down_proj.weight"): + state_dict[key] = concat_tensor(tensor_list, dim=1) + + if process_exclusion: + # CPU 内存回收(速度很慢) + gc.collect() + # 似乎不需要? + # state_dict_keys = state_dict.keys() + # for layer_id in range(config.num_layers): + # qkv_names = [None, None, None] + # for key in state_dict_keys: + # if f"layers.{layer_id}.attention.wq.weight" in key: + # qkv_names[0] = key + # elif f"layers.{layer_id}.attention.wk.weight" in key: + # qkv_names[1] = key + # elif f"layers.{layer_id}.attention.wv.weight" in key: + # qkv_names[2] = key + # qkv_name = qkv_names[0].replace("wq", "wqkv") + # state_dict[qkv_name] = torch.cat( + # [ + # state_dict.pop(qkv_names[0]), + # state_dict.pop(qkv_names[1]), + # state_dict.pop(qkv_names[2]), + # ], + # dim=0 + # ) + + if env.tp_rank == 0: + # Save gathered weights + if env.is_pipeline: + ckpt_name = f"pytorch_model-{env.pp_rank + 1:05d}-of-{config.pp_size:05d}.bin" + total_size = 0 + weight_map = {} + for name, weight in state_dict.items(): + weight_size = weight.numel() * dtype_byte_size( + weight.dtype + ) + weight_map[name] = ckpt_name + total_size += weight_size + index_dict = dict( + total_size=total_size, weight_map=weight_map + ) + index_dicts = [None for _ in range(env.pp_size)] + dist.gather_object( + index_dict, index_dicts if env.pp_rank == 0 else None, group=env.pp_group + ) + if env.pp_rank == 0: + total_size = 0 + weight_map = {} + for _index_dict in index_dicts: + total_size += _index_dict["total_size"] + weight_map.update(_index_dict["weight_map"]) + merged_dict = { + "metadata": {"total_size": total_size}, + "weight_map": weight_map, + } + io_driver.save( + json.dumps(merged_dict, indent=2, sort_keys=True) + + "\n", + os.path.join(path, "pytorch_model.bin.index.json"), + ) + + else: + ckpt_name = f"pytorch_model.bin" + ckpt_path = os.path.join(path, ckpt_name) + io_driver.save(state_dict, ckpt_path) + if dist.is_initialized() and process_exclusion: + dist.barrier() + if env.rank == 0: + config.save_pretrained(path, protocol=protocol) + dist.barrier() + + +@add_start_docstrings( + """ + The Mistral Model transformer with a sequence classification head on top (linear layer). + + [`MistralForSequenceClassification`] uses the last token in order to do the classification, as other causal models + (e.g. GPT-2) do. + + Since it does classification on the last token, it requires to know the position of the last token. If a + `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If + no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the + padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in + each row of the batch). + """, + MISTRAL_START_DOCSTRING, +) +# Copied from transformers.models.llama.modeling_llama.LlamaForSequenceClassification with Llama->Mistral, LLAMA->MISTRAL +class MistralForSequenceClassification(MistralPreTrainedModel): + def __init__(self, config): + super().__init__(config) + self.num_labels = config.num_labels + self.model = MistralModel(config) + self.score = nn.Linear(config.hidden_size, self.num_labels, bias=False) + + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self): + return self.model.embed_tokens + + def set_input_embeddings(self, value): + self.model.embed_tokens = value + + @add_start_docstrings_to_model_forward(MISTRAL_INPUTS_DOCSTRING) + def forward( + self, + input_ids: torch.LongTensor = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[List[torch.FloatTensor]] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple, SequenceClassifierOutputWithPast]: + r""" + labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): + Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., + config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If + `config.num_labels > 1` a classification loss is computed (Cross-Entropy). + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + transformer_outputs = self.model( + input_ids, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + hidden_states = transformer_outputs[0] + logits = self.score(hidden_states) + + if input_ids is not None: + batch_size = input_ids.shape[0] + else: + batch_size = inputs_embeds.shape[0] + + if self.config.pad_token_id is None and batch_size != 1: + raise ValueError("Cannot handle batch sizes > 1 if no padding token is defined.") + if self.config.pad_token_id is None: + sequence_lengths = -1 + else: + if input_ids is not None: + # if no pad token found, use modulo instead of reverse indexing for ONNX compatibility + sequence_lengths = torch.eq(input_ids, self.config.pad_token_id).int().argmax(-1) - 1 + sequence_lengths = sequence_lengths % input_ids.shape[-1] + sequence_lengths = sequence_lengths.to(logits.device) + else: + sequence_lengths = -1 + + pooled_logits = logits[torch.arange(batch_size, device=logits.device), sequence_lengths] + + loss = None + if labels is not None: + labels = labels.to(logits.device) + if self.config.problem_type is None: + if self.num_labels == 1: + self.config.problem_type = "regression" + elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int): + self.config.problem_type = "single_label_classification" + else: + self.config.problem_type = "multi_label_classification" + + if self.config.problem_type == "regression": + loss_fct = MSELoss() + if self.num_labels == 1: + loss = loss_fct(pooled_logits.squeeze(), labels.squeeze()) + else: + loss = loss_fct(pooled_logits, labels) + elif self.config.problem_type == "single_label_classification": + loss_fct = CrossEntropyLoss() + loss = loss_fct(pooled_logits.view(-1, self.num_labels), labels.view(-1)) + elif self.config.problem_type == "multi_label_classification": + loss_fct = BCEWithLogitsLoss() + loss = loss_fct(pooled_logits, labels) + if not return_dict: + output = (pooled_logits,) + transformer_outputs[1:] + return ((loss,) + output) if loss is not None else output + + return SequenceClassifierOutputWithPast( + loss=loss, + logits=pooled_logits, + past_key_values=transformer_outputs.past_key_values, + hidden_states=transformer_outputs.hidden_states, + attentions=transformer_outputs.attentions, + ) From f1a19a9ef405c9a30c5cacf725920b50fa2ff697 Mon Sep 17 00:00:00 2001 From: LinqiY <100989140+LinqiY@users.noreply.github.com> Date: Fri, 26 Apr 2024 15:41:46 +0800 Subject: [PATCH 02/16] Add files via upload --- collie/models/mistral2/__init__.py | 2 + .../__pycache__/__init__.cpython-310.pyc | Bin 0 -> 295 bytes .../configuration_mistraltp.cpython-310.pyc | Bin 0 -> 6283 bytes .../__pycache__/model.cpython-310.pyc | Bin 0 -> 49178 bytes .../__pycache__/modeltp.cpython-310.pyc | Bin 0 -> 52277 bytes .../mistral2/configuration_mistraltp.py | 155 ++ collie/models/mistral2/model.py | 2026 +++++++++++++++ collie/models/mistral2/modelpp.py | 1922 ++++++++++++++ collie/models/mistral2/modeltp.py | 2254 +++++++++++++++++ 9 files changed, 6359 insertions(+) create mode 100644 collie/models/mistral2/__init__.py create mode 100644 collie/models/mistral2/__pycache__/__init__.cpython-310.pyc create mode 100644 collie/models/mistral2/__pycache__/configuration_mistraltp.cpython-310.pyc create mode 100644 collie/models/mistral2/__pycache__/model.cpython-310.pyc create mode 100644 collie/models/mistral2/__pycache__/modeltp.cpython-310.pyc create mode 100644 collie/models/mistral2/configuration_mistraltp.py create mode 100644 collie/models/mistral2/model.py create mode 100644 collie/models/mistral2/modelpp.py create mode 100644 collie/models/mistral2/modeltp.py diff --git a/collie/models/mistral2/__init__.py b/collie/models/mistral2/__init__.py new file mode 100644 index 0000000..9dc3f79 --- /dev/null +++ b/collie/models/mistral2/__init__.py @@ -0,0 +1,2 @@ +from .modeltp import MistralForCausalLM +from .configuration_mistraltp import MistralConfig \ No newline at end of file diff --git a/collie/models/mistral2/__pycache__/__init__.cpython-310.pyc b/collie/models/mistral2/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..76a01ca4171928aebb54f37b4541ecbf0bd2731f GIT binary patch literal 295 zcmd1j<>g`kf)fuV(xQO$V-N=!FabFZKwK;XBvKes7;_kM8KW2(L2RZRrd;MIW+0n6 zm_d`}B_mLYCgUw3-^}8YqQo4x{37SX(&EG%A77v-FI3byKQApa-A|Jxiaj?!B{ip{ zpa^6~lz1{&qO>TnBr`uRJ{MvJP?i}eyON=Z1xSI3U(xzSsk!+jsk#~YxvBa&g_W6k z`p)@2KAEoiC8@B?0;HGr)ME}k^pZ=_OaFov{Ui2P^k5ih&js3|=u2pi;`G=BYjBvSiM}3dainiBma(TGk3WBzOvsP<`&8Bdh4a4MB)2lhLT0_(l z1M?q!GQYP_U93s&d-fi$IZ#-O?Ny)oo_o0UCaTt5EITxV<8@Z@hi@OblEayyU-fA= z$pyVHay`BLEeNjdn8_UWC4#FZXk?+Gs9ITlL-kn z4Ab_jE3GYAdKY!flf57H#ke}ItUhs$ zg6z}TH$f(WymE}jKHG1mx^HSFUUm$VBx9Mk`wV^tPTUrkB47>aIf(pV0@t4+#1=CR zm+kTt0Z7I`dbaXk962ux+F{^V*%mhdk``C{vOH5oTam;uCK_bE9vw9t&Q>~B zWYfQ?o(S7}o@wmrzBuG;wl3VDKF6+kZQ>xqzNRo0;{d>0URqdKsAN-V`e!sf1SSpu z4(RB|K#amFyGzLAaRRxav&$t7v_zx9C9n$wJ?Acc4HPAwk-QDM!k57K?m_ARvPmy2 zmE5BX%dfLGk`Tl8TinHT+a`1m=3Khrmar`Do^Hq27k>Psfeqgk$TlDD>25XF$I|sG zU;EY69(>BB!!640(^*}-zPx;**^LJekG2UP>&ZTCUXDS8yv z$)BnAc+Izl_!y~6{2TL!;%yWdwgDO!>^`wbYJ$yf++V58F>wk*i$^ORSU9vkTB$G( zi;(*Qrq>>=bjw_o{S4XJw=Hnq9+yE}iX2+yJjIupEp9$lpPaE6M3 zj==STthUd^#WyL%=21$jY;Lof`SGNeVmO_D0NU|ld5XN_Kf4|>Oys4X-F)nC$tP{ zp|9IR@#rZzXun+(<;2omD(<%4W0es*rtej90pR;wIRuS>6ejPfPSWv}5}mgxGAAu{ zk zI9VP1f&*Js_<92UTWXEj_S%M}tIh^p3U)ZyTn!01#w86D<|h3tgS$BeIExMy^k$ULdHGA_K$={ ziV0-AY8h3%h?{Fhwpfp?tae}}R^|igwYd&WfpMA?L6eM(9^(6512qw4L3nAHx4eg| zJuApeChVjq?DRT%`a?aWQGI*_zoDoo!lQOwRi2Y-4(aYGJ89FUTks22zmsWWxIen{ z)s=_hEs9pEpS7j8YwQXej7pTHGO=E-w-17r=c;$NZ{Lm&LAWF7wxfu%)AnT0rTPsz zbI*0(-BC#!>!R83;ZU3~w&Kd}IQG#eC~yb%z=1!t8Yh4E?02g>Eml9991w>Zh}$!D zI;8D(5BewAi)W>F)TUEz0y)wtKo&ZqkfWU`$YQ5xX`v>sc1FLzGd>yXj6s$`5ers!zUclwK_hnyCov;7n`ol9e~8LyL662W&Wxtgl*_L3*X7$7{;y}aEp@zr&$m2>*X$PuxH7%tJ$N8iYXSEIzorre|Kh8wrCPO` z{F)waAaKb_ywmf2U$W}Sw4hyWAIJ&}3jFyC5XCFST=r8MDddXz?0qp;8p-G4pTd9g zqvQWRyqr)-%stiArPGp0wxh=Y literal 0 HcmV?d00001 diff --git a/collie/models/mistral2/__pycache__/model.cpython-310.pyc b/collie/models/mistral2/__pycache__/model.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ab53c9573dc702d9ab95ac9870bc99c46c10a54a GIT binary patch literal 49178 zcmd75d7K>AeIM9WU48fToR|RyR|5nI30P`8CU?{Km_1{Ko56BT-H?lI3J0RZcb1<+P#0XUZ7~mu=+C zm=SB_%LBoaf;_S6nMSUh6Td`#urX8~Y7Cc$8zbcr@k`c68)M}$xlYx$G{(#0jfwJv z+^6eX8{5j;8k6NoxzE(MH+GbFG4werZG3V}^G4-R!d!c@B<52lf<8b+~#4uPd zHI9^zG>(>!Hjb5#!EeYp?ig$LIfrf=3-{kNuExsmaQ3e~;5>NSD8JJlF274^;GxEM zl^>QE%=(G)3Gq8wJ}JMClpn$GNd4W7Q{__YU3?*C7oFU~%*}YY3ZFgpUij<{d@k8{+4~T7);?|DZQt{P zQMM6gzkL8<4g_Hwd&dHY^o%_Z$a_Td+-@@0fA*+&reND#ILpQH9M_#AU; zNO1x8$L;%Yf1lhhIyGm(S+wu}iFmn=XYa5dz_SN}w;T3@<)(9&{Z6NO)5Ozv*$?6A zLr(E#{H9TE+22)O>U#RHeF9HU=y&n_3f?$rKLWo;oM(`>LfCf)F`csC1ON9pEs4ps z&GWITM^`3_Pu{rDQnQzf=W4E}s`X-{WjpoK%N%IaQHShA-)_l!#&$e9GA3Ck@^H|eUt)&~{lR9_)F<#)U zg;95&p&#jS#8v2= zs_vqH@GXN>=31)a+DldBaN<2x*Ez?sJ+bU9Eql!ElT{RL{ItY=_FSKT@w{_o*=f!? zr|VVMt5o-yyW)CP+PRwOJ487SQh1dflnB+C0Y}M68wCs?B)^;WKBeHym|- z$(i*BF4P=5x<$bJ=~lg7bAmD%e){nz9#yrgj$$Wd<=`sr??$NQM)S!a-Lq)Ws$vgm zEqm{&Rb79qFTlC?IaS-=@l@;Drte_v49vEgvsJI6^Y0gIl&?~Cr63L0k2}q)etJo@ z&~`E^#&u?;E1hYnuwx>o9g9~n3NTREMWj~IFHj|QmfYVF_~2O-uIp*!*ZZ#Yv`ycY zr?c!E;U|HM6uzhNbw3EF9b3Q>>KO~BZQe3hc@=NR)~wqxJ8oOIOkS;;ZDTEQJGPd@ zb=*s>r4cIAHta<3UffRJHdn2-*|ys8xi}`J)bz`5;PaB{$L(6fPsorugZ4_J>O9Uq zO1iA6CXoq!o%Q4HGQxVC%KSvF>D_nSPc>W36-TxF6#6aa4nN+gUiYn<=QModj2}m@ z^pms84ZJ#)@C$wO%aH3_K^J#X-<3;M5AmkZb!)SZ>yILyihkUAnw?z^{M?sf>ix*u zo$?XoG+Lf>_;Rb^9I0QqQEMJK-8y@=_Sg~6aXt5l)G~hb-0LkJDV2i%S?Ray1IQbt zr5pZWW!~{BdT7~IugVs2hu{nu<3`??G)&VpOv8FzorC8rzC9l+xO4Bpg%bnO6a&GO z{8(%bM3fzWAp^3>vJ+Sst&hd5SUJI#@dv|6;;D1zr(3G=N4Ngu7uLS{#sBr>EJ(2E z!)x{y{!08_>}G7uxDE2kGuPstwU%fbxAE7GRg!4=SS4i}h~Hc}ku3(FAfX_gV$CgL zTB}{HqD8!-*D79khYWLOB6??fD(xrSbOjAl^C}fZD$&nym^K_tkm>~QtyUAw zn7roH<}Z8x5GJ!)6;pl1S#oQ2tZQ-Ka-I5|pSxVMZA^2V%Q0S;+?O!-R4TJv5)d~V zS)HZx0XR0kE|bezBSzL(+1giGr7%ykCIW>YRusRd@pY%+w2cKlYz)s_h~G9=4Gba6 zMs+PD*fbnktMN9Tt|i+Ta0{u4*llAhhH$ZkG{PEfQ|_&{g-T6Kzs%&+gK#cj6r~m& zX9)vqDx)5Pmv3UsNS$Dmw5l98@uk)^-)cD3rk_ym73Ha?8NmfOQwha><`0DR!U@{- zb20%{WTg7(ur~d8wdv|QiYf|YQ7>(EMC0b+B+*&N@Hb>wV=EK=RThMvGSm|Y`%;n( z3qrxERQzm(i{)~i*ZE3?i=KWWjiKfwpFvyAI`w+kUXqt*@Km=Pt9%w;X}NT)U<{>I z=m+o4CmJzF;TMgVtc)0r6pomro#KeGb9UO!fRs$g*h!)p@_wgmEl>kD9&223Y#VFy z%dvDU_VUnv&|9;e^Z4>Z2zB^Hu4#uH}bIIdIZ zD%M&Wvu~ztakMR%+he1#?XlGauCsQconVe{#SPBoImWk;Ul{NbYX!{VY5LnS{OfJ7*H(7puKY?GN1VC#0{ci+p`hIrB_X zRW2>h&0*}1U>f!+vn?0HwyBp*wZtTs8GM-2rtQpCm+SZ?xi*!QPOX?3b(PMwFtmP8 zI!n}rCFh?&2zLZd)<~L3BWWCE2aXqv>>Dc~DSiB@-iGds|FA3JeOXeY@xKBOY1|l> zM>I#QZE{()ja%_G3-<}k2bO1G8;@o^j}=oXD5=Bbo!6pyd7;dE~^JhVMJ8>y$p zrM}5UwGr;e@C_!G9Y(<%F(-{(#>&BHV(D*_b$w;SrK~eMORFqeY7n)InHIA(ztTc7 zj4T^%_WkBIw3Yaqw>a>U>wV|QfyJkl%fhwbpvd3HYnGx@Wc9tgWlvYn!EE<;3CLQd>Ih_B1Ji}Pu^Fp^dD?V%>)@QAwo`*711aSW;o)Z@08ZB53$ z-<(4=a#@qvE%LWfz4C-Z1G8WbLk68cZYOTBN|P%GIqE$w`sgEEVSo#z0TWl@X=EycNnRo6mvA4}J%mnS&O`{zji%rBZ zw$bOr&xrgmp4%zuO-8keIz6?t1eV7w>glKWWH7T9qZNnr6Ke(ht-4XHHtph~bEBxa z6t|QWdg_##cOQxyZ;+{|c%)ES_%L`eU<<-rdaXrNQgODWz}GFc zz%w-G-@M9;XIiT0T(363+#EvFfZCm{Hj9^>o-~(T$7VcBAU=`S9A+*LKAmhJ*VisP z&EmC|T4b8nz`+))#rJ>kuuR;Y+FEY9XnSYHsbEB)-oP)-94a1c1wx*Khl;@h=`y`z zQ=N_wmVJLu6#I)+HNV^-Y3_mzr?3QXqKXhbssbXLVijaHa&)-{Q4`{fUPm>#&xS>) zpyDsGL-v##E!Y>rW&f>$54enet81zVJ{p6zexrCvwW{_kL_4~+da@y-|BCMNrQTX^ zf=}y7R#Mn7cg6KNx?H{LNQq_cC_XC;qp%H!iUAdLsCXGd1J?M(68lf{P_Z^wl+5&` zyNGFP^T-&7yHT9%H)wly3buVz!A$VPv0|gT=(s)UyxpQj^+~Z)*pH={y|sD1)_`aV zEzl`fQJ1ABS+>VpRT2sq)m5ilePueX`}0#G(CV72W<*ACpnJNSnn5>fEU6kAZay5Q z8_s@3J(=6ZWeLj6ZaR(X^eKRfDqALeaG}N(-&j=NkNI8w031Kx-NXHXC_Ed-PhF|V zj0V{}A!YDqq-Az)~}2qz&~q;WvwK&xb25Kkhy_A>Sfg z3po83(%@RaQugrq2|pK-3+K*0*~6`pm81}ek-A(Q*D)m`1gd~PT>ynz0ZW1Wf6OB8%TyqK44JJ^bi-65Ak~* z->;F_ZsSsY0vA%XplP|F_g8cJ;d-@k$*!LKL%zn6TCtUTx@saK;7bqJTeH==dlDD= z6>vM&B}xhh=Bwy5ki&|UnVd1C5Rg1vZrPh=QITftWKaG0`pl_M!|yBf{|Pu3{?bwX zc^K78arL)w?GI}f9zta-xFUU3Yw({M=x}S0m5FFreJFSV^n1gxYaq3>IQL%qr zm~Sok8PtR$m+CMdWFYo!>iaD3vw;kuI_vpKIF-5POQt6JKa1x|NJ^nFNhb;Y`MbDq z?}r1f3;jK9m10J?WwbkI0Y$FJb9Z&o4u$=r zc<^O>1Bpf67}}%gr!K+q6HUnYm;9s_^!F5xgeV8E6z{{h!0?JaXRShnfC0K_ zssRob(A;1rzn?5+;D*1Ty%Qi<$@-uoz=kGnCstWWlt;j4EGB*wJ;hhPEK~ml5;XYd zY}zHoBOy+CZ3+2>@&W=obf~(6#qYp%P#|2k4i)t?_k@&cN*2zLniuVWTbyqd=Tr-1 zjI6nbB*o@(qXKHP0(s%GQ|YXkU`{(r%M=&0;H@Ix=m>x}dqW}@x$`s_u^(c6C!Om{ zpjiAAh&}QRkS4Gl)X(r?LJN*ZH^>hpj1-Ne!-cM^soCGD;*eTdy!t9`^-P~gqqiR= zWAO#}&Enhh84K>*pTxjlgIC{GhmP4JL(h|CJc#R zx!)E;y5Q^P3ig=2<%L9fP$XkR=p|eI;jsIk3N?Hy|5$4J9Aq@L!>l3Da#8E(=H4F# zEPYD<~ukSU*iWwj5KF6qXD7n_NhJ8iVpPkmf+nU_jCvt3cWdWlF4)#X#nInVrNy ztmNRE$1lDMe3BR60r(c+Tfi@57nMPqg2N%)4dWMI2mlct!xwQa#?>?42xLgOMyet% z#BcI5iJQhp4R4hG42^GYEQZxNB?8Zwos|`OEDFIqgduVeiH0GzY#1Wn6=K|*=%j!v z$gM!A3`mHAq_Wl9#@O*ixfTZP!CNfRq)-Gy(<_C?o0uV>FxG9j@{gq5ur3P&MMAJz zoQc|Kh7%Nc08BU_6~VPgmqS0eRzugqtR%e%`h%8oi}o_LxH6vwvo*L71Oq)!PK(bF z8~L?T@kzXkenR@Y=q@Xvps_w)t=Udr284H;r5;yDjwnRYP`a@4NGNZI9-`P-c0KTh zm?5u%pe0d$>4wZ-GHVqN%%c=DT`$Z$R9qPmo@a*hkc6V2IaS)6Yy`(;aA@Y}|Vt^dciT|ZzdgQMxCLmyRp44p=5mZ1VQJ&h~p`~PsP``)}>L4eJ zJkSuTC#jZ)1PiYKNGd3`Ued8bBt!JccGbAt>q$z7Qx2N8aFO*-l{zM4cdKq6wg#uP+BuX)>A zBg5J^LFgH_-cAJvx=2M z)6J`xuc65R1-**3K`Vk+2fSoE$Mr~5R&m_rd+r80ck#FxU(K!NR|ncTtO+1RG3R3q zl(i@mtbe(+Jb1;r$=H zhB3guXYl<{teLlB-jPokH-19<&c{|Ko{2S$Yq9pk+R^qHM~uY3HHd$l@h==}PqfGF zZT6(S{gwr~Z+z{zy@O#lNby&@Q*4i-{wx91?3{-D33xLuFGSAWQ2@S)cx$HHTPPsg z0qzbeInvu9@dHwWFqW(EgOvU1{iXXJ+<>ehfj!e%OqdX21PKPxsWfwE>{1aD`pZRo zs(AQhkvF)8#@Mj6?uAq2%ijX-=gyf6U=u}PT?ATgc@}szC~&GEhAvYDY*YF=>lA_E z62UU_^FXQi_+0S@=zkyv1JT``t3}Xe#p<%xIt&dlAy&YBL9jgwt^fp@5b=vgu7(ub zp(2Fu>nJsfZ)(*#$u4rQQTa4wrX#5qg^EOBfq^SQxWxGyqKqHFkH`RD#{3vt*%!^p zMU6XgAqQOMam5IxZJ^9md>szCj3thRqf=YOYjHyxpHgx$_Z zWKYFgl4TTnf(h$z+0lXl18{_4MR=X`Xz1A`iybuX+fonCP zssxZ_xdz~Ku(UyO8!iXU9^ry$d?6md;#TvDO~|-JF)JG8Sq~A3^ntlpTE0}T%@$8R z`8bvoDQkZjxvbj`fc~-o1_S^kp`x?K`VP&hUh7050KHk$!gx`XbYi-nIqI7%dl;WRh&F?-Fg`zn zL;}XA(8c(q)=a`R1^XQDVSzN`Ll#I7u)+cX(t)r8UMgUL0NuI;q;$Xng?E|-lDi%j zC=;+iB8me!t3`l$urvc*EKqn&)+W3r`%r+`j4V(K;fDeih`$WSU$Q`%9v0|RcxzZ# zppQX7NS0!BVXTh@+9E8_ux5e4@D$o;avOLaJ4@Wo+Bm>x6SreGfj0(wKp+*F8L&RW zBn_<&L$F6C39L_tN%AIJzvgXco-q}G&?alr`ZYIf44QD_}N<+Y=G9xlfqQyM@`@C(I_V z*=z0rd-D-{q)C~_{?)Bu;1e{yu(G z`qBU-G@15RFrnkOQmfnB+t-c>8+aV)--mBrvw_lD=w)m7x3?qR?ZOKF53qte@Wwk( zlAZR%>MpQ?543mLTgjLSE4bTxaP6J#-FTz8jxjUc(Qd{pUSi$1j zL+zcy3NrrfLHy&Zh=1X`+Qs&6dk43+2`jjJ?O_`!Zomqf?%D1X+uPea0#>j%t^PG9 z&EKN)+jRa79pZWQZi?UG)tBi|*Q)u8e~T->5GYS7T6wbalU&4RmjQ7C=%;^;33)TS z1so8C>Hy_0wSeQ=OapUB^J-XgvBWjNY!c-)H&>gjG5Q*DQA8I(G7czFk0^b4X z=8E8oHN)D3#!Fv9JsU+|Dt-vm47DD7vv{dGyD0JM3?rOxlpxmrItuBJQbgEWJFfco zc%uG*&VQiuAL;yOIA_!!^76mXVJ%IaP=Czpm*H^J!IBby!<+Jd>c8;~k)D2)R}ssn z{)D%iv3u%I`ScW>_t5!OI>PF04+xH+hq^)k7KYMK0V|F!rPLNs6e6fszl1ma0x{$P z|3l7^gtCPX^_9v?8Aq6MgCHKlg?kzd4(u zQ)G5jhEA4Fj!vEqDLL(&<<%UV--o};E*tWg*#tPtx8gG6-CXA0IQUG{0*?sJ&@^AS zlA2r1CWg#0vml&nK3>4pK69V36`HzM zylRrZn^$Y&@||NX{=p&kXDr$fZCNAYc&zk35%L1*r@Oik9l+2qNWpg6eZ&LVEa3|xHyWtm>-F!8F9jk(Vwg%-SHT4ct5wA3H1xolAq?sd%NfC6 zwTnzHQ{=tw;6L>J&DiF?FG z3!NoQA9{)?y5Jq*NoFv7Yp#a<-CfME4idn@S+r9uZL>ZyLVVhCkrd|}Fg_<-H% zSomU3Ky1Z5+#6U65f~vq!4!@H3z38U1?$r7jGG{EPl!P3T5%x?aiVCtLG2qv$;#sk z)CZt6A9(+VFMyyz!6E4<9OLGBBj3vLNaiwCodZiMaU5)b3G{g z4Knm0sOQlNOaoLLw!3s%F#3E~46uL@0UofVfD2$j1{@8_+S~DV4zdRm{_zd`l=8OT#iv4MIH10xA+xAQ|NVSZCWHHUb(BtD2R{tIKVt`f z7}yI7LBnn}w?g|ujSKWGaR7RtpCKp*o$?LRvg4qe&A9Hagalc4R_@xgqjpUj zL=s`DA+b7r278PzBh{|miv{K|Qlh9q-o_bZh*vpY4fBfBzv9?b%>U--MByUIeMd?c(8KE$ zE*e4rx)&=vBuQ9nNv%K?N-`L`>bUOO)g@Y(kd)6q`s67pQvzV`TPOz)z z>i4jspibKUuT)h4WTbWIz*l3krGy@OO}Z;sF4o!*dI zJ}I)#K!YkBD`MmnS;P}?;vR##> z4ZG7bk{_~qn(`(YJ+;3hoYWX&S;zir#Hk`W{Y7TxSLsk{+ArJu75WKnVu`>_z$$=LvEE2p;nL3E>0n`M5lr+pSXmH6G@XhiU(f9?30F$o_ zWeG6JnkzduTzyNyTBU^88Q74BMWSf7R(KP^t zq{sh4@lBdfkg?$p&xLmD0;%+IAf`Y|%p%&JPek$jb$EARBTn2l9V^(Y#2u>nGC(8p zm&_#XAyPV^*3%AlE-jGq$;g!kjif>4h%fwe@aImYeDFIU&v?f$d48!m#oi>i*q>Ce z2LUwc07{1aaC_dP#11;@ERC*p0Q~EA_pp*tf!at~_wF8Abg7sg7W@S$hPmwDq)!5P zX9Lz8UogG8qCU7HFxV*G9A9zTEKsZ(bwr!HK0 zZ2H3EPfS-%pFMT{{A1_+G16T9lbN5Ps!N{8HbPiQt1pQOB!ICRZiQMR8piSmg+bSR z1wH+(5CwISah)gX*N~BokQqPBj+Mv5Ec*Wlde#VFSSve%-rilmZyTMlp6FuyB)T}L z-5-Lp9-E#)?XLp7pm%kO<*Z&ZEPg8qW)$*lkTz%&z!ZRGkvBAigF8Ysgv01OpQEBW zp-zAh$;=tRf9Kjs^(ZCI!Cg)>fE^X=2@aWLM-8UV!b8DmYD}G@L&OqV_=6}!AArAq zL7^(fPeZ0#v$4eb!%>@7LImD0w2>~xVi)K?rC0BMBV95>{3x61e#9E;K+L#Rfc_!i z&Gd|rzmrZ|5L>W0p>2SMLcZfG+c#>`p6P2Hi*|s8l!<>+K24@C@@anV42_iq$TwN+ z7Wul&>d7#{qK)EG>a@#?a8{e#+#jC;#grG1I zS3Q`pXJ`7%fX=|c!|BUTpf?qRbh42PtF$jSj1pfY06xVt(eL=drCy$X_q`)2{F4T%%2XNWYxFtf0vmW2+I zO<|I%V70D$MUgqx!4OAb`#wU!EW9+Az!F5?m!5KuhuY{-+I zmM!r!7yE>{nh4jSm${1++@|gW+9(Th#e8$Gy_hA8%o0hf0?r=d&e?SO6+-dmYV$?H zWFtacYOEN1*!&W0h(!cT+nVM6d~RdJp4EszJ*4r6yzj|l*l#f}FmO-o&(3_Lub-Dl zbQ9ztuyet~QL7*X|L%S}V*VGttR`k~;=bq*&6z}9Vg^wkL{_?{ zaL(TTfVSfq^ym~;0Cy|=(@=WT$R>>Ly^#kMK{&Vdx@E519a7GH4OZ%jXBu`b`Xq$g znOa_t25sQeAEy1mz#=nx`CRR~zdOQ-5Ozd#QU#da*~O(6VEh=X{xImJiVFi=0IW)> zTz`ATU92r}*>z!fkcJQ|VvCt8fj?e}VvGo87<%>#9e+~I{+7C!dvi2;P-3u=wF!5u;V!} zQ5EiO;qIScUvk{$Jh*I<8-u+T;MeJM|8k_#@gO?gT0Bm^c;5}Q9OFJRfDk9Q7A!!i&K;| zXW||eFA5;$l3|E}FN-1qd!C%SGs_hTNV1t{j~*%>yEyZZrZ>VWI2-2~fpnjP=x5d zyYK!7j!j8tBV4s7wOW(ILX&%eej%EsKqPt2{Efa!VPBVVf`=NULsFdL+qdfNo4wWP z?_n$E+iSwCB5K2{wxm zATnR{)VE%)OE4=HrHKl?IMY|jov|Kc>qhJiK%6e3$bq5izu2Va=)vmEQgD7^a&T!u;E3| z+1=zq8)+CyaVF}VqI`GJ8-xjy5g|4TIjM39Js#R#Mch3}YCDO&R8|i}UDHni!F5QF zAi;H2utJNW5Kv)38)8Zk7Jjh_NkO=4t?tZMXK(Z*Dm4<+Uw_3$O(^u3=Gb)S1RINf zg^>}wse2mIoI~q_mzOC(wrMw8>pP9mJJi5(T|!6c$&k)tZ|osG7<+MPjdYQh-v_t?(1i0)&2tLO2qyo>;(EtSqi0%ooX$tPOYXD;*uY-Rs%7RwP&O+MI^P#YYANQBxgkm_cNl5LUDd?j>yBCPw zXzVo&5ghCiI+>eBX59w{Qh;h&$U(E0$M1mX_IO1t-vImyxE`cN&n`#^T*G2v-WkI0 zQ1Ck({Eh^_qt381;*5$85awQJqYwwGjR_)Z3q*kv{y->MlDb&=U!)nj5C@uO;-aES zBu;B+=Yt^`K+|h_veeCfFxYxkDn3RUSv*Piv(`7zo`A2B6nh0VI`W4?QR-+4AY)qWkdvziRqs0<6+iDnYD2^^T^P} z6wZ{2A#27>^GD1pucwa!xr>FQG-M!$Qt=cMBX#M*Xs@-p5#`rCdzK5|!)t#$reHd4?06*CJmN z5g^rQ4h5#HP}r>y$iX=F1dRi0nFh`hK!fp~P94=qPvam$!YTyn`!yDj0_ywOIFJu>wN_=bSmB4WC+}Ha*NUUODUs^~8QrA0H0lw66BECob~^Agp{T}z zBar`3wv&Nh97jiF#BX0a6Zl!cY2(~4gu5S@X!-T9ZAG6Z8t~b;8lbfRm_GfVgS@RG4@1~%o>&+HLU4G27{Xk80Eq2j;QwI1 z;Fg8su;OcDKrI?;TX4Kh4rasR%{2UiyR}hZ5`hwzcs~c#{kUK!69K00i^%naod(M9 zi^$^_=}U~djYBn{HQ5Sl1>1le%wjv+P4Fcl;3R^kPob-NSAS+gNdOE0#;8Pya;~B=gQeG#zz_JdbWmt~DrqE!k z2m->X9E?tv28pz+KEk0Z7Jw;TfGSI+GqTG^J%E`;Z_YoBtCg=Z-z4uz0XM*Df%E_^ z9Z2b+Y{lt?V3xGjTXfNqflw6(%!qShzv6!K{Q+-Uy%E{GU2ej{wHDb2GFr$`NjuBF zk*@8G;jVuWJ;Ah2$$yvGTQnHiSkd-Sxl-!RK(TnDxSxhz_RoZSu!4Y68nlx%X{o3l zMUg=S$o3#EI5?E<41UC}Wj`z?vTV?^qHzhqWEyk%<>sRABe{_w#z&?>C}uy1CGQF* zf^cDxgiN)-Dc-o+zo=v$X5C(2QYUwU!V=pv=JS)Wl>@!&8)}Up)Gm4>d<|!O4%fM_ zxtj}6Tu`)C_iTr^ncAnRoi8EZ>Lx>oMXy2fkKK!jfHHrGfvz&p@AK*)onNB!J@m`* zDh~%}J*x7xq$RNVDY#F9Iy;+Cd{`oD&Z|$+Q*fGG`!%8RF#X;`r$C4MgLMme2;%Rk zgq^)tT`K3t@=QOi7*op(Jcx5jyK{nLJ>}F-+7=3}n|&V>?zFm8)JN&Zy3lscq-tKE zA2++;xH5eh9rx>FQHuHF6w-9SP}sFEX5h+0tjO;41=(62lA{r{6-5~;x?zUVrF$C9 z#X+L*ittyiKZ2gYBj*G>s4LeW2ycbh^9ST7FxLmeYx0JO)B1XvK(n~v*!`C%=-2@4 z;=VB!A2Wxr`J6g&v6*Y;LFDP}=GbBbSQa*jLqiUb77(aI(WTx^96{k?Rypkn;3AqevE z=_jAQaK7^B6Q==+e0=)7x-$XhUjwm}MJ-{{Xn? z!JCFNM84SGa?8w&um7_juqPbYf3mkaTkty${(75Sjp1q%SA4ovIPvY`k868J6k;c? zw!vpt@Vh(sEe5}PoJnW9Gb}v$xV_iDi#$2F@*SCR`$2o3eK!oi?R0k8_t^Vi0Ig8o z4M~Vd73>4{)C<^8Sl;98${e)ci9_4>;>o0AIeQtKeaP8{lMW8sCE@b#VyMg!9Jane zIQuBxN!!P|!y?Ax&b}bDC9$RL`|SHAw!5R)`oc&|SseHtgcI4qJ)mbEiV()f&vcG` zzEh}cW9=mket8Rq{yB6E&Crv!<=&zIR4i~~a7p?o9$<_>6DH+29#ahFr}ge393lXe z&_Z5lMwmt>^R|5U_BQUr{AfeF5Z~^Rb8OdqH`}Y!{e`c_BL&L0Jo+Y z;#5#3DGUKc;k=05w-hl2f&EbrFX9|CTc3!;?G-pTQW9a*gAK@>0xph+zIO4yT&O$z z?-Z9~LSSe^16CR_6{W?zIPW9iad)+E&I4nRYewu4E@Nqz@J94V0D8}q)bu4uxnZ3k@ewS zA-%B`W-7akd35?wsVnq4W;!Yp>xgC2c}Eut@~-Xl#noq}?&&bE0E`L0FRN`|?f@qT zD2l3}6U-auR^%+&(a_byOHs%k>J0IB%KC5Vhp6Tb{}LO77stYM95NIeCb(Q@0WX23J3inN#HK zMSOZ?^7PS-5UE!GiorHa;jA>XFyb0V60*q}|AQPf=a-4Bkad+0i611gTx&A}gjFEj zAha^T<3Y;M3$0^nffA!5>Aolkn8U}89u+NNkHVCof9N)w^MoE~Jf6rKLPaXZlc+BF zHtgp@ZV|vbqY7n`{2b7Qfss85YFGejNJK#&foJ9*ir`a`76gYg1=ZhOq>YNug&KG& zJVr9g!(;k9)-G(~%*+h-He7BsqcA5=p44`M0q%;9f#QY@8awO(k{$|WB3%-?UyF*aFUk^C`6Zak6 zq}EE>YK)$;4}_611=xoqrdzfm0PU{(p5T-YVV?)7JRE}gA zH5`3dU0U-nhTV}&sTaA{4)=?sKF&2z_yM6-sBP578AWU)Ji)*+#;iFYv)e5+5;4_yIn>#AttsSHB3y&%2jvuqCZa5#7mD>Ze)4${dfQ)az{@ z$smG&r?76j-;GGe4rJT9GM6E2jLVDE#F8D4rtJIGSp4Lx z(Pp_GHU!feWredyFZyt)rqLZ)u1V(xiif6$D-UW?wMC!Q0%ec65Ql0=%o(hvn=pS% zV?6xAa0r%A3ulS1LYV|rk(g4(Hj+4=V8I0&D8Df8fUPB36%svz(Vi~DH%4<9%X}Cf zB9tgRV%AGFWU6VAsWyXjnlS4VyfFdeK1fBLFnl}R6*ts1Yh3hWSlTneyPp9mLe!#2cj76+7(`QsMP^slk}~Jd8kCHDfi~ zw4gvTcm+cbSr9N#!yq4N=(gi9KLJ%0;?9Bu$WCifd?aDk^5+N9JnEzDZ_;boIdPU# zq#o40m3?_C*Cm79$-W7S)WOUr3#K+P+L@!Yp{zK_LiGto{upAQ!74)GE7%c+4L7tm zib#bole9%BD#J$d!!c3L3u6{ceB6u!5ojp)WNO`yCD$Qet|!pG5#jF)vDo@CEss`$ zX%ai$euiV^-AFbK<3Bvd3%hh6HQbTh3u{+6gaOA{V*f9sl|$GIjGa8j4r2&1!ZeLs zVuXNQH>}rSCMdmfY$ML)P1EjS6v*W!@fDeW2^Szb&2|i`vmvMjO)u6tp`abtY9~=@ znOCefUE=|GnN4i-!aS}%sf zL;XCcA?G^f#L%tib3!;7B82D|w3SoY{l>FZE;-c()}8CR0nQ?z`T@lB71l6IXFk6- zw(^dRY6&_4h*5wBm8RWD^5Uq>jBvOr+b zlfeJO9Fbq<+{dFc(lGPX(W!S+HN7wPG#DF-s{&b0cL@$AI7245tVhdW5Vf$P7c8}l z8fy=Bb8Mi9ZkaIg$)hoGEGE%J&}EvKSgwQ*sc9KiId^QmyfY{iqmqgbpQA+Fph%t@rSjB020#405YC=ey{5@mX+( zpJj-D%t+X=yL#2XhTosE4H*Yb%qC^6;_)e3r;xs5W;%DtSI5&`2D-YWPJIcV@hv&! z1-sB&po5?*y*^vVmMffh$i@yo?90f3tZ-lBO-J0qjG~kNHZv{R|2MpnmAd*sISW5T*U&s}4qw+tVK(Zk z9*%Ks0}X*wma&|)ad@=5!{csG*c#%YuEY~~X>h7Z91NDSacEu}7>U+V4~LyE@Hjh* zHkoeXD0CXtC2Hb!Or69cchVcg-4ISwkj2Xzv{SrlXCH-?p^HBSTS$E|v~!E5`*a&t z_}xDsnnFEG9K~wd8^hwA1_@zf^lVwpgReIh#=*H83lqGWq%9}p9r@Y1hJ#ma@Z7fz z_1kt}#h{7a&ID=|2me;(Ug#rz64B z-wK|N>Zi>t503%PAhEWG?Z8x5Q{LS`)@yTwa%CH2a@GCZ-Y%yy zlwnPZwWjNZp~~H%5DN~LgKskkr)yVyI^lrwnC!{-{Kp{%w|D*|j?ME9$uhc2T69z| zqri%cAHe}@?~y$J{<`>6ZyEo;+bDiLvXJ^$BoAM~{X3Y4E04Bm zy>V|fiAg6~7|wFTy{yq1Yb!>mASA$XRx{XxR17{BAe{hZ1i-hfA6PATOd9|hn&t1D z0!J~1pi$&ZAqgHuGIM!(ejYLvnD++~B=U~do=!GO#pxF8#>!S7iID(ioxx60!8F_l zjfO8kBLLb1J%a24mdXgiV;&&mSnQ;{lA3GP39??v2}e4|bID56zV+0p>G%7_QB59` zdz7UT_bE7n8@(HdHSVuRPs3rP7$xZS1{jU~;XTPI&vl3lZ!$&&Ic>z_oUzxE*2=p#BH=b$mAZ(z2=2PFT{=|>Y($qp zK2Ra93~0tS)F`q{E5rH)!b=1Ho+>7t3wi(&QRJDeDOD_38Pd5}0-$INs>--yv0e3H zfVNAaUqH`_iGY4ugH(_M;r&YcLww! z$HA>cu`)EzVTBR15WZA^%%|)A&Jb;~bo;3PinKSZU3TEFk&+03&zRiMeQyUft84B* zGUow*kOb3*oh49q`~iIX`q$5+QdNWgA3>Drk9qYnoT&#i&-o>K9N?pW#j7b?`2zuo zsVN*k>0Adf>*}*1_A=-<7|Lw9e#&WHt*KU1{R&=2qad7LfF`$NFv3!~z@%J{*Jy0}H-IYFQ4h@Dm&quAE=t zk4CX*DxHiH3jiUa?I&<1Vgn~ka)f}Q#@zcFqdrCF7@hx{X_9#cLl&2Am1S}Gy1sc1Q9S64Qk9Xb4?oJ|I z+5y4g9FVUO;grQ4ujyLP1LI_jk0xPs7;lKVVOV%d0SJ#AZH2|XeqW^U{1Bd+&s&Dd zVCI|cCiFQAqU^$(sQ(jJqL9_Kl#t;44ZI<90?SN1@lpXL2E0S3MSh2M%Gnn3jzb$Q z1Dn0rBO)YXoZJXkM)#|Tj@D)cVVxl7j5QBus;OZdnqe*gO|zN=f*U$FUU9nzXfPU~ z13w+{4dAPV^GGbl!BOMjtg@T8&IPV=ioY(ICCZ^Wq?8Hbwr+uIHW}N)IICU#FprlOs_qP? z*&TKP+}MxF)gYF$&&eC$R=VE!T<482ZP17;?@s(ah=Z+BavNsi?BP#aHjcf92$ff( zp8@V2*!MyW=iH$tc9F}uxCQhHk2V=z+kHFc0WrO>=cX}6EhY7hL&zfz;)eOS!j<0u zCj@J__87R0Q5-#;c>#9HkrrA`_?9u$(p~LQNuBliN_z|#?G*0!frp7pO$`RE^FWx7 z_LvC1cuL$FwY3J;TcA;hv+!B?F`dI2FJ zSXf*O6~QU5kCb?wvr<5xS^3wABu&1)F`2sA{In67N;#B`1vv*I3g~)$7;s`UDg#k~gX1T#SI(&` zlFD*!0!dHMR|7#8YF%0&xs^sHbd*BVS?rYo$;eZ(UA;zsld-0lNCA#NsQYfjnFD4n zV__qLEzP$E=xn7kNJk>c1+_>@Q;pDj6b`7W#U-yID8;Civ73*mMWM`c0dLF0&nY6N zRzM1t>s&)#4D!131nfdwxejZhp9l)X3pv)+h=N}~V zuba@z?7$V#;lPet$=57MeEAfk{Ix8s6q}>8ux43s0R}gp2hu#a%8#ebT?l7^Vk2e7 zErnVxoCG3WZ!*l|7e|l@jYc^#jxDoz^qC2pZPx3cvT$T~UfwsKH*f8FHGhNmFJ(j2 z_cRnGxSlf3Oqpjy1`A8qm)ZU`Ity_8kqV%qc4*~PhK6Qs1Q!0c+0H*jXN8Uc#D9iY ze~-@3(fN5g1dXd7<<&M`abWv{mD5j5pLzVfl`~H~MF9D?2OXbdJpUXqLd#5ppR?M$ zr}_do{J4voPPGT|S= z_UjSoXxOiBgh5ZkNhWReUl!lCA2Z(mC=W?Fix6F9{b@T`r z#?KrOoMLKY8DRZ|U4ybLW;MH{Rk`#&ETp*!6Z$IL~V zjnD>}xGzT|Rfv%ic8l^cTbmy3O&fR^;FGwd0gwUfqA)&6ShGF?{D4!MFBRYW zr|^1!K01$`VL(99q6iY~jZUmvqAEIBfb|TVHmc7JNA_ApnsX8AB(z^d86&Bc&O2gu zr7HCb0?5`XW-!TO3k>$DL>9?C{FLl#y*bhXi!jgyvCTw2(N+!6c5x)Rg&E4m^K3n##wQZcH#S-ll-ei^DI=2a5y z$rg}dmtlQnCeT!in8?-WGpIFb3aQJgfewW#qSc^yeFgR zcSU?=F_{|Dv(rqH4G z%)qR9T1+AE=ok0^^^wCB44%_e0`6ccL0`jILKp%j6C50E4KoS)cPRKB4t__RL2d;J zwqkIL2w0Hvh-_g1P;zSA4~OffL#ju?i*equMeuIibZ=dNf&kiiZ7E&sqGRiVKAZ_) zPZQA4VB-St>KU>Rg0YX|7&cEa=BjW-Si#8jmybLNw{Tk!HrflIhv10pz2M_z4?X$3a9af7_lbcgoMD^M{#%&%eD4 z5H#JNfz4sd>I+PABPiFVm}O?%y$3}Iu*k^>i%k6|ItmT!x8pdgy%&p|;|V#?MB;=P zAPMr&$qs;nCHnVXbUaI*6>u(jBD@xaF;E5o9-SmY87Ie_1fl=~K=yzWP{n7^zO4>6 z?+W6&?4{Otuqe^V(A=`4Vn2Km`aBD8C1@5UEu0@eE_Z9#(Z0aE41$M7JnaE5wJ-n< z9lpRE;{-7dz11O`C%rHhdgK8vaMi=j zd+FCs$~Did;k%6aKY3#z0vr4BnbX1BCf>s}G&6jYdyN>H`{e)38xKS4xY}7rl2Yv< z;Iebjg3Fm~g4iBrn?Z_@;GXv`%=QF!ix1)aA)ZUW4c|%brpFYOuiio*lZlv<@wbi8 zo_aHLew(7PLn~Z;3;PjRHizLV;vY7~WkckEWls#Gt^;HTVC>G^hLdDz^bm_}$Ld@( z@fA;;C?3N?2dbebNiN?Ji2A@eU{(*G$6@7w^AZOM$aS!Ev$FV^S)c4l9dRGc62i1Q zk)IaJgpbkO*3yk&F=Uj}I8yQq=byhpN8m>v-E1C!zdoaW<-}>d*u(6WjlJ|j3gEc( z6%Y={5YN;Fwb8Tov#0@`qd~3UkO{Ve1U;$~e>)A4)v@)LXi#kdIwC3k$;NWst8Ms9 z{Q|N!wOwOUKF;i4pmPEH<#>RNgPX8ZJv^nYz zuvHI7W|NDg&5Z$QHUoxoOX{bT&(N5QMU z8tn?(d<8}E3BXkkpapRFXfNBT&l2qLlpX$&ZLCifjPq)c6@VBv-d`-O#;Vwy*ZVHK ze1#Q7mOdTJ52rI5?=Jqfc)R@A|4@vX@+ZVQ<*5Q2H3b@&wjR`oD4u7BlqOZ{-jd?> zZ$I0y5--Q%%6Yl|M8~&u3HA%2UlL0N`XtfajJrsO0y)SKvE8PMZ59wWQ4ggLpitZS zEK9l(0~n{9=^lTOue}qE?+@#?qPkbrF5ygZ+s}2~XbGbT?_S|MloUeFp_QQ0a(m(J z4>TOe>!3X@@dR>p7lZEOgQQbywjiFn8#m>G?cl^A+NCW~x2x`9u>E`zY%0dvvZ?q0 z{f3xI%_(UOY6&NJOwoHd@J4Y;x=7)cMXPN=i9J z2I`LlAv9Bu?IAECdYrG0_CC-t+{gFsr}GXv570RW$4{Qq=}*(APUn+!{wAFtr6aS3 z2wcCw+h3qFf(n&0w$yuh^%$J;L}=#V{Do6bUFfoMaGpU#o+WG#g)1`NyQunzwD;2w z>-wNtAm;upIM%2>pp0j=j)(w{lj;yeb=b^c6peg38RzwwnT3IgycnP01Wk1;zL&3->9U!iy<6`T9u8de$74>Hk|JmNjmk X(br=ZLVY*G^3Qw%zIpR&<3s-+!t&0O literal 0 HcmV?d00001 diff --git a/collie/models/mistral2/__pycache__/modeltp.cpython-310.pyc b/collie/models/mistral2/__pycache__/modeltp.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f7c6a28cecdfc2502d5bbb914f4ff15a9b802990 GIT binary patch literal 52277 zcmd753wRvYeIL3rJNv`}SiC_HBv%wA5)=uNdfKucmMPJeO+m6q+wn^BVu=}m3+w|t zGayA|5zD4rD-R})+dOV#e+6SFvD`FinlDY_=H@!jo5a0MZf??>Ow+V!+M76a>f}1H zBT6#w@Ap45vpWkwlH<1DU0~0iIdk66|NFH1`|}BWe(fE%E&hwEM&eKT(EUl{;jQ?( zr;>?;ohT=4!#3+i*^pneY|3x4oWyUkZZ%TnR3lwZH!|f+BU{cII)1L4lX&??pP4Wc zjY7FUcrzexta`4|SMHOrRDG~fEEgL?<)OxKd04{I^^wMCc~tzF`dDMUJl>cnPsnq& zKH1n+-qn~YPswwxzPmA9o^I?Z?`iBU@0GB8eP5$gE;Vi{-z3j{_5F>T%QrU;ln*p+ zDc{n#wR~&iVEJI$8nx zQSL%Q~6Ee-&KFS@kIHF#=Fb!Zai6j zQo^R{XBubAXB+P+zo+ru@_QTaE5A>|ch|qA@&5Ar#XnsyH=Zg#)p)x6bmN)wGmQ_F zKY;K(_TJ|c<;vR=_CCAxe8Mg{earJ#lI1EwZnF0yWPcEH&c4|`fVd0x!}cxqt-rjFNfVU6m-0=PaQaotC6=836-jCWS#C<5p>23B&gr9U; zl9OwjXA(1ytWT7lxO}#y7S5L*tGS-4)=Q0+?bK(#$fY+oWBJx;g!|dYSG`)RS*`o2 zvumq$$4|YdiRT%^H=E6wq(AhwlaIWw=AA#?TB>>O>6YvI#ghs@k2F2iTD>eGna9pN zLJv}5Ow> z&1o(;C+k($tu59Z6_0^h4nCorv#MHaI<{o(58B@4Ri|?9vgcIX+PX87_D8F>U2(mt z@+x*~0exI+F1h|K+l5xVRz

6~9zfw!f?9Ru)nIc{aXTxxG@oSgqBo=P&{{$6qZe zr|LPXf)=CR=&jo;Cyx7>dTVJ31^T7JM zTbHDB)NWmJ-{Z=5lw&_!Tkufs0EWFV@CQJjP98~Bp`KEzsUG@l5{J&p0K zDo&8rn)kL^)%8bvB0TmEr)vAtPqr@YmmT{(f#0kSh==gr-F1zJAKXEu-azZY9|+y*po7IU%Y|Oi>9BnYYjgoE9w-+D~qo4 zxcMlVv8I|rCA2^9C*3u~^|+Jysan&!<90vOY&F*%)$%i#x7<4XWTX15Z`C}f;Txy? zBxa?bURY}&=}gKW=-FP1u5$rX+(my^&Q(3+o59qrEjX?}f_y6ab?0qEg9$CT4(dCt-Et%h@~e&KShdF*8C^y%6o$2`aN++)(q_|aSM?Z;+kgZ~AYwp-py zd~bDj^|C)$S#rFJURrk5t8zr#BAlWzZWN3u!!%99G^|(EV+dTpH}n=;vhNuVn8ft|T^%YXG-Ab2I5#o2j;O4S(%KC5?ekR5G@K?9KK2^QGVuFQ-&< zOW4$E7poW$ujI8#XYZ0#&PqgS=4P^f%3WK!S<-`=d;yn zViePtoZ8ZP&o5#xt5vblSDaP1R>!H9^exw^FZzAwYqpJDjypM)>#F-Aww_95fd>Ne z<`}Efblwfe#@A(Wd286n8|xGCHq3@qS}+kO{AA(b_awgVkHcvj%X-lmp1GX7W^5Q( zK$eZ(T266bxUe>oZM@w~x3S!oGZTqx#%KcZ63bb{HQJ^;TWxDIhlQ8Nw-1XiWvAMy znz51g3U=DgTrt0=xRLStmj~MEcE--yxvNPd@jkS|!rvA6o7ocy_t8s+fpIeu@5eV@ z0!A!9_eC~F-2>+=R%m9$aaOTt*N0?uN*tb&J~Y>dsuk66E7(zSCd@9ot)_2W`XZ+x zIyKGnqv_2Is(TUOn^?y(P*}OGD%WyywROq28cwz8r<8j^dFnkZ_$*_mZO6mj=qDFe z*Zj2CsuN(bTB}&_mXu@JYmL>Jtl|>!`@=Ei+2Z^A!BuA?+}98Ma`aTI?=qnOMZ1#{S#zEMoBkN3=vAlZze z9!H`V)11TriJVHs&sPW*tkvl+R4P2)^&6RZN=dtddAZ=!>*3s#O1&R%^(>u(ZvkJK zrP;)QQOv3^48fgGv^g9{ShP9hWpm)Bz|8@3h?|4m$GHPAIwjjg8ndF{M@?%1?793% z{K>(Hu+#tda7Y!GIY#7hN)7dcE+$2odB$Ax&6)XizL(5Aor)_K< zID3-tXRs`+%`B#Ru5EE+SvGekMiRR*?cvXJ>a)U};&|?3e#?dBelN8-fYU0=a65sj z=jN7lrT*$cIat(Fczn^&voc$2UaTxC=fWw>*gUow7ZVM$_Qj;1!C_Fr-o&|h5l4h1 zz-)d3$@Lh$89sl4=f-NwtqBn8g#F_B);R#DddjD%P;D0ZAx0nK0bx6f)wMc)375{KWeq4+Mtzvhl`ytW zCvzp5%2M;^5W^jYlQ+_4+DIG6F_Eq0fRVqkzAv1@@p0~Kb>T!pa)BK<>ax|X!y{vx zARK_vNVHAD54LeNxoP1!g{{x>3~X}x$=o2skZEHi7XfeBNo<2@#LL=NJFPdmHnvL} z;~b1~*3RG!w!Y1L_Zu1KEHDr0BpE2#zc9)&MtOFpz*_bxwSj8-`KWVk%y}M>m*9h1DZt6GZ{T5&oMd2O;0w%!_Z@{ z5ln2H>)B(}3yAmK___r+X=BNJKL^r#^^W)wjYyi?7npMs!-NzLSrf=jyxQ z%p~=8Z9J=j$?-NjJ128owlnn|s0KislzYC4O^oNP`j`Z%@5hgCK6?V++q+uL z2K#+VN~^H2Pa{AMkts~NQA7O@o)@@_haYwdn=aFd$uWVfU5U*^o9o)dPTqOKuHH6B zu@AHpSB!RYG%=CD%EpwFFe3`XYHnv_G8xq-`t+gIRbX~*iM^mMm!1f=)lzia5Kd&T z0E@@_YSS*QIG0Nr8FOdz0yjRSmfU+I2=YSd(7d1!vu6e6oS(V3bT-OCr<{#quj~-J zQ%WZ;mi!K}@doH-3COJ%jv0FzHIB}QPBw_TLpI+)CeZ;@v$v^K)I37sYU5vf6?o_ZM&~HEs=Z};Q zw*peu;UlHsbaYwXiJ8ts2-|)zmc_wRRV}SGh=RDFHjpHQl;|R4kFEe=s#FE~fEt~z zfuM#wqvYr&_o=W66?FVFoRG10V+4C*xSYRL;5+9rZ*@*^A#s0274#zKa z&v3s#iqFCEGZ!kdqXA3Qp57B?w2B@5az*cc1d|!+Ad(;^iR6nU4GbU81N-1~L`VyF}X z&{t9}3;=~)1@JkNW3IeTBd+Bj!dANT_%V@s{X`YoM~h=BLm z`|7QQYTbPR9-RV+j&+XY(Eg<=rVMD!g3J?#2CW73mGdoo#~P|1VP&G)sUO{1H}zu( z`wtBN44kvybXNaQnAM9(^;!7+A&u;VaE{|vNX+WT5kAx3LDWDH6#BP%UyuN>d&9A7 z0JJpC`Zk7V)>grMXf69W^n@c`b(Al1plvtxGa*wyACM5L3!a~bQ(0`jXlj7}C-Giw zgK+&8!reRJ0O7)H&l+aRGAxktOg_JCW_ErmX4gly8Ohk(4w3DXEQU}lNVfa}{MrPB zujz7|nAxNv39u%F+zt^JP-+r%B+JWio`a4w1k@}GqOJub8`ObbI#L_ZDkw>J#VAPv z!OpGcvGTeIM2-4tH-Sh!gX*ad(5b+g$*Z5H|7Ym@3pzhVXORxkc6FXkjm|Qi6*wWr zE0AqEn6LRxIck+LFVN{g`2ye*{Qs}`3hrBqNH_T#RgWKwP;NY*4z@sm z`8S!F?)`#Myh$-k)#3Q5Cb%-Iep<6;wu}~cA{j0A1on!=)7A!PfEyNw8`y1DOx4eo z5Bvd$@kfXp2X6QaJR$|)n^+vimE&|RwZSnbVFe+h2??`^r>*~<9Ajq*oZ+7*Ll@|(TK~v4^#N6wG@fjDX!lGRa;2nKRxZATob;)c%3|#vCzA9_2wT86 z_8ASHT<&<`sbw2Dt;X%7W7(FIa?-mJiy*>-Fc0F!^Zn(FopG?Ac+s z=5#)1%3mQ@uqW((i8Fxohzm?}P<%K~ARdr%3iusz`t4zSN1Or131i|{o_8fsE^ySv z0ejRQdp=bj6bfGvvt-gQ2DAU6kR`kRucV_N1FflclpO@HG#VT|+6SY^rAOV;L-l&; z{jIgqoa5LB0qg^+oh?1qvI$iS{0}^!t`xfiDn6ID<_ed4gZtbahg9zXopR6gOi(C+svp=g)4TjYQDES)90ApaJqQ zl5NvYtR>VG`5S`Crn5%rucJ%O~C zCqyMm?wS&00Ehj>n(g#dK+wI40#7&9hDkut_L(18{|d6=EN_~ zU6$=jHm%a3CA4C`E5ZD|rS)M!-{!duNi2q$GqXE`fNvDQI))i60M`0#y^HuYU_bRD zR>(IEk=-4Z{Y~d||9`if9tm?F({jtJv3yE|~9>=5c#5JMg2UQvK}MiZl60gK;J zWrhnSQNTtyJfhuCkDs55hEjc=(SMQIjK+_dCDmG6b)^P=o|t|G2e4Q5a{yVj5R5<6 z$=5~`>T+e9=1rvy(eW5BR(WYbyv>Tg!UhrxZ$<8Ateb;HbA@OJ~SJ3eKZCW@IQsB0O8 zSJ!v#I90*%6Iz{mj_paFbJrJ$3xyy033Q#r*Db*Ti!=eChdc)otj%lICXp}mnt-{Q zR6T`wLMw$>fOWCvTvETzkk7;Ma}bhhg5KpiK3E0Vtd`bVYwnDtJMVWH?>RdE7o9jv z>8C3KZ_Ke1iFuVeT;k|#oYiqoX^`X;bNMH5C=~kvj^IP1c!Q&5MgG2YYYX!0kmASQ($p?9HkC{ zMb29Bmo?&Vf4i_eYy(uu{er1L&(v=MJ} z0xn`*Gw)%##PVdjzg@&pXFhFi3~dbK2t+yUq4sb)c_oD~^i7bvkFX)T_z81k1o=!M z7E2guk6g154?XDZMp$Bb`U;-l=Ue*wy*-Dmhd)n%>WN{v^~m^ zZ4&^v3E#ox{ks$GT)Vhvtt3BYsNHDG@OE#}vJsv3hZ3K(q=f`>45WFmZR$J$YIJgX z3vwE5W0ZB6q?dS6`q8!t`u?bQ^YVe^TiO6Dy*;%$~9K*tYFKdxI8z1ofEQ(js{;b&~W^oyNlXC!@~Xk05Q%JBre0 z!~VKrTuY+l#OAT~7-~7zPJPU9f5b*x>|IFFHBW&Yk8hcm<2%gD+=&G8`Hk4j978)T zk%gL?15FK*f;?y_^LSlBWKl?V^WDJ$3fT@MCc$JR;2lCifH^3EbP-|T99+C}_KthD zfo}+D&v%X$7K9uDivfn3oxeVMt^}^b`I0?TI{H9~5AcHpZrEA^OL*kT^}@S_&!SPh z_58a)vkv?PIiJF($!cfM3b1`w5> zKS2;yj$I6)!6PN`D7HX>q;u7(b%Jz6jH52oP@m4ES`r8q{RfH2SrFl#!;TA@&&Tm2 zq|z^9^G&QDh&KF^7DIADE^e*WZNiB_vng^^x}G~yItTCGi?Kg_IZ8<=&a&el~z`Ae}%TkY7|u{cASk|Df}4 z=-fi*^K^cn&LW+iXfYbe%B-$xLfXG&T;k&zs`>(b5get37a;WuV40luCGWiY9me<+ z96!Cb3LzXv{Srg|cRC_1lx|eL##$Y6LLg6~z7!MU!3~CdP-dD>c?zut(##pRwdjGP zug)=-DxG^JMF3kVQsdNL(D?<1CqeNL(xCcnJkCt$Q(veD(NV4bfYJYuj?Xmd74+Yd zD-eO7(sCjO1bKuquVq|aKhKk0uNS|dX&}K<7r>{Fd{^jINO6(!hU}eg61=RHiZ0SG zV2U;?!36XB{&IWt$3+a^hd!#22XuAP%R7NxMf{2qz{BM#(GzK%4@oXaPXXr^XR) zih!gw4Mu*!*pIiqY@N?Y)stu1{_O2?NYdh|Djh=H}*3othcy zVetEz3T2-mA$5oY#I>kMtwy(G@q>avhd~LBV8!(09z(0m5%BILRX_K-aczDd7Oe+B>`k45rbrzn{o?Mt^;--+gGGLRN%Es4N**FLGG z*hmt@mZl%DQG%5Kr2$POn(rOe`4TpJ?zZ7J83;B6sj)~dYlh!i(`&6kL>d@8&x}?R zOi~EYtkob{9-J0n8HYP;Ge!##UNf+HoEJ5()C5UNSi-{FUht5S5DQp~)wOf=+Cu4} zCmzKvEp6>>BM)EOfg**RDZw!fx}|W-aSDTHtB((1Mu212^sr64IJ7SZJiCUbO2u$w zPE=01L3{;|ArT4~j8TL1#HWq#2K?@sR}ux1laeH;$SMF@@&pp-$xRf%-`lNDfP)_* z)vZS(+qEnc+-N@47W!*IE}Zb^(AjF;uf)iK%kg- zL+eXYQgM0TibaUt{Tr{ed6PXX*r#Avz%2d|aG(9GHKh4Q5O1tE7sPqKlaD-&(G{X6 z@}5j=jEmIZD7Lo|U~d!A6WAadlkJJ1b<)Q6xU^LKi@mnXn&{hM zdmOmoF7(f>0d6?j-WB172L(4gB)H*0!42srjyMAh<45g@CTSjrH>QAR-L|p2y&E{# zkrfj?_g_%@(QCj%)Gxp{i}p`9hrDBKmpyrfF@8hZcpTv*EL+Pbki)c$-LCdDX46za z^guRU1tZ))#B$$Nm8neHCoWqa*ur0QCmz{~b|y{m8EYv5(3`_|a{+tc0~ z;AMN})W5?frT&P{zo+xZbf)MK9a8@h&idYJXR{4JtpI}C4&PCK!i4{c&Y#k`ht7Y7 z;|~OEkcwu5tp702w}myxT|pA5_xK4hA-e_PB$CIW5wY5WaM@10lp`85!f}k#uK{e4 zqGyYXwS^k9uTfZwB2S=kL%u0S6kl6iQegDxDjX$nS*q90DbN#VMS6N23Va}Z1G3c= za1r>a24XQd#t+SR%MV55QHuT47iTpeem36rqu?)w?6>)uBj6;0{{J29ss94!&+wIx zKLHx2LqT~!xa(4NEdjm$i|jjn@&?k98q7wS4x}c*UIhLHEfBO=j(w4@!l@l7A_QV< z4JwW>rgG|naU&g}gGI3T1@n}quB`_0S&<}P2{@=m51Ukt5HI+wQTDT?_W{r$|AHw? z=c)@UlCQ2X;`zobDAa$4M*1V9_QiY0RsR)lX5Os6gdg>nj0MGsRrTNa_|0N&>dVaK zHL*7JSB&xyoww2HU~IdCl@LruH<-$T3Hlibs$=#jH2}Cm$m{A?k=!4k0DFMq5CJ4~ zE~2V_jtV@LdTqr~f6c&Oq9cf)plX7&5swqJ&L08woR~lRwBj_0Q`-JGGmk=-R@VX@ zkPL_mQkfzvbC#J@;rNqV)^dn$5HZv^#QW&`TRMUrmfiq%s079mV)4sN_zIm@>3p5e z4LX$a(J~5v@)G5NfM&l1`BTGuJ*+W6Gy|Fe;LkTbx6B=6fkY$?hC=HwASOePfj&Vv zmC59V0Q0SYlM2xnl2aHMJjR*;_%2DGG%GDU>u#rv` zkS}moVz4H#FQ~|+zYgp(%}`>c1w%pwKhs!R`lV8yiVr=wXDDXg3-X3jVh}yBKrlA^ zjKV|g?IgY~#~4Rg!p*=nHgZk*ZS_Nb##l*iK$6BbL2P~ooL)%S0AvY>@gl7u(i=~w zHvs-Z21FCq3m^{N;&nTj-Ky2~Q+eN@lW7%{lOK-S59@&4-(z8X>`8-$0oJ0m4}!wA7nMCn+G6xl-| z_%;)(%QnCaYD8LFOk{tX3cU{;a!_tq2B0tK2zxur4RtzDON43d@r+;S=x$do#GY3C zLYJ~P-gP~#K-^Qa;47eR=2U?ZDMP4b*!n7Km>Uql!}g%|F!(3w2n4l{zG*l!qZ$I~fhS0*4$02}ZJiLM zE|V~X)XL9ypmPWyXw`T0PM_{8(w*I?JLIX;RDZ0in?n70q}?`nMPwR6_Jmex*H8wD zKpMa}P2X-*?TU#5)WcJ{1aX68Hc5ejyio+@7{Um0`s;f4We`+vJI9j*!2Pfc#!1;!ye}`ep#G`8?%Kl$1nO@>mel~vLg+<96h#ok z3X3M7mi|R^k^00|pu9%x8jLi7>puiPKVsiN?WIp3alsOv=i>u(7!&%rICh~3 zq)K=KPvE@_?9)fFAYp!vr=<4w=_5=YlyG$pVz3nKF~QmdonJ?4VS1H!#m}A!ao7OF zj#kWyUi>iDl+X+02z@hjdgeFh#Gi3~--iOgQL*yIB%swva}t~r(-HnMk|RTeal)ARmFJE) ztT0>p>;gn_`-CY{z$6{;M}s~;1A9Z!xx4;~oN~ddE=Vz8Uj+I$B2(RL&UQGHvvMdy zkE-QST+0UMyTdkr4F-x7_xEC=02toNK+#-w0KGWg0RrQoC8z>^CShRc=kd_BtE;rh zAtj%A_=$(e90}xVUx)I)RZM9N14%I@&NTNGtc7+;jR@%Tp(LKPAXE6H1)1bY7y)?^(8+JYh=7PL z@-9I|K+fB-#=e6$b~a%v9?0UBqJ2f{LP+>KMG4}J$L*S|)v}f8` z?;z40!gpwSMiZTZ$0jxpLomzO7y+?=qu9|3{bathg<3qXDb<}l$G!ioT65;1P6}+DK)xnU zYPx5m+JsG#uI(*OHrCbBIH&aEhlbj$2_WPR?MJwAIJ+Y+}U zH`o&?s>0d@>awBqQ%BDcq%5t%JCC!wS&0{a|9+&<5Dz?l_sw-ykLRFES zggtg<2JESdCAviLzhd+Yd?gg@h;&MVf(MW#IH{opoCHMku<{yx(Io zk=W>=wvN3KYU?*bXzfVV2xU~GBtrA-VTrSEr*UGr-vtk(F954qH_}!J-0(MjeIWOZ zQ9y?Q??nXfd>YsHt$3+acHm z2x|e^#y+eQKXHR*hLf~S#|rePr~zE)vQ(7vLv_iADG2DO)3m3jF=@`|Eowk41dU33 z3`h~kwggkIFdVW7aMK{A8n9*oB_I#TkC;=*({dg{`nH6~ z;Z-5`HoZ(K0jmE{wxdYUr2>xnnZ%1gjP65cLB{Y~P@PjR4tEKAaB=vf+--XIIDgk; zkDfXEx$lTdSAD^q7JpIs_Gmo6{bL6~9BHam}W-dvnONyN+zd#FK<4kB3)tSS=s0MZO4KK=^kK_7*&atO=oP&^S6 zNiX*WVSV&dW&kRAa2FxA1so&=hE2Z%RM{S|2cZNwAQ;IYFp?pu@Ee%S>H#!NJxJ#v zIy();kD@o!yBOm%oVgwa1ufL`Idn(NwilToBzmxm-4DQN!vr0GC;XB;fcXt;%^^T< z8=nABPiZEQH4KtPK+rHCbAojxsYF4UsX(&O>7)1Utk{OJs&7R#1s5sdvz7iHs#$5% zIibCmQ?xUu#r$aWMKc$G-0KE0IWOdKUMu~TC=O*wlop;cA1kOxuXYh+GZa)j& zSD-NJ+ozB%H>{KRy5A4SBcB$V56se01Pkh7%SlCS&{79! zf^azlssc4@p+dz2A7X(Jbh}2HDkuozGp|v}2ip3egeEp$J0{h95R*Fjws{}(%X`GB zK`EuxFRi47tvyZib&s;^DWKgmRKY8Mpy^!FM=-2ixZX@b-H8_IWaJgVJn?yFVP&-i zk#-j`Y!628#Uv!WXNtYxYtQ+< z9Ux06&hP75GrD;e?K8iQP&Z+KA7K;rBadtXV(Y^MYps_7^hd(n1qgFb=ki}0zpr{4 zab;D763@ioV*<}l&G*<4pNHL~lN=e@L&cu$0u9arM~AIqED|k=#f^`r(-M{5A(5A=&ROJ0rx*LIoxHl zNrS$nLr;YS+LHnc&p*?{)kPGGy=lZotQyO?$`Bbf1OYhwl5Hqxk2%*`py3;;)Wd#4RKO7}T+ioN9m9DGV1kij zxj!}kZ9VgRmXsZe>4}<>W6wVmihW>MqRL2%LfQY>{Q7&&qSSLdbAzpcHY3nk8h{Z! zd{w;^*c&+-$S|<_tQ1A;1Iuj-B1R}Q?bhn-0#<;s1e3qx0;U}vyPXCpm{o|gM5Y1LT)(EY*q$sRwEuze)B zU7ciIDCdaN0Ox`5^$ah$3=W#w4`fo#BA}nJ{c;szTnNbu)we|7vKUC!CT@v1lIMrC zd8B=4+q8X>_N0CX7gj^4;MmDQEH=QVCz(e$Z){Wy7T7)5M-_a7hL%E2IR!y7+Oy5m zDjCHjpe|3+kDu0N_O%UXi$;}cv0XKAb%D0ll%nzWil2KVw3F-i1#m4*bcC7;Y4jyF z5QS?%dJi1GUt0kWCiKstBVD4BFv1K35=>(dQ)rHvN=eqMmbrc)m{UFDH5YC2c84|cqPg)qLSc_2!EgxPEv_K)_E3SP*+i%kN21y)^bii zg%_V?XOXoA+Kt$s0$(xsE0~Hgs;7bb${LR&S(+-)ZCkRVLN&hyjp|6$#1WaE9q~~u ze!%4V7=rMlB)}dXD%Nm6x(Ff9bKLYVL(er3{1SOukd$jnm|ob-fc*q^A45$@>WT&y zK4P0z&Fd27;vN}RBLMMR=cu|v=w5VkpPGLNQl@8FA*xF9ASYVWeZogCXK}L#HZE*> zI6Gp6ZXX@iy>S>-DoDVSUt>F=6;oD?pQqkObCKK}btgUgu>0UMhj6LDQXSW>pi^#^ z@S_-*Pa^cZnjZ0gmEtr`#!N1+7`9!*iv+=oM)RR$CdkgFD8 z5{QWfUx9`!@Yw~PN05Y`f9m*=(urs0@72(T?tl{H0|Q|yJuSq#B|GHsVdAI6svZ0A zVQq4%ljgV9ZEhPc@jYU9hxJWNSo}V*Gv!$ac z6M|kxBvPP94YNEp)}9=-m~0hPd6wV?pe7iE)%4yjZVX3g{+U>zgzAvIwg{`iEoKcO z2UP?0vW87S?Bfm0H_XPT*a2e1ssqVZ)w8vWvxH--=iFl_-hATt?1?+>y!*tAOg0iN zVx`rZToxMG3s~*ZHU$vLbCxdmbPDIXtP{Kxz+b~O7^K&m?K`FF%=d7XhLVf4Sqq|r zhQ9wk8~SVP58+CnaAaV`Ss+>mlnXSF0 z8ly7Nq0h|sbaH2{hw(~Vx7F}fd}r+e?9hEDNsGZDs7j&vR^Df`4H2i8iW93jz>}ka zjI!MW=^i3^Dx&a+Y6-i56c;M!0BRsEaSoO}SAZd5eq-)3!UFH=>cof@r(?tP9uJqD zh+BGqXdNMx7?efAP;6UsW&L{n6}b;+^gqP0g+2Ou;W zm=a>itRC5C&@>I>z9rU{qa~r%v8Lg`i^xwG4Fa@Vyw!Bv(tI>Ig|6a)rY9(vtOzmu z%T1L>=;saR?FDWDu5(JM%xu?kLTu~{`UFV-8epWt5*_?0H; z1ffE9-C3$GT#mIqOGDxXs8p>S6Q4dUBwg55e(FA3- zXH`Kz9LGBXL2ecK{aA%`9pkx&@L)K>dMdi@!N)R4(3KMCQ?e4ejzPUo^&Ey0$gOj} zbO{7sD*CDK9<1U}mygl0;QGk()rdU0ZLj;JOKpi^fjt0U@U$K|8@u}7f~nLCB&X88*r># z8Qp`N_aHn`!6c%&fiBWo0V@Guf~q~+8I)*G!+^Y=%>4fxCI?81Zve&%H!Hmxbf7U# ztk-KtDEE)U^nV2F3DbiYB|35B>~+~rK}*7^9da4Jsm8<=Rf83+4+ag2&c})icxO=q zq7wq=TF*SguGY&p)6E7q6Pm2w#30B4sgK6TNT7RM>>;7xh)@il3exWKA1cy;I|)K z_Ja6m7`Gq30fY{c$8HZuIQ%f3TX2f_Ee5|s!S8VJJK_u>-LP=wVgHQAa&Z^yh%+jp zCnik!PCO4${qmUb=Ep%A+~pSnQj>Jf`ah9z>5_8O01ZzW4Ks0{!@wU7;R0F()w`#j z2Ztj@>LUc2B~VUnz$x4CP=M%2iE+w}&ivkxvU*a8(-KdpdNEcR38Z9~B{@DHhSQOT zd4X~&<_lsFzhdE+X7E!Um}T_i3;7q^$%UJq5Z>n%Hep(35ezRFzt4IDct3c>c$m8V z2HqEK$9IGbF@%iTkS3^%2|+S5hdbzCFAZ23sAZHiirZ7Y+NzF*Sr9`eo%{uBP zpJ;l?iL}r7_Eb#uFHxkHv)Hs_u@ql1JWO*7PCH`qa)Ql z8jfAsSidP&`gXCmECSA|llXom&zf+`LdzwnjruU_qV4_rDS_voejcFO7Te8R68AnB_Yj|lduErTo7hK1q!#2 zBELJKZX~}R6r#m~iVlAOtcWyNXe0> zR!@65h<1x1Zrj>O+bP&ZNh<~By5$wFn1uw@msG>+-yGomfndQ~n^1j%ub;MXVAlKl zgSu^?4neOD6?^!78pBwQQkfX=I5b$4v}~D zW%S{I^m>pA0`yQWlbmiqc;AH8SP1h&oE{gIHcOKCpa|%XwKK3ZGv<{bcn>uKC~>9Q z{ZJb>oBiHR-C;6U_QQTq3PKer*e5k$@ltLDgSb5`n0_Jy`!Ho!gJ`kpYaY0tdKx&Oe5XI zqQ!IY3NFX>ZbKMCAP6#f)glBjw|@5hpojw55hE{h9~EweYJpK35;vn=K`1@3&~Qf} zJw*sa#KApWW3~PWbJgf*D`1ww9t;?$07hb2aJwO0B^DUwyI$Txf&n%KQ=K?1fo{7SiOLIugbb$H?h2Gj_m>XR zj@!ZcP#-&pD6IkYMW(}SNuZ7yLsW$!9{Zi_#yWQNo};54LGsRJc2RhA;u60 z?RiGC>aa4YKEXiIZy*fwI>uB5CZnRLHnlTAA1#c?xcWH<&>SzraMZUUFMqlcuCzGM zxk8OY-ZZSai(0ztgG@1q8;-l{jmy>L?)5>GF|$X5S2A}xsxa(D^<8w>b9w?vM}9ZM z4l&m}C`$oRNle}N<0N-=#QtE^MLOpaO6jl4x*WLFCYHq#;%pDhF!?jPy$beK5&t3@6hN!7J+0GwHvB zHg3jxDWA*2)ap2Fkq(>IbCj@zf;|9s2;AeYPsu^lPSP*0hYGJ6(FxXI3_eEF#rVE# z37dEzrsVvR(lx`ZfBvk(4xY?;CO?VwuNx(9QrvgB(vAx2pogO2UQ3okepgiDNZ?{Xs!^ zNtk*&M1eH|B-F;*ssp_@r$cnRSMLhzy(b-jBG$HvS&^cu|{JsF}=EIhq zleml^rzCs`a+)P_>KTHbCcDv7Q{9R?oPNwWohSg$T0b7! zI9G#4X*p51YXgl?TVyYaR^P(%xR)qy50Ja$PDi01C6;vk!5(ogf5fXf72MHjcb&xQ zAaglJhX|~?%>16EBV|aN+I-xgGsh^zIB^X_a-r_({aNmT7{R~v%cfj6Ay2R5Gte)&VPd*tQSt37X~?5O>gCl)kXTRjo&s=Zc=&-i z=pMl!QKA=#R8cm0o~`jJ2a?%LTm6vs>7|6z>_%WGArdaZy8Oee1o# z`rDXXD-0B8VW2pjHUBQ1GryM3n_o@$nSYxv6#fQ!`m>XUV%`hLHuj--6hH27;cbVG z2LU>*ob{TNJmNl}=w(IQB>_>petQ7Vc`;9%fXJOTPh5bp<57h7gAxb&ky8XsZkU!c zbK_h8?7Qs=X9Nn;lg=1^$3ffMB|cgVoq~^VCxr^QTf*VDr=u8q;M;|ey}|Fk;I|a~ z-sDU-*>Ui()2;PZg*%lklu6^f>P$es~O z;Z4ro++q7o_HE97yqR(=XFqeZk2t$<-PcijRw#@&Ggj`H{pRiB*~gJ4YoF+jiyUuv z4g|3+$t`Q&Vc#ja-4f;26Gw8&+joWWgo=49Hp_ec@DjXJEp<}t-JE{x`aoUZSTw7} z?p}v<_%!B)CLIz9riOo@BnKJzuDdlsJOm1tmiskLAslA44nJQlBu~ibYx#T%Nzz9plSz1ohA^gE|OShbLNK7kBwP`>}Z${7d^s=T~_YW z;d3PX-qpD~6U4YXoWxi3ErEl|`LPTw!yg7oYB~A6(zyud1z|KRXIz7rsC1AC>mMno z3NFB8S~CtJ>8_Ez_NUQA3KoJ|6)~IC6}DTN6t#;R5%UpvXb;=RnoUBLHM*O!)B!!_$=+BV&XzE(XOxUJx6!-rFC|F3vStG(?x(oish>k=u zNOW9IRDfZHIMu&GIqIWy;_a8S=U`ZY!uAzl%QeK(fwgz+1oa`7@&l*^78&QrwsT^P zjS3*2PNs)Z5%m$ISw9p2CB6NyCCQdK!V%=OSN|Zahx*^(3+8~{PF9le~d?p?^3{WN)&w&(3y5d;`OyeuW*h&7}1E zI5gok8!5a@NVbI>NQ|(&$!Fv}1Yf>gTqaR@2t?-u?5GUW(h6*(47T&` zN81C7u)3x5+m(RQE6K@h&qrV_#b}RAV0qL2rq>S}F0he>v#tQe@B)lv=XHfZcz`@f~_?R-0wiJNBV4GnN3AbSjI8h7+`C^xP8x$Rki6BUFZK%Y)X5 z%oCzk7aaF~A+66mb-W|~{Ec+u)6GW@Hbe#|>TYfD=TVW73p*U+G27{JOxro=41Xt2 zrEN}(ZBGisP0VV$_*3w{h|3wd`f$3mmT-ZLP;oWM@8gIH0|;cKcab6gAqx{YQ6QCn zz_Ge4uj4OiFm3ye#w*e|%} zYq*0&w4E8}ReLwiZTFq`T8Rx9!vvoUWI5p%Jk(C&jo4Drzc6_Ql{L-3 zjNZX4W|wJ6qdA0Q9x8E6w~|m#M)8reKB~cw&|K07b|l{vbulo;IWGZ^I@tN-z|;o$JA0Jog1nb=`1|NQ zf*fcOmO3^SD4If%@2_>Hf{;ns0u#!Dm-=u`lna891sflg{J;?i*#Io9`?bCdH83Tk zH_)C1;qM8RJoIZC4yXj%BsfGXb>lZC-0T-c7q0IGb^ z7{(WGKzBz>Q0BQnfTbJOD=@^FjYz3ohv)2T79GP9h~1(pnq4*v4`5ESod7$#2mu1q zOLT;y+DXj@7B;eZ!D>TFiB?g$5yU`$VnbYtRL>VBdttxzll``dN%sP-dc`z{9f50R zz=i|r3XBAUmzaair2>E<=UMVZz^_zQgvwrs7%rj&cLfLw4c9+r&pFiwj-6+9|DR^; zk0IK>VF$By=5zZK>vwLmi-4U5vM_|qWnjCA&KxE3`oNw1$H)nDRHn-xvhaZ*5vCR= zhg- zsDP4FAI>XOKybsa@S#KW;A+5(`J<@z|DHZ>FufbZB(=*ii?rCBzI9Oq zUcbzC3F#>lU5?&8O?FD>TU4vZts1zQqXqKe0&5n2#^Iq+5uTH-kH%~_fjkQ+DCCni zC{QY2(%>_m3&x zq@JQ=(?-^Vl#ZPR>(0is9or}X#Wt45fkYcHA`0IW5oy#N^_hfuP}s7Cy!YF2H}&Sej~b8)rB$me(DKqXY>ku& z1&R13Z+{!DKsz#a`UL~|{uJWh3`8ALx)S3+JBz*mTp({hAG{rky&VqT{&w&dGDA$; z%=7BQ=V5qu^Cpf1OWnwLw?Niy%!4^lSa8eXzGCZp_5p1l#CAOjB)_Xg?cz1lo`7@~ zZhLO0AyqWQT)%DT!kGZSpDyhqBzWUm@bb(=jUsopNV#rJRTesAv< z>8-M-drkKXW0kvOAs4g+q0Jzk?p+D##DfIXRIJ`h-wsN&J^jNllL0~uuRRotnKo;VKD$_c<#w}ZSg#u)bljzXHxcz0~x zNuva93B3>V`NBnM?LOAi{hatqQs+O~l7HrP^Z$eG^4BX1rT>N0;VW@(-UPb0bj>PHxE6-FxpuV3IL=)sY4jez&3etF9eP>_T9M` zm=p!kmXa$mC3=*}{Q0$|C6G681r?;EqTq;0-KoZGX|9Dk5=5O(a-@=iu3)FAU>ojH ze?TaNqM%FzN(rh9NpofpUPT6S!%C;*^~_?cj#Iw0-Y3Z6Vhbj{X6JnC$%p3N(hbd7i~0Q_FSzA%$Rp5FT1U#KX~%-)2APOr1HcQl_ws0^6VMkfC+Wn z^b0zYpVyow=&!HFz#$$S*wHml{YiL)r+y2h1!+J7^9(oNV~t>^FaYr%44;XQ@P2Wa zc)}j1<1h^IVD$b4?qiUL)ZBO+mqR>eB7hqLM@hi0eoE{(_?en(*MOXg#Zp8TI4TM5 zzj^`50OtsEg-4eblqSpm9)NHzG0$`YQW&sOVcwWBaYd}P{@^yi)J|to7g!bX^4E9C zjG7gScmULgY`T9QHe-EAN1~8-@G{n`JRiY51~n4Zxo zgccp#k3xc508HwewWQ(k*1wajC(xEom;~Ei1q^XxeV9v$^g@}&Q0iLn0IyoM<1Roj zsRbH?pRsFlU9@kibGlnOGS^48Y3zDmSL0sHtoL7_jrHZAwd?&W@yCnZkBS|&UR>gu z=}t5N+Dg~6`Wr;sw%;h>@f89Eq51*#Da^%dL7!=0<-Z~Iivif3259fsp9 zwU0tufy{Yp%g zLadPJZ&hpsolhk-2=-?`6PU8`Q_??vKa_xE8p@6Kex3za1h?`nsf6_1=h>t`XO*p0 zmaUo8%%KbRMGsQh16JQ!bt3PWaej_|*~rxIG7qj)tY4XivT>be=zl=Ye@DpK$5Xxt zb8aTg5JT98q@-!cv%%0?GH*^prYsGTQx-1l?%;bz@1Z#%r%$L&{sH31plQ5#(8;TO1E8?yBoMot8*!dQ^A$#tE zXuBXI>W|?QMyT$kltk|xV2-U`z|p4XPr}$5#&9b(^0V9t1sln3i;e7*E=HZBC>gO2 zo`vq0kQ*;k5FLjUMeuOS4YhHG!Dg|!3@Me3G;jp4yXf=M!qUp%iUbJeMR)=P^aRsL ztt5fgCO>9;&v9TQS-~d`5cgOffJuEqXs6&GZ0CSaJkUlw_mo%khF&muV=G}lK(*Cx z$Nd`{dBlT07xWXJ{%8VFKg9Cv9EAB;r_n%|4+4YBV=q7w4rlZDHAv*QnQFq$U$uY{ zgz0|_ux1}%%|35ZV9f&aY4-)NCK!I|dvL!ouWZ1T2h0sd7L)3`y{XOJ0>z!eUOa6N z0Q>r1#5ai3?8hVpu#>J7Kh{YRmL0Tr4}R~liwFhtZX#h1{g8!g2hg|T8~G$OPoRA= zP)lqKq9>qxfnFE`M8a$PhNy=D?TMk~o30q6WDt_oS415L+QZOh7`X78zfQKQ9hi)2r zSzI$VWlDpX4nhL&+UOP-P3e;=Pn&AN&8bynqi<8X8|4MihDc?@FMB+9ydHStPrAS} ztMDp?mD8_o3z+V-J#7b=0+vhKIF9=ORzcAOH@o5?wWL~WtLkr2xh;Lct(Sn5`YPV) zDiUN;U!(I<6ei~r!6XwTybhS8t~18VaMlN6SjSO6dHgPW-8$M@JgWW@?;+=q7w}R; zEIdTF)P-BC0Ki0<8}o9y#NA_RvS>j)hdhVruz z>cky807Docn8sO5lzHR}d+p%qigh^O$gBU1_v=Hk1`+y3rUWczW);Ca^$K$_Sb#}~ zkeR;P_h#m}pF$p*(HUB=i%urBKwEw*8blHk>Nl7;!E6ZoDFNs*_NqklGpm6o&00FP z>Sx3Zoe?^`DA!Nx5m%goe%4!so&pDbh!IE?S7Y>z(;?8MEcyy?{1i1D>Plw8y^L9A z7X85tYMrBLW02&+90i)QI4J{w(Ozq;y83`lGt&t=dztf~p1KhpJ~O%PVJ^_xBvb97 zbBNA9I+96W(21PNYCnT-h69jlW!0;QRA4m7P^hJppnzq40WM3s-;^9DJ)YPO;N-{; zF>(Ocoi|9k1hTk9)A^IapeNlymMqKQ0DQm1)%#g=R=&Rg7DpC}2%0|#+W4y`7$Vc4 zl2iQv$_ZBb6$>)ce2+E$N*=dPm?JcqXjyQfV_-f9RffP<7|)t}5fAs|lEVm;4IrOR z36%#6+(q71#`oIzA|l^H;}Om9ACUfuh0;WD3J%!lIDb93rj zFvK&)sTuQ>`j7M;>=C&3hbxdew?n(qvNALw{N0=(KgSvL^K>YQrzNpJPv0-o`87Jf zPKVOj>YvfKi$1Pvf3R}$@wrovzP)nl@h2&%y{(Kkx_FC>HX%lBRrvRjBUrmM;JXk= zYX3VH@)LAOS682*LwH&yv*M8LEXVzmh?gha{p%4X7iw+gR7 zBK$!!rNhBFLtGth<_N`GFEf58+yp#7`m$o1sy>=6$M#temjaSUVCEH+u~4KXG;AB% zs?ip5Yq+>xWE;BeyOs`7bE6vEy&l?@i!396#D69*CwY=m^idx3&(HvjHmyc2GA$ZH zp(MevNCmUC31N}GkJ3Q^1p+}l$shp%E5R@#OnE0)8gB zj%@+h3WEJ-SSTgMk=8g!(G%I{!1S;bhSyN{jOU0)&>)eT8mKVl(S6Y}a}sM+s&$Da zOS#&7)AmJ6DK4*L0(B8uk6ndnLr7g2hK&c4-X^B-T|5ja3VxSm$9! zX+B^L3vI~N;$E~5Bt77Iv>b^3)3HE2p#80IXxIfpgFVDrc6ubNq?8%9JXX=bp#K6^ zYwR->Jh{IBPpBRt=nDvr&u5_|K_e##*dEr_g?Tk$2F6V?TmOI(WS#6XFEY&Imqtx^ zi3mdA>w~WVKiDjIg838ehuM=b28^FL19&$W{1$`Xq2PDe8Kh!Gpx8i#3m_ilVNnQx zWa7-YA1>D&msE_m%I%Vn93d$8t#Ge+J3+ zUoyh`>2%;V9wUo{*YfaCgO9d@3(3gn$BymZS#lpW&?3yAV_gJX6#4Bde3X(q#q-!| z5tRJ}c-HUgkb}41zaVP1jVw2NJ^awm?@4^s{~E8rP6Ff?gn#>Z?IZY&xTX+y$rQkO z?FZgUaogK4@ho{a&;&O&^ovL@loTQ9^8iJHaed(fVCG{1d|2e~f#LHVcx)Sk&6|U~ z&U+bN`JmHYGdWRVhawHGodx+I@DHV|^sdCXJZ-{~#4_tL2+SGzwELk5iEAMl3L#%y zip!-3%McVN*5b*q=n$m>-x%5m5MD+-ejNTm)Y4QRhms@p0|uH?L71gJ(@rlJkvA?@ zbUzjNL8nolfhweqYuW5Qq@s&Fvv~Vk-mr)*hdKW$1(<2g0(F23LFS?6Z6eg17XK|l zTIT#G-e@TD3_BRC>EJaJ>EH(khAG36@`(LqZ#<0Mmb}_|(2p|hA_SrP!0VQ4hDAhr zh+_uYL8?6z6Ox|bb(OdTa}r-%R=GT7_lb~np-PQp*`!POe8VEqGdLFz47sHty z2<*9j|7xpM2MQL*>6@i?Vtbd)zI_WnU?wU0~#sKlA0Am2la z8Bcq`9022hgnS93!d%|ik1t}%y>O!-`ZMmdqySq1jDX2r8P_hwqa99A@a1V*LOp^J zKv#Nf#2TH8bOfvH;F2A@QA|7J*lfZ!T7efiY>ynusIdoj#`PXW;`gwlh{|UZg`q6< z^h1r^Sf_mhxLslNZxnOJLLd-2<0&dEgK6EQ47sPi!a98kj-RF#Uzn(nHb|2MttId# z^SL|{-OJLs!&-w_Uu7H~)Bd=Qqpbo$2QRpM2xOn&{%a+ZSmrdRczLpG2D-CVTvc^Z za64BK{OWnFr9>t3jscGHEwL;F4MRIeBSboLx9k%Flt#xT@gM%hQFM2B=ZkPe~WQiWav$V_BD>5F94C?`Oka3dBluENzb zo)l;Gb|xFrV?`#fs-0T{SUHFLVxfFx{PW?xJ%yoyXw#>Bn^WRKr)#()l4epQ7_=I$7=?LTCLYKK?SD zK8D>z=Omr`;glyrI|66ULNun!jKEWjB6KW4dq`Z7_1?7&N4SNby-)YY1BjJye;AH6 zqA!}A#HKndggb7kMUd#AJYbZJLN=YGf7Hywa!Em~ns9?A4;Ei+zXiOVOp&*m%?$Ss zPiJq*=Z2@Jjt>_SA1pqc_#oUl^IN&)r(Ye;3>On0wARcaIRd^uY+2*xDeWJ%5Oag^ P`DZ?k(1Q7u@#6mn+3oyM literal 0 HcmV?d00001 diff --git a/collie/models/mistral2/configuration_mistraltp.py b/collie/models/mistral2/configuration_mistraltp.py new file mode 100644 index 0000000..ad6691b --- /dev/null +++ b/collie/models/mistral2/configuration_mistraltp.py @@ -0,0 +1,155 @@ +# coding=utf-8 +# Copyright 2023 Mistral AI and the HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" Mistral model configuration""" + +from transformers.configuration_utils import PretrainedConfig +# from transformers.utils import logging +from collie.log.logger import logger + + +# logger = logging.get_logger(__name__) + +MISTRAL_PRETRAINED_CONFIG_ARCHIVE_MAP = { + "mistralai/Mistral-7B-v0.1": "https://huggingface.co/mistralai/Mistral-7B-v0.1/resolve/main/config.json", + "mistralai/Mistral-7B-Instruct-v0.1": "https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.1/resolve/main/config.json", +} + + +class MistralConfig(PretrainedConfig): + r""" + This is the configuration class to store the configuration of a [`MistralModel`]. It is used to instantiate an + Mistral model according to the specified arguments, defining the model architecture. Instantiating a configuration + with the defaults will yield a similar configuration to that of the Mistral-7B-v0.1 or Mistral-7B-Instruct-v0.1. + + [mistralai/Mistral-7B-v0.1](https://huggingface.co/mistralai/Mistral-7B-v0.1) + [mistralai/Mistral-7B-Instruct-v0.1](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.1) + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + + Args: + vocab_size (`int`, *optional*, defaults to 32000): + Vocabulary size of the Mistral model. Defines the number of different tokens that can be represented by the + `inputs_ids` passed when calling [`MistralModel`] + hidden_size (`int`, *optional*, defaults to 4096): + Dimension of the hidden representations. + intermediate_size (`int`, *optional*, defaults to 14336): + Dimension of the MLP representations. + num_hidden_layers (`int`, *optional*, defaults to 32): + Number of hidden layers in the Transformer encoder. + num_attention_heads (`int`, *optional*, defaults to 32): + Number of attention heads for each attention layer in the Transformer encoder. + num_key_value_heads (`int`, *optional*, defaults to 8): + This is the number of key_value heads that should be used to implement Grouped Query Attention. If + `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if + `num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used. When + converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed + by meanpooling all the original heads within that group. For more details checkout [this + paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to `8`. + hidden_act (`str` or `function`, *optional*, defaults to `"silu"`): + The non-linear activation function (function or string) in the decoder. + max_position_embeddings (`int`, *optional*, defaults to `4096*32`): + The maximum sequence length that this model might ever be used with. Mistral's sliding window attention + allows sequence of up to 4096*32 tokens. + initializer_range (`float`, *optional*, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + rms_norm_eps (`float`, *optional*, defaults to 1e-06): + The epsilon used by the rms normalization layers. + use_cache (`bool`, *optional*, defaults to `True`): + Whether or not the model should return the last key/values attentions (not used by all models). Only + relevant if `config.is_decoder=True`. + pad_token_id (`int`, *optional*): + The id of the padding token. + bos_token_id (`int`, *optional*, defaults to 1): + The id of the "beginning-of-sequence" token. + eos_token_id (`int`, *optional*, defaults to 2): + The id of the "end-of-sequence" token. + tie_word_embeddings (`bool`, *optional*, defaults to `False`): + Whether the model's input and output word embeddings should be tied. + rope_theta (`float`, *optional*, defaults to 10000.0): + The base period of the RoPE embeddings. + sliding_window (`int`, *optional*, defaults to 4096): + Sliding window attention window size. If not specified, will default to `4096`. + attention_dropout (`float`, *optional*, defaults to 0.0): + The dropout ratio for the attention probabilities. + + ```python + >>> from transformers import MistralModel, MistralConfig + + >>> # Initializing a Mistral 7B style configuration + >>> configuration = MistralConfig() + + >>> # Initializing a model from the Mistral 7B style configuration + >>> model = MistralModel(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + ```""" + + model_type = "mistral" + keys_to_ignore_at_inference = ["past_key_values"] + + def __init__( + self, + vocab_size=32000, + hidden_size=4096, + intermediate_size=14336, + num_hidden_layers=32, + num_attention_heads=32, + num_key_value_heads=8, + hidden_act="silu", + max_position_embeddings=4096 * 32, + initializer_range=0.02, + rms_norm_eps=1e-6, + use_cache=True, + pad_token_id=None, + bos_token_id=1, + eos_token_id=2, + tie_word_embeddings=False, + rope_theta=10000.0, + sliding_window=4096, + attention_dropout=0.0, + attn_implementation="flash_attention_2", + **kwargs, + ): + self.vocab_size = vocab_size + self.max_position_embeddings = max_position_embeddings + self.hidden_size = hidden_size + self.intermediate_size = intermediate_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.sliding_window = sliding_window + + # for backward compatibility + if num_key_value_heads is None: + num_key_value_heads = num_attention_heads + + self.num_key_value_heads = num_key_value_heads + self.hidden_act = hidden_act + self.initializer_range = initializer_range + self.rms_norm_eps = rms_norm_eps + self.use_cache = use_cache + self.rope_theta = rope_theta + self.attention_dropout = attention_dropout + + # 调用父类的初始化函数,将一些公共参数传递给父类处理 + super().__init__( + pad_token_id=pad_token_id, + bos_token_id=bos_token_id, + eos_token_id=eos_token_id, + tie_word_embeddings=tie_word_embeddings, + **kwargs, + ) diff --git a/collie/models/mistral2/model.py b/collie/models/mistral2/model.py new file mode 100644 index 0000000..60d9553 --- /dev/null +++ b/collie/models/mistral2/model.py @@ -0,0 +1,2026 @@ +# coding=utf-8 +# Copyright 2023 Mistral AI and the HuggingFace Inc. team. All rights reserved. +# +# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX +# and OPT implementations in this library. It has been modified from its +# original forms to accommodate minor architectural differences compared +# to GPT-NeoX and OPT used by the Meta AI team that trained the model. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" PyTorch Mistral model.""" +import inspect +import math +import warnings +from typing import List, Optional, Tuple, Union + +import torch +import torch.nn.functional as F +import torch.utils.checkpoint +from torch import nn +from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss + +from transformers.activations import ACT2FN +from transformers.cache_utils import Cache, DynamicCache +from transformers.modeling_attn_mask_utils import _prepare_4d_causal_attention_mask, _prepare_4d_causal_attention_mask_for_sdpa +from transformers.modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast, SequenceClassifierOutputWithPast +from transformers.modeling_utils import PreTrainedModel, dtype_byte_size +from transformers.utils import ( + add_start_docstrings, + add_start_docstrings_to_model_forward, + is_flash_attn_2_available, + is_flash_attn_greater_or_equal_2_10, + logging, + replace_return_docstrings, +) +from .configuration_mistraltp import Mistral2Config + + +if is_flash_attn_2_available(): + from flash_attn import flash_attn_func, flash_attn_varlen_func + from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input # noqa + + _flash_supports_window_size = "window_size" in list(inspect.signature(flash_attn_func).parameters) + + +logger = logging.get_logger(__name__) + +_CONFIG_FOR_DOC = "Mistral2Config" + +#modified for collie +import torch.distributed as dist +import gc +import json +import os +from collections import OrderedDict +from megatron.core import parallel_state, tensor_parallel +from einops import rearrange +from deepspeed.pipe import LayerSpec, TiedLayerSpec + +from collie.config import CollieConfig +from collie.driver.io import IODriver +from collie.log.logger import logger +from collie.module import ( + ColumnParallelLinearWithoutBias, + ColumnParallelLMHead, + RowParallelLinearWithoutBias, +) +from collie.utils import concat_tensor, dict_as_params, env, progress +from collie.models.base import CollieModelForCausalLM +from collie.models.utils import ( + kv_cache_to_inputs_for_layer, inputs_to_kv_cache_for_layer, + kv_cache_to_inputs_for_model, inputs_to_kv_cache_for_model, +) + + +# Copied from transformers.models.llama.modeling_llama._get_unpad_data +def _get_unpad_data(attention_mask): + seqlens_in_batch = attention_mask.sum(dim=-1, dtype=torch.int32) + indices = torch.nonzero(attention_mask.flatten(), as_tuple=False).flatten() + max_seqlen_in_batch = seqlens_in_batch.max().item() + cu_seqlens = F.pad(torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.int32), (1, 0)) + return ( + indices, + cu_seqlens, + max_seqlen_in_batch, + ) + + +# Copied from transformers.models.llama.modeling_llama.LlamaRMSNorm with Llama->Mistral +class Mistral2RMSNorm(nn.Module): + def __init__(self, hidden_size, eps=1e-6): + """ + MistralRMSNorm is equivalent to T5LayerNorm + """ + super().__init__() + self.weight = nn.Parameter(torch.ones(hidden_size)) + self.variance_epsilon = eps + + def forward(self, hidden_states): + input_dtype = hidden_states.dtype + hidden_states = hidden_states.to(torch.float32) + variance = hidden_states.pow(2).mean(-1, keepdim=True) + hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon) + ans = self.weight * hidden_states.to(input_dtype) + # -------------------------------------------------------- + # # 将Tensor转换为列表 + # ans_list = ans.tolist() + # # 指定.json文件的路径 + # file_path = '/remote-home/lqyin/CoLLiE/tests/models/mistraltp/rms_ans.json' + + # # 尝试打开现有的.json文件并读取内容,如果文件不存在则创建一个新的列表 + # try: + # with open(file_path, 'r', encoding='utf-8') as file: + # results_list = json.load(file) + # except FileNotFoundError: + # results_list = [] + # # 将当前结果添加到列表中 + # results_list.append(ans_list) + # # 将更新后的列表写回.json文件 + # with open(file_path, 'w', encoding='utf-8') as file: + # json.dump(results_list, file, ensure_ascii=False, indent=4) + # file.write('\n') # 在文件末尾添加一个换行符 + # -------------------------------------------------------- + return ans + + +# copied from transformers.models.llama.modeling_llama.LlamaRotaryEmbedding with Llama->Mistral +# TODO @Arthur no longer copied from LLama after static cache +class Mistral2RotaryEmbedding(nn.Module): + def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None): + super().__init__() + + self.dim = dim + self.max_position_embeddings = max_position_embeddings + self.base = base + inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2, dtype=torch.int64).float().to(device) / self.dim)) + self.register_buffer("inv_freq", inv_freq, persistent=False) + + # Build here to make `torch.jit.trace` work. + self._set_cos_sin_cache( + seq_len=max_position_embeddings, device=self.inv_freq.device, dtype=torch.get_default_dtype() + ) + + def _set_cos_sin_cache(self, seq_len, device, dtype): + self.max_seq_len_cached = seq_len + t = torch.arange(self.max_seq_len_cached, device=device, dtype=torch.int64).type_as(self.inv_freq) + + freqs = torch.outer(t, self.inv_freq) + # Different from paper, but it uses a different permutation in order to obtain the same calculation + emb = torch.cat((freqs, freqs), dim=-1) + self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False) + self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False) + + def forward(self, x, seq_len=None): + # x: [bs, num_attention_heads, seq_len, head_size] + if seq_len > self.max_seq_len_cached: + self._set_cos_sin_cache(seq_len=seq_len, device=x.device, dtype=x.dtype) + + return ( + self.cos_cached[:seq_len].to(dtype=x.dtype), + self.sin_cached[:seq_len].to(dtype=x.dtype), + ) + + +# Copied from transformers.models.llama.modeling_llama.rotate_half +def rotate_half(x): + """Rotates half the hidden dims of the input.""" + x1 = x[..., : x.shape[-1] // 2] + x2 = x[..., x.shape[-1] // 2 :] + return torch.cat((-x2, x1), dim=-1) + + +# copied from transformers.models.llama.modeling_llama.apply_rotary_pos_emb +# TODO @Arthur no longer copied from LLama after static cache +def apply_rotary_pos_emb(q, k, cos, sin, position_ids, unsqueeze_dim=1): + """Applies Rotary Position Embedding to the query and key tensors. + + Args: + q (`torch.Tensor`): The query tensor. + k (`torch.Tensor`): The key tensor. + cos (`torch.Tensor`): The cosine part of the rotary embedding. + sin (`torch.Tensor`): The sine part of the rotary embedding. + position_ids (`torch.Tensor`): + The position indices of the tokens corresponding to the query and key tensors. For example, this can be + used to pass offsetted position ids when working with a KV-cache. + unsqueeze_dim (`int`, *optional*, defaults to 1): + The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and + sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note + that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and + k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes + cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have + the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2. + Returns: + `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding. + """ + cos = cos[position_ids].unsqueeze(unsqueeze_dim) + sin = sin[position_ids].unsqueeze(unsqueeze_dim) + q_embed = (q * cos) + (rotate_half(q) * sin) + k_embed = (k * cos) + (rotate_half(k) * sin) + return q_embed, k_embed + + +class Mistral2MLP(nn.Module): + def __init__(self, config): + super().__init__() + self.config = config + self.hidden_size = config.hidden_size + self.intermediate_size = config.intermediate_size + # self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False) + # self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False) + # self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False) + + self.up_proj = ColumnParallelLinearWithoutBias( + self.hidden_size, + self.intermediate_size, + bias=False, + gather_output=False, + init_method=lambda x: x, + ) + self.gate_proj = ColumnParallelLinearWithoutBias( + self.hidden_size, + self.intermediate_size, + bias=False, + gather_output=False, + init_method=lambda x: x, + ) + self.down_proj = RowParallelLinearWithoutBias( + self.intermediate_size, + self.hidden_size, + bias=False, + input_is_parallel=True, + init_method=lambda x: x, + ) + + self.act_fn = ACT2FN[config.hidden_act] + + def forward(self, x): + return self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x)) + + +# Copied from transformers.models.llama.modeling_llama.repeat_kv +def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor: + """ + This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch, + num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim) + """ + batch, num_key_value_heads, slen, head_dim = hidden_states.shape + if n_rep == 1: + return hidden_states + hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim) + return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim) + + +class Mistral2Attention(nn.Module): + """ + Multi-headed attention from 'Attention Is All You Need' paper. Modified to use sliding window attention: Longformer + and "Generating Long Sequences with Sparse Transformers". + """ + + def __init__(self, config: CollieConfig, layer_idx: Optional[int] = None): + super().__init__() + self.config = config + self.layer_idx = layer_idx + if layer_idx is None: + logger.warning_once( + f"Instantiating {self.__class__.__name__} without passing a `layer_idx` is not recommended and will " + "lead to errors during the forward call if caching is used. Please make sure to provide a `layer_idx` " + "when creating this class." + ) + + self.hidden_size = config.hidden_size + self.num_heads = config.num_attention_heads + self.head_dim = self.hidden_size // self.num_heads + self.num_key_value_heads = config.num_key_value_heads + self.num_key_value_groups = self.num_heads // self.num_key_value_heads + self.max_position_embeddings = config.max_position_embeddings + self.rope_theta = config.rope_theta + self.is_causal = True + self.attention_dropout = config.attention_dropout + + if (self.head_dim * self.num_heads) != self.hidden_size: + raise ValueError( + f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}" + f" and `num_heads`: {self.num_heads})." + ) + # self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=False) + # self.k_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=False) + # self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=False) + # self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=False) + + self.q_proj = ColumnParallelLinearWithoutBias( + self.hidden_size, + self.num_heads * self.head_dim, + bias=False, + gather_output=False, + init_method=lambda x: x, + ) + self.k_proj = ColumnParallelLinearWithoutBias( + self.hidden_size, + self.num_key_value_heads * self.head_dim, + bias=False, + gather_output=False, + init_method=lambda x: x, + ) + self.v_proj = ColumnParallelLinearWithoutBias( + self.hidden_size, + self.num_key_value_heads * self.head_dim, + bias=False, + gather_output=False, + init_method=lambda x: x, + ) + self.o_proj = RowParallelLinearWithoutBias( + self.num_heads * self.head_dim, + self.hidden_size, + bias=False, + input_is_parallel=True, + init_method=lambda x: x, + ) + + self.rotary_emb = Mistral2RotaryEmbedding( + self.head_dim, + max_position_embeddings=self.max_position_embeddings, + base=self.rope_theta, + ) + + def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int): + return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous() + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_value: Optional[Cache] = None, + output_attentions: bool = False, + use_cache: bool = False, + **kwargs, + ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: + if "padding_mask" in kwargs: + warnings.warn( + "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`" + ) + bsz, q_len, _ = hidden_states.size() + + query_states = self.q_proj(hidden_states) + key_states = self.k_proj(hidden_states) + value_states = self.v_proj(hidden_states) + + # query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2) + # key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) + # value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) + + query_states, key_states, value_states = ( + rearrange(query_states, "b n (h d) -> b n h d", d=self.head_dim), + rearrange(key_states, "b n (h d) -> b n h d", d=self.head_dim), + rearrange(value_states, "b n (h d) -> b n h d", d=self.head_dim), + ) + + self.num_heads_tp = query_states.shape[2] + self.tp_size = self.num_heads // self.num_heads_tp + + query_states = query_states.transpose(1, 2) + key_states = key_states.transpose(1, 2) + value_states = value_states.transpose(1, 2) + + if self.config.pp_size > 1: + query_states = query_states.contiguous() + key_states = key_states.contiguous() + value_states = value_states.contiguous() + + kv_seq_len = key_states.shape[-2] + if past_key_value is not None: + if self.layer_idx is None: + raise ValueError( + f"The cache structure has changed since version v4.36. If you are using {self.__class__.__name__} " + "for auto-regressive decoding with k/v caching, please make sure to initialize the attention class " + "with a layer index." + ) + kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx) + cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len) + query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids) + + if past_key_value is not None: + cache_kwargs = {"sin": sin, "cos": cos} # Specific to RoPE models + key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs) + + # repeat k/v heads if n_kv_heads < n_heads + key_states = repeat_kv(key_states, self.num_key_value_groups) + value_states = repeat_kv(value_states, self.num_key_value_groups) + + attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim) + + if attn_weights.size() != (bsz, self.num_heads_tp, q_len, kv_seq_len): + raise ValueError( + f"Attention weights should be of size {(bsz, self.num_heads_tp, q_len, kv_seq_len)}, but is" + f" {attn_weights.size()}" + ) + + if attention_mask is not None: + if attention_mask.size() != (bsz, 1, q_len, kv_seq_len): + raise ValueError( + f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}" + ) + + attn_weights = attn_weights + attention_mask + + # upcast attention to fp32 + attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype) + attn_weights = nn.functional.dropout(attn_weights, p=self.attention_dropout, training=self.training) + attn_output = torch.matmul(attn_weights, value_states) + + if attn_output.size() != (bsz, self.num_heads_tp, q_len, self.head_dim): + raise ValueError( + f"`attn_output` should be of size {(bsz, self.num_heads_tp, q_len, self.head_dim)}, but is" + f" {attn_output.size()}" + ) + + attn_output = attn_output.transpose(1, 2).contiguous() + attn_output = attn_output.reshape(bsz, q_len, int(self.hidden_size/self.tp_size)) + + attn_output = self.o_proj(attn_output) + + if not output_attentions: + attn_weights = None + + # -------------------------------------------------------- + # 将Tensor转换为列表 + ans_list = attn_output.tolist() + # 指定.json文件的路径 + file_path = '/remote-home/lqyin/CoLLiE/tests/models/mistraltp/attn_output_1.json' + + # 尝试打开现有的.json文件并读取内容,如果文件不存在则创建一个新的列表 + try: + with open(file_path, 'r', encoding='utf-8') as file: + results_list = json.load(file) + except FileNotFoundError: + results_list = [] + # 将当前结果添加到列表中 + results_list.append(ans_list) + # 将更新后的列表写回.json文件 + with open(file_path, 'w', encoding='utf-8') as file: + json.dump(results_list, file, ensure_ascii=False, indent=4) + file.write('\n\n\n') # 在文件末尾添加一个换行符 + # -------------------------------------------------------- + + + + return attn_output, attn_weights, past_key_value + + +class Mistral2FlashAttention2(Mistral2Attention): + """ + Mistral flash attention module. This module inherits from `MistralAttention` as the weights of the module stays + untouched. The only required change would be on the forward pass where it needs to correctly call the public API of + flash attention and deal with padding tokens in case the input contains any of them. + """ + + # Copied from transformers.models.llama.modeling_llama.LlamaFlashAttention2.__init__ + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1. + # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignement, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0. + # Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left). + self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10() + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_value: Optional[Cache] = None, + output_attentions: bool = False, + use_cache: bool = False, + **kwargs, + ): + if "padding_mask" in kwargs: + warnings.warn( + "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`" + ) + + # overwrite attention_mask with padding_mask + attention_mask = kwargs.pop("padding_mask") + bsz, q_len, _ = hidden_states.size() + + query_states = self.q_proj(hidden_states) + key_states = self.k_proj(hidden_states) + value_states = self.v_proj(hidden_states) + + # query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2) + # key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) + # value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) + + query_states, key_states, value_states = ( + rearrange(query_states, "b n (h d) -> b n h d", d=self.head_dim), + rearrange(key_states, "b n (h d) -> b n h d", d=self.head_dim), + rearrange(value_states, "b n (h d) -> b n h d", d=self.head_dim), + ) + + self.num_heads_tp = query_states.shape[2] + self.tp_size = self.num_heads // self.num_heads_tp + + query_states = query_states.transpose(1, 2) + key_states = key_states.transpose(1, 2) + value_states = value_states.transpose(1, 2) + + if self.config.pp_size > 1: + query_states = query_states.contiguous() + key_states = key_states.contiguous() + value_states = value_states.contiguous() + + + kv_seq_len = key_states.shape[-2] + if past_key_value is not None: + if self.layer_idx is None: + raise ValueError( + f"The cache structure has changed since version v4.36. If you are using {self.__class__.__name__} " + "for auto-regressive decoding with k/v caching, please make sure to initialize the attention class " + "with a layer index." + ) + kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx) + + # Because the input can be padded, the absolute sequence length depends on the max position id. + rotary_seq_len = max(kv_seq_len, position_ids[:, -1].max().item()) + 1 + cos, sin = self.rotary_emb(value_states, seq_len=rotary_seq_len) + + query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids) + + use_sliding_windows = ( + _flash_supports_window_size + and getattr(self.config, "sliding_window", None) is not None + and kv_seq_len > self.config.sliding_window + ) + + if not _flash_supports_window_size: + logger.warning_once( + "The current flash attention version does not support sliding window attention, for a more memory efficient implementation" + " make sure to upgrade flash-attn library." + ) + + if past_key_value is not None: + # Activate slicing cache only if the config has a value `sliding_windows` attribute + cache_has_contents = past_key_value.get_seq_length(self.layer_idx) > 0 + if ( + getattr(self.config, "sliding_window", None) is not None + and kv_seq_len > self.config.sliding_window + and cache_has_contents + ): + slicing_tokens = 1 - self.config.sliding_window + + past_key = past_key_value[self.layer_idx][0] + past_value = past_key_value[self.layer_idx][1] + + past_key = past_key[:, :, slicing_tokens:, :].contiguous() + past_value = past_value[:, :, slicing_tokens:, :].contiguous() + + if past_key.shape[-2] != self.config.sliding_window - 1: + raise ValueError( + f"past key must have a shape of (`batch_size, num_heads, self.config.sliding_window-1, head_dim`), got" + f" {past_key.shape}" + ) + + if attention_mask is not None: + attention_mask = attention_mask[:, slicing_tokens:] + attention_mask = torch.cat([attention_mask, torch.ones_like(attention_mask[:, -1:])], dim=-1) + + cache_kwargs = {"sin": sin, "cos": cos} # Specific to RoPE models + key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs) + + # repeat k/v heads if n_kv_heads < n_heads + key_states = repeat_kv(key_states, self.num_key_value_groups) + value_states = repeat_kv(value_states, self.num_key_value_groups) + dropout_rate = 0.0 if not self.training else self.attention_dropout + + # In PEFT, usually we cast the layer norms in float32 for training stability reasons + # therefore the input hidden states gets silently casted in float32. Hence, we need + # cast them back in float16 just to be sure everything works as expected. + input_dtype = query_states.dtype + if input_dtype == torch.float32: + if torch.is_autocast_enabled(): + target_dtype = torch.get_autocast_gpu_dtype() + # Handle the case where the model is quantized + elif hasattr(self.config, "_pre_quantization_dtype"): + target_dtype = self.config._pre_quantization_dtype + else: + target_dtype = self.q_proj.weight.dtype + + logger.warning_once( + f"The input hidden states seems to be silently casted in float32, this might be related to" + f" the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in" + f" {target_dtype}." + ) + + query_states = query_states.to(target_dtype) + key_states = key_states.to(target_dtype) + value_states = value_states.to(target_dtype) + + # Reashape to the expected shape for Flash Attention + query_states = query_states.transpose(1, 2) + key_states = key_states.transpose(1, 2) + value_states = value_states.transpose(1, 2) + + attn_output = self._flash_attention_forward( + query_states, + key_states, + value_states, + attention_mask, + q_len, + dropout=dropout_rate, + use_sliding_windows=use_sliding_windows, + ) + + attn_output = attn_output.reshape(bsz, q_len, int(self.hidden_size/self.tp_size)).contiguous() + attn_output = self.o_proj(attn_output) + + if not output_attentions: + attn_weights = None + + # -------------------------------------------------------- + # 将Tensor转换为列表 + ans_list = attn_output.tolist() + # 指定.json文件的路径 + file_path = '/remote-home/lqyin/CoLLiE/tests/models/mistraltp/attn_output_1.json' + + # 尝试打开现有的.json文件并读取内容,如果文件不存在则创建一个新的列表 + try: + with open(file_path, 'r', encoding='utf-8') as file: + results_list = json.load(file) + except FileNotFoundError: + results_list = [] + # 将当前结果添加到列表中 + results_list.append(ans_list) + # 将更新后的列表写回.json文件 + with open(file_path, 'w', encoding='utf-8') as file: + json.dump(results_list, file, ensure_ascii=False, indent=4) + file.write('\n\n\n') # 在文件末尾添加一个换行符 + # -------------------------------------------------------- + + + return attn_output, attn_weights, past_key_value + + def _flash_attention_forward( + self, + query_states, + key_states, + value_states, + attention_mask, + query_length, + dropout=0.0, + softmax_scale=None, + use_sliding_windows=False, + ): + """ + Calls the forward method of Flash Attention - if the input hidden states contain at least one padding token + first unpad the input, then computes the attention scores and pad the final attention scores. + + Args: + query_states (`torch.Tensor`): + Input query states to be passed to Flash Attention API + key_states (`torch.Tensor`): + Input key states to be passed to Flash Attention API + value_states (`torch.Tensor`): + Input value states to be passed to Flash Attention API + attention_mask (`torch.Tensor`): + The padding mask - corresponds to a tensor of size `(batch_size, seq_len)` where 0 stands for the + position of padding tokens and 1 for the position of non-padding tokens. + dropout (`float`): + Attention dropout + softmax_scale (`float`, *optional*): + The scaling of QK^T before applying softmax. Default to 1 / sqrt(head_dim) + use_sliding_windows (`bool`, *optional*): + Whether to activate sliding window attention. + """ + if not self._flash_attn_uses_top_left_mask: + causal = self.is_causal + else: + # TODO: Remove the `query_length != 1` check once Flash Attention for RoCm is bumped to 2.1. For details, please see the comment in LlamaFlashAttention2 __init__. + causal = self.is_causal and query_length != 1 + + # Contains at least one padding token in the sequence + if attention_mask is not None: + batch_size = query_states.shape[0] + query_states, key_states, value_states, indices_q, cu_seq_lens, max_seq_lens = self._upad_input( + query_states, key_states, value_states, attention_mask, query_length + ) + + cu_seqlens_q, cu_seqlens_k = cu_seq_lens + max_seqlen_in_batch_q, max_seqlen_in_batch_k = max_seq_lens + + if not use_sliding_windows: + attn_output_unpad = flash_attn_varlen_func( + query_states, + key_states, + value_states, + cu_seqlens_q=cu_seqlens_q, + cu_seqlens_k=cu_seqlens_k, + max_seqlen_q=max_seqlen_in_batch_q, + max_seqlen_k=max_seqlen_in_batch_k, + dropout_p=dropout, + softmax_scale=softmax_scale, + causal=causal, + ) + else: + attn_output_unpad = flash_attn_varlen_func( + query_states, + key_states, + value_states, + cu_seqlens_q=cu_seqlens_q, + cu_seqlens_k=cu_seqlens_k, + max_seqlen_q=max_seqlen_in_batch_q, + max_seqlen_k=max_seqlen_in_batch_k, + dropout_p=dropout, + softmax_scale=softmax_scale, + causal=causal, + window_size=(self.config.sliding_window, self.config.sliding_window), + ) + + attn_output = pad_input(attn_output_unpad, indices_q, batch_size, query_length) + else: + if not use_sliding_windows: + attn_output = flash_attn_func( + query_states, + key_states, + value_states, + dropout, + softmax_scale=softmax_scale, + causal=causal, + ) + else: + attn_output = flash_attn_func( + query_states, + key_states, + value_states, + dropout, + softmax_scale=softmax_scale, + causal=causal, + window_size=(self.config.sliding_window, self.config.sliding_window), + ) + + return attn_output + + def _upad_input(self, query_layer, key_layer, value_layer, attention_mask, query_length): + batch_size, kv_seq_len, num_heads, head_dim = key_layer.shape + + # On the first iteration we need to properly re-create the padding mask + # by slicing it on the proper place + if kv_seq_len != attention_mask.shape[-1]: + attention_mask_num_tokens = attention_mask.shape[-1] + attention_mask = attention_mask[:, attention_mask_num_tokens - kv_seq_len :] + + indices_k, cu_seqlens_k, max_seqlen_in_batch_k = _get_unpad_data(attention_mask) + + key_layer = index_first_axis(key_layer.reshape(batch_size * kv_seq_len, num_heads, head_dim), indices_k) + value_layer = index_first_axis(value_layer.reshape(batch_size * kv_seq_len, num_heads, head_dim), indices_k) + + if query_length == kv_seq_len: + query_layer = index_first_axis( + query_layer.reshape(batch_size * kv_seq_len, num_heads, head_dim), indices_k + ) + cu_seqlens_q = cu_seqlens_k + max_seqlen_in_batch_q = max_seqlen_in_batch_k + indices_q = indices_k + elif query_length == 1: + max_seqlen_in_batch_q = 1 + cu_seqlens_q = torch.arange( + batch_size + 1, dtype=torch.int32, device=query_layer.device + ) # There is a memcpy here, that is very bad. + indices_q = cu_seqlens_q[:-1] + query_layer = query_layer.squeeze(1) + else: + # The -q_len: slice assumes left padding. + attention_mask = attention_mask[:, -query_length:] + query_layer, indices_q, cu_seqlens_q, max_seqlen_in_batch_q = unpad_input(query_layer, attention_mask) + + return ( + query_layer, + key_layer, + value_layer, + indices_q, + (cu_seqlens_q, cu_seqlens_k), + (max_seqlen_in_batch_q, max_seqlen_in_batch_k), + ) + + +# copied from transformers.models.llama.modeling_llama.LlamaSdpaAttention with Llama->Mistral +# TODO @Arthur no longer copied from LLama after static cache +class Mistral2SdpaAttention(Mistral2Attention): + """ + Mistral attention module using torch.nn.functional.scaled_dot_product_attention. This module inherits from + `MistralAttention` as the weights of the module stays untouched. The only changes are on the forward pass to adapt to + SDPA API. + """ + + # Adapted from MistralAttention.forward + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_value: Optional[Cache] = None, + output_attentions: bool = False, + use_cache: bool = False, + ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: + if output_attentions: + # TODO: Improve this warning with e.g. `model.config.attn_implementation = "manual"` once this is implemented. + logger.warning_once( + "MistralModel is using MistralSdpaAttention, but `torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to the manual attention implementation, " + 'but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.' + ) + return super().forward( + hidden_states=hidden_states, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_value=past_key_value, + output_attentions=output_attentions, + use_cache=use_cache, + ) + + bsz, q_len, _ = hidden_states.size() + + query_states = self.q_proj(hidden_states) + key_states = self.k_proj(hidden_states) + value_states = self.v_proj(hidden_states) + + # query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2) + # key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) + # value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) + + query_states, key_states, value_states = ( + rearrange(query_states, "b n (h d) -> b n h d", d=self.head_dim), + rearrange(key_states, "b n (h d) -> b n h d", d=self.head_dim), + rearrange(value_states, "b n (h d) -> b n h d", d=self.head_dim), + ) + + self.num_heads_tp = query_states.shape[2] + self.tp_size = self.num_heads // self.num_heads_tp + + query_states = query_states.transpose(1, 2) + key_states = key_states.transpose(1, 2) + value_states = value_states.transpose(1, 2) + + if self.config.pp_size > 1: + query_states = query_states.contiguous() + key_states = key_states.contiguous() + value_states = value_states.contiguous() + + + kv_seq_len = key_states.shape[-2] + if past_key_value is not None: + kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx) + cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len) + + query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids) + + if past_key_value is not None: + cache_kwargs = {"sin": sin, "cos": cos} # Specific to RoPE models + key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs) + + key_states = repeat_kv(key_states, self.num_key_value_groups) + value_states = repeat_kv(value_states, self.num_key_value_groups) + + if attention_mask is not None: + if attention_mask.size() != (bsz, 1, q_len, kv_seq_len): + raise ValueError( + f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}" + ) + + # SDPA with memory-efficient backend is currently (torch==2.1.2) bugged with non-contiguous inputs with custom attn_mask, + # Reference: https://github.com/pytorch/pytorch/issues/112577. + if query_states.device.type == "cuda" and attention_mask is not None: + query_states = query_states.contiguous() + key_states = key_states.contiguous() + value_states = value_states.contiguous() + + attn_output = torch.nn.functional.scaled_dot_product_attention( + query_states, + key_states, + value_states, + attn_mask=attention_mask, + dropout_p=self.attention_dropout if self.training else 0.0, + # The q_len > 1 is necessary to match with AttentionMaskConverter.to_causal_4d that does not create a causal mask in case q_len == 1. + is_causal=self.is_causal and attention_mask is None and q_len > 1, + ) + + attn_output = attn_output.transpose(1, 2).contiguous() + attn_output = attn_output.view(bsz, q_len, int(self.hidden_size/self.tp_size)) + + attn_output = self.o_proj(attn_output) + + return attn_output, None, past_key_value + + +MISTRAL_ATTENTION_CLASSES = { + "eager": Mistral2Attention, + "flash_attention_2": Mistral2FlashAttention2, + "sdpa": Mistral2SdpaAttention, +} + + +class MistralDecoderLayer(nn.Module): + def __init__(self, config: CollieConfig, layer_idx: int): + super().__init__() + self.hidden_size = config.hidden_size + + self.self_attn = MISTRAL_ATTENTION_CLASSES[config._attn_implementation](config, layer_idx) + + self.mlp = Mistral2MLP(config) + self.input_layernorm = Mistral2RMSNorm(config.hidden_size, eps=config.rms_norm_eps) + self.post_attention_layernorm = Mistral2RMSNorm(config.hidden_size, eps=config.rms_norm_eps) + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_value: Optional[Tuple[torch.Tensor]] = None, + output_attentions: Optional[bool] = False, + use_cache: Optional[bool] = False, + **kwargs, + ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]: + if "padding_mask" in kwargs: + warnings.warn( + "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`" + ) + """ + Args: + hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)` + attention_mask (`torch.FloatTensor`, *optional*): attention mask of size + `(batch, sequence_length)` where padding elements are indicated by 0. + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under + returned tensors for more detail. + use_cache (`bool`, *optional*): + If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding + (see `past_key_values`). + past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states + """ + + residual = hidden_states + + hidden_states = self.input_layernorm(hidden_states) + + # Self Attention + hidden_states, self_attn_weights, present_key_value = self.self_attn( + hidden_states=hidden_states, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_value=past_key_value, + output_attentions=output_attentions, + use_cache=use_cache, + ) + hidden_states = residual + hidden_states + + # Fully Connected + residual = hidden_states + hidden_states = self.post_attention_layernorm(hidden_states) + hidden_states = self.mlp(hidden_states) + hidden_states = residual + hidden_states + + outputs = (hidden_states,) + + if output_attentions: + outputs += (self_attn_weights,) + + if use_cache: + outputs += (present_key_value,) + + # -------------------------------------------------------- + # # 将Tensor转换为列表 + # ans_list = [tensor.tolist() for tensor in outputs] + # # 指定.json文件的路径 + # file_path = '/remote-home/lqyin/CoLLiE/tests/models/mistraltp/decoder_outputs.json' + + # # 尝试打开现有的.json文件并读取内容,如果文件不存在则创建一个新的列表 + # try: + # with open(file_path, 'r', encoding='utf-8') as file: + # results_list = json.load(file) + # except FileNotFoundError: + # results_list = [] + # # 将当前结果添加到列表中 + # results_list.append(ans_list) + # # 将更新后的列表写回.json文件 + # with open(file_path, 'w', encoding='utf-8') as file: + # json.dump(results_list, file, ensure_ascii=False, indent=4) + # file.write('\n') # 在文件末尾添加一个换行符 + # -------------------------------------------------------- + + return outputs + + +MISTRAL_START_DOCSTRING = r""" + This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the + library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads + etc.) + + This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. + Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage + and behavior. + + Parameters: + config ([`MistralConfig`]): + Model configuration class with all the parameters of the model. Initializing with a config file does not + load the weights associated with the model, only the configuration. Check out the + [`~PreTrainedModel.from_pretrained`] method to load the model weights. +""" + + +@add_start_docstrings( + "The bare Mistral Model outputting raw hidden-states without any specific head on top.", + MISTRAL_START_DOCSTRING, +) +class Mistral2PreTrainedModel(PreTrainedModel): + config_class = Mistral2Config + base_model_prefix = "model" + supports_gradient_checkpointing = True + _no_split_modules = ["MistralDecoderLayer"] + _skip_keys_device_placement = "past_key_values" + _supports_flash_attn_2 = True + _supports_sdpa = True + _supports_cache_class = True + + def _init_weights(self, module): + std = self.config.initializer_range + if isinstance(module, nn.Linear): + module.weight.data.normal_(mean=0.0, std=std) + if module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, nn.Embedding): + module.weight.data.normal_(mean=0.0, std=std) + if module.padding_idx is not None: + module.weight.data[module.padding_idx].zero_() + + +MISTRAL_INPUTS_DOCSTRING = r""" + Args: + input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`): + Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide + it. + + Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. + + [What are input IDs?](../glossary#input-ids) + attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*): + Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + [What are attention masks?](../glossary#attention-mask) + + Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. + + If `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see + `past_key_values`). + + If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`] + and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more + information on the default strategy. + + - 1 indicates the head is **not masked**, + - 0 indicates the head is **masked**. + position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0, + config.n_positions - 1]`. + + [What are position IDs?](../glossary#position-ids) + past_key_values (`Cache` or `tuple(tuple(torch.FloatTensor))`, *optional*): + Pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention + blocks) that can be used to speed up sequential decoding. This typically consists in the `past_key_values` + returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`. + + Two formats are allowed: + - a [`~cache_utils.Cache`] instance; + - Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of + shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`). This is also known as the legacy + cache format. + + The model will output the same cache format that is fed as input. If no `past_key_values` are passed, the + legacy cache format will be returned. + + If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that don't + have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `input_ids` + of shape `(batch_size, sequence_length)`. + inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): + Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This + is useful if you want more control over how to convert `input_ids` indices into associated vectors than the + model's internal embedding lookup matrix. + use_cache (`bool`, *optional*): + If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see + `past_key_values`). + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned + tensors for more detail. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for + more detail. + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. +""" + + +@add_start_docstrings( + "The bare Mistral Model outputting raw hidden-states without any specific head on top.", + MISTRAL_START_DOCSTRING, +) +class Mistral2Model(nn.Module): + """ + Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`MistralDecoderLayer`] + + Args: + config: MistralConfig + """ + + def __init__(self, config: CollieConfig): + # super().__init__(config) + super().__init__() + self.config = config + self.padding_idx = config.pad_token_id + self.vocab_size = config.vocab_size + + self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx) + self.layers = nn.ModuleList( + [MistralDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)] + ) + self._attn_implementation = config._attn_implementation + self.norm = Mistral2RMSNorm(config.hidden_size, eps=config.rms_norm_eps) + + self.gradient_checkpointing = False + # Initialize weights and apply final processing + # self.post_init() + + def get_input_embeddings(self): + return self.embed_tokens + + def set_input_embeddings(self, value): + self.embed_tokens = value + + @add_start_docstrings_to_model_forward(MISTRAL_INPUTS_DOCSTRING) + def forward( + self, + input_ids: torch.LongTensor = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[List[torch.FloatTensor]] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple, BaseModelOutputWithPast]: + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + use_cache = use_cache if use_cache is not None else self.config.use_cache + + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + # retrieve input_ids and inputs_embeds + if input_ids is not None and inputs_embeds is not None: + raise ValueError("You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time") + elif input_ids is not None: + batch_size, seq_length = input_ids.shape + elif inputs_embeds is not None: + batch_size, seq_length, _ = inputs_embeds.shape + else: + raise ValueError("You have to specify either decoder_input_ids or decoder_inputs_embeds") + + if self.gradient_checkpointing and self.training: + if use_cache: + logger.warning_once( + "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..." + ) + use_cache = False + + past_key_values_length = 0 + + if use_cache: + use_legacy_cache = not isinstance(past_key_values, Cache) + if use_legacy_cache: + past_key_values = DynamicCache.from_legacy_cache(past_key_values) + past_key_values_length = past_key_values.get_usable_length(seq_length) + + if position_ids is None: + device = input_ids.device if input_ids is not None else inputs_embeds.device + position_ids = torch.arange( + past_key_values_length, seq_length + past_key_values_length, dtype=torch.long, device=device + ) + position_ids = position_ids.unsqueeze(0).view(-1, seq_length) + else: + position_ids = position_ids.view(-1, seq_length).long() + + if inputs_embeds is None: + inputs_embeds = self.embed_tokens(input_ids) + + if attention_mask is not None and self._attn_implementation == "flash_attention_2" and use_cache: + is_padding_right = attention_mask[:, -1].sum().item() != batch_size + if is_padding_right: + raise ValueError( + "You are attempting to perform batched generation with padding_side='right'" + " this may lead to unexpected behaviour for Flash Attention version of Mistral. Make sure to " + " call `tokenizer.padding_side = 'left'` before tokenizing the input. " + ) + + if self._attn_implementation == "flash_attention_2": + # 2d mask is passed through the layers + attention_mask = attention_mask if (attention_mask is not None and 0 in attention_mask) else None + elif self._attn_implementation == "sdpa" and not output_attentions: + # output_attentions=True can not be supported when using SDPA, and we fall back on + # the manual implementation that requires a 4D causal mask in all cases. + attention_mask = _prepare_4d_causal_attention_mask_for_sdpa( + attention_mask, + (batch_size, seq_length), + inputs_embeds, + past_key_values_length, + ) + else: + # 4d mask is passed through the layers + attention_mask = _prepare_4d_causal_attention_mask( + attention_mask, + (batch_size, seq_length), + inputs_embeds, + past_key_values_length, + sliding_window=self.config.sliding_window, + ) + + hidden_states = inputs_embeds + + + # -------------------------------------------------------- + # # 将Tensor转换为列表 + # ans_list = inputs_embeds.tolist() + # # 指定.json文件的路径 + # file_path = '/remote-home/lqyin/CoLLiE/tests/models/mistraltp/inputs_embeds.json' + + # # 尝试打开现有的.json文件并读取内容,如果文件不存在则创建一个新的列表 + # try: + # with open(file_path, 'r', encoding='utf-8') as file: + # results_list = json.load(file) + # except FileNotFoundError: + # results_list = [] + # # 将当前结果添加到列表中 + # results_list.append(ans_list) + # # 将更新后的列表写回.json文件 + # with open(file_path, 'w', encoding='utf-8') as file: + # json.dump(results_list, file, ensure_ascii=False, indent=4) + # file.write('\n') # 在文件末尾添加一个换行符 + # # -------------------------------------------------------- + + # decoder layers + all_hidden_states = () if output_hidden_states else None + all_self_attns = () if output_attentions else None + next_decoder_cache = None + + for decoder_layer in self.layers: + if output_hidden_states: + all_hidden_states += (hidden_states,) + + if self.gradient_checkpointing and self.training: + layer_outputs = self._gradient_checkpointing_func( + decoder_layer.__call__, + hidden_states, + attention_mask, + position_ids, + past_key_values, + output_attentions, + use_cache, + ) + else: + layer_outputs = decoder_layer( + hidden_states, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_value=past_key_values, + output_attentions=output_attentions, + use_cache=use_cache, + ) + + hidden_states = layer_outputs[0] + + if use_cache: + next_decoder_cache = layer_outputs[2 if output_attentions else 1] + + if output_attentions: + all_self_attns += (layer_outputs[1],) + + hidden_states = self.norm(hidden_states) + + # add hidden states from the last decoder layer + if output_hidden_states: + all_hidden_states += (hidden_states,) + + next_cache = None + if use_cache: + next_cache = next_decoder_cache.to_legacy_cache() if use_legacy_cache else next_decoder_cache + + if not return_dict: + return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None) + return BaseModelOutputWithPast( + last_hidden_state=hidden_states, + past_key_values=next_cache, + hidden_states=all_hidden_states, + attentions=all_self_attns, + ) + + +class Mistral2ForCausalLM(CollieModelForCausalLM): + _tied_weights_keys = ["lm_head.weight"] + + def __init__(self, config:CollieConfig): + super().__init__(config) + self.model = Mistral2Model(config) + self.vocab_size = config.vocab_size + # self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False) + self.lm_head = ColumnParallelLinearWithoutBias( + self.collie_config.hidden_size, self.collie_config.vocab_size, bias=False + ) + # Initialize weights and apply final processing + # self.post_init() + # GenerationMixin 需要的额外参数 + self.config.is_decoder = True + if config.model_config.tie_word_embeddings: + self.lm_head.weight = self.embed_tokens.weight + self.main_input_name = "input_ids" + + def clean_cache(self): + self._clean_hidden_states([*self.model.layers, self.lm_head]) + self._set_use_cache(self.model.layers, False) + + def set_cache(self, use_cache): + self._set_use_cache(self.model.layers, use_cache) + + def get_input_embeddings(self): + return self.model.embed_tokens + + def set_input_embeddings(self, value): + self.model.embed_tokens = value + + def get_output_embeddings(self): + return self.lm_head + + def set_output_embeddings(self, new_embeddings): + self.lm_head = new_embeddings + + def set_decoder(self, decoder): + self.model = decoder + + def get_decoder(self): + return self.model + + @add_start_docstrings_to_model_forward(MISTRAL_INPUTS_DOCSTRING) + @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC) + def forward( + self, + input_ids: torch.LongTensor = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[List[torch.FloatTensor]] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + **kwargs, + ) -> Union[Tuple, CausalLMOutputWithPast]: + r""" + Args: + labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., + config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored + (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`. + + Returns: + + Example: + + ```python + >>> from transformers import AutoTokenizer, MistralForCausalLM + + >>> model = MistralForCausalLM.from_pretrained("mistralai/Mistral-7B-v0.1") + >>> tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-v0.1") + + >>> prompt = "Hey, are you conscious? Can you talk to me?" + >>> inputs = tokenizer(prompt, return_tensors="pt") + + >>> # Generate + >>> generate_ids = model.generate(inputs.input_ids, max_length=30) + >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0] + "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you." + ```""" + + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn) + outputs = self.model( + input_ids=input_ids, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + hidden_states = outputs[0] + logits = self.lm_head(hidden_states) + logits = logits.float() + + loss = None + if labels is not None: + # Shift so that tokens < n predict n + shift_logits = logits[..., :-1, :].contiguous() + shift_labels = labels[..., 1:].contiguous() + # Flatten the tokens + shift_logits = shift_logits.view(-1, self.config.vocab_size) + shift_labels = shift_labels.view(-1) + # Ensure tensors are on the same device + shift_labels = shift_labels.to(shift_logits.device) + loss_fct = CrossEntropyLoss() + loss = loss_fct(shift_logits, shift_labels) + + if not return_dict: + output = (logits,) + outputs[1:] + return (loss,) + output if loss is not None else output + + return CausalLMOutputWithPast( + loss=loss, + logits=logits, + past_key_values=outputs.past_key_values, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + def prepare_inputs_for_generation( + self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, **kwargs + ): + # Omit tokens covered by past_key_values + if past_key_values is not None: + if isinstance(past_key_values, Cache): + cache_length = past_key_values.get_seq_length() + past_length = past_key_values.seen_tokens + max_cache_length = past_key_values.get_max_length() + else: + cache_length = past_length = past_key_values[0][0].shape[2] + max_cache_length = None + + # Keep only the unprocessed tokens: + # 1 - If the length of the attention_mask exceeds the length of input_ids, then we are in a setting where + # some of the inputs are exclusively passed as part of the cache (e.g. when passing input_embeds as + # input) + if attention_mask is not None and attention_mask.shape[1] > input_ids.shape[1]: + input_ids = input_ids[:, -(attention_mask.shape[1] - past_length) :] + # 2 - If the past_length is smaller than input_ids', then input_ids holds all input tokens. We can discard + # input_ids based on the past_length. + elif past_length < input_ids.shape[1]: + input_ids = input_ids[:, past_length:] + # 3 - Otherwise (past_length >= input_ids.shape[1]), let's assume input_ids only has unprocessed tokens. + + # If we are about to go beyond the maximum cache length, we need to crop the input attention mask. + if ( + max_cache_length is not None + and attention_mask is not None + and cache_length + input_ids.shape[1] > max_cache_length + ): + attention_mask = attention_mask[:, -max_cache_length:] + + position_ids = kwargs.get("position_ids", None) + if attention_mask is not None and position_ids is None: + # create position_ids on the fly for batch generation + position_ids = attention_mask.long().cumsum(-1) - 1 + position_ids.masked_fill_(attention_mask == 0, 1) + if past_key_values: + position_ids = position_ids[:, -input_ids.shape[1] :] + + # if `inputs_embeds` are passed, we only want to use them in the 1st generation step + if inputs_embeds is not None and past_key_values is None: + model_inputs = {"inputs_embeds": inputs_embeds} + else: + model_inputs = {"input_ids": input_ids} + + model_inputs.update( + { + "position_ids": position_ids, + "past_key_values": past_key_values, + "use_cache": kwargs.get("use_cache"), + "attention_mask": attention_mask, + } + ) + return model_inputs + + @staticmethod + def _reorder_cache(past_key_values, beam_idx): + reordered_past = () + for layer_past in past_key_values: + reordered_past += ( + tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past), + ) + return reordered_past + + @classmethod + def pipeline_layers(cls, config: CollieConfig): + """ + Get layers of pipeline. + :return: list + """ + if isinstance(config, str): + config = CollieConfig.from_pretrained(config) + + if config.tie_word_embeddings: + output = TiedLayerSpec( + "embed_tokens", + dict_as_params(input_keys="hidden_states", output_keys="logits"), + ColumnParallelLMHead, + config.hidden_size, + config.vocab_size, + bias=False, + ) + else: + output = LayerSpec( + dict_as_params(input_keys="hidden_states", output_keys="logits"), + ColumnParallelLMHead, + config.hidden_size, + config.vocab_size, + bias=False, + ) + + return [("model", Mistral2Model.pipeline_layers(config)), ("lm_head", output)] + + @staticmethod + def load_parallel_state_dict( + path: str, + config: Union[CollieConfig, str], + process_exclusion: bool = False, + **kwargs, + ): + ... + + @staticmethod + def load_parallel_state_dict( + path: str, + config: Union[CollieConfig, str], + process_exclusion: bool = False, + protocol: str = "file", # 指定加载state_dict时使用的协议 + **kwargs, + ): + """ + Load state_dict from ``path``. + The format of pretrained model should be the same as that of + `huggingface`. + :return: state_dict. Note that the state_dict should be processed + properly to match the current rank. + """ + # 配置加载 + if isinstance(config, str): + config = CollieConfig.from_pretrained(config) + # IO驱动初始化 + io_driver = IODriver.from_protocol(protocol) + # 检查文件路径是否存在 + if not io_driver.exists(path): + raise FileNotFoundError(f"folder {path} not found.") + # 初始化存储和处理变量 + state_dict = OrderedDict() + weights = [] + parts = None # 变量用于存储模型分割的部分信息 + # 如果开启了进程互斥,那么每个进程都会显示进度条,否则只显示 RANK0 的 + hide_progress = not process_exclusion and int(os.environ.get("RANK", "0")) != 0 + if dist.is_initialized() and process_exclusion: + # 如果启动了进程互斥,则要进行 dist.get_world_size() 次循环 + rank_order = range(dist.get_world_size()) + else: + # 不开启只进行一次循环 + rank_order = range(1) + # 权重文件加载和处理 + for rank in rank_order: + # 如果开启了进程互斥,那么只有对应 RANK 的能进入循环;不开启进程互斥的话就都可以进 + if int(os.environ.get("RANK", "0")) == rank or not process_exclusion: + # PP 分层的方法保存在了 os.environ["COLLIE_PP_PARTS"], 格式类似于 [0, 17, 35], 左闭右开 + if env.is_pipeline: + # 保存的是 json 格式 + parts = env.pipeline_parts + if hasattr(config, "num_key_value_heads"): + # llama2 (transformers >= 4.31.0) + num_key_value_heads = config.num_key_value_heads + else: + num_key_value_heads = config.num_attention_heads + head_dim = config.hidden_size // config.num_attention_heads + # 如果存在 pytorch_model.bin.index.json 文件的话,此时不同的 pp 进程可以按需加载自己需要的权重 + if ( + io_driver.exists(os.path.join(path, "pytorch_model.bin.index.json")) + and "COLLIE_PP_PARTS" in os.environ.keys() + ): + weight_map = json.loads( + io_driver.load( + os.path.join(path, "pytorch_model.bin.index.json"), mode="r" + ) + )["weight_map"] + # layers 表示自己需要的层 + layers = env.pipeline_layers_idx + # 筛选出形似 model.layers.0 这样的层。包含两个条件:1. 有数字的层;2. 数字加一要在 layers 里面(因为最开始还有个 embedding 占一层) + weights.extend( + [ + value + for key, value in weight_map.items() + if len(key.split(".")) > 2 + and key.split(".")[2].isdigit() + and (int(key.split(".")[2]) + 1) in layers + ] + ) + # 去重 + weights = list(set(weights)) + # 继续筛选,如果有 0 层,那么就要加载 embedding;如果有最后一层,那么就要加载 lm_head;如果有倒数第二层,那么就要加载 norm + if 0 in layers: + weights.append(weight_map["model.tok_embeddings.weight"]) + if max(parts) - 1 in layers: + weights.append(weight_map["output.weight"]) + if max(parts) - 2 in layers: + weights.append(weight_map["model.norm.weight"]) + else: + # 如果没有 pytorch_model.bin.index.json 文件的话,那么就加载所有的权重 + weights = [ + weight + for weight in io_driver.list(path) + if weight.endswith(".bin") + ] + with progress( + weights, + desc="Loading state dict", + total=len(weights), + disable=hide_progress, + ) as pbar: + for weight in pbar: + part_state_dict = io_driver.load( + os.path.join(path, weight), mode="rb" + ) + # for key in list(part_state_dict.keys()): + # if "attention.wqkv.weight" in key: + # # qkv_weights = part_state_dict.pop(key) + # qkv_weights = part_state_dict[key] + # print(qkv_weights.shape) + # (wq, wk, wv) = qkv_weights.split( + # [ + # config.hidden_size, + # config.num_key_value_heads * head_dim, + # config.num_key_value_heads * head_dim, + # ], + # dim=0, + # ) + # wq_name = key.replace("wqkv", "wq") + # wk_name = key.replace("wqkv", "wk") + # wv_name = key.replace("wqkv", "wv") + # part_state_dict[wq_name] = wq + # part_state_dict[wk_name] = wk + # part_state_dict[wv_name] = wv + state_dict.update(part_state_dict) + del part_state_dict + if parts is not None: + # 这一步是 pp 的复筛 + layers = env.pipeline_layers_idx + for key in list(state_dict.keys()): + if key.startswith("layers"): + layer = int(key.split(".")[1]) + if layer + 1 not in layers: + state_dict.pop(key) + # if key.endswith("tok_embeddings.weight"): + if key.endswith("embed_tokens.weight"): + if 0 not in layers: + state_dict.pop(key) + if key == "norm.weight": + if max(parts) - 2 not in layers: + state_dict.pop(key) + # if key.endswith("output.weight"): + if key.endswith("lm_head.weight"): + if max(parts) - 1 not in layers: + state_dict.pop(key) + # 根据用户配置的新的 tp size 进行分割 + for key in list(state_dict.keys()): + col_filter = [ + # "wq.weight", + # "wk.weight", + # "wv.weight", + # "wqkv.weight", + # "w1.weight", + # "w3.weight", + # "tok_embeddings.weight", + # "output.weight", + "q_proj.weight", + "k_proj.weight", + "v_proj.weight", + "o_proj.weight", + "lm_head.weight", + "gate_proj.weight", + "up_proj.weight", + "down_proj.weight", + "embed_tokens.weight", + ] + col_split = any([key.endswith(filter) for filter in col_filter]) + + if col_split: + tensor = ( + list(torch.chunk(state_dict[key], config.tp_size, dim=0))[ + env.tp_rank + ] + .detach() + .clone() + ) + del state_dict[key] + if process_exclusion: + # CPU 内存回收(速度很慢) + gc.collect() + state_dict[key] = tensor + elif key.endswith("o_proj.weight") or key.endswith("down_proj.weight"): + tensor = ( + list(torch.chunk(state_dict[key], config.tp_size, dim=1))[ + env.tp_rank + ] + .detach() + .clone() + ) + del state_dict[key] + if process_exclusion: + # CPU 内存回收(速度很慢) + gc.collect() + state_dict[key] = tensor + if dist.is_initialized() and process_exclusion: + # 如果选择了进程互斥,那么本次循环中不需要加载权重的进程需等待 + dist.barrier() + return state_dict + + @staticmethod + def save_parallel_state_dict( + state_dict: dict, + path: str, + config: CollieConfig, + process_exclusion: bool = False, + **kwargs, + ): + ... + + @staticmethod + def save_parallel_state_dict( + state_dict: dict, + path: str, + config: CollieConfig, + process_exclusion: bool = False, + protocol: str = "file", + ): + """ + Save state_dict to ``path``. + The format of saved state dict should be the same as that of + `huggingface`. + """ + io_driver = IODriver.from_protocol(protocol) + # gather to tp rank 0 + if dist.is_initialized() and process_exclusion: + # 如果启动了进程互斥,则要进行 pp_size 次循环 + rank_order = range(config.pp_size) + else: + # 不开启只进行一次循环 + rank_order = range(1) + dst = parallel_state.get_tensor_model_parallel_src_rank() + with progress( + rank_order, + desc="Saving model", + disable=int(os.environ.get("RANK", "0")) != 0, + ) as pbar: + for rank in pbar: + if env.dp_rank == 0 and (env.pp_rank == rank or not process_exclusion): + for key in sorted(list(state_dict.keys())): + tensor_list = None + if env.tp_rank == 0: + tensor_list = [ + torch.zeros_like(state_dict[key]) + .to(state_dict[key].dtype) + .cuda() + for _ in range(config.tp_size) + ] + dist.gather( + state_dict[key].cuda(), + dst=dst, + gather_list=tensor_list, + group=env.tp_group, + ) + if env.tp_rank == 0: + col_filter = [ + # "wq.weight", + # "wk.weight", + # "wv.weight", + # "wqkv.weight", + # "w1.weight", + # "w3.weight", + # "tok_embeddings.weight", + # "output.weight", + "q_proj.weight", + "k_proj.weight", + "v_proj.weight", + "o_proj.weight", + "lm_head.weight", + "gate_proj.weight", + "up_proj.weight", + "down_proj.weight", + "embed_tokens.weight", + ] + col_split = any( + [key.endswith(filter) for filter in col_filter] + ) + + if col_split: + state_dict[key] = concat_tensor(tensor_list, dim=0) + + if process_exclusion: + # CPU 内存回收(速度很慢) + gc.collect() + + elif key.endswith("o_proj.weight") or key.endswith("down_proj.weight"): + state_dict[key] = concat_tensor(tensor_list, dim=1) + + if process_exclusion: + # CPU 内存回收(速度很慢) + gc.collect() + # 似乎不需要? + # state_dict_keys = state_dict.keys() + # for layer_id in range(config.num_layers): + # qkv_names = [None, None, None] + # for key in state_dict_keys: + # if f"layers.{layer_id}.attention.wq.weight" in key: + # qkv_names[0] = key + # elif f"layers.{layer_id}.attention.wk.weight" in key: + # qkv_names[1] = key + # elif f"layers.{layer_id}.attention.wv.weight" in key: + # qkv_names[2] = key + # qkv_name = qkv_names[0].replace("wq", "wqkv") + # state_dict[qkv_name] = torch.cat( + # [ + # state_dict.pop(qkv_names[0]), + # state_dict.pop(qkv_names[1]), + # state_dict.pop(qkv_names[2]), + # ], + # dim=0 + # ) + + if env.tp_rank == 0: + # Save gathered weights + if env.is_pipeline: + ckpt_name = f"pytorch_model-{env.pp_rank + 1:05d}-of-{config.pp_size:05d}.bin" + total_size = 0 + weight_map = {} + for name, weight in state_dict.items(): + weight_size = weight.numel() * dtype_byte_size( + weight.dtype + ) + weight_map[name] = ckpt_name + total_size += weight_size + index_dict = dict( + total_size=total_size, weight_map=weight_map + ) + index_dicts = [None for _ in range(env.pp_size)] + dist.gather_object( + index_dict, index_dicts if env.pp_rank == 0 else None, group=env.pp_group + ) + if env.pp_rank == 0: + total_size = 0 + weight_map = {} + for _index_dict in index_dicts: + total_size += _index_dict["total_size"] + weight_map.update(_index_dict["weight_map"]) + merged_dict = { + "metadata": {"total_size": total_size}, + "weight_map": weight_map, + } + io_driver.save( + json.dumps(merged_dict, indent=2, sort_keys=True) + + "\n", + os.path.join(path, "pytorch_model.bin.index.json"), + ) + + else: + ckpt_name = f"pytorch_model.bin" + ckpt_path = os.path.join(path, ckpt_name) + io_driver.save(state_dict, ckpt_path) + if dist.is_initialized() and process_exclusion: + dist.barrier() + if env.rank == 0: + config.save_pretrained(path, protocol=protocol) + dist.barrier() + + +@add_start_docstrings( + """ + The Mistral Model transformer with a sequence classification head on top (linear layer). + + [`MistralForSequenceClassification`] uses the last token in order to do the classification, as other causal models + (e.g. GPT-2) do. + + Since it does classification on the last token, it requires to know the position of the last token. If a + `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If + no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the + padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in + each row of the batch). + """, + MISTRAL_START_DOCSTRING, +) +# Copied from transformers.models.llama.modeling_llama.LlamaForSequenceClassification with Llama->Mistral, LLAMA->MISTRAL +class MistralForSequenceClassification(Mistral2PreTrainedModel): + def __init__(self, config): + super().__init__(config) + self.num_labels = config.num_labels + self.model = Mistral2Model(config) + self.score = nn.Linear(config.hidden_size, self.num_labels, bias=False) + + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self): + return self.model.embed_tokens + + def set_input_embeddings(self, value): + self.model.embed_tokens = value + + @add_start_docstrings_to_model_forward(MISTRAL_INPUTS_DOCSTRING) + def forward( + self, + input_ids: torch.LongTensor = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[List[torch.FloatTensor]] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple, SequenceClassifierOutputWithPast]: + r""" + labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): + Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., + config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If + `config.num_labels > 1` a classification loss is computed (Cross-Entropy). + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + transformer_outputs = self.model( + input_ids, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + hidden_states = transformer_outputs[0] + logits = self.score(hidden_states) + + if input_ids is not None: + batch_size = input_ids.shape[0] + else: + batch_size = inputs_embeds.shape[0] + + if self.config.pad_token_id is None and batch_size != 1: + raise ValueError("Cannot handle batch sizes > 1 if no padding token is defined.") + if self.config.pad_token_id is None: + sequence_lengths = -1 + else: + if input_ids is not None: + # if no pad token found, use modulo instead of reverse indexing for ONNX compatibility + sequence_lengths = torch.eq(input_ids, self.config.pad_token_id).int().argmax(-1) - 1 + sequence_lengths = sequence_lengths % input_ids.shape[-1] + sequence_lengths = sequence_lengths.to(logits.device) + else: + sequence_lengths = -1 + + pooled_logits = logits[torch.arange(batch_size, device=logits.device), sequence_lengths] + + loss = None + if labels is not None: + labels = labels.to(logits.device) + if self.config.problem_type is None: + if self.num_labels == 1: + self.config.problem_type = "regression" + elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int): + self.config.problem_type = "single_label_classification" + else: + self.config.problem_type = "multi_label_classification" + + if self.config.problem_type == "regression": + loss_fct = MSELoss() + if self.num_labels == 1: + loss = loss_fct(pooled_logits.squeeze(), labels.squeeze()) + else: + loss = loss_fct(pooled_logits, labels) + elif self.config.problem_type == "single_label_classification": + loss_fct = CrossEntropyLoss() + loss = loss_fct(pooled_logits.view(-1, self.num_labels), labels.view(-1)) + elif self.config.problem_type == "multi_label_classification": + loss_fct = BCEWithLogitsLoss() + loss = loss_fct(pooled_logits, labels) + if not return_dict: + output = (pooled_logits,) + transformer_outputs[1:] + return ((loss,) + output) if loss is not None else output + + return SequenceClassifierOutputWithPast( + loss=loss, + logits=pooled_logits, + past_key_values=transformer_outputs.past_key_values, + hidden_states=transformer_outputs.hidden_states, + attentions=transformer_outputs.attentions, + ) diff --git a/collie/models/mistral2/modelpp.py b/collie/models/mistral2/modelpp.py new file mode 100644 index 0000000..1180a10 --- /dev/null +++ b/collie/models/mistral2/modelpp.py @@ -0,0 +1,1922 @@ +# coding=utf-8 +# Copyright 2023 Mistral AI and the HuggingFace Inc. team. All rights reserved. +# +# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX +# and OPT implementations in this library. It has been modified from its +# original forms to accommodate minor architectural differences compared +# to GPT-NeoX and OPT used by the Meta AI team that trained the model. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" PyTorch Mistral model.""" +import inspect +import math +import warnings +from typing import List, Optional, Tuple, Union + +import torch +import torch.nn.functional as F +import torch.utils.checkpoint +from torch import nn +from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss + +from transformers.activations import ACT2FN +from transformers.cache_utils import Cache, DynamicCache +from transformers.modeling_attn_mask_utils import _prepare_4d_causal_attention_mask, _prepare_4d_causal_attention_mask_for_sdpa +from transformers.modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast, SequenceClassifierOutputWithPast +from transformers.modeling_utils import PreTrainedModel, dtype_byte_size +from transformers.utils import ( + add_start_docstrings, + add_start_docstrings_to_model_forward, + is_flash_attn_2_available, + is_flash_attn_greater_or_equal_2_10, + logging, + replace_return_docstrings, +) +from .configuration_mistraltp import Mistral2Config + + +if is_flash_attn_2_available(): + from flash_attn import flash_attn_func, flash_attn_varlen_func + from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input # noqa + + _flash_supports_window_size = "window_size" in list(inspect.signature(flash_attn_func).parameters) + + +logger = logging.get_logger(__name__) + +_CONFIG_FOR_DOC = "Mistral2Config" + +#modified for collie +import torch.distributed as dist +import gc +import json +import os +from collections import OrderedDict +from megatron.core import parallel_state, tensor_parallel +from einops import rearrange +from deepspeed.pipe import LayerSpec, TiedLayerSpec + +from collie.config import CollieConfig +from collie.driver.io import IODriver +from collie.log.logger import logger +from collie.module import ( + ColumnParallelLinearWithoutBias, + ColumnParallelLMHead, + RowParallelLinearWithoutBias, +) +from collie.utils import concat_tensor, dict_as_params, env, progress +from collie.models.base import CollieModelForCausalLM +from collie.models.utils import ( + kv_cache_to_inputs_for_layer, inputs_to_kv_cache_for_layer, + kv_cache_to_inputs_for_model, inputs_to_kv_cache_for_model, +) + + +# Copied from transformers.models.llama.modeling_llama._get_unpad_data +def _get_unpad_data(attention_mask): + seqlens_in_batch = attention_mask.sum(dim=-1, dtype=torch.int32) + indices = torch.nonzero(attention_mask.flatten(), as_tuple=False).flatten() + max_seqlen_in_batch = seqlens_in_batch.max().item() + cu_seqlens = F.pad(torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.int32), (1, 0)) + return ( + indices, + cu_seqlens, + max_seqlen_in_batch, + ) + + +# Copied from transformers.models.llama.modeling_llama.LlamaRMSNorm with Llama->Mistral +class Mistral2RMSNorm(nn.Module): + def __init__(self, hidden_size, eps=1e-6): + """ + MistralRMSNorm is equivalent to T5LayerNorm + """ + super().__init__() + self.weight = nn.Parameter(torch.ones(hidden_size)) + self.variance_epsilon = eps + + def forward(self, hidden_states): + input_dtype = hidden_states.dtype + hidden_states = hidden_states.to(torch.float32) + variance = hidden_states.pow(2).mean(-1, keepdim=True) + hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon) + return self.weight * hidden_states.to(input_dtype) + + +# copied from transformers.models.llama.modeling_llama.LlamaRotaryEmbedding with Llama->Mistral +# TODO @Arthur no longer copied from LLama after static cache +class Mistral2RotaryEmbedding(nn.Module): + def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None): + super().__init__() + + self.dim = dim + self.max_position_embeddings = max_position_embeddings + self.base = base + inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2, dtype=torch.int64).float().to(device) / self.dim)) + self.register_buffer("inv_freq", inv_freq, persistent=False) + + # Build here to make `torch.jit.trace` work. + self._set_cos_sin_cache( + seq_len=max_position_embeddings, device=self.inv_freq.device, dtype=torch.get_default_dtype() + ) + + def _set_cos_sin_cache(self, seq_len, device, dtype): + self.max_seq_len_cached = seq_len + t = torch.arange(self.max_seq_len_cached, device=device, dtype=torch.int64).type_as(self.inv_freq) + + freqs = torch.outer(t, self.inv_freq) + # Different from paper, but it uses a different permutation in order to obtain the same calculation + emb = torch.cat((freqs, freqs), dim=-1) + self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False) + self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False) + + def forward(self, x, seq_len=None): + # x: [bs, num_attention_heads, seq_len, head_size] + if seq_len > self.max_seq_len_cached: + self._set_cos_sin_cache(seq_len=seq_len, device=x.device, dtype=x.dtype) + + return ( + self.cos_cached[:seq_len].to(dtype=x.dtype), + self.sin_cached[:seq_len].to(dtype=x.dtype), + ) + + +# Copied from transformers.models.llama.modeling_llama.rotate_half +def rotate_half(x): + """Rotates half the hidden dims of the input.""" + x1 = x[..., : x.shape[-1] // 2] + x2 = x[..., x.shape[-1] // 2 :] + return torch.cat((-x2, x1), dim=-1) + + +# copied from transformers.models.llama.modeling_llama.apply_rotary_pos_emb +# TODO @Arthur no longer copied from LLama after static cache +def apply_rotary_pos_emb(q, k, cos, sin, position_ids, unsqueeze_dim=1): + """Applies Rotary Position Embedding to the query and key tensors. + + Args: + q (`torch.Tensor`): The query tensor. + k (`torch.Tensor`): The key tensor. + cos (`torch.Tensor`): The cosine part of the rotary embedding. + sin (`torch.Tensor`): The sine part of the rotary embedding. + position_ids (`torch.Tensor`): + The position indices of the tokens corresponding to the query and key tensors. For example, this can be + used to pass offsetted position ids when working with a KV-cache. + unsqueeze_dim (`int`, *optional*, defaults to 1): + The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and + sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note + that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and + k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes + cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have + the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2. + Returns: + `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding. + """ + cos = cos[position_ids].unsqueeze(unsqueeze_dim) + sin = sin[position_ids].unsqueeze(unsqueeze_dim) + q_embed = (q * cos) + (rotate_half(q) * sin) + k_embed = (k * cos) + (rotate_half(k) * sin) + return q_embed, k_embed + + +class Mistral2MLP(nn.Module): + def __init__(self, config): + super().__init__() + self.config = config + self.hidden_size = config.hidden_size + self.intermediate_size = config.intermediate_size + # self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False) + # self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False) + # self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False) + + self.up_proj = ColumnParallelLinearWithoutBias( + self.hidden_size, + self.intermediate_size, + bias=False, + gather_output=False, + init_method=lambda x: x, + ) + self.gate_proj = ColumnParallelLinearWithoutBias( + self.hidden_size, + self.intermediate_size, + bias=False, + gather_output=False, + init_method=lambda x: x, + ) + self.down_proj = RowParallelLinearWithoutBias( + self.intermediate_size, + self.hidden_size, + bias=False, + input_is_parallel=True, + init_method=lambda x: x, + ) + + self.act_fn = ACT2FN[config.hidden_act] + + def forward(self, x): + return self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x)) + + +# Copied from transformers.models.llama.modeling_llama.repeat_kv +def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor: + """ + This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch, + num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim) + """ + batch, num_key_value_heads, slen, head_dim = hidden_states.shape + if n_rep == 1: + return hidden_states + hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim) + return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim) + + +class Mistral2Attention(nn.Module): + """ + Multi-headed attention from 'Attention Is All You Need' paper. Modified to use sliding window attention: Longformer + and "Generating Long Sequences with Sparse Transformers". + """ + + def __init__(self, config: CollieConfig, layer_idx: Optional[int] = None): + super().__init__() + self.config = config + self.layer_idx = layer_idx + if layer_idx is None: + logger.warning_once( + f"Instantiating {self.__class__.__name__} without passing a `layer_idx` is not recommended and will " + "lead to errors during the forward call if caching is used. Please make sure to provide a `layer_idx` " + "when creating this class." + ) + + self.hidden_size = config.hidden_size + self.num_heads = config.num_attention_heads + self.head_dim = self.hidden_size // self.num_heads + self.num_key_value_heads = config.num_key_value_heads + self.num_key_value_groups = self.num_heads // self.num_key_value_heads + self.max_position_embeddings = config.max_position_embeddings + self.rope_theta = config.rope_theta + self.is_causal = True + self.attention_dropout = config.attention_dropout + + if (self.head_dim * self.num_heads) != self.hidden_size: + raise ValueError( + f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}" + f" and `num_heads`: {self.num_heads})." + ) + # self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=False) + # self.k_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=False) + # self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=False) + # self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=False) + + self.q_proj = ColumnParallelLinearWithoutBias( + self.hidden_size, + self.num_heads * self.head_dim, + bias=False, + gather_output=False, + init_method=lambda x: x, + ) + self.k_proj = ColumnParallelLinearWithoutBias( + self.hidden_size, + self.num_key_value_heads * self.head_dim, + bias=False, + gather_output=False, + init_method=lambda x: x, + ) + self.v_proj = ColumnParallelLinearWithoutBias( + self.hidden_size, + self.num_key_value_heads * self.head_dim, + bias=False, + gather_output=False, + init_method=lambda x: x, + ) + self.o_proj = RowParallelLinearWithoutBias( + self.num_heads * self.head_dim, + self.hidden_size, + bias=False, + input_is_parallel=True, + init_method=lambda x: x, + ) + + self.rotary_emb = Mistral2RotaryEmbedding( + self.head_dim, + max_position_embeddings=self.max_position_embeddings, + base=self.rope_theta, + ) + + def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int): + return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous() + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_value: Optional[Cache] = None, + output_attentions: bool = False, + use_cache: bool = False, + **kwargs, + ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: + if "padding_mask" in kwargs: + warnings.warn( + "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`" + ) + bsz, q_len, _ = hidden_states.size() + + query_states = self.q_proj(hidden_states) + key_states = self.k_proj(hidden_states) + value_states = self.v_proj(hidden_states) + + # query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2) + # key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) + # value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) + + query_states, key_states, value_states = ( + rearrange(query_states, "b n (h d) -> b n h d", d=self.head_dim), + rearrange(key_states, "b n (h d) -> b n h d", d=self.head_dim), + rearrange(value_states, "b n (h d) -> b n h d", d=self.head_dim), + ) + + self.num_heads_tp = query_states.shape[2] + self.tp_size = self.num_heads // self.num_heads_tp + + query_states = query_states.transpose(1, 2) + key_states = key_states.transpose(1, 2) + value_states = value_states.transpose(1, 2) + + if self.config.pp_size > 1: + query_states = query_states.contiguous() + key_states = key_states.contiguous() + value_states = value_states.contiguous() + + kv_seq_len = key_states.shape[-2] + if past_key_value is not None: + if self.layer_idx is None: + raise ValueError( + f"The cache structure has changed since version v4.36. If you are using {self.__class__.__name__} " + "for auto-regressive decoding with k/v caching, please make sure to initialize the attention class " + "with a layer index." + ) + kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx) + cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len) + query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids) + + if past_key_value is not None: + cache_kwargs = {"sin": sin, "cos": cos} # Specific to RoPE models + key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs) + + # repeat k/v heads if n_kv_heads < n_heads + key_states = repeat_kv(key_states, self.num_key_value_groups) + value_states = repeat_kv(value_states, self.num_key_value_groups) + + attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim) + + if attn_weights.size() != (bsz, self.num_heads_tp, q_len, kv_seq_len): + raise ValueError( + f"Attention weights should be of size {(bsz, self.num_heads_tp, q_len, kv_seq_len)}, but is" + f" {attn_weights.size()}" + ) + + if attention_mask is not None: + if attention_mask.size() != (bsz, 1, q_len, kv_seq_len): + raise ValueError( + f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}" + ) + + attn_weights = attn_weights + attention_mask + + # upcast attention to fp32 + attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype) + attn_weights = nn.functional.dropout(attn_weights, p=self.attention_dropout, training=self.training) + attn_output = torch.matmul(attn_weights, value_states) + + if attn_output.size() != (bsz, self.num_heads_tp, q_len, self.head_dim): + raise ValueError( + f"`attn_output` should be of size {(bsz, self.num_heads_tp, q_len, self.head_dim)}, but is" + f" {attn_output.size()}" + ) + + attn_output = attn_output.transpose(1, 2).contiguous() + attn_output = attn_output.reshape(bsz, q_len, int(self.hidden_size/self.tp_size)) + + attn_output = self.o_proj(attn_output) + + if not output_attentions: + attn_weights = None + + return attn_output, attn_weights, past_key_value + + +class Mistral2FlashAttention2(Mistral2Attention): + """ + Mistral flash attention module. This module inherits from `MistralAttention` as the weights of the module stays + untouched. The only required change would be on the forward pass where it needs to correctly call the public API of + flash attention and deal with padding tokens in case the input contains any of them. + """ + + # Copied from transformers.models.llama.modeling_llama.LlamaFlashAttention2.__init__ + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1. + # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignement, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0. + # Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left). + self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10() + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_value: Optional[Cache] = None, + output_attentions: bool = False, + use_cache: bool = False, + **kwargs, + ): + if "padding_mask" in kwargs: + warnings.warn( + "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`" + ) + + # overwrite attention_mask with padding_mask + attention_mask = kwargs.pop("padding_mask") + bsz, q_len, _ = hidden_states.size() + + query_states = self.q_proj(hidden_states) + key_states = self.k_proj(hidden_states) + value_states = self.v_proj(hidden_states) + + # query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2) + # key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) + # value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) + + query_states, key_states, value_states = ( + rearrange(query_states, "b n (h d) -> b n h d", d=self.head_dim), + rearrange(key_states, "b n (h d) -> b n h d", d=self.head_dim), + rearrange(value_states, "b n (h d) -> b n h d", d=self.head_dim), + ) + + self.num_heads_tp = query_states.shape[2] + self.tp_size = self.num_heads // self.num_heads_tp + + query_states = query_states.transpose(1, 2) + key_states = key_states.transpose(1, 2) + value_states = value_states.transpose(1, 2) + + if self.config.pp_size > 1: + query_states = query_states.contiguous() + key_states = key_states.contiguous() + value_states = value_states.contiguous() + + + kv_seq_len = key_states.shape[-2] + if past_key_value is not None: + if self.layer_idx is None: + raise ValueError( + f"The cache structure has changed since version v4.36. If you are using {self.__class__.__name__} " + "for auto-regressive decoding with k/v caching, please make sure to initialize the attention class " + "with a layer index." + ) + kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx) + + # Because the input can be padded, the absolute sequence length depends on the max position id. + rotary_seq_len = max(kv_seq_len, position_ids[:, -1].max().item()) + 1 + cos, sin = self.rotary_emb(value_states, seq_len=rotary_seq_len) + + query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids) + + use_sliding_windows = ( + _flash_supports_window_size + and getattr(self.config, "sliding_window", None) is not None + and kv_seq_len > self.config.sliding_window + ) + + if not _flash_supports_window_size: + logger.warning_once( + "The current flash attention version does not support sliding window attention, for a more memory efficient implementation" + " make sure to upgrade flash-attn library." + ) + + if past_key_value is not None: + # Activate slicing cache only if the config has a value `sliding_windows` attribute + cache_has_contents = past_key_value.get_seq_length(self.layer_idx) > 0 + if ( + getattr(self.config, "sliding_window", None) is not None + and kv_seq_len > self.config.sliding_window + and cache_has_contents + ): + slicing_tokens = 1 - self.config.sliding_window + + past_key = past_key_value[self.layer_idx][0] + past_value = past_key_value[self.layer_idx][1] + + past_key = past_key[:, :, slicing_tokens:, :].contiguous() + past_value = past_value[:, :, slicing_tokens:, :].contiguous() + + if past_key.shape[-2] != self.config.sliding_window - 1: + raise ValueError( + f"past key must have a shape of (`batch_size, num_heads, self.config.sliding_window-1, head_dim`), got" + f" {past_key.shape}" + ) + + if attention_mask is not None: + attention_mask = attention_mask[:, slicing_tokens:] + attention_mask = torch.cat([attention_mask, torch.ones_like(attention_mask[:, -1:])], dim=-1) + + cache_kwargs = {"sin": sin, "cos": cos} # Specific to RoPE models + key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs) + + # repeat k/v heads if n_kv_heads < n_heads + key_states = repeat_kv(key_states, self.num_key_value_groups) + value_states = repeat_kv(value_states, self.num_key_value_groups) + dropout_rate = 0.0 if not self.training else self.attention_dropout + + # In PEFT, usually we cast the layer norms in float32 for training stability reasons + # therefore the input hidden states gets silently casted in float32. Hence, we need + # cast them back in float16 just to be sure everything works as expected. + input_dtype = query_states.dtype + if input_dtype == torch.float32: + if torch.is_autocast_enabled(): + target_dtype = torch.get_autocast_gpu_dtype() + # Handle the case where the model is quantized + elif hasattr(self.config, "_pre_quantization_dtype"): + target_dtype = self.config._pre_quantization_dtype + else: + target_dtype = self.q_proj.weight.dtype + + logger.warning_once( + f"The input hidden states seems to be silently casted in float32, this might be related to" + f" the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in" + f" {target_dtype}." + ) + + query_states = query_states.to(target_dtype) + key_states = key_states.to(target_dtype) + value_states = value_states.to(target_dtype) + + # Reashape to the expected shape for Flash Attention + query_states = query_states.transpose(1, 2) + key_states = key_states.transpose(1, 2) + value_states = value_states.transpose(1, 2) + + attn_output = self._flash_attention_forward( + query_states, + key_states, + value_states, + attention_mask, + q_len, + dropout=dropout_rate, + use_sliding_windows=use_sliding_windows, + ) + + attn_output = attn_output.reshape(bsz, q_len, int(self.hidden_size/self.tp_size)).contiguous() + attn_output = self.o_proj(attn_output) + + if not output_attentions: + attn_weights = None + + return attn_output, attn_weights, past_key_value + + def _flash_attention_forward( + self, + query_states, + key_states, + value_states, + attention_mask, + query_length, + dropout=0.0, + softmax_scale=None, + use_sliding_windows=False, + ): + """ + Calls the forward method of Flash Attention - if the input hidden states contain at least one padding token + first unpad the input, then computes the attention scores and pad the final attention scores. + + Args: + query_states (`torch.Tensor`): + Input query states to be passed to Flash Attention API + key_states (`torch.Tensor`): + Input key states to be passed to Flash Attention API + value_states (`torch.Tensor`): + Input value states to be passed to Flash Attention API + attention_mask (`torch.Tensor`): + The padding mask - corresponds to a tensor of size `(batch_size, seq_len)` where 0 stands for the + position of padding tokens and 1 for the position of non-padding tokens. + dropout (`float`): + Attention dropout + softmax_scale (`float`, *optional*): + The scaling of QK^T before applying softmax. Default to 1 / sqrt(head_dim) + use_sliding_windows (`bool`, *optional*): + Whether to activate sliding window attention. + """ + if not self._flash_attn_uses_top_left_mask: + causal = self.is_causal + else: + # TODO: Remove the `query_length != 1` check once Flash Attention for RoCm is bumped to 2.1. For details, please see the comment in LlamaFlashAttention2 __init__. + causal = self.is_causal and query_length != 1 + + # Contains at least one padding token in the sequence + if attention_mask is not None: + batch_size = query_states.shape[0] + query_states, key_states, value_states, indices_q, cu_seq_lens, max_seq_lens = self._upad_input( + query_states, key_states, value_states, attention_mask, query_length + ) + + cu_seqlens_q, cu_seqlens_k = cu_seq_lens + max_seqlen_in_batch_q, max_seqlen_in_batch_k = max_seq_lens + + if not use_sliding_windows: + attn_output_unpad = flash_attn_varlen_func( + query_states, + key_states, + value_states, + cu_seqlens_q=cu_seqlens_q, + cu_seqlens_k=cu_seqlens_k, + max_seqlen_q=max_seqlen_in_batch_q, + max_seqlen_k=max_seqlen_in_batch_k, + dropout_p=dropout, + softmax_scale=softmax_scale, + causal=causal, + ) + else: + attn_output_unpad = flash_attn_varlen_func( + query_states, + key_states, + value_states, + cu_seqlens_q=cu_seqlens_q, + cu_seqlens_k=cu_seqlens_k, + max_seqlen_q=max_seqlen_in_batch_q, + max_seqlen_k=max_seqlen_in_batch_k, + dropout_p=dropout, + softmax_scale=softmax_scale, + causal=causal, + window_size=(self.config.sliding_window, self.config.sliding_window), + ) + + attn_output = pad_input(attn_output_unpad, indices_q, batch_size, query_length) + else: + if not use_sliding_windows: + attn_output = flash_attn_func( + query_states, + key_states, + value_states, + dropout, + softmax_scale=softmax_scale, + causal=causal, + ) + else: + attn_output = flash_attn_func( + query_states, + key_states, + value_states, + dropout, + softmax_scale=softmax_scale, + causal=causal, + window_size=(self.config.sliding_window, self.config.sliding_window), + ) + + return attn_output + + def _upad_input(self, query_layer, key_layer, value_layer, attention_mask, query_length): + batch_size, kv_seq_len, num_heads, head_dim = key_layer.shape + + # On the first iteration we need to properly re-create the padding mask + # by slicing it on the proper place + if kv_seq_len != attention_mask.shape[-1]: + attention_mask_num_tokens = attention_mask.shape[-1] + attention_mask = attention_mask[:, attention_mask_num_tokens - kv_seq_len :] + + indices_k, cu_seqlens_k, max_seqlen_in_batch_k = _get_unpad_data(attention_mask) + + key_layer = index_first_axis(key_layer.reshape(batch_size * kv_seq_len, num_heads, head_dim), indices_k) + value_layer = index_first_axis(value_layer.reshape(batch_size * kv_seq_len, num_heads, head_dim), indices_k) + + if query_length == kv_seq_len: + query_layer = index_first_axis( + query_layer.reshape(batch_size * kv_seq_len, num_heads, head_dim), indices_k + ) + cu_seqlens_q = cu_seqlens_k + max_seqlen_in_batch_q = max_seqlen_in_batch_k + indices_q = indices_k + elif query_length == 1: + max_seqlen_in_batch_q = 1 + cu_seqlens_q = torch.arange( + batch_size + 1, dtype=torch.int32, device=query_layer.device + ) # There is a memcpy here, that is very bad. + indices_q = cu_seqlens_q[:-1] + query_layer = query_layer.squeeze(1) + else: + # The -q_len: slice assumes left padding. + attention_mask = attention_mask[:, -query_length:] + query_layer, indices_q, cu_seqlens_q, max_seqlen_in_batch_q = unpad_input(query_layer, attention_mask) + + return ( + query_layer, + key_layer, + value_layer, + indices_q, + (cu_seqlens_q, cu_seqlens_k), + (max_seqlen_in_batch_q, max_seqlen_in_batch_k), + ) + + +# copied from transformers.models.llama.modeling_llama.LlamaSdpaAttention with Llama->Mistral +# TODO @Arthur no longer copied from LLama after static cache +class Mistral2SdpaAttention(Mistral2Attention): + """ + Mistral attention module using torch.nn.functional.scaled_dot_product_attention. This module inherits from + `MistralAttention` as the weights of the module stays untouched. The only changes are on the forward pass to adapt to + SDPA API. + """ + + # Adapted from MistralAttention.forward + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_value: Optional[Cache] = None, + output_attentions: bool = False, + use_cache: bool = False, + ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: + if output_attentions: + # TODO: Improve this warning with e.g. `model.config.attn_implementation = "manual"` once this is implemented. + logger.warning_once( + "MistralModel is using MistralSdpaAttention, but `torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to the manual attention implementation, " + 'but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.' + ) + return super().forward( + hidden_states=hidden_states, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_value=past_key_value, + output_attentions=output_attentions, + use_cache=use_cache, + ) + + bsz, q_len, _ = hidden_states.size() + + query_states = self.q_proj(hidden_states) + key_states = self.k_proj(hidden_states) + value_states = self.v_proj(hidden_states) + + # query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2) + # key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) + # value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) + + query_states, key_states, value_states = ( + rearrange(query_states, "b n (h d) -> b n h d", d=self.head_dim), + rearrange(key_states, "b n (h d) -> b n h d", d=self.head_dim), + rearrange(value_states, "b n (h d) -> b n h d", d=self.head_dim), + ) + + self.num_heads_tp = query_states.shape[2] + self.tp_size = self.num_heads // self.num_heads_tp + + query_states = query_states.transpose(1, 2) + key_states = key_states.transpose(1, 2) + value_states = value_states.transpose(1, 2) + + if self.config.pp_size > 1: + query_states = query_states.contiguous() + key_states = key_states.contiguous() + value_states = value_states.contiguous() + + + kv_seq_len = key_states.shape[-2] + if past_key_value is not None: + kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx) + cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len) + + query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids) + + if past_key_value is not None: + cache_kwargs = {"sin": sin, "cos": cos} # Specific to RoPE models + key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs) + + key_states = repeat_kv(key_states, self.num_key_value_groups) + value_states = repeat_kv(value_states, self.num_key_value_groups) + + if attention_mask is not None: + if attention_mask.size() != (bsz, 1, q_len, kv_seq_len): + raise ValueError( + f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}" + ) + + # SDPA with memory-efficient backend is currently (torch==2.1.2) bugged with non-contiguous inputs with custom attn_mask, + # Reference: https://github.com/pytorch/pytorch/issues/112577. + if query_states.device.type == "cuda" and attention_mask is not None: + query_states = query_states.contiguous() + key_states = key_states.contiguous() + value_states = value_states.contiguous() + + attn_output = torch.nn.functional.scaled_dot_product_attention( + query_states, + key_states, + value_states, + attn_mask=attention_mask, + dropout_p=self.attention_dropout if self.training else 0.0, + # The q_len > 1 is necessary to match with AttentionMaskConverter.to_causal_4d that does not create a causal mask in case q_len == 1. + is_causal=self.is_causal and attention_mask is None and q_len > 1, + ) + + attn_output = attn_output.transpose(1, 2).contiguous() + attn_output = attn_output.view(bsz, q_len, int(self.hidden_size/self.tp_size)) + + attn_output = self.o_proj(attn_output) + + return attn_output, None, past_key_value + + +MISTRAL_ATTENTION_CLASSES = { + "eager": Mistral2Attention, + "flash_attention_2": Mistral2FlashAttention2, + "sdpa": Mistral2SdpaAttention, +} + + +class MistralDecoderLayer(nn.Module): + def __init__(self, config: CollieConfig, layer_idx: int): + super().__init__() + self.hidden_size = config.hidden_size + + self.self_attn = MISTRAL_ATTENTION_CLASSES[config._attn_implementation](config, layer_idx) + + self.mlp = Mistral2MLP(config) + self.input_layernorm = Mistral2RMSNorm(config.hidden_size, eps=config.rms_norm_eps) + self.post_attention_layernorm = Mistral2RMSNorm(config.hidden_size, eps=config.rms_norm_eps) + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_value: Optional[Tuple[torch.Tensor]] = None, + output_attentions: Optional[bool] = False, + use_cache: Optional[bool] = False, + **kwargs, + ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]: + if "padding_mask" in kwargs: + warnings.warn( + "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`" + ) + """ + Args: + hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)` + attention_mask (`torch.FloatTensor`, *optional*): attention mask of size + `(batch, sequence_length)` where padding elements are indicated by 0. + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under + returned tensors for more detail. + use_cache (`bool`, *optional*): + If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding + (see `past_key_values`). + past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states + """ + + residual = hidden_states + + hidden_states = self.input_layernorm(hidden_states) + + # Self Attention + hidden_states, self_attn_weights, present_key_value = self.self_attn( + hidden_states=hidden_states, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_value=past_key_value, + output_attentions=output_attentions, + use_cache=use_cache, + ) + hidden_states = residual + hidden_states + + # Fully Connected + residual = hidden_states + hidden_states = self.post_attention_layernorm(hidden_states) + hidden_states = self.mlp(hidden_states) + hidden_states = residual + hidden_states + + outputs = (hidden_states,) + + if output_attentions: + outputs += (self_attn_weights,) + + if use_cache: + outputs += (present_key_value,) + + return outputs + + +MISTRAL_START_DOCSTRING = r""" + This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the + library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads + etc.) + + This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. + Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage + and behavior. + + Parameters: + config ([`MistralConfig`]): + Model configuration class with all the parameters of the model. Initializing with a config file does not + load the weights associated with the model, only the configuration. Check out the + [`~PreTrainedModel.from_pretrained`] method to load the model weights. +""" + + +@add_start_docstrings( + "The bare Mistral Model outputting raw hidden-states without any specific head on top.", + MISTRAL_START_DOCSTRING, +) +class Mistral2PreTrainedModel(PreTrainedModel): + config_class = Mistral2Config + base_model_prefix = "model" + supports_gradient_checkpointing = True + _no_split_modules = ["MistralDecoderLayer"] + _skip_keys_device_placement = "past_key_values" + _supports_flash_attn_2 = True + _supports_sdpa = True + _supports_cache_class = True + + def _init_weights(self, module): + std = self.config.initializer_range + if isinstance(module, nn.Linear): + module.weight.data.normal_(mean=0.0, std=std) + if module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, nn.Embedding): + module.weight.data.normal_(mean=0.0, std=std) + if module.padding_idx is not None: + module.weight.data[module.padding_idx].zero_() + + +MISTRAL_INPUTS_DOCSTRING = r""" + Args: + input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`): + Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide + it. + + Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. + + [What are input IDs?](../glossary#input-ids) + attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*): + Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + [What are attention masks?](../glossary#attention-mask) + + Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. + + If `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see + `past_key_values`). + + If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`] + and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more + information on the default strategy. + + - 1 indicates the head is **not masked**, + - 0 indicates the head is **masked**. + position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0, + config.n_positions - 1]`. + + [What are position IDs?](../glossary#position-ids) + past_key_values (`Cache` or `tuple(tuple(torch.FloatTensor))`, *optional*): + Pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention + blocks) that can be used to speed up sequential decoding. This typically consists in the `past_key_values` + returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`. + + Two formats are allowed: + - a [`~cache_utils.Cache`] instance; + - Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of + shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`). This is also known as the legacy + cache format. + + The model will output the same cache format that is fed as input. If no `past_key_values` are passed, the + legacy cache format will be returned. + + If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that don't + have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `input_ids` + of shape `(batch_size, sequence_length)`. + inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): + Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This + is useful if you want more control over how to convert `input_ids` indices into associated vectors than the + model's internal embedding lookup matrix. + use_cache (`bool`, *optional*): + If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see + `past_key_values`). + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned + tensors for more detail. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for + more detail. + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. +""" + + +@add_start_docstrings( + "The bare Mistral Model outputting raw hidden-states without any specific head on top.", + MISTRAL_START_DOCSTRING, +) +class Mistral2Model(nn.Module): + """ + Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`MistralDecoderLayer`] + + Args: + config: MistralConfig + """ + + def __init__(self, config: CollieConfig): + # super().__init__(config) + super().__init__() + self.config = config + self.padding_idx = config.pad_token_id + self.vocab_size = config.vocab_size + + self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx) + self.layers = nn.ModuleList( + [MistralDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)] + ) + self._attn_implementation = config._attn_implementation + self.norm = Mistral2RMSNorm(config.hidden_size, eps=config.rms_norm_eps) + + self.gradient_checkpointing = False + # Initialize weights and apply final processing + # self.post_init() + + def get_input_embeddings(self): + return self.embed_tokens + + def set_input_embeddings(self, value): + self.embed_tokens = value + + @add_start_docstrings_to_model_forward(MISTRAL_INPUTS_DOCSTRING) + def forward( + self, + input_ids: torch.LongTensor = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[List[torch.FloatTensor]] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple, BaseModelOutputWithPast]: + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + use_cache = use_cache if use_cache is not None else self.config.use_cache + + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + # retrieve input_ids and inputs_embeds + if input_ids is not None and inputs_embeds is not None: + raise ValueError("You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time") + elif input_ids is not None: + batch_size, seq_length = input_ids.shape + elif inputs_embeds is not None: + batch_size, seq_length, _ = inputs_embeds.shape + else: + raise ValueError("You have to specify either decoder_input_ids or decoder_inputs_embeds") + + if self.gradient_checkpointing and self.training: + if use_cache: + logger.warning_once( + "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..." + ) + use_cache = False + + past_key_values_length = 0 + + if use_cache: + use_legacy_cache = not isinstance(past_key_values, Cache) + if use_legacy_cache: + past_key_values = DynamicCache.from_legacy_cache(past_key_values) + past_key_values_length = past_key_values.get_usable_length(seq_length) + + if position_ids is None: + device = input_ids.device if input_ids is not None else inputs_embeds.device + position_ids = torch.arange( + past_key_values_length, seq_length + past_key_values_length, dtype=torch.long, device=device + ) + position_ids = position_ids.unsqueeze(0).view(-1, seq_length) + else: + position_ids = position_ids.view(-1, seq_length).long() + + if inputs_embeds is None: + inputs_embeds = self.embed_tokens(input_ids) + + if attention_mask is not None and self._attn_implementation == "flash_attention_2" and use_cache: + is_padding_right = attention_mask[:, -1].sum().item() != batch_size + if is_padding_right: + raise ValueError( + "You are attempting to perform batched generation with padding_side='right'" + " this may lead to unexpected behaviour for Flash Attention version of Mistral. Make sure to " + " call `tokenizer.padding_side = 'left'` before tokenizing the input. " + ) + + if self._attn_implementation == "flash_attention_2": + # 2d mask is passed through the layers + attention_mask = attention_mask if (attention_mask is not None and 0 in attention_mask) else None + elif self._attn_implementation == "sdpa" and not output_attentions: + # output_attentions=True can not be supported when using SDPA, and we fall back on + # the manual implementation that requires a 4D causal mask in all cases. + attention_mask = _prepare_4d_causal_attention_mask_for_sdpa( + attention_mask, + (batch_size, seq_length), + inputs_embeds, + past_key_values_length, + ) + else: + # 4d mask is passed through the layers + attention_mask = _prepare_4d_causal_attention_mask( + attention_mask, + (batch_size, seq_length), + inputs_embeds, + past_key_values_length, + sliding_window=self.config.sliding_window, + ) + + hidden_states = inputs_embeds + + # decoder layers + all_hidden_states = () if output_hidden_states else None + all_self_attns = () if output_attentions else None + next_decoder_cache = None + + for decoder_layer in self.layers: + if output_hidden_states: + all_hidden_states += (hidden_states,) + + if self.gradient_checkpointing and self.training: + layer_outputs = self._gradient_checkpointing_func( + decoder_layer.__call__, + hidden_states, + attention_mask, + position_ids, + past_key_values, + output_attentions, + use_cache, + ) + else: + layer_outputs = decoder_layer( + hidden_states, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_value=past_key_values, + output_attentions=output_attentions, + use_cache=use_cache, + ) + + hidden_states = layer_outputs[0] + + if use_cache: + next_decoder_cache = layer_outputs[2 if output_attentions else 1] + + if output_attentions: + all_self_attns += (layer_outputs[1],) + + hidden_states = self.norm(hidden_states) + + # add hidden states from the last decoder layer + if output_hidden_states: + all_hidden_states += (hidden_states,) + + next_cache = None + if use_cache: + next_cache = next_decoder_cache.to_legacy_cache() if use_legacy_cache else next_decoder_cache + + if not return_dict: + return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None) + return BaseModelOutputWithPast( + last_hidden_state=hidden_states, + past_key_values=next_cache, + hidden_states=all_hidden_states, + attentions=all_self_attns, + ) + + +class Mistral2ForCausalLM(CollieModelForCausalLM): + _tied_weights_keys = ["lm_head.weight"] + + def __init__(self, config:CollieConfig): + super().__init__(config) + self.model = Mistral2Model(config) + self.vocab_size = config.vocab_size + # self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False) + self.lm_head = ColumnParallelLinearWithoutBias( + self.collie_config.hidden_size, self.collie_config.vocab_size, bias=False + ) + # Initialize weights and apply final processing + # self.post_init() + # GenerationMixin 需要的额外参数 + self.config.is_decoder = True + if config.model_config.tie_word_embeddings: + self.lm_head.weight = self.embed_tokens.weight + self.main_input_name = "input_ids" + + def clean_cache(self): + self._clean_hidden_states([*self.model.layers, self.lm_head]) + self._set_use_cache(self.model.layers, False) + + def set_cache(self, use_cache): + self._set_use_cache(self.model.layers, use_cache) + + def get_input_embeddings(self): + return self.model.embed_tokens + + def set_input_embeddings(self, value): + self.model.embed_tokens = value + + def get_output_embeddings(self): + return self.lm_head + + def set_output_embeddings(self, new_embeddings): + self.lm_head = new_embeddings + + def set_decoder(self, decoder): + self.model = decoder + + def get_decoder(self): + return self.model + + @add_start_docstrings_to_model_forward(MISTRAL_INPUTS_DOCSTRING) + @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC) + def forward( + self, + input_ids: torch.LongTensor = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[List[torch.FloatTensor]] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + **kwargs, + ) -> Union[Tuple, CausalLMOutputWithPast]: + r""" + Args: + labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., + config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored + (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`. + + Returns: + + Example: + + ```python + >>> from transformers import AutoTokenizer, MistralForCausalLM + + >>> model = MistralForCausalLM.from_pretrained("mistralai/Mistral-7B-v0.1") + >>> tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-v0.1") + + >>> prompt = "Hey, are you conscious? Can you talk to me?" + >>> inputs = tokenizer(prompt, return_tensors="pt") + + >>> # Generate + >>> generate_ids = model.generate(inputs.input_ids, max_length=30) + >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0] + "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you." + ```""" + + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn) + outputs = self.model( + input_ids=input_ids, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + hidden_states = outputs[0] + logits = self.lm_head(hidden_states) + logits = logits.float() + + loss = None + if labels is not None: + # Shift so that tokens < n predict n + shift_logits = logits[..., :-1, :].contiguous() + shift_labels = labels[..., 1:].contiguous() + # Flatten the tokens + shift_logits = shift_logits.view(-1, self.config.vocab_size) + shift_labels = shift_labels.view(-1) + # Ensure tensors are on the same device + shift_labels = shift_labels.to(shift_logits.device) + loss_fct = CrossEntropyLoss() + loss = loss_fct(shift_logits, shift_labels) + + if not return_dict: + output = (logits,) + outputs[1:] + return (loss,) + output if loss is not None else output + + return CausalLMOutputWithPast( + loss=loss, + logits=logits, + past_key_values=outputs.past_key_values, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + def prepare_inputs_for_generation( + self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, **kwargs + ): + # Omit tokens covered by past_key_values + if past_key_values is not None: + if isinstance(past_key_values, Cache): + cache_length = past_key_values.get_seq_length() + past_length = past_key_values.seen_tokens + max_cache_length = past_key_values.get_max_length() + else: + cache_length = past_length = past_key_values[0][0].shape[2] + max_cache_length = None + + # Keep only the unprocessed tokens: + # 1 - If the length of the attention_mask exceeds the length of input_ids, then we are in a setting where + # some of the inputs are exclusively passed as part of the cache (e.g. when passing input_embeds as + # input) + if attention_mask is not None and attention_mask.shape[1] > input_ids.shape[1]: + input_ids = input_ids[:, -(attention_mask.shape[1] - past_length) :] + # 2 - If the past_length is smaller than input_ids', then input_ids holds all input tokens. We can discard + # input_ids based on the past_length. + elif past_length < input_ids.shape[1]: + input_ids = input_ids[:, past_length:] + # 3 - Otherwise (past_length >= input_ids.shape[1]), let's assume input_ids only has unprocessed tokens. + + # If we are about to go beyond the maximum cache length, we need to crop the input attention mask. + if ( + max_cache_length is not None + and attention_mask is not None + and cache_length + input_ids.shape[1] > max_cache_length + ): + attention_mask = attention_mask[:, -max_cache_length:] + + position_ids = kwargs.get("position_ids", None) + if attention_mask is not None and position_ids is None: + # create position_ids on the fly for batch generation + position_ids = attention_mask.long().cumsum(-1) - 1 + position_ids.masked_fill_(attention_mask == 0, 1) + if past_key_values: + position_ids = position_ids[:, -input_ids.shape[1] :] + + # if `inputs_embeds` are passed, we only want to use them in the 1st generation step + if inputs_embeds is not None and past_key_values is None: + model_inputs = {"inputs_embeds": inputs_embeds} + else: + model_inputs = {"input_ids": input_ids} + + model_inputs.update( + { + "position_ids": position_ids, + "past_key_values": past_key_values, + "use_cache": kwargs.get("use_cache"), + "attention_mask": attention_mask, + } + ) + return model_inputs + + @staticmethod + def _reorder_cache(past_key_values, beam_idx): + reordered_past = () + for layer_past in past_key_values: + reordered_past += ( + tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past), + ) + return reordered_past + + @classmethod + def pipeline_layers(cls, config: CollieConfig): + """ + Get layers of pipeline. + :return: list + """ + if isinstance(config, str): + config = CollieConfig.from_pretrained(config) + + if config.tie_word_embeddings: + output = TiedLayerSpec( + "embed_tokens", + dict_as_params(input_keys="hidden_states", output_keys="logits"), + ColumnParallelLMHead, + config.hidden_size, + config.vocab_size, + bias=False, + ) + else: + output = LayerSpec( + dict_as_params(input_keys="hidden_states", output_keys="logits"), + ColumnParallelLMHead, + config.hidden_size, + config.vocab_size, + bias=False, + ) + + return [("model", Mistral2Model.pipeline_layers(config)), ("lm_head", output)] + + @staticmethod + def load_parallel_state_dict( + path: str, + config: Union[CollieConfig, str], + process_exclusion: bool = False, + **kwargs, + ): + ... + + @staticmethod + def load_parallel_state_dict( + path: str, + config: Union[CollieConfig, str], + process_exclusion: bool = False, + protocol: str = "file", # 指定加载state_dict时使用的协议 + **kwargs, + ): + """ + Load state_dict from ``path``. + The format of pretrained model should be the same as that of + `huggingface`. + :return: state_dict. Note that the state_dict should be processed + properly to match the current rank. + """ + # 配置加载 + if isinstance(config, str): + config = CollieConfig.from_pretrained(config) + # IO驱动初始化 + io_driver = IODriver.from_protocol(protocol) + # 检查文件路径是否存在 + if not io_driver.exists(path): + raise FileNotFoundError(f"folder {path} not found.") + # 初始化存储和处理变量 + state_dict = OrderedDict() + weights = [] + parts = None # 变量用于存储模型分割的部分信息 + # 如果开启了进程互斥,那么每个进程都会显示进度条,否则只显示 RANK0 的 + hide_progress = not process_exclusion and int(os.environ.get("RANK", "0")) != 0 + if dist.is_initialized() and process_exclusion: + # 如果启动了进程互斥,则要进行 dist.get_world_size() 次循环 + rank_order = range(dist.get_world_size()) + else: + # 不开启只进行一次循环 + rank_order = range(1) + # 权重文件加载和处理 + for rank in rank_order: + # 如果开启了进程互斥,那么只有对应 RANK 的能进入循环;不开启进程互斥的话就都可以进 + if int(os.environ.get("RANK", "0")) == rank or not process_exclusion: + # PP 分层的方法保存在了 os.environ["COLLIE_PP_PARTS"], 格式类似于 [0, 17, 35], 左闭右开 + if env.is_pipeline: + # 保存的是 json 格式 + parts = env.pipeline_parts + if hasattr(config, "num_key_value_heads"): + # llama2 (transformers >= 4.31.0) + num_key_value_heads = config.num_key_value_heads + else: + num_key_value_heads = config.num_attention_heads + head_dim = config.hidden_size // config.num_attention_heads + # 如果存在 pytorch_model.bin.index.json 文件的话,此时不同的 pp 进程可以按需加载自己需要的权重 + if ( + io_driver.exists(os.path.join(path, "pytorch_model.bin.index.json")) + and "COLLIE_PP_PARTS" in os.environ.keys() + ): + weight_map = json.loads( + io_driver.load( + os.path.join(path, "pytorch_model.bin.index.json"), mode="r" + ) + )["weight_map"] + # layers 表示自己需要的层 + layers = env.pipeline_layers_idx + # 筛选出形似 model.layers.0 这样的层。包含两个条件:1. 有数字的层;2. 数字加一要在 layers 里面(因为最开始还有个 embedding 占一层) + weights.extend( + [ + value + for key, value in weight_map.items() + if len(key.split(".")) > 2 + and key.split(".")[2].isdigit() + and (int(key.split(".")[2]) + 1) in layers + ] + ) + # 去重 + weights = list(set(weights)) + # 继续筛选,如果有 0 层,那么就要加载 embedding;如果有最后一层,那么就要加载 lm_head;如果有倒数第二层,那么就要加载 norm + if 0 in layers: + weights.append(weight_map["model.tok_embeddings.weight"]) + if max(parts) - 1 in layers: + weights.append(weight_map["output.weight"]) + if max(parts) - 2 in layers: + weights.append(weight_map["model.norm.weight"]) + else: + # 如果没有 pytorch_model.bin.index.json 文件的话,那么就加载所有的权重 + weights = [ + weight + for weight in io_driver.list(path) + if weight.endswith(".bin") + ] + with progress( + weights, + desc="Loading state dict", + total=len(weights), + disable=hide_progress, + ) as pbar: + for weight in pbar: + part_state_dict = io_driver.load( + os.path.join(path, weight), mode="rb" + ) + # for key in list(part_state_dict.keys()): + # if "attention.wqkv.weight" in key: + # # qkv_weights = part_state_dict.pop(key) + # qkv_weights = part_state_dict[key] + # print(qkv_weights.shape) + # (wq, wk, wv) = qkv_weights.split( + # [ + # config.hidden_size, + # config.num_key_value_heads * head_dim, + # config.num_key_value_heads * head_dim, + # ], + # dim=0, + # ) + # wq_name = key.replace("wqkv", "wq") + # wk_name = key.replace("wqkv", "wk") + # wv_name = key.replace("wqkv", "wv") + # part_state_dict[wq_name] = wq + # part_state_dict[wk_name] = wk + # part_state_dict[wv_name] = wv + state_dict.update(part_state_dict) + del part_state_dict + if parts is not None: + # 这一步是 pp 的复筛 + layers = env.pipeline_layers_idx + for key in list(state_dict.keys()): + if key.startswith("layers"): + layer = int(key.split(".")[1]) + if layer + 1 not in layers: + state_dict.pop(key) + # if key.endswith("tok_embeddings.weight"): + if key.endswith("embed_tokens.weight"): + if 0 not in layers: + state_dict.pop(key) + if key == "norm.weight": + if max(parts) - 2 not in layers: + state_dict.pop(key) + # if key.endswith("output.weight"): + if key.endswith("lm_head.weight"): + if max(parts) - 1 not in layers: + state_dict.pop(key) + # 根据用户配置的新的 tp size 进行分割 + for key in list(state_dict.keys()): + col_filter = [ + # "wq.weight", + # "wk.weight", + # "wv.weight", + # "wqkv.weight", + # "w1.weight", + # "w3.weight", + # "tok_embeddings.weight", + # "output.weight", + "q_proj.weight", + "k_proj.weight", + "v_proj.weight", + "o_proj.weight", + "lm_head.weight", + "gate_proj.weight", + "up_proj.weight", + "down_proj.weight", + "embed_tokens.weight", + ] + col_split = any([key.endswith(filter) for filter in col_filter]) + + if col_split: + tensor = ( + list(torch.chunk(state_dict[key], config.tp_size, dim=0))[ + env.tp_rank + ] + .detach() + .clone() + ) + del state_dict[key] + if process_exclusion: + # CPU 内存回收(速度很慢) + gc.collect() + state_dict[key] = tensor + elif key.endswith("o_proj.weight") or key.endswith("down_proj.weight"): + tensor = ( + list(torch.chunk(state_dict[key], config.tp_size, dim=1))[ + env.tp_rank + ] + .detach() + .clone() + ) + del state_dict[key] + if process_exclusion: + # CPU 内存回收(速度很慢) + gc.collect() + state_dict[key] = tensor + if dist.is_initialized() and process_exclusion: + # 如果选择了进程互斥,那么本次循环中不需要加载权重的进程需等待 + dist.barrier() + return state_dict + + @staticmethod + def save_parallel_state_dict( + state_dict: dict, + path: str, + config: CollieConfig, + process_exclusion: bool = False, + **kwargs, + ): + ... + + @staticmethod + def save_parallel_state_dict( + state_dict: dict, + path: str, + config: CollieConfig, + process_exclusion: bool = False, + protocol: str = "file", + ): + """ + Save state_dict to ``path``. + The format of saved state dict should be the same as that of + `huggingface`. + """ + io_driver = IODriver.from_protocol(protocol) + # gather to tp rank 0 + if dist.is_initialized() and process_exclusion: + # 如果启动了进程互斥,则要进行 pp_size 次循环 + rank_order = range(config.pp_size) + else: + # 不开启只进行一次循环 + rank_order = range(1) + dst = parallel_state.get_tensor_model_parallel_src_rank() + with progress( + rank_order, + desc="Saving model", + disable=int(os.environ.get("RANK", "0")) != 0, + ) as pbar: + for rank in pbar: + if env.dp_rank == 0 and (env.pp_rank == rank or not process_exclusion): + for key in sorted(list(state_dict.keys())): + tensor_list = None + if env.tp_rank == 0: + tensor_list = [ + torch.zeros_like(state_dict[key]) + .to(state_dict[key].dtype) + .cuda() + for _ in range(config.tp_size) + ] + dist.gather( + state_dict[key].cuda(), + dst=dst, + gather_list=tensor_list, + group=env.tp_group, + ) + if env.tp_rank == 0: + col_filter = [ + # "wq.weight", + # "wk.weight", + # "wv.weight", + # "wqkv.weight", + # "w1.weight", + # "w3.weight", + # "tok_embeddings.weight", + # "output.weight", + "q_proj.weight", + "k_proj.weight", + "v_proj.weight", + "o_proj.weight", + "lm_head.weight", + "gate_proj.weight", + "up_proj.weight", + "down_proj.weight", + "embed_tokens.weight", + ] + col_split = any( + [key.endswith(filter) for filter in col_filter] + ) + + if col_split: + state_dict[key] = concat_tensor(tensor_list, dim=0) + + if process_exclusion: + # CPU 内存回收(速度很慢) + gc.collect() + + elif key.endswith("o_proj.weight") or key.endswith("down_proj.weight"): + state_dict[key] = concat_tensor(tensor_list, dim=1) + + if process_exclusion: + # CPU 内存回收(速度很慢) + gc.collect() + # 似乎不需要? + # state_dict_keys = state_dict.keys() + # for layer_id in range(config.num_layers): + # qkv_names = [None, None, None] + # for key in state_dict_keys: + # if f"layers.{layer_id}.attention.wq.weight" in key: + # qkv_names[0] = key + # elif f"layers.{layer_id}.attention.wk.weight" in key: + # qkv_names[1] = key + # elif f"layers.{layer_id}.attention.wv.weight" in key: + # qkv_names[2] = key + # qkv_name = qkv_names[0].replace("wq", "wqkv") + # state_dict[qkv_name] = torch.cat( + # [ + # state_dict.pop(qkv_names[0]), + # state_dict.pop(qkv_names[1]), + # state_dict.pop(qkv_names[2]), + # ], + # dim=0 + # ) + + if env.tp_rank == 0: + # Save gathered weights + if env.is_pipeline: + ckpt_name = f"pytorch_model-{env.pp_rank + 1:05d}-of-{config.pp_size:05d}.bin" + total_size = 0 + weight_map = {} + for name, weight in state_dict.items(): + weight_size = weight.numel() * dtype_byte_size( + weight.dtype + ) + weight_map[name] = ckpt_name + total_size += weight_size + index_dict = dict( + total_size=total_size, weight_map=weight_map + ) + index_dicts = [None for _ in range(env.pp_size)] + dist.gather_object( + index_dict, index_dicts if env.pp_rank == 0 else None, group=env.pp_group + ) + if env.pp_rank == 0: + total_size = 0 + weight_map = {} + for _index_dict in index_dicts: + total_size += _index_dict["total_size"] + weight_map.update(_index_dict["weight_map"]) + merged_dict = { + "metadata": {"total_size": total_size}, + "weight_map": weight_map, + } + io_driver.save( + json.dumps(merged_dict, indent=2, sort_keys=True) + + "\n", + os.path.join(path, "pytorch_model.bin.index.json"), + ) + + else: + ckpt_name = f"pytorch_model.bin" + ckpt_path = os.path.join(path, ckpt_name) + io_driver.save(state_dict, ckpt_path) + if dist.is_initialized() and process_exclusion: + dist.barrier() + if env.rank == 0: + config.save_pretrained(path, protocol=protocol) + dist.barrier() + + +@add_start_docstrings( + """ + The Mistral Model transformer with a sequence classification head on top (linear layer). + + [`MistralForSequenceClassification`] uses the last token in order to do the classification, as other causal models + (e.g. GPT-2) do. + + Since it does classification on the last token, it requires to know the position of the last token. If a + `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If + no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the + padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in + each row of the batch). + """, + MISTRAL_START_DOCSTRING, +) +# Copied from transformers.models.llama.modeling_llama.LlamaForSequenceClassification with Llama->Mistral, LLAMA->MISTRAL +class MistralForSequenceClassification(Mistral2PreTrainedModel): + def __init__(self, config): + super().__init__(config) + self.num_labels = config.num_labels + self.model = Mistral2Model(config) + self.score = nn.Linear(config.hidden_size, self.num_labels, bias=False) + + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self): + return self.model.embed_tokens + + def set_input_embeddings(self, value): + self.model.embed_tokens = value + + @add_start_docstrings_to_model_forward(MISTRAL_INPUTS_DOCSTRING) + def forward( + self, + input_ids: torch.LongTensor = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[List[torch.FloatTensor]] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple, SequenceClassifierOutputWithPast]: + r""" + labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): + Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., + config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If + `config.num_labels > 1` a classification loss is computed (Cross-Entropy). + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + transformer_outputs = self.model( + input_ids, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + hidden_states = transformer_outputs[0] + logits = self.score(hidden_states) + + if input_ids is not None: + batch_size = input_ids.shape[0] + else: + batch_size = inputs_embeds.shape[0] + + if self.config.pad_token_id is None and batch_size != 1: + raise ValueError("Cannot handle batch sizes > 1 if no padding token is defined.") + if self.config.pad_token_id is None: + sequence_lengths = -1 + else: + if input_ids is not None: + # if no pad token found, use modulo instead of reverse indexing for ONNX compatibility + sequence_lengths = torch.eq(input_ids, self.config.pad_token_id).int().argmax(-1) - 1 + sequence_lengths = sequence_lengths % input_ids.shape[-1] + sequence_lengths = sequence_lengths.to(logits.device) + else: + sequence_lengths = -1 + + pooled_logits = logits[torch.arange(batch_size, device=logits.device), sequence_lengths] + + loss = None + if labels is not None: + labels = labels.to(logits.device) + if self.config.problem_type is None: + if self.num_labels == 1: + self.config.problem_type = "regression" + elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int): + self.config.problem_type = "single_label_classification" + else: + self.config.problem_type = "multi_label_classification" + + if self.config.problem_type == "regression": + loss_fct = MSELoss() + if self.num_labels == 1: + loss = loss_fct(pooled_logits.squeeze(), labels.squeeze()) + else: + loss = loss_fct(pooled_logits, labels) + elif self.config.problem_type == "single_label_classification": + loss_fct = CrossEntropyLoss() + loss = loss_fct(pooled_logits.view(-1, self.num_labels), labels.view(-1)) + elif self.config.problem_type == "multi_label_classification": + loss_fct = BCEWithLogitsLoss() + loss = loss_fct(pooled_logits, labels) + if not return_dict: + output = (pooled_logits,) + transformer_outputs[1:] + return ((loss,) + output) if loss is not None else output + + return SequenceClassifierOutputWithPast( + loss=loss, + logits=pooled_logits, + past_key_values=transformer_outputs.past_key_values, + hidden_states=transformer_outputs.hidden_states, + attentions=transformer_outputs.attentions, + ) diff --git a/collie/models/mistral2/modeltp.py b/collie/models/mistral2/modeltp.py new file mode 100644 index 0000000..e91037f --- /dev/null +++ b/collie/models/mistral2/modeltp.py @@ -0,0 +1,2254 @@ +# coding=utf-8 +# Copyright 2023 Mistral AI and the HuggingFace Inc. team. All rights reserved. +# +# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX +# and OPT implementations in this library. It has been modified from its +# original forms to accommodate minor architectural differences compared +# to GPT-NeoX and OPT used by the Meta AI team that trained the model. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" PyTorch Mistral model.""" +import inspect +import math +import warnings +from typing import List, Optional, Tuple, Union + +import torch +import torch.nn.functional as F +import torch.utils.checkpoint +from torch import nn +from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss + +from transformers.activations import ACT2FN +from transformers.cache_utils import Cache, DynamicCache +from transformers.modeling_attn_mask_utils import _prepare_4d_causal_attention_mask, _prepare_4d_causal_attention_mask_for_sdpa +from transformers.modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast, SequenceClassifierOutputWithPast +from transformers.modeling_utils import PreTrainedModel, dtype_byte_size +from transformers.utils import ( + add_start_docstrings, + add_start_docstrings_to_model_forward, + is_flash_attn_2_available, + is_flash_attn_greater_or_equal_2_10, + logging, + replace_return_docstrings, +) +from .configuration_mistraltp import MistralConfig + + +if is_flash_attn_2_available(): + from flash_attn import flash_attn_func, flash_attn_varlen_func + from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input # noqa + + _flash_supports_window_size = "window_size" in list(inspect.signature(flash_attn_func).parameters) + + +logger = logging.get_logger(__name__) + +_CONFIG_FOR_DOC = "MistralConfig" + +#modified for collie +import torch.distributed as dist +import gc +import json +import os +from collections import OrderedDict +from megatron.core import parallel_state, tensor_parallel +from einops import rearrange +from deepspeed.pipe import LayerSpec, TiedLayerSpec + +from collie.config import CollieConfig +from collie.driver.io import IODriver +from collie.log.logger import logger +from collie.module import ( + ColumnParallelLinearWithoutBias, + ColumnParallelLMHead, + RowParallelLinearWithoutBias, +) +from collie.utils import concat_tensor, dict_as_params, env, progress +from collie.models.base import CollieModelForCausalLM +from collie.models.utils import ( + kv_cache_to_inputs_for_layer, inputs_to_kv_cache_for_layer, + kv_cache_to_inputs_for_model, inputs_to_kv_cache_for_model, +) + + +# Copied from transformers.models.llama.modeling_llama._get_unpad_data +def _get_unpad_data(attention_mask): + seqlens_in_batch = attention_mask.sum(dim=-1, dtype=torch.int32) + indices = torch.nonzero(attention_mask.flatten(), as_tuple=False).flatten() + max_seqlen_in_batch = seqlens_in_batch.max().item() + cu_seqlens = F.pad(torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.int32), (1, 0)) + return ( + indices, + cu_seqlens, + max_seqlen_in_batch, + ) + + +# Copied from transformers.models.llama.modeling_llama.LlamaRMSNorm with Llama->Mistral +class MistralRMSNorm(nn.Module): + def __init__(self, hidden_size, eps=1e-6): + """ + MistralRMSNorm is equivalent to T5LayerNorm + """ + super().__init__() + self.weight = nn.Parameter(torch.ones(hidden_size)) + self.variance_epsilon = eps + + def forward(self, hidden_states): + input_dtype = hidden_states.dtype + hidden_states = hidden_states.to(torch.float32) + variance = hidden_states.pow(2).mean(-1, keepdim=True) + hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon) + ans = self.weight * hidden_states.to(input_dtype) + + # # 打印层标准化的输出 + hidden_states_output = ans.detach().cpu().tolist() + data_to_save = {"Layer Norm Output": hidden_states_output} + # 将输出写入 JSON 文件 + with open('a_rms_output.json', 'w') as f: + json.dump(data_to_save, f, indent=4) + + return self.weight * hidden_states.to(input_dtype) + + +# copied from transformers.models.llama.modeling_llama.LlamaRotaryEmbedding with Llama->Mistral +# TODO @Arthur no longer copied from LLama after static cache +class MistralRotaryEmbedding(nn.Module): + def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None): + super().__init__() + + self.dim = dim + self.max_position_embeddings = max_position_embeddings + self.base = base + inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2, dtype=torch.int64).float().to(device) / self.dim)) + self.register_buffer("inv_freq", inv_freq, persistent=False) + + # Build here to make `torch.jit.trace` work. + self._set_cos_sin_cache( + seq_len=max_position_embeddings, device=self.inv_freq.device, dtype=torch.get_default_dtype() + ) + + def _set_cos_sin_cache(self, seq_len, device, dtype): + self.max_seq_len_cached = seq_len + t = torch.arange(self.max_seq_len_cached, device=device, dtype=torch.int64).type_as(self.inv_freq) + + freqs = torch.outer(t, self.inv_freq) + # Different from paper, but it uses a different permutation in order to obtain the same calculation + emb = torch.cat((freqs, freqs), dim=-1) + self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False) + self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False) + + def forward(self, x, seq_len=None): + # x: [bs, num_attention_heads, seq_len, head_size] + if seq_len > self.max_seq_len_cached: + self._set_cos_sin_cache(seq_len=seq_len, device=x.device, dtype=x.dtype) + + return ( + self.cos_cached[:seq_len].to(dtype=x.dtype), + self.sin_cached[:seq_len].to(dtype=x.dtype), + ) + + +# Copied from transformers.models.llama.modeling_llama.rotate_half +def rotate_half(x): + """Rotates half the hidden dims of the input.""" + x1 = x[..., : x.shape[-1] // 2] + x2 = x[..., x.shape[-1] // 2 :] + return torch.cat((-x2, x1), dim=-1) + + +# copied from transformers.models.llama.modeling_llama.apply_rotary_pos_emb +# TODO @Arthur no longer copied from LLama after static cache +def apply_rotary_pos_emb(q, k, cos, sin, position_ids, unsqueeze_dim=1): + """Applies Rotary Position Embedding to the query and key tensors. + + Args: + q (`torch.Tensor`): The query tensor. + k (`torch.Tensor`): The key tensor. + cos (`torch.Tensor`): The cosine part of the rotary embedding. + sin (`torch.Tensor`): The sine part of the rotary embedding. + position_ids (`torch.Tensor`): + The position indices of the tokens corresponding to the query and key tensors. For example, this can be + used to pass offsetted position ids when working with a KV-cache. + unsqueeze_dim (`int`, *optional*, defaults to 1): + The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and + sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note + that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and + k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes + cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have + the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2. + Returns: + `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding. + """ + cos = cos[position_ids].unsqueeze(unsqueeze_dim) + sin = sin[position_ids].unsqueeze(unsqueeze_dim) + q_embed = (q * cos) + (rotate_half(q) * sin) + k_embed = (k * cos) + (rotate_half(k) * sin) + return q_embed, k_embed + + +class MistralMLP(nn.Module): + def __init__(self, config): + super().__init__() + self.config = config + self.hidden_size = config.hidden_size + self.intermediate_size = config.intermediate_size + # self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False) + # self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False) + # self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False) + + self.up_proj = ColumnParallelLinearWithoutBias( + self.hidden_size, + self.intermediate_size, + bias=False, + gather_output=False, + init_method=lambda x: x, + ) + self.gate_proj = ColumnParallelLinearWithoutBias( + self.hidden_size, + self.intermediate_size, + bias=False, + gather_output=False, + init_method=lambda x: x, + ) + self.down_proj = RowParallelLinearWithoutBias( + self.intermediate_size, + self.hidden_size, + bias=False, + input_is_parallel=True, + init_method=lambda x: x, + ) + + self.act_fn = ACT2FN[config.hidden_act] + + def forward(self, x): + output = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x)) + + # 打印MLP层输出 + mlp_output = output.detach().cpu().tolist() + data_to_save = {"MLP Output": mlp_output} + # 将输出写入 JSON 文件 + with open('a_mlp_output.json', 'w') as f: + json.dump(data_to_save, f, indent=4) + + return output + + +# Copied from transformers.models.llama.modeling_llama.repeat_kv +def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor: + """ + This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch, + num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim) + """ + batch, num_key_value_heads, slen, head_dim = hidden_states.shape + if n_rep == 1: + return hidden_states + hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim) + return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim) + + +class MistralAttention(nn.Module): + """ + Multi-headed attention from 'Attention Is All You Need' paper. Modified to use sliding window attention: Longformer + and "Generating Long Sequences with Sparse Transformers". + """ + + def __init__(self, config: CollieConfig, layer_idx: Optional[int] = None): + super().__init__() + self.config = config + self.layer_idx = layer_idx + if layer_idx is None: + logger.warning_once( + f"Instantiating {self.__class__.__name__} without passing a `layer_idx` is not recommended and will " + "lead to errors during the forward call if caching is used. Please make sure to provide a `layer_idx` " + "when creating this class." + ) + + self.hidden_size = config.hidden_size + self.num_heads = config.num_attention_heads + self.head_dim = self.hidden_size // self.num_heads + self.num_key_value_heads = config.num_key_value_heads + self.num_key_value_groups = self.num_heads // self.num_key_value_heads + self.max_position_embeddings = config.max_position_embeddings + self.rope_theta = config.rope_theta + self.is_causal = True + self.attention_dropout = config.attention_dropout + + if (self.head_dim * self.num_heads) != self.hidden_size: + raise ValueError( + f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}" + f" and `num_heads`: {self.num_heads})." + ) + # self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=False) + # self.k_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=False) + # self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=False) + # self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=False) + + self.q_proj = ColumnParallelLinearWithoutBias( + self.hidden_size, + self.num_heads * self.head_dim, + bias=False, + gather_output=False, + init_method=lambda x: x, + ) + self.k_proj = ColumnParallelLinearWithoutBias( + self.hidden_size, + self.num_key_value_heads * self.head_dim, + bias=False, + gather_output=False, + init_method=lambda x: x, + ) + self.v_proj = ColumnParallelLinearWithoutBias( + self.hidden_size, + self.num_key_value_heads * self.head_dim, + bias=False, + gather_output=False, + init_method=lambda x: x, + ) + # aaaa + self.o_proj = RowParallelLinearWithoutBias( + self.num_heads * self.head_dim, + self.hidden_size, + bias=False, + input_is_parallel=True, + init_method=lambda x: x, + ) + + self.rotary_emb = MistralRotaryEmbedding( + self.head_dim, + max_position_embeddings=self.max_position_embeddings, + base=self.rope_theta, + ) + + def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int): + return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous() + + def forward( + self, + hidden_states: torch.Tensor, # 输入维度 [bsz, q_len, hidden_size] + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_value: Optional[Cache] = None, + output_attentions: bool = False, + use_cache: bool = False, + **kwargs, + ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: + if "padding_mask" in kwargs: + warnings.warn( + "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`" + ) + + bsz, q_len, _ = hidden_states.size() + + query_states = self.q_proj(hidden_states) # [bsz, q_len, num_heads * head_dim] + key_states = self.k_proj(hidden_states) # [bsz, q_len, num_key_value_heads * head_dim] + value_states = self.v_proj(hidden_states) # [bsz, q_len, num_key_value_heads * head_dim] + + # query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2) + # key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) + # value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) + + query_states, key_states, value_states = ( + rearrange(query_states, "b n (h d) -> b n h d", d=self.head_dim), # [bsz, q_len, num_heads, head_dim] + rearrange(key_states, "b n (h d) -> b n h d", d=self.head_dim), # [bsz, q_len, num_key_value_heads, head_dim] + rearrange(value_states, "b n (h d) -> b n h d", d=self.head_dim), # [bsz, q_len, num_key_value_heads, head_dim] + ) + + query_states = query_states.transpose(1, 2) # [bsz, num_heads, q_len, head_dim] + key_states = key_states.transpose(1, 2) # [bsz, num_key_value_heads, q_len, head_dim] + value_states = value_states.transpose(1, 2) # [bsz, num_key_value_heads, q_len, head_dim] + + # 打印注意力模块的输出 + # 准备数据以写入 JSON 文件 + attention_outputs = { + "Query states": query_states.detach().cpu().tolist(), + "Key states": key_states.detach().cpu().tolist(), + "Value states": value_states.detach().cpu().tolist() + } + # 将数据写入 JSON 文件 + with open("a_attention_outputs.json", "w") as f: + json.dump(attention_outputs, f, indent=4) + + if self.config.pp_size > 1: + query_states = query_states.contiguous() + key_states = key_states.contiguous() + value_states = value_states.contiguous() + + kv_seq_len = key_states.shape[-2] + if past_key_value is not None: + if self.layer_idx is None: + raise ValueError( + f"The cache structure has changed since version v4.36. If you are using {self.__class__.__name__} " + "for auto-regressive decoding with k/v caching, please make sure to initialize the attention class " + "with a layer index." + ) + kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx) + cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len) + query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids) + + if past_key_value is not None: + cache_kwargs = {"sin": sin, "cos": cos} # Specific to RoPE models + key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs) + + # repeat k/v heads if n_kv_heads < n_heads + key_states = repeat_kv(key_states, self.num_key_value_groups) + value_states = repeat_kv(value_states, self.num_key_value_groups) + + attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim) + + if attn_weights.size() != (bsz, self.num_heads/self.config.tp_size, q_len, kv_seq_len): + raise ValueError( + f"Attention weights should be of size {(bsz, self.num_heads/self.config.tp_size, q_len, kv_seq_len)}, but is" + f" {attn_weights.size()}" + ) + + if attention_mask is not None: + if attention_mask.size() != (bsz, 1, q_len, kv_seq_len): + raise ValueError( + f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}" + ) + + attn_weights = attn_weights + attention_mask + + # upcast attention to fp32 + attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype) + attn_weights = nn.functional.dropout(attn_weights, p=self.attention_dropout, training=self.training) + attn_output = torch.matmul(attn_weights, value_states) + + if attn_output.size() != (bsz, self.num_heads/self.config.tp_size, q_len, self.head_dim): + raise ValueError( + f"`attn_output` should be of size {(bsz, self.num_heads/self.config.tp_size, q_len, self.head_dim)}, but is" + f" {attn_output.size()}" + ) + + attn_output = attn_output.transpose(1, 2).contiguous() + attn_output = attn_output.reshape(bsz, q_len, int(self.hidden_size/self.config.tp_size)) + + + attn_output = self.o_proj(attn_output) + + if not output_attentions: + attn_weights = None + + # 打印注意力模块的输出 + attention_result = { + "Output weights:": attn_output.detach().cpu().tolist(), + # "Attention weights:": attn_weights.detach().cpu().tolist(), + } + # 将数据写入 JSON 文件 + with open("a_attention_outputs.json", "w") as f: + json.dump(attention_result, f, indent=4) + + return attn_output, attn_weights, past_key_value + + +class MistralFlashAttention2(MistralAttention): + """ + Mistral flash attention module. This module inherits from `MistralAttention` as the weights of the module stays + untouched. The only required change would be on the forward pass where it needs to correctly call the public API of + flash attention and deal with padding tokens in case the input contains any of them. + """ + + # Copied from transformers.models.llama.modeling_llama.LlamaFlashAttention2.__init__ + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1. + # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignement, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0. + # Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left). + self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10() + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_value: Optional[Cache] = None, + output_attentions: bool = False, + use_cache: bool = False, + **kwargs, + ): + if "padding_mask" in kwargs: + warnings.warn( + "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`" + ) + + # overwrite attention_mask with padding_mask + attention_mask = kwargs.pop("padding_mask") + bsz, q_len, _ = hidden_states.size() + + query_states = self.q_proj(hidden_states) + key_states = self.k_proj(hidden_states) + value_states = self.v_proj(hidden_states) + + # query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2) + # key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) + # value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) + + query_states, key_states, value_states = ( + rearrange(query_states, "b n (h d) -> b n h d", d=self.head_dim), + rearrange(key_states, "b n (h d) -> b n h d", d=self.head_dim), + rearrange(value_states, "b n (h d) -> b n h d", d=self.head_dim), + ) + + query_states = query_states.transpose(1, 2) + key_states = key_states.transpose(1, 2) + value_states = value_states.transpose(1, 2) + + # 打印注意力模块的输出 + # 准备数据以写入 JSON 文件 + attention_outputs = { + "Query states": query_states.detach().cpu().tolist(), + "Key states": key_states.detach().cpu().tolist(), + "Value states": value_states.detach().cpu().tolist() + } + # 将数据写入 JSON 文件 + with open("a_flash_attention_outputs.json", "w") as f: + json.dump(attention_outputs, f, indent=4) + + if self.config.pp_size > 1: + query_states = query_states.contiguous() + key_states = key_states.contiguous() + value_states = value_states.contiguous() + + + kv_seq_len = key_states.shape[-2] + if past_key_value is not None: + if self.layer_idx is None: + raise ValueError( + f"The cache structure has changed since version v4.36. If you are using {self.__class__.__name__} " + "for auto-regressive decoding with k/v caching, please make sure to initialize the attention class " + "with a layer index." + ) + kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx) + + # Because the input can be padded, the absolute sequence length depends on the max position id. + rotary_seq_len = max(kv_seq_len, position_ids[:, -1].max().item()) + 1 + cos, sin = self.rotary_emb(value_states, seq_len=rotary_seq_len) + + query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids) + + use_sliding_windows = ( + _flash_supports_window_size + and getattr(self.config, "sliding_window", None) is not None + and kv_seq_len > self.config.sliding_window + ) + + if not _flash_supports_window_size: + logger.warning_once( + "The current flash attention version does not support sliding window attention, for a more memory efficient implementation" + " make sure to upgrade flash-attn library." + ) + + if past_key_value is not None: + # Activate slicing cache only if the config has a value `sliding_windows` attribute + cache_has_contents = past_key_value.get_seq_length(self.layer_idx) > 0 + if ( + getattr(self.config, "sliding_window", None) is not None + and kv_seq_len > self.config.sliding_window + and cache_has_contents + ): + slicing_tokens = 1 - self.config.sliding_window + + past_key = past_key_value[self.layer_idx][0] + past_value = past_key_value[self.layer_idx][1] + + past_key = past_key[:, :, slicing_tokens:, :].contiguous() + past_value = past_value[:, :, slicing_tokens:, :].contiguous() + + if past_key.shape[-2] != self.config.sliding_window - 1: + raise ValueError( + f"past key must have a shape of (`batch_size, num_heads, self.config.sliding_window-1, head_dim`), got" + f" {past_key.shape}" + ) + + if attention_mask is not None: + attention_mask = attention_mask[:, slicing_tokens:] + attention_mask = torch.cat([attention_mask, torch.ones_like(attention_mask[:, -1:])], dim=-1) + + cache_kwargs = {"sin": sin, "cos": cos} # Specific to RoPE models + key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs) + + # repeat k/v heads if n_kv_heads < n_heads + key_states = repeat_kv(key_states, self.num_key_value_groups) + value_states = repeat_kv(value_states, self.num_key_value_groups) + dropout_rate = 0.0 if not self.training else self.attention_dropout + + # In PEFT, usually we cast the layer norms in float32 for training stability reasons + # therefore the input hidden states gets silently casted in float32. Hence, we need + # cast them back in float16 just to be sure everything works as expected. + input_dtype = query_states.dtype + if input_dtype == torch.float32: + if torch.is_autocast_enabled(): + target_dtype = torch.get_autocast_gpu_dtype() + # Handle the case where the model is quantized + elif hasattr(self.config, "_pre_quantization_dtype"): + target_dtype = self.config._pre_quantization_dtype + else: + target_dtype = self.q_proj.weight.dtype + + logger.warning_once( + f"The input hidden states seems to be silently casted in float32, this might be related to" + f" the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in" + f" {target_dtype}." + ) + + query_states = query_states.to(target_dtype) + key_states = key_states.to(target_dtype) + value_states = value_states.to(target_dtype) + + # Reashape to the expected shape for Flash Attention + query_states = query_states.transpose(1, 2) + key_states = key_states.transpose(1, 2) + value_states = value_states.transpose(1, 2) + + attn_output = self._flash_attention_forward( + query_states, + key_states, + value_states, + attention_mask, + q_len, + dropout=dropout_rate, + use_sliding_windows=use_sliding_windows, + ) + + attn_output = attn_output.reshape(bsz, q_len, int(self.hidden_size/self.config.tp_size)).contiguous() + attn_output = self.o_proj(attn_output) + + if not output_attentions: + attn_weights = None + + # 打印注意力模块的输出 + attention_result = { + "Output weights:": attn_output.detach().cpu().tolist(), + # "Attention weights:": attn_weights.detach().cpu().tolist(), + } + # 将数据写入 JSON 文件 + with open("a_flash_attention_outputs.json", "w") as f: + json.dump(attention_result, f, indent=4) + + return attn_output, attn_weights, past_key_value + + def _flash_attention_forward( + self, + query_states, + key_states, + value_states, + attention_mask, + query_length, + dropout=0.0, + softmax_scale=None, + use_sliding_windows=False, + ): + """ + Calls the forward method of Flash Attention - if the input hidden states contain at least one padding token + first unpad the input, then computes the attention scores and pad the final attention scores. + + Args: + query_states (`torch.Tensor`): + Input query states to be passed to Flash Attention API + key_states (`torch.Tensor`): + Input key states to be passed to Flash Attention API + value_states (`torch.Tensor`): + Input value states to be passed to Flash Attention API + attention_mask (`torch.Tensor`): + The padding mask - corresponds to a tensor of size `(batch_size, seq_len)` where 0 stands for the + position of padding tokens and 1 for the position of non-padding tokens. + dropout (`float`): + Attention dropout + softmax_scale (`float`, *optional*): + The scaling of QK^T before applying softmax. Default to 1 / sqrt(head_dim) + use_sliding_windows (`bool`, *optional*): + Whether to activate sliding window attention. + """ + if not self._flash_attn_uses_top_left_mask: + causal = self.is_causal + else: + # TODO: Remove the `query_length != 1` check once Flash Attention for RoCm is bumped to 2.1. For details, please see the comment in LlamaFlashAttention2 __init__. + causal = self.is_causal and query_length != 1 + + # Contains at least one padding token in the sequence + if attention_mask is not None: + batch_size = query_states.shape[0] + query_states, key_states, value_states, indices_q, cu_seq_lens, max_seq_lens = self._upad_input( + query_states, key_states, value_states, attention_mask, query_length + ) + + cu_seqlens_q, cu_seqlens_k = cu_seq_lens + max_seqlen_in_batch_q, max_seqlen_in_batch_k = max_seq_lens + + if not use_sliding_windows: + attn_output_unpad = flash_attn_varlen_func( + query_states, + key_states, + value_states, + cu_seqlens_q=cu_seqlens_q, + cu_seqlens_k=cu_seqlens_k, + max_seqlen_q=max_seqlen_in_batch_q, + max_seqlen_k=max_seqlen_in_batch_k, + dropout_p=dropout, + softmax_scale=softmax_scale, + causal=causal, + ) + else: + attn_output_unpad = flash_attn_varlen_func( + query_states, + key_states, + value_states, + cu_seqlens_q=cu_seqlens_q, + cu_seqlens_k=cu_seqlens_k, + max_seqlen_q=max_seqlen_in_batch_q, + max_seqlen_k=max_seqlen_in_batch_k, + dropout_p=dropout, + softmax_scale=softmax_scale, + causal=causal, + window_size=(self.config.sliding_window, self.config.sliding_window), + ) + + attn_output = pad_input(attn_output_unpad, indices_q, batch_size, query_length) + else: + if not use_sliding_windows: + attn_output = flash_attn_func( + query_states, + key_states, + value_states, + dropout, + softmax_scale=softmax_scale, + causal=causal, + ) + else: + attn_output = flash_attn_func( + query_states, + key_states, + value_states, + dropout, + softmax_scale=softmax_scale, + causal=causal, + window_size=(self.config.sliding_window, self.config.sliding_window), + ) + + return attn_output + + def _upad_input(self, query_layer, key_layer, value_layer, attention_mask, query_length): + batch_size, kv_seq_len, num_heads, head_dim = key_layer.shape + + # On the first iteration we need to properly re-create the padding mask + # by slicing it on the proper place + if kv_seq_len != attention_mask.shape[-1]: + attention_mask_num_tokens = attention_mask.shape[-1] + attention_mask = attention_mask[:, attention_mask_num_tokens - kv_seq_len :] + + indices_k, cu_seqlens_k, max_seqlen_in_batch_k = _get_unpad_data(attention_mask) + + key_layer = index_first_axis(key_layer.reshape(batch_size * kv_seq_len, num_heads, head_dim), indices_k) + value_layer = index_first_axis(value_layer.reshape(batch_size * kv_seq_len, num_heads, head_dim), indices_k) + + if query_length == kv_seq_len: + query_layer = index_first_axis( + query_layer.reshape(batch_size * kv_seq_len, num_heads, head_dim), indices_k + ) + cu_seqlens_q = cu_seqlens_k + max_seqlen_in_batch_q = max_seqlen_in_batch_k + indices_q = indices_k + elif query_length == 1: + max_seqlen_in_batch_q = 1 + cu_seqlens_q = torch.arange( + batch_size + 1, dtype=torch.int32, device=query_layer.device + ) # There is a memcpy here, that is very bad. + indices_q = cu_seqlens_q[:-1] + query_layer = query_layer.squeeze(1) + else: + # The -q_len: slice assumes left padding. + attention_mask = attention_mask[:, -query_length:] + query_layer, indices_q, cu_seqlens_q, max_seqlen_in_batch_q = unpad_input(query_layer, attention_mask) + + return ( + query_layer, + key_layer, + value_layer, + indices_q, + (cu_seqlens_q, cu_seqlens_k), + (max_seqlen_in_batch_q, max_seqlen_in_batch_k), + ) + + +# copied from transformers.models.llama.modeling_llama.LlamaSdpaAttention with Llama->Mistral +# TODO @Arthur no longer copied from LLama after static cache +class MistralSdpaAttention(MistralAttention): + """ + Mistral attention module using torch.nn.functional.scaled_dot_product_attention. This module inherits from + `MistralAttention` as the weights of the module stays untouched. The only changes are on the forward pass to adapt to + SDPA API. + """ + + # Adapted from MistralAttention.forward + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_value: Optional[Cache] = None, + output_attentions: bool = False, + use_cache: bool = False, + ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: + if output_attentions: + # TODO: Improve this warning with e.g. `model.config.attn_implementation = "manual"` once this is implemented. + logger.warning_once( + "MistralModel is using MistralSdpaAttention, but `torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to the manual attention implementation, " + 'but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.' + ) + return super().forward( + hidden_states=hidden_states, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_value=past_key_value, + output_attentions=output_attentions, + use_cache=use_cache, + ) + + bsz, q_len, _ = hidden_states.size() + + query_states = self.q_proj(hidden_states) + key_states = self.k_proj(hidden_states) + value_states = self.v_proj(hidden_states) + + # query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2) + # key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) + # value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) + + query_states, key_states, value_states = ( + rearrange(query_states, "b n (h d) -> b n h d", d=self.head_dim), + rearrange(key_states, "b n (h d) -> b n h d", d=self.head_dim), + rearrange(value_states, "b n (h d) -> b n h d", d=self.head_dim), + ) + + query_states = query_states.transpose(1, 2) + key_states = key_states.transpose(1, 2) + value_states = value_states.transpose(1, 2) + + # 打印注意力模块的输出 + # 准备数据以写入 JSON 文件 + attention_outputs = { + "Query states": query_states.detach().cpu().tolist(), + "Key states": key_states.detach().cpu().tolist(), + "Value states": value_states.detach().cpu().tolist() + } + # 将数据写入 JSON 文件 + with open("a_sdpa_attention_outputs.json", "w") as f: + json.dump(attention_outputs, f, indent=4) + + if self.config.pp_size > 1: + query_states = query_states.contiguous() + key_states = key_states.contiguous() + value_states = value_states.contiguous() + + + kv_seq_len = key_states.shape[-2] + if past_key_value is not None: + kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx) + cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len) + query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids) + + if past_key_value is not None: + cache_kwargs = {"sin": sin, "cos": cos} # Specific to RoPE models + key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs) + + key_states = repeat_kv(key_states, self.num_key_value_groups) + value_states = repeat_kv(value_states, self.num_key_value_groups) + + if attention_mask is not None: + if attention_mask.size() != (bsz, 1, q_len, kv_seq_len): + raise ValueError( + f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}" + ) + + # SDPA with memory-efficient backend is currently (torch==2.1.2) bugged with non-contiguous inputs with custom attn_mask, + # Reference: https://github.com/pytorch/pytorch/issues/112577. + if query_states.device.type == "cuda" and attention_mask is not None: + query_states = query_states.contiguous() + key_states = key_states.contiguous() + value_states = value_states.contiguous() + + attn_output = torch.nn.functional.scaled_dot_product_attention( + query_states, + key_states, + value_states, + attn_mask=attention_mask, + dropout_p=self.attention_dropout if self.training else 0.0, + # The q_len > 1 is necessary to match with AttentionMaskConverter.to_causal_4d that does not create a causal mask in case q_len == 1. + is_causal=self.is_causal and attention_mask is None and q_len > 1, + ) + + attn_output = attn_output.transpose(1, 2).contiguous() + attn_output = attn_output.view(bsz, q_len, int(self.hidden_size/self.config.tp_size)) + + attn_output = self.o_proj(attn_output) + + # 打印注意力模块的输出 + attention_result = { + "Output weights:": attn_output.detach().cpu().tolist(), + # "Attention weights:": attn_weights.detach().cpu().tolist(), + } + # 将数据写入 JSON 文件 + with open("a_sdpa_attention_outputs.json", "w") as f: + json.dump(attention_result, f, indent=4) + + return attn_output, None, past_key_value + + +MISTRAL_ATTENTION_CLASSES = { + "eager": MistralAttention, + "flash_attention_2": MistralFlashAttention2, + "sdpa": MistralSdpaAttention, +} + + +class MistralDecoderLayer(nn.Module): + def __init__(self, config: CollieConfig, layer_idx: int): + super().__init__() + self.hidden_size = config.hidden_size + config._attn_implementation = "sdpa" + self.self_attn = MISTRAL_ATTENTION_CLASSES[config._attn_implementation](config, layer_idx) + self.config = config + self.mlp = MistralMLP(config) + self.input_layernorm = MistralRMSNorm(config.hidden_size, eps=config.rms_norm_eps) + self.post_attention_layernorm = MistralRMSNorm(config.hidden_size, eps=config.rms_norm_eps) + self.idx = layer_idx + # 务必保持变量名一致 + self.use_cache = self.config.model_config.use_cache + self.hidden_states = None + self.output_attentions = False + +class MistralDecoderLayer(nn.Module): + def __init__(self, config: CollieConfig, layer_idx: int): + super().__init__() + self.hidden_size = config.hidden_size + config._attn_implementation = "sdpa" + self.self_attn = MISTRAL_ATTENTION_CLASSES[config._attn_implementation](config, layer_idx) + self.config = config + self.mlp = MistralMLP(config) + self.input_layernorm = MistralRMSNorm(config.hidden_size, eps=config.rms_norm_eps) + self.post_attention_layernorm = MistralRMSNorm(config.hidden_size, eps=config.rms_norm_eps) + self.idx = layer_idx + # 务必保持变量名一致 + self.use_cache = self.config.model_config.use_cache + self.hidden_states = None + self.output_attentions = False + + def _forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_value: Optional[Tuple[torch.Tensor]] = None, + # output_attentions: Optional[bool] = False, + # use_cache: Optional[bool] = False, + **kwargs, + ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]: + # if "padding_mask" in kwargs: + # warnings.warn( + # "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`" + # ) + """ + Args: + hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)` + attention_mask (`torch.FloatTensor`, *optional*): attention mask of size + `(batch, sequence_length)` where padding elements are indicated by 0. + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under + returned tensors for more detail. + use_cache (`bool`, *optional*): + If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding + (see `past_key_values`). + past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states + """ + + residual = hidden_states + + hidden_states = self.input_layernorm(hidden_states) + + # Self Attention + hidden_states, self_attn_weights, present_key_value = self.self_attn( + hidden_states=hidden_states, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_value=past_key_value, + # output_attentions=output_attentions, + # use_cache=use_cache, + **kwargs, + ) + hidden_states = residual + hidden_states + + # Fully Connected + residual = hidden_states + hidden_states = self.post_attention_layernorm(hidden_states) + hidden_states = self.mlp(hidden_states) + hidden_states = residual + hidden_states + + # outputs = (hidden_states,) + + # if output_attentions: + # outputs += (self_attn_weights,) + + # if use_cache: + # outputs += (present_key_value,) + + return hidden_states, present_key_value + + def forward(self, inputs: dict): + layer_past = inputs_to_kv_cache_for_layer(idx=self.idx, inputs=inputs) + + if self.config.checkpointing and self.training: + hidden_states, new_layer_past = torch.utils.checkpoint.checkpoint( + self._forward, + inputs["hidden_states"], + inputs.get("attention_mask", None), + inputs.get("position_ids", None), + layer_past, # inputs.get("past_key_values", None), + ) + else: + hidden_states, new_layer_past = self._forward( + inputs["hidden_states"], + inputs.get("attention_mask", None), + inputs.get("position_ids", None), + layer_past + ) # **inputs + inputs["hidden_states"] = hidden_states + + inputs.update(kv_cache_to_inputs_for_layer(idx=self.idx, new_layer_past=new_layer_past)) + return inputs + + + # def _forward( + # self, + # hidden_states: torch.Tensor, + # attention_mask: Optional[torch.Tensor] = None, + # position_ids: Optional[torch.LongTensor] = None, + # past_key_value: Optional[Tuple[torch.Tensor]] = None, + # # output_attentions: Optional[bool] = False, + # # use_cache: Optional[bool] = False, + # **kwargs, + # ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]: + # # if "padding_mask" in kwargs: + # # warnings.warn( + # # "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`" + # # ) + # """ + # Args: + # hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)` + # attention_mask (`torch.FloatTensor`, *optional*): attention mask of size + # `(batch, sequence_length)` where padding elements are indicated by 0. + # output_attentions (`bool`, *optional*): + # Whether or not to return the attentions tensors of all attention layers. See `attentions` under + # returned tensors for more detail. + # use_cache (`bool`, *optional*): + # If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding + # (see `past_key_values`). + # past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states + # """ + + # residual = hidden_states + + # hidden_states = self.input_layernorm(hidden_states) + + # # Self Attention + # hidden_states, self_attn_weights, present_key_value = self.self_attn( + # hidden_states=hidden_states, + # attention_mask=attention_mask, + # position_ids=position_ids, + # past_key_value=past_key_value, + # # output_attentions=output_attentions, + # # use_cache=use_cache, + # **kwargs, + # ) + # hidden_states = residual + hidden_states + + # # Fully Connected + # residual = hidden_states + # hidden_states = self.post_attention_layernorm(hidden_states) + # hidden_states = self.mlp(hidden_states) + # hidden_states = residual + hidden_states + + # # outputs = (hidden_states,) + + # # if output_attentions: + # # outputs += (self_attn_weights,) + + # # if use_cache: + # # outputs += (present_key_value,) + + # return hidden_states, present_key_value + + # def forward(self, inputs: dict): + # layer_past = inputs_to_kv_cache_for_layer(idx=self.idx, inputs=inputs) + + # if self.config.checkpointing and self.training: + # hidden_states, new_layer_past = torch.utils.checkpoint.checkpoint( + # self._forward, + # inputs["hidden_states"], + # inputs.get("attention_mask", None), + # inputs.get("position_ids", None), + # layer_past, # inputs.get("past_key_values", None), + # ) + # else: + # hidden_states, new_layer_past = self._forward( + # inputs["hidden_states"], + # inputs.get("attention_mask", None), + # inputs.get("position_ids", None), + # layer_past + # ) # **inputs + # inputs["hidden_states"] = hidden_states + + # inputs.update(kv_cache_to_inputs_for_layer(idx=self.idx, new_layer_past=new_layer_past)) + # return inputs + + # def forward( + # self, + # hidden_states: torch.Tensor, + # attention_mask: Optional[torch.Tensor] = None, + # position_ids: Optional[torch.LongTensor] = None, + # past_key_value: Optional[Tuple[torch.Tensor]] = None, + # output_attentions: Optional[bool] = False, + # use_cache: Optional[bool] = False, + # **kwargs, + # ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]: + # if "padding_mask" in kwargs: + # warnings.warn( + # "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`" + # ) + # """ + # Args: + # hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)` + # attention_mask (`torch.FloatTensor`, *optional*): attention mask of size + # `(batch, sequence_length)` where padding elements are indicated by 0. + # output_attentions (`bool`, *optional*): + # Whether or not to return the attentions tensors of all attention layers. See `attentions` under + # returned tensors for more detail. + # use_cache (`bool`, *optional*): + # If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding + # (see `past_key_values`). + # past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states + # """ + + # residual = hidden_states + + # hidden_states = self.input_layernorm(hidden_states) + + # # Self Attention + # hidden_states, self_attn_weights, present_key_value = self.self_attn( + # hidden_states=hidden_states, + # attention_mask=attention_mask, + # position_ids=position_ids, + # past_key_value=past_key_value, + # output_attentions=output_attentions, + # use_cache=use_cache, + # **kwargs, + # ) + # hidden_states = residual + hidden_states + + # # Fully Connected + # residual = hidden_states + # hidden_states = self.post_attention_layernorm(hidden_states) + # hidden_states = self.mlp(hidden_states) + # hidden_states = residual + hidden_states + + # outputs = (hidden_states,) + + # if output_attentions: + # outputs += (self_attn_weights,) + + # if use_cache: + # outputs += (present_key_value,) + + # return outputs + + +MISTRAL_START_DOCSTRING = r""" + This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the + library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads + etc.) + + This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. + Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage + and behavior. + + Parameters: + config ([`MistralConfig`]): + Model configuration class with all the parameters of the model. Initializing with a config file does not + load the weights associated with the model, only the configuration. Check out the + [`~PreTrainedModel.from_pretrained`] method to load the model weights. +""" + + +@add_start_docstrings( + "The bare Mistral Model outputting raw hidden-states without any specific head on top.", + MISTRAL_START_DOCSTRING, +) +class MistralPreTrainedModel(PreTrainedModel): + config_class = MistralConfig + base_model_prefix = "model" + supports_gradient_checkpointing = True + _no_split_modules = ["MistralDecoderLayer"] + _skip_keys_device_placement = "past_key_values" + _supports_flash_attn_2 = True + _supports_sdpa = True + _supports_cache_class = True + + def _init_weights(self, module): + std = self.config.initializer_range + if isinstance(module, nn.Linear): + module.weight.data.normal_(mean=0.0, std=std) + if module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, nn.Embedding): + module.weight.data.normal_(mean=0.0, std=std) + if module.padding_idx is not None: + module.weight.data[module.padding_idx].zero_() + + +MISTRAL_INPUTS_DOCSTRING = r""" + Args: + input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`): + Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide + it. + + Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. + + [What are input IDs?](../glossary#input-ids) + attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*): + Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + [What are attention masks?](../glossary#attention-mask) + + Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. + + If `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see + `past_key_values`). + + If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`] + and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more + information on the default strategy. + + - 1 indicates the head is **not masked**, + - 0 indicates the head is **masked**. + position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0, + config.n_positions - 1]`. + + [What are position IDs?](../glossary#position-ids) + past_key_values (`Cache` or `tuple(tuple(torch.FloatTensor))`, *optional*): + Pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention + blocks) that can be used to speed up sequential decoding. This typically consists in the `past_key_values` + returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`. + + Two formats are allowed: + - a [`~cache_utils.Cache`] instance; + - Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of + shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`). This is also known as the legacy + cache format. + + The model will output the same cache format that is fed as input. If no `past_key_values` are passed, the + legacy cache format will be returned. + + If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that don't + have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `input_ids` + of shape `(batch_size, sequence_length)`. + inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): + Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This + is useful if you want more control over how to convert `input_ids` indices into associated vectors than the + model's internal embedding lookup matrix. + use_cache (`bool`, *optional*): + If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see + `past_key_values`). + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned + tensors for more detail. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for + more detail. + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. +""" + + +@add_start_docstrings( + "The bare Mistral Model outputting raw hidden-states without any specific head on top.", + MISTRAL_START_DOCSTRING, +) +class MistralModel(nn.Module): + """ + Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`MistralDecoderLayer`] + + Args: + config: MistralConfig + """ + + def __init__(self, config: CollieConfig): + # super().__init__(config) + super().__init__() + self.config = config + self.padding_idx = config.pad_token_id + self.vocab_size = config.vocab_size + + # aaaa + # self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx) + self.embed_tokens = tensor_parallel.VocabParallelEmbedding( + config.vocab_size, config.hidden_size, params_dtype=torch.float32 + ) + self.layers = nn.ModuleList( + [MistralDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)] + ) + config._attn_implementation = "sdpa" + self._attn_implementation = config._attn_implementation + self.norm = MistralRMSNorm(config.hidden_size, eps=config.rms_norm_eps) + self.gradient_checkpointing = False + # Initialize weights and apply final processing + # self.post_init() + + + def get_input_embeddings(self): + return self.embed_tokens + + def set_input_embeddings(self, value): + self.embed_tokens = value + + @add_start_docstrings_to_model_forward(MISTRAL_INPUTS_DOCSTRING) + def forward( + self, + input_ids: torch.LongTensor = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + # aaaa + past_key_values: Optional[Tuple[torch.FloatTensor]] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple, BaseModelOutputWithPast]: + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + use_cache = use_cache if use_cache is not None else self.config.use_cache + + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + # retrieve input_ids and inputs_embeds + if input_ids is not None and inputs_embeds is not None: + raise ValueError("You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time") + elif input_ids is not None: + batch_size, seq_length = input_ids.shape + elif inputs_embeds is not None: + batch_size, seq_length, _ = inputs_embeds.shape + else: + raise ValueError("You have to specify either decoder_input_ids or decoder_inputs_embeds") + + if self.gradient_checkpointing and self.training: + if use_cache: + logger.warning_once( + "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..." + ) + use_cache = False + + past_key_values_length = 0 + + if use_cache: + use_legacy_cache = not isinstance(past_key_values, Cache) + if use_legacy_cache: + past_key_values = DynamicCache.from_legacy_cache(past_key_values) + past_key_values_length = past_key_values.get_usable_length(seq_length) + + if position_ids is None: + device = input_ids.device if input_ids is not None else inputs_embeds.device + position_ids = torch.arange( + past_key_values_length, seq_length + past_key_values_length, dtype=torch.long, device=device + ) + position_ids = position_ids.unsqueeze(0).view(-1, seq_length) + else: + position_ids = position_ids.view(-1, seq_length).long() + + if inputs_embeds is None: + inputs_embeds = self.embed_tokens(input_ids) + + # 打印嵌入层输出 + embeddings_output = inputs_embeds.detach().cpu().tolist() + data_to_save = {"Embeddings Output": embeddings_output} + # 将输出写入 JSON 文件 + with open('a_embeddings_output.json', 'w') as f: + json.dump(data_to_save, f, indent=4) + + if attention_mask is not None and self._attn_implementation == "flash_attention_2" and use_cache: + is_padding_right = attention_mask[:, -1].sum().item() != batch_size + if is_padding_right: + raise ValueError( + "You are attempting to perform batched generation with padding_side='right'" + " this may lead to unexpected behaviour for Flash Attention version of Mistral. Make sure to " + " call `tokenizer.padding_side = 'left'` before tokenizing the input. " + ) + + if self._attn_implementation == "flash_attention_2": + # 2d mask is passed through the layers + attention_mask = attention_mask if (attention_mask is not None and 0 in attention_mask) else None + elif self._attn_implementation == "sdpa" and not output_attentions: + # output_attentions=True can not be supported when using SDPA, and we fall back on + # the manual implementation that requires a 4D causal mask in all cases. + attention_mask = _prepare_4d_causal_attention_mask_for_sdpa( + attention_mask, + (batch_size, seq_length), + inputs_embeds, + past_key_values_length, + ) + else: + # 4d mask is passed through the layers + attention_mask = _prepare_4d_causal_attention_mask( + attention_mask, + (batch_size, seq_length), + inputs_embeds, + past_key_values_length, + sliding_window=self.config.sliding_window, + ) + + hidden_states = inputs_embeds + + inputs = { + "input_ids": input_ids, + "hidden_states": hidden_states, + "attention_mask": attention_mask, + "position_ids": position_ids, + "past_key_values": past_key_values, + "output_attentions": output_attentions, + "use_cache": use_cache, + } + + # decoder layers + all_hidden_states = () if output_hidden_states else None + all_self_attns = () if output_attentions else None + next_decoder_cache = None + + # for decoder_layer in self.layers: + for idx, decoder_layer in enumerate(self.layers): + if output_hidden_states: + # all_hidden_states += (hidden_states,) + all_hidden_states += (inputs["hidden_states"],) + + if self.gradient_checkpointing and self.training: + layer_outputs = self._gradient_checkpointing_func( + decoder_layer.__call__, + # hidden_states, + # attention_mask, + # position_ids, + # past_key_values, + # output_attentions, + # use_cache, + inputs, + ) + else: + layer_outputs = decoder_layer( + # hidden_states, + # attention_mask=attention_mask, + # position_ids=position_ids, + # past_key_value=past_key_values, + # output_attentions=output_attentions, + # use_cache=use_cache, + inputs, + ) + inputs.update(layer_outputs) + + # hidden_states = layer_outputs[0] + hidden_states = inputs["hidden_states"] + + if use_cache: + # next_decoder_cache = layer_outputs[2 if output_attentions else 1] + next_decoder_cache = inputs["addition_info"][1 if output_attentions else 0] + + if output_attentions: + # all_self_attns += (layer_outputs[1],) + all_self_attns += (inputs["addition_info"][0],) + + hidden_states = self.norm(hidden_states) + + # add hidden states from the last decoder layer + if output_hidden_states: + all_hidden_states += (hidden_states,) + + next_cache = None + if use_cache: + next_cache = next_decoder_cache.to_legacy_cache() if use_legacy_cache else next_decoder_cache + + if not return_dict: + return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None) + return BaseModelOutputWithPast( + last_hidden_state=hidden_states, + # past_key_values=next_cache, + hidden_states=all_hidden_states, + attentions=all_self_attns, + past_key_values=past_key_values, + ) + + @classmethod + def pipeline_layers(cls, config: CollieConfig): + """ + Get layers of pipeline. + :return: list + """ + if isinstance(config, str): + config = CollieConfig.from_pretrained(config) + + if config.tie_word_embeddings: + embed_tokens = TiedLayerSpec( + "embed_tokens", + dict_as_params(input_keys="input_ids", output_keys="hidden_states"), + tensor_parallel.VocabParallelEmbedding, + config.vocab_size, + config.hidden_size, + ) + else: + embed_tokens = LayerSpec( + dict_as_params(input_keys="input_ids", output_keys="hidden_states"), + tensor_parallel.VocabParallelEmbedding, + config.vocab_size, + config.hidden_size, + ) + + layers = [ + LayerSpec(MistralDecoderLayer, config, i) for i in range(config.num_hidden_layers) + ] + norm = LayerSpec( + dict_as_params(input_keys="hidden_states", output_keys="hidden_states"), + MistralRMSNorm, + hidden_size=config.hidden_size, + eps=config.rms_norm_eps, + ) + + return [ + ("embed_tokens", embed_tokens), + ("layers", layers), + ("norm", norm), + ] + +class MistralForCausalLM(CollieModelForCausalLM): + _tied_weights_keys = ["lm_head.weight"] + + def __init__(self, config:CollieConfig): + super().__init__(config) + self.model = MistralModel(config) + self.vocab_size = config.vocab_size + # self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False) + self.lm_head = ColumnParallelLinearWithoutBias( + self.collie_config.hidden_size, self.collie_config.vocab_size, bias=False + ) + # Initialize weights and apply final processing + # self.post_init() + # GenerationMixin 需要的额外参数 + self.config.is_decoder = True + if config.model_config.tie_word_embeddings: + self.lm_head.weight = self.embed_tokens.weight + self.main_input_name = "input_ids" + + def clean_cache(self): + self._clean_hidden_states([*self.model.layers, self.lm_head]) + self._set_use_cache(self.model.layers, False) + + def set_cache(self, use_cache): + self._set_use_cache(self.model.layers, use_cache) + + def get_input_embeddings(self): + return self.model.embed_tokens + + def set_input_embeddings(self, value): + self.model.embed_tokens = value + + def get_output_embeddings(self): + return self.lm_head + + def set_output_embeddings(self, new_embeddings): + self.lm_head = new_embeddings + + def set_decoder(self, decoder): + self.model = decoder + + def get_decoder(self): + return self.model + + @add_start_docstrings_to_model_forward(MISTRAL_INPUTS_DOCSTRING) + @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC) + def forward( + self, + input_ids: torch.LongTensor = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[List[torch.FloatTensor]] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + **kwargs, + ) -> Union[Tuple, CausalLMOutputWithPast]: + r""" + Args: + labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., + config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored + (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`. + + Returns: + + Example: + + ```python + >>> from transformers import AutoTokenizer, MistralForCausalLM + + >>> model = MistralForCausalLM.from_pretrained("mistralai/Mistral-7B-v0.1") + >>> tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-v0.1") + + >>> prompt = "Hey, are you conscious? Can you talk to me?" + >>> inputs = tokenizer(prompt, return_tensors="pt") + + >>> # Generate + >>> generate_ids = model.generate(inputs.input_ids, max_length=30) + >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0] + "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you." + ```""" + + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn) + outputs = self.model( + input_ids=input_ids, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + hidden_states = outputs[0] + logits = self.lm_head(hidden_states) + logits = logits.float() + + loss = None + if labels is not None: + # Shift so that tokens < n predict n + shift_logits = logits[..., :-1, :].contiguous() + shift_labels = labels[..., 1:].contiguous() + # Flatten the tokens + shift_logits = shift_logits.view(-1, self.config.vocab_size) + shift_labels = shift_labels.view(-1) + # Ensure tensors are on the same device + shift_labels = shift_labels.to(shift_logits.device) + loss_fct = CrossEntropyLoss() + loss = loss_fct(shift_logits, shift_labels) + + if not return_dict: + output = (logits,) + outputs[1:] + return (loss,) + output if loss is not None else output + + return CausalLMOutputWithPast( + loss=loss, + logits=logits, + past_key_values=outputs.past_key_values, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + def prepare_inputs_for_generation( + self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, **kwargs + ): + # Omit tokens covered by past_key_values + if past_key_values is not None: + if isinstance(past_key_values, Cache): + cache_length = past_key_values.get_seq_length() + past_length = past_key_values.seen_tokens + max_cache_length = past_key_values.get_max_length() + else: + cache_length = past_length = past_key_values[0][0].shape[2] + max_cache_length = None + + # Keep only the unprocessed tokens: + # 1 - If the length of the attention_mask exceeds the length of input_ids, then we are in a setting where + # some of the inputs are exclusively passed as part of the cache (e.g. when passing input_embeds as + # input) + if attention_mask is not None and attention_mask.shape[1] > input_ids.shape[1]: + input_ids = input_ids[:, -(attention_mask.shape[1] - past_length) :] + # 2 - If the past_length is smaller than input_ids', then input_ids holds all input tokens. We can discard + # input_ids based on the past_length. + elif past_length < input_ids.shape[1]: + input_ids = input_ids[:, past_length:] + # 3 - Otherwise (past_length >= input_ids.shape[1]), let's assume input_ids only has unprocessed tokens. + + # If we are about to go beyond the maximum cache length, we need to crop the input attention mask. + if ( + max_cache_length is not None + and attention_mask is not None + and cache_length + input_ids.shape[1] > max_cache_length + ): + attention_mask = attention_mask[:, -max_cache_length:] + + position_ids = kwargs.get("position_ids", None) + if attention_mask is not None and position_ids is None: + # create position_ids on the fly for batch generation + position_ids = attention_mask.long().cumsum(-1) - 1 + position_ids.masked_fill_(attention_mask == 0, 1) + if past_key_values: + position_ids = position_ids[:, -input_ids.shape[1] :] + + # if `inputs_embeds` are passed, we only want to use them in the 1st generation step + if inputs_embeds is not None and past_key_values is None: + model_inputs = {"inputs_embeds": inputs_embeds} + else: + model_inputs = {"input_ids": input_ids} + + model_inputs.update( + { + "position_ids": position_ids, + "past_key_values": past_key_values, + "use_cache": kwargs.get("use_cache"), + "attention_mask": attention_mask, + } + ) + return model_inputs + + @staticmethod + def _reorder_cache(past_key_values, beam_idx): + reordered_past = () + for layer_past in past_key_values: + reordered_past += ( + tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past), + ) + return reordered_past + + @classmethod + def pipeline_layers(cls, config: CollieConfig): + """ + Get layers of pipeline. + :return: list + """ + if isinstance(config, str): + config = CollieConfig.from_pretrained(config) + + if config.tie_word_embeddings: + output = TiedLayerSpec( + "embed_tokens", + dict_as_params(input_keys="hidden_states", output_keys="logits"), + ColumnParallelLMHead, + config.hidden_size, + config.vocab_size, + bias=False, + ) + else: + output = LayerSpec( + dict_as_params(input_keys="hidden_states", output_keys="logits"), + ColumnParallelLMHead, + config.hidden_size, + config.vocab_size, + bias=False, + ) + + return [("model", MistralModel.pipeline_layers(config)), ("lm_head", output)] + + @staticmethod + def load_parallel_state_dict( + path: str, + config: Union[CollieConfig, str], + process_exclusion: bool = False, + **kwargs, + ): + ... + + @staticmethod + def load_parallel_state_dict( + path: str, + config: Union[CollieConfig, str], + process_exclusion: bool = False, + protocol: str = "file", # 指定加载state_dict时使用的协议 + **kwargs, + ): + """ + Load state_dict from ``path``. + The format of pretrained model should be the same as that of + `huggingface`. + :return: state_dict. Note that the state_dict should be processed + properly to match the current rank. + """ + # 配置加载 + if isinstance(config, str): + config = CollieConfig.from_pretrained(config) + # IO驱动初始化 + io_driver = IODriver.from_protocol(protocol) + # 检查文件路径是否存在 + if not io_driver.exists(path): + raise FileNotFoundError(f"folder {path} not found.") + # 初始化存储和处理变量 + state_dict = OrderedDict() + weights = [] + parts = None # 变量用于存储模型分割的部分信息 + # 如果开启了进程互斥,那么每个进程都会显示进度条,否则只显示 RANK0 的 + hide_progress = not process_exclusion and int(os.environ.get("RANK", "0")) != 0 + if dist.is_initialized() and process_exclusion: + # 如果启动了进程互斥,则要进行 dist.get_world_size() 次循环 + rank_order = range(dist.get_world_size()) + else: + # 不开启只进行一次循环 + rank_order = range(1) + # 权重文件加载和处理 + for rank in rank_order: + # 如果开启了进程互斥,那么只有对应 RANK 的能进入循环;不开启进程互斥的话就都可以进 + if int(os.environ.get("RANK", "0")) == rank or not process_exclusion: + # PP 分层的方法保存在了 os.environ["COLLIE_PP_PARTS"], 格式类似于 [0, 17, 35], 左闭右开 + if env.is_pipeline: + # 保存的是 json 格式 + parts = env.pipeline_parts + if hasattr(config, "num_key_value_heads"): + # llama2 (transformers >= 4.31.0) + num_key_value_heads = config.num_key_value_heads + else: + num_key_value_heads = config.num_attention_heads + head_dim = config.hidden_size // config.num_attention_heads + # 如果存在 pytorch_model.bin.index.json 文件的话,此时不同的 pp 进程可以按需加载自己需要的权重 + if ( + io_driver.exists(os.path.join(path, "pytorch_model.bin.index.json")) + and "COLLIE_PP_PARTS" in os.environ.keys() + ): + weight_map = json.loads( + io_driver.load( + os.path.join(path, "pytorch_model.bin.index.json"), mode="r" + ) + )["weight_map"] + # layers 表示自己需要的层 + layers = env.pipeline_layers_idx + # 筛选出形似 model.layers.0 这样的层。包含两个条件:1. 有数字的层;2. 数字加一要在 layers 里面(因为最开始还有个 embedding 占一层) + weights.extend( + [ + value + for key, value in weight_map.items() + if len(key.split(".")) > 2 + and key.split(".")[2].isdigit() + and (int(key.split(".")[2]) + 1) in layers + ] + ) + # 去重 + weights = list(set(weights)) + # 继续筛选,如果有 0 层,那么就要加载 embedding;如果有最后一层,那么就要加载 lm_head;如果有倒数第二层,那么就要加载 norm + if 0 in layers: + weights.append(weight_map["model.embed_tokens.weight"]) + if max(parts) - 1 in layers: + weights.append(weight_map["lm_head.weight"]) + if max(parts) - 2 in layers: + weights.append(weight_map["model.norm.weight"]) + else: + # 如果没有 pytorch_model.bin.index.json 文件的话,那么就加载所有的权重 + weights = [ + weight + for weight in io_driver.list(path) + if weight.endswith(".bin") + ] + with progress( + weights, + desc="Loading state dict", + total=len(weights), + disable=hide_progress, + ) as pbar: + for weight in pbar: + part_state_dict = io_driver.load( + os.path.join(path, weight), mode="rb" + ) + # for key in list(part_state_dict.keys()): + # if "attention.wqkv.weight" in key: + # # qkv_weights = part_state_dict.pop(key) + # qkv_weights = part_state_dict[key] + # print(qkv_weights.shape) + # (wq, wk, wv) = qkv_weights.split( + # [ + # config.hidden_size, + # config.num_key_value_heads * head_dim, + # config.num_key_value_heads * head_dim, + # ], + # dim=0, + # ) + # wq_name = key.replace("wqkv", "wq") + # wk_name = key.replace("wqkv", "wk") + # wv_name = key.replace("wqkv", "wv") + # part_state_dict[wq_name] = wq + # part_state_dict[wk_name] = wk + # part_state_dict[wv_name] = wv + state_dict.update(part_state_dict) + del part_state_dict + if parts is not None: + # 这一步是 pp 的复筛 + layers = env.pipeline_layers_idx + for key in list(state_dict.keys()): + if key.startswith("layers"): + layer = int(key.split(".")[1]) + if layer + 1 not in layers: + state_dict.pop(key) + # if key.endswith("tok_embeddings.weight"): + if key.endswith("embed_tokens.weight"): + if 0 not in layers: + state_dict.pop(key) + if key == "norm.weight": + if max(parts) - 2 not in layers: + state_dict.pop(key) + # if key.endswith("output.weight"): + if key.endswith("lm_head.weight"): + if max(parts) - 1 not in layers: + state_dict.pop(key) + # 根据用户配置的新的 tp size 进行分割 + for key in list(state_dict.keys()): + col_filter = [ + # "wq.weight", + # "wk.weight", + # "wv.weight", + # "wqkv.weight", + # "w1.weight", + # "w3.weight", + # "tok_embeddings.weight", + # "output.weight", + "q_proj.weight", + "k_proj.weight", + "v_proj.weight", + #"o_proj.weight", + "lm_head.weight", + "gate_proj.weight", + "up_proj.weight", + #"down_proj.weight", + "embed_tokens.weight", + ] + col_split = any([key.endswith(filter) for filter in col_filter]) + + if col_split: + tensor = ( + list(torch.chunk(state_dict[key], config.tp_size, dim=0))[ + env.tp_rank + ] + .detach() + .clone() + ) + del state_dict[key] + if process_exclusion: + # CPU 内存回收(速度很慢) + gc.collect() + state_dict[key] = tensor + elif key.endswith("o_proj.weight") or key.endswith("down_proj.weight"): + tensor = ( + list(torch.chunk(state_dict[key], config.tp_size, dim=1))[ + env.tp_rank + ] + .detach() + .clone() + ) + del state_dict[key] + if process_exclusion: + # CPU 内存回收(速度很慢) + gc.collect() + state_dict[key] = tensor + if dist.is_initialized() and process_exclusion: + # 如果选择了进程互斥,那么本次循环中不需要加载权重的进程需等待 + dist.barrier() + return state_dict + + @staticmethod + def save_parallel_state_dict( + state_dict: dict, + path: str, + config: CollieConfig, + process_exclusion: bool = False, + **kwargs, + ): + ... + + @staticmethod + def save_parallel_state_dict( + state_dict: dict, + path: str, + config: CollieConfig, + process_exclusion: bool = False, + protocol: str = "file", + ): + """ + Save state_dict to ``path``. + The format of saved state dict should be the same as that of + `huggingface`. + """ + io_driver = IODriver.from_protocol(protocol) + # gather to tp rank 0 + if dist.is_initialized() and process_exclusion: + # 如果启动了进程互斥,则要进行 pp_size 次循环 + rank_order = range(config.pp_size) + else: + # 不开启只进行一次循环 + rank_order = range(1) + dst = parallel_state.get_tensor_model_parallel_src_rank() + with progress( + rank_order, + desc="Saving model", + disable=int(os.environ.get("RANK", "0")) != 0, + ) as pbar: + for rank in pbar: + if env.dp_rank == 0 and (env.pp_rank == rank or not process_exclusion): + for key in sorted(list(state_dict.keys())): + tensor_list = None + if env.tp_rank == 0: + tensor_list = [ + torch.zeros_like(state_dict[key]) + .to(state_dict[key].dtype) + .cuda() + for _ in range(config.tp_size) + ] + dist.gather( + state_dict[key].cuda(), + dst=dst, + gather_list=tensor_list, + group=env.tp_group, + ) + if env.tp_rank == 0: + col_filter = [ + # "wq.weight", + # "wk.weight", + # "wv.weight", + # "wqkv.weight", + # "w1.weight", + # "w3.weight", + # "tok_embeddings.weight", + # "output.weight", + "q_proj.weight", + "k_proj.weight", + "v_proj.weight", + #"o_proj.weight", + "lm_head.weight", + "gate_proj.weight", + "up_proj.weight", + #"down_proj.weight", + "embed_tokens.weight", + ] + col_split = any( + [key.endswith(filter) for filter in col_filter] + ) + + if col_split: + state_dict[key] = concat_tensor(tensor_list, dim=0) + + if process_exclusion: + # CPU 内存回收(速度很慢) + gc.collect() + + elif key.endswith("o_proj.weight") or key.endswith("down_proj.weight"): + state_dict[key] = concat_tensor(tensor_list, dim=1) + + if process_exclusion: + # CPU 内存回收(速度很慢) + gc.collect() + # 似乎不需要? + # state_dict_keys = state_dict.keys() + # for layer_id in range(config.num_layers): + # qkv_names = [None, None, None] + # for key in state_dict_keys: + # if f"layers.{layer_id}.attention.wq.weight" in key: + # qkv_names[0] = key + # elif f"layers.{layer_id}.attention.wk.weight" in key: + # qkv_names[1] = key + # elif f"layers.{layer_id}.attention.wv.weight" in key: + # qkv_names[2] = key + # qkv_name = qkv_names[0].replace("wq", "wqkv") + # state_dict[qkv_name] = torch.cat( + # [ + # state_dict.pop(qkv_names[0]), + # state_dict.pop(qkv_names[1]), + # state_dict.pop(qkv_names[2]), + # ], + # dim=0 + # ) + + if env.tp_rank == 0: + # Save gathered weights + if env.is_pipeline: + ckpt_name = f"pytorch_model-{env.pp_rank + 1:05d}-of-{config.pp_size:05d}.bin" + total_size = 0 + weight_map = {} + for name, weight in state_dict.items(): + weight_size = weight.numel() * dtype_byte_size( + weight.dtype + ) + weight_map[name] = ckpt_name + total_size += weight_size + index_dict = dict( + total_size=total_size, weight_map=weight_map + ) + index_dicts = [None for _ in range(env.pp_size)] + dist.gather_object( + index_dict, index_dicts if env.pp_rank == 0 else None, group=env.pp_group + ) + if env.pp_rank == 0: + total_size = 0 + weight_map = {} + for _index_dict in index_dicts: + total_size += _index_dict["total_size"] + weight_map.update(_index_dict["weight_map"]) + merged_dict = { + "metadata": {"total_size": total_size}, + "weight_map": weight_map, + } + io_driver.save( + json.dumps(merged_dict, indent=2, sort_keys=True) + + "\n", + os.path.join(path, "pytorch_model.bin.index.json"), + ) + + else: + ckpt_name = f"pytorch_model.bin" + ckpt_path = os.path.join(path, ckpt_name) + io_driver.save(state_dict, ckpt_path) + if dist.is_initialized() and process_exclusion: + dist.barrier() + if env.rank == 0: + config.save_pretrained(path, protocol=protocol) + dist.barrier() + + +@add_start_docstrings( + """ + The Mistral Model transformer with a sequence classification head on top (linear layer). + + [`MistralForSequenceClassification`] uses the last token in order to do the classification, as other causal models + (e.g. GPT-2) do. + + Since it does classification on the last token, it requires to know the position of the last token. If a + `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If + no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the + padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in + each row of the batch). + """, + MISTRAL_START_DOCSTRING, +) +# Copied from transformers.models.llama.modeling_llama.LlamaForSequenceClassification with Llama->Mistral, LLAMA->MISTRAL +class MistralForSequenceClassification(MistralPreTrainedModel): + def __init__(self, config): + super().__init__(config) + self.num_labels = config.num_labels + self.model = MistralModel(config) + self.score = nn.Linear(config.hidden_size, self.num_labels, bias=False) + + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self): + return self.model.embed_tokens + + def set_input_embeddings(self, value): + self.model.embed_tokens = value + + @add_start_docstrings_to_model_forward(MISTRAL_INPUTS_DOCSTRING) + def forward( + self, + input_ids: torch.LongTensor = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[List[torch.FloatTensor]] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple, SequenceClassifierOutputWithPast]: + r""" + labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): + Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., + config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If + `config.num_labels > 1` a classification loss is computed (Cross-Entropy). + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + transformer_outputs = self.model( + input_ids, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + hidden_states = transformer_outputs[0] + logits = self.score(hidden_states) + + if input_ids is not None: + batch_size = input_ids.shape[0] + else: + batch_size = inputs_embeds.shape[0] + + if self.config.pad_token_id is None and batch_size != 1: + raise ValueError("Cannot handle batch sizes > 1 if no padding token is defined.") + if self.config.pad_token_id is None: + sequence_lengths = -1 + else: + if input_ids is not None: + # if no pad token found, use modulo instead of reverse indexing for ONNX compatibility + sequence_lengths = torch.eq(input_ids, self.config.pad_token_id).int().argmax(-1) - 1 + sequence_lengths = sequence_lengths % input_ids.shape[-1] + sequence_lengths = sequence_lengths.to(logits.device) + else: + sequence_lengths = -1 + + pooled_logits = logits[torch.arange(batch_size, device=logits.device), sequence_lengths] + + loss = None + if labels is not None: + labels = labels.to(logits.device) + if self.config.problem_type is None: + if self.num_labels == 1: + self.config.problem_type = "regression" + elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int): + self.config.problem_type = "single_label_classification" + else: + self.config.problem_type = "multi_label_classification" + + if self.config.problem_type == "regression": + loss_fct = MSELoss() + if self.num_labels == 1: + loss = loss_fct(pooled_logits.squeeze(), labels.squeeze()) + else: + loss = loss_fct(pooled_logits, labels) + elif self.config.problem_type == "single_label_classification": + loss_fct = CrossEntropyLoss() + loss = loss_fct(pooled_logits.view(-1, self.num_labels), labels.view(-1)) + elif self.config.problem_type == "multi_label_classification": + loss_fct = BCEWithLogitsLoss() + loss = loss_fct(pooled_logits, labels) + if not return_dict: + output = (pooled_logits,) + transformer_outputs[1:] + return ((loss,) + output) if loss is not None else output + + return SequenceClassifierOutputWithPast( + loss=loss, + logits=pooled_logits, + past_key_values=transformer_outputs.past_key_values, + hidden_states=transformer_outputs.hidden_states, + attentions=transformer_outputs.attentions, + ) From f61b34e08080f43d5e11e1deec1c913f154f5dca Mon Sep 17 00:00:00 2001 From: LinqiY <100989140+LinqiY@users.noreply.github.com> Date: Fri, 26 Apr 2024 16:06:52 +0800 Subject: [PATCH 03/16] Add Raw Model --- collie/models/mistral/__init__.py | 82 + .../__pycache__/__init__.cpython-310.pyc | Bin 0 -> 1210 bytes .../configuration_mistral.cpython-310.pyc | Bin 0 -> 6270 bytes .../modeling_mistral.cpython-310.pyc | Bin 0 -> 41165 bytes .../models/mistral/configuration_mistral.py | 152 ++ .../mistral/convert_mistral_weights_to_hf.py | 276 +++ .../models/mistral/modeling_flax_mistral.py | 741 +++++++++ collie/models/mistral/modeling_mistral.py | 1473 +++++++++++++++++ 8 files changed, 2724 insertions(+) create mode 100644 collie/models/mistral/__init__.py create mode 100644 collie/models/mistral/__pycache__/__init__.cpython-310.pyc create mode 100644 collie/models/mistral/__pycache__/configuration_mistral.cpython-310.pyc create mode 100644 collie/models/mistral/__pycache__/modeling_mistral.cpython-310.pyc create mode 100644 collie/models/mistral/configuration_mistral.py create mode 100644 collie/models/mistral/convert_mistral_weights_to_hf.py create mode 100644 collie/models/mistral/modeling_flax_mistral.py create mode 100644 collie/models/mistral/modeling_mistral.py diff --git a/collie/models/mistral/__init__.py b/collie/models/mistral/__init__.py new file mode 100644 index 0000000..c5fa66e --- /dev/null +++ b/collie/models/mistral/__init__.py @@ -0,0 +1,82 @@ +# Copyright 2023 Mistral AI and The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import TYPE_CHECKING + +from transformers.utils import OptionalDependencyNotAvailable, _LazyModule, is_flax_available, is_torch_available + + +_import_structure = { + "configuration_mistral": ["MISTRAL_PRETRAINED_CONFIG_ARCHIVE_MAP", "MistralConfig"], +} + + +try: + if not is_torch_available(): + raise OptionalDependencyNotAvailable() +except OptionalDependencyNotAvailable: + pass +else: + _import_structure["modeling_mistral"] = [ + "MistralForCausalLM", + "MistralModel", + "MistralPreTrainedModel", + "MistralForSequenceClassification", + ] + +try: + if not is_flax_available(): + raise OptionalDependencyNotAvailable() +except OptionalDependencyNotAvailable: + pass +else: + _import_structure["modeling_flax_mistral"] = [ + "FlaxMistralForCausalLM", + "FlaxMistralModel", + "FlaxMistralPreTrainedModel", + ] + + +if TYPE_CHECKING: + from .configuration_mistral import MISTRAL_PRETRAINED_CONFIG_ARCHIVE_MAP, MistralConfig + + try: + if not is_torch_available(): + raise OptionalDependencyNotAvailable() + except OptionalDependencyNotAvailable: + pass + else: + from .modeling_mistral import ( + MistralForCausalLM, + MistralForSequenceClassification, + MistralModel, + MistralPreTrainedModel, + ) + + try: + if not is_flax_available(): + raise OptionalDependencyNotAvailable() + except OptionalDependencyNotAvailable: + pass + else: + from .modeling_flax_mistral import ( + FlaxMistralForCausalLM, + FlaxMistralModel, + FlaxMistralPreTrainedModel, + ) + + +else: + import sys + + sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__) diff --git a/collie/models/mistral/__pycache__/__init__.cpython-310.pyc b/collie/models/mistral/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..0eeae894122b29f98b82399a2066890e840baa6a GIT binary patch literal 1210 zcmZuw%Wm306dgaX&BMGBQqo1*O&4Aiy6Xm2MFbL5ASj7i;mydw1DR^ZCNrifSte3< zRsW&8{*t#{r-38r5B$rrZ=YTtncRuluQ~iWS$qv5;{G_zR@nfm`Jh2+}%0+QLzQ z{ER?gUV|y{jh_lKeX{`sKPC8YxA@I1zLoIP4S*E(!_W9xV$rmlrWrS*NDaB>XiH?majGA9OOuP$%;wSwK<9#+Msd zWgG?*Je@O7N<)fy#5@we>UYo1Pn$hBJZ+0-chEkBt&>5gdjy-Ot>f;;HtaWtvE5(t zTZxFUSY830khQ!y_eifF=T?|7hDf}(%7zS`GY<#IU*a#HI?vG6Tx<@th{rje;Bn$D zE>bCggXyD7M~nA5VvA3m>otoT_)`D*2C}_&Wv!e{j4n5IFU~Y@R3;BO6Q>_II+K3~ zX3LP3At%Bhwji8ff*=$xEi0tN_#vO6v9li=(PAbXGgT1dvKVc^Cn2MV@!CAXgfmUb z=rW{aFeI-8F`b1hg5q7w$I+Z2X3FB)2+Pa#KVDc_778*HWhjXdYka|_$H1e)V{A@I zc;S%*k}f5*G|hhjb|Bki_%o_AM8gQZz6>daDQX!|i&6c*N_;(8Oaot$MAt z`i?_FFxoCdhyxr!Seq?ktFc`0ot!WqijcFbs-^1>YRXUyMSsYtx}rYhG}Zh!*Uh4$ Rs+(AKgJX!Es%BX))xXwfO11z1 literal 0 HcmV?d00001 diff --git a/collie/models/mistral/__pycache__/configuration_mistral.cpython-310.pyc b/collie/models/mistral/__pycache__/configuration_mistral.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..0731f2e2d91d302df8b2d492eb9e55ec9352b625 GIT binary patch literal 6270 zcmbtYOK%(36`r9eilimKV>fl$+$v2gx)v$fPU8l(8^xAw0hZ#2kODM};b`WPy!Jfk zJ2O(GGC+X@NH^`O%Pdl$n{0}1`U?v5N6fD1!tf&N0&P+BJ9maNBuzbxQsUt}?mg$+ z^Z3rW*PolKYWUn;eY<_`Elv9e-3)( zUR!h`X@rvk7qNE-{}aiv$xg_!(Q+~qtT7jgSKh$y5%?A zgs&+Ysf5M%KU&;gsxLPr4}52vH(YRRBm(PC1K-g}pcT)nspe z;;RwP7TI9*VOxlR8@qiC+|OnxWsJU7M>b}5lKv$(&~zkTVW(Qq-JtxlsUQ< zC3Z)IT_pg#%*Y7?wj&&e?F#Nd6bpna98-=?Oms3shP9Gzsp^B6W4;`IG~(fURY`s9 z?ia967G5VVDa30Ba2%2SdaOsL))KXoFi8`Z`CCt6XT*uuwp|Miix%k-P`6!b`;X)^271 zl1U$^O6Dsk}jJHvFrEl?=KTJHqv!E<}R1P5qvwg&W+4X=>k(m z!c#%IDNQe+(1I(uX93q_zGM<_h9R5>I~iRB@c*Si#)>7ZD7d+4by@yGDTiP`nr_ed zuCZ-85kI1C*T!r!To3Ya&bX#(s%Q?negx-m`Lj!j z>pNsS2BRo6RnAmqXSu<_KWS6AJt7xq;&RjL`Mv@Q;DuVDF9n(^bYn@7Q6*II1V~cC z>g@IaRBaxb!U+;h0o3d<#s(uodS;Kyyu_IDQ?Xt5Wv9`z+l|Xtmfou`U%7nsy*h4d zlisxRJ{O{yYNgaOQBYqgocYiJXu|zbiW3ka5IUmU`sS`k)i}aliscsE%q! zc(%WjW66jc5gh&w#ty`xY9Z|4iB!k5a^+4z1BO1%K`E35AhJVMwe9%kL3MShzI1tl zFQ_N9O=+XA+eYyiDmiGrJqzW;Hasfsjxi*a5jtk~D!CBgJ6stAjeryuAF58$4HOfD z_sBDcsk+P;m&QxeCNKxHg@c-?I@+XMGq6THk$z$?Q9mgmxv?djz|XWDE#d9 zd>Ia)ex3H*mj-t?RnjKBX!bkU6sL%-uyQbtz5fvk+^#dW;m@Qd$v@cpT^~$K*e|3T z#EAlu_Dt;#X}N=q{vq;`S(zNQsD%+FMr5v7K>2`HlqJCErhVUqC_y+xB5WI2Jf2B6 z=I{T+Pf8YE4FJAoi+rNDM z&%gco#^%OLody;_ZodH0wBP*o!{h?j)*1CEA!qS3gYPkXgI|O6^}c3k{W7TDFMt;M zQ=n7*B51Kc4LaRF1zPHtY%S8{h5qzsxW<#2{tRfjUjeQ3tDx2XG0$4Q@tbE2t@S}~p-2KRV&7?K#{efQ(ON`8&Xecsjr~N5*AC^(lr?JB#SxPFubScghPyaOSGX?-BVv z5k}+(M7~R8p2#~yen^BuLcT%d9Fg-x-X-!jkspD?rDr=h7z@79;YsBdN=cdOiKOe4H%c=#=mzd!HVn zra)k%v+BKFxd3The1j#B(z%kJ|COhTda01#m-KR>peKLD{}iUH|K2;7;z9x_^~`jq zB_)HT6q6MqbZ(sx$`CnAWRA#jA}2sLHze{?lQ_-PZp+hjsisqw)ym`+&vdzA#8tz< z;v$ExXAR?dWIE}a8Nk1rs6s2edv2W=k5jvxM22Ras4tB>bW+gsdm zQu`|>$e)lBi$v&1m;4^oLCRWDFWxEXh2p>(`~RN%R?Xe0Rpm0KHra3qNbzgnl_e*FQ({4wZlV$4%WGpq)S{%ZsK>>*B)&?T)Ai5f4GX@wT;zV#?6m6Zm-=j zRvzBGyY|VdvA%L&DU&RY!YsL>6~^VCCq}(NkQb0AipOUQvwC%^Fjde?|2%Pu8zjN-X_TXSKh0m{k?2*Nv&l{=V zpc9lnqS^Gizz?;_GIix*kA41wgLE7_( zdB}bkF%LWEQR1^WKWrbt`4Kt4;GB1!buQS`Urx_GhigaeM{wvv$L5}Q4%)|^ z=Wm<1I%A)})f3Kxx6`+cxsTf?=U(W#ddfbHtEZih%hiiW|ET>KVjg46+>1#0fgqp9 z?I#fbM38dUHZP^7KeRDfefieqwpzGaeX-$ss*Y}J+s^9D+Z>v+)0S^tM7*DWY0Yc2 zTlG~xb9sGj)$ud0ws1ae_-3m$o%V-6dhXdzHN2}A+e;14z1VhLzj99D=h>F0+H1EY zCimi{XL*3TvS-g-K7CI+vLzjXdqtKMuZ$Qhy^tgR_$t*)HfV|Hz!zV6mn zYjw|aTC7H`S$9|bqwkrdw%ArR*IuimhLazyyUvSzwwKnuwRMlReYuWDn>Z)AUwpAA zzIw^Iw(hhRoO7#n*KI5|9F^vccbxE>kCdzIo`TzbyekU109R14+i~xuH9N}ETP4Rx(Z%gZ!P!} zT@lyoYSn4g7>~BBG+MTEv$oh!u2-wyY`A`5t!~#EEwrmYu-@uAxq+nZ8#TAF;ixH; zyujkaPX-4$e9z(QeiA_^wTxxVGnP%;ykllbUw3OB z$Fq`FG1YEVLLV3Xw7ZV99w&#NX|%jYPy4x6yM+g7`#H1?N28x^)^GY&!*iOxaludH z$^GoYdK0%!XZ&){h^@HJH8i7(ma1K;d&oD3=4mWAu0M)=YWi|~wYabz#JO*!)CW+v zcdndJPP6Sf$FH`V&WY7)w;HVz=h_!9Hl97#*)oWKTf8b*MmZMmS=$_f>yq|H`*Bs^NYv`qhSF0(K zZNKnxUDcZoCZsydbF1A#pJs13jisxeU%~8Y)UlG(oHe(xie(eas_U#S`U6)RwvBnk z>EWkwRUN|ZZyA1}R$Jh*g96y?>Tw27BDjdJ%QB1Bh*2~)CX){`6V_`1Ohoj_Q!k>AS|JfjMQ_XtL!DdxvvvGu^?}t!xJ~a5*=bx@(N3FeCF@1)LYR zjE)&Jagpb^Ut*BP>R4L?n4cwl2Qf!8cBYeQ7@JwIY-jDlZ2xzfpYa(3P> z+({d$Pl4sJ@OK->t^BE!d+CN@phJz+r}0gefUYdgzRd?z4C{_(M!DCNr#Km;)76K0bOwQ+wH*&s%ug?@t@~N8y-Jd4wb!sdSW=>8uQ%7G^Xh4) z9t=C(!wT;D1F~$^WKHt(;ZX6@^_J_8C6WdY;g>imNj%*8b;mar^$>~12^J?!PA;P~ zcNc;zX6=~#Rg6kHZis-PCYKeQ@t)v7>wQpXLU6k!BV}C;;J6YAH>lM_{vz$r^bv*UNLUx)#N0uMyp9t z))B5B1u(=IRai$Xdw`<}>{~`wlq|+c$&Vhf4VrxG+2$3;wz1y5%?a^#1-(7Hz_ld& zaB6X?$l7uf2l3j{;d)}qdXna(sA0ljU|L>gIcsB;0ntk1IETq&ZRIie3muDV(6YHF zHJaLkiNImZ$-oM8(Bpi7`7M{02ffTz8DuWccsqrv7iO1qrT)XyLXp&YoW5n~k(zI` zuGbcobL|2~aS0$qS(%2eSHl6~juGjpDl-4}?QG+0XHU$SEIwayVeGpTpPPKp=gaA_IZmNOepy9=35Oy=CD% zg9Xy^3@nBE%-kfak?Wu*7s065X)Kmmq|4h@C%aWZ&lfvbRBiNg(93x{hbvqZ6IZ04 z^WcJ|k))$!MME!39~If0(vaMh@ zpTRPSzDh6DJ#~$j(~TlNo3hb^96G^ zx=P~BJDhge?Xim#deXD1fn1e?08e~_$9#zRiPbt!InEWIY&xwMZsVqsmXSdjk4!Fk zpT*ZLA}E?;#zcC;u)e#oe}~5p7umF0zzxbqF#DDkcA^Y39$V?cVZYsMl}ok3i7 zyp~?U1&$7v3s*i>HuB|>qGIeqH@BT=*N@?9M<4^V6u9f%DchP#eZgErD}o!qmF{>O ztt(eJF|Y&&_y zEN99?H6T!A+^cmgVux|2I33aApT&=F-aLix>8__HBjgV#DXqrBegqLhM5Zw4#tiiZ zoG)+{4?p2*ehv{X*(VGjYjbB8IkEJG4nAsR|5@$ql zn9ZG>3?`%A!dN}Kwg%qXt+Ex=t?J9cvRaME4OvRI3M5Kguea>#igT-q729nqccv&T z=2^AmJ{d8a*Q!V6MfRAvEUD(FpR8Vva?p3qM5$MHNFA3Fr^S@tAvLZ*l&wPS_QF16 zOQXgSeTcI`(jBsSADKi1sA2c1RP=ac+WGKd@@gP9g|+nBD`=$ZLR%@vU28+8YAxNn z$*ULIs_NXVHzE8TL)Sn6Sg5zESDZwd>#k!npEa=JC~FZ5mxq|R8mRS+t4^zWqpen0 z<_(|<)q3@XPac5h95I+0i zL>7mub+xqKq_pC$ISYg*a3`7w*`p}{l2q%EG*F|f4S+kyGrAqkj-iQO^B7TGXCIe!_7qkC|-EON}PbA@o4}Tvh!n ziDKCw@6||nz^JX_=jwT;(|SC=BA8OwS~V{i%8~BnYI+{SthuHd=(weDn)Y4&s(LZE ztLu`KmECa})#U@MDnyMf3qAxEPI%2$^JYk2#I`oye!gG-cWaNN;01Kkkazw9zqbxEF!T0`*<@Ci3Jii zB^HrWGN7OtzZh@}FJ62(A-kK%g5->Oy5EOjn?NEMSq;+qGT76ah$Eogl-ePncsa18 z<{e%sP*CZ6r5Icxdke0r>z)CxgnS2ggIB1P$*#VYp1lAZ&AQS6PszvYmMj2$37V*$ z!3BR9Yn-E+j@5-=6!Ru0U*2!TW?As(iIfLLCUJr4wA z7$U^LatQ(j&=D;rbR!@ga8Bs^VwgnuMsszo8{beDP+#>N19ExO zgPM~S=4zg?e}TbYWWW)vew@Ke3|?mN2?nn)xWs@ULMW?!7RP>8Ln(eSE>_Jky)n|&6(MJw?aK+@n+g6R3i#_RNq1ezs1z0B z)T;>mObbZ(nxEBhcw&MveGXsgYBJxL1gY1pO#on<7G!ql-&RaD*umfjTMT*c3zYeS zF#H93oB>Zs;RF+r-Fi2(iS%2<3lKAwk~oXv$Hu>u)%h}c4*YY$>%tz%D-&{_L45)7 z17^e|T6I95j@$&3K+1Bgs;{|c^pCws+RKxdx`f&w(E!YmX7ceo$ZG z#f-+jNBh(VxG15Ed`OFW#*<$NT4m+>KS543M~hob6Sq73%kS1k&X;(Fgf2# z5FL;wRAMbx-|8OFG<#NYmG%p(MAgA`-rOega~=wjy9l9m8<8W5pYE7;YCWZKb8boG%xrNW2CRTf#5C0IPwxN$enE%ZM%G7vMr| z$R@5=!Pzi=@dZAI^q9TKYbC8d?TrBA!7)k|G!MVouVii;pEtbGN8&r#b)9wSy}s)`uI~=62YIukME+CW9_ETKo@aT8pgKz3D+p6@cBA}!3kw63 zr@HSpep&hpE4hd^WYOx?`KWK^xh{b_$2tQh4)TNyHT3_D28I+CBN;tVc0hQo+Urzt z$}$$L-;fWG44gJ)AYMaml%!^=FXLW}4|2v;cU=i@3$lN`VLLq)5UFZ`N=}_QvJi#H z%;k-zLr@$_lWKF_^&qujVZ07jlkD=9Te57)VpTn|gr}JAx?%pw>c)sjBlBE-Bo*V# z>6r`aOBjLx<03S2dS4f6DqP0Tau8>!Wd)2@g3h*u>ebrN?Wl9Or0Lc(OeCvYG(B}e zejzGT{WO#R6jP2R*TE&#UWX!xmGFy{uxg+XUfnN10jw3_{&1YHjTGoczxYWm>d&%E zw07dpG5?6>``K%nt;((hzt@A`w&t;}1jSL>C$IaK#=KI0opbd76K_q9@IaK~%Rvr8$6T)NK&yU*;*m(!c&E@k16C=1J-p&gZl zw-StUX?fT#0>-R#01U4i>L*dp5uqGE?v2`9BgdAGuWxvgLll=&CTJ>5pP!qPu>~sjC9hsGl&~Nd+FSN9dW~U`OD_!DDv5jRNxht zM>?Z-Eu<@L4ti6FOD*rYjWZncDvt-fy<7X3E85nwhwfsGdi%E?z}5U+1KOgo&KP?P z{lH!*bSjG$DA0HD1eNYMJjbw3`4g$HSn@oilE#+|bpT?BhA($YG?D598#1 zNf+He)-i#5j(OGP2bT|Y#_UmV@A5%=>`r>?P-kHIA&eD$?cok7-eG)?*yDH7h~*QF zbp|m`#yjJQ+T*EMf0RFu@+YDvka($sq>t(_9*(%JX$Vlv?PxF}AGw>xd1~ueXB;&d z?_|DYxEnU=YfmCo*QkQPHL-0JP3$m=3a3)Y=PQYEG>&Jn#DHYiEbs)F@NluG?a@05 zrb7Z(%y&l%-LxH8E^vdGv`s#E|URe7v-?Amj+b4T3u} zAFo?)r8i2S5FlAI`y0g<0Llf&B_Rai*+iV3UQt|8f%k}rZ+{ooG4EWs3`v9}8?@O| z>kBX(0*qe=;k`;NpiK;J!Kq%SvJvnipW^YE>hp`$Ti`KasTBY??_95f52)7Hz4mb^ z2_XQu4S;JlSc!nq1f#fe;(Ev(9IFC?-NqHrx}s5EC5fki6g`IK0z#ZsI2JS!tdVA* zG~?0?P=NT4AVF_qB}{D`idNsMHpp;+hPBsMZSoBew`hD(eW!4&dIjtd_>6ZRjBj8c z^hvEX7zdC$uqb2A&x35B#;v6d>A+SXj?eeG^_`)Y#1tW{%ahZ`{d~YkK>5le2tlZl zqBN`i27|xJ;FlRhluG?=o|1pKx9Be7m?q%A$ixwsqE)i6s{_wO{o#^#Rs9lE5Yh6p z>uWGFa73a~e}&OxCH!o&?lsp}VG&XHuA-VWh#>v-i?j?2ECh3Id(netQjr>I?14a$ zp9^>u^$U#urwo3D!4`vO82r}=rYAM66>%{lkm~R9`mZwhdkFmO3L0w3RiwjyM(eu` z*e3|WNb@^dRun1h2FE2W^af{L3{)^J1iio=*J*IY_UMKZWm431k$wqdu~iF3l0P74 zIupNw6Cm=mP{go@oYuo42pp0jfAD$KJ72VDMfGkuT?BJfH1?W%sXfhH5bpJhYszE%>Tph1st306Q@Tfp#E zqX6^0y*gjTD$ZpzT>k=vY;ZRw#VxIj z&Q?O`6CD}EHV}e_S8V}v5LK4wQ2fccM2XC`o~zrWi?7epI2r0q4V8H=7U2cfxuB`egw!X5o& zC))J5tj!3fd}O`SGa4c}Xtq~k=vfdG(m)lt7Vs5mU;||Y(m=UO8pv&#H1`u@+H^uL zfHbg`@qnz{4dnwMDqS&@6JT!J6+_;>7;tCgu1k;?2D{{i@EVbc?rW46cDgp$DFMX* zrn*@U`0Ve+-4)^Dl{aitkukD73eGVXaPdgDHP&Gn72)E`-Z;+Ck2cI2>^#jEw#EU< zPuxx2hUo%0T$+=R&nH?!KEJX#9Ps%SaQYFS@9l2?0{C`t?LYxxph)hz{R>jkp5?tO z7NmpS?%(qEZSA)QMA8zu0G#fpP$Tm571#omkZ!a$7o>SE&PUEh84`tBM&7$91rTql zV37fy2yQn#18TTA(HRe(PM#5*I-gXI7irT_70!9$J<}i`-PXEPJH0CoC8#dAdBSeh;~Fb&BrKM2dI>=xvXpv*ke@akP-34LIfVJT&zZ7cQ=>bV%eS9#d|57(9BzuCyrMAKrvawtQrBcV{<7>-36=mirn?KYACO zy!s4|9>McZwT8W8d@g(VHdFktJmYc16Th>TXOP2`^xa5j3ZrQ>z!C=_3r}HwP(IT~ z9Ig6@{q_`6cGVV=(*xV;^T0joW4ezf>a(x2KaiaE%&K3*Ql$PqgWU|yGx&!L z{t<)!j=_TrejR~d4)Exj#-lfWigajU9d;eCbm%1$p(tXvq46MI3FGXwHjGzx0=186 z(F9}@RH_MyjHWG%i;aZ_vv1HCgeDaL9U@SKQ-|5-m%%r>p)M zu1tSO{RV#2Z!#6kyVuk|=cx!=ALkMIS2fSzK4dTQ@%^ML^*`~#BMgo)h$X8%0rL)aL)K(iSj`|l&PCb~G z+C<6{dFv44$PxIX0Fo)4v)OA-ivpnSPcZW+c4=)rFi6h<0FWB}2i|{%!AlHYX7C9H zuP}&I#s03CIl{Q2C=`kmTTJ*21CjHp_aWyg;hqF+^?BoOG5FsY5PDYs2ZP^6;FkkS zjcvl4`j5Okq9r&y3oMkOP;GgxtTYP*bTnJ(=R*h<_S9?{5g66)@b*$5xz?cia?~d! z3P8EoG}FFT{hx@^`tRT6(Z6Q!e=+z#6h!Lq4KjZJ1L8yZuLy~61Qz*__e26Ln%y$r zfi$E(6UuVKe79(t-?6emDgGG<5A{Uc-+6 zIv#d`3~Bfo!vW>6bNIU7MgUTratSpXn}wGAwg+JuVXUM#VMK460CC@dQVwhz*q#8q z7R9EwQO9B`>pbUR0PqHDmyH(SS6tD z6BJX+QX+e?Oxei?V=r8dvkT0?XvPJraa|=W1$tIjT?lrFq@)(!wb;N0(Jo0Hj3F z-h%Xx@ypSaXnyu$Xqm_S!jc2op2O)>cw-_Vc3vJ&Ce{)@Gvp^&qVd4OzQA01Tev0J_Q{fs#~VOL_%xq*;d?HFRl9xAwliAUUk+IUBAe9if# zc-v|1TH zRS*QQaZv>Dl;~JrET}&~8uiy0kOT7vYU>yqu$98{p!PEQ#~26$wVy|O5KND0j-!Wv zIEhOka2(*^-{|$36K8ufl zI{$IGY)x2y5XbQ-`;Ikgexo}MDZgbEH%fnwQY|rXI`3d#5@Bp3()n}Si9;Q1` zWZzA91^^+Lh)?&4Ph%@r4n}^j;=0l2y3r{Ch8hGGBKM*> ziVEPN(xM5jWpL44xw*giv98oDlF>sN3*UDT?cCvdFxG1y`t(oBm-M;I_Ab_jy( zVk?4r0B1n=pJwEMCdDGA`2|u*eK?>=n4Boi6*a&~3eO>$^^R3i{_Z60sZq)enCoc9 ztmw%<%-dxxsPJfzLC^RuBJL)~_eW4DxKgWV>;mt)%iIMW2=yM&dZb}eB4#HvJjsF& zE@AW4h+zhND~(dDcQy{}G)}t~jsKfXkyd0?XD`5}IjQkb|Ar6#Bm-&Cf6JpZgSQzB zF!*;2UT46WsaXA;w`~1Bk}M;IOR<28jZ)4W2&b5}gIc5bt<-1C158!?X2n*_V*%g9 zhpaU}+teF@WvZxY1tnO+WZo{Z$xx=l9~%C}Sr_ z+oQRX4drwU2i!vyBE|iw)FEJox2r<51_Vqm$U6Wc5L&V-&_t34#ttD8UAw+U2Le*^ zrSmVJrJ5r!fqV}Pf4f=FI65+qFGrgD+n5WTj2aaXAJDv3R>00*uN07kxDMUH8iDo3 z5=dXKq+-^ym8Dg`K>C!Q2;j#tGy>ud{y3)8DQ*p54W{u4jaLTmT3h7+(l64KC%8rk zz55#B(w(j`&kEqsVSGmbi4G*rO5JB-0yGFik zR^-$f5zrbm$J>jz)bc)1nE@QL&3IfAlT|}~6Z8mu0)Gc-wCRwT@_f_V4|qL_0r*_j zsUjno;_?H_ReMOF?IE2LAnYNu<*$@0>oAbAX9zL*qaQTot1xiL89zy#4_DyNtzFti9)+l!BYD1gR+4>J-SqxT)7wN7?TaS{F?zg09tBrYWGJpST#(Z@f zW<0Nz`0={pp1G{noq23pgGA2qHAJb=&t|;^M+jX@SQ0$e)zUljLP~+69~PLOY{)Hx+;?y-{!;^tn|_7Jue| zQ->|<0TJGSX=5WFGn49yBdE6m%Y7mJKZJ#-NAuVP>i+{=-Pl8aOK~IK3#-*y0K)!) zXsdtBqyG_sZ!N6bbs@&s99)oUQ#93*g0TGpZ{eucl;cglrs z5VOaO1>y(+&d7YlEP;s|!+kVZ*#&!H`HKXW3V_V)w<<*%tt1Rq_V!x?W&$FtgozvM z%e&p1Z3F)HUZyBhSeCxZHXxo)j?FBF&%YDD1~ZiW#D7w}lY87{qWQy%p?|Lc9zP8` zDomN3IXQ~5v7w5C&18-h>{j76!qObfROByP$l4Wha1cWB4)(4rlcOuhkv3DwgL{)$ z#19~zdsRxo@1R`c8Pk;brREm9QxIZ@O4%NQX-jOvGVF)D=+4u$&ruh-wIend-?qUc zgXa)V#>PSS1`j&jsF^TQFlun#1Q`k_FS(lAjE5;gha2zV#EE#Scz!5&#pMC;I^5z= z>Ki*K8y{foeE^LQG3hLd6j^Dy+-1%23}XE;E>^wEqQCpa=PzA;rWXQ;e_b!gZs7`+0Jp6Py#v57w~m|1i^Mg z@&fv16MP}Iw$VU@X7))QG<_!vIUGnnI28043@y;Ja)&lF=k7^y(1rNu`XW{B8FdC+ zRAJG85%xeQtIp%B7@Q4g+4t8_C#_6|?@4Y<{0&C@CWE7hP=^^j!r%z6m+oiHq>{K{oU9TtZ|7hO)PQS#M3E0|L-x_QAU*|jxwr0aDmRP0((56n04M6KZguI zS@;j44JY|9deMW0VL7aRa?vZn_AU$CyDV()_R{u_ch)cBRuP0kTQzcX2~)M#=6gIS z&SR(kxvNf~3l?W68EFQmQ1TiL`NY72Da1s;gIEP-~1+>q9=MA~iamtRxs!g$FZA}Q7J7G|ck zGmq{khQPQA{|WWT)#hsvy9y2$p0(aMa@F(J+$T?*Si7Y!wbjxIx{o{oC>{FQ6X0yG zdd<~?JvUCn;^B(k7bO*V6=rcTzQZHcQTwHJI0uLd)kTTIMmg}3_p?0|!`InJig++R3I?k>O9C8F@FT&U%;{2t>s=mbh!(kIHh0VDE z**u$qwJUu&<|7N&o+tM+@>G{)iSUO)t-IE@yNRvYSUyr*x6ZLHe+B_)4RyrCaSboM z48#_H4;>$bn->&J2IxVpNIw`GFqcy+qZ0MHF9|&|`+&UW?m}b$CSUuEZSZlkMZ?4?_p!QE0jOIY2nM5|T@Fma#jFR#|MIyR7tw=|C=U zY>NTD*tqFG5Lvs>ii*ZoHCP!htgN+RHjZiP4})#0xp1Nc6IyvF*WXifR~l=ixGp@$ z(UVnv2nO81!7>ihm1Rivlgbm{$Q?Xe?oPVI{-f*uc$DxHE-&cY+6+?`0jDIb>AI!4;}G~+&@HE_ShJI7DJq`j>jT|LgL0aFx~`v3E>ge`6JJ!DCC_CYIegZc+9yI?XY z`y)8_Akt%-xvk}3t%jy!t@*&Olo(p-ad`x`mx(dIt)%Im77sSq-GF%~M^dm0BOEdz z-*rq(F@t?=*fwut?g^d6V5_0ZGvqbEGQs~?P|K)?#871ZjYO@;(&6^TBAnE&FKYtm1vm)f8;q!w|W|dV_oJ6E^5mpa^ZxE-)pq*bXPpy!%-TVsjw#_JDNYu zf_{(nA)E=cgbADm3Onmy8zvr*)=oPt_hItbH6H-sOI@o3yG57}SuYapt;zKa+$%+8 zqCwx7?`h4F3cd`&@|BGFJfHC*D5xCE4s zs{khjSZR{KwV;EzTNSiPZ*mQwd4RtS(k|?MYQrZV+_Q&;ZkzRqVzrkdXLWif-t5$Lv}2x0 zh+xdeHA$3<%l!sQxZFnUcx)vn8Hp!A?rjXY#jb&TzdSznb;N%E@44Q+6fqlc@u)ok z@VVIMoC00XCiI_6)WX%aw00Xlao~u6TRWH>OI|=KFtWf3Xh2hFt{&-$#z8})rx(}7 zjSpo6VLxQs4CW_Z=>p6FyKkyj+c$(MX}53&7AS6OR9Dzo{G#i+(3I<_6a5BZO_dco zQLe0XhdJ(nI?J=*CWTAY6!nE8Yi>7jhYTHHa0a99ZvV=ouP$f|!!E$Mfh4at(+j3pBC#&Jl)! zGa;sedTG_n+6X3u19k1WytUZP|nWpZhS$eM`ZN zPSie8T<-%!Er3e*j5G9V?*r(^hJ#RSZ2HgW1R;=`G`>i5?j`Z}0O_@B0)vpRLVm6a zL7OQq0BuNob_Hkd-^M|Rz=_j3D*yCbR08E+fV#m_#?E7pIULtHh1*79+XwFMV9W)* z*??2RFMN7J^~WPB0S9qZ#_naSWGsC5M|&E5Yw@@H-Oxjyl86h%+iG!ZGc{ z4lk)16GO0Zpn#KpDFBwFDK>smdZEj>L9j(WTc)-Rnn#jbt;` zXUQT4q_M5y{LYl$b#`)B3Xwh(Q z*~VS9A;iKmsW#+Qm`#T%#qiRs4=vu=ASjM!JDXHhSPZ00Pck!6LaRIY-NKZ6{?(L{l8!Q2;XQzzWb?vjdl?4 zRuienzrS>Cq;#D52v}^eOi$q;yi3%u>JZAgFg+eCV8yA9HW3A(T`Ev_FHTb&R3W;;BPN>M@_&MH4D!j_%%!B#(4J> zh}tR9W5^Kzb4rlzKjd&D(j7P;K!4CBB7SJ7+Y9uLaTl5ns9W&RynARdX;t2o_GC>P|9sp;`<$XtMv7-N_c5evP%6@&(g*o z(k~fEZFMZBM~|Xxe^9Q3e3Csz z8kb)W+uCF=y^enh3Wh8;s=?H-H58Xb*ARj+%6 zEw~frGGtquSv!NRZ&{^0m@Imw+h&QL6UwUT4Q|0R0u=RpEfWmbaWqJ;2=Jr%<3Zgv zQ3vj+&n#D9&^`>`W!Sym8H7pu@bV}OB(rxcybLP6HD+h-8e8Ldm&pJ;e5G4?#06(t zqcA{(8NKBDvsj=e#5e$NNZFc%x#uo?cOyl`&UY#>7aaDcY}gjdZub!no(^DMIfB>s zly58*gQRx@JGhdjRH^MQk91A*1>_ z+ORCG9^3`9NNf?1oZf{ePZPfK-tCw$lEiDVT2;wgn_Z4~a_~7Y?p0w<34=?R;%7R8 z*!*X<2E7Nn<7BKHuq$^m@NkfU--bNOn6`%{Psv4{49?(@0lp7X%LmV=-Z&30226Rj z18-wu>Hyzn%P3QQ62H8HU-(_ywfqpgkyW;E-m~p}zf_Tjs z>tu27T4y)wubTEK>M;g`*hx9x=u99T&ZWeOFB#XKKq>I&hS8yG{zGwnP+N?0-?+-|8Mrhn<><03inMcS|n??@y<1GI*hX??^s|6cFk^F z{xswXFwwGMU*27SHwd($VhxeWXyyyMorn!*J$PARc3le49$tB{@ho%IvZP=$cqIb% z_Cxgv-Xf6&mlT2`iO4YDjk#_3378A`Jp=87tVI8!f<}bo0@7BO8jB3IeiuMjF0dYT zVMRIv|Ds&s)hI%7c(5@XsrTH_m-j@xhTp@bw|WqHZ2SSAn9>Ou_$CBJuuc#K0wn{~ z#CX9VWIe5~7cII}JyayZ+Qm6@SaEm%;Xt&m--?{OuD9U9S!)~uh#3$tS%2B=)3pUP z^uYk8Eg&>hTy*(&MORY_7iAT7Lo?kKs8-KZ57Tqb;rVdiT#!(n2I7nihMB5x5a3n* zJ)4mFy6H?8UHH37$w_lV_Y~ioeGcLzI5I>6=)dejb<9D@^oHdGs{~Z!vhBaRWS} zofk|IX@j6~qQJAL7`M`LHQqSnTdQ#2rM{n$zs!Kl$i1BaXddffCZ^$#qTQO}hHu?{ zFCzngZ!MfdNma1CW0bF-(@H<}2__!G%kH}CiPv(cTS(+j5P*tpyx>wnRI+}B&D>qSHtrqXHd$jw_;^RE zTMI|JOfuc4!u^=<1zSW9f=X z&vWtH11HjZ|nE)rR_mbT4+gzl;4Vt zX&IY+CtskAV-{Wwtk2>un8{hVmoa$zX!Mu;_C~nRNXsC>T*JHb!q445;$=cZGJJ&(FU6>g7wd^Dmu)9q9A3&*=&AHt*viqGSsgmlSo9H~%{x zQRdQa?Km){?NyyeVqhS_M|mo;!v);-o9vm7A&RC3Bf0!=KAla6yVv69w7${p=Z>~^ z`M}mLn>YVcwqSlUTQt8>92om!#atJVVd6t3mmfEat1+4Y8=iD5k;1x1zmHX3PY5I- z%l#Wf6yWx10Ka+hZkK|+4^4qnb43S8!fnGD1abgBcX!Of@b*9ZARORV;E;C|I6-B3 z6j;KT91Y`W97kh_9T(JLLgI0Z7jw%!INF7y3Hc7A}8%F~Eptl{i-Lh@?alJW*uljUY5A|6b!3d4efM0GoH}w0AxQWN@=?{OL&$_LU zpJKh<*r{H9Z@G_pp?x@p!2Ma$0>3Gh5b!o~6 zw80PrB19W#JP#NVag_*JFo7%-B@UYwBLspT*+W-1+z`qQoJ?olDZrq9K-}k);8{`p zJHq{q_WGXhlmrzRq?zO9pltv-$V%QfJA;6i4Lb|T+TeC}zO7`dh*U#Jkpbk)I7-}% zt8|982E;QX{XAlGe5b;56CG0WD=9TGXi;X2pi&oPT=Ly19(Tr0Q|b=@9+g$h5FB-GX=w~;*-RC z8Kb)dxV|=_{{PsHgC$&)P8T6bn<#`Lj5aMn4O&azfT5Ykk03eUi(l^Ao;`phPCq)63(8bY|9BQ-H61Cde)JtT+P)CQ!MbUAd zcf(mMWXix|Flta1$5%^l3eg+n4tN)AF1CXbSb1? z3Ewgo)!|wFy2dWExB2;bSj=B-x1uypJ@u4!f(yH$=p8iN20*uf2^+;@p}g9)495m_ z%t!hO0&1V-y2pC^$bqKz)?06cPBD%@@zLYgVfJ(&HaBFRkr<486V>T82)}ohiRXp= zCd7Yac;I6WUIobRyG4iv8kGgwiaZUUQZ0${>Z@Xo*mRygkjO$qs>mX$-w~ZFcR^4Y zSPRsDXAZ1+c%;P52dmHN*8;@`2z=B4cyI)8Iip*1CX76yOPqHUx9QTaiGt;k~ zj1A!4Pcy#Wdj4=zw6IZg>%G6viws- zrXEI&$sSZ~ z@16aVjj4WfUjsS|l=i|833v|Q7w~n-7zW^S%fpcejhhq}0MIhc}^w0MQp$!+?g{+?`<}6;eySL3v%d)88KzB|j%Em{LCnhQfl% zpxCmYcrbVb{|7~|HGtVs5BRh|?u9Qy#euwwUVQNt(mV=s^&q@>qK zOprVf80*_Wi}D*M`%I8~7M+kCFlS@s`V3CMdzzgT^dc2#lT0ra!xNpf)+va_!n|hj z6`6E~#HAo15Ku$}yyn}>!w=n2_kZ)tZpTFYNn;<_o^^zoFFKcs-4a}!AKYJjx@mQZm z0&Lq>Ft*Vh+YcQ!)GBhq7?t7jWgNrJslEwAivz?jY8;Tqx&l9eR01zyCrqM%r&uA1 z&s_NUUWh!fzDN<<#RAPF?|Z8lB5AW|92`&Y?*Y{Ok($>)g@Ua{vSFteTKgQUb(_KW zGxz}pgxl0j9#MHNA7F1+*#p||%3x{(a1{}MTZcyOQSg5DLXev*aM zc`>FNF%EYNSMQ*!+~S;U}wF9n;1&OzbeMtQ&}3DCASprV^Q+@8`%g0Yo2>FJpN||h*s8N z*RTmht$KvD++Im3Gko$yMKSIuA}(MaB|Ze(`Emaaf)IX|l^w=V-LJzAOxfsdEC&;| z-1a}{nc>+2P-+2QD)M5wMh|6(0Skp!u)r)WPt!J>!L&^u!?-O>0TVZ;jB7)|ZzcF0 z4t__RA*bRD2XOMJJw`ZrZbU$1IQyNR@Wbi4eZ`}CPq!$`DrZ;NAji+NW)_*y9ZxmU5FW2JF zp;?rmwtWyfMcDWXidUOB*!aME7D}8>i;*$dx~&)36M~$6U$O!_!Nv|N@R|Fo0HMj= z3hayy?kp5-g9b?=DOm_K_%I$Jfc&x%6f_Yu_a$EF(E%)L@n*b4E zm;~6*w!r@n0p?>_cyrQ(oWZNuAmBj*Ihf3+GYG$B@a)D|J9h`ZE;;>RDg-m30EUE# zPjnYw{E}KGI0zT_I37YRP4#0iETUGR+~SK`7-y-kb+XG9Zx3K)vz$eW~BI)I>j5z20Sr{K$uU}&j+2P0P&V=l<=8~VKGmV_N)&K>xa6X8n$ z!K(BHJnw~c1}JB|zQN@=H>&ZqgY(-jo;g!J1!4!zLBDq}D3T;RLMM;B2r%gQCA=>K zG?5homKl(5HWoj9a(+r@+{3r<_GISgWH;Hf+#<1dDau&XiMdFJEPLcikfg|WU zEx2x+0dPm&16w6*m>vl`ATGGwDk?{vR7kD(rIDwx8lA>Ivh8gq#QGsjc_YFk( zqg@PlXiw1vOqEc+UkOKcAdtKM;2K9EM-oNZO z^8*yGp`^u29cY*Vx3=bfm_NV(0xoYUwDI)}G|D)DFH#C-H3ngYY4^$<=9L@%Wdpq?Ep8;qkqeQbhfWWg!Gz7Xmuv( zCyD(uivJiJ>LOC)Q>9_bVg2kO-kGhaUkK9~k>>R+-tzkvYn zn#O)jm^#Tb$RkmL*H#^uc*F;u8;yMM#Qtex=YHxqbDGd;^d^wlL=f)@4Sd@w8ObuR zInL*HbqzyzCh*)2ALH3 zz@H2)iY{F~`^x1mgQ8PRA^?c+>_i4+pmxm-nI#03{QO7IquAu|6jB9#7)Q0&M7YDo zdN6{p1XzcVU4_4R!+I&7P4jrnEW)fSZ;cF&?9D$^ER5`(ni?skK2sS>eFotfb5*eM k?~LTE=geVQo4z|@Srg`k@EEB+4Byiq!FkF2!->lO3(fVAQ~&?~ literal 0 HcmV?d00001 diff --git a/collie/models/mistral/configuration_mistral.py b/collie/models/mistral/configuration_mistral.py new file mode 100644 index 0000000..20ffba5 --- /dev/null +++ b/collie/models/mistral/configuration_mistral.py @@ -0,0 +1,152 @@ +# coding=utf-8 +# Copyright 2023 Mistral AI and the HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" Mistral model configuration""" + +from transformers.configuration_utils import PretrainedConfig +from transformers.utils import logging + + +logger = logging.get_logger(__name__) + +MISTRAL_PRETRAINED_CONFIG_ARCHIVE_MAP = { + "mistralai/Mistral-7B-v0.1": "https://huggingface.co/mistralai/Mistral-7B-v0.1/resolve/main/config.json", + "mistralai/Mistral-7B-Instruct-v0.1": "https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.1/resolve/main/config.json", +} + + +class MistralConfig(PretrainedConfig): + r""" + This is the configuration class to store the configuration of a [`MistralModel`]. It is used to instantiate an + Mistral model according to the specified arguments, defining the model architecture. Instantiating a configuration + with the defaults will yield a similar configuration to that of the Mistral-7B-v0.1 or Mistral-7B-Instruct-v0.1. + + [mistralai/Mistral-7B-v0.1](https://huggingface.co/mistralai/Mistral-7B-v0.1) + [mistralai/Mistral-7B-Instruct-v0.1](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.1) + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + + Args: + vocab_size (`int`, *optional*, defaults to 32000): + Vocabulary size of the Mistral model. Defines the number of different tokens that can be represented by the + `inputs_ids` passed when calling [`MistralModel`] + hidden_size (`int`, *optional*, defaults to 4096): + Dimension of the hidden representations. + intermediate_size (`int`, *optional*, defaults to 14336): + Dimension of the MLP representations. + num_hidden_layers (`int`, *optional*, defaults to 32): + Number of hidden layers in the Transformer encoder. + num_attention_heads (`int`, *optional*, defaults to 32): + Number of attention heads for each attention layer in the Transformer encoder. + num_key_value_heads (`int`, *optional*, defaults to 8): + This is the number of key_value heads that should be used to implement Grouped Query Attention. If + `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if + `num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used. When + converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed + by meanpooling all the original heads within that group. For more details checkout [this + paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to `8`. + hidden_act (`str` or `function`, *optional*, defaults to `"silu"`): + The non-linear activation function (function or string) in the decoder. + max_position_embeddings (`int`, *optional*, defaults to `4096*32`): + The maximum sequence length that this model might ever be used with. Mistral's sliding window attention + allows sequence of up to 4096*32 tokens. + initializer_range (`float`, *optional*, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + rms_norm_eps (`float`, *optional*, defaults to 1e-06): + The epsilon used by the rms normalization layers. + use_cache (`bool`, *optional*, defaults to `True`): + Whether or not the model should return the last key/values attentions (not used by all models). Only + relevant if `config.is_decoder=True`. + pad_token_id (`int`, *optional*): + The id of the padding token. + bos_token_id (`int`, *optional*, defaults to 1): + The id of the "beginning-of-sequence" token. + eos_token_id (`int`, *optional*, defaults to 2): + The id of the "end-of-sequence" token. + tie_word_embeddings (`bool`, *optional*, defaults to `False`): + Whether the model's input and output word embeddings should be tied. + rope_theta (`float`, *optional*, defaults to 10000.0): + The base period of the RoPE embeddings. + sliding_window (`int`, *optional*, defaults to 4096): + Sliding window attention window size. If not specified, will default to `4096`. + attention_dropout (`float`, *optional*, defaults to 0.0): + The dropout ratio for the attention probabilities. + + ```python + >>> from transformers import MistralModel, MistralConfig + + >>> # Initializing a Mistral 7B style configuration + >>> configuration = MistralConfig() + + >>> # Initializing a model from the Mistral 7B style configuration + >>> model = MistralModel(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + ```""" + + model_type = "mistral" + keys_to_ignore_at_inference = ["past_key_values"] + + def __init__( + self, + vocab_size=32000, + hidden_size=4096, + intermediate_size=14336, + num_hidden_layers=32, + num_attention_heads=32, + num_key_value_heads=8, + hidden_act="silu", + max_position_embeddings=4096 * 32, + initializer_range=0.02, + rms_norm_eps=1e-6, + use_cache=True, + pad_token_id=None, + bos_token_id=1, + eos_token_id=2, + tie_word_embeddings=False, + rope_theta=10000.0, + sliding_window=4096, + attention_dropout=0.0, + **kwargs, + ): + self.vocab_size = vocab_size + self.max_position_embeddings = max_position_embeddings + self.hidden_size = hidden_size + self.intermediate_size = intermediate_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.sliding_window = sliding_window + + # for backward compatibility + if num_key_value_heads is None: + num_key_value_heads = num_attention_heads + + self.num_key_value_heads = num_key_value_heads + self.hidden_act = hidden_act + self.initializer_range = initializer_range + self.rms_norm_eps = rms_norm_eps + self.use_cache = use_cache + self.rope_theta = rope_theta + self.attention_dropout = attention_dropout + + super().__init__( + pad_token_id=pad_token_id, + bos_token_id=bos_token_id, + eos_token_id=eos_token_id, + tie_word_embeddings=tie_word_embeddings, + **kwargs, + ) diff --git a/collie/models/mistral/convert_mistral_weights_to_hf.py b/collie/models/mistral/convert_mistral_weights_to_hf.py new file mode 100644 index 0000000..4ba6236 --- /dev/null +++ b/collie/models/mistral/convert_mistral_weights_to_hf.py @@ -0,0 +1,276 @@ +# Copyright 2023 Mistral AI and The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import argparse +import gc +import json +import os +import shutil +import warnings + +import torch + +from transformers import ( + LlamaTokenizer, + MistralConfig, + MistralForCausalLM, +) + + +try: + from transformers import LlamaTokenizerFast + + tokenizer_class = LlamaTokenizerFast +except ImportError as e: + warnings.warn(e) + warnings.warn( + "The converted tokenizer will be the `slow` tokenizer. To use the fast, update your `tokenizers` library and re-run the tokenizer conversion" + ) + tokenizer_class = LlamaTokenizer + +""" +Sample usage: + +``` +python src/transformers/models/mistral/convert_mistral_weights_to_hf.py \ + --input_dir /path/to/downloaded/mistral/weights --model_size 7B --output_dir /output/path +``` + +Thereafter, models can be loaded via: + +```py +from transformers import MistralForCausalLM, LlamaTokenizer + +model = MistralForCausalLM.from_pretrained("/output/path") +tokenizer = LlamaTokenizer.from_pretrained("/output/path") +``` + +Important note: you need to be able to host the whole model in RAM to execute this script (even if the biggest versions +come in several checkpoints they each contain a part of each weight of the model, so we need to load them all in RAM). +""" + +NUM_SHARDS = {"7B": 1} + + +def compute_intermediate_size(n, ffn_dim_multiplier=1, multiple_of=256): + return multiple_of * ((int(ffn_dim_multiplier * int(8 * n / 3)) + multiple_of - 1) // multiple_of) + + +def read_json(path): + with open(path, "r") as f: + return json.load(f) + + +def write_json(text, path): + with open(path, "w") as f: + json.dump(text, f) + + +def write_model(model_path, input_base_path, model_size, tokenizer_path=None, safe_serialization=True): + # for backward compatibility, before you needed the repo to be called `my_repo/model_size` + if not os.path.isfile(os.path.join(input_base_path, "params.json")): + input_base_path = os.path.join(input_base_path, model_size) + + os.makedirs(model_path, exist_ok=True) + tmp_model_path = os.path.join(model_path, "tmp") + os.makedirs(tmp_model_path, exist_ok=True) + + params = read_json(os.path.join(input_base_path, "params.json")) + num_shards = NUM_SHARDS[model_size] + + # For some reason this is a string in the params.json + sliding_window = int(params["sliding_window"]) + n_layers = params["n_layers"] + n_heads = params["n_heads"] + n_heads_per_shard = n_heads // num_shards + dim = params["dim"] + dims_per_head = dim // n_heads + base = params.get("rope_theta", 10000.0) + inv_freq = 1.0 / (base ** (torch.arange(0, dims_per_head, 2).float() / dims_per_head)) + max_position_embeddings = 4096 * 8 + + if tokenizer_path is not None: + tokenizer = tokenizer_class(tokenizer_path) + tokenizer.save_pretrained(model_path) + vocab_size = tokenizer.vocab_size if tokenizer_path is not None else 32000 + + if "n_kv_heads" in params: + num_key_value_heads = params["n_kv_heads"] # for GQA / MQA + num_local_key_value_heads = num_key_value_heads // num_shards + key_value_dim = dims_per_head * num_local_key_value_heads + else: # compatibility with other checkpoints + num_key_value_heads = n_heads + num_local_key_value_heads = n_heads_per_shard + key_value_dim = dim + + # permute for sliced rotary + def permute(w, n_heads=n_heads, dim1=dim, dim2=dim): + return w.view(n_heads, dim1 // n_heads // 2, 2, dim2).transpose(1, 2).reshape(dim1, dim2) + + print(f"Fetching all parameters from the checkpoint at {input_base_path}.") + # Load weights + loaded = [ + torch.load(os.path.join(input_base_path, f"consolidated.{i:02d}.pth"), map_location="cpu") + for i in range(num_shards) + ] + param_count = 0 + index_dict = {"weight_map": {}} + for layer_i in range(n_layers): + filename = f"pytorch_model-{layer_i + 1}-of-{n_layers + 1}.bin" + + # Sharded + # Note that attention.w{q,k,v,o}, feed_fordward.w[1,2,3], attention_norm.weight and ffn_norm.weight share + # the same storage object, saving attention_norm and ffn_norm will save other weights too, which is + # redundant as other weights will be stitched from multiple shards. To avoid that, they are cloned. + + state_dict = { + f"model.layers.{layer_i}.input_layernorm.weight": loaded[0][ + f"layers.{layer_i}.attention_norm.weight" + ].clone(), + f"model.layers.{layer_i}.post_attention_layernorm.weight": loaded[0][ + f"layers.{layer_i}.ffn_norm.weight" + ].clone(), + } + state_dict[f"model.layers.{layer_i}.self_attn.q_proj.weight"] = permute( + torch.cat( + [ + loaded[i][f"layers.{layer_i}.attention.wq.weight"].view(n_heads_per_shard, dims_per_head, dim) + for i in range(num_shards) + ], + dim=0, + ).reshape(dim, dim) + ) + state_dict[f"model.layers.{layer_i}.self_attn.k_proj.weight"] = permute( + torch.cat( + [ + loaded[i][f"layers.{layer_i}.attention.wk.weight"].view( + num_local_key_value_heads, dims_per_head, dim + ) + for i in range(num_shards) + ], + dim=0, + ).reshape(key_value_dim, dim), + num_key_value_heads, + key_value_dim, + dim, + ) + state_dict[f"model.layers.{layer_i}.self_attn.v_proj.weight"] = torch.cat( + [ + loaded[i][f"layers.{layer_i}.attention.wv.weight"].view(num_local_key_value_heads, dims_per_head, dim) + for i in range(num_shards) + ], + dim=0, + ).reshape(key_value_dim, dim) + + state_dict[f"model.layers.{layer_i}.self_attn.o_proj.weight"] = torch.cat( + [loaded[i][f"layers.{layer_i}.attention.wo.weight"] for i in range(num_shards)], dim=1 + ) + state_dict[f"model.layers.{layer_i}.mlp.gate_proj.weight"] = torch.cat( + [loaded[i][f"layers.{layer_i}.feed_forward.w1.weight"] for i in range(num_shards)], dim=0 + ) + state_dict[f"model.layers.{layer_i}.mlp.down_proj.weight"] = torch.cat( + [loaded[i][f"layers.{layer_i}.feed_forward.w2.weight"] for i in range(num_shards)], dim=1 + ) + state_dict[f"model.layers.{layer_i}.mlp.up_proj.weight"] = torch.cat( + [loaded[i][f"layers.{layer_i}.feed_forward.w3.weight"] for i in range(num_shards)], dim=0 + ) + + state_dict[f"model.layers.{layer_i}.self_attn.rotary_emb.inv_freq"] = inv_freq + for k, v in state_dict.items(): + index_dict["weight_map"][k] = filename + param_count += v.numel() + torch.save(state_dict, os.path.join(tmp_model_path, filename)) + + filename = f"pytorch_model-{n_layers + 1}-of-{n_layers + 1}.bin" + state_dict = { + "model.norm.weight": loaded[0]["norm.weight"], + "model.embed_tokens.weight": torch.cat([loaded[i]["tok_embeddings.weight"] for i in range(num_shards)], dim=1), + "lm_head.weight": torch.cat([loaded[i]["output.weight"] for i in range(num_shards)], dim=0), + } + + for k, v in state_dict.items(): + index_dict["weight_map"][k] = filename + param_count += v.numel() + torch.save(state_dict, os.path.join(tmp_model_path, filename)) + + # Write configs + index_dict["metadata"] = {"total_size": param_count * 2} + write_json(index_dict, os.path.join(tmp_model_path, "pytorch_model.bin.index.json")) + config = MistralConfig( + hidden_size=dim, + intermediate_size=params["hidden_dim"], + num_attention_heads=params["n_heads"], + num_hidden_layers=params["n_layers"], + rms_norm_eps=params["norm_eps"], + num_key_value_heads=num_key_value_heads, + vocab_size=vocab_size, + rope_theta=base, + max_position_embeddings=max_position_embeddings, + sliding_window=sliding_window, + ) + config.save_pretrained(tmp_model_path) + + # Make space so we can load the model properly now. + del state_dict + del loaded + gc.collect() + + print("Loading the checkpoint in a Mistral model.") + model = MistralForCausalLM.from_pretrained(tmp_model_path, torch_dtype=torch.bfloat16, low_cpu_mem_usage=True) + # Avoid saving this as part of the config. + del model.config._name_or_path + model.config.torch_dtype = torch.float16 + print("Saving in the Transformers format.") + model.save_pretrained(model_path, safe_serialization=safe_serialization) + shutil.rmtree(tmp_model_path) + + +def write_tokenizer(tokenizer_path, input_tokenizer_path): + # Initialize the tokenizer based on the `spm` model + print(f"Saving a {tokenizer_class.__name__} to {tokenizer_path}.") + tokenizer = tokenizer_class(input_tokenizer_path) + tokenizer.save_pretrained(tokenizer_path) + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument( + "--input_dir", + help="Location of Mistral weights, which contains tokenizer.model and model folders", + ) + parser.add_argument( + "--model_size", + choices=["7B", "tokenizer_only"], + help="'f' models correspond to the finetuned versions, and are specific to the Mistral2 official release. For more details on Mistral2, checkout the original repo: https://huggingface.co/meta-mistral", + ) + parser.add_argument( + "--output_dir", + help="Location to write HF model and tokenizer", + ) + parser.add_argument("--safe_serialization", type=bool, help="Whether or not to save using `safetensors`.") + args = parser.parse_args() + spm_path = os.path.join(args.input_dir, "tokenizer.model") + if args.model_size != "tokenizer_only": + write_model( + model_path=args.output_dir, + input_base_path=args.input_dir, + model_size=args.model_size, + safe_serialization=args.safe_serialization, + tokenizer_path=spm_path, + ) + else: + write_tokenizer(args.output_dir, spm_path) + + +if __name__ == "__main__": + main() diff --git a/collie/models/mistral/modeling_flax_mistral.py b/collie/models/mistral/modeling_flax_mistral.py new file mode 100644 index 0000000..0a837f4 --- /dev/null +++ b/collie/models/mistral/modeling_flax_mistral.py @@ -0,0 +1,741 @@ +# coding=utf-8 +# Copyright 2024 Mistral AI and the HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" Flax Mistral model.""" +from typing import Optional, Tuple + +import flax.linen as nn +import jax +import jax.numpy as jnp +import numpy as np +from flax.core.frozen_dict import FrozenDict, freeze, unfreeze +from flax.linen import combine_masks, make_causal_mask +from flax.linen.attention import dot_product_attention_weights +from flax.traverse_util import flatten_dict, unflatten_dict +from jax import lax + +from transformers.modeling_flax_outputs import ( + FlaxBaseModelOutput, + FlaxBaseModelOutputWithPast, + FlaxCausalLMOutput, + FlaxCausalLMOutputWithCrossAttentions, +) +from transformers.modeling_flax_utils import ACT2FN, FlaxPreTrainedModel, append_call_sample_docstring, logging +from transformers.utils import add_start_docstrings, add_start_docstrings_to_model_forward +from .configuration_mistral import MistralConfig + + +logger = logging.get_logger(__name__) + +_CONFIG_FOR_DOC = "MistralConfig" +_REAL_CHECKPOINT_FOR_DOC = "mistralai/Mistral-7B-v0.1" +_CHECKPOINT_FOR_DOC = "ksmcg/Mistral-tiny" + +MISTRAL_START_DOCSTRING = r""" + + This model inherits from [`FlaxPreTrainedModel`]. Check the superclass documentation for the generic methods the + library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads + etc.) + + This model is also a Flax Linen + [flax.nn.Module](https://flax.readthedocs.io/en/latest/_autosummary/flax.nn.module.html) subclass. Use it as a + regular Flax Module and refer to the Flax documentation for all matter related to general usage and behavior. + + Finally, this model supports inherent JAX features such as: + + - [Just-In-Time (JIT) compilation](https://jax.readthedocs.io/en/latest/jax.html#just-in-time-compilation-jit) + - [Automatic Differentiation](https://jax.readthedocs.io/en/latest/jax.html#automatic-differentiation) + - [Vectorization](https://jax.readthedocs.io/en/latest/jax.html#vectorization-vmap) + - [Parallelization](https://jax.readthedocs.io/en/latest/jax.html#parallelization-pmap) + + Parameters: + config ([`MistralConfig`]): Model configuration class with all the parameters of the model. + Initializing with a config file does not load the weights associated with the model, only the + configuration. Check out the [`~FlaxPreTrainedModel.from_pretrained`] method to load the model weights. + dtype (`jax.numpy.dtype`, *optional*, defaults to `jax.numpy.float32`): + The data type of the computation. Can be one of `jax.numpy.float32`, `jax.numpy.float16`, or + `jax.numpy.bfloat16`. + + This can be used to enable mixed-precision training or half-precision inference on GPUs or TPUs. If + specified all the computation will be performed with the given `dtype`. + + **Note that this only specifies the dtype of the computation and does not influence the dtype of model + parameters.** + + If you wish to change the dtype of the model parameters, see [`~FlaxPreTrainedModel.to_fp16`] and + [`~FlaxPreTrainedModel.to_bf16`]. +""" + +MISTRAL_INPUTS_DOCSTRING = r""" + Args: + input_ids (`numpy.ndarray` of shape `(batch_size, input_ids_length)`): + Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide + it. + + Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. + + [What are input IDs?](../glossary#input-ids) + attention_mask (`numpy.ndarray` of shape `(batch_size, sequence_length)`, *optional*): + Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + [What are attention masks?](../glossary#attention-mask) + + Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. + + If `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see + `past_key_values`). + + If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`] + and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more + information on the default strategy. + + - 1 indicates the head is **not masked**, + - 0 indicates the head is **masked**. + position_ids (`numpy.ndarray` of shape `(batch_size, sequence_length)`, *optional*): + Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0, + config.n_positions - 1]`. + + [What are position IDs?](../glossary#position-ids) + past_key_values (`Dict[str, np.ndarray]`, *optional*, returned by `init_cache` or when passing previous `past_key_values`): + Dictionary of pre-computed hidden-states (key and values in the attention blocks) that can be used for fast + auto-regressive decoding. Pre-computed key and value hidden-states are of shape *[batch_size, max_length]*. + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned + tensors for more detail. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for + more detail. + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. +""" + + +# Copied from transformers.models.llama.modeling_flax_llama.FlaxLlamaRMSNorm with Llama->Mistral +class FlaxMistralRMSNorm(nn.Module): + config: MistralConfig + dtype: jnp.dtype = jnp.float32 + + def setup(self): + self.epsilon = self.config.rms_norm_eps + self.weight = self.param("weight", lambda _, shape: jnp.ones(shape), self.config.hidden_size) + + def __call__(self, hidden_states): + variance = jnp.asarray(hidden_states, dtype=jnp.float32) + variance = jnp.power(variance, 2) + variance = variance.mean(-1, keepdims=True) + # use `jax.numpy.sqrt` as `jax.lax.rsqrt` does not match `torch.rsqrt` + hidden_states = hidden_states / jnp.sqrt(variance + self.epsilon) + + return self.weight * jnp.asarray(hidden_states, dtype=self.dtype) + + +# Copied from transformers.models.llama.modeling_flax_llama.FlaxLlamaRotaryEmbedding with Llama->Mistral +class FlaxMistralRotaryEmbedding(nn.Module): + config: MistralConfig + dtype: jnp.dtype = jnp.float32 + + def setup(self): + head_dim = self.config.hidden_size // self.config.num_attention_heads + self.sincos = create_sinusoidal_positions(self.config.max_position_embeddings, head_dim) + + def __call__(self, key, query, position_ids): + sincos = self.sincos[position_ids] + sin_pos, cos_pos = jnp.split(sincos, 2, axis=-1) + + key = apply_rotary_pos_emb(key, sin_pos, cos_pos) + query = apply_rotary_pos_emb(query, sin_pos, cos_pos) + + key = jnp.asarray(key, dtype=self.dtype) + query = jnp.asarray(query, dtype=self.dtype) + + return key, query + + +# Copied from transformers.models.llama.modeling_flax_llama.FlaxLlamaMLP with Llama->Mistral +class FlaxMistralMLP(nn.Module): + config: MistralConfig + dtype: jnp.dtype = jnp.float32 + + def setup(self): + embed_dim = self.config.hidden_size + inner_dim = self.config.intermediate_size if self.config.intermediate_size is not None else 4 * embed_dim + + kernel_init = jax.nn.initializers.normal(self.config.initializer_range) + self.act = ACT2FN[self.config.hidden_act] + + self.gate_proj = nn.Dense(inner_dim, use_bias=False, dtype=self.dtype, kernel_init=kernel_init) + self.down_proj = nn.Dense(embed_dim, use_bias=False, dtype=self.dtype, kernel_init=kernel_init) + self.up_proj = nn.Dense(inner_dim, use_bias=False, dtype=self.dtype, kernel_init=kernel_init) + + def __call__(self, hidden_states): + up_proj_states = self.up_proj(hidden_states) + gate_states = self.act(self.gate_proj(hidden_states)) + + hidden_states = self.down_proj(up_proj_states * gate_states) + return hidden_states + + +# Copied from transformers.models.llama.modeling_flax_llama.apply_rotary_pos_emb +def apply_rotary_pos_emb(tensor, sin_pos, cos_pos): + return (tensor * cos_pos) + (rotate_half(tensor) * sin_pos) + + +# Copied from transformers.models.llama.modeling_flax_llama.create_sinusoidal_positions +def create_sinusoidal_positions(num_pos, dim): + inv_freq = 1.0 / (10000 ** (np.arange(0, dim, 2) / dim)) + freqs = np.einsum("i , j -> i j", np.arange(num_pos), inv_freq).astype("float32") + + emb = np.concatenate((freqs, freqs), axis=-1) + out = np.concatenate((np.sin(emb)[:, None, :], np.cos(emb)[:, None, :]), axis=-1) + return jnp.array(out[:, :, :num_pos]) + + +# Copied from transformers.models.llama.modeling_flax_llama.rotate_half +def rotate_half(tensor): + """Rotates half the hidden dims of the input.""" + rotate_half_tensor = jnp.concatenate( + (-tensor[..., tensor.shape[-1] // 2 :], tensor[..., : tensor.shape[-1] // 2]), axis=-1 + ) + return rotate_half_tensor + + +class FlaxMistralAttention(nn.Module): + config: MistralConfig + dtype: jnp.dtype = jnp.float32 + + def setup(self): + config = self.config + self.hidden_size = config.hidden_size + self.num_heads = config.num_attention_heads + self.head_dim = self.hidden_size // self.num_heads + self.num_key_value_heads = config.num_key_value_heads + self.num_key_value_groups = self.num_heads // self.num_key_value_heads + self.max_position_embeddings = config.max_position_embeddings + self.attention_softmax_in_fp32 = self.dtype is not jnp.float32 + self.rope_theta = config.rope_theta + if (self.head_dim * self.num_heads) != self.hidden_size: + raise ValueError( + f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}" + f" and `num_heads`: {self.num_heads})." + ) + self.q_proj = nn.Dense(self.num_heads * self.head_dim, use_bias=False, dtype=self.dtype) + self.k_proj = nn.Dense(self.num_key_value_heads * self.head_dim, use_bias=False, dtype=self.dtype) + self.v_proj = nn.Dense(self.num_key_value_heads * self.head_dim, use_bias=False, dtype=self.dtype) + self.o_proj = nn.Dense(self.hidden_size, use_bias=False, dtype=self.dtype) + casual_mask = make_causal_mask(jnp.ones((1, config.max_position_embeddings), dtype="bool"), dtype="bool") + self.causal_mask = jnp.triu(casual_mask, k=-config.sliding_window) + self.rotary_emb = FlaxMistralRotaryEmbedding(config, dtype=self.dtype) + + def _split_heads(self, hidden_states, num_heads): + return hidden_states.reshape(hidden_states.shape[:2] + (num_heads, self.head_dim)) + + def _merge_heads(self, hidden_states): + return hidden_states.reshape(hidden_states.shape[:2] + (self.hidden_size,)) + + @nn.compact + # Copied from transformers.models.gpt_neo.modeling_flax_gpt_neo.FlaxGPTNeoSelfAttention._concatenate_to_cache + def _concatenate_to_cache(self, key, value, query, attention_mask): + """ + This function takes projected key, value states from a single input token and concatenates the states to cached + states from previous steps. This function is slighly adapted from the official Flax repository: + https://github.com/google/flax/blob/491ce18759622506588784b4fca0e4bf05f8c8cd/flax/linen/attention.py#L252 + """ + # detect if we're initializing by absence of existing cache data. + is_initialized = self.has_variable("cache", "cached_key") + cached_key = self.variable("cache", "cached_key", jnp.zeros, key.shape, key.dtype) + cached_value = self.variable("cache", "cached_value", jnp.zeros, value.shape, value.dtype) + cache_index = self.variable("cache", "cache_index", lambda: jnp.array(0, dtype=jnp.int32)) + + if is_initialized: + *batch_dims, max_length, num_heads, depth_per_head = cached_key.value.shape + # update key, value caches with our new 1d spatial slices + cur_index = cache_index.value + indices = (0,) * len(batch_dims) + (cur_index, 0, 0) + key = lax.dynamic_update_slice(cached_key.value, key, indices) + value = lax.dynamic_update_slice(cached_value.value, value, indices) + cached_key.value = key + cached_value.value = value + num_updated_cache_vectors = query.shape[1] + cache_index.value = cache_index.value + num_updated_cache_vectors + # causal mask for cached decoder self-attention: our single query position should only attend to those key positions that have already been generated and cached, not the remaining zero elements. + pad_mask = jnp.broadcast_to( + jnp.arange(max_length) < cur_index + num_updated_cache_vectors, + tuple(batch_dims) + (1, num_updated_cache_vectors, max_length), + ) + attention_mask = combine_masks(pad_mask, attention_mask) + return key, value, attention_mask + + def __call__( + self, + hidden_states: jnp.ndarray, + attention_mask: Optional[jnp.ndarray] = None, + position_ids: Optional[jnp.ndarray] = None, + deterministic: bool = True, + output_attentions: bool = False, + init_cache: bool = False, + ) -> Tuple[jnp.ndarray, jnp.ndarray]: + query_states = self.q_proj(hidden_states) + key_states = self.k_proj(hidden_states) + value_states = self.v_proj(hidden_states) + + query_states = self._split_heads(query_states, self.num_heads) + key_states = self._split_heads(key_states, self.num_key_value_heads) + value_states = self._split_heads(value_states, self.num_key_value_heads) + + key_states, query_states = self.rotary_emb(key_states, query_states, position_ids) + query_length, key_length = query_states.shape[1], key_states.shape[1] + if self.has_variable("cache", "cached_key"): + mask_shift = self.variables["cache"]["cache_index"] + max_decoder_length = self.variables["cache"]["cached_key"].shape[1] + causal_mask = lax.dynamic_slice( + self.causal_mask, (0, 0, mask_shift, 0), (1, 1, query_length, max_decoder_length) + ) + else: + causal_mask = self.causal_mask[:, :, :query_length, :key_length] + + batch_size = hidden_states.shape[0] + causal_mask = jnp.broadcast_to(causal_mask, (batch_size,) + causal_mask.shape[1:]) + attention_mask = jnp.broadcast_to(jnp.expand_dims(attention_mask, axis=(-3, -2)), causal_mask.shape) + attention_mask = combine_masks(attention_mask, causal_mask) + + if self.has_variable("cache", "cached_key") or init_cache: + key_states, value_states, attention_mask = self._concatenate_to_cache( + key_states, value_states, query_states, attention_mask + ) + key_states = jnp.repeat(key_states, self.num_key_value_groups, axis=2) + value_states = jnp.repeat(value_states, self.num_key_value_groups, axis=2) + + attention_bias = lax.select( + attention_mask > 0, + jnp.full(attention_mask.shape, 0.0).astype(self.dtype), + jnp.full(attention_mask.shape, jnp.finfo(self.dtype).min).astype(self.dtype), + ) + + # usual dot product attention + attention_dtype = jnp.float32 if self.attention_softmax_in_fp32 else self.dtype + attn_weights = dot_product_attention_weights( + query_states, + key_states, + bias=attention_bias, + deterministic=deterministic, + dropout_rate=self.config.attention_dropout, + dtype=attention_dtype, + ) + + if self.attention_softmax_in_fp32: + attn_weights = attn_weights.astype(self.dtype) + + attn_output = jnp.einsum("...hqk,...khd->...qhd", attn_weights, value_states) + attn_output = self._merge_heads(attn_output) + attn_output = self.o_proj(attn_output) + + outputs = (attn_output, attn_weights) if output_attentions else (attn_output,) + return outputs + + +# Copied from transformers.models.llama.modeling_flax_llama.FlaxLlamaDecoderLayer with Llama->Mistral +class FlaxMistralDecoderLayer(nn.Module): + config: MistralConfig + dtype: jnp.dtype = jnp.float32 + + def setup(self): + self.input_layernorm = FlaxMistralRMSNorm(self.config, dtype=self.dtype) + self.self_attn = FlaxMistralAttention(self.config, dtype=self.dtype) + self.post_attention_layernorm = FlaxMistralRMSNorm(self.config, dtype=self.dtype) + self.mlp = FlaxMistralMLP(self.config, dtype=self.dtype) + + def __call__( + self, + hidden_states, + attention_mask=None, + position_ids=None, + deterministic: bool = True, + init_cache: bool = False, + output_attentions: bool = False, + ): + residual = hidden_states + hidden_states = self.input_layernorm(hidden_states) + outputs = self.self_attn( + hidden_states, + attention_mask=attention_mask, + position_ids=position_ids, + deterministic=deterministic, + init_cache=init_cache, + output_attentions=output_attentions, + ) + # residual connection + attn_output = outputs[0] + hidden_states = residual + attn_output + + residual = hidden_states + hidden_states = self.post_attention_layernorm(hidden_states) + hidden_states = self.mlp(hidden_states) + # residual connection + hidden_states = residual + hidden_states + + return (hidden_states,) + outputs[1:] + + +# Copied from transformers.models.gpt_neo.modeling_flax_gpt_neo.FlaxGPTNeoPreTrainedModel with GPTNeo->Mistral, GPT_NEO->MISTRAL, transformer->model +class FlaxMistralPreTrainedModel(FlaxPreTrainedModel): + """ + An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained + models. + """ + + config_class = MistralConfig + base_model_prefix = "model" + module_class: nn.Module = None + + def __init__( + self, + config: MistralConfig, + input_shape: Tuple = (1, 1), + seed: int = 0, + dtype: jnp.dtype = jnp.float32, + _do_init: bool = True, + **kwargs, + ): + module = self.module_class(config=config, dtype=dtype, **kwargs) + super().__init__(config, module, input_shape=input_shape, seed=seed, dtype=dtype, _do_init=_do_init) + + def init_weights(self, rng: jax.random.PRNGKey, input_shape: Tuple, params: FrozenDict = None) -> FrozenDict: + # init input tensors + input_ids = jnp.zeros(input_shape, dtype="i4") + attention_mask = jnp.ones_like(input_ids) + position_ids = jnp.broadcast_to(jnp.arange(jnp.atleast_2d(input_ids).shape[-1]), input_shape) + params_rng, dropout_rng = jax.random.split(rng) + rngs = {"params": params_rng, "dropout": dropout_rng} + + random_params = self.module.init(rngs, input_ids, attention_mask, position_ids, return_dict=False)["params"] + + if params is not None: + random_params = flatten_dict(unfreeze(random_params)) + params = flatten_dict(unfreeze(params)) + for missing_key in self._missing_keys: + params[missing_key] = random_params[missing_key] + self._missing_keys = set() + return freeze(unflatten_dict(params)) + else: + return random_params + + def init_cache(self, batch_size, max_length): + r""" + Args: + batch_size (`int`): + batch_size used for fast auto-regressive decoding. Defines the batch size of the initialized cache. + max_length (`int`): + maximum possible length for auto-regressive decoding. Defines the sequence length of the initialized + cache. + """ + # init input variables to retrieve cache + input_ids = jnp.ones((batch_size, max_length)) + attention_mask = jnp.ones_like(input_ids) + position_ids = jnp.broadcast_to(jnp.arange(jnp.atleast_2d(input_ids).shape[-1]), input_ids.shape) + + init_variables = self.module.init( + jax.random.PRNGKey(0), input_ids, attention_mask, position_ids, return_dict=False, init_cache=True + ) + return unfreeze(init_variables["cache"]) + + @add_start_docstrings_to_model_forward(MISTRAL_INPUTS_DOCSTRING) + def __call__( + self, + input_ids, + attention_mask=None, + position_ids=None, + params: dict = None, + past_key_values: dict = None, + dropout_rng: jax.random.PRNGKey = None, + train: bool = False, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ): + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.return_dict + + batch_size, sequence_length = input_ids.shape + + if position_ids is None: + if past_key_values is not None: + raise ValueError("Make sure to provide `position_ids` when passing `past_key_values`.") + + position_ids = jnp.broadcast_to(jnp.arange(sequence_length)[None, :], (batch_size, sequence_length)) + + if attention_mask is None: + attention_mask = jnp.ones((batch_size, sequence_length)) + + # Handle any PRNG if needed + rngs = {} + if dropout_rng is not None: + rngs["dropout"] = dropout_rng + + inputs = {"params": params or self.params} + + # if past_key_values are passed then cache is already initialized a private flag init_cache has to be passed down to ensure cache is used. It has to be made sure that cache is marked as mutable so that it can be changed by FlaxMistralAttention module + if past_key_values: + inputs["cache"] = past_key_values + mutable = ["cache"] + else: + mutable = False + + outputs = self.module.apply( + inputs, + jnp.array(input_ids, dtype="i4"), + jnp.array(attention_mask, dtype="i4"), + jnp.array(position_ids, dtype="i4"), + not train, + False, + output_attentions, + output_hidden_states, + return_dict, + rngs=rngs, + mutable=mutable, + ) + + # add updated cache to model output + if past_key_values is not None and return_dict: + outputs, past_key_values = outputs + outputs["past_key_values"] = unfreeze(past_key_values["cache"]) + return outputs + elif past_key_values is not None and not return_dict: + outputs, past_key_values = outputs + outputs = outputs[:1] + (unfreeze(past_key_values["cache"]),) + outputs[1:] + + return outputs + + +# Copied from transformers.models.llama.modeling_flax_llama.FlaxLlamaLayerCollection with Llama->Mistral +class FlaxMistralLayerCollection(nn.Module): + config: MistralConfig + dtype: jnp.dtype = jnp.float32 + + def setup(self): + self.blocks = [ + FlaxMistralDecoderLayer(self.config, dtype=self.dtype, name=str(i)) + for i in range(self.config.num_hidden_layers) + ] + + def __call__( + self, + hidden_states, + attention_mask=None, + position_ids=None, + deterministic: bool = True, + init_cache: bool = False, + output_attentions: bool = False, + output_hidden_states: bool = False, + return_dict: bool = False, + ): + all_attentions = () if output_attentions else None + all_hidden_states = () if output_hidden_states else None + + for block in self.blocks: + if output_hidden_states: + all_hidden_states += (hidden_states,) + layer_outputs = block( + hidden_states, + attention_mask=attention_mask, + position_ids=position_ids, + deterministic=deterministic, + init_cache=init_cache, + output_attentions=output_attentions, + ) + hidden_states = layer_outputs[0] + + if output_attentions: + all_attentions += (layer_outputs[1],) + + # this contains possible `None` values - `FlaxMistralModule` will filter them out + outputs = (hidden_states, all_hidden_states, all_attentions) + + return outputs + + +# Copied from transformers.models.llama.modeling_flax_llama.FlaxLlamaModule with Llama->Mistral +class FlaxMistralModule(nn.Module): + config: MistralConfig + dtype: jnp.dtype = jnp.float32 + + def setup(self): + self.hidden_size = self.config.hidden_size + embedding_init = jax.nn.initializers.normal(stddev=self.config.initializer_range) + self.embed_tokens = nn.Embed( + self.config.vocab_size, + self.hidden_size, + embedding_init=embedding_init, + dtype=self.dtype, + ) + self.layers = FlaxMistralLayerCollection(self.config, dtype=self.dtype) + self.norm = FlaxMistralRMSNorm(self.config, dtype=self.dtype) + + def __call__( + self, + input_ids, + attention_mask=None, + position_ids=None, + deterministic=True, + init_cache: bool = False, + output_attentions: bool = False, + output_hidden_states: bool = False, + return_dict: bool = True, + ): + input_embeds = self.embed_tokens(input_ids.astype("i4")) + + outputs = self.layers( + input_embeds, + position_ids=position_ids, + attention_mask=attention_mask, + deterministic=deterministic, + init_cache=init_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + hidden_states = outputs[0] + hidden_states = self.norm(hidden_states) + + if output_hidden_states: + all_hidden_states = outputs[1] + (hidden_states,) + outputs = (hidden_states, all_hidden_states) + outputs[2:] + else: + outputs = (hidden_states,) + outputs[1:] + + if not return_dict: + return tuple(v for v in outputs if v is not None) + + return FlaxBaseModelOutput( + last_hidden_state=hidden_states, + hidden_states=outputs[1], + attentions=outputs[-1], + ) + + +@add_start_docstrings( + "The bare Mistral Model transformer outputting raw hidden-states without any specific head on top.", + MISTRAL_START_DOCSTRING, +) +class FlaxMistralModel(FlaxMistralPreTrainedModel): + module_class = FlaxMistralModule + + +append_call_sample_docstring( + FlaxMistralModel, + _CHECKPOINT_FOR_DOC, + FlaxBaseModelOutputWithPast, + _CONFIG_FOR_DOC, + real_checkpoint=_REAL_CHECKPOINT_FOR_DOC, +) + + +# Copied from transformers.models.llama.modeling_flax_llama.FlaxLlamaForCausalLMModule with Llama->Mistral +class FlaxMistralForCausalLMModule(nn.Module): + config: MistralConfig + dtype: jnp.dtype = jnp.float32 + + def setup(self): + self.model = FlaxMistralModule(self.config, dtype=self.dtype) + self.lm_head = nn.Dense( + self.config.vocab_size, + use_bias=False, + dtype=self.dtype, + kernel_init=jax.nn.initializers.normal(stddev=self.config.initializer_range), + ) + + def __call__( + self, + input_ids, + attention_mask=None, + position_ids=None, + deterministic: bool = True, + init_cache: bool = False, + output_attentions: bool = False, + output_hidden_states: bool = False, + return_dict: bool = True, + ): + outputs = self.model( + input_ids, + position_ids=position_ids, + attention_mask=attention_mask, + deterministic=deterministic, + init_cache=init_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + hidden_states = outputs[0] + lm_logits = self.lm_head(hidden_states) + + if not return_dict: + return (lm_logits,) + outputs[1:] + + return FlaxCausalLMOutput(logits=lm_logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions) + + +@add_start_docstrings( + """ + The Mistral Model transformer with a language modeling head (linear layer) on top. + """, + MISTRAL_START_DOCSTRING, +) + +# Copied from transformers.models.gptj.modeling_flax_gptj.FlaxGPTJForCausalLM with GPTJ->Mistral +class FlaxMistralForCausalLM(FlaxMistralPreTrainedModel): + module_class = FlaxMistralForCausalLMModule + + def prepare_inputs_for_generation(self, input_ids, max_length, attention_mask: Optional[jax.Array] = None): + # initializing the cache + batch_size, seq_length = input_ids.shape + + past_key_values = self.init_cache(batch_size, max_length) + # Note that usually one would have to put 0's in the attention_mask for x > input_ids.shape[-1] and x < cache_length. + # But since Mistral uses a causal mask, those positions are masked anyways. + # Thus we can create a single static attention_mask here, which is more efficient for compilation + extended_attention_mask = jnp.ones((batch_size, max_length), dtype="i4") + if attention_mask is not None: + position_ids = attention_mask.cumsum(axis=-1) - 1 + extended_attention_mask = lax.dynamic_update_slice(extended_attention_mask, attention_mask, (0, 0)) + else: + position_ids = jnp.broadcast_to(jnp.arange(seq_length, dtype="i4")[None, :], (batch_size, seq_length)) + + return { + "past_key_values": past_key_values, + "attention_mask": extended_attention_mask, + "position_ids": position_ids, + } + + def update_inputs_for_generation(self, model_outputs, model_kwargs): + model_kwargs["past_key_values"] = model_outputs.past_key_values + model_kwargs["position_ids"] = model_kwargs["position_ids"][:, -1:] + 1 + return model_kwargs + + +append_call_sample_docstring( + FlaxMistralForCausalLM, + _CHECKPOINT_FOR_DOC, + FlaxCausalLMOutputWithCrossAttentions, + _CONFIG_FOR_DOC, + real_checkpoint=_REAL_CHECKPOINT_FOR_DOC, +) diff --git a/collie/models/mistral/modeling_mistral.py b/collie/models/mistral/modeling_mistral.py new file mode 100644 index 0000000..03fad65 --- /dev/null +++ b/collie/models/mistral/modeling_mistral.py @@ -0,0 +1,1473 @@ +# coding=utf-8 +# Copyright 2023 Mistral AI and the HuggingFace Inc. team. All rights reserved. +# +# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX +# and OPT implementations in this library. It has been modified from its +# original forms to accommodate minor architectural differences compared +# to GPT-NeoX and OPT used by the Meta AI team that trained the model. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" PyTorch Mistral model.""" +import inspect +import math +import warnings +from typing import List, Optional, Tuple, Union +import json + +import torch +import torch.nn.functional as F +import torch.utils.checkpoint +from torch import nn +from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss + +from transformers.activations import ACT2FN +from transformers.cache_utils import Cache, DynamicCache +from transformers.modeling_attn_mask_utils import _prepare_4d_causal_attention_mask, _prepare_4d_causal_attention_mask_for_sdpa +from transformers.modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast, SequenceClassifierOutputWithPast +from transformers.modeling_utils import PreTrainedModel +from transformers.utils import ( + add_start_docstrings, + add_start_docstrings_to_model_forward, + is_flash_attn_2_available, + is_flash_attn_greater_or_equal_2_10, + logging, + replace_return_docstrings, +) +from .configuration_mistral import MistralConfig + + +if is_flash_attn_2_available(): + from flash_attn import flash_attn_func, flash_attn_varlen_func + from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input # noqa + + _flash_supports_window_size = "window_size" in list(inspect.signature(flash_attn_func).parameters) + + +logger = logging.get_logger(__name__) + +_CONFIG_FOR_DOC = "MistralConfig" + + +# Copied from transformers.models.llama.modeling_llama._get_unpad_data +def _get_unpad_data(attention_mask): + seqlens_in_batch = attention_mask.sum(dim=-1, dtype=torch.int32) + indices = torch.nonzero(attention_mask.flatten(), as_tuple=False).flatten() + max_seqlen_in_batch = seqlens_in_batch.max().item() + cu_seqlens = F.pad(torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.int32), (1, 0)) + return ( + indices, + cu_seqlens, + max_seqlen_in_batch, + ) + + +# Copied from transformers.models.llama.modeling_llama.LlamaRMSNorm with Llama->Mistral +class MistralRMSNorm(nn.Module): + def __init__(self, hidden_size, eps=1e-6, dtype=torch.bfloat16): + """ + MistralRMSNorm is equivalent to T5LayerNorm + """ + super().__init__() + self.weight = nn.Parameter(torch.ones(hidden_size)) + self.variance_epsilon = eps + + def forward(self, hidden_states): + input_dtype = hidden_states.dtype + hidden_states = hidden_states.to(torch.float32) + variance = hidden_states.pow(2).mean(-1, keepdim=True) + hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon) + self.weight = nn.Parameter(self.weight.to(input_dtype)) + ans = self.weight * hidden_states.to(input_dtype) + + # 打印层标准化的输出 + hidden_states_output = ans.detach().cpu().tolist() + data_to_save = {"Layer Norm Output": hidden_states_output} + # 将输出写入 JSON 文件 + with open('rms_output.json', 'w') as f: + json.dump(data_to_save, f, indent=4) + return self.weight * hidden_states.to(input_dtype) + + +# copied from transformers.models.llama.modeling_llama.LlamaRotaryEmbedding with Llama->Mistral +# TODO @Arthur no longer copied from LLama after static cache +class MistralRotaryEmbedding(nn.Module): + def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None): + super().__init__() + + self.dim = dim + self.max_position_embeddings = max_position_embeddings + self.base = base + inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2, dtype=torch.int64).float().to(device) / self.dim)) + self.register_buffer("inv_freq", inv_freq, persistent=False) + + # Build here to make `torch.jit.trace` work. + self._set_cos_sin_cache( + seq_len=max_position_embeddings, device=self.inv_freq.device, dtype=torch.get_default_dtype() + ) + + def _set_cos_sin_cache(self, seq_len, device, dtype): + self.max_seq_len_cached = seq_len + t = torch.arange(self.max_seq_len_cached, device=device, dtype=torch.int64).type_as(self.inv_freq) + + freqs = torch.outer(t, self.inv_freq) + # Different from paper, but it uses a different permutation in order to obtain the same calculation + emb = torch.cat((freqs, freqs), dim=-1) + self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False) + self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False) + + def forward(self, x, seq_len=None): + # x: [bs, num_attention_heads, seq_len, head_size] + if seq_len > self.max_seq_len_cached: + self._set_cos_sin_cache(seq_len=seq_len, device=x.device, dtype=x.dtype) + + return ( + self.cos_cached[:seq_len].to(dtype=x.dtype), + self.sin_cached[:seq_len].to(dtype=x.dtype), + ) + + +# Copied from transformers.models.llama.modeling_llama.rotate_half +def rotate_half(x): + """Rotates half the hidden dims of the input.""" + x1 = x[..., : x.shape[-1] // 2] + x2 = x[..., x.shape[-1] // 2 :] + return torch.cat((-x2, x1), dim=-1) + + +# copied from transformers.models.llama.modeling_llama.apply_rotary_pos_emb +# TODO @Arthur no longer copied from LLama after static cache +def apply_rotary_pos_emb(q, k, cos, sin, position_ids, unsqueeze_dim=1): + """Applies Rotary Position Embedding to the query and key tensors. + + Args: + q (`torch.Tensor`): The query tensor. + k (`torch.Tensor`): The key tensor. + cos (`torch.Tensor`): The cosine part of the rotary embedding. + sin (`torch.Tensor`): The sine part of the rotary embedding. + position_ids (`torch.Tensor`): + The position indices of the tokens corresponding to the query and key tensors. For example, this can be + used to pass offsetted position ids when working with a KV-cache. + unsqueeze_dim (`int`, *optional*, defaults to 1): + The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and + sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note + that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and + k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes + cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have + the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2. + Returns: + `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding. + """ + cos = cos[position_ids].unsqueeze(unsqueeze_dim) + sin = sin[position_ids].unsqueeze(unsqueeze_dim) + q_embed = (q * cos) + (rotate_half(q) * sin) + k_embed = (k * cos) + (rotate_half(k) * sin) + return q_embed, k_embed + + +class MistralMLP(nn.Module): + def __init__(self, config): + super().__init__() + self.config = config + self.hidden_size = config.hidden_size + self.intermediate_size = config.intermediate_size + self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False, dtype = torch.bfloat16) + self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False,dtype = torch.bfloat16) + self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False,dtype = torch.bfloat16) + self.act_fn = ACT2FN[config.hidden_act] + + def forward(self, x): + x = x.to(dtype=torch.bfloat16) + output = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x)) + + # 打印MLP层输出 + mlp_output = output.detach().cpu().tolist() + data_to_save = {"MLP Output": mlp_output} + # 将输出写入 JSON 文件 + with open('mlp_output.json', 'w') as f: + json.dump(data_to_save, f, indent=4) + + return output + + +# Copied from transformers.models.llama.modeling_llama.repeat_kv +def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor: + """ + This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch, + num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim) + """ + batch, num_key_value_heads, slen, head_dim = hidden_states.shape + if n_rep == 1: + return hidden_states + hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim) + return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim) + + +class MistralAttention(nn.Module): + """ + Multi-headed attention from 'Attention Is All You Need' paper. Modified to use sliding window attention: Longformer + and "Generating Long Sequences with Sparse Transformers". + """ + + def __init__(self, config: MistralConfig, layer_idx: Optional[int] = None): + super().__init__() + self.config = config + self.layer_idx = layer_idx + if layer_idx is None: + logger.warning_once( + f"Instantiating {self.__class__.__name__} without passing a `layer_idx` is not recommended and will " + "lead to errors during the forward call if caching is used. Please make sure to provide a `layer_idx` " + "when creating this class." + ) + + self.hidden_size = config.hidden_size + self.num_heads = config.num_attention_heads + self.head_dim = self.hidden_size // self.num_heads + self.num_key_value_heads = config.num_key_value_heads + self.num_key_value_groups = self.num_heads // self.num_key_value_heads + self.max_position_embeddings = config.max_position_embeddings + self.rope_theta = config.rope_theta + self.is_causal = True + self.attention_dropout = config.attention_dropout + + if (self.head_dim * self.num_heads) != self.hidden_size: + raise ValueError( + f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}" + f" and `num_heads`: {self.num_heads})." + ) + self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=False,dtype = torch.bfloat16) + self.k_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=False,dtype = torch.bfloat16) + self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=False,dtype = torch.bfloat16) + self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=False,dtype = torch.bfloat16) + + self.rotary_emb = MistralRotaryEmbedding( + self.head_dim, + max_position_embeddings=self.max_position_embeddings, + base=self.rope_theta, + ) + + def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int): + return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous() + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_value: Optional[Cache] = None, + output_attentions: bool = False, + use_cache: bool = False, + **kwargs, + ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: + if "padding_mask" in kwargs: + warnings.warn( + "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`" + ) + bsz, q_len, _ = hidden_states.size() + query_states = self.q_proj(hidden_states) + key_states = self.k_proj(hidden_states) + value_states = self.v_proj(hidden_states) + + query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2) + key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) + value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) + + # 打印注意力模块的输出 + # 准备数据以写入 JSON 文件 + attention_outputs = { + "Query states": query_states.detach().cpu().tolist(), + "Key states": key_states.detach().cpu().tolist(), + "Value states": value_states.detach().cpu().tolist() + } + # 将数据写入 JSON 文件 + with open("attention_outputs.json", "w") as f: + json.dump(attention_outputs, f, indent=4) + + kv_seq_len = key_states.shape[-2] + if past_key_value is not None: + if self.layer_idx is None: + raise ValueError( + f"The cache structure has changed since version v4.36. If you are using {self.__class__.__name__} " + "for auto-regressive decoding with k/v caching, please make sure to initialize the attention class " + "with a layer index." + ) + kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx) + cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len) + query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids) + + if past_key_value is not None: + cache_kwargs = {"sin": sin, "cos": cos} # Specific to RoPE models + key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs) + + # repeat k/v heads if n_kv_heads < n_heads + key_states = repeat_kv(key_states, self.num_key_value_groups) + value_states = repeat_kv(value_states, self.num_key_value_groups) + + attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim) + + if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len): + raise ValueError( + f"Attention weights should be of size {(bsz, self.num_heads, q_len, kv_seq_len)}, but is" + f" {attn_weights.size()}" + ) + + if attention_mask is not None: + if attention_mask.size() != (bsz, 1, q_len, kv_seq_len): + raise ValueError( + f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}" + ) + + attn_weights = attn_weights + attention_mask + + # upcast attention to fp32 + attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype) + attn_weights = nn.functional.dropout(attn_weights, p=self.attention_dropout, training=self.training) + attn_output = torch.matmul(attn_weights, value_states) + + if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim): + raise ValueError( + f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is" + f" {attn_output.size()}" + ) + + attn_output = attn_output.transpose(1, 2).contiguous() + attn_output = attn_output.reshape(bsz, q_len, self.hidden_size) + + attn_output = self.o_proj(attn_output) + + if not output_attentions: + attn_weights = None + + # 打印注意力模块的输出 + attention_result = { + "Output weights:": attn_output.detach().cpu().tolist(), + # "Attention weights:": attn_weights.detach().cpu().tolist(), + } + # 将数据写入 JSON 文件 + with open("attention_outputs.json", "w") as f: + json.dump(attention_result, f, indent=4) + + return attn_output, attn_weights, past_key_value + + +class MistralFlashAttention2(MistralAttention): + """ + Mistral flash attention module. This module inherits from `MistralAttention` as the weights of the module stays + untouched. The only required change would be on the forward pass where it needs to correctly call the public API of + flash attention and deal with padding tokens in case the input contains any of them. + """ + + # Copied from transformers.models.llama.modeling_llama.LlamaFlashAttention2.__init__ + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1. + # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignement, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0. + # Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left). + self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10() + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_value: Optional[Cache] = None, + output_attentions: bool = False, + use_cache: bool = False, + **kwargs, + ): + if "padding_mask" in kwargs: + warnings.warn( + "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`" + ) + + # overwrite attention_mask with padding_mask + attention_mask = kwargs.pop("padding_mask") + bsz, q_len, _ = hidden_states.size() + + query_states = self.q_proj(hidden_states.to(torch.bfloat16)) + key_states = self.k_proj(hidden_states.to(torch.bfloat16)) + value_states = self.v_proj(hidden_states.to(torch.bfloat16)) + + query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2) + key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) + value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) + + kv_seq_len = key_states.shape[-2] + if past_key_value is not None: + if self.layer_idx is None: + raise ValueError( + f"The cache structure has changed since version v4.36. If you are using {self.__class__.__name__} " + "for auto-regressive decoding with k/v caching, please make sure to initialize the attention class " + "with a layer index." + ) + kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx) + + # Because the input can be padded, the absolute sequence length depends on the max position id. + rotary_seq_len = max(kv_seq_len, position_ids[:, -1].max().item()) + 1 + cos, sin = self.rotary_emb(value_states, seq_len=rotary_seq_len) + + query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids) + + use_sliding_windows = ( + _flash_supports_window_size + and getattr(self.config, "sliding_window", None) is not None + and kv_seq_len > self.config.sliding_window + ) + + if not _flash_supports_window_size: + logger.warning_once( + "The current flash attention version does not support sliding window attention, for a more memory efficient implementation" + " make sure to upgrade flash-attn library." + ) + + if past_key_value is not None: + # Activate slicing cache only if the config has a value `sliding_windows` attribute + cache_has_contents = past_key_value.get_seq_length(self.layer_idx) > 0 + if ( + getattr(self.config, "sliding_window", None) is not None + and kv_seq_len > self.config.sliding_window + and cache_has_contents + ): + slicing_tokens = 1 - self.config.sliding_window + + past_key = past_key_value[self.layer_idx][0] + past_value = past_key_value[self.layer_idx][1] + + past_key = past_key[:, :, slicing_tokens:, :].contiguous() + past_value = past_value[:, :, slicing_tokens:, :].contiguous() + + if past_key.shape[-2] != self.config.sliding_window - 1: + raise ValueError( + f"past key must have a shape of (`batch_size, num_heads, self.config.sliding_window-1, head_dim`), got" + f" {past_key.shape}" + ) + + if attention_mask is not None: + attention_mask = attention_mask[:, slicing_tokens:] + attention_mask = torch.cat([attention_mask, torch.ones_like(attention_mask[:, -1:])], dim=-1) + + cache_kwargs = {"sin": sin, "cos": cos} # Specific to RoPE models + key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs) + + # repeat k/v heads if n_kv_heads < n_heads + key_states = repeat_kv(key_states, self.num_key_value_groups) + value_states = repeat_kv(value_states, self.num_key_value_groups) + dropout_rate = 0.0 if not self.training else self.attention_dropout + + # In PEFT, usually we cast the layer norms in float32 for training stability reasons + # therefore the input hidden states gets silently casted in float32. Hence, we need + # cast them back in float16 just to be sure everything works as expected. + input_dtype = query_states.dtype + if input_dtype == torch.float32: + if torch.is_autocast_enabled(): + target_dtype = torch.get_autocast_gpu_dtype() + # Handle the case where the model is quantized + elif hasattr(self.config, "_pre_quantization_dtype"): + target_dtype = self.config._pre_quantization_dtype + else: + target_dtype = self.q_proj.weight.dtype + + logger.warning_once( + f"The input hidden states seems to be silently casted in float32, this might be related to" + f" the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in" + f" {target_dtype}." + ) + + query_states = query_states.to(target_dtype) + key_states = key_states.to(target_dtype) + value_states = value_states.to(target_dtype) + + # Reashape to the expected shape for Flash Attention + query_states = query_states.transpose(1, 2) + key_states = key_states.transpose(1, 2) + value_states = value_states.transpose(1, 2) + # 打印注意力模块的输出 + # 准备数据以写入 JSON 文件 + attention_outputs = { + "Query states": query_states.detach().cpu().tolist(), + "Key states": key_states.detach().cpu().tolist(), + "Value states": value_states.detach().cpu().tolist() + } + # 将数据写入 JSON 文件 + with open("flash_attention_outputs.json", "w") as f: + json.dump(attention_outputs, f, indent=4) + + attn_output = self._flash_attention_forward( + query_states, + key_states, + value_states, + attention_mask, + q_len, + dropout=dropout_rate, + use_sliding_windows=use_sliding_windows, + ) + + attn_output = attn_output.reshape(bsz, q_len, self.hidden_size).contiguous() + attn_output = self.o_proj(attn_output) + + if not output_attentions: + attn_weights = None + + # 打印注意力模块的输出 + attention_result = { + "Output weights:": attn_output.detach().cpu().tolist(), + # "Attention weights:": attn_weights.detach().cpu().tolist(), + } + # 将数据写入 JSON 文件 + with open("flash_attention_outputs.json", "w") as f: + json.dump(attention_result, f, indent=4) + + return attn_output, attn_weights, past_key_value + + def _flash_attention_forward( + self, + query_states, + key_states, + value_states, + attention_mask, + query_length, + dropout=0.0, + softmax_scale=None, + use_sliding_windows=False, + ): + """ + Calls the forward method of Flash Attention - if the input hidden states contain at least one padding token + first unpad the input, then computes the attention scores and pad the final attention scores. + + Args: + query_states (`torch.Tensor`): + Input query states to be passed to Flash Attention API + key_states (`torch.Tensor`): + Input key states to be passed to Flash Attention API + value_states (`torch.Tensor`): + Input value states to be passed to Flash Attention API + attention_mask (`torch.Tensor`): + The padding mask - corresponds to a tensor of size `(batch_size, seq_len)` where 0 stands for the + position of padding tokens and 1 for the position of non-padding tokens. + dropout (`float`): + Attention dropout + softmax_scale (`float`, *optional*): + The scaling of QK^T before applying softmax. Default to 1 / sqrt(head_dim) + use_sliding_windows (`bool`, *optional*): + Whether to activate sliding window attention. + """ + if not self._flash_attn_uses_top_left_mask: + causal = self.is_causal + else: + # TODO: Remove the `query_length != 1` check once Flash Attention for RoCm is bumped to 2.1. For details, please see the comment in LlamaFlashAttention2 __init__. + causal = self.is_causal and query_length != 1 + + # Contains at least one padding token in the sequence + if attention_mask is not None: + batch_size = query_states.shape[0] + query_states, key_states, value_states, indices_q, cu_seq_lens, max_seq_lens = self._upad_input( + query_states, key_states, value_states, attention_mask, query_length + ) + + cu_seqlens_q, cu_seqlens_k = cu_seq_lens + max_seqlen_in_batch_q, max_seqlen_in_batch_k = max_seq_lens + + if not use_sliding_windows: + attn_output_unpad = flash_attn_varlen_func( + query_states, + key_states, + value_states, + cu_seqlens_q=cu_seqlens_q, + cu_seqlens_k=cu_seqlens_k, + max_seqlen_q=max_seqlen_in_batch_q, + max_seqlen_k=max_seqlen_in_batch_k, + dropout_p=dropout, + softmax_scale=softmax_scale, + causal=causal, + ) + else: + attn_output_unpad = flash_attn_varlen_func( + query_states, + key_states, + value_states, + cu_seqlens_q=cu_seqlens_q, + cu_seqlens_k=cu_seqlens_k, + max_seqlen_q=max_seqlen_in_batch_q, + max_seqlen_k=max_seqlen_in_batch_k, + dropout_p=dropout, + softmax_scale=softmax_scale, + causal=causal, + window_size=(self.config.sliding_window, self.config.sliding_window), + ) + + attn_output = pad_input(attn_output_unpad, indices_q, batch_size, query_length) + else: + if not use_sliding_windows: + attn_output = flash_attn_func( + query_states, + key_states, + value_states, + dropout, + softmax_scale=softmax_scale, + causal=causal, + ) + else: + attn_output = flash_attn_func( + query_states, + key_states, + value_states, + dropout, + softmax_scale=softmax_scale, + causal=causal, + window_size=(self.config.sliding_window, self.config.sliding_window), + ) + + return attn_output + + def _upad_input(self, query_layer, key_layer, value_layer, attention_mask, query_length): + batch_size, kv_seq_len, num_heads, head_dim = key_layer.shape + + # On the first iteration we need to properly re-create the padding mask + # by slicing it on the proper place + if kv_seq_len != attention_mask.shape[-1]: + attention_mask_num_tokens = attention_mask.shape[-1] + attention_mask = attention_mask[:, attention_mask_num_tokens - kv_seq_len :] + + indices_k, cu_seqlens_k, max_seqlen_in_batch_k = _get_unpad_data(attention_mask) + + key_layer = index_first_axis(key_layer.reshape(batch_size * kv_seq_len, num_heads, head_dim), indices_k) + value_layer = index_first_axis(value_layer.reshape(batch_size * kv_seq_len, num_heads, head_dim), indices_k) + + if query_length == kv_seq_len: + query_layer = index_first_axis( + query_layer.reshape(batch_size * kv_seq_len, num_heads, head_dim), indices_k + ) + cu_seqlens_q = cu_seqlens_k + max_seqlen_in_batch_q = max_seqlen_in_batch_k + indices_q = indices_k + elif query_length == 1: + max_seqlen_in_batch_q = 1 + cu_seqlens_q = torch.arange( + batch_size + 1, dtype=torch.int32, device=query_layer.device + ) # There is a memcpy here, that is very bad. + indices_q = cu_seqlens_q[:-1] + query_layer = query_layer.squeeze(1) + else: + # The -q_len: slice assumes left padding. + attention_mask = attention_mask[:, -query_length:] + query_layer, indices_q, cu_seqlens_q, max_seqlen_in_batch_q = unpad_input(query_layer, attention_mask) + + return ( + query_layer, + key_layer, + value_layer, + indices_q, + (cu_seqlens_q, cu_seqlens_k), + (max_seqlen_in_batch_q, max_seqlen_in_batch_k), + ) + + +# copied from transformers.models.llama.modeling_llama.LlamaSdpaAttention with Llama->Mistral +# TODO @Arthur no longer copied from LLama after static cache +class MistralSdpaAttention(MistralAttention): + """ + Mistral attention module using torch.nn.functional.scaled_dot_product_attention. This module inherits from + `MistralAttention` as the weights of the module stays untouched. The only changes are on the forward pass to adapt to + SDPA API. + """ + + # Adapted from MistralAttention.forward + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_value: Optional[Cache] = None, + output_attentions: bool = False, + use_cache: bool = False, + ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: + if output_attentions: + # TODO: Improve this warning with e.g. `model.config.attn_implementation = "manual"` once this is implemented. + logger.warning_once( + "MistralModel is using MistralSdpaAttention, but `torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to the manual attention implementation, " + 'but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.' + ) + return super().forward( + hidden_states=hidden_states, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_value=past_key_value, + output_attentions=output_attentions, + use_cache=use_cache, + ) + + bsz, q_len, _ = hidden_states.size() + + query_states = self.q_proj(hidden_states.to(torch.bfloat16)) + key_states = self.k_proj(hidden_states.to(torch.bfloat16)) + value_states = self.v_proj(hidden_states.to(torch.bfloat16)) + + query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2) + key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) + value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) + + # 打印注意力模块的输出 + # 准备数据以写入 JSON 文件 + attention_outputs = { + "Query states": query_states.detach().cpu().tolist(), + "Key states": key_states.detach().cpu().tolist(), + "Value states": value_states.detach().cpu().tolist() + } + # 将数据写入 JSON 文件 + with open("sdpa_attention_outputs.json", "w") as f: + json.dump(attention_outputs, f, indent=4) + + + kv_seq_len = key_states.shape[-2] + if past_key_value is not None: + kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx) + cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len) + query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids) + + if past_key_value is not None: + cache_kwargs = {"sin": sin, "cos": cos} # Specific to RoPE models + key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs) + + key_states = repeat_kv(key_states, self.num_key_value_groups) + value_states = repeat_kv(value_states, self.num_key_value_groups) + + if attention_mask is not None: + if attention_mask.size() != (bsz, 1, q_len, kv_seq_len): + raise ValueError( + f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}" + ) + + # SDPA with memory-efficient backend is currently (torch==2.1.2) bugged with non-contiguous inputs with custom attn_mask, + # Reference: https://github.com/pytorch/pytorch/issues/112577. + if query_states.device.type == "cuda" and attention_mask is not None: + query_states = query_states.contiguous() + key_states = key_states.contiguous() + value_states = value_states.contiguous() + + attn_output = torch.nn.functional.scaled_dot_product_attention( + query_states, + key_states, + value_states, + attn_mask=attention_mask, + dropout_p=self.attention_dropout if self.training else 0.0, + # The q_len > 1 is necessary to match with AttentionMaskConverter.to_causal_4d that does not create a causal mask in case q_len == 1. + is_causal=self.is_causal and attention_mask is None and q_len > 1, + ) + + attn_output = attn_output.transpose(1, 2).contiguous() + attn_output = attn_output.view(bsz, q_len, self.hidden_size) + + attn_output = self.o_proj(attn_output) + + # 打印注意力模块的输出 + attention_result = { + "Output weights:": attn_output.detach().cpu().tolist(), + # "Attention weights:": attn_weights.detach().cpu().tolist(), + } + # 将数据写入 JSON 文件 + with open("sdpa_attention_outputs.json", "w") as f: + json.dump(attention_result, f, indent=4) + + return attn_output, None, past_key_value + + +MISTRAL_ATTENTION_CLASSES = { + "eager": MistralAttention, + "flash_attention_2": MistralFlashAttention2, + "sdpa": MistralSdpaAttention, +} + + +class MistralDecoderLayer(nn.Module): + def __init__(self, config: MistralConfig, layer_idx: int): + super().__init__() + self.hidden_size = config.hidden_size + # bbbb + config._attn_implementation = "sdpa" + self.self_attn = MISTRAL_ATTENTION_CLASSES[config._attn_implementation](config, layer_idx) + + self.mlp = MistralMLP(config) + self.input_layernorm = MistralRMSNorm(config.hidden_size, eps=config.rms_norm_eps,dtype=torch.bfloat16) + self.post_attention_layernorm = MistralRMSNorm(config.hidden_size, eps=config.rms_norm_eps,dtype=torch.bfloat16) + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_value: Optional[Tuple[torch.Tensor]] = None, + output_attentions: Optional[bool] = False, + use_cache: Optional[bool] = False, + **kwargs, + ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]: + if "padding_mask" in kwargs: + warnings.warn( + "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`" + ) + """ + Args: + hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)` + attention_mask (`torch.FloatTensor`, *optional*): attention mask of size + `(batch, sequence_length)` where padding elements are indicated by 0. + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under + returned tensors for more detail. + use_cache (`bool`, *optional*): + If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding + (see `past_key_values`). + past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states + """ + + residual = hidden_states + + hidden_states = self.input_layernorm(hidden_states) + # Self Attention + hidden_states, self_attn_weights, present_key_value = self.self_attn( + hidden_states=hidden_states, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_value=past_key_value, + output_attentions=output_attentions, + use_cache=use_cache, + ) + hidden_states = residual + hidden_states + + # Fully Connected + residual = hidden_states + hidden_states = self.post_attention_layernorm(hidden_states) + hidden_states = self.mlp(hidden_states) + hidden_states = residual + hidden_states + + outputs = (hidden_states,) + + if output_attentions: + outputs += (self_attn_weights,) + + if use_cache: + outputs += (present_key_value,) + + return outputs + + +MISTRAL_START_DOCSTRING = r""" + This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the + library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads + etc.) + + This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. + Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage + and behavior. + + Parameters: + config ([`MistralConfig`]): + Model configuration class with all the parameters of the model. Initializing with a config file does not + load the weights associated with the model, only the configuration. Check out the + [`~PreTrainedModel.from_pretrained`] method to load the model weights. +""" + + +@add_start_docstrings( + "The bare Mistral Model outputting raw hidden-states without any specific head on top.", + MISTRAL_START_DOCSTRING, +) +class MistralPreTrainedModel(PreTrainedModel): + config_class = MistralConfig + base_model_prefix = "model" + supports_gradient_checkpointing = True + _no_split_modules = ["MistralDecoderLayer"] + _skip_keys_device_placement = "past_key_values" + _supports_flash_attn_2 = True + _supports_sdpa = True + _supports_cache_class = True + + def _init_weights(self, module): + std = self.config.initializer_range + if isinstance(module, nn.Linear): + module.weight.data.normal_(mean=0.0, std=std) + if module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, nn.Embedding): + module.weight.data.normal_(mean=0.0, std=std) + if module.padding_idx is not None: + module.weight.data[module.padding_idx].zero_() + + +MISTRAL_INPUTS_DOCSTRING = r""" + Args: + input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`): + Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide + it. + + Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. + + [What are input IDs?](../glossary#input-ids) + attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*): + Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + [What are attention masks?](../glossary#attention-mask) + + Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. + + If `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see + `past_key_values`). + + If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`] + and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more + information on the default strategy. + + - 1 indicates the head is **not masked**, + - 0 indicates the head is **masked**. + position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0, + config.n_positions - 1]`. + + [What are position IDs?](../glossary#position-ids) + past_key_values (`Cache` or `tuple(tuple(torch.FloatTensor))`, *optional*): + Pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention + blocks) that can be used to speed up sequential decoding. This typically consists in the `past_key_values` + returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`. + + Two formats are allowed: + - a [`~cache_utils.Cache`] instance; + - Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of + shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`). This is also known as the legacy + cache format. + + The model will output the same cache format that is fed as input. If no `past_key_values` are passed, the + legacy cache format will be returned. + + If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that don't + have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `input_ids` + of shape `(batch_size, sequence_length)`. + inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): + Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This + is useful if you want more control over how to convert `input_ids` indices into associated vectors than the + model's internal embedding lookup matrix. + use_cache (`bool`, *optional*): + If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see + `past_key_values`). + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned + tensors for more detail. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for + more detail. + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. +""" + + +@add_start_docstrings( + "The bare Mistral Model outputting raw hidden-states without any specific head on top.", + MISTRAL_START_DOCSTRING, +) +class MistralModel(MistralPreTrainedModel): + """ + Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`MistralDecoderLayer`] + + Args: + config: MistralConfig + """ + + def __init__(self, config: MistralConfig): + super().__init__(config) + self.padding_idx = config.pad_token_id + self.vocab_size = config.vocab_size + + self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx, dtype = torch.bfloat16) + self.layers = nn.ModuleList( + [MistralDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)] + ) + config._attn_implementation = "sdpa" + self._attn_implementation = config._attn_implementation + self.norm = MistralRMSNorm(config.hidden_size, eps=config.rms_norm_eps,dtype=torch.bfloat16) + + self.gradient_checkpointing = False + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self): + return self.embed_tokens + + def set_input_embeddings(self, value): + self.embed_tokens = value + + @add_start_docstrings_to_model_forward(MISTRAL_INPUTS_DOCSTRING) + def forward( + self, + input_ids: torch.LongTensor = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[List[torch.FloatTensor]] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple, BaseModelOutputWithPast]: + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + use_cache = use_cache if use_cache is not None else self.config.use_cache + + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + # retrieve input_ids and inputs_embeds + if input_ids is not None and inputs_embeds is not None: + raise ValueError("You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time") + elif input_ids is not None: + batch_size, seq_length = input_ids.shape + elif inputs_embeds is not None: + batch_size, seq_length, _ = inputs_embeds.shape + else: + raise ValueError("You have to specify either decoder_input_ids or decoder_inputs_embeds") + + if self.gradient_checkpointing and self.training: + if use_cache: + logger.warning_once( + "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..." + ) + use_cache = False + + past_key_values_length = 0 + + if use_cache: + use_legacy_cache = not isinstance(past_key_values, Cache) + if use_legacy_cache: + past_key_values = DynamicCache.from_legacy_cache(past_key_values) + past_key_values_length = past_key_values.get_usable_length(seq_length) + + if position_ids is None: + device = input_ids.device if input_ids is not None else inputs_embeds.device + position_ids = torch.arange( + past_key_values_length, seq_length + past_key_values_length, dtype=torch.long, device=device + ) + position_ids = position_ids.unsqueeze(0).view(-1, seq_length) + else: + position_ids = position_ids.view(-1, seq_length).long() + + if inputs_embeds is None: + inputs_embeds = self.embed_tokens(input_ids) + + # 打印嵌入层输出 + embeddings_output = inputs_embeds.detach().cpu().tolist() + data_to_save = {"Embeddings Output": embeddings_output} + # 将输出写入 JSON 文件 + with open('embeddings_output.json', 'w') as f: + json.dump(data_to_save, f, indent=4) + + if attention_mask is not None and self._attn_implementation == "flash_attention_2" and use_cache: + is_padding_right = attention_mask[:, -1].sum().item() != batch_size + if is_padding_right: + raise ValueError( + "You are attempting to perform batched generation with padding_side='right'" + " this may lead to unexpected behaviour for Flash Attention version of Mistral. Make sure to " + " call `tokenizer.padding_side = 'left'` before tokenizing the input. " + ) + + if self._attn_implementation == "flash_attention_2": + # 2d mask is passed through the layers + attention_mask = attention_mask if (attention_mask is not None and 0 in attention_mask) else None + elif self._attn_implementation == "sdpa" and not output_attentions: + # output_attentions=True can not be supported when using SDPA, and we fall back on + # the manual implementation that requires a 4D causal mask in all cases. + attention_mask = _prepare_4d_causal_attention_mask_for_sdpa( + attention_mask, + (batch_size, seq_length), + inputs_embeds, + past_key_values_length, + ) + else: + # 4d mask is passed through the layers + attention_mask = _prepare_4d_causal_attention_mask( + attention_mask, + (batch_size, seq_length), + inputs_embeds, + past_key_values_length, + sliding_window=self.config.sliding_window, + ) + + hidden_states = inputs_embeds + + # decoder layers + all_hidden_states = () if output_hidden_states else None + all_self_attns = () if output_attentions else None + next_decoder_cache = None + + for decoder_layer in self.layers: + if output_hidden_states: + all_hidden_states += (hidden_states,) + + if self.gradient_checkpointing and self.training: + layer_outputs = self._gradient_checkpointing_func( + decoder_layer.__call__, + hidden_states, + attention_mask, + position_ids, + past_key_values, + output_attentions, + use_cache, + ) + else: + layer_outputs = decoder_layer( + hidden_states, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_value=past_key_values, + output_attentions=output_attentions, + use_cache=use_cache, + ) + + hidden_states = layer_outputs[0] + + if use_cache: + next_decoder_cache = layer_outputs[2 if output_attentions else 1] + + if output_attentions: + all_self_attns += (layer_outputs[1],) + + hidden_states = self.norm(hidden_states) + + # add hidden states from the last decoder layer + if output_hidden_states: + all_hidden_states += (hidden_states,) + + next_cache = None + if use_cache: + next_cache = next_decoder_cache.to_legacy_cache() if use_legacy_cache else next_decoder_cache + + if not return_dict: + return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None) + return BaseModelOutputWithPast( + last_hidden_state=hidden_states, + past_key_values=next_cache, + hidden_states=all_hidden_states, + attentions=all_self_attns, + ) + + +class MistralForCausalLM(MistralPreTrainedModel): + _tied_weights_keys = ["lm_head.weight"] + + def __init__(self, config): + super().__init__(config) + self.model = MistralModel(config) + self.vocab_size = config.vocab_size + self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False,dtype = torch.bfloat16) + + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self): + return self.model.embed_tokens + + def set_input_embeddings(self, value): + self.model.embed_tokens = value + + def get_output_embeddings(self): + return self.lm_head + + def set_output_embeddings(self, new_embeddings): + self.lm_head = new_embeddings + + def set_decoder(self, decoder): + self.model = decoder + + def get_decoder(self): + return self.model + + @add_start_docstrings_to_model_forward(MISTRAL_INPUTS_DOCSTRING) + @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC) + def forward( + self, + input_ids: torch.LongTensor = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[List[torch.FloatTensor]] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple, CausalLMOutputWithPast]: + r""" + Args: + labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., + config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored + (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`. + + Returns: + + Example: + + ```python + >>> from transformers import AutoTokenizer, MistralForCausalLM + + >>> model = MistralForCausalLM.from_pretrained("mistralai/Mistral-7B-v0.1") + >>> tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-v0.1") + + >>> prompt = "Hey, are you conscious? Can you talk to me?" + >>> inputs = tokenizer(prompt, return_tensors="pt") + + >>> # Generate + >>> generate_ids = model.generate(inputs.input_ids, max_length=30) + >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0] + "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you." + ```""" + + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn) + outputs = self.model( + input_ids=input_ids, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + hidden_states = outputs[0] + hidden_states = hidden_states.to(dtype=torch.bfloat16) + logits = self.lm_head(hidden_states) + logits = logits.float() + + loss = None + if labels is not None: + # Shift so that tokens < n predict n + shift_logits = logits[..., :-1, :].contiguous() + shift_labels = labels[..., 1:].contiguous() + # Flatten the tokens + shift_logits = shift_logits.view(-1, self.config.vocab_size) + shift_labels = shift_labels.view(-1) + # Ensure tensors are on the same device + shift_labels = shift_labels.to(shift_logits.device) + loss_fct = CrossEntropyLoss() + loss = loss_fct(shift_logits, shift_labels) + + if not return_dict: + output = (logits,) + outputs[1:] + return (loss,) + output if loss is not None else output + + return CausalLMOutputWithPast( + loss=loss, + logits=logits, + past_key_values=outputs.past_key_values, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + def prepare_inputs_for_generation( + self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, **kwargs + ): + # Omit tokens covered by past_key_values + if past_key_values is not None: + if isinstance(past_key_values, Cache): + cache_length = past_key_values.get_seq_length() + past_length = past_key_values.seen_tokens + max_cache_length = past_key_values.get_max_length() + else: + cache_length = past_length = past_key_values[0][0].shape[2] + max_cache_length = None + + # Keep only the unprocessed tokens: + # 1 - If the length of the attention_mask exceeds the length of input_ids, then we are in a setting where + # some of the inputs are exclusively passed as part of the cache (e.g. when passing input_embeds as + # input) + if attention_mask is not None and attention_mask.shape[1] > input_ids.shape[1]: + input_ids = input_ids[:, -(attention_mask.shape[1] - past_length) :] + # 2 - If the past_length is smaller than input_ids', then input_ids holds all input tokens. We can discard + # input_ids based on the past_length. + elif past_length < input_ids.shape[1]: + input_ids = input_ids[:, past_length:] + # 3 - Otherwise (past_length >= input_ids.shape[1]), let's assume input_ids only has unprocessed tokens. + + # If we are about to go beyond the maximum cache length, we need to crop the input attention mask. + if ( + max_cache_length is not None + and attention_mask is not None + and cache_length + input_ids.shape[1] > max_cache_length + ): + attention_mask = attention_mask[:, -max_cache_length:] + + position_ids = kwargs.get("position_ids", None) + if attention_mask is not None and position_ids is None: + # create position_ids on the fly for batch generation + position_ids = attention_mask.long().cumsum(-1) - 1 + position_ids.masked_fill_(attention_mask == 0, 1) + if past_key_values: + position_ids = position_ids[:, -input_ids.shape[1] :] + + # if `inputs_embeds` are passed, we only want to use them in the 1st generation step + if inputs_embeds is not None and past_key_values is None: + model_inputs = {"inputs_embeds": inputs_embeds} + else: + model_inputs = {"input_ids": input_ids} + + model_inputs.update( + { + "position_ids": position_ids, + "past_key_values": past_key_values, + "use_cache": kwargs.get("use_cache"), + "attention_mask": attention_mask, + } + ) + return model_inputs + + @staticmethod + def _reorder_cache(past_key_values, beam_idx): + reordered_past = () + for layer_past in past_key_values: + reordered_past += ( + tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past), + ) + return reordered_past + + +@add_start_docstrings( + """ + The Mistral Model transformer with a sequence classification head on top (linear layer). + + [`MistralForSequenceClassification`] uses the last token in order to do the classification, as other causal models + (e.g. GPT-2) do. + + Since it does classification on the last token, it requires to know the position of the last token. If a + `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If + no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the + padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in + each row of the batch). + """, + MISTRAL_START_DOCSTRING, +) +# Copied from transformers.models.llama.modeling_llama.LlamaForSequenceClassification with Llama->Mistral, LLAMA->MISTRAL +class MistralForSequenceClassification(MistralPreTrainedModel): + def __init__(self, config): + super().__init__(config) + self.num_labels = config.num_labels + self.model = MistralModel(config) + self.score = nn.Linear(config.hidden_size, self.num_labels, bias=False,dtype = torch.bfloat16) + + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self): + return self.model.embed_tokens + + def set_input_embeddings(self, value): + self.model.embed_tokens = value + + @add_start_docstrings_to_model_forward(MISTRAL_INPUTS_DOCSTRING) + def forward( + self, + input_ids: torch.LongTensor = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[List[torch.FloatTensor]] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple, SequenceClassifierOutputWithPast]: + r""" + labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): + Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., + config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If + `config.num_labels > 1` a classification loss is computed (Cross-Entropy). + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + transformer_outputs = self.model( + input_ids, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + hidden_states = transformer_outputs[0] + logits = self.score(hidden_states) + + if input_ids is not None: + batch_size = input_ids.shape[0] + else: + batch_size = inputs_embeds.shape[0] + + if self.config.pad_token_id is None and batch_size != 1: + raise ValueError("Cannot handle batch sizes > 1 if no padding token is defined.") + if self.config.pad_token_id is None: + sequence_lengths = -1 + else: + if input_ids is not None: + # if no pad token found, use modulo instead of reverse indexing for ONNX compatibility + sequence_lengths = torch.eq(input_ids, self.config.pad_token_id).int().argmax(-1) - 1 + sequence_lengths = sequence_lengths % input_ids.shape[-1] + sequence_lengths = sequence_lengths.to(logits.device) + else: + sequence_lengths = -1 + + pooled_logits = logits[torch.arange(batch_size, device=logits.device), sequence_lengths] + + loss = None + if labels is not None: + labels = labels.to(logits.device) + if self.config.problem_type is None: + if self.num_labels == 1: + self.config.problem_type = "regression" + elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int): + self.config.problem_type = "single_label_classification" + else: + self.config.problem_type = "multi_label_classification" + + if self.config.problem_type == "regression": + loss_fct = MSELoss() + if self.num_labels == 1: + loss = loss_fct(pooled_logits.squeeze(), labels.squeeze()) + else: + loss = loss_fct(pooled_logits, labels) + elif self.config.problem_type == "single_label_classification": + loss_fct = CrossEntropyLoss() + loss = loss_fct(pooled_logits.view(-1, self.num_labels), labels.view(-1)) + elif self.config.problem_type == "multi_label_classification": + loss_fct = BCEWithLogitsLoss() + loss = loss_fct(pooled_logits, labels) + if not return_dict: + output = (pooled_logits,) + transformer_outputs[1:] + return ((loss,) + output) if loss is not None else output + + return SequenceClassifierOutputWithPast( + loss=loss, + logits=pooled_logits, + past_key_values=transformer_outputs.past_key_values, + hidden_states=transformer_outputs.hidden_states, + attentions=transformer_outputs.attentions, + ) From b2ad2cfa73d1bc0d3cb442a50cca6ba3fcfd3a7f Mon Sep 17 00:00:00 2001 From: LinqiY <100989140+LinqiY@users.noreply.github.com> Date: Mon, 6 May 2024 15:36:09 +0800 Subject: [PATCH 04/16] Add safetensors --- collie/driver/io/file.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/collie/driver/io/file.py b/collie/driver/io/file.py index 0196de4..18427fb 100644 --- a/collie/driver/io/file.py +++ b/collie/driver/io/file.py @@ -4,13 +4,17 @@ import io import torch import shutil +from safetensors.torch import save_file, load_file class FileIODriver(IODriver): @staticmethod def load(path: str, mode: str): assert os.path.exists(path), f"File {path} does not exist." if 'b' in mode.lower(): - return torch.load(path, map_location=torch.device('cpu')) + if path.endswith(".safetensors"): + return load_file(path, device='cpu') + else: + return torch.load(path, map_location=torch.device('cpu')) else: with open(path, 'r') as f: return f.read() From b47c0fb4a4dfdee278598179309a90643b068ee8 Mon Sep 17 00:00:00 2001 From: LinqiY <100989140+LinqiY@users.noreply.github.com> Date: Mon, 6 May 2024 15:39:30 +0800 Subject: [PATCH 05/16] Delete collie/models/mistral/modeling_mistral.py --- collie/models/mistral/modeling_mistral.py | 1473 --------------------- 1 file changed, 1473 deletions(-) delete mode 100644 collie/models/mistral/modeling_mistral.py diff --git a/collie/models/mistral/modeling_mistral.py b/collie/models/mistral/modeling_mistral.py deleted file mode 100644 index 03fad65..0000000 --- a/collie/models/mistral/modeling_mistral.py +++ /dev/null @@ -1,1473 +0,0 @@ -# coding=utf-8 -# Copyright 2023 Mistral AI and the HuggingFace Inc. team. All rights reserved. -# -# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX -# and OPT implementations in this library. It has been modified from its -# original forms to accommodate minor architectural differences compared -# to GPT-NeoX and OPT used by the Meta AI team that trained the model. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -""" PyTorch Mistral model.""" -import inspect -import math -import warnings -from typing import List, Optional, Tuple, Union -import json - -import torch -import torch.nn.functional as F -import torch.utils.checkpoint -from torch import nn -from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss - -from transformers.activations import ACT2FN -from transformers.cache_utils import Cache, DynamicCache -from transformers.modeling_attn_mask_utils import _prepare_4d_causal_attention_mask, _prepare_4d_causal_attention_mask_for_sdpa -from transformers.modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast, SequenceClassifierOutputWithPast -from transformers.modeling_utils import PreTrainedModel -from transformers.utils import ( - add_start_docstrings, - add_start_docstrings_to_model_forward, - is_flash_attn_2_available, - is_flash_attn_greater_or_equal_2_10, - logging, - replace_return_docstrings, -) -from .configuration_mistral import MistralConfig - - -if is_flash_attn_2_available(): - from flash_attn import flash_attn_func, flash_attn_varlen_func - from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input # noqa - - _flash_supports_window_size = "window_size" in list(inspect.signature(flash_attn_func).parameters) - - -logger = logging.get_logger(__name__) - -_CONFIG_FOR_DOC = "MistralConfig" - - -# Copied from transformers.models.llama.modeling_llama._get_unpad_data -def _get_unpad_data(attention_mask): - seqlens_in_batch = attention_mask.sum(dim=-1, dtype=torch.int32) - indices = torch.nonzero(attention_mask.flatten(), as_tuple=False).flatten() - max_seqlen_in_batch = seqlens_in_batch.max().item() - cu_seqlens = F.pad(torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.int32), (1, 0)) - return ( - indices, - cu_seqlens, - max_seqlen_in_batch, - ) - - -# Copied from transformers.models.llama.modeling_llama.LlamaRMSNorm with Llama->Mistral -class MistralRMSNorm(nn.Module): - def __init__(self, hidden_size, eps=1e-6, dtype=torch.bfloat16): - """ - MistralRMSNorm is equivalent to T5LayerNorm - """ - super().__init__() - self.weight = nn.Parameter(torch.ones(hidden_size)) - self.variance_epsilon = eps - - def forward(self, hidden_states): - input_dtype = hidden_states.dtype - hidden_states = hidden_states.to(torch.float32) - variance = hidden_states.pow(2).mean(-1, keepdim=True) - hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon) - self.weight = nn.Parameter(self.weight.to(input_dtype)) - ans = self.weight * hidden_states.to(input_dtype) - - # 打印层标准化的输出 - hidden_states_output = ans.detach().cpu().tolist() - data_to_save = {"Layer Norm Output": hidden_states_output} - # 将输出写入 JSON 文件 - with open('rms_output.json', 'w') as f: - json.dump(data_to_save, f, indent=4) - return self.weight * hidden_states.to(input_dtype) - - -# copied from transformers.models.llama.modeling_llama.LlamaRotaryEmbedding with Llama->Mistral -# TODO @Arthur no longer copied from LLama after static cache -class MistralRotaryEmbedding(nn.Module): - def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None): - super().__init__() - - self.dim = dim - self.max_position_embeddings = max_position_embeddings - self.base = base - inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2, dtype=torch.int64).float().to(device) / self.dim)) - self.register_buffer("inv_freq", inv_freq, persistent=False) - - # Build here to make `torch.jit.trace` work. - self._set_cos_sin_cache( - seq_len=max_position_embeddings, device=self.inv_freq.device, dtype=torch.get_default_dtype() - ) - - def _set_cos_sin_cache(self, seq_len, device, dtype): - self.max_seq_len_cached = seq_len - t = torch.arange(self.max_seq_len_cached, device=device, dtype=torch.int64).type_as(self.inv_freq) - - freqs = torch.outer(t, self.inv_freq) - # Different from paper, but it uses a different permutation in order to obtain the same calculation - emb = torch.cat((freqs, freqs), dim=-1) - self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False) - self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False) - - def forward(self, x, seq_len=None): - # x: [bs, num_attention_heads, seq_len, head_size] - if seq_len > self.max_seq_len_cached: - self._set_cos_sin_cache(seq_len=seq_len, device=x.device, dtype=x.dtype) - - return ( - self.cos_cached[:seq_len].to(dtype=x.dtype), - self.sin_cached[:seq_len].to(dtype=x.dtype), - ) - - -# Copied from transformers.models.llama.modeling_llama.rotate_half -def rotate_half(x): - """Rotates half the hidden dims of the input.""" - x1 = x[..., : x.shape[-1] // 2] - x2 = x[..., x.shape[-1] // 2 :] - return torch.cat((-x2, x1), dim=-1) - - -# copied from transformers.models.llama.modeling_llama.apply_rotary_pos_emb -# TODO @Arthur no longer copied from LLama after static cache -def apply_rotary_pos_emb(q, k, cos, sin, position_ids, unsqueeze_dim=1): - """Applies Rotary Position Embedding to the query and key tensors. - - Args: - q (`torch.Tensor`): The query tensor. - k (`torch.Tensor`): The key tensor. - cos (`torch.Tensor`): The cosine part of the rotary embedding. - sin (`torch.Tensor`): The sine part of the rotary embedding. - position_ids (`torch.Tensor`): - The position indices of the tokens corresponding to the query and key tensors. For example, this can be - used to pass offsetted position ids when working with a KV-cache. - unsqueeze_dim (`int`, *optional*, defaults to 1): - The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and - sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note - that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and - k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes - cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have - the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2. - Returns: - `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding. - """ - cos = cos[position_ids].unsqueeze(unsqueeze_dim) - sin = sin[position_ids].unsqueeze(unsqueeze_dim) - q_embed = (q * cos) + (rotate_half(q) * sin) - k_embed = (k * cos) + (rotate_half(k) * sin) - return q_embed, k_embed - - -class MistralMLP(nn.Module): - def __init__(self, config): - super().__init__() - self.config = config - self.hidden_size = config.hidden_size - self.intermediate_size = config.intermediate_size - self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False, dtype = torch.bfloat16) - self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False,dtype = torch.bfloat16) - self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False,dtype = torch.bfloat16) - self.act_fn = ACT2FN[config.hidden_act] - - def forward(self, x): - x = x.to(dtype=torch.bfloat16) - output = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x)) - - # 打印MLP层输出 - mlp_output = output.detach().cpu().tolist() - data_to_save = {"MLP Output": mlp_output} - # 将输出写入 JSON 文件 - with open('mlp_output.json', 'w') as f: - json.dump(data_to_save, f, indent=4) - - return output - - -# Copied from transformers.models.llama.modeling_llama.repeat_kv -def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor: - """ - This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch, - num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim) - """ - batch, num_key_value_heads, slen, head_dim = hidden_states.shape - if n_rep == 1: - return hidden_states - hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim) - return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim) - - -class MistralAttention(nn.Module): - """ - Multi-headed attention from 'Attention Is All You Need' paper. Modified to use sliding window attention: Longformer - and "Generating Long Sequences with Sparse Transformers". - """ - - def __init__(self, config: MistralConfig, layer_idx: Optional[int] = None): - super().__init__() - self.config = config - self.layer_idx = layer_idx - if layer_idx is None: - logger.warning_once( - f"Instantiating {self.__class__.__name__} without passing a `layer_idx` is not recommended and will " - "lead to errors during the forward call if caching is used. Please make sure to provide a `layer_idx` " - "when creating this class." - ) - - self.hidden_size = config.hidden_size - self.num_heads = config.num_attention_heads - self.head_dim = self.hidden_size // self.num_heads - self.num_key_value_heads = config.num_key_value_heads - self.num_key_value_groups = self.num_heads // self.num_key_value_heads - self.max_position_embeddings = config.max_position_embeddings - self.rope_theta = config.rope_theta - self.is_causal = True - self.attention_dropout = config.attention_dropout - - if (self.head_dim * self.num_heads) != self.hidden_size: - raise ValueError( - f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}" - f" and `num_heads`: {self.num_heads})." - ) - self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=False,dtype = torch.bfloat16) - self.k_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=False,dtype = torch.bfloat16) - self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=False,dtype = torch.bfloat16) - self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=False,dtype = torch.bfloat16) - - self.rotary_emb = MistralRotaryEmbedding( - self.head_dim, - max_position_embeddings=self.max_position_embeddings, - base=self.rope_theta, - ) - - def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int): - return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous() - - def forward( - self, - hidden_states: torch.Tensor, - attention_mask: Optional[torch.Tensor] = None, - position_ids: Optional[torch.LongTensor] = None, - past_key_value: Optional[Cache] = None, - output_attentions: bool = False, - use_cache: bool = False, - **kwargs, - ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: - if "padding_mask" in kwargs: - warnings.warn( - "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`" - ) - bsz, q_len, _ = hidden_states.size() - query_states = self.q_proj(hidden_states) - key_states = self.k_proj(hidden_states) - value_states = self.v_proj(hidden_states) - - query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2) - key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) - value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) - - # 打印注意力模块的输出 - # 准备数据以写入 JSON 文件 - attention_outputs = { - "Query states": query_states.detach().cpu().tolist(), - "Key states": key_states.detach().cpu().tolist(), - "Value states": value_states.detach().cpu().tolist() - } - # 将数据写入 JSON 文件 - with open("attention_outputs.json", "w") as f: - json.dump(attention_outputs, f, indent=4) - - kv_seq_len = key_states.shape[-2] - if past_key_value is not None: - if self.layer_idx is None: - raise ValueError( - f"The cache structure has changed since version v4.36. If you are using {self.__class__.__name__} " - "for auto-regressive decoding with k/v caching, please make sure to initialize the attention class " - "with a layer index." - ) - kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx) - cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len) - query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids) - - if past_key_value is not None: - cache_kwargs = {"sin": sin, "cos": cos} # Specific to RoPE models - key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs) - - # repeat k/v heads if n_kv_heads < n_heads - key_states = repeat_kv(key_states, self.num_key_value_groups) - value_states = repeat_kv(value_states, self.num_key_value_groups) - - attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim) - - if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len): - raise ValueError( - f"Attention weights should be of size {(bsz, self.num_heads, q_len, kv_seq_len)}, but is" - f" {attn_weights.size()}" - ) - - if attention_mask is not None: - if attention_mask.size() != (bsz, 1, q_len, kv_seq_len): - raise ValueError( - f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}" - ) - - attn_weights = attn_weights + attention_mask - - # upcast attention to fp32 - attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype) - attn_weights = nn.functional.dropout(attn_weights, p=self.attention_dropout, training=self.training) - attn_output = torch.matmul(attn_weights, value_states) - - if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim): - raise ValueError( - f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is" - f" {attn_output.size()}" - ) - - attn_output = attn_output.transpose(1, 2).contiguous() - attn_output = attn_output.reshape(bsz, q_len, self.hidden_size) - - attn_output = self.o_proj(attn_output) - - if not output_attentions: - attn_weights = None - - # 打印注意力模块的输出 - attention_result = { - "Output weights:": attn_output.detach().cpu().tolist(), - # "Attention weights:": attn_weights.detach().cpu().tolist(), - } - # 将数据写入 JSON 文件 - with open("attention_outputs.json", "w") as f: - json.dump(attention_result, f, indent=4) - - return attn_output, attn_weights, past_key_value - - -class MistralFlashAttention2(MistralAttention): - """ - Mistral flash attention module. This module inherits from `MistralAttention` as the weights of the module stays - untouched. The only required change would be on the forward pass where it needs to correctly call the public API of - flash attention and deal with padding tokens in case the input contains any of them. - """ - - # Copied from transformers.models.llama.modeling_llama.LlamaFlashAttention2.__init__ - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - - # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1. - # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignement, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0. - # Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left). - self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10() - - def forward( - self, - hidden_states: torch.Tensor, - attention_mask: Optional[torch.Tensor] = None, - position_ids: Optional[torch.LongTensor] = None, - past_key_value: Optional[Cache] = None, - output_attentions: bool = False, - use_cache: bool = False, - **kwargs, - ): - if "padding_mask" in kwargs: - warnings.warn( - "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`" - ) - - # overwrite attention_mask with padding_mask - attention_mask = kwargs.pop("padding_mask") - bsz, q_len, _ = hidden_states.size() - - query_states = self.q_proj(hidden_states.to(torch.bfloat16)) - key_states = self.k_proj(hidden_states.to(torch.bfloat16)) - value_states = self.v_proj(hidden_states.to(torch.bfloat16)) - - query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2) - key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) - value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) - - kv_seq_len = key_states.shape[-2] - if past_key_value is not None: - if self.layer_idx is None: - raise ValueError( - f"The cache structure has changed since version v4.36. If you are using {self.__class__.__name__} " - "for auto-regressive decoding with k/v caching, please make sure to initialize the attention class " - "with a layer index." - ) - kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx) - - # Because the input can be padded, the absolute sequence length depends on the max position id. - rotary_seq_len = max(kv_seq_len, position_ids[:, -1].max().item()) + 1 - cos, sin = self.rotary_emb(value_states, seq_len=rotary_seq_len) - - query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids) - - use_sliding_windows = ( - _flash_supports_window_size - and getattr(self.config, "sliding_window", None) is not None - and kv_seq_len > self.config.sliding_window - ) - - if not _flash_supports_window_size: - logger.warning_once( - "The current flash attention version does not support sliding window attention, for a more memory efficient implementation" - " make sure to upgrade flash-attn library." - ) - - if past_key_value is not None: - # Activate slicing cache only if the config has a value `sliding_windows` attribute - cache_has_contents = past_key_value.get_seq_length(self.layer_idx) > 0 - if ( - getattr(self.config, "sliding_window", None) is not None - and kv_seq_len > self.config.sliding_window - and cache_has_contents - ): - slicing_tokens = 1 - self.config.sliding_window - - past_key = past_key_value[self.layer_idx][0] - past_value = past_key_value[self.layer_idx][1] - - past_key = past_key[:, :, slicing_tokens:, :].contiguous() - past_value = past_value[:, :, slicing_tokens:, :].contiguous() - - if past_key.shape[-2] != self.config.sliding_window - 1: - raise ValueError( - f"past key must have a shape of (`batch_size, num_heads, self.config.sliding_window-1, head_dim`), got" - f" {past_key.shape}" - ) - - if attention_mask is not None: - attention_mask = attention_mask[:, slicing_tokens:] - attention_mask = torch.cat([attention_mask, torch.ones_like(attention_mask[:, -1:])], dim=-1) - - cache_kwargs = {"sin": sin, "cos": cos} # Specific to RoPE models - key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs) - - # repeat k/v heads if n_kv_heads < n_heads - key_states = repeat_kv(key_states, self.num_key_value_groups) - value_states = repeat_kv(value_states, self.num_key_value_groups) - dropout_rate = 0.0 if not self.training else self.attention_dropout - - # In PEFT, usually we cast the layer norms in float32 for training stability reasons - # therefore the input hidden states gets silently casted in float32. Hence, we need - # cast them back in float16 just to be sure everything works as expected. - input_dtype = query_states.dtype - if input_dtype == torch.float32: - if torch.is_autocast_enabled(): - target_dtype = torch.get_autocast_gpu_dtype() - # Handle the case where the model is quantized - elif hasattr(self.config, "_pre_quantization_dtype"): - target_dtype = self.config._pre_quantization_dtype - else: - target_dtype = self.q_proj.weight.dtype - - logger.warning_once( - f"The input hidden states seems to be silently casted in float32, this might be related to" - f" the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in" - f" {target_dtype}." - ) - - query_states = query_states.to(target_dtype) - key_states = key_states.to(target_dtype) - value_states = value_states.to(target_dtype) - - # Reashape to the expected shape for Flash Attention - query_states = query_states.transpose(1, 2) - key_states = key_states.transpose(1, 2) - value_states = value_states.transpose(1, 2) - # 打印注意力模块的输出 - # 准备数据以写入 JSON 文件 - attention_outputs = { - "Query states": query_states.detach().cpu().tolist(), - "Key states": key_states.detach().cpu().tolist(), - "Value states": value_states.detach().cpu().tolist() - } - # 将数据写入 JSON 文件 - with open("flash_attention_outputs.json", "w") as f: - json.dump(attention_outputs, f, indent=4) - - attn_output = self._flash_attention_forward( - query_states, - key_states, - value_states, - attention_mask, - q_len, - dropout=dropout_rate, - use_sliding_windows=use_sliding_windows, - ) - - attn_output = attn_output.reshape(bsz, q_len, self.hidden_size).contiguous() - attn_output = self.o_proj(attn_output) - - if not output_attentions: - attn_weights = None - - # 打印注意力模块的输出 - attention_result = { - "Output weights:": attn_output.detach().cpu().tolist(), - # "Attention weights:": attn_weights.detach().cpu().tolist(), - } - # 将数据写入 JSON 文件 - with open("flash_attention_outputs.json", "w") as f: - json.dump(attention_result, f, indent=4) - - return attn_output, attn_weights, past_key_value - - def _flash_attention_forward( - self, - query_states, - key_states, - value_states, - attention_mask, - query_length, - dropout=0.0, - softmax_scale=None, - use_sliding_windows=False, - ): - """ - Calls the forward method of Flash Attention - if the input hidden states contain at least one padding token - first unpad the input, then computes the attention scores and pad the final attention scores. - - Args: - query_states (`torch.Tensor`): - Input query states to be passed to Flash Attention API - key_states (`torch.Tensor`): - Input key states to be passed to Flash Attention API - value_states (`torch.Tensor`): - Input value states to be passed to Flash Attention API - attention_mask (`torch.Tensor`): - The padding mask - corresponds to a tensor of size `(batch_size, seq_len)` where 0 stands for the - position of padding tokens and 1 for the position of non-padding tokens. - dropout (`float`): - Attention dropout - softmax_scale (`float`, *optional*): - The scaling of QK^T before applying softmax. Default to 1 / sqrt(head_dim) - use_sliding_windows (`bool`, *optional*): - Whether to activate sliding window attention. - """ - if not self._flash_attn_uses_top_left_mask: - causal = self.is_causal - else: - # TODO: Remove the `query_length != 1` check once Flash Attention for RoCm is bumped to 2.1. For details, please see the comment in LlamaFlashAttention2 __init__. - causal = self.is_causal and query_length != 1 - - # Contains at least one padding token in the sequence - if attention_mask is not None: - batch_size = query_states.shape[0] - query_states, key_states, value_states, indices_q, cu_seq_lens, max_seq_lens = self._upad_input( - query_states, key_states, value_states, attention_mask, query_length - ) - - cu_seqlens_q, cu_seqlens_k = cu_seq_lens - max_seqlen_in_batch_q, max_seqlen_in_batch_k = max_seq_lens - - if not use_sliding_windows: - attn_output_unpad = flash_attn_varlen_func( - query_states, - key_states, - value_states, - cu_seqlens_q=cu_seqlens_q, - cu_seqlens_k=cu_seqlens_k, - max_seqlen_q=max_seqlen_in_batch_q, - max_seqlen_k=max_seqlen_in_batch_k, - dropout_p=dropout, - softmax_scale=softmax_scale, - causal=causal, - ) - else: - attn_output_unpad = flash_attn_varlen_func( - query_states, - key_states, - value_states, - cu_seqlens_q=cu_seqlens_q, - cu_seqlens_k=cu_seqlens_k, - max_seqlen_q=max_seqlen_in_batch_q, - max_seqlen_k=max_seqlen_in_batch_k, - dropout_p=dropout, - softmax_scale=softmax_scale, - causal=causal, - window_size=(self.config.sliding_window, self.config.sliding_window), - ) - - attn_output = pad_input(attn_output_unpad, indices_q, batch_size, query_length) - else: - if not use_sliding_windows: - attn_output = flash_attn_func( - query_states, - key_states, - value_states, - dropout, - softmax_scale=softmax_scale, - causal=causal, - ) - else: - attn_output = flash_attn_func( - query_states, - key_states, - value_states, - dropout, - softmax_scale=softmax_scale, - causal=causal, - window_size=(self.config.sliding_window, self.config.sliding_window), - ) - - return attn_output - - def _upad_input(self, query_layer, key_layer, value_layer, attention_mask, query_length): - batch_size, kv_seq_len, num_heads, head_dim = key_layer.shape - - # On the first iteration we need to properly re-create the padding mask - # by slicing it on the proper place - if kv_seq_len != attention_mask.shape[-1]: - attention_mask_num_tokens = attention_mask.shape[-1] - attention_mask = attention_mask[:, attention_mask_num_tokens - kv_seq_len :] - - indices_k, cu_seqlens_k, max_seqlen_in_batch_k = _get_unpad_data(attention_mask) - - key_layer = index_first_axis(key_layer.reshape(batch_size * kv_seq_len, num_heads, head_dim), indices_k) - value_layer = index_first_axis(value_layer.reshape(batch_size * kv_seq_len, num_heads, head_dim), indices_k) - - if query_length == kv_seq_len: - query_layer = index_first_axis( - query_layer.reshape(batch_size * kv_seq_len, num_heads, head_dim), indices_k - ) - cu_seqlens_q = cu_seqlens_k - max_seqlen_in_batch_q = max_seqlen_in_batch_k - indices_q = indices_k - elif query_length == 1: - max_seqlen_in_batch_q = 1 - cu_seqlens_q = torch.arange( - batch_size + 1, dtype=torch.int32, device=query_layer.device - ) # There is a memcpy here, that is very bad. - indices_q = cu_seqlens_q[:-1] - query_layer = query_layer.squeeze(1) - else: - # The -q_len: slice assumes left padding. - attention_mask = attention_mask[:, -query_length:] - query_layer, indices_q, cu_seqlens_q, max_seqlen_in_batch_q = unpad_input(query_layer, attention_mask) - - return ( - query_layer, - key_layer, - value_layer, - indices_q, - (cu_seqlens_q, cu_seqlens_k), - (max_seqlen_in_batch_q, max_seqlen_in_batch_k), - ) - - -# copied from transformers.models.llama.modeling_llama.LlamaSdpaAttention with Llama->Mistral -# TODO @Arthur no longer copied from LLama after static cache -class MistralSdpaAttention(MistralAttention): - """ - Mistral attention module using torch.nn.functional.scaled_dot_product_attention. This module inherits from - `MistralAttention` as the weights of the module stays untouched. The only changes are on the forward pass to adapt to - SDPA API. - """ - - # Adapted from MistralAttention.forward - def forward( - self, - hidden_states: torch.Tensor, - attention_mask: Optional[torch.Tensor] = None, - position_ids: Optional[torch.LongTensor] = None, - past_key_value: Optional[Cache] = None, - output_attentions: bool = False, - use_cache: bool = False, - ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: - if output_attentions: - # TODO: Improve this warning with e.g. `model.config.attn_implementation = "manual"` once this is implemented. - logger.warning_once( - "MistralModel is using MistralSdpaAttention, but `torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to the manual attention implementation, " - 'but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.' - ) - return super().forward( - hidden_states=hidden_states, - attention_mask=attention_mask, - position_ids=position_ids, - past_key_value=past_key_value, - output_attentions=output_attentions, - use_cache=use_cache, - ) - - bsz, q_len, _ = hidden_states.size() - - query_states = self.q_proj(hidden_states.to(torch.bfloat16)) - key_states = self.k_proj(hidden_states.to(torch.bfloat16)) - value_states = self.v_proj(hidden_states.to(torch.bfloat16)) - - query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2) - key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) - value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) - - # 打印注意力模块的输出 - # 准备数据以写入 JSON 文件 - attention_outputs = { - "Query states": query_states.detach().cpu().tolist(), - "Key states": key_states.detach().cpu().tolist(), - "Value states": value_states.detach().cpu().tolist() - } - # 将数据写入 JSON 文件 - with open("sdpa_attention_outputs.json", "w") as f: - json.dump(attention_outputs, f, indent=4) - - - kv_seq_len = key_states.shape[-2] - if past_key_value is not None: - kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx) - cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len) - query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids) - - if past_key_value is not None: - cache_kwargs = {"sin": sin, "cos": cos} # Specific to RoPE models - key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs) - - key_states = repeat_kv(key_states, self.num_key_value_groups) - value_states = repeat_kv(value_states, self.num_key_value_groups) - - if attention_mask is not None: - if attention_mask.size() != (bsz, 1, q_len, kv_seq_len): - raise ValueError( - f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}" - ) - - # SDPA with memory-efficient backend is currently (torch==2.1.2) bugged with non-contiguous inputs with custom attn_mask, - # Reference: https://github.com/pytorch/pytorch/issues/112577. - if query_states.device.type == "cuda" and attention_mask is not None: - query_states = query_states.contiguous() - key_states = key_states.contiguous() - value_states = value_states.contiguous() - - attn_output = torch.nn.functional.scaled_dot_product_attention( - query_states, - key_states, - value_states, - attn_mask=attention_mask, - dropout_p=self.attention_dropout if self.training else 0.0, - # The q_len > 1 is necessary to match with AttentionMaskConverter.to_causal_4d that does not create a causal mask in case q_len == 1. - is_causal=self.is_causal and attention_mask is None and q_len > 1, - ) - - attn_output = attn_output.transpose(1, 2).contiguous() - attn_output = attn_output.view(bsz, q_len, self.hidden_size) - - attn_output = self.o_proj(attn_output) - - # 打印注意力模块的输出 - attention_result = { - "Output weights:": attn_output.detach().cpu().tolist(), - # "Attention weights:": attn_weights.detach().cpu().tolist(), - } - # 将数据写入 JSON 文件 - with open("sdpa_attention_outputs.json", "w") as f: - json.dump(attention_result, f, indent=4) - - return attn_output, None, past_key_value - - -MISTRAL_ATTENTION_CLASSES = { - "eager": MistralAttention, - "flash_attention_2": MistralFlashAttention2, - "sdpa": MistralSdpaAttention, -} - - -class MistralDecoderLayer(nn.Module): - def __init__(self, config: MistralConfig, layer_idx: int): - super().__init__() - self.hidden_size = config.hidden_size - # bbbb - config._attn_implementation = "sdpa" - self.self_attn = MISTRAL_ATTENTION_CLASSES[config._attn_implementation](config, layer_idx) - - self.mlp = MistralMLP(config) - self.input_layernorm = MistralRMSNorm(config.hidden_size, eps=config.rms_norm_eps,dtype=torch.bfloat16) - self.post_attention_layernorm = MistralRMSNorm(config.hidden_size, eps=config.rms_norm_eps,dtype=torch.bfloat16) - - def forward( - self, - hidden_states: torch.Tensor, - attention_mask: Optional[torch.Tensor] = None, - position_ids: Optional[torch.LongTensor] = None, - past_key_value: Optional[Tuple[torch.Tensor]] = None, - output_attentions: Optional[bool] = False, - use_cache: Optional[bool] = False, - **kwargs, - ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]: - if "padding_mask" in kwargs: - warnings.warn( - "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`" - ) - """ - Args: - hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)` - attention_mask (`torch.FloatTensor`, *optional*): attention mask of size - `(batch, sequence_length)` where padding elements are indicated by 0. - output_attentions (`bool`, *optional*): - Whether or not to return the attentions tensors of all attention layers. See `attentions` under - returned tensors for more detail. - use_cache (`bool`, *optional*): - If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding - (see `past_key_values`). - past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states - """ - - residual = hidden_states - - hidden_states = self.input_layernorm(hidden_states) - # Self Attention - hidden_states, self_attn_weights, present_key_value = self.self_attn( - hidden_states=hidden_states, - attention_mask=attention_mask, - position_ids=position_ids, - past_key_value=past_key_value, - output_attentions=output_attentions, - use_cache=use_cache, - ) - hidden_states = residual + hidden_states - - # Fully Connected - residual = hidden_states - hidden_states = self.post_attention_layernorm(hidden_states) - hidden_states = self.mlp(hidden_states) - hidden_states = residual + hidden_states - - outputs = (hidden_states,) - - if output_attentions: - outputs += (self_attn_weights,) - - if use_cache: - outputs += (present_key_value,) - - return outputs - - -MISTRAL_START_DOCSTRING = r""" - This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the - library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads - etc.) - - This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. - Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage - and behavior. - - Parameters: - config ([`MistralConfig`]): - Model configuration class with all the parameters of the model. Initializing with a config file does not - load the weights associated with the model, only the configuration. Check out the - [`~PreTrainedModel.from_pretrained`] method to load the model weights. -""" - - -@add_start_docstrings( - "The bare Mistral Model outputting raw hidden-states without any specific head on top.", - MISTRAL_START_DOCSTRING, -) -class MistralPreTrainedModel(PreTrainedModel): - config_class = MistralConfig - base_model_prefix = "model" - supports_gradient_checkpointing = True - _no_split_modules = ["MistralDecoderLayer"] - _skip_keys_device_placement = "past_key_values" - _supports_flash_attn_2 = True - _supports_sdpa = True - _supports_cache_class = True - - def _init_weights(self, module): - std = self.config.initializer_range - if isinstance(module, nn.Linear): - module.weight.data.normal_(mean=0.0, std=std) - if module.bias is not None: - module.bias.data.zero_() - elif isinstance(module, nn.Embedding): - module.weight.data.normal_(mean=0.0, std=std) - if module.padding_idx is not None: - module.weight.data[module.padding_idx].zero_() - - -MISTRAL_INPUTS_DOCSTRING = r""" - Args: - input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`): - Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide - it. - - Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and - [`PreTrainedTokenizer.__call__`] for details. - - [What are input IDs?](../glossary#input-ids) - attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*): - Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: - - - 1 for tokens that are **not masked**, - - 0 for tokens that are **masked**. - - [What are attention masks?](../glossary#attention-mask) - - Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and - [`PreTrainedTokenizer.__call__`] for details. - - If `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see - `past_key_values`). - - If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`] - and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more - information on the default strategy. - - - 1 indicates the head is **not masked**, - - 0 indicates the head is **masked**. - position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): - Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0, - config.n_positions - 1]`. - - [What are position IDs?](../glossary#position-ids) - past_key_values (`Cache` or `tuple(tuple(torch.FloatTensor))`, *optional*): - Pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention - blocks) that can be used to speed up sequential decoding. This typically consists in the `past_key_values` - returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`. - - Two formats are allowed: - - a [`~cache_utils.Cache`] instance; - - Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of - shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`). This is also known as the legacy - cache format. - - The model will output the same cache format that is fed as input. If no `past_key_values` are passed, the - legacy cache format will be returned. - - If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that don't - have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `input_ids` - of shape `(batch_size, sequence_length)`. - inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): - Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This - is useful if you want more control over how to convert `input_ids` indices into associated vectors than the - model's internal embedding lookup matrix. - use_cache (`bool`, *optional*): - If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see - `past_key_values`). - output_attentions (`bool`, *optional*): - Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned - tensors for more detail. - output_hidden_states (`bool`, *optional*): - Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for - more detail. - return_dict (`bool`, *optional*): - Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. -""" - - -@add_start_docstrings( - "The bare Mistral Model outputting raw hidden-states without any specific head on top.", - MISTRAL_START_DOCSTRING, -) -class MistralModel(MistralPreTrainedModel): - """ - Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`MistralDecoderLayer`] - - Args: - config: MistralConfig - """ - - def __init__(self, config: MistralConfig): - super().__init__(config) - self.padding_idx = config.pad_token_id - self.vocab_size = config.vocab_size - - self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx, dtype = torch.bfloat16) - self.layers = nn.ModuleList( - [MistralDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)] - ) - config._attn_implementation = "sdpa" - self._attn_implementation = config._attn_implementation - self.norm = MistralRMSNorm(config.hidden_size, eps=config.rms_norm_eps,dtype=torch.bfloat16) - - self.gradient_checkpointing = False - # Initialize weights and apply final processing - self.post_init() - - def get_input_embeddings(self): - return self.embed_tokens - - def set_input_embeddings(self, value): - self.embed_tokens = value - - @add_start_docstrings_to_model_forward(MISTRAL_INPUTS_DOCSTRING) - def forward( - self, - input_ids: torch.LongTensor = None, - attention_mask: Optional[torch.Tensor] = None, - position_ids: Optional[torch.LongTensor] = None, - past_key_values: Optional[List[torch.FloatTensor]] = None, - inputs_embeds: Optional[torch.FloatTensor] = None, - use_cache: Optional[bool] = None, - output_attentions: Optional[bool] = None, - output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[Tuple, BaseModelOutputWithPast]: - output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions - output_hidden_states = ( - output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states - ) - use_cache = use_cache if use_cache is not None else self.config.use_cache - - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - - # retrieve input_ids and inputs_embeds - if input_ids is not None and inputs_embeds is not None: - raise ValueError("You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time") - elif input_ids is not None: - batch_size, seq_length = input_ids.shape - elif inputs_embeds is not None: - batch_size, seq_length, _ = inputs_embeds.shape - else: - raise ValueError("You have to specify either decoder_input_ids or decoder_inputs_embeds") - - if self.gradient_checkpointing and self.training: - if use_cache: - logger.warning_once( - "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..." - ) - use_cache = False - - past_key_values_length = 0 - - if use_cache: - use_legacy_cache = not isinstance(past_key_values, Cache) - if use_legacy_cache: - past_key_values = DynamicCache.from_legacy_cache(past_key_values) - past_key_values_length = past_key_values.get_usable_length(seq_length) - - if position_ids is None: - device = input_ids.device if input_ids is not None else inputs_embeds.device - position_ids = torch.arange( - past_key_values_length, seq_length + past_key_values_length, dtype=torch.long, device=device - ) - position_ids = position_ids.unsqueeze(0).view(-1, seq_length) - else: - position_ids = position_ids.view(-1, seq_length).long() - - if inputs_embeds is None: - inputs_embeds = self.embed_tokens(input_ids) - - # 打印嵌入层输出 - embeddings_output = inputs_embeds.detach().cpu().tolist() - data_to_save = {"Embeddings Output": embeddings_output} - # 将输出写入 JSON 文件 - with open('embeddings_output.json', 'w') as f: - json.dump(data_to_save, f, indent=4) - - if attention_mask is not None and self._attn_implementation == "flash_attention_2" and use_cache: - is_padding_right = attention_mask[:, -1].sum().item() != batch_size - if is_padding_right: - raise ValueError( - "You are attempting to perform batched generation with padding_side='right'" - " this may lead to unexpected behaviour for Flash Attention version of Mistral. Make sure to " - " call `tokenizer.padding_side = 'left'` before tokenizing the input. " - ) - - if self._attn_implementation == "flash_attention_2": - # 2d mask is passed through the layers - attention_mask = attention_mask if (attention_mask is not None and 0 in attention_mask) else None - elif self._attn_implementation == "sdpa" and not output_attentions: - # output_attentions=True can not be supported when using SDPA, and we fall back on - # the manual implementation that requires a 4D causal mask in all cases. - attention_mask = _prepare_4d_causal_attention_mask_for_sdpa( - attention_mask, - (batch_size, seq_length), - inputs_embeds, - past_key_values_length, - ) - else: - # 4d mask is passed through the layers - attention_mask = _prepare_4d_causal_attention_mask( - attention_mask, - (batch_size, seq_length), - inputs_embeds, - past_key_values_length, - sliding_window=self.config.sliding_window, - ) - - hidden_states = inputs_embeds - - # decoder layers - all_hidden_states = () if output_hidden_states else None - all_self_attns = () if output_attentions else None - next_decoder_cache = None - - for decoder_layer in self.layers: - if output_hidden_states: - all_hidden_states += (hidden_states,) - - if self.gradient_checkpointing and self.training: - layer_outputs = self._gradient_checkpointing_func( - decoder_layer.__call__, - hidden_states, - attention_mask, - position_ids, - past_key_values, - output_attentions, - use_cache, - ) - else: - layer_outputs = decoder_layer( - hidden_states, - attention_mask=attention_mask, - position_ids=position_ids, - past_key_value=past_key_values, - output_attentions=output_attentions, - use_cache=use_cache, - ) - - hidden_states = layer_outputs[0] - - if use_cache: - next_decoder_cache = layer_outputs[2 if output_attentions else 1] - - if output_attentions: - all_self_attns += (layer_outputs[1],) - - hidden_states = self.norm(hidden_states) - - # add hidden states from the last decoder layer - if output_hidden_states: - all_hidden_states += (hidden_states,) - - next_cache = None - if use_cache: - next_cache = next_decoder_cache.to_legacy_cache() if use_legacy_cache else next_decoder_cache - - if not return_dict: - return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None) - return BaseModelOutputWithPast( - last_hidden_state=hidden_states, - past_key_values=next_cache, - hidden_states=all_hidden_states, - attentions=all_self_attns, - ) - - -class MistralForCausalLM(MistralPreTrainedModel): - _tied_weights_keys = ["lm_head.weight"] - - def __init__(self, config): - super().__init__(config) - self.model = MistralModel(config) - self.vocab_size = config.vocab_size - self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False,dtype = torch.bfloat16) - - # Initialize weights and apply final processing - self.post_init() - - def get_input_embeddings(self): - return self.model.embed_tokens - - def set_input_embeddings(self, value): - self.model.embed_tokens = value - - def get_output_embeddings(self): - return self.lm_head - - def set_output_embeddings(self, new_embeddings): - self.lm_head = new_embeddings - - def set_decoder(self, decoder): - self.model = decoder - - def get_decoder(self): - return self.model - - @add_start_docstrings_to_model_forward(MISTRAL_INPUTS_DOCSTRING) - @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC) - def forward( - self, - input_ids: torch.LongTensor = None, - attention_mask: Optional[torch.Tensor] = None, - position_ids: Optional[torch.LongTensor] = None, - past_key_values: Optional[List[torch.FloatTensor]] = None, - inputs_embeds: Optional[torch.FloatTensor] = None, - labels: Optional[torch.LongTensor] = None, - use_cache: Optional[bool] = None, - output_attentions: Optional[bool] = None, - output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[Tuple, CausalLMOutputWithPast]: - r""" - Args: - labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): - Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., - config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored - (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`. - - Returns: - - Example: - - ```python - >>> from transformers import AutoTokenizer, MistralForCausalLM - - >>> model = MistralForCausalLM.from_pretrained("mistralai/Mistral-7B-v0.1") - >>> tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-v0.1") - - >>> prompt = "Hey, are you conscious? Can you talk to me?" - >>> inputs = tokenizer(prompt, return_tensors="pt") - - >>> # Generate - >>> generate_ids = model.generate(inputs.input_ids, max_length=30) - >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0] - "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you." - ```""" - - output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions - output_hidden_states = ( - output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states - ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - - # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn) - outputs = self.model( - input_ids=input_ids, - attention_mask=attention_mask, - position_ids=position_ids, - past_key_values=past_key_values, - inputs_embeds=inputs_embeds, - use_cache=use_cache, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, - return_dict=return_dict, - ) - - hidden_states = outputs[0] - hidden_states = hidden_states.to(dtype=torch.bfloat16) - logits = self.lm_head(hidden_states) - logits = logits.float() - - loss = None - if labels is not None: - # Shift so that tokens < n predict n - shift_logits = logits[..., :-1, :].contiguous() - shift_labels = labels[..., 1:].contiguous() - # Flatten the tokens - shift_logits = shift_logits.view(-1, self.config.vocab_size) - shift_labels = shift_labels.view(-1) - # Ensure tensors are on the same device - shift_labels = shift_labels.to(shift_logits.device) - loss_fct = CrossEntropyLoss() - loss = loss_fct(shift_logits, shift_labels) - - if not return_dict: - output = (logits,) + outputs[1:] - return (loss,) + output if loss is not None else output - - return CausalLMOutputWithPast( - loss=loss, - logits=logits, - past_key_values=outputs.past_key_values, - hidden_states=outputs.hidden_states, - attentions=outputs.attentions, - ) - - def prepare_inputs_for_generation( - self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, **kwargs - ): - # Omit tokens covered by past_key_values - if past_key_values is not None: - if isinstance(past_key_values, Cache): - cache_length = past_key_values.get_seq_length() - past_length = past_key_values.seen_tokens - max_cache_length = past_key_values.get_max_length() - else: - cache_length = past_length = past_key_values[0][0].shape[2] - max_cache_length = None - - # Keep only the unprocessed tokens: - # 1 - If the length of the attention_mask exceeds the length of input_ids, then we are in a setting where - # some of the inputs are exclusively passed as part of the cache (e.g. when passing input_embeds as - # input) - if attention_mask is not None and attention_mask.shape[1] > input_ids.shape[1]: - input_ids = input_ids[:, -(attention_mask.shape[1] - past_length) :] - # 2 - If the past_length is smaller than input_ids', then input_ids holds all input tokens. We can discard - # input_ids based on the past_length. - elif past_length < input_ids.shape[1]: - input_ids = input_ids[:, past_length:] - # 3 - Otherwise (past_length >= input_ids.shape[1]), let's assume input_ids only has unprocessed tokens. - - # If we are about to go beyond the maximum cache length, we need to crop the input attention mask. - if ( - max_cache_length is not None - and attention_mask is not None - and cache_length + input_ids.shape[1] > max_cache_length - ): - attention_mask = attention_mask[:, -max_cache_length:] - - position_ids = kwargs.get("position_ids", None) - if attention_mask is not None and position_ids is None: - # create position_ids on the fly for batch generation - position_ids = attention_mask.long().cumsum(-1) - 1 - position_ids.masked_fill_(attention_mask == 0, 1) - if past_key_values: - position_ids = position_ids[:, -input_ids.shape[1] :] - - # if `inputs_embeds` are passed, we only want to use them in the 1st generation step - if inputs_embeds is not None and past_key_values is None: - model_inputs = {"inputs_embeds": inputs_embeds} - else: - model_inputs = {"input_ids": input_ids} - - model_inputs.update( - { - "position_ids": position_ids, - "past_key_values": past_key_values, - "use_cache": kwargs.get("use_cache"), - "attention_mask": attention_mask, - } - ) - return model_inputs - - @staticmethod - def _reorder_cache(past_key_values, beam_idx): - reordered_past = () - for layer_past in past_key_values: - reordered_past += ( - tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past), - ) - return reordered_past - - -@add_start_docstrings( - """ - The Mistral Model transformer with a sequence classification head on top (linear layer). - - [`MistralForSequenceClassification`] uses the last token in order to do the classification, as other causal models - (e.g. GPT-2) do. - - Since it does classification on the last token, it requires to know the position of the last token. If a - `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If - no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the - padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in - each row of the batch). - """, - MISTRAL_START_DOCSTRING, -) -# Copied from transformers.models.llama.modeling_llama.LlamaForSequenceClassification with Llama->Mistral, LLAMA->MISTRAL -class MistralForSequenceClassification(MistralPreTrainedModel): - def __init__(self, config): - super().__init__(config) - self.num_labels = config.num_labels - self.model = MistralModel(config) - self.score = nn.Linear(config.hidden_size, self.num_labels, bias=False,dtype = torch.bfloat16) - - # Initialize weights and apply final processing - self.post_init() - - def get_input_embeddings(self): - return self.model.embed_tokens - - def set_input_embeddings(self, value): - self.model.embed_tokens = value - - @add_start_docstrings_to_model_forward(MISTRAL_INPUTS_DOCSTRING) - def forward( - self, - input_ids: torch.LongTensor = None, - attention_mask: Optional[torch.Tensor] = None, - position_ids: Optional[torch.LongTensor] = None, - past_key_values: Optional[List[torch.FloatTensor]] = None, - inputs_embeds: Optional[torch.FloatTensor] = None, - labels: Optional[torch.LongTensor] = None, - use_cache: Optional[bool] = None, - output_attentions: Optional[bool] = None, - output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[Tuple, SequenceClassifierOutputWithPast]: - r""" - labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): - Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., - config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If - `config.num_labels > 1` a classification loss is computed (Cross-Entropy). - """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - - transformer_outputs = self.model( - input_ids, - attention_mask=attention_mask, - position_ids=position_ids, - past_key_values=past_key_values, - inputs_embeds=inputs_embeds, - use_cache=use_cache, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, - return_dict=return_dict, - ) - hidden_states = transformer_outputs[0] - logits = self.score(hidden_states) - - if input_ids is not None: - batch_size = input_ids.shape[0] - else: - batch_size = inputs_embeds.shape[0] - - if self.config.pad_token_id is None and batch_size != 1: - raise ValueError("Cannot handle batch sizes > 1 if no padding token is defined.") - if self.config.pad_token_id is None: - sequence_lengths = -1 - else: - if input_ids is not None: - # if no pad token found, use modulo instead of reverse indexing for ONNX compatibility - sequence_lengths = torch.eq(input_ids, self.config.pad_token_id).int().argmax(-1) - 1 - sequence_lengths = sequence_lengths % input_ids.shape[-1] - sequence_lengths = sequence_lengths.to(logits.device) - else: - sequence_lengths = -1 - - pooled_logits = logits[torch.arange(batch_size, device=logits.device), sequence_lengths] - - loss = None - if labels is not None: - labels = labels.to(logits.device) - if self.config.problem_type is None: - if self.num_labels == 1: - self.config.problem_type = "regression" - elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int): - self.config.problem_type = "single_label_classification" - else: - self.config.problem_type = "multi_label_classification" - - if self.config.problem_type == "regression": - loss_fct = MSELoss() - if self.num_labels == 1: - loss = loss_fct(pooled_logits.squeeze(), labels.squeeze()) - else: - loss = loss_fct(pooled_logits, labels) - elif self.config.problem_type == "single_label_classification": - loss_fct = CrossEntropyLoss() - loss = loss_fct(pooled_logits.view(-1, self.num_labels), labels.view(-1)) - elif self.config.problem_type == "multi_label_classification": - loss_fct = BCEWithLogitsLoss() - loss = loss_fct(pooled_logits, labels) - if not return_dict: - output = (pooled_logits,) + transformer_outputs[1:] - return ((loss,) + output) if loss is not None else output - - return SequenceClassifierOutputWithPast( - loss=loss, - logits=pooled_logits, - past_key_values=transformer_outputs.past_key_values, - hidden_states=transformer_outputs.hidden_states, - attentions=transformer_outputs.attentions, - ) From cb21a96886c1d63faae2e44663a06597879925da Mon Sep 17 00:00:00 2001 From: LinqiY <100989140+LinqiY@users.noreply.github.com> Date: Mon, 6 May 2024 15:39:39 +0800 Subject: [PATCH 06/16] Delete collie/models/mistral/modeling_flax_mistral.py --- .../models/mistral/modeling_flax_mistral.py | 741 ------------------ 1 file changed, 741 deletions(-) delete mode 100644 collie/models/mistral/modeling_flax_mistral.py diff --git a/collie/models/mistral/modeling_flax_mistral.py b/collie/models/mistral/modeling_flax_mistral.py deleted file mode 100644 index 0a837f4..0000000 --- a/collie/models/mistral/modeling_flax_mistral.py +++ /dev/null @@ -1,741 +0,0 @@ -# coding=utf-8 -# Copyright 2024 Mistral AI and the HuggingFace Inc. team. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -""" Flax Mistral model.""" -from typing import Optional, Tuple - -import flax.linen as nn -import jax -import jax.numpy as jnp -import numpy as np -from flax.core.frozen_dict import FrozenDict, freeze, unfreeze -from flax.linen import combine_masks, make_causal_mask -from flax.linen.attention import dot_product_attention_weights -from flax.traverse_util import flatten_dict, unflatten_dict -from jax import lax - -from transformers.modeling_flax_outputs import ( - FlaxBaseModelOutput, - FlaxBaseModelOutputWithPast, - FlaxCausalLMOutput, - FlaxCausalLMOutputWithCrossAttentions, -) -from transformers.modeling_flax_utils import ACT2FN, FlaxPreTrainedModel, append_call_sample_docstring, logging -from transformers.utils import add_start_docstrings, add_start_docstrings_to_model_forward -from .configuration_mistral import MistralConfig - - -logger = logging.get_logger(__name__) - -_CONFIG_FOR_DOC = "MistralConfig" -_REAL_CHECKPOINT_FOR_DOC = "mistralai/Mistral-7B-v0.1" -_CHECKPOINT_FOR_DOC = "ksmcg/Mistral-tiny" - -MISTRAL_START_DOCSTRING = r""" - - This model inherits from [`FlaxPreTrainedModel`]. Check the superclass documentation for the generic methods the - library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads - etc.) - - This model is also a Flax Linen - [flax.nn.Module](https://flax.readthedocs.io/en/latest/_autosummary/flax.nn.module.html) subclass. Use it as a - regular Flax Module and refer to the Flax documentation for all matter related to general usage and behavior. - - Finally, this model supports inherent JAX features such as: - - - [Just-In-Time (JIT) compilation](https://jax.readthedocs.io/en/latest/jax.html#just-in-time-compilation-jit) - - [Automatic Differentiation](https://jax.readthedocs.io/en/latest/jax.html#automatic-differentiation) - - [Vectorization](https://jax.readthedocs.io/en/latest/jax.html#vectorization-vmap) - - [Parallelization](https://jax.readthedocs.io/en/latest/jax.html#parallelization-pmap) - - Parameters: - config ([`MistralConfig`]): Model configuration class with all the parameters of the model. - Initializing with a config file does not load the weights associated with the model, only the - configuration. Check out the [`~FlaxPreTrainedModel.from_pretrained`] method to load the model weights. - dtype (`jax.numpy.dtype`, *optional*, defaults to `jax.numpy.float32`): - The data type of the computation. Can be one of `jax.numpy.float32`, `jax.numpy.float16`, or - `jax.numpy.bfloat16`. - - This can be used to enable mixed-precision training or half-precision inference on GPUs or TPUs. If - specified all the computation will be performed with the given `dtype`. - - **Note that this only specifies the dtype of the computation and does not influence the dtype of model - parameters.** - - If you wish to change the dtype of the model parameters, see [`~FlaxPreTrainedModel.to_fp16`] and - [`~FlaxPreTrainedModel.to_bf16`]. -""" - -MISTRAL_INPUTS_DOCSTRING = r""" - Args: - input_ids (`numpy.ndarray` of shape `(batch_size, input_ids_length)`): - Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide - it. - - Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and - [`PreTrainedTokenizer.__call__`] for details. - - [What are input IDs?](../glossary#input-ids) - attention_mask (`numpy.ndarray` of shape `(batch_size, sequence_length)`, *optional*): - Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: - - - 1 for tokens that are **not masked**, - - 0 for tokens that are **masked**. - - [What are attention masks?](../glossary#attention-mask) - - Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and - [`PreTrainedTokenizer.__call__`] for details. - - If `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see - `past_key_values`). - - If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`] - and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more - information on the default strategy. - - - 1 indicates the head is **not masked**, - - 0 indicates the head is **masked**. - position_ids (`numpy.ndarray` of shape `(batch_size, sequence_length)`, *optional*): - Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0, - config.n_positions - 1]`. - - [What are position IDs?](../glossary#position-ids) - past_key_values (`Dict[str, np.ndarray]`, *optional*, returned by `init_cache` or when passing previous `past_key_values`): - Dictionary of pre-computed hidden-states (key and values in the attention blocks) that can be used for fast - auto-regressive decoding. Pre-computed key and value hidden-states are of shape *[batch_size, max_length]*. - output_attentions (`bool`, *optional*): - Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned - tensors for more detail. - output_hidden_states (`bool`, *optional*): - Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for - more detail. - return_dict (`bool`, *optional*): - Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. -""" - - -# Copied from transformers.models.llama.modeling_flax_llama.FlaxLlamaRMSNorm with Llama->Mistral -class FlaxMistralRMSNorm(nn.Module): - config: MistralConfig - dtype: jnp.dtype = jnp.float32 - - def setup(self): - self.epsilon = self.config.rms_norm_eps - self.weight = self.param("weight", lambda _, shape: jnp.ones(shape), self.config.hidden_size) - - def __call__(self, hidden_states): - variance = jnp.asarray(hidden_states, dtype=jnp.float32) - variance = jnp.power(variance, 2) - variance = variance.mean(-1, keepdims=True) - # use `jax.numpy.sqrt` as `jax.lax.rsqrt` does not match `torch.rsqrt` - hidden_states = hidden_states / jnp.sqrt(variance + self.epsilon) - - return self.weight * jnp.asarray(hidden_states, dtype=self.dtype) - - -# Copied from transformers.models.llama.modeling_flax_llama.FlaxLlamaRotaryEmbedding with Llama->Mistral -class FlaxMistralRotaryEmbedding(nn.Module): - config: MistralConfig - dtype: jnp.dtype = jnp.float32 - - def setup(self): - head_dim = self.config.hidden_size // self.config.num_attention_heads - self.sincos = create_sinusoidal_positions(self.config.max_position_embeddings, head_dim) - - def __call__(self, key, query, position_ids): - sincos = self.sincos[position_ids] - sin_pos, cos_pos = jnp.split(sincos, 2, axis=-1) - - key = apply_rotary_pos_emb(key, sin_pos, cos_pos) - query = apply_rotary_pos_emb(query, sin_pos, cos_pos) - - key = jnp.asarray(key, dtype=self.dtype) - query = jnp.asarray(query, dtype=self.dtype) - - return key, query - - -# Copied from transformers.models.llama.modeling_flax_llama.FlaxLlamaMLP with Llama->Mistral -class FlaxMistralMLP(nn.Module): - config: MistralConfig - dtype: jnp.dtype = jnp.float32 - - def setup(self): - embed_dim = self.config.hidden_size - inner_dim = self.config.intermediate_size if self.config.intermediate_size is not None else 4 * embed_dim - - kernel_init = jax.nn.initializers.normal(self.config.initializer_range) - self.act = ACT2FN[self.config.hidden_act] - - self.gate_proj = nn.Dense(inner_dim, use_bias=False, dtype=self.dtype, kernel_init=kernel_init) - self.down_proj = nn.Dense(embed_dim, use_bias=False, dtype=self.dtype, kernel_init=kernel_init) - self.up_proj = nn.Dense(inner_dim, use_bias=False, dtype=self.dtype, kernel_init=kernel_init) - - def __call__(self, hidden_states): - up_proj_states = self.up_proj(hidden_states) - gate_states = self.act(self.gate_proj(hidden_states)) - - hidden_states = self.down_proj(up_proj_states * gate_states) - return hidden_states - - -# Copied from transformers.models.llama.modeling_flax_llama.apply_rotary_pos_emb -def apply_rotary_pos_emb(tensor, sin_pos, cos_pos): - return (tensor * cos_pos) + (rotate_half(tensor) * sin_pos) - - -# Copied from transformers.models.llama.modeling_flax_llama.create_sinusoidal_positions -def create_sinusoidal_positions(num_pos, dim): - inv_freq = 1.0 / (10000 ** (np.arange(0, dim, 2) / dim)) - freqs = np.einsum("i , j -> i j", np.arange(num_pos), inv_freq).astype("float32") - - emb = np.concatenate((freqs, freqs), axis=-1) - out = np.concatenate((np.sin(emb)[:, None, :], np.cos(emb)[:, None, :]), axis=-1) - return jnp.array(out[:, :, :num_pos]) - - -# Copied from transformers.models.llama.modeling_flax_llama.rotate_half -def rotate_half(tensor): - """Rotates half the hidden dims of the input.""" - rotate_half_tensor = jnp.concatenate( - (-tensor[..., tensor.shape[-1] // 2 :], tensor[..., : tensor.shape[-1] // 2]), axis=-1 - ) - return rotate_half_tensor - - -class FlaxMistralAttention(nn.Module): - config: MistralConfig - dtype: jnp.dtype = jnp.float32 - - def setup(self): - config = self.config - self.hidden_size = config.hidden_size - self.num_heads = config.num_attention_heads - self.head_dim = self.hidden_size // self.num_heads - self.num_key_value_heads = config.num_key_value_heads - self.num_key_value_groups = self.num_heads // self.num_key_value_heads - self.max_position_embeddings = config.max_position_embeddings - self.attention_softmax_in_fp32 = self.dtype is not jnp.float32 - self.rope_theta = config.rope_theta - if (self.head_dim * self.num_heads) != self.hidden_size: - raise ValueError( - f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}" - f" and `num_heads`: {self.num_heads})." - ) - self.q_proj = nn.Dense(self.num_heads * self.head_dim, use_bias=False, dtype=self.dtype) - self.k_proj = nn.Dense(self.num_key_value_heads * self.head_dim, use_bias=False, dtype=self.dtype) - self.v_proj = nn.Dense(self.num_key_value_heads * self.head_dim, use_bias=False, dtype=self.dtype) - self.o_proj = nn.Dense(self.hidden_size, use_bias=False, dtype=self.dtype) - casual_mask = make_causal_mask(jnp.ones((1, config.max_position_embeddings), dtype="bool"), dtype="bool") - self.causal_mask = jnp.triu(casual_mask, k=-config.sliding_window) - self.rotary_emb = FlaxMistralRotaryEmbedding(config, dtype=self.dtype) - - def _split_heads(self, hidden_states, num_heads): - return hidden_states.reshape(hidden_states.shape[:2] + (num_heads, self.head_dim)) - - def _merge_heads(self, hidden_states): - return hidden_states.reshape(hidden_states.shape[:2] + (self.hidden_size,)) - - @nn.compact - # Copied from transformers.models.gpt_neo.modeling_flax_gpt_neo.FlaxGPTNeoSelfAttention._concatenate_to_cache - def _concatenate_to_cache(self, key, value, query, attention_mask): - """ - This function takes projected key, value states from a single input token and concatenates the states to cached - states from previous steps. This function is slighly adapted from the official Flax repository: - https://github.com/google/flax/blob/491ce18759622506588784b4fca0e4bf05f8c8cd/flax/linen/attention.py#L252 - """ - # detect if we're initializing by absence of existing cache data. - is_initialized = self.has_variable("cache", "cached_key") - cached_key = self.variable("cache", "cached_key", jnp.zeros, key.shape, key.dtype) - cached_value = self.variable("cache", "cached_value", jnp.zeros, value.shape, value.dtype) - cache_index = self.variable("cache", "cache_index", lambda: jnp.array(0, dtype=jnp.int32)) - - if is_initialized: - *batch_dims, max_length, num_heads, depth_per_head = cached_key.value.shape - # update key, value caches with our new 1d spatial slices - cur_index = cache_index.value - indices = (0,) * len(batch_dims) + (cur_index, 0, 0) - key = lax.dynamic_update_slice(cached_key.value, key, indices) - value = lax.dynamic_update_slice(cached_value.value, value, indices) - cached_key.value = key - cached_value.value = value - num_updated_cache_vectors = query.shape[1] - cache_index.value = cache_index.value + num_updated_cache_vectors - # causal mask for cached decoder self-attention: our single query position should only attend to those key positions that have already been generated and cached, not the remaining zero elements. - pad_mask = jnp.broadcast_to( - jnp.arange(max_length) < cur_index + num_updated_cache_vectors, - tuple(batch_dims) + (1, num_updated_cache_vectors, max_length), - ) - attention_mask = combine_masks(pad_mask, attention_mask) - return key, value, attention_mask - - def __call__( - self, - hidden_states: jnp.ndarray, - attention_mask: Optional[jnp.ndarray] = None, - position_ids: Optional[jnp.ndarray] = None, - deterministic: bool = True, - output_attentions: bool = False, - init_cache: bool = False, - ) -> Tuple[jnp.ndarray, jnp.ndarray]: - query_states = self.q_proj(hidden_states) - key_states = self.k_proj(hidden_states) - value_states = self.v_proj(hidden_states) - - query_states = self._split_heads(query_states, self.num_heads) - key_states = self._split_heads(key_states, self.num_key_value_heads) - value_states = self._split_heads(value_states, self.num_key_value_heads) - - key_states, query_states = self.rotary_emb(key_states, query_states, position_ids) - query_length, key_length = query_states.shape[1], key_states.shape[1] - if self.has_variable("cache", "cached_key"): - mask_shift = self.variables["cache"]["cache_index"] - max_decoder_length = self.variables["cache"]["cached_key"].shape[1] - causal_mask = lax.dynamic_slice( - self.causal_mask, (0, 0, mask_shift, 0), (1, 1, query_length, max_decoder_length) - ) - else: - causal_mask = self.causal_mask[:, :, :query_length, :key_length] - - batch_size = hidden_states.shape[0] - causal_mask = jnp.broadcast_to(causal_mask, (batch_size,) + causal_mask.shape[1:]) - attention_mask = jnp.broadcast_to(jnp.expand_dims(attention_mask, axis=(-3, -2)), causal_mask.shape) - attention_mask = combine_masks(attention_mask, causal_mask) - - if self.has_variable("cache", "cached_key") or init_cache: - key_states, value_states, attention_mask = self._concatenate_to_cache( - key_states, value_states, query_states, attention_mask - ) - key_states = jnp.repeat(key_states, self.num_key_value_groups, axis=2) - value_states = jnp.repeat(value_states, self.num_key_value_groups, axis=2) - - attention_bias = lax.select( - attention_mask > 0, - jnp.full(attention_mask.shape, 0.0).astype(self.dtype), - jnp.full(attention_mask.shape, jnp.finfo(self.dtype).min).astype(self.dtype), - ) - - # usual dot product attention - attention_dtype = jnp.float32 if self.attention_softmax_in_fp32 else self.dtype - attn_weights = dot_product_attention_weights( - query_states, - key_states, - bias=attention_bias, - deterministic=deterministic, - dropout_rate=self.config.attention_dropout, - dtype=attention_dtype, - ) - - if self.attention_softmax_in_fp32: - attn_weights = attn_weights.astype(self.dtype) - - attn_output = jnp.einsum("...hqk,...khd->...qhd", attn_weights, value_states) - attn_output = self._merge_heads(attn_output) - attn_output = self.o_proj(attn_output) - - outputs = (attn_output, attn_weights) if output_attentions else (attn_output,) - return outputs - - -# Copied from transformers.models.llama.modeling_flax_llama.FlaxLlamaDecoderLayer with Llama->Mistral -class FlaxMistralDecoderLayer(nn.Module): - config: MistralConfig - dtype: jnp.dtype = jnp.float32 - - def setup(self): - self.input_layernorm = FlaxMistralRMSNorm(self.config, dtype=self.dtype) - self.self_attn = FlaxMistralAttention(self.config, dtype=self.dtype) - self.post_attention_layernorm = FlaxMistralRMSNorm(self.config, dtype=self.dtype) - self.mlp = FlaxMistralMLP(self.config, dtype=self.dtype) - - def __call__( - self, - hidden_states, - attention_mask=None, - position_ids=None, - deterministic: bool = True, - init_cache: bool = False, - output_attentions: bool = False, - ): - residual = hidden_states - hidden_states = self.input_layernorm(hidden_states) - outputs = self.self_attn( - hidden_states, - attention_mask=attention_mask, - position_ids=position_ids, - deterministic=deterministic, - init_cache=init_cache, - output_attentions=output_attentions, - ) - # residual connection - attn_output = outputs[0] - hidden_states = residual + attn_output - - residual = hidden_states - hidden_states = self.post_attention_layernorm(hidden_states) - hidden_states = self.mlp(hidden_states) - # residual connection - hidden_states = residual + hidden_states - - return (hidden_states,) + outputs[1:] - - -# Copied from transformers.models.gpt_neo.modeling_flax_gpt_neo.FlaxGPTNeoPreTrainedModel with GPTNeo->Mistral, GPT_NEO->MISTRAL, transformer->model -class FlaxMistralPreTrainedModel(FlaxPreTrainedModel): - """ - An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained - models. - """ - - config_class = MistralConfig - base_model_prefix = "model" - module_class: nn.Module = None - - def __init__( - self, - config: MistralConfig, - input_shape: Tuple = (1, 1), - seed: int = 0, - dtype: jnp.dtype = jnp.float32, - _do_init: bool = True, - **kwargs, - ): - module = self.module_class(config=config, dtype=dtype, **kwargs) - super().__init__(config, module, input_shape=input_shape, seed=seed, dtype=dtype, _do_init=_do_init) - - def init_weights(self, rng: jax.random.PRNGKey, input_shape: Tuple, params: FrozenDict = None) -> FrozenDict: - # init input tensors - input_ids = jnp.zeros(input_shape, dtype="i4") - attention_mask = jnp.ones_like(input_ids) - position_ids = jnp.broadcast_to(jnp.arange(jnp.atleast_2d(input_ids).shape[-1]), input_shape) - params_rng, dropout_rng = jax.random.split(rng) - rngs = {"params": params_rng, "dropout": dropout_rng} - - random_params = self.module.init(rngs, input_ids, attention_mask, position_ids, return_dict=False)["params"] - - if params is not None: - random_params = flatten_dict(unfreeze(random_params)) - params = flatten_dict(unfreeze(params)) - for missing_key in self._missing_keys: - params[missing_key] = random_params[missing_key] - self._missing_keys = set() - return freeze(unflatten_dict(params)) - else: - return random_params - - def init_cache(self, batch_size, max_length): - r""" - Args: - batch_size (`int`): - batch_size used for fast auto-regressive decoding. Defines the batch size of the initialized cache. - max_length (`int`): - maximum possible length for auto-regressive decoding. Defines the sequence length of the initialized - cache. - """ - # init input variables to retrieve cache - input_ids = jnp.ones((batch_size, max_length)) - attention_mask = jnp.ones_like(input_ids) - position_ids = jnp.broadcast_to(jnp.arange(jnp.atleast_2d(input_ids).shape[-1]), input_ids.shape) - - init_variables = self.module.init( - jax.random.PRNGKey(0), input_ids, attention_mask, position_ids, return_dict=False, init_cache=True - ) - return unfreeze(init_variables["cache"]) - - @add_start_docstrings_to_model_forward(MISTRAL_INPUTS_DOCSTRING) - def __call__( - self, - input_ids, - attention_mask=None, - position_ids=None, - params: dict = None, - past_key_values: dict = None, - dropout_rng: jax.random.PRNGKey = None, - train: bool = False, - output_attentions: Optional[bool] = None, - output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ): - output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions - output_hidden_states = ( - output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states - ) - return_dict = return_dict if return_dict is not None else self.config.return_dict - - batch_size, sequence_length = input_ids.shape - - if position_ids is None: - if past_key_values is not None: - raise ValueError("Make sure to provide `position_ids` when passing `past_key_values`.") - - position_ids = jnp.broadcast_to(jnp.arange(sequence_length)[None, :], (batch_size, sequence_length)) - - if attention_mask is None: - attention_mask = jnp.ones((batch_size, sequence_length)) - - # Handle any PRNG if needed - rngs = {} - if dropout_rng is not None: - rngs["dropout"] = dropout_rng - - inputs = {"params": params or self.params} - - # if past_key_values are passed then cache is already initialized a private flag init_cache has to be passed down to ensure cache is used. It has to be made sure that cache is marked as mutable so that it can be changed by FlaxMistralAttention module - if past_key_values: - inputs["cache"] = past_key_values - mutable = ["cache"] - else: - mutable = False - - outputs = self.module.apply( - inputs, - jnp.array(input_ids, dtype="i4"), - jnp.array(attention_mask, dtype="i4"), - jnp.array(position_ids, dtype="i4"), - not train, - False, - output_attentions, - output_hidden_states, - return_dict, - rngs=rngs, - mutable=mutable, - ) - - # add updated cache to model output - if past_key_values is not None and return_dict: - outputs, past_key_values = outputs - outputs["past_key_values"] = unfreeze(past_key_values["cache"]) - return outputs - elif past_key_values is not None and not return_dict: - outputs, past_key_values = outputs - outputs = outputs[:1] + (unfreeze(past_key_values["cache"]),) + outputs[1:] - - return outputs - - -# Copied from transformers.models.llama.modeling_flax_llama.FlaxLlamaLayerCollection with Llama->Mistral -class FlaxMistralLayerCollection(nn.Module): - config: MistralConfig - dtype: jnp.dtype = jnp.float32 - - def setup(self): - self.blocks = [ - FlaxMistralDecoderLayer(self.config, dtype=self.dtype, name=str(i)) - for i in range(self.config.num_hidden_layers) - ] - - def __call__( - self, - hidden_states, - attention_mask=None, - position_ids=None, - deterministic: bool = True, - init_cache: bool = False, - output_attentions: bool = False, - output_hidden_states: bool = False, - return_dict: bool = False, - ): - all_attentions = () if output_attentions else None - all_hidden_states = () if output_hidden_states else None - - for block in self.blocks: - if output_hidden_states: - all_hidden_states += (hidden_states,) - layer_outputs = block( - hidden_states, - attention_mask=attention_mask, - position_ids=position_ids, - deterministic=deterministic, - init_cache=init_cache, - output_attentions=output_attentions, - ) - hidden_states = layer_outputs[0] - - if output_attentions: - all_attentions += (layer_outputs[1],) - - # this contains possible `None` values - `FlaxMistralModule` will filter them out - outputs = (hidden_states, all_hidden_states, all_attentions) - - return outputs - - -# Copied from transformers.models.llama.modeling_flax_llama.FlaxLlamaModule with Llama->Mistral -class FlaxMistralModule(nn.Module): - config: MistralConfig - dtype: jnp.dtype = jnp.float32 - - def setup(self): - self.hidden_size = self.config.hidden_size - embedding_init = jax.nn.initializers.normal(stddev=self.config.initializer_range) - self.embed_tokens = nn.Embed( - self.config.vocab_size, - self.hidden_size, - embedding_init=embedding_init, - dtype=self.dtype, - ) - self.layers = FlaxMistralLayerCollection(self.config, dtype=self.dtype) - self.norm = FlaxMistralRMSNorm(self.config, dtype=self.dtype) - - def __call__( - self, - input_ids, - attention_mask=None, - position_ids=None, - deterministic=True, - init_cache: bool = False, - output_attentions: bool = False, - output_hidden_states: bool = False, - return_dict: bool = True, - ): - input_embeds = self.embed_tokens(input_ids.astype("i4")) - - outputs = self.layers( - input_embeds, - position_ids=position_ids, - attention_mask=attention_mask, - deterministic=deterministic, - init_cache=init_cache, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, - return_dict=return_dict, - ) - - hidden_states = outputs[0] - hidden_states = self.norm(hidden_states) - - if output_hidden_states: - all_hidden_states = outputs[1] + (hidden_states,) - outputs = (hidden_states, all_hidden_states) + outputs[2:] - else: - outputs = (hidden_states,) + outputs[1:] - - if not return_dict: - return tuple(v for v in outputs if v is not None) - - return FlaxBaseModelOutput( - last_hidden_state=hidden_states, - hidden_states=outputs[1], - attentions=outputs[-1], - ) - - -@add_start_docstrings( - "The bare Mistral Model transformer outputting raw hidden-states without any specific head on top.", - MISTRAL_START_DOCSTRING, -) -class FlaxMistralModel(FlaxMistralPreTrainedModel): - module_class = FlaxMistralModule - - -append_call_sample_docstring( - FlaxMistralModel, - _CHECKPOINT_FOR_DOC, - FlaxBaseModelOutputWithPast, - _CONFIG_FOR_DOC, - real_checkpoint=_REAL_CHECKPOINT_FOR_DOC, -) - - -# Copied from transformers.models.llama.modeling_flax_llama.FlaxLlamaForCausalLMModule with Llama->Mistral -class FlaxMistralForCausalLMModule(nn.Module): - config: MistralConfig - dtype: jnp.dtype = jnp.float32 - - def setup(self): - self.model = FlaxMistralModule(self.config, dtype=self.dtype) - self.lm_head = nn.Dense( - self.config.vocab_size, - use_bias=False, - dtype=self.dtype, - kernel_init=jax.nn.initializers.normal(stddev=self.config.initializer_range), - ) - - def __call__( - self, - input_ids, - attention_mask=None, - position_ids=None, - deterministic: bool = True, - init_cache: bool = False, - output_attentions: bool = False, - output_hidden_states: bool = False, - return_dict: bool = True, - ): - outputs = self.model( - input_ids, - position_ids=position_ids, - attention_mask=attention_mask, - deterministic=deterministic, - init_cache=init_cache, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, - return_dict=return_dict, - ) - - hidden_states = outputs[0] - lm_logits = self.lm_head(hidden_states) - - if not return_dict: - return (lm_logits,) + outputs[1:] - - return FlaxCausalLMOutput(logits=lm_logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions) - - -@add_start_docstrings( - """ - The Mistral Model transformer with a language modeling head (linear layer) on top. - """, - MISTRAL_START_DOCSTRING, -) - -# Copied from transformers.models.gptj.modeling_flax_gptj.FlaxGPTJForCausalLM with GPTJ->Mistral -class FlaxMistralForCausalLM(FlaxMistralPreTrainedModel): - module_class = FlaxMistralForCausalLMModule - - def prepare_inputs_for_generation(self, input_ids, max_length, attention_mask: Optional[jax.Array] = None): - # initializing the cache - batch_size, seq_length = input_ids.shape - - past_key_values = self.init_cache(batch_size, max_length) - # Note that usually one would have to put 0's in the attention_mask for x > input_ids.shape[-1] and x < cache_length. - # But since Mistral uses a causal mask, those positions are masked anyways. - # Thus we can create a single static attention_mask here, which is more efficient for compilation - extended_attention_mask = jnp.ones((batch_size, max_length), dtype="i4") - if attention_mask is not None: - position_ids = attention_mask.cumsum(axis=-1) - 1 - extended_attention_mask = lax.dynamic_update_slice(extended_attention_mask, attention_mask, (0, 0)) - else: - position_ids = jnp.broadcast_to(jnp.arange(seq_length, dtype="i4")[None, :], (batch_size, seq_length)) - - return { - "past_key_values": past_key_values, - "attention_mask": extended_attention_mask, - "position_ids": position_ids, - } - - def update_inputs_for_generation(self, model_outputs, model_kwargs): - model_kwargs["past_key_values"] = model_outputs.past_key_values - model_kwargs["position_ids"] = model_kwargs["position_ids"][:, -1:] + 1 - return model_kwargs - - -append_call_sample_docstring( - FlaxMistralForCausalLM, - _CHECKPOINT_FOR_DOC, - FlaxCausalLMOutputWithCrossAttentions, - _CONFIG_FOR_DOC, - real_checkpoint=_REAL_CHECKPOINT_FOR_DOC, -) From 8696802c76fad80b0873a53e2943e42021402d8e Mon Sep 17 00:00:00 2001 From: LinqiY <100989140+LinqiY@users.noreply.github.com> Date: Mon, 6 May 2024 15:39:49 +0800 Subject: [PATCH 07/16] Delete collie/models/mistral/convert_mistral_weights_to_hf.py --- .../mistral/convert_mistral_weights_to_hf.py | 276 ------------------ 1 file changed, 276 deletions(-) delete mode 100644 collie/models/mistral/convert_mistral_weights_to_hf.py diff --git a/collie/models/mistral/convert_mistral_weights_to_hf.py b/collie/models/mistral/convert_mistral_weights_to_hf.py deleted file mode 100644 index 4ba6236..0000000 --- a/collie/models/mistral/convert_mistral_weights_to_hf.py +++ /dev/null @@ -1,276 +0,0 @@ -# Copyright 2023 Mistral AI and The HuggingFace Inc. team. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import argparse -import gc -import json -import os -import shutil -import warnings - -import torch - -from transformers import ( - LlamaTokenizer, - MistralConfig, - MistralForCausalLM, -) - - -try: - from transformers import LlamaTokenizerFast - - tokenizer_class = LlamaTokenizerFast -except ImportError as e: - warnings.warn(e) - warnings.warn( - "The converted tokenizer will be the `slow` tokenizer. To use the fast, update your `tokenizers` library and re-run the tokenizer conversion" - ) - tokenizer_class = LlamaTokenizer - -""" -Sample usage: - -``` -python src/transformers/models/mistral/convert_mistral_weights_to_hf.py \ - --input_dir /path/to/downloaded/mistral/weights --model_size 7B --output_dir /output/path -``` - -Thereafter, models can be loaded via: - -```py -from transformers import MistralForCausalLM, LlamaTokenizer - -model = MistralForCausalLM.from_pretrained("/output/path") -tokenizer = LlamaTokenizer.from_pretrained("/output/path") -``` - -Important note: you need to be able to host the whole model in RAM to execute this script (even if the biggest versions -come in several checkpoints they each contain a part of each weight of the model, so we need to load them all in RAM). -""" - -NUM_SHARDS = {"7B": 1} - - -def compute_intermediate_size(n, ffn_dim_multiplier=1, multiple_of=256): - return multiple_of * ((int(ffn_dim_multiplier * int(8 * n / 3)) + multiple_of - 1) // multiple_of) - - -def read_json(path): - with open(path, "r") as f: - return json.load(f) - - -def write_json(text, path): - with open(path, "w") as f: - json.dump(text, f) - - -def write_model(model_path, input_base_path, model_size, tokenizer_path=None, safe_serialization=True): - # for backward compatibility, before you needed the repo to be called `my_repo/model_size` - if not os.path.isfile(os.path.join(input_base_path, "params.json")): - input_base_path = os.path.join(input_base_path, model_size) - - os.makedirs(model_path, exist_ok=True) - tmp_model_path = os.path.join(model_path, "tmp") - os.makedirs(tmp_model_path, exist_ok=True) - - params = read_json(os.path.join(input_base_path, "params.json")) - num_shards = NUM_SHARDS[model_size] - - # For some reason this is a string in the params.json - sliding_window = int(params["sliding_window"]) - n_layers = params["n_layers"] - n_heads = params["n_heads"] - n_heads_per_shard = n_heads // num_shards - dim = params["dim"] - dims_per_head = dim // n_heads - base = params.get("rope_theta", 10000.0) - inv_freq = 1.0 / (base ** (torch.arange(0, dims_per_head, 2).float() / dims_per_head)) - max_position_embeddings = 4096 * 8 - - if tokenizer_path is not None: - tokenizer = tokenizer_class(tokenizer_path) - tokenizer.save_pretrained(model_path) - vocab_size = tokenizer.vocab_size if tokenizer_path is not None else 32000 - - if "n_kv_heads" in params: - num_key_value_heads = params["n_kv_heads"] # for GQA / MQA - num_local_key_value_heads = num_key_value_heads // num_shards - key_value_dim = dims_per_head * num_local_key_value_heads - else: # compatibility with other checkpoints - num_key_value_heads = n_heads - num_local_key_value_heads = n_heads_per_shard - key_value_dim = dim - - # permute for sliced rotary - def permute(w, n_heads=n_heads, dim1=dim, dim2=dim): - return w.view(n_heads, dim1 // n_heads // 2, 2, dim2).transpose(1, 2).reshape(dim1, dim2) - - print(f"Fetching all parameters from the checkpoint at {input_base_path}.") - # Load weights - loaded = [ - torch.load(os.path.join(input_base_path, f"consolidated.{i:02d}.pth"), map_location="cpu") - for i in range(num_shards) - ] - param_count = 0 - index_dict = {"weight_map": {}} - for layer_i in range(n_layers): - filename = f"pytorch_model-{layer_i + 1}-of-{n_layers + 1}.bin" - - # Sharded - # Note that attention.w{q,k,v,o}, feed_fordward.w[1,2,3], attention_norm.weight and ffn_norm.weight share - # the same storage object, saving attention_norm and ffn_norm will save other weights too, which is - # redundant as other weights will be stitched from multiple shards. To avoid that, they are cloned. - - state_dict = { - f"model.layers.{layer_i}.input_layernorm.weight": loaded[0][ - f"layers.{layer_i}.attention_norm.weight" - ].clone(), - f"model.layers.{layer_i}.post_attention_layernorm.weight": loaded[0][ - f"layers.{layer_i}.ffn_norm.weight" - ].clone(), - } - state_dict[f"model.layers.{layer_i}.self_attn.q_proj.weight"] = permute( - torch.cat( - [ - loaded[i][f"layers.{layer_i}.attention.wq.weight"].view(n_heads_per_shard, dims_per_head, dim) - for i in range(num_shards) - ], - dim=0, - ).reshape(dim, dim) - ) - state_dict[f"model.layers.{layer_i}.self_attn.k_proj.weight"] = permute( - torch.cat( - [ - loaded[i][f"layers.{layer_i}.attention.wk.weight"].view( - num_local_key_value_heads, dims_per_head, dim - ) - for i in range(num_shards) - ], - dim=0, - ).reshape(key_value_dim, dim), - num_key_value_heads, - key_value_dim, - dim, - ) - state_dict[f"model.layers.{layer_i}.self_attn.v_proj.weight"] = torch.cat( - [ - loaded[i][f"layers.{layer_i}.attention.wv.weight"].view(num_local_key_value_heads, dims_per_head, dim) - for i in range(num_shards) - ], - dim=0, - ).reshape(key_value_dim, dim) - - state_dict[f"model.layers.{layer_i}.self_attn.o_proj.weight"] = torch.cat( - [loaded[i][f"layers.{layer_i}.attention.wo.weight"] for i in range(num_shards)], dim=1 - ) - state_dict[f"model.layers.{layer_i}.mlp.gate_proj.weight"] = torch.cat( - [loaded[i][f"layers.{layer_i}.feed_forward.w1.weight"] for i in range(num_shards)], dim=0 - ) - state_dict[f"model.layers.{layer_i}.mlp.down_proj.weight"] = torch.cat( - [loaded[i][f"layers.{layer_i}.feed_forward.w2.weight"] for i in range(num_shards)], dim=1 - ) - state_dict[f"model.layers.{layer_i}.mlp.up_proj.weight"] = torch.cat( - [loaded[i][f"layers.{layer_i}.feed_forward.w3.weight"] for i in range(num_shards)], dim=0 - ) - - state_dict[f"model.layers.{layer_i}.self_attn.rotary_emb.inv_freq"] = inv_freq - for k, v in state_dict.items(): - index_dict["weight_map"][k] = filename - param_count += v.numel() - torch.save(state_dict, os.path.join(tmp_model_path, filename)) - - filename = f"pytorch_model-{n_layers + 1}-of-{n_layers + 1}.bin" - state_dict = { - "model.norm.weight": loaded[0]["norm.weight"], - "model.embed_tokens.weight": torch.cat([loaded[i]["tok_embeddings.weight"] for i in range(num_shards)], dim=1), - "lm_head.weight": torch.cat([loaded[i]["output.weight"] for i in range(num_shards)], dim=0), - } - - for k, v in state_dict.items(): - index_dict["weight_map"][k] = filename - param_count += v.numel() - torch.save(state_dict, os.path.join(tmp_model_path, filename)) - - # Write configs - index_dict["metadata"] = {"total_size": param_count * 2} - write_json(index_dict, os.path.join(tmp_model_path, "pytorch_model.bin.index.json")) - config = MistralConfig( - hidden_size=dim, - intermediate_size=params["hidden_dim"], - num_attention_heads=params["n_heads"], - num_hidden_layers=params["n_layers"], - rms_norm_eps=params["norm_eps"], - num_key_value_heads=num_key_value_heads, - vocab_size=vocab_size, - rope_theta=base, - max_position_embeddings=max_position_embeddings, - sliding_window=sliding_window, - ) - config.save_pretrained(tmp_model_path) - - # Make space so we can load the model properly now. - del state_dict - del loaded - gc.collect() - - print("Loading the checkpoint in a Mistral model.") - model = MistralForCausalLM.from_pretrained(tmp_model_path, torch_dtype=torch.bfloat16, low_cpu_mem_usage=True) - # Avoid saving this as part of the config. - del model.config._name_or_path - model.config.torch_dtype = torch.float16 - print("Saving in the Transformers format.") - model.save_pretrained(model_path, safe_serialization=safe_serialization) - shutil.rmtree(tmp_model_path) - - -def write_tokenizer(tokenizer_path, input_tokenizer_path): - # Initialize the tokenizer based on the `spm` model - print(f"Saving a {tokenizer_class.__name__} to {tokenizer_path}.") - tokenizer = tokenizer_class(input_tokenizer_path) - tokenizer.save_pretrained(tokenizer_path) - - -def main(): - parser = argparse.ArgumentParser() - parser.add_argument( - "--input_dir", - help="Location of Mistral weights, which contains tokenizer.model and model folders", - ) - parser.add_argument( - "--model_size", - choices=["7B", "tokenizer_only"], - help="'f' models correspond to the finetuned versions, and are specific to the Mistral2 official release. For more details on Mistral2, checkout the original repo: https://huggingface.co/meta-mistral", - ) - parser.add_argument( - "--output_dir", - help="Location to write HF model and tokenizer", - ) - parser.add_argument("--safe_serialization", type=bool, help="Whether or not to save using `safetensors`.") - args = parser.parse_args() - spm_path = os.path.join(args.input_dir, "tokenizer.model") - if args.model_size != "tokenizer_only": - write_model( - model_path=args.output_dir, - input_base_path=args.input_dir, - model_size=args.model_size, - safe_serialization=args.safe_serialization, - tokenizer_path=spm_path, - ) - else: - write_tokenizer(args.output_dir, spm_path) - - -if __name__ == "__main__": - main() From ea0c9b4c742f51d183afdbb63d6279f79f22c899 Mon Sep 17 00:00:00 2001 From: LinqiY <100989140+LinqiY@users.noreply.github.com> Date: Mon, 6 May 2024 15:39:58 +0800 Subject: [PATCH 08/16] Delete collie/models/mistral/configuration_mistral.py --- .../models/mistral/configuration_mistral.py | 152 ------------------ 1 file changed, 152 deletions(-) delete mode 100644 collie/models/mistral/configuration_mistral.py diff --git a/collie/models/mistral/configuration_mistral.py b/collie/models/mistral/configuration_mistral.py deleted file mode 100644 index 20ffba5..0000000 --- a/collie/models/mistral/configuration_mistral.py +++ /dev/null @@ -1,152 +0,0 @@ -# coding=utf-8 -# Copyright 2023 Mistral AI and the HuggingFace Inc. team. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -""" Mistral model configuration""" - -from transformers.configuration_utils import PretrainedConfig -from transformers.utils import logging - - -logger = logging.get_logger(__name__) - -MISTRAL_PRETRAINED_CONFIG_ARCHIVE_MAP = { - "mistralai/Mistral-7B-v0.1": "https://huggingface.co/mistralai/Mistral-7B-v0.1/resolve/main/config.json", - "mistralai/Mistral-7B-Instruct-v0.1": "https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.1/resolve/main/config.json", -} - - -class MistralConfig(PretrainedConfig): - r""" - This is the configuration class to store the configuration of a [`MistralModel`]. It is used to instantiate an - Mistral model according to the specified arguments, defining the model architecture. Instantiating a configuration - with the defaults will yield a similar configuration to that of the Mistral-7B-v0.1 or Mistral-7B-Instruct-v0.1. - - [mistralai/Mistral-7B-v0.1](https://huggingface.co/mistralai/Mistral-7B-v0.1) - [mistralai/Mistral-7B-Instruct-v0.1](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.1) - - Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the - documentation from [`PretrainedConfig`] for more information. - - - Args: - vocab_size (`int`, *optional*, defaults to 32000): - Vocabulary size of the Mistral model. Defines the number of different tokens that can be represented by the - `inputs_ids` passed when calling [`MistralModel`] - hidden_size (`int`, *optional*, defaults to 4096): - Dimension of the hidden representations. - intermediate_size (`int`, *optional*, defaults to 14336): - Dimension of the MLP representations. - num_hidden_layers (`int`, *optional*, defaults to 32): - Number of hidden layers in the Transformer encoder. - num_attention_heads (`int`, *optional*, defaults to 32): - Number of attention heads for each attention layer in the Transformer encoder. - num_key_value_heads (`int`, *optional*, defaults to 8): - This is the number of key_value heads that should be used to implement Grouped Query Attention. If - `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if - `num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used. When - converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed - by meanpooling all the original heads within that group. For more details checkout [this - paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to `8`. - hidden_act (`str` or `function`, *optional*, defaults to `"silu"`): - The non-linear activation function (function or string) in the decoder. - max_position_embeddings (`int`, *optional*, defaults to `4096*32`): - The maximum sequence length that this model might ever be used with. Mistral's sliding window attention - allows sequence of up to 4096*32 tokens. - initializer_range (`float`, *optional*, defaults to 0.02): - The standard deviation of the truncated_normal_initializer for initializing all weight matrices. - rms_norm_eps (`float`, *optional*, defaults to 1e-06): - The epsilon used by the rms normalization layers. - use_cache (`bool`, *optional*, defaults to `True`): - Whether or not the model should return the last key/values attentions (not used by all models). Only - relevant if `config.is_decoder=True`. - pad_token_id (`int`, *optional*): - The id of the padding token. - bos_token_id (`int`, *optional*, defaults to 1): - The id of the "beginning-of-sequence" token. - eos_token_id (`int`, *optional*, defaults to 2): - The id of the "end-of-sequence" token. - tie_word_embeddings (`bool`, *optional*, defaults to `False`): - Whether the model's input and output word embeddings should be tied. - rope_theta (`float`, *optional*, defaults to 10000.0): - The base period of the RoPE embeddings. - sliding_window (`int`, *optional*, defaults to 4096): - Sliding window attention window size. If not specified, will default to `4096`. - attention_dropout (`float`, *optional*, defaults to 0.0): - The dropout ratio for the attention probabilities. - - ```python - >>> from transformers import MistralModel, MistralConfig - - >>> # Initializing a Mistral 7B style configuration - >>> configuration = MistralConfig() - - >>> # Initializing a model from the Mistral 7B style configuration - >>> model = MistralModel(configuration) - - >>> # Accessing the model configuration - >>> configuration = model.config - ```""" - - model_type = "mistral" - keys_to_ignore_at_inference = ["past_key_values"] - - def __init__( - self, - vocab_size=32000, - hidden_size=4096, - intermediate_size=14336, - num_hidden_layers=32, - num_attention_heads=32, - num_key_value_heads=8, - hidden_act="silu", - max_position_embeddings=4096 * 32, - initializer_range=0.02, - rms_norm_eps=1e-6, - use_cache=True, - pad_token_id=None, - bos_token_id=1, - eos_token_id=2, - tie_word_embeddings=False, - rope_theta=10000.0, - sliding_window=4096, - attention_dropout=0.0, - **kwargs, - ): - self.vocab_size = vocab_size - self.max_position_embeddings = max_position_embeddings - self.hidden_size = hidden_size - self.intermediate_size = intermediate_size - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - self.sliding_window = sliding_window - - # for backward compatibility - if num_key_value_heads is None: - num_key_value_heads = num_attention_heads - - self.num_key_value_heads = num_key_value_heads - self.hidden_act = hidden_act - self.initializer_range = initializer_range - self.rms_norm_eps = rms_norm_eps - self.use_cache = use_cache - self.rope_theta = rope_theta - self.attention_dropout = attention_dropout - - super().__init__( - pad_token_id=pad_token_id, - bos_token_id=bos_token_id, - eos_token_id=eos_token_id, - tie_word_embeddings=tie_word_embeddings, - **kwargs, - ) From 3c51c3b7c4d5443d8397da7baeda70b397d2fa5c Mon Sep 17 00:00:00 2001 From: LinqiY <100989140+LinqiY@users.noreply.github.com> Date: Mon, 6 May 2024 15:40:47 +0800 Subject: [PATCH 09/16] Delete collie/models/mistral/__init__.py --- collie/models/mistral/__init__.py | 82 ------------------------------- 1 file changed, 82 deletions(-) delete mode 100644 collie/models/mistral/__init__.py diff --git a/collie/models/mistral/__init__.py b/collie/models/mistral/__init__.py deleted file mode 100644 index c5fa66e..0000000 --- a/collie/models/mistral/__init__.py +++ /dev/null @@ -1,82 +0,0 @@ -# Copyright 2023 Mistral AI and The HuggingFace Inc. team. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -from typing import TYPE_CHECKING - -from transformers.utils import OptionalDependencyNotAvailable, _LazyModule, is_flax_available, is_torch_available - - -_import_structure = { - "configuration_mistral": ["MISTRAL_PRETRAINED_CONFIG_ARCHIVE_MAP", "MistralConfig"], -} - - -try: - if not is_torch_available(): - raise OptionalDependencyNotAvailable() -except OptionalDependencyNotAvailable: - pass -else: - _import_structure["modeling_mistral"] = [ - "MistralForCausalLM", - "MistralModel", - "MistralPreTrainedModel", - "MistralForSequenceClassification", - ] - -try: - if not is_flax_available(): - raise OptionalDependencyNotAvailable() -except OptionalDependencyNotAvailable: - pass -else: - _import_structure["modeling_flax_mistral"] = [ - "FlaxMistralForCausalLM", - "FlaxMistralModel", - "FlaxMistralPreTrainedModel", - ] - - -if TYPE_CHECKING: - from .configuration_mistral import MISTRAL_PRETRAINED_CONFIG_ARCHIVE_MAP, MistralConfig - - try: - if not is_torch_available(): - raise OptionalDependencyNotAvailable() - except OptionalDependencyNotAvailable: - pass - else: - from .modeling_mistral import ( - MistralForCausalLM, - MistralForSequenceClassification, - MistralModel, - MistralPreTrainedModel, - ) - - try: - if not is_flax_available(): - raise OptionalDependencyNotAvailable() - except OptionalDependencyNotAvailable: - pass - else: - from .modeling_flax_mistral import ( - FlaxMistralForCausalLM, - FlaxMistralModel, - FlaxMistralPreTrainedModel, - ) - - -else: - import sys - - sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__) From 96a36285bfd01d430531fbeb47898862a4a02284 Mon Sep 17 00:00:00 2001 From: LinqiY <100989140+LinqiY@users.noreply.github.com> Date: Mon, 6 May 2024 15:41:07 +0800 Subject: [PATCH 10/16] Delete collie/models/mistral/__pycache__ directory --- .../__pycache__/__init__.cpython-310.pyc | Bin 1210 -> 0 bytes .../configuration_mistral.cpython-310.pyc | Bin 6270 -> 0 bytes .../modeling_mistral.cpython-310.pyc | Bin 41165 -> 0 bytes 3 files changed, 0 insertions(+), 0 deletions(-) delete mode 100644 collie/models/mistral/__pycache__/__init__.cpython-310.pyc delete mode 100644 collie/models/mistral/__pycache__/configuration_mistral.cpython-310.pyc delete mode 100644 collie/models/mistral/__pycache__/modeling_mistral.cpython-310.pyc diff --git a/collie/models/mistral/__pycache__/__init__.cpython-310.pyc b/collie/models/mistral/__pycache__/__init__.cpython-310.pyc deleted file mode 100644 index 0eeae894122b29f98b82399a2066890e840baa6a..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 1210 zcmZuw%Wm306dgaX&BMGBQqo1*O&4Aiy6Xm2MFbL5ASj7i;mydw1DR^ZCNrifSte3< zRsW&8{*t#{r-38r5B$rrZ=YTtncRuluQ~iWS$qv5;{G_zR@nfm`Jh2+}%0+QLzQ z{ER?gUV|y{jh_lKeX{`sKPC8YxA@I1zLoIP4S*E(!_W9xV$rmlrWrS*NDaB>XiH?majGA9OOuP$%;wSwK<9#+Msd zWgG?*Je@O7N<)fy#5@we>UYo1Pn$hBJZ+0-chEkBt&>5gdjy-Ot>f;;HtaWtvE5(t zTZxFUSY830khQ!y_eifF=T?|7hDf}(%7zS`GY<#IU*a#HI?vG6Tx<@th{rje;Bn$D zE>bCggXyD7M~nA5VvA3m>otoT_)`D*2C}_&Wv!e{j4n5IFU~Y@R3;BO6Q>_II+K3~ zX3LP3At%Bhwji8ff*=$xEi0tN_#vO6v9li=(PAbXGgT1dvKVc^Cn2MV@!CAXgfmUb z=rW{aFeI-8F`b1hg5q7w$I+Z2X3FB)2+Pa#KVDc_778*HWhjXdYka|_$H1e)V{A@I zc;S%*k}f5*G|hhjb|Bki_%o_AM8gQZz6>daDQX!|i&6c*N_;(8Oaot$MAt z`i?_FFxoCdhyxr!Seq?ktFc`0ot!WqijcFbs-^1>YRXUyMSsYtx}rYhG}Zh!*Uh4$ Rs+(AKgJX!Es%BX))xXwfO11z1 diff --git a/collie/models/mistral/__pycache__/configuration_mistral.cpython-310.pyc b/collie/models/mistral/__pycache__/configuration_mistral.cpython-310.pyc deleted file mode 100644 index 0731f2e2d91d302df8b2d492eb9e55ec9352b625..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 6270 zcmbtYOK%(36`r9eilimKV>fl$+$v2gx)v$fPU8l(8^xAw0hZ#2kODM};b`WPy!Jfk zJ2O(GGC+X@NH^`O%Pdl$n{0}1`U?v5N6fD1!tf&N0&P+BJ9maNBuzbxQsUt}?mg$+ z^Z3rW*PolKYWUn;eY<_`Elv9e-3)( zUR!h`X@rvk7qNE-{}aiv$xg_!(Q+~qtT7jgSKh$y5%?A zgs&+Ysf5M%KU&;gsxLPr4}52vH(YRRBm(PC1K-g}pcT)nspe z;;RwP7TI9*VOxlR8@qiC+|OnxWsJU7M>b}5lKv$(&~zkTVW(Qq-JtxlsUQ< zC3Z)IT_pg#%*Y7?wj&&e?F#Nd6bpna98-=?Oms3shP9Gzsp^B6W4;`IG~(fURY`s9 z?ia967G5VVDa30Ba2%2SdaOsL))KXoFi8`Z`CCt6XT*uuwp|Miix%k-P`6!b`;X)^271 zl1U$^O6Dsk}jJHvFrEl?=KTJHqv!E<}R1P5qvwg&W+4X=>k(m z!c#%IDNQe+(1I(uX93q_zGM<_h9R5>I~iRB@c*Si#)>7ZD7d+4by@yGDTiP`nr_ed zuCZ-85kI1C*T!r!To3Ya&bX#(s%Q?negx-m`Lj!j z>pNsS2BRo6RnAmqXSu<_KWS6AJt7xq;&RjL`Mv@Q;DuVDF9n(^bYn@7Q6*II1V~cC z>g@IaRBaxb!U+;h0o3d<#s(uodS;Kyyu_IDQ?Xt5Wv9`z+l|Xtmfou`U%7nsy*h4d zlisxRJ{O{yYNgaOQBYqgocYiJXu|zbiW3ka5IUmU`sS`k)i}aliscsE%q! zc(%WjW66jc5gh&w#ty`xY9Z|4iB!k5a^+4z1BO1%K`E35AhJVMwe9%kL3MShzI1tl zFQ_N9O=+XA+eYyiDmiGrJqzW;Hasfsjxi*a5jtk~D!CBgJ6stAjeryuAF58$4HOfD z_sBDcsk+P;m&QxeCNKxHg@c-?I@+XMGq6THk$z$?Q9mgmxv?djz|XWDE#d9 zd>Ia)ex3H*mj-t?RnjKBX!bkU6sL%-uyQbtz5fvk+^#dW;m@Qd$v@cpT^~$K*e|3T z#EAlu_Dt;#X}N=q{vq;`S(zNQsD%+FMr5v7K>2`HlqJCErhVUqC_y+xB5WI2Jf2B6 z=I{T+Pf8YE4FJAoi+rNDM z&%gco#^%OLody;_ZodH0wBP*o!{h?j)*1CEA!qS3gYPkXgI|O6^}c3k{W7TDFMt;M zQ=n7*B51Kc4LaRF1zPHtY%S8{h5qzsxW<#2{tRfjUjeQ3tDx2XG0$4Q@tbE2t@S}~p-2KRV&7?K#{efQ(ON`8&Xecsjr~N5*AC^(lr?JB#SxPFubScghPyaOSGX?-BVv z5k}+(M7~R8p2#~yen^BuLcT%d9Fg-x-X-!jkspD?rDr=h7z@79;YsBdN=cdOiKOe4H%c=#=mzd!HVn zra)k%v+BKFxd3The1j#B(z%kJ|COhTda01#m-KR>peKLD{}iUH|K2;7;z9x_^~`jq zB_)HT6q6MqbZ(sx$`CnAWRA#jA}2sLHze{?lQ_-PZp+hjsisqw)ym`+&vdzA#8tz< z;v$ExXAR?dWIE}a8Nk1rs6s2edv2W=k5jvxM22Ras4tB>bW+gsdm zQu`|>$e)lBi$v&1m;4^oLCRWDFWxEXh2p>(`~RN%R?Xe0Rpm0KHra3qNbzgnl_e*FQ({4wZlV$4%WGpq)S{%ZsK>>*B)&?T)Ai5f4GX@wT;zV#?6m6Zm-=j zRvzBGyY|VdvA%L&DU&RY!YsL>6~^VCCq}(NkQb0AipOUQvwC%^Fjde?|2%Pu8zjN-X_TXSKh0m{k?2*Nv&l{=V zpc9lnqS^Gizz?;_GIix*kA41wgLE7_( zdB}bkF%LWEQR1^WKWrbt`4Kt4;GB1!buQS`Urx_GhigaeM{wvv$L5}Q4%)|^ z=Wm<1I%A)})f3Kxx6`+cxsTf?=U(W#ddfbHtEZih%hiiW|ET>KVjg46+>1#0fgqp9 z?I#fbM38dUHZP^7KeRDfefieqwpzGaeX-$ss*Y}J+s^9D+Z>v+)0S^tM7*DWY0Yc2 zTlG~xb9sGj)$ud0ws1ae_-3m$o%V-6dhXdzHN2}A+e;14z1VhLzj99D=h>F0+H1EY zCimi{XL*3TvS-g-K7CI+vLzjXdqtKMuZ$Qhy^tgR_$t*)HfV|Hz!zV6mn zYjw|aTC7H`S$9|bqwkrdw%ArR*IuimhLazyyUvSzwwKnuwRMlReYuWDn>Z)AUwpAA zzIw^Iw(hhRoO7#n*KI5|9F^vccbxE>kCdzIo`TzbyekU109R14+i~xuH9N}ETP4Rx(Z%gZ!P!} zT@lyoYSn4g7>~BBG+MTEv$oh!u2-wyY`A`5t!~#EEwrmYu-@uAxq+nZ8#TAF;ixH; zyujkaPX-4$e9z(QeiA_^wTxxVGnP%;ykllbUw3OB z$Fq`FG1YEVLLV3Xw7ZV99w&#NX|%jYPy4x6yM+g7`#H1?N28x^)^GY&!*iOxaludH z$^GoYdK0%!XZ&){h^@HJH8i7(ma1K;d&oD3=4mWAu0M)=YWi|~wYabz#JO*!)CW+v zcdndJPP6Sf$FH`V&WY7)w;HVz=h_!9Hl97#*)oWKTf8b*MmZMmS=$_f>yq|H`*Bs^NYv`qhSF0(K zZNKnxUDcZoCZsydbF1A#pJs13jisxeU%~8Y)UlG(oHe(xie(eas_U#S`U6)RwvBnk z>EWkwRUN|ZZyA1}R$Jh*g96y?>Tw27BDjdJ%QB1Bh*2~)CX){`6V_`1Ohoj_Q!k>AS|JfjMQ_XtL!DdxvvvGu^?}t!xJ~a5*=bx@(N3FeCF@1)LYR zjE)&Jagpb^Ut*BP>R4L?n4cwl2Qf!8cBYeQ7@JwIY-jDlZ2xzfpYa(3P> z+({d$Pl4sJ@OK->t^BE!d+CN@phJz+r}0gefUYdgzRd?z4C{_(M!DCNr#Km;)76K0bOwQ+wH*&s%ug?@t@~N8y-Jd4wb!sdSW=>8uQ%7G^Xh4) z9t=C(!wT;D1F~$^WKHt(;ZX6@^_J_8C6WdY;g>imNj%*8b;mar^$>~12^J?!PA;P~ zcNc;zX6=~#Rg6kHZis-PCYKeQ@t)v7>wQpXLU6k!BV}C;;J6YAH>lM_{vz$r^bv*UNLUx)#N0uMyp9t z))B5B1u(=IRai$Xdw`<}>{~`wlq|+c$&Vhf4VrxG+2$3;wz1y5%?a^#1-(7Hz_ld& zaB6X?$l7uf2l3j{;d)}qdXna(sA0ljU|L>gIcsB;0ntk1IETq&ZRIie3muDV(6YHF zHJaLkiNImZ$-oM8(Bpi7`7M{02ffTz8DuWccsqrv7iO1qrT)XyLXp&YoW5n~k(zI` zuGbcobL|2~aS0$qS(%2eSHl6~juGjpDl-4}?QG+0XHU$SEIwayVeGpTpPPKp=gaA_IZmNOepy9=35Oy=CD% zg9Xy^3@nBE%-kfak?Wu*7s065X)Kmmq|4h@C%aWZ&lfvbRBiNg(93x{hbvqZ6IZ04 z^WcJ|k))$!MME!39~If0(vaMh@ zpTRPSzDh6DJ#~$j(~TlNo3hb^96G^ zx=P~BJDhge?Xim#deXD1fn1e?08e~_$9#zRiPbt!InEWIY&xwMZsVqsmXSdjk4!Fk zpT*ZLA}E?;#zcC;u)e#oe}~5p7umF0zzxbqF#DDkcA^Y39$V?cVZYsMl}ok3i7 zyp~?U1&$7v3s*i>HuB|>qGIeqH@BT=*N@?9M<4^V6u9f%DchP#eZgErD}o!qmF{>O ztt(eJF|Y&&_y zEN99?H6T!A+^cmgVux|2I33aApT&=F-aLix>8__HBjgV#DXqrBegqLhM5Zw4#tiiZ zoG)+{4?p2*ehv{X*(VGjYjbB8IkEJG4nAsR|5@$ql zn9ZG>3?`%A!dN}Kwg%qXt+Ex=t?J9cvRaME4OvRI3M5Kguea>#igT-q729nqccv&T z=2^AmJ{d8a*Q!V6MfRAvEUD(FpR8Vva?p3qM5$MHNFA3Fr^S@tAvLZ*l&wPS_QF16 zOQXgSeTcI`(jBsSADKi1sA2c1RP=ac+WGKd@@gP9g|+nBD`=$ZLR%@vU28+8YAxNn z$*ULIs_NXVHzE8TL)Sn6Sg5zESDZwd>#k!npEa=JC~FZ5mxq|R8mRS+t4^zWqpen0 z<_(|<)q3@XPac5h95I+0i zL>7mub+xqKq_pC$ISYg*a3`7w*`p}{l2q%EG*F|f4S+kyGrAqkj-iQO^B7TGXCIe!_7qkC|-EON}PbA@o4}Tvh!n ziDKCw@6||nz^JX_=jwT;(|SC=BA8OwS~V{i%8~BnYI+{SthuHd=(weDn)Y4&s(LZE ztLu`KmECa})#U@MDnyMf3qAxEPI%2$^JYk2#I`oye!gG-cWaNN;01Kkkazw9zqbxEF!T0`*<@Ci3Jii zB^HrWGN7OtzZh@}FJ62(A-kK%g5->Oy5EOjn?NEMSq;+qGT76ah$Eogl-ePncsa18 z<{e%sP*CZ6r5Icxdke0r>z)CxgnS2ggIB1P$*#VYp1lAZ&AQS6PszvYmMj2$37V*$ z!3BR9Yn-E+j@5-=6!Ru0U*2!TW?As(iIfLLCUJr4wA z7$U^LatQ(j&=D;rbR!@ga8Bs^VwgnuMsszo8{beDP+#>N19ExO zgPM~S=4zg?e}TbYWWW)vew@Ke3|?mN2?nn)xWs@ULMW?!7RP>8Ln(eSE>_Jky)n|&6(MJw?aK+@n+g6R3i#_RNq1ezs1z0B z)T;>mObbZ(nxEBhcw&MveGXsgYBJxL1gY1pO#on<7G!ql-&RaD*umfjTMT*c3zYeS zF#H93oB>Zs;RF+r-Fi2(iS%2<3lKAwk~oXv$Hu>u)%h}c4*YY$>%tz%D-&{_L45)7 z17^e|T6I95j@$&3K+1Bgs;{|c^pCws+RKxdx`f&w(E!YmX7ceo$ZG z#f-+jNBh(VxG15Ed`OFW#*<$NT4m+>KS543M~hob6Sq73%kS1k&X;(Fgf2# z5FL;wRAMbx-|8OFG<#NYmG%p(MAgA`-rOega~=wjy9l9m8<8W5pYE7;YCWZKb8boG%xrNW2CRTf#5C0IPwxN$enE%ZM%G7vMr| z$R@5=!Pzi=@dZAI^q9TKYbC8d?TrBA!7)k|G!MVouVii;pEtbGN8&r#b)9wSy}s)`uI~=62YIukME+CW9_ETKo@aT8pgKz3D+p6@cBA}!3kw63 zr@HSpep&hpE4hd^WYOx?`KWK^xh{b_$2tQh4)TNyHT3_D28I+CBN;tVc0hQo+Urzt z$}$$L-;fWG44gJ)AYMaml%!^=FXLW}4|2v;cU=i@3$lN`VLLq)5UFZ`N=}_QvJi#H z%;k-zLr@$_lWKF_^&qujVZ07jlkD=9Te57)VpTn|gr}JAx?%pw>c)sjBlBE-Bo*V# z>6r`aOBjLx<03S2dS4f6DqP0Tau8>!Wd)2@g3h*u>ebrN?Wl9Or0Lc(OeCvYG(B}e zejzGT{WO#R6jP2R*TE&#UWX!xmGFy{uxg+XUfnN10jw3_{&1YHjTGoczxYWm>d&%E zw07dpG5?6>``K%nt;((hzt@A`w&t;}1jSL>C$IaK#=KI0opbd76K_q9@IaK~%Rvr8$6T)NK&yU*;*m(!c&E@k16C=1J-p&gZl zw-StUX?fT#0>-R#01U4i>L*dp5uqGE?v2`9BgdAGuWxvgLll=&CTJ>5pP!qPu>~sjC9hsGl&~Nd+FSN9dW~U`OD_!DDv5jRNxht zM>?Z-Eu<@L4ti6FOD*rYjWZncDvt-fy<7X3E85nwhwfsGdi%E?z}5U+1KOgo&KP?P z{lH!*bSjG$DA0HD1eNYMJjbw3`4g$HSn@oilE#+|bpT?BhA($YG?D598#1 zNf+He)-i#5j(OGP2bT|Y#_UmV@A5%=>`r>?P-kHIA&eD$?cok7-eG)?*yDH7h~*QF zbp|m`#yjJQ+T*EMf0RFu@+YDvka($sq>t(_9*(%JX$Vlv?PxF}AGw>xd1~ueXB;&d z?_|DYxEnU=YfmCo*QkQPHL-0JP3$m=3a3)Y=PQYEG>&Jn#DHYiEbs)F@NluG?a@05 zrb7Z(%y&l%-LxH8E^vdGv`s#E|URe7v-?Amj+b4T3u} zAFo?)r8i2S5FlAI`y0g<0Llf&B_Rai*+iV3UQt|8f%k}rZ+{ooG4EWs3`v9}8?@O| z>kBX(0*qe=;k`;NpiK;J!Kq%SvJvnipW^YE>hp`$Ti`KasTBY??_95f52)7Hz4mb^ z2_XQu4S;JlSc!nq1f#fe;(Ev(9IFC?-NqHrx}s5EC5fki6g`IK0z#ZsI2JS!tdVA* zG~?0?P=NT4AVF_qB}{D`idNsMHpp;+hPBsMZSoBew`hD(eW!4&dIjtd_>6ZRjBj8c z^hvEX7zdC$uqb2A&x35B#;v6d>A+SXj?eeG^_`)Y#1tW{%ahZ`{d~YkK>5le2tlZl zqBN`i27|xJ;FlRhluG?=o|1pKx9Be7m?q%A$ixwsqE)i6s{_wO{o#^#Rs9lE5Yh6p z>uWGFa73a~e}&OxCH!o&?lsp}VG&XHuA-VWh#>v-i?j?2ECh3Id(netQjr>I?14a$ zp9^>u^$U#urwo3D!4`vO82r}=rYAM66>%{lkm~R9`mZwhdkFmO3L0w3RiwjyM(eu` z*e3|WNb@^dRun1h2FE2W^af{L3{)^J1iio=*J*IY_UMKZWm431k$wqdu~iF3l0P74 zIupNw6Cm=mP{go@oYuo42pp0jfAD$KJ72VDMfGkuT?BJfH1?W%sXfhH5bpJhYszE%>Tph1st306Q@Tfp#E zqX6^0y*gjTD$ZpzT>k=vY;ZRw#VxIj z&Q?O`6CD}EHV}e_S8V}v5LK4wQ2fccM2XC`o~zrWi?7epI2r0q4V8H=7U2cfxuB`egw!X5o& zC))J5tj!3fd}O`SGa4c}Xtq~k=vfdG(m)lt7Vs5mU;||Y(m=UO8pv&#H1`u@+H^uL zfHbg`@qnz{4dnwMDqS&@6JT!J6+_;>7;tCgu1k;?2D{{i@EVbc?rW46cDgp$DFMX* zrn*@U`0Ve+-4)^Dl{aitkukD73eGVXaPdgDHP&Gn72)E`-Z;+Ck2cI2>^#jEw#EU< zPuxx2hUo%0T$+=R&nH?!KEJX#9Ps%SaQYFS@9l2?0{C`t?LYxxph)hz{R>jkp5?tO z7NmpS?%(qEZSA)QMA8zu0G#fpP$Tm571#omkZ!a$7o>SE&PUEh84`tBM&7$91rTql zV37fy2yQn#18TTA(HRe(PM#5*I-gXI7irT_70!9$J<}i`-PXEPJH0CoC8#dAdBSeh;~Fb&BrKM2dI>=xvXpv*ke@akP-34LIfVJT&zZ7cQ=>bV%eS9#d|57(9BzuCyrMAKrvawtQrBcV{<7>-36=mirn?KYACO zy!s4|9>McZwT8W8d@g(VHdFktJmYc16Th>TXOP2`^xa5j3ZrQ>z!C=_3r}HwP(IT~ z9Ig6@{q_`6cGVV=(*xV;^T0joW4ezf>a(x2KaiaE%&K3*Ql$PqgWU|yGx&!L z{t<)!j=_TrejR~d4)Exj#-lfWigajU9d;eCbm%1$p(tXvq46MI3FGXwHjGzx0=186 z(F9}@RH_MyjHWG%i;aZ_vv1HCgeDaL9U@SKQ-|5-m%%r>p)M zu1tSO{RV#2Z!#6kyVuk|=cx!=ALkMIS2fSzK4dTQ@%^ML^*`~#BMgo)h$X8%0rL)aL)K(iSj`|l&PCb~G z+C<6{dFv44$PxIX0Fo)4v)OA-ivpnSPcZW+c4=)rFi6h<0FWB}2i|{%!AlHYX7C9H zuP}&I#s03CIl{Q2C=`kmTTJ*21CjHp_aWyg;hqF+^?BoOG5FsY5PDYs2ZP^6;FkkS zjcvl4`j5Okq9r&y3oMkOP;GgxtTYP*bTnJ(=R*h<_S9?{5g66)@b*$5xz?cia?~d! z3P8EoG}FFT{hx@^`tRT6(Z6Q!e=+z#6h!Lq4KjZJ1L8yZuLy~61Qz*__e26Ln%y$r zfi$E(6UuVKe79(t-?6emDgGG<5A{Uc-+6 zIv#d`3~Bfo!vW>6bNIU7MgUTratSpXn}wGAwg+JuVXUM#VMK460CC@dQVwhz*q#8q z7R9EwQO9B`>pbUR0PqHDmyH(SS6tD z6BJX+QX+e?Oxei?V=r8dvkT0?XvPJraa|=W1$tIjT?lrFq@)(!wb;N0(Jo0Hj3F z-h%Xx@ypSaXnyu$Xqm_S!jc2op2O)>cw-_Vc3vJ&Ce{)@Gvp^&qVd4OzQA01Tev0J_Q{fs#~VOL_%xq*;d?HFRl9xAwliAUUk+IUBAe9if# zc-v|1TH zRS*QQaZv>Dl;~JrET}&~8uiy0kOT7vYU>yqu$98{p!PEQ#~26$wVy|O5KND0j-!Wv zIEhOka2(*^-{|$36K8ufl zI{$IGY)x2y5XbQ-`;Ikgexo}MDZgbEH%fnwQY|rXI`3d#5@Bp3()n}Si9;Q1` zWZzA91^^+Lh)?&4Ph%@r4n}^j;=0l2y3r{Ch8hGGBKM*> ziVEPN(xM5jWpL44xw*giv98oDlF>sN3*UDT?cCvdFxG1y`t(oBm-M;I_Ab_jy( zVk?4r0B1n=pJwEMCdDGA`2|u*eK?>=n4Boi6*a&~3eO>$^^R3i{_Z60sZq)enCoc9 ztmw%<%-dxxsPJfzLC^RuBJL)~_eW4DxKgWV>;mt)%iIMW2=yM&dZb}eB4#HvJjsF& zE@AW4h+zhND~(dDcQy{}G)}t~jsKfXkyd0?XD`5}IjQkb|Ar6#Bm-&Cf6JpZgSQzB zF!*;2UT46WsaXA;w`~1Bk}M;IOR<28jZ)4W2&b5}gIc5bt<-1C158!?X2n*_V*%g9 zhpaU}+teF@WvZxY1tnO+WZo{Z$xx=l9~%C}Sr_ z+oQRX4drwU2i!vyBE|iw)FEJox2r<51_Vqm$U6Wc5L&V-&_t34#ttD8UAw+U2Le*^ zrSmVJrJ5r!fqV}Pf4f=FI65+qFGrgD+n5WTj2aaXAJDv3R>00*uN07kxDMUH8iDo3 z5=dXKq+-^ym8Dg`K>C!Q2;j#tGy>ud{y3)8DQ*p54W{u4jaLTmT3h7+(l64KC%8rk zz55#B(w(j`&kEqsVSGmbi4G*rO5JB-0yGFik zR^-$f5zrbm$J>jz)bc)1nE@QL&3IfAlT|}~6Z8mu0)Gc-wCRwT@_f_V4|qL_0r*_j zsUjno;_?H_ReMOF?IE2LAnYNu<*$@0>oAbAX9zL*qaQTot1xiL89zy#4_DyNtzFti9)+l!BYD1gR+4>J-SqxT)7wN7?TaS{F?zg09tBrYWGJpST#(Z@f zW<0Nz`0={pp1G{noq23pgGA2qHAJb=&t|;^M+jX@SQ0$e)zUljLP~+69~PLOY{)Hx+;?y-{!;^tn|_7Jue| zQ->|<0TJGSX=5WFGn49yBdE6m%Y7mJKZJ#-NAuVP>i+{=-Pl8aOK~IK3#-*y0K)!) zXsdtBqyG_sZ!N6bbs@&s99)oUQ#93*g0TGpZ{eucl;cglrs z5VOaO1>y(+&d7YlEP;s|!+kVZ*#&!H`HKXW3V_V)w<<*%tt1Rq_V!x?W&$FtgozvM z%e&p1Z3F)HUZyBhSeCxZHXxo)j?FBF&%YDD1~ZiW#D7w}lY87{qWQy%p?|Lc9zP8` zDomN3IXQ~5v7w5C&18-h>{j76!qObfROByP$l4Wha1cWB4)(4rlcOuhkv3DwgL{)$ z#19~zdsRxo@1R`c8Pk;brREm9QxIZ@O4%NQX-jOvGVF)D=+4u$&ruh-wIend-?qUc zgXa)V#>PSS1`j&jsF^TQFlun#1Q`k_FS(lAjE5;gha2zV#EE#Scz!5&#pMC;I^5z= z>Ki*K8y{foeE^LQG3hLd6j^Dy+-1%23}XE;E>^wEqQCpa=PzA;rWXQ;e_b!gZs7`+0Jp6Py#v57w~m|1i^Mg z@&fv16MP}Iw$VU@X7))QG<_!vIUGnnI28043@y;Ja)&lF=k7^y(1rNu`XW{B8FdC+ zRAJG85%xeQtIp%B7@Q4g+4t8_C#_6|?@4Y<{0&C@CWE7hP=^^j!r%z6m+oiHq>{K{oU9TtZ|7hO)PQS#M3E0|L-x_QAU*|jxwr0aDmRP0((56n04M6KZguI zS@;j44JY|9deMW0VL7aRa?vZn_AU$CyDV()_R{u_ch)cBRuP0kTQzcX2~)M#=6gIS z&SR(kxvNf~3l?W68EFQmQ1TiL`NY72Da1s;gIEP-~1+>q9=MA~iamtRxs!g$FZA}Q7J7G|ck zGmq{khQPQA{|WWT)#hsvy9y2$p0(aMa@F(J+$T?*Si7Y!wbjxIx{o{oC>{FQ6X0yG zdd<~?JvUCn;^B(k7bO*V6=rcTzQZHcQTwHJI0uLd)kTTIMmg}3_p?0|!`InJig++R3I?k>O9C8F@FT&U%;{2t>s=mbh!(kIHh0VDE z**u$qwJUu&<|7N&o+tM+@>G{)iSUO)t-IE@yNRvYSUyr*x6ZLHe+B_)4RyrCaSboM z48#_H4;>$bn->&J2IxVpNIw`GFqcy+qZ0MHF9|&|`+&UW?m}b$CSUuEZSZlkMZ?4?_p!QE0jOIY2nM5|T@Fma#jFR#|MIyR7tw=|C=U zY>NTD*tqFG5Lvs>ii*ZoHCP!htgN+RHjZiP4})#0xp1Nc6IyvF*WXifR~l=ixGp@$ z(UVnv2nO81!7>ihm1Rivlgbm{$Q?Xe?oPVI{-f*uc$DxHE-&cY+6+?`0jDIb>AI!4;}G~+&@HE_ShJI7DJq`j>jT|LgL0aFx~`v3E>ge`6JJ!DCC_CYIegZc+9yI?XY z`y)8_Akt%-xvk}3t%jy!t@*&Olo(p-ad`x`mx(dIt)%Im77sSq-GF%~M^dm0BOEdz z-*rq(F@t?=*fwut?g^d6V5_0ZGvqbEGQs~?P|K)?#871ZjYO@;(&6^TBAnE&FKYtm1vm)f8;q!w|W|dV_oJ6E^5mpa^ZxE-)pq*bXPpy!%-TVsjw#_JDNYu zf_{(nA)E=cgbADm3Onmy8zvr*)=oPt_hItbH6H-sOI@o3yG57}SuYapt;zKa+$%+8 zqCwx7?`h4F3cd`&@|BGFJfHC*D5xCE4s zs{khjSZR{KwV;EzTNSiPZ*mQwd4RtS(k|?MYQrZV+_Q&;ZkzRqVzrkdXLWif-t5$Lv}2x0 zh+xdeHA$3<%l!sQxZFnUcx)vn8Hp!A?rjXY#jb&TzdSznb;N%E@44Q+6fqlc@u)ok z@VVIMoC00XCiI_6)WX%aw00Xlao~u6TRWH>OI|=KFtWf3Xh2hFt{&-$#z8})rx(}7 zjSpo6VLxQs4CW_Z=>p6FyKkyj+c$(MX}53&7AS6OR9Dzo{G#i+(3I<_6a5BZO_dco zQLe0XhdJ(nI?J=*CWTAY6!nE8Yi>7jhYTHHa0a99ZvV=ouP$f|!!E$Mfh4at(+j3pBC#&Jl)! zGa;sedTG_n+6X3u19k1WytUZP|nWpZhS$eM`ZN zPSie8T<-%!Er3e*j5G9V?*r(^hJ#RSZ2HgW1R;=`G`>i5?j`Z}0O_@B0)vpRLVm6a zL7OQq0BuNob_Hkd-^M|Rz=_j3D*yCbR08E+fV#m_#?E7pIULtHh1*79+XwFMV9W)* z*??2RFMN7J^~WPB0S9qZ#_naSWGsC5M|&E5Yw@@H-Oxjyl86h%+iG!ZGc{ z4lk)16GO0Zpn#KpDFBwFDK>smdZEj>L9j(WTc)-Rnn#jbt;` zXUQT4q_M5y{LYl$b#`)B3Xwh(Q z*~VS9A;iKmsW#+Qm`#T%#qiRs4=vu=ASjM!JDXHhSPZ00Pck!6LaRIY-NKZ6{?(L{l8!Q2;XQzzWb?vjdl?4 zRuienzrS>Cq;#D52v}^eOi$q;yi3%u>JZAgFg+eCV8yA9HW3A(T`Ev_FHTb&R3W;;BPN>M@_&MH4D!j_%%!B#(4J> zh}tR9W5^Kzb4rlzKjd&D(j7P;K!4CBB7SJ7+Y9uLaTl5ns9W&RynARdX;t2o_GC>P|9sp;`<$XtMv7-N_c5evP%6@&(g*o z(k~fEZFMZBM~|Xxe^9Q3e3Csz z8kb)W+uCF=y^enh3Wh8;s=?H-H58Xb*ARj+%6 zEw~frGGtquSv!NRZ&{^0m@Imw+h&QL6UwUT4Q|0R0u=RpEfWmbaWqJ;2=Jr%<3Zgv zQ3vj+&n#D9&^`>`W!Sym8H7pu@bV}OB(rxcybLP6HD+h-8e8Ldm&pJ;e5G4?#06(t zqcA{(8NKBDvsj=e#5e$NNZFc%x#uo?cOyl`&UY#>7aaDcY}gjdZub!no(^DMIfB>s zly58*gQRx@JGhdjRH^MQk91A*1>_ z+ORCG9^3`9NNf?1oZf{ePZPfK-tCw$lEiDVT2;wgn_Z4~a_~7Y?p0w<34=?R;%7R8 z*!*X<2E7Nn<7BKHuq$^m@NkfU--bNOn6`%{Psv4{49?(@0lp7X%LmV=-Z&30226Rj z18-wu>Hyzn%P3QQ62H8HU-(_ywfqpgkyW;E-m~p}zf_Tjs z>tu27T4y)wubTEK>M;g`*hx9x=u99T&ZWeOFB#XKKq>I&hS8yG{zGwnP+N?0-?+-|8Mrhn<><03inMcS|n??@y<1GI*hX??^s|6cFk^F z{xswXFwwGMU*27SHwd($VhxeWXyyyMorn!*J$PARc3le49$tB{@ho%IvZP=$cqIb% z_Cxgv-Xf6&mlT2`iO4YDjk#_3378A`Jp=87tVI8!f<}bo0@7BO8jB3IeiuMjF0dYT zVMRIv|Ds&s)hI%7c(5@XsrTH_m-j@xhTp@bw|WqHZ2SSAn9>Ou_$CBJuuc#K0wn{~ z#CX9VWIe5~7cII}JyayZ+Qm6@SaEm%;Xt&m--?{OuD9U9S!)~uh#3$tS%2B=)3pUP z^uYk8Eg&>hTy*(&MORY_7iAT7Lo?kKs8-KZ57Tqb;rVdiT#!(n2I7nihMB5x5a3n* zJ)4mFy6H?8UHH37$w_lV_Y~ioeGcLzI5I>6=)dejb<9D@^oHdGs{~Z!vhBaRWS} zofk|IX@j6~qQJAL7`M`LHQqSnTdQ#2rM{n$zs!Kl$i1BaXddffCZ^$#qTQO}hHu?{ zFCzngZ!MfdNma1CW0bF-(@H<}2__!G%kH}CiPv(cTS(+j5P*tpyx>wnRI+}B&D>qSHtrqXHd$jw_;^RE zTMI|JOfuc4!u^=<1zSW9f=X z&vWtH11HjZ|nE)rR_mbT4+gzl;4Vt zX&IY+CtskAV-{Wwtk2>un8{hVmoa$zX!Mu;_C~nRNXsC>T*JHb!q445;$=cZGJJ&(FU6>g7wd^Dmu)9q9A3&*=&AHt*viqGSsgmlSo9H~%{x zQRdQa?Km){?NyyeVqhS_M|mo;!v);-o9vm7A&RC3Bf0!=KAla6yVv69w7${p=Z>~^ z`M}mLn>YVcwqSlUTQt8>92om!#atJVVd6t3mmfEat1+4Y8=iD5k;1x1zmHX3PY5I- z%l#Wf6yWx10Ka+hZkK|+4^4qnb43S8!fnGD1abgBcX!Of@b*9ZARORV;E;C|I6-B3 z6j;KT91Y`W97kh_9T(JLLgI0Z7jw%!INF7y3Hc7A}8%F~Eptl{i-Lh@?alJW*uljUY5A|6b!3d4efM0GoH}w0AxQWN@=?{OL&$_LU zpJKh<*r{H9Z@G_pp?x@p!2Ma$0>3Gh5b!o~6 zw80PrB19W#JP#NVag_*JFo7%-B@UYwBLspT*+W-1+z`qQoJ?olDZrq9K-}k);8{`p zJHq{q_WGXhlmrzRq?zO9pltv-$V%QfJA;6i4Lb|T+TeC}zO7`dh*U#Jkpbk)I7-}% zt8|982E;QX{XAlGe5b;56CG0WD=9TGXi;X2pi&oPT=Ly19(Tr0Q|b=@9+g$h5FB-GX=w~;*-RC z8Kb)dxV|=_{{PsHgC$&)P8T6bn<#`Lj5aMn4O&azfT5Ykk03eUi(l^Ao;`phPCq)63(8bY|9BQ-H61Cde)JtT+P)CQ!MbUAd zcf(mMWXix|Flta1$5%^l3eg+n4tN)AF1CXbSb1? z3Ewgo)!|wFy2dWExB2;bSj=B-x1uypJ@u4!f(yH$=p8iN20*uf2^+;@p}g9)495m_ z%t!hO0&1V-y2pC^$bqKz)?06cPBD%@@zLYgVfJ(&HaBFRkr<486V>T82)}ohiRXp= zCd7Yac;I6WUIobRyG4iv8kGgwiaZUUQZ0${>Z@Xo*mRygkjO$qs>mX$-w~ZFcR^4Y zSPRsDXAZ1+c%;P52dmHN*8;@`2z=B4cyI)8Iip*1CX76yOPqHUx9QTaiGt;k~ zj1A!4Pcy#Wdj4=zw6IZg>%G6viws- zrXEI&$sSZ~ z@16aVjj4WfUjsS|l=i|833v|Q7w~n-7zW^S%fpcejhhq}0MIhc}^w0MQp$!+?g{+?`<}6;eySL3v%d)88KzB|j%Em{LCnhQfl% zpxCmYcrbVb{|7~|HGtVs5BRh|?u9Qy#euwwUVQNt(mV=s^&q@>qK zOprVf80*_Wi}D*M`%I8~7M+kCFlS@s`V3CMdzzgT^dc2#lT0ra!xNpf)+va_!n|hj z6`6E~#HAo15Ku$}yyn}>!w=n2_kZ)tZpTFYNn;<_o^^zoFFKcs-4a}!AKYJjx@mQZm z0&Lq>Ft*Vh+YcQ!)GBhq7?t7jWgNrJslEwAivz?jY8;Tqx&l9eR01zyCrqM%r&uA1 z&s_NUUWh!fzDN<<#RAPF?|Z8lB5AW|92`&Y?*Y{Ok($>)g@Ua{vSFteTKgQUb(_KW zGxz}pgxl0j9#MHNA7F1+*#p||%3x{(a1{}MTZcyOQSg5DLXev*aM zc`>FNF%EYNSMQ*!+~S;U}wF9n;1&OzbeMtQ&}3DCASprV^Q+@8`%g0Yo2>FJpN||h*s8N z*RTmht$KvD++Im3Gko$yMKSIuA}(MaB|Ze(`Emaaf)IX|l^w=V-LJzAOxfsdEC&;| z-1a}{nc>+2P-+2QD)M5wMh|6(0Skp!u)r)WPt!J>!L&^u!?-O>0TVZ;jB7)|ZzcF0 z4t__RA*bRD2XOMJJw`ZrZbU$1IQyNR@Wbi4eZ`}CPq!$`DrZ;NAji+NW)_*y9ZxmU5FW2JF zp;?rmwtWyfMcDWXidUOB*!aME7D}8>i;*$dx~&)36M~$6U$O!_!Nv|N@R|Fo0HMj= z3hayy?kp5-g9b?=DOm_K_%I$Jfc&x%6f_Yu_a$EF(E%)L@n*b4E zm;~6*w!r@n0p?>_cyrQ(oWZNuAmBj*Ihf3+GYG$B@a)D|J9h`ZE;;>RDg-m30EUE# zPjnYw{E}KGI0zT_I37YRP4#0iETUGR+~SK`7-y-kb+XG9Zx3K)vz$eW~BI)I>j5z20Sr{K$uU}&j+2P0P&V=l<=8~VKGmV_N)&K>xa6X8n$ z!K(BHJnw~c1}JB|zQN@=H>&ZqgY(-jo;g!J1!4!zLBDq}D3T;RLMM;B2r%gQCA=>K zG?5homKl(5HWoj9a(+r@+{3r<_GISgWH;Hf+#<1dDau&XiMdFJEPLcikfg|WU zEx2x+0dPm&16w6*m>vl`ATGGwDk?{vR7kD(rIDwx8lA>Ivh8gq#QGsjc_YFk( zqg@PlXiw1vOqEc+UkOKcAdtKM;2K9EM-oNZO z^8*yGp`^u29cY*Vx3=bfm_NV(0xoYUwDI)}G|D)DFH#C-H3ngYY4^$<=9L@%Wdpq?Ep8;qkqeQbhfWWg!Gz7Xmuv( zCyD(uivJiJ>LOC)Q>9_bVg2kO-kGhaUkK9~k>>R+-tzkvYn zn#O)jm^#Tb$RkmL*H#^uc*F;u8;yMM#Qtex=YHxqbDGd;^d^wlL=f)@4Sd@w8ObuR zInL*HbqzyzCh*)2ALH3 zz@H2)iY{F~`^x1mgQ8PRA^?c+>_i4+pmxm-nI#03{QO7IquAu|6jB9#7)Q0&M7YDo zdN6{p1XzcVU4_4R!+I&7P4jrnEW)fSZ;cF&?9D$^ER5`(ni?skK2sS>eFotfb5*eM k?~LTE=geVQo4z|@Srg`k@EEB+4Byiq!FkF2!->lO3(fVAQ~&?~ From f644007ce5f4ec2875c55922d2634c7db6ab965d Mon Sep 17 00:00:00 2001 From: LinqiY <100989140+LinqiY@users.noreply.github.com> Date: Mon, 6 May 2024 15:41:22 +0800 Subject: [PATCH 11/16] Delete collie/models/mistral2 directory --- collie/models/mistral2/__init__.py | 2 - .../__pycache__/__init__.cpython-310.pyc | Bin 295 -> 0 bytes .../configuration_mistraltp.cpython-310.pyc | Bin 6283 -> 0 bytes .../__pycache__/model.cpython-310.pyc | Bin 49178 -> 0 bytes .../__pycache__/modeltp.cpython-310.pyc | Bin 52277 -> 0 bytes .../mistral2/configuration_mistraltp.py | 155 -- collie/models/mistral2/model.py | 2026 --------------- collie/models/mistral2/modelpp.py | 1922 -------------- collie/models/mistral2/modeltp.py | 2254 ----------------- 9 files changed, 6359 deletions(-) delete mode 100644 collie/models/mistral2/__init__.py delete mode 100644 collie/models/mistral2/__pycache__/__init__.cpython-310.pyc delete mode 100644 collie/models/mistral2/__pycache__/configuration_mistraltp.cpython-310.pyc delete mode 100644 collie/models/mistral2/__pycache__/model.cpython-310.pyc delete mode 100644 collie/models/mistral2/__pycache__/modeltp.cpython-310.pyc delete mode 100644 collie/models/mistral2/configuration_mistraltp.py delete mode 100644 collie/models/mistral2/model.py delete mode 100644 collie/models/mistral2/modelpp.py delete mode 100644 collie/models/mistral2/modeltp.py diff --git a/collie/models/mistral2/__init__.py b/collie/models/mistral2/__init__.py deleted file mode 100644 index 9dc3f79..0000000 --- a/collie/models/mistral2/__init__.py +++ /dev/null @@ -1,2 +0,0 @@ -from .modeltp import MistralForCausalLM -from .configuration_mistraltp import MistralConfig \ No newline at end of file diff --git a/collie/models/mistral2/__pycache__/__init__.cpython-310.pyc b/collie/models/mistral2/__pycache__/__init__.cpython-310.pyc deleted file mode 100644 index 76a01ca4171928aebb54f37b4541ecbf0bd2731f..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 295 zcmd1j<>g`kf)fuV(xQO$V-N=!FabFZKwK;XBvKes7;_kM8KW2(L2RZRrd;MIW+0n6 zm_d`}B_mLYCgUw3-^}8YqQo4x{37SX(&EG%A77v-FI3byKQApa-A|Jxiaj?!B{ip{ zpa^6~lz1{&qO>TnBr`uRJ{MvJP?i}eyON=Z1xSI3U(xzSsk!+jsk#~YxvBa&g_W6k z`p)@2KAEoiC8@B?0;HGr)ME}k^pZ=_OaFov{Ui2P^k5ih&js3|=u2pi;`G=BYjBvSiM}3dainiBma(TGk3WBzOvsP<`&8Bdh4a4MB)2lhLT0_(l z1M?q!GQYP_U93s&d-fi$IZ#-O?Ny)oo_o0UCaTt5EITxV<8@Z@hi@OblEayyU-fA= z$pyVHay`BLEeNjdn8_UWC4#FZXk?+Gs9ITlL-kn z4Ab_jE3GYAdKY!flf57H#ke}ItUhs$ zg6z}TH$f(WymE}jKHG1mx^HSFUUm$VBx9Mk`wV^tPTUrkB47>aIf(pV0@t4+#1=CR zm+kTt0Z7I`dbaXk962ux+F{^V*%mhdk``C{vOH5oTam;uCK_bE9vw9t&Q>~B zWYfQ?o(S7}o@wmrzBuG;wl3VDKF6+kZQ>xqzNRo0;{d>0URqdKsAN-V`e!sf1SSpu z4(RB|K#amFyGzLAaRRxav&$t7v_zx9C9n$wJ?Acc4HPAwk-QDM!k57K?m_ARvPmy2 zmE5BX%dfLGk`Tl8TinHT+a`1m=3Khrmar`Do^Hq27k>Psfeqgk$TlDD>25XF$I|sG zU;EY69(>BB!!640(^*}-zPx;**^LJekG2UP>&ZTCUXDS8yv z$)BnAc+Izl_!y~6{2TL!;%yWdwgDO!>^`wbYJ$yf++V58F>wk*i$^ORSU9vkTB$G( zi;(*Qrq>>=bjw_o{S4XJw=Hnq9+yE}iX2+yJjIupEp9$lpPaE6M3 zj==STthUd^#WyL%=21$jY;Lof`SGNeVmO_D0NU|ld5XN_Kf4|>Oys4X-F)nC$tP{ zp|9IR@#rZzXun+(<;2omD(<%4W0es*rtej90pR;wIRuS>6ejPfPSWv}5}mgxGAAu{ zk zI9VP1f&*Js_<92UTWXEj_S%M}tIh^p3U)ZyTn!01#w86D<|h3tgS$BeIExMy^k$ULdHGA_K$={ ziV0-AY8h3%h?{Fhwpfp?tae}}R^|igwYd&WfpMA?L6eM(9^(6512qw4L3nAHx4eg| zJuApeChVjq?DRT%`a?aWQGI*_zoDoo!lQOwRi2Y-4(aYGJ89FUTks22zmsWWxIen{ z)s=_hEs9pEpS7j8YwQXej7pTHGO=E-w-17r=c;$NZ{Lm&LAWF7wxfu%)AnT0rTPsz zbI*0(-BC#!>!R83;ZU3~w&Kd}IQG#eC~yb%z=1!t8Yh4E?02g>Eml9991w>Zh}$!D zI;8D(5BewAi)W>F)TUEz0y)wtKo&ZqkfWU`$YQ5xX`v>sc1FLzGd>yXj6s$`5ers!zUclwK_hnyCov;7n`ol9e~8LyL662W&Wxtgl*_L3*X7$7{;y}aEp@zr&$m2>*X$PuxH7%tJ$N8iYXSEIzorre|Kh8wrCPO` z{F)waAaKb_ywmf2U$W}Sw4hyWAIJ&}3jFyC5XCFST=r8MDddXz?0qp;8p-G4pTd9g zqvQWRyqr)-%stiArPGp0wxh=Y diff --git a/collie/models/mistral2/__pycache__/model.cpython-310.pyc b/collie/models/mistral2/__pycache__/model.cpython-310.pyc deleted file mode 100644 index ab53c9573dc702d9ab95ac9870bc99c46c10a54a..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 49178 zcmd75d7K>AeIM9WU48fToR|RyR|5nI30P`8CU?{Km_1{Ko56BT-H?lI3J0RZcb1<+P#0XUZ7~mu=+C zm=SB_%LBoaf;_S6nMSUh6Td`#urX8~Y7Cc$8zbcr@k`c68)M}$xlYx$G{(#0jfwJv z+^6eX8{5j;8k6NoxzE(MH+GbFG4werZG3V}^G4-R!d!c@B<52lf<8b+~#4uPd zHI9^zG>(>!Hjb5#!EeYp?ig$LIfrf=3-{kNuExsmaQ3e~;5>NSD8JJlF274^;GxEM zl^>QE%=(G)3Gq8wJ}JMClpn$GNd4W7Q{__YU3?*C7oFU~%*}YY3ZFgpUij<{d@k8{+4~T7);?|DZQt{P zQMM6gzkL8<4g_Hwd&dHY^o%_Z$a_Td+-@@0fA*+&reND#ILpQH9M_#AU; zNO1x8$L;%Yf1lhhIyGm(S+wu}iFmn=XYa5dz_SN}w;T3@<)(9&{Z6NO)5Ozv*$?6A zLr(E#{H9TE+22)O>U#RHeF9HU=y&n_3f?$rKLWo;oM(`>LfCf)F`csC1ON9pEs4ps z&GWITM^`3_Pu{rDQnQzf=W4E}s`X-{WjpoK%N%IaQHShA-)_l!#&$e9GA3Ck@^H|eUt)&~{lR9_)F<#)U zg;95&p&#jS#8v2= zs_vqH@GXN>=31)a+DldBaN<2x*Ez?sJ+bU9Eql!ElT{RL{ItY=_FSKT@w{_o*=f!? zr|VVMt5o-yyW)CP+PRwOJ487SQh1dflnB+C0Y}M68wCs?B)^;WKBeHym|- z$(i*BF4P=5x<$bJ=~lg7bAmD%e){nz9#yrgj$$Wd<=`sr??$NQM)S!a-Lq)Ws$vgm zEqm{&Rb79qFTlC?IaS-=@l@;Drte_v49vEgvsJI6^Y0gIl&?~Cr63L0k2}q)etJo@ z&~`E^#&u?;E1hYnuwx>o9g9~n3NTREMWj~IFHj|QmfYVF_~2O-uIp*!*ZZ#Yv`ycY zr?c!E;U|HM6uzhNbw3EF9b3Q>>KO~BZQe3hc@=NR)~wqxJ8oOIOkS;;ZDTEQJGPd@ zb=*s>r4cIAHta<3UffRJHdn2-*|ys8xi}`J)bz`5;PaB{$L(6fPsorugZ4_J>O9Uq zO1iA6CXoq!o%Q4HGQxVC%KSvF>D_nSPc>W36-TxF6#6aa4nN+gUiYn<=QModj2}m@ z^pms84ZJ#)@C$wO%aH3_K^J#X-<3;M5AmkZb!)SZ>yILyihkUAnw?z^{M?sf>ix*u zo$?XoG+Lf>_;Rb^9I0QqQEMJK-8y@=_Sg~6aXt5l)G~hb-0LkJDV2i%S?Ray1IQbt zr5pZWW!~{BdT7~IugVs2hu{nu<3`??G)&VpOv8FzorC8rzC9l+xO4Bpg%bnO6a&GO z{8(%bM3fzWAp^3>vJ+Sst&hd5SUJI#@dv|6;;D1zr(3G=N4Ngu7uLS{#sBr>EJ(2E z!)x{y{!08_>}G7uxDE2kGuPstwU%fbxAE7GRg!4=SS4i}h~Hc}ku3(FAfX_gV$CgL zTB}{HqD8!-*D79khYWLOB6??fD(xrSbOjAl^C}fZD$&nym^K_tkm>~QtyUAw zn7roH<}Z8x5GJ!)6;pl1S#oQ2tZQ-Ka-I5|pSxVMZA^2V%Q0S;+?O!-R4TJv5)d~V zS)HZx0XR0kE|bezBSzL(+1giGr7%ykCIW>YRusRd@pY%+w2cKlYz)s_h~G9=4Gba6 zMs+PD*fbnktMN9Tt|i+Ta0{u4*llAhhH$ZkG{PEfQ|_&{g-T6Kzs%&+gK#cj6r~m& zX9)vqDx)5Pmv3UsNS$Dmw5l98@uk)^-)cD3rk_ym73Ha?8NmfOQwha><`0DR!U@{- zb20%{WTg7(ur~d8wdv|QiYf|YQ7>(EMC0b+B+*&N@Hb>wV=EK=RThMvGSm|Y`%;n( z3qrxERQzm(i{)~i*ZE3?i=KWWjiKfwpFvyAI`w+kUXqt*@Km=Pt9%w;X}NT)U<{>I z=m+o4CmJzF;TMgVtc)0r6pomro#KeGb9UO!fRs$g*h!)p@_wgmEl>kD9&223Y#VFy z%dvDU_VUnv&|9;e^Z4>Z2zB^Hu4#uH}bIIdIZ zD%M&Wvu~ztakMR%+he1#?XlGauCsQconVe{#SPBoImWk;Ul{NbYX!{VY5LnS{OfJ7*H(7puKY?GN1VC#0{ci+p`hIrB_X zRW2>h&0*}1U>f!+vn?0HwyBp*wZtTs8GM-2rtQpCm+SZ?xi*!QPOX?3b(PMwFtmP8 zI!n}rCFh?&2zLZd)<~L3BWWCE2aXqv>>Dc~DSiB@-iGds|FA3JeOXeY@xKBOY1|l> zM>I#QZE{()ja%_G3-<}k2bO1G8;@o^j}=oXD5=Bbo!6pyd7;dE~^JhVMJ8>y$p zrM}5UwGr;e@C_!G9Y(<%F(-{(#>&BHV(D*_b$w;SrK~eMORFqeY7n)InHIA(ztTc7 zj4T^%_WkBIw3Yaqw>a>U>wV|QfyJkl%fhwbpvd3HYnGx@Wc9tgWlvYn!EE<;3CLQd>Ih_B1Ji}Pu^Fp^dD?V%>)@QAwo`*711aSW;o)Z@08ZB53$ z-<(4=a#@qvE%LWfz4C-Z1G8WbLk68cZYOTBN|P%GIqE$w`sgEEVSo#z0TWl@X=EycNnRo6mvA4}J%mnS&O`{zji%rBZ zw$bOr&xrgmp4%zuO-8keIz6?t1eV7w>glKWWH7T9qZNnr6Ke(ht-4XHHtph~bEBxa z6t|QWdg_##cOQxyZ;+{|c%)ES_%L`eU<<-rdaXrNQgODWz}GFc zz%w-G-@M9;XIiT0T(363+#EvFfZCm{Hj9^>o-~(T$7VcBAU=`S9A+*LKAmhJ*VisP z&EmC|T4b8nz`+))#rJ>kuuR;Y+FEY9XnSYHsbEB)-oP)-94a1c1wx*Khl;@h=`y`z zQ=N_wmVJLu6#I)+HNV^-Y3_mzr?3QXqKXhbssbXLVijaHa&)-{Q4`{fUPm>#&xS>) zpyDsGL-v##E!Y>rW&f>$54enet81zVJ{p6zexrCvwW{_kL_4~+da@y-|BCMNrQTX^ zf=}y7R#Mn7cg6KNx?H{LNQq_cC_XC;qp%H!iUAdLsCXGd1J?M(68lf{P_Z^wl+5&` zyNGFP^T-&7yHT9%H)wly3buVz!A$VPv0|gT=(s)UyxpQj^+~Z)*pH={y|sD1)_`aV zEzl`fQJ1ABS+>VpRT2sq)m5ilePueX`}0#G(CV72W<*ACpnJNSnn5>fEU6kAZay5Q z8_s@3J(=6ZWeLj6ZaR(X^eKRfDqALeaG}N(-&j=NkNI8w031Kx-NXHXC_Ed-PhF|V zj0V{}A!YDqq-Az)~}2qz&~q;WvwK&xb25Kkhy_A>Sfg z3po83(%@RaQugrq2|pK-3+K*0*~6`pm81}ek-A(Q*D)m`1gd~PT>ynz0ZW1Wf6OB8%TyqK44JJ^bi-65Ak~* z->;F_ZsSsY0vA%XplP|F_g8cJ;d-@k$*!LKL%zn6TCtUTx@saK;7bqJTeH==dlDD= z6>vM&B}xhh=Bwy5ki&|UnVd1C5Rg1vZrPh=QITftWKaG0`pl_M!|yBf{|Pu3{?bwX zc^K78arL)w?GI}f9zta-xFUU3Yw({M=x}S0m5FFreJFSV^n1gxYaq3>IQL%qr zm~Sok8PtR$m+CMdWFYo!>iaD3vw;kuI_vpKIF-5POQt6JKa1x|NJ^nFNhb;Y`MbDq z?}r1f3;jK9m10J?WwbkI0Y$FJb9Z&o4u$=r zc<^O>1Bpf67}}%gr!K+q6HUnYm;9s_^!F5xgeV8E6z{{h!0?JaXRShnfC0K_ zssRob(A;1rzn?5+;D*1Ty%Qi<$@-uoz=kGnCstWWlt;j4EGB*wJ;hhPEK~ml5;XYd zY}zHoBOy+CZ3+2>@&W=obf~(6#qYp%P#|2k4i)t?_k@&cN*2zLniuVWTbyqd=Tr-1 zjI6nbB*o@(qXKHP0(s%GQ|YXkU`{(r%M=&0;H@Ix=m>x}dqW}@x$`s_u^(c6C!Om{ zpjiAAh&}QRkS4Gl)X(r?LJN*ZH^>hpj1-Ne!-cM^soCGD;*eTdy!t9`^-P~gqqiR= zWAO#}&Enhh84K>*pTxjlgIC{GhmP4JL(h|CJc#R zx!)E;y5Q^P3ig=2<%L9fP$XkR=p|eI;jsIk3N?Hy|5$4J9Aq@L!>l3Da#8E(=H4F# zEPYD<~ukSU*iWwj5KF6qXD7n_NhJ8iVpPkmf+nU_jCvt3cWdWlF4)#X#nInVrNy ztmNRE$1lDMe3BR60r(c+Tfi@57nMPqg2N%)4dWMI2mlct!xwQa#?>?42xLgOMyet% z#BcI5iJQhp4R4hG42^GYEQZxNB?8Zwos|`OEDFIqgduVeiH0GzY#1Wn6=K|*=%j!v z$gM!A3`mHAq_Wl9#@O*ixfTZP!CNfRq)-Gy(<_C?o0uV>FxG9j@{gq5ur3P&MMAJz zoQc|Kh7%Nc08BU_6~VPgmqS0eRzugqtR%e%`h%8oi}o_LxH6vwvo*L71Oq)!PK(bF z8~L?T@kzXkenR@Y=q@Xvps_w)t=Udr284H;r5;yDjwnRYP`a@4NGNZI9-`P-c0KTh zm?5u%pe0d$>4wZ-GHVqN%%c=DT`$Z$R9qPmo@a*hkc6V2IaS)6Yy`(;aA@Y}|Vt^dciT|ZzdgQMxCLmyRp44p=5mZ1VQJ&h~p`~PsP``)}>L4eJ zJkSuTC#jZ)1PiYKNGd3`Ued8bBt!JccGbAt>q$z7Qx2N8aFO*-l{zM4cdKq6wg#uP+BuX)>A zBg5J^LFgH_-cAJvx=2M z)6J`xuc65R1-**3K`Vk+2fSoE$Mr~5R&m_rd+r80ck#FxU(K!NR|ncTtO+1RG3R3q zl(i@mtbe(+Jb1;r$=H zhB3guXYl<{teLlB-jPokH-19<&c{|Ko{2S$Yq9pk+R^qHM~uY3HHd$l@h==}PqfGF zZT6(S{gwr~Z+z{zy@O#lNby&@Q*4i-{wx91?3{-D33xLuFGSAWQ2@S)cx$HHTPPsg z0qzbeInvu9@dHwWFqW(EgOvU1{iXXJ+<>ehfj!e%OqdX21PKPxsWfwE>{1aD`pZRo zs(AQhkvF)8#@Mj6?uAq2%ijX-=gyf6U=u}PT?ATgc@}szC~&GEhAvYDY*YF=>lA_E z62UU_^FXQi_+0S@=zkyv1JT``t3}Xe#p<%xIt&dlAy&YBL9jgwt^fp@5b=vgu7(ub zp(2Fu>nJsfZ)(*#$u4rQQTa4wrX#5qg^EOBfq^SQxWxGyqKqHFkH`RD#{3vt*%!^p zMU6XgAqQOMam5IxZJ^9md>szCj3thRqf=YOYjHyxpHgx$_Z zWKYFgl4TTnf(h$z+0lXl18{_4MR=X`Xz1A`iybuX+fonCP zssxZ_xdz~Ku(UyO8!iXU9^ry$d?6md;#TvDO~|-JF)JG8Sq~A3^ntlpTE0}T%@$8R z`8bvoDQkZjxvbj`fc~-o1_S^kp`x?K`VP&hUh7050KHk$!gx`XbYi-nIqI7%dl;WRh&F?-Fg`zn zL;}XA(8c(q)=a`R1^XQDVSzN`Ll#I7u)+cX(t)r8UMgUL0NuI;q;$Xng?E|-lDi%j zC=;+iB8me!t3`l$urvc*EKqn&)+W3r`%r+`j4V(K;fDeih`$WSU$Q`%9v0|RcxzZ# zppQX7NS0!BVXTh@+9E8_ux5e4@D$o;avOLaJ4@Wo+Bm>x6SreGfj0(wKp+*F8L&RW zBn_<&L$F6C39L_tN%AIJzvgXco-q}G&?alr`ZYIf44QD_}N<+Y=G9xlfqQyM@`@C(I_V z*=z0rd-D-{q)C~_{?)Bu;1e{yu(G z`qBU-G@15RFrnkOQmfnB+t-c>8+aV)--mBrvw_lD=w)m7x3?qR?ZOKF53qte@Wwk( zlAZR%>MpQ?543mLTgjLSE4bTxaP6J#-FTz8jxjUc(Qd{pUSi$1j zL+zcy3NrrfLHy&Zh=1X`+Qs&6dk43+2`jjJ?O_`!Zomqf?%D1X+uPea0#>j%t^PG9 z&EKN)+jRa79pZWQZi?UG)tBi|*Q)u8e~T->5GYS7T6wbalU&4RmjQ7C=%;^;33)TS z1so8C>Hy_0wSeQ=OapUB^J-XgvBWjNY!c-)H&>gjG5Q*DQA8I(G7czFk0^b4X z=8E8oHN)D3#!Fv9JsU+|Dt-vm47DD7vv{dGyD0JM3?rOxlpxmrItuBJQbgEWJFfco zc%uG*&VQiuAL;yOIA_!!^76mXVJ%IaP=Czpm*H^J!IBby!<+Jd>c8;~k)D2)R}ssn z{)D%iv3u%I`ScW>_t5!OI>PF04+xH+hq^)k7KYMK0V|F!rPLNs6e6fszl1ma0x{$P z|3l7^gtCPX^_9v?8Aq6MgCHKlg?kzd4(u zQ)G5jhEA4Fj!vEqDLL(&<<%UV--o};E*tWg*#tPtx8gG6-CXA0IQUG{0*?sJ&@^AS zlA2r1CWg#0vml&nK3>4pK69V36`HzM zylRrZn^$Y&@||NX{=p&kXDr$fZCNAYc&zk35%L1*r@Oik9l+2qNWpg6eZ&LVEa3|xHyWtm>-F!8F9jk(Vwg%-SHT4ct5wA3H1xolAq?sd%NfC6 zwTnzHQ{=tw;6L>J&DiF?FG z3!NoQA9{)?y5Jq*NoFv7Yp#a<-CfME4idn@S+r9uZL>ZyLVVhCkrd|}Fg_<-H% zSomU3Ky1Z5+#6U65f~vq!4!@H3z38U1?$r7jGG{EPl!P3T5%x?aiVCtLG2qv$;#sk z)CZt6A9(+VFMyyz!6E4<9OLGBBj3vLNaiwCodZiMaU5)b3G{g z4Knm0sOQlNOaoLLw!3s%F#3E~46uL@0UofVfD2$j1{@8_+S~DV4zdRm{_zd`l=8OT#iv4MIH10xA+xAQ|NVSZCWHHUb(BtD2R{tIKVt`f z7}yI7LBnn}w?g|ujSKWGaR7RtpCKp*o$?LRvg4qe&A9Hagalc4R_@xgqjpUj zL=s`DA+b7r278PzBh{|miv{K|Qlh9q-o_bZh*vpY4fBfBzv9?b%>U--MByUIeMd?c(8KE$ zE*e4rx)&=vBuQ9nNv%K?N-`L`>bUOO)g@Y(kd)6q`s67pQvzV`TPOz)z z>i4jspibKUuT)h4WTbWIz*l3krGy@OO}Z;sF4o!*dI zJ}I)#K!YkBD`MmnS;P}?;vR##> z4ZG7bk{_~qn(`(YJ+;3hoYWX&S;zir#Hk`W{Y7TxSLsk{+ArJu75WKnVu`>_z$$=LvEE2p;nL3E>0n`M5lr+pSXmH6G@XhiU(f9?30F$o_ zWeG6JnkzduTzyNyTBU^88Q74BMWSf7R(KP^t zq{sh4@lBdfkg?$p&xLmD0;%+IAf`Y|%p%&JPek$jb$EARBTn2l9V^(Y#2u>nGC(8p zm&_#XAyPV^*3%AlE-jGq$;g!kjif>4h%fwe@aImYeDFIU&v?f$d48!m#oi>i*q>Ce z2LUwc07{1aaC_dP#11;@ERC*p0Q~EA_pp*tf!at~_wF8Abg7sg7W@S$hPmwDq)!5P zX9Lz8UogG8qCU7HFxV*G9A9zTEKsZ(bwr!HK0 zZ2H3EPfS-%pFMT{{A1_+G16T9lbN5Ps!N{8HbPiQt1pQOB!ICRZiQMR8piSmg+bSR z1wH+(5CwISah)gX*N~BokQqPBj+Mv5Ec*Wlde#VFSSve%-rilmZyTMlp6FuyB)T}L z-5-Lp9-E#)?XLp7pm%kO<*Z&ZEPg8qW)$*lkTz%&z!ZRGkvBAigF8Ysgv01OpQEBW zp-zAh$;=tRf9Kjs^(ZCI!Cg)>fE^X=2@aWLM-8UV!b8DmYD}G@L&OqV_=6}!AArAq zL7^(fPeZ0#v$4eb!%>@7LImD0w2>~xVi)K?rC0BMBV95>{3x61e#9E;K+L#Rfc_!i z&Gd|rzmrZ|5L>W0p>2SMLcZfG+c#>`p6P2Hi*|s8l!<>+K24@C@@anV42_iq$TwN+ z7Wul&>d7#{qK)EG>a@#?a8{e#+#jC;#grG1I zS3Q`pXJ`7%fX=|c!|BUTpf?qRbh42PtF$jSj1pfY06xVt(eL=drCy$X_q`)2{F4T%%2XNWYxFtf0vmW2+I zO<|I%V70D$MUgqx!4OAb`#wU!EW9+Az!F5?m!5KuhuY{-+I zmM!r!7yE>{nh4jSm${1++@|gW+9(Th#e8$Gy_hA8%o0hf0?r=d&e?SO6+-dmYV$?H zWFtacYOEN1*!&W0h(!cT+nVM6d~RdJp4EszJ*4r6yzj|l*l#f}FmO-o&(3_Lub-Dl zbQ9ztuyet~QL7*X|L%S}V*VGttR`k~;=bq*&6z}9Vg^wkL{_?{ zaL(TTfVSfq^ym~;0Cy|=(@=WT$R>>Ly^#kMK{&Vdx@E519a7GH4OZ%jXBu`b`Xq$g znOa_t25sQeAEy1mz#=nx`CRR~zdOQ-5Ozd#QU#da*~O(6VEh=X{xImJiVFi=0IW)> zTz`ATU92r}*>z!fkcJQ|VvCt8fj?e}VvGo87<%>#9e+~I{+7C!dvi2;P-3u=wF!5u;V!} zQ5EiO;qIScUvk{$Jh*I<8-u+T;MeJM|8k_#@gO?gT0Bm^c;5}Q9OFJRfDk9Q7A!!i&K;| zXW||eFA5;$l3|E}FN-1qd!C%SGs_hTNV1t{j~*%>yEyZZrZ>VWI2-2~fpnjP=x5d zyYK!7j!j8tBV4s7wOW(ILX&%eej%EsKqPt2{Efa!VPBVVf`=NULsFdL+qdfNo4wWP z?_n$E+iSwCB5K2{wxm zATnR{)VE%)OE4=HrHKl?IMY|jov|Kc>qhJiK%6e3$bq5izu2Va=)vmEQgD7^a&T!u;E3| z+1=zq8)+CyaVF}VqI`GJ8-xjy5g|4TIjM39Js#R#Mch3}YCDO&R8|i}UDHni!F5QF zAi;H2utJNW5Kv)38)8Zk7Jjh_NkO=4t?tZMXK(Z*Dm4<+Uw_3$O(^u3=Gb)S1RINf zg^>}wse2mIoI~q_mzOC(wrMw8>pP9mJJi5(T|!6c$&k)tZ|osG7<+MPjdYQh-v_t?(1i0)&2tLO2qyo>;(EtSqi0%ooX$tPOYXD;*uY-Rs%7RwP&O+MI^P#YYANQBxgkm_cNl5LUDd?j>yBCPw zXzVo&5ghCiI+>eBX59w{Qh;h&$U(E0$M1mX_IO1t-vImyxE`cN&n`#^T*G2v-WkI0 zQ1Ck({Eh^_qt381;*5$85awQJqYwwGjR_)Z3q*kv{y->MlDb&=U!)nj5C@uO;-aES zBu;B+=Yt^`K+|h_veeCfFxYxkDn3RUSv*Piv(`7zo`A2B6nh0VI`W4?QR-+4AY)qWkdvziRqs0<6+iDnYD2^^T^P} z6wZ{2A#27>^GD1pucwa!xr>FQG-M!$Qt=cMBX#M*Xs@-p5#`rCdzK5|!)t#$reHd4?06*CJmN z5g^rQ4h5#HP}r>y$iX=F1dRi0nFh`hK!fp~P94=qPvam$!YTyn`!yDj0_ywOIFJu>wN_=bSmB4WC+}Ha*NUUODUs^~8QrA0H0lw66BECob~^Agp{T}z zBar`3wv&Nh97jiF#BX0a6Zl!cY2(~4gu5S@X!-T9ZAG6Z8t~b;8lbfRm_GfVgS@RG4@1~%o>&+HLU4G27{Xk80Eq2j;QwI1 z;Fg8su;OcDKrI?;TX4Kh4rasR%{2UiyR}hZ5`hwzcs~c#{kUK!69K00i^%naod(M9 zi^$^_=}U~djYBn{HQ5Sl1>1le%wjv+P4Fcl;3R^kPob-NSAS+gNdOE0#;8Pya;~B=gQeG#zz_JdbWmt~DrqE!k z2m->X9E?tv28pz+KEk0Z7Jw;TfGSI+GqTG^J%E`;Z_YoBtCg=Z-z4uz0XM*Df%E_^ z9Z2b+Y{lt?V3xGjTXfNqflw6(%!qShzv6!K{Q+-Uy%E{GU2ej{wHDb2GFr$`NjuBF zk*@8G;jVuWJ;Ah2$$yvGTQnHiSkd-Sxl-!RK(TnDxSxhz_RoZSu!4Y68nlx%X{o3l zMUg=S$o3#EI5?E<41UC}Wj`z?vTV?^qHzhqWEyk%<>sRABe{_w#z&?>C}uy1CGQF* zf^cDxgiN)-Dc-o+zo=v$X5C(2QYUwU!V=pv=JS)Wl>@!&8)}Up)Gm4>d<|!O4%fM_ zxtj}6Tu`)C_iTr^ncAnRoi8EZ>Lx>oMXy2fkKK!jfHHrGfvz&p@AK*)onNB!J@m`* zDh~%}J*x7xq$RNVDY#F9Iy;+Cd{`oD&Z|$+Q*fGG`!%8RF#X;`r$C4MgLMme2;%Rk zgq^)tT`K3t@=QOi7*op(Jcx5jyK{nLJ>}F-+7=3}n|&V>?zFm8)JN&Zy3lscq-tKE zA2++;xH5eh9rx>FQHuHF6w-9SP}sFEX5h+0tjO;41=(62lA{r{6-5~;x?zUVrF$C9 z#X+L*ittyiKZ2gYBj*G>s4LeW2ycbh^9ST7FxLmeYx0JO)B1XvK(n~v*!`C%=-2@4 z;=VB!A2Wxr`J6g&v6*Y;LFDP}=GbBbSQa*jLqiUb77(aI(WTx^96{k?Rypkn;3AqevE z=_jAQaK7^B6Q==+e0=)7x-$XhUjwm}MJ-{{Xn? z!JCFNM84SGa?8w&um7_juqPbYf3mkaTkty${(75Sjp1q%SA4ovIPvY`k868J6k;c? zw!vpt@Vh(sEe5}PoJnW9Gb}v$xV_iDi#$2F@*SCR`$2o3eK!oi?R0k8_t^Vi0Ig8o z4M~Vd73>4{)C<^8Sl;98${e)ci9_4>;>o0AIeQtKeaP8{lMW8sCE@b#VyMg!9Jane zIQuBxN!!P|!y?Ax&b}bDC9$RL`|SHAw!5R)`oc&|SseHtgcI4qJ)mbEiV()f&vcG` zzEh}cW9=mket8Rq{yB6E&Crv!<=&zIR4i~~a7p?o9$<_>6DH+29#ahFr}ge393lXe z&_Z5lMwmt>^R|5U_BQUr{AfeF5Z~^Rb8OdqH`}Y!{e`c_BL&L0Jo+Y z;#5#3DGUKc;k=05w-hl2f&EbrFX9|CTc3!;?G-pTQW9a*gAK@>0xph+zIO4yT&O$z z?-Z9~LSSe^16CR_6{W?zIPW9iad)+E&I4nRYewu4E@Nqz@J94V0D8}q)bu4uxnZ3k@ewS zA-%B`W-7akd35?wsVnq4W;!Yp>xgC2c}Eut@~-Xl#noq}?&&bE0E`L0FRN`|?f@qT zD2l3}6U-auR^%+&(a_byOHs%k>J0IB%KC5Vhp6Tb{}LO77stYM95NIeCb(Q@0WX23J3inN#HK zMSOZ?^7PS-5UE!GiorHa;jA>XFyb0V60*q}|AQPf=a-4Bkad+0i611gTx&A}gjFEj zAha^T<3Y;M3$0^nffA!5>Aolkn8U}89u+NNkHVCof9N)w^MoE~Jf6rKLPaXZlc+BF zHtgp@ZV|vbqY7n`{2b7Qfss85YFGejNJK#&foJ9*ir`a`76gYg1=ZhOq>YNug&KG& zJVr9g!(;k9)-G(~%*+h-He7BsqcA5=p44`M0q%;9f#QY@8awO(k{$|WB3%-?UyF*aFUk^C`6Zak6 zq}EE>YK)$;4}_611=xoqrdzfm0PU{(p5T-YVV?)7JRE}gA zH5`3dU0U-nhTV}&sTaA{4)=?sKF&2z_yM6-sBP578AWU)Ji)*+#;iFYv)e5+5;4_yIn>#AttsSHB3y&%2jvuqCZa5#7mD>Ze)4${dfQ)az{@ z$smG&r?76j-;GGe4rJT9GM6E2jLVDE#F8D4rtJIGSp4Lx z(Pp_GHU!feWredyFZyt)rqLZ)u1V(xiif6$D-UW?wMC!Q0%ec65Ql0=%o(hvn=pS% zV?6xAa0r%A3ulS1LYV|rk(g4(Hj+4=V8I0&D8Df8fUPB36%svz(Vi~DH%4<9%X}Cf zB9tgRV%AGFWU6VAsWyXjnlS4VyfFdeK1fBLFnl}R6*ts1Yh3hWSlTneyPp9mLe!#2cj76+7(`QsMP^slk}~Jd8kCHDfi~ zw4gvTcm+cbSr9N#!yq4N=(gi9KLJ%0;?9Bu$WCifd?aDk^5+N9JnEzDZ_;boIdPU# zq#o40m3?_C*Cm79$-W7S)WOUr3#K+P+L@!Yp{zK_LiGto{upAQ!74)GE7%c+4L7tm zib#bole9%BD#J$d!!c3L3u6{ceB6u!5ojp)WNO`yCD$Qet|!pG5#jF)vDo@CEss`$ zX%ai$euiV^-AFbK<3Bvd3%hh6HQbTh3u{+6gaOA{V*f9sl|$GIjGa8j4r2&1!ZeLs zVuXNQH>}rSCMdmfY$ML)P1EjS6v*W!@fDeW2^Szb&2|i`vmvMjO)u6tp`abtY9~=@ znOCefUE=|GnN4i-!aS}%sf zL;XCcA?G^f#L%tib3!;7B82D|w3SoY{l>FZE;-c()}8CR0nQ?z`T@lB71l6IXFk6- zw(^dRY6&_4h*5wBm8RWD^5Uq>jBvOr+b zlfeJO9Fbq<+{dFc(lGPX(W!S+HN7wPG#DF-s{&b0cL@$AI7245tVhdW5Vf$P7c8}l z8fy=Bb8Mi9ZkaIg$)hoGEGE%J&}EvKSgwQ*sc9KiId^QmyfY{iqmqgbpQA+Fph%t@rSjB020#405YC=ey{5@mX+( zpJj-D%t+X=yL#2XhTosE4H*Yb%qC^6;_)e3r;xs5W;%DtSI5&`2D-YWPJIcV@hv&! z1-sB&po5?*y*^vVmMffh$i@yo?90f3tZ-lBO-J0qjG~kNHZv{R|2MpnmAd*sISW5T*U&s}4qw+tVK(Zk z9*%Ks0}X*wma&|)ad@=5!{csG*c#%YuEY~~X>h7Z91NDSacEu}7>U+V4~LyE@Hjh* zHkoeXD0CXtC2Hb!Or69cchVcg-4ISwkj2Xzv{SrlXCH-?p^HBSTS$E|v~!E5`*a&t z_}xDsnnFEG9K~wd8^hwA1_@zf^lVwpgReIh#=*H83lqGWq%9}p9r@Y1hJ#ma@Z7fz z_1kt}#h{7a&ID=|2me;(Ug#rz64B z-wK|N>Zi>t503%PAhEWG?Z8x5Q{LS`)@yTwa%CH2a@GCZ-Y%yy zlwnPZwWjNZp~~H%5DN~LgKskkr)yVyI^lrwnC!{-{Kp{%w|D*|j?ME9$uhc2T69z| zqri%cAHe}@?~y$J{<`>6ZyEo;+bDiLvXJ^$BoAM~{X3Y4E04Bm zy>V|fiAg6~7|wFTy{yq1Yb!>mASA$XRx{XxR17{BAe{hZ1i-hfA6PATOd9|hn&t1D z0!J~1pi$&ZAqgHuGIM!(ejYLvnD++~B=U~do=!GO#pxF8#>!S7iID(ioxx60!8F_l zjfO8kBLLb1J%a24mdXgiV;&&mSnQ;{lA3GP39??v2}e4|bID56zV+0p>G%7_QB59` zdz7UT_bE7n8@(HdHSVuRPs3rP7$xZS1{jU~;XTPI&vl3lZ!$&&Ic>z_oUzxE*2=p#BH=b$mAZ(z2=2PFT{=|>Y($qp zK2Ra93~0tS)F`q{E5rH)!b=1Ho+>7t3wi(&QRJDeDOD_38Pd5}0-$INs>--yv0e3H zfVNAaUqH`_iGY4ugH(_M;r&YcLww! z$HA>cu`)EzVTBR15WZA^%%|)A&Jb;~bo;3PinKSZU3TEFk&+03&zRiMeQyUft84B* zGUow*kOb3*oh49q`~iIX`q$5+QdNWgA3>Drk9qYnoT&#i&-o>K9N?pW#j7b?`2zuo zsVN*k>0Adf>*}*1_A=-<7|Lw9e#&WHt*KU1{R&=2qad7LfF`$NFv3!~z@%J{*Jy0}H-IYFQ4h@Dm&quAE=t zk4CX*DxHiH3jiUa?I&<1Vgn~ka)f}Q#@zcFqdrCF7@hx{X_9#cLl&2Am1S}Gy1sc1Q9S64Qk9Xb4?oJ|I z+5y4g9FVUO;grQ4ujyLP1LI_jk0xPs7;lKVVOV%d0SJ#AZH2|XeqW^U{1Bd+&s&Dd zVCI|cCiFQAqU^$(sQ(jJqL9_Kl#t;44ZI<90?SN1@lpXL2E0S3MSh2M%Gnn3jzb$Q z1Dn0rBO)YXoZJXkM)#|Tj@D)cVVxl7j5QBus;OZdnqe*gO|zN=f*U$FUU9nzXfPU~ z13w+{4dAPV^GGbl!BOMjtg@T8&IPV=ioY(ICCZ^Wq?8Hbwr+uIHW}N)IICU#FprlOs_qP? z*&TKP+}MxF)gYF$&&eC$R=VE!T<482ZP17;?@s(ah=Z+BavNsi?BP#aHjcf92$ff( zp8@V2*!MyW=iH$tc9F}uxCQhHk2V=z+kHFc0WrO>=cX}6EhY7hL&zfz;)eOS!j<0u zCj@J__87R0Q5-#;c>#9HkrrA`_?9u$(p~LQNuBliN_z|#?G*0!frp7pO$`RE^FWx7 z_LvC1cuL$FwY3J;TcA;hv+!B?F`dI2FJ zSXf*O6~QU5kCb?wvr<5xS^3wABu&1)F`2sA{In67N;#B`1vv*I3g~)$7;s`UDg#k~gX1T#SI(&` zlFD*!0!dHMR|7#8YF%0&xs^sHbd*BVS?rYo$;eZ(UA;zsld-0lNCA#NsQYfjnFD4n zV__qLEzP$E=xn7kNJk>c1+_>@Q;pDj6b`7W#U-yID8;Civ73*mMWM`c0dLF0&nY6N zRzM1t>s&)#4D!131nfdwxejZhp9l)X3pv)+h=N}~V zuba@z?7$V#;lPet$=57MeEAfk{Ix8s6q}>8ux43s0R}gp2hu#a%8#ebT?l7^Vk2e7 zErnVxoCG3WZ!*l|7e|l@jYc^#jxDoz^qC2pZPx3cvT$T~UfwsKH*f8FHGhNmFJ(j2 z_cRnGxSlf3Oqpjy1`A8qm)ZU`Ity_8kqV%qc4*~PhK6Qs1Q!0c+0H*jXN8Uc#D9iY ze~-@3(fN5g1dXd7<<&M`abWv{mD5j5pLzVfl`~H~MF9D?2OXbdJpUXqLd#5ppR?M$ zr}_do{J4voPPGT|S= z_UjSoXxOiBgh5ZkNhWReUl!lCA2Z(mC=W?Fix6F9{b@T`r z#?KrOoMLKY8DRZ|U4ybLW;MH{Rk`#&ETp*!6Z$IL~V zjnD>}xGzT|Rfv%ic8l^cTbmy3O&fR^;FGwd0gwUfqA)&6ShGF?{D4!MFBRYW zr|^1!K01$`VL(99q6iY~jZUmvqAEIBfb|TVHmc7JNA_ApnsX8AB(z^d86&Bc&O2gu zr7HCb0?5`XW-!TO3k>$DL>9?C{FLl#y*bhXi!jgyvCTw2(N+!6c5x)Rg&E4m^K3n##wQZcH#S-ll-ei^DI=2a5y z$rg}dmtlQnCeT!in8?-WGpIFb3aQJgfewW#qSc^yeFgR zcSU?=F_{|Dv(rqH4G z%)qR9T1+AE=ok0^^^wCB44%_e0`6ccL0`jILKp%j6C50E4KoS)cPRKB4t__RL2d;J zwqkIL2w0Hvh-_g1P;zSA4~OffL#ju?i*equMeuIibZ=dNf&kiiZ7E&sqGRiVKAZ_) zPZQA4VB-St>KU>Rg0YX|7&cEa=BjW-Si#8jmybLNw{Tk!HrflIhv10pz2M_z4?X$3a9af7_lbcgoMD^M{#%&%eD4 z5H#JNfz4sd>I+PABPiFVm}O?%y$3}Iu*k^>i%k6|ItmT!x8pdgy%&p|;|V#?MB;=P zAPMr&$qs;nCHnVXbUaI*6>u(jBD@xaF;E5o9-SmY87Ie_1fl=~K=yzWP{n7^zO4>6 z?+W6&?4{Otuqe^V(A=`4Vn2Km`aBD8C1@5UEu0@eE_Z9#(Z0aE41$M7JnaE5wJ-n< z9lpRE;{-7dz11O`C%rHhdgK8vaMi=j zd+FCs$~Did;k%6aKY3#z0vr4BnbX1BCf>s}G&6jYdyN>H`{e)38xKS4xY}7rl2Yv< z;Iebjg3Fm~g4iBrn?Z_@;GXv`%=QF!ix1)aA)ZUW4c|%brpFYOuiio*lZlv<@wbi8 zo_aHLew(7PLn~Z;3;PjRHizLV;vY7~WkckEWls#Gt^;HTVC>G^hLdDz^bm_}$Ld@( z@fA;;C?3N?2dbebNiN?Ji2A@eU{(*G$6@7w^AZOM$aS!Ev$FV^S)c4l9dRGc62i1Q zk)IaJgpbkO*3yk&F=Uj}I8yQq=byhpN8m>v-E1C!zdoaW<-}>d*u(6WjlJ|j3gEc( z6%Y={5YN;Fwb8Tov#0@`qd~3UkO{Ve1U;$~e>)A4)v@)LXi#kdIwC3k$;NWst8Ms9 z{Q|N!wOwOUKF;i4pmPEH<#>RNgPX8ZJv^nYz zuvHI7W|NDg&5Z$QHUoxoOX{bT&(N5QMU z8tn?(d<8}E3BXkkpapRFXfNBT&l2qLlpX$&ZLCifjPq)c6@VBv-d`-O#;Vwy*ZVHK ze1#Q7mOdTJ52rI5?=Jqfc)R@A|4@vX@+ZVQ<*5Q2H3b@&wjR`oD4u7BlqOZ{-jd?> zZ$I0y5--Q%%6Yl|M8~&u3HA%2UlL0N`XtfajJrsO0y)SKvE8PMZ59wWQ4ggLpitZS zEK9l(0~n{9=^lTOue}qE?+@#?qPkbrF5ygZ+s}2~XbGbT?_S|MloUeFp_QQ0a(m(J z4>TOe>!3X@@dR>p7lZEOgQQbywjiFn8#m>G?cl^A+NCW~x2x`9u>E`zY%0dvvZ?q0 z{f3xI%_(UOY6&NJOwoHd@J4Y;x=7)cMXPN=i9J z2I`LlAv9Bu?IAECdYrG0_CC-t+{gFsr}GXv570RW$4{Qq=}*(APUn+!{wAFtr6aS3 z2wcCw+h3qFf(n&0w$yuh^%$J;L}=#V{Do6bUFfoMaGpU#o+WG#g)1`NyQunzwD;2w z>-wNtAm;upIM%2>pp0j=j)(w{lj;yeb=b^c6peg38RzwwnT3IgycnP01Wk1;zL&3->9U!iy<6`T9u8de$74>Hk|JmNjmk X(br=ZLVY*G^3Qw%zIpR&<3s-+!t&0O diff --git a/collie/models/mistral2/__pycache__/modeltp.cpython-310.pyc b/collie/models/mistral2/__pycache__/modeltp.cpython-310.pyc deleted file mode 100644 index f7c6a28cecdfc2502d5bbb914f4ff15a9b802990..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 52277 zcmd753wRvYeIL3rJNv`}SiC_HBv%wA5)=uNdfKucmMPJeO+m6q+wn^BVu=}m3+w|t zGayA|5zD4rD-R})+dOV#e+6SFvD`FinlDY_=H@!jo5a0MZf??>Ow+V!+M76a>f}1H zBT6#w@Ap45vpWkwlH<1DU0~0iIdk66|NFH1`|}BWe(fE%E&hwEM&eKT(EUl{;jQ?( zr;>?;ohT=4!#3+i*^pneY|3x4oWyUkZZ%TnR3lwZH!|f+BU{cII)1L4lX&??pP4Wc zjY7FUcrzexta`4|SMHOrRDG~fEEgL?<)OxKd04{I^^wMCc~tzF`dDMUJl>cnPsnq& zKH1n+-qn~YPswwxzPmA9o^I?Z?`iBU@0GB8eP5$gE;Vi{-z3j{_5F>T%QrU;ln*p+ zDc{n#wR~&iVEJI$8nx zQSL%Q~6Ee-&KFS@kIHF#=Fb!Zai6j zQo^R{XBubAXB+P+zo+ru@_QTaE5A>|ch|qA@&5Ar#XnsyH=Zg#)p)x6bmN)wGmQ_F zKY;K(_TJ|c<;vR=_CCAxe8Mg{earJ#lI1EwZnF0yWPcEH&c4|`fVd0x!}cxqt-rjFNfVU6m-0=PaQaotC6=836-jCWS#C<5p>23B&gr9U; zl9OwjXA(1ytWT7lxO}#y7S5L*tGS-4)=Q0+?bK(#$fY+oWBJx;g!|dYSG`)RS*`o2 zvumq$$4|YdiRT%^H=E6wq(AhwlaIWw=AA#?TB>>O>6YvI#ghs@k2F2iTD>eGna9pN zLJv}5Ow> z&1o(;C+k($tu59Z6_0^h4nCorv#MHaI<{o(58B@4Ri|?9vgcIX+PX87_D8F>U2(mt z@+x*~0exI+F1h|K+l5xVRz

6~9zfw!f?9Ru)nIc{aXTxxG@oSgqBo=P&{{$6qZe zr|LPXf)=CR=&jo;Cyx7>dTVJ31^T7JM zTbHDB)NWmJ-{Z=5lw&_!Tkufs0EWFV@CQJjP98~Bp`KEzsUG@l5{J&p0K zDo&8rn)kL^)%8bvB0TmEr)vAtPqr@YmmT{(f#0kSh==gr-F1zJAKXEu-azZY9|+y*po7IU%Y|Oi>9BnYYjgoE9w-+D~qo4 zxcMlVv8I|rCA2^9C*3u~^|+Jysan&!<90vOY&F*%)$%i#x7<4XWTX15Z`C}f;Txy? zBxa?bURY}&=}gKW=-FP1u5$rX+(my^&Q(3+o59qrEjX?}f_y6ab?0qEg9$CT4(dCt-Et%h@~e&KShdF*8C^y%6o$2`aN++)(q_|aSM?Z;+kgZ~AYwp-py zd~bDj^|C)$S#rFJURrk5t8zr#BAlWzZWN3u!!%99G^|(EV+dTpH}n=;vhNuVn8ft|T^%YXG-Ab2I5#o2j;O4S(%KC5?ekR5G@K?9KK2^QGVuFQ-&< zOW4$E7poW$ujI8#XYZ0#&PqgS=4P^f%3WK!S<-`=d;yn zViePtoZ8ZP&o5#xt5vblSDaP1R>!H9^exw^FZzAwYqpJDjypM)>#F-Aww_95fd>Ne z<`}Efblwfe#@A(Wd286n8|xGCHq3@qS}+kO{AA(b_awgVkHcvj%X-lmp1GX7W^5Q( zK$eZ(T266bxUe>oZM@w~x3S!oGZTqx#%KcZ63bb{HQJ^;TWxDIhlQ8Nw-1XiWvAMy znz51g3U=DgTrt0=xRLStmj~MEcE--yxvNPd@jkS|!rvA6o7ocy_t8s+fpIeu@5eV@ z0!A!9_eC~F-2>+=R%m9$aaOTt*N0?uN*tb&J~Y>dsuk66E7(zSCd@9ot)_2W`XZ+x zIyKGnqv_2Is(TUOn^?y(P*}OGD%WyywROq28cwz8r<8j^dFnkZ_$*_mZO6mj=qDFe z*Zj2CsuN(bTB}&_mXu@JYmL>Jtl|>!`@=Ei+2Z^A!BuA?+}98Ma`aTI?=qnOMZ1#{S#zEMoBkN3=vAlZze z9!H`V)11TriJVHs&sPW*tkvl+R4P2)^&6RZN=dtddAZ=!>*3s#O1&R%^(>u(ZvkJK zrP;)QQOv3^48fgGv^g9{ShP9hWpm)Bz|8@3h?|4m$GHPAIwjjg8ndF{M@?%1?793% z{K>(Hu+#tda7Y!GIY#7hN)7dcE+$2odB$Ax&6)XizL(5Aor)_K< zID3-tXRs`+%`B#Ru5EE+SvGekMiRR*?cvXJ>a)U};&|?3e#?dBelN8-fYU0=a65sj z=jN7lrT*$cIat(Fczn^&voc$2UaTxC=fWw>*gUow7ZVM$_Qj;1!C_Fr-o&|h5l4h1 zz-)d3$@Lh$89sl4=f-NwtqBn8g#F_B);R#DddjD%P;D0ZAx0nK0bx6f)wMc)375{KWeq4+Mtzvhl`ytW zCvzp5%2M;^5W^jYlQ+_4+DIG6F_Eq0fRVqkzAv1@@p0~Kb>T!pa)BK<>ax|X!y{vx zARK_vNVHAD54LeNxoP1!g{{x>3~X}x$=o2skZEHi7XfeBNo<2@#LL=NJFPdmHnvL} z;~b1~*3RG!w!Y1L_Zu1KEHDr0BpE2#zc9)&MtOFpz*_bxwSj8-`KWVk%y}M>m*9h1DZt6GZ{T5&oMd2O;0w%!_Z@{ z5ln2H>)B(}3yAmK___r+X=BNJKL^r#^^W)wjYyi?7npMs!-NzLSrf=jyxQ z%p~=8Z9J=j$?-NjJ128owlnn|s0KislzYC4O^oNP`j`Z%@5hgCK6?V++q+uL z2K#+VN~^H2Pa{AMkts~NQA7O@o)@@_haYwdn=aFd$uWVfU5U*^o9o)dPTqOKuHH6B zu@AHpSB!RYG%=CD%EpwFFe3`XYHnv_G8xq-`t+gIRbX~*iM^mMm!1f=)lzia5Kd&T z0E@@_YSS*QIG0Nr8FOdz0yjRSmfU+I2=YSd(7d1!vu6e6oS(V3bT-OCr<{#quj~-J zQ%WZ;mi!K}@doH-3COJ%jv0FzHIB}QPBw_TLpI+)CeZ;@v$v^K)I37sYU5vf6?o_ZM&~HEs=Z};Q zw*peu;UlHsbaYwXiJ8ts2-|)zmc_wRRV}SGh=RDFHjpHQl;|R4kFEe=s#FE~fEt~z zfuM#wqvYr&_o=W66?FVFoRG10V+4C*xSYRL;5+9rZ*@*^A#s0274#zKa z&v3s#iqFCEGZ!kdqXA3Qp57B?w2B@5az*cc1d|!+Ad(;^iR6nU4GbU81N-1~L`VyF}X z&{t9}3;=~)1@JkNW3IeTBd+Bj!dANT_%V@s{X`YoM~h=BLm z`|7QQYTbPR9-RV+j&+XY(Eg<=rVMD!g3J?#2CW73mGdoo#~P|1VP&G)sUO{1H}zu( z`wtBN44kvybXNaQnAM9(^;!7+A&u;VaE{|vNX+WT5kAx3LDWDH6#BP%UyuN>d&9A7 z0JJpC`Zk7V)>grMXf69W^n@c`b(Al1plvtxGa*wyACM5L3!a~bQ(0`jXlj7}C-Giw zgK+&8!reRJ0O7)H&l+aRGAxktOg_JCW_ErmX4gly8Ohk(4w3DXEQU}lNVfa}{MrPB zujz7|nAxNv39u%F+zt^JP-+r%B+JWio`a4w1k@}GqOJub8`ObbI#L_ZDkw>J#VAPv z!OpGcvGTeIM2-4tH-Sh!gX*ad(5b+g$*Z5H|7Ym@3pzhVXORxkc6FXkjm|Qi6*wWr zE0AqEn6LRxIck+LFVN{g`2ye*{Qs}`3hrBqNH_T#RgWKwP;NY*4z@sm z`8S!F?)`#Myh$-k)#3Q5Cb%-Iep<6;wu}~cA{j0A1on!=)7A!PfEyNw8`y1DOx4eo z5Bvd$@kfXp2X6QaJR$|)n^+vimE&|RwZSnbVFe+h2??`^r>*~<9Ajq*oZ+7*Ll@|(TK~v4^#N6wG@fjDX!lGRa;2nKRxZATob;)c%3|#vCzA9_2wT86 z_8ASHT<&<`sbw2Dt;X%7W7(FIa?-mJiy*>-Fc0F!^Zn(FopG?Ac+s z=5#)1%3mQ@uqW((i8Fxohzm?}P<%K~ARdr%3iusz`t4zSN1Or131i|{o_8fsE^ySv z0ejRQdp=bj6bfGvvt-gQ2DAU6kR`kRucV_N1FflclpO@HG#VT|+6SY^rAOV;L-l&; z{jIgqoa5LB0qg^+oh?1qvI$iS{0}^!t`xfiDn6ID<_ed4gZtbahg9zXopR6gOi(C+svp=g)4TjYQDES)90ApaJqQ zl5NvYtR>VG`5S`Crn5%rucJ%O~C zCqyMm?wS&00Ehj>n(g#dK+wI40#7&9hDkut_L(18{|d6=EN_~ zU6$=jHm%a3CA4C`E5ZD|rS)M!-{!duNi2q$GqXE`fNvDQI))i60M`0#y^HuYU_bRD zR>(IEk=-4Z{Y~d||9`if9tm?F({jtJv3yE|~9>=5c#5JMg2UQvK}MiZl60gK;J zWrhnSQNTtyJfhuCkDs55hEjc=(SMQIjK+_dCDmG6b)^P=o|t|G2e4Q5a{yVj5R5<6 z$=5~`>T+e9=1rvy(eW5BR(WYbyv>Tg!UhrxZ$<8Ateb;HbA@OJ~SJ3eKZCW@IQsB0O8 zSJ!v#I90*%6Iz{mj_paFbJrJ$3xyy033Q#r*Db*Ti!=eChdc)otj%lICXp}mnt-{Q zR6T`wLMw$>fOWCvTvETzkk7;Ma}bhhg5KpiK3E0Vtd`bVYwnDtJMVWH?>RdE7o9jv z>8C3KZ_Ke1iFuVeT;k|#oYiqoX^`X;bNMH5C=~kvj^IP1c!Q&5MgG2YYYX!0kmASQ($p?9HkC{ zMb29Bmo?&Vf4i_eYy(uu{er1L&(v=MJ} z0xn`*Gw)%##PVdjzg@&pXFhFi3~dbK2t+yUq4sb)c_oD~^i7bvkFX)T_z81k1o=!M z7E2guk6g154?XDZMp$Bb`U;-l=Ue*wy*-Dmhd)n%>WN{v^~m^ zZ4&^v3E#ox{ks$GT)Vhvtt3BYsNHDG@OE#}vJsv3hZ3K(q=f`>45WFmZR$J$YIJgX z3vwE5W0ZB6q?dS6`q8!t`u?bQ^YVe^TiO6Dy*;%$~9K*tYFKdxI8z1ofEQ(js{;b&~W^oyNlXC!@~Xk05Q%JBre0 z!~VKrTuY+l#OAT~7-~7zPJPU9f5b*x>|IFFHBW&Yk8hcm<2%gD+=&G8`Hk4j978)T zk%gL?15FK*f;?y_^LSlBWKl?V^WDJ$3fT@MCc$JR;2lCifH^3EbP-|T99+C}_KthD zfo}+D&v%X$7K9uDivfn3oxeVMt^}^b`I0?TI{H9~5AcHpZrEA^OL*kT^}@S_&!SPh z_58a)vkv?PIiJF($!cfM3b1`w5> zKS2;yj$I6)!6PN`D7HX>q;u7(b%Jz6jH52oP@m4ES`r8q{RfH2SrFl#!;TA@&&Tm2 zq|z^9^G&QDh&KF^7DIADE^e*WZNiB_vng^^x}G~yItTCGi?Kg_IZ8<=&a&el~z`Ae}%TkY7|u{cASk|Df}4 z=-fi*^K^cn&LW+iXfYbe%B-$xLfXG&T;k&zs`>(b5get37a;WuV40luCGWiY9me<+ z96!Cb3LzXv{Srg|cRC_1lx|eL##$Y6LLg6~z7!MU!3~CdP-dD>c?zut(##pRwdjGP zug)=-DxG^JMF3kVQsdNL(D?<1CqeNL(xCcnJkCt$Q(veD(NV4bfYJYuj?Xmd74+Yd zD-eO7(sCjO1bKuquVq|aKhKk0uNS|dX&}K<7r>{Fd{^jINO6(!hU}eg61=RHiZ0SG zV2U;?!36XB{&IWt$3+a^hd!#22XuAP%R7NxMf{2qz{BM#(GzK%4@oXaPXXr^XR) zih!gw4Mu*!*pIiqY@N?Y)stu1{_O2?NYdh|Djh=H}*3othcy zVetEz3T2-mA$5oY#I>kMtwy(G@q>avhd~LBV8!(09z(0m5%BILRX_K-aczDd7Oe+B>`k45rbrzn{o?Mt^;--+gGGLRN%Es4N**FLGG z*hmt@mZl%DQG%5Kr2$POn(rOe`4TpJ?zZ7J83;B6sj)~dYlh!i(`&6kL>d@8&x}?R zOi~EYtkob{9-J0n8HYP;Ge!##UNf+HoEJ5()C5UNSi-{FUht5S5DQp~)wOf=+Cu4} zCmzKvEp6>>BM)EOfg**RDZw!fx}|W-aSDTHtB((1Mu212^sr64IJ7SZJiCUbO2u$w zPE=01L3{;|ArT4~j8TL1#HWq#2K?@sR}ux1laeH;$SMF@@&pp-$xRf%-`lNDfP)_* z)vZS(+qEnc+-N@47W!*IE}Zb^(AjF;uf)iK%kg- zL+eXYQgM0TibaUt{Tr{ed6PXX*r#Avz%2d|aG(9GHKh4Q5O1tE7sPqKlaD-&(G{X6 z@}5j=jEmIZD7Lo|U~d!A6WAadlkJJ1b<)Q6xU^LKi@mnXn&{hM zdmOmoF7(f>0d6?j-WB172L(4gB)H*0!42srjyMAh<45g@CTSjrH>QAR-L|p2y&E{# zkrfj?_g_%@(QCj%)Gxp{i}p`9hrDBKmpyrfF@8hZcpTv*EL+Pbki)c$-LCdDX46za z^guRU1tZ))#B$$Nm8neHCoWqa*ur0QCmz{~b|y{m8EYv5(3`_|a{+tc0~ z;AMN})W5?frT&P{zo+xZbf)MK9a8@h&idYJXR{4JtpI}C4&PCK!i4{c&Y#k`ht7Y7 z;|~OEkcwu5tp702w}myxT|pA5_xK4hA-e_PB$CIW5wY5WaM@10lp`85!f}k#uK{e4 zqGyYXwS^k9uTfZwB2S=kL%u0S6kl6iQegDxDjX$nS*q90DbN#VMS6N23Va}Z1G3c= za1r>a24XQd#t+SR%MV55QHuT47iTpeem36rqu?)w?6>)uBj6;0{{J29ss94!&+wIx zKLHx2LqT~!xa(4NEdjm$i|jjn@&?k98q7wS4x}c*UIhLHEfBO=j(w4@!l@l7A_QV< z4JwW>rgG|naU&g}gGI3T1@n}quB`_0S&<}P2{@=m51Ukt5HI+wQTDT?_W{r$|AHw? z=c)@UlCQ2X;`zobDAa$4M*1V9_QiY0RsR)lX5Os6gdg>nj0MGsRrTNa_|0N&>dVaK zHL*7JSB&xyoww2HU~IdCl@LruH<-$T3Hlibs$=#jH2}Cm$m{A?k=!4k0DFMq5CJ4~ zE~2V_jtV@LdTqr~f6c&Oq9cf)plX7&5swqJ&L08woR~lRwBj_0Q`-JGGmk=-R@VX@ zkPL_mQkfzvbC#J@;rNqV)^dn$5HZv^#QW&`TRMUrmfiq%s079mV)4sN_zIm@>3p5e z4LX$a(J~5v@)G5NfM&l1`BTGuJ*+W6Gy|Fe;LkTbx6B=6fkY$?hC=HwASOePfj&Vv zmC59V0Q0SYlM2xnl2aHMJjR*;_%2DGG%GDU>u#rv` zkS}moVz4H#FQ~|+zYgp(%}`>c1w%pwKhs!R`lV8yiVr=wXDDXg3-X3jVh}yBKrlA^ zjKV|g?IgY~#~4Rg!p*=nHgZk*ZS_Nb##l*iK$6BbL2P~ooL)%S0AvY>@gl7u(i=~w zHvs-Z21FCq3m^{N;&nTj-Ky2~Q+eN@lW7%{lOK-S59@&4-(z8X>`8-$0oJ0m4}!wA7nMCn+G6xl-| z_%;)(%QnCaYD8LFOk{tX3cU{;a!_tq2B0tK2zxur4RtzDON43d@r+;S=x$do#GY3C zLYJ~P-gP~#K-^Qa;47eR=2U?ZDMP4b*!n7Km>Uql!}g%|F!(3w2n4l{zG*l!qZ$I~fhS0*4$02}ZJiLM zE|V~X)XL9ypmPWyXw`T0PM_{8(w*I?JLIX;RDZ0in?n70q}?`nMPwR6_Jmex*H8wD zKpMa}P2X-*?TU#5)WcJ{1aX68Hc5ejyio+@7{Um0`s;f4We`+vJI9j*!2Pfc#!1;!ye}`ep#G`8?%Kl$1nO@>mel~vLg+<96h#ok z3X3M7mi|R^k^00|pu9%x8jLi7>puiPKVsiN?WIp3alsOv=i>u(7!&%rICh~3 zq)K=KPvE@_?9)fFAYp!vr=<4w=_5=YlyG$pVz3nKF~QmdonJ?4VS1H!#m}A!ao7OF zj#kWyUi>iDl+X+02z@hjdgeFh#Gi3~--iOgQL*yIB%swva}t~r(-HnMk|RTeal)ARmFJE) ztT0>p>;gn_`-CY{z$6{;M}s~;1A9Z!xx4;~oN~ddE=Vz8Uj+I$B2(RL&UQGHvvMdy zkE-QST+0UMyTdkr4F-x7_xEC=02toNK+#-w0KGWg0RrQoC8z>^CShRc=kd_BtE;rh zAtj%A_=$(e90}xVUx)I)RZM9N14%I@&NTNGtc7+;jR@%Tp(LKPAXE6H1)1bY7y)?^(8+JYh=7PL z@-9I|K+fB-#=e6$b~a%v9?0UBqJ2f{LP+>KMG4}J$L*S|)v}f8` z?;z40!gpwSMiZTZ$0jxpLomzO7y+?=qu9|3{bathg<3qXDb<}l$G!ioT65;1P6}+DK)xnU zYPx5m+JsG#uI(*OHrCbBIH&aEhlbj$2_WPR?MJwAIJ+Y+}U zH`o&?s>0d@>awBqQ%BDcq%5t%JCC!wS&0{a|9+&<5Dz?l_sw-ykLRFES zggtg<2JESdCAviLzhd+Yd?gg@h;&MVf(MW#IH{opoCHMku<{yx(Io zk=W>=wvN3KYU?*bXzfVV2xU~GBtrA-VTrSEr*UGr-vtk(F954qH_}!J-0(MjeIWOZ zQ9y?Q??nXfd>YsHt$3+acHm z2x|e^#y+eQKXHR*hLf~S#|rePr~zE)vQ(7vLv_iADG2DO)3m3jF=@`|Eowk41dU33 z3`h~kwggkIFdVW7aMK{A8n9*oB_I#TkC;=*({dg{`nH6~ z;Z-5`HoZ(K0jmE{wxdYUr2>xnnZ%1gjP65cLB{Y~P@PjR4tEKAaB=vf+--XIIDgk; zkDfXEx$lTdSAD^q7JpIs_Gmo6{bL6~9BHam}W-dvnONyN+zd#FK<4kB3)tSS=s0MZO4KK=^kK_7*&atO=oP&^S6 zNiX*WVSV&dW&kRAa2FxA1so&=hE2Z%RM{S|2cZNwAQ;IYFp?pu@Ee%S>H#!NJxJ#v zIy();kD@o!yBOm%oVgwa1ufL`Idn(NwilToBzmxm-4DQN!vr0GC;XB;fcXt;%^^T< z8=nABPiZEQH4KtPK+rHCbAojxsYF4UsX(&O>7)1Utk{OJs&7R#1s5sdvz7iHs#$5% zIibCmQ?xUu#r$aWMKc$G-0KE0IWOdKUMu~TC=O*wlop;cA1kOxuXYh+GZa)j& zSD-NJ+ozB%H>{KRy5A4SBcB$V56se01Pkh7%SlCS&{79! zf^azlssc4@p+dz2A7X(Jbh}2HDkuozGp|v}2ip3egeEp$J0{h95R*Fjws{}(%X`GB zK`EuxFRi47tvyZib&s;^DWKgmRKY8Mpy^!FM=-2ixZX@b-H8_IWaJgVJn?yFVP&-i zk#-j`Y!628#Uv!WXNtYxYtQ+< z9Ux06&hP75GrD;e?K8iQP&Z+KA7K;rBadtXV(Y^MYps_7^hd(n1qgFb=ki}0zpr{4 zab;D763@ioV*<}l&G*<4pNHL~lN=e@L&cu$0u9arM~AIqED|k=#f^`r(-M{5A(5A=&ROJ0rx*LIoxHl zNrS$nLr;YS+LHnc&p*?{)kPGGy=lZotQyO?$`Bbf1OYhwl5Hqxk2%*`py3;;)Wd#4RKO7}T+ioN9m9DGV1kij zxj!}kZ9VgRmXsZe>4}<>W6wVmihW>MqRL2%LfQY>{Q7&&qSSLdbAzpcHY3nk8h{Z! zd{w;^*c&+-$S|<_tQ1A;1Iuj-B1R}Q?bhn-0#<;s1e3qx0;U}vyPXCpm{o|gM5Y1LT)(EY*q$sRwEuze)B zU7ciIDCdaN0Ox`5^$ah$3=W#w4`fo#BA}nJ{c;szTnNbu)we|7vKUC!CT@v1lIMrC zd8B=4+q8X>_N0CX7gj^4;MmDQEH=QVCz(e$Z){Wy7T7)5M-_a7hL%E2IR!y7+Oy5m zDjCHjpe|3+kDu0N_O%UXi$;}cv0XKAb%D0ll%nzWil2KVw3F-i1#m4*bcC7;Y4jyF z5QS?%dJi1GUt0kWCiKstBVD4BFv1K35=>(dQ)rHvN=eqMmbrc)m{UFDH5YC2c84|cqPg)qLSc_2!EgxPEv_K)_E3SP*+i%kN21y)^bii zg%_V?XOXoA+Kt$s0$(xsE0~Hgs;7bb${LR&S(+-)ZCkRVLN&hyjp|6$#1WaE9q~~u ze!%4V7=rMlB)}dXD%Nm6x(Ff9bKLYVL(er3{1SOukd$jnm|ob-fc*q^A45$@>WT&y zK4P0z&Fd27;vN}RBLMMR=cu|v=w5VkpPGLNQl@8FA*xF9ASYVWeZogCXK}L#HZE*> zI6Gp6ZXX@iy>S>-DoDVSUt>F=6;oD?pQqkObCKK}btgUgu>0UMhj6LDQXSW>pi^#^ z@S_-*Pa^cZnjZ0gmEtr`#!N1+7`9!*iv+=oM)RR$CdkgFD8 z5{QWfUx9`!@Yw~PN05Y`f9m*=(urs0@72(T?tl{H0|Q|yJuSq#B|GHsVdAI6svZ0A zVQq4%ljgV9ZEhPc@jYU9hxJWNSo}V*Gv!$ac z6M|kxBvPP94YNEp)}9=-m~0hPd6wV?pe7iE)%4yjZVX3g{+U>zgzAvIwg{`iEoKcO z2UP?0vW87S?Bfm0H_XPT*a2e1ssqVZ)w8vWvxH--=iFl_-hATt?1?+>y!*tAOg0iN zVx`rZToxMG3s~*ZHU$vLbCxdmbPDIXtP{Kxz+b~O7^K&m?K`FF%=d7XhLVf4Sqq|r zhQ9wk8~SVP58+CnaAaV`Ss+>mlnXSF0 z8ly7Nq0h|sbaH2{hw(~Vx7F}fd}r+e?9hEDNsGZDs7j&vR^Df`4H2i8iW93jz>}ka zjI!MW=^i3^Dx&a+Y6-i56c;M!0BRsEaSoO}SAZd5eq-)3!UFH=>cof@r(?tP9uJqD zh+BGqXdNMx7?efAP;6UsW&L{n6}b;+^gqP0g+2Ou;W zm=a>itRC5C&@>I>z9rU{qa~r%v8Lg`i^xwG4Fa@Vyw!Bv(tI>Ig|6a)rY9(vtOzmu z%T1L>=;saR?FDWDu5(JM%xu?kLTu~{`UFV-8epWt5*_?0H; z1ffE9-C3$GT#mIqOGDxXs8p>S6Q4dUBwg55e(FA3- zXH`Kz9LGBXL2ecK{aA%`9pkx&@L)K>dMdi@!N)R4(3KMCQ?e4ejzPUo^&Ey0$gOj} zbO{7sD*CDK9<1U}mygl0;QGk()rdU0ZLj;JOKpi^fjt0U@U$K|8@u}7f~nLCB&X88*r># z8Qp`N_aHn`!6c%&fiBWo0V@Guf~q~+8I)*G!+^Y=%>4fxCI?81Zve&%H!Hmxbf7U# ztk-KtDEE)U^nV2F3DbiYB|35B>~+~rK}*7^9da4Jsm8<=Rf83+4+ag2&c})icxO=q zq7wq=TF*SguGY&p)6E7q6Pm2w#30B4sgK6TNT7RM>>;7xh)@il3exWKA1cy;I|)K z_Ja6m7`Gq30fY{c$8HZuIQ%f3TX2f_Ee5|s!S8VJJK_u>-LP=wVgHQAa&Z^yh%+jp zCnik!PCO4${qmUb=Ep%A+~pSnQj>Jf`ah9z>5_8O01ZzW4Ks0{!@wU7;R0F()w`#j z2Ztj@>LUc2B~VUnz$x4CP=M%2iE+w}&ivkxvU*a8(-KdpdNEcR38Z9~B{@DHhSQOT zd4X~&<_lsFzhdE+X7E!Um}T_i3;7q^$%UJq5Z>n%Hep(35ezRFzt4IDct3c>c$m8V z2HqEK$9IGbF@%iTkS3^%2|+S5hdbzCFAZ23sAZHiirZ7Y+NzF*Sr9`eo%{uBP zpJ;l?iL}r7_Eb#uFHxkHv)Hs_u@ql1JWO*7PCH`qa)Ql z8jfAsSidP&`gXCmECSA|llXom&zf+`LdzwnjruU_qV4_rDS_voejcFO7Te8R68AnB_Yj|lduErTo7hK1q!#2 zBELJKZX~}R6r#m~iVlAOtcWyNXe0> zR!@65h<1x1Zrj>O+bP&ZNh<~By5$wFn1uw@msG>+-yGomfndQ~n^1j%ub;MXVAlKl zgSu^?4neOD6?^!78pBwQQkfX=I5b$4v}~D zW%S{I^m>pA0`yQWlbmiqc;AH8SP1h&oE{gIHcOKCpa|%XwKK3ZGv<{bcn>uKC~>9Q z{ZJb>oBiHR-C;6U_QQTq3PKer*e5k$@ltLDgSb5`n0_Jy`!Ho!gJ`kpYaY0tdKx&Oe5XI zqQ!IY3NFX>ZbKMCAP6#f)glBjw|@5hpojw55hE{h9~EweYJpK35;vn=K`1@3&~Qf} zJw*sa#KApWW3~PWbJgf*D`1ww9t;?$07hb2aJwO0B^DUwyI$Txf&n%KQ=K?1fo{7SiOLIugbb$H?h2Gj_m>XR zj@!ZcP#-&pD6IkYMW(}SNuZ7yLsW$!9{Zi_#yWQNo};54LGsRJc2RhA;u60 z?RiGC>aa4YKEXiIZy*fwI>uB5CZnRLHnlTAA1#c?xcWH<&>SzraMZUUFMqlcuCzGM zxk8OY-ZZSai(0ztgG@1q8;-l{jmy>L?)5>GF|$X5S2A}xsxa(D^<8w>b9w?vM}9ZM z4l&m}C`$oRNle}N<0N-=#QtE^MLOpaO6jl4x*WLFCYHq#;%pDhF!?jPy$beK5&t3@6hN!7J+0GwHvB zHg3jxDWA*2)ap2Fkq(>IbCj@zf;|9s2;AeYPsu^lPSP*0hYGJ6(FxXI3_eEF#rVE# z37dEzrsVvR(lx`ZfBvk(4xY?;CO?VwuNx(9QrvgB(vAx2pogO2UQ3okepgiDNZ?{Xs!^ zNtk*&M1eH|B-F;*ssp_@r$cnRSMLhzy(b-jBG$HvS&^cu|{JsF}=EIhq zleml^rzCs`a+)P_>KTHbCcDv7Q{9R?oPNwWohSg$T0b7! zI9G#4X*p51YXgl?TVyYaR^P(%xR)qy50Ja$PDi01C6;vk!5(ogf5fXf72MHjcb&xQ zAaglJhX|~?%>16EBV|aN+I-xgGsh^zIB^X_a-r_({aNmT7{R~v%cfj6Ay2R5Gte)&VPd*tQSt37X~?5O>gCl)kXTRjo&s=Zc=&-i z=pMl!QKA=#R8cm0o~`jJ2a?%LTm6vs>7|6z>_%WGArdaZy8Oee1o# z`rDXXD-0B8VW2pjHUBQ1GryM3n_o@$nSYxv6#fQ!`m>XUV%`hLHuj--6hH27;cbVG z2LU>*ob{TNJmNl}=w(IQB>_>petQ7Vc`;9%fXJOTPh5bp<57h7gAxb&ky8XsZkU!c zbK_h8?7Qs=X9Nn;lg=1^$3ffMB|cgVoq~^VCxr^QTf*VDr=u8q;M;|ey}|Fk;I|a~ z-sDU-*>Ui()2;PZg*%lklu6^f>P$es~O z;Z4ro++q7o_HE97yqR(=XFqeZk2t$<-PcijRw#@&Ggj`H{pRiB*~gJ4YoF+jiyUuv z4g|3+$t`Q&Vc#ja-4f;26Gw8&+joWWgo=49Hp_ec@DjXJEp<}t-JE{x`aoUZSTw7} z?p}v<_%!B)CLIz9riOo@BnKJzuDdlsJOm1tmiskLAslA44nJQlBu~ibYx#T%Nzz9plSz1ohA^gE|OShbLNK7kBwP`>}Z${7d^s=T~_YW z;d3PX-qpD~6U4YXoWxi3ErEl|`LPTw!yg7oYB~A6(zyud1z|KRXIz7rsC1AC>mMno z3NFB8S~CtJ>8_Ez_NUQA3KoJ|6)~IC6}DTN6t#;R5%UpvXb;=RnoUBLHM*O!)B!!_$=+BV&XzE(XOxUJx6!-rFC|F3vStG(?x(oish>k=u zNOW9IRDfZHIMu&GIqIWy;_a8S=U`ZY!uAzl%QeK(fwgz+1oa`7@&l*^78&QrwsT^P zjS3*2PNs)Z5%m$ISw9p2CB6NyCCQdK!V%=OSN|Zahx*^(3+8~{PF9le~d?p?^3{WN)&w&(3y5d;`OyeuW*h&7}1E zI5gok8!5a@NVbI>NQ|(&$!Fv}1Yf>gTqaR@2t?-u?5GUW(h6*(47T&` zN81C7u)3x5+m(RQE6K@h&qrV_#b}RAV0qL2rq>S}F0he>v#tQe@B)lv=XHfZcz`@f~_?R-0wiJNBV4GnN3AbSjI8h7+`C^xP8x$Rki6BUFZK%Y)X5 z%oCzk7aaF~A+66mb-W|~{Ec+u)6GW@Hbe#|>TYfD=TVW73p*U+G27{JOxro=41Xt2 zrEN}(ZBGisP0VV$_*3w{h|3wd`f$3mmT-ZLP;oWM@8gIH0|;cKcab6gAqx{YQ6QCn zz_Ge4uj4OiFm3ye#w*e|%} zYq*0&w4E8}ReLwiZTFq`T8Rx9!vvoUWI5p%Jk(C&jo4Drzc6_Ql{L-3 zjNZX4W|wJ6qdA0Q9x8E6w~|m#M)8reKB~cw&|K07b|l{vbulo;IWGZ^I@tN-z|;o$JA0Jog1nb=`1|NQ zf*fcOmO3^SD4If%@2_>Hf{;ns0u#!Dm-=u`lna891sflg{J;?i*#Io9`?bCdH83Tk zH_)C1;qM8RJoIZC4yXj%BsfGXb>lZC-0T-c7q0IGb^ z7{(WGKzBz>Q0BQnfTbJOD=@^FjYz3ohv)2T79GP9h~1(pnq4*v4`5ESod7$#2mu1q zOLT;y+DXj@7B;eZ!D>TFiB?g$5yU`$VnbYtRL>VBdttxzll``dN%sP-dc`z{9f50R zz=i|r3XBAUmzaair2>E<=UMVZz^_zQgvwrs7%rj&cLfLw4c9+r&pFiwj-6+9|DR^; zk0IK>VF$By=5zZK>vwLmi-4U5vM_|qWnjCA&KxE3`oNw1$H)nDRHn-xvhaZ*5vCR= zhg- zsDP4FAI>XOKybsa@S#KW;A+5(`J<@z|DHZ>FufbZB(=*ii?rCBzI9Oq zUcbzC3F#>lU5?&8O?FD>TU4vZts1zQqXqKe0&5n2#^Iq+5uTH-kH%~_fjkQ+DCCni zC{QY2(%>_m3&x zq@JQ=(?-^Vl#ZPR>(0is9or}X#Wt45fkYcHA`0IW5oy#N^_hfuP}s7Cy!YF2H}&Sej~b8)rB$me(DKqXY>ku& z1&R13Z+{!DKsz#a`UL~|{uJWh3`8ALx)S3+JBz*mTp({hAG{rky&VqT{&w&dGDA$; z%=7BQ=V5qu^Cpf1OWnwLw?Niy%!4^lSa8eXzGCZp_5p1l#CAOjB)_Xg?cz1lo`7@~ zZhLO0AyqWQT)%DT!kGZSpDyhqBzWUm@bb(=jUsopNV#rJRTesAv< z>8-M-drkKXW0kvOAs4g+q0Jzk?p+D##DfIXRIJ`h-wsN&J^jNllL0~uuRRotnKo;VKD$_c<#w}ZSg#u)bljzXHxcz0~x zNuva93B3>V`NBnM?LOAi{hatqQs+O~l7HrP^Z$eG^4BX1rT>N0;VW@(-UPb0bj>PHxE6-FxpuV3IL=)sY4jez&3etF9eP>_T9M` zm=p!kmXa$mC3=*}{Q0$|C6G681r?;EqTq;0-KoZGX|9Dk5=5O(a-@=iu3)FAU>ojH ze?TaNqM%FzN(rh9NpofpUPT6S!%C;*^~_?cj#Iw0-Y3Z6Vhbj{X6JnC$%p3N(hbd7i~0Q_FSzA%$Rp5FT1U#KX~%-)2APOr1HcQl_ws0^6VMkfC+Wn z^b0zYpVyow=&!HFz#$$S*wHml{YiL)r+y2h1!+J7^9(oNV~t>^FaYr%44;XQ@P2Wa zc)}j1<1h^IVD$b4?qiUL)ZBO+mqR>eB7hqLM@hi0eoE{(_?en(*MOXg#Zp8TI4TM5 zzj^`50OtsEg-4eblqSpm9)NHzG0$`YQW&sOVcwWBaYd}P{@^yi)J|to7g!bX^4E9C zjG7gScmULgY`T9QHe-EAN1~8-@G{n`JRiY51~n4Zxo zgccp#k3xc508HwewWQ(k*1wajC(xEom;~Ei1q^XxeV9v$^g@}&Q0iLn0IyoM<1Roj zsRbH?pRsFlU9@kibGlnOGS^48Y3zDmSL0sHtoL7_jrHZAwd?&W@yCnZkBS|&UR>gu z=}t5N+Dg~6`Wr;sw%;h>@f89Eq51*#Da^%dL7!=0<-Z~Iivif3259fsp9 zwU0tufy{Yp%g zLadPJZ&hpsolhk-2=-?`6PU8`Q_??vKa_xE8p@6Kex3za1h?`nsf6_1=h>t`XO*p0 zmaUo8%%KbRMGsQh16JQ!bt3PWaej_|*~rxIG7qj)tY4XivT>be=zl=Ye@DpK$5Xxt zb8aTg5JT98q@-!cv%%0?GH*^prYsGTQx-1l?%;bz@1Z#%r%$L&{sH31plQ5#(8;TO1E8?yBoMot8*!dQ^A$#tE zXuBXI>W|?QMyT$kltk|xV2-U`z|p4XPr}$5#&9b(^0V9t1sln3i;e7*E=HZBC>gO2 zo`vq0kQ*;k5FLjUMeuOS4YhHG!Dg|!3@Me3G;jp4yXf=M!qUp%iUbJeMR)=P^aRsL ztt5fgCO>9;&v9TQS-~d`5cgOffJuEqXs6&GZ0CSaJkUlw_mo%khF&muV=G}lK(*Cx z$Nd`{dBlT07xWXJ{%8VFKg9Cv9EAB;r_n%|4+4YBV=q7w4rlZDHAv*QnQFq$U$uY{ zgz0|_ux1}%%|35ZV9f&aY4-)NCK!I|dvL!ouWZ1T2h0sd7L)3`y{XOJ0>z!eUOa6N z0Q>r1#5ai3?8hVpu#>J7Kh{YRmL0Tr4}R~liwFhtZX#h1{g8!g2hg|T8~G$OPoRA= zP)lqKq9>qxfnFE`M8a$PhNy=D?TMk~o30q6WDt_oS415L+QZOh7`X78zfQKQ9hi)2r zSzI$VWlDpX4nhL&+UOP-P3e;=Pn&AN&8bynqi<8X8|4MihDc?@FMB+9ydHStPrAS} ztMDp?mD8_o3z+V-J#7b=0+vhKIF9=ORzcAOH@o5?wWL~WtLkr2xh;Lct(Sn5`YPV) zDiUN;U!(I<6ei~r!6XwTybhS8t~18VaMlN6SjSO6dHgPW-8$M@JgWW@?;+=q7w}R; zEIdTF)P-BC0Ki0<8}o9y#NA_RvS>j)hdhVruz z>cky807Docn8sO5lzHR}d+p%qigh^O$gBU1_v=Hk1`+y3rUWczW);Ca^$K$_Sb#}~ zkeR;P_h#m}pF$p*(HUB=i%urBKwEw*8blHk>Nl7;!E6ZoDFNs*_NqklGpm6o&00FP z>Sx3Zoe?^`DA!Nx5m%goe%4!so&pDbh!IE?S7Y>z(;?8MEcyy?{1i1D>Plw8y^L9A z7X85tYMrBLW02&+90i)QI4J{w(Ozq;y83`lGt&t=dztf~p1KhpJ~O%PVJ^_xBvb97 zbBNA9I+96W(21PNYCnT-h69jlW!0;QRA4m7P^hJppnzq40WM3s-;^9DJ)YPO;N-{; zF>(Ocoi|9k1hTk9)A^IapeNlymMqKQ0DQm1)%#g=R=&Rg7DpC}2%0|#+W4y`7$Vc4 zl2iQv$_ZBb6$>)ce2+E$N*=dPm?JcqXjyQfV_-f9RffP<7|)t}5fAs|lEVm;4IrOR z36%#6+(q71#`oIzA|l^H;}Om9ACUfuh0;WD3J%!lIDb93rj zFvK&)sTuQ>`j7M;>=C&3hbxdew?n(qvNALw{N0=(KgSvL^K>YQrzNpJPv0-o`87Jf zPKVOj>YvfKi$1Pvf3R}$@wrovzP)nl@h2&%y{(Kkx_FC>HX%lBRrvRjBUrmM;JXk= zYX3VH@)LAOS682*LwH&yv*M8LEXVzmh?gha{p%4X7iw+gR7 zBK$!!rNhBFLtGth<_N`GFEf58+yp#7`m$o1sy>=6$M#temjaSUVCEH+u~4KXG;AB% zs?ip5Yq+>xWE;BeyOs`7bE6vEy&l?@i!396#D69*CwY=m^idx3&(HvjHmyc2GA$ZH zp(MevNCmUC31N}GkJ3Q^1p+}l$shp%E5R@#OnE0)8gB zj%@+h3WEJ-SSTgMk=8g!(G%I{!1S;bhSyN{jOU0)&>)eT8mKVl(S6Y}a}sM+s&$Da zOS#&7)AmJ6DK4*L0(B8uk6ndnLr7g2hK&c4-X^B-T|5ja3VxSm$9! zX+B^L3vI~N;$E~5Bt77Iv>b^3)3HE2p#80IXxIfpgFVDrc6ubNq?8%9JXX=bp#K6^ zYwR->Jh{IBPpBRt=nDvr&u5_|K_e##*dEr_g?Tk$2F6V?TmOI(WS#6XFEY&Imqtx^ zi3mdA>w~WVKiDjIg838ehuM=b28^FL19&$W{1$`Xq2PDe8Kh!Gpx8i#3m_ilVNnQx zWa7-YA1>D&msE_m%I%Vn93d$8t#Ge+J3+ zUoyh`>2%;V9wUo{*YfaCgO9d@3(3gn$BymZS#lpW&?3yAV_gJX6#4Bde3X(q#q-!| z5tRJ}c-HUgkb}41zaVP1jVw2NJ^awm?@4^s{~E8rP6Ff?gn#>Z?IZY&xTX+y$rQkO z?FZgUaogK4@ho{a&;&O&^ovL@loTQ9^8iJHaed(fVCG{1d|2e~f#LHVcx)Sk&6|U~ z&U+bN`JmHYGdWRVhawHGodx+I@DHV|^sdCXJZ-{~#4_tL2+SGzwELk5iEAMl3L#%y zip!-3%McVN*5b*q=n$m>-x%5m5MD+-ejNTm)Y4QRhms@p0|uH?L71gJ(@rlJkvA?@ zbUzjNL8nolfhweqYuW5Qq@s&Fvv~Vk-mr)*hdKW$1(<2g0(F23LFS?6Z6eg17XK|l zTIT#G-e@TD3_BRC>EJaJ>EH(khAG36@`(LqZ#<0Mmb}_|(2p|hA_SrP!0VQ4hDAhr zh+_uYL8?6z6Ox|bb(OdTa}r-%R=GT7_lb~np-PQp*`!POe8VEqGdLFz47sHty z2<*9j|7xpM2MQL*>6@i?Vtbd)zI_WnU?wU0~#sKlA0Am2la z8Bcq`9022hgnS93!d%|ik1t}%y>O!-`ZMmdqySq1jDX2r8P_hwqa99A@a1V*LOp^J zKv#Nf#2TH8bOfvH;F2A@QA|7J*lfZ!T7efiY>ynusIdoj#`PXW;`gwlh{|UZg`q6< z^h1r^Sf_mhxLslNZxnOJLLd-2<0&dEgK6EQ47sPi!a98kj-RF#Uzn(nHb|2MttId# z^SL|{-OJLs!&-w_Uu7H~)Bd=Qqpbo$2QRpM2xOn&{%a+ZSmrdRczLpG2D-CVTvc^Z za64BK{OWnFr9>t3jscGHEwL;F4MRIeBSboLx9k%Flt#xT@gM%hQFM2B=ZkPe~WQiWav$V_BD>5F94C?`Oka3dBluENzb zo)l;Gb|xFrV?`#fs-0T{SUHFLVxfFx{PW?xJ%yoyXw#>Bn^WRKr)#()l4epQ7_=I$7=?LTCLYKK?SD zK8D>z=Omr`;glyrI|66ULNun!jKEWjB6KW4dq`Z7_1?7&N4SNby-)YY1BjJye;AH6 zqA!}A#HKndggb7kMUd#AJYbZJLN=YGf7Hywa!Em~ns9?A4;Ei+zXiOVOp&*m%?$Ss zPiJq*=Z2@Jjt>_SA1pqc_#oUl^IN&)r(Ye;3>On0wARcaIRd^uY+2*xDeWJ%5Oag^ P`DZ?k(1Q7u@#6mn+3oyM diff --git a/collie/models/mistral2/configuration_mistraltp.py b/collie/models/mistral2/configuration_mistraltp.py deleted file mode 100644 index ad6691b..0000000 --- a/collie/models/mistral2/configuration_mistraltp.py +++ /dev/null @@ -1,155 +0,0 @@ -# coding=utf-8 -# Copyright 2023 Mistral AI and the HuggingFace Inc. team. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -""" Mistral model configuration""" - -from transformers.configuration_utils import PretrainedConfig -# from transformers.utils import logging -from collie.log.logger import logger - - -# logger = logging.get_logger(__name__) - -MISTRAL_PRETRAINED_CONFIG_ARCHIVE_MAP = { - "mistralai/Mistral-7B-v0.1": "https://huggingface.co/mistralai/Mistral-7B-v0.1/resolve/main/config.json", - "mistralai/Mistral-7B-Instruct-v0.1": "https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.1/resolve/main/config.json", -} - - -class MistralConfig(PretrainedConfig): - r""" - This is the configuration class to store the configuration of a [`MistralModel`]. It is used to instantiate an - Mistral model according to the specified arguments, defining the model architecture. Instantiating a configuration - with the defaults will yield a similar configuration to that of the Mistral-7B-v0.1 or Mistral-7B-Instruct-v0.1. - - [mistralai/Mistral-7B-v0.1](https://huggingface.co/mistralai/Mistral-7B-v0.1) - [mistralai/Mistral-7B-Instruct-v0.1](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.1) - - Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the - documentation from [`PretrainedConfig`] for more information. - - - Args: - vocab_size (`int`, *optional*, defaults to 32000): - Vocabulary size of the Mistral model. Defines the number of different tokens that can be represented by the - `inputs_ids` passed when calling [`MistralModel`] - hidden_size (`int`, *optional*, defaults to 4096): - Dimension of the hidden representations. - intermediate_size (`int`, *optional*, defaults to 14336): - Dimension of the MLP representations. - num_hidden_layers (`int`, *optional*, defaults to 32): - Number of hidden layers in the Transformer encoder. - num_attention_heads (`int`, *optional*, defaults to 32): - Number of attention heads for each attention layer in the Transformer encoder. - num_key_value_heads (`int`, *optional*, defaults to 8): - This is the number of key_value heads that should be used to implement Grouped Query Attention. If - `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if - `num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used. When - converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed - by meanpooling all the original heads within that group. For more details checkout [this - paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to `8`. - hidden_act (`str` or `function`, *optional*, defaults to `"silu"`): - The non-linear activation function (function or string) in the decoder. - max_position_embeddings (`int`, *optional*, defaults to `4096*32`): - The maximum sequence length that this model might ever be used with. Mistral's sliding window attention - allows sequence of up to 4096*32 tokens. - initializer_range (`float`, *optional*, defaults to 0.02): - The standard deviation of the truncated_normal_initializer for initializing all weight matrices. - rms_norm_eps (`float`, *optional*, defaults to 1e-06): - The epsilon used by the rms normalization layers. - use_cache (`bool`, *optional*, defaults to `True`): - Whether or not the model should return the last key/values attentions (not used by all models). Only - relevant if `config.is_decoder=True`. - pad_token_id (`int`, *optional*): - The id of the padding token. - bos_token_id (`int`, *optional*, defaults to 1): - The id of the "beginning-of-sequence" token. - eos_token_id (`int`, *optional*, defaults to 2): - The id of the "end-of-sequence" token. - tie_word_embeddings (`bool`, *optional*, defaults to `False`): - Whether the model's input and output word embeddings should be tied. - rope_theta (`float`, *optional*, defaults to 10000.0): - The base period of the RoPE embeddings. - sliding_window (`int`, *optional*, defaults to 4096): - Sliding window attention window size. If not specified, will default to `4096`. - attention_dropout (`float`, *optional*, defaults to 0.0): - The dropout ratio for the attention probabilities. - - ```python - >>> from transformers import MistralModel, MistralConfig - - >>> # Initializing a Mistral 7B style configuration - >>> configuration = MistralConfig() - - >>> # Initializing a model from the Mistral 7B style configuration - >>> model = MistralModel(configuration) - - >>> # Accessing the model configuration - >>> configuration = model.config - ```""" - - model_type = "mistral" - keys_to_ignore_at_inference = ["past_key_values"] - - def __init__( - self, - vocab_size=32000, - hidden_size=4096, - intermediate_size=14336, - num_hidden_layers=32, - num_attention_heads=32, - num_key_value_heads=8, - hidden_act="silu", - max_position_embeddings=4096 * 32, - initializer_range=0.02, - rms_norm_eps=1e-6, - use_cache=True, - pad_token_id=None, - bos_token_id=1, - eos_token_id=2, - tie_word_embeddings=False, - rope_theta=10000.0, - sliding_window=4096, - attention_dropout=0.0, - attn_implementation="flash_attention_2", - **kwargs, - ): - self.vocab_size = vocab_size - self.max_position_embeddings = max_position_embeddings - self.hidden_size = hidden_size - self.intermediate_size = intermediate_size - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - self.sliding_window = sliding_window - - # for backward compatibility - if num_key_value_heads is None: - num_key_value_heads = num_attention_heads - - self.num_key_value_heads = num_key_value_heads - self.hidden_act = hidden_act - self.initializer_range = initializer_range - self.rms_norm_eps = rms_norm_eps - self.use_cache = use_cache - self.rope_theta = rope_theta - self.attention_dropout = attention_dropout - - # 调用父类的初始化函数,将一些公共参数传递给父类处理 - super().__init__( - pad_token_id=pad_token_id, - bos_token_id=bos_token_id, - eos_token_id=eos_token_id, - tie_word_embeddings=tie_word_embeddings, - **kwargs, - ) diff --git a/collie/models/mistral2/model.py b/collie/models/mistral2/model.py deleted file mode 100644 index 60d9553..0000000 --- a/collie/models/mistral2/model.py +++ /dev/null @@ -1,2026 +0,0 @@ -# coding=utf-8 -# Copyright 2023 Mistral AI and the HuggingFace Inc. team. All rights reserved. -# -# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX -# and OPT implementations in this library. It has been modified from its -# original forms to accommodate minor architectural differences compared -# to GPT-NeoX and OPT used by the Meta AI team that trained the model. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -""" PyTorch Mistral model.""" -import inspect -import math -import warnings -from typing import List, Optional, Tuple, Union - -import torch -import torch.nn.functional as F -import torch.utils.checkpoint -from torch import nn -from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss - -from transformers.activations import ACT2FN -from transformers.cache_utils import Cache, DynamicCache -from transformers.modeling_attn_mask_utils import _prepare_4d_causal_attention_mask, _prepare_4d_causal_attention_mask_for_sdpa -from transformers.modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast, SequenceClassifierOutputWithPast -from transformers.modeling_utils import PreTrainedModel, dtype_byte_size -from transformers.utils import ( - add_start_docstrings, - add_start_docstrings_to_model_forward, - is_flash_attn_2_available, - is_flash_attn_greater_or_equal_2_10, - logging, - replace_return_docstrings, -) -from .configuration_mistraltp import Mistral2Config - - -if is_flash_attn_2_available(): - from flash_attn import flash_attn_func, flash_attn_varlen_func - from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input # noqa - - _flash_supports_window_size = "window_size" in list(inspect.signature(flash_attn_func).parameters) - - -logger = logging.get_logger(__name__) - -_CONFIG_FOR_DOC = "Mistral2Config" - -#modified for collie -import torch.distributed as dist -import gc -import json -import os -from collections import OrderedDict -from megatron.core import parallel_state, tensor_parallel -from einops import rearrange -from deepspeed.pipe import LayerSpec, TiedLayerSpec - -from collie.config import CollieConfig -from collie.driver.io import IODriver -from collie.log.logger import logger -from collie.module import ( - ColumnParallelLinearWithoutBias, - ColumnParallelLMHead, - RowParallelLinearWithoutBias, -) -from collie.utils import concat_tensor, dict_as_params, env, progress -from collie.models.base import CollieModelForCausalLM -from collie.models.utils import ( - kv_cache_to_inputs_for_layer, inputs_to_kv_cache_for_layer, - kv_cache_to_inputs_for_model, inputs_to_kv_cache_for_model, -) - - -# Copied from transformers.models.llama.modeling_llama._get_unpad_data -def _get_unpad_data(attention_mask): - seqlens_in_batch = attention_mask.sum(dim=-1, dtype=torch.int32) - indices = torch.nonzero(attention_mask.flatten(), as_tuple=False).flatten() - max_seqlen_in_batch = seqlens_in_batch.max().item() - cu_seqlens = F.pad(torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.int32), (1, 0)) - return ( - indices, - cu_seqlens, - max_seqlen_in_batch, - ) - - -# Copied from transformers.models.llama.modeling_llama.LlamaRMSNorm with Llama->Mistral -class Mistral2RMSNorm(nn.Module): - def __init__(self, hidden_size, eps=1e-6): - """ - MistralRMSNorm is equivalent to T5LayerNorm - """ - super().__init__() - self.weight = nn.Parameter(torch.ones(hidden_size)) - self.variance_epsilon = eps - - def forward(self, hidden_states): - input_dtype = hidden_states.dtype - hidden_states = hidden_states.to(torch.float32) - variance = hidden_states.pow(2).mean(-1, keepdim=True) - hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon) - ans = self.weight * hidden_states.to(input_dtype) - # -------------------------------------------------------- - # # 将Tensor转换为列表 - # ans_list = ans.tolist() - # # 指定.json文件的路径 - # file_path = '/remote-home/lqyin/CoLLiE/tests/models/mistraltp/rms_ans.json' - - # # 尝试打开现有的.json文件并读取内容,如果文件不存在则创建一个新的列表 - # try: - # with open(file_path, 'r', encoding='utf-8') as file: - # results_list = json.load(file) - # except FileNotFoundError: - # results_list = [] - # # 将当前结果添加到列表中 - # results_list.append(ans_list) - # # 将更新后的列表写回.json文件 - # with open(file_path, 'w', encoding='utf-8') as file: - # json.dump(results_list, file, ensure_ascii=False, indent=4) - # file.write('\n') # 在文件末尾添加一个换行符 - # -------------------------------------------------------- - return ans - - -# copied from transformers.models.llama.modeling_llama.LlamaRotaryEmbedding with Llama->Mistral -# TODO @Arthur no longer copied from LLama after static cache -class Mistral2RotaryEmbedding(nn.Module): - def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None): - super().__init__() - - self.dim = dim - self.max_position_embeddings = max_position_embeddings - self.base = base - inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2, dtype=torch.int64).float().to(device) / self.dim)) - self.register_buffer("inv_freq", inv_freq, persistent=False) - - # Build here to make `torch.jit.trace` work. - self._set_cos_sin_cache( - seq_len=max_position_embeddings, device=self.inv_freq.device, dtype=torch.get_default_dtype() - ) - - def _set_cos_sin_cache(self, seq_len, device, dtype): - self.max_seq_len_cached = seq_len - t = torch.arange(self.max_seq_len_cached, device=device, dtype=torch.int64).type_as(self.inv_freq) - - freqs = torch.outer(t, self.inv_freq) - # Different from paper, but it uses a different permutation in order to obtain the same calculation - emb = torch.cat((freqs, freqs), dim=-1) - self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False) - self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False) - - def forward(self, x, seq_len=None): - # x: [bs, num_attention_heads, seq_len, head_size] - if seq_len > self.max_seq_len_cached: - self._set_cos_sin_cache(seq_len=seq_len, device=x.device, dtype=x.dtype) - - return ( - self.cos_cached[:seq_len].to(dtype=x.dtype), - self.sin_cached[:seq_len].to(dtype=x.dtype), - ) - - -# Copied from transformers.models.llama.modeling_llama.rotate_half -def rotate_half(x): - """Rotates half the hidden dims of the input.""" - x1 = x[..., : x.shape[-1] // 2] - x2 = x[..., x.shape[-1] // 2 :] - return torch.cat((-x2, x1), dim=-1) - - -# copied from transformers.models.llama.modeling_llama.apply_rotary_pos_emb -# TODO @Arthur no longer copied from LLama after static cache -def apply_rotary_pos_emb(q, k, cos, sin, position_ids, unsqueeze_dim=1): - """Applies Rotary Position Embedding to the query and key tensors. - - Args: - q (`torch.Tensor`): The query tensor. - k (`torch.Tensor`): The key tensor. - cos (`torch.Tensor`): The cosine part of the rotary embedding. - sin (`torch.Tensor`): The sine part of the rotary embedding. - position_ids (`torch.Tensor`): - The position indices of the tokens corresponding to the query and key tensors. For example, this can be - used to pass offsetted position ids when working with a KV-cache. - unsqueeze_dim (`int`, *optional*, defaults to 1): - The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and - sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note - that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and - k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes - cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have - the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2. - Returns: - `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding. - """ - cos = cos[position_ids].unsqueeze(unsqueeze_dim) - sin = sin[position_ids].unsqueeze(unsqueeze_dim) - q_embed = (q * cos) + (rotate_half(q) * sin) - k_embed = (k * cos) + (rotate_half(k) * sin) - return q_embed, k_embed - - -class Mistral2MLP(nn.Module): - def __init__(self, config): - super().__init__() - self.config = config - self.hidden_size = config.hidden_size - self.intermediate_size = config.intermediate_size - # self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False) - # self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False) - # self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False) - - self.up_proj = ColumnParallelLinearWithoutBias( - self.hidden_size, - self.intermediate_size, - bias=False, - gather_output=False, - init_method=lambda x: x, - ) - self.gate_proj = ColumnParallelLinearWithoutBias( - self.hidden_size, - self.intermediate_size, - bias=False, - gather_output=False, - init_method=lambda x: x, - ) - self.down_proj = RowParallelLinearWithoutBias( - self.intermediate_size, - self.hidden_size, - bias=False, - input_is_parallel=True, - init_method=lambda x: x, - ) - - self.act_fn = ACT2FN[config.hidden_act] - - def forward(self, x): - return self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x)) - - -# Copied from transformers.models.llama.modeling_llama.repeat_kv -def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor: - """ - This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch, - num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim) - """ - batch, num_key_value_heads, slen, head_dim = hidden_states.shape - if n_rep == 1: - return hidden_states - hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim) - return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim) - - -class Mistral2Attention(nn.Module): - """ - Multi-headed attention from 'Attention Is All You Need' paper. Modified to use sliding window attention: Longformer - and "Generating Long Sequences with Sparse Transformers". - """ - - def __init__(self, config: CollieConfig, layer_idx: Optional[int] = None): - super().__init__() - self.config = config - self.layer_idx = layer_idx - if layer_idx is None: - logger.warning_once( - f"Instantiating {self.__class__.__name__} without passing a `layer_idx` is not recommended and will " - "lead to errors during the forward call if caching is used. Please make sure to provide a `layer_idx` " - "when creating this class." - ) - - self.hidden_size = config.hidden_size - self.num_heads = config.num_attention_heads - self.head_dim = self.hidden_size // self.num_heads - self.num_key_value_heads = config.num_key_value_heads - self.num_key_value_groups = self.num_heads // self.num_key_value_heads - self.max_position_embeddings = config.max_position_embeddings - self.rope_theta = config.rope_theta - self.is_causal = True - self.attention_dropout = config.attention_dropout - - if (self.head_dim * self.num_heads) != self.hidden_size: - raise ValueError( - f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}" - f" and `num_heads`: {self.num_heads})." - ) - # self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=False) - # self.k_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=False) - # self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=False) - # self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=False) - - self.q_proj = ColumnParallelLinearWithoutBias( - self.hidden_size, - self.num_heads * self.head_dim, - bias=False, - gather_output=False, - init_method=lambda x: x, - ) - self.k_proj = ColumnParallelLinearWithoutBias( - self.hidden_size, - self.num_key_value_heads * self.head_dim, - bias=False, - gather_output=False, - init_method=lambda x: x, - ) - self.v_proj = ColumnParallelLinearWithoutBias( - self.hidden_size, - self.num_key_value_heads * self.head_dim, - bias=False, - gather_output=False, - init_method=lambda x: x, - ) - self.o_proj = RowParallelLinearWithoutBias( - self.num_heads * self.head_dim, - self.hidden_size, - bias=False, - input_is_parallel=True, - init_method=lambda x: x, - ) - - self.rotary_emb = Mistral2RotaryEmbedding( - self.head_dim, - max_position_embeddings=self.max_position_embeddings, - base=self.rope_theta, - ) - - def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int): - return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous() - - def forward( - self, - hidden_states: torch.Tensor, - attention_mask: Optional[torch.Tensor] = None, - position_ids: Optional[torch.LongTensor] = None, - past_key_value: Optional[Cache] = None, - output_attentions: bool = False, - use_cache: bool = False, - **kwargs, - ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: - if "padding_mask" in kwargs: - warnings.warn( - "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`" - ) - bsz, q_len, _ = hidden_states.size() - - query_states = self.q_proj(hidden_states) - key_states = self.k_proj(hidden_states) - value_states = self.v_proj(hidden_states) - - # query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2) - # key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) - # value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) - - query_states, key_states, value_states = ( - rearrange(query_states, "b n (h d) -> b n h d", d=self.head_dim), - rearrange(key_states, "b n (h d) -> b n h d", d=self.head_dim), - rearrange(value_states, "b n (h d) -> b n h d", d=self.head_dim), - ) - - self.num_heads_tp = query_states.shape[2] - self.tp_size = self.num_heads // self.num_heads_tp - - query_states = query_states.transpose(1, 2) - key_states = key_states.transpose(1, 2) - value_states = value_states.transpose(1, 2) - - if self.config.pp_size > 1: - query_states = query_states.contiguous() - key_states = key_states.contiguous() - value_states = value_states.contiguous() - - kv_seq_len = key_states.shape[-2] - if past_key_value is not None: - if self.layer_idx is None: - raise ValueError( - f"The cache structure has changed since version v4.36. If you are using {self.__class__.__name__} " - "for auto-regressive decoding with k/v caching, please make sure to initialize the attention class " - "with a layer index." - ) - kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx) - cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len) - query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids) - - if past_key_value is not None: - cache_kwargs = {"sin": sin, "cos": cos} # Specific to RoPE models - key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs) - - # repeat k/v heads if n_kv_heads < n_heads - key_states = repeat_kv(key_states, self.num_key_value_groups) - value_states = repeat_kv(value_states, self.num_key_value_groups) - - attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim) - - if attn_weights.size() != (bsz, self.num_heads_tp, q_len, kv_seq_len): - raise ValueError( - f"Attention weights should be of size {(bsz, self.num_heads_tp, q_len, kv_seq_len)}, but is" - f" {attn_weights.size()}" - ) - - if attention_mask is not None: - if attention_mask.size() != (bsz, 1, q_len, kv_seq_len): - raise ValueError( - f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}" - ) - - attn_weights = attn_weights + attention_mask - - # upcast attention to fp32 - attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype) - attn_weights = nn.functional.dropout(attn_weights, p=self.attention_dropout, training=self.training) - attn_output = torch.matmul(attn_weights, value_states) - - if attn_output.size() != (bsz, self.num_heads_tp, q_len, self.head_dim): - raise ValueError( - f"`attn_output` should be of size {(bsz, self.num_heads_tp, q_len, self.head_dim)}, but is" - f" {attn_output.size()}" - ) - - attn_output = attn_output.transpose(1, 2).contiguous() - attn_output = attn_output.reshape(bsz, q_len, int(self.hidden_size/self.tp_size)) - - attn_output = self.o_proj(attn_output) - - if not output_attentions: - attn_weights = None - - # -------------------------------------------------------- - # 将Tensor转换为列表 - ans_list = attn_output.tolist() - # 指定.json文件的路径 - file_path = '/remote-home/lqyin/CoLLiE/tests/models/mistraltp/attn_output_1.json' - - # 尝试打开现有的.json文件并读取内容,如果文件不存在则创建一个新的列表 - try: - with open(file_path, 'r', encoding='utf-8') as file: - results_list = json.load(file) - except FileNotFoundError: - results_list = [] - # 将当前结果添加到列表中 - results_list.append(ans_list) - # 将更新后的列表写回.json文件 - with open(file_path, 'w', encoding='utf-8') as file: - json.dump(results_list, file, ensure_ascii=False, indent=4) - file.write('\n\n\n') # 在文件末尾添加一个换行符 - # -------------------------------------------------------- - - - - return attn_output, attn_weights, past_key_value - - -class Mistral2FlashAttention2(Mistral2Attention): - """ - Mistral flash attention module. This module inherits from `MistralAttention` as the weights of the module stays - untouched. The only required change would be on the forward pass where it needs to correctly call the public API of - flash attention and deal with padding tokens in case the input contains any of them. - """ - - # Copied from transformers.models.llama.modeling_llama.LlamaFlashAttention2.__init__ - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - - # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1. - # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignement, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0. - # Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left). - self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10() - - def forward( - self, - hidden_states: torch.Tensor, - attention_mask: Optional[torch.Tensor] = None, - position_ids: Optional[torch.LongTensor] = None, - past_key_value: Optional[Cache] = None, - output_attentions: bool = False, - use_cache: bool = False, - **kwargs, - ): - if "padding_mask" in kwargs: - warnings.warn( - "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`" - ) - - # overwrite attention_mask with padding_mask - attention_mask = kwargs.pop("padding_mask") - bsz, q_len, _ = hidden_states.size() - - query_states = self.q_proj(hidden_states) - key_states = self.k_proj(hidden_states) - value_states = self.v_proj(hidden_states) - - # query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2) - # key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) - # value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) - - query_states, key_states, value_states = ( - rearrange(query_states, "b n (h d) -> b n h d", d=self.head_dim), - rearrange(key_states, "b n (h d) -> b n h d", d=self.head_dim), - rearrange(value_states, "b n (h d) -> b n h d", d=self.head_dim), - ) - - self.num_heads_tp = query_states.shape[2] - self.tp_size = self.num_heads // self.num_heads_tp - - query_states = query_states.transpose(1, 2) - key_states = key_states.transpose(1, 2) - value_states = value_states.transpose(1, 2) - - if self.config.pp_size > 1: - query_states = query_states.contiguous() - key_states = key_states.contiguous() - value_states = value_states.contiguous() - - - kv_seq_len = key_states.shape[-2] - if past_key_value is not None: - if self.layer_idx is None: - raise ValueError( - f"The cache structure has changed since version v4.36. If you are using {self.__class__.__name__} " - "for auto-regressive decoding with k/v caching, please make sure to initialize the attention class " - "with a layer index." - ) - kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx) - - # Because the input can be padded, the absolute sequence length depends on the max position id. - rotary_seq_len = max(kv_seq_len, position_ids[:, -1].max().item()) + 1 - cos, sin = self.rotary_emb(value_states, seq_len=rotary_seq_len) - - query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids) - - use_sliding_windows = ( - _flash_supports_window_size - and getattr(self.config, "sliding_window", None) is not None - and kv_seq_len > self.config.sliding_window - ) - - if not _flash_supports_window_size: - logger.warning_once( - "The current flash attention version does not support sliding window attention, for a more memory efficient implementation" - " make sure to upgrade flash-attn library." - ) - - if past_key_value is not None: - # Activate slicing cache only if the config has a value `sliding_windows` attribute - cache_has_contents = past_key_value.get_seq_length(self.layer_idx) > 0 - if ( - getattr(self.config, "sliding_window", None) is not None - and kv_seq_len > self.config.sliding_window - and cache_has_contents - ): - slicing_tokens = 1 - self.config.sliding_window - - past_key = past_key_value[self.layer_idx][0] - past_value = past_key_value[self.layer_idx][1] - - past_key = past_key[:, :, slicing_tokens:, :].contiguous() - past_value = past_value[:, :, slicing_tokens:, :].contiguous() - - if past_key.shape[-2] != self.config.sliding_window - 1: - raise ValueError( - f"past key must have a shape of (`batch_size, num_heads, self.config.sliding_window-1, head_dim`), got" - f" {past_key.shape}" - ) - - if attention_mask is not None: - attention_mask = attention_mask[:, slicing_tokens:] - attention_mask = torch.cat([attention_mask, torch.ones_like(attention_mask[:, -1:])], dim=-1) - - cache_kwargs = {"sin": sin, "cos": cos} # Specific to RoPE models - key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs) - - # repeat k/v heads if n_kv_heads < n_heads - key_states = repeat_kv(key_states, self.num_key_value_groups) - value_states = repeat_kv(value_states, self.num_key_value_groups) - dropout_rate = 0.0 if not self.training else self.attention_dropout - - # In PEFT, usually we cast the layer norms in float32 for training stability reasons - # therefore the input hidden states gets silently casted in float32. Hence, we need - # cast them back in float16 just to be sure everything works as expected. - input_dtype = query_states.dtype - if input_dtype == torch.float32: - if torch.is_autocast_enabled(): - target_dtype = torch.get_autocast_gpu_dtype() - # Handle the case where the model is quantized - elif hasattr(self.config, "_pre_quantization_dtype"): - target_dtype = self.config._pre_quantization_dtype - else: - target_dtype = self.q_proj.weight.dtype - - logger.warning_once( - f"The input hidden states seems to be silently casted in float32, this might be related to" - f" the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in" - f" {target_dtype}." - ) - - query_states = query_states.to(target_dtype) - key_states = key_states.to(target_dtype) - value_states = value_states.to(target_dtype) - - # Reashape to the expected shape for Flash Attention - query_states = query_states.transpose(1, 2) - key_states = key_states.transpose(1, 2) - value_states = value_states.transpose(1, 2) - - attn_output = self._flash_attention_forward( - query_states, - key_states, - value_states, - attention_mask, - q_len, - dropout=dropout_rate, - use_sliding_windows=use_sliding_windows, - ) - - attn_output = attn_output.reshape(bsz, q_len, int(self.hidden_size/self.tp_size)).contiguous() - attn_output = self.o_proj(attn_output) - - if not output_attentions: - attn_weights = None - - # -------------------------------------------------------- - # 将Tensor转换为列表 - ans_list = attn_output.tolist() - # 指定.json文件的路径 - file_path = '/remote-home/lqyin/CoLLiE/tests/models/mistraltp/attn_output_1.json' - - # 尝试打开现有的.json文件并读取内容,如果文件不存在则创建一个新的列表 - try: - with open(file_path, 'r', encoding='utf-8') as file: - results_list = json.load(file) - except FileNotFoundError: - results_list = [] - # 将当前结果添加到列表中 - results_list.append(ans_list) - # 将更新后的列表写回.json文件 - with open(file_path, 'w', encoding='utf-8') as file: - json.dump(results_list, file, ensure_ascii=False, indent=4) - file.write('\n\n\n') # 在文件末尾添加一个换行符 - # -------------------------------------------------------- - - - return attn_output, attn_weights, past_key_value - - def _flash_attention_forward( - self, - query_states, - key_states, - value_states, - attention_mask, - query_length, - dropout=0.0, - softmax_scale=None, - use_sliding_windows=False, - ): - """ - Calls the forward method of Flash Attention - if the input hidden states contain at least one padding token - first unpad the input, then computes the attention scores and pad the final attention scores. - - Args: - query_states (`torch.Tensor`): - Input query states to be passed to Flash Attention API - key_states (`torch.Tensor`): - Input key states to be passed to Flash Attention API - value_states (`torch.Tensor`): - Input value states to be passed to Flash Attention API - attention_mask (`torch.Tensor`): - The padding mask - corresponds to a tensor of size `(batch_size, seq_len)` where 0 stands for the - position of padding tokens and 1 for the position of non-padding tokens. - dropout (`float`): - Attention dropout - softmax_scale (`float`, *optional*): - The scaling of QK^T before applying softmax. Default to 1 / sqrt(head_dim) - use_sliding_windows (`bool`, *optional*): - Whether to activate sliding window attention. - """ - if not self._flash_attn_uses_top_left_mask: - causal = self.is_causal - else: - # TODO: Remove the `query_length != 1` check once Flash Attention for RoCm is bumped to 2.1. For details, please see the comment in LlamaFlashAttention2 __init__. - causal = self.is_causal and query_length != 1 - - # Contains at least one padding token in the sequence - if attention_mask is not None: - batch_size = query_states.shape[0] - query_states, key_states, value_states, indices_q, cu_seq_lens, max_seq_lens = self._upad_input( - query_states, key_states, value_states, attention_mask, query_length - ) - - cu_seqlens_q, cu_seqlens_k = cu_seq_lens - max_seqlen_in_batch_q, max_seqlen_in_batch_k = max_seq_lens - - if not use_sliding_windows: - attn_output_unpad = flash_attn_varlen_func( - query_states, - key_states, - value_states, - cu_seqlens_q=cu_seqlens_q, - cu_seqlens_k=cu_seqlens_k, - max_seqlen_q=max_seqlen_in_batch_q, - max_seqlen_k=max_seqlen_in_batch_k, - dropout_p=dropout, - softmax_scale=softmax_scale, - causal=causal, - ) - else: - attn_output_unpad = flash_attn_varlen_func( - query_states, - key_states, - value_states, - cu_seqlens_q=cu_seqlens_q, - cu_seqlens_k=cu_seqlens_k, - max_seqlen_q=max_seqlen_in_batch_q, - max_seqlen_k=max_seqlen_in_batch_k, - dropout_p=dropout, - softmax_scale=softmax_scale, - causal=causal, - window_size=(self.config.sliding_window, self.config.sliding_window), - ) - - attn_output = pad_input(attn_output_unpad, indices_q, batch_size, query_length) - else: - if not use_sliding_windows: - attn_output = flash_attn_func( - query_states, - key_states, - value_states, - dropout, - softmax_scale=softmax_scale, - causal=causal, - ) - else: - attn_output = flash_attn_func( - query_states, - key_states, - value_states, - dropout, - softmax_scale=softmax_scale, - causal=causal, - window_size=(self.config.sliding_window, self.config.sliding_window), - ) - - return attn_output - - def _upad_input(self, query_layer, key_layer, value_layer, attention_mask, query_length): - batch_size, kv_seq_len, num_heads, head_dim = key_layer.shape - - # On the first iteration we need to properly re-create the padding mask - # by slicing it on the proper place - if kv_seq_len != attention_mask.shape[-1]: - attention_mask_num_tokens = attention_mask.shape[-1] - attention_mask = attention_mask[:, attention_mask_num_tokens - kv_seq_len :] - - indices_k, cu_seqlens_k, max_seqlen_in_batch_k = _get_unpad_data(attention_mask) - - key_layer = index_first_axis(key_layer.reshape(batch_size * kv_seq_len, num_heads, head_dim), indices_k) - value_layer = index_first_axis(value_layer.reshape(batch_size * kv_seq_len, num_heads, head_dim), indices_k) - - if query_length == kv_seq_len: - query_layer = index_first_axis( - query_layer.reshape(batch_size * kv_seq_len, num_heads, head_dim), indices_k - ) - cu_seqlens_q = cu_seqlens_k - max_seqlen_in_batch_q = max_seqlen_in_batch_k - indices_q = indices_k - elif query_length == 1: - max_seqlen_in_batch_q = 1 - cu_seqlens_q = torch.arange( - batch_size + 1, dtype=torch.int32, device=query_layer.device - ) # There is a memcpy here, that is very bad. - indices_q = cu_seqlens_q[:-1] - query_layer = query_layer.squeeze(1) - else: - # The -q_len: slice assumes left padding. - attention_mask = attention_mask[:, -query_length:] - query_layer, indices_q, cu_seqlens_q, max_seqlen_in_batch_q = unpad_input(query_layer, attention_mask) - - return ( - query_layer, - key_layer, - value_layer, - indices_q, - (cu_seqlens_q, cu_seqlens_k), - (max_seqlen_in_batch_q, max_seqlen_in_batch_k), - ) - - -# copied from transformers.models.llama.modeling_llama.LlamaSdpaAttention with Llama->Mistral -# TODO @Arthur no longer copied from LLama after static cache -class Mistral2SdpaAttention(Mistral2Attention): - """ - Mistral attention module using torch.nn.functional.scaled_dot_product_attention. This module inherits from - `MistralAttention` as the weights of the module stays untouched. The only changes are on the forward pass to adapt to - SDPA API. - """ - - # Adapted from MistralAttention.forward - def forward( - self, - hidden_states: torch.Tensor, - attention_mask: Optional[torch.Tensor] = None, - position_ids: Optional[torch.LongTensor] = None, - past_key_value: Optional[Cache] = None, - output_attentions: bool = False, - use_cache: bool = False, - ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: - if output_attentions: - # TODO: Improve this warning with e.g. `model.config.attn_implementation = "manual"` once this is implemented. - logger.warning_once( - "MistralModel is using MistralSdpaAttention, but `torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to the manual attention implementation, " - 'but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.' - ) - return super().forward( - hidden_states=hidden_states, - attention_mask=attention_mask, - position_ids=position_ids, - past_key_value=past_key_value, - output_attentions=output_attentions, - use_cache=use_cache, - ) - - bsz, q_len, _ = hidden_states.size() - - query_states = self.q_proj(hidden_states) - key_states = self.k_proj(hidden_states) - value_states = self.v_proj(hidden_states) - - # query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2) - # key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) - # value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) - - query_states, key_states, value_states = ( - rearrange(query_states, "b n (h d) -> b n h d", d=self.head_dim), - rearrange(key_states, "b n (h d) -> b n h d", d=self.head_dim), - rearrange(value_states, "b n (h d) -> b n h d", d=self.head_dim), - ) - - self.num_heads_tp = query_states.shape[2] - self.tp_size = self.num_heads // self.num_heads_tp - - query_states = query_states.transpose(1, 2) - key_states = key_states.transpose(1, 2) - value_states = value_states.transpose(1, 2) - - if self.config.pp_size > 1: - query_states = query_states.contiguous() - key_states = key_states.contiguous() - value_states = value_states.contiguous() - - - kv_seq_len = key_states.shape[-2] - if past_key_value is not None: - kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx) - cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len) - - query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids) - - if past_key_value is not None: - cache_kwargs = {"sin": sin, "cos": cos} # Specific to RoPE models - key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs) - - key_states = repeat_kv(key_states, self.num_key_value_groups) - value_states = repeat_kv(value_states, self.num_key_value_groups) - - if attention_mask is not None: - if attention_mask.size() != (bsz, 1, q_len, kv_seq_len): - raise ValueError( - f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}" - ) - - # SDPA with memory-efficient backend is currently (torch==2.1.2) bugged with non-contiguous inputs with custom attn_mask, - # Reference: https://github.com/pytorch/pytorch/issues/112577. - if query_states.device.type == "cuda" and attention_mask is not None: - query_states = query_states.contiguous() - key_states = key_states.contiguous() - value_states = value_states.contiguous() - - attn_output = torch.nn.functional.scaled_dot_product_attention( - query_states, - key_states, - value_states, - attn_mask=attention_mask, - dropout_p=self.attention_dropout if self.training else 0.0, - # The q_len > 1 is necessary to match with AttentionMaskConverter.to_causal_4d that does not create a causal mask in case q_len == 1. - is_causal=self.is_causal and attention_mask is None and q_len > 1, - ) - - attn_output = attn_output.transpose(1, 2).contiguous() - attn_output = attn_output.view(bsz, q_len, int(self.hidden_size/self.tp_size)) - - attn_output = self.o_proj(attn_output) - - return attn_output, None, past_key_value - - -MISTRAL_ATTENTION_CLASSES = { - "eager": Mistral2Attention, - "flash_attention_2": Mistral2FlashAttention2, - "sdpa": Mistral2SdpaAttention, -} - - -class MistralDecoderLayer(nn.Module): - def __init__(self, config: CollieConfig, layer_idx: int): - super().__init__() - self.hidden_size = config.hidden_size - - self.self_attn = MISTRAL_ATTENTION_CLASSES[config._attn_implementation](config, layer_idx) - - self.mlp = Mistral2MLP(config) - self.input_layernorm = Mistral2RMSNorm(config.hidden_size, eps=config.rms_norm_eps) - self.post_attention_layernorm = Mistral2RMSNorm(config.hidden_size, eps=config.rms_norm_eps) - - def forward( - self, - hidden_states: torch.Tensor, - attention_mask: Optional[torch.Tensor] = None, - position_ids: Optional[torch.LongTensor] = None, - past_key_value: Optional[Tuple[torch.Tensor]] = None, - output_attentions: Optional[bool] = False, - use_cache: Optional[bool] = False, - **kwargs, - ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]: - if "padding_mask" in kwargs: - warnings.warn( - "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`" - ) - """ - Args: - hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)` - attention_mask (`torch.FloatTensor`, *optional*): attention mask of size - `(batch, sequence_length)` where padding elements are indicated by 0. - output_attentions (`bool`, *optional*): - Whether or not to return the attentions tensors of all attention layers. See `attentions` under - returned tensors for more detail. - use_cache (`bool`, *optional*): - If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding - (see `past_key_values`). - past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states - """ - - residual = hidden_states - - hidden_states = self.input_layernorm(hidden_states) - - # Self Attention - hidden_states, self_attn_weights, present_key_value = self.self_attn( - hidden_states=hidden_states, - attention_mask=attention_mask, - position_ids=position_ids, - past_key_value=past_key_value, - output_attentions=output_attentions, - use_cache=use_cache, - ) - hidden_states = residual + hidden_states - - # Fully Connected - residual = hidden_states - hidden_states = self.post_attention_layernorm(hidden_states) - hidden_states = self.mlp(hidden_states) - hidden_states = residual + hidden_states - - outputs = (hidden_states,) - - if output_attentions: - outputs += (self_attn_weights,) - - if use_cache: - outputs += (present_key_value,) - - # -------------------------------------------------------- - # # 将Tensor转换为列表 - # ans_list = [tensor.tolist() for tensor in outputs] - # # 指定.json文件的路径 - # file_path = '/remote-home/lqyin/CoLLiE/tests/models/mistraltp/decoder_outputs.json' - - # # 尝试打开现有的.json文件并读取内容,如果文件不存在则创建一个新的列表 - # try: - # with open(file_path, 'r', encoding='utf-8') as file: - # results_list = json.load(file) - # except FileNotFoundError: - # results_list = [] - # # 将当前结果添加到列表中 - # results_list.append(ans_list) - # # 将更新后的列表写回.json文件 - # with open(file_path, 'w', encoding='utf-8') as file: - # json.dump(results_list, file, ensure_ascii=False, indent=4) - # file.write('\n') # 在文件末尾添加一个换行符 - # -------------------------------------------------------- - - return outputs - - -MISTRAL_START_DOCSTRING = r""" - This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the - library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads - etc.) - - This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. - Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage - and behavior. - - Parameters: - config ([`MistralConfig`]): - Model configuration class with all the parameters of the model. Initializing with a config file does not - load the weights associated with the model, only the configuration. Check out the - [`~PreTrainedModel.from_pretrained`] method to load the model weights. -""" - - -@add_start_docstrings( - "The bare Mistral Model outputting raw hidden-states without any specific head on top.", - MISTRAL_START_DOCSTRING, -) -class Mistral2PreTrainedModel(PreTrainedModel): - config_class = Mistral2Config - base_model_prefix = "model" - supports_gradient_checkpointing = True - _no_split_modules = ["MistralDecoderLayer"] - _skip_keys_device_placement = "past_key_values" - _supports_flash_attn_2 = True - _supports_sdpa = True - _supports_cache_class = True - - def _init_weights(self, module): - std = self.config.initializer_range - if isinstance(module, nn.Linear): - module.weight.data.normal_(mean=0.0, std=std) - if module.bias is not None: - module.bias.data.zero_() - elif isinstance(module, nn.Embedding): - module.weight.data.normal_(mean=0.0, std=std) - if module.padding_idx is not None: - module.weight.data[module.padding_idx].zero_() - - -MISTRAL_INPUTS_DOCSTRING = r""" - Args: - input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`): - Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide - it. - - Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and - [`PreTrainedTokenizer.__call__`] for details. - - [What are input IDs?](../glossary#input-ids) - attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*): - Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: - - - 1 for tokens that are **not masked**, - - 0 for tokens that are **masked**. - - [What are attention masks?](../glossary#attention-mask) - - Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and - [`PreTrainedTokenizer.__call__`] for details. - - If `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see - `past_key_values`). - - If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`] - and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more - information on the default strategy. - - - 1 indicates the head is **not masked**, - - 0 indicates the head is **masked**. - position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): - Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0, - config.n_positions - 1]`. - - [What are position IDs?](../glossary#position-ids) - past_key_values (`Cache` or `tuple(tuple(torch.FloatTensor))`, *optional*): - Pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention - blocks) that can be used to speed up sequential decoding. This typically consists in the `past_key_values` - returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`. - - Two formats are allowed: - - a [`~cache_utils.Cache`] instance; - - Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of - shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`). This is also known as the legacy - cache format. - - The model will output the same cache format that is fed as input. If no `past_key_values` are passed, the - legacy cache format will be returned. - - If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that don't - have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `input_ids` - of shape `(batch_size, sequence_length)`. - inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): - Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This - is useful if you want more control over how to convert `input_ids` indices into associated vectors than the - model's internal embedding lookup matrix. - use_cache (`bool`, *optional*): - If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see - `past_key_values`). - output_attentions (`bool`, *optional*): - Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned - tensors for more detail. - output_hidden_states (`bool`, *optional*): - Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for - more detail. - return_dict (`bool`, *optional*): - Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. -""" - - -@add_start_docstrings( - "The bare Mistral Model outputting raw hidden-states without any specific head on top.", - MISTRAL_START_DOCSTRING, -) -class Mistral2Model(nn.Module): - """ - Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`MistralDecoderLayer`] - - Args: - config: MistralConfig - """ - - def __init__(self, config: CollieConfig): - # super().__init__(config) - super().__init__() - self.config = config - self.padding_idx = config.pad_token_id - self.vocab_size = config.vocab_size - - self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx) - self.layers = nn.ModuleList( - [MistralDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)] - ) - self._attn_implementation = config._attn_implementation - self.norm = Mistral2RMSNorm(config.hidden_size, eps=config.rms_norm_eps) - - self.gradient_checkpointing = False - # Initialize weights and apply final processing - # self.post_init() - - def get_input_embeddings(self): - return self.embed_tokens - - def set_input_embeddings(self, value): - self.embed_tokens = value - - @add_start_docstrings_to_model_forward(MISTRAL_INPUTS_DOCSTRING) - def forward( - self, - input_ids: torch.LongTensor = None, - attention_mask: Optional[torch.Tensor] = None, - position_ids: Optional[torch.LongTensor] = None, - past_key_values: Optional[List[torch.FloatTensor]] = None, - inputs_embeds: Optional[torch.FloatTensor] = None, - use_cache: Optional[bool] = None, - output_attentions: Optional[bool] = None, - output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[Tuple, BaseModelOutputWithPast]: - output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions - output_hidden_states = ( - output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states - ) - use_cache = use_cache if use_cache is not None else self.config.use_cache - - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - - # retrieve input_ids and inputs_embeds - if input_ids is not None and inputs_embeds is not None: - raise ValueError("You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time") - elif input_ids is not None: - batch_size, seq_length = input_ids.shape - elif inputs_embeds is not None: - batch_size, seq_length, _ = inputs_embeds.shape - else: - raise ValueError("You have to specify either decoder_input_ids or decoder_inputs_embeds") - - if self.gradient_checkpointing and self.training: - if use_cache: - logger.warning_once( - "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..." - ) - use_cache = False - - past_key_values_length = 0 - - if use_cache: - use_legacy_cache = not isinstance(past_key_values, Cache) - if use_legacy_cache: - past_key_values = DynamicCache.from_legacy_cache(past_key_values) - past_key_values_length = past_key_values.get_usable_length(seq_length) - - if position_ids is None: - device = input_ids.device if input_ids is not None else inputs_embeds.device - position_ids = torch.arange( - past_key_values_length, seq_length + past_key_values_length, dtype=torch.long, device=device - ) - position_ids = position_ids.unsqueeze(0).view(-1, seq_length) - else: - position_ids = position_ids.view(-1, seq_length).long() - - if inputs_embeds is None: - inputs_embeds = self.embed_tokens(input_ids) - - if attention_mask is not None and self._attn_implementation == "flash_attention_2" and use_cache: - is_padding_right = attention_mask[:, -1].sum().item() != batch_size - if is_padding_right: - raise ValueError( - "You are attempting to perform batched generation with padding_side='right'" - " this may lead to unexpected behaviour for Flash Attention version of Mistral. Make sure to " - " call `tokenizer.padding_side = 'left'` before tokenizing the input. " - ) - - if self._attn_implementation == "flash_attention_2": - # 2d mask is passed through the layers - attention_mask = attention_mask if (attention_mask is not None and 0 in attention_mask) else None - elif self._attn_implementation == "sdpa" and not output_attentions: - # output_attentions=True can not be supported when using SDPA, and we fall back on - # the manual implementation that requires a 4D causal mask in all cases. - attention_mask = _prepare_4d_causal_attention_mask_for_sdpa( - attention_mask, - (batch_size, seq_length), - inputs_embeds, - past_key_values_length, - ) - else: - # 4d mask is passed through the layers - attention_mask = _prepare_4d_causal_attention_mask( - attention_mask, - (batch_size, seq_length), - inputs_embeds, - past_key_values_length, - sliding_window=self.config.sliding_window, - ) - - hidden_states = inputs_embeds - - - # -------------------------------------------------------- - # # 将Tensor转换为列表 - # ans_list = inputs_embeds.tolist() - # # 指定.json文件的路径 - # file_path = '/remote-home/lqyin/CoLLiE/tests/models/mistraltp/inputs_embeds.json' - - # # 尝试打开现有的.json文件并读取内容,如果文件不存在则创建一个新的列表 - # try: - # with open(file_path, 'r', encoding='utf-8') as file: - # results_list = json.load(file) - # except FileNotFoundError: - # results_list = [] - # # 将当前结果添加到列表中 - # results_list.append(ans_list) - # # 将更新后的列表写回.json文件 - # with open(file_path, 'w', encoding='utf-8') as file: - # json.dump(results_list, file, ensure_ascii=False, indent=4) - # file.write('\n') # 在文件末尾添加一个换行符 - # # -------------------------------------------------------- - - # decoder layers - all_hidden_states = () if output_hidden_states else None - all_self_attns = () if output_attentions else None - next_decoder_cache = None - - for decoder_layer in self.layers: - if output_hidden_states: - all_hidden_states += (hidden_states,) - - if self.gradient_checkpointing and self.training: - layer_outputs = self._gradient_checkpointing_func( - decoder_layer.__call__, - hidden_states, - attention_mask, - position_ids, - past_key_values, - output_attentions, - use_cache, - ) - else: - layer_outputs = decoder_layer( - hidden_states, - attention_mask=attention_mask, - position_ids=position_ids, - past_key_value=past_key_values, - output_attentions=output_attentions, - use_cache=use_cache, - ) - - hidden_states = layer_outputs[0] - - if use_cache: - next_decoder_cache = layer_outputs[2 if output_attentions else 1] - - if output_attentions: - all_self_attns += (layer_outputs[1],) - - hidden_states = self.norm(hidden_states) - - # add hidden states from the last decoder layer - if output_hidden_states: - all_hidden_states += (hidden_states,) - - next_cache = None - if use_cache: - next_cache = next_decoder_cache.to_legacy_cache() if use_legacy_cache else next_decoder_cache - - if not return_dict: - return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None) - return BaseModelOutputWithPast( - last_hidden_state=hidden_states, - past_key_values=next_cache, - hidden_states=all_hidden_states, - attentions=all_self_attns, - ) - - -class Mistral2ForCausalLM(CollieModelForCausalLM): - _tied_weights_keys = ["lm_head.weight"] - - def __init__(self, config:CollieConfig): - super().__init__(config) - self.model = Mistral2Model(config) - self.vocab_size = config.vocab_size - # self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False) - self.lm_head = ColumnParallelLinearWithoutBias( - self.collie_config.hidden_size, self.collie_config.vocab_size, bias=False - ) - # Initialize weights and apply final processing - # self.post_init() - # GenerationMixin 需要的额外参数 - self.config.is_decoder = True - if config.model_config.tie_word_embeddings: - self.lm_head.weight = self.embed_tokens.weight - self.main_input_name = "input_ids" - - def clean_cache(self): - self._clean_hidden_states([*self.model.layers, self.lm_head]) - self._set_use_cache(self.model.layers, False) - - def set_cache(self, use_cache): - self._set_use_cache(self.model.layers, use_cache) - - def get_input_embeddings(self): - return self.model.embed_tokens - - def set_input_embeddings(self, value): - self.model.embed_tokens = value - - def get_output_embeddings(self): - return self.lm_head - - def set_output_embeddings(self, new_embeddings): - self.lm_head = new_embeddings - - def set_decoder(self, decoder): - self.model = decoder - - def get_decoder(self): - return self.model - - @add_start_docstrings_to_model_forward(MISTRAL_INPUTS_DOCSTRING) - @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC) - def forward( - self, - input_ids: torch.LongTensor = None, - attention_mask: Optional[torch.Tensor] = None, - position_ids: Optional[torch.LongTensor] = None, - past_key_values: Optional[List[torch.FloatTensor]] = None, - inputs_embeds: Optional[torch.FloatTensor] = None, - labels: Optional[torch.LongTensor] = None, - use_cache: Optional[bool] = None, - output_attentions: Optional[bool] = None, - output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - **kwargs, - ) -> Union[Tuple, CausalLMOutputWithPast]: - r""" - Args: - labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): - Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., - config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored - (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`. - - Returns: - - Example: - - ```python - >>> from transformers import AutoTokenizer, MistralForCausalLM - - >>> model = MistralForCausalLM.from_pretrained("mistralai/Mistral-7B-v0.1") - >>> tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-v0.1") - - >>> prompt = "Hey, are you conscious? Can you talk to me?" - >>> inputs = tokenizer(prompt, return_tensors="pt") - - >>> # Generate - >>> generate_ids = model.generate(inputs.input_ids, max_length=30) - >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0] - "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you." - ```""" - - output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions - output_hidden_states = ( - output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states - ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - - # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn) - outputs = self.model( - input_ids=input_ids, - attention_mask=attention_mask, - position_ids=position_ids, - past_key_values=past_key_values, - inputs_embeds=inputs_embeds, - use_cache=use_cache, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, - return_dict=return_dict, - ) - - hidden_states = outputs[0] - logits = self.lm_head(hidden_states) - logits = logits.float() - - loss = None - if labels is not None: - # Shift so that tokens < n predict n - shift_logits = logits[..., :-1, :].contiguous() - shift_labels = labels[..., 1:].contiguous() - # Flatten the tokens - shift_logits = shift_logits.view(-1, self.config.vocab_size) - shift_labels = shift_labels.view(-1) - # Ensure tensors are on the same device - shift_labels = shift_labels.to(shift_logits.device) - loss_fct = CrossEntropyLoss() - loss = loss_fct(shift_logits, shift_labels) - - if not return_dict: - output = (logits,) + outputs[1:] - return (loss,) + output if loss is not None else output - - return CausalLMOutputWithPast( - loss=loss, - logits=logits, - past_key_values=outputs.past_key_values, - hidden_states=outputs.hidden_states, - attentions=outputs.attentions, - ) - - def prepare_inputs_for_generation( - self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, **kwargs - ): - # Omit tokens covered by past_key_values - if past_key_values is not None: - if isinstance(past_key_values, Cache): - cache_length = past_key_values.get_seq_length() - past_length = past_key_values.seen_tokens - max_cache_length = past_key_values.get_max_length() - else: - cache_length = past_length = past_key_values[0][0].shape[2] - max_cache_length = None - - # Keep only the unprocessed tokens: - # 1 - If the length of the attention_mask exceeds the length of input_ids, then we are in a setting where - # some of the inputs are exclusively passed as part of the cache (e.g. when passing input_embeds as - # input) - if attention_mask is not None and attention_mask.shape[1] > input_ids.shape[1]: - input_ids = input_ids[:, -(attention_mask.shape[1] - past_length) :] - # 2 - If the past_length is smaller than input_ids', then input_ids holds all input tokens. We can discard - # input_ids based on the past_length. - elif past_length < input_ids.shape[1]: - input_ids = input_ids[:, past_length:] - # 3 - Otherwise (past_length >= input_ids.shape[1]), let's assume input_ids only has unprocessed tokens. - - # If we are about to go beyond the maximum cache length, we need to crop the input attention mask. - if ( - max_cache_length is not None - and attention_mask is not None - and cache_length + input_ids.shape[1] > max_cache_length - ): - attention_mask = attention_mask[:, -max_cache_length:] - - position_ids = kwargs.get("position_ids", None) - if attention_mask is not None and position_ids is None: - # create position_ids on the fly for batch generation - position_ids = attention_mask.long().cumsum(-1) - 1 - position_ids.masked_fill_(attention_mask == 0, 1) - if past_key_values: - position_ids = position_ids[:, -input_ids.shape[1] :] - - # if `inputs_embeds` are passed, we only want to use them in the 1st generation step - if inputs_embeds is not None and past_key_values is None: - model_inputs = {"inputs_embeds": inputs_embeds} - else: - model_inputs = {"input_ids": input_ids} - - model_inputs.update( - { - "position_ids": position_ids, - "past_key_values": past_key_values, - "use_cache": kwargs.get("use_cache"), - "attention_mask": attention_mask, - } - ) - return model_inputs - - @staticmethod - def _reorder_cache(past_key_values, beam_idx): - reordered_past = () - for layer_past in past_key_values: - reordered_past += ( - tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past), - ) - return reordered_past - - @classmethod - def pipeline_layers(cls, config: CollieConfig): - """ - Get layers of pipeline. - :return: list - """ - if isinstance(config, str): - config = CollieConfig.from_pretrained(config) - - if config.tie_word_embeddings: - output = TiedLayerSpec( - "embed_tokens", - dict_as_params(input_keys="hidden_states", output_keys="logits"), - ColumnParallelLMHead, - config.hidden_size, - config.vocab_size, - bias=False, - ) - else: - output = LayerSpec( - dict_as_params(input_keys="hidden_states", output_keys="logits"), - ColumnParallelLMHead, - config.hidden_size, - config.vocab_size, - bias=False, - ) - - return [("model", Mistral2Model.pipeline_layers(config)), ("lm_head", output)] - - @staticmethod - def load_parallel_state_dict( - path: str, - config: Union[CollieConfig, str], - process_exclusion: bool = False, - **kwargs, - ): - ... - - @staticmethod - def load_parallel_state_dict( - path: str, - config: Union[CollieConfig, str], - process_exclusion: bool = False, - protocol: str = "file", # 指定加载state_dict时使用的协议 - **kwargs, - ): - """ - Load state_dict from ``path``. - The format of pretrained model should be the same as that of - `huggingface`. - :return: state_dict. Note that the state_dict should be processed - properly to match the current rank. - """ - # 配置加载 - if isinstance(config, str): - config = CollieConfig.from_pretrained(config) - # IO驱动初始化 - io_driver = IODriver.from_protocol(protocol) - # 检查文件路径是否存在 - if not io_driver.exists(path): - raise FileNotFoundError(f"folder {path} not found.") - # 初始化存储和处理变量 - state_dict = OrderedDict() - weights = [] - parts = None # 变量用于存储模型分割的部分信息 - # 如果开启了进程互斥,那么每个进程都会显示进度条,否则只显示 RANK0 的 - hide_progress = not process_exclusion and int(os.environ.get("RANK", "0")) != 0 - if dist.is_initialized() and process_exclusion: - # 如果启动了进程互斥,则要进行 dist.get_world_size() 次循环 - rank_order = range(dist.get_world_size()) - else: - # 不开启只进行一次循环 - rank_order = range(1) - # 权重文件加载和处理 - for rank in rank_order: - # 如果开启了进程互斥,那么只有对应 RANK 的能进入循环;不开启进程互斥的话就都可以进 - if int(os.environ.get("RANK", "0")) == rank or not process_exclusion: - # PP 分层的方法保存在了 os.environ["COLLIE_PP_PARTS"], 格式类似于 [0, 17, 35], 左闭右开 - if env.is_pipeline: - # 保存的是 json 格式 - parts = env.pipeline_parts - if hasattr(config, "num_key_value_heads"): - # llama2 (transformers >= 4.31.0) - num_key_value_heads = config.num_key_value_heads - else: - num_key_value_heads = config.num_attention_heads - head_dim = config.hidden_size // config.num_attention_heads - # 如果存在 pytorch_model.bin.index.json 文件的话,此时不同的 pp 进程可以按需加载自己需要的权重 - if ( - io_driver.exists(os.path.join(path, "pytorch_model.bin.index.json")) - and "COLLIE_PP_PARTS" in os.environ.keys() - ): - weight_map = json.loads( - io_driver.load( - os.path.join(path, "pytorch_model.bin.index.json"), mode="r" - ) - )["weight_map"] - # layers 表示自己需要的层 - layers = env.pipeline_layers_idx - # 筛选出形似 model.layers.0 这样的层。包含两个条件:1. 有数字的层;2. 数字加一要在 layers 里面(因为最开始还有个 embedding 占一层) - weights.extend( - [ - value - for key, value in weight_map.items() - if len(key.split(".")) > 2 - and key.split(".")[2].isdigit() - and (int(key.split(".")[2]) + 1) in layers - ] - ) - # 去重 - weights = list(set(weights)) - # 继续筛选,如果有 0 层,那么就要加载 embedding;如果有最后一层,那么就要加载 lm_head;如果有倒数第二层,那么就要加载 norm - if 0 in layers: - weights.append(weight_map["model.tok_embeddings.weight"]) - if max(parts) - 1 in layers: - weights.append(weight_map["output.weight"]) - if max(parts) - 2 in layers: - weights.append(weight_map["model.norm.weight"]) - else: - # 如果没有 pytorch_model.bin.index.json 文件的话,那么就加载所有的权重 - weights = [ - weight - for weight in io_driver.list(path) - if weight.endswith(".bin") - ] - with progress( - weights, - desc="Loading state dict", - total=len(weights), - disable=hide_progress, - ) as pbar: - for weight in pbar: - part_state_dict = io_driver.load( - os.path.join(path, weight), mode="rb" - ) - # for key in list(part_state_dict.keys()): - # if "attention.wqkv.weight" in key: - # # qkv_weights = part_state_dict.pop(key) - # qkv_weights = part_state_dict[key] - # print(qkv_weights.shape) - # (wq, wk, wv) = qkv_weights.split( - # [ - # config.hidden_size, - # config.num_key_value_heads * head_dim, - # config.num_key_value_heads * head_dim, - # ], - # dim=0, - # ) - # wq_name = key.replace("wqkv", "wq") - # wk_name = key.replace("wqkv", "wk") - # wv_name = key.replace("wqkv", "wv") - # part_state_dict[wq_name] = wq - # part_state_dict[wk_name] = wk - # part_state_dict[wv_name] = wv - state_dict.update(part_state_dict) - del part_state_dict - if parts is not None: - # 这一步是 pp 的复筛 - layers = env.pipeline_layers_idx - for key in list(state_dict.keys()): - if key.startswith("layers"): - layer = int(key.split(".")[1]) - if layer + 1 not in layers: - state_dict.pop(key) - # if key.endswith("tok_embeddings.weight"): - if key.endswith("embed_tokens.weight"): - if 0 not in layers: - state_dict.pop(key) - if key == "norm.weight": - if max(parts) - 2 not in layers: - state_dict.pop(key) - # if key.endswith("output.weight"): - if key.endswith("lm_head.weight"): - if max(parts) - 1 not in layers: - state_dict.pop(key) - # 根据用户配置的新的 tp size 进行分割 - for key in list(state_dict.keys()): - col_filter = [ - # "wq.weight", - # "wk.weight", - # "wv.weight", - # "wqkv.weight", - # "w1.weight", - # "w3.weight", - # "tok_embeddings.weight", - # "output.weight", - "q_proj.weight", - "k_proj.weight", - "v_proj.weight", - "o_proj.weight", - "lm_head.weight", - "gate_proj.weight", - "up_proj.weight", - "down_proj.weight", - "embed_tokens.weight", - ] - col_split = any([key.endswith(filter) for filter in col_filter]) - - if col_split: - tensor = ( - list(torch.chunk(state_dict[key], config.tp_size, dim=0))[ - env.tp_rank - ] - .detach() - .clone() - ) - del state_dict[key] - if process_exclusion: - # CPU 内存回收(速度很慢) - gc.collect() - state_dict[key] = tensor - elif key.endswith("o_proj.weight") or key.endswith("down_proj.weight"): - tensor = ( - list(torch.chunk(state_dict[key], config.tp_size, dim=1))[ - env.tp_rank - ] - .detach() - .clone() - ) - del state_dict[key] - if process_exclusion: - # CPU 内存回收(速度很慢) - gc.collect() - state_dict[key] = tensor - if dist.is_initialized() and process_exclusion: - # 如果选择了进程互斥,那么本次循环中不需要加载权重的进程需等待 - dist.barrier() - return state_dict - - @staticmethod - def save_parallel_state_dict( - state_dict: dict, - path: str, - config: CollieConfig, - process_exclusion: bool = False, - **kwargs, - ): - ... - - @staticmethod - def save_parallel_state_dict( - state_dict: dict, - path: str, - config: CollieConfig, - process_exclusion: bool = False, - protocol: str = "file", - ): - """ - Save state_dict to ``path``. - The format of saved state dict should be the same as that of - `huggingface`. - """ - io_driver = IODriver.from_protocol(protocol) - # gather to tp rank 0 - if dist.is_initialized() and process_exclusion: - # 如果启动了进程互斥,则要进行 pp_size 次循环 - rank_order = range(config.pp_size) - else: - # 不开启只进行一次循环 - rank_order = range(1) - dst = parallel_state.get_tensor_model_parallel_src_rank() - with progress( - rank_order, - desc="Saving model", - disable=int(os.environ.get("RANK", "0")) != 0, - ) as pbar: - for rank in pbar: - if env.dp_rank == 0 and (env.pp_rank == rank or not process_exclusion): - for key in sorted(list(state_dict.keys())): - tensor_list = None - if env.tp_rank == 0: - tensor_list = [ - torch.zeros_like(state_dict[key]) - .to(state_dict[key].dtype) - .cuda() - for _ in range(config.tp_size) - ] - dist.gather( - state_dict[key].cuda(), - dst=dst, - gather_list=tensor_list, - group=env.tp_group, - ) - if env.tp_rank == 0: - col_filter = [ - # "wq.weight", - # "wk.weight", - # "wv.weight", - # "wqkv.weight", - # "w1.weight", - # "w3.weight", - # "tok_embeddings.weight", - # "output.weight", - "q_proj.weight", - "k_proj.weight", - "v_proj.weight", - "o_proj.weight", - "lm_head.weight", - "gate_proj.weight", - "up_proj.weight", - "down_proj.weight", - "embed_tokens.weight", - ] - col_split = any( - [key.endswith(filter) for filter in col_filter] - ) - - if col_split: - state_dict[key] = concat_tensor(tensor_list, dim=0) - - if process_exclusion: - # CPU 内存回收(速度很慢) - gc.collect() - - elif key.endswith("o_proj.weight") or key.endswith("down_proj.weight"): - state_dict[key] = concat_tensor(tensor_list, dim=1) - - if process_exclusion: - # CPU 内存回收(速度很慢) - gc.collect() - # 似乎不需要? - # state_dict_keys = state_dict.keys() - # for layer_id in range(config.num_layers): - # qkv_names = [None, None, None] - # for key in state_dict_keys: - # if f"layers.{layer_id}.attention.wq.weight" in key: - # qkv_names[0] = key - # elif f"layers.{layer_id}.attention.wk.weight" in key: - # qkv_names[1] = key - # elif f"layers.{layer_id}.attention.wv.weight" in key: - # qkv_names[2] = key - # qkv_name = qkv_names[0].replace("wq", "wqkv") - # state_dict[qkv_name] = torch.cat( - # [ - # state_dict.pop(qkv_names[0]), - # state_dict.pop(qkv_names[1]), - # state_dict.pop(qkv_names[2]), - # ], - # dim=0 - # ) - - if env.tp_rank == 0: - # Save gathered weights - if env.is_pipeline: - ckpt_name = f"pytorch_model-{env.pp_rank + 1:05d}-of-{config.pp_size:05d}.bin" - total_size = 0 - weight_map = {} - for name, weight in state_dict.items(): - weight_size = weight.numel() * dtype_byte_size( - weight.dtype - ) - weight_map[name] = ckpt_name - total_size += weight_size - index_dict = dict( - total_size=total_size, weight_map=weight_map - ) - index_dicts = [None for _ in range(env.pp_size)] - dist.gather_object( - index_dict, index_dicts if env.pp_rank == 0 else None, group=env.pp_group - ) - if env.pp_rank == 0: - total_size = 0 - weight_map = {} - for _index_dict in index_dicts: - total_size += _index_dict["total_size"] - weight_map.update(_index_dict["weight_map"]) - merged_dict = { - "metadata": {"total_size": total_size}, - "weight_map": weight_map, - } - io_driver.save( - json.dumps(merged_dict, indent=2, sort_keys=True) - + "\n", - os.path.join(path, "pytorch_model.bin.index.json"), - ) - - else: - ckpt_name = f"pytorch_model.bin" - ckpt_path = os.path.join(path, ckpt_name) - io_driver.save(state_dict, ckpt_path) - if dist.is_initialized() and process_exclusion: - dist.barrier() - if env.rank == 0: - config.save_pretrained(path, protocol=protocol) - dist.barrier() - - -@add_start_docstrings( - """ - The Mistral Model transformer with a sequence classification head on top (linear layer). - - [`MistralForSequenceClassification`] uses the last token in order to do the classification, as other causal models - (e.g. GPT-2) do. - - Since it does classification on the last token, it requires to know the position of the last token. If a - `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If - no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the - padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in - each row of the batch). - """, - MISTRAL_START_DOCSTRING, -) -# Copied from transformers.models.llama.modeling_llama.LlamaForSequenceClassification with Llama->Mistral, LLAMA->MISTRAL -class MistralForSequenceClassification(Mistral2PreTrainedModel): - def __init__(self, config): - super().__init__(config) - self.num_labels = config.num_labels - self.model = Mistral2Model(config) - self.score = nn.Linear(config.hidden_size, self.num_labels, bias=False) - - # Initialize weights and apply final processing - self.post_init() - - def get_input_embeddings(self): - return self.model.embed_tokens - - def set_input_embeddings(self, value): - self.model.embed_tokens = value - - @add_start_docstrings_to_model_forward(MISTRAL_INPUTS_DOCSTRING) - def forward( - self, - input_ids: torch.LongTensor = None, - attention_mask: Optional[torch.Tensor] = None, - position_ids: Optional[torch.LongTensor] = None, - past_key_values: Optional[List[torch.FloatTensor]] = None, - inputs_embeds: Optional[torch.FloatTensor] = None, - labels: Optional[torch.LongTensor] = None, - use_cache: Optional[bool] = None, - output_attentions: Optional[bool] = None, - output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[Tuple, SequenceClassifierOutputWithPast]: - r""" - labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): - Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., - config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If - `config.num_labels > 1` a classification loss is computed (Cross-Entropy). - """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - - transformer_outputs = self.model( - input_ids, - attention_mask=attention_mask, - position_ids=position_ids, - past_key_values=past_key_values, - inputs_embeds=inputs_embeds, - use_cache=use_cache, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, - return_dict=return_dict, - ) - hidden_states = transformer_outputs[0] - logits = self.score(hidden_states) - - if input_ids is not None: - batch_size = input_ids.shape[0] - else: - batch_size = inputs_embeds.shape[0] - - if self.config.pad_token_id is None and batch_size != 1: - raise ValueError("Cannot handle batch sizes > 1 if no padding token is defined.") - if self.config.pad_token_id is None: - sequence_lengths = -1 - else: - if input_ids is not None: - # if no pad token found, use modulo instead of reverse indexing for ONNX compatibility - sequence_lengths = torch.eq(input_ids, self.config.pad_token_id).int().argmax(-1) - 1 - sequence_lengths = sequence_lengths % input_ids.shape[-1] - sequence_lengths = sequence_lengths.to(logits.device) - else: - sequence_lengths = -1 - - pooled_logits = logits[torch.arange(batch_size, device=logits.device), sequence_lengths] - - loss = None - if labels is not None: - labels = labels.to(logits.device) - if self.config.problem_type is None: - if self.num_labels == 1: - self.config.problem_type = "regression" - elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int): - self.config.problem_type = "single_label_classification" - else: - self.config.problem_type = "multi_label_classification" - - if self.config.problem_type == "regression": - loss_fct = MSELoss() - if self.num_labels == 1: - loss = loss_fct(pooled_logits.squeeze(), labels.squeeze()) - else: - loss = loss_fct(pooled_logits, labels) - elif self.config.problem_type == "single_label_classification": - loss_fct = CrossEntropyLoss() - loss = loss_fct(pooled_logits.view(-1, self.num_labels), labels.view(-1)) - elif self.config.problem_type == "multi_label_classification": - loss_fct = BCEWithLogitsLoss() - loss = loss_fct(pooled_logits, labels) - if not return_dict: - output = (pooled_logits,) + transformer_outputs[1:] - return ((loss,) + output) if loss is not None else output - - return SequenceClassifierOutputWithPast( - loss=loss, - logits=pooled_logits, - past_key_values=transformer_outputs.past_key_values, - hidden_states=transformer_outputs.hidden_states, - attentions=transformer_outputs.attentions, - ) diff --git a/collie/models/mistral2/modelpp.py b/collie/models/mistral2/modelpp.py deleted file mode 100644 index 1180a10..0000000 --- a/collie/models/mistral2/modelpp.py +++ /dev/null @@ -1,1922 +0,0 @@ -# coding=utf-8 -# Copyright 2023 Mistral AI and the HuggingFace Inc. team. All rights reserved. -# -# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX -# and OPT implementations in this library. It has been modified from its -# original forms to accommodate minor architectural differences compared -# to GPT-NeoX and OPT used by the Meta AI team that trained the model. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -""" PyTorch Mistral model.""" -import inspect -import math -import warnings -from typing import List, Optional, Tuple, Union - -import torch -import torch.nn.functional as F -import torch.utils.checkpoint -from torch import nn -from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss - -from transformers.activations import ACT2FN -from transformers.cache_utils import Cache, DynamicCache -from transformers.modeling_attn_mask_utils import _prepare_4d_causal_attention_mask, _prepare_4d_causal_attention_mask_for_sdpa -from transformers.modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast, SequenceClassifierOutputWithPast -from transformers.modeling_utils import PreTrainedModel, dtype_byte_size -from transformers.utils import ( - add_start_docstrings, - add_start_docstrings_to_model_forward, - is_flash_attn_2_available, - is_flash_attn_greater_or_equal_2_10, - logging, - replace_return_docstrings, -) -from .configuration_mistraltp import Mistral2Config - - -if is_flash_attn_2_available(): - from flash_attn import flash_attn_func, flash_attn_varlen_func - from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input # noqa - - _flash_supports_window_size = "window_size" in list(inspect.signature(flash_attn_func).parameters) - - -logger = logging.get_logger(__name__) - -_CONFIG_FOR_DOC = "Mistral2Config" - -#modified for collie -import torch.distributed as dist -import gc -import json -import os -from collections import OrderedDict -from megatron.core import parallel_state, tensor_parallel -from einops import rearrange -from deepspeed.pipe import LayerSpec, TiedLayerSpec - -from collie.config import CollieConfig -from collie.driver.io import IODriver -from collie.log.logger import logger -from collie.module import ( - ColumnParallelLinearWithoutBias, - ColumnParallelLMHead, - RowParallelLinearWithoutBias, -) -from collie.utils import concat_tensor, dict_as_params, env, progress -from collie.models.base import CollieModelForCausalLM -from collie.models.utils import ( - kv_cache_to_inputs_for_layer, inputs_to_kv_cache_for_layer, - kv_cache_to_inputs_for_model, inputs_to_kv_cache_for_model, -) - - -# Copied from transformers.models.llama.modeling_llama._get_unpad_data -def _get_unpad_data(attention_mask): - seqlens_in_batch = attention_mask.sum(dim=-1, dtype=torch.int32) - indices = torch.nonzero(attention_mask.flatten(), as_tuple=False).flatten() - max_seqlen_in_batch = seqlens_in_batch.max().item() - cu_seqlens = F.pad(torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.int32), (1, 0)) - return ( - indices, - cu_seqlens, - max_seqlen_in_batch, - ) - - -# Copied from transformers.models.llama.modeling_llama.LlamaRMSNorm with Llama->Mistral -class Mistral2RMSNorm(nn.Module): - def __init__(self, hidden_size, eps=1e-6): - """ - MistralRMSNorm is equivalent to T5LayerNorm - """ - super().__init__() - self.weight = nn.Parameter(torch.ones(hidden_size)) - self.variance_epsilon = eps - - def forward(self, hidden_states): - input_dtype = hidden_states.dtype - hidden_states = hidden_states.to(torch.float32) - variance = hidden_states.pow(2).mean(-1, keepdim=True) - hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon) - return self.weight * hidden_states.to(input_dtype) - - -# copied from transformers.models.llama.modeling_llama.LlamaRotaryEmbedding with Llama->Mistral -# TODO @Arthur no longer copied from LLama after static cache -class Mistral2RotaryEmbedding(nn.Module): - def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None): - super().__init__() - - self.dim = dim - self.max_position_embeddings = max_position_embeddings - self.base = base - inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2, dtype=torch.int64).float().to(device) / self.dim)) - self.register_buffer("inv_freq", inv_freq, persistent=False) - - # Build here to make `torch.jit.trace` work. - self._set_cos_sin_cache( - seq_len=max_position_embeddings, device=self.inv_freq.device, dtype=torch.get_default_dtype() - ) - - def _set_cos_sin_cache(self, seq_len, device, dtype): - self.max_seq_len_cached = seq_len - t = torch.arange(self.max_seq_len_cached, device=device, dtype=torch.int64).type_as(self.inv_freq) - - freqs = torch.outer(t, self.inv_freq) - # Different from paper, but it uses a different permutation in order to obtain the same calculation - emb = torch.cat((freqs, freqs), dim=-1) - self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False) - self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False) - - def forward(self, x, seq_len=None): - # x: [bs, num_attention_heads, seq_len, head_size] - if seq_len > self.max_seq_len_cached: - self._set_cos_sin_cache(seq_len=seq_len, device=x.device, dtype=x.dtype) - - return ( - self.cos_cached[:seq_len].to(dtype=x.dtype), - self.sin_cached[:seq_len].to(dtype=x.dtype), - ) - - -# Copied from transformers.models.llama.modeling_llama.rotate_half -def rotate_half(x): - """Rotates half the hidden dims of the input.""" - x1 = x[..., : x.shape[-1] // 2] - x2 = x[..., x.shape[-1] // 2 :] - return torch.cat((-x2, x1), dim=-1) - - -# copied from transformers.models.llama.modeling_llama.apply_rotary_pos_emb -# TODO @Arthur no longer copied from LLama after static cache -def apply_rotary_pos_emb(q, k, cos, sin, position_ids, unsqueeze_dim=1): - """Applies Rotary Position Embedding to the query and key tensors. - - Args: - q (`torch.Tensor`): The query tensor. - k (`torch.Tensor`): The key tensor. - cos (`torch.Tensor`): The cosine part of the rotary embedding. - sin (`torch.Tensor`): The sine part of the rotary embedding. - position_ids (`torch.Tensor`): - The position indices of the tokens corresponding to the query and key tensors. For example, this can be - used to pass offsetted position ids when working with a KV-cache. - unsqueeze_dim (`int`, *optional*, defaults to 1): - The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and - sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note - that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and - k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes - cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have - the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2. - Returns: - `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding. - """ - cos = cos[position_ids].unsqueeze(unsqueeze_dim) - sin = sin[position_ids].unsqueeze(unsqueeze_dim) - q_embed = (q * cos) + (rotate_half(q) * sin) - k_embed = (k * cos) + (rotate_half(k) * sin) - return q_embed, k_embed - - -class Mistral2MLP(nn.Module): - def __init__(self, config): - super().__init__() - self.config = config - self.hidden_size = config.hidden_size - self.intermediate_size = config.intermediate_size - # self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False) - # self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False) - # self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False) - - self.up_proj = ColumnParallelLinearWithoutBias( - self.hidden_size, - self.intermediate_size, - bias=False, - gather_output=False, - init_method=lambda x: x, - ) - self.gate_proj = ColumnParallelLinearWithoutBias( - self.hidden_size, - self.intermediate_size, - bias=False, - gather_output=False, - init_method=lambda x: x, - ) - self.down_proj = RowParallelLinearWithoutBias( - self.intermediate_size, - self.hidden_size, - bias=False, - input_is_parallel=True, - init_method=lambda x: x, - ) - - self.act_fn = ACT2FN[config.hidden_act] - - def forward(self, x): - return self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x)) - - -# Copied from transformers.models.llama.modeling_llama.repeat_kv -def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor: - """ - This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch, - num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim) - """ - batch, num_key_value_heads, slen, head_dim = hidden_states.shape - if n_rep == 1: - return hidden_states - hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim) - return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim) - - -class Mistral2Attention(nn.Module): - """ - Multi-headed attention from 'Attention Is All You Need' paper. Modified to use sliding window attention: Longformer - and "Generating Long Sequences with Sparse Transformers". - """ - - def __init__(self, config: CollieConfig, layer_idx: Optional[int] = None): - super().__init__() - self.config = config - self.layer_idx = layer_idx - if layer_idx is None: - logger.warning_once( - f"Instantiating {self.__class__.__name__} without passing a `layer_idx` is not recommended and will " - "lead to errors during the forward call if caching is used. Please make sure to provide a `layer_idx` " - "when creating this class." - ) - - self.hidden_size = config.hidden_size - self.num_heads = config.num_attention_heads - self.head_dim = self.hidden_size // self.num_heads - self.num_key_value_heads = config.num_key_value_heads - self.num_key_value_groups = self.num_heads // self.num_key_value_heads - self.max_position_embeddings = config.max_position_embeddings - self.rope_theta = config.rope_theta - self.is_causal = True - self.attention_dropout = config.attention_dropout - - if (self.head_dim * self.num_heads) != self.hidden_size: - raise ValueError( - f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}" - f" and `num_heads`: {self.num_heads})." - ) - # self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=False) - # self.k_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=False) - # self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=False) - # self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=False) - - self.q_proj = ColumnParallelLinearWithoutBias( - self.hidden_size, - self.num_heads * self.head_dim, - bias=False, - gather_output=False, - init_method=lambda x: x, - ) - self.k_proj = ColumnParallelLinearWithoutBias( - self.hidden_size, - self.num_key_value_heads * self.head_dim, - bias=False, - gather_output=False, - init_method=lambda x: x, - ) - self.v_proj = ColumnParallelLinearWithoutBias( - self.hidden_size, - self.num_key_value_heads * self.head_dim, - bias=False, - gather_output=False, - init_method=lambda x: x, - ) - self.o_proj = RowParallelLinearWithoutBias( - self.num_heads * self.head_dim, - self.hidden_size, - bias=False, - input_is_parallel=True, - init_method=lambda x: x, - ) - - self.rotary_emb = Mistral2RotaryEmbedding( - self.head_dim, - max_position_embeddings=self.max_position_embeddings, - base=self.rope_theta, - ) - - def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int): - return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous() - - def forward( - self, - hidden_states: torch.Tensor, - attention_mask: Optional[torch.Tensor] = None, - position_ids: Optional[torch.LongTensor] = None, - past_key_value: Optional[Cache] = None, - output_attentions: bool = False, - use_cache: bool = False, - **kwargs, - ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: - if "padding_mask" in kwargs: - warnings.warn( - "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`" - ) - bsz, q_len, _ = hidden_states.size() - - query_states = self.q_proj(hidden_states) - key_states = self.k_proj(hidden_states) - value_states = self.v_proj(hidden_states) - - # query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2) - # key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) - # value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) - - query_states, key_states, value_states = ( - rearrange(query_states, "b n (h d) -> b n h d", d=self.head_dim), - rearrange(key_states, "b n (h d) -> b n h d", d=self.head_dim), - rearrange(value_states, "b n (h d) -> b n h d", d=self.head_dim), - ) - - self.num_heads_tp = query_states.shape[2] - self.tp_size = self.num_heads // self.num_heads_tp - - query_states = query_states.transpose(1, 2) - key_states = key_states.transpose(1, 2) - value_states = value_states.transpose(1, 2) - - if self.config.pp_size > 1: - query_states = query_states.contiguous() - key_states = key_states.contiguous() - value_states = value_states.contiguous() - - kv_seq_len = key_states.shape[-2] - if past_key_value is not None: - if self.layer_idx is None: - raise ValueError( - f"The cache structure has changed since version v4.36. If you are using {self.__class__.__name__} " - "for auto-regressive decoding with k/v caching, please make sure to initialize the attention class " - "with a layer index." - ) - kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx) - cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len) - query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids) - - if past_key_value is not None: - cache_kwargs = {"sin": sin, "cos": cos} # Specific to RoPE models - key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs) - - # repeat k/v heads if n_kv_heads < n_heads - key_states = repeat_kv(key_states, self.num_key_value_groups) - value_states = repeat_kv(value_states, self.num_key_value_groups) - - attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim) - - if attn_weights.size() != (bsz, self.num_heads_tp, q_len, kv_seq_len): - raise ValueError( - f"Attention weights should be of size {(bsz, self.num_heads_tp, q_len, kv_seq_len)}, but is" - f" {attn_weights.size()}" - ) - - if attention_mask is not None: - if attention_mask.size() != (bsz, 1, q_len, kv_seq_len): - raise ValueError( - f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}" - ) - - attn_weights = attn_weights + attention_mask - - # upcast attention to fp32 - attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype) - attn_weights = nn.functional.dropout(attn_weights, p=self.attention_dropout, training=self.training) - attn_output = torch.matmul(attn_weights, value_states) - - if attn_output.size() != (bsz, self.num_heads_tp, q_len, self.head_dim): - raise ValueError( - f"`attn_output` should be of size {(bsz, self.num_heads_tp, q_len, self.head_dim)}, but is" - f" {attn_output.size()}" - ) - - attn_output = attn_output.transpose(1, 2).contiguous() - attn_output = attn_output.reshape(bsz, q_len, int(self.hidden_size/self.tp_size)) - - attn_output = self.o_proj(attn_output) - - if not output_attentions: - attn_weights = None - - return attn_output, attn_weights, past_key_value - - -class Mistral2FlashAttention2(Mistral2Attention): - """ - Mistral flash attention module. This module inherits from `MistralAttention` as the weights of the module stays - untouched. The only required change would be on the forward pass where it needs to correctly call the public API of - flash attention and deal with padding tokens in case the input contains any of them. - """ - - # Copied from transformers.models.llama.modeling_llama.LlamaFlashAttention2.__init__ - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - - # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1. - # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignement, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0. - # Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left). - self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10() - - def forward( - self, - hidden_states: torch.Tensor, - attention_mask: Optional[torch.Tensor] = None, - position_ids: Optional[torch.LongTensor] = None, - past_key_value: Optional[Cache] = None, - output_attentions: bool = False, - use_cache: bool = False, - **kwargs, - ): - if "padding_mask" in kwargs: - warnings.warn( - "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`" - ) - - # overwrite attention_mask with padding_mask - attention_mask = kwargs.pop("padding_mask") - bsz, q_len, _ = hidden_states.size() - - query_states = self.q_proj(hidden_states) - key_states = self.k_proj(hidden_states) - value_states = self.v_proj(hidden_states) - - # query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2) - # key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) - # value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) - - query_states, key_states, value_states = ( - rearrange(query_states, "b n (h d) -> b n h d", d=self.head_dim), - rearrange(key_states, "b n (h d) -> b n h d", d=self.head_dim), - rearrange(value_states, "b n (h d) -> b n h d", d=self.head_dim), - ) - - self.num_heads_tp = query_states.shape[2] - self.tp_size = self.num_heads // self.num_heads_tp - - query_states = query_states.transpose(1, 2) - key_states = key_states.transpose(1, 2) - value_states = value_states.transpose(1, 2) - - if self.config.pp_size > 1: - query_states = query_states.contiguous() - key_states = key_states.contiguous() - value_states = value_states.contiguous() - - - kv_seq_len = key_states.shape[-2] - if past_key_value is not None: - if self.layer_idx is None: - raise ValueError( - f"The cache structure has changed since version v4.36. If you are using {self.__class__.__name__} " - "for auto-regressive decoding with k/v caching, please make sure to initialize the attention class " - "with a layer index." - ) - kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx) - - # Because the input can be padded, the absolute sequence length depends on the max position id. - rotary_seq_len = max(kv_seq_len, position_ids[:, -1].max().item()) + 1 - cos, sin = self.rotary_emb(value_states, seq_len=rotary_seq_len) - - query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids) - - use_sliding_windows = ( - _flash_supports_window_size - and getattr(self.config, "sliding_window", None) is not None - and kv_seq_len > self.config.sliding_window - ) - - if not _flash_supports_window_size: - logger.warning_once( - "The current flash attention version does not support sliding window attention, for a more memory efficient implementation" - " make sure to upgrade flash-attn library." - ) - - if past_key_value is not None: - # Activate slicing cache only if the config has a value `sliding_windows` attribute - cache_has_contents = past_key_value.get_seq_length(self.layer_idx) > 0 - if ( - getattr(self.config, "sliding_window", None) is not None - and kv_seq_len > self.config.sliding_window - and cache_has_contents - ): - slicing_tokens = 1 - self.config.sliding_window - - past_key = past_key_value[self.layer_idx][0] - past_value = past_key_value[self.layer_idx][1] - - past_key = past_key[:, :, slicing_tokens:, :].contiguous() - past_value = past_value[:, :, slicing_tokens:, :].contiguous() - - if past_key.shape[-2] != self.config.sliding_window - 1: - raise ValueError( - f"past key must have a shape of (`batch_size, num_heads, self.config.sliding_window-1, head_dim`), got" - f" {past_key.shape}" - ) - - if attention_mask is not None: - attention_mask = attention_mask[:, slicing_tokens:] - attention_mask = torch.cat([attention_mask, torch.ones_like(attention_mask[:, -1:])], dim=-1) - - cache_kwargs = {"sin": sin, "cos": cos} # Specific to RoPE models - key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs) - - # repeat k/v heads if n_kv_heads < n_heads - key_states = repeat_kv(key_states, self.num_key_value_groups) - value_states = repeat_kv(value_states, self.num_key_value_groups) - dropout_rate = 0.0 if not self.training else self.attention_dropout - - # In PEFT, usually we cast the layer norms in float32 for training stability reasons - # therefore the input hidden states gets silently casted in float32. Hence, we need - # cast them back in float16 just to be sure everything works as expected. - input_dtype = query_states.dtype - if input_dtype == torch.float32: - if torch.is_autocast_enabled(): - target_dtype = torch.get_autocast_gpu_dtype() - # Handle the case where the model is quantized - elif hasattr(self.config, "_pre_quantization_dtype"): - target_dtype = self.config._pre_quantization_dtype - else: - target_dtype = self.q_proj.weight.dtype - - logger.warning_once( - f"The input hidden states seems to be silently casted in float32, this might be related to" - f" the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in" - f" {target_dtype}." - ) - - query_states = query_states.to(target_dtype) - key_states = key_states.to(target_dtype) - value_states = value_states.to(target_dtype) - - # Reashape to the expected shape for Flash Attention - query_states = query_states.transpose(1, 2) - key_states = key_states.transpose(1, 2) - value_states = value_states.transpose(1, 2) - - attn_output = self._flash_attention_forward( - query_states, - key_states, - value_states, - attention_mask, - q_len, - dropout=dropout_rate, - use_sliding_windows=use_sliding_windows, - ) - - attn_output = attn_output.reshape(bsz, q_len, int(self.hidden_size/self.tp_size)).contiguous() - attn_output = self.o_proj(attn_output) - - if not output_attentions: - attn_weights = None - - return attn_output, attn_weights, past_key_value - - def _flash_attention_forward( - self, - query_states, - key_states, - value_states, - attention_mask, - query_length, - dropout=0.0, - softmax_scale=None, - use_sliding_windows=False, - ): - """ - Calls the forward method of Flash Attention - if the input hidden states contain at least one padding token - first unpad the input, then computes the attention scores and pad the final attention scores. - - Args: - query_states (`torch.Tensor`): - Input query states to be passed to Flash Attention API - key_states (`torch.Tensor`): - Input key states to be passed to Flash Attention API - value_states (`torch.Tensor`): - Input value states to be passed to Flash Attention API - attention_mask (`torch.Tensor`): - The padding mask - corresponds to a tensor of size `(batch_size, seq_len)` where 0 stands for the - position of padding tokens and 1 for the position of non-padding tokens. - dropout (`float`): - Attention dropout - softmax_scale (`float`, *optional*): - The scaling of QK^T before applying softmax. Default to 1 / sqrt(head_dim) - use_sliding_windows (`bool`, *optional*): - Whether to activate sliding window attention. - """ - if not self._flash_attn_uses_top_left_mask: - causal = self.is_causal - else: - # TODO: Remove the `query_length != 1` check once Flash Attention for RoCm is bumped to 2.1. For details, please see the comment in LlamaFlashAttention2 __init__. - causal = self.is_causal and query_length != 1 - - # Contains at least one padding token in the sequence - if attention_mask is not None: - batch_size = query_states.shape[0] - query_states, key_states, value_states, indices_q, cu_seq_lens, max_seq_lens = self._upad_input( - query_states, key_states, value_states, attention_mask, query_length - ) - - cu_seqlens_q, cu_seqlens_k = cu_seq_lens - max_seqlen_in_batch_q, max_seqlen_in_batch_k = max_seq_lens - - if not use_sliding_windows: - attn_output_unpad = flash_attn_varlen_func( - query_states, - key_states, - value_states, - cu_seqlens_q=cu_seqlens_q, - cu_seqlens_k=cu_seqlens_k, - max_seqlen_q=max_seqlen_in_batch_q, - max_seqlen_k=max_seqlen_in_batch_k, - dropout_p=dropout, - softmax_scale=softmax_scale, - causal=causal, - ) - else: - attn_output_unpad = flash_attn_varlen_func( - query_states, - key_states, - value_states, - cu_seqlens_q=cu_seqlens_q, - cu_seqlens_k=cu_seqlens_k, - max_seqlen_q=max_seqlen_in_batch_q, - max_seqlen_k=max_seqlen_in_batch_k, - dropout_p=dropout, - softmax_scale=softmax_scale, - causal=causal, - window_size=(self.config.sliding_window, self.config.sliding_window), - ) - - attn_output = pad_input(attn_output_unpad, indices_q, batch_size, query_length) - else: - if not use_sliding_windows: - attn_output = flash_attn_func( - query_states, - key_states, - value_states, - dropout, - softmax_scale=softmax_scale, - causal=causal, - ) - else: - attn_output = flash_attn_func( - query_states, - key_states, - value_states, - dropout, - softmax_scale=softmax_scale, - causal=causal, - window_size=(self.config.sliding_window, self.config.sliding_window), - ) - - return attn_output - - def _upad_input(self, query_layer, key_layer, value_layer, attention_mask, query_length): - batch_size, kv_seq_len, num_heads, head_dim = key_layer.shape - - # On the first iteration we need to properly re-create the padding mask - # by slicing it on the proper place - if kv_seq_len != attention_mask.shape[-1]: - attention_mask_num_tokens = attention_mask.shape[-1] - attention_mask = attention_mask[:, attention_mask_num_tokens - kv_seq_len :] - - indices_k, cu_seqlens_k, max_seqlen_in_batch_k = _get_unpad_data(attention_mask) - - key_layer = index_first_axis(key_layer.reshape(batch_size * kv_seq_len, num_heads, head_dim), indices_k) - value_layer = index_first_axis(value_layer.reshape(batch_size * kv_seq_len, num_heads, head_dim), indices_k) - - if query_length == kv_seq_len: - query_layer = index_first_axis( - query_layer.reshape(batch_size * kv_seq_len, num_heads, head_dim), indices_k - ) - cu_seqlens_q = cu_seqlens_k - max_seqlen_in_batch_q = max_seqlen_in_batch_k - indices_q = indices_k - elif query_length == 1: - max_seqlen_in_batch_q = 1 - cu_seqlens_q = torch.arange( - batch_size + 1, dtype=torch.int32, device=query_layer.device - ) # There is a memcpy here, that is very bad. - indices_q = cu_seqlens_q[:-1] - query_layer = query_layer.squeeze(1) - else: - # The -q_len: slice assumes left padding. - attention_mask = attention_mask[:, -query_length:] - query_layer, indices_q, cu_seqlens_q, max_seqlen_in_batch_q = unpad_input(query_layer, attention_mask) - - return ( - query_layer, - key_layer, - value_layer, - indices_q, - (cu_seqlens_q, cu_seqlens_k), - (max_seqlen_in_batch_q, max_seqlen_in_batch_k), - ) - - -# copied from transformers.models.llama.modeling_llama.LlamaSdpaAttention with Llama->Mistral -# TODO @Arthur no longer copied from LLama after static cache -class Mistral2SdpaAttention(Mistral2Attention): - """ - Mistral attention module using torch.nn.functional.scaled_dot_product_attention. This module inherits from - `MistralAttention` as the weights of the module stays untouched. The only changes are on the forward pass to adapt to - SDPA API. - """ - - # Adapted from MistralAttention.forward - def forward( - self, - hidden_states: torch.Tensor, - attention_mask: Optional[torch.Tensor] = None, - position_ids: Optional[torch.LongTensor] = None, - past_key_value: Optional[Cache] = None, - output_attentions: bool = False, - use_cache: bool = False, - ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: - if output_attentions: - # TODO: Improve this warning with e.g. `model.config.attn_implementation = "manual"` once this is implemented. - logger.warning_once( - "MistralModel is using MistralSdpaAttention, but `torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to the manual attention implementation, " - 'but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.' - ) - return super().forward( - hidden_states=hidden_states, - attention_mask=attention_mask, - position_ids=position_ids, - past_key_value=past_key_value, - output_attentions=output_attentions, - use_cache=use_cache, - ) - - bsz, q_len, _ = hidden_states.size() - - query_states = self.q_proj(hidden_states) - key_states = self.k_proj(hidden_states) - value_states = self.v_proj(hidden_states) - - # query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2) - # key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) - # value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) - - query_states, key_states, value_states = ( - rearrange(query_states, "b n (h d) -> b n h d", d=self.head_dim), - rearrange(key_states, "b n (h d) -> b n h d", d=self.head_dim), - rearrange(value_states, "b n (h d) -> b n h d", d=self.head_dim), - ) - - self.num_heads_tp = query_states.shape[2] - self.tp_size = self.num_heads // self.num_heads_tp - - query_states = query_states.transpose(1, 2) - key_states = key_states.transpose(1, 2) - value_states = value_states.transpose(1, 2) - - if self.config.pp_size > 1: - query_states = query_states.contiguous() - key_states = key_states.contiguous() - value_states = value_states.contiguous() - - - kv_seq_len = key_states.shape[-2] - if past_key_value is not None: - kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx) - cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len) - - query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids) - - if past_key_value is not None: - cache_kwargs = {"sin": sin, "cos": cos} # Specific to RoPE models - key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs) - - key_states = repeat_kv(key_states, self.num_key_value_groups) - value_states = repeat_kv(value_states, self.num_key_value_groups) - - if attention_mask is not None: - if attention_mask.size() != (bsz, 1, q_len, kv_seq_len): - raise ValueError( - f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}" - ) - - # SDPA with memory-efficient backend is currently (torch==2.1.2) bugged with non-contiguous inputs with custom attn_mask, - # Reference: https://github.com/pytorch/pytorch/issues/112577. - if query_states.device.type == "cuda" and attention_mask is not None: - query_states = query_states.contiguous() - key_states = key_states.contiguous() - value_states = value_states.contiguous() - - attn_output = torch.nn.functional.scaled_dot_product_attention( - query_states, - key_states, - value_states, - attn_mask=attention_mask, - dropout_p=self.attention_dropout if self.training else 0.0, - # The q_len > 1 is necessary to match with AttentionMaskConverter.to_causal_4d that does not create a causal mask in case q_len == 1. - is_causal=self.is_causal and attention_mask is None and q_len > 1, - ) - - attn_output = attn_output.transpose(1, 2).contiguous() - attn_output = attn_output.view(bsz, q_len, int(self.hidden_size/self.tp_size)) - - attn_output = self.o_proj(attn_output) - - return attn_output, None, past_key_value - - -MISTRAL_ATTENTION_CLASSES = { - "eager": Mistral2Attention, - "flash_attention_2": Mistral2FlashAttention2, - "sdpa": Mistral2SdpaAttention, -} - - -class MistralDecoderLayer(nn.Module): - def __init__(self, config: CollieConfig, layer_idx: int): - super().__init__() - self.hidden_size = config.hidden_size - - self.self_attn = MISTRAL_ATTENTION_CLASSES[config._attn_implementation](config, layer_idx) - - self.mlp = Mistral2MLP(config) - self.input_layernorm = Mistral2RMSNorm(config.hidden_size, eps=config.rms_norm_eps) - self.post_attention_layernorm = Mistral2RMSNorm(config.hidden_size, eps=config.rms_norm_eps) - - def forward( - self, - hidden_states: torch.Tensor, - attention_mask: Optional[torch.Tensor] = None, - position_ids: Optional[torch.LongTensor] = None, - past_key_value: Optional[Tuple[torch.Tensor]] = None, - output_attentions: Optional[bool] = False, - use_cache: Optional[bool] = False, - **kwargs, - ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]: - if "padding_mask" in kwargs: - warnings.warn( - "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`" - ) - """ - Args: - hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)` - attention_mask (`torch.FloatTensor`, *optional*): attention mask of size - `(batch, sequence_length)` where padding elements are indicated by 0. - output_attentions (`bool`, *optional*): - Whether or not to return the attentions tensors of all attention layers. See `attentions` under - returned tensors for more detail. - use_cache (`bool`, *optional*): - If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding - (see `past_key_values`). - past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states - """ - - residual = hidden_states - - hidden_states = self.input_layernorm(hidden_states) - - # Self Attention - hidden_states, self_attn_weights, present_key_value = self.self_attn( - hidden_states=hidden_states, - attention_mask=attention_mask, - position_ids=position_ids, - past_key_value=past_key_value, - output_attentions=output_attentions, - use_cache=use_cache, - ) - hidden_states = residual + hidden_states - - # Fully Connected - residual = hidden_states - hidden_states = self.post_attention_layernorm(hidden_states) - hidden_states = self.mlp(hidden_states) - hidden_states = residual + hidden_states - - outputs = (hidden_states,) - - if output_attentions: - outputs += (self_attn_weights,) - - if use_cache: - outputs += (present_key_value,) - - return outputs - - -MISTRAL_START_DOCSTRING = r""" - This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the - library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads - etc.) - - This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. - Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage - and behavior. - - Parameters: - config ([`MistralConfig`]): - Model configuration class with all the parameters of the model. Initializing with a config file does not - load the weights associated with the model, only the configuration. Check out the - [`~PreTrainedModel.from_pretrained`] method to load the model weights. -""" - - -@add_start_docstrings( - "The bare Mistral Model outputting raw hidden-states without any specific head on top.", - MISTRAL_START_DOCSTRING, -) -class Mistral2PreTrainedModel(PreTrainedModel): - config_class = Mistral2Config - base_model_prefix = "model" - supports_gradient_checkpointing = True - _no_split_modules = ["MistralDecoderLayer"] - _skip_keys_device_placement = "past_key_values" - _supports_flash_attn_2 = True - _supports_sdpa = True - _supports_cache_class = True - - def _init_weights(self, module): - std = self.config.initializer_range - if isinstance(module, nn.Linear): - module.weight.data.normal_(mean=0.0, std=std) - if module.bias is not None: - module.bias.data.zero_() - elif isinstance(module, nn.Embedding): - module.weight.data.normal_(mean=0.0, std=std) - if module.padding_idx is not None: - module.weight.data[module.padding_idx].zero_() - - -MISTRAL_INPUTS_DOCSTRING = r""" - Args: - input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`): - Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide - it. - - Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and - [`PreTrainedTokenizer.__call__`] for details. - - [What are input IDs?](../glossary#input-ids) - attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*): - Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: - - - 1 for tokens that are **not masked**, - - 0 for tokens that are **masked**. - - [What are attention masks?](../glossary#attention-mask) - - Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and - [`PreTrainedTokenizer.__call__`] for details. - - If `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see - `past_key_values`). - - If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`] - and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more - information on the default strategy. - - - 1 indicates the head is **not masked**, - - 0 indicates the head is **masked**. - position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): - Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0, - config.n_positions - 1]`. - - [What are position IDs?](../glossary#position-ids) - past_key_values (`Cache` or `tuple(tuple(torch.FloatTensor))`, *optional*): - Pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention - blocks) that can be used to speed up sequential decoding. This typically consists in the `past_key_values` - returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`. - - Two formats are allowed: - - a [`~cache_utils.Cache`] instance; - - Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of - shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`). This is also known as the legacy - cache format. - - The model will output the same cache format that is fed as input. If no `past_key_values` are passed, the - legacy cache format will be returned. - - If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that don't - have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `input_ids` - of shape `(batch_size, sequence_length)`. - inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): - Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This - is useful if you want more control over how to convert `input_ids` indices into associated vectors than the - model's internal embedding lookup matrix. - use_cache (`bool`, *optional*): - If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see - `past_key_values`). - output_attentions (`bool`, *optional*): - Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned - tensors for more detail. - output_hidden_states (`bool`, *optional*): - Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for - more detail. - return_dict (`bool`, *optional*): - Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. -""" - - -@add_start_docstrings( - "The bare Mistral Model outputting raw hidden-states without any specific head on top.", - MISTRAL_START_DOCSTRING, -) -class Mistral2Model(nn.Module): - """ - Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`MistralDecoderLayer`] - - Args: - config: MistralConfig - """ - - def __init__(self, config: CollieConfig): - # super().__init__(config) - super().__init__() - self.config = config - self.padding_idx = config.pad_token_id - self.vocab_size = config.vocab_size - - self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx) - self.layers = nn.ModuleList( - [MistralDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)] - ) - self._attn_implementation = config._attn_implementation - self.norm = Mistral2RMSNorm(config.hidden_size, eps=config.rms_norm_eps) - - self.gradient_checkpointing = False - # Initialize weights and apply final processing - # self.post_init() - - def get_input_embeddings(self): - return self.embed_tokens - - def set_input_embeddings(self, value): - self.embed_tokens = value - - @add_start_docstrings_to_model_forward(MISTRAL_INPUTS_DOCSTRING) - def forward( - self, - input_ids: torch.LongTensor = None, - attention_mask: Optional[torch.Tensor] = None, - position_ids: Optional[torch.LongTensor] = None, - past_key_values: Optional[List[torch.FloatTensor]] = None, - inputs_embeds: Optional[torch.FloatTensor] = None, - use_cache: Optional[bool] = None, - output_attentions: Optional[bool] = None, - output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[Tuple, BaseModelOutputWithPast]: - output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions - output_hidden_states = ( - output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states - ) - use_cache = use_cache if use_cache is not None else self.config.use_cache - - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - - # retrieve input_ids and inputs_embeds - if input_ids is not None and inputs_embeds is not None: - raise ValueError("You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time") - elif input_ids is not None: - batch_size, seq_length = input_ids.shape - elif inputs_embeds is not None: - batch_size, seq_length, _ = inputs_embeds.shape - else: - raise ValueError("You have to specify either decoder_input_ids or decoder_inputs_embeds") - - if self.gradient_checkpointing and self.training: - if use_cache: - logger.warning_once( - "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..." - ) - use_cache = False - - past_key_values_length = 0 - - if use_cache: - use_legacy_cache = not isinstance(past_key_values, Cache) - if use_legacy_cache: - past_key_values = DynamicCache.from_legacy_cache(past_key_values) - past_key_values_length = past_key_values.get_usable_length(seq_length) - - if position_ids is None: - device = input_ids.device if input_ids is not None else inputs_embeds.device - position_ids = torch.arange( - past_key_values_length, seq_length + past_key_values_length, dtype=torch.long, device=device - ) - position_ids = position_ids.unsqueeze(0).view(-1, seq_length) - else: - position_ids = position_ids.view(-1, seq_length).long() - - if inputs_embeds is None: - inputs_embeds = self.embed_tokens(input_ids) - - if attention_mask is not None and self._attn_implementation == "flash_attention_2" and use_cache: - is_padding_right = attention_mask[:, -1].sum().item() != batch_size - if is_padding_right: - raise ValueError( - "You are attempting to perform batched generation with padding_side='right'" - " this may lead to unexpected behaviour for Flash Attention version of Mistral. Make sure to " - " call `tokenizer.padding_side = 'left'` before tokenizing the input. " - ) - - if self._attn_implementation == "flash_attention_2": - # 2d mask is passed through the layers - attention_mask = attention_mask if (attention_mask is not None and 0 in attention_mask) else None - elif self._attn_implementation == "sdpa" and not output_attentions: - # output_attentions=True can not be supported when using SDPA, and we fall back on - # the manual implementation that requires a 4D causal mask in all cases. - attention_mask = _prepare_4d_causal_attention_mask_for_sdpa( - attention_mask, - (batch_size, seq_length), - inputs_embeds, - past_key_values_length, - ) - else: - # 4d mask is passed through the layers - attention_mask = _prepare_4d_causal_attention_mask( - attention_mask, - (batch_size, seq_length), - inputs_embeds, - past_key_values_length, - sliding_window=self.config.sliding_window, - ) - - hidden_states = inputs_embeds - - # decoder layers - all_hidden_states = () if output_hidden_states else None - all_self_attns = () if output_attentions else None - next_decoder_cache = None - - for decoder_layer in self.layers: - if output_hidden_states: - all_hidden_states += (hidden_states,) - - if self.gradient_checkpointing and self.training: - layer_outputs = self._gradient_checkpointing_func( - decoder_layer.__call__, - hidden_states, - attention_mask, - position_ids, - past_key_values, - output_attentions, - use_cache, - ) - else: - layer_outputs = decoder_layer( - hidden_states, - attention_mask=attention_mask, - position_ids=position_ids, - past_key_value=past_key_values, - output_attentions=output_attentions, - use_cache=use_cache, - ) - - hidden_states = layer_outputs[0] - - if use_cache: - next_decoder_cache = layer_outputs[2 if output_attentions else 1] - - if output_attentions: - all_self_attns += (layer_outputs[1],) - - hidden_states = self.norm(hidden_states) - - # add hidden states from the last decoder layer - if output_hidden_states: - all_hidden_states += (hidden_states,) - - next_cache = None - if use_cache: - next_cache = next_decoder_cache.to_legacy_cache() if use_legacy_cache else next_decoder_cache - - if not return_dict: - return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None) - return BaseModelOutputWithPast( - last_hidden_state=hidden_states, - past_key_values=next_cache, - hidden_states=all_hidden_states, - attentions=all_self_attns, - ) - - -class Mistral2ForCausalLM(CollieModelForCausalLM): - _tied_weights_keys = ["lm_head.weight"] - - def __init__(self, config:CollieConfig): - super().__init__(config) - self.model = Mistral2Model(config) - self.vocab_size = config.vocab_size - # self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False) - self.lm_head = ColumnParallelLinearWithoutBias( - self.collie_config.hidden_size, self.collie_config.vocab_size, bias=False - ) - # Initialize weights and apply final processing - # self.post_init() - # GenerationMixin 需要的额外参数 - self.config.is_decoder = True - if config.model_config.tie_word_embeddings: - self.lm_head.weight = self.embed_tokens.weight - self.main_input_name = "input_ids" - - def clean_cache(self): - self._clean_hidden_states([*self.model.layers, self.lm_head]) - self._set_use_cache(self.model.layers, False) - - def set_cache(self, use_cache): - self._set_use_cache(self.model.layers, use_cache) - - def get_input_embeddings(self): - return self.model.embed_tokens - - def set_input_embeddings(self, value): - self.model.embed_tokens = value - - def get_output_embeddings(self): - return self.lm_head - - def set_output_embeddings(self, new_embeddings): - self.lm_head = new_embeddings - - def set_decoder(self, decoder): - self.model = decoder - - def get_decoder(self): - return self.model - - @add_start_docstrings_to_model_forward(MISTRAL_INPUTS_DOCSTRING) - @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC) - def forward( - self, - input_ids: torch.LongTensor = None, - attention_mask: Optional[torch.Tensor] = None, - position_ids: Optional[torch.LongTensor] = None, - past_key_values: Optional[List[torch.FloatTensor]] = None, - inputs_embeds: Optional[torch.FloatTensor] = None, - labels: Optional[torch.LongTensor] = None, - use_cache: Optional[bool] = None, - output_attentions: Optional[bool] = None, - output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - **kwargs, - ) -> Union[Tuple, CausalLMOutputWithPast]: - r""" - Args: - labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): - Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., - config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored - (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`. - - Returns: - - Example: - - ```python - >>> from transformers import AutoTokenizer, MistralForCausalLM - - >>> model = MistralForCausalLM.from_pretrained("mistralai/Mistral-7B-v0.1") - >>> tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-v0.1") - - >>> prompt = "Hey, are you conscious? Can you talk to me?" - >>> inputs = tokenizer(prompt, return_tensors="pt") - - >>> # Generate - >>> generate_ids = model.generate(inputs.input_ids, max_length=30) - >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0] - "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you." - ```""" - - output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions - output_hidden_states = ( - output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states - ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - - # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn) - outputs = self.model( - input_ids=input_ids, - attention_mask=attention_mask, - position_ids=position_ids, - past_key_values=past_key_values, - inputs_embeds=inputs_embeds, - use_cache=use_cache, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, - return_dict=return_dict, - ) - - hidden_states = outputs[0] - logits = self.lm_head(hidden_states) - logits = logits.float() - - loss = None - if labels is not None: - # Shift so that tokens < n predict n - shift_logits = logits[..., :-1, :].contiguous() - shift_labels = labels[..., 1:].contiguous() - # Flatten the tokens - shift_logits = shift_logits.view(-1, self.config.vocab_size) - shift_labels = shift_labels.view(-1) - # Ensure tensors are on the same device - shift_labels = shift_labels.to(shift_logits.device) - loss_fct = CrossEntropyLoss() - loss = loss_fct(shift_logits, shift_labels) - - if not return_dict: - output = (logits,) + outputs[1:] - return (loss,) + output if loss is not None else output - - return CausalLMOutputWithPast( - loss=loss, - logits=logits, - past_key_values=outputs.past_key_values, - hidden_states=outputs.hidden_states, - attentions=outputs.attentions, - ) - - def prepare_inputs_for_generation( - self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, **kwargs - ): - # Omit tokens covered by past_key_values - if past_key_values is not None: - if isinstance(past_key_values, Cache): - cache_length = past_key_values.get_seq_length() - past_length = past_key_values.seen_tokens - max_cache_length = past_key_values.get_max_length() - else: - cache_length = past_length = past_key_values[0][0].shape[2] - max_cache_length = None - - # Keep only the unprocessed tokens: - # 1 - If the length of the attention_mask exceeds the length of input_ids, then we are in a setting where - # some of the inputs are exclusively passed as part of the cache (e.g. when passing input_embeds as - # input) - if attention_mask is not None and attention_mask.shape[1] > input_ids.shape[1]: - input_ids = input_ids[:, -(attention_mask.shape[1] - past_length) :] - # 2 - If the past_length is smaller than input_ids', then input_ids holds all input tokens. We can discard - # input_ids based on the past_length. - elif past_length < input_ids.shape[1]: - input_ids = input_ids[:, past_length:] - # 3 - Otherwise (past_length >= input_ids.shape[1]), let's assume input_ids only has unprocessed tokens. - - # If we are about to go beyond the maximum cache length, we need to crop the input attention mask. - if ( - max_cache_length is not None - and attention_mask is not None - and cache_length + input_ids.shape[1] > max_cache_length - ): - attention_mask = attention_mask[:, -max_cache_length:] - - position_ids = kwargs.get("position_ids", None) - if attention_mask is not None and position_ids is None: - # create position_ids on the fly for batch generation - position_ids = attention_mask.long().cumsum(-1) - 1 - position_ids.masked_fill_(attention_mask == 0, 1) - if past_key_values: - position_ids = position_ids[:, -input_ids.shape[1] :] - - # if `inputs_embeds` are passed, we only want to use them in the 1st generation step - if inputs_embeds is not None and past_key_values is None: - model_inputs = {"inputs_embeds": inputs_embeds} - else: - model_inputs = {"input_ids": input_ids} - - model_inputs.update( - { - "position_ids": position_ids, - "past_key_values": past_key_values, - "use_cache": kwargs.get("use_cache"), - "attention_mask": attention_mask, - } - ) - return model_inputs - - @staticmethod - def _reorder_cache(past_key_values, beam_idx): - reordered_past = () - for layer_past in past_key_values: - reordered_past += ( - tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past), - ) - return reordered_past - - @classmethod - def pipeline_layers(cls, config: CollieConfig): - """ - Get layers of pipeline. - :return: list - """ - if isinstance(config, str): - config = CollieConfig.from_pretrained(config) - - if config.tie_word_embeddings: - output = TiedLayerSpec( - "embed_tokens", - dict_as_params(input_keys="hidden_states", output_keys="logits"), - ColumnParallelLMHead, - config.hidden_size, - config.vocab_size, - bias=False, - ) - else: - output = LayerSpec( - dict_as_params(input_keys="hidden_states", output_keys="logits"), - ColumnParallelLMHead, - config.hidden_size, - config.vocab_size, - bias=False, - ) - - return [("model", Mistral2Model.pipeline_layers(config)), ("lm_head", output)] - - @staticmethod - def load_parallel_state_dict( - path: str, - config: Union[CollieConfig, str], - process_exclusion: bool = False, - **kwargs, - ): - ... - - @staticmethod - def load_parallel_state_dict( - path: str, - config: Union[CollieConfig, str], - process_exclusion: bool = False, - protocol: str = "file", # 指定加载state_dict时使用的协议 - **kwargs, - ): - """ - Load state_dict from ``path``. - The format of pretrained model should be the same as that of - `huggingface`. - :return: state_dict. Note that the state_dict should be processed - properly to match the current rank. - """ - # 配置加载 - if isinstance(config, str): - config = CollieConfig.from_pretrained(config) - # IO驱动初始化 - io_driver = IODriver.from_protocol(protocol) - # 检查文件路径是否存在 - if not io_driver.exists(path): - raise FileNotFoundError(f"folder {path} not found.") - # 初始化存储和处理变量 - state_dict = OrderedDict() - weights = [] - parts = None # 变量用于存储模型分割的部分信息 - # 如果开启了进程互斥,那么每个进程都会显示进度条,否则只显示 RANK0 的 - hide_progress = not process_exclusion and int(os.environ.get("RANK", "0")) != 0 - if dist.is_initialized() and process_exclusion: - # 如果启动了进程互斥,则要进行 dist.get_world_size() 次循环 - rank_order = range(dist.get_world_size()) - else: - # 不开启只进行一次循环 - rank_order = range(1) - # 权重文件加载和处理 - for rank in rank_order: - # 如果开启了进程互斥,那么只有对应 RANK 的能进入循环;不开启进程互斥的话就都可以进 - if int(os.environ.get("RANK", "0")) == rank or not process_exclusion: - # PP 分层的方法保存在了 os.environ["COLLIE_PP_PARTS"], 格式类似于 [0, 17, 35], 左闭右开 - if env.is_pipeline: - # 保存的是 json 格式 - parts = env.pipeline_parts - if hasattr(config, "num_key_value_heads"): - # llama2 (transformers >= 4.31.0) - num_key_value_heads = config.num_key_value_heads - else: - num_key_value_heads = config.num_attention_heads - head_dim = config.hidden_size // config.num_attention_heads - # 如果存在 pytorch_model.bin.index.json 文件的话,此时不同的 pp 进程可以按需加载自己需要的权重 - if ( - io_driver.exists(os.path.join(path, "pytorch_model.bin.index.json")) - and "COLLIE_PP_PARTS" in os.environ.keys() - ): - weight_map = json.loads( - io_driver.load( - os.path.join(path, "pytorch_model.bin.index.json"), mode="r" - ) - )["weight_map"] - # layers 表示自己需要的层 - layers = env.pipeline_layers_idx - # 筛选出形似 model.layers.0 这样的层。包含两个条件:1. 有数字的层;2. 数字加一要在 layers 里面(因为最开始还有个 embedding 占一层) - weights.extend( - [ - value - for key, value in weight_map.items() - if len(key.split(".")) > 2 - and key.split(".")[2].isdigit() - and (int(key.split(".")[2]) + 1) in layers - ] - ) - # 去重 - weights = list(set(weights)) - # 继续筛选,如果有 0 层,那么就要加载 embedding;如果有最后一层,那么就要加载 lm_head;如果有倒数第二层,那么就要加载 norm - if 0 in layers: - weights.append(weight_map["model.tok_embeddings.weight"]) - if max(parts) - 1 in layers: - weights.append(weight_map["output.weight"]) - if max(parts) - 2 in layers: - weights.append(weight_map["model.norm.weight"]) - else: - # 如果没有 pytorch_model.bin.index.json 文件的话,那么就加载所有的权重 - weights = [ - weight - for weight in io_driver.list(path) - if weight.endswith(".bin") - ] - with progress( - weights, - desc="Loading state dict", - total=len(weights), - disable=hide_progress, - ) as pbar: - for weight in pbar: - part_state_dict = io_driver.load( - os.path.join(path, weight), mode="rb" - ) - # for key in list(part_state_dict.keys()): - # if "attention.wqkv.weight" in key: - # # qkv_weights = part_state_dict.pop(key) - # qkv_weights = part_state_dict[key] - # print(qkv_weights.shape) - # (wq, wk, wv) = qkv_weights.split( - # [ - # config.hidden_size, - # config.num_key_value_heads * head_dim, - # config.num_key_value_heads * head_dim, - # ], - # dim=0, - # ) - # wq_name = key.replace("wqkv", "wq") - # wk_name = key.replace("wqkv", "wk") - # wv_name = key.replace("wqkv", "wv") - # part_state_dict[wq_name] = wq - # part_state_dict[wk_name] = wk - # part_state_dict[wv_name] = wv - state_dict.update(part_state_dict) - del part_state_dict - if parts is not None: - # 这一步是 pp 的复筛 - layers = env.pipeline_layers_idx - for key in list(state_dict.keys()): - if key.startswith("layers"): - layer = int(key.split(".")[1]) - if layer + 1 not in layers: - state_dict.pop(key) - # if key.endswith("tok_embeddings.weight"): - if key.endswith("embed_tokens.weight"): - if 0 not in layers: - state_dict.pop(key) - if key == "norm.weight": - if max(parts) - 2 not in layers: - state_dict.pop(key) - # if key.endswith("output.weight"): - if key.endswith("lm_head.weight"): - if max(parts) - 1 not in layers: - state_dict.pop(key) - # 根据用户配置的新的 tp size 进行分割 - for key in list(state_dict.keys()): - col_filter = [ - # "wq.weight", - # "wk.weight", - # "wv.weight", - # "wqkv.weight", - # "w1.weight", - # "w3.weight", - # "tok_embeddings.weight", - # "output.weight", - "q_proj.weight", - "k_proj.weight", - "v_proj.weight", - "o_proj.weight", - "lm_head.weight", - "gate_proj.weight", - "up_proj.weight", - "down_proj.weight", - "embed_tokens.weight", - ] - col_split = any([key.endswith(filter) for filter in col_filter]) - - if col_split: - tensor = ( - list(torch.chunk(state_dict[key], config.tp_size, dim=0))[ - env.tp_rank - ] - .detach() - .clone() - ) - del state_dict[key] - if process_exclusion: - # CPU 内存回收(速度很慢) - gc.collect() - state_dict[key] = tensor - elif key.endswith("o_proj.weight") or key.endswith("down_proj.weight"): - tensor = ( - list(torch.chunk(state_dict[key], config.tp_size, dim=1))[ - env.tp_rank - ] - .detach() - .clone() - ) - del state_dict[key] - if process_exclusion: - # CPU 内存回收(速度很慢) - gc.collect() - state_dict[key] = tensor - if dist.is_initialized() and process_exclusion: - # 如果选择了进程互斥,那么本次循环中不需要加载权重的进程需等待 - dist.barrier() - return state_dict - - @staticmethod - def save_parallel_state_dict( - state_dict: dict, - path: str, - config: CollieConfig, - process_exclusion: bool = False, - **kwargs, - ): - ... - - @staticmethod - def save_parallel_state_dict( - state_dict: dict, - path: str, - config: CollieConfig, - process_exclusion: bool = False, - protocol: str = "file", - ): - """ - Save state_dict to ``path``. - The format of saved state dict should be the same as that of - `huggingface`. - """ - io_driver = IODriver.from_protocol(protocol) - # gather to tp rank 0 - if dist.is_initialized() and process_exclusion: - # 如果启动了进程互斥,则要进行 pp_size 次循环 - rank_order = range(config.pp_size) - else: - # 不开启只进行一次循环 - rank_order = range(1) - dst = parallel_state.get_tensor_model_parallel_src_rank() - with progress( - rank_order, - desc="Saving model", - disable=int(os.environ.get("RANK", "0")) != 0, - ) as pbar: - for rank in pbar: - if env.dp_rank == 0 and (env.pp_rank == rank or not process_exclusion): - for key in sorted(list(state_dict.keys())): - tensor_list = None - if env.tp_rank == 0: - tensor_list = [ - torch.zeros_like(state_dict[key]) - .to(state_dict[key].dtype) - .cuda() - for _ in range(config.tp_size) - ] - dist.gather( - state_dict[key].cuda(), - dst=dst, - gather_list=tensor_list, - group=env.tp_group, - ) - if env.tp_rank == 0: - col_filter = [ - # "wq.weight", - # "wk.weight", - # "wv.weight", - # "wqkv.weight", - # "w1.weight", - # "w3.weight", - # "tok_embeddings.weight", - # "output.weight", - "q_proj.weight", - "k_proj.weight", - "v_proj.weight", - "o_proj.weight", - "lm_head.weight", - "gate_proj.weight", - "up_proj.weight", - "down_proj.weight", - "embed_tokens.weight", - ] - col_split = any( - [key.endswith(filter) for filter in col_filter] - ) - - if col_split: - state_dict[key] = concat_tensor(tensor_list, dim=0) - - if process_exclusion: - # CPU 内存回收(速度很慢) - gc.collect() - - elif key.endswith("o_proj.weight") or key.endswith("down_proj.weight"): - state_dict[key] = concat_tensor(tensor_list, dim=1) - - if process_exclusion: - # CPU 内存回收(速度很慢) - gc.collect() - # 似乎不需要? - # state_dict_keys = state_dict.keys() - # for layer_id in range(config.num_layers): - # qkv_names = [None, None, None] - # for key in state_dict_keys: - # if f"layers.{layer_id}.attention.wq.weight" in key: - # qkv_names[0] = key - # elif f"layers.{layer_id}.attention.wk.weight" in key: - # qkv_names[1] = key - # elif f"layers.{layer_id}.attention.wv.weight" in key: - # qkv_names[2] = key - # qkv_name = qkv_names[0].replace("wq", "wqkv") - # state_dict[qkv_name] = torch.cat( - # [ - # state_dict.pop(qkv_names[0]), - # state_dict.pop(qkv_names[1]), - # state_dict.pop(qkv_names[2]), - # ], - # dim=0 - # ) - - if env.tp_rank == 0: - # Save gathered weights - if env.is_pipeline: - ckpt_name = f"pytorch_model-{env.pp_rank + 1:05d}-of-{config.pp_size:05d}.bin" - total_size = 0 - weight_map = {} - for name, weight in state_dict.items(): - weight_size = weight.numel() * dtype_byte_size( - weight.dtype - ) - weight_map[name] = ckpt_name - total_size += weight_size - index_dict = dict( - total_size=total_size, weight_map=weight_map - ) - index_dicts = [None for _ in range(env.pp_size)] - dist.gather_object( - index_dict, index_dicts if env.pp_rank == 0 else None, group=env.pp_group - ) - if env.pp_rank == 0: - total_size = 0 - weight_map = {} - for _index_dict in index_dicts: - total_size += _index_dict["total_size"] - weight_map.update(_index_dict["weight_map"]) - merged_dict = { - "metadata": {"total_size": total_size}, - "weight_map": weight_map, - } - io_driver.save( - json.dumps(merged_dict, indent=2, sort_keys=True) - + "\n", - os.path.join(path, "pytorch_model.bin.index.json"), - ) - - else: - ckpt_name = f"pytorch_model.bin" - ckpt_path = os.path.join(path, ckpt_name) - io_driver.save(state_dict, ckpt_path) - if dist.is_initialized() and process_exclusion: - dist.barrier() - if env.rank == 0: - config.save_pretrained(path, protocol=protocol) - dist.barrier() - - -@add_start_docstrings( - """ - The Mistral Model transformer with a sequence classification head on top (linear layer). - - [`MistralForSequenceClassification`] uses the last token in order to do the classification, as other causal models - (e.g. GPT-2) do. - - Since it does classification on the last token, it requires to know the position of the last token. If a - `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If - no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the - padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in - each row of the batch). - """, - MISTRAL_START_DOCSTRING, -) -# Copied from transformers.models.llama.modeling_llama.LlamaForSequenceClassification with Llama->Mistral, LLAMA->MISTRAL -class MistralForSequenceClassification(Mistral2PreTrainedModel): - def __init__(self, config): - super().__init__(config) - self.num_labels = config.num_labels - self.model = Mistral2Model(config) - self.score = nn.Linear(config.hidden_size, self.num_labels, bias=False) - - # Initialize weights and apply final processing - self.post_init() - - def get_input_embeddings(self): - return self.model.embed_tokens - - def set_input_embeddings(self, value): - self.model.embed_tokens = value - - @add_start_docstrings_to_model_forward(MISTRAL_INPUTS_DOCSTRING) - def forward( - self, - input_ids: torch.LongTensor = None, - attention_mask: Optional[torch.Tensor] = None, - position_ids: Optional[torch.LongTensor] = None, - past_key_values: Optional[List[torch.FloatTensor]] = None, - inputs_embeds: Optional[torch.FloatTensor] = None, - labels: Optional[torch.LongTensor] = None, - use_cache: Optional[bool] = None, - output_attentions: Optional[bool] = None, - output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[Tuple, SequenceClassifierOutputWithPast]: - r""" - labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): - Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., - config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If - `config.num_labels > 1` a classification loss is computed (Cross-Entropy). - """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - - transformer_outputs = self.model( - input_ids, - attention_mask=attention_mask, - position_ids=position_ids, - past_key_values=past_key_values, - inputs_embeds=inputs_embeds, - use_cache=use_cache, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, - return_dict=return_dict, - ) - hidden_states = transformer_outputs[0] - logits = self.score(hidden_states) - - if input_ids is not None: - batch_size = input_ids.shape[0] - else: - batch_size = inputs_embeds.shape[0] - - if self.config.pad_token_id is None and batch_size != 1: - raise ValueError("Cannot handle batch sizes > 1 if no padding token is defined.") - if self.config.pad_token_id is None: - sequence_lengths = -1 - else: - if input_ids is not None: - # if no pad token found, use modulo instead of reverse indexing for ONNX compatibility - sequence_lengths = torch.eq(input_ids, self.config.pad_token_id).int().argmax(-1) - 1 - sequence_lengths = sequence_lengths % input_ids.shape[-1] - sequence_lengths = sequence_lengths.to(logits.device) - else: - sequence_lengths = -1 - - pooled_logits = logits[torch.arange(batch_size, device=logits.device), sequence_lengths] - - loss = None - if labels is not None: - labels = labels.to(logits.device) - if self.config.problem_type is None: - if self.num_labels == 1: - self.config.problem_type = "regression" - elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int): - self.config.problem_type = "single_label_classification" - else: - self.config.problem_type = "multi_label_classification" - - if self.config.problem_type == "regression": - loss_fct = MSELoss() - if self.num_labels == 1: - loss = loss_fct(pooled_logits.squeeze(), labels.squeeze()) - else: - loss = loss_fct(pooled_logits, labels) - elif self.config.problem_type == "single_label_classification": - loss_fct = CrossEntropyLoss() - loss = loss_fct(pooled_logits.view(-1, self.num_labels), labels.view(-1)) - elif self.config.problem_type == "multi_label_classification": - loss_fct = BCEWithLogitsLoss() - loss = loss_fct(pooled_logits, labels) - if not return_dict: - output = (pooled_logits,) + transformer_outputs[1:] - return ((loss,) + output) if loss is not None else output - - return SequenceClassifierOutputWithPast( - loss=loss, - logits=pooled_logits, - past_key_values=transformer_outputs.past_key_values, - hidden_states=transformer_outputs.hidden_states, - attentions=transformer_outputs.attentions, - ) diff --git a/collie/models/mistral2/modeltp.py b/collie/models/mistral2/modeltp.py deleted file mode 100644 index e91037f..0000000 --- a/collie/models/mistral2/modeltp.py +++ /dev/null @@ -1,2254 +0,0 @@ -# coding=utf-8 -# Copyright 2023 Mistral AI and the HuggingFace Inc. team. All rights reserved. -# -# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX -# and OPT implementations in this library. It has been modified from its -# original forms to accommodate minor architectural differences compared -# to GPT-NeoX and OPT used by the Meta AI team that trained the model. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -""" PyTorch Mistral model.""" -import inspect -import math -import warnings -from typing import List, Optional, Tuple, Union - -import torch -import torch.nn.functional as F -import torch.utils.checkpoint -from torch import nn -from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss - -from transformers.activations import ACT2FN -from transformers.cache_utils import Cache, DynamicCache -from transformers.modeling_attn_mask_utils import _prepare_4d_causal_attention_mask, _prepare_4d_causal_attention_mask_for_sdpa -from transformers.modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast, SequenceClassifierOutputWithPast -from transformers.modeling_utils import PreTrainedModel, dtype_byte_size -from transformers.utils import ( - add_start_docstrings, - add_start_docstrings_to_model_forward, - is_flash_attn_2_available, - is_flash_attn_greater_or_equal_2_10, - logging, - replace_return_docstrings, -) -from .configuration_mistraltp import MistralConfig - - -if is_flash_attn_2_available(): - from flash_attn import flash_attn_func, flash_attn_varlen_func - from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input # noqa - - _flash_supports_window_size = "window_size" in list(inspect.signature(flash_attn_func).parameters) - - -logger = logging.get_logger(__name__) - -_CONFIG_FOR_DOC = "MistralConfig" - -#modified for collie -import torch.distributed as dist -import gc -import json -import os -from collections import OrderedDict -from megatron.core import parallel_state, tensor_parallel -from einops import rearrange -from deepspeed.pipe import LayerSpec, TiedLayerSpec - -from collie.config import CollieConfig -from collie.driver.io import IODriver -from collie.log.logger import logger -from collie.module import ( - ColumnParallelLinearWithoutBias, - ColumnParallelLMHead, - RowParallelLinearWithoutBias, -) -from collie.utils import concat_tensor, dict_as_params, env, progress -from collie.models.base import CollieModelForCausalLM -from collie.models.utils import ( - kv_cache_to_inputs_for_layer, inputs_to_kv_cache_for_layer, - kv_cache_to_inputs_for_model, inputs_to_kv_cache_for_model, -) - - -# Copied from transformers.models.llama.modeling_llama._get_unpad_data -def _get_unpad_data(attention_mask): - seqlens_in_batch = attention_mask.sum(dim=-1, dtype=torch.int32) - indices = torch.nonzero(attention_mask.flatten(), as_tuple=False).flatten() - max_seqlen_in_batch = seqlens_in_batch.max().item() - cu_seqlens = F.pad(torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.int32), (1, 0)) - return ( - indices, - cu_seqlens, - max_seqlen_in_batch, - ) - - -# Copied from transformers.models.llama.modeling_llama.LlamaRMSNorm with Llama->Mistral -class MistralRMSNorm(nn.Module): - def __init__(self, hidden_size, eps=1e-6): - """ - MistralRMSNorm is equivalent to T5LayerNorm - """ - super().__init__() - self.weight = nn.Parameter(torch.ones(hidden_size)) - self.variance_epsilon = eps - - def forward(self, hidden_states): - input_dtype = hidden_states.dtype - hidden_states = hidden_states.to(torch.float32) - variance = hidden_states.pow(2).mean(-1, keepdim=True) - hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon) - ans = self.weight * hidden_states.to(input_dtype) - - # # 打印层标准化的输出 - hidden_states_output = ans.detach().cpu().tolist() - data_to_save = {"Layer Norm Output": hidden_states_output} - # 将输出写入 JSON 文件 - with open('a_rms_output.json', 'w') as f: - json.dump(data_to_save, f, indent=4) - - return self.weight * hidden_states.to(input_dtype) - - -# copied from transformers.models.llama.modeling_llama.LlamaRotaryEmbedding with Llama->Mistral -# TODO @Arthur no longer copied from LLama after static cache -class MistralRotaryEmbedding(nn.Module): - def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None): - super().__init__() - - self.dim = dim - self.max_position_embeddings = max_position_embeddings - self.base = base - inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2, dtype=torch.int64).float().to(device) / self.dim)) - self.register_buffer("inv_freq", inv_freq, persistent=False) - - # Build here to make `torch.jit.trace` work. - self._set_cos_sin_cache( - seq_len=max_position_embeddings, device=self.inv_freq.device, dtype=torch.get_default_dtype() - ) - - def _set_cos_sin_cache(self, seq_len, device, dtype): - self.max_seq_len_cached = seq_len - t = torch.arange(self.max_seq_len_cached, device=device, dtype=torch.int64).type_as(self.inv_freq) - - freqs = torch.outer(t, self.inv_freq) - # Different from paper, but it uses a different permutation in order to obtain the same calculation - emb = torch.cat((freqs, freqs), dim=-1) - self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False) - self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False) - - def forward(self, x, seq_len=None): - # x: [bs, num_attention_heads, seq_len, head_size] - if seq_len > self.max_seq_len_cached: - self._set_cos_sin_cache(seq_len=seq_len, device=x.device, dtype=x.dtype) - - return ( - self.cos_cached[:seq_len].to(dtype=x.dtype), - self.sin_cached[:seq_len].to(dtype=x.dtype), - ) - - -# Copied from transformers.models.llama.modeling_llama.rotate_half -def rotate_half(x): - """Rotates half the hidden dims of the input.""" - x1 = x[..., : x.shape[-1] // 2] - x2 = x[..., x.shape[-1] // 2 :] - return torch.cat((-x2, x1), dim=-1) - - -# copied from transformers.models.llama.modeling_llama.apply_rotary_pos_emb -# TODO @Arthur no longer copied from LLama after static cache -def apply_rotary_pos_emb(q, k, cos, sin, position_ids, unsqueeze_dim=1): - """Applies Rotary Position Embedding to the query and key tensors. - - Args: - q (`torch.Tensor`): The query tensor. - k (`torch.Tensor`): The key tensor. - cos (`torch.Tensor`): The cosine part of the rotary embedding. - sin (`torch.Tensor`): The sine part of the rotary embedding. - position_ids (`torch.Tensor`): - The position indices of the tokens corresponding to the query and key tensors. For example, this can be - used to pass offsetted position ids when working with a KV-cache. - unsqueeze_dim (`int`, *optional*, defaults to 1): - The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and - sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note - that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and - k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes - cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have - the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2. - Returns: - `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding. - """ - cos = cos[position_ids].unsqueeze(unsqueeze_dim) - sin = sin[position_ids].unsqueeze(unsqueeze_dim) - q_embed = (q * cos) + (rotate_half(q) * sin) - k_embed = (k * cos) + (rotate_half(k) * sin) - return q_embed, k_embed - - -class MistralMLP(nn.Module): - def __init__(self, config): - super().__init__() - self.config = config - self.hidden_size = config.hidden_size - self.intermediate_size = config.intermediate_size - # self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False) - # self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False) - # self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False) - - self.up_proj = ColumnParallelLinearWithoutBias( - self.hidden_size, - self.intermediate_size, - bias=False, - gather_output=False, - init_method=lambda x: x, - ) - self.gate_proj = ColumnParallelLinearWithoutBias( - self.hidden_size, - self.intermediate_size, - bias=False, - gather_output=False, - init_method=lambda x: x, - ) - self.down_proj = RowParallelLinearWithoutBias( - self.intermediate_size, - self.hidden_size, - bias=False, - input_is_parallel=True, - init_method=lambda x: x, - ) - - self.act_fn = ACT2FN[config.hidden_act] - - def forward(self, x): - output = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x)) - - # 打印MLP层输出 - mlp_output = output.detach().cpu().tolist() - data_to_save = {"MLP Output": mlp_output} - # 将输出写入 JSON 文件 - with open('a_mlp_output.json', 'w') as f: - json.dump(data_to_save, f, indent=4) - - return output - - -# Copied from transformers.models.llama.modeling_llama.repeat_kv -def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor: - """ - This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch, - num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim) - """ - batch, num_key_value_heads, slen, head_dim = hidden_states.shape - if n_rep == 1: - return hidden_states - hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim) - return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim) - - -class MistralAttention(nn.Module): - """ - Multi-headed attention from 'Attention Is All You Need' paper. Modified to use sliding window attention: Longformer - and "Generating Long Sequences with Sparse Transformers". - """ - - def __init__(self, config: CollieConfig, layer_idx: Optional[int] = None): - super().__init__() - self.config = config - self.layer_idx = layer_idx - if layer_idx is None: - logger.warning_once( - f"Instantiating {self.__class__.__name__} without passing a `layer_idx` is not recommended and will " - "lead to errors during the forward call if caching is used. Please make sure to provide a `layer_idx` " - "when creating this class." - ) - - self.hidden_size = config.hidden_size - self.num_heads = config.num_attention_heads - self.head_dim = self.hidden_size // self.num_heads - self.num_key_value_heads = config.num_key_value_heads - self.num_key_value_groups = self.num_heads // self.num_key_value_heads - self.max_position_embeddings = config.max_position_embeddings - self.rope_theta = config.rope_theta - self.is_causal = True - self.attention_dropout = config.attention_dropout - - if (self.head_dim * self.num_heads) != self.hidden_size: - raise ValueError( - f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}" - f" and `num_heads`: {self.num_heads})." - ) - # self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=False) - # self.k_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=False) - # self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=False) - # self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=False) - - self.q_proj = ColumnParallelLinearWithoutBias( - self.hidden_size, - self.num_heads * self.head_dim, - bias=False, - gather_output=False, - init_method=lambda x: x, - ) - self.k_proj = ColumnParallelLinearWithoutBias( - self.hidden_size, - self.num_key_value_heads * self.head_dim, - bias=False, - gather_output=False, - init_method=lambda x: x, - ) - self.v_proj = ColumnParallelLinearWithoutBias( - self.hidden_size, - self.num_key_value_heads * self.head_dim, - bias=False, - gather_output=False, - init_method=lambda x: x, - ) - # aaaa - self.o_proj = RowParallelLinearWithoutBias( - self.num_heads * self.head_dim, - self.hidden_size, - bias=False, - input_is_parallel=True, - init_method=lambda x: x, - ) - - self.rotary_emb = MistralRotaryEmbedding( - self.head_dim, - max_position_embeddings=self.max_position_embeddings, - base=self.rope_theta, - ) - - def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int): - return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous() - - def forward( - self, - hidden_states: torch.Tensor, # 输入维度 [bsz, q_len, hidden_size] - attention_mask: Optional[torch.Tensor] = None, - position_ids: Optional[torch.LongTensor] = None, - past_key_value: Optional[Cache] = None, - output_attentions: bool = False, - use_cache: bool = False, - **kwargs, - ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: - if "padding_mask" in kwargs: - warnings.warn( - "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`" - ) - - bsz, q_len, _ = hidden_states.size() - - query_states = self.q_proj(hidden_states) # [bsz, q_len, num_heads * head_dim] - key_states = self.k_proj(hidden_states) # [bsz, q_len, num_key_value_heads * head_dim] - value_states = self.v_proj(hidden_states) # [bsz, q_len, num_key_value_heads * head_dim] - - # query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2) - # key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) - # value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) - - query_states, key_states, value_states = ( - rearrange(query_states, "b n (h d) -> b n h d", d=self.head_dim), # [bsz, q_len, num_heads, head_dim] - rearrange(key_states, "b n (h d) -> b n h d", d=self.head_dim), # [bsz, q_len, num_key_value_heads, head_dim] - rearrange(value_states, "b n (h d) -> b n h d", d=self.head_dim), # [bsz, q_len, num_key_value_heads, head_dim] - ) - - query_states = query_states.transpose(1, 2) # [bsz, num_heads, q_len, head_dim] - key_states = key_states.transpose(1, 2) # [bsz, num_key_value_heads, q_len, head_dim] - value_states = value_states.transpose(1, 2) # [bsz, num_key_value_heads, q_len, head_dim] - - # 打印注意力模块的输出 - # 准备数据以写入 JSON 文件 - attention_outputs = { - "Query states": query_states.detach().cpu().tolist(), - "Key states": key_states.detach().cpu().tolist(), - "Value states": value_states.detach().cpu().tolist() - } - # 将数据写入 JSON 文件 - with open("a_attention_outputs.json", "w") as f: - json.dump(attention_outputs, f, indent=4) - - if self.config.pp_size > 1: - query_states = query_states.contiguous() - key_states = key_states.contiguous() - value_states = value_states.contiguous() - - kv_seq_len = key_states.shape[-2] - if past_key_value is not None: - if self.layer_idx is None: - raise ValueError( - f"The cache structure has changed since version v4.36. If you are using {self.__class__.__name__} " - "for auto-regressive decoding with k/v caching, please make sure to initialize the attention class " - "with a layer index." - ) - kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx) - cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len) - query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids) - - if past_key_value is not None: - cache_kwargs = {"sin": sin, "cos": cos} # Specific to RoPE models - key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs) - - # repeat k/v heads if n_kv_heads < n_heads - key_states = repeat_kv(key_states, self.num_key_value_groups) - value_states = repeat_kv(value_states, self.num_key_value_groups) - - attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim) - - if attn_weights.size() != (bsz, self.num_heads/self.config.tp_size, q_len, kv_seq_len): - raise ValueError( - f"Attention weights should be of size {(bsz, self.num_heads/self.config.tp_size, q_len, kv_seq_len)}, but is" - f" {attn_weights.size()}" - ) - - if attention_mask is not None: - if attention_mask.size() != (bsz, 1, q_len, kv_seq_len): - raise ValueError( - f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}" - ) - - attn_weights = attn_weights + attention_mask - - # upcast attention to fp32 - attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype) - attn_weights = nn.functional.dropout(attn_weights, p=self.attention_dropout, training=self.training) - attn_output = torch.matmul(attn_weights, value_states) - - if attn_output.size() != (bsz, self.num_heads/self.config.tp_size, q_len, self.head_dim): - raise ValueError( - f"`attn_output` should be of size {(bsz, self.num_heads/self.config.tp_size, q_len, self.head_dim)}, but is" - f" {attn_output.size()}" - ) - - attn_output = attn_output.transpose(1, 2).contiguous() - attn_output = attn_output.reshape(bsz, q_len, int(self.hidden_size/self.config.tp_size)) - - - attn_output = self.o_proj(attn_output) - - if not output_attentions: - attn_weights = None - - # 打印注意力模块的输出 - attention_result = { - "Output weights:": attn_output.detach().cpu().tolist(), - # "Attention weights:": attn_weights.detach().cpu().tolist(), - } - # 将数据写入 JSON 文件 - with open("a_attention_outputs.json", "w") as f: - json.dump(attention_result, f, indent=4) - - return attn_output, attn_weights, past_key_value - - -class MistralFlashAttention2(MistralAttention): - """ - Mistral flash attention module. This module inherits from `MistralAttention` as the weights of the module stays - untouched. The only required change would be on the forward pass where it needs to correctly call the public API of - flash attention and deal with padding tokens in case the input contains any of them. - """ - - # Copied from transformers.models.llama.modeling_llama.LlamaFlashAttention2.__init__ - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - - # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1. - # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignement, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0. - # Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left). - self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10() - - def forward( - self, - hidden_states: torch.Tensor, - attention_mask: Optional[torch.Tensor] = None, - position_ids: Optional[torch.LongTensor] = None, - past_key_value: Optional[Cache] = None, - output_attentions: bool = False, - use_cache: bool = False, - **kwargs, - ): - if "padding_mask" in kwargs: - warnings.warn( - "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`" - ) - - # overwrite attention_mask with padding_mask - attention_mask = kwargs.pop("padding_mask") - bsz, q_len, _ = hidden_states.size() - - query_states = self.q_proj(hidden_states) - key_states = self.k_proj(hidden_states) - value_states = self.v_proj(hidden_states) - - # query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2) - # key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) - # value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) - - query_states, key_states, value_states = ( - rearrange(query_states, "b n (h d) -> b n h d", d=self.head_dim), - rearrange(key_states, "b n (h d) -> b n h d", d=self.head_dim), - rearrange(value_states, "b n (h d) -> b n h d", d=self.head_dim), - ) - - query_states = query_states.transpose(1, 2) - key_states = key_states.transpose(1, 2) - value_states = value_states.transpose(1, 2) - - # 打印注意力模块的输出 - # 准备数据以写入 JSON 文件 - attention_outputs = { - "Query states": query_states.detach().cpu().tolist(), - "Key states": key_states.detach().cpu().tolist(), - "Value states": value_states.detach().cpu().tolist() - } - # 将数据写入 JSON 文件 - with open("a_flash_attention_outputs.json", "w") as f: - json.dump(attention_outputs, f, indent=4) - - if self.config.pp_size > 1: - query_states = query_states.contiguous() - key_states = key_states.contiguous() - value_states = value_states.contiguous() - - - kv_seq_len = key_states.shape[-2] - if past_key_value is not None: - if self.layer_idx is None: - raise ValueError( - f"The cache structure has changed since version v4.36. If you are using {self.__class__.__name__} " - "for auto-regressive decoding with k/v caching, please make sure to initialize the attention class " - "with a layer index." - ) - kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx) - - # Because the input can be padded, the absolute sequence length depends on the max position id. - rotary_seq_len = max(kv_seq_len, position_ids[:, -1].max().item()) + 1 - cos, sin = self.rotary_emb(value_states, seq_len=rotary_seq_len) - - query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids) - - use_sliding_windows = ( - _flash_supports_window_size - and getattr(self.config, "sliding_window", None) is not None - and kv_seq_len > self.config.sliding_window - ) - - if not _flash_supports_window_size: - logger.warning_once( - "The current flash attention version does not support sliding window attention, for a more memory efficient implementation" - " make sure to upgrade flash-attn library." - ) - - if past_key_value is not None: - # Activate slicing cache only if the config has a value `sliding_windows` attribute - cache_has_contents = past_key_value.get_seq_length(self.layer_idx) > 0 - if ( - getattr(self.config, "sliding_window", None) is not None - and kv_seq_len > self.config.sliding_window - and cache_has_contents - ): - slicing_tokens = 1 - self.config.sliding_window - - past_key = past_key_value[self.layer_idx][0] - past_value = past_key_value[self.layer_idx][1] - - past_key = past_key[:, :, slicing_tokens:, :].contiguous() - past_value = past_value[:, :, slicing_tokens:, :].contiguous() - - if past_key.shape[-2] != self.config.sliding_window - 1: - raise ValueError( - f"past key must have a shape of (`batch_size, num_heads, self.config.sliding_window-1, head_dim`), got" - f" {past_key.shape}" - ) - - if attention_mask is not None: - attention_mask = attention_mask[:, slicing_tokens:] - attention_mask = torch.cat([attention_mask, torch.ones_like(attention_mask[:, -1:])], dim=-1) - - cache_kwargs = {"sin": sin, "cos": cos} # Specific to RoPE models - key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs) - - # repeat k/v heads if n_kv_heads < n_heads - key_states = repeat_kv(key_states, self.num_key_value_groups) - value_states = repeat_kv(value_states, self.num_key_value_groups) - dropout_rate = 0.0 if not self.training else self.attention_dropout - - # In PEFT, usually we cast the layer norms in float32 for training stability reasons - # therefore the input hidden states gets silently casted in float32. Hence, we need - # cast them back in float16 just to be sure everything works as expected. - input_dtype = query_states.dtype - if input_dtype == torch.float32: - if torch.is_autocast_enabled(): - target_dtype = torch.get_autocast_gpu_dtype() - # Handle the case where the model is quantized - elif hasattr(self.config, "_pre_quantization_dtype"): - target_dtype = self.config._pre_quantization_dtype - else: - target_dtype = self.q_proj.weight.dtype - - logger.warning_once( - f"The input hidden states seems to be silently casted in float32, this might be related to" - f" the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in" - f" {target_dtype}." - ) - - query_states = query_states.to(target_dtype) - key_states = key_states.to(target_dtype) - value_states = value_states.to(target_dtype) - - # Reashape to the expected shape for Flash Attention - query_states = query_states.transpose(1, 2) - key_states = key_states.transpose(1, 2) - value_states = value_states.transpose(1, 2) - - attn_output = self._flash_attention_forward( - query_states, - key_states, - value_states, - attention_mask, - q_len, - dropout=dropout_rate, - use_sliding_windows=use_sliding_windows, - ) - - attn_output = attn_output.reshape(bsz, q_len, int(self.hidden_size/self.config.tp_size)).contiguous() - attn_output = self.o_proj(attn_output) - - if not output_attentions: - attn_weights = None - - # 打印注意力模块的输出 - attention_result = { - "Output weights:": attn_output.detach().cpu().tolist(), - # "Attention weights:": attn_weights.detach().cpu().tolist(), - } - # 将数据写入 JSON 文件 - with open("a_flash_attention_outputs.json", "w") as f: - json.dump(attention_result, f, indent=4) - - return attn_output, attn_weights, past_key_value - - def _flash_attention_forward( - self, - query_states, - key_states, - value_states, - attention_mask, - query_length, - dropout=0.0, - softmax_scale=None, - use_sliding_windows=False, - ): - """ - Calls the forward method of Flash Attention - if the input hidden states contain at least one padding token - first unpad the input, then computes the attention scores and pad the final attention scores. - - Args: - query_states (`torch.Tensor`): - Input query states to be passed to Flash Attention API - key_states (`torch.Tensor`): - Input key states to be passed to Flash Attention API - value_states (`torch.Tensor`): - Input value states to be passed to Flash Attention API - attention_mask (`torch.Tensor`): - The padding mask - corresponds to a tensor of size `(batch_size, seq_len)` where 0 stands for the - position of padding tokens and 1 for the position of non-padding tokens. - dropout (`float`): - Attention dropout - softmax_scale (`float`, *optional*): - The scaling of QK^T before applying softmax. Default to 1 / sqrt(head_dim) - use_sliding_windows (`bool`, *optional*): - Whether to activate sliding window attention. - """ - if not self._flash_attn_uses_top_left_mask: - causal = self.is_causal - else: - # TODO: Remove the `query_length != 1` check once Flash Attention for RoCm is bumped to 2.1. For details, please see the comment in LlamaFlashAttention2 __init__. - causal = self.is_causal and query_length != 1 - - # Contains at least one padding token in the sequence - if attention_mask is not None: - batch_size = query_states.shape[0] - query_states, key_states, value_states, indices_q, cu_seq_lens, max_seq_lens = self._upad_input( - query_states, key_states, value_states, attention_mask, query_length - ) - - cu_seqlens_q, cu_seqlens_k = cu_seq_lens - max_seqlen_in_batch_q, max_seqlen_in_batch_k = max_seq_lens - - if not use_sliding_windows: - attn_output_unpad = flash_attn_varlen_func( - query_states, - key_states, - value_states, - cu_seqlens_q=cu_seqlens_q, - cu_seqlens_k=cu_seqlens_k, - max_seqlen_q=max_seqlen_in_batch_q, - max_seqlen_k=max_seqlen_in_batch_k, - dropout_p=dropout, - softmax_scale=softmax_scale, - causal=causal, - ) - else: - attn_output_unpad = flash_attn_varlen_func( - query_states, - key_states, - value_states, - cu_seqlens_q=cu_seqlens_q, - cu_seqlens_k=cu_seqlens_k, - max_seqlen_q=max_seqlen_in_batch_q, - max_seqlen_k=max_seqlen_in_batch_k, - dropout_p=dropout, - softmax_scale=softmax_scale, - causal=causal, - window_size=(self.config.sliding_window, self.config.sliding_window), - ) - - attn_output = pad_input(attn_output_unpad, indices_q, batch_size, query_length) - else: - if not use_sliding_windows: - attn_output = flash_attn_func( - query_states, - key_states, - value_states, - dropout, - softmax_scale=softmax_scale, - causal=causal, - ) - else: - attn_output = flash_attn_func( - query_states, - key_states, - value_states, - dropout, - softmax_scale=softmax_scale, - causal=causal, - window_size=(self.config.sliding_window, self.config.sliding_window), - ) - - return attn_output - - def _upad_input(self, query_layer, key_layer, value_layer, attention_mask, query_length): - batch_size, kv_seq_len, num_heads, head_dim = key_layer.shape - - # On the first iteration we need to properly re-create the padding mask - # by slicing it on the proper place - if kv_seq_len != attention_mask.shape[-1]: - attention_mask_num_tokens = attention_mask.shape[-1] - attention_mask = attention_mask[:, attention_mask_num_tokens - kv_seq_len :] - - indices_k, cu_seqlens_k, max_seqlen_in_batch_k = _get_unpad_data(attention_mask) - - key_layer = index_first_axis(key_layer.reshape(batch_size * kv_seq_len, num_heads, head_dim), indices_k) - value_layer = index_first_axis(value_layer.reshape(batch_size * kv_seq_len, num_heads, head_dim), indices_k) - - if query_length == kv_seq_len: - query_layer = index_first_axis( - query_layer.reshape(batch_size * kv_seq_len, num_heads, head_dim), indices_k - ) - cu_seqlens_q = cu_seqlens_k - max_seqlen_in_batch_q = max_seqlen_in_batch_k - indices_q = indices_k - elif query_length == 1: - max_seqlen_in_batch_q = 1 - cu_seqlens_q = torch.arange( - batch_size + 1, dtype=torch.int32, device=query_layer.device - ) # There is a memcpy here, that is very bad. - indices_q = cu_seqlens_q[:-1] - query_layer = query_layer.squeeze(1) - else: - # The -q_len: slice assumes left padding. - attention_mask = attention_mask[:, -query_length:] - query_layer, indices_q, cu_seqlens_q, max_seqlen_in_batch_q = unpad_input(query_layer, attention_mask) - - return ( - query_layer, - key_layer, - value_layer, - indices_q, - (cu_seqlens_q, cu_seqlens_k), - (max_seqlen_in_batch_q, max_seqlen_in_batch_k), - ) - - -# copied from transformers.models.llama.modeling_llama.LlamaSdpaAttention with Llama->Mistral -# TODO @Arthur no longer copied from LLama after static cache -class MistralSdpaAttention(MistralAttention): - """ - Mistral attention module using torch.nn.functional.scaled_dot_product_attention. This module inherits from - `MistralAttention` as the weights of the module stays untouched. The only changes are on the forward pass to adapt to - SDPA API. - """ - - # Adapted from MistralAttention.forward - def forward( - self, - hidden_states: torch.Tensor, - attention_mask: Optional[torch.Tensor] = None, - position_ids: Optional[torch.LongTensor] = None, - past_key_value: Optional[Cache] = None, - output_attentions: bool = False, - use_cache: bool = False, - ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: - if output_attentions: - # TODO: Improve this warning with e.g. `model.config.attn_implementation = "manual"` once this is implemented. - logger.warning_once( - "MistralModel is using MistralSdpaAttention, but `torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to the manual attention implementation, " - 'but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.' - ) - return super().forward( - hidden_states=hidden_states, - attention_mask=attention_mask, - position_ids=position_ids, - past_key_value=past_key_value, - output_attentions=output_attentions, - use_cache=use_cache, - ) - - bsz, q_len, _ = hidden_states.size() - - query_states = self.q_proj(hidden_states) - key_states = self.k_proj(hidden_states) - value_states = self.v_proj(hidden_states) - - # query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2) - # key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) - # value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) - - query_states, key_states, value_states = ( - rearrange(query_states, "b n (h d) -> b n h d", d=self.head_dim), - rearrange(key_states, "b n (h d) -> b n h d", d=self.head_dim), - rearrange(value_states, "b n (h d) -> b n h d", d=self.head_dim), - ) - - query_states = query_states.transpose(1, 2) - key_states = key_states.transpose(1, 2) - value_states = value_states.transpose(1, 2) - - # 打印注意力模块的输出 - # 准备数据以写入 JSON 文件 - attention_outputs = { - "Query states": query_states.detach().cpu().tolist(), - "Key states": key_states.detach().cpu().tolist(), - "Value states": value_states.detach().cpu().tolist() - } - # 将数据写入 JSON 文件 - with open("a_sdpa_attention_outputs.json", "w") as f: - json.dump(attention_outputs, f, indent=4) - - if self.config.pp_size > 1: - query_states = query_states.contiguous() - key_states = key_states.contiguous() - value_states = value_states.contiguous() - - - kv_seq_len = key_states.shape[-2] - if past_key_value is not None: - kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx) - cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len) - query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids) - - if past_key_value is not None: - cache_kwargs = {"sin": sin, "cos": cos} # Specific to RoPE models - key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs) - - key_states = repeat_kv(key_states, self.num_key_value_groups) - value_states = repeat_kv(value_states, self.num_key_value_groups) - - if attention_mask is not None: - if attention_mask.size() != (bsz, 1, q_len, kv_seq_len): - raise ValueError( - f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}" - ) - - # SDPA with memory-efficient backend is currently (torch==2.1.2) bugged with non-contiguous inputs with custom attn_mask, - # Reference: https://github.com/pytorch/pytorch/issues/112577. - if query_states.device.type == "cuda" and attention_mask is not None: - query_states = query_states.contiguous() - key_states = key_states.contiguous() - value_states = value_states.contiguous() - - attn_output = torch.nn.functional.scaled_dot_product_attention( - query_states, - key_states, - value_states, - attn_mask=attention_mask, - dropout_p=self.attention_dropout if self.training else 0.0, - # The q_len > 1 is necessary to match with AttentionMaskConverter.to_causal_4d that does not create a causal mask in case q_len == 1. - is_causal=self.is_causal and attention_mask is None and q_len > 1, - ) - - attn_output = attn_output.transpose(1, 2).contiguous() - attn_output = attn_output.view(bsz, q_len, int(self.hidden_size/self.config.tp_size)) - - attn_output = self.o_proj(attn_output) - - # 打印注意力模块的输出 - attention_result = { - "Output weights:": attn_output.detach().cpu().tolist(), - # "Attention weights:": attn_weights.detach().cpu().tolist(), - } - # 将数据写入 JSON 文件 - with open("a_sdpa_attention_outputs.json", "w") as f: - json.dump(attention_result, f, indent=4) - - return attn_output, None, past_key_value - - -MISTRAL_ATTENTION_CLASSES = { - "eager": MistralAttention, - "flash_attention_2": MistralFlashAttention2, - "sdpa": MistralSdpaAttention, -} - - -class MistralDecoderLayer(nn.Module): - def __init__(self, config: CollieConfig, layer_idx: int): - super().__init__() - self.hidden_size = config.hidden_size - config._attn_implementation = "sdpa" - self.self_attn = MISTRAL_ATTENTION_CLASSES[config._attn_implementation](config, layer_idx) - self.config = config - self.mlp = MistralMLP(config) - self.input_layernorm = MistralRMSNorm(config.hidden_size, eps=config.rms_norm_eps) - self.post_attention_layernorm = MistralRMSNorm(config.hidden_size, eps=config.rms_norm_eps) - self.idx = layer_idx - # 务必保持变量名一致 - self.use_cache = self.config.model_config.use_cache - self.hidden_states = None - self.output_attentions = False - -class MistralDecoderLayer(nn.Module): - def __init__(self, config: CollieConfig, layer_idx: int): - super().__init__() - self.hidden_size = config.hidden_size - config._attn_implementation = "sdpa" - self.self_attn = MISTRAL_ATTENTION_CLASSES[config._attn_implementation](config, layer_idx) - self.config = config - self.mlp = MistralMLP(config) - self.input_layernorm = MistralRMSNorm(config.hidden_size, eps=config.rms_norm_eps) - self.post_attention_layernorm = MistralRMSNorm(config.hidden_size, eps=config.rms_norm_eps) - self.idx = layer_idx - # 务必保持变量名一致 - self.use_cache = self.config.model_config.use_cache - self.hidden_states = None - self.output_attentions = False - - def _forward( - self, - hidden_states: torch.Tensor, - attention_mask: Optional[torch.Tensor] = None, - position_ids: Optional[torch.LongTensor] = None, - past_key_value: Optional[Tuple[torch.Tensor]] = None, - # output_attentions: Optional[bool] = False, - # use_cache: Optional[bool] = False, - **kwargs, - ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]: - # if "padding_mask" in kwargs: - # warnings.warn( - # "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`" - # ) - """ - Args: - hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)` - attention_mask (`torch.FloatTensor`, *optional*): attention mask of size - `(batch, sequence_length)` where padding elements are indicated by 0. - output_attentions (`bool`, *optional*): - Whether or not to return the attentions tensors of all attention layers. See `attentions` under - returned tensors for more detail. - use_cache (`bool`, *optional*): - If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding - (see `past_key_values`). - past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states - """ - - residual = hidden_states - - hidden_states = self.input_layernorm(hidden_states) - - # Self Attention - hidden_states, self_attn_weights, present_key_value = self.self_attn( - hidden_states=hidden_states, - attention_mask=attention_mask, - position_ids=position_ids, - past_key_value=past_key_value, - # output_attentions=output_attentions, - # use_cache=use_cache, - **kwargs, - ) - hidden_states = residual + hidden_states - - # Fully Connected - residual = hidden_states - hidden_states = self.post_attention_layernorm(hidden_states) - hidden_states = self.mlp(hidden_states) - hidden_states = residual + hidden_states - - # outputs = (hidden_states,) - - # if output_attentions: - # outputs += (self_attn_weights,) - - # if use_cache: - # outputs += (present_key_value,) - - return hidden_states, present_key_value - - def forward(self, inputs: dict): - layer_past = inputs_to_kv_cache_for_layer(idx=self.idx, inputs=inputs) - - if self.config.checkpointing and self.training: - hidden_states, new_layer_past = torch.utils.checkpoint.checkpoint( - self._forward, - inputs["hidden_states"], - inputs.get("attention_mask", None), - inputs.get("position_ids", None), - layer_past, # inputs.get("past_key_values", None), - ) - else: - hidden_states, new_layer_past = self._forward( - inputs["hidden_states"], - inputs.get("attention_mask", None), - inputs.get("position_ids", None), - layer_past - ) # **inputs - inputs["hidden_states"] = hidden_states - - inputs.update(kv_cache_to_inputs_for_layer(idx=self.idx, new_layer_past=new_layer_past)) - return inputs - - - # def _forward( - # self, - # hidden_states: torch.Tensor, - # attention_mask: Optional[torch.Tensor] = None, - # position_ids: Optional[torch.LongTensor] = None, - # past_key_value: Optional[Tuple[torch.Tensor]] = None, - # # output_attentions: Optional[bool] = False, - # # use_cache: Optional[bool] = False, - # **kwargs, - # ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]: - # # if "padding_mask" in kwargs: - # # warnings.warn( - # # "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`" - # # ) - # """ - # Args: - # hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)` - # attention_mask (`torch.FloatTensor`, *optional*): attention mask of size - # `(batch, sequence_length)` where padding elements are indicated by 0. - # output_attentions (`bool`, *optional*): - # Whether or not to return the attentions tensors of all attention layers. See `attentions` under - # returned tensors for more detail. - # use_cache (`bool`, *optional*): - # If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding - # (see `past_key_values`). - # past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states - # """ - - # residual = hidden_states - - # hidden_states = self.input_layernorm(hidden_states) - - # # Self Attention - # hidden_states, self_attn_weights, present_key_value = self.self_attn( - # hidden_states=hidden_states, - # attention_mask=attention_mask, - # position_ids=position_ids, - # past_key_value=past_key_value, - # # output_attentions=output_attentions, - # # use_cache=use_cache, - # **kwargs, - # ) - # hidden_states = residual + hidden_states - - # # Fully Connected - # residual = hidden_states - # hidden_states = self.post_attention_layernorm(hidden_states) - # hidden_states = self.mlp(hidden_states) - # hidden_states = residual + hidden_states - - # # outputs = (hidden_states,) - - # # if output_attentions: - # # outputs += (self_attn_weights,) - - # # if use_cache: - # # outputs += (present_key_value,) - - # return hidden_states, present_key_value - - # def forward(self, inputs: dict): - # layer_past = inputs_to_kv_cache_for_layer(idx=self.idx, inputs=inputs) - - # if self.config.checkpointing and self.training: - # hidden_states, new_layer_past = torch.utils.checkpoint.checkpoint( - # self._forward, - # inputs["hidden_states"], - # inputs.get("attention_mask", None), - # inputs.get("position_ids", None), - # layer_past, # inputs.get("past_key_values", None), - # ) - # else: - # hidden_states, new_layer_past = self._forward( - # inputs["hidden_states"], - # inputs.get("attention_mask", None), - # inputs.get("position_ids", None), - # layer_past - # ) # **inputs - # inputs["hidden_states"] = hidden_states - - # inputs.update(kv_cache_to_inputs_for_layer(idx=self.idx, new_layer_past=new_layer_past)) - # return inputs - - # def forward( - # self, - # hidden_states: torch.Tensor, - # attention_mask: Optional[torch.Tensor] = None, - # position_ids: Optional[torch.LongTensor] = None, - # past_key_value: Optional[Tuple[torch.Tensor]] = None, - # output_attentions: Optional[bool] = False, - # use_cache: Optional[bool] = False, - # **kwargs, - # ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]: - # if "padding_mask" in kwargs: - # warnings.warn( - # "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`" - # ) - # """ - # Args: - # hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)` - # attention_mask (`torch.FloatTensor`, *optional*): attention mask of size - # `(batch, sequence_length)` where padding elements are indicated by 0. - # output_attentions (`bool`, *optional*): - # Whether or not to return the attentions tensors of all attention layers. See `attentions` under - # returned tensors for more detail. - # use_cache (`bool`, *optional*): - # If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding - # (see `past_key_values`). - # past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states - # """ - - # residual = hidden_states - - # hidden_states = self.input_layernorm(hidden_states) - - # # Self Attention - # hidden_states, self_attn_weights, present_key_value = self.self_attn( - # hidden_states=hidden_states, - # attention_mask=attention_mask, - # position_ids=position_ids, - # past_key_value=past_key_value, - # output_attentions=output_attentions, - # use_cache=use_cache, - # **kwargs, - # ) - # hidden_states = residual + hidden_states - - # # Fully Connected - # residual = hidden_states - # hidden_states = self.post_attention_layernorm(hidden_states) - # hidden_states = self.mlp(hidden_states) - # hidden_states = residual + hidden_states - - # outputs = (hidden_states,) - - # if output_attentions: - # outputs += (self_attn_weights,) - - # if use_cache: - # outputs += (present_key_value,) - - # return outputs - - -MISTRAL_START_DOCSTRING = r""" - This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the - library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads - etc.) - - This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. - Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage - and behavior. - - Parameters: - config ([`MistralConfig`]): - Model configuration class with all the parameters of the model. Initializing with a config file does not - load the weights associated with the model, only the configuration. Check out the - [`~PreTrainedModel.from_pretrained`] method to load the model weights. -""" - - -@add_start_docstrings( - "The bare Mistral Model outputting raw hidden-states without any specific head on top.", - MISTRAL_START_DOCSTRING, -) -class MistralPreTrainedModel(PreTrainedModel): - config_class = MistralConfig - base_model_prefix = "model" - supports_gradient_checkpointing = True - _no_split_modules = ["MistralDecoderLayer"] - _skip_keys_device_placement = "past_key_values" - _supports_flash_attn_2 = True - _supports_sdpa = True - _supports_cache_class = True - - def _init_weights(self, module): - std = self.config.initializer_range - if isinstance(module, nn.Linear): - module.weight.data.normal_(mean=0.0, std=std) - if module.bias is not None: - module.bias.data.zero_() - elif isinstance(module, nn.Embedding): - module.weight.data.normal_(mean=0.0, std=std) - if module.padding_idx is not None: - module.weight.data[module.padding_idx].zero_() - - -MISTRAL_INPUTS_DOCSTRING = r""" - Args: - input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`): - Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide - it. - - Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and - [`PreTrainedTokenizer.__call__`] for details. - - [What are input IDs?](../glossary#input-ids) - attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*): - Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: - - - 1 for tokens that are **not masked**, - - 0 for tokens that are **masked**. - - [What are attention masks?](../glossary#attention-mask) - - Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and - [`PreTrainedTokenizer.__call__`] for details. - - If `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see - `past_key_values`). - - If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`] - and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more - information on the default strategy. - - - 1 indicates the head is **not masked**, - - 0 indicates the head is **masked**. - position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): - Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0, - config.n_positions - 1]`. - - [What are position IDs?](../glossary#position-ids) - past_key_values (`Cache` or `tuple(tuple(torch.FloatTensor))`, *optional*): - Pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention - blocks) that can be used to speed up sequential decoding. This typically consists in the `past_key_values` - returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`. - - Two formats are allowed: - - a [`~cache_utils.Cache`] instance; - - Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of - shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`). This is also known as the legacy - cache format. - - The model will output the same cache format that is fed as input. If no `past_key_values` are passed, the - legacy cache format will be returned. - - If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that don't - have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `input_ids` - of shape `(batch_size, sequence_length)`. - inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): - Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This - is useful if you want more control over how to convert `input_ids` indices into associated vectors than the - model's internal embedding lookup matrix. - use_cache (`bool`, *optional*): - If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see - `past_key_values`). - output_attentions (`bool`, *optional*): - Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned - tensors for more detail. - output_hidden_states (`bool`, *optional*): - Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for - more detail. - return_dict (`bool`, *optional*): - Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. -""" - - -@add_start_docstrings( - "The bare Mistral Model outputting raw hidden-states without any specific head on top.", - MISTRAL_START_DOCSTRING, -) -class MistralModel(nn.Module): - """ - Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`MistralDecoderLayer`] - - Args: - config: MistralConfig - """ - - def __init__(self, config: CollieConfig): - # super().__init__(config) - super().__init__() - self.config = config - self.padding_idx = config.pad_token_id - self.vocab_size = config.vocab_size - - # aaaa - # self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx) - self.embed_tokens = tensor_parallel.VocabParallelEmbedding( - config.vocab_size, config.hidden_size, params_dtype=torch.float32 - ) - self.layers = nn.ModuleList( - [MistralDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)] - ) - config._attn_implementation = "sdpa" - self._attn_implementation = config._attn_implementation - self.norm = MistralRMSNorm(config.hidden_size, eps=config.rms_norm_eps) - self.gradient_checkpointing = False - # Initialize weights and apply final processing - # self.post_init() - - - def get_input_embeddings(self): - return self.embed_tokens - - def set_input_embeddings(self, value): - self.embed_tokens = value - - @add_start_docstrings_to_model_forward(MISTRAL_INPUTS_DOCSTRING) - def forward( - self, - input_ids: torch.LongTensor = None, - attention_mask: Optional[torch.Tensor] = None, - position_ids: Optional[torch.LongTensor] = None, - # aaaa - past_key_values: Optional[Tuple[torch.FloatTensor]] = None, - inputs_embeds: Optional[torch.FloatTensor] = None, - use_cache: Optional[bool] = None, - output_attentions: Optional[bool] = None, - output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[Tuple, BaseModelOutputWithPast]: - output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions - output_hidden_states = ( - output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states - ) - use_cache = use_cache if use_cache is not None else self.config.use_cache - - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - - # retrieve input_ids and inputs_embeds - if input_ids is not None and inputs_embeds is not None: - raise ValueError("You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time") - elif input_ids is not None: - batch_size, seq_length = input_ids.shape - elif inputs_embeds is not None: - batch_size, seq_length, _ = inputs_embeds.shape - else: - raise ValueError("You have to specify either decoder_input_ids or decoder_inputs_embeds") - - if self.gradient_checkpointing and self.training: - if use_cache: - logger.warning_once( - "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..." - ) - use_cache = False - - past_key_values_length = 0 - - if use_cache: - use_legacy_cache = not isinstance(past_key_values, Cache) - if use_legacy_cache: - past_key_values = DynamicCache.from_legacy_cache(past_key_values) - past_key_values_length = past_key_values.get_usable_length(seq_length) - - if position_ids is None: - device = input_ids.device if input_ids is not None else inputs_embeds.device - position_ids = torch.arange( - past_key_values_length, seq_length + past_key_values_length, dtype=torch.long, device=device - ) - position_ids = position_ids.unsqueeze(0).view(-1, seq_length) - else: - position_ids = position_ids.view(-1, seq_length).long() - - if inputs_embeds is None: - inputs_embeds = self.embed_tokens(input_ids) - - # 打印嵌入层输出 - embeddings_output = inputs_embeds.detach().cpu().tolist() - data_to_save = {"Embeddings Output": embeddings_output} - # 将输出写入 JSON 文件 - with open('a_embeddings_output.json', 'w') as f: - json.dump(data_to_save, f, indent=4) - - if attention_mask is not None and self._attn_implementation == "flash_attention_2" and use_cache: - is_padding_right = attention_mask[:, -1].sum().item() != batch_size - if is_padding_right: - raise ValueError( - "You are attempting to perform batched generation with padding_side='right'" - " this may lead to unexpected behaviour for Flash Attention version of Mistral. Make sure to " - " call `tokenizer.padding_side = 'left'` before tokenizing the input. " - ) - - if self._attn_implementation == "flash_attention_2": - # 2d mask is passed through the layers - attention_mask = attention_mask if (attention_mask is not None and 0 in attention_mask) else None - elif self._attn_implementation == "sdpa" and not output_attentions: - # output_attentions=True can not be supported when using SDPA, and we fall back on - # the manual implementation that requires a 4D causal mask in all cases. - attention_mask = _prepare_4d_causal_attention_mask_for_sdpa( - attention_mask, - (batch_size, seq_length), - inputs_embeds, - past_key_values_length, - ) - else: - # 4d mask is passed through the layers - attention_mask = _prepare_4d_causal_attention_mask( - attention_mask, - (batch_size, seq_length), - inputs_embeds, - past_key_values_length, - sliding_window=self.config.sliding_window, - ) - - hidden_states = inputs_embeds - - inputs = { - "input_ids": input_ids, - "hidden_states": hidden_states, - "attention_mask": attention_mask, - "position_ids": position_ids, - "past_key_values": past_key_values, - "output_attentions": output_attentions, - "use_cache": use_cache, - } - - # decoder layers - all_hidden_states = () if output_hidden_states else None - all_self_attns = () if output_attentions else None - next_decoder_cache = None - - # for decoder_layer in self.layers: - for idx, decoder_layer in enumerate(self.layers): - if output_hidden_states: - # all_hidden_states += (hidden_states,) - all_hidden_states += (inputs["hidden_states"],) - - if self.gradient_checkpointing and self.training: - layer_outputs = self._gradient_checkpointing_func( - decoder_layer.__call__, - # hidden_states, - # attention_mask, - # position_ids, - # past_key_values, - # output_attentions, - # use_cache, - inputs, - ) - else: - layer_outputs = decoder_layer( - # hidden_states, - # attention_mask=attention_mask, - # position_ids=position_ids, - # past_key_value=past_key_values, - # output_attentions=output_attentions, - # use_cache=use_cache, - inputs, - ) - inputs.update(layer_outputs) - - # hidden_states = layer_outputs[0] - hidden_states = inputs["hidden_states"] - - if use_cache: - # next_decoder_cache = layer_outputs[2 if output_attentions else 1] - next_decoder_cache = inputs["addition_info"][1 if output_attentions else 0] - - if output_attentions: - # all_self_attns += (layer_outputs[1],) - all_self_attns += (inputs["addition_info"][0],) - - hidden_states = self.norm(hidden_states) - - # add hidden states from the last decoder layer - if output_hidden_states: - all_hidden_states += (hidden_states,) - - next_cache = None - if use_cache: - next_cache = next_decoder_cache.to_legacy_cache() if use_legacy_cache else next_decoder_cache - - if not return_dict: - return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None) - return BaseModelOutputWithPast( - last_hidden_state=hidden_states, - # past_key_values=next_cache, - hidden_states=all_hidden_states, - attentions=all_self_attns, - past_key_values=past_key_values, - ) - - @classmethod - def pipeline_layers(cls, config: CollieConfig): - """ - Get layers of pipeline. - :return: list - """ - if isinstance(config, str): - config = CollieConfig.from_pretrained(config) - - if config.tie_word_embeddings: - embed_tokens = TiedLayerSpec( - "embed_tokens", - dict_as_params(input_keys="input_ids", output_keys="hidden_states"), - tensor_parallel.VocabParallelEmbedding, - config.vocab_size, - config.hidden_size, - ) - else: - embed_tokens = LayerSpec( - dict_as_params(input_keys="input_ids", output_keys="hidden_states"), - tensor_parallel.VocabParallelEmbedding, - config.vocab_size, - config.hidden_size, - ) - - layers = [ - LayerSpec(MistralDecoderLayer, config, i) for i in range(config.num_hidden_layers) - ] - norm = LayerSpec( - dict_as_params(input_keys="hidden_states", output_keys="hidden_states"), - MistralRMSNorm, - hidden_size=config.hidden_size, - eps=config.rms_norm_eps, - ) - - return [ - ("embed_tokens", embed_tokens), - ("layers", layers), - ("norm", norm), - ] - -class MistralForCausalLM(CollieModelForCausalLM): - _tied_weights_keys = ["lm_head.weight"] - - def __init__(self, config:CollieConfig): - super().__init__(config) - self.model = MistralModel(config) - self.vocab_size = config.vocab_size - # self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False) - self.lm_head = ColumnParallelLinearWithoutBias( - self.collie_config.hidden_size, self.collie_config.vocab_size, bias=False - ) - # Initialize weights and apply final processing - # self.post_init() - # GenerationMixin 需要的额外参数 - self.config.is_decoder = True - if config.model_config.tie_word_embeddings: - self.lm_head.weight = self.embed_tokens.weight - self.main_input_name = "input_ids" - - def clean_cache(self): - self._clean_hidden_states([*self.model.layers, self.lm_head]) - self._set_use_cache(self.model.layers, False) - - def set_cache(self, use_cache): - self._set_use_cache(self.model.layers, use_cache) - - def get_input_embeddings(self): - return self.model.embed_tokens - - def set_input_embeddings(self, value): - self.model.embed_tokens = value - - def get_output_embeddings(self): - return self.lm_head - - def set_output_embeddings(self, new_embeddings): - self.lm_head = new_embeddings - - def set_decoder(self, decoder): - self.model = decoder - - def get_decoder(self): - return self.model - - @add_start_docstrings_to_model_forward(MISTRAL_INPUTS_DOCSTRING) - @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC) - def forward( - self, - input_ids: torch.LongTensor = None, - attention_mask: Optional[torch.Tensor] = None, - position_ids: Optional[torch.LongTensor] = None, - past_key_values: Optional[List[torch.FloatTensor]] = None, - inputs_embeds: Optional[torch.FloatTensor] = None, - labels: Optional[torch.LongTensor] = None, - use_cache: Optional[bool] = None, - output_attentions: Optional[bool] = None, - output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - **kwargs, - ) -> Union[Tuple, CausalLMOutputWithPast]: - r""" - Args: - labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): - Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., - config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored - (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`. - - Returns: - - Example: - - ```python - >>> from transformers import AutoTokenizer, MistralForCausalLM - - >>> model = MistralForCausalLM.from_pretrained("mistralai/Mistral-7B-v0.1") - >>> tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-v0.1") - - >>> prompt = "Hey, are you conscious? Can you talk to me?" - >>> inputs = tokenizer(prompt, return_tensors="pt") - - >>> # Generate - >>> generate_ids = model.generate(inputs.input_ids, max_length=30) - >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0] - "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you." - ```""" - - output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions - output_hidden_states = ( - output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states - ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - - # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn) - outputs = self.model( - input_ids=input_ids, - attention_mask=attention_mask, - position_ids=position_ids, - past_key_values=past_key_values, - inputs_embeds=inputs_embeds, - use_cache=use_cache, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, - return_dict=return_dict, - ) - - hidden_states = outputs[0] - logits = self.lm_head(hidden_states) - logits = logits.float() - - loss = None - if labels is not None: - # Shift so that tokens < n predict n - shift_logits = logits[..., :-1, :].contiguous() - shift_labels = labels[..., 1:].contiguous() - # Flatten the tokens - shift_logits = shift_logits.view(-1, self.config.vocab_size) - shift_labels = shift_labels.view(-1) - # Ensure tensors are on the same device - shift_labels = shift_labels.to(shift_logits.device) - loss_fct = CrossEntropyLoss() - loss = loss_fct(shift_logits, shift_labels) - - if not return_dict: - output = (logits,) + outputs[1:] - return (loss,) + output if loss is not None else output - - return CausalLMOutputWithPast( - loss=loss, - logits=logits, - past_key_values=outputs.past_key_values, - hidden_states=outputs.hidden_states, - attentions=outputs.attentions, - ) - - def prepare_inputs_for_generation( - self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, **kwargs - ): - # Omit tokens covered by past_key_values - if past_key_values is not None: - if isinstance(past_key_values, Cache): - cache_length = past_key_values.get_seq_length() - past_length = past_key_values.seen_tokens - max_cache_length = past_key_values.get_max_length() - else: - cache_length = past_length = past_key_values[0][0].shape[2] - max_cache_length = None - - # Keep only the unprocessed tokens: - # 1 - If the length of the attention_mask exceeds the length of input_ids, then we are in a setting where - # some of the inputs are exclusively passed as part of the cache (e.g. when passing input_embeds as - # input) - if attention_mask is not None and attention_mask.shape[1] > input_ids.shape[1]: - input_ids = input_ids[:, -(attention_mask.shape[1] - past_length) :] - # 2 - If the past_length is smaller than input_ids', then input_ids holds all input tokens. We can discard - # input_ids based on the past_length. - elif past_length < input_ids.shape[1]: - input_ids = input_ids[:, past_length:] - # 3 - Otherwise (past_length >= input_ids.shape[1]), let's assume input_ids only has unprocessed tokens. - - # If we are about to go beyond the maximum cache length, we need to crop the input attention mask. - if ( - max_cache_length is not None - and attention_mask is not None - and cache_length + input_ids.shape[1] > max_cache_length - ): - attention_mask = attention_mask[:, -max_cache_length:] - - position_ids = kwargs.get("position_ids", None) - if attention_mask is not None and position_ids is None: - # create position_ids on the fly for batch generation - position_ids = attention_mask.long().cumsum(-1) - 1 - position_ids.masked_fill_(attention_mask == 0, 1) - if past_key_values: - position_ids = position_ids[:, -input_ids.shape[1] :] - - # if `inputs_embeds` are passed, we only want to use them in the 1st generation step - if inputs_embeds is not None and past_key_values is None: - model_inputs = {"inputs_embeds": inputs_embeds} - else: - model_inputs = {"input_ids": input_ids} - - model_inputs.update( - { - "position_ids": position_ids, - "past_key_values": past_key_values, - "use_cache": kwargs.get("use_cache"), - "attention_mask": attention_mask, - } - ) - return model_inputs - - @staticmethod - def _reorder_cache(past_key_values, beam_idx): - reordered_past = () - for layer_past in past_key_values: - reordered_past += ( - tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past), - ) - return reordered_past - - @classmethod - def pipeline_layers(cls, config: CollieConfig): - """ - Get layers of pipeline. - :return: list - """ - if isinstance(config, str): - config = CollieConfig.from_pretrained(config) - - if config.tie_word_embeddings: - output = TiedLayerSpec( - "embed_tokens", - dict_as_params(input_keys="hidden_states", output_keys="logits"), - ColumnParallelLMHead, - config.hidden_size, - config.vocab_size, - bias=False, - ) - else: - output = LayerSpec( - dict_as_params(input_keys="hidden_states", output_keys="logits"), - ColumnParallelLMHead, - config.hidden_size, - config.vocab_size, - bias=False, - ) - - return [("model", MistralModel.pipeline_layers(config)), ("lm_head", output)] - - @staticmethod - def load_parallel_state_dict( - path: str, - config: Union[CollieConfig, str], - process_exclusion: bool = False, - **kwargs, - ): - ... - - @staticmethod - def load_parallel_state_dict( - path: str, - config: Union[CollieConfig, str], - process_exclusion: bool = False, - protocol: str = "file", # 指定加载state_dict时使用的协议 - **kwargs, - ): - """ - Load state_dict from ``path``. - The format of pretrained model should be the same as that of - `huggingface`. - :return: state_dict. Note that the state_dict should be processed - properly to match the current rank. - """ - # 配置加载 - if isinstance(config, str): - config = CollieConfig.from_pretrained(config) - # IO驱动初始化 - io_driver = IODriver.from_protocol(protocol) - # 检查文件路径是否存在 - if not io_driver.exists(path): - raise FileNotFoundError(f"folder {path} not found.") - # 初始化存储和处理变量 - state_dict = OrderedDict() - weights = [] - parts = None # 变量用于存储模型分割的部分信息 - # 如果开启了进程互斥,那么每个进程都会显示进度条,否则只显示 RANK0 的 - hide_progress = not process_exclusion and int(os.environ.get("RANK", "0")) != 0 - if dist.is_initialized() and process_exclusion: - # 如果启动了进程互斥,则要进行 dist.get_world_size() 次循环 - rank_order = range(dist.get_world_size()) - else: - # 不开启只进行一次循环 - rank_order = range(1) - # 权重文件加载和处理 - for rank in rank_order: - # 如果开启了进程互斥,那么只有对应 RANK 的能进入循环;不开启进程互斥的话就都可以进 - if int(os.environ.get("RANK", "0")) == rank or not process_exclusion: - # PP 分层的方法保存在了 os.environ["COLLIE_PP_PARTS"], 格式类似于 [0, 17, 35], 左闭右开 - if env.is_pipeline: - # 保存的是 json 格式 - parts = env.pipeline_parts - if hasattr(config, "num_key_value_heads"): - # llama2 (transformers >= 4.31.0) - num_key_value_heads = config.num_key_value_heads - else: - num_key_value_heads = config.num_attention_heads - head_dim = config.hidden_size // config.num_attention_heads - # 如果存在 pytorch_model.bin.index.json 文件的话,此时不同的 pp 进程可以按需加载自己需要的权重 - if ( - io_driver.exists(os.path.join(path, "pytorch_model.bin.index.json")) - and "COLLIE_PP_PARTS" in os.environ.keys() - ): - weight_map = json.loads( - io_driver.load( - os.path.join(path, "pytorch_model.bin.index.json"), mode="r" - ) - )["weight_map"] - # layers 表示自己需要的层 - layers = env.pipeline_layers_idx - # 筛选出形似 model.layers.0 这样的层。包含两个条件:1. 有数字的层;2. 数字加一要在 layers 里面(因为最开始还有个 embedding 占一层) - weights.extend( - [ - value - for key, value in weight_map.items() - if len(key.split(".")) > 2 - and key.split(".")[2].isdigit() - and (int(key.split(".")[2]) + 1) in layers - ] - ) - # 去重 - weights = list(set(weights)) - # 继续筛选,如果有 0 层,那么就要加载 embedding;如果有最后一层,那么就要加载 lm_head;如果有倒数第二层,那么就要加载 norm - if 0 in layers: - weights.append(weight_map["model.embed_tokens.weight"]) - if max(parts) - 1 in layers: - weights.append(weight_map["lm_head.weight"]) - if max(parts) - 2 in layers: - weights.append(weight_map["model.norm.weight"]) - else: - # 如果没有 pytorch_model.bin.index.json 文件的话,那么就加载所有的权重 - weights = [ - weight - for weight in io_driver.list(path) - if weight.endswith(".bin") - ] - with progress( - weights, - desc="Loading state dict", - total=len(weights), - disable=hide_progress, - ) as pbar: - for weight in pbar: - part_state_dict = io_driver.load( - os.path.join(path, weight), mode="rb" - ) - # for key in list(part_state_dict.keys()): - # if "attention.wqkv.weight" in key: - # # qkv_weights = part_state_dict.pop(key) - # qkv_weights = part_state_dict[key] - # print(qkv_weights.shape) - # (wq, wk, wv) = qkv_weights.split( - # [ - # config.hidden_size, - # config.num_key_value_heads * head_dim, - # config.num_key_value_heads * head_dim, - # ], - # dim=0, - # ) - # wq_name = key.replace("wqkv", "wq") - # wk_name = key.replace("wqkv", "wk") - # wv_name = key.replace("wqkv", "wv") - # part_state_dict[wq_name] = wq - # part_state_dict[wk_name] = wk - # part_state_dict[wv_name] = wv - state_dict.update(part_state_dict) - del part_state_dict - if parts is not None: - # 这一步是 pp 的复筛 - layers = env.pipeline_layers_idx - for key in list(state_dict.keys()): - if key.startswith("layers"): - layer = int(key.split(".")[1]) - if layer + 1 not in layers: - state_dict.pop(key) - # if key.endswith("tok_embeddings.weight"): - if key.endswith("embed_tokens.weight"): - if 0 not in layers: - state_dict.pop(key) - if key == "norm.weight": - if max(parts) - 2 not in layers: - state_dict.pop(key) - # if key.endswith("output.weight"): - if key.endswith("lm_head.weight"): - if max(parts) - 1 not in layers: - state_dict.pop(key) - # 根据用户配置的新的 tp size 进行分割 - for key in list(state_dict.keys()): - col_filter = [ - # "wq.weight", - # "wk.weight", - # "wv.weight", - # "wqkv.weight", - # "w1.weight", - # "w3.weight", - # "tok_embeddings.weight", - # "output.weight", - "q_proj.weight", - "k_proj.weight", - "v_proj.weight", - #"o_proj.weight", - "lm_head.weight", - "gate_proj.weight", - "up_proj.weight", - #"down_proj.weight", - "embed_tokens.weight", - ] - col_split = any([key.endswith(filter) for filter in col_filter]) - - if col_split: - tensor = ( - list(torch.chunk(state_dict[key], config.tp_size, dim=0))[ - env.tp_rank - ] - .detach() - .clone() - ) - del state_dict[key] - if process_exclusion: - # CPU 内存回收(速度很慢) - gc.collect() - state_dict[key] = tensor - elif key.endswith("o_proj.weight") or key.endswith("down_proj.weight"): - tensor = ( - list(torch.chunk(state_dict[key], config.tp_size, dim=1))[ - env.tp_rank - ] - .detach() - .clone() - ) - del state_dict[key] - if process_exclusion: - # CPU 内存回收(速度很慢) - gc.collect() - state_dict[key] = tensor - if dist.is_initialized() and process_exclusion: - # 如果选择了进程互斥,那么本次循环中不需要加载权重的进程需等待 - dist.barrier() - return state_dict - - @staticmethod - def save_parallel_state_dict( - state_dict: dict, - path: str, - config: CollieConfig, - process_exclusion: bool = False, - **kwargs, - ): - ... - - @staticmethod - def save_parallel_state_dict( - state_dict: dict, - path: str, - config: CollieConfig, - process_exclusion: bool = False, - protocol: str = "file", - ): - """ - Save state_dict to ``path``. - The format of saved state dict should be the same as that of - `huggingface`. - """ - io_driver = IODriver.from_protocol(protocol) - # gather to tp rank 0 - if dist.is_initialized() and process_exclusion: - # 如果启动了进程互斥,则要进行 pp_size 次循环 - rank_order = range(config.pp_size) - else: - # 不开启只进行一次循环 - rank_order = range(1) - dst = parallel_state.get_tensor_model_parallel_src_rank() - with progress( - rank_order, - desc="Saving model", - disable=int(os.environ.get("RANK", "0")) != 0, - ) as pbar: - for rank in pbar: - if env.dp_rank == 0 and (env.pp_rank == rank or not process_exclusion): - for key in sorted(list(state_dict.keys())): - tensor_list = None - if env.tp_rank == 0: - tensor_list = [ - torch.zeros_like(state_dict[key]) - .to(state_dict[key].dtype) - .cuda() - for _ in range(config.tp_size) - ] - dist.gather( - state_dict[key].cuda(), - dst=dst, - gather_list=tensor_list, - group=env.tp_group, - ) - if env.tp_rank == 0: - col_filter = [ - # "wq.weight", - # "wk.weight", - # "wv.weight", - # "wqkv.weight", - # "w1.weight", - # "w3.weight", - # "tok_embeddings.weight", - # "output.weight", - "q_proj.weight", - "k_proj.weight", - "v_proj.weight", - #"o_proj.weight", - "lm_head.weight", - "gate_proj.weight", - "up_proj.weight", - #"down_proj.weight", - "embed_tokens.weight", - ] - col_split = any( - [key.endswith(filter) for filter in col_filter] - ) - - if col_split: - state_dict[key] = concat_tensor(tensor_list, dim=0) - - if process_exclusion: - # CPU 内存回收(速度很慢) - gc.collect() - - elif key.endswith("o_proj.weight") or key.endswith("down_proj.weight"): - state_dict[key] = concat_tensor(tensor_list, dim=1) - - if process_exclusion: - # CPU 内存回收(速度很慢) - gc.collect() - # 似乎不需要? - # state_dict_keys = state_dict.keys() - # for layer_id in range(config.num_layers): - # qkv_names = [None, None, None] - # for key in state_dict_keys: - # if f"layers.{layer_id}.attention.wq.weight" in key: - # qkv_names[0] = key - # elif f"layers.{layer_id}.attention.wk.weight" in key: - # qkv_names[1] = key - # elif f"layers.{layer_id}.attention.wv.weight" in key: - # qkv_names[2] = key - # qkv_name = qkv_names[0].replace("wq", "wqkv") - # state_dict[qkv_name] = torch.cat( - # [ - # state_dict.pop(qkv_names[0]), - # state_dict.pop(qkv_names[1]), - # state_dict.pop(qkv_names[2]), - # ], - # dim=0 - # ) - - if env.tp_rank == 0: - # Save gathered weights - if env.is_pipeline: - ckpt_name = f"pytorch_model-{env.pp_rank + 1:05d}-of-{config.pp_size:05d}.bin" - total_size = 0 - weight_map = {} - for name, weight in state_dict.items(): - weight_size = weight.numel() * dtype_byte_size( - weight.dtype - ) - weight_map[name] = ckpt_name - total_size += weight_size - index_dict = dict( - total_size=total_size, weight_map=weight_map - ) - index_dicts = [None for _ in range(env.pp_size)] - dist.gather_object( - index_dict, index_dicts if env.pp_rank == 0 else None, group=env.pp_group - ) - if env.pp_rank == 0: - total_size = 0 - weight_map = {} - for _index_dict in index_dicts: - total_size += _index_dict["total_size"] - weight_map.update(_index_dict["weight_map"]) - merged_dict = { - "metadata": {"total_size": total_size}, - "weight_map": weight_map, - } - io_driver.save( - json.dumps(merged_dict, indent=2, sort_keys=True) - + "\n", - os.path.join(path, "pytorch_model.bin.index.json"), - ) - - else: - ckpt_name = f"pytorch_model.bin" - ckpt_path = os.path.join(path, ckpt_name) - io_driver.save(state_dict, ckpt_path) - if dist.is_initialized() and process_exclusion: - dist.barrier() - if env.rank == 0: - config.save_pretrained(path, protocol=protocol) - dist.barrier() - - -@add_start_docstrings( - """ - The Mistral Model transformer with a sequence classification head on top (linear layer). - - [`MistralForSequenceClassification`] uses the last token in order to do the classification, as other causal models - (e.g. GPT-2) do. - - Since it does classification on the last token, it requires to know the position of the last token. If a - `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If - no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the - padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in - each row of the batch). - """, - MISTRAL_START_DOCSTRING, -) -# Copied from transformers.models.llama.modeling_llama.LlamaForSequenceClassification with Llama->Mistral, LLAMA->MISTRAL -class MistralForSequenceClassification(MistralPreTrainedModel): - def __init__(self, config): - super().__init__(config) - self.num_labels = config.num_labels - self.model = MistralModel(config) - self.score = nn.Linear(config.hidden_size, self.num_labels, bias=False) - - # Initialize weights and apply final processing - self.post_init() - - def get_input_embeddings(self): - return self.model.embed_tokens - - def set_input_embeddings(self, value): - self.model.embed_tokens = value - - @add_start_docstrings_to_model_forward(MISTRAL_INPUTS_DOCSTRING) - def forward( - self, - input_ids: torch.LongTensor = None, - attention_mask: Optional[torch.Tensor] = None, - position_ids: Optional[torch.LongTensor] = None, - past_key_values: Optional[List[torch.FloatTensor]] = None, - inputs_embeds: Optional[torch.FloatTensor] = None, - labels: Optional[torch.LongTensor] = None, - use_cache: Optional[bool] = None, - output_attentions: Optional[bool] = None, - output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[Tuple, SequenceClassifierOutputWithPast]: - r""" - labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): - Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., - config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If - `config.num_labels > 1` a classification loss is computed (Cross-Entropy). - """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - - transformer_outputs = self.model( - input_ids, - attention_mask=attention_mask, - position_ids=position_ids, - past_key_values=past_key_values, - inputs_embeds=inputs_embeds, - use_cache=use_cache, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, - return_dict=return_dict, - ) - hidden_states = transformer_outputs[0] - logits = self.score(hidden_states) - - if input_ids is not None: - batch_size = input_ids.shape[0] - else: - batch_size = inputs_embeds.shape[0] - - if self.config.pad_token_id is None and batch_size != 1: - raise ValueError("Cannot handle batch sizes > 1 if no padding token is defined.") - if self.config.pad_token_id is None: - sequence_lengths = -1 - else: - if input_ids is not None: - # if no pad token found, use modulo instead of reverse indexing for ONNX compatibility - sequence_lengths = torch.eq(input_ids, self.config.pad_token_id).int().argmax(-1) - 1 - sequence_lengths = sequence_lengths % input_ids.shape[-1] - sequence_lengths = sequence_lengths.to(logits.device) - else: - sequence_lengths = -1 - - pooled_logits = logits[torch.arange(batch_size, device=logits.device), sequence_lengths] - - loss = None - if labels is not None: - labels = labels.to(logits.device) - if self.config.problem_type is None: - if self.num_labels == 1: - self.config.problem_type = "regression" - elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int): - self.config.problem_type = "single_label_classification" - else: - self.config.problem_type = "multi_label_classification" - - if self.config.problem_type == "regression": - loss_fct = MSELoss() - if self.num_labels == 1: - loss = loss_fct(pooled_logits.squeeze(), labels.squeeze()) - else: - loss = loss_fct(pooled_logits, labels) - elif self.config.problem_type == "single_label_classification": - loss_fct = CrossEntropyLoss() - loss = loss_fct(pooled_logits.view(-1, self.num_labels), labels.view(-1)) - elif self.config.problem_type == "multi_label_classification": - loss_fct = BCEWithLogitsLoss() - loss = loss_fct(pooled_logits, labels) - if not return_dict: - output = (pooled_logits,) + transformer_outputs[1:] - return ((loss,) + output) if loss is not None else output - - return SequenceClassifierOutputWithPast( - loss=loss, - logits=pooled_logits, - past_key_values=transformer_outputs.past_key_values, - hidden_states=transformer_outputs.hidden_states, - attentions=transformer_outputs.attentions, - ) From 7dd75f01f7583fb92559c900b26d1c3f0424469f Mon Sep 17 00:00:00 2001 From: LinqiY <100989140+LinqiY@users.noreply.github.com> Date: Mon, 6 May 2024 15:42:44 +0800 Subject: [PATCH 12/16] Add mistral --- collie/models/mistral/__init__.py | 2 + collie/models/mistral/model.py | 1919 +++++++++++++++++++++++++++++ 2 files changed, 1921 insertions(+) create mode 100644 collie/models/mistral/__init__.py create mode 100644 collie/models/mistral/model.py diff --git a/collie/models/mistral/__init__.py b/collie/models/mistral/__init__.py new file mode 100644 index 0000000..e998c29 --- /dev/null +++ b/collie/models/mistral/__init__.py @@ -0,0 +1,2 @@ +from .model import MistralForCausalLM +from .configuration_mistral import MistralConfig \ No newline at end of file diff --git a/collie/models/mistral/model.py b/collie/models/mistral/model.py new file mode 100644 index 0000000..a85d0c7 --- /dev/null +++ b/collie/models/mistral/model.py @@ -0,0 +1,1919 @@ +# coding=utf-8 +# Copyright 2023 Mistral AI and the HuggingFace Inc. team. All rights reserved. +# +# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX +# and OPT implementations in this library. It has been modified from its +# original forms to accommodate minor architectural differences compared +# to GPT-NeoX and OPT used by the Meta AI team that trained the model. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" PyTorch Mistral model.""" +import inspect +import math +import warnings +from typing import List, Optional, Tuple, Union + +import torch +import torch.nn.functional as F +import torch.utils.checkpoint +from torch import nn +from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss + +from transformers.activations import ACT2FN +from transformers.cache_utils import Cache, DynamicCache +from transformers.modeling_attn_mask_utils import _prepare_4d_causal_attention_mask, _prepare_4d_causal_attention_mask_for_sdpa +from transformers.modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast, SequenceClassifierOutputWithPast +from transformers.modeling_utils import PreTrainedModel, dtype_byte_size +from transformers.utils import ( + add_start_docstrings, + add_start_docstrings_to_model_forward, + is_flash_attn_2_available, + is_flash_attn_greater_or_equal_2_10, + logging, + replace_return_docstrings, +) +from .configuration_mistral import MistralConfig + + +if is_flash_attn_2_available(): + from flash_attn import flash_attn_func, flash_attn_varlen_func + from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input # noqa + + _flash_supports_window_size = "window_size" in list(inspect.signature(flash_attn_func).parameters) + + +logger = logging.get_logger(__name__) + +_CONFIG_FOR_DOC = "MistralConfig" + +#modified for collie +import torch.distributed as dist +import gc +import json +import os +from collections import OrderedDict +from megatron.core import parallel_state, tensor_parallel +from einops import rearrange +from deepspeed.pipe import LayerSpec, TiedLayerSpec + +from collie.config import CollieConfig +from collie.driver.io import IODriver +from collie.log.logger import logger +from collie.module import ( + ColumnParallelLinearWithoutBias, + ColumnParallelLMHead, + RowParallelLinearWithoutBias, +) +from collie.utils import concat_tensor, dict_as_params, env, progress +from collie.models.base import CollieModelForCausalLM +from collie.models.utils import ( + kv_cache_to_inputs_for_layer, inputs_to_kv_cache_for_layer, + kv_cache_to_inputs_for_model, inputs_to_kv_cache_for_model, +) + + +# Copied from transformers.models.llama.modeling_llama._get_unpad_data +def _get_unpad_data(attention_mask): + seqlens_in_batch = attention_mask.sum(dim=-1, dtype=torch.int32) + indices = torch.nonzero(attention_mask.flatten(), as_tuple=False).flatten() + max_seqlen_in_batch = seqlens_in_batch.max().item() + cu_seqlens = F.pad(torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.int32), (1, 0)) + return ( + indices, + cu_seqlens, + max_seqlen_in_batch, + ) + + +# Copied from transformers.models.llama.modeling_llama.LlamaRMSNorm with Llama->Mistral +class MistralRMSNorm(nn.Module): + def __init__(self, hidden_size, eps=1e-6): + """ + MistralRMSNorm is equivalent to T5LayerNorm + """ + super().__init__() + self.weight = nn.Parameter(torch.ones(hidden_size)) + self.variance_epsilon = eps + + def forward(self, hidden_states): + input_dtype = hidden_states.dtype + hidden_states = hidden_states.to(torch.float32) + variance = hidden_states.pow(2).mean(-1, keepdim=True) + hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon) + ans = self.weight * hidden_states.to(input_dtype) + + return self.weight * hidden_states.to(input_dtype) + + +# copied from transformers.models.llama.modeling_llama.LlamaRotaryEmbedding with Llama->Mistral +# TODO @Arthur no longer copied from LLama after static cache +class MistralRotaryEmbedding(nn.Module): + def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None): + super().__init__() + + self.dim = dim + self.max_position_embeddings = max_position_embeddings + self.base = base + inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2, dtype=torch.int64).float().to(device) / self.dim)) + self.register_buffer("inv_freq", inv_freq, persistent=False) + + # Build here to make `torch.jit.trace` work. + self._set_cos_sin_cache( + seq_len=max_position_embeddings, device=self.inv_freq.device, dtype=torch.get_default_dtype() + ) + + def _set_cos_sin_cache(self, seq_len, device, dtype): + self.max_seq_len_cached = seq_len + t = torch.arange(self.max_seq_len_cached, device=device, dtype=torch.int64).type_as(self.inv_freq) + + freqs = torch.outer(t, self.inv_freq) + # Different from paper, but it uses a different permutation in order to obtain the same calculation + emb = torch.cat((freqs, freqs), dim=-1) + self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False) + self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False) + + def forward(self, x, seq_len=None): + # x: [bs, num_attention_heads, seq_len, head_size] + if seq_len > self.max_seq_len_cached: + self._set_cos_sin_cache(seq_len=seq_len, device=x.device, dtype=x.dtype) + + return ( + self.cos_cached[:seq_len].to(dtype=x.dtype), + self.sin_cached[:seq_len].to(dtype=x.dtype), + ) + + +# Copied from transformers.models.llama.modeling_llama.rotate_half +def rotate_half(x): + """Rotates half the hidden dims of the input.""" + x1 = x[..., : x.shape[-1] // 2] + x2 = x[..., x.shape[-1] // 2 :] + return torch.cat((-x2, x1), dim=-1) + + +# copied from transformers.models.llama.modeling_llama.apply_rotary_pos_emb +# TODO @Arthur no longer copied from LLama after static cache +def apply_rotary_pos_emb(q, k, cos, sin, position_ids, unsqueeze_dim=1): + """Applies Rotary Position Embedding to the query and key tensors. + + Args: + q (`torch.Tensor`): The query tensor. + k (`torch.Tensor`): The key tensor. + cos (`torch.Tensor`): The cosine part of the rotary embedding. + sin (`torch.Tensor`): The sine part of the rotary embedding. + position_ids (`torch.Tensor`): + The position indices of the tokens corresponding to the query and key tensors. For example, this can be + used to pass offsetted position ids when working with a KV-cache. + unsqueeze_dim (`int`, *optional*, defaults to 1): + The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and + sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note + that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and + k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes + cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have + the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2. + Returns: + `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding. + """ + cos = cos[position_ids].unsqueeze(unsqueeze_dim) + sin = sin[position_ids].unsqueeze(unsqueeze_dim) + q_embed = (q * cos) + (rotate_half(q) * sin) + k_embed = (k * cos) + (rotate_half(k) * sin) + return q_embed, k_embed + + +class MistralMLP(nn.Module): + def __init__(self, config): + super().__init__() + self.config = config + self.hidden_size = config.hidden_size + self.intermediate_size = config.intermediate_size + self.up_proj = ColumnParallelLinearWithoutBias( + self.hidden_size, + self.intermediate_size, + bias=False, + gather_output=False, + init_method=lambda x: x, + ) + self.gate_proj = ColumnParallelLinearWithoutBias( + self.hidden_size, + self.intermediate_size, + bias=False, + gather_output=False, + init_method=lambda x: x, + ) + self.down_proj = RowParallelLinearWithoutBias( + self.intermediate_size, + self.hidden_size, + bias=False, + input_is_parallel=True, + init_method=lambda x: x, + ) + + self.act_fn = ACT2FN[config.hidden_act] + + def forward(self, x): + output = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x)) + return output + + +# Copied from transformers.models.llama.modeling_llama.repeat_kv +def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor: + """ + This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch, + num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim) + """ + batch, num_key_value_heads, slen, head_dim = hidden_states.shape + if n_rep == 1: + return hidden_states + hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim) + return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim) + + +class MistralAttention(nn.Module): + """ + Multi-headed attention from 'Attention Is All You Need' paper. Modified to use sliding window attention: Longformer + and "Generating Long Sequences with Sparse Transformers". + """ + + def __init__(self, config: CollieConfig, layer_idx: Optional[int] = None): + super().__init__() + self.config = config + self.layer_idx = layer_idx + if layer_idx is None: + logger.warning_once( + f"Instantiating {self.__class__.__name__} without passing a `layer_idx` is not recommended and will " + "lead to errors during the forward call if caching is used. Please make sure to provide a `layer_idx` " + "when creating this class." + ) + + self.hidden_size = config.hidden_size + self.num_heads = config.num_attention_heads + self.head_dim = self.hidden_size // self.num_heads + self.num_key_value_heads = config.num_key_value_heads + self.num_key_value_groups = self.num_heads // self.num_key_value_heads + self.max_position_embeddings = config.max_position_embeddings + self.rope_theta = config.rope_theta + self.is_causal = True + self.attention_dropout = config.attention_dropout + + if (self.head_dim * self.num_heads) != self.hidden_size: + raise ValueError( + f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}" + f" and `num_heads`: {self.num_heads})." + ) + self.q_proj = ColumnParallelLinearWithoutBias( + self.hidden_size, + self.num_heads * self.head_dim, + bias=False, + gather_output=False, + init_method=lambda x: x, + ) + self.k_proj = ColumnParallelLinearWithoutBias( + self.hidden_size, + self.num_key_value_heads * self.head_dim, + bias=False, + gather_output=False, + init_method=lambda x: x, + ) + self.v_proj = ColumnParallelLinearWithoutBias( + self.hidden_size, + self.num_key_value_heads * self.head_dim, + bias=False, + gather_output=False, + init_method=lambda x: x, + ) + # aaaa + self.o_proj = RowParallelLinearWithoutBias( + self.num_heads * self.head_dim, + self.hidden_size, + bias=False, + input_is_parallel=True, + init_method=lambda x: x, + ) + + self.rotary_emb = MistralRotaryEmbedding( + self.head_dim, + max_position_embeddings=self.max_position_embeddings, + base=self.rope_theta, + ) + + def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int): + return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous() + + def forward( + self, + hidden_states: torch.Tensor, # 输入维度 [bsz, q_len, hidden_size] + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_value: Optional[Cache] = None, + output_attentions: bool = False, + use_cache: bool = False, + **kwargs, + ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: + if "padding_mask" in kwargs: + warnings.warn( + "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`" + ) + + bsz, q_len, _ = hidden_states.size() + + query_states = self.q_proj(hidden_states) # [bsz, q_len, num_heads * head_dim] + key_states = self.k_proj(hidden_states) # [bsz, q_len, num_key_value_heads * head_dim] + value_states = self.v_proj(hidden_states) # [bsz, q_len, num_key_value_heads * head_dim] + + query_states, key_states, value_states = ( + rearrange(query_states, "b n (h d) -> b n h d", d=self.head_dim), # [bsz, q_len, num_heads, head_dim] + rearrange(key_states, "b n (h d) -> b n h d", d=self.head_dim), # [bsz, q_len, num_key_value_heads, head_dim] + rearrange(value_states, "b n (h d) -> b n h d", d=self.head_dim), # [bsz, q_len, num_key_value_heads, head_dim] + ) + + query_states = query_states.transpose(1, 2) # [bsz, num_heads, q_len, head_dim] + key_states = key_states.transpose(1, 2) # [bsz, num_key_value_heads, q_len, head_dim] + value_states = value_states.transpose(1, 2) # [bsz, num_key_value_heads, q_len, head_dim] + + + if self.config.pp_size > 1: + query_states = query_states.contiguous() + key_states = key_states.contiguous() + value_states = value_states.contiguous() + + kv_seq_len = key_states.shape[-2] + if past_key_value is not None: + if self.layer_idx is None: + raise ValueError( + f"The cache structure has changed since version v4.36. If you are using {self.__class__.__name__} " + "for auto-regressive decoding with k/v caching, please make sure to initialize the attention class " + "with a layer index." + ) + kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx) + cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len) + query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids) + + if past_key_value is not None: + cache_kwargs = {"sin": sin, "cos": cos} # Specific to RoPE models + key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs) + + # repeat k/v heads if n_kv_heads < n_heads + key_states = repeat_kv(key_states, self.num_key_value_groups) + value_states = repeat_kv(value_states, self.num_key_value_groups) + + attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim) + + if attn_weights.size() != (bsz, self.num_heads/self.config.tp_size, q_len, kv_seq_len): + raise ValueError( + f"Attention weights should be of size {(bsz, self.num_heads/self.config.tp_size, q_len, kv_seq_len)}, but is" + f" {attn_weights.size()}" + ) + + if attention_mask is not None: + if attention_mask.size() != (bsz, 1, q_len, kv_seq_len): + raise ValueError( + f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}" + ) + + attn_weights = attn_weights + attention_mask + + # upcast attention to fp32 + attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype) + attn_weights = nn.functional.dropout(attn_weights, p=self.attention_dropout, training=self.training) + attn_output = torch.matmul(attn_weights, value_states) + + if attn_output.size() != (bsz, self.num_heads/self.config.tp_size, q_len, self.head_dim): + raise ValueError( + f"`attn_output` should be of size {(bsz, self.num_heads/self.config.tp_size, q_len, self.head_dim)}, but is" + f" {attn_output.size()}" + ) + + attn_output = attn_output.transpose(1, 2).contiguous() + attn_output = attn_output.reshape(bsz, q_len, int(self.hidden_size/self.config.tp_size)) + + + attn_output = self.o_proj(attn_output) + + if not output_attentions: + attn_weights = None + return attn_output, attn_weights, past_key_value + + +class MistralFlashAttention2(MistralAttention): + """ + Mistral flash attention module. This module inherits from `MistralAttention` as the weights of the module stays + untouched. The only required change would be on the forward pass where it needs to correctly call the public API of + flash attention and deal with padding tokens in case the input contains any of them. + """ + + # Copied from transformers.models.llama.modeling_llama.LlamaFlashAttention2.__init__ + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1. + # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignement, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0. + # Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left). + self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10() + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_value: Optional[Cache] = None, + output_attentions: bool = False, + use_cache: bool = False, + **kwargs, + ): + if "padding_mask" in kwargs: + warnings.warn( + "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`" + ) + + # overwrite attention_mask with padding_mask + attention_mask = kwargs.pop("padding_mask") + bsz, q_len, _ = hidden_states.size() + + query_states = self.q_proj(hidden_states) + key_states = self.k_proj(hidden_states) + value_states = self.v_proj(hidden_states) + + query_states, key_states, value_states = ( + rearrange(query_states, "b n (h d) -> b n h d", d=self.head_dim), + rearrange(key_states, "b n (h d) -> b n h d", d=self.head_dim), + rearrange(value_states, "b n (h d) -> b n h d", d=self.head_dim), + ) + + query_states = query_states.transpose(1, 2) + key_states = key_states.transpose(1, 2) + value_states = value_states.transpose(1, 2) + + if self.config.pp_size > 1: + query_states = query_states.contiguous() + key_states = key_states.contiguous() + value_states = value_states.contiguous() + + + kv_seq_len = key_states.shape[-2] + if past_key_value is not None: + if self.layer_idx is None: + raise ValueError( + f"The cache structure has changed since version v4.36. If you are using {self.__class__.__name__} " + "for auto-regressive decoding with k/v caching, please make sure to initialize the attention class " + "with a layer index." + ) + kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx) + + # Because the input can be padded, the absolute sequence length depends on the max position id. + rotary_seq_len = max(kv_seq_len, position_ids[:, -1].max().item()) + 1 + cos, sin = self.rotary_emb(value_states, seq_len=rotary_seq_len) + + query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids) + + use_sliding_windows = ( + _flash_supports_window_size + and getattr(self.config, "sliding_window", None) is not None + and kv_seq_len > self.config.sliding_window + ) + + if not _flash_supports_window_size: + logger.warning_once( + "The current flash attention version does not support sliding window attention, for a more memory efficient implementation" + " make sure to upgrade flash-attn library." + ) + + if past_key_value is not None: + # Activate slicing cache only if the config has a value `sliding_windows` attribute + cache_has_contents = past_key_value.get_seq_length(self.layer_idx) > 0 + if ( + getattr(self.config, "sliding_window", None) is not None + and kv_seq_len > self.config.sliding_window + and cache_has_contents + ): + slicing_tokens = 1 - self.config.sliding_window + + past_key = past_key_value[self.layer_idx][0] + past_value = past_key_value[self.layer_idx][1] + + past_key = past_key[:, :, slicing_tokens:, :].contiguous() + past_value = past_value[:, :, slicing_tokens:, :].contiguous() + + if past_key.shape[-2] != self.config.sliding_window - 1: + raise ValueError( + f"past key must have a shape of (`batch_size, num_heads, self.config.sliding_window-1, head_dim`), got" + f" {past_key.shape}" + ) + + if attention_mask is not None: + attention_mask = attention_mask[:, slicing_tokens:] + attention_mask = torch.cat([attention_mask, torch.ones_like(attention_mask[:, -1:])], dim=-1) + + cache_kwargs = {"sin": sin, "cos": cos} # Specific to RoPE models + key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs) + + # repeat k/v heads if n_kv_heads < n_heads + key_states = repeat_kv(key_states, self.num_key_value_groups) + value_states = repeat_kv(value_states, self.num_key_value_groups) + dropout_rate = 0.0 if not self.training else self.attention_dropout + + # In PEFT, usually we cast the layer norms in float32 for training stability reasons + # therefore the input hidden states gets silently casted in float32. Hence, we need + # cast them back in float16 just to be sure everything works as expected. + input_dtype = query_states.dtype + if input_dtype == torch.float32: + if torch.is_autocast_enabled(): + target_dtype = torch.get_autocast_gpu_dtype() + # Handle the case where the model is quantized + elif hasattr(self.config, "_pre_quantization_dtype"): + target_dtype = self.config._pre_quantization_dtype + else: + target_dtype = self.q_proj.weight.dtype + + logger.warning_once( + f"The input hidden states seems to be silently casted in float32, this might be related to" + f" the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in" + f" {target_dtype}." + ) + + query_states = query_states.to(target_dtype) + key_states = key_states.to(target_dtype) + value_states = value_states.to(target_dtype) + + # Reashape to the expected shape for Flash Attention + query_states = query_states.transpose(1, 2) + key_states = key_states.transpose(1, 2) + value_states = value_states.transpose(1, 2) + + attn_output = self._flash_attention_forward( + query_states, + key_states, + value_states, + attention_mask, + q_len, + dropout=dropout_rate, + use_sliding_windows=use_sliding_windows, + ) + + attn_output = attn_output.reshape(bsz, q_len, int(self.hidden_size/self.config.tp_size)).contiguous() + attn_output = self.o_proj(attn_output) + + if not output_attentions: + attn_weights = None + + return attn_output, attn_weights, past_key_value + + def _flash_attention_forward( + self, + query_states, + key_states, + value_states, + attention_mask, + query_length, + dropout=0.0, + softmax_scale=None, + use_sliding_windows=False, + ): + """ + Calls the forward method of Flash Attention - if the input hidden states contain at least one padding token + first unpad the input, then computes the attention scores and pad the final attention scores. + + Args: + query_states (`torch.Tensor`): + Input query states to be passed to Flash Attention API + key_states (`torch.Tensor`): + Input key states to be passed to Flash Attention API + value_states (`torch.Tensor`): + Input value states to be passed to Flash Attention API + attention_mask (`torch.Tensor`): + The padding mask - corresponds to a tensor of size `(batch_size, seq_len)` where 0 stands for the + position of padding tokens and 1 for the position of non-padding tokens. + dropout (`float`): + Attention dropout + softmax_scale (`float`, *optional*): + The scaling of QK^T before applying softmax. Default to 1 / sqrt(head_dim) + use_sliding_windows (`bool`, *optional*): + Whether to activate sliding window attention. + """ + if not self._flash_attn_uses_top_left_mask: + causal = self.is_causal + else: + # TODO: Remove the `query_length != 1` check once Flash Attention for RoCm is bumped to 2.1. For details, please see the comment in LlamaFlashAttention2 __init__. + causal = self.is_causal and query_length != 1 + + # Contains at least one padding token in the sequence + if attention_mask is not None: + batch_size = query_states.shape[0] + query_states, key_states, value_states, indices_q, cu_seq_lens, max_seq_lens = self._upad_input( + query_states, key_states, value_states, attention_mask, query_length + ) + + cu_seqlens_q, cu_seqlens_k = cu_seq_lens + max_seqlen_in_batch_q, max_seqlen_in_batch_k = max_seq_lens + + if not use_sliding_windows: + attn_output_unpad = flash_attn_varlen_func( + query_states, + key_states, + value_states, + cu_seqlens_q=cu_seqlens_q, + cu_seqlens_k=cu_seqlens_k, + max_seqlen_q=max_seqlen_in_batch_q, + max_seqlen_k=max_seqlen_in_batch_k, + dropout_p=dropout, + softmax_scale=softmax_scale, + causal=causal, + ) + else: + attn_output_unpad = flash_attn_varlen_func( + query_states, + key_states, + value_states, + cu_seqlens_q=cu_seqlens_q, + cu_seqlens_k=cu_seqlens_k, + max_seqlen_q=max_seqlen_in_batch_q, + max_seqlen_k=max_seqlen_in_batch_k, + dropout_p=dropout, + softmax_scale=softmax_scale, + causal=causal, + window_size=(self.config.sliding_window, self.config.sliding_window), + ) + + attn_output = pad_input(attn_output_unpad, indices_q, batch_size, query_length) + else: + if not use_sliding_windows: + attn_output = flash_attn_func( + query_states, + key_states, + value_states, + dropout, + softmax_scale=softmax_scale, + causal=causal, + ) + else: + attn_output = flash_attn_func( + query_states, + key_states, + value_states, + dropout, + softmax_scale=softmax_scale, + causal=causal, + window_size=(self.config.sliding_window, self.config.sliding_window), + ) + + return attn_output + + def _upad_input(self, query_layer, key_layer, value_layer, attention_mask, query_length): + batch_size, kv_seq_len, num_heads, head_dim = key_layer.shape + + # On the first iteration we need to properly re-create the padding mask + # by slicing it on the proper place + if kv_seq_len != attention_mask.shape[-1]: + attention_mask_num_tokens = attention_mask.shape[-1] + attention_mask = attention_mask[:, attention_mask_num_tokens - kv_seq_len :] + + indices_k, cu_seqlens_k, max_seqlen_in_batch_k = _get_unpad_data(attention_mask) + + key_layer = index_first_axis(key_layer.reshape(batch_size * kv_seq_len, num_heads, head_dim), indices_k) + value_layer = index_first_axis(value_layer.reshape(batch_size * kv_seq_len, num_heads, head_dim), indices_k) + + if query_length == kv_seq_len: + query_layer = index_first_axis( + query_layer.reshape(batch_size * kv_seq_len, num_heads, head_dim), indices_k + ) + cu_seqlens_q = cu_seqlens_k + max_seqlen_in_batch_q = max_seqlen_in_batch_k + indices_q = indices_k + elif query_length == 1: + max_seqlen_in_batch_q = 1 + cu_seqlens_q = torch.arange( + batch_size + 1, dtype=torch.int32, device=query_layer.device + ) # There is a memcpy here, that is very bad. + indices_q = cu_seqlens_q[:-1] + query_layer = query_layer.squeeze(1) + else: + # The -q_len: slice assumes left padding. + attention_mask = attention_mask[:, -query_length:] + query_layer, indices_q, cu_seqlens_q, max_seqlen_in_batch_q = unpad_input(query_layer, attention_mask) + + return ( + query_layer, + key_layer, + value_layer, + indices_q, + (cu_seqlens_q, cu_seqlens_k), + (max_seqlen_in_batch_q, max_seqlen_in_batch_k), + ) + + +# copied from transformers.models.llama.modeling_llama.LlamaSdpaAttention with Llama->Mistral +# TODO @Arthur no longer copied from LLama after static cache +class MistralSdpaAttention(MistralAttention): + """ + Mistral attention module using torch.nn.functional.scaled_dot_product_attention. This module inherits from + `MistralAttention` as the weights of the module stays untouched. The only changes are on the forward pass to adapt to + SDPA API. + """ + + # Adapted from MistralAttention.forward + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_value: Optional[Cache] = None, + output_attentions: bool = False, + use_cache: bool = False, + ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: + if output_attentions: + # TODO: Improve this warning with e.g. `model.config.attn_implementation = "manual"` once this is implemented. + logger.warning_once( + "MistralModel is using MistralSdpaAttention, but `torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to the manual attention implementation, " + 'but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.' + ) + return super().forward( + hidden_states=hidden_states, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_value=past_key_value, + output_attentions=output_attentions, + use_cache=use_cache, + ) + + bsz, q_len, _ = hidden_states.size() + + query_states = self.q_proj(hidden_states) + key_states = self.k_proj(hidden_states) + value_states = self.v_proj(hidden_states) + + query_states, key_states, value_states = ( + rearrange(query_states, "b n (h d) -> b n h d", d=self.head_dim), + rearrange(key_states, "b n (h d) -> b n h d", d=self.head_dim), + rearrange(value_states, "b n (h d) -> b n h d", d=self.head_dim), + ) + + query_states = query_states.transpose(1, 2) + key_states = key_states.transpose(1, 2) + value_states = value_states.transpose(1, 2) + + if self.config.pp_size > 1: + query_states = query_states.contiguous() + key_states = key_states.contiguous() + value_states = value_states.contiguous() + + + kv_seq_len = key_states.shape[-2] + if past_key_value is not None: + kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx) + cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len) + query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids) + + if past_key_value is not None: + cache_kwargs = {"sin": sin, "cos": cos} # Specific to RoPE models + key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs) + + key_states = repeat_kv(key_states, self.num_key_value_groups) + value_states = repeat_kv(value_states, self.num_key_value_groups) + + if attention_mask is not None: + if attention_mask.size() != (bsz, 1, q_len, kv_seq_len): + raise ValueError( + f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}" + ) + + # SDPA with memory-efficient backend is currently (torch==2.1.2) bugged with non-contiguous inputs with custom attn_mask, + # Reference: https://github.com/pytorch/pytorch/issues/112577. + if query_states.device.type == "cuda" and attention_mask is not None: + query_states = query_states.contiguous() + key_states = key_states.contiguous() + value_states = value_states.contiguous() + + attn_output = torch.nn.functional.scaled_dot_product_attention( + query_states, + key_states, + value_states, + attn_mask=attention_mask, + dropout_p=self.attention_dropout if self.training else 0.0, + # The q_len > 1 is necessary to match with AttentionMaskConverter.to_causal_4d that does not create a causal mask in case q_len == 1. + is_causal=self.is_causal and attention_mask is None and q_len > 1, + ) + + attn_output = attn_output.transpose(1, 2).contiguous() + attn_output = attn_output.view(bsz, q_len, int(self.hidden_size/self.config.tp_size)) + + attn_output = self.o_proj(attn_output) + + return attn_output, None, past_key_value + + +MISTRAL_ATTENTION_CLASSES = { + "eager": MistralAttention, + "flash_attention_2": MistralFlashAttention2, + "sdpa": MistralSdpaAttention, +} + + +class MistralDecoderLayer(nn.Module): + def __init__(self, config: CollieConfig, layer_idx: int): + super().__init__() + self.hidden_size = config.hidden_size + config._attn_implementation = "sdpa" + self.self_attn = MISTRAL_ATTENTION_CLASSES[config._attn_implementation](config, layer_idx) + self.config = config + self.mlp = MistralMLP(config) + self.input_layernorm = MistralRMSNorm(config.hidden_size, eps=config.rms_norm_eps) + self.post_attention_layernorm = MistralRMSNorm(config.hidden_size, eps=config.rms_norm_eps) + self.idx = layer_idx + # 务必保持变量名一致 + self.use_cache = self.config.model_config.use_cache + self.hidden_states = None + self.output_attentions = False + +class MistralDecoderLayer(nn.Module): + def __init__(self, config: CollieConfig, layer_idx: int): + super().__init__() + self.hidden_size = config.hidden_size + config._attn_implementation = "sdpa" + self.self_attn = MISTRAL_ATTENTION_CLASSES[config._attn_implementation](config, layer_idx) + self.config = config + self.mlp = MistralMLP(config) + self.input_layernorm = MistralRMSNorm(config.hidden_size, eps=config.rms_norm_eps) + self.post_attention_layernorm = MistralRMSNorm(config.hidden_size, eps=config.rms_norm_eps) + self.idx = layer_idx + # 务必保持变量名一致 + self.use_cache = self.config.model_config.use_cache + self.hidden_states = None + self.output_attentions = False + + def _forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_value: Optional[Tuple[torch.Tensor]] = None, + # output_attentions: Optional[bool] = False, + # use_cache: Optional[bool] = False, + **kwargs, + ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]: + # if "padding_mask" in kwargs: + # warnings.warn( + # "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`" + # ) + """ + Args: + hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)` + attention_mask (`torch.FloatTensor`, *optional*): attention mask of size + `(batch, sequence_length)` where padding elements are indicated by 0. + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under + returned tensors for more detail. + use_cache (`bool`, *optional*): + If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding + (see `past_key_values`). + past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states + """ + + residual = hidden_states + + hidden_states = self.input_layernorm(hidden_states) + + # Self Attention + hidden_states, self_attn_weights, present_key_value = self.self_attn( + hidden_states=hidden_states, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_value=past_key_value, + # output_attentions=output_attentions, + # use_cache=use_cache, + **kwargs, + ) + hidden_states = residual + hidden_states + + # Fully Connected + residual = hidden_states + hidden_states = self.post_attention_layernorm(hidden_states) + hidden_states = self.mlp(hidden_states) + hidden_states = residual + hidden_states + + return hidden_states, present_key_value + + def forward(self, inputs: dict): + layer_past = inputs_to_kv_cache_for_layer(idx=self.idx, inputs=inputs) + + if self.config.checkpointing and self.training: + hidden_states, new_layer_past = torch.utils.checkpoint.checkpoint( + self._forward, + inputs["hidden_states"], + inputs.get("attention_mask", None), + inputs.get("position_ids", None), + layer_past, # inputs.get("past_key_values", None), + ) + else: + hidden_states, new_layer_past = self._forward( + inputs["hidden_states"], + inputs.get("attention_mask", None), + inputs.get("position_ids", None), + layer_past + ) # **inputs + inputs["hidden_states"] = hidden_states + + inputs.update(kv_cache_to_inputs_for_layer(idx=self.idx, new_layer_past=new_layer_past)) + return inputs + + +MISTRAL_START_DOCSTRING = r""" + This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the + library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads + etc.) + + This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. + Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage + and behavior. + + Parameters: + config ([`MistralConfig`]): + Model configuration class with all the parameters of the model. Initializing with a config file does not + load the weights associated with the model, only the configuration. Check out the + [`~PreTrainedModel.from_pretrained`] method to load the model weights. +""" + + +@add_start_docstrings( + "The bare Mistral Model outputting raw hidden-states without any specific head on top.", + MISTRAL_START_DOCSTRING, +) +class MistralPreTrainedModel(PreTrainedModel): + config_class = MistralConfig + base_model_prefix = "model" + supports_gradient_checkpointing = True + _no_split_modules = ["MistralDecoderLayer"] + _skip_keys_device_placement = "past_key_values" + _supports_flash_attn_2 = True + _supports_sdpa = True + _supports_cache_class = True + + def _init_weights(self, module): + std = self.config.initializer_range + if isinstance(module, nn.Linear): + module.weight.data.normal_(mean=0.0, std=std) + if module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, nn.Embedding): + module.weight.data.normal_(mean=0.0, std=std) + if module.padding_idx is not None: + module.weight.data[module.padding_idx].zero_() + + +MISTRAL_INPUTS_DOCSTRING = r""" + Args: + input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`): + Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide + it. + + Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. + + [What are input IDs?](../glossary#input-ids) + attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*): + Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + [What are attention masks?](../glossary#attention-mask) + + Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. + + If `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see + `past_key_values`). + + If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`] + and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more + information on the default strategy. + + - 1 indicates the head is **not masked**, + - 0 indicates the head is **masked**. + position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0, + config.n_positions - 1]`. + + [What are position IDs?](../glossary#position-ids) + past_key_values (`Cache` or `tuple(tuple(torch.FloatTensor))`, *optional*): + Pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention + blocks) that can be used to speed up sequential decoding. This typically consists in the `past_key_values` + returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`. + + Two formats are allowed: + - a [`~cache_utils.Cache`] instance; + - Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of + shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`). This is also known as the legacy + cache format. + + The model will output the same cache format that is fed as input. If no `past_key_values` are passed, the + legacy cache format will be returned. + + If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that don't + have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `input_ids` + of shape `(batch_size, sequence_length)`. + inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): + Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This + is useful if you want more control over how to convert `input_ids` indices into associated vectors than the + model's internal embedding lookup matrix. + use_cache (`bool`, *optional*): + If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see + `past_key_values`). + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned + tensors for more detail. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for + more detail. + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. +""" + + +@add_start_docstrings( + "The bare Mistral Model outputting raw hidden-states without any specific head on top.", + MISTRAL_START_DOCSTRING, +) +class MistralModel(nn.Module): + """ + Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`MistralDecoderLayer`] + + Args: + config: MistralConfig + """ + + def __init__(self, config: CollieConfig): + super().__init__() + self.config = config + self.padding_idx = config.pad_token_id + self.vocab_size = config.vocab_size + + self.embed_tokens = tensor_parallel.VocabParallelEmbedding( + config.vocab_size, config.hidden_size, params_dtype=torch.float32 + ) + self.layers = nn.ModuleList( + [MistralDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)] + ) + config._attn_implementation = "sdpa" + self._attn_implementation = config._attn_implementation + self.norm = MistralRMSNorm(config.hidden_size, eps=config.rms_norm_eps) + self.gradient_checkpointing = False + + def get_input_embeddings(self): + return self.embed_tokens + + def set_input_embeddings(self, value): + self.embed_tokens = value + + @add_start_docstrings_to_model_forward(MISTRAL_INPUTS_DOCSTRING) + def forward( + self, + input_ids: torch.LongTensor = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + # aaaa + past_key_values: Optional[Tuple[torch.FloatTensor]] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple, BaseModelOutputWithPast]: + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + use_cache = use_cache if use_cache is not None else self.config.use_cache + + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + # retrieve input_ids and inputs_embeds + if input_ids is not None and inputs_embeds is not None: + raise ValueError("You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time") + elif input_ids is not None: + batch_size, seq_length = input_ids.shape + elif inputs_embeds is not None: + batch_size, seq_length, _ = inputs_embeds.shape + else: + raise ValueError("You have to specify either decoder_input_ids or decoder_inputs_embeds") + + if self.gradient_checkpointing and self.training: + if use_cache: + logger.warning_once( + "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..." + ) + use_cache = False + + past_key_values_length = 0 + + if use_cache: + use_legacy_cache = not isinstance(past_key_values, Cache) + if use_legacy_cache: + past_key_values = DynamicCache.from_legacy_cache(past_key_values) + past_key_values_length = past_key_values.get_usable_length(seq_length) + + if position_ids is None: + device = input_ids.device if input_ids is not None else inputs_embeds.device + position_ids = torch.arange( + past_key_values_length, seq_length + past_key_values_length, dtype=torch.long, device=device + ) + position_ids = position_ids.unsqueeze(0).view(-1, seq_length) + else: + position_ids = position_ids.view(-1, seq_length).long() + + if inputs_embeds is None: + inputs_embeds = self.embed_tokens(input_ids) + + if attention_mask is not None and self._attn_implementation == "flash_attention_2" and use_cache: + is_padding_right = attention_mask[:, -1].sum().item() != batch_size + if is_padding_right: + raise ValueError( + "You are attempting to perform batched generation with padding_side='right'" + " this may lead to unexpected behaviour for Flash Attention version of Mistral. Make sure to " + " call `tokenizer.padding_side = 'left'` before tokenizing the input. " + ) + + if self._attn_implementation == "flash_attention_2": + # 2d mask is passed through the layers + attention_mask = attention_mask if (attention_mask is not None and 0 in attention_mask) else None + elif self._attn_implementation == "sdpa" and not output_attentions: + # output_attentions=True can not be supported when using SDPA, and we fall back on + # the manual implementation that requires a 4D causal mask in all cases. + attention_mask = _prepare_4d_causal_attention_mask_for_sdpa( + attention_mask, + (batch_size, seq_length), + inputs_embeds, + past_key_values_length, + ) + else: + # 4d mask is passed through the layers + attention_mask = _prepare_4d_causal_attention_mask( + attention_mask, + (batch_size, seq_length), + inputs_embeds, + past_key_values_length, + sliding_window=self.config.sliding_window, + ) + + hidden_states = inputs_embeds + + inputs = { + "input_ids": input_ids, + "hidden_states": hidden_states, + "attention_mask": attention_mask, + "position_ids": position_ids, + "past_key_values": past_key_values, + "output_attentions": output_attentions, + "use_cache": use_cache, + } + + # decoder layers + all_hidden_states = () if output_hidden_states else None + all_self_attns = () if output_attentions else None + next_decoder_cache = None + + # for decoder_layer in self.layers: + for idx, decoder_layer in enumerate(self.layers): + if output_hidden_states: + # all_hidden_states += (hidden_states,) + all_hidden_states += (inputs["hidden_states"],) + + if self.gradient_checkpointing and self.training: + layer_outputs = self._gradient_checkpointing_func( + decoder_layer.__call__, + inputs, + ) + else: + layer_outputs = decoder_layer( + inputs, + ) + inputs.update(layer_outputs) + + # hidden_states = layer_outputs[0] + hidden_states = inputs["hidden_states"] + + if use_cache: + # next_decoder_cache = layer_outputs[2 if output_attentions else 1] + next_decoder_cache = inputs["addition_info"][1 if output_attentions else 0] + + if output_attentions: + # all_self_attns += (layer_outputs[1],) + all_self_attns += (inputs["addition_info"][0],) + + hidden_states = self.norm(hidden_states) + + # add hidden states from the last decoder layer + if output_hidden_states: + all_hidden_states += (hidden_states,) + + next_cache = None + if use_cache: + next_cache = next_decoder_cache.to_legacy_cache() if use_legacy_cache else next_decoder_cache + + if not return_dict: + return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None) + return BaseModelOutputWithPast( + last_hidden_state=hidden_states, + hidden_states=all_hidden_states, + attentions=all_self_attns, + past_key_values=past_key_values, + ) + + @classmethod + def pipeline_layers(cls, config: CollieConfig): + """ + Get layers of pipeline. + :return: list + """ + if isinstance(config, str): + config = CollieConfig.from_pretrained(config) + + if config.tie_word_embeddings: + embed_tokens = TiedLayerSpec( + "embed_tokens", + dict_as_params(input_keys="input_ids", output_keys="hidden_states"), + tensor_parallel.VocabParallelEmbedding, + config.vocab_size, + config.hidden_size, + ) + else: + embed_tokens = LayerSpec( + dict_as_params(input_keys="input_ids", output_keys="hidden_states"), + tensor_parallel.VocabParallelEmbedding, + config.vocab_size, + config.hidden_size, + ) + + layers = [ + LayerSpec(MistralDecoderLayer, config, i) for i in range(config.num_hidden_layers) + ] + norm = LayerSpec( + dict_as_params(input_keys="hidden_states", output_keys="hidden_states"), + MistralRMSNorm, + hidden_size=config.hidden_size, + eps=config.rms_norm_eps, + ) + + return [ + ("embed_tokens", embed_tokens), + ("layers", layers), + ("norm", norm), + ] + +class MistralForCausalLM(CollieModelForCausalLM): + _tied_weights_keys = ["lm_head.weight"] + + def __init__(self, config:CollieConfig): + super().__init__(config) + self.model = MistralModel(config) + self.vocab_size = config.vocab_size + self.lm_head = ColumnParallelLinearWithoutBias( + self.collie_config.hidden_size, self.collie_config.vocab_size, bias=False + ) + # Initialize weights and apply final processing + # GenerationMixin 需要的额外参数 + self.config.is_decoder = True + if config.model_config.tie_word_embeddings: + self.lm_head.weight = self.embed_tokens.weight + self.main_input_name = "input_ids" + + def clean_cache(self): + self._clean_hidden_states([*self.model.layers, self.lm_head]) + self._set_use_cache(self.model.layers, False) + + def set_cache(self, use_cache): + self._set_use_cache(self.model.layers, use_cache) + + def get_input_embeddings(self): + return self.model.embed_tokens + + def set_input_embeddings(self, value): + self.model.embed_tokens = value + + def get_output_embeddings(self): + return self.lm_head + + def set_output_embeddings(self, new_embeddings): + self.lm_head = new_embeddings + + def set_decoder(self, decoder): + self.model = decoder + + def get_decoder(self): + return self.model + + @add_start_docstrings_to_model_forward(MISTRAL_INPUTS_DOCSTRING) + @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC) + def forward( + self, + input_ids: torch.LongTensor = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[List[torch.FloatTensor]] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + **kwargs, + ) -> Union[Tuple, CausalLMOutputWithPast]: + r""" + Args: + labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., + config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored + (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`. + + Returns: + + Example: + + ```python + >>> from transformers import AutoTokenizer, MistralForCausalLM + + >>> model = MistralForCausalLM.from_pretrained("mistralai/Mistral-7B-v0.1") + >>> tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-v0.1") + + >>> prompt = "Hey, are you conscious? Can you talk to me?" + >>> inputs = tokenizer(prompt, return_tensors="pt") + + >>> # Generate + >>> generate_ids = model.generate(inputs.input_ids, max_length=30) + >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0] + "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you." + ```""" + + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn) + outputs = self.model( + input_ids=input_ids, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + hidden_states = outputs[0] + logits = self.lm_head(hidden_states) + logits = logits.float() + + loss = None + if labels is not None: + # Shift so that tokens < n predict n + shift_logits = logits[..., :-1, :].contiguous() + shift_labels = labels[..., 1:].contiguous() + # Flatten the tokens + shift_logits = shift_logits.view(-1, self.config.vocab_size) + shift_labels = shift_labels.view(-1) + # Ensure tensors are on the same device + shift_labels = shift_labels.to(shift_logits.device) + loss_fct = CrossEntropyLoss() + loss = loss_fct(shift_logits, shift_labels) + + if not return_dict: + output = (logits,) + outputs[1:] + return (loss,) + output if loss is not None else output + + return CausalLMOutputWithPast( + loss=loss, + logits=logits, + past_key_values=outputs.past_key_values, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + def prepare_inputs_for_generation( + self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, **kwargs + ): + # Omit tokens covered by past_key_values + if past_key_values is not None: + if isinstance(past_key_values, Cache): + cache_length = past_key_values.get_seq_length() + past_length = past_key_values.seen_tokens + max_cache_length = past_key_values.get_max_length() + else: + cache_length = past_length = past_key_values[0][0].shape[2] + max_cache_length = None + + # Keep only the unprocessed tokens: + # 1 - If the length of the attention_mask exceeds the length of input_ids, then we are in a setting where + # some of the inputs are exclusively passed as part of the cache (e.g. when passing input_embeds as + # input) + if attention_mask is not None and attention_mask.shape[1] > input_ids.shape[1]: + input_ids = input_ids[:, -(attention_mask.shape[1] - past_length) :] + # 2 - If the past_length is smaller than input_ids', then input_ids holds all input tokens. We can discard + # input_ids based on the past_length. + elif past_length < input_ids.shape[1]: + input_ids = input_ids[:, past_length:] + # 3 - Otherwise (past_length >= input_ids.shape[1]), let's assume input_ids only has unprocessed tokens. + + # If we are about to go beyond the maximum cache length, we need to crop the input attention mask. + if ( + max_cache_length is not None + and attention_mask is not None + and cache_length + input_ids.shape[1] > max_cache_length + ): + attention_mask = attention_mask[:, -max_cache_length:] + + position_ids = kwargs.get("position_ids", None) + if attention_mask is not None and position_ids is None: + # create position_ids on the fly for batch generation + position_ids = attention_mask.long().cumsum(-1) - 1 + position_ids.masked_fill_(attention_mask == 0, 1) + if past_key_values: + position_ids = position_ids[:, -input_ids.shape[1] :] + + # if `inputs_embeds` are passed, we only want to use them in the 1st generation step + if inputs_embeds is not None and past_key_values is None: + model_inputs = {"inputs_embeds": inputs_embeds} + else: + model_inputs = {"input_ids": input_ids} + + model_inputs.update( + { + "position_ids": position_ids, + "past_key_values": past_key_values, + "use_cache": kwargs.get("use_cache"), + "attention_mask": attention_mask, + } + ) + return model_inputs + + @staticmethod + def _reorder_cache(past_key_values, beam_idx): + reordered_past = () + for layer_past in past_key_values: + reordered_past += ( + tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past), + ) + return reordered_past + + @classmethod + def pipeline_layers(cls, config: CollieConfig): + """ + Get layers of pipeline. + :return: list + """ + if isinstance(config, str): + config = CollieConfig.from_pretrained(config) + + if config.tie_word_embeddings: + output = TiedLayerSpec( + "embed_tokens", + dict_as_params(input_keys="hidden_states", output_keys="logits"), + ColumnParallelLMHead, + config.hidden_size, + config.vocab_size, + bias=False, + ) + else: + output = LayerSpec( + dict_as_params(input_keys="hidden_states", output_keys="logits"), + ColumnParallelLMHead, + config.hidden_size, + config.vocab_size, + bias=False, + ) + + return [("model", MistralModel.pipeline_layers(config)), ("lm_head", output)] + + @staticmethod + def load_parallel_state_dict( + path: str, + config: Union[CollieConfig, str], + process_exclusion: bool = False, + **kwargs, + ): + ... + + @staticmethod + def load_parallel_state_dict( + path: str, + config: Union[CollieConfig, str], + process_exclusion: bool = False, + protocol: str = "file", # 指定加载state_dict时使用的协议 + **kwargs, + ): + """ + Load state_dict from ``path``. + The format of pretrained model should be the same as that of + `huggingface`. + :return: state_dict. Note that the state_dict should be processed + properly to match the current rank. + """ + # 配置加载 + if isinstance(config, str): + config = CollieConfig.from_pretrained(config) + # IO驱动初始化 + io_driver = IODriver.from_protocol(protocol) + # 检查文件路径是否存在 + if not io_driver.exists(path): + raise FileNotFoundError(f"folder {path} not found.") + # 初始化存储和处理变量 + state_dict = OrderedDict() + weights = [] + parts = None # 变量用于存储模型分割的部分信息 + # 如果开启了进程互斥,那么每个进程都会显示进度条,否则只显示 RANK0 的 + hide_progress = not process_exclusion and int(os.environ.get("RANK", "0")) != 0 + if dist.is_initialized() and process_exclusion: + # 如果启动了进程互斥,则要进行 dist.get_world_size() 次循环 + rank_order = range(dist.get_world_size()) + else: + # 不开启只进行一次循环 + rank_order = range(1) + # 权重文件加载和处理 + for rank in rank_order: + # 如果开启了进程互斥,那么只有对应 RANK 的能进入循环;不开启进程互斥的话就都可以进 + if int(os.environ.get("RANK", "0")) == rank or not process_exclusion: + # PP 分层的方法保存在了 os.environ["COLLIE_PP_PARTS"], 格式类似于 [0, 17, 35], 左闭右开 + if env.is_pipeline: + # 保存的是 json 格式 + parts = env.pipeline_parts + if hasattr(config, "num_key_value_heads"): + # llama2 (transformers >= 4.31.0) + num_key_value_heads = config.num_key_value_heads + else: + num_key_value_heads = config.num_attention_heads + head_dim = config.hidden_size // config.num_attention_heads + # 如果存在 pytorch_model.bin.index.json 文件的话,此时不同的 pp 进程可以按需加载自己需要的权重 + if ( + io_driver.exists(os.path.join(path, "pytorch_model.bin.index.json")) + and "COLLIE_PP_PARTS" in os.environ.keys() + ): + weight_map = json.loads( + io_driver.load( + os.path.join(path, "pytorch_model.bin.index.json"), mode="r" + ) + )["weight_map"] + # layers 表示自己需要的层 + layers = env.pipeline_layers_idx + # 筛选出形似 model.layers.0 这样的层。包含两个条件:1. 有数字的层;2. 数字加一要在 layers 里面(因为最开始还有个 embedding 占一层) + weights.extend( + [ + value + for key, value in weight_map.items() + if len(key.split(".")) > 2 + and key.split(".")[2].isdigit() + and (int(key.split(".")[2]) + 1) in layers + ] + ) + # 去重 + weights = list(set(weights)) + # 继续筛选,如果有 0 层,那么就要加载 embedding;如果有最后一层,那么就要加载 lm_head;如果有倒数第二层,那么就要加载 norm + if 0 in layers: + weights.append(weight_map["model.embed_tokens.weight"]) + if max(parts) - 1 in layers: + weights.append(weight_map["lm_head.weight"]) + if max(parts) - 2 in layers: + weights.append(weight_map["model.norm.weight"]) + else: + # 如果没有 pytorch_model.bin.index.json 文件的话,那么就加载所有的权重 + weights = [ + weight + for weight in io_driver.list(path) + if weight.endswith(".bin") + ] + with progress( + weights, + desc="Loading state dict", + total=len(weights), + disable=hide_progress, + ) as pbar: + for weight in pbar: + part_state_dict = io_driver.load( + os.path.join(path, weight), mode="rb" + ) + state_dict.update(part_state_dict) + del part_state_dict + if parts is not None: + # 这一步是 pp 的复筛 + layers = env.pipeline_layers_idx + for key in list(state_dict.keys()): + if key.startswith("layers"): + layer = int(key.split(".")[1]) + if layer + 1 not in layers: + state_dict.pop(key) + # if key.endswith("tok_embeddings.weight"): + if key.endswith("embed_tokens.weight"): + if 0 not in layers: + state_dict.pop(key) + if key == "norm.weight": + if max(parts) - 2 not in layers: + state_dict.pop(key) + # if key.endswith("output.weight"): + if key.endswith("lm_head.weight"): + if max(parts) - 1 not in layers: + state_dict.pop(key) + # 根据用户配置的新的 tp size 进行分割 + for key in list(state_dict.keys()): + col_filter = [ + "q_proj.weight", + "k_proj.weight", + "v_proj.weight", + "lm_head.weight", + "gate_proj.weight", + "up_proj.weight", + "embed_tokens.weight", + ] + col_split = any([key.endswith(filter) for filter in col_filter]) + + if col_split: + tensor = ( + list(torch.chunk(state_dict[key], config.tp_size, dim=0))[ + env.tp_rank + ] + .detach() + .clone() + ) + del state_dict[key] + if process_exclusion: + # CPU 内存回收(速度很慢) + gc.collect() + state_dict[key] = tensor + elif key.endswith("o_proj.weight") or key.endswith("down_proj.weight"): + tensor = ( + list(torch.chunk(state_dict[key], config.tp_size, dim=1))[ + env.tp_rank + ] + .detach() + .clone() + ) + del state_dict[key] + if process_exclusion: + # CPU 内存回收(速度很慢) + gc.collect() + state_dict[key] = tensor + if dist.is_initialized() and process_exclusion: + # 如果选择了进程互斥,那么本次循环中不需要加载权重的进程需等待 + dist.barrier() + return state_dict + + @staticmethod + def save_parallel_state_dict( + state_dict: dict, + path: str, + config: CollieConfig, + process_exclusion: bool = False, + **kwargs, + ): + ... + + @staticmethod + def save_parallel_state_dict( + state_dict: dict, + path: str, + config: CollieConfig, + process_exclusion: bool = False, + protocol: str = "file", + ): + """ + Save state_dict to ``path``. + The format of saved state dict should be the same as that of + `huggingface`. + """ + io_driver = IODriver.from_protocol(protocol) + # gather to tp rank 0 + if dist.is_initialized() and process_exclusion: + # 如果启动了进程互斥,则要进行 pp_size 次循环 + rank_order = range(config.pp_size) + else: + # 不开启只进行一次循环 + rank_order = range(1) + dst = parallel_state.get_tensor_model_parallel_src_rank() + with progress( + rank_order, + desc="Saving model", + disable=int(os.environ.get("RANK", "0")) != 0, + ) as pbar: + for rank in pbar: + if env.dp_rank == 0 and (env.pp_rank == rank or not process_exclusion): + for key in sorted(list(state_dict.keys())): + tensor_list = None + if env.tp_rank == 0: + tensor_list = [ + torch.zeros_like(state_dict[key]) + .to(state_dict[key].dtype) + .cuda() + for _ in range(config.tp_size) + ] + dist.gather( + state_dict[key].cuda(), + dst=dst, + gather_list=tensor_list, + group=env.tp_group, + ) + if env.tp_rank == 0: + col_filter = [ + "q_proj.weight", + "k_proj.weight", + "v_proj.weight", + "lm_head.weight", + "gate_proj.weight", + "up_proj.weight", + "embed_tokens.weight", + ] + col_split = any( + [key.endswith(filter) for filter in col_filter] + ) + + if col_split: + state_dict[key] = concat_tensor(tensor_list, dim=0) + + if process_exclusion: + # CPU 内存回收(速度很慢) + gc.collect() + + elif key.endswith("o_proj.weight") or key.endswith("down_proj.weight"): + state_dict[key] = concat_tensor(tensor_list, dim=1) + + if process_exclusion: + # CPU 内存回收(速度很慢) + gc.collect() + + if env.tp_rank == 0: + # Save gathered weights + if env.is_pipeline: + ckpt_name = f"pytorch_model-{env.pp_rank + 1:05d}-of-{config.pp_size:05d}.bin" + total_size = 0 + weight_map = {} + for name, weight in state_dict.items(): + weight_size = weight.numel() * dtype_byte_size( + weight.dtype + ) + weight_map[name] = ckpt_name + total_size += weight_size + index_dict = dict( + total_size=total_size, weight_map=weight_map + ) + index_dicts = [None for _ in range(env.pp_size)] + dist.gather_object( + index_dict, index_dicts if env.pp_rank == 0 else None, group=env.pp_group + ) + if env.pp_rank == 0: + total_size = 0 + weight_map = {} + for _index_dict in index_dicts: + total_size += _index_dict["total_size"] + weight_map.update(_index_dict["weight_map"]) + merged_dict = { + "metadata": {"total_size": total_size}, + "weight_map": weight_map, + } + io_driver.save( + json.dumps(merged_dict, indent=2, sort_keys=True) + + "\n", + os.path.join(path, "pytorch_model.bin.index.json"), + ) + + else: + ckpt_name = f"pytorch_model.bin" + ckpt_path = os.path.join(path, ckpt_name) + io_driver.save(state_dict, ckpt_path) + if dist.is_initialized() and process_exclusion: + dist.barrier() + if env.rank == 0: + config.save_pretrained(path, protocol=protocol) + dist.barrier() + + +@add_start_docstrings( + """ + The Mistral Model transformer with a sequence classification head on top (linear layer). + + [`MistralForSequenceClassification`] uses the last token in order to do the classification, as other causal models + (e.g. GPT-2) do. + + Since it does classification on the last token, it requires to know the position of the last token. If a + `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If + no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the + padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in + each row of the batch). + """, + MISTRAL_START_DOCSTRING, +) +# Copied from transformers.models.llama.modeling_llama.LlamaForSequenceClassification with Llama->Mistral, LLAMA->MISTRAL +class MistralForSequenceClassification(MistralPreTrainedModel): + def __init__(self, config): + super().__init__(config) + self.num_labels = config.num_labels + self.model = MistralModel(config) + self.score = nn.Linear(config.hidden_size, self.num_labels, bias=False) + + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self): + return self.model.embed_tokens + + def set_input_embeddings(self, value): + self.model.embed_tokens = value + + @add_start_docstrings_to_model_forward(MISTRAL_INPUTS_DOCSTRING) + def forward( + self, + input_ids: torch.LongTensor = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[List[torch.FloatTensor]] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple, SequenceClassifierOutputWithPast]: + r""" + labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): + Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., + config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If + `config.num_labels > 1` a classification loss is computed (Cross-Entropy). + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + transformer_outputs = self.model( + input_ids, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + hidden_states = transformer_outputs[0] + logits = self.score(hidden_states) + + if input_ids is not None: + batch_size = input_ids.shape[0] + else: + batch_size = inputs_embeds.shape[0] + + if self.config.pad_token_id is None and batch_size != 1: + raise ValueError("Cannot handle batch sizes > 1 if no padding token is defined.") + if self.config.pad_token_id is None: + sequence_lengths = -1 + else: + if input_ids is not None: + # if no pad token found, use modulo instead of reverse indexing for ONNX compatibility + sequence_lengths = torch.eq(input_ids, self.config.pad_token_id).int().argmax(-1) - 1 + sequence_lengths = sequence_lengths % input_ids.shape[-1] + sequence_lengths = sequence_lengths.to(logits.device) + else: + sequence_lengths = -1 + + pooled_logits = logits[torch.arange(batch_size, device=logits.device), sequence_lengths] + + loss = None + if labels is not None: + labels = labels.to(logits.device) + if self.config.problem_type is None: + if self.num_labels == 1: + self.config.problem_type = "regression" + elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int): + self.config.problem_type = "single_label_classification" + else: + self.config.problem_type = "multi_label_classification" + + if self.config.problem_type == "regression": + loss_fct = MSELoss() + if self.num_labels == 1: + loss = loss_fct(pooled_logits.squeeze(), labels.squeeze()) + else: + loss = loss_fct(pooled_logits, labels) + elif self.config.problem_type == "single_label_classification": + loss_fct = CrossEntropyLoss() + loss = loss_fct(pooled_logits.view(-1, self.num_labels), labels.view(-1)) + elif self.config.problem_type == "multi_label_classification": + loss_fct = BCEWithLogitsLoss() + loss = loss_fct(pooled_logits, labels) + if not return_dict: + output = (pooled_logits,) + transformer_outputs[1:] + return ((loss,) + output) if loss is not None else output + + return SequenceClassifierOutputWithPast( + loss=loss, + logits=pooled_logits, + past_key_values=transformer_outputs.past_key_values, + hidden_states=transformer_outputs.hidden_states, + attentions=transformer_outputs.attentions, + ) From 507ca245b785e2588a2bb776c26ad2523557236e Mon Sep 17 00:00:00 2001 From: LinqiY <100989140+LinqiY@users.noreply.github.com> Date: Mon, 6 May 2024 15:43:23 +0800 Subject: [PATCH 13/16] Delete tests/models/mistral2 directory --- tests/models/mistral2/__init__.py | 2 - .../__pycache__/__init__.cpython-310.pyc | Bin 295 -> 0 bytes .../configuration_mistraltp.cpython-310.pyc | Bin 6283 -> 0 bytes .../__pycache__/model.cpython-310.pyc | Bin 49178 -> 0 bytes .../__pycache__/modeltp.cpython-310.pyc | Bin 52277 -> 0 bytes .../mistral2/configuration_mistraltp.py | 155 -- tests/models/mistral2/model.py | 2026 --------------- tests/models/mistral2/modelpp.py | 1922 -------------- tests/models/mistral2/modeltp.py | 2254 ----------------- 9 files changed, 6359 deletions(-) delete mode 100644 tests/models/mistral2/__init__.py delete mode 100644 tests/models/mistral2/__pycache__/__init__.cpython-310.pyc delete mode 100644 tests/models/mistral2/__pycache__/configuration_mistraltp.cpython-310.pyc delete mode 100644 tests/models/mistral2/__pycache__/model.cpython-310.pyc delete mode 100644 tests/models/mistral2/__pycache__/modeltp.cpython-310.pyc delete mode 100644 tests/models/mistral2/configuration_mistraltp.py delete mode 100644 tests/models/mistral2/model.py delete mode 100644 tests/models/mistral2/modelpp.py delete mode 100644 tests/models/mistral2/modeltp.py diff --git a/tests/models/mistral2/__init__.py b/tests/models/mistral2/__init__.py deleted file mode 100644 index 9dc3f79..0000000 --- a/tests/models/mistral2/__init__.py +++ /dev/null @@ -1,2 +0,0 @@ -from .modeltp import MistralForCausalLM -from .configuration_mistraltp import MistralConfig \ No newline at end of file diff --git a/tests/models/mistral2/__pycache__/__init__.cpython-310.pyc b/tests/models/mistral2/__pycache__/__init__.cpython-310.pyc deleted file mode 100644 index 76a01ca4171928aebb54f37b4541ecbf0bd2731f..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 295 zcmd1j<>g`kf)fuV(xQO$V-N=!FabFZKwK;XBvKes7;_kM8KW2(L2RZRrd;MIW+0n6 zm_d`}B_mLYCgUw3-^}8YqQo4x{37SX(&EG%A77v-FI3byKQApa-A|Jxiaj?!B{ip{ zpa^6~lz1{&qO>TnBr`uRJ{MvJP?i}eyON=Z1xSI3U(xzSsk!+jsk#~YxvBa&g_W6k z`p)@2KAEoiC8@B?0;HGr)ME}k^pZ=_OaFov{Ui2P^k5ih&js3|=u2pi;`G=BYjBvSiM}3dainiBma(TGk3WBzOvsP<`&8Bdh4a4MB)2lhLT0_(l z1M?q!GQYP_U93s&d-fi$IZ#-O?Ny)oo_o0UCaTt5EITxV<8@Z@hi@OblEayyU-fA= z$pyVHay`BLEeNjdn8_UWC4#FZXk?+Gs9ITlL-kn z4Ab_jE3GYAdKY!flf57H#ke}ItUhs$ zg6z}TH$f(WymE}jKHG1mx^HSFUUm$VBx9Mk`wV^tPTUrkB47>aIf(pV0@t4+#1=CR zm+kTt0Z7I`dbaXk962ux+F{^V*%mhdk``C{vOH5oTam;uCK_bE9vw9t&Q>~B zWYfQ?o(S7}o@wmrzBuG;wl3VDKF6+kZQ>xqzNRo0;{d>0URqdKsAN-V`e!sf1SSpu z4(RB|K#amFyGzLAaRRxav&$t7v_zx9C9n$wJ?Acc4HPAwk-QDM!k57K?m_ARvPmy2 zmE5BX%dfLGk`Tl8TinHT+a`1m=3Khrmar`Do^Hq27k>Psfeqgk$TlDD>25XF$I|sG zU;EY69(>BB!!640(^*}-zPx;**^LJekG2UP>&ZTCUXDS8yv z$)BnAc+Izl_!y~6{2TL!;%yWdwgDO!>^`wbYJ$yf++V58F>wk*i$^ORSU9vkTB$G( zi;(*Qrq>>=bjw_o{S4XJw=Hnq9+yE}iX2+yJjIupEp9$lpPaE6M3 zj==STthUd^#WyL%=21$jY;Lof`SGNeVmO_D0NU|ld5XN_Kf4|>Oys4X-F)nC$tP{ zp|9IR@#rZzXun+(<;2omD(<%4W0es*rtej90pR;wIRuS>6ejPfPSWv}5}mgxGAAu{ zk zI9VP1f&*Js_<92UTWXEj_S%M}tIh^p3U)ZyTn!01#w86D<|h3tgS$BeIExMy^k$ULdHGA_K$={ ziV0-AY8h3%h?{Fhwpfp?tae}}R^|igwYd&WfpMA?L6eM(9^(6512qw4L3nAHx4eg| zJuApeChVjq?DRT%`a?aWQGI*_zoDoo!lQOwRi2Y-4(aYGJ89FUTks22zmsWWxIen{ z)s=_hEs9pEpS7j8YwQXej7pTHGO=E-w-17r=c;$NZ{Lm&LAWF7wxfu%)AnT0rTPsz zbI*0(-BC#!>!R83;ZU3~w&Kd}IQG#eC~yb%z=1!t8Yh4E?02g>Eml9991w>Zh}$!D zI;8D(5BewAi)W>F)TUEz0y)wtKo&ZqkfWU`$YQ5xX`v>sc1FLzGd>yXj6s$`5ers!zUclwK_hnyCov;7n`ol9e~8LyL662W&Wxtgl*_L3*X7$7{;y}aEp@zr&$m2>*X$PuxH7%tJ$N8iYXSEIzorre|Kh8wrCPO` z{F)waAaKb_ywmf2U$W}Sw4hyWAIJ&}3jFyC5XCFST=r8MDddXz?0qp;8p-G4pTd9g zqvQWRyqr)-%stiArPGp0wxh=Y diff --git a/tests/models/mistral2/__pycache__/model.cpython-310.pyc b/tests/models/mistral2/__pycache__/model.cpython-310.pyc deleted file mode 100644 index ab53c9573dc702d9ab95ac9870bc99c46c10a54a..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 49178 zcmd75d7K>AeIM9WU48fToR|RyR|5nI30P`8CU?{Km_1{Ko56BT-H?lI3J0RZcb1<+P#0XUZ7~mu=+C zm=SB_%LBoaf;_S6nMSUh6Td`#urX8~Y7Cc$8zbcr@k`c68)M}$xlYx$G{(#0jfwJv z+^6eX8{5j;8k6NoxzE(MH+GbFG4werZG3V}^G4-R!d!c@B<52lf<8b+~#4uPd zHI9^zG>(>!Hjb5#!EeYp?ig$LIfrf=3-{kNuExsmaQ3e~;5>NSD8JJlF274^;GxEM zl^>QE%=(G)3Gq8wJ}JMClpn$GNd4W7Q{__YU3?*C7oFU~%*}YY3ZFgpUij<{d@k8{+4~T7);?|DZQt{P zQMM6gzkL8<4g_Hwd&dHY^o%_Z$a_Td+-@@0fA*+&reND#ILpQH9M_#AU; zNO1x8$L;%Yf1lhhIyGm(S+wu}iFmn=XYa5dz_SN}w;T3@<)(9&{Z6NO)5Ozv*$?6A zLr(E#{H9TE+22)O>U#RHeF9HU=y&n_3f?$rKLWo;oM(`>LfCf)F`csC1ON9pEs4ps z&GWITM^`3_Pu{rDQnQzf=W4E}s`X-{WjpoK%N%IaQHShA-)_l!#&$e9GA3Ck@^H|eUt)&~{lR9_)F<#)U zg;95&p&#jS#8v2= zs_vqH@GXN>=31)a+DldBaN<2x*Ez?sJ+bU9Eql!ElT{RL{ItY=_FSKT@w{_o*=f!? zr|VVMt5o-yyW)CP+PRwOJ487SQh1dflnB+C0Y}M68wCs?B)^;WKBeHym|- z$(i*BF4P=5x<$bJ=~lg7bAmD%e){nz9#yrgj$$Wd<=`sr??$NQM)S!a-Lq)Ws$vgm zEqm{&Rb79qFTlC?IaS-=@l@;Drte_v49vEgvsJI6^Y0gIl&?~Cr63L0k2}q)etJo@ z&~`E^#&u?;E1hYnuwx>o9g9~n3NTREMWj~IFHj|QmfYVF_~2O-uIp*!*ZZ#Yv`ycY zr?c!E;U|HM6uzhNbw3EF9b3Q>>KO~BZQe3hc@=NR)~wqxJ8oOIOkS;;ZDTEQJGPd@ zb=*s>r4cIAHta<3UffRJHdn2-*|ys8xi}`J)bz`5;PaB{$L(6fPsorugZ4_J>O9Uq zO1iA6CXoq!o%Q4HGQxVC%KSvF>D_nSPc>W36-TxF6#6aa4nN+gUiYn<=QModj2}m@ z^pms84ZJ#)@C$wO%aH3_K^J#X-<3;M5AmkZb!)SZ>yILyihkUAnw?z^{M?sf>ix*u zo$?XoG+Lf>_;Rb^9I0QqQEMJK-8y@=_Sg~6aXt5l)G~hb-0LkJDV2i%S?Ray1IQbt zr5pZWW!~{BdT7~IugVs2hu{nu<3`??G)&VpOv8FzorC8rzC9l+xO4Bpg%bnO6a&GO z{8(%bM3fzWAp^3>vJ+Sst&hd5SUJI#@dv|6;;D1zr(3G=N4Ngu7uLS{#sBr>EJ(2E z!)x{y{!08_>}G7uxDE2kGuPstwU%fbxAE7GRg!4=SS4i}h~Hc}ku3(FAfX_gV$CgL zTB}{HqD8!-*D79khYWLOB6??fD(xrSbOjAl^C}fZD$&nym^K_tkm>~QtyUAw zn7roH<}Z8x5GJ!)6;pl1S#oQ2tZQ-Ka-I5|pSxVMZA^2V%Q0S;+?O!-R4TJv5)d~V zS)HZx0XR0kE|bezBSzL(+1giGr7%ykCIW>YRusRd@pY%+w2cKlYz)s_h~G9=4Gba6 zMs+PD*fbnktMN9Tt|i+Ta0{u4*llAhhH$ZkG{PEfQ|_&{g-T6Kzs%&+gK#cj6r~m& zX9)vqDx)5Pmv3UsNS$Dmw5l98@uk)^-)cD3rk_ym73Ha?8NmfOQwha><`0DR!U@{- zb20%{WTg7(ur~d8wdv|QiYf|YQ7>(EMC0b+B+*&N@Hb>wV=EK=RThMvGSm|Y`%;n( z3qrxERQzm(i{)~i*ZE3?i=KWWjiKfwpFvyAI`w+kUXqt*@Km=Pt9%w;X}NT)U<{>I z=m+o4CmJzF;TMgVtc)0r6pomro#KeGb9UO!fRs$g*h!)p@_wgmEl>kD9&223Y#VFy z%dvDU_VUnv&|9;e^Z4>Z2zB^Hu4#uH}bIIdIZ zD%M&Wvu~ztakMR%+he1#?XlGauCsQconVe{#SPBoImWk;Ul{NbYX!{VY5LnS{OfJ7*H(7puKY?GN1VC#0{ci+p`hIrB_X zRW2>h&0*}1U>f!+vn?0HwyBp*wZtTs8GM-2rtQpCm+SZ?xi*!QPOX?3b(PMwFtmP8 zI!n}rCFh?&2zLZd)<~L3BWWCE2aXqv>>Dc~DSiB@-iGds|FA3JeOXeY@xKBOY1|l> zM>I#QZE{()ja%_G3-<}k2bO1G8;@o^j}=oXD5=Bbo!6pyd7;dE~^JhVMJ8>y$p zrM}5UwGr;e@C_!G9Y(<%F(-{(#>&BHV(D*_b$w;SrK~eMORFqeY7n)InHIA(ztTc7 zj4T^%_WkBIw3Yaqw>a>U>wV|QfyJkl%fhwbpvd3HYnGx@Wc9tgWlvYn!EE<;3CLQd>Ih_B1Ji}Pu^Fp^dD?V%>)@QAwo`*711aSW;o)Z@08ZB53$ z-<(4=a#@qvE%LWfz4C-Z1G8WbLk68cZYOTBN|P%GIqE$w`sgEEVSo#z0TWl@X=EycNnRo6mvA4}J%mnS&O`{zji%rBZ zw$bOr&xrgmp4%zuO-8keIz6?t1eV7w>glKWWH7T9qZNnr6Ke(ht-4XHHtph~bEBxa z6t|QWdg_##cOQxyZ;+{|c%)ES_%L`eU<<-rdaXrNQgODWz}GFc zz%w-G-@M9;XIiT0T(363+#EvFfZCm{Hj9^>o-~(T$7VcBAU=`S9A+*LKAmhJ*VisP z&EmC|T4b8nz`+))#rJ>kuuR;Y+FEY9XnSYHsbEB)-oP)-94a1c1wx*Khl;@h=`y`z zQ=N_wmVJLu6#I)+HNV^-Y3_mzr?3QXqKXhbssbXLVijaHa&)-{Q4`{fUPm>#&xS>) zpyDsGL-v##E!Y>rW&f>$54enet81zVJ{p6zexrCvwW{_kL_4~+da@y-|BCMNrQTX^ zf=}y7R#Mn7cg6KNx?H{LNQq_cC_XC;qp%H!iUAdLsCXGd1J?M(68lf{P_Z^wl+5&` zyNGFP^T-&7yHT9%H)wly3buVz!A$VPv0|gT=(s)UyxpQj^+~Z)*pH={y|sD1)_`aV zEzl`fQJ1ABS+>VpRT2sq)m5ilePueX`}0#G(CV72W<*ACpnJNSnn5>fEU6kAZay5Q z8_s@3J(=6ZWeLj6ZaR(X^eKRfDqALeaG}N(-&j=NkNI8w031Kx-NXHXC_Ed-PhF|V zj0V{}A!YDqq-Az)~}2qz&~q;WvwK&xb25Kkhy_A>Sfg z3po83(%@RaQugrq2|pK-3+K*0*~6`pm81}ek-A(Q*D)m`1gd~PT>ynz0ZW1Wf6OB8%TyqK44JJ^bi-65Ak~* z->;F_ZsSsY0vA%XplP|F_g8cJ;d-@k$*!LKL%zn6TCtUTx@saK;7bqJTeH==dlDD= z6>vM&B}xhh=Bwy5ki&|UnVd1C5Rg1vZrPh=QITftWKaG0`pl_M!|yBf{|Pu3{?bwX zc^K78arL)w?GI}f9zta-xFUU3Yw({M=x}S0m5FFreJFSV^n1gxYaq3>IQL%qr zm~Sok8PtR$m+CMdWFYo!>iaD3vw;kuI_vpKIF-5POQt6JKa1x|NJ^nFNhb;Y`MbDq z?}r1f3;jK9m10J?WwbkI0Y$FJb9Z&o4u$=r zc<^O>1Bpf67}}%gr!K+q6HUnYm;9s_^!F5xgeV8E6z{{h!0?JaXRShnfC0K_ zssRob(A;1rzn?5+;D*1Ty%Qi<$@-uoz=kGnCstWWlt;j4EGB*wJ;hhPEK~ml5;XYd zY}zHoBOy+CZ3+2>@&W=obf~(6#qYp%P#|2k4i)t?_k@&cN*2zLniuVWTbyqd=Tr-1 zjI6nbB*o@(qXKHP0(s%GQ|YXkU`{(r%M=&0;H@Ix=m>x}dqW}@x$`s_u^(c6C!Om{ zpjiAAh&}QRkS4Gl)X(r?LJN*ZH^>hpj1-Ne!-cM^soCGD;*eTdy!t9`^-P~gqqiR= zWAO#}&Enhh84K>*pTxjlgIC{GhmP4JL(h|CJc#R zx!)E;y5Q^P3ig=2<%L9fP$XkR=p|eI;jsIk3N?Hy|5$4J9Aq@L!>l3Da#8E(=H4F# zEPYD<~ukSU*iWwj5KF6qXD7n_NhJ8iVpPkmf+nU_jCvt3cWdWlF4)#X#nInVrNy ztmNRE$1lDMe3BR60r(c+Tfi@57nMPqg2N%)4dWMI2mlct!xwQa#?>?42xLgOMyet% z#BcI5iJQhp4R4hG42^GYEQZxNB?8Zwos|`OEDFIqgduVeiH0GzY#1Wn6=K|*=%j!v z$gM!A3`mHAq_Wl9#@O*ixfTZP!CNfRq)-Gy(<_C?o0uV>FxG9j@{gq5ur3P&MMAJz zoQc|Kh7%Nc08BU_6~VPgmqS0eRzugqtR%e%`h%8oi}o_LxH6vwvo*L71Oq)!PK(bF z8~L?T@kzXkenR@Y=q@Xvps_w)t=Udr284H;r5;yDjwnRYP`a@4NGNZI9-`P-c0KTh zm?5u%pe0d$>4wZ-GHVqN%%c=DT`$Z$R9qPmo@a*hkc6V2IaS)6Yy`(;aA@Y}|Vt^dciT|ZzdgQMxCLmyRp44p=5mZ1VQJ&h~p`~PsP``)}>L4eJ zJkSuTC#jZ)1PiYKNGd3`Ued8bBt!JccGbAt>q$z7Qx2N8aFO*-l{zM4cdKq6wg#uP+BuX)>A zBg5J^LFgH_-cAJvx=2M z)6J`xuc65R1-**3K`Vk+2fSoE$Mr~5R&m_rd+r80ck#FxU(K!NR|ncTtO+1RG3R3q zl(i@mtbe(+Jb1;r$=H zhB3guXYl<{teLlB-jPokH-19<&c{|Ko{2S$Yq9pk+R^qHM~uY3HHd$l@h==}PqfGF zZT6(S{gwr~Z+z{zy@O#lNby&@Q*4i-{wx91?3{-D33xLuFGSAWQ2@S)cx$HHTPPsg z0qzbeInvu9@dHwWFqW(EgOvU1{iXXJ+<>ehfj!e%OqdX21PKPxsWfwE>{1aD`pZRo zs(AQhkvF)8#@Mj6?uAq2%ijX-=gyf6U=u}PT?ATgc@}szC~&GEhAvYDY*YF=>lA_E z62UU_^FXQi_+0S@=zkyv1JT``t3}Xe#p<%xIt&dlAy&YBL9jgwt^fp@5b=vgu7(ub zp(2Fu>nJsfZ)(*#$u4rQQTa4wrX#5qg^EOBfq^SQxWxGyqKqHFkH`RD#{3vt*%!^p zMU6XgAqQOMam5IxZJ^9md>szCj3thRqf=YOYjHyxpHgx$_Z zWKYFgl4TTnf(h$z+0lXl18{_4MR=X`Xz1A`iybuX+fonCP zssxZ_xdz~Ku(UyO8!iXU9^ry$d?6md;#TvDO~|-JF)JG8Sq~A3^ntlpTE0}T%@$8R z`8bvoDQkZjxvbj`fc~-o1_S^kp`x?K`VP&hUh7050KHk$!gx`XbYi-nIqI7%dl;WRh&F?-Fg`zn zL;}XA(8c(q)=a`R1^XQDVSzN`Ll#I7u)+cX(t)r8UMgUL0NuI;q;$Xng?E|-lDi%j zC=;+iB8me!t3`l$urvc*EKqn&)+W3r`%r+`j4V(K;fDeih`$WSU$Q`%9v0|RcxzZ# zppQX7NS0!BVXTh@+9E8_ux5e4@D$o;avOLaJ4@Wo+Bm>x6SreGfj0(wKp+*F8L&RW zBn_<&L$F6C39L_tN%AIJzvgXco-q}G&?alr`ZYIf44QD_}N<+Y=G9xlfqQyM@`@C(I_V z*=z0rd-D-{q)C~_{?)Bu;1e{yu(G z`qBU-G@15RFrnkOQmfnB+t-c>8+aV)--mBrvw_lD=w)m7x3?qR?ZOKF53qte@Wwk( zlAZR%>MpQ?543mLTgjLSE4bTxaP6J#-FTz8jxjUc(Qd{pUSi$1j zL+zcy3NrrfLHy&Zh=1X`+Qs&6dk43+2`jjJ?O_`!Zomqf?%D1X+uPea0#>j%t^PG9 z&EKN)+jRa79pZWQZi?UG)tBi|*Q)u8e~T->5GYS7T6wbalU&4RmjQ7C=%;^;33)TS z1so8C>Hy_0wSeQ=OapUB^J-XgvBWjNY!c-)H&>gjG5Q*DQA8I(G7czFk0^b4X z=8E8oHN)D3#!Fv9JsU+|Dt-vm47DD7vv{dGyD0JM3?rOxlpxmrItuBJQbgEWJFfco zc%uG*&VQiuAL;yOIA_!!^76mXVJ%IaP=Czpm*H^J!IBby!<+Jd>c8;~k)D2)R}ssn z{)D%iv3u%I`ScW>_t5!OI>PF04+xH+hq^)k7KYMK0V|F!rPLNs6e6fszl1ma0x{$P z|3l7^gtCPX^_9v?8Aq6MgCHKlg?kzd4(u zQ)G5jhEA4Fj!vEqDLL(&<<%UV--o};E*tWg*#tPtx8gG6-CXA0IQUG{0*?sJ&@^AS zlA2r1CWg#0vml&nK3>4pK69V36`HzM zylRrZn^$Y&@||NX{=p&kXDr$fZCNAYc&zk35%L1*r@Oik9l+2qNWpg6eZ&LVEa3|xHyWtm>-F!8F9jk(Vwg%-SHT4ct5wA3H1xolAq?sd%NfC6 zwTnzHQ{=tw;6L>J&DiF?FG z3!NoQA9{)?y5Jq*NoFv7Yp#a<-CfME4idn@S+r9uZL>ZyLVVhCkrd|}Fg_<-H% zSomU3Ky1Z5+#6U65f~vq!4!@H3z38U1?$r7jGG{EPl!P3T5%x?aiVCtLG2qv$;#sk z)CZt6A9(+VFMyyz!6E4<9OLGBBj3vLNaiwCodZiMaU5)b3G{g z4Knm0sOQlNOaoLLw!3s%F#3E~46uL@0UofVfD2$j1{@8_+S~DV4zdRm{_zd`l=8OT#iv4MIH10xA+xAQ|NVSZCWHHUb(BtD2R{tIKVt`f z7}yI7LBnn}w?g|ujSKWGaR7RtpCKp*o$?LRvg4qe&A9Hagalc4R_@xgqjpUj zL=s`DA+b7r278PzBh{|miv{K|Qlh9q-o_bZh*vpY4fBfBzv9?b%>U--MByUIeMd?c(8KE$ zE*e4rx)&=vBuQ9nNv%K?N-`L`>bUOO)g@Y(kd)6q`s67pQvzV`TPOz)z z>i4jspibKUuT)h4WTbWIz*l3krGy@OO}Z;sF4o!*dI zJ}I)#K!YkBD`MmnS;P}?;vR##> z4ZG7bk{_~qn(`(YJ+;3hoYWX&S;zir#Hk`W{Y7TxSLsk{+ArJu75WKnVu`>_z$$=LvEE2p;nL3E>0n`M5lr+pSXmH6G@XhiU(f9?30F$o_ zWeG6JnkzduTzyNyTBU^88Q74BMWSf7R(KP^t zq{sh4@lBdfkg?$p&xLmD0;%+IAf`Y|%p%&JPek$jb$EARBTn2l9V^(Y#2u>nGC(8p zm&_#XAyPV^*3%AlE-jGq$;g!kjif>4h%fwe@aImYeDFIU&v?f$d48!m#oi>i*q>Ce z2LUwc07{1aaC_dP#11;@ERC*p0Q~EA_pp*tf!at~_wF8Abg7sg7W@S$hPmwDq)!5P zX9Lz8UogG8qCU7HFxV*G9A9zTEKsZ(bwr!HK0 zZ2H3EPfS-%pFMT{{A1_+G16T9lbN5Ps!N{8HbPiQt1pQOB!ICRZiQMR8piSmg+bSR z1wH+(5CwISah)gX*N~BokQqPBj+Mv5Ec*Wlde#VFSSve%-rilmZyTMlp6FuyB)T}L z-5-Lp9-E#)?XLp7pm%kO<*Z&ZEPg8qW)$*lkTz%&z!ZRGkvBAigF8Ysgv01OpQEBW zp-zAh$;=tRf9Kjs^(ZCI!Cg)>fE^X=2@aWLM-8UV!b8DmYD}G@L&OqV_=6}!AArAq zL7^(fPeZ0#v$4eb!%>@7LImD0w2>~xVi)K?rC0BMBV95>{3x61e#9E;K+L#Rfc_!i z&Gd|rzmrZ|5L>W0p>2SMLcZfG+c#>`p6P2Hi*|s8l!<>+K24@C@@anV42_iq$TwN+ z7Wul&>d7#{qK)EG>a@#?a8{e#+#jC;#grG1I zS3Q`pXJ`7%fX=|c!|BUTpf?qRbh42PtF$jSj1pfY06xVt(eL=drCy$X_q`)2{F4T%%2XNWYxFtf0vmW2+I zO<|I%V70D$MUgqx!4OAb`#wU!EW9+Az!F5?m!5KuhuY{-+I zmM!r!7yE>{nh4jSm${1++@|gW+9(Th#e8$Gy_hA8%o0hf0?r=d&e?SO6+-dmYV$?H zWFtacYOEN1*!&W0h(!cT+nVM6d~RdJp4EszJ*4r6yzj|l*l#f}FmO-o&(3_Lub-Dl zbQ9ztuyet~QL7*X|L%S}V*VGttR`k~;=bq*&6z}9Vg^wkL{_?{ zaL(TTfVSfq^ym~;0Cy|=(@=WT$R>>Ly^#kMK{&Vdx@E519a7GH4OZ%jXBu`b`Xq$g znOa_t25sQeAEy1mz#=nx`CRR~zdOQ-5Ozd#QU#da*~O(6VEh=X{xImJiVFi=0IW)> zTz`ATU92r}*>z!fkcJQ|VvCt8fj?e}VvGo87<%>#9e+~I{+7C!dvi2;P-3u=wF!5u;V!} zQ5EiO;qIScUvk{$Jh*I<8-u+T;MeJM|8k_#@gO?gT0Bm^c;5}Q9OFJRfDk9Q7A!!i&K;| zXW||eFA5;$l3|E}FN-1qd!C%SGs_hTNV1t{j~*%>yEyZZrZ>VWI2-2~fpnjP=x5d zyYK!7j!j8tBV4s7wOW(ILX&%eej%EsKqPt2{Efa!VPBVVf`=NULsFdL+qdfNo4wWP z?_n$E+iSwCB5K2{wxm zATnR{)VE%)OE4=HrHKl?IMY|jov|Kc>qhJiK%6e3$bq5izu2Va=)vmEQgD7^a&T!u;E3| z+1=zq8)+CyaVF}VqI`GJ8-xjy5g|4TIjM39Js#R#Mch3}YCDO&R8|i}UDHni!F5QF zAi;H2utJNW5Kv)38)8Zk7Jjh_NkO=4t?tZMXK(Z*Dm4<+Uw_3$O(^u3=Gb)S1RINf zg^>}wse2mIoI~q_mzOC(wrMw8>pP9mJJi5(T|!6c$&k)tZ|osG7<+MPjdYQh-v_t?(1i0)&2tLO2qyo>;(EtSqi0%ooX$tPOYXD;*uY-Rs%7RwP&O+MI^P#YYANQBxgkm_cNl5LUDd?j>yBCPw zXzVo&5ghCiI+>eBX59w{Qh;h&$U(E0$M1mX_IO1t-vImyxE`cN&n`#^T*G2v-WkI0 zQ1Ck({Eh^_qt381;*5$85awQJqYwwGjR_)Z3q*kv{y->MlDb&=U!)nj5C@uO;-aES zBu;B+=Yt^`K+|h_veeCfFxYxkDn3RUSv*Piv(`7zo`A2B6nh0VI`W4?QR-+4AY)qWkdvziRqs0<6+iDnYD2^^T^P} z6wZ{2A#27>^GD1pucwa!xr>FQG-M!$Qt=cMBX#M*Xs@-p5#`rCdzK5|!)t#$reHd4?06*CJmN z5g^rQ4h5#HP}r>y$iX=F1dRi0nFh`hK!fp~P94=qPvam$!YTyn`!yDj0_ywOIFJu>wN_=bSmB4WC+}Ha*NUUODUs^~8QrA0H0lw66BECob~^Agp{T}z zBar`3wv&Nh97jiF#BX0a6Zl!cY2(~4gu5S@X!-T9ZAG6Z8t~b;8lbfRm_GfVgS@RG4@1~%o>&+HLU4G27{Xk80Eq2j;QwI1 z;Fg8su;OcDKrI?;TX4Kh4rasR%{2UiyR}hZ5`hwzcs~c#{kUK!69K00i^%naod(M9 zi^$^_=}U~djYBn{HQ5Sl1>1le%wjv+P4Fcl;3R^kPob-NSAS+gNdOE0#;8Pya;~B=gQeG#zz_JdbWmt~DrqE!k z2m->X9E?tv28pz+KEk0Z7Jw;TfGSI+GqTG^J%E`;Z_YoBtCg=Z-z4uz0XM*Df%E_^ z9Z2b+Y{lt?V3xGjTXfNqflw6(%!qShzv6!K{Q+-Uy%E{GU2ej{wHDb2GFr$`NjuBF zk*@8G;jVuWJ;Ah2$$yvGTQnHiSkd-Sxl-!RK(TnDxSxhz_RoZSu!4Y68nlx%X{o3l zMUg=S$o3#EI5?E<41UC}Wj`z?vTV?^qHzhqWEyk%<>sRABe{_w#z&?>C}uy1CGQF* zf^cDxgiN)-Dc-o+zo=v$X5C(2QYUwU!V=pv=JS)Wl>@!&8)}Up)Gm4>d<|!O4%fM_ zxtj}6Tu`)C_iTr^ncAnRoi8EZ>Lx>oMXy2fkKK!jfHHrGfvz&p@AK*)onNB!J@m`* zDh~%}J*x7xq$RNVDY#F9Iy;+Cd{`oD&Z|$+Q*fGG`!%8RF#X;`r$C4MgLMme2;%Rk zgq^)tT`K3t@=QOi7*op(Jcx5jyK{nLJ>}F-+7=3}n|&V>?zFm8)JN&Zy3lscq-tKE zA2++;xH5eh9rx>FQHuHF6w-9SP}sFEX5h+0tjO;41=(62lA{r{6-5~;x?zUVrF$C9 z#X+L*ittyiKZ2gYBj*G>s4LeW2ycbh^9ST7FxLmeYx0JO)B1XvK(n~v*!`C%=-2@4 z;=VB!A2Wxr`J6g&v6*Y;LFDP}=GbBbSQa*jLqiUb77(aI(WTx^96{k?Rypkn;3AqevE z=_jAQaK7^B6Q==+e0=)7x-$XhUjwm}MJ-{{Xn? z!JCFNM84SGa?8w&um7_juqPbYf3mkaTkty${(75Sjp1q%SA4ovIPvY`k868J6k;c? zw!vpt@Vh(sEe5}PoJnW9Gb}v$xV_iDi#$2F@*SCR`$2o3eK!oi?R0k8_t^Vi0Ig8o z4M~Vd73>4{)C<^8Sl;98${e)ci9_4>;>o0AIeQtKeaP8{lMW8sCE@b#VyMg!9Jane zIQuBxN!!P|!y?Ax&b}bDC9$RL`|SHAw!5R)`oc&|SseHtgcI4qJ)mbEiV()f&vcG` zzEh}cW9=mket8Rq{yB6E&Crv!<=&zIR4i~~a7p?o9$<_>6DH+29#ahFr}ge393lXe z&_Z5lMwmt>^R|5U_BQUr{AfeF5Z~^Rb8OdqH`}Y!{e`c_BL&L0Jo+Y z;#5#3DGUKc;k=05w-hl2f&EbrFX9|CTc3!;?G-pTQW9a*gAK@>0xph+zIO4yT&O$z z?-Z9~LSSe^16CR_6{W?zIPW9iad)+E&I4nRYewu4E@Nqz@J94V0D8}q)bu4uxnZ3k@ewS zA-%B`W-7akd35?wsVnq4W;!Yp>xgC2c}Eut@~-Xl#noq}?&&bE0E`L0FRN`|?f@qT zD2l3}6U-auR^%+&(a_byOHs%k>J0IB%KC5Vhp6Tb{}LO77stYM95NIeCb(Q@0WX23J3inN#HK zMSOZ?^7PS-5UE!GiorHa;jA>XFyb0V60*q}|AQPf=a-4Bkad+0i611gTx&A}gjFEj zAha^T<3Y;M3$0^nffA!5>Aolkn8U}89u+NNkHVCof9N)w^MoE~Jf6rKLPaXZlc+BF zHtgp@ZV|vbqY7n`{2b7Qfss85YFGejNJK#&foJ9*ir`a`76gYg1=ZhOq>YNug&KG& zJVr9g!(;k9)-G(~%*+h-He7BsqcA5=p44`M0q%;9f#QY@8awO(k{$|WB3%-?UyF*aFUk^C`6Zak6 zq}EE>YK)$;4}_611=xoqrdzfm0PU{(p5T-YVV?)7JRE}gA zH5`3dU0U-nhTV}&sTaA{4)=?sKF&2z_yM6-sBP578AWU)Ji)*+#;iFYv)e5+5;4_yIn>#AttsSHB3y&%2jvuqCZa5#7mD>Ze)4${dfQ)az{@ z$smG&r?76j-;GGe4rJT9GM6E2jLVDE#F8D4rtJIGSp4Lx z(Pp_GHU!feWredyFZyt)rqLZ)u1V(xiif6$D-UW?wMC!Q0%ec65Ql0=%o(hvn=pS% zV?6xAa0r%A3ulS1LYV|rk(g4(Hj+4=V8I0&D8Df8fUPB36%svz(Vi~DH%4<9%X}Cf zB9tgRV%AGFWU6VAsWyXjnlS4VyfFdeK1fBLFnl}R6*ts1Yh3hWSlTneyPp9mLe!#2cj76+7(`QsMP^slk}~Jd8kCHDfi~ zw4gvTcm+cbSr9N#!yq4N=(gi9KLJ%0;?9Bu$WCifd?aDk^5+N9JnEzDZ_;boIdPU# zq#o40m3?_C*Cm79$-W7S)WOUr3#K+P+L@!Yp{zK_LiGto{upAQ!74)GE7%c+4L7tm zib#bole9%BD#J$d!!c3L3u6{ceB6u!5ojp)WNO`yCD$Qet|!pG5#jF)vDo@CEss`$ zX%ai$euiV^-AFbK<3Bvd3%hh6HQbTh3u{+6gaOA{V*f9sl|$GIjGa8j4r2&1!ZeLs zVuXNQH>}rSCMdmfY$ML)P1EjS6v*W!@fDeW2^Szb&2|i`vmvMjO)u6tp`abtY9~=@ znOCefUE=|GnN4i-!aS}%sf zL;XCcA?G^f#L%tib3!;7B82D|w3SoY{l>FZE;-c()}8CR0nQ?z`T@lB71l6IXFk6- zw(^dRY6&_4h*5wBm8RWD^5Uq>jBvOr+b zlfeJO9Fbq<+{dFc(lGPX(W!S+HN7wPG#DF-s{&b0cL@$AI7245tVhdW5Vf$P7c8}l z8fy=Bb8Mi9ZkaIg$)hoGEGE%J&}EvKSgwQ*sc9KiId^QmyfY{iqmqgbpQA+Fph%t@rSjB020#405YC=ey{5@mX+( zpJj-D%t+X=yL#2XhTosE4H*Yb%qC^6;_)e3r;xs5W;%DtSI5&`2D-YWPJIcV@hv&! z1-sB&po5?*y*^vVmMffh$i@yo?90f3tZ-lBO-J0qjG~kNHZv{R|2MpnmAd*sISW5T*U&s}4qw+tVK(Zk z9*%Ks0}X*wma&|)ad@=5!{csG*c#%YuEY~~X>h7Z91NDSacEu}7>U+V4~LyE@Hjh* zHkoeXD0CXtC2Hb!Or69cchVcg-4ISwkj2Xzv{SrlXCH-?p^HBSTS$E|v~!E5`*a&t z_}xDsnnFEG9K~wd8^hwA1_@zf^lVwpgReIh#=*H83lqGWq%9}p9r@Y1hJ#ma@Z7fz z_1kt}#h{7a&ID=|2me;(Ug#rz64B z-wK|N>Zi>t503%PAhEWG?Z8x5Q{LS`)@yTwa%CH2a@GCZ-Y%yy zlwnPZwWjNZp~~H%5DN~LgKskkr)yVyI^lrwnC!{-{Kp{%w|D*|j?ME9$uhc2T69z| zqri%cAHe}@?~y$J{<`>6ZyEo;+bDiLvXJ^$BoAM~{X3Y4E04Bm zy>V|fiAg6~7|wFTy{yq1Yb!>mASA$XRx{XxR17{BAe{hZ1i-hfA6PATOd9|hn&t1D z0!J~1pi$&ZAqgHuGIM!(ejYLvnD++~B=U~do=!GO#pxF8#>!S7iID(ioxx60!8F_l zjfO8kBLLb1J%a24mdXgiV;&&mSnQ;{lA3GP39??v2}e4|bID56zV+0p>G%7_QB59` zdz7UT_bE7n8@(HdHSVuRPs3rP7$xZS1{jU~;XTPI&vl3lZ!$&&Ic>z_oUzxE*2=p#BH=b$mAZ(z2=2PFT{=|>Y($qp zK2Ra93~0tS)F`q{E5rH)!b=1Ho+>7t3wi(&QRJDeDOD_38Pd5}0-$INs>--yv0e3H zfVNAaUqH`_iGY4ugH(_M;r&YcLww! z$HA>cu`)EzVTBR15WZA^%%|)A&Jb;~bo;3PinKSZU3TEFk&+03&zRiMeQyUft84B* zGUow*kOb3*oh49q`~iIX`q$5+QdNWgA3>Drk9qYnoT&#i&-o>K9N?pW#j7b?`2zuo zsVN*k>0Adf>*}*1_A=-<7|Lw9e#&WHt*KU1{R&=2qad7LfF`$NFv3!~z@%J{*Jy0}H-IYFQ4h@Dm&quAE=t zk4CX*DxHiH3jiUa?I&<1Vgn~ka)f}Q#@zcFqdrCF7@hx{X_9#cLl&2Am1S}Gy1sc1Q9S64Qk9Xb4?oJ|I z+5y4g9FVUO;grQ4ujyLP1LI_jk0xPs7;lKVVOV%d0SJ#AZH2|XeqW^U{1Bd+&s&Dd zVCI|cCiFQAqU^$(sQ(jJqL9_Kl#t;44ZI<90?SN1@lpXL2E0S3MSh2M%Gnn3jzb$Q z1Dn0rBO)YXoZJXkM)#|Tj@D)cVVxl7j5QBus;OZdnqe*gO|zN=f*U$FUU9nzXfPU~ z13w+{4dAPV^GGbl!BOMjtg@T8&IPV=ioY(ICCZ^Wq?8Hbwr+uIHW}N)IICU#FprlOs_qP? z*&TKP+}MxF)gYF$&&eC$R=VE!T<482ZP17;?@s(ah=Z+BavNsi?BP#aHjcf92$ff( zp8@V2*!MyW=iH$tc9F}uxCQhHk2V=z+kHFc0WrO>=cX}6EhY7hL&zfz;)eOS!j<0u zCj@J__87R0Q5-#;c>#9HkrrA`_?9u$(p~LQNuBliN_z|#?G*0!frp7pO$`RE^FWx7 z_LvC1cuL$FwY3J;TcA;hv+!B?F`dI2FJ zSXf*O6~QU5kCb?wvr<5xS^3wABu&1)F`2sA{In67N;#B`1vv*I3g~)$7;s`UDg#k~gX1T#SI(&` zlFD*!0!dHMR|7#8YF%0&xs^sHbd*BVS?rYo$;eZ(UA;zsld-0lNCA#NsQYfjnFD4n zV__qLEzP$E=xn7kNJk>c1+_>@Q;pDj6b`7W#U-yID8;Civ73*mMWM`c0dLF0&nY6N zRzM1t>s&)#4D!131nfdwxejZhp9l)X3pv)+h=N}~V zuba@z?7$V#;lPet$=57MeEAfk{Ix8s6q}>8ux43s0R}gp2hu#a%8#ebT?l7^Vk2e7 zErnVxoCG3WZ!*l|7e|l@jYc^#jxDoz^qC2pZPx3cvT$T~UfwsKH*f8FHGhNmFJ(j2 z_cRnGxSlf3Oqpjy1`A8qm)ZU`Ity_8kqV%qc4*~PhK6Qs1Q!0c+0H*jXN8Uc#D9iY ze~-@3(fN5g1dXd7<<&M`abWv{mD5j5pLzVfl`~H~MF9D?2OXbdJpUXqLd#5ppR?M$ zr}_do{J4voPPGT|S= z_UjSoXxOiBgh5ZkNhWReUl!lCA2Z(mC=W?Fix6F9{b@T`r z#?KrOoMLKY8DRZ|U4ybLW;MH{Rk`#&ETp*!6Z$IL~V zjnD>}xGzT|Rfv%ic8l^cTbmy3O&fR^;FGwd0gwUfqA)&6ShGF?{D4!MFBRYW zr|^1!K01$`VL(99q6iY~jZUmvqAEIBfb|TVHmc7JNA_ApnsX8AB(z^d86&Bc&O2gu zr7HCb0?5`XW-!TO3k>$DL>9?C{FLl#y*bhXi!jgyvCTw2(N+!6c5x)Rg&E4m^K3n##wQZcH#S-ll-ei^DI=2a5y z$rg}dmtlQnCeT!in8?-WGpIFb3aQJgfewW#qSc^yeFgR zcSU?=F_{|Dv(rqH4G z%)qR9T1+AE=ok0^^^wCB44%_e0`6ccL0`jILKp%j6C50E4KoS)cPRKB4t__RL2d;J zwqkIL2w0Hvh-_g1P;zSA4~OffL#ju?i*equMeuIibZ=dNf&kiiZ7E&sqGRiVKAZ_) zPZQA4VB-St>KU>Rg0YX|7&cEa=BjW-Si#8jmybLNw{Tk!HrflIhv10pz2M_z4?X$3a9af7_lbcgoMD^M{#%&%eD4 z5H#JNfz4sd>I+PABPiFVm}O?%y$3}Iu*k^>i%k6|ItmT!x8pdgy%&p|;|V#?MB;=P zAPMr&$qs;nCHnVXbUaI*6>u(jBD@xaF;E5o9-SmY87Ie_1fl=~K=yzWP{n7^zO4>6 z?+W6&?4{Otuqe^V(A=`4Vn2Km`aBD8C1@5UEu0@eE_Z9#(Z0aE41$M7JnaE5wJ-n< z9lpRE;{-7dz11O`C%rHhdgK8vaMi=j zd+FCs$~Did;k%6aKY3#z0vr4BnbX1BCf>s}G&6jYdyN>H`{e)38xKS4xY}7rl2Yv< z;Iebjg3Fm~g4iBrn?Z_@;GXv`%=QF!ix1)aA)ZUW4c|%brpFYOuiio*lZlv<@wbi8 zo_aHLew(7PLn~Z;3;PjRHizLV;vY7~WkckEWls#Gt^;HTVC>G^hLdDz^bm_}$Ld@( z@fA;;C?3N?2dbebNiN?Ji2A@eU{(*G$6@7w^AZOM$aS!Ev$FV^S)c4l9dRGc62i1Q zk)IaJgpbkO*3yk&F=Uj}I8yQq=byhpN8m>v-E1C!zdoaW<-}>d*u(6WjlJ|j3gEc( z6%Y={5YN;Fwb8Tov#0@`qd~3UkO{Ve1U;$~e>)A4)v@)LXi#kdIwC3k$;NWst8Ms9 z{Q|N!wOwOUKF;i4pmPEH<#>RNgPX8ZJv^nYz zuvHI7W|NDg&5Z$QHUoxoOX{bT&(N5QMU z8tn?(d<8}E3BXkkpapRFXfNBT&l2qLlpX$&ZLCifjPq)c6@VBv-d`-O#;Vwy*ZVHK ze1#Q7mOdTJ52rI5?=Jqfc)R@A|4@vX@+ZVQ<*5Q2H3b@&wjR`oD4u7BlqOZ{-jd?> zZ$I0y5--Q%%6Yl|M8~&u3HA%2UlL0N`XtfajJrsO0y)SKvE8PMZ59wWQ4ggLpitZS zEK9l(0~n{9=^lTOue}qE?+@#?qPkbrF5ygZ+s}2~XbGbT?_S|MloUeFp_QQ0a(m(J z4>TOe>!3X@@dR>p7lZEOgQQbywjiFn8#m>G?cl^A+NCW~x2x`9u>E`zY%0dvvZ?q0 z{f3xI%_(UOY6&NJOwoHd@J4Y;x=7)cMXPN=i9J z2I`LlAv9Bu?IAECdYrG0_CC-t+{gFsr}GXv570RW$4{Qq=}*(APUn+!{wAFtr6aS3 z2wcCw+h3qFf(n&0w$yuh^%$J;L}=#V{Do6bUFfoMaGpU#o+WG#g)1`NyQunzwD;2w z>-wNtAm;upIM%2>pp0j=j)(w{lj;yeb=b^c6peg38RzwwnT3IgycnP01Wk1;zL&3->9U!iy<6`T9u8de$74>Hk|JmNjmk X(br=ZLVY*G^3Qw%zIpR&<3s-+!t&0O diff --git a/tests/models/mistral2/__pycache__/modeltp.cpython-310.pyc b/tests/models/mistral2/__pycache__/modeltp.cpython-310.pyc deleted file mode 100644 index f7c6a28cecdfc2502d5bbb914f4ff15a9b802990..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 52277 zcmd753wRvYeIL3rJNv`}SiC_HBv%wA5)=uNdfKucmMPJeO+m6q+wn^BVu=}m3+w|t zGayA|5zD4rD-R})+dOV#e+6SFvD`FinlDY_=H@!jo5a0MZf??>Ow+V!+M76a>f}1H zBT6#w@Ap45vpWkwlH<1DU0~0iIdk66|NFH1`|}BWe(fE%E&hwEM&eKT(EUl{;jQ?( zr;>?;ohT=4!#3+i*^pneY|3x4oWyUkZZ%TnR3lwZH!|f+BU{cII)1L4lX&??pP4Wc zjY7FUcrzexta`4|SMHOrRDG~fEEgL?<)OxKd04{I^^wMCc~tzF`dDMUJl>cnPsnq& zKH1n+-qn~YPswwxzPmA9o^I?Z?`iBU@0GB8eP5$gE;Vi{-z3j{_5F>T%QrU;ln*p+ zDc{n#wR~&iVEJI$8nx zQSL%Q~6Ee-&KFS@kIHF#=Fb!Zai6j zQo^R{XBubAXB+P+zo+ru@_QTaE5A>|ch|qA@&5Ar#XnsyH=Zg#)p)x6bmN)wGmQ_F zKY;K(_TJ|c<;vR=_CCAxe8Mg{earJ#lI1EwZnF0yWPcEH&c4|`fVd0x!}cxqt-rjFNfVU6m-0=PaQaotC6=836-jCWS#C<5p>23B&gr9U; zl9OwjXA(1ytWT7lxO}#y7S5L*tGS-4)=Q0+?bK(#$fY+oWBJx;g!|dYSG`)RS*`o2 zvumq$$4|YdiRT%^H=E6wq(AhwlaIWw=AA#?TB>>O>6YvI#ghs@k2F2iTD>eGna9pN zLJv}5Ow> z&1o(;C+k($tu59Z6_0^h4nCorv#MHaI<{o(58B@4Ri|?9vgcIX+PX87_D8F>U2(mt z@+x*~0exI+F1h|K+l5xVRz

6~9zfw!f?9Ru)nIc{aXTxxG@oSgqBo=P&{{$6qZe zr|LPXf)=CR=&jo;Cyx7>dTVJ31^T7JM zTbHDB)NWmJ-{Z=5lw&_!Tkufs0EWFV@CQJjP98~Bp`KEzsUG@l5{J&p0K zDo&8rn)kL^)%8bvB0TmEr)vAtPqr@YmmT{(f#0kSh==gr-F1zJAKXEu-azZY9|+y*po7IU%Y|Oi>9BnYYjgoE9w-+D~qo4 zxcMlVv8I|rCA2^9C*3u~^|+Jysan&!<90vOY&F*%)$%i#x7<4XWTX15Z`C}f;Txy? zBxa?bURY}&=}gKW=-FP1u5$rX+(my^&Q(3+o59qrEjX?}f_y6ab?0qEg9$CT4(dCt-Et%h@~e&KShdF*8C^y%6o$2`aN++)(q_|aSM?Z;+kgZ~AYwp-py zd~bDj^|C)$S#rFJURrk5t8zr#BAlWzZWN3u!!%99G^|(EV+dTpH}n=;vhNuVn8ft|T^%YXG-Ab2I5#o2j;O4S(%KC5?ekR5G@K?9KK2^QGVuFQ-&< zOW4$E7poW$ujI8#XYZ0#&PqgS=4P^f%3WK!S<-`=d;yn zViePtoZ8ZP&o5#xt5vblSDaP1R>!H9^exw^FZzAwYqpJDjypM)>#F-Aww_95fd>Ne z<`}Efblwfe#@A(Wd286n8|xGCHq3@qS}+kO{AA(b_awgVkHcvj%X-lmp1GX7W^5Q( zK$eZ(T266bxUe>oZM@w~x3S!oGZTqx#%KcZ63bb{HQJ^;TWxDIhlQ8Nw-1XiWvAMy znz51g3U=DgTrt0=xRLStmj~MEcE--yxvNPd@jkS|!rvA6o7ocy_t8s+fpIeu@5eV@ z0!A!9_eC~F-2>+=R%m9$aaOTt*N0?uN*tb&J~Y>dsuk66E7(zSCd@9ot)_2W`XZ+x zIyKGnqv_2Is(TUOn^?y(P*}OGD%WyywROq28cwz8r<8j^dFnkZ_$*_mZO6mj=qDFe z*Zj2CsuN(bTB}&_mXu@JYmL>Jtl|>!`@=Ei+2Z^A!BuA?+}98Ma`aTI?=qnOMZ1#{S#zEMoBkN3=vAlZze z9!H`V)11TriJVHs&sPW*tkvl+R4P2)^&6RZN=dtddAZ=!>*3s#O1&R%^(>u(ZvkJK zrP;)QQOv3^48fgGv^g9{ShP9hWpm)Bz|8@3h?|4m$GHPAIwjjg8ndF{M@?%1?793% z{K>(Hu+#tda7Y!GIY#7hN)7dcE+$2odB$Ax&6)XizL(5Aor)_K< zID3-tXRs`+%`B#Ru5EE+SvGekMiRR*?cvXJ>a)U};&|?3e#?dBelN8-fYU0=a65sj z=jN7lrT*$cIat(Fczn^&voc$2UaTxC=fWw>*gUow7ZVM$_Qj;1!C_Fr-o&|h5l4h1 zz-)d3$@Lh$89sl4=f-NwtqBn8g#F_B);R#DddjD%P;D0ZAx0nK0bx6f)wMc)375{KWeq4+Mtzvhl`ytW zCvzp5%2M;^5W^jYlQ+_4+DIG6F_Eq0fRVqkzAv1@@p0~Kb>T!pa)BK<>ax|X!y{vx zARK_vNVHAD54LeNxoP1!g{{x>3~X}x$=o2skZEHi7XfeBNo<2@#LL=NJFPdmHnvL} z;~b1~*3RG!w!Y1L_Zu1KEHDr0BpE2#zc9)&MtOFpz*_bxwSj8-`KWVk%y}M>m*9h1DZt6GZ{T5&oMd2O;0w%!_Z@{ z5ln2H>)B(}3yAmK___r+X=BNJKL^r#^^W)wjYyi?7npMs!-NzLSrf=jyxQ z%p~=8Z9J=j$?-NjJ128owlnn|s0KislzYC4O^oNP`j`Z%@5hgCK6?V++q+uL z2K#+VN~^H2Pa{AMkts~NQA7O@o)@@_haYwdn=aFd$uWVfU5U*^o9o)dPTqOKuHH6B zu@AHpSB!RYG%=CD%EpwFFe3`XYHnv_G8xq-`t+gIRbX~*iM^mMm!1f=)lzia5Kd&T z0E@@_YSS*QIG0Nr8FOdz0yjRSmfU+I2=YSd(7d1!vu6e6oS(V3bT-OCr<{#quj~-J zQ%WZ;mi!K}@doH-3COJ%jv0FzHIB}QPBw_TLpI+)CeZ;@v$v^K)I37sYU5vf6?o_ZM&~HEs=Z};Q zw*peu;UlHsbaYwXiJ8ts2-|)zmc_wRRV}SGh=RDFHjpHQl;|R4kFEe=s#FE~fEt~z zfuM#wqvYr&_o=W66?FVFoRG10V+4C*xSYRL;5+9rZ*@*^A#s0274#zKa z&v3s#iqFCEGZ!kdqXA3Qp57B?w2B@5az*cc1d|!+Ad(;^iR6nU4GbU81N-1~L`VyF}X z&{t9}3;=~)1@JkNW3IeTBd+Bj!dANT_%V@s{X`YoM~h=BLm z`|7QQYTbPR9-RV+j&+XY(Eg<=rVMD!g3J?#2CW73mGdoo#~P|1VP&G)sUO{1H}zu( z`wtBN44kvybXNaQnAM9(^;!7+A&u;VaE{|vNX+WT5kAx3LDWDH6#BP%UyuN>d&9A7 z0JJpC`Zk7V)>grMXf69W^n@c`b(Al1plvtxGa*wyACM5L3!a~bQ(0`jXlj7}C-Giw zgK+&8!reRJ0O7)H&l+aRGAxktOg_JCW_ErmX4gly8Ohk(4w3DXEQU}lNVfa}{MrPB zujz7|nAxNv39u%F+zt^JP-+r%B+JWio`a4w1k@}GqOJub8`ObbI#L_ZDkw>J#VAPv z!OpGcvGTeIM2-4tH-Sh!gX*ad(5b+g$*Z5H|7Ym@3pzhVXORxkc6FXkjm|Qi6*wWr zE0AqEn6LRxIck+LFVN{g`2ye*{Qs}`3hrBqNH_T#RgWKwP;NY*4z@sm z`8S!F?)`#Myh$-k)#3Q5Cb%-Iep<6;wu}~cA{j0A1on!=)7A!PfEyNw8`y1DOx4eo z5Bvd$@kfXp2X6QaJR$|)n^+vimE&|RwZSnbVFe+h2??`^r>*~<9Ajq*oZ+7*Ll@|(TK~v4^#N6wG@fjDX!lGRa;2nKRxZATob;)c%3|#vCzA9_2wT86 z_8ASHT<&<`sbw2Dt;X%7W7(FIa?-mJiy*>-Fc0F!^Zn(FopG?Ac+s z=5#)1%3mQ@uqW((i8Fxohzm?}P<%K~ARdr%3iusz`t4zSN1Or131i|{o_8fsE^ySv z0ejRQdp=bj6bfGvvt-gQ2DAU6kR`kRucV_N1FflclpO@HG#VT|+6SY^rAOV;L-l&; z{jIgqoa5LB0qg^+oh?1qvI$iS{0}^!t`xfiDn6ID<_ed4gZtbahg9zXopR6gOi(C+svp=g)4TjYQDES)90ApaJqQ zl5NvYtR>VG`5S`Crn5%rucJ%O~C zCqyMm?wS&00Ehj>n(g#dK+wI40#7&9hDkut_L(18{|d6=EN_~ zU6$=jHm%a3CA4C`E5ZD|rS)M!-{!duNi2q$GqXE`fNvDQI))i60M`0#y^HuYU_bRD zR>(IEk=-4Z{Y~d||9`if9tm?F({jtJv3yE|~9>=5c#5JMg2UQvK}MiZl60gK;J zWrhnSQNTtyJfhuCkDs55hEjc=(SMQIjK+_dCDmG6b)^P=o|t|G2e4Q5a{yVj5R5<6 z$=5~`>T+e9=1rvy(eW5BR(WYbyv>Tg!UhrxZ$<8Ateb;HbA@OJ~SJ3eKZCW@IQsB0O8 zSJ!v#I90*%6Iz{mj_paFbJrJ$3xyy033Q#r*Db*Ti!=eChdc)otj%lICXp}mnt-{Q zR6T`wLMw$>fOWCvTvETzkk7;Ma}bhhg5KpiK3E0Vtd`bVYwnDtJMVWH?>RdE7o9jv z>8C3KZ_Ke1iFuVeT;k|#oYiqoX^`X;bNMH5C=~kvj^IP1c!Q&5MgG2YYYX!0kmASQ($p?9HkC{ zMb29Bmo?&Vf4i_eYy(uu{er1L&(v=MJ} z0xn`*Gw)%##PVdjzg@&pXFhFi3~dbK2t+yUq4sb)c_oD~^i7bvkFX)T_z81k1o=!M z7E2guk6g154?XDZMp$Bb`U;-l=Ue*wy*-Dmhd)n%>WN{v^~m^ zZ4&^v3E#ox{ks$GT)Vhvtt3BYsNHDG@OE#}vJsv3hZ3K(q=f`>45WFmZR$J$YIJgX z3vwE5W0ZB6q?dS6`q8!t`u?bQ^YVe^TiO6Dy*;%$~9K*tYFKdxI8z1ofEQ(js{;b&~W^oyNlXC!@~Xk05Q%JBre0 z!~VKrTuY+l#OAT~7-~7zPJPU9f5b*x>|IFFHBW&Yk8hcm<2%gD+=&G8`Hk4j978)T zk%gL?15FK*f;?y_^LSlBWKl?V^WDJ$3fT@MCc$JR;2lCifH^3EbP-|T99+C}_KthD zfo}+D&v%X$7K9uDivfn3oxeVMt^}^b`I0?TI{H9~5AcHpZrEA^OL*kT^}@S_&!SPh z_58a)vkv?PIiJF($!cfM3b1`w5> zKS2;yj$I6)!6PN`D7HX>q;u7(b%Jz6jH52oP@m4ES`r8q{RfH2SrFl#!;TA@&&Tm2 zq|z^9^G&QDh&KF^7DIADE^e*WZNiB_vng^^x}G~yItTCGi?Kg_IZ8<=&a&el~z`Ae}%TkY7|u{cASk|Df}4 z=-fi*^K^cn&LW+iXfYbe%B-$xLfXG&T;k&zs`>(b5get37a;WuV40luCGWiY9me<+ z96!Cb3LzXv{Srg|cRC_1lx|eL##$Y6LLg6~z7!MU!3~CdP-dD>c?zut(##pRwdjGP zug)=-DxG^JMF3kVQsdNL(D?<1CqeNL(xCcnJkCt$Q(veD(NV4bfYJYuj?Xmd74+Yd zD-eO7(sCjO1bKuquVq|aKhKk0uNS|dX&}K<7r>{Fd{^jINO6(!hU}eg61=RHiZ0SG zV2U;?!36XB{&IWt$3+a^hd!#22XuAP%R7NxMf{2qz{BM#(GzK%4@oXaPXXr^XR) zih!gw4Mu*!*pIiqY@N?Y)stu1{_O2?NYdh|Djh=H}*3othcy zVetEz3T2-mA$5oY#I>kMtwy(G@q>avhd~LBV8!(09z(0m5%BILRX_K-aczDd7Oe+B>`k45rbrzn{o?Mt^;--+gGGLRN%Es4N**FLGG z*hmt@mZl%DQG%5Kr2$POn(rOe`4TpJ?zZ7J83;B6sj)~dYlh!i(`&6kL>d@8&x}?R zOi~EYtkob{9-J0n8HYP;Ge!##UNf+HoEJ5()C5UNSi-{FUht5S5DQp~)wOf=+Cu4} zCmzKvEp6>>BM)EOfg**RDZw!fx}|W-aSDTHtB((1Mu212^sr64IJ7SZJiCUbO2u$w zPE=01L3{;|ArT4~j8TL1#HWq#2K?@sR}ux1laeH;$SMF@@&pp-$xRf%-`lNDfP)_* z)vZS(+qEnc+-N@47W!*IE}Zb^(AjF;uf)iK%kg- zL+eXYQgM0TibaUt{Tr{ed6PXX*r#Avz%2d|aG(9GHKh4Q5O1tE7sPqKlaD-&(G{X6 z@}5j=jEmIZD7Lo|U~d!A6WAadlkJJ1b<)Q6xU^LKi@mnXn&{hM zdmOmoF7(f>0d6?j-WB172L(4gB)H*0!42srjyMAh<45g@CTSjrH>QAR-L|p2y&E{# zkrfj?_g_%@(QCj%)Gxp{i}p`9hrDBKmpyrfF@8hZcpTv*EL+Pbki)c$-LCdDX46za z^guRU1tZ))#B$$Nm8neHCoWqa*ur0QCmz{~b|y{m8EYv5(3`_|a{+tc0~ z;AMN})W5?frT&P{zo+xZbf)MK9a8@h&idYJXR{4JtpI}C4&PCK!i4{c&Y#k`ht7Y7 z;|~OEkcwu5tp702w}myxT|pA5_xK4hA-e_PB$CIW5wY5WaM@10lp`85!f}k#uK{e4 zqGyYXwS^k9uTfZwB2S=kL%u0S6kl6iQegDxDjX$nS*q90DbN#VMS6N23Va}Z1G3c= za1r>a24XQd#t+SR%MV55QHuT47iTpeem36rqu?)w?6>)uBj6;0{{J29ss94!&+wIx zKLHx2LqT~!xa(4NEdjm$i|jjn@&?k98q7wS4x}c*UIhLHEfBO=j(w4@!l@l7A_QV< z4JwW>rgG|naU&g}gGI3T1@n}quB`_0S&<}P2{@=m51Ukt5HI+wQTDT?_W{r$|AHw? z=c)@UlCQ2X;`zobDAa$4M*1V9_QiY0RsR)lX5Os6gdg>nj0MGsRrTNa_|0N&>dVaK zHL*7JSB&xyoww2HU~IdCl@LruH<-$T3Hlibs$=#jH2}Cm$m{A?k=!4k0DFMq5CJ4~ zE~2V_jtV@LdTqr~f6c&Oq9cf)plX7&5swqJ&L08woR~lRwBj_0Q`-JGGmk=-R@VX@ zkPL_mQkfzvbC#J@;rNqV)^dn$5HZv^#QW&`TRMUrmfiq%s079mV)4sN_zIm@>3p5e z4LX$a(J~5v@)G5NfM&l1`BTGuJ*+W6Gy|Fe;LkTbx6B=6fkY$?hC=HwASOePfj&Vv zmC59V0Q0SYlM2xnl2aHMJjR*;_%2DGG%GDU>u#rv` zkS}moVz4H#FQ~|+zYgp(%}`>c1w%pwKhs!R`lV8yiVr=wXDDXg3-X3jVh}yBKrlA^ zjKV|g?IgY~#~4Rg!p*=nHgZk*ZS_Nb##l*iK$6BbL2P~ooL)%S0AvY>@gl7u(i=~w zHvs-Z21FCq3m^{N;&nTj-Ky2~Q+eN@lW7%{lOK-S59@&4-(z8X>`8-$0oJ0m4}!wA7nMCn+G6xl-| z_%;)(%QnCaYD8LFOk{tX3cU{;a!_tq2B0tK2zxur4RtzDON43d@r+;S=x$do#GY3C zLYJ~P-gP~#K-^Qa;47eR=2U?ZDMP4b*!n7Km>Uql!}g%|F!(3w2n4l{zG*l!qZ$I~fhS0*4$02}ZJiLM zE|V~X)XL9ypmPWyXw`T0PM_{8(w*I?JLIX;RDZ0in?n70q}?`nMPwR6_Jmex*H8wD zKpMa}P2X-*?TU#5)WcJ{1aX68Hc5ejyio+@7{Um0`s;f4We`+vJI9j*!2Pfc#!1;!ye}`ep#G`8?%Kl$1nO@>mel~vLg+<96h#ok z3X3M7mi|R^k^00|pu9%x8jLi7>puiPKVsiN?WIp3alsOv=i>u(7!&%rICh~3 zq)K=KPvE@_?9)fFAYp!vr=<4w=_5=YlyG$pVz3nKF~QmdonJ?4VS1H!#m}A!ao7OF zj#kWyUi>iDl+X+02z@hjdgeFh#Gi3~--iOgQL*yIB%swva}t~r(-HnMk|RTeal)ARmFJE) ztT0>p>;gn_`-CY{z$6{;M}s~;1A9Z!xx4;~oN~ddE=Vz8Uj+I$B2(RL&UQGHvvMdy zkE-QST+0UMyTdkr4F-x7_xEC=02toNK+#-w0KGWg0RrQoC8z>^CShRc=kd_BtE;rh zAtj%A_=$(e90}xVUx)I)RZM9N14%I@&NTNGtc7+;jR@%Tp(LKPAXE6H1)1bY7y)?^(8+JYh=7PL z@-9I|K+fB-#=e6$b~a%v9?0UBqJ2f{LP+>KMG4}J$L*S|)v}f8` z?;z40!gpwSMiZTZ$0jxpLomzO7y+?=qu9|3{bathg<3qXDb<}l$G!ioT65;1P6}+DK)xnU zYPx5m+JsG#uI(*OHrCbBIH&aEhlbj$2_WPR?MJwAIJ+Y+}U zH`o&?s>0d@>awBqQ%BDcq%5t%JCC!wS&0{a|9+&<5Dz?l_sw-ykLRFES zggtg<2JESdCAviLzhd+Yd?gg@h;&MVf(MW#IH{opoCHMku<{yx(Io zk=W>=wvN3KYU?*bXzfVV2xU~GBtrA-VTrSEr*UGr-vtk(F954qH_}!J-0(MjeIWOZ zQ9y?Q??nXfd>YsHt$3+acHm z2x|e^#y+eQKXHR*hLf~S#|rePr~zE)vQ(7vLv_iADG2DO)3m3jF=@`|Eowk41dU33 z3`h~kwggkIFdVW7aMK{A8n9*oB_I#TkC;=*({dg{`nH6~ z;Z-5`HoZ(K0jmE{wxdYUr2>xnnZ%1gjP65cLB{Y~P@PjR4tEKAaB=vf+--XIIDgk; zkDfXEx$lTdSAD^q7JpIs_Gmo6{bL6~9BHam}W-dvnONyN+zd#FK<4kB3)tSS=s0MZO4KK=^kK_7*&atO=oP&^S6 zNiX*WVSV&dW&kRAa2FxA1so&=hE2Z%RM{S|2cZNwAQ;IYFp?pu@Ee%S>H#!NJxJ#v zIy();kD@o!yBOm%oVgwa1ufL`Idn(NwilToBzmxm-4DQN!vr0GC;XB;fcXt;%^^T< z8=nABPiZEQH4KtPK+rHCbAojxsYF4UsX(&O>7)1Utk{OJs&7R#1s5sdvz7iHs#$5% zIibCmQ?xUu#r$aWMKc$G-0KE0IWOdKUMu~TC=O*wlop;cA1kOxuXYh+GZa)j& zSD-NJ+ozB%H>{KRy5A4SBcB$V56se01Pkh7%SlCS&{79! zf^azlssc4@p+dz2A7X(Jbh}2HDkuozGp|v}2ip3egeEp$J0{h95R*Fjws{}(%X`GB zK`EuxFRi47tvyZib&s;^DWKgmRKY8Mpy^!FM=-2ixZX@b-H8_IWaJgVJn?yFVP&-i zk#-j`Y!628#Uv!WXNtYxYtQ+< z9Ux06&hP75GrD;e?K8iQP&Z+KA7K;rBadtXV(Y^MYps_7^hd(n1qgFb=ki}0zpr{4 zab;D763@ioV*<}l&G*<4pNHL~lN=e@L&cu$0u9arM~AIqED|k=#f^`r(-M{5A(5A=&ROJ0rx*LIoxHl zNrS$nLr;YS+LHnc&p*?{)kPGGy=lZotQyO?$`Bbf1OYhwl5Hqxk2%*`py3;;)Wd#4RKO7}T+ioN9m9DGV1kij zxj!}kZ9VgRmXsZe>4}<>W6wVmihW>MqRL2%LfQY>{Q7&&qSSLdbAzpcHY3nk8h{Z! zd{w;^*c&+-$S|<_tQ1A;1Iuj-B1R}Q?bhn-0#<;s1e3qx0;U}vyPXCpm{o|gM5Y1LT)(EY*q$sRwEuze)B zU7ciIDCdaN0Ox`5^$ah$3=W#w4`fo#BA}nJ{c;szTnNbu)we|7vKUC!CT@v1lIMrC zd8B=4+q8X>_N0CX7gj^4;MmDQEH=QVCz(e$Z){Wy7T7)5M-_a7hL%E2IR!y7+Oy5m zDjCHjpe|3+kDu0N_O%UXi$;}cv0XKAb%D0ll%nzWil2KVw3F-i1#m4*bcC7;Y4jyF z5QS?%dJi1GUt0kWCiKstBVD4BFv1K35=>(dQ)rHvN=eqMmbrc)m{UFDH5YC2c84|cqPg)qLSc_2!EgxPEv_K)_E3SP*+i%kN21y)^bii zg%_V?XOXoA+Kt$s0$(xsE0~Hgs;7bb${LR&S(+-)ZCkRVLN&hyjp|6$#1WaE9q~~u ze!%4V7=rMlB)}dXD%Nm6x(Ff9bKLYVL(er3{1SOukd$jnm|ob-fc*q^A45$@>WT&y zK4P0z&Fd27;vN}RBLMMR=cu|v=w5VkpPGLNQl@8FA*xF9ASYVWeZogCXK}L#HZE*> zI6Gp6ZXX@iy>S>-DoDVSUt>F=6;oD?pQqkObCKK}btgUgu>0UMhj6LDQXSW>pi^#^ z@S_-*Pa^cZnjZ0gmEtr`#!N1+7`9!*iv+=oM)RR$CdkgFD8 z5{QWfUx9`!@Yw~PN05Y`f9m*=(urs0@72(T?tl{H0|Q|yJuSq#B|GHsVdAI6svZ0A zVQq4%ljgV9ZEhPc@jYU9hxJWNSo}V*Gv!$ac z6M|kxBvPP94YNEp)}9=-m~0hPd6wV?pe7iE)%4yjZVX3g{+U>zgzAvIwg{`iEoKcO z2UP?0vW87S?Bfm0H_XPT*a2e1ssqVZ)w8vWvxH--=iFl_-hATt?1?+>y!*tAOg0iN zVx`rZToxMG3s~*ZHU$vLbCxdmbPDIXtP{Kxz+b~O7^K&m?K`FF%=d7XhLVf4Sqq|r zhQ9wk8~SVP58+CnaAaV`Ss+>mlnXSF0 z8ly7Nq0h|sbaH2{hw(~Vx7F}fd}r+e?9hEDNsGZDs7j&vR^Df`4H2i8iW93jz>}ka zjI!MW=^i3^Dx&a+Y6-i56c;M!0BRsEaSoO}SAZd5eq-)3!UFH=>cof@r(?tP9uJqD zh+BGqXdNMx7?efAP;6UsW&L{n6}b;+^gqP0g+2Ou;W zm=a>itRC5C&@>I>z9rU{qa~r%v8Lg`i^xwG4Fa@Vyw!Bv(tI>Ig|6a)rY9(vtOzmu z%T1L>=;saR?FDWDu5(JM%xu?kLTu~{`UFV-8epWt5*_?0H; z1ffE9-C3$GT#mIqOGDxXs8p>S6Q4dUBwg55e(FA3- zXH`Kz9LGBXL2ecK{aA%`9pkx&@L)K>dMdi@!N)R4(3KMCQ?e4ejzPUo^&Ey0$gOj} zbO{7sD*CDK9<1U}mygl0;QGk()rdU0ZLj;JOKpi^fjt0U@U$K|8@u}7f~nLCB&X88*r># z8Qp`N_aHn`!6c%&fiBWo0V@Guf~q~+8I)*G!+^Y=%>4fxCI?81Zve&%H!Hmxbf7U# ztk-KtDEE)U^nV2F3DbiYB|35B>~+~rK}*7^9da4Jsm8<=Rf83+4+ag2&c})icxO=q zq7wq=TF*SguGY&p)6E7q6Pm2w#30B4sgK6TNT7RM>>;7xh)@il3exWKA1cy;I|)K z_Ja6m7`Gq30fY{c$8HZuIQ%f3TX2f_Ee5|s!S8VJJK_u>-LP=wVgHQAa&Z^yh%+jp zCnik!PCO4${qmUb=Ep%A+~pSnQj>Jf`ah9z>5_8O01ZzW4Ks0{!@wU7;R0F()w`#j z2Ztj@>LUc2B~VUnz$x4CP=M%2iE+w}&ivkxvU*a8(-KdpdNEcR38Z9~B{@DHhSQOT zd4X~&<_lsFzhdE+X7E!Um}T_i3;7q^$%UJq5Z>n%Hep(35ezRFzt4IDct3c>c$m8V z2HqEK$9IGbF@%iTkS3^%2|+S5hdbzCFAZ23sAZHiirZ7Y+NzF*Sr9`eo%{uBP zpJ;l?iL}r7_Eb#uFHxkHv)Hs_u@ql1JWO*7PCH`qa)Ql z8jfAsSidP&`gXCmECSA|llXom&zf+`LdzwnjruU_qV4_rDS_voejcFO7Te8R68AnB_Yj|lduErTo7hK1q!#2 zBELJKZX~}R6r#m~iVlAOtcWyNXe0> zR!@65h<1x1Zrj>O+bP&ZNh<~By5$wFn1uw@msG>+-yGomfndQ~n^1j%ub;MXVAlKl zgSu^?4neOD6?^!78pBwQQkfX=I5b$4v}~D zW%S{I^m>pA0`yQWlbmiqc;AH8SP1h&oE{gIHcOKCpa|%XwKK3ZGv<{bcn>uKC~>9Q z{ZJb>oBiHR-C;6U_QQTq3PKer*e5k$@ltLDgSb5`n0_Jy`!Ho!gJ`kpYaY0tdKx&Oe5XI zqQ!IY3NFX>ZbKMCAP6#f)glBjw|@5hpojw55hE{h9~EweYJpK35;vn=K`1@3&~Qf} zJw*sa#KApWW3~PWbJgf*D`1ww9t;?$07hb2aJwO0B^DUwyI$Txf&n%KQ=K?1fo{7SiOLIugbb$H?h2Gj_m>XR zj@!ZcP#-&pD6IkYMW(}SNuZ7yLsW$!9{Zi_#yWQNo};54LGsRJc2RhA;u60 z?RiGC>aa4YKEXiIZy*fwI>uB5CZnRLHnlTAA1#c?xcWH<&>SzraMZUUFMqlcuCzGM zxk8OY-ZZSai(0ztgG@1q8;-l{jmy>L?)5>GF|$X5S2A}xsxa(D^<8w>b9w?vM}9ZM z4l&m}C`$oRNle}N<0N-=#QtE^MLOpaO6jl4x*WLFCYHq#;%pDhF!?jPy$beK5&t3@6hN!7J+0GwHvB zHg3jxDWA*2)ap2Fkq(>IbCj@zf;|9s2;AeYPsu^lPSP*0hYGJ6(FxXI3_eEF#rVE# z37dEzrsVvR(lx`ZfBvk(4xY?;CO?VwuNx(9QrvgB(vAx2pogO2UQ3okepgiDNZ?{Xs!^ zNtk*&M1eH|B-F;*ssp_@r$cnRSMLhzy(b-jBG$HvS&^cu|{JsF}=EIhq zleml^rzCs`a+)P_>KTHbCcDv7Q{9R?oPNwWohSg$T0b7! zI9G#4X*p51YXgl?TVyYaR^P(%xR)qy50Ja$PDi01C6;vk!5(ogf5fXf72MHjcb&xQ zAaglJhX|~?%>16EBV|aN+I-xgGsh^zIB^X_a-r_({aNmT7{R~v%cfj6Ay2R5Gte)&VPd*tQSt37X~?5O>gCl)kXTRjo&s=Zc=&-i z=pMl!QKA=#R8cm0o~`jJ2a?%LTm6vs>7|6z>_%WGArdaZy8Oee1o# z`rDXXD-0B8VW2pjHUBQ1GryM3n_o@$nSYxv6#fQ!`m>XUV%`hLHuj--6hH27;cbVG z2LU>*ob{TNJmNl}=w(IQB>_>petQ7Vc`;9%fXJOTPh5bp<57h7gAxb&ky8XsZkU!c zbK_h8?7Qs=X9Nn;lg=1^$3ffMB|cgVoq~^VCxr^QTf*VDr=u8q;M;|ey}|Fk;I|a~ z-sDU-*>Ui()2;PZg*%lklu6^f>P$es~O z;Z4ro++q7o_HE97yqR(=XFqeZk2t$<-PcijRw#@&Ggj`H{pRiB*~gJ4YoF+jiyUuv z4g|3+$t`Q&Vc#ja-4f;26Gw8&+joWWgo=49Hp_ec@DjXJEp<}t-JE{x`aoUZSTw7} z?p}v<_%!B)CLIz9riOo@BnKJzuDdlsJOm1tmiskLAslA44nJQlBu~ibYx#T%Nzz9plSz1ohA^gE|OShbLNK7kBwP`>}Z${7d^s=T~_YW z;d3PX-qpD~6U4YXoWxi3ErEl|`LPTw!yg7oYB~A6(zyud1z|KRXIz7rsC1AC>mMno z3NFB8S~CtJ>8_Ez_NUQA3KoJ|6)~IC6}DTN6t#;R5%UpvXb;=RnoUBLHM*O!)B!!_$=+BV&XzE(XOxUJx6!-rFC|F3vStG(?x(oish>k=u zNOW9IRDfZHIMu&GIqIWy;_a8S=U`ZY!uAzl%QeK(fwgz+1oa`7@&l*^78&QrwsT^P zjS3*2PNs)Z5%m$ISw9p2CB6NyCCQdK!V%=OSN|Zahx*^(3+8~{PF9le~d?p?^3{WN)&w&(3y5d;`OyeuW*h&7}1E zI5gok8!5a@NVbI>NQ|(&$!Fv}1Yf>gTqaR@2t?-u?5GUW(h6*(47T&` zN81C7u)3x5+m(RQE6K@h&qrV_#b}RAV0qL2rq>S}F0he>v#tQe@B)lv=XHfZcz`@f~_?R-0wiJNBV4GnN3AbSjI8h7+`C^xP8x$Rki6BUFZK%Y)X5 z%oCzk7aaF~A+66mb-W|~{Ec+u)6GW@Hbe#|>TYfD=TVW73p*U+G27{JOxro=41Xt2 zrEN}(ZBGisP0VV$_*3w{h|3wd`f$3mmT-ZLP;oWM@8gIH0|;cKcab6gAqx{YQ6QCn zz_Ge4uj4OiFm3ye#w*e|%} zYq*0&w4E8}ReLwiZTFq`T8Rx9!vvoUWI5p%Jk(C&jo4Drzc6_Ql{L-3 zjNZX4W|wJ6qdA0Q9x8E6w~|m#M)8reKB~cw&|K07b|l{vbulo;IWGZ^I@tN-z|;o$JA0Jog1nb=`1|NQ zf*fcOmO3^SD4If%@2_>Hf{;ns0u#!Dm-=u`lna891sflg{J;?i*#Io9`?bCdH83Tk zH_)C1;qM8RJoIZC4yXj%BsfGXb>lZC-0T-c7q0IGb^ z7{(WGKzBz>Q0BQnfTbJOD=@^FjYz3ohv)2T79GP9h~1(pnq4*v4`5ESod7$#2mu1q zOLT;y+DXj@7B;eZ!D>TFiB?g$5yU`$VnbYtRL>VBdttxzll``dN%sP-dc`z{9f50R zz=i|r3XBAUmzaair2>E<=UMVZz^_zQgvwrs7%rj&cLfLw4c9+r&pFiwj-6+9|DR^; zk0IK>VF$By=5zZK>vwLmi-4U5vM_|qWnjCA&KxE3`oNw1$H)nDRHn-xvhaZ*5vCR= zhg- zsDP4FAI>XOKybsa@S#KW;A+5(`J<@z|DHZ>FufbZB(=*ii?rCBzI9Oq zUcbzC3F#>lU5?&8O?FD>TU4vZts1zQqXqKe0&5n2#^Iq+5uTH-kH%~_fjkQ+DCCni zC{QY2(%>_m3&x zq@JQ=(?-^Vl#ZPR>(0is9or}X#Wt45fkYcHA`0IW5oy#N^_hfuP}s7Cy!YF2H}&Sej~b8)rB$me(DKqXY>ku& z1&R13Z+{!DKsz#a`UL~|{uJWh3`8ALx)S3+JBz*mTp({hAG{rky&VqT{&w&dGDA$; z%=7BQ=V5qu^Cpf1OWnwLw?Niy%!4^lSa8eXzGCZp_5p1l#CAOjB)_Xg?cz1lo`7@~ zZhLO0AyqWQT)%DT!kGZSpDyhqBzWUm@bb(=jUsopNV#rJRTesAv< z>8-M-drkKXW0kvOAs4g+q0Jzk?p+D##DfIXRIJ`h-wsN&J^jNllL0~uuRRotnKo;VKD$_c<#w}ZSg#u)bljzXHxcz0~x zNuva93B3>V`NBnM?LOAi{hatqQs+O~l7HrP^Z$eG^4BX1rT>N0;VW@(-UPb0bj>PHxE6-FxpuV3IL=)sY4jez&3etF9eP>_T9M` zm=p!kmXa$mC3=*}{Q0$|C6G681r?;EqTq;0-KoZGX|9Dk5=5O(a-@=iu3)FAU>ojH ze?TaNqM%FzN(rh9NpofpUPT6S!%C;*^~_?cj#Iw0-Y3Z6Vhbj{X6JnC$%p3N(hbd7i~0Q_FSzA%$Rp5FT1U#KX~%-)2APOr1HcQl_ws0^6VMkfC+Wn z^b0zYpVyow=&!HFz#$$S*wHml{YiL)r+y2h1!+J7^9(oNV~t>^FaYr%44;XQ@P2Wa zc)}j1<1h^IVD$b4?qiUL)ZBO+mqR>eB7hqLM@hi0eoE{(_?en(*MOXg#Zp8TI4TM5 zzj^`50OtsEg-4eblqSpm9)NHzG0$`YQW&sOVcwWBaYd}P{@^yi)J|to7g!bX^4E9C zjG7gScmULgY`T9QHe-EAN1~8-@G{n`JRiY51~n4Zxo zgccp#k3xc508HwewWQ(k*1wajC(xEom;~Ei1q^XxeV9v$^g@}&Q0iLn0IyoM<1Roj zsRbH?pRsFlU9@kibGlnOGS^48Y3zDmSL0sHtoL7_jrHZAwd?&W@yCnZkBS|&UR>gu z=}t5N+Dg~6`Wr;sw%;h>@f89Eq51*#Da^%dL7!=0<-Z~Iivif3259fsp9 zwU0tufy{Yp%g zLadPJZ&hpsolhk-2=-?`6PU8`Q_??vKa_xE8p@6Kex3za1h?`nsf6_1=h>t`XO*p0 zmaUo8%%KbRMGsQh16JQ!bt3PWaej_|*~rxIG7qj)tY4XivT>be=zl=Ye@DpK$5Xxt zb8aTg5JT98q@-!cv%%0?GH*^prYsGTQx-1l?%;bz@1Z#%r%$L&{sH31plQ5#(8;TO1E8?yBoMot8*!dQ^A$#tE zXuBXI>W|?QMyT$kltk|xV2-U`z|p4XPr}$5#&9b(^0V9t1sln3i;e7*E=HZBC>gO2 zo`vq0kQ*;k5FLjUMeuOS4YhHG!Dg|!3@Me3G;jp4yXf=M!qUp%iUbJeMR)=P^aRsL ztt5fgCO>9;&v9TQS-~d`5cgOffJuEqXs6&GZ0CSaJkUlw_mo%khF&muV=G}lK(*Cx z$Nd`{dBlT07xWXJ{%8VFKg9Cv9EAB;r_n%|4+4YBV=q7w4rlZDHAv*QnQFq$U$uY{ zgz0|_ux1}%%|35ZV9f&aY4-)NCK!I|dvL!ouWZ1T2h0sd7L)3`y{XOJ0>z!eUOa6N z0Q>r1#5ai3?8hVpu#>J7Kh{YRmL0Tr4}R~liwFhtZX#h1{g8!g2hg|T8~G$OPoRA= zP)lqKq9>qxfnFE`M8a$PhNy=D?TMk~o30q6WDt_oS415L+QZOh7`X78zfQKQ9hi)2r zSzI$VWlDpX4nhL&+UOP-P3e;=Pn&AN&8bynqi<8X8|4MihDc?@FMB+9ydHStPrAS} ztMDp?mD8_o3z+V-J#7b=0+vhKIF9=ORzcAOH@o5?wWL~WtLkr2xh;Lct(Sn5`YPV) zDiUN;U!(I<6ei~r!6XwTybhS8t~18VaMlN6SjSO6dHgPW-8$M@JgWW@?;+=q7w}R; zEIdTF)P-BC0Ki0<8}o9y#NA_RvS>j)hdhVruz z>cky807Docn8sO5lzHR}d+p%qigh^O$gBU1_v=Hk1`+y3rUWczW);Ca^$K$_Sb#}~ zkeR;P_h#m}pF$p*(HUB=i%urBKwEw*8blHk>Nl7;!E6ZoDFNs*_NqklGpm6o&00FP z>Sx3Zoe?^`DA!Nx5m%goe%4!so&pDbh!IE?S7Y>z(;?8MEcyy?{1i1D>Plw8y^L9A z7X85tYMrBLW02&+90i)QI4J{w(Ozq;y83`lGt&t=dztf~p1KhpJ~O%PVJ^_xBvb97 zbBNA9I+96W(21PNYCnT-h69jlW!0;QRA4m7P^hJppnzq40WM3s-;^9DJ)YPO;N-{; zF>(Ocoi|9k1hTk9)A^IapeNlymMqKQ0DQm1)%#g=R=&Rg7DpC}2%0|#+W4y`7$Vc4 zl2iQv$_ZBb6$>)ce2+E$N*=dPm?JcqXjyQfV_-f9RffP<7|)t}5fAs|lEVm;4IrOR z36%#6+(q71#`oIzA|l^H;}Om9ACUfuh0;WD3J%!lIDb93rj zFvK&)sTuQ>`j7M;>=C&3hbxdew?n(qvNALw{N0=(KgSvL^K>YQrzNpJPv0-o`87Jf zPKVOj>YvfKi$1Pvf3R}$@wrovzP)nl@h2&%y{(Kkx_FC>HX%lBRrvRjBUrmM;JXk= zYX3VH@)LAOS682*LwH&yv*M8LEXVzmh?gha{p%4X7iw+gR7 zBK$!!rNhBFLtGth<_N`GFEf58+yp#7`m$o1sy>=6$M#temjaSUVCEH+u~4KXG;AB% zs?ip5Yq+>xWE;BeyOs`7bE6vEy&l?@i!396#D69*CwY=m^idx3&(HvjHmyc2GA$ZH zp(MevNCmUC31N}GkJ3Q^1p+}l$shp%E5R@#OnE0)8gB zj%@+h3WEJ-SSTgMk=8g!(G%I{!1S;bhSyN{jOU0)&>)eT8mKVl(S6Y}a}sM+s&$Da zOS#&7)AmJ6DK4*L0(B8uk6ndnLr7g2hK&c4-X^B-T|5ja3VxSm$9! zX+B^L3vI~N;$E~5Bt77Iv>b^3)3HE2p#80IXxIfpgFVDrc6ubNq?8%9JXX=bp#K6^ zYwR->Jh{IBPpBRt=nDvr&u5_|K_e##*dEr_g?Tk$2F6V?TmOI(WS#6XFEY&Imqtx^ zi3mdA>w~WVKiDjIg838ehuM=b28^FL19&$W{1$`Xq2PDe8Kh!Gpx8i#3m_ilVNnQx zWa7-YA1>D&msE_m%I%Vn93d$8t#Ge+J3+ zUoyh`>2%;V9wUo{*YfaCgO9d@3(3gn$BymZS#lpW&?3yAV_gJX6#4Bde3X(q#q-!| z5tRJ}c-HUgkb}41zaVP1jVw2NJ^awm?@4^s{~E8rP6Ff?gn#>Z?IZY&xTX+y$rQkO z?FZgUaogK4@ho{a&;&O&^ovL@loTQ9^8iJHaed(fVCG{1d|2e~f#LHVcx)Sk&6|U~ z&U+bN`JmHYGdWRVhawHGodx+I@DHV|^sdCXJZ-{~#4_tL2+SGzwELk5iEAMl3L#%y zip!-3%McVN*5b*q=n$m>-x%5m5MD+-ejNTm)Y4QRhms@p0|uH?L71gJ(@rlJkvA?@ zbUzjNL8nolfhweqYuW5Qq@s&Fvv~Vk-mr)*hdKW$1(<2g0(F23LFS?6Z6eg17XK|l zTIT#G-e@TD3_BRC>EJaJ>EH(khAG36@`(LqZ#<0Mmb}_|(2p|hA_SrP!0VQ4hDAhr zh+_uYL8?6z6Ox|bb(OdTa}r-%R=GT7_lb~np-PQp*`!POe8VEqGdLFz47sHty z2<*9j|7xpM2MQL*>6@i?Vtbd)zI_WnU?wU0~#sKlA0Am2la z8Bcq`9022hgnS93!d%|ik1t}%y>O!-`ZMmdqySq1jDX2r8P_hwqa99A@a1V*LOp^J zKv#Nf#2TH8bOfvH;F2A@QA|7J*lfZ!T7efiY>ynusIdoj#`PXW;`gwlh{|UZg`q6< z^h1r^Sf_mhxLslNZxnOJLLd-2<0&dEgK6EQ47sPi!a98kj-RF#Uzn(nHb|2MttId# z^SL|{-OJLs!&-w_Uu7H~)Bd=Qqpbo$2QRpM2xOn&{%a+ZSmrdRczLpG2D-CVTvc^Z za64BK{OWnFr9>t3jscGHEwL;F4MRIeBSboLx9k%Flt#xT@gM%hQFM2B=ZkPe~WQiWav$V_BD>5F94C?`Oka3dBluENzb zo)l;Gb|xFrV?`#fs-0T{SUHFLVxfFx{PW?xJ%yoyXw#>Bn^WRKr)#()l4epQ7_=I$7=?LTCLYKK?SD zK8D>z=Omr`;glyrI|66ULNun!jKEWjB6KW4dq`Z7_1?7&N4SNby-)YY1BjJye;AH6 zqA!}A#HKndggb7kMUd#AJYbZJLN=YGf7Hywa!Em~ns9?A4;Ei+zXiOVOp&*m%?$Ss zPiJq*=Z2@Jjt>_SA1pqc_#oUl^IN&)r(Ye;3>On0wARcaIRd^uY+2*xDeWJ%5Oag^ P`DZ?k(1Q7u@#6mn+3oyM diff --git a/tests/models/mistral2/configuration_mistraltp.py b/tests/models/mistral2/configuration_mistraltp.py deleted file mode 100644 index ad6691b..0000000 --- a/tests/models/mistral2/configuration_mistraltp.py +++ /dev/null @@ -1,155 +0,0 @@ -# coding=utf-8 -# Copyright 2023 Mistral AI and the HuggingFace Inc. team. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -""" Mistral model configuration""" - -from transformers.configuration_utils import PretrainedConfig -# from transformers.utils import logging -from collie.log.logger import logger - - -# logger = logging.get_logger(__name__) - -MISTRAL_PRETRAINED_CONFIG_ARCHIVE_MAP = { - "mistralai/Mistral-7B-v0.1": "https://huggingface.co/mistralai/Mistral-7B-v0.1/resolve/main/config.json", - "mistralai/Mistral-7B-Instruct-v0.1": "https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.1/resolve/main/config.json", -} - - -class MistralConfig(PretrainedConfig): - r""" - This is the configuration class to store the configuration of a [`MistralModel`]. It is used to instantiate an - Mistral model according to the specified arguments, defining the model architecture. Instantiating a configuration - with the defaults will yield a similar configuration to that of the Mistral-7B-v0.1 or Mistral-7B-Instruct-v0.1. - - [mistralai/Mistral-7B-v0.1](https://huggingface.co/mistralai/Mistral-7B-v0.1) - [mistralai/Mistral-7B-Instruct-v0.1](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.1) - - Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the - documentation from [`PretrainedConfig`] for more information. - - - Args: - vocab_size (`int`, *optional*, defaults to 32000): - Vocabulary size of the Mistral model. Defines the number of different tokens that can be represented by the - `inputs_ids` passed when calling [`MistralModel`] - hidden_size (`int`, *optional*, defaults to 4096): - Dimension of the hidden representations. - intermediate_size (`int`, *optional*, defaults to 14336): - Dimension of the MLP representations. - num_hidden_layers (`int`, *optional*, defaults to 32): - Number of hidden layers in the Transformer encoder. - num_attention_heads (`int`, *optional*, defaults to 32): - Number of attention heads for each attention layer in the Transformer encoder. - num_key_value_heads (`int`, *optional*, defaults to 8): - This is the number of key_value heads that should be used to implement Grouped Query Attention. If - `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if - `num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used. When - converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed - by meanpooling all the original heads within that group. For more details checkout [this - paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to `8`. - hidden_act (`str` or `function`, *optional*, defaults to `"silu"`): - The non-linear activation function (function or string) in the decoder. - max_position_embeddings (`int`, *optional*, defaults to `4096*32`): - The maximum sequence length that this model might ever be used with. Mistral's sliding window attention - allows sequence of up to 4096*32 tokens. - initializer_range (`float`, *optional*, defaults to 0.02): - The standard deviation of the truncated_normal_initializer for initializing all weight matrices. - rms_norm_eps (`float`, *optional*, defaults to 1e-06): - The epsilon used by the rms normalization layers. - use_cache (`bool`, *optional*, defaults to `True`): - Whether or not the model should return the last key/values attentions (not used by all models). Only - relevant if `config.is_decoder=True`. - pad_token_id (`int`, *optional*): - The id of the padding token. - bos_token_id (`int`, *optional*, defaults to 1): - The id of the "beginning-of-sequence" token. - eos_token_id (`int`, *optional*, defaults to 2): - The id of the "end-of-sequence" token. - tie_word_embeddings (`bool`, *optional*, defaults to `False`): - Whether the model's input and output word embeddings should be tied. - rope_theta (`float`, *optional*, defaults to 10000.0): - The base period of the RoPE embeddings. - sliding_window (`int`, *optional*, defaults to 4096): - Sliding window attention window size. If not specified, will default to `4096`. - attention_dropout (`float`, *optional*, defaults to 0.0): - The dropout ratio for the attention probabilities. - - ```python - >>> from transformers import MistralModel, MistralConfig - - >>> # Initializing a Mistral 7B style configuration - >>> configuration = MistralConfig() - - >>> # Initializing a model from the Mistral 7B style configuration - >>> model = MistralModel(configuration) - - >>> # Accessing the model configuration - >>> configuration = model.config - ```""" - - model_type = "mistral" - keys_to_ignore_at_inference = ["past_key_values"] - - def __init__( - self, - vocab_size=32000, - hidden_size=4096, - intermediate_size=14336, - num_hidden_layers=32, - num_attention_heads=32, - num_key_value_heads=8, - hidden_act="silu", - max_position_embeddings=4096 * 32, - initializer_range=0.02, - rms_norm_eps=1e-6, - use_cache=True, - pad_token_id=None, - bos_token_id=1, - eos_token_id=2, - tie_word_embeddings=False, - rope_theta=10000.0, - sliding_window=4096, - attention_dropout=0.0, - attn_implementation="flash_attention_2", - **kwargs, - ): - self.vocab_size = vocab_size - self.max_position_embeddings = max_position_embeddings - self.hidden_size = hidden_size - self.intermediate_size = intermediate_size - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - self.sliding_window = sliding_window - - # for backward compatibility - if num_key_value_heads is None: - num_key_value_heads = num_attention_heads - - self.num_key_value_heads = num_key_value_heads - self.hidden_act = hidden_act - self.initializer_range = initializer_range - self.rms_norm_eps = rms_norm_eps - self.use_cache = use_cache - self.rope_theta = rope_theta - self.attention_dropout = attention_dropout - - # 调用父类的初始化函数,将一些公共参数传递给父类处理 - super().__init__( - pad_token_id=pad_token_id, - bos_token_id=bos_token_id, - eos_token_id=eos_token_id, - tie_word_embeddings=tie_word_embeddings, - **kwargs, - ) diff --git a/tests/models/mistral2/model.py b/tests/models/mistral2/model.py deleted file mode 100644 index 60d9553..0000000 --- a/tests/models/mistral2/model.py +++ /dev/null @@ -1,2026 +0,0 @@ -# coding=utf-8 -# Copyright 2023 Mistral AI and the HuggingFace Inc. team. All rights reserved. -# -# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX -# and OPT implementations in this library. It has been modified from its -# original forms to accommodate minor architectural differences compared -# to GPT-NeoX and OPT used by the Meta AI team that trained the model. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -""" PyTorch Mistral model.""" -import inspect -import math -import warnings -from typing import List, Optional, Tuple, Union - -import torch -import torch.nn.functional as F -import torch.utils.checkpoint -from torch import nn -from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss - -from transformers.activations import ACT2FN -from transformers.cache_utils import Cache, DynamicCache -from transformers.modeling_attn_mask_utils import _prepare_4d_causal_attention_mask, _prepare_4d_causal_attention_mask_for_sdpa -from transformers.modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast, SequenceClassifierOutputWithPast -from transformers.modeling_utils import PreTrainedModel, dtype_byte_size -from transformers.utils import ( - add_start_docstrings, - add_start_docstrings_to_model_forward, - is_flash_attn_2_available, - is_flash_attn_greater_or_equal_2_10, - logging, - replace_return_docstrings, -) -from .configuration_mistraltp import Mistral2Config - - -if is_flash_attn_2_available(): - from flash_attn import flash_attn_func, flash_attn_varlen_func - from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input # noqa - - _flash_supports_window_size = "window_size" in list(inspect.signature(flash_attn_func).parameters) - - -logger = logging.get_logger(__name__) - -_CONFIG_FOR_DOC = "Mistral2Config" - -#modified for collie -import torch.distributed as dist -import gc -import json -import os -from collections import OrderedDict -from megatron.core import parallel_state, tensor_parallel -from einops import rearrange -from deepspeed.pipe import LayerSpec, TiedLayerSpec - -from collie.config import CollieConfig -from collie.driver.io import IODriver -from collie.log.logger import logger -from collie.module import ( - ColumnParallelLinearWithoutBias, - ColumnParallelLMHead, - RowParallelLinearWithoutBias, -) -from collie.utils import concat_tensor, dict_as_params, env, progress -from collie.models.base import CollieModelForCausalLM -from collie.models.utils import ( - kv_cache_to_inputs_for_layer, inputs_to_kv_cache_for_layer, - kv_cache_to_inputs_for_model, inputs_to_kv_cache_for_model, -) - - -# Copied from transformers.models.llama.modeling_llama._get_unpad_data -def _get_unpad_data(attention_mask): - seqlens_in_batch = attention_mask.sum(dim=-1, dtype=torch.int32) - indices = torch.nonzero(attention_mask.flatten(), as_tuple=False).flatten() - max_seqlen_in_batch = seqlens_in_batch.max().item() - cu_seqlens = F.pad(torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.int32), (1, 0)) - return ( - indices, - cu_seqlens, - max_seqlen_in_batch, - ) - - -# Copied from transformers.models.llama.modeling_llama.LlamaRMSNorm with Llama->Mistral -class Mistral2RMSNorm(nn.Module): - def __init__(self, hidden_size, eps=1e-6): - """ - MistralRMSNorm is equivalent to T5LayerNorm - """ - super().__init__() - self.weight = nn.Parameter(torch.ones(hidden_size)) - self.variance_epsilon = eps - - def forward(self, hidden_states): - input_dtype = hidden_states.dtype - hidden_states = hidden_states.to(torch.float32) - variance = hidden_states.pow(2).mean(-1, keepdim=True) - hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon) - ans = self.weight * hidden_states.to(input_dtype) - # -------------------------------------------------------- - # # 将Tensor转换为列表 - # ans_list = ans.tolist() - # # 指定.json文件的路径 - # file_path = '/remote-home/lqyin/CoLLiE/tests/models/mistraltp/rms_ans.json' - - # # 尝试打开现有的.json文件并读取内容,如果文件不存在则创建一个新的列表 - # try: - # with open(file_path, 'r', encoding='utf-8') as file: - # results_list = json.load(file) - # except FileNotFoundError: - # results_list = [] - # # 将当前结果添加到列表中 - # results_list.append(ans_list) - # # 将更新后的列表写回.json文件 - # with open(file_path, 'w', encoding='utf-8') as file: - # json.dump(results_list, file, ensure_ascii=False, indent=4) - # file.write('\n') # 在文件末尾添加一个换行符 - # -------------------------------------------------------- - return ans - - -# copied from transformers.models.llama.modeling_llama.LlamaRotaryEmbedding with Llama->Mistral -# TODO @Arthur no longer copied from LLama after static cache -class Mistral2RotaryEmbedding(nn.Module): - def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None): - super().__init__() - - self.dim = dim - self.max_position_embeddings = max_position_embeddings - self.base = base - inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2, dtype=torch.int64).float().to(device) / self.dim)) - self.register_buffer("inv_freq", inv_freq, persistent=False) - - # Build here to make `torch.jit.trace` work. - self._set_cos_sin_cache( - seq_len=max_position_embeddings, device=self.inv_freq.device, dtype=torch.get_default_dtype() - ) - - def _set_cos_sin_cache(self, seq_len, device, dtype): - self.max_seq_len_cached = seq_len - t = torch.arange(self.max_seq_len_cached, device=device, dtype=torch.int64).type_as(self.inv_freq) - - freqs = torch.outer(t, self.inv_freq) - # Different from paper, but it uses a different permutation in order to obtain the same calculation - emb = torch.cat((freqs, freqs), dim=-1) - self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False) - self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False) - - def forward(self, x, seq_len=None): - # x: [bs, num_attention_heads, seq_len, head_size] - if seq_len > self.max_seq_len_cached: - self._set_cos_sin_cache(seq_len=seq_len, device=x.device, dtype=x.dtype) - - return ( - self.cos_cached[:seq_len].to(dtype=x.dtype), - self.sin_cached[:seq_len].to(dtype=x.dtype), - ) - - -# Copied from transformers.models.llama.modeling_llama.rotate_half -def rotate_half(x): - """Rotates half the hidden dims of the input.""" - x1 = x[..., : x.shape[-1] // 2] - x2 = x[..., x.shape[-1] // 2 :] - return torch.cat((-x2, x1), dim=-1) - - -# copied from transformers.models.llama.modeling_llama.apply_rotary_pos_emb -# TODO @Arthur no longer copied from LLama after static cache -def apply_rotary_pos_emb(q, k, cos, sin, position_ids, unsqueeze_dim=1): - """Applies Rotary Position Embedding to the query and key tensors. - - Args: - q (`torch.Tensor`): The query tensor. - k (`torch.Tensor`): The key tensor. - cos (`torch.Tensor`): The cosine part of the rotary embedding. - sin (`torch.Tensor`): The sine part of the rotary embedding. - position_ids (`torch.Tensor`): - The position indices of the tokens corresponding to the query and key tensors. For example, this can be - used to pass offsetted position ids when working with a KV-cache. - unsqueeze_dim (`int`, *optional*, defaults to 1): - The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and - sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note - that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and - k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes - cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have - the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2. - Returns: - `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding. - """ - cos = cos[position_ids].unsqueeze(unsqueeze_dim) - sin = sin[position_ids].unsqueeze(unsqueeze_dim) - q_embed = (q * cos) + (rotate_half(q) * sin) - k_embed = (k * cos) + (rotate_half(k) * sin) - return q_embed, k_embed - - -class Mistral2MLP(nn.Module): - def __init__(self, config): - super().__init__() - self.config = config - self.hidden_size = config.hidden_size - self.intermediate_size = config.intermediate_size - # self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False) - # self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False) - # self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False) - - self.up_proj = ColumnParallelLinearWithoutBias( - self.hidden_size, - self.intermediate_size, - bias=False, - gather_output=False, - init_method=lambda x: x, - ) - self.gate_proj = ColumnParallelLinearWithoutBias( - self.hidden_size, - self.intermediate_size, - bias=False, - gather_output=False, - init_method=lambda x: x, - ) - self.down_proj = RowParallelLinearWithoutBias( - self.intermediate_size, - self.hidden_size, - bias=False, - input_is_parallel=True, - init_method=lambda x: x, - ) - - self.act_fn = ACT2FN[config.hidden_act] - - def forward(self, x): - return self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x)) - - -# Copied from transformers.models.llama.modeling_llama.repeat_kv -def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor: - """ - This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch, - num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim) - """ - batch, num_key_value_heads, slen, head_dim = hidden_states.shape - if n_rep == 1: - return hidden_states - hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim) - return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim) - - -class Mistral2Attention(nn.Module): - """ - Multi-headed attention from 'Attention Is All You Need' paper. Modified to use sliding window attention: Longformer - and "Generating Long Sequences with Sparse Transformers". - """ - - def __init__(self, config: CollieConfig, layer_idx: Optional[int] = None): - super().__init__() - self.config = config - self.layer_idx = layer_idx - if layer_idx is None: - logger.warning_once( - f"Instantiating {self.__class__.__name__} without passing a `layer_idx` is not recommended and will " - "lead to errors during the forward call if caching is used. Please make sure to provide a `layer_idx` " - "when creating this class." - ) - - self.hidden_size = config.hidden_size - self.num_heads = config.num_attention_heads - self.head_dim = self.hidden_size // self.num_heads - self.num_key_value_heads = config.num_key_value_heads - self.num_key_value_groups = self.num_heads // self.num_key_value_heads - self.max_position_embeddings = config.max_position_embeddings - self.rope_theta = config.rope_theta - self.is_causal = True - self.attention_dropout = config.attention_dropout - - if (self.head_dim * self.num_heads) != self.hidden_size: - raise ValueError( - f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}" - f" and `num_heads`: {self.num_heads})." - ) - # self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=False) - # self.k_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=False) - # self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=False) - # self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=False) - - self.q_proj = ColumnParallelLinearWithoutBias( - self.hidden_size, - self.num_heads * self.head_dim, - bias=False, - gather_output=False, - init_method=lambda x: x, - ) - self.k_proj = ColumnParallelLinearWithoutBias( - self.hidden_size, - self.num_key_value_heads * self.head_dim, - bias=False, - gather_output=False, - init_method=lambda x: x, - ) - self.v_proj = ColumnParallelLinearWithoutBias( - self.hidden_size, - self.num_key_value_heads * self.head_dim, - bias=False, - gather_output=False, - init_method=lambda x: x, - ) - self.o_proj = RowParallelLinearWithoutBias( - self.num_heads * self.head_dim, - self.hidden_size, - bias=False, - input_is_parallel=True, - init_method=lambda x: x, - ) - - self.rotary_emb = Mistral2RotaryEmbedding( - self.head_dim, - max_position_embeddings=self.max_position_embeddings, - base=self.rope_theta, - ) - - def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int): - return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous() - - def forward( - self, - hidden_states: torch.Tensor, - attention_mask: Optional[torch.Tensor] = None, - position_ids: Optional[torch.LongTensor] = None, - past_key_value: Optional[Cache] = None, - output_attentions: bool = False, - use_cache: bool = False, - **kwargs, - ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: - if "padding_mask" in kwargs: - warnings.warn( - "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`" - ) - bsz, q_len, _ = hidden_states.size() - - query_states = self.q_proj(hidden_states) - key_states = self.k_proj(hidden_states) - value_states = self.v_proj(hidden_states) - - # query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2) - # key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) - # value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) - - query_states, key_states, value_states = ( - rearrange(query_states, "b n (h d) -> b n h d", d=self.head_dim), - rearrange(key_states, "b n (h d) -> b n h d", d=self.head_dim), - rearrange(value_states, "b n (h d) -> b n h d", d=self.head_dim), - ) - - self.num_heads_tp = query_states.shape[2] - self.tp_size = self.num_heads // self.num_heads_tp - - query_states = query_states.transpose(1, 2) - key_states = key_states.transpose(1, 2) - value_states = value_states.transpose(1, 2) - - if self.config.pp_size > 1: - query_states = query_states.contiguous() - key_states = key_states.contiguous() - value_states = value_states.contiguous() - - kv_seq_len = key_states.shape[-2] - if past_key_value is not None: - if self.layer_idx is None: - raise ValueError( - f"The cache structure has changed since version v4.36. If you are using {self.__class__.__name__} " - "for auto-regressive decoding with k/v caching, please make sure to initialize the attention class " - "with a layer index." - ) - kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx) - cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len) - query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids) - - if past_key_value is not None: - cache_kwargs = {"sin": sin, "cos": cos} # Specific to RoPE models - key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs) - - # repeat k/v heads if n_kv_heads < n_heads - key_states = repeat_kv(key_states, self.num_key_value_groups) - value_states = repeat_kv(value_states, self.num_key_value_groups) - - attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim) - - if attn_weights.size() != (bsz, self.num_heads_tp, q_len, kv_seq_len): - raise ValueError( - f"Attention weights should be of size {(bsz, self.num_heads_tp, q_len, kv_seq_len)}, but is" - f" {attn_weights.size()}" - ) - - if attention_mask is not None: - if attention_mask.size() != (bsz, 1, q_len, kv_seq_len): - raise ValueError( - f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}" - ) - - attn_weights = attn_weights + attention_mask - - # upcast attention to fp32 - attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype) - attn_weights = nn.functional.dropout(attn_weights, p=self.attention_dropout, training=self.training) - attn_output = torch.matmul(attn_weights, value_states) - - if attn_output.size() != (bsz, self.num_heads_tp, q_len, self.head_dim): - raise ValueError( - f"`attn_output` should be of size {(bsz, self.num_heads_tp, q_len, self.head_dim)}, but is" - f" {attn_output.size()}" - ) - - attn_output = attn_output.transpose(1, 2).contiguous() - attn_output = attn_output.reshape(bsz, q_len, int(self.hidden_size/self.tp_size)) - - attn_output = self.o_proj(attn_output) - - if not output_attentions: - attn_weights = None - - # -------------------------------------------------------- - # 将Tensor转换为列表 - ans_list = attn_output.tolist() - # 指定.json文件的路径 - file_path = '/remote-home/lqyin/CoLLiE/tests/models/mistraltp/attn_output_1.json' - - # 尝试打开现有的.json文件并读取内容,如果文件不存在则创建一个新的列表 - try: - with open(file_path, 'r', encoding='utf-8') as file: - results_list = json.load(file) - except FileNotFoundError: - results_list = [] - # 将当前结果添加到列表中 - results_list.append(ans_list) - # 将更新后的列表写回.json文件 - with open(file_path, 'w', encoding='utf-8') as file: - json.dump(results_list, file, ensure_ascii=False, indent=4) - file.write('\n\n\n') # 在文件末尾添加一个换行符 - # -------------------------------------------------------- - - - - return attn_output, attn_weights, past_key_value - - -class Mistral2FlashAttention2(Mistral2Attention): - """ - Mistral flash attention module. This module inherits from `MistralAttention` as the weights of the module stays - untouched. The only required change would be on the forward pass where it needs to correctly call the public API of - flash attention and deal with padding tokens in case the input contains any of them. - """ - - # Copied from transformers.models.llama.modeling_llama.LlamaFlashAttention2.__init__ - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - - # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1. - # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignement, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0. - # Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left). - self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10() - - def forward( - self, - hidden_states: torch.Tensor, - attention_mask: Optional[torch.Tensor] = None, - position_ids: Optional[torch.LongTensor] = None, - past_key_value: Optional[Cache] = None, - output_attentions: bool = False, - use_cache: bool = False, - **kwargs, - ): - if "padding_mask" in kwargs: - warnings.warn( - "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`" - ) - - # overwrite attention_mask with padding_mask - attention_mask = kwargs.pop("padding_mask") - bsz, q_len, _ = hidden_states.size() - - query_states = self.q_proj(hidden_states) - key_states = self.k_proj(hidden_states) - value_states = self.v_proj(hidden_states) - - # query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2) - # key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) - # value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) - - query_states, key_states, value_states = ( - rearrange(query_states, "b n (h d) -> b n h d", d=self.head_dim), - rearrange(key_states, "b n (h d) -> b n h d", d=self.head_dim), - rearrange(value_states, "b n (h d) -> b n h d", d=self.head_dim), - ) - - self.num_heads_tp = query_states.shape[2] - self.tp_size = self.num_heads // self.num_heads_tp - - query_states = query_states.transpose(1, 2) - key_states = key_states.transpose(1, 2) - value_states = value_states.transpose(1, 2) - - if self.config.pp_size > 1: - query_states = query_states.contiguous() - key_states = key_states.contiguous() - value_states = value_states.contiguous() - - - kv_seq_len = key_states.shape[-2] - if past_key_value is not None: - if self.layer_idx is None: - raise ValueError( - f"The cache structure has changed since version v4.36. If you are using {self.__class__.__name__} " - "for auto-regressive decoding with k/v caching, please make sure to initialize the attention class " - "with a layer index." - ) - kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx) - - # Because the input can be padded, the absolute sequence length depends on the max position id. - rotary_seq_len = max(kv_seq_len, position_ids[:, -1].max().item()) + 1 - cos, sin = self.rotary_emb(value_states, seq_len=rotary_seq_len) - - query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids) - - use_sliding_windows = ( - _flash_supports_window_size - and getattr(self.config, "sliding_window", None) is not None - and kv_seq_len > self.config.sliding_window - ) - - if not _flash_supports_window_size: - logger.warning_once( - "The current flash attention version does not support sliding window attention, for a more memory efficient implementation" - " make sure to upgrade flash-attn library." - ) - - if past_key_value is not None: - # Activate slicing cache only if the config has a value `sliding_windows` attribute - cache_has_contents = past_key_value.get_seq_length(self.layer_idx) > 0 - if ( - getattr(self.config, "sliding_window", None) is not None - and kv_seq_len > self.config.sliding_window - and cache_has_contents - ): - slicing_tokens = 1 - self.config.sliding_window - - past_key = past_key_value[self.layer_idx][0] - past_value = past_key_value[self.layer_idx][1] - - past_key = past_key[:, :, slicing_tokens:, :].contiguous() - past_value = past_value[:, :, slicing_tokens:, :].contiguous() - - if past_key.shape[-2] != self.config.sliding_window - 1: - raise ValueError( - f"past key must have a shape of (`batch_size, num_heads, self.config.sliding_window-1, head_dim`), got" - f" {past_key.shape}" - ) - - if attention_mask is not None: - attention_mask = attention_mask[:, slicing_tokens:] - attention_mask = torch.cat([attention_mask, torch.ones_like(attention_mask[:, -1:])], dim=-1) - - cache_kwargs = {"sin": sin, "cos": cos} # Specific to RoPE models - key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs) - - # repeat k/v heads if n_kv_heads < n_heads - key_states = repeat_kv(key_states, self.num_key_value_groups) - value_states = repeat_kv(value_states, self.num_key_value_groups) - dropout_rate = 0.0 if not self.training else self.attention_dropout - - # In PEFT, usually we cast the layer norms in float32 for training stability reasons - # therefore the input hidden states gets silently casted in float32. Hence, we need - # cast them back in float16 just to be sure everything works as expected. - input_dtype = query_states.dtype - if input_dtype == torch.float32: - if torch.is_autocast_enabled(): - target_dtype = torch.get_autocast_gpu_dtype() - # Handle the case where the model is quantized - elif hasattr(self.config, "_pre_quantization_dtype"): - target_dtype = self.config._pre_quantization_dtype - else: - target_dtype = self.q_proj.weight.dtype - - logger.warning_once( - f"The input hidden states seems to be silently casted in float32, this might be related to" - f" the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in" - f" {target_dtype}." - ) - - query_states = query_states.to(target_dtype) - key_states = key_states.to(target_dtype) - value_states = value_states.to(target_dtype) - - # Reashape to the expected shape for Flash Attention - query_states = query_states.transpose(1, 2) - key_states = key_states.transpose(1, 2) - value_states = value_states.transpose(1, 2) - - attn_output = self._flash_attention_forward( - query_states, - key_states, - value_states, - attention_mask, - q_len, - dropout=dropout_rate, - use_sliding_windows=use_sliding_windows, - ) - - attn_output = attn_output.reshape(bsz, q_len, int(self.hidden_size/self.tp_size)).contiguous() - attn_output = self.o_proj(attn_output) - - if not output_attentions: - attn_weights = None - - # -------------------------------------------------------- - # 将Tensor转换为列表 - ans_list = attn_output.tolist() - # 指定.json文件的路径 - file_path = '/remote-home/lqyin/CoLLiE/tests/models/mistraltp/attn_output_1.json' - - # 尝试打开现有的.json文件并读取内容,如果文件不存在则创建一个新的列表 - try: - with open(file_path, 'r', encoding='utf-8') as file: - results_list = json.load(file) - except FileNotFoundError: - results_list = [] - # 将当前结果添加到列表中 - results_list.append(ans_list) - # 将更新后的列表写回.json文件 - with open(file_path, 'w', encoding='utf-8') as file: - json.dump(results_list, file, ensure_ascii=False, indent=4) - file.write('\n\n\n') # 在文件末尾添加一个换行符 - # -------------------------------------------------------- - - - return attn_output, attn_weights, past_key_value - - def _flash_attention_forward( - self, - query_states, - key_states, - value_states, - attention_mask, - query_length, - dropout=0.0, - softmax_scale=None, - use_sliding_windows=False, - ): - """ - Calls the forward method of Flash Attention - if the input hidden states contain at least one padding token - first unpad the input, then computes the attention scores and pad the final attention scores. - - Args: - query_states (`torch.Tensor`): - Input query states to be passed to Flash Attention API - key_states (`torch.Tensor`): - Input key states to be passed to Flash Attention API - value_states (`torch.Tensor`): - Input value states to be passed to Flash Attention API - attention_mask (`torch.Tensor`): - The padding mask - corresponds to a tensor of size `(batch_size, seq_len)` where 0 stands for the - position of padding tokens and 1 for the position of non-padding tokens. - dropout (`float`): - Attention dropout - softmax_scale (`float`, *optional*): - The scaling of QK^T before applying softmax. Default to 1 / sqrt(head_dim) - use_sliding_windows (`bool`, *optional*): - Whether to activate sliding window attention. - """ - if not self._flash_attn_uses_top_left_mask: - causal = self.is_causal - else: - # TODO: Remove the `query_length != 1` check once Flash Attention for RoCm is bumped to 2.1. For details, please see the comment in LlamaFlashAttention2 __init__. - causal = self.is_causal and query_length != 1 - - # Contains at least one padding token in the sequence - if attention_mask is not None: - batch_size = query_states.shape[0] - query_states, key_states, value_states, indices_q, cu_seq_lens, max_seq_lens = self._upad_input( - query_states, key_states, value_states, attention_mask, query_length - ) - - cu_seqlens_q, cu_seqlens_k = cu_seq_lens - max_seqlen_in_batch_q, max_seqlen_in_batch_k = max_seq_lens - - if not use_sliding_windows: - attn_output_unpad = flash_attn_varlen_func( - query_states, - key_states, - value_states, - cu_seqlens_q=cu_seqlens_q, - cu_seqlens_k=cu_seqlens_k, - max_seqlen_q=max_seqlen_in_batch_q, - max_seqlen_k=max_seqlen_in_batch_k, - dropout_p=dropout, - softmax_scale=softmax_scale, - causal=causal, - ) - else: - attn_output_unpad = flash_attn_varlen_func( - query_states, - key_states, - value_states, - cu_seqlens_q=cu_seqlens_q, - cu_seqlens_k=cu_seqlens_k, - max_seqlen_q=max_seqlen_in_batch_q, - max_seqlen_k=max_seqlen_in_batch_k, - dropout_p=dropout, - softmax_scale=softmax_scale, - causal=causal, - window_size=(self.config.sliding_window, self.config.sliding_window), - ) - - attn_output = pad_input(attn_output_unpad, indices_q, batch_size, query_length) - else: - if not use_sliding_windows: - attn_output = flash_attn_func( - query_states, - key_states, - value_states, - dropout, - softmax_scale=softmax_scale, - causal=causal, - ) - else: - attn_output = flash_attn_func( - query_states, - key_states, - value_states, - dropout, - softmax_scale=softmax_scale, - causal=causal, - window_size=(self.config.sliding_window, self.config.sliding_window), - ) - - return attn_output - - def _upad_input(self, query_layer, key_layer, value_layer, attention_mask, query_length): - batch_size, kv_seq_len, num_heads, head_dim = key_layer.shape - - # On the first iteration we need to properly re-create the padding mask - # by slicing it on the proper place - if kv_seq_len != attention_mask.shape[-1]: - attention_mask_num_tokens = attention_mask.shape[-1] - attention_mask = attention_mask[:, attention_mask_num_tokens - kv_seq_len :] - - indices_k, cu_seqlens_k, max_seqlen_in_batch_k = _get_unpad_data(attention_mask) - - key_layer = index_first_axis(key_layer.reshape(batch_size * kv_seq_len, num_heads, head_dim), indices_k) - value_layer = index_first_axis(value_layer.reshape(batch_size * kv_seq_len, num_heads, head_dim), indices_k) - - if query_length == kv_seq_len: - query_layer = index_first_axis( - query_layer.reshape(batch_size * kv_seq_len, num_heads, head_dim), indices_k - ) - cu_seqlens_q = cu_seqlens_k - max_seqlen_in_batch_q = max_seqlen_in_batch_k - indices_q = indices_k - elif query_length == 1: - max_seqlen_in_batch_q = 1 - cu_seqlens_q = torch.arange( - batch_size + 1, dtype=torch.int32, device=query_layer.device - ) # There is a memcpy here, that is very bad. - indices_q = cu_seqlens_q[:-1] - query_layer = query_layer.squeeze(1) - else: - # The -q_len: slice assumes left padding. - attention_mask = attention_mask[:, -query_length:] - query_layer, indices_q, cu_seqlens_q, max_seqlen_in_batch_q = unpad_input(query_layer, attention_mask) - - return ( - query_layer, - key_layer, - value_layer, - indices_q, - (cu_seqlens_q, cu_seqlens_k), - (max_seqlen_in_batch_q, max_seqlen_in_batch_k), - ) - - -# copied from transformers.models.llama.modeling_llama.LlamaSdpaAttention with Llama->Mistral -# TODO @Arthur no longer copied from LLama after static cache -class Mistral2SdpaAttention(Mistral2Attention): - """ - Mistral attention module using torch.nn.functional.scaled_dot_product_attention. This module inherits from - `MistralAttention` as the weights of the module stays untouched. The only changes are on the forward pass to adapt to - SDPA API. - """ - - # Adapted from MistralAttention.forward - def forward( - self, - hidden_states: torch.Tensor, - attention_mask: Optional[torch.Tensor] = None, - position_ids: Optional[torch.LongTensor] = None, - past_key_value: Optional[Cache] = None, - output_attentions: bool = False, - use_cache: bool = False, - ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: - if output_attentions: - # TODO: Improve this warning with e.g. `model.config.attn_implementation = "manual"` once this is implemented. - logger.warning_once( - "MistralModel is using MistralSdpaAttention, but `torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to the manual attention implementation, " - 'but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.' - ) - return super().forward( - hidden_states=hidden_states, - attention_mask=attention_mask, - position_ids=position_ids, - past_key_value=past_key_value, - output_attentions=output_attentions, - use_cache=use_cache, - ) - - bsz, q_len, _ = hidden_states.size() - - query_states = self.q_proj(hidden_states) - key_states = self.k_proj(hidden_states) - value_states = self.v_proj(hidden_states) - - # query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2) - # key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) - # value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) - - query_states, key_states, value_states = ( - rearrange(query_states, "b n (h d) -> b n h d", d=self.head_dim), - rearrange(key_states, "b n (h d) -> b n h d", d=self.head_dim), - rearrange(value_states, "b n (h d) -> b n h d", d=self.head_dim), - ) - - self.num_heads_tp = query_states.shape[2] - self.tp_size = self.num_heads // self.num_heads_tp - - query_states = query_states.transpose(1, 2) - key_states = key_states.transpose(1, 2) - value_states = value_states.transpose(1, 2) - - if self.config.pp_size > 1: - query_states = query_states.contiguous() - key_states = key_states.contiguous() - value_states = value_states.contiguous() - - - kv_seq_len = key_states.shape[-2] - if past_key_value is not None: - kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx) - cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len) - - query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids) - - if past_key_value is not None: - cache_kwargs = {"sin": sin, "cos": cos} # Specific to RoPE models - key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs) - - key_states = repeat_kv(key_states, self.num_key_value_groups) - value_states = repeat_kv(value_states, self.num_key_value_groups) - - if attention_mask is not None: - if attention_mask.size() != (bsz, 1, q_len, kv_seq_len): - raise ValueError( - f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}" - ) - - # SDPA with memory-efficient backend is currently (torch==2.1.2) bugged with non-contiguous inputs with custom attn_mask, - # Reference: https://github.com/pytorch/pytorch/issues/112577. - if query_states.device.type == "cuda" and attention_mask is not None: - query_states = query_states.contiguous() - key_states = key_states.contiguous() - value_states = value_states.contiguous() - - attn_output = torch.nn.functional.scaled_dot_product_attention( - query_states, - key_states, - value_states, - attn_mask=attention_mask, - dropout_p=self.attention_dropout if self.training else 0.0, - # The q_len > 1 is necessary to match with AttentionMaskConverter.to_causal_4d that does not create a causal mask in case q_len == 1. - is_causal=self.is_causal and attention_mask is None and q_len > 1, - ) - - attn_output = attn_output.transpose(1, 2).contiguous() - attn_output = attn_output.view(bsz, q_len, int(self.hidden_size/self.tp_size)) - - attn_output = self.o_proj(attn_output) - - return attn_output, None, past_key_value - - -MISTRAL_ATTENTION_CLASSES = { - "eager": Mistral2Attention, - "flash_attention_2": Mistral2FlashAttention2, - "sdpa": Mistral2SdpaAttention, -} - - -class MistralDecoderLayer(nn.Module): - def __init__(self, config: CollieConfig, layer_idx: int): - super().__init__() - self.hidden_size = config.hidden_size - - self.self_attn = MISTRAL_ATTENTION_CLASSES[config._attn_implementation](config, layer_idx) - - self.mlp = Mistral2MLP(config) - self.input_layernorm = Mistral2RMSNorm(config.hidden_size, eps=config.rms_norm_eps) - self.post_attention_layernorm = Mistral2RMSNorm(config.hidden_size, eps=config.rms_norm_eps) - - def forward( - self, - hidden_states: torch.Tensor, - attention_mask: Optional[torch.Tensor] = None, - position_ids: Optional[torch.LongTensor] = None, - past_key_value: Optional[Tuple[torch.Tensor]] = None, - output_attentions: Optional[bool] = False, - use_cache: Optional[bool] = False, - **kwargs, - ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]: - if "padding_mask" in kwargs: - warnings.warn( - "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`" - ) - """ - Args: - hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)` - attention_mask (`torch.FloatTensor`, *optional*): attention mask of size - `(batch, sequence_length)` where padding elements are indicated by 0. - output_attentions (`bool`, *optional*): - Whether or not to return the attentions tensors of all attention layers. See `attentions` under - returned tensors for more detail. - use_cache (`bool`, *optional*): - If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding - (see `past_key_values`). - past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states - """ - - residual = hidden_states - - hidden_states = self.input_layernorm(hidden_states) - - # Self Attention - hidden_states, self_attn_weights, present_key_value = self.self_attn( - hidden_states=hidden_states, - attention_mask=attention_mask, - position_ids=position_ids, - past_key_value=past_key_value, - output_attentions=output_attentions, - use_cache=use_cache, - ) - hidden_states = residual + hidden_states - - # Fully Connected - residual = hidden_states - hidden_states = self.post_attention_layernorm(hidden_states) - hidden_states = self.mlp(hidden_states) - hidden_states = residual + hidden_states - - outputs = (hidden_states,) - - if output_attentions: - outputs += (self_attn_weights,) - - if use_cache: - outputs += (present_key_value,) - - # -------------------------------------------------------- - # # 将Tensor转换为列表 - # ans_list = [tensor.tolist() for tensor in outputs] - # # 指定.json文件的路径 - # file_path = '/remote-home/lqyin/CoLLiE/tests/models/mistraltp/decoder_outputs.json' - - # # 尝试打开现有的.json文件并读取内容,如果文件不存在则创建一个新的列表 - # try: - # with open(file_path, 'r', encoding='utf-8') as file: - # results_list = json.load(file) - # except FileNotFoundError: - # results_list = [] - # # 将当前结果添加到列表中 - # results_list.append(ans_list) - # # 将更新后的列表写回.json文件 - # with open(file_path, 'w', encoding='utf-8') as file: - # json.dump(results_list, file, ensure_ascii=False, indent=4) - # file.write('\n') # 在文件末尾添加一个换行符 - # -------------------------------------------------------- - - return outputs - - -MISTRAL_START_DOCSTRING = r""" - This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the - library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads - etc.) - - This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. - Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage - and behavior. - - Parameters: - config ([`MistralConfig`]): - Model configuration class with all the parameters of the model. Initializing with a config file does not - load the weights associated with the model, only the configuration. Check out the - [`~PreTrainedModel.from_pretrained`] method to load the model weights. -""" - - -@add_start_docstrings( - "The bare Mistral Model outputting raw hidden-states without any specific head on top.", - MISTRAL_START_DOCSTRING, -) -class Mistral2PreTrainedModel(PreTrainedModel): - config_class = Mistral2Config - base_model_prefix = "model" - supports_gradient_checkpointing = True - _no_split_modules = ["MistralDecoderLayer"] - _skip_keys_device_placement = "past_key_values" - _supports_flash_attn_2 = True - _supports_sdpa = True - _supports_cache_class = True - - def _init_weights(self, module): - std = self.config.initializer_range - if isinstance(module, nn.Linear): - module.weight.data.normal_(mean=0.0, std=std) - if module.bias is not None: - module.bias.data.zero_() - elif isinstance(module, nn.Embedding): - module.weight.data.normal_(mean=0.0, std=std) - if module.padding_idx is not None: - module.weight.data[module.padding_idx].zero_() - - -MISTRAL_INPUTS_DOCSTRING = r""" - Args: - input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`): - Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide - it. - - Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and - [`PreTrainedTokenizer.__call__`] for details. - - [What are input IDs?](../glossary#input-ids) - attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*): - Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: - - - 1 for tokens that are **not masked**, - - 0 for tokens that are **masked**. - - [What are attention masks?](../glossary#attention-mask) - - Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and - [`PreTrainedTokenizer.__call__`] for details. - - If `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see - `past_key_values`). - - If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`] - and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more - information on the default strategy. - - - 1 indicates the head is **not masked**, - - 0 indicates the head is **masked**. - position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): - Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0, - config.n_positions - 1]`. - - [What are position IDs?](../glossary#position-ids) - past_key_values (`Cache` or `tuple(tuple(torch.FloatTensor))`, *optional*): - Pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention - blocks) that can be used to speed up sequential decoding. This typically consists in the `past_key_values` - returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`. - - Two formats are allowed: - - a [`~cache_utils.Cache`] instance; - - Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of - shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`). This is also known as the legacy - cache format. - - The model will output the same cache format that is fed as input. If no `past_key_values` are passed, the - legacy cache format will be returned. - - If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that don't - have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `input_ids` - of shape `(batch_size, sequence_length)`. - inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): - Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This - is useful if you want more control over how to convert `input_ids` indices into associated vectors than the - model's internal embedding lookup matrix. - use_cache (`bool`, *optional*): - If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see - `past_key_values`). - output_attentions (`bool`, *optional*): - Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned - tensors for more detail. - output_hidden_states (`bool`, *optional*): - Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for - more detail. - return_dict (`bool`, *optional*): - Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. -""" - - -@add_start_docstrings( - "The bare Mistral Model outputting raw hidden-states without any specific head on top.", - MISTRAL_START_DOCSTRING, -) -class Mistral2Model(nn.Module): - """ - Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`MistralDecoderLayer`] - - Args: - config: MistralConfig - """ - - def __init__(self, config: CollieConfig): - # super().__init__(config) - super().__init__() - self.config = config - self.padding_idx = config.pad_token_id - self.vocab_size = config.vocab_size - - self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx) - self.layers = nn.ModuleList( - [MistralDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)] - ) - self._attn_implementation = config._attn_implementation - self.norm = Mistral2RMSNorm(config.hidden_size, eps=config.rms_norm_eps) - - self.gradient_checkpointing = False - # Initialize weights and apply final processing - # self.post_init() - - def get_input_embeddings(self): - return self.embed_tokens - - def set_input_embeddings(self, value): - self.embed_tokens = value - - @add_start_docstrings_to_model_forward(MISTRAL_INPUTS_DOCSTRING) - def forward( - self, - input_ids: torch.LongTensor = None, - attention_mask: Optional[torch.Tensor] = None, - position_ids: Optional[torch.LongTensor] = None, - past_key_values: Optional[List[torch.FloatTensor]] = None, - inputs_embeds: Optional[torch.FloatTensor] = None, - use_cache: Optional[bool] = None, - output_attentions: Optional[bool] = None, - output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[Tuple, BaseModelOutputWithPast]: - output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions - output_hidden_states = ( - output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states - ) - use_cache = use_cache if use_cache is not None else self.config.use_cache - - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - - # retrieve input_ids and inputs_embeds - if input_ids is not None and inputs_embeds is not None: - raise ValueError("You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time") - elif input_ids is not None: - batch_size, seq_length = input_ids.shape - elif inputs_embeds is not None: - batch_size, seq_length, _ = inputs_embeds.shape - else: - raise ValueError("You have to specify either decoder_input_ids or decoder_inputs_embeds") - - if self.gradient_checkpointing and self.training: - if use_cache: - logger.warning_once( - "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..." - ) - use_cache = False - - past_key_values_length = 0 - - if use_cache: - use_legacy_cache = not isinstance(past_key_values, Cache) - if use_legacy_cache: - past_key_values = DynamicCache.from_legacy_cache(past_key_values) - past_key_values_length = past_key_values.get_usable_length(seq_length) - - if position_ids is None: - device = input_ids.device if input_ids is not None else inputs_embeds.device - position_ids = torch.arange( - past_key_values_length, seq_length + past_key_values_length, dtype=torch.long, device=device - ) - position_ids = position_ids.unsqueeze(0).view(-1, seq_length) - else: - position_ids = position_ids.view(-1, seq_length).long() - - if inputs_embeds is None: - inputs_embeds = self.embed_tokens(input_ids) - - if attention_mask is not None and self._attn_implementation == "flash_attention_2" and use_cache: - is_padding_right = attention_mask[:, -1].sum().item() != batch_size - if is_padding_right: - raise ValueError( - "You are attempting to perform batched generation with padding_side='right'" - " this may lead to unexpected behaviour for Flash Attention version of Mistral. Make sure to " - " call `tokenizer.padding_side = 'left'` before tokenizing the input. " - ) - - if self._attn_implementation == "flash_attention_2": - # 2d mask is passed through the layers - attention_mask = attention_mask if (attention_mask is not None and 0 in attention_mask) else None - elif self._attn_implementation == "sdpa" and not output_attentions: - # output_attentions=True can not be supported when using SDPA, and we fall back on - # the manual implementation that requires a 4D causal mask in all cases. - attention_mask = _prepare_4d_causal_attention_mask_for_sdpa( - attention_mask, - (batch_size, seq_length), - inputs_embeds, - past_key_values_length, - ) - else: - # 4d mask is passed through the layers - attention_mask = _prepare_4d_causal_attention_mask( - attention_mask, - (batch_size, seq_length), - inputs_embeds, - past_key_values_length, - sliding_window=self.config.sliding_window, - ) - - hidden_states = inputs_embeds - - - # -------------------------------------------------------- - # # 将Tensor转换为列表 - # ans_list = inputs_embeds.tolist() - # # 指定.json文件的路径 - # file_path = '/remote-home/lqyin/CoLLiE/tests/models/mistraltp/inputs_embeds.json' - - # # 尝试打开现有的.json文件并读取内容,如果文件不存在则创建一个新的列表 - # try: - # with open(file_path, 'r', encoding='utf-8') as file: - # results_list = json.load(file) - # except FileNotFoundError: - # results_list = [] - # # 将当前结果添加到列表中 - # results_list.append(ans_list) - # # 将更新后的列表写回.json文件 - # with open(file_path, 'w', encoding='utf-8') as file: - # json.dump(results_list, file, ensure_ascii=False, indent=4) - # file.write('\n') # 在文件末尾添加一个换行符 - # # -------------------------------------------------------- - - # decoder layers - all_hidden_states = () if output_hidden_states else None - all_self_attns = () if output_attentions else None - next_decoder_cache = None - - for decoder_layer in self.layers: - if output_hidden_states: - all_hidden_states += (hidden_states,) - - if self.gradient_checkpointing and self.training: - layer_outputs = self._gradient_checkpointing_func( - decoder_layer.__call__, - hidden_states, - attention_mask, - position_ids, - past_key_values, - output_attentions, - use_cache, - ) - else: - layer_outputs = decoder_layer( - hidden_states, - attention_mask=attention_mask, - position_ids=position_ids, - past_key_value=past_key_values, - output_attentions=output_attentions, - use_cache=use_cache, - ) - - hidden_states = layer_outputs[0] - - if use_cache: - next_decoder_cache = layer_outputs[2 if output_attentions else 1] - - if output_attentions: - all_self_attns += (layer_outputs[1],) - - hidden_states = self.norm(hidden_states) - - # add hidden states from the last decoder layer - if output_hidden_states: - all_hidden_states += (hidden_states,) - - next_cache = None - if use_cache: - next_cache = next_decoder_cache.to_legacy_cache() if use_legacy_cache else next_decoder_cache - - if not return_dict: - return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None) - return BaseModelOutputWithPast( - last_hidden_state=hidden_states, - past_key_values=next_cache, - hidden_states=all_hidden_states, - attentions=all_self_attns, - ) - - -class Mistral2ForCausalLM(CollieModelForCausalLM): - _tied_weights_keys = ["lm_head.weight"] - - def __init__(self, config:CollieConfig): - super().__init__(config) - self.model = Mistral2Model(config) - self.vocab_size = config.vocab_size - # self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False) - self.lm_head = ColumnParallelLinearWithoutBias( - self.collie_config.hidden_size, self.collie_config.vocab_size, bias=False - ) - # Initialize weights and apply final processing - # self.post_init() - # GenerationMixin 需要的额外参数 - self.config.is_decoder = True - if config.model_config.tie_word_embeddings: - self.lm_head.weight = self.embed_tokens.weight - self.main_input_name = "input_ids" - - def clean_cache(self): - self._clean_hidden_states([*self.model.layers, self.lm_head]) - self._set_use_cache(self.model.layers, False) - - def set_cache(self, use_cache): - self._set_use_cache(self.model.layers, use_cache) - - def get_input_embeddings(self): - return self.model.embed_tokens - - def set_input_embeddings(self, value): - self.model.embed_tokens = value - - def get_output_embeddings(self): - return self.lm_head - - def set_output_embeddings(self, new_embeddings): - self.lm_head = new_embeddings - - def set_decoder(self, decoder): - self.model = decoder - - def get_decoder(self): - return self.model - - @add_start_docstrings_to_model_forward(MISTRAL_INPUTS_DOCSTRING) - @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC) - def forward( - self, - input_ids: torch.LongTensor = None, - attention_mask: Optional[torch.Tensor] = None, - position_ids: Optional[torch.LongTensor] = None, - past_key_values: Optional[List[torch.FloatTensor]] = None, - inputs_embeds: Optional[torch.FloatTensor] = None, - labels: Optional[torch.LongTensor] = None, - use_cache: Optional[bool] = None, - output_attentions: Optional[bool] = None, - output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - **kwargs, - ) -> Union[Tuple, CausalLMOutputWithPast]: - r""" - Args: - labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): - Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., - config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored - (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`. - - Returns: - - Example: - - ```python - >>> from transformers import AutoTokenizer, MistralForCausalLM - - >>> model = MistralForCausalLM.from_pretrained("mistralai/Mistral-7B-v0.1") - >>> tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-v0.1") - - >>> prompt = "Hey, are you conscious? Can you talk to me?" - >>> inputs = tokenizer(prompt, return_tensors="pt") - - >>> # Generate - >>> generate_ids = model.generate(inputs.input_ids, max_length=30) - >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0] - "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you." - ```""" - - output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions - output_hidden_states = ( - output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states - ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - - # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn) - outputs = self.model( - input_ids=input_ids, - attention_mask=attention_mask, - position_ids=position_ids, - past_key_values=past_key_values, - inputs_embeds=inputs_embeds, - use_cache=use_cache, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, - return_dict=return_dict, - ) - - hidden_states = outputs[0] - logits = self.lm_head(hidden_states) - logits = logits.float() - - loss = None - if labels is not None: - # Shift so that tokens < n predict n - shift_logits = logits[..., :-1, :].contiguous() - shift_labels = labels[..., 1:].contiguous() - # Flatten the tokens - shift_logits = shift_logits.view(-1, self.config.vocab_size) - shift_labels = shift_labels.view(-1) - # Ensure tensors are on the same device - shift_labels = shift_labels.to(shift_logits.device) - loss_fct = CrossEntropyLoss() - loss = loss_fct(shift_logits, shift_labels) - - if not return_dict: - output = (logits,) + outputs[1:] - return (loss,) + output if loss is not None else output - - return CausalLMOutputWithPast( - loss=loss, - logits=logits, - past_key_values=outputs.past_key_values, - hidden_states=outputs.hidden_states, - attentions=outputs.attentions, - ) - - def prepare_inputs_for_generation( - self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, **kwargs - ): - # Omit tokens covered by past_key_values - if past_key_values is not None: - if isinstance(past_key_values, Cache): - cache_length = past_key_values.get_seq_length() - past_length = past_key_values.seen_tokens - max_cache_length = past_key_values.get_max_length() - else: - cache_length = past_length = past_key_values[0][0].shape[2] - max_cache_length = None - - # Keep only the unprocessed tokens: - # 1 - If the length of the attention_mask exceeds the length of input_ids, then we are in a setting where - # some of the inputs are exclusively passed as part of the cache (e.g. when passing input_embeds as - # input) - if attention_mask is not None and attention_mask.shape[1] > input_ids.shape[1]: - input_ids = input_ids[:, -(attention_mask.shape[1] - past_length) :] - # 2 - If the past_length is smaller than input_ids', then input_ids holds all input tokens. We can discard - # input_ids based on the past_length. - elif past_length < input_ids.shape[1]: - input_ids = input_ids[:, past_length:] - # 3 - Otherwise (past_length >= input_ids.shape[1]), let's assume input_ids only has unprocessed tokens. - - # If we are about to go beyond the maximum cache length, we need to crop the input attention mask. - if ( - max_cache_length is not None - and attention_mask is not None - and cache_length + input_ids.shape[1] > max_cache_length - ): - attention_mask = attention_mask[:, -max_cache_length:] - - position_ids = kwargs.get("position_ids", None) - if attention_mask is not None and position_ids is None: - # create position_ids on the fly for batch generation - position_ids = attention_mask.long().cumsum(-1) - 1 - position_ids.masked_fill_(attention_mask == 0, 1) - if past_key_values: - position_ids = position_ids[:, -input_ids.shape[1] :] - - # if `inputs_embeds` are passed, we only want to use them in the 1st generation step - if inputs_embeds is not None and past_key_values is None: - model_inputs = {"inputs_embeds": inputs_embeds} - else: - model_inputs = {"input_ids": input_ids} - - model_inputs.update( - { - "position_ids": position_ids, - "past_key_values": past_key_values, - "use_cache": kwargs.get("use_cache"), - "attention_mask": attention_mask, - } - ) - return model_inputs - - @staticmethod - def _reorder_cache(past_key_values, beam_idx): - reordered_past = () - for layer_past in past_key_values: - reordered_past += ( - tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past), - ) - return reordered_past - - @classmethod - def pipeline_layers(cls, config: CollieConfig): - """ - Get layers of pipeline. - :return: list - """ - if isinstance(config, str): - config = CollieConfig.from_pretrained(config) - - if config.tie_word_embeddings: - output = TiedLayerSpec( - "embed_tokens", - dict_as_params(input_keys="hidden_states", output_keys="logits"), - ColumnParallelLMHead, - config.hidden_size, - config.vocab_size, - bias=False, - ) - else: - output = LayerSpec( - dict_as_params(input_keys="hidden_states", output_keys="logits"), - ColumnParallelLMHead, - config.hidden_size, - config.vocab_size, - bias=False, - ) - - return [("model", Mistral2Model.pipeline_layers(config)), ("lm_head", output)] - - @staticmethod - def load_parallel_state_dict( - path: str, - config: Union[CollieConfig, str], - process_exclusion: bool = False, - **kwargs, - ): - ... - - @staticmethod - def load_parallel_state_dict( - path: str, - config: Union[CollieConfig, str], - process_exclusion: bool = False, - protocol: str = "file", # 指定加载state_dict时使用的协议 - **kwargs, - ): - """ - Load state_dict from ``path``. - The format of pretrained model should be the same as that of - `huggingface`. - :return: state_dict. Note that the state_dict should be processed - properly to match the current rank. - """ - # 配置加载 - if isinstance(config, str): - config = CollieConfig.from_pretrained(config) - # IO驱动初始化 - io_driver = IODriver.from_protocol(protocol) - # 检查文件路径是否存在 - if not io_driver.exists(path): - raise FileNotFoundError(f"folder {path} not found.") - # 初始化存储和处理变量 - state_dict = OrderedDict() - weights = [] - parts = None # 变量用于存储模型分割的部分信息 - # 如果开启了进程互斥,那么每个进程都会显示进度条,否则只显示 RANK0 的 - hide_progress = not process_exclusion and int(os.environ.get("RANK", "0")) != 0 - if dist.is_initialized() and process_exclusion: - # 如果启动了进程互斥,则要进行 dist.get_world_size() 次循环 - rank_order = range(dist.get_world_size()) - else: - # 不开启只进行一次循环 - rank_order = range(1) - # 权重文件加载和处理 - for rank in rank_order: - # 如果开启了进程互斥,那么只有对应 RANK 的能进入循环;不开启进程互斥的话就都可以进 - if int(os.environ.get("RANK", "0")) == rank or not process_exclusion: - # PP 分层的方法保存在了 os.environ["COLLIE_PP_PARTS"], 格式类似于 [0, 17, 35], 左闭右开 - if env.is_pipeline: - # 保存的是 json 格式 - parts = env.pipeline_parts - if hasattr(config, "num_key_value_heads"): - # llama2 (transformers >= 4.31.0) - num_key_value_heads = config.num_key_value_heads - else: - num_key_value_heads = config.num_attention_heads - head_dim = config.hidden_size // config.num_attention_heads - # 如果存在 pytorch_model.bin.index.json 文件的话,此时不同的 pp 进程可以按需加载自己需要的权重 - if ( - io_driver.exists(os.path.join(path, "pytorch_model.bin.index.json")) - and "COLLIE_PP_PARTS" in os.environ.keys() - ): - weight_map = json.loads( - io_driver.load( - os.path.join(path, "pytorch_model.bin.index.json"), mode="r" - ) - )["weight_map"] - # layers 表示自己需要的层 - layers = env.pipeline_layers_idx - # 筛选出形似 model.layers.0 这样的层。包含两个条件:1. 有数字的层;2. 数字加一要在 layers 里面(因为最开始还有个 embedding 占一层) - weights.extend( - [ - value - for key, value in weight_map.items() - if len(key.split(".")) > 2 - and key.split(".")[2].isdigit() - and (int(key.split(".")[2]) + 1) in layers - ] - ) - # 去重 - weights = list(set(weights)) - # 继续筛选,如果有 0 层,那么就要加载 embedding;如果有最后一层,那么就要加载 lm_head;如果有倒数第二层,那么就要加载 norm - if 0 in layers: - weights.append(weight_map["model.tok_embeddings.weight"]) - if max(parts) - 1 in layers: - weights.append(weight_map["output.weight"]) - if max(parts) - 2 in layers: - weights.append(weight_map["model.norm.weight"]) - else: - # 如果没有 pytorch_model.bin.index.json 文件的话,那么就加载所有的权重 - weights = [ - weight - for weight in io_driver.list(path) - if weight.endswith(".bin") - ] - with progress( - weights, - desc="Loading state dict", - total=len(weights), - disable=hide_progress, - ) as pbar: - for weight in pbar: - part_state_dict = io_driver.load( - os.path.join(path, weight), mode="rb" - ) - # for key in list(part_state_dict.keys()): - # if "attention.wqkv.weight" in key: - # # qkv_weights = part_state_dict.pop(key) - # qkv_weights = part_state_dict[key] - # print(qkv_weights.shape) - # (wq, wk, wv) = qkv_weights.split( - # [ - # config.hidden_size, - # config.num_key_value_heads * head_dim, - # config.num_key_value_heads * head_dim, - # ], - # dim=0, - # ) - # wq_name = key.replace("wqkv", "wq") - # wk_name = key.replace("wqkv", "wk") - # wv_name = key.replace("wqkv", "wv") - # part_state_dict[wq_name] = wq - # part_state_dict[wk_name] = wk - # part_state_dict[wv_name] = wv - state_dict.update(part_state_dict) - del part_state_dict - if parts is not None: - # 这一步是 pp 的复筛 - layers = env.pipeline_layers_idx - for key in list(state_dict.keys()): - if key.startswith("layers"): - layer = int(key.split(".")[1]) - if layer + 1 not in layers: - state_dict.pop(key) - # if key.endswith("tok_embeddings.weight"): - if key.endswith("embed_tokens.weight"): - if 0 not in layers: - state_dict.pop(key) - if key == "norm.weight": - if max(parts) - 2 not in layers: - state_dict.pop(key) - # if key.endswith("output.weight"): - if key.endswith("lm_head.weight"): - if max(parts) - 1 not in layers: - state_dict.pop(key) - # 根据用户配置的新的 tp size 进行分割 - for key in list(state_dict.keys()): - col_filter = [ - # "wq.weight", - # "wk.weight", - # "wv.weight", - # "wqkv.weight", - # "w1.weight", - # "w3.weight", - # "tok_embeddings.weight", - # "output.weight", - "q_proj.weight", - "k_proj.weight", - "v_proj.weight", - "o_proj.weight", - "lm_head.weight", - "gate_proj.weight", - "up_proj.weight", - "down_proj.weight", - "embed_tokens.weight", - ] - col_split = any([key.endswith(filter) for filter in col_filter]) - - if col_split: - tensor = ( - list(torch.chunk(state_dict[key], config.tp_size, dim=0))[ - env.tp_rank - ] - .detach() - .clone() - ) - del state_dict[key] - if process_exclusion: - # CPU 内存回收(速度很慢) - gc.collect() - state_dict[key] = tensor - elif key.endswith("o_proj.weight") or key.endswith("down_proj.weight"): - tensor = ( - list(torch.chunk(state_dict[key], config.tp_size, dim=1))[ - env.tp_rank - ] - .detach() - .clone() - ) - del state_dict[key] - if process_exclusion: - # CPU 内存回收(速度很慢) - gc.collect() - state_dict[key] = tensor - if dist.is_initialized() and process_exclusion: - # 如果选择了进程互斥,那么本次循环中不需要加载权重的进程需等待 - dist.barrier() - return state_dict - - @staticmethod - def save_parallel_state_dict( - state_dict: dict, - path: str, - config: CollieConfig, - process_exclusion: bool = False, - **kwargs, - ): - ... - - @staticmethod - def save_parallel_state_dict( - state_dict: dict, - path: str, - config: CollieConfig, - process_exclusion: bool = False, - protocol: str = "file", - ): - """ - Save state_dict to ``path``. - The format of saved state dict should be the same as that of - `huggingface`. - """ - io_driver = IODriver.from_protocol(protocol) - # gather to tp rank 0 - if dist.is_initialized() and process_exclusion: - # 如果启动了进程互斥,则要进行 pp_size 次循环 - rank_order = range(config.pp_size) - else: - # 不开启只进行一次循环 - rank_order = range(1) - dst = parallel_state.get_tensor_model_parallel_src_rank() - with progress( - rank_order, - desc="Saving model", - disable=int(os.environ.get("RANK", "0")) != 0, - ) as pbar: - for rank in pbar: - if env.dp_rank == 0 and (env.pp_rank == rank or not process_exclusion): - for key in sorted(list(state_dict.keys())): - tensor_list = None - if env.tp_rank == 0: - tensor_list = [ - torch.zeros_like(state_dict[key]) - .to(state_dict[key].dtype) - .cuda() - for _ in range(config.tp_size) - ] - dist.gather( - state_dict[key].cuda(), - dst=dst, - gather_list=tensor_list, - group=env.tp_group, - ) - if env.tp_rank == 0: - col_filter = [ - # "wq.weight", - # "wk.weight", - # "wv.weight", - # "wqkv.weight", - # "w1.weight", - # "w3.weight", - # "tok_embeddings.weight", - # "output.weight", - "q_proj.weight", - "k_proj.weight", - "v_proj.weight", - "o_proj.weight", - "lm_head.weight", - "gate_proj.weight", - "up_proj.weight", - "down_proj.weight", - "embed_tokens.weight", - ] - col_split = any( - [key.endswith(filter) for filter in col_filter] - ) - - if col_split: - state_dict[key] = concat_tensor(tensor_list, dim=0) - - if process_exclusion: - # CPU 内存回收(速度很慢) - gc.collect() - - elif key.endswith("o_proj.weight") or key.endswith("down_proj.weight"): - state_dict[key] = concat_tensor(tensor_list, dim=1) - - if process_exclusion: - # CPU 内存回收(速度很慢) - gc.collect() - # 似乎不需要? - # state_dict_keys = state_dict.keys() - # for layer_id in range(config.num_layers): - # qkv_names = [None, None, None] - # for key in state_dict_keys: - # if f"layers.{layer_id}.attention.wq.weight" in key: - # qkv_names[0] = key - # elif f"layers.{layer_id}.attention.wk.weight" in key: - # qkv_names[1] = key - # elif f"layers.{layer_id}.attention.wv.weight" in key: - # qkv_names[2] = key - # qkv_name = qkv_names[0].replace("wq", "wqkv") - # state_dict[qkv_name] = torch.cat( - # [ - # state_dict.pop(qkv_names[0]), - # state_dict.pop(qkv_names[1]), - # state_dict.pop(qkv_names[2]), - # ], - # dim=0 - # ) - - if env.tp_rank == 0: - # Save gathered weights - if env.is_pipeline: - ckpt_name = f"pytorch_model-{env.pp_rank + 1:05d}-of-{config.pp_size:05d}.bin" - total_size = 0 - weight_map = {} - for name, weight in state_dict.items(): - weight_size = weight.numel() * dtype_byte_size( - weight.dtype - ) - weight_map[name] = ckpt_name - total_size += weight_size - index_dict = dict( - total_size=total_size, weight_map=weight_map - ) - index_dicts = [None for _ in range(env.pp_size)] - dist.gather_object( - index_dict, index_dicts if env.pp_rank == 0 else None, group=env.pp_group - ) - if env.pp_rank == 0: - total_size = 0 - weight_map = {} - for _index_dict in index_dicts: - total_size += _index_dict["total_size"] - weight_map.update(_index_dict["weight_map"]) - merged_dict = { - "metadata": {"total_size": total_size}, - "weight_map": weight_map, - } - io_driver.save( - json.dumps(merged_dict, indent=2, sort_keys=True) - + "\n", - os.path.join(path, "pytorch_model.bin.index.json"), - ) - - else: - ckpt_name = f"pytorch_model.bin" - ckpt_path = os.path.join(path, ckpt_name) - io_driver.save(state_dict, ckpt_path) - if dist.is_initialized() and process_exclusion: - dist.barrier() - if env.rank == 0: - config.save_pretrained(path, protocol=protocol) - dist.barrier() - - -@add_start_docstrings( - """ - The Mistral Model transformer with a sequence classification head on top (linear layer). - - [`MistralForSequenceClassification`] uses the last token in order to do the classification, as other causal models - (e.g. GPT-2) do. - - Since it does classification on the last token, it requires to know the position of the last token. If a - `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If - no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the - padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in - each row of the batch). - """, - MISTRAL_START_DOCSTRING, -) -# Copied from transformers.models.llama.modeling_llama.LlamaForSequenceClassification with Llama->Mistral, LLAMA->MISTRAL -class MistralForSequenceClassification(Mistral2PreTrainedModel): - def __init__(self, config): - super().__init__(config) - self.num_labels = config.num_labels - self.model = Mistral2Model(config) - self.score = nn.Linear(config.hidden_size, self.num_labels, bias=False) - - # Initialize weights and apply final processing - self.post_init() - - def get_input_embeddings(self): - return self.model.embed_tokens - - def set_input_embeddings(self, value): - self.model.embed_tokens = value - - @add_start_docstrings_to_model_forward(MISTRAL_INPUTS_DOCSTRING) - def forward( - self, - input_ids: torch.LongTensor = None, - attention_mask: Optional[torch.Tensor] = None, - position_ids: Optional[torch.LongTensor] = None, - past_key_values: Optional[List[torch.FloatTensor]] = None, - inputs_embeds: Optional[torch.FloatTensor] = None, - labels: Optional[torch.LongTensor] = None, - use_cache: Optional[bool] = None, - output_attentions: Optional[bool] = None, - output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[Tuple, SequenceClassifierOutputWithPast]: - r""" - labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): - Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., - config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If - `config.num_labels > 1` a classification loss is computed (Cross-Entropy). - """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - - transformer_outputs = self.model( - input_ids, - attention_mask=attention_mask, - position_ids=position_ids, - past_key_values=past_key_values, - inputs_embeds=inputs_embeds, - use_cache=use_cache, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, - return_dict=return_dict, - ) - hidden_states = transformer_outputs[0] - logits = self.score(hidden_states) - - if input_ids is not None: - batch_size = input_ids.shape[0] - else: - batch_size = inputs_embeds.shape[0] - - if self.config.pad_token_id is None and batch_size != 1: - raise ValueError("Cannot handle batch sizes > 1 if no padding token is defined.") - if self.config.pad_token_id is None: - sequence_lengths = -1 - else: - if input_ids is not None: - # if no pad token found, use modulo instead of reverse indexing for ONNX compatibility - sequence_lengths = torch.eq(input_ids, self.config.pad_token_id).int().argmax(-1) - 1 - sequence_lengths = sequence_lengths % input_ids.shape[-1] - sequence_lengths = sequence_lengths.to(logits.device) - else: - sequence_lengths = -1 - - pooled_logits = logits[torch.arange(batch_size, device=logits.device), sequence_lengths] - - loss = None - if labels is not None: - labels = labels.to(logits.device) - if self.config.problem_type is None: - if self.num_labels == 1: - self.config.problem_type = "regression" - elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int): - self.config.problem_type = "single_label_classification" - else: - self.config.problem_type = "multi_label_classification" - - if self.config.problem_type == "regression": - loss_fct = MSELoss() - if self.num_labels == 1: - loss = loss_fct(pooled_logits.squeeze(), labels.squeeze()) - else: - loss = loss_fct(pooled_logits, labels) - elif self.config.problem_type == "single_label_classification": - loss_fct = CrossEntropyLoss() - loss = loss_fct(pooled_logits.view(-1, self.num_labels), labels.view(-1)) - elif self.config.problem_type == "multi_label_classification": - loss_fct = BCEWithLogitsLoss() - loss = loss_fct(pooled_logits, labels) - if not return_dict: - output = (pooled_logits,) + transformer_outputs[1:] - return ((loss,) + output) if loss is not None else output - - return SequenceClassifierOutputWithPast( - loss=loss, - logits=pooled_logits, - past_key_values=transformer_outputs.past_key_values, - hidden_states=transformer_outputs.hidden_states, - attentions=transformer_outputs.attentions, - ) diff --git a/tests/models/mistral2/modelpp.py b/tests/models/mistral2/modelpp.py deleted file mode 100644 index 1180a10..0000000 --- a/tests/models/mistral2/modelpp.py +++ /dev/null @@ -1,1922 +0,0 @@ -# coding=utf-8 -# Copyright 2023 Mistral AI and the HuggingFace Inc. team. All rights reserved. -# -# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX -# and OPT implementations in this library. It has been modified from its -# original forms to accommodate minor architectural differences compared -# to GPT-NeoX and OPT used by the Meta AI team that trained the model. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -""" PyTorch Mistral model.""" -import inspect -import math -import warnings -from typing import List, Optional, Tuple, Union - -import torch -import torch.nn.functional as F -import torch.utils.checkpoint -from torch import nn -from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss - -from transformers.activations import ACT2FN -from transformers.cache_utils import Cache, DynamicCache -from transformers.modeling_attn_mask_utils import _prepare_4d_causal_attention_mask, _prepare_4d_causal_attention_mask_for_sdpa -from transformers.modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast, SequenceClassifierOutputWithPast -from transformers.modeling_utils import PreTrainedModel, dtype_byte_size -from transformers.utils import ( - add_start_docstrings, - add_start_docstrings_to_model_forward, - is_flash_attn_2_available, - is_flash_attn_greater_or_equal_2_10, - logging, - replace_return_docstrings, -) -from .configuration_mistraltp import Mistral2Config - - -if is_flash_attn_2_available(): - from flash_attn import flash_attn_func, flash_attn_varlen_func - from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input # noqa - - _flash_supports_window_size = "window_size" in list(inspect.signature(flash_attn_func).parameters) - - -logger = logging.get_logger(__name__) - -_CONFIG_FOR_DOC = "Mistral2Config" - -#modified for collie -import torch.distributed as dist -import gc -import json -import os -from collections import OrderedDict -from megatron.core import parallel_state, tensor_parallel -from einops import rearrange -from deepspeed.pipe import LayerSpec, TiedLayerSpec - -from collie.config import CollieConfig -from collie.driver.io import IODriver -from collie.log.logger import logger -from collie.module import ( - ColumnParallelLinearWithoutBias, - ColumnParallelLMHead, - RowParallelLinearWithoutBias, -) -from collie.utils import concat_tensor, dict_as_params, env, progress -from collie.models.base import CollieModelForCausalLM -from collie.models.utils import ( - kv_cache_to_inputs_for_layer, inputs_to_kv_cache_for_layer, - kv_cache_to_inputs_for_model, inputs_to_kv_cache_for_model, -) - - -# Copied from transformers.models.llama.modeling_llama._get_unpad_data -def _get_unpad_data(attention_mask): - seqlens_in_batch = attention_mask.sum(dim=-1, dtype=torch.int32) - indices = torch.nonzero(attention_mask.flatten(), as_tuple=False).flatten() - max_seqlen_in_batch = seqlens_in_batch.max().item() - cu_seqlens = F.pad(torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.int32), (1, 0)) - return ( - indices, - cu_seqlens, - max_seqlen_in_batch, - ) - - -# Copied from transformers.models.llama.modeling_llama.LlamaRMSNorm with Llama->Mistral -class Mistral2RMSNorm(nn.Module): - def __init__(self, hidden_size, eps=1e-6): - """ - MistralRMSNorm is equivalent to T5LayerNorm - """ - super().__init__() - self.weight = nn.Parameter(torch.ones(hidden_size)) - self.variance_epsilon = eps - - def forward(self, hidden_states): - input_dtype = hidden_states.dtype - hidden_states = hidden_states.to(torch.float32) - variance = hidden_states.pow(2).mean(-1, keepdim=True) - hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon) - return self.weight * hidden_states.to(input_dtype) - - -# copied from transformers.models.llama.modeling_llama.LlamaRotaryEmbedding with Llama->Mistral -# TODO @Arthur no longer copied from LLama after static cache -class Mistral2RotaryEmbedding(nn.Module): - def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None): - super().__init__() - - self.dim = dim - self.max_position_embeddings = max_position_embeddings - self.base = base - inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2, dtype=torch.int64).float().to(device) / self.dim)) - self.register_buffer("inv_freq", inv_freq, persistent=False) - - # Build here to make `torch.jit.trace` work. - self._set_cos_sin_cache( - seq_len=max_position_embeddings, device=self.inv_freq.device, dtype=torch.get_default_dtype() - ) - - def _set_cos_sin_cache(self, seq_len, device, dtype): - self.max_seq_len_cached = seq_len - t = torch.arange(self.max_seq_len_cached, device=device, dtype=torch.int64).type_as(self.inv_freq) - - freqs = torch.outer(t, self.inv_freq) - # Different from paper, but it uses a different permutation in order to obtain the same calculation - emb = torch.cat((freqs, freqs), dim=-1) - self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False) - self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False) - - def forward(self, x, seq_len=None): - # x: [bs, num_attention_heads, seq_len, head_size] - if seq_len > self.max_seq_len_cached: - self._set_cos_sin_cache(seq_len=seq_len, device=x.device, dtype=x.dtype) - - return ( - self.cos_cached[:seq_len].to(dtype=x.dtype), - self.sin_cached[:seq_len].to(dtype=x.dtype), - ) - - -# Copied from transformers.models.llama.modeling_llama.rotate_half -def rotate_half(x): - """Rotates half the hidden dims of the input.""" - x1 = x[..., : x.shape[-1] // 2] - x2 = x[..., x.shape[-1] // 2 :] - return torch.cat((-x2, x1), dim=-1) - - -# copied from transformers.models.llama.modeling_llama.apply_rotary_pos_emb -# TODO @Arthur no longer copied from LLama after static cache -def apply_rotary_pos_emb(q, k, cos, sin, position_ids, unsqueeze_dim=1): - """Applies Rotary Position Embedding to the query and key tensors. - - Args: - q (`torch.Tensor`): The query tensor. - k (`torch.Tensor`): The key tensor. - cos (`torch.Tensor`): The cosine part of the rotary embedding. - sin (`torch.Tensor`): The sine part of the rotary embedding. - position_ids (`torch.Tensor`): - The position indices of the tokens corresponding to the query and key tensors. For example, this can be - used to pass offsetted position ids when working with a KV-cache. - unsqueeze_dim (`int`, *optional*, defaults to 1): - The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and - sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note - that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and - k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes - cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have - the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2. - Returns: - `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding. - """ - cos = cos[position_ids].unsqueeze(unsqueeze_dim) - sin = sin[position_ids].unsqueeze(unsqueeze_dim) - q_embed = (q * cos) + (rotate_half(q) * sin) - k_embed = (k * cos) + (rotate_half(k) * sin) - return q_embed, k_embed - - -class Mistral2MLP(nn.Module): - def __init__(self, config): - super().__init__() - self.config = config - self.hidden_size = config.hidden_size - self.intermediate_size = config.intermediate_size - # self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False) - # self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False) - # self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False) - - self.up_proj = ColumnParallelLinearWithoutBias( - self.hidden_size, - self.intermediate_size, - bias=False, - gather_output=False, - init_method=lambda x: x, - ) - self.gate_proj = ColumnParallelLinearWithoutBias( - self.hidden_size, - self.intermediate_size, - bias=False, - gather_output=False, - init_method=lambda x: x, - ) - self.down_proj = RowParallelLinearWithoutBias( - self.intermediate_size, - self.hidden_size, - bias=False, - input_is_parallel=True, - init_method=lambda x: x, - ) - - self.act_fn = ACT2FN[config.hidden_act] - - def forward(self, x): - return self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x)) - - -# Copied from transformers.models.llama.modeling_llama.repeat_kv -def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor: - """ - This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch, - num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim) - """ - batch, num_key_value_heads, slen, head_dim = hidden_states.shape - if n_rep == 1: - return hidden_states - hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim) - return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim) - - -class Mistral2Attention(nn.Module): - """ - Multi-headed attention from 'Attention Is All You Need' paper. Modified to use sliding window attention: Longformer - and "Generating Long Sequences with Sparse Transformers". - """ - - def __init__(self, config: CollieConfig, layer_idx: Optional[int] = None): - super().__init__() - self.config = config - self.layer_idx = layer_idx - if layer_idx is None: - logger.warning_once( - f"Instantiating {self.__class__.__name__} without passing a `layer_idx` is not recommended and will " - "lead to errors during the forward call if caching is used. Please make sure to provide a `layer_idx` " - "when creating this class." - ) - - self.hidden_size = config.hidden_size - self.num_heads = config.num_attention_heads - self.head_dim = self.hidden_size // self.num_heads - self.num_key_value_heads = config.num_key_value_heads - self.num_key_value_groups = self.num_heads // self.num_key_value_heads - self.max_position_embeddings = config.max_position_embeddings - self.rope_theta = config.rope_theta - self.is_causal = True - self.attention_dropout = config.attention_dropout - - if (self.head_dim * self.num_heads) != self.hidden_size: - raise ValueError( - f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}" - f" and `num_heads`: {self.num_heads})." - ) - # self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=False) - # self.k_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=False) - # self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=False) - # self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=False) - - self.q_proj = ColumnParallelLinearWithoutBias( - self.hidden_size, - self.num_heads * self.head_dim, - bias=False, - gather_output=False, - init_method=lambda x: x, - ) - self.k_proj = ColumnParallelLinearWithoutBias( - self.hidden_size, - self.num_key_value_heads * self.head_dim, - bias=False, - gather_output=False, - init_method=lambda x: x, - ) - self.v_proj = ColumnParallelLinearWithoutBias( - self.hidden_size, - self.num_key_value_heads * self.head_dim, - bias=False, - gather_output=False, - init_method=lambda x: x, - ) - self.o_proj = RowParallelLinearWithoutBias( - self.num_heads * self.head_dim, - self.hidden_size, - bias=False, - input_is_parallel=True, - init_method=lambda x: x, - ) - - self.rotary_emb = Mistral2RotaryEmbedding( - self.head_dim, - max_position_embeddings=self.max_position_embeddings, - base=self.rope_theta, - ) - - def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int): - return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous() - - def forward( - self, - hidden_states: torch.Tensor, - attention_mask: Optional[torch.Tensor] = None, - position_ids: Optional[torch.LongTensor] = None, - past_key_value: Optional[Cache] = None, - output_attentions: bool = False, - use_cache: bool = False, - **kwargs, - ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: - if "padding_mask" in kwargs: - warnings.warn( - "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`" - ) - bsz, q_len, _ = hidden_states.size() - - query_states = self.q_proj(hidden_states) - key_states = self.k_proj(hidden_states) - value_states = self.v_proj(hidden_states) - - # query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2) - # key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) - # value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) - - query_states, key_states, value_states = ( - rearrange(query_states, "b n (h d) -> b n h d", d=self.head_dim), - rearrange(key_states, "b n (h d) -> b n h d", d=self.head_dim), - rearrange(value_states, "b n (h d) -> b n h d", d=self.head_dim), - ) - - self.num_heads_tp = query_states.shape[2] - self.tp_size = self.num_heads // self.num_heads_tp - - query_states = query_states.transpose(1, 2) - key_states = key_states.transpose(1, 2) - value_states = value_states.transpose(1, 2) - - if self.config.pp_size > 1: - query_states = query_states.contiguous() - key_states = key_states.contiguous() - value_states = value_states.contiguous() - - kv_seq_len = key_states.shape[-2] - if past_key_value is not None: - if self.layer_idx is None: - raise ValueError( - f"The cache structure has changed since version v4.36. If you are using {self.__class__.__name__} " - "for auto-regressive decoding with k/v caching, please make sure to initialize the attention class " - "with a layer index." - ) - kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx) - cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len) - query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids) - - if past_key_value is not None: - cache_kwargs = {"sin": sin, "cos": cos} # Specific to RoPE models - key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs) - - # repeat k/v heads if n_kv_heads < n_heads - key_states = repeat_kv(key_states, self.num_key_value_groups) - value_states = repeat_kv(value_states, self.num_key_value_groups) - - attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim) - - if attn_weights.size() != (bsz, self.num_heads_tp, q_len, kv_seq_len): - raise ValueError( - f"Attention weights should be of size {(bsz, self.num_heads_tp, q_len, kv_seq_len)}, but is" - f" {attn_weights.size()}" - ) - - if attention_mask is not None: - if attention_mask.size() != (bsz, 1, q_len, kv_seq_len): - raise ValueError( - f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}" - ) - - attn_weights = attn_weights + attention_mask - - # upcast attention to fp32 - attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype) - attn_weights = nn.functional.dropout(attn_weights, p=self.attention_dropout, training=self.training) - attn_output = torch.matmul(attn_weights, value_states) - - if attn_output.size() != (bsz, self.num_heads_tp, q_len, self.head_dim): - raise ValueError( - f"`attn_output` should be of size {(bsz, self.num_heads_tp, q_len, self.head_dim)}, but is" - f" {attn_output.size()}" - ) - - attn_output = attn_output.transpose(1, 2).contiguous() - attn_output = attn_output.reshape(bsz, q_len, int(self.hidden_size/self.tp_size)) - - attn_output = self.o_proj(attn_output) - - if not output_attentions: - attn_weights = None - - return attn_output, attn_weights, past_key_value - - -class Mistral2FlashAttention2(Mistral2Attention): - """ - Mistral flash attention module. This module inherits from `MistralAttention` as the weights of the module stays - untouched. The only required change would be on the forward pass where it needs to correctly call the public API of - flash attention and deal with padding tokens in case the input contains any of them. - """ - - # Copied from transformers.models.llama.modeling_llama.LlamaFlashAttention2.__init__ - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - - # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1. - # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignement, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0. - # Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left). - self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10() - - def forward( - self, - hidden_states: torch.Tensor, - attention_mask: Optional[torch.Tensor] = None, - position_ids: Optional[torch.LongTensor] = None, - past_key_value: Optional[Cache] = None, - output_attentions: bool = False, - use_cache: bool = False, - **kwargs, - ): - if "padding_mask" in kwargs: - warnings.warn( - "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`" - ) - - # overwrite attention_mask with padding_mask - attention_mask = kwargs.pop("padding_mask") - bsz, q_len, _ = hidden_states.size() - - query_states = self.q_proj(hidden_states) - key_states = self.k_proj(hidden_states) - value_states = self.v_proj(hidden_states) - - # query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2) - # key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) - # value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) - - query_states, key_states, value_states = ( - rearrange(query_states, "b n (h d) -> b n h d", d=self.head_dim), - rearrange(key_states, "b n (h d) -> b n h d", d=self.head_dim), - rearrange(value_states, "b n (h d) -> b n h d", d=self.head_dim), - ) - - self.num_heads_tp = query_states.shape[2] - self.tp_size = self.num_heads // self.num_heads_tp - - query_states = query_states.transpose(1, 2) - key_states = key_states.transpose(1, 2) - value_states = value_states.transpose(1, 2) - - if self.config.pp_size > 1: - query_states = query_states.contiguous() - key_states = key_states.contiguous() - value_states = value_states.contiguous() - - - kv_seq_len = key_states.shape[-2] - if past_key_value is not None: - if self.layer_idx is None: - raise ValueError( - f"The cache structure has changed since version v4.36. If you are using {self.__class__.__name__} " - "for auto-regressive decoding with k/v caching, please make sure to initialize the attention class " - "with a layer index." - ) - kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx) - - # Because the input can be padded, the absolute sequence length depends on the max position id. - rotary_seq_len = max(kv_seq_len, position_ids[:, -1].max().item()) + 1 - cos, sin = self.rotary_emb(value_states, seq_len=rotary_seq_len) - - query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids) - - use_sliding_windows = ( - _flash_supports_window_size - and getattr(self.config, "sliding_window", None) is not None - and kv_seq_len > self.config.sliding_window - ) - - if not _flash_supports_window_size: - logger.warning_once( - "The current flash attention version does not support sliding window attention, for a more memory efficient implementation" - " make sure to upgrade flash-attn library." - ) - - if past_key_value is not None: - # Activate slicing cache only if the config has a value `sliding_windows` attribute - cache_has_contents = past_key_value.get_seq_length(self.layer_idx) > 0 - if ( - getattr(self.config, "sliding_window", None) is not None - and kv_seq_len > self.config.sliding_window - and cache_has_contents - ): - slicing_tokens = 1 - self.config.sliding_window - - past_key = past_key_value[self.layer_idx][0] - past_value = past_key_value[self.layer_idx][1] - - past_key = past_key[:, :, slicing_tokens:, :].contiguous() - past_value = past_value[:, :, slicing_tokens:, :].contiguous() - - if past_key.shape[-2] != self.config.sliding_window - 1: - raise ValueError( - f"past key must have a shape of (`batch_size, num_heads, self.config.sliding_window-1, head_dim`), got" - f" {past_key.shape}" - ) - - if attention_mask is not None: - attention_mask = attention_mask[:, slicing_tokens:] - attention_mask = torch.cat([attention_mask, torch.ones_like(attention_mask[:, -1:])], dim=-1) - - cache_kwargs = {"sin": sin, "cos": cos} # Specific to RoPE models - key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs) - - # repeat k/v heads if n_kv_heads < n_heads - key_states = repeat_kv(key_states, self.num_key_value_groups) - value_states = repeat_kv(value_states, self.num_key_value_groups) - dropout_rate = 0.0 if not self.training else self.attention_dropout - - # In PEFT, usually we cast the layer norms in float32 for training stability reasons - # therefore the input hidden states gets silently casted in float32. Hence, we need - # cast them back in float16 just to be sure everything works as expected. - input_dtype = query_states.dtype - if input_dtype == torch.float32: - if torch.is_autocast_enabled(): - target_dtype = torch.get_autocast_gpu_dtype() - # Handle the case where the model is quantized - elif hasattr(self.config, "_pre_quantization_dtype"): - target_dtype = self.config._pre_quantization_dtype - else: - target_dtype = self.q_proj.weight.dtype - - logger.warning_once( - f"The input hidden states seems to be silently casted in float32, this might be related to" - f" the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in" - f" {target_dtype}." - ) - - query_states = query_states.to(target_dtype) - key_states = key_states.to(target_dtype) - value_states = value_states.to(target_dtype) - - # Reashape to the expected shape for Flash Attention - query_states = query_states.transpose(1, 2) - key_states = key_states.transpose(1, 2) - value_states = value_states.transpose(1, 2) - - attn_output = self._flash_attention_forward( - query_states, - key_states, - value_states, - attention_mask, - q_len, - dropout=dropout_rate, - use_sliding_windows=use_sliding_windows, - ) - - attn_output = attn_output.reshape(bsz, q_len, int(self.hidden_size/self.tp_size)).contiguous() - attn_output = self.o_proj(attn_output) - - if not output_attentions: - attn_weights = None - - return attn_output, attn_weights, past_key_value - - def _flash_attention_forward( - self, - query_states, - key_states, - value_states, - attention_mask, - query_length, - dropout=0.0, - softmax_scale=None, - use_sliding_windows=False, - ): - """ - Calls the forward method of Flash Attention - if the input hidden states contain at least one padding token - first unpad the input, then computes the attention scores and pad the final attention scores. - - Args: - query_states (`torch.Tensor`): - Input query states to be passed to Flash Attention API - key_states (`torch.Tensor`): - Input key states to be passed to Flash Attention API - value_states (`torch.Tensor`): - Input value states to be passed to Flash Attention API - attention_mask (`torch.Tensor`): - The padding mask - corresponds to a tensor of size `(batch_size, seq_len)` where 0 stands for the - position of padding tokens and 1 for the position of non-padding tokens. - dropout (`float`): - Attention dropout - softmax_scale (`float`, *optional*): - The scaling of QK^T before applying softmax. Default to 1 / sqrt(head_dim) - use_sliding_windows (`bool`, *optional*): - Whether to activate sliding window attention. - """ - if not self._flash_attn_uses_top_left_mask: - causal = self.is_causal - else: - # TODO: Remove the `query_length != 1` check once Flash Attention for RoCm is bumped to 2.1. For details, please see the comment in LlamaFlashAttention2 __init__. - causal = self.is_causal and query_length != 1 - - # Contains at least one padding token in the sequence - if attention_mask is not None: - batch_size = query_states.shape[0] - query_states, key_states, value_states, indices_q, cu_seq_lens, max_seq_lens = self._upad_input( - query_states, key_states, value_states, attention_mask, query_length - ) - - cu_seqlens_q, cu_seqlens_k = cu_seq_lens - max_seqlen_in_batch_q, max_seqlen_in_batch_k = max_seq_lens - - if not use_sliding_windows: - attn_output_unpad = flash_attn_varlen_func( - query_states, - key_states, - value_states, - cu_seqlens_q=cu_seqlens_q, - cu_seqlens_k=cu_seqlens_k, - max_seqlen_q=max_seqlen_in_batch_q, - max_seqlen_k=max_seqlen_in_batch_k, - dropout_p=dropout, - softmax_scale=softmax_scale, - causal=causal, - ) - else: - attn_output_unpad = flash_attn_varlen_func( - query_states, - key_states, - value_states, - cu_seqlens_q=cu_seqlens_q, - cu_seqlens_k=cu_seqlens_k, - max_seqlen_q=max_seqlen_in_batch_q, - max_seqlen_k=max_seqlen_in_batch_k, - dropout_p=dropout, - softmax_scale=softmax_scale, - causal=causal, - window_size=(self.config.sliding_window, self.config.sliding_window), - ) - - attn_output = pad_input(attn_output_unpad, indices_q, batch_size, query_length) - else: - if not use_sliding_windows: - attn_output = flash_attn_func( - query_states, - key_states, - value_states, - dropout, - softmax_scale=softmax_scale, - causal=causal, - ) - else: - attn_output = flash_attn_func( - query_states, - key_states, - value_states, - dropout, - softmax_scale=softmax_scale, - causal=causal, - window_size=(self.config.sliding_window, self.config.sliding_window), - ) - - return attn_output - - def _upad_input(self, query_layer, key_layer, value_layer, attention_mask, query_length): - batch_size, kv_seq_len, num_heads, head_dim = key_layer.shape - - # On the first iteration we need to properly re-create the padding mask - # by slicing it on the proper place - if kv_seq_len != attention_mask.shape[-1]: - attention_mask_num_tokens = attention_mask.shape[-1] - attention_mask = attention_mask[:, attention_mask_num_tokens - kv_seq_len :] - - indices_k, cu_seqlens_k, max_seqlen_in_batch_k = _get_unpad_data(attention_mask) - - key_layer = index_first_axis(key_layer.reshape(batch_size * kv_seq_len, num_heads, head_dim), indices_k) - value_layer = index_first_axis(value_layer.reshape(batch_size * kv_seq_len, num_heads, head_dim), indices_k) - - if query_length == kv_seq_len: - query_layer = index_first_axis( - query_layer.reshape(batch_size * kv_seq_len, num_heads, head_dim), indices_k - ) - cu_seqlens_q = cu_seqlens_k - max_seqlen_in_batch_q = max_seqlen_in_batch_k - indices_q = indices_k - elif query_length == 1: - max_seqlen_in_batch_q = 1 - cu_seqlens_q = torch.arange( - batch_size + 1, dtype=torch.int32, device=query_layer.device - ) # There is a memcpy here, that is very bad. - indices_q = cu_seqlens_q[:-1] - query_layer = query_layer.squeeze(1) - else: - # The -q_len: slice assumes left padding. - attention_mask = attention_mask[:, -query_length:] - query_layer, indices_q, cu_seqlens_q, max_seqlen_in_batch_q = unpad_input(query_layer, attention_mask) - - return ( - query_layer, - key_layer, - value_layer, - indices_q, - (cu_seqlens_q, cu_seqlens_k), - (max_seqlen_in_batch_q, max_seqlen_in_batch_k), - ) - - -# copied from transformers.models.llama.modeling_llama.LlamaSdpaAttention with Llama->Mistral -# TODO @Arthur no longer copied from LLama after static cache -class Mistral2SdpaAttention(Mistral2Attention): - """ - Mistral attention module using torch.nn.functional.scaled_dot_product_attention. This module inherits from - `MistralAttention` as the weights of the module stays untouched. The only changes are on the forward pass to adapt to - SDPA API. - """ - - # Adapted from MistralAttention.forward - def forward( - self, - hidden_states: torch.Tensor, - attention_mask: Optional[torch.Tensor] = None, - position_ids: Optional[torch.LongTensor] = None, - past_key_value: Optional[Cache] = None, - output_attentions: bool = False, - use_cache: bool = False, - ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: - if output_attentions: - # TODO: Improve this warning with e.g. `model.config.attn_implementation = "manual"` once this is implemented. - logger.warning_once( - "MistralModel is using MistralSdpaAttention, but `torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to the manual attention implementation, " - 'but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.' - ) - return super().forward( - hidden_states=hidden_states, - attention_mask=attention_mask, - position_ids=position_ids, - past_key_value=past_key_value, - output_attentions=output_attentions, - use_cache=use_cache, - ) - - bsz, q_len, _ = hidden_states.size() - - query_states = self.q_proj(hidden_states) - key_states = self.k_proj(hidden_states) - value_states = self.v_proj(hidden_states) - - # query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2) - # key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) - # value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) - - query_states, key_states, value_states = ( - rearrange(query_states, "b n (h d) -> b n h d", d=self.head_dim), - rearrange(key_states, "b n (h d) -> b n h d", d=self.head_dim), - rearrange(value_states, "b n (h d) -> b n h d", d=self.head_dim), - ) - - self.num_heads_tp = query_states.shape[2] - self.tp_size = self.num_heads // self.num_heads_tp - - query_states = query_states.transpose(1, 2) - key_states = key_states.transpose(1, 2) - value_states = value_states.transpose(1, 2) - - if self.config.pp_size > 1: - query_states = query_states.contiguous() - key_states = key_states.contiguous() - value_states = value_states.contiguous() - - - kv_seq_len = key_states.shape[-2] - if past_key_value is not None: - kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx) - cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len) - - query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids) - - if past_key_value is not None: - cache_kwargs = {"sin": sin, "cos": cos} # Specific to RoPE models - key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs) - - key_states = repeat_kv(key_states, self.num_key_value_groups) - value_states = repeat_kv(value_states, self.num_key_value_groups) - - if attention_mask is not None: - if attention_mask.size() != (bsz, 1, q_len, kv_seq_len): - raise ValueError( - f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}" - ) - - # SDPA with memory-efficient backend is currently (torch==2.1.2) bugged with non-contiguous inputs with custom attn_mask, - # Reference: https://github.com/pytorch/pytorch/issues/112577. - if query_states.device.type == "cuda" and attention_mask is not None: - query_states = query_states.contiguous() - key_states = key_states.contiguous() - value_states = value_states.contiguous() - - attn_output = torch.nn.functional.scaled_dot_product_attention( - query_states, - key_states, - value_states, - attn_mask=attention_mask, - dropout_p=self.attention_dropout if self.training else 0.0, - # The q_len > 1 is necessary to match with AttentionMaskConverter.to_causal_4d that does not create a causal mask in case q_len == 1. - is_causal=self.is_causal and attention_mask is None and q_len > 1, - ) - - attn_output = attn_output.transpose(1, 2).contiguous() - attn_output = attn_output.view(bsz, q_len, int(self.hidden_size/self.tp_size)) - - attn_output = self.o_proj(attn_output) - - return attn_output, None, past_key_value - - -MISTRAL_ATTENTION_CLASSES = { - "eager": Mistral2Attention, - "flash_attention_2": Mistral2FlashAttention2, - "sdpa": Mistral2SdpaAttention, -} - - -class MistralDecoderLayer(nn.Module): - def __init__(self, config: CollieConfig, layer_idx: int): - super().__init__() - self.hidden_size = config.hidden_size - - self.self_attn = MISTRAL_ATTENTION_CLASSES[config._attn_implementation](config, layer_idx) - - self.mlp = Mistral2MLP(config) - self.input_layernorm = Mistral2RMSNorm(config.hidden_size, eps=config.rms_norm_eps) - self.post_attention_layernorm = Mistral2RMSNorm(config.hidden_size, eps=config.rms_norm_eps) - - def forward( - self, - hidden_states: torch.Tensor, - attention_mask: Optional[torch.Tensor] = None, - position_ids: Optional[torch.LongTensor] = None, - past_key_value: Optional[Tuple[torch.Tensor]] = None, - output_attentions: Optional[bool] = False, - use_cache: Optional[bool] = False, - **kwargs, - ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]: - if "padding_mask" in kwargs: - warnings.warn( - "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`" - ) - """ - Args: - hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)` - attention_mask (`torch.FloatTensor`, *optional*): attention mask of size - `(batch, sequence_length)` where padding elements are indicated by 0. - output_attentions (`bool`, *optional*): - Whether or not to return the attentions tensors of all attention layers. See `attentions` under - returned tensors for more detail. - use_cache (`bool`, *optional*): - If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding - (see `past_key_values`). - past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states - """ - - residual = hidden_states - - hidden_states = self.input_layernorm(hidden_states) - - # Self Attention - hidden_states, self_attn_weights, present_key_value = self.self_attn( - hidden_states=hidden_states, - attention_mask=attention_mask, - position_ids=position_ids, - past_key_value=past_key_value, - output_attentions=output_attentions, - use_cache=use_cache, - ) - hidden_states = residual + hidden_states - - # Fully Connected - residual = hidden_states - hidden_states = self.post_attention_layernorm(hidden_states) - hidden_states = self.mlp(hidden_states) - hidden_states = residual + hidden_states - - outputs = (hidden_states,) - - if output_attentions: - outputs += (self_attn_weights,) - - if use_cache: - outputs += (present_key_value,) - - return outputs - - -MISTRAL_START_DOCSTRING = r""" - This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the - library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads - etc.) - - This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. - Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage - and behavior. - - Parameters: - config ([`MistralConfig`]): - Model configuration class with all the parameters of the model. Initializing with a config file does not - load the weights associated with the model, only the configuration. Check out the - [`~PreTrainedModel.from_pretrained`] method to load the model weights. -""" - - -@add_start_docstrings( - "The bare Mistral Model outputting raw hidden-states without any specific head on top.", - MISTRAL_START_DOCSTRING, -) -class Mistral2PreTrainedModel(PreTrainedModel): - config_class = Mistral2Config - base_model_prefix = "model" - supports_gradient_checkpointing = True - _no_split_modules = ["MistralDecoderLayer"] - _skip_keys_device_placement = "past_key_values" - _supports_flash_attn_2 = True - _supports_sdpa = True - _supports_cache_class = True - - def _init_weights(self, module): - std = self.config.initializer_range - if isinstance(module, nn.Linear): - module.weight.data.normal_(mean=0.0, std=std) - if module.bias is not None: - module.bias.data.zero_() - elif isinstance(module, nn.Embedding): - module.weight.data.normal_(mean=0.0, std=std) - if module.padding_idx is not None: - module.weight.data[module.padding_idx].zero_() - - -MISTRAL_INPUTS_DOCSTRING = r""" - Args: - input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`): - Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide - it. - - Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and - [`PreTrainedTokenizer.__call__`] for details. - - [What are input IDs?](../glossary#input-ids) - attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*): - Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: - - - 1 for tokens that are **not masked**, - - 0 for tokens that are **masked**. - - [What are attention masks?](../glossary#attention-mask) - - Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and - [`PreTrainedTokenizer.__call__`] for details. - - If `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see - `past_key_values`). - - If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`] - and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more - information on the default strategy. - - - 1 indicates the head is **not masked**, - - 0 indicates the head is **masked**. - position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): - Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0, - config.n_positions - 1]`. - - [What are position IDs?](../glossary#position-ids) - past_key_values (`Cache` or `tuple(tuple(torch.FloatTensor))`, *optional*): - Pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention - blocks) that can be used to speed up sequential decoding. This typically consists in the `past_key_values` - returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`. - - Two formats are allowed: - - a [`~cache_utils.Cache`] instance; - - Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of - shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`). This is also known as the legacy - cache format. - - The model will output the same cache format that is fed as input. If no `past_key_values` are passed, the - legacy cache format will be returned. - - If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that don't - have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `input_ids` - of shape `(batch_size, sequence_length)`. - inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): - Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This - is useful if you want more control over how to convert `input_ids` indices into associated vectors than the - model's internal embedding lookup matrix. - use_cache (`bool`, *optional*): - If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see - `past_key_values`). - output_attentions (`bool`, *optional*): - Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned - tensors for more detail. - output_hidden_states (`bool`, *optional*): - Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for - more detail. - return_dict (`bool`, *optional*): - Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. -""" - - -@add_start_docstrings( - "The bare Mistral Model outputting raw hidden-states without any specific head on top.", - MISTRAL_START_DOCSTRING, -) -class Mistral2Model(nn.Module): - """ - Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`MistralDecoderLayer`] - - Args: - config: MistralConfig - """ - - def __init__(self, config: CollieConfig): - # super().__init__(config) - super().__init__() - self.config = config - self.padding_idx = config.pad_token_id - self.vocab_size = config.vocab_size - - self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx) - self.layers = nn.ModuleList( - [MistralDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)] - ) - self._attn_implementation = config._attn_implementation - self.norm = Mistral2RMSNorm(config.hidden_size, eps=config.rms_norm_eps) - - self.gradient_checkpointing = False - # Initialize weights and apply final processing - # self.post_init() - - def get_input_embeddings(self): - return self.embed_tokens - - def set_input_embeddings(self, value): - self.embed_tokens = value - - @add_start_docstrings_to_model_forward(MISTRAL_INPUTS_DOCSTRING) - def forward( - self, - input_ids: torch.LongTensor = None, - attention_mask: Optional[torch.Tensor] = None, - position_ids: Optional[torch.LongTensor] = None, - past_key_values: Optional[List[torch.FloatTensor]] = None, - inputs_embeds: Optional[torch.FloatTensor] = None, - use_cache: Optional[bool] = None, - output_attentions: Optional[bool] = None, - output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[Tuple, BaseModelOutputWithPast]: - output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions - output_hidden_states = ( - output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states - ) - use_cache = use_cache if use_cache is not None else self.config.use_cache - - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - - # retrieve input_ids and inputs_embeds - if input_ids is not None and inputs_embeds is not None: - raise ValueError("You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time") - elif input_ids is not None: - batch_size, seq_length = input_ids.shape - elif inputs_embeds is not None: - batch_size, seq_length, _ = inputs_embeds.shape - else: - raise ValueError("You have to specify either decoder_input_ids or decoder_inputs_embeds") - - if self.gradient_checkpointing and self.training: - if use_cache: - logger.warning_once( - "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..." - ) - use_cache = False - - past_key_values_length = 0 - - if use_cache: - use_legacy_cache = not isinstance(past_key_values, Cache) - if use_legacy_cache: - past_key_values = DynamicCache.from_legacy_cache(past_key_values) - past_key_values_length = past_key_values.get_usable_length(seq_length) - - if position_ids is None: - device = input_ids.device if input_ids is not None else inputs_embeds.device - position_ids = torch.arange( - past_key_values_length, seq_length + past_key_values_length, dtype=torch.long, device=device - ) - position_ids = position_ids.unsqueeze(0).view(-1, seq_length) - else: - position_ids = position_ids.view(-1, seq_length).long() - - if inputs_embeds is None: - inputs_embeds = self.embed_tokens(input_ids) - - if attention_mask is not None and self._attn_implementation == "flash_attention_2" and use_cache: - is_padding_right = attention_mask[:, -1].sum().item() != batch_size - if is_padding_right: - raise ValueError( - "You are attempting to perform batched generation with padding_side='right'" - " this may lead to unexpected behaviour for Flash Attention version of Mistral. Make sure to " - " call `tokenizer.padding_side = 'left'` before tokenizing the input. " - ) - - if self._attn_implementation == "flash_attention_2": - # 2d mask is passed through the layers - attention_mask = attention_mask if (attention_mask is not None and 0 in attention_mask) else None - elif self._attn_implementation == "sdpa" and not output_attentions: - # output_attentions=True can not be supported when using SDPA, and we fall back on - # the manual implementation that requires a 4D causal mask in all cases. - attention_mask = _prepare_4d_causal_attention_mask_for_sdpa( - attention_mask, - (batch_size, seq_length), - inputs_embeds, - past_key_values_length, - ) - else: - # 4d mask is passed through the layers - attention_mask = _prepare_4d_causal_attention_mask( - attention_mask, - (batch_size, seq_length), - inputs_embeds, - past_key_values_length, - sliding_window=self.config.sliding_window, - ) - - hidden_states = inputs_embeds - - # decoder layers - all_hidden_states = () if output_hidden_states else None - all_self_attns = () if output_attentions else None - next_decoder_cache = None - - for decoder_layer in self.layers: - if output_hidden_states: - all_hidden_states += (hidden_states,) - - if self.gradient_checkpointing and self.training: - layer_outputs = self._gradient_checkpointing_func( - decoder_layer.__call__, - hidden_states, - attention_mask, - position_ids, - past_key_values, - output_attentions, - use_cache, - ) - else: - layer_outputs = decoder_layer( - hidden_states, - attention_mask=attention_mask, - position_ids=position_ids, - past_key_value=past_key_values, - output_attentions=output_attentions, - use_cache=use_cache, - ) - - hidden_states = layer_outputs[0] - - if use_cache: - next_decoder_cache = layer_outputs[2 if output_attentions else 1] - - if output_attentions: - all_self_attns += (layer_outputs[1],) - - hidden_states = self.norm(hidden_states) - - # add hidden states from the last decoder layer - if output_hidden_states: - all_hidden_states += (hidden_states,) - - next_cache = None - if use_cache: - next_cache = next_decoder_cache.to_legacy_cache() if use_legacy_cache else next_decoder_cache - - if not return_dict: - return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None) - return BaseModelOutputWithPast( - last_hidden_state=hidden_states, - past_key_values=next_cache, - hidden_states=all_hidden_states, - attentions=all_self_attns, - ) - - -class Mistral2ForCausalLM(CollieModelForCausalLM): - _tied_weights_keys = ["lm_head.weight"] - - def __init__(self, config:CollieConfig): - super().__init__(config) - self.model = Mistral2Model(config) - self.vocab_size = config.vocab_size - # self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False) - self.lm_head = ColumnParallelLinearWithoutBias( - self.collie_config.hidden_size, self.collie_config.vocab_size, bias=False - ) - # Initialize weights and apply final processing - # self.post_init() - # GenerationMixin 需要的额外参数 - self.config.is_decoder = True - if config.model_config.tie_word_embeddings: - self.lm_head.weight = self.embed_tokens.weight - self.main_input_name = "input_ids" - - def clean_cache(self): - self._clean_hidden_states([*self.model.layers, self.lm_head]) - self._set_use_cache(self.model.layers, False) - - def set_cache(self, use_cache): - self._set_use_cache(self.model.layers, use_cache) - - def get_input_embeddings(self): - return self.model.embed_tokens - - def set_input_embeddings(self, value): - self.model.embed_tokens = value - - def get_output_embeddings(self): - return self.lm_head - - def set_output_embeddings(self, new_embeddings): - self.lm_head = new_embeddings - - def set_decoder(self, decoder): - self.model = decoder - - def get_decoder(self): - return self.model - - @add_start_docstrings_to_model_forward(MISTRAL_INPUTS_DOCSTRING) - @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC) - def forward( - self, - input_ids: torch.LongTensor = None, - attention_mask: Optional[torch.Tensor] = None, - position_ids: Optional[torch.LongTensor] = None, - past_key_values: Optional[List[torch.FloatTensor]] = None, - inputs_embeds: Optional[torch.FloatTensor] = None, - labels: Optional[torch.LongTensor] = None, - use_cache: Optional[bool] = None, - output_attentions: Optional[bool] = None, - output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - **kwargs, - ) -> Union[Tuple, CausalLMOutputWithPast]: - r""" - Args: - labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): - Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., - config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored - (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`. - - Returns: - - Example: - - ```python - >>> from transformers import AutoTokenizer, MistralForCausalLM - - >>> model = MistralForCausalLM.from_pretrained("mistralai/Mistral-7B-v0.1") - >>> tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-v0.1") - - >>> prompt = "Hey, are you conscious? Can you talk to me?" - >>> inputs = tokenizer(prompt, return_tensors="pt") - - >>> # Generate - >>> generate_ids = model.generate(inputs.input_ids, max_length=30) - >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0] - "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you." - ```""" - - output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions - output_hidden_states = ( - output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states - ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - - # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn) - outputs = self.model( - input_ids=input_ids, - attention_mask=attention_mask, - position_ids=position_ids, - past_key_values=past_key_values, - inputs_embeds=inputs_embeds, - use_cache=use_cache, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, - return_dict=return_dict, - ) - - hidden_states = outputs[0] - logits = self.lm_head(hidden_states) - logits = logits.float() - - loss = None - if labels is not None: - # Shift so that tokens < n predict n - shift_logits = logits[..., :-1, :].contiguous() - shift_labels = labels[..., 1:].contiguous() - # Flatten the tokens - shift_logits = shift_logits.view(-1, self.config.vocab_size) - shift_labels = shift_labels.view(-1) - # Ensure tensors are on the same device - shift_labels = shift_labels.to(shift_logits.device) - loss_fct = CrossEntropyLoss() - loss = loss_fct(shift_logits, shift_labels) - - if not return_dict: - output = (logits,) + outputs[1:] - return (loss,) + output if loss is not None else output - - return CausalLMOutputWithPast( - loss=loss, - logits=logits, - past_key_values=outputs.past_key_values, - hidden_states=outputs.hidden_states, - attentions=outputs.attentions, - ) - - def prepare_inputs_for_generation( - self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, **kwargs - ): - # Omit tokens covered by past_key_values - if past_key_values is not None: - if isinstance(past_key_values, Cache): - cache_length = past_key_values.get_seq_length() - past_length = past_key_values.seen_tokens - max_cache_length = past_key_values.get_max_length() - else: - cache_length = past_length = past_key_values[0][0].shape[2] - max_cache_length = None - - # Keep only the unprocessed tokens: - # 1 - If the length of the attention_mask exceeds the length of input_ids, then we are in a setting where - # some of the inputs are exclusively passed as part of the cache (e.g. when passing input_embeds as - # input) - if attention_mask is not None and attention_mask.shape[1] > input_ids.shape[1]: - input_ids = input_ids[:, -(attention_mask.shape[1] - past_length) :] - # 2 - If the past_length is smaller than input_ids', then input_ids holds all input tokens. We can discard - # input_ids based on the past_length. - elif past_length < input_ids.shape[1]: - input_ids = input_ids[:, past_length:] - # 3 - Otherwise (past_length >= input_ids.shape[1]), let's assume input_ids only has unprocessed tokens. - - # If we are about to go beyond the maximum cache length, we need to crop the input attention mask. - if ( - max_cache_length is not None - and attention_mask is not None - and cache_length + input_ids.shape[1] > max_cache_length - ): - attention_mask = attention_mask[:, -max_cache_length:] - - position_ids = kwargs.get("position_ids", None) - if attention_mask is not None and position_ids is None: - # create position_ids on the fly for batch generation - position_ids = attention_mask.long().cumsum(-1) - 1 - position_ids.masked_fill_(attention_mask == 0, 1) - if past_key_values: - position_ids = position_ids[:, -input_ids.shape[1] :] - - # if `inputs_embeds` are passed, we only want to use them in the 1st generation step - if inputs_embeds is not None and past_key_values is None: - model_inputs = {"inputs_embeds": inputs_embeds} - else: - model_inputs = {"input_ids": input_ids} - - model_inputs.update( - { - "position_ids": position_ids, - "past_key_values": past_key_values, - "use_cache": kwargs.get("use_cache"), - "attention_mask": attention_mask, - } - ) - return model_inputs - - @staticmethod - def _reorder_cache(past_key_values, beam_idx): - reordered_past = () - for layer_past in past_key_values: - reordered_past += ( - tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past), - ) - return reordered_past - - @classmethod - def pipeline_layers(cls, config: CollieConfig): - """ - Get layers of pipeline. - :return: list - """ - if isinstance(config, str): - config = CollieConfig.from_pretrained(config) - - if config.tie_word_embeddings: - output = TiedLayerSpec( - "embed_tokens", - dict_as_params(input_keys="hidden_states", output_keys="logits"), - ColumnParallelLMHead, - config.hidden_size, - config.vocab_size, - bias=False, - ) - else: - output = LayerSpec( - dict_as_params(input_keys="hidden_states", output_keys="logits"), - ColumnParallelLMHead, - config.hidden_size, - config.vocab_size, - bias=False, - ) - - return [("model", Mistral2Model.pipeline_layers(config)), ("lm_head", output)] - - @staticmethod - def load_parallel_state_dict( - path: str, - config: Union[CollieConfig, str], - process_exclusion: bool = False, - **kwargs, - ): - ... - - @staticmethod - def load_parallel_state_dict( - path: str, - config: Union[CollieConfig, str], - process_exclusion: bool = False, - protocol: str = "file", # 指定加载state_dict时使用的协议 - **kwargs, - ): - """ - Load state_dict from ``path``. - The format of pretrained model should be the same as that of - `huggingface`. - :return: state_dict. Note that the state_dict should be processed - properly to match the current rank. - """ - # 配置加载 - if isinstance(config, str): - config = CollieConfig.from_pretrained(config) - # IO驱动初始化 - io_driver = IODriver.from_protocol(protocol) - # 检查文件路径是否存在 - if not io_driver.exists(path): - raise FileNotFoundError(f"folder {path} not found.") - # 初始化存储和处理变量 - state_dict = OrderedDict() - weights = [] - parts = None # 变量用于存储模型分割的部分信息 - # 如果开启了进程互斥,那么每个进程都会显示进度条,否则只显示 RANK0 的 - hide_progress = not process_exclusion and int(os.environ.get("RANK", "0")) != 0 - if dist.is_initialized() and process_exclusion: - # 如果启动了进程互斥,则要进行 dist.get_world_size() 次循环 - rank_order = range(dist.get_world_size()) - else: - # 不开启只进行一次循环 - rank_order = range(1) - # 权重文件加载和处理 - for rank in rank_order: - # 如果开启了进程互斥,那么只有对应 RANK 的能进入循环;不开启进程互斥的话就都可以进 - if int(os.environ.get("RANK", "0")) == rank or not process_exclusion: - # PP 分层的方法保存在了 os.environ["COLLIE_PP_PARTS"], 格式类似于 [0, 17, 35], 左闭右开 - if env.is_pipeline: - # 保存的是 json 格式 - parts = env.pipeline_parts - if hasattr(config, "num_key_value_heads"): - # llama2 (transformers >= 4.31.0) - num_key_value_heads = config.num_key_value_heads - else: - num_key_value_heads = config.num_attention_heads - head_dim = config.hidden_size // config.num_attention_heads - # 如果存在 pytorch_model.bin.index.json 文件的话,此时不同的 pp 进程可以按需加载自己需要的权重 - if ( - io_driver.exists(os.path.join(path, "pytorch_model.bin.index.json")) - and "COLLIE_PP_PARTS" in os.environ.keys() - ): - weight_map = json.loads( - io_driver.load( - os.path.join(path, "pytorch_model.bin.index.json"), mode="r" - ) - )["weight_map"] - # layers 表示自己需要的层 - layers = env.pipeline_layers_idx - # 筛选出形似 model.layers.0 这样的层。包含两个条件:1. 有数字的层;2. 数字加一要在 layers 里面(因为最开始还有个 embedding 占一层) - weights.extend( - [ - value - for key, value in weight_map.items() - if len(key.split(".")) > 2 - and key.split(".")[2].isdigit() - and (int(key.split(".")[2]) + 1) in layers - ] - ) - # 去重 - weights = list(set(weights)) - # 继续筛选,如果有 0 层,那么就要加载 embedding;如果有最后一层,那么就要加载 lm_head;如果有倒数第二层,那么就要加载 norm - if 0 in layers: - weights.append(weight_map["model.tok_embeddings.weight"]) - if max(parts) - 1 in layers: - weights.append(weight_map["output.weight"]) - if max(parts) - 2 in layers: - weights.append(weight_map["model.norm.weight"]) - else: - # 如果没有 pytorch_model.bin.index.json 文件的话,那么就加载所有的权重 - weights = [ - weight - for weight in io_driver.list(path) - if weight.endswith(".bin") - ] - with progress( - weights, - desc="Loading state dict", - total=len(weights), - disable=hide_progress, - ) as pbar: - for weight in pbar: - part_state_dict = io_driver.load( - os.path.join(path, weight), mode="rb" - ) - # for key in list(part_state_dict.keys()): - # if "attention.wqkv.weight" in key: - # # qkv_weights = part_state_dict.pop(key) - # qkv_weights = part_state_dict[key] - # print(qkv_weights.shape) - # (wq, wk, wv) = qkv_weights.split( - # [ - # config.hidden_size, - # config.num_key_value_heads * head_dim, - # config.num_key_value_heads * head_dim, - # ], - # dim=0, - # ) - # wq_name = key.replace("wqkv", "wq") - # wk_name = key.replace("wqkv", "wk") - # wv_name = key.replace("wqkv", "wv") - # part_state_dict[wq_name] = wq - # part_state_dict[wk_name] = wk - # part_state_dict[wv_name] = wv - state_dict.update(part_state_dict) - del part_state_dict - if parts is not None: - # 这一步是 pp 的复筛 - layers = env.pipeline_layers_idx - for key in list(state_dict.keys()): - if key.startswith("layers"): - layer = int(key.split(".")[1]) - if layer + 1 not in layers: - state_dict.pop(key) - # if key.endswith("tok_embeddings.weight"): - if key.endswith("embed_tokens.weight"): - if 0 not in layers: - state_dict.pop(key) - if key == "norm.weight": - if max(parts) - 2 not in layers: - state_dict.pop(key) - # if key.endswith("output.weight"): - if key.endswith("lm_head.weight"): - if max(parts) - 1 not in layers: - state_dict.pop(key) - # 根据用户配置的新的 tp size 进行分割 - for key in list(state_dict.keys()): - col_filter = [ - # "wq.weight", - # "wk.weight", - # "wv.weight", - # "wqkv.weight", - # "w1.weight", - # "w3.weight", - # "tok_embeddings.weight", - # "output.weight", - "q_proj.weight", - "k_proj.weight", - "v_proj.weight", - "o_proj.weight", - "lm_head.weight", - "gate_proj.weight", - "up_proj.weight", - "down_proj.weight", - "embed_tokens.weight", - ] - col_split = any([key.endswith(filter) for filter in col_filter]) - - if col_split: - tensor = ( - list(torch.chunk(state_dict[key], config.tp_size, dim=0))[ - env.tp_rank - ] - .detach() - .clone() - ) - del state_dict[key] - if process_exclusion: - # CPU 内存回收(速度很慢) - gc.collect() - state_dict[key] = tensor - elif key.endswith("o_proj.weight") or key.endswith("down_proj.weight"): - tensor = ( - list(torch.chunk(state_dict[key], config.tp_size, dim=1))[ - env.tp_rank - ] - .detach() - .clone() - ) - del state_dict[key] - if process_exclusion: - # CPU 内存回收(速度很慢) - gc.collect() - state_dict[key] = tensor - if dist.is_initialized() and process_exclusion: - # 如果选择了进程互斥,那么本次循环中不需要加载权重的进程需等待 - dist.barrier() - return state_dict - - @staticmethod - def save_parallel_state_dict( - state_dict: dict, - path: str, - config: CollieConfig, - process_exclusion: bool = False, - **kwargs, - ): - ... - - @staticmethod - def save_parallel_state_dict( - state_dict: dict, - path: str, - config: CollieConfig, - process_exclusion: bool = False, - protocol: str = "file", - ): - """ - Save state_dict to ``path``. - The format of saved state dict should be the same as that of - `huggingface`. - """ - io_driver = IODriver.from_protocol(protocol) - # gather to tp rank 0 - if dist.is_initialized() and process_exclusion: - # 如果启动了进程互斥,则要进行 pp_size 次循环 - rank_order = range(config.pp_size) - else: - # 不开启只进行一次循环 - rank_order = range(1) - dst = parallel_state.get_tensor_model_parallel_src_rank() - with progress( - rank_order, - desc="Saving model", - disable=int(os.environ.get("RANK", "0")) != 0, - ) as pbar: - for rank in pbar: - if env.dp_rank == 0 and (env.pp_rank == rank or not process_exclusion): - for key in sorted(list(state_dict.keys())): - tensor_list = None - if env.tp_rank == 0: - tensor_list = [ - torch.zeros_like(state_dict[key]) - .to(state_dict[key].dtype) - .cuda() - for _ in range(config.tp_size) - ] - dist.gather( - state_dict[key].cuda(), - dst=dst, - gather_list=tensor_list, - group=env.tp_group, - ) - if env.tp_rank == 0: - col_filter = [ - # "wq.weight", - # "wk.weight", - # "wv.weight", - # "wqkv.weight", - # "w1.weight", - # "w3.weight", - # "tok_embeddings.weight", - # "output.weight", - "q_proj.weight", - "k_proj.weight", - "v_proj.weight", - "o_proj.weight", - "lm_head.weight", - "gate_proj.weight", - "up_proj.weight", - "down_proj.weight", - "embed_tokens.weight", - ] - col_split = any( - [key.endswith(filter) for filter in col_filter] - ) - - if col_split: - state_dict[key] = concat_tensor(tensor_list, dim=0) - - if process_exclusion: - # CPU 内存回收(速度很慢) - gc.collect() - - elif key.endswith("o_proj.weight") or key.endswith("down_proj.weight"): - state_dict[key] = concat_tensor(tensor_list, dim=1) - - if process_exclusion: - # CPU 内存回收(速度很慢) - gc.collect() - # 似乎不需要? - # state_dict_keys = state_dict.keys() - # for layer_id in range(config.num_layers): - # qkv_names = [None, None, None] - # for key in state_dict_keys: - # if f"layers.{layer_id}.attention.wq.weight" in key: - # qkv_names[0] = key - # elif f"layers.{layer_id}.attention.wk.weight" in key: - # qkv_names[1] = key - # elif f"layers.{layer_id}.attention.wv.weight" in key: - # qkv_names[2] = key - # qkv_name = qkv_names[0].replace("wq", "wqkv") - # state_dict[qkv_name] = torch.cat( - # [ - # state_dict.pop(qkv_names[0]), - # state_dict.pop(qkv_names[1]), - # state_dict.pop(qkv_names[2]), - # ], - # dim=0 - # ) - - if env.tp_rank == 0: - # Save gathered weights - if env.is_pipeline: - ckpt_name = f"pytorch_model-{env.pp_rank + 1:05d}-of-{config.pp_size:05d}.bin" - total_size = 0 - weight_map = {} - for name, weight in state_dict.items(): - weight_size = weight.numel() * dtype_byte_size( - weight.dtype - ) - weight_map[name] = ckpt_name - total_size += weight_size - index_dict = dict( - total_size=total_size, weight_map=weight_map - ) - index_dicts = [None for _ in range(env.pp_size)] - dist.gather_object( - index_dict, index_dicts if env.pp_rank == 0 else None, group=env.pp_group - ) - if env.pp_rank == 0: - total_size = 0 - weight_map = {} - for _index_dict in index_dicts: - total_size += _index_dict["total_size"] - weight_map.update(_index_dict["weight_map"]) - merged_dict = { - "metadata": {"total_size": total_size}, - "weight_map": weight_map, - } - io_driver.save( - json.dumps(merged_dict, indent=2, sort_keys=True) - + "\n", - os.path.join(path, "pytorch_model.bin.index.json"), - ) - - else: - ckpt_name = f"pytorch_model.bin" - ckpt_path = os.path.join(path, ckpt_name) - io_driver.save(state_dict, ckpt_path) - if dist.is_initialized() and process_exclusion: - dist.barrier() - if env.rank == 0: - config.save_pretrained(path, protocol=protocol) - dist.barrier() - - -@add_start_docstrings( - """ - The Mistral Model transformer with a sequence classification head on top (linear layer). - - [`MistralForSequenceClassification`] uses the last token in order to do the classification, as other causal models - (e.g. GPT-2) do. - - Since it does classification on the last token, it requires to know the position of the last token. If a - `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If - no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the - padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in - each row of the batch). - """, - MISTRAL_START_DOCSTRING, -) -# Copied from transformers.models.llama.modeling_llama.LlamaForSequenceClassification with Llama->Mistral, LLAMA->MISTRAL -class MistralForSequenceClassification(Mistral2PreTrainedModel): - def __init__(self, config): - super().__init__(config) - self.num_labels = config.num_labels - self.model = Mistral2Model(config) - self.score = nn.Linear(config.hidden_size, self.num_labels, bias=False) - - # Initialize weights and apply final processing - self.post_init() - - def get_input_embeddings(self): - return self.model.embed_tokens - - def set_input_embeddings(self, value): - self.model.embed_tokens = value - - @add_start_docstrings_to_model_forward(MISTRAL_INPUTS_DOCSTRING) - def forward( - self, - input_ids: torch.LongTensor = None, - attention_mask: Optional[torch.Tensor] = None, - position_ids: Optional[torch.LongTensor] = None, - past_key_values: Optional[List[torch.FloatTensor]] = None, - inputs_embeds: Optional[torch.FloatTensor] = None, - labels: Optional[torch.LongTensor] = None, - use_cache: Optional[bool] = None, - output_attentions: Optional[bool] = None, - output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[Tuple, SequenceClassifierOutputWithPast]: - r""" - labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): - Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., - config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If - `config.num_labels > 1` a classification loss is computed (Cross-Entropy). - """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - - transformer_outputs = self.model( - input_ids, - attention_mask=attention_mask, - position_ids=position_ids, - past_key_values=past_key_values, - inputs_embeds=inputs_embeds, - use_cache=use_cache, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, - return_dict=return_dict, - ) - hidden_states = transformer_outputs[0] - logits = self.score(hidden_states) - - if input_ids is not None: - batch_size = input_ids.shape[0] - else: - batch_size = inputs_embeds.shape[0] - - if self.config.pad_token_id is None and batch_size != 1: - raise ValueError("Cannot handle batch sizes > 1 if no padding token is defined.") - if self.config.pad_token_id is None: - sequence_lengths = -1 - else: - if input_ids is not None: - # if no pad token found, use modulo instead of reverse indexing for ONNX compatibility - sequence_lengths = torch.eq(input_ids, self.config.pad_token_id).int().argmax(-1) - 1 - sequence_lengths = sequence_lengths % input_ids.shape[-1] - sequence_lengths = sequence_lengths.to(logits.device) - else: - sequence_lengths = -1 - - pooled_logits = logits[torch.arange(batch_size, device=logits.device), sequence_lengths] - - loss = None - if labels is not None: - labels = labels.to(logits.device) - if self.config.problem_type is None: - if self.num_labels == 1: - self.config.problem_type = "regression" - elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int): - self.config.problem_type = "single_label_classification" - else: - self.config.problem_type = "multi_label_classification" - - if self.config.problem_type == "regression": - loss_fct = MSELoss() - if self.num_labels == 1: - loss = loss_fct(pooled_logits.squeeze(), labels.squeeze()) - else: - loss = loss_fct(pooled_logits, labels) - elif self.config.problem_type == "single_label_classification": - loss_fct = CrossEntropyLoss() - loss = loss_fct(pooled_logits.view(-1, self.num_labels), labels.view(-1)) - elif self.config.problem_type == "multi_label_classification": - loss_fct = BCEWithLogitsLoss() - loss = loss_fct(pooled_logits, labels) - if not return_dict: - output = (pooled_logits,) + transformer_outputs[1:] - return ((loss,) + output) if loss is not None else output - - return SequenceClassifierOutputWithPast( - loss=loss, - logits=pooled_logits, - past_key_values=transformer_outputs.past_key_values, - hidden_states=transformer_outputs.hidden_states, - attentions=transformer_outputs.attentions, - ) diff --git a/tests/models/mistral2/modeltp.py b/tests/models/mistral2/modeltp.py deleted file mode 100644 index e91037f..0000000 --- a/tests/models/mistral2/modeltp.py +++ /dev/null @@ -1,2254 +0,0 @@ -# coding=utf-8 -# Copyright 2023 Mistral AI and the HuggingFace Inc. team. All rights reserved. -# -# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX -# and OPT implementations in this library. It has been modified from its -# original forms to accommodate minor architectural differences compared -# to GPT-NeoX and OPT used by the Meta AI team that trained the model. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -""" PyTorch Mistral model.""" -import inspect -import math -import warnings -from typing import List, Optional, Tuple, Union - -import torch -import torch.nn.functional as F -import torch.utils.checkpoint -from torch import nn -from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss - -from transformers.activations import ACT2FN -from transformers.cache_utils import Cache, DynamicCache -from transformers.modeling_attn_mask_utils import _prepare_4d_causal_attention_mask, _prepare_4d_causal_attention_mask_for_sdpa -from transformers.modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast, SequenceClassifierOutputWithPast -from transformers.modeling_utils import PreTrainedModel, dtype_byte_size -from transformers.utils import ( - add_start_docstrings, - add_start_docstrings_to_model_forward, - is_flash_attn_2_available, - is_flash_attn_greater_or_equal_2_10, - logging, - replace_return_docstrings, -) -from .configuration_mistraltp import MistralConfig - - -if is_flash_attn_2_available(): - from flash_attn import flash_attn_func, flash_attn_varlen_func - from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input # noqa - - _flash_supports_window_size = "window_size" in list(inspect.signature(flash_attn_func).parameters) - - -logger = logging.get_logger(__name__) - -_CONFIG_FOR_DOC = "MistralConfig" - -#modified for collie -import torch.distributed as dist -import gc -import json -import os -from collections import OrderedDict -from megatron.core import parallel_state, tensor_parallel -from einops import rearrange -from deepspeed.pipe import LayerSpec, TiedLayerSpec - -from collie.config import CollieConfig -from collie.driver.io import IODriver -from collie.log.logger import logger -from collie.module import ( - ColumnParallelLinearWithoutBias, - ColumnParallelLMHead, - RowParallelLinearWithoutBias, -) -from collie.utils import concat_tensor, dict_as_params, env, progress -from collie.models.base import CollieModelForCausalLM -from collie.models.utils import ( - kv_cache_to_inputs_for_layer, inputs_to_kv_cache_for_layer, - kv_cache_to_inputs_for_model, inputs_to_kv_cache_for_model, -) - - -# Copied from transformers.models.llama.modeling_llama._get_unpad_data -def _get_unpad_data(attention_mask): - seqlens_in_batch = attention_mask.sum(dim=-1, dtype=torch.int32) - indices = torch.nonzero(attention_mask.flatten(), as_tuple=False).flatten() - max_seqlen_in_batch = seqlens_in_batch.max().item() - cu_seqlens = F.pad(torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.int32), (1, 0)) - return ( - indices, - cu_seqlens, - max_seqlen_in_batch, - ) - - -# Copied from transformers.models.llama.modeling_llama.LlamaRMSNorm with Llama->Mistral -class MistralRMSNorm(nn.Module): - def __init__(self, hidden_size, eps=1e-6): - """ - MistralRMSNorm is equivalent to T5LayerNorm - """ - super().__init__() - self.weight = nn.Parameter(torch.ones(hidden_size)) - self.variance_epsilon = eps - - def forward(self, hidden_states): - input_dtype = hidden_states.dtype - hidden_states = hidden_states.to(torch.float32) - variance = hidden_states.pow(2).mean(-1, keepdim=True) - hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon) - ans = self.weight * hidden_states.to(input_dtype) - - # # 打印层标准化的输出 - hidden_states_output = ans.detach().cpu().tolist() - data_to_save = {"Layer Norm Output": hidden_states_output} - # 将输出写入 JSON 文件 - with open('a_rms_output.json', 'w') as f: - json.dump(data_to_save, f, indent=4) - - return self.weight * hidden_states.to(input_dtype) - - -# copied from transformers.models.llama.modeling_llama.LlamaRotaryEmbedding with Llama->Mistral -# TODO @Arthur no longer copied from LLama after static cache -class MistralRotaryEmbedding(nn.Module): - def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None): - super().__init__() - - self.dim = dim - self.max_position_embeddings = max_position_embeddings - self.base = base - inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2, dtype=torch.int64).float().to(device) / self.dim)) - self.register_buffer("inv_freq", inv_freq, persistent=False) - - # Build here to make `torch.jit.trace` work. - self._set_cos_sin_cache( - seq_len=max_position_embeddings, device=self.inv_freq.device, dtype=torch.get_default_dtype() - ) - - def _set_cos_sin_cache(self, seq_len, device, dtype): - self.max_seq_len_cached = seq_len - t = torch.arange(self.max_seq_len_cached, device=device, dtype=torch.int64).type_as(self.inv_freq) - - freqs = torch.outer(t, self.inv_freq) - # Different from paper, but it uses a different permutation in order to obtain the same calculation - emb = torch.cat((freqs, freqs), dim=-1) - self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False) - self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False) - - def forward(self, x, seq_len=None): - # x: [bs, num_attention_heads, seq_len, head_size] - if seq_len > self.max_seq_len_cached: - self._set_cos_sin_cache(seq_len=seq_len, device=x.device, dtype=x.dtype) - - return ( - self.cos_cached[:seq_len].to(dtype=x.dtype), - self.sin_cached[:seq_len].to(dtype=x.dtype), - ) - - -# Copied from transformers.models.llama.modeling_llama.rotate_half -def rotate_half(x): - """Rotates half the hidden dims of the input.""" - x1 = x[..., : x.shape[-1] // 2] - x2 = x[..., x.shape[-1] // 2 :] - return torch.cat((-x2, x1), dim=-1) - - -# copied from transformers.models.llama.modeling_llama.apply_rotary_pos_emb -# TODO @Arthur no longer copied from LLama after static cache -def apply_rotary_pos_emb(q, k, cos, sin, position_ids, unsqueeze_dim=1): - """Applies Rotary Position Embedding to the query and key tensors. - - Args: - q (`torch.Tensor`): The query tensor. - k (`torch.Tensor`): The key tensor. - cos (`torch.Tensor`): The cosine part of the rotary embedding. - sin (`torch.Tensor`): The sine part of the rotary embedding. - position_ids (`torch.Tensor`): - The position indices of the tokens corresponding to the query and key tensors. For example, this can be - used to pass offsetted position ids when working with a KV-cache. - unsqueeze_dim (`int`, *optional*, defaults to 1): - The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and - sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note - that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and - k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes - cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have - the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2. - Returns: - `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding. - """ - cos = cos[position_ids].unsqueeze(unsqueeze_dim) - sin = sin[position_ids].unsqueeze(unsqueeze_dim) - q_embed = (q * cos) + (rotate_half(q) * sin) - k_embed = (k * cos) + (rotate_half(k) * sin) - return q_embed, k_embed - - -class MistralMLP(nn.Module): - def __init__(self, config): - super().__init__() - self.config = config - self.hidden_size = config.hidden_size - self.intermediate_size = config.intermediate_size - # self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False) - # self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False) - # self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False) - - self.up_proj = ColumnParallelLinearWithoutBias( - self.hidden_size, - self.intermediate_size, - bias=False, - gather_output=False, - init_method=lambda x: x, - ) - self.gate_proj = ColumnParallelLinearWithoutBias( - self.hidden_size, - self.intermediate_size, - bias=False, - gather_output=False, - init_method=lambda x: x, - ) - self.down_proj = RowParallelLinearWithoutBias( - self.intermediate_size, - self.hidden_size, - bias=False, - input_is_parallel=True, - init_method=lambda x: x, - ) - - self.act_fn = ACT2FN[config.hidden_act] - - def forward(self, x): - output = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x)) - - # 打印MLP层输出 - mlp_output = output.detach().cpu().tolist() - data_to_save = {"MLP Output": mlp_output} - # 将输出写入 JSON 文件 - with open('a_mlp_output.json', 'w') as f: - json.dump(data_to_save, f, indent=4) - - return output - - -# Copied from transformers.models.llama.modeling_llama.repeat_kv -def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor: - """ - This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch, - num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim) - """ - batch, num_key_value_heads, slen, head_dim = hidden_states.shape - if n_rep == 1: - return hidden_states - hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim) - return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim) - - -class MistralAttention(nn.Module): - """ - Multi-headed attention from 'Attention Is All You Need' paper. Modified to use sliding window attention: Longformer - and "Generating Long Sequences with Sparse Transformers". - """ - - def __init__(self, config: CollieConfig, layer_idx: Optional[int] = None): - super().__init__() - self.config = config - self.layer_idx = layer_idx - if layer_idx is None: - logger.warning_once( - f"Instantiating {self.__class__.__name__} without passing a `layer_idx` is not recommended and will " - "lead to errors during the forward call if caching is used. Please make sure to provide a `layer_idx` " - "when creating this class." - ) - - self.hidden_size = config.hidden_size - self.num_heads = config.num_attention_heads - self.head_dim = self.hidden_size // self.num_heads - self.num_key_value_heads = config.num_key_value_heads - self.num_key_value_groups = self.num_heads // self.num_key_value_heads - self.max_position_embeddings = config.max_position_embeddings - self.rope_theta = config.rope_theta - self.is_causal = True - self.attention_dropout = config.attention_dropout - - if (self.head_dim * self.num_heads) != self.hidden_size: - raise ValueError( - f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}" - f" and `num_heads`: {self.num_heads})." - ) - # self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=False) - # self.k_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=False) - # self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=False) - # self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=False) - - self.q_proj = ColumnParallelLinearWithoutBias( - self.hidden_size, - self.num_heads * self.head_dim, - bias=False, - gather_output=False, - init_method=lambda x: x, - ) - self.k_proj = ColumnParallelLinearWithoutBias( - self.hidden_size, - self.num_key_value_heads * self.head_dim, - bias=False, - gather_output=False, - init_method=lambda x: x, - ) - self.v_proj = ColumnParallelLinearWithoutBias( - self.hidden_size, - self.num_key_value_heads * self.head_dim, - bias=False, - gather_output=False, - init_method=lambda x: x, - ) - # aaaa - self.o_proj = RowParallelLinearWithoutBias( - self.num_heads * self.head_dim, - self.hidden_size, - bias=False, - input_is_parallel=True, - init_method=lambda x: x, - ) - - self.rotary_emb = MistralRotaryEmbedding( - self.head_dim, - max_position_embeddings=self.max_position_embeddings, - base=self.rope_theta, - ) - - def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int): - return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous() - - def forward( - self, - hidden_states: torch.Tensor, # 输入维度 [bsz, q_len, hidden_size] - attention_mask: Optional[torch.Tensor] = None, - position_ids: Optional[torch.LongTensor] = None, - past_key_value: Optional[Cache] = None, - output_attentions: bool = False, - use_cache: bool = False, - **kwargs, - ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: - if "padding_mask" in kwargs: - warnings.warn( - "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`" - ) - - bsz, q_len, _ = hidden_states.size() - - query_states = self.q_proj(hidden_states) # [bsz, q_len, num_heads * head_dim] - key_states = self.k_proj(hidden_states) # [bsz, q_len, num_key_value_heads * head_dim] - value_states = self.v_proj(hidden_states) # [bsz, q_len, num_key_value_heads * head_dim] - - # query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2) - # key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) - # value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) - - query_states, key_states, value_states = ( - rearrange(query_states, "b n (h d) -> b n h d", d=self.head_dim), # [bsz, q_len, num_heads, head_dim] - rearrange(key_states, "b n (h d) -> b n h d", d=self.head_dim), # [bsz, q_len, num_key_value_heads, head_dim] - rearrange(value_states, "b n (h d) -> b n h d", d=self.head_dim), # [bsz, q_len, num_key_value_heads, head_dim] - ) - - query_states = query_states.transpose(1, 2) # [bsz, num_heads, q_len, head_dim] - key_states = key_states.transpose(1, 2) # [bsz, num_key_value_heads, q_len, head_dim] - value_states = value_states.transpose(1, 2) # [bsz, num_key_value_heads, q_len, head_dim] - - # 打印注意力模块的输出 - # 准备数据以写入 JSON 文件 - attention_outputs = { - "Query states": query_states.detach().cpu().tolist(), - "Key states": key_states.detach().cpu().tolist(), - "Value states": value_states.detach().cpu().tolist() - } - # 将数据写入 JSON 文件 - with open("a_attention_outputs.json", "w") as f: - json.dump(attention_outputs, f, indent=4) - - if self.config.pp_size > 1: - query_states = query_states.contiguous() - key_states = key_states.contiguous() - value_states = value_states.contiguous() - - kv_seq_len = key_states.shape[-2] - if past_key_value is not None: - if self.layer_idx is None: - raise ValueError( - f"The cache structure has changed since version v4.36. If you are using {self.__class__.__name__} " - "for auto-regressive decoding with k/v caching, please make sure to initialize the attention class " - "with a layer index." - ) - kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx) - cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len) - query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids) - - if past_key_value is not None: - cache_kwargs = {"sin": sin, "cos": cos} # Specific to RoPE models - key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs) - - # repeat k/v heads if n_kv_heads < n_heads - key_states = repeat_kv(key_states, self.num_key_value_groups) - value_states = repeat_kv(value_states, self.num_key_value_groups) - - attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim) - - if attn_weights.size() != (bsz, self.num_heads/self.config.tp_size, q_len, kv_seq_len): - raise ValueError( - f"Attention weights should be of size {(bsz, self.num_heads/self.config.tp_size, q_len, kv_seq_len)}, but is" - f" {attn_weights.size()}" - ) - - if attention_mask is not None: - if attention_mask.size() != (bsz, 1, q_len, kv_seq_len): - raise ValueError( - f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}" - ) - - attn_weights = attn_weights + attention_mask - - # upcast attention to fp32 - attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype) - attn_weights = nn.functional.dropout(attn_weights, p=self.attention_dropout, training=self.training) - attn_output = torch.matmul(attn_weights, value_states) - - if attn_output.size() != (bsz, self.num_heads/self.config.tp_size, q_len, self.head_dim): - raise ValueError( - f"`attn_output` should be of size {(bsz, self.num_heads/self.config.tp_size, q_len, self.head_dim)}, but is" - f" {attn_output.size()}" - ) - - attn_output = attn_output.transpose(1, 2).contiguous() - attn_output = attn_output.reshape(bsz, q_len, int(self.hidden_size/self.config.tp_size)) - - - attn_output = self.o_proj(attn_output) - - if not output_attentions: - attn_weights = None - - # 打印注意力模块的输出 - attention_result = { - "Output weights:": attn_output.detach().cpu().tolist(), - # "Attention weights:": attn_weights.detach().cpu().tolist(), - } - # 将数据写入 JSON 文件 - with open("a_attention_outputs.json", "w") as f: - json.dump(attention_result, f, indent=4) - - return attn_output, attn_weights, past_key_value - - -class MistralFlashAttention2(MistralAttention): - """ - Mistral flash attention module. This module inherits from `MistralAttention` as the weights of the module stays - untouched. The only required change would be on the forward pass where it needs to correctly call the public API of - flash attention and deal with padding tokens in case the input contains any of them. - """ - - # Copied from transformers.models.llama.modeling_llama.LlamaFlashAttention2.__init__ - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - - # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1. - # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignement, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0. - # Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left). - self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10() - - def forward( - self, - hidden_states: torch.Tensor, - attention_mask: Optional[torch.Tensor] = None, - position_ids: Optional[torch.LongTensor] = None, - past_key_value: Optional[Cache] = None, - output_attentions: bool = False, - use_cache: bool = False, - **kwargs, - ): - if "padding_mask" in kwargs: - warnings.warn( - "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`" - ) - - # overwrite attention_mask with padding_mask - attention_mask = kwargs.pop("padding_mask") - bsz, q_len, _ = hidden_states.size() - - query_states = self.q_proj(hidden_states) - key_states = self.k_proj(hidden_states) - value_states = self.v_proj(hidden_states) - - # query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2) - # key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) - # value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) - - query_states, key_states, value_states = ( - rearrange(query_states, "b n (h d) -> b n h d", d=self.head_dim), - rearrange(key_states, "b n (h d) -> b n h d", d=self.head_dim), - rearrange(value_states, "b n (h d) -> b n h d", d=self.head_dim), - ) - - query_states = query_states.transpose(1, 2) - key_states = key_states.transpose(1, 2) - value_states = value_states.transpose(1, 2) - - # 打印注意力模块的输出 - # 准备数据以写入 JSON 文件 - attention_outputs = { - "Query states": query_states.detach().cpu().tolist(), - "Key states": key_states.detach().cpu().tolist(), - "Value states": value_states.detach().cpu().tolist() - } - # 将数据写入 JSON 文件 - with open("a_flash_attention_outputs.json", "w") as f: - json.dump(attention_outputs, f, indent=4) - - if self.config.pp_size > 1: - query_states = query_states.contiguous() - key_states = key_states.contiguous() - value_states = value_states.contiguous() - - - kv_seq_len = key_states.shape[-2] - if past_key_value is not None: - if self.layer_idx is None: - raise ValueError( - f"The cache structure has changed since version v4.36. If you are using {self.__class__.__name__} " - "for auto-regressive decoding with k/v caching, please make sure to initialize the attention class " - "with a layer index." - ) - kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx) - - # Because the input can be padded, the absolute sequence length depends on the max position id. - rotary_seq_len = max(kv_seq_len, position_ids[:, -1].max().item()) + 1 - cos, sin = self.rotary_emb(value_states, seq_len=rotary_seq_len) - - query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids) - - use_sliding_windows = ( - _flash_supports_window_size - and getattr(self.config, "sliding_window", None) is not None - and kv_seq_len > self.config.sliding_window - ) - - if not _flash_supports_window_size: - logger.warning_once( - "The current flash attention version does not support sliding window attention, for a more memory efficient implementation" - " make sure to upgrade flash-attn library." - ) - - if past_key_value is not None: - # Activate slicing cache only if the config has a value `sliding_windows` attribute - cache_has_contents = past_key_value.get_seq_length(self.layer_idx) > 0 - if ( - getattr(self.config, "sliding_window", None) is not None - and kv_seq_len > self.config.sliding_window - and cache_has_contents - ): - slicing_tokens = 1 - self.config.sliding_window - - past_key = past_key_value[self.layer_idx][0] - past_value = past_key_value[self.layer_idx][1] - - past_key = past_key[:, :, slicing_tokens:, :].contiguous() - past_value = past_value[:, :, slicing_tokens:, :].contiguous() - - if past_key.shape[-2] != self.config.sliding_window - 1: - raise ValueError( - f"past key must have a shape of (`batch_size, num_heads, self.config.sliding_window-1, head_dim`), got" - f" {past_key.shape}" - ) - - if attention_mask is not None: - attention_mask = attention_mask[:, slicing_tokens:] - attention_mask = torch.cat([attention_mask, torch.ones_like(attention_mask[:, -1:])], dim=-1) - - cache_kwargs = {"sin": sin, "cos": cos} # Specific to RoPE models - key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs) - - # repeat k/v heads if n_kv_heads < n_heads - key_states = repeat_kv(key_states, self.num_key_value_groups) - value_states = repeat_kv(value_states, self.num_key_value_groups) - dropout_rate = 0.0 if not self.training else self.attention_dropout - - # In PEFT, usually we cast the layer norms in float32 for training stability reasons - # therefore the input hidden states gets silently casted in float32. Hence, we need - # cast them back in float16 just to be sure everything works as expected. - input_dtype = query_states.dtype - if input_dtype == torch.float32: - if torch.is_autocast_enabled(): - target_dtype = torch.get_autocast_gpu_dtype() - # Handle the case where the model is quantized - elif hasattr(self.config, "_pre_quantization_dtype"): - target_dtype = self.config._pre_quantization_dtype - else: - target_dtype = self.q_proj.weight.dtype - - logger.warning_once( - f"The input hidden states seems to be silently casted in float32, this might be related to" - f" the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in" - f" {target_dtype}." - ) - - query_states = query_states.to(target_dtype) - key_states = key_states.to(target_dtype) - value_states = value_states.to(target_dtype) - - # Reashape to the expected shape for Flash Attention - query_states = query_states.transpose(1, 2) - key_states = key_states.transpose(1, 2) - value_states = value_states.transpose(1, 2) - - attn_output = self._flash_attention_forward( - query_states, - key_states, - value_states, - attention_mask, - q_len, - dropout=dropout_rate, - use_sliding_windows=use_sliding_windows, - ) - - attn_output = attn_output.reshape(bsz, q_len, int(self.hidden_size/self.config.tp_size)).contiguous() - attn_output = self.o_proj(attn_output) - - if not output_attentions: - attn_weights = None - - # 打印注意力模块的输出 - attention_result = { - "Output weights:": attn_output.detach().cpu().tolist(), - # "Attention weights:": attn_weights.detach().cpu().tolist(), - } - # 将数据写入 JSON 文件 - with open("a_flash_attention_outputs.json", "w") as f: - json.dump(attention_result, f, indent=4) - - return attn_output, attn_weights, past_key_value - - def _flash_attention_forward( - self, - query_states, - key_states, - value_states, - attention_mask, - query_length, - dropout=0.0, - softmax_scale=None, - use_sliding_windows=False, - ): - """ - Calls the forward method of Flash Attention - if the input hidden states contain at least one padding token - first unpad the input, then computes the attention scores and pad the final attention scores. - - Args: - query_states (`torch.Tensor`): - Input query states to be passed to Flash Attention API - key_states (`torch.Tensor`): - Input key states to be passed to Flash Attention API - value_states (`torch.Tensor`): - Input value states to be passed to Flash Attention API - attention_mask (`torch.Tensor`): - The padding mask - corresponds to a tensor of size `(batch_size, seq_len)` where 0 stands for the - position of padding tokens and 1 for the position of non-padding tokens. - dropout (`float`): - Attention dropout - softmax_scale (`float`, *optional*): - The scaling of QK^T before applying softmax. Default to 1 / sqrt(head_dim) - use_sliding_windows (`bool`, *optional*): - Whether to activate sliding window attention. - """ - if not self._flash_attn_uses_top_left_mask: - causal = self.is_causal - else: - # TODO: Remove the `query_length != 1` check once Flash Attention for RoCm is bumped to 2.1. For details, please see the comment in LlamaFlashAttention2 __init__. - causal = self.is_causal and query_length != 1 - - # Contains at least one padding token in the sequence - if attention_mask is not None: - batch_size = query_states.shape[0] - query_states, key_states, value_states, indices_q, cu_seq_lens, max_seq_lens = self._upad_input( - query_states, key_states, value_states, attention_mask, query_length - ) - - cu_seqlens_q, cu_seqlens_k = cu_seq_lens - max_seqlen_in_batch_q, max_seqlen_in_batch_k = max_seq_lens - - if not use_sliding_windows: - attn_output_unpad = flash_attn_varlen_func( - query_states, - key_states, - value_states, - cu_seqlens_q=cu_seqlens_q, - cu_seqlens_k=cu_seqlens_k, - max_seqlen_q=max_seqlen_in_batch_q, - max_seqlen_k=max_seqlen_in_batch_k, - dropout_p=dropout, - softmax_scale=softmax_scale, - causal=causal, - ) - else: - attn_output_unpad = flash_attn_varlen_func( - query_states, - key_states, - value_states, - cu_seqlens_q=cu_seqlens_q, - cu_seqlens_k=cu_seqlens_k, - max_seqlen_q=max_seqlen_in_batch_q, - max_seqlen_k=max_seqlen_in_batch_k, - dropout_p=dropout, - softmax_scale=softmax_scale, - causal=causal, - window_size=(self.config.sliding_window, self.config.sliding_window), - ) - - attn_output = pad_input(attn_output_unpad, indices_q, batch_size, query_length) - else: - if not use_sliding_windows: - attn_output = flash_attn_func( - query_states, - key_states, - value_states, - dropout, - softmax_scale=softmax_scale, - causal=causal, - ) - else: - attn_output = flash_attn_func( - query_states, - key_states, - value_states, - dropout, - softmax_scale=softmax_scale, - causal=causal, - window_size=(self.config.sliding_window, self.config.sliding_window), - ) - - return attn_output - - def _upad_input(self, query_layer, key_layer, value_layer, attention_mask, query_length): - batch_size, kv_seq_len, num_heads, head_dim = key_layer.shape - - # On the first iteration we need to properly re-create the padding mask - # by slicing it on the proper place - if kv_seq_len != attention_mask.shape[-1]: - attention_mask_num_tokens = attention_mask.shape[-1] - attention_mask = attention_mask[:, attention_mask_num_tokens - kv_seq_len :] - - indices_k, cu_seqlens_k, max_seqlen_in_batch_k = _get_unpad_data(attention_mask) - - key_layer = index_first_axis(key_layer.reshape(batch_size * kv_seq_len, num_heads, head_dim), indices_k) - value_layer = index_first_axis(value_layer.reshape(batch_size * kv_seq_len, num_heads, head_dim), indices_k) - - if query_length == kv_seq_len: - query_layer = index_first_axis( - query_layer.reshape(batch_size * kv_seq_len, num_heads, head_dim), indices_k - ) - cu_seqlens_q = cu_seqlens_k - max_seqlen_in_batch_q = max_seqlen_in_batch_k - indices_q = indices_k - elif query_length == 1: - max_seqlen_in_batch_q = 1 - cu_seqlens_q = torch.arange( - batch_size + 1, dtype=torch.int32, device=query_layer.device - ) # There is a memcpy here, that is very bad. - indices_q = cu_seqlens_q[:-1] - query_layer = query_layer.squeeze(1) - else: - # The -q_len: slice assumes left padding. - attention_mask = attention_mask[:, -query_length:] - query_layer, indices_q, cu_seqlens_q, max_seqlen_in_batch_q = unpad_input(query_layer, attention_mask) - - return ( - query_layer, - key_layer, - value_layer, - indices_q, - (cu_seqlens_q, cu_seqlens_k), - (max_seqlen_in_batch_q, max_seqlen_in_batch_k), - ) - - -# copied from transformers.models.llama.modeling_llama.LlamaSdpaAttention with Llama->Mistral -# TODO @Arthur no longer copied from LLama after static cache -class MistralSdpaAttention(MistralAttention): - """ - Mistral attention module using torch.nn.functional.scaled_dot_product_attention. This module inherits from - `MistralAttention` as the weights of the module stays untouched. The only changes are on the forward pass to adapt to - SDPA API. - """ - - # Adapted from MistralAttention.forward - def forward( - self, - hidden_states: torch.Tensor, - attention_mask: Optional[torch.Tensor] = None, - position_ids: Optional[torch.LongTensor] = None, - past_key_value: Optional[Cache] = None, - output_attentions: bool = False, - use_cache: bool = False, - ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: - if output_attentions: - # TODO: Improve this warning with e.g. `model.config.attn_implementation = "manual"` once this is implemented. - logger.warning_once( - "MistralModel is using MistralSdpaAttention, but `torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to the manual attention implementation, " - 'but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.' - ) - return super().forward( - hidden_states=hidden_states, - attention_mask=attention_mask, - position_ids=position_ids, - past_key_value=past_key_value, - output_attentions=output_attentions, - use_cache=use_cache, - ) - - bsz, q_len, _ = hidden_states.size() - - query_states = self.q_proj(hidden_states) - key_states = self.k_proj(hidden_states) - value_states = self.v_proj(hidden_states) - - # query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2) - # key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) - # value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) - - query_states, key_states, value_states = ( - rearrange(query_states, "b n (h d) -> b n h d", d=self.head_dim), - rearrange(key_states, "b n (h d) -> b n h d", d=self.head_dim), - rearrange(value_states, "b n (h d) -> b n h d", d=self.head_dim), - ) - - query_states = query_states.transpose(1, 2) - key_states = key_states.transpose(1, 2) - value_states = value_states.transpose(1, 2) - - # 打印注意力模块的输出 - # 准备数据以写入 JSON 文件 - attention_outputs = { - "Query states": query_states.detach().cpu().tolist(), - "Key states": key_states.detach().cpu().tolist(), - "Value states": value_states.detach().cpu().tolist() - } - # 将数据写入 JSON 文件 - with open("a_sdpa_attention_outputs.json", "w") as f: - json.dump(attention_outputs, f, indent=4) - - if self.config.pp_size > 1: - query_states = query_states.contiguous() - key_states = key_states.contiguous() - value_states = value_states.contiguous() - - - kv_seq_len = key_states.shape[-2] - if past_key_value is not None: - kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx) - cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len) - query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids) - - if past_key_value is not None: - cache_kwargs = {"sin": sin, "cos": cos} # Specific to RoPE models - key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs) - - key_states = repeat_kv(key_states, self.num_key_value_groups) - value_states = repeat_kv(value_states, self.num_key_value_groups) - - if attention_mask is not None: - if attention_mask.size() != (bsz, 1, q_len, kv_seq_len): - raise ValueError( - f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}" - ) - - # SDPA with memory-efficient backend is currently (torch==2.1.2) bugged with non-contiguous inputs with custom attn_mask, - # Reference: https://github.com/pytorch/pytorch/issues/112577. - if query_states.device.type == "cuda" and attention_mask is not None: - query_states = query_states.contiguous() - key_states = key_states.contiguous() - value_states = value_states.contiguous() - - attn_output = torch.nn.functional.scaled_dot_product_attention( - query_states, - key_states, - value_states, - attn_mask=attention_mask, - dropout_p=self.attention_dropout if self.training else 0.0, - # The q_len > 1 is necessary to match with AttentionMaskConverter.to_causal_4d that does not create a causal mask in case q_len == 1. - is_causal=self.is_causal and attention_mask is None and q_len > 1, - ) - - attn_output = attn_output.transpose(1, 2).contiguous() - attn_output = attn_output.view(bsz, q_len, int(self.hidden_size/self.config.tp_size)) - - attn_output = self.o_proj(attn_output) - - # 打印注意力模块的输出 - attention_result = { - "Output weights:": attn_output.detach().cpu().tolist(), - # "Attention weights:": attn_weights.detach().cpu().tolist(), - } - # 将数据写入 JSON 文件 - with open("a_sdpa_attention_outputs.json", "w") as f: - json.dump(attention_result, f, indent=4) - - return attn_output, None, past_key_value - - -MISTRAL_ATTENTION_CLASSES = { - "eager": MistralAttention, - "flash_attention_2": MistralFlashAttention2, - "sdpa": MistralSdpaAttention, -} - - -class MistralDecoderLayer(nn.Module): - def __init__(self, config: CollieConfig, layer_idx: int): - super().__init__() - self.hidden_size = config.hidden_size - config._attn_implementation = "sdpa" - self.self_attn = MISTRAL_ATTENTION_CLASSES[config._attn_implementation](config, layer_idx) - self.config = config - self.mlp = MistralMLP(config) - self.input_layernorm = MistralRMSNorm(config.hidden_size, eps=config.rms_norm_eps) - self.post_attention_layernorm = MistralRMSNorm(config.hidden_size, eps=config.rms_norm_eps) - self.idx = layer_idx - # 务必保持变量名一致 - self.use_cache = self.config.model_config.use_cache - self.hidden_states = None - self.output_attentions = False - -class MistralDecoderLayer(nn.Module): - def __init__(self, config: CollieConfig, layer_idx: int): - super().__init__() - self.hidden_size = config.hidden_size - config._attn_implementation = "sdpa" - self.self_attn = MISTRAL_ATTENTION_CLASSES[config._attn_implementation](config, layer_idx) - self.config = config - self.mlp = MistralMLP(config) - self.input_layernorm = MistralRMSNorm(config.hidden_size, eps=config.rms_norm_eps) - self.post_attention_layernorm = MistralRMSNorm(config.hidden_size, eps=config.rms_norm_eps) - self.idx = layer_idx - # 务必保持变量名一致 - self.use_cache = self.config.model_config.use_cache - self.hidden_states = None - self.output_attentions = False - - def _forward( - self, - hidden_states: torch.Tensor, - attention_mask: Optional[torch.Tensor] = None, - position_ids: Optional[torch.LongTensor] = None, - past_key_value: Optional[Tuple[torch.Tensor]] = None, - # output_attentions: Optional[bool] = False, - # use_cache: Optional[bool] = False, - **kwargs, - ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]: - # if "padding_mask" in kwargs: - # warnings.warn( - # "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`" - # ) - """ - Args: - hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)` - attention_mask (`torch.FloatTensor`, *optional*): attention mask of size - `(batch, sequence_length)` where padding elements are indicated by 0. - output_attentions (`bool`, *optional*): - Whether or not to return the attentions tensors of all attention layers. See `attentions` under - returned tensors for more detail. - use_cache (`bool`, *optional*): - If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding - (see `past_key_values`). - past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states - """ - - residual = hidden_states - - hidden_states = self.input_layernorm(hidden_states) - - # Self Attention - hidden_states, self_attn_weights, present_key_value = self.self_attn( - hidden_states=hidden_states, - attention_mask=attention_mask, - position_ids=position_ids, - past_key_value=past_key_value, - # output_attentions=output_attentions, - # use_cache=use_cache, - **kwargs, - ) - hidden_states = residual + hidden_states - - # Fully Connected - residual = hidden_states - hidden_states = self.post_attention_layernorm(hidden_states) - hidden_states = self.mlp(hidden_states) - hidden_states = residual + hidden_states - - # outputs = (hidden_states,) - - # if output_attentions: - # outputs += (self_attn_weights,) - - # if use_cache: - # outputs += (present_key_value,) - - return hidden_states, present_key_value - - def forward(self, inputs: dict): - layer_past = inputs_to_kv_cache_for_layer(idx=self.idx, inputs=inputs) - - if self.config.checkpointing and self.training: - hidden_states, new_layer_past = torch.utils.checkpoint.checkpoint( - self._forward, - inputs["hidden_states"], - inputs.get("attention_mask", None), - inputs.get("position_ids", None), - layer_past, # inputs.get("past_key_values", None), - ) - else: - hidden_states, new_layer_past = self._forward( - inputs["hidden_states"], - inputs.get("attention_mask", None), - inputs.get("position_ids", None), - layer_past - ) # **inputs - inputs["hidden_states"] = hidden_states - - inputs.update(kv_cache_to_inputs_for_layer(idx=self.idx, new_layer_past=new_layer_past)) - return inputs - - - # def _forward( - # self, - # hidden_states: torch.Tensor, - # attention_mask: Optional[torch.Tensor] = None, - # position_ids: Optional[torch.LongTensor] = None, - # past_key_value: Optional[Tuple[torch.Tensor]] = None, - # # output_attentions: Optional[bool] = False, - # # use_cache: Optional[bool] = False, - # **kwargs, - # ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]: - # # if "padding_mask" in kwargs: - # # warnings.warn( - # # "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`" - # # ) - # """ - # Args: - # hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)` - # attention_mask (`torch.FloatTensor`, *optional*): attention mask of size - # `(batch, sequence_length)` where padding elements are indicated by 0. - # output_attentions (`bool`, *optional*): - # Whether or not to return the attentions tensors of all attention layers. See `attentions` under - # returned tensors for more detail. - # use_cache (`bool`, *optional*): - # If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding - # (see `past_key_values`). - # past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states - # """ - - # residual = hidden_states - - # hidden_states = self.input_layernorm(hidden_states) - - # # Self Attention - # hidden_states, self_attn_weights, present_key_value = self.self_attn( - # hidden_states=hidden_states, - # attention_mask=attention_mask, - # position_ids=position_ids, - # past_key_value=past_key_value, - # # output_attentions=output_attentions, - # # use_cache=use_cache, - # **kwargs, - # ) - # hidden_states = residual + hidden_states - - # # Fully Connected - # residual = hidden_states - # hidden_states = self.post_attention_layernorm(hidden_states) - # hidden_states = self.mlp(hidden_states) - # hidden_states = residual + hidden_states - - # # outputs = (hidden_states,) - - # # if output_attentions: - # # outputs += (self_attn_weights,) - - # # if use_cache: - # # outputs += (present_key_value,) - - # return hidden_states, present_key_value - - # def forward(self, inputs: dict): - # layer_past = inputs_to_kv_cache_for_layer(idx=self.idx, inputs=inputs) - - # if self.config.checkpointing and self.training: - # hidden_states, new_layer_past = torch.utils.checkpoint.checkpoint( - # self._forward, - # inputs["hidden_states"], - # inputs.get("attention_mask", None), - # inputs.get("position_ids", None), - # layer_past, # inputs.get("past_key_values", None), - # ) - # else: - # hidden_states, new_layer_past = self._forward( - # inputs["hidden_states"], - # inputs.get("attention_mask", None), - # inputs.get("position_ids", None), - # layer_past - # ) # **inputs - # inputs["hidden_states"] = hidden_states - - # inputs.update(kv_cache_to_inputs_for_layer(idx=self.idx, new_layer_past=new_layer_past)) - # return inputs - - # def forward( - # self, - # hidden_states: torch.Tensor, - # attention_mask: Optional[torch.Tensor] = None, - # position_ids: Optional[torch.LongTensor] = None, - # past_key_value: Optional[Tuple[torch.Tensor]] = None, - # output_attentions: Optional[bool] = False, - # use_cache: Optional[bool] = False, - # **kwargs, - # ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]: - # if "padding_mask" in kwargs: - # warnings.warn( - # "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`" - # ) - # """ - # Args: - # hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)` - # attention_mask (`torch.FloatTensor`, *optional*): attention mask of size - # `(batch, sequence_length)` where padding elements are indicated by 0. - # output_attentions (`bool`, *optional*): - # Whether or not to return the attentions tensors of all attention layers. See `attentions` under - # returned tensors for more detail. - # use_cache (`bool`, *optional*): - # If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding - # (see `past_key_values`). - # past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states - # """ - - # residual = hidden_states - - # hidden_states = self.input_layernorm(hidden_states) - - # # Self Attention - # hidden_states, self_attn_weights, present_key_value = self.self_attn( - # hidden_states=hidden_states, - # attention_mask=attention_mask, - # position_ids=position_ids, - # past_key_value=past_key_value, - # output_attentions=output_attentions, - # use_cache=use_cache, - # **kwargs, - # ) - # hidden_states = residual + hidden_states - - # # Fully Connected - # residual = hidden_states - # hidden_states = self.post_attention_layernorm(hidden_states) - # hidden_states = self.mlp(hidden_states) - # hidden_states = residual + hidden_states - - # outputs = (hidden_states,) - - # if output_attentions: - # outputs += (self_attn_weights,) - - # if use_cache: - # outputs += (present_key_value,) - - # return outputs - - -MISTRAL_START_DOCSTRING = r""" - This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the - library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads - etc.) - - This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. - Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage - and behavior. - - Parameters: - config ([`MistralConfig`]): - Model configuration class with all the parameters of the model. Initializing with a config file does not - load the weights associated with the model, only the configuration. Check out the - [`~PreTrainedModel.from_pretrained`] method to load the model weights. -""" - - -@add_start_docstrings( - "The bare Mistral Model outputting raw hidden-states without any specific head on top.", - MISTRAL_START_DOCSTRING, -) -class MistralPreTrainedModel(PreTrainedModel): - config_class = MistralConfig - base_model_prefix = "model" - supports_gradient_checkpointing = True - _no_split_modules = ["MistralDecoderLayer"] - _skip_keys_device_placement = "past_key_values" - _supports_flash_attn_2 = True - _supports_sdpa = True - _supports_cache_class = True - - def _init_weights(self, module): - std = self.config.initializer_range - if isinstance(module, nn.Linear): - module.weight.data.normal_(mean=0.0, std=std) - if module.bias is not None: - module.bias.data.zero_() - elif isinstance(module, nn.Embedding): - module.weight.data.normal_(mean=0.0, std=std) - if module.padding_idx is not None: - module.weight.data[module.padding_idx].zero_() - - -MISTRAL_INPUTS_DOCSTRING = r""" - Args: - input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`): - Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide - it. - - Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and - [`PreTrainedTokenizer.__call__`] for details. - - [What are input IDs?](../glossary#input-ids) - attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*): - Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: - - - 1 for tokens that are **not masked**, - - 0 for tokens that are **masked**. - - [What are attention masks?](../glossary#attention-mask) - - Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and - [`PreTrainedTokenizer.__call__`] for details. - - If `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see - `past_key_values`). - - If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`] - and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more - information on the default strategy. - - - 1 indicates the head is **not masked**, - - 0 indicates the head is **masked**. - position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): - Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0, - config.n_positions - 1]`. - - [What are position IDs?](../glossary#position-ids) - past_key_values (`Cache` or `tuple(tuple(torch.FloatTensor))`, *optional*): - Pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention - blocks) that can be used to speed up sequential decoding. This typically consists in the `past_key_values` - returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`. - - Two formats are allowed: - - a [`~cache_utils.Cache`] instance; - - Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of - shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`). This is also known as the legacy - cache format. - - The model will output the same cache format that is fed as input. If no `past_key_values` are passed, the - legacy cache format will be returned. - - If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that don't - have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `input_ids` - of shape `(batch_size, sequence_length)`. - inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): - Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This - is useful if you want more control over how to convert `input_ids` indices into associated vectors than the - model's internal embedding lookup matrix. - use_cache (`bool`, *optional*): - If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see - `past_key_values`). - output_attentions (`bool`, *optional*): - Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned - tensors for more detail. - output_hidden_states (`bool`, *optional*): - Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for - more detail. - return_dict (`bool`, *optional*): - Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. -""" - - -@add_start_docstrings( - "The bare Mistral Model outputting raw hidden-states without any specific head on top.", - MISTRAL_START_DOCSTRING, -) -class MistralModel(nn.Module): - """ - Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`MistralDecoderLayer`] - - Args: - config: MistralConfig - """ - - def __init__(self, config: CollieConfig): - # super().__init__(config) - super().__init__() - self.config = config - self.padding_idx = config.pad_token_id - self.vocab_size = config.vocab_size - - # aaaa - # self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx) - self.embed_tokens = tensor_parallel.VocabParallelEmbedding( - config.vocab_size, config.hidden_size, params_dtype=torch.float32 - ) - self.layers = nn.ModuleList( - [MistralDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)] - ) - config._attn_implementation = "sdpa" - self._attn_implementation = config._attn_implementation - self.norm = MistralRMSNorm(config.hidden_size, eps=config.rms_norm_eps) - self.gradient_checkpointing = False - # Initialize weights and apply final processing - # self.post_init() - - - def get_input_embeddings(self): - return self.embed_tokens - - def set_input_embeddings(self, value): - self.embed_tokens = value - - @add_start_docstrings_to_model_forward(MISTRAL_INPUTS_DOCSTRING) - def forward( - self, - input_ids: torch.LongTensor = None, - attention_mask: Optional[torch.Tensor] = None, - position_ids: Optional[torch.LongTensor] = None, - # aaaa - past_key_values: Optional[Tuple[torch.FloatTensor]] = None, - inputs_embeds: Optional[torch.FloatTensor] = None, - use_cache: Optional[bool] = None, - output_attentions: Optional[bool] = None, - output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[Tuple, BaseModelOutputWithPast]: - output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions - output_hidden_states = ( - output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states - ) - use_cache = use_cache if use_cache is not None else self.config.use_cache - - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - - # retrieve input_ids and inputs_embeds - if input_ids is not None and inputs_embeds is not None: - raise ValueError("You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time") - elif input_ids is not None: - batch_size, seq_length = input_ids.shape - elif inputs_embeds is not None: - batch_size, seq_length, _ = inputs_embeds.shape - else: - raise ValueError("You have to specify either decoder_input_ids or decoder_inputs_embeds") - - if self.gradient_checkpointing and self.training: - if use_cache: - logger.warning_once( - "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..." - ) - use_cache = False - - past_key_values_length = 0 - - if use_cache: - use_legacy_cache = not isinstance(past_key_values, Cache) - if use_legacy_cache: - past_key_values = DynamicCache.from_legacy_cache(past_key_values) - past_key_values_length = past_key_values.get_usable_length(seq_length) - - if position_ids is None: - device = input_ids.device if input_ids is not None else inputs_embeds.device - position_ids = torch.arange( - past_key_values_length, seq_length + past_key_values_length, dtype=torch.long, device=device - ) - position_ids = position_ids.unsqueeze(0).view(-1, seq_length) - else: - position_ids = position_ids.view(-1, seq_length).long() - - if inputs_embeds is None: - inputs_embeds = self.embed_tokens(input_ids) - - # 打印嵌入层输出 - embeddings_output = inputs_embeds.detach().cpu().tolist() - data_to_save = {"Embeddings Output": embeddings_output} - # 将输出写入 JSON 文件 - with open('a_embeddings_output.json', 'w') as f: - json.dump(data_to_save, f, indent=4) - - if attention_mask is not None and self._attn_implementation == "flash_attention_2" and use_cache: - is_padding_right = attention_mask[:, -1].sum().item() != batch_size - if is_padding_right: - raise ValueError( - "You are attempting to perform batched generation with padding_side='right'" - " this may lead to unexpected behaviour for Flash Attention version of Mistral. Make sure to " - " call `tokenizer.padding_side = 'left'` before tokenizing the input. " - ) - - if self._attn_implementation == "flash_attention_2": - # 2d mask is passed through the layers - attention_mask = attention_mask if (attention_mask is not None and 0 in attention_mask) else None - elif self._attn_implementation == "sdpa" and not output_attentions: - # output_attentions=True can not be supported when using SDPA, and we fall back on - # the manual implementation that requires a 4D causal mask in all cases. - attention_mask = _prepare_4d_causal_attention_mask_for_sdpa( - attention_mask, - (batch_size, seq_length), - inputs_embeds, - past_key_values_length, - ) - else: - # 4d mask is passed through the layers - attention_mask = _prepare_4d_causal_attention_mask( - attention_mask, - (batch_size, seq_length), - inputs_embeds, - past_key_values_length, - sliding_window=self.config.sliding_window, - ) - - hidden_states = inputs_embeds - - inputs = { - "input_ids": input_ids, - "hidden_states": hidden_states, - "attention_mask": attention_mask, - "position_ids": position_ids, - "past_key_values": past_key_values, - "output_attentions": output_attentions, - "use_cache": use_cache, - } - - # decoder layers - all_hidden_states = () if output_hidden_states else None - all_self_attns = () if output_attentions else None - next_decoder_cache = None - - # for decoder_layer in self.layers: - for idx, decoder_layer in enumerate(self.layers): - if output_hidden_states: - # all_hidden_states += (hidden_states,) - all_hidden_states += (inputs["hidden_states"],) - - if self.gradient_checkpointing and self.training: - layer_outputs = self._gradient_checkpointing_func( - decoder_layer.__call__, - # hidden_states, - # attention_mask, - # position_ids, - # past_key_values, - # output_attentions, - # use_cache, - inputs, - ) - else: - layer_outputs = decoder_layer( - # hidden_states, - # attention_mask=attention_mask, - # position_ids=position_ids, - # past_key_value=past_key_values, - # output_attentions=output_attentions, - # use_cache=use_cache, - inputs, - ) - inputs.update(layer_outputs) - - # hidden_states = layer_outputs[0] - hidden_states = inputs["hidden_states"] - - if use_cache: - # next_decoder_cache = layer_outputs[2 if output_attentions else 1] - next_decoder_cache = inputs["addition_info"][1 if output_attentions else 0] - - if output_attentions: - # all_self_attns += (layer_outputs[1],) - all_self_attns += (inputs["addition_info"][0],) - - hidden_states = self.norm(hidden_states) - - # add hidden states from the last decoder layer - if output_hidden_states: - all_hidden_states += (hidden_states,) - - next_cache = None - if use_cache: - next_cache = next_decoder_cache.to_legacy_cache() if use_legacy_cache else next_decoder_cache - - if not return_dict: - return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None) - return BaseModelOutputWithPast( - last_hidden_state=hidden_states, - # past_key_values=next_cache, - hidden_states=all_hidden_states, - attentions=all_self_attns, - past_key_values=past_key_values, - ) - - @classmethod - def pipeline_layers(cls, config: CollieConfig): - """ - Get layers of pipeline. - :return: list - """ - if isinstance(config, str): - config = CollieConfig.from_pretrained(config) - - if config.tie_word_embeddings: - embed_tokens = TiedLayerSpec( - "embed_tokens", - dict_as_params(input_keys="input_ids", output_keys="hidden_states"), - tensor_parallel.VocabParallelEmbedding, - config.vocab_size, - config.hidden_size, - ) - else: - embed_tokens = LayerSpec( - dict_as_params(input_keys="input_ids", output_keys="hidden_states"), - tensor_parallel.VocabParallelEmbedding, - config.vocab_size, - config.hidden_size, - ) - - layers = [ - LayerSpec(MistralDecoderLayer, config, i) for i in range(config.num_hidden_layers) - ] - norm = LayerSpec( - dict_as_params(input_keys="hidden_states", output_keys="hidden_states"), - MistralRMSNorm, - hidden_size=config.hidden_size, - eps=config.rms_norm_eps, - ) - - return [ - ("embed_tokens", embed_tokens), - ("layers", layers), - ("norm", norm), - ] - -class MistralForCausalLM(CollieModelForCausalLM): - _tied_weights_keys = ["lm_head.weight"] - - def __init__(self, config:CollieConfig): - super().__init__(config) - self.model = MistralModel(config) - self.vocab_size = config.vocab_size - # self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False) - self.lm_head = ColumnParallelLinearWithoutBias( - self.collie_config.hidden_size, self.collie_config.vocab_size, bias=False - ) - # Initialize weights and apply final processing - # self.post_init() - # GenerationMixin 需要的额外参数 - self.config.is_decoder = True - if config.model_config.tie_word_embeddings: - self.lm_head.weight = self.embed_tokens.weight - self.main_input_name = "input_ids" - - def clean_cache(self): - self._clean_hidden_states([*self.model.layers, self.lm_head]) - self._set_use_cache(self.model.layers, False) - - def set_cache(self, use_cache): - self._set_use_cache(self.model.layers, use_cache) - - def get_input_embeddings(self): - return self.model.embed_tokens - - def set_input_embeddings(self, value): - self.model.embed_tokens = value - - def get_output_embeddings(self): - return self.lm_head - - def set_output_embeddings(self, new_embeddings): - self.lm_head = new_embeddings - - def set_decoder(self, decoder): - self.model = decoder - - def get_decoder(self): - return self.model - - @add_start_docstrings_to_model_forward(MISTRAL_INPUTS_DOCSTRING) - @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC) - def forward( - self, - input_ids: torch.LongTensor = None, - attention_mask: Optional[torch.Tensor] = None, - position_ids: Optional[torch.LongTensor] = None, - past_key_values: Optional[List[torch.FloatTensor]] = None, - inputs_embeds: Optional[torch.FloatTensor] = None, - labels: Optional[torch.LongTensor] = None, - use_cache: Optional[bool] = None, - output_attentions: Optional[bool] = None, - output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - **kwargs, - ) -> Union[Tuple, CausalLMOutputWithPast]: - r""" - Args: - labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): - Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., - config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored - (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`. - - Returns: - - Example: - - ```python - >>> from transformers import AutoTokenizer, MistralForCausalLM - - >>> model = MistralForCausalLM.from_pretrained("mistralai/Mistral-7B-v0.1") - >>> tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-v0.1") - - >>> prompt = "Hey, are you conscious? Can you talk to me?" - >>> inputs = tokenizer(prompt, return_tensors="pt") - - >>> # Generate - >>> generate_ids = model.generate(inputs.input_ids, max_length=30) - >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0] - "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you." - ```""" - - output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions - output_hidden_states = ( - output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states - ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - - # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn) - outputs = self.model( - input_ids=input_ids, - attention_mask=attention_mask, - position_ids=position_ids, - past_key_values=past_key_values, - inputs_embeds=inputs_embeds, - use_cache=use_cache, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, - return_dict=return_dict, - ) - - hidden_states = outputs[0] - logits = self.lm_head(hidden_states) - logits = logits.float() - - loss = None - if labels is not None: - # Shift so that tokens < n predict n - shift_logits = logits[..., :-1, :].contiguous() - shift_labels = labels[..., 1:].contiguous() - # Flatten the tokens - shift_logits = shift_logits.view(-1, self.config.vocab_size) - shift_labels = shift_labels.view(-1) - # Ensure tensors are on the same device - shift_labels = shift_labels.to(shift_logits.device) - loss_fct = CrossEntropyLoss() - loss = loss_fct(shift_logits, shift_labels) - - if not return_dict: - output = (logits,) + outputs[1:] - return (loss,) + output if loss is not None else output - - return CausalLMOutputWithPast( - loss=loss, - logits=logits, - past_key_values=outputs.past_key_values, - hidden_states=outputs.hidden_states, - attentions=outputs.attentions, - ) - - def prepare_inputs_for_generation( - self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, **kwargs - ): - # Omit tokens covered by past_key_values - if past_key_values is not None: - if isinstance(past_key_values, Cache): - cache_length = past_key_values.get_seq_length() - past_length = past_key_values.seen_tokens - max_cache_length = past_key_values.get_max_length() - else: - cache_length = past_length = past_key_values[0][0].shape[2] - max_cache_length = None - - # Keep only the unprocessed tokens: - # 1 - If the length of the attention_mask exceeds the length of input_ids, then we are in a setting where - # some of the inputs are exclusively passed as part of the cache (e.g. when passing input_embeds as - # input) - if attention_mask is not None and attention_mask.shape[1] > input_ids.shape[1]: - input_ids = input_ids[:, -(attention_mask.shape[1] - past_length) :] - # 2 - If the past_length is smaller than input_ids', then input_ids holds all input tokens. We can discard - # input_ids based on the past_length. - elif past_length < input_ids.shape[1]: - input_ids = input_ids[:, past_length:] - # 3 - Otherwise (past_length >= input_ids.shape[1]), let's assume input_ids only has unprocessed tokens. - - # If we are about to go beyond the maximum cache length, we need to crop the input attention mask. - if ( - max_cache_length is not None - and attention_mask is not None - and cache_length + input_ids.shape[1] > max_cache_length - ): - attention_mask = attention_mask[:, -max_cache_length:] - - position_ids = kwargs.get("position_ids", None) - if attention_mask is not None and position_ids is None: - # create position_ids on the fly for batch generation - position_ids = attention_mask.long().cumsum(-1) - 1 - position_ids.masked_fill_(attention_mask == 0, 1) - if past_key_values: - position_ids = position_ids[:, -input_ids.shape[1] :] - - # if `inputs_embeds` are passed, we only want to use them in the 1st generation step - if inputs_embeds is not None and past_key_values is None: - model_inputs = {"inputs_embeds": inputs_embeds} - else: - model_inputs = {"input_ids": input_ids} - - model_inputs.update( - { - "position_ids": position_ids, - "past_key_values": past_key_values, - "use_cache": kwargs.get("use_cache"), - "attention_mask": attention_mask, - } - ) - return model_inputs - - @staticmethod - def _reorder_cache(past_key_values, beam_idx): - reordered_past = () - for layer_past in past_key_values: - reordered_past += ( - tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past), - ) - return reordered_past - - @classmethod - def pipeline_layers(cls, config: CollieConfig): - """ - Get layers of pipeline. - :return: list - """ - if isinstance(config, str): - config = CollieConfig.from_pretrained(config) - - if config.tie_word_embeddings: - output = TiedLayerSpec( - "embed_tokens", - dict_as_params(input_keys="hidden_states", output_keys="logits"), - ColumnParallelLMHead, - config.hidden_size, - config.vocab_size, - bias=False, - ) - else: - output = LayerSpec( - dict_as_params(input_keys="hidden_states", output_keys="logits"), - ColumnParallelLMHead, - config.hidden_size, - config.vocab_size, - bias=False, - ) - - return [("model", MistralModel.pipeline_layers(config)), ("lm_head", output)] - - @staticmethod - def load_parallel_state_dict( - path: str, - config: Union[CollieConfig, str], - process_exclusion: bool = False, - **kwargs, - ): - ... - - @staticmethod - def load_parallel_state_dict( - path: str, - config: Union[CollieConfig, str], - process_exclusion: bool = False, - protocol: str = "file", # 指定加载state_dict时使用的协议 - **kwargs, - ): - """ - Load state_dict from ``path``. - The format of pretrained model should be the same as that of - `huggingface`. - :return: state_dict. Note that the state_dict should be processed - properly to match the current rank. - """ - # 配置加载 - if isinstance(config, str): - config = CollieConfig.from_pretrained(config) - # IO驱动初始化 - io_driver = IODriver.from_protocol(protocol) - # 检查文件路径是否存在 - if not io_driver.exists(path): - raise FileNotFoundError(f"folder {path} not found.") - # 初始化存储和处理变量 - state_dict = OrderedDict() - weights = [] - parts = None # 变量用于存储模型分割的部分信息 - # 如果开启了进程互斥,那么每个进程都会显示进度条,否则只显示 RANK0 的 - hide_progress = not process_exclusion and int(os.environ.get("RANK", "0")) != 0 - if dist.is_initialized() and process_exclusion: - # 如果启动了进程互斥,则要进行 dist.get_world_size() 次循环 - rank_order = range(dist.get_world_size()) - else: - # 不开启只进行一次循环 - rank_order = range(1) - # 权重文件加载和处理 - for rank in rank_order: - # 如果开启了进程互斥,那么只有对应 RANK 的能进入循环;不开启进程互斥的话就都可以进 - if int(os.environ.get("RANK", "0")) == rank or not process_exclusion: - # PP 分层的方法保存在了 os.environ["COLLIE_PP_PARTS"], 格式类似于 [0, 17, 35], 左闭右开 - if env.is_pipeline: - # 保存的是 json 格式 - parts = env.pipeline_parts - if hasattr(config, "num_key_value_heads"): - # llama2 (transformers >= 4.31.0) - num_key_value_heads = config.num_key_value_heads - else: - num_key_value_heads = config.num_attention_heads - head_dim = config.hidden_size // config.num_attention_heads - # 如果存在 pytorch_model.bin.index.json 文件的话,此时不同的 pp 进程可以按需加载自己需要的权重 - if ( - io_driver.exists(os.path.join(path, "pytorch_model.bin.index.json")) - and "COLLIE_PP_PARTS" in os.environ.keys() - ): - weight_map = json.loads( - io_driver.load( - os.path.join(path, "pytorch_model.bin.index.json"), mode="r" - ) - )["weight_map"] - # layers 表示自己需要的层 - layers = env.pipeline_layers_idx - # 筛选出形似 model.layers.0 这样的层。包含两个条件:1. 有数字的层;2. 数字加一要在 layers 里面(因为最开始还有个 embedding 占一层) - weights.extend( - [ - value - for key, value in weight_map.items() - if len(key.split(".")) > 2 - and key.split(".")[2].isdigit() - and (int(key.split(".")[2]) + 1) in layers - ] - ) - # 去重 - weights = list(set(weights)) - # 继续筛选,如果有 0 层,那么就要加载 embedding;如果有最后一层,那么就要加载 lm_head;如果有倒数第二层,那么就要加载 norm - if 0 in layers: - weights.append(weight_map["model.embed_tokens.weight"]) - if max(parts) - 1 in layers: - weights.append(weight_map["lm_head.weight"]) - if max(parts) - 2 in layers: - weights.append(weight_map["model.norm.weight"]) - else: - # 如果没有 pytorch_model.bin.index.json 文件的话,那么就加载所有的权重 - weights = [ - weight - for weight in io_driver.list(path) - if weight.endswith(".bin") - ] - with progress( - weights, - desc="Loading state dict", - total=len(weights), - disable=hide_progress, - ) as pbar: - for weight in pbar: - part_state_dict = io_driver.load( - os.path.join(path, weight), mode="rb" - ) - # for key in list(part_state_dict.keys()): - # if "attention.wqkv.weight" in key: - # # qkv_weights = part_state_dict.pop(key) - # qkv_weights = part_state_dict[key] - # print(qkv_weights.shape) - # (wq, wk, wv) = qkv_weights.split( - # [ - # config.hidden_size, - # config.num_key_value_heads * head_dim, - # config.num_key_value_heads * head_dim, - # ], - # dim=0, - # ) - # wq_name = key.replace("wqkv", "wq") - # wk_name = key.replace("wqkv", "wk") - # wv_name = key.replace("wqkv", "wv") - # part_state_dict[wq_name] = wq - # part_state_dict[wk_name] = wk - # part_state_dict[wv_name] = wv - state_dict.update(part_state_dict) - del part_state_dict - if parts is not None: - # 这一步是 pp 的复筛 - layers = env.pipeline_layers_idx - for key in list(state_dict.keys()): - if key.startswith("layers"): - layer = int(key.split(".")[1]) - if layer + 1 not in layers: - state_dict.pop(key) - # if key.endswith("tok_embeddings.weight"): - if key.endswith("embed_tokens.weight"): - if 0 not in layers: - state_dict.pop(key) - if key == "norm.weight": - if max(parts) - 2 not in layers: - state_dict.pop(key) - # if key.endswith("output.weight"): - if key.endswith("lm_head.weight"): - if max(parts) - 1 not in layers: - state_dict.pop(key) - # 根据用户配置的新的 tp size 进行分割 - for key in list(state_dict.keys()): - col_filter = [ - # "wq.weight", - # "wk.weight", - # "wv.weight", - # "wqkv.weight", - # "w1.weight", - # "w3.weight", - # "tok_embeddings.weight", - # "output.weight", - "q_proj.weight", - "k_proj.weight", - "v_proj.weight", - #"o_proj.weight", - "lm_head.weight", - "gate_proj.weight", - "up_proj.weight", - #"down_proj.weight", - "embed_tokens.weight", - ] - col_split = any([key.endswith(filter) for filter in col_filter]) - - if col_split: - tensor = ( - list(torch.chunk(state_dict[key], config.tp_size, dim=0))[ - env.tp_rank - ] - .detach() - .clone() - ) - del state_dict[key] - if process_exclusion: - # CPU 内存回收(速度很慢) - gc.collect() - state_dict[key] = tensor - elif key.endswith("o_proj.weight") or key.endswith("down_proj.weight"): - tensor = ( - list(torch.chunk(state_dict[key], config.tp_size, dim=1))[ - env.tp_rank - ] - .detach() - .clone() - ) - del state_dict[key] - if process_exclusion: - # CPU 内存回收(速度很慢) - gc.collect() - state_dict[key] = tensor - if dist.is_initialized() and process_exclusion: - # 如果选择了进程互斥,那么本次循环中不需要加载权重的进程需等待 - dist.barrier() - return state_dict - - @staticmethod - def save_parallel_state_dict( - state_dict: dict, - path: str, - config: CollieConfig, - process_exclusion: bool = False, - **kwargs, - ): - ... - - @staticmethod - def save_parallel_state_dict( - state_dict: dict, - path: str, - config: CollieConfig, - process_exclusion: bool = False, - protocol: str = "file", - ): - """ - Save state_dict to ``path``. - The format of saved state dict should be the same as that of - `huggingface`. - """ - io_driver = IODriver.from_protocol(protocol) - # gather to tp rank 0 - if dist.is_initialized() and process_exclusion: - # 如果启动了进程互斥,则要进行 pp_size 次循环 - rank_order = range(config.pp_size) - else: - # 不开启只进行一次循环 - rank_order = range(1) - dst = parallel_state.get_tensor_model_parallel_src_rank() - with progress( - rank_order, - desc="Saving model", - disable=int(os.environ.get("RANK", "0")) != 0, - ) as pbar: - for rank in pbar: - if env.dp_rank == 0 and (env.pp_rank == rank or not process_exclusion): - for key in sorted(list(state_dict.keys())): - tensor_list = None - if env.tp_rank == 0: - tensor_list = [ - torch.zeros_like(state_dict[key]) - .to(state_dict[key].dtype) - .cuda() - for _ in range(config.tp_size) - ] - dist.gather( - state_dict[key].cuda(), - dst=dst, - gather_list=tensor_list, - group=env.tp_group, - ) - if env.tp_rank == 0: - col_filter = [ - # "wq.weight", - # "wk.weight", - # "wv.weight", - # "wqkv.weight", - # "w1.weight", - # "w3.weight", - # "tok_embeddings.weight", - # "output.weight", - "q_proj.weight", - "k_proj.weight", - "v_proj.weight", - #"o_proj.weight", - "lm_head.weight", - "gate_proj.weight", - "up_proj.weight", - #"down_proj.weight", - "embed_tokens.weight", - ] - col_split = any( - [key.endswith(filter) for filter in col_filter] - ) - - if col_split: - state_dict[key] = concat_tensor(tensor_list, dim=0) - - if process_exclusion: - # CPU 内存回收(速度很慢) - gc.collect() - - elif key.endswith("o_proj.weight") or key.endswith("down_proj.weight"): - state_dict[key] = concat_tensor(tensor_list, dim=1) - - if process_exclusion: - # CPU 内存回收(速度很慢) - gc.collect() - # 似乎不需要? - # state_dict_keys = state_dict.keys() - # for layer_id in range(config.num_layers): - # qkv_names = [None, None, None] - # for key in state_dict_keys: - # if f"layers.{layer_id}.attention.wq.weight" in key: - # qkv_names[0] = key - # elif f"layers.{layer_id}.attention.wk.weight" in key: - # qkv_names[1] = key - # elif f"layers.{layer_id}.attention.wv.weight" in key: - # qkv_names[2] = key - # qkv_name = qkv_names[0].replace("wq", "wqkv") - # state_dict[qkv_name] = torch.cat( - # [ - # state_dict.pop(qkv_names[0]), - # state_dict.pop(qkv_names[1]), - # state_dict.pop(qkv_names[2]), - # ], - # dim=0 - # ) - - if env.tp_rank == 0: - # Save gathered weights - if env.is_pipeline: - ckpt_name = f"pytorch_model-{env.pp_rank + 1:05d}-of-{config.pp_size:05d}.bin" - total_size = 0 - weight_map = {} - for name, weight in state_dict.items(): - weight_size = weight.numel() * dtype_byte_size( - weight.dtype - ) - weight_map[name] = ckpt_name - total_size += weight_size - index_dict = dict( - total_size=total_size, weight_map=weight_map - ) - index_dicts = [None for _ in range(env.pp_size)] - dist.gather_object( - index_dict, index_dicts if env.pp_rank == 0 else None, group=env.pp_group - ) - if env.pp_rank == 0: - total_size = 0 - weight_map = {} - for _index_dict in index_dicts: - total_size += _index_dict["total_size"] - weight_map.update(_index_dict["weight_map"]) - merged_dict = { - "metadata": {"total_size": total_size}, - "weight_map": weight_map, - } - io_driver.save( - json.dumps(merged_dict, indent=2, sort_keys=True) - + "\n", - os.path.join(path, "pytorch_model.bin.index.json"), - ) - - else: - ckpt_name = f"pytorch_model.bin" - ckpt_path = os.path.join(path, ckpt_name) - io_driver.save(state_dict, ckpt_path) - if dist.is_initialized() and process_exclusion: - dist.barrier() - if env.rank == 0: - config.save_pretrained(path, protocol=protocol) - dist.barrier() - - -@add_start_docstrings( - """ - The Mistral Model transformer with a sequence classification head on top (linear layer). - - [`MistralForSequenceClassification`] uses the last token in order to do the classification, as other causal models - (e.g. GPT-2) do. - - Since it does classification on the last token, it requires to know the position of the last token. If a - `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If - no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the - padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in - each row of the batch). - """, - MISTRAL_START_DOCSTRING, -) -# Copied from transformers.models.llama.modeling_llama.LlamaForSequenceClassification with Llama->Mistral, LLAMA->MISTRAL -class MistralForSequenceClassification(MistralPreTrainedModel): - def __init__(self, config): - super().__init__(config) - self.num_labels = config.num_labels - self.model = MistralModel(config) - self.score = nn.Linear(config.hidden_size, self.num_labels, bias=False) - - # Initialize weights and apply final processing - self.post_init() - - def get_input_embeddings(self): - return self.model.embed_tokens - - def set_input_embeddings(self, value): - self.model.embed_tokens = value - - @add_start_docstrings_to_model_forward(MISTRAL_INPUTS_DOCSTRING) - def forward( - self, - input_ids: torch.LongTensor = None, - attention_mask: Optional[torch.Tensor] = None, - position_ids: Optional[torch.LongTensor] = None, - past_key_values: Optional[List[torch.FloatTensor]] = None, - inputs_embeds: Optional[torch.FloatTensor] = None, - labels: Optional[torch.LongTensor] = None, - use_cache: Optional[bool] = None, - output_attentions: Optional[bool] = None, - output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[Tuple, SequenceClassifierOutputWithPast]: - r""" - labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): - Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., - config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If - `config.num_labels > 1` a classification loss is computed (Cross-Entropy). - """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - - transformer_outputs = self.model( - input_ids, - attention_mask=attention_mask, - position_ids=position_ids, - past_key_values=past_key_values, - inputs_embeds=inputs_embeds, - use_cache=use_cache, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, - return_dict=return_dict, - ) - hidden_states = transformer_outputs[0] - logits = self.score(hidden_states) - - if input_ids is not None: - batch_size = input_ids.shape[0] - else: - batch_size = inputs_embeds.shape[0] - - if self.config.pad_token_id is None and batch_size != 1: - raise ValueError("Cannot handle batch sizes > 1 if no padding token is defined.") - if self.config.pad_token_id is None: - sequence_lengths = -1 - else: - if input_ids is not None: - # if no pad token found, use modulo instead of reverse indexing for ONNX compatibility - sequence_lengths = torch.eq(input_ids, self.config.pad_token_id).int().argmax(-1) - 1 - sequence_lengths = sequence_lengths % input_ids.shape[-1] - sequence_lengths = sequence_lengths.to(logits.device) - else: - sequence_lengths = -1 - - pooled_logits = logits[torch.arange(batch_size, device=logits.device), sequence_lengths] - - loss = None - if labels is not None: - labels = labels.to(logits.device) - if self.config.problem_type is None: - if self.num_labels == 1: - self.config.problem_type = "regression" - elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int): - self.config.problem_type = "single_label_classification" - else: - self.config.problem_type = "multi_label_classification" - - if self.config.problem_type == "regression": - loss_fct = MSELoss() - if self.num_labels == 1: - loss = loss_fct(pooled_logits.squeeze(), labels.squeeze()) - else: - loss = loss_fct(pooled_logits, labels) - elif self.config.problem_type == "single_label_classification": - loss_fct = CrossEntropyLoss() - loss = loss_fct(pooled_logits.view(-1, self.num_labels), labels.view(-1)) - elif self.config.problem_type == "multi_label_classification": - loss_fct = BCEWithLogitsLoss() - loss = loss_fct(pooled_logits, labels) - if not return_dict: - output = (pooled_logits,) + transformer_outputs[1:] - return ((loss,) + output) if loss is not None else output - - return SequenceClassifierOutputWithPast( - loss=loss, - logits=pooled_logits, - past_key_values=transformer_outputs.past_key_values, - hidden_states=transformer_outputs.hidden_states, - attentions=transformer_outputs.attentions, - ) From 0068f2395de5886ede219eef283e2be5c0c7bc75 Mon Sep 17 00:00:00 2001 From: LinqiY <100989140+LinqiY@users.noreply.github.com> Date: Mon, 6 May 2024 15:45:14 +0800 Subject: [PATCH 14/16] Add tests for mistral --- tests/models/mistral/test_generation.py | 37 +++++++++++++++++++++++++ tests/models/mistral/test_raw.py | 22 +++++++++++++++ 2 files changed, 59 insertions(+) create mode 100644 tests/models/mistral/test_generation.py create mode 100644 tests/models/mistral/test_raw.py diff --git a/tests/models/mistral/test_generation.py b/tests/models/mistral/test_generation.py new file mode 100644 index 0000000..ac06de8 --- /dev/null +++ b/tests/models/mistral/test_generation.py @@ -0,0 +1,37 @@ +import sys +sys.path.append("../../../") + +from transformers import AutoTokenizer, GenerationConfig + +from collie.models.mistral2 import MistralForCausalLM, MistralConfig +from collie import CollieConfig, env + +model_name_or_path = "mistralai/Mistral-7B-v0.1" + +tokenizer = AutoTokenizer.from_pretrained(model_name_or_path) + +config = CollieConfig.from_pretrained(model_name_or_path) + +config.dp_size = 1 +config.tp_size = 2 +config.pp_size = 2 +# config.architectures = ["MistralForCausalLM"] +print("------------------------------") +model = MistralForCausalLM.from_pretrained(model_name_or_path, config=config).cuda() +model.eval() +print("------------------------------") +prompt = "Llama is a" +# prompt = "Q:What do we eat for tonight?A:" +inputs = tokenizer(prompt, return_tensors="pt") +print("inputs:") +print(inputs) + + +gen_config = GenerationConfig(max_new_tokens=256, early_stopping=False, eos_token_id=2) + +outs = model.generate(inputs["input_ids"].cuda(), generation_config=gen_config) +if env.local_rank == 0: + print("outs:") + print(outs) + print("last:") + print(tokenizer.decode(outs[0], skip_special_tokens=True)) \ No newline at end of file diff --git a/tests/models/mistral/test_raw.py b/tests/models/mistral/test_raw.py new file mode 100644 index 0000000..1f50b39 --- /dev/null +++ b/tests/models/mistral/test_raw.py @@ -0,0 +1,22 @@ +import sys + +import torch + +sys.path.append("../../../") + +from transformers import AutoTokenizer, GenerationConfig, AutoModelForCausalLM +from collie.models.mistral.modeling_mistral import MistralForCausalLM + +model_name_or_path = "mistralai/Mistral-7B-v0.1" +tokenizer = AutoTokenizer.from_pretrained(model_name_or_path) +model = MistralForCausalLM.from_pretrained(model_name_or_path).cuda() +model.eval() +prompt = "Llama is a" +# prompt = "Q:What do we eat for tonight?A:" +inputs = tokenizer(prompt, return_tensors="pt") +print(inputs) +gen_config = GenerationConfig(max_new_tokens=256, early_stopping=False, eos_token_id=2) +outs = model.generate(inputs["input_ids"].cuda(), generation_config=gen_config) + +print(outs) +print(tokenizer.decode(outs[0], skip_special_tokens=True)) \ No newline at end of file From 2c3af7fc62ed102e0a9fd5f4dfee7191e9560431 Mon Sep 17 00:00:00 2001 From: LinqiY <100989140+LinqiY@users.noreply.github.com> Date: Thu, 16 May 2024 10:51:20 +0800 Subject: [PATCH 15/16] Add MistralConfig --- .../models/mistral/configuration_mistral.py | 155 ++++++++++++++++++ 1 file changed, 155 insertions(+) create mode 100644 collie/models/mistral/configuration_mistral.py diff --git a/collie/models/mistral/configuration_mistral.py b/collie/models/mistral/configuration_mistral.py new file mode 100644 index 0000000..ad6691b --- /dev/null +++ b/collie/models/mistral/configuration_mistral.py @@ -0,0 +1,155 @@ +# coding=utf-8 +# Copyright 2023 Mistral AI and the HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" Mistral model configuration""" + +from transformers.configuration_utils import PretrainedConfig +# from transformers.utils import logging +from collie.log.logger import logger + + +# logger = logging.get_logger(__name__) + +MISTRAL_PRETRAINED_CONFIG_ARCHIVE_MAP = { + "mistralai/Mistral-7B-v0.1": "https://huggingface.co/mistralai/Mistral-7B-v0.1/resolve/main/config.json", + "mistralai/Mistral-7B-Instruct-v0.1": "https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.1/resolve/main/config.json", +} + + +class MistralConfig(PretrainedConfig): + r""" + This is the configuration class to store the configuration of a [`MistralModel`]. It is used to instantiate an + Mistral model according to the specified arguments, defining the model architecture. Instantiating a configuration + with the defaults will yield a similar configuration to that of the Mistral-7B-v0.1 or Mistral-7B-Instruct-v0.1. + + [mistralai/Mistral-7B-v0.1](https://huggingface.co/mistralai/Mistral-7B-v0.1) + [mistralai/Mistral-7B-Instruct-v0.1](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.1) + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + + Args: + vocab_size (`int`, *optional*, defaults to 32000): + Vocabulary size of the Mistral model. Defines the number of different tokens that can be represented by the + `inputs_ids` passed when calling [`MistralModel`] + hidden_size (`int`, *optional*, defaults to 4096): + Dimension of the hidden representations. + intermediate_size (`int`, *optional*, defaults to 14336): + Dimension of the MLP representations. + num_hidden_layers (`int`, *optional*, defaults to 32): + Number of hidden layers in the Transformer encoder. + num_attention_heads (`int`, *optional*, defaults to 32): + Number of attention heads for each attention layer in the Transformer encoder. + num_key_value_heads (`int`, *optional*, defaults to 8): + This is the number of key_value heads that should be used to implement Grouped Query Attention. If + `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if + `num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used. When + converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed + by meanpooling all the original heads within that group. For more details checkout [this + paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to `8`. + hidden_act (`str` or `function`, *optional*, defaults to `"silu"`): + The non-linear activation function (function or string) in the decoder. + max_position_embeddings (`int`, *optional*, defaults to `4096*32`): + The maximum sequence length that this model might ever be used with. Mistral's sliding window attention + allows sequence of up to 4096*32 tokens. + initializer_range (`float`, *optional*, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + rms_norm_eps (`float`, *optional*, defaults to 1e-06): + The epsilon used by the rms normalization layers. + use_cache (`bool`, *optional*, defaults to `True`): + Whether or not the model should return the last key/values attentions (not used by all models). Only + relevant if `config.is_decoder=True`. + pad_token_id (`int`, *optional*): + The id of the padding token. + bos_token_id (`int`, *optional*, defaults to 1): + The id of the "beginning-of-sequence" token. + eos_token_id (`int`, *optional*, defaults to 2): + The id of the "end-of-sequence" token. + tie_word_embeddings (`bool`, *optional*, defaults to `False`): + Whether the model's input and output word embeddings should be tied. + rope_theta (`float`, *optional*, defaults to 10000.0): + The base period of the RoPE embeddings. + sliding_window (`int`, *optional*, defaults to 4096): + Sliding window attention window size. If not specified, will default to `4096`. + attention_dropout (`float`, *optional*, defaults to 0.0): + The dropout ratio for the attention probabilities. + + ```python + >>> from transformers import MistralModel, MistralConfig + + >>> # Initializing a Mistral 7B style configuration + >>> configuration = MistralConfig() + + >>> # Initializing a model from the Mistral 7B style configuration + >>> model = MistralModel(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + ```""" + + model_type = "mistral" + keys_to_ignore_at_inference = ["past_key_values"] + + def __init__( + self, + vocab_size=32000, + hidden_size=4096, + intermediate_size=14336, + num_hidden_layers=32, + num_attention_heads=32, + num_key_value_heads=8, + hidden_act="silu", + max_position_embeddings=4096 * 32, + initializer_range=0.02, + rms_norm_eps=1e-6, + use_cache=True, + pad_token_id=None, + bos_token_id=1, + eos_token_id=2, + tie_word_embeddings=False, + rope_theta=10000.0, + sliding_window=4096, + attention_dropout=0.0, + attn_implementation="flash_attention_2", + **kwargs, + ): + self.vocab_size = vocab_size + self.max_position_embeddings = max_position_embeddings + self.hidden_size = hidden_size + self.intermediate_size = intermediate_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.sliding_window = sliding_window + + # for backward compatibility + if num_key_value_heads is None: + num_key_value_heads = num_attention_heads + + self.num_key_value_heads = num_key_value_heads + self.hidden_act = hidden_act + self.initializer_range = initializer_range + self.rms_norm_eps = rms_norm_eps + self.use_cache = use_cache + self.rope_theta = rope_theta + self.attention_dropout = attention_dropout + + # 调用父类的初始化函数,将一些公共参数传递给父类处理 + super().__init__( + pad_token_id=pad_token_id, + bos_token_id=bos_token_id, + eos_token_id=eos_token_id, + tie_word_embeddings=tie_word_embeddings, + **kwargs, + ) From 86445698f75f94acdad180216993d2eaa1537954 Mon Sep 17 00:00:00 2001 From: LinqiY <100989140+LinqiY@users.noreply.github.com> Date: Sat, 18 May 2024 09:27:21 +0800 Subject: [PATCH 16/16] Update __init__.py --- collie/models/__init__.py | 1 + 1 file changed, 1 insertion(+) diff --git a/collie/models/__init__.py b/collie/models/__init__.py index 9a11a47..a60817e 100644 --- a/collie/models/__init__.py +++ b/collie/models/__init__.py @@ -6,3 +6,4 @@ from .chatglm2 import ChatGLM2ForCausalLM from .moss_moon import Moss003MoonForCausalLM from .internlm2 import InternLM2ForCausalLM +from .mistral import MistralForCausalLM