From 6ef19d65010164a7cc8408663eb189b64f44d26a Mon Sep 17 00:00:00 2001 From: Andrzej Janik Date: Fri, 17 Sep 2021 18:31:12 +0000 Subject: [PATCH] Add early support for more sregs --- ptx/lib/zluda_ptx_impl.bc | Bin 31224 -> 31940 bytes ptx/lib/zluda_ptx_impl.cl | 13 +++++ ptx/src/test/spirv_run/lanemask_lt.ptx | 25 ++++++++++ ptx/src/test/spirv_run/lanemask_lt.spvtxt | 45 +++++++++++++++++ ptx/src/test/spirv_run/mod.rs | 1 + ptx/src/translate.rs | 56 ++++++++-------------- 6 files changed, 103 insertions(+), 37 deletions(-) create mode 100644 ptx/src/test/spirv_run/lanemask_lt.ptx create mode 100644 ptx/src/test/spirv_run/lanemask_lt.spvtxt diff --git a/ptx/lib/zluda_ptx_impl.bc b/ptx/lib/zluda_ptx_impl.bc index 175f4df8c13942df8c53e9608518b38a9e6eab69..7aa12c831a7188b41031fcbc65c662a774f9a9df 100644 GIT binary patch literal 31940 zcmeI54OClYy6>M&((C{sNegX5^WhHE7PYnkDoqM#Al1QI$Fx)D;@FV{2tvz82$Tk9 z^z5CaggVrTBYLr?FCR;D*5u1m_Wya`cfZ-$2~cbrJbqIezybg$m*|$?dCwoK9{95VF6GGb z4MffY&`JRAN&`qv&_V(LECB$L$$X?}M0mSMo@4kK;gMXsK%A}-{-m5%-I`GRv*h9_ zc}|*Mthgg7$I($@Kcvmvu*lmfFZx+R<%sxqX_V=<4(bge$Kg?2pLM-2`^SCT9-&?z z0f0Rez)b*gSkB?DnH__zrT()xCX_bW;)^RlnlB^*8w5{0BlwOs0je& z_cJbBx^(FhF)aWo>=AogC~p((ZL4;-*}bh|vMI}(rz4%!ZhMb+SGBiU;cbP}MVd3c zDi~HxX;sI1RmVVcT&o^uG>3~cu**sA$|C7%Z?oN-XYo4pUc1Gc*XMO8$Tlb0TJ8Q( z3F+t|n_&W|@k#YDS~I0pO%R&5htx-UQ;um>rwKK*`MhmeB;8H6iM@@b-aMLYb&_pa z-o_s9E}u6~?rqbNO<8QU))Ml8ED~gNiboD_wm+`;>e8i4{{j=>gQU}f4An74b%fBI z=}kE^q&}rpjrOXJ4XP%3Q-&GM@m}@YR@Et@hirYu>i}_+&ug#tI;sUur?=Towq=p6 zFg~ds?^VH5>bHl~$7uE8N!2i;J~pU1Y*n3}R8RG$oYSgD7)^kn1+?9}t6I?J^V(_B zSwim8d-JNjcD7S)xXLxBwW`;w>SLgpnpB_eRlUY&ju&Z;8C7S7SYuPYDW|mR^Svr4 zfV_jkPuW=!^oc+Pi;TnrMvp1q=Qj+R3j2!+2mA#dx1r0V>=qRC7wH53{C?B=PJaQM zr(~a-rC^nQCYOF&LBUaz@~At1)THk=8TuK+ph@Ww6dp9?AMoo(1xDFtX4y%QpQ2=w z0qMzrXnI=u(S-E8S^60a`11!$2H22)z@6VuUF&6Nj?3#uW;;FDp4$TfIVf#53`P+?eXBE~W^8)Tiv#q(kp* zw0rY{n^f>K+1%~5kI9#^OQDBsF_LW-vN<4{mIuEGk~)w%ddMbnyU%^}jHZ&d_l!vesavgJuI7 zeeAa9Fms7JEo2+Tey+2~cFNgrv7ZB?FXW^>Shf*JhmrlR1IT8Y4YnIeXFzmL#|F0; z$vRy!duVym&VcCGayaLHpKIlcEFw0)#aCNF}6|V7h);qksS!Ai>D8guxVA``9xB40|{rD3%cu zqCsEMeNpV?M%pV^N80=7r;+vo&!i-U+k5$_XhFEWN4C2Y!tA|!@5@C*n7xO7f2=2b z^d2j(Gl!4fi!bG6g^%9edzES7qi6X|q`e3JBhudIJ0k6+h$4@h@oGZC}M#kcoHuAiOP^+&ARjRm_S7vw9?1R_`3&5NU~@ZS_iF5TMi zS4XbN(@BwQ(p474WZ|kPCLfDp^8J^hn0#?bR85JMzT&=N~Cj5_aM|r%+AN;U~^b54lyk@Dt}uqfa0TKXLAnItIf}oWp-E zk}AT_?i<#R5Q4C?`?gU&P1bgG2ol3hKD(%qBElYmhyO>0bb0tl z*4%N35r$3o#$9{#zVMNJK^Mj3%_|~JzI!B!$sg~FVsb~S!xC;Xsj*nPD%@n#cSo27 zVJ4p&Pc-y}n|!}OOD+i;$>)x)oEi)t$(JvARHfk~d92STNDd##HA^B*KJ-QulTY6h z#pKP&ktTm$vye;=GkN{C8$X}~VblHCi?^yO!%d$5w+^=?+~hx&H2T6v@`K3?X-@b^ zzOe03ktl2=5C7sP`jPOFd}U`8lhdmsO)fqW#pH%pqL@6mv~e)pWX6{nQhm6|m&*OIt0@2k$iAjq{;H{Wk#BOWqTBp zwkRh5`NAWutnrywx%U4U8D=k0XI{SEeR0_n!OdpI2dl~?RF6&nRq z@De`s!Cs0t)Y*3c7>SuCx4)cHpTcI(H2L%2Z+@AO1Igt=@S^>zyhE0h@OJ)lVDb%u z9iS`PS(mjQ64+;Ep%3Ry_MVffO1B^w=2ngh*xMVO zg2GW}XS1FBk(g{P@!IWVn_bZ2^x7$J`shOGS)X)LC#%=VPE1Ru&C*=?A@y;sYD}vh zwrZ+$d)T`y0h6I$P;k&);1T4HPN%h4$RAac>)3lsD@(p z%f#MBh1b#TZS7`Ootu`Pn~;7!C_Sqal#j5tg+e!nRO43FVXf*AqdsgE{U0BD6C!jg z!Z6_1cbW7Zg8VN3nuG3wi>~~mg8V+ed)^p6$VU(i^By|ru+_n zexJYd)g*6gw|AGq+bm{pvpKq%Z71Z?vxCx;X6fH`nL}?EX{IKZj!mkM6Pl`$^!z?| zVZU4H@#{NHhEadPz>s0YpMO+PFj~^zC?;DhWMl9)VftTQlb*H6KA8|rQ?ipf_NLbt zW@)`#dO{~XsgwPELiEvubjmEfpv(N=td+f!H#WKStX6$Wt3ISvk7?DXwbxWpKhHmC zDjX0Py4;4*(t~z8xywnmiOIGS?=HJH{q=RS&veqW)6x?F&`JT^2LKLD31#Y#q$fIZ zN~lbx^N~m1+R!7-t9O>Ku4t%Rv&C9zYpB1=ZmTyn z)Y*47+t$?9w$$BdcQkCbRWwtMhKd{4uFhSpqjIuy3^!3#Ifk{lReAcgYjf9C=2T>7 zug$gIlxJI;t*^{6R8{1a=jLXw)ve39uBN`Zp{~Bh-q38TZ>Cs-->a#!*V-u48q*Co zP+NAR$Cdc0=enXP* z&Zu0HD|}62$~NRmgeOOJTOZ5y+s#KL{+)(giTLEGuJVb@OeXPg$KF5x=Fh+UWbgA2 zzcBi0=Z$A;>(ho>x3sx$Xi`=*wJIx`es{y8|9Z!dnv~blTdWl??P$6^^M4y6RIl}>oGxNd^Hc2Pvg>on+7lM(hi2I% zC7WUwzwAS^^n4%9kgZw4v(o8p46Vsh`+mQ^pU6MpF6{Cr>`;wqRpXPYaY{91Wj~*0 zG{aU^pjR~#T1doOatLmGVGcm94sewU!I$6e56();{`-8XL1kC_bJ?gDmu@E18` zf1_j{+ohjPOFsu0d8>ZE$uJ-&ROu$|(vuU?^8wM{CfNI%lR6n0VDHX8@8{Nh{}6kd zv%g3`>SsQco-<49^tO=t&k89`csCC zqLr(3@5`y-fg*>TpcDzZ&K1I;Y8Q;=h#B2J9Vmr|*c&$haIc7Z2KQ>7^>hkBTUt<~aMCAHQ#QzRjP%-*Eime16QIzu#;={=XZ)I6uD4KmN|I zzu*VY>$Um)?`zh}vf%ZziMB$D`#zlHb}0GD0uywzZ+HR-&ev>@H)xT3 ztH%TDx7(l5`t9~do4?)u2F>4Yf3W%6?a#RR+wE_D{vfXV%eOv%^ZRFf=Wl-hjO%#L z@1OA<&-ur{_>SlN{u$r#oZmm=Jf7e1dXW8@k5r*4iTNeysuEpHk`eY%f>-Y|8%4Bw zU5O+EgeGWz%R?#dl+*W!wDQ|hMU>#x`peJ3+NlBdxfc5zjD3A0xgGTE`;plX_WAPF z>x(Mpw!V0d{U^<_e^2q;j!%2}+}7`yWB(7$vH$t;|N8wQKI8A1q<(#I7Ho{U>()`0PKi z>&IvRiCsTF>pyn=`0PKi>&IvRiCsTF`%mop^W)F%Ke5LrKI4Dzm#`uD{7WaNSB%gv zE=}_h)vu>4a*JU|4&`gL;@eV%mdA_4p0_;22>l}aj() z`0PKi>&IvP$F3ir{U>()`0PKi>(BZ6kIVk^HU8tW|HrQX-(&p$G{^f-?E3N9e`42< z&;Ap;ethKGdG zPzvXplu#LZ-HQEXE~&zz|9YQd)-AA3xR1y>BGx6F_sdU{}LLo%o?*d%{x5kH6EqQn-s5TDq#u^z=&Mzq3En$-b_(Fe2}kgm;5^`-m*N z_`4GcE?M?kfq7g$zCc_SIGys-C*@~F+HCGMx{jHz(P0lo?8|)niN&?tF|ObTm^9>^ z=K@yQw_0X=XT-NzBurrAZBAQ-(v zmuHyIGRLyGm6T+IRaNrTs!b;f%**fGYWrd7uj;ma_<%(&DmSw)NOJwLW0@*s5*jIhPWESF zrSc399F-{OBXq(=i5}jgg$Ghf>A%pT(G({fr^V+}y6Mw&(thaxd`wFYNyq3awkTELH!Z9YD(LBHK z0-<84qYD)-4U=(kp+4a{X8D0dr3tH;72}JXt~E?nS88`cK6CwO>VWHJW|b#xEMX(F z`e51x*PV>+e40XV50mr!V!eACvvz86si2z4+rPx=-ofaPF6kCDFzY*)4!GNxfPCk}08=b$gy=HufzW@I23K znpigG`7QHZpYnp|4~%(Gso4Drvt?ST-#x-??NOEPev|orK;_(hgej%f-Mimoe$b~L z*gei{n^2GK{ws5zPjg}S8OAcGQM7-;luv8)?H8EJo@+|mzhtTc*Erk1VybDXyFJmh zy^k7bm$+&tsIhjLtKO%*(7w!NAJi(mYL{bLtM_KPntL)zy*Id80vS&4YL}DF?DiU5 zZGD*o-U8Q;Co;#px40hkEx+L1=%R_i{}DnsY-NlsUDark}qgN?@z{64F}-x0PPf?u=$$A8GTaSRptLmWp%&c2Ax)<8vm zisPur(}++!D)M5EqawF+92NPK97jby%5hZW|KK<(@~i}a8v!mW^1C^XirmF zXg#AMf0*N_$PaQH75N#Cqas(3q57!EZ{s*B@@9^sB7chGsK^5xM@9Y>$5D~55{LYt zBL4x$QIU6uL*t2x{AG@#BL9HnsK{lBp*pC@3pkF7yoTeb$RFo8Dsn%^QISt`92L1b zDdYzg`9_YTBHzVvROAOZj*9#(j-w*~isPurZ%7XLK}BB5aa82HIgX0_MUJB)|7&vS zu?ZFVLP?0DA~$gy75Pq%qayz`$5D~L$8l8T$tj`Zq9QNkI4bf6j-w*)JuzqarWmI4bfz97jd| z2acm6|Cr;b$W=>1$3;bcC&y8dKge-Zls=>V4%`54Dhk$=i@ROCs^LUmA)XL1}Bc>%{!k>9-xfKT>s(w0kO6R6Fqat^592I$|GW6bXROIZ(57}P_N(P@(f6)K{Wo2!(J1eba_GV`p z`^%Kd%B;-|b!84)t<`C(EURd?mfdfww>fGm%4+H>Vnkaj8h6$>Y&@eKwpyFDiEp&E zqLIzuUpwgge*4Z@_~sXlzWGI?Z+_7@-c7Adv+&I?8h!JNM&JCR(RXm2)Hak`Yxypd zs|BNLzLnVoe!=KDI5StXflHfLFnZ<{j04Rp7(EAPg|BB`!RVP+FnZ<{jGoyUY^t_8 zY?VCM##MvSGvCT=0>5DN%udQxZOmL7R}Dtbd@Bb%^9n}K?4(@P272ZdjGlP~&s-Z< zwSk_4XE*owQfZsb<4f#lbj`aqo53?0eFslk-oE)oqi=rEINbcA(Rc8~GXCjGp;cW)t`Yqi1$fu4)6rSSu^b zs&aE?af$!+88Ziw_c4Mk@OOoM1lRbi`rw~|eeln~Rm?vF`v|W4S@pp`1N-2gfqn4L zz&?UA7}`d@ev#l3=beCE@J_%ccqd>F!P%Ql54;nw2i^(T1MdXvAvklh>4A3w_P{#< zd*GdbJ+KqUZD{=060Zd8VKzr#llUiK5A4M8vV`XY?*#0DcLL4>?*#0Dow!-`z&inZ z;GKXy@J_%Uf}6n1{WWjxY#;ApW?&byItH6Hiwx|8oxIug!9N50;Gcoh!9N50U?*^P zeeln~KKN%~AN(`0kKmlo*k@xd5F?8MF7XL+q9UJ2O4Y>vPt@lU`W*oot13C{=K3D^Vg1e^!n z3D^TWakJ`ycLMgnI{|y(lTcn|E87|EX)RdJ+&2ZRS z*cA2;fvhgCX?EDyzawH>T@$rRFgR~*L~yjq){5qu7F(UQX-DKEd6UC-e_2IsL&c6@ zDQjKj{T21An^xD^>Y8inY%vmQt@WXLwat+cWo6c;CYz(V%34!9vuIs;MSb(?+J;D0 zT{Dw4l~)D3p2>F3WDc9Pvdm$-zox0#=2%^mw>I>LR_?E9=KY6KLVez7t*d1JRX16$ Oe2H&$1^Z@m0Qi4q^c4mG literal 31224 zcmeI5e_UJjz5m}KX-%>lX@z+RzP*9*DPzX3( zPJ#_~)Nksr!_Mmk*SYFLH`bw->#cV{uwxbbWv+9G*o78o?P|rY_RF=--p4sPZ-@C| z57)H+eBtpZFZ$&DexKL-^ZA@lPEG>knR}1jv>0Fk02E7gEAP1XzserH*nOvRaOFlp z`U22O0p3dnNQ~1$8~`i@0D?~4ojWMHO)O70JSAwBu38{T(TILr>{5IuF7K(tya9Ro zV!cFhdqR3~OTK+To4Rq4w^g3|R9xwx(*W&S;Yxq|dr&K)7sAg>sjdrDoI>Q0SQ!6zDF7w`K>7U@GnX!1x+Itu0trt@ zy!Di~*5z#~_cYnPjS`|Z&6}kooaG*Shqs~JTc_|g!l_)%=}r~&t0uIn(N5JUXpU*s zW3=XQt_B*ML_-?kD)-jey;&BoL+`a)yjfjdhk|Hw5{>1aNAd|r2T=zBpvK14qb|*a zRuvFv-sw{x=}a2cs!j>i(B$(rr4g=nqDkVdDfDK!h(;&Tl;*AJ@HY6oS#ocaj;Kvz zsx{^lkERhIb0sy);jJBe6klArbm>PB03Rfr60T5<(yAi@&FRjh(|zhGt!k)KHQK8R zbSCxFnq!^ncdV)@K?l*e-|GNLtw9JZ=XjjJa*lg?_@gS2K?;1as*-iC5vlh12+5zc&~LGR5f_u83Gx#23- zoYJZeS=FPUnHX1}>Qo(~HOF!_qbAkqKE~KYXVR2beZEr#M#wrS`h=MkVV4+Gu*f8M z)YNRw_W2FH=A7=_oF2ci*<)xkE8B&}?p*z_KfBw!q1A7M^OWq<0urp`IXU@J0a*pI zkA37>lWdZbxv~#>vfG5FZu5p7v$D%ShE`$DL65P!FK3_7Ff7~vc95T) zCaX-cb8>PTWFH2|lf&YV?*S~C5_Iiv5g*k10#>1dl*<#LV z_ZZrkYQ}w@o7;pgqF4M&Twnni#CNzhz-Jc!?Fy#7Z^OgTE1(2RhKY^(ZlU2>q4B6$ zdDN3VWY)Kv4c)Y%*Q{(7<{UI<@AK=2geKX!0@)}9_Fkl=u>s$RBx&_8TfA;W~LqzM3dJOGm!=PC?WZrPfYyc-zQly|TOFjr| zlr7D$C#iW0yd}2%H+xR`AYs3-Lzbl&V*V?0iSAamOfg>)rqK>PjzB&vINv8e9!x0| z0^FtmhzEe=W>dRdnjBZ0SFl;q8h?mVV}_`Ms5cQ!7NTxgJS}IxU`Z9o934b0MKtLM$FO)>$9}<*Dv$+Z zxU7P&e5-y|)`u2?t7ehaScoP$Q9mrcKryhEBC0I1FY8}05zgRoEwV;7(&1tPYJALg z;3(h{w_AuNiuqh+k!_bV-(o%wi!aCtJ6pB}2#1OJt^79zwfTs3WU?&{?n(aL%*HrPu~|eW&)tV1#$pzfn8|H zpA=V90CF&a1OQCXBhc+TzF@r^KmrtqfQcUi#g-SFl@b6E$k~S*IaGu6#p(Ve5kMSB zKmbhqdf0LI)VMefKmZ~avuBH3N-$ryyhbPh7eJheVR5}lF74C%duZlxAfOl~2#9-q z2|tKnFC*Muu{zw|hyN$s-thjUgiw309u+SLwYU2rPh5z-Ki~gqt{}wT&VL{62pzqr zi>nGkNAJLIv(iFGZ`b|G#i65T`Bk{RNB_&1#uyb z9KP0*WeRopO4+bj5;`)UOGiNvvS99cAy-L-_ELINxlj~3hQAY6xk3l0`27eb|3ewU zWOD?QZ`VXHX(uBrn|J8&^)R!+9=;Bni0}p0|4DE7!hS&$zG8PC6NWFvCms!ip8_MY z2qrH@Fga2X!Q|S6aFcnrb%w8>?ceGSTeq8xkB2YFH}(&Que6(&L|Ea!$_rn*mESK9 zUz4X2!q=p`D1ynHH4#ic9l_)Wzl~t>z|!zFx#a!4@HP4Q{SnsW-o)@V*}uIr{KQ#x zWB8ifboTM^6XzfQMG?}F6Xzv`YQhzI;@q^;qtb<*IHzlTLUHJcb1&)W4LxxVznx1e zLeK8&Hw+4dA!qkXcM=9m=-K_o-y)doQb#cPpAk%Maz-$zmeov$n#?eV|#O@8$M((3v&tAck`FxA;vqsu@~;t0x-UgAY0Zyd za_JM{CU>n|NGu99`TEM^rnu03^0|$w{7{pXZ7st1P?Ik%s-XlS55fKavVvS0I+7K) z5745J>E6`vgx(iAlCS6@n7ny)xXC{siD2@Fdn1^9IN4zdHJMP8N3IDqS^M2VdO?WE zm&W1^U7;pFFlvdVAtU+H(Q7AqLr3z}OU}p7bR^%{9>L`F+HjLgPDC)d@r?*34=$_e4K=yq;tEn9YVy^h0Ra(e zQnX7S2sQaZ=R!giI+Asn$6a9~`IB2z_Rx{M`%f)GGIS&#TOMw*_ScOzqb_EoO^FN6KeCF<%({Ht^!TndQf5b-ec8P0H@kNUm@%7u zJ$!T4fpq5LclLGoJ#X!(7XwJYa_D34wI2Zqd&SLuuvgCc%KMfxH)OtY#a>aeNl3Am z@WBt}QhZyL{b2x;V7AGJUQLpbO!jP(zxemuMiZ#GRA8^|KYPE!NCEjJV+yXTF02L7 zKkWp>t^qOq{J$gv+)Duv%snDaJC(9D{ z3a_Kx+t^O8IXg|B4UnJql4o?n;z8yXM({?4YRsxStW^!r>cdv?yFTU#bMR@ip~tUp zGwWM~*=_zC4|)=8LKLXLL&ffI7_OfNZEK>l5q+V@VbW@3EV$hi8LKvR~VlHKLW>Gmj_ z{rXn3VaRXn=`#%avyTdmL;2k`5~AKh)Uda-Qrg^^#!IqDB7XC1PkODYrU`YAc#-1-F^0&9-`5Wl5#AdMj0L zbJSLBtESdvtktd6QEBCMb@ti~H{Q6lqON><@!FDYRX1+2mfE&e-)Xm1o3~Zjx7XQj ztgNiBy20+)_K>Zlj&f`(xgm3H##$YfzAoKx6IGUO$jm6q(r0F7tS?P3S+_1T!+KMe zEpwf|G~G~El2x3Mu`W}$KK)x2)pgsdsw?c<>TK0@6l3sv6;<|18)d%HeEs#*-FdmW zWtH2kb*0<37gyQ{-YUFkEasdW6|mR)cE>g`{AwCm+3Um1F{ z^@cN*)rh-lhSFUP-7Zm_n z07y)5q!+!Ua>QHCWsa)nXpJFDLS)WTYPguy41+sBUHo+`@ z*~tR(e3y$R8q?Ub(&?=UuE|3CvwnTIAbX!Dr_CStuxdoB8XH%QQK|_m^Z695>9?we zJ5__hg(Uc+9AG7Lk9^3u&y)SEFuP5d)8R3;ne{%u!0sC}>qpFnc8`(S9{jlu*~gUZ zZ+7zBH2EpWi0|m1H5+<_IV#<_otzAi=ZD1~2bg>p)QM9^YZ1JjqMj zt0i*l7-eVyxm2LICE0Ds%!RU)W|QI$g)1qpn7Nq(0QZZj{kTu_tfy#=Z9OW+{?cOX zPo5Cleib`oTaSvdzqAsL8w#DLM*}2_<_d#eNWV#IIFYdbDYSme+H2?i!Cs8)wpq zEQ&iM?mvnI?KXLwHPZ@Nk`}>NuBEl>0EEnQmmNU;c!3$3nK$A9u=6$7<5_g#t9m@w zRqn6jD)(nfyUO#Ic9rMvD)-lMmHV5YKZxo6a@FVWD(^4z`)6#|%l!El+wr{W=igO+ ze3*ayi|u-u-#=qJp7YnAF&@uvcsC_Wex&_wS&3nCDu|b1>%hjl>-wPYgc&VxNCqwtj>QhM3lm@B}NS^&>3k znAVT*020&s5gssNT0g=Qq?p!^a00}%{`~lV{r(W6@gKOt`%l#RG1`Bk){oKt6SaPf z_MfQrW3>LG){oKt6SaPf_MfQrulV{8e**Ep+25l2pC5mAfBXFv-hZOjkJ0`UwSJ8D zpQ!a?wEsk{AEWgjwSJ7&f7JRh+JB_5@#$7cVDUOzVLKYIPx z>_5@#$7cVDUjK@(|CsEb(f!YlKeNAGyu$lW^!l;cf1=lq&HfX;er)!i==Ec>{-f8A z&H9gCKQ{YM^!l;cf1=l)AOEl4AHLc6Gru$?S=86kx4W5AINzc~%HZo(%r96;7Ull) zeTr?@z+*k95_~bt`J3sjPSnxQ^>t~|2JCm5_bL%BL^XzfU zq(8j1^IA~@-M4H_llkY@ivBY- zyi@b5{pNNyQ;KK%k}D(eeYsz9X>c5kj52S_Tyg85?=G<@ju|(b(^jH%Cqb(C%|&jD zj=1%SwzRxPykObKdVa0S2V+CiHg9WEADxz-tv}9bXZy#c-tjDCv{pz zT#~&>?SWBOLVr?^jDnC4Ze%jGu8WI!@}XAG;RslogKuw`=JE znbZ2YOED^IAOFI&+%501#?z{ooz8zDrG4UO^rm-HdM2KuH}6)Q$NB`h8^~~fL zZJAOlrv6G7@6qU|PSd4_G=)qaa81s1;Kb!oI4bhf97jdIOceBkihLu-QIXeh92NO<97jd|7ROPM&u|Oqxhy_d2Nk)IZ564lF`#Fw^e468^$kho!Kd8tzaU2zS1IJO3@8dWs z@;`DM75Nt&M@4>pV$csN@k#FNTD)LT_qayzk$5D|lAcMz6MZTT{SjSwia*Dj1 z`#&o3XE=_E{4mE+kzeFED)Ke5;BirrKge-Zk1XQAUViUk!Nrm6}gS$sK|fHaa81g;5aJsiyTKqe#7FRA5`Q; z97jdIo8zd+{TxR{{wc>%kzczcIR2=}3ptL8{0WYuB7dFZsL21uaa820rNQH(BEN&< zsK_7VI4bgAa~u_Ufa9ph7cL7P7ZrIP$5D~javT-;K8~Xz|1-x?k&6|<H1t10BOGQ4yaa81=a2yqR z!t!7pROG1~M@4SrI4bgcmIM1UpQy;UEeA*j_)-}zSp(qa;!!dFSuP$GB;tjGp;cW)k=Xqi1GPE^7nVGv8p=1~YV*t&BF<^~`Mt z*3#0VvW)aOT;hLy#_U1leGIk*{;seOc8$-e5B?e02mcIQ#r!j{4|d(psSo}c*a!a% z?1O&>_QB3za2xshMS@G5cLH|7I{};Eoq#>Cvp1I>cqd>Fyc4ho-U-+PJ9BgCfp-G- zz&inZ;GKXyFcZgZX#Cd_uLSI2E=ORJ_$Ocw%*64sgy#eA1nhx#0?q^P1nhyCxH?PQV`6OfiT;Gu4? WR+TdU%4@A(eVuJ>3G=RJfd2sj;aPhC diff --git a/ptx/lib/zluda_ptx_impl.cl b/ptx/lib/zluda_ptx_impl.cl index aca9327..d439795 100644 --- a/ptx/lib/zluda_ptx_impl.cl +++ b/ptx/lib/zluda_ptx_impl.cl @@ -296,6 +296,19 @@ atomic_add(atom_acq_rel_sys_shared_add_f64, memory_order_acq_rel, memory_order_a uint FUNC(activemask)() { return (uint)__builtin_amdgcn_uicmp(1, 0, 33); } + + uint FUNC(sreg_clock)() { + return (uint)__builtin_amdgcn_s_memtime(); + } + + // Taken from __ballot definition in hipamd/include/hip/amd_detail/amd_device_functions.h + // They return active threads, which I think is incorrect + extern __attribute__((const)) uint __ockl_lane_u32(); + uint FUNC(sreg_lanemask_lt)() { + uint lane_idx = __ockl_lane_u32(); + ulong mask = (1UL << lane_idx) - 1UL; + return (uint)mask; + } #endif void FUNC(__assertfail)( diff --git a/ptx/src/test/spirv_run/lanemask_lt.ptx b/ptx/src/test/spirv_run/lanemask_lt.ptx new file mode 100644 index 0000000..02b13ce --- /dev/null +++ b/ptx/src/test/spirv_run/lanemask_lt.ptx @@ -0,0 +1,25 @@ +.version 6.5 +.target sm_30 +.address_size 64 + +.visible .entry lanemask_lt( + .param .u64 input, + .param .u64 output +) +{ + .reg .u64 in_addr; + .reg .u64 out_addr; + .reg .b32 temp; + .reg .b32 temp2; + .reg .b32 less_lane; + + ld.param.u64 in_addr, [input]; + ld.param.u64 out_addr, [output]; + + ld.u32 temp, [in_addr]; + add.u32 temp2, temp, 1; + mov.u32 less_lane, %lanemask_lt; + add.u32 temp2, temp2, less_lane; + st.u32 [out_addr], temp2; + ret; +} diff --git a/ptx/src/test/spirv_run/lanemask_lt.spvtxt b/ptx/src/test/spirv_run/lanemask_lt.spvtxt new file mode 100644 index 0000000..0753c95 --- /dev/null +++ b/ptx/src/test/spirv_run/lanemask_lt.spvtxt @@ -0,0 +1,45 @@ + OpCapability GenericPointer + OpCapability Linkage + OpCapability Addresses + OpCapability Kernel + OpCapability Int8 + OpCapability Int16 + OpCapability Int64 + OpCapability Float16 + OpCapability Float64 + %18 = OpExtInstImport "OpenCL.std" + OpMemoryModel Physical64 OpenCL + OpEntryPoint Kernel %1 "activemask" + OpExecutionMode %1 ContractionOff + OpDecorate %15 LinkageAttributes "__zluda_ptx_impl__activemask" Import + %void = OpTypeVoid + %uint = OpTypeInt 32 0 + %21 = OpTypeFunction %uint + %ulong = OpTypeInt 64 0 + %23 = OpTypeFunction %void %ulong %ulong +%_ptr_Function_ulong = OpTypePointer Function %ulong +%_ptr_Function_uint = OpTypePointer Function %uint +%_ptr_Generic_uint = OpTypePointer Generic %uint + %15 = OpFunction %uint None %21 + OpFunctionEnd + %1 = OpFunction %void None %23 + %6 = OpFunctionParameter %ulong + %7 = OpFunctionParameter %ulong + %14 = OpLabel + %2 = OpVariable %_ptr_Function_ulong Function + %3 = OpVariable %_ptr_Function_ulong Function + %4 = OpVariable %_ptr_Function_ulong Function + %5 = OpVariable %_ptr_Function_uint Function + OpStore %2 %6 + OpStore %3 %7 + %8 = OpLoad %ulong %3 Aligned 8 + OpStore %4 %8 + %9 = OpFunctionCall %uint %15 + OpStore %5 %9 + %10 = OpLoad %ulong %4 + %11 = OpLoad %uint %5 + %12 = OpConvertUToPtr %_ptr_Generic_uint %10 + %13 = OpCopyObject %uint %11 + OpStore %12 %13 Aligned 4 + OpReturn + OpFunctionEnd diff --git a/ptx/src/test/spirv_run/mod.rs b/ptx/src/test/spirv_run/mod.rs index 0dcd0bb..d68cf17 100644 --- a/ptx/src/test/spirv_run/mod.rs +++ b/ptx/src/test/spirv_run/mod.rs @@ -210,6 +210,7 @@ test_ptx!(prmt, [0x70c507d6u32, 0x6fbd4b5cu32], [0x6fbdd65cu32]); test_ptx!(activemask, [0u32], [1u32]); test_ptx!(membar, [152731u32], [152731u32]); test_ptx!(func_ptr, [152731u64], [152732u64]); +test_ptx!(lanemask_lt, [187235u32], [187236u32]); struct DisplayError { err: T, diff --git a/ptx/src/translate.rs b/ptx/src/translate.rs index 39bd07e..15dcdd1 100644 --- a/ptx/src/translate.rs +++ b/ptx/src/translate.rs @@ -9,7 +9,9 @@ use rspirv::binary::{Assemble, Disassemble}; static ZLUDA_PTX_IMPL_INTEL: &'static [u8] = include_bytes!("../lib/zluda_ptx_impl.spv"); static ZLUDA_PTX_IMPL_AMD: &'static [u8] = include_bytes!("../lib/zluda_ptx_impl.bc"); -static ZLUDA_PTX_PREFIX: &'static str = "__zluda_ptx_impl__"; +const ZLUDA_PTX_PREFIX: &'static str = "__zluda_ptx_impl__"; +const ZLUDA_PTX_PREFIX_SREG_CLOCK: &'static str = "__zluda_ptx_impl__sreg_clock"; +const ZLUDA_PTX_PREFIX_SREG_LANEMASK_LT: &'static str = "__zluda_ptx_impl__sreg_lanemask_lt"; quick_error! { #[derive(Debug)] @@ -1015,25 +1017,6 @@ fn compute_denorm_information<'input>( .collect() } -fn emit_builtins( - builder: &mut dr::Builder, - map: &mut TypeWordMap, - id_defs: &GlobalStringIdResolver, -) { - for (reg, id) in id_defs.special_registers.builtins() { - let result_type = map.get_or_add( - builder, - SpirvType::pointer_to(reg.get_type(), spirv::StorageClass::Input), - ); - builder.variable(result_type, Some(id), spirv::StorageClass::Input, None); - builder.decorate( - id, - spirv::Decoration::BuiltIn, - [dr::Operand::BuiltIn(reg.get_builtin())].iter().cloned(), - ); - } -} - fn emit_function_header<'a>( builder: &mut dr::Builder, map: &mut TypeWordMap, @@ -4815,6 +4798,8 @@ enum PtxSpecialRegister { Ctaid64, Nctaid, Nctaid64, + Clock, + LanemaskLt, } impl PtxSpecialRegister { @@ -4824,6 +4809,8 @@ impl PtxSpecialRegister { "%ntid" => Some(Self::Ntid), "%ctaid" => Some(Self::Ctaid), "%nctaid" => Some(Self::Nctaid), + "%clock" => Some(Self::Clock), + "%lanemask_lt" => Some(Self::LanemaskLt), _ => None, } } @@ -4838,6 +4825,8 @@ impl PtxSpecialRegister { PtxSpecialRegister::Ctaid64 => ast::Type::Vector(ast::ScalarType::U64, 3), PtxSpecialRegister::Nctaid => ast::Type::Vector(ast::ScalarType::U32, 4), PtxSpecialRegister::Nctaid64 => ast::Type::Vector(ast::ScalarType::U64, 3), + PtxSpecialRegister::Clock => ast::Type::Scalar(ast::ScalarType::U32), + PtxSpecialRegister::LanemaskLt => ast::Type::Scalar(ast::ScalarType::U32), } } @@ -4846,7 +4835,9 @@ impl PtxSpecialRegister { PtxSpecialRegister::Tid | PtxSpecialRegister::Ntid | PtxSpecialRegister::Ctaid - | PtxSpecialRegister::Nctaid => ast::ScalarType::U32, + | PtxSpecialRegister::Nctaid + | PtxSpecialRegister::Clock + | PtxSpecialRegister::LanemaskLt => ast::ScalarType::U32, PtxSpecialRegister::Tid64 | PtxSpecialRegister::Ntid64 | PtxSpecialRegister::Ctaid64 @@ -4854,21 +4845,6 @@ impl PtxSpecialRegister { } } - fn get_builtin(self) -> spirv::BuiltIn { - match self { - PtxSpecialRegister::Tid | PtxSpecialRegister::Tid64 => { - spirv::BuiltIn::LocalInvocationId - } - PtxSpecialRegister::Ntid | PtxSpecialRegister::Ntid64 => { - spirv::BuiltIn::EnqueuedWorkgroupSize - } - PtxSpecialRegister::Ctaid | PtxSpecialRegister::Ctaid64 => spirv::BuiltIn::WorkgroupId, - PtxSpecialRegister::Nctaid | PtxSpecialRegister::Nctaid64 => { - spirv::BuiltIn::NumWorkgroups - } - } - } - fn get_opencl_fn_type(self) -> (&'static str, ast::ScalarType) { match self { PtxSpecialRegister::Tid | PtxSpecialRegister::Tid64 => { @@ -4883,6 +4859,10 @@ impl PtxSpecialRegister { PtxSpecialRegister::Nctaid | PtxSpecialRegister::Nctaid64 => { ("_Z14get_num_groupsj", ast::ScalarType::U64) } + PtxSpecialRegister::Clock => (ZLUDA_PTX_PREFIX_SREG_CLOCK, ast::ScalarType::U32), + PtxSpecialRegister::LanemaskLt => { + (ZLUDA_PTX_PREFIX_SREG_LANEMASK_LT, ast::ScalarType::U32) + } } } @@ -4899,7 +4879,9 @@ impl PtxSpecialRegister { PtxSpecialRegister::Tid64 | PtxSpecialRegister::Ntid64 | PtxSpecialRegister::Ctaid64 - | PtxSpecialRegister::Nctaid64 => None, + | PtxSpecialRegister::Nctaid64 + | PtxSpecialRegister::Clock => None, + PtxSpecialRegister::LanemaskLt => None, } } }