2
0

rsaz-avx2.pl 51 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982
  1. #! /usr/bin/env perl
  2. # Copyright 2013-2020 The OpenSSL Project Authors. All Rights Reserved.
  3. # Copyright (c) 2012, Intel Corporation. All Rights Reserved.
  4. #
  5. # Licensed under the OpenSSL license (the "License"). You may not use
  6. # this file except in compliance with the License. You can obtain a copy
  7. # in the file LICENSE in the source distribution or at
  8. # https://www.openssl.org/source/license.html
  9. #
  10. # Originally written by Shay Gueron (1, 2), and Vlad Krasnov (1)
  11. # (1) Intel Corporation, Israel Development Center, Haifa, Israel
  12. # (2) University of Haifa, Israel
  13. #
  14. # References:
  15. # [1] S. Gueron, V. Krasnov: "Software Implementation of Modular
  16. # Exponentiation, Using Advanced Vector Instructions Architectures",
  17. # F. Ozbudak and F. Rodriguez-Henriquez (Eds.): WAIFI 2012, LNCS 7369,
  18. # pp. 119?135, 2012. Springer-Verlag Berlin Heidelberg 2012
  19. # [2] S. Gueron: "Efficient Software Implementations of Modular
  20. # Exponentiation", Journal of Cryptographic Engineering 2:31-43 (2012).
  21. # [3] S. Gueron, V. Krasnov: "Speeding up Big-numbers Squaring",IEEE
  22. # Proceedings of 9th International Conference on Information Technology:
  23. # New Generations (ITNG 2012), pp.821-823 (2012)
  24. # [4] S. Gueron, V. Krasnov: "[PATCH] Efficient and side channel analysis
  25. # resistant 1024-bit modular exponentiation, for optimizing RSA2048
  26. # on AVX2 capable x86_64 platforms",
  27. # http://rt.openssl.org/Ticket/Display.html?id=2850&user=guest&pass=guest
  28. #
  29. # +13% improvement over original submission by <appro@openssl.org>
  30. #
  31. # rsa2048 sign/sec OpenSSL 1.0.1 scalar(*) this
  32. # 2.3GHz Haswell 621 765/+23% 1113/+79%
  33. # 2.3GHz Broadwell(**) 688 1200(***)/+74% 1120/+63%
  34. #
  35. # (*) if system doesn't support AVX2, for reference purposes;
  36. # (**) scaled to 2.3GHz to simplify comparison;
  37. # (***) scalar AD*X code is faster than AVX2 and is preferred code
  38. # path for Broadwell;
  39. $flavour = shift;
  40. $output = shift;
  41. if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
  42. $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
  43. $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
  44. ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
  45. ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
  46. die "can't locate x86_64-xlate.pl";
  47. if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
  48. =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
  49. $avx = ($1>=2.19) + ($1>=2.22);
  50. $addx = ($1>=2.23);
  51. }
  52. if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
  53. `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
  54. $avx = ($1>=2.09) + ($1>=2.10);
  55. $addx = ($1>=2.10);
  56. }
  57. if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
  58. `ml64 2>&1` =~ /Version ([0-9]+)\./) {
  59. $avx = ($1>=10) + ($1>=11);
  60. $addx = ($1>=11);
  61. }
  62. if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:clang|LLVM) version|based on LLVM) ([0-9]+)\.([0-9]+)/) {
  63. my $ver = $2 + $3/100.0; # 3.1->3.01, 3.10->3.10
  64. $avx = ($ver>=3.0) + ($ver>=3.01);
  65. $addx = ($ver>=3.03);
  66. }
  67. open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
  68. *STDOUT = *OUT;
  69. if ($avx>1) {{{
  70. { # void AMS_WW(
  71. my $rp="%rdi"; # BN_ULONG *rp,
  72. my $ap="%rsi"; # const BN_ULONG *ap,
  73. my $np="%rdx"; # const BN_ULONG *np,
  74. my $n0="%ecx"; # const BN_ULONG n0,
  75. my $rep="%r8d"; # int repeat);
  76. # The registers that hold the accumulated redundant result
  77. # The AMM works on 1024 bit operands, and redundant word size is 29
  78. # Therefore: ceil(1024/29)/4 = 9
  79. my $ACC0="%ymm0";
  80. my $ACC1="%ymm1";
  81. my $ACC2="%ymm2";
  82. my $ACC3="%ymm3";
  83. my $ACC4="%ymm4";
  84. my $ACC5="%ymm5";
  85. my $ACC6="%ymm6";
  86. my $ACC7="%ymm7";
  87. my $ACC8="%ymm8";
  88. my $ACC9="%ymm9";
  89. # Registers that hold the broadcasted words of bp, currently used
  90. my $B1="%ymm10";
  91. my $B2="%ymm11";
  92. # Registers that hold the broadcasted words of Y, currently used
  93. my $Y1="%ymm12";
  94. my $Y2="%ymm13";
  95. # Helper registers
  96. my $TEMP1="%ymm14";
  97. my $AND_MASK="%ymm15";
  98. # alu registers that hold the first words of the ACC
  99. my $r0="%r9";
  100. my $r1="%r10";
  101. my $r2="%r11";
  102. my $r3="%r12";
  103. my $i="%r14d"; # loop counter
  104. my $tmp = "%r15";
  105. my $FrameSize=32*18+32*8; # place for A^2 and 2*A
  106. my $aap=$r0;
  107. my $tp0="%rbx";
  108. my $tp1=$r3;
  109. my $tpa=$tmp;
  110. $np="%r13"; # reassigned argument
  111. $code.=<<___;
  112. .text
  113. .globl rsaz_1024_sqr_avx2
  114. .type rsaz_1024_sqr_avx2,\@function,5
  115. .align 64
  116. rsaz_1024_sqr_avx2: # 702 cycles, 14% faster than rsaz_1024_mul_avx2
  117. .cfi_startproc
  118. lea (%rsp), %rax
  119. .cfi_def_cfa_register %rax
  120. push %rbx
  121. .cfi_push %rbx
  122. push %rbp
  123. .cfi_push %rbp
  124. push %r12
  125. .cfi_push %r12
  126. push %r13
  127. .cfi_push %r13
  128. push %r14
  129. .cfi_push %r14
  130. push %r15
  131. .cfi_push %r15
  132. vzeroupper
  133. ___
  134. $code.=<<___ if ($win64);
  135. lea -0xa8(%rsp),%rsp
  136. vmovaps %xmm6,-0xd8(%rax)
  137. vmovaps %xmm7,-0xc8(%rax)
  138. vmovaps %xmm8,-0xb8(%rax)
  139. vmovaps %xmm9,-0xa8(%rax)
  140. vmovaps %xmm10,-0x98(%rax)
  141. vmovaps %xmm11,-0x88(%rax)
  142. vmovaps %xmm12,-0x78(%rax)
  143. vmovaps %xmm13,-0x68(%rax)
  144. vmovaps %xmm14,-0x58(%rax)
  145. vmovaps %xmm15,-0x48(%rax)
  146. .Lsqr_1024_body:
  147. ___
  148. $code.=<<___;
  149. mov %rax,%rbp
  150. .cfi_def_cfa_register %rbp
  151. mov %rdx, $np # reassigned argument
  152. sub \$$FrameSize, %rsp
  153. mov $np, $tmp
  154. sub \$-128, $rp # size optimization
  155. sub \$-128, $ap
  156. sub \$-128, $np
  157. and \$4095, $tmp # see if $np crosses page
  158. add \$32*10, $tmp
  159. shr \$12, $tmp
  160. vpxor $ACC9,$ACC9,$ACC9
  161. jz .Lsqr_1024_no_n_copy
  162. # unaligned 256-bit load that crosses page boundary can
  163. # cause >2x performance degradation here, so if $np does
  164. # cross page boundary, copy it to stack and make sure stack
  165. # frame doesn't...
  166. sub \$32*10,%rsp
  167. vmovdqu 32*0-128($np), $ACC0
  168. and \$-2048, %rsp
  169. vmovdqu 32*1-128($np), $ACC1
  170. vmovdqu 32*2-128($np), $ACC2
  171. vmovdqu 32*3-128($np), $ACC3
  172. vmovdqu 32*4-128($np), $ACC4
  173. vmovdqu 32*5-128($np), $ACC5
  174. vmovdqu 32*6-128($np), $ACC6
  175. vmovdqu 32*7-128($np), $ACC7
  176. vmovdqu 32*8-128($np), $ACC8
  177. lea $FrameSize+128(%rsp),$np
  178. vmovdqu $ACC0, 32*0-128($np)
  179. vmovdqu $ACC1, 32*1-128($np)
  180. vmovdqu $ACC2, 32*2-128($np)
  181. vmovdqu $ACC3, 32*3-128($np)
  182. vmovdqu $ACC4, 32*4-128($np)
  183. vmovdqu $ACC5, 32*5-128($np)
  184. vmovdqu $ACC6, 32*6-128($np)
  185. vmovdqu $ACC7, 32*7-128($np)
  186. vmovdqu $ACC8, 32*8-128($np)
  187. vmovdqu $ACC9, 32*9-128($np) # $ACC9 is zero
  188. .Lsqr_1024_no_n_copy:
  189. and \$-1024, %rsp
  190. vmovdqu 32*1-128($ap), $ACC1
  191. vmovdqu 32*2-128($ap), $ACC2
  192. vmovdqu 32*3-128($ap), $ACC3
  193. vmovdqu 32*4-128($ap), $ACC4
  194. vmovdqu 32*5-128($ap), $ACC5
  195. vmovdqu 32*6-128($ap), $ACC6
  196. vmovdqu 32*7-128($ap), $ACC7
  197. vmovdqu 32*8-128($ap), $ACC8
  198. lea 192(%rsp), $tp0 # 64+128=192
  199. vmovdqu .Land_mask(%rip), $AND_MASK
  200. jmp .LOOP_GRANDE_SQR_1024
  201. .align 32
  202. .LOOP_GRANDE_SQR_1024:
  203. lea 32*18+128(%rsp), $aap # size optimization
  204. lea 448(%rsp), $tp1 # 64+128+256=448
  205. # the squaring is performed as described in Variant B of
  206. # "Speeding up Big-Number Squaring", so start by calculating
  207. # the A*2=A+A vector
  208. vpaddq $ACC1, $ACC1, $ACC1
  209. vpbroadcastq 32*0-128($ap), $B1
  210. vpaddq $ACC2, $ACC2, $ACC2
  211. vmovdqa $ACC1, 32*0-128($aap)
  212. vpaddq $ACC3, $ACC3, $ACC3
  213. vmovdqa $ACC2, 32*1-128($aap)
  214. vpaddq $ACC4, $ACC4, $ACC4
  215. vmovdqa $ACC3, 32*2-128($aap)
  216. vpaddq $ACC5, $ACC5, $ACC5
  217. vmovdqa $ACC4, 32*3-128($aap)
  218. vpaddq $ACC6, $ACC6, $ACC6
  219. vmovdqa $ACC5, 32*4-128($aap)
  220. vpaddq $ACC7, $ACC7, $ACC7
  221. vmovdqa $ACC6, 32*5-128($aap)
  222. vpaddq $ACC8, $ACC8, $ACC8
  223. vmovdqa $ACC7, 32*6-128($aap)
  224. vpxor $ACC9, $ACC9, $ACC9
  225. vmovdqa $ACC8, 32*7-128($aap)
  226. vpmuludq 32*0-128($ap), $B1, $ACC0
  227. vpbroadcastq 32*1-128($ap), $B2
  228. vmovdqu $ACC9, 32*9-192($tp0) # zero upper half
  229. vpmuludq $B1, $ACC1, $ACC1
  230. vmovdqu $ACC9, 32*10-448($tp1)
  231. vpmuludq $B1, $ACC2, $ACC2
  232. vmovdqu $ACC9, 32*11-448($tp1)
  233. vpmuludq $B1, $ACC3, $ACC3
  234. vmovdqu $ACC9, 32*12-448($tp1)
  235. vpmuludq $B1, $ACC4, $ACC4
  236. vmovdqu $ACC9, 32*13-448($tp1)
  237. vpmuludq $B1, $ACC5, $ACC5
  238. vmovdqu $ACC9, 32*14-448($tp1)
  239. vpmuludq $B1, $ACC6, $ACC6
  240. vmovdqu $ACC9, 32*15-448($tp1)
  241. vpmuludq $B1, $ACC7, $ACC7
  242. vmovdqu $ACC9, 32*16-448($tp1)
  243. vpmuludq $B1, $ACC8, $ACC8
  244. vpbroadcastq 32*2-128($ap), $B1
  245. vmovdqu $ACC9, 32*17-448($tp1)
  246. mov $ap, $tpa
  247. mov \$4, $i
  248. jmp .Lsqr_entry_1024
  249. ___
  250. $TEMP0=$Y1;
  251. $TEMP2=$Y2;
  252. $code.=<<___;
  253. .align 32
  254. .LOOP_SQR_1024:
  255. vpbroadcastq 32*1-128($tpa), $B2
  256. vpmuludq 32*0-128($ap), $B1, $ACC0
  257. vpaddq 32*0-192($tp0), $ACC0, $ACC0
  258. vpmuludq 32*0-128($aap), $B1, $ACC1
  259. vpaddq 32*1-192($tp0), $ACC1, $ACC1
  260. vpmuludq 32*1-128($aap), $B1, $ACC2
  261. vpaddq 32*2-192($tp0), $ACC2, $ACC2
  262. vpmuludq 32*2-128($aap), $B1, $ACC3
  263. vpaddq 32*3-192($tp0), $ACC3, $ACC3
  264. vpmuludq 32*3-128($aap), $B1, $ACC4
  265. vpaddq 32*4-192($tp0), $ACC4, $ACC4
  266. vpmuludq 32*4-128($aap), $B1, $ACC5
  267. vpaddq 32*5-192($tp0), $ACC5, $ACC5
  268. vpmuludq 32*5-128($aap), $B1, $ACC6
  269. vpaddq 32*6-192($tp0), $ACC6, $ACC6
  270. vpmuludq 32*6-128($aap), $B1, $ACC7
  271. vpaddq 32*7-192($tp0), $ACC7, $ACC7
  272. vpmuludq 32*7-128($aap), $B1, $ACC8
  273. vpbroadcastq 32*2-128($tpa), $B1
  274. vpaddq 32*8-192($tp0), $ACC8, $ACC8
  275. .Lsqr_entry_1024:
  276. vmovdqu $ACC0, 32*0-192($tp0)
  277. vmovdqu $ACC1, 32*1-192($tp0)
  278. vpmuludq 32*1-128($ap), $B2, $TEMP0
  279. vpaddq $TEMP0, $ACC2, $ACC2
  280. vpmuludq 32*1-128($aap), $B2, $TEMP1
  281. vpaddq $TEMP1, $ACC3, $ACC3
  282. vpmuludq 32*2-128($aap), $B2, $TEMP2
  283. vpaddq $TEMP2, $ACC4, $ACC4
  284. vpmuludq 32*3-128($aap), $B2, $TEMP0
  285. vpaddq $TEMP0, $ACC5, $ACC5
  286. vpmuludq 32*4-128($aap), $B2, $TEMP1
  287. vpaddq $TEMP1, $ACC6, $ACC6
  288. vpmuludq 32*5-128($aap), $B2, $TEMP2
  289. vpaddq $TEMP2, $ACC7, $ACC7
  290. vpmuludq 32*6-128($aap), $B2, $TEMP0
  291. vpaddq $TEMP0, $ACC8, $ACC8
  292. vpmuludq 32*7-128($aap), $B2, $ACC0
  293. vpbroadcastq 32*3-128($tpa), $B2
  294. vpaddq 32*9-192($tp0), $ACC0, $ACC0
  295. vmovdqu $ACC2, 32*2-192($tp0)
  296. vmovdqu $ACC3, 32*3-192($tp0)
  297. vpmuludq 32*2-128($ap), $B1, $TEMP2
  298. vpaddq $TEMP2, $ACC4, $ACC4
  299. vpmuludq 32*2-128($aap), $B1, $TEMP0
  300. vpaddq $TEMP0, $ACC5, $ACC5
  301. vpmuludq 32*3-128($aap), $B1, $TEMP1
  302. vpaddq $TEMP1, $ACC6, $ACC6
  303. vpmuludq 32*4-128($aap), $B1, $TEMP2
  304. vpaddq $TEMP2, $ACC7, $ACC7
  305. vpmuludq 32*5-128($aap), $B1, $TEMP0
  306. vpaddq $TEMP0, $ACC8, $ACC8
  307. vpmuludq 32*6-128($aap), $B1, $TEMP1
  308. vpaddq $TEMP1, $ACC0, $ACC0
  309. vpmuludq 32*7-128($aap), $B1, $ACC1
  310. vpbroadcastq 32*4-128($tpa), $B1
  311. vpaddq 32*10-448($tp1), $ACC1, $ACC1
  312. vmovdqu $ACC4, 32*4-192($tp0)
  313. vmovdqu $ACC5, 32*5-192($tp0)
  314. vpmuludq 32*3-128($ap), $B2, $TEMP0
  315. vpaddq $TEMP0, $ACC6, $ACC6
  316. vpmuludq 32*3-128($aap), $B2, $TEMP1
  317. vpaddq $TEMP1, $ACC7, $ACC7
  318. vpmuludq 32*4-128($aap), $B2, $TEMP2
  319. vpaddq $TEMP2, $ACC8, $ACC8
  320. vpmuludq 32*5-128($aap), $B2, $TEMP0
  321. vpaddq $TEMP0, $ACC0, $ACC0
  322. vpmuludq 32*6-128($aap), $B2, $TEMP1
  323. vpaddq $TEMP1, $ACC1, $ACC1
  324. vpmuludq 32*7-128($aap), $B2, $ACC2
  325. vpbroadcastq 32*5-128($tpa), $B2
  326. vpaddq 32*11-448($tp1), $ACC2, $ACC2
  327. vmovdqu $ACC6, 32*6-192($tp0)
  328. vmovdqu $ACC7, 32*7-192($tp0)
  329. vpmuludq 32*4-128($ap), $B1, $TEMP0
  330. vpaddq $TEMP0, $ACC8, $ACC8
  331. vpmuludq 32*4-128($aap), $B1, $TEMP1
  332. vpaddq $TEMP1, $ACC0, $ACC0
  333. vpmuludq 32*5-128($aap), $B1, $TEMP2
  334. vpaddq $TEMP2, $ACC1, $ACC1
  335. vpmuludq 32*6-128($aap), $B1, $TEMP0
  336. vpaddq $TEMP0, $ACC2, $ACC2
  337. vpmuludq 32*7-128($aap), $B1, $ACC3
  338. vpbroadcastq 32*6-128($tpa), $B1
  339. vpaddq 32*12-448($tp1), $ACC3, $ACC3
  340. vmovdqu $ACC8, 32*8-192($tp0)
  341. vmovdqu $ACC0, 32*9-192($tp0)
  342. lea 8($tp0), $tp0
  343. vpmuludq 32*5-128($ap), $B2, $TEMP2
  344. vpaddq $TEMP2, $ACC1, $ACC1
  345. vpmuludq 32*5-128($aap), $B2, $TEMP0
  346. vpaddq $TEMP0, $ACC2, $ACC2
  347. vpmuludq 32*6-128($aap), $B2, $TEMP1
  348. vpaddq $TEMP1, $ACC3, $ACC3
  349. vpmuludq 32*7-128($aap), $B2, $ACC4
  350. vpbroadcastq 32*7-128($tpa), $B2
  351. vpaddq 32*13-448($tp1), $ACC4, $ACC4
  352. vmovdqu $ACC1, 32*10-448($tp1)
  353. vmovdqu $ACC2, 32*11-448($tp1)
  354. vpmuludq 32*6-128($ap), $B1, $TEMP0
  355. vpaddq $TEMP0, $ACC3, $ACC3
  356. vpmuludq 32*6-128($aap), $B1, $TEMP1
  357. vpbroadcastq 32*8-128($tpa), $ACC0 # borrow $ACC0 for $B1
  358. vpaddq $TEMP1, $ACC4, $ACC4
  359. vpmuludq 32*7-128($aap), $B1, $ACC5
  360. vpbroadcastq 32*0+8-128($tpa), $B1 # for next iteration
  361. vpaddq 32*14-448($tp1), $ACC5, $ACC5
  362. vmovdqu $ACC3, 32*12-448($tp1)
  363. vmovdqu $ACC4, 32*13-448($tp1)
  364. lea 8($tpa), $tpa
  365. vpmuludq 32*7-128($ap), $B2, $TEMP0
  366. vpaddq $TEMP0, $ACC5, $ACC5
  367. vpmuludq 32*7-128($aap), $B2, $ACC6
  368. vpaddq 32*15-448($tp1), $ACC6, $ACC6
  369. vpmuludq 32*8-128($ap), $ACC0, $ACC7
  370. vmovdqu $ACC5, 32*14-448($tp1)
  371. vpaddq 32*16-448($tp1), $ACC7, $ACC7
  372. vmovdqu $ACC6, 32*15-448($tp1)
  373. vmovdqu $ACC7, 32*16-448($tp1)
  374. lea 8($tp1), $tp1
  375. dec $i
  376. jnz .LOOP_SQR_1024
  377. ___
  378. $ZERO = $ACC9;
  379. $TEMP0 = $B1;
  380. $TEMP2 = $B2;
  381. $TEMP3 = $Y1;
  382. $TEMP4 = $Y2;
  383. $code.=<<___;
  384. # we need to fix indices 32-39 to avoid overflow
  385. vmovdqu 32*8(%rsp), $ACC8 # 32*8-192($tp0),
  386. vmovdqu 32*9(%rsp), $ACC1 # 32*9-192($tp0)
  387. vmovdqu 32*10(%rsp), $ACC2 # 32*10-192($tp0)
  388. lea 192(%rsp), $tp0 # 64+128=192
  389. vpsrlq \$29, $ACC8, $TEMP1
  390. vpand $AND_MASK, $ACC8, $ACC8
  391. vpsrlq \$29, $ACC1, $TEMP2
  392. vpand $AND_MASK, $ACC1, $ACC1
  393. vpermq \$0x93, $TEMP1, $TEMP1
  394. vpxor $ZERO, $ZERO, $ZERO
  395. vpermq \$0x93, $TEMP2, $TEMP2
  396. vpblendd \$3, $ZERO, $TEMP1, $TEMP0
  397. vpblendd \$3, $TEMP1, $TEMP2, $TEMP1
  398. vpaddq $TEMP0, $ACC8, $ACC8
  399. vpblendd \$3, $TEMP2, $ZERO, $TEMP2
  400. vpaddq $TEMP1, $ACC1, $ACC1
  401. vpaddq $TEMP2, $ACC2, $ACC2
  402. vmovdqu $ACC1, 32*9-192($tp0)
  403. vmovdqu $ACC2, 32*10-192($tp0)
  404. mov (%rsp), %rax
  405. mov 8(%rsp), $r1
  406. mov 16(%rsp), $r2
  407. mov 24(%rsp), $r3
  408. vmovdqu 32*1(%rsp), $ACC1
  409. vmovdqu 32*2-192($tp0), $ACC2
  410. vmovdqu 32*3-192($tp0), $ACC3
  411. vmovdqu 32*4-192($tp0), $ACC4
  412. vmovdqu 32*5-192($tp0), $ACC5
  413. vmovdqu 32*6-192($tp0), $ACC6
  414. vmovdqu 32*7-192($tp0), $ACC7
  415. mov %rax, $r0
  416. imull $n0, %eax
  417. and \$0x1fffffff, %eax
  418. vmovd %eax, $Y1
  419. mov %rax, %rdx
  420. imulq -128($np), %rax
  421. vpbroadcastq $Y1, $Y1
  422. add %rax, $r0
  423. mov %rdx, %rax
  424. imulq 8-128($np), %rax
  425. shr \$29, $r0
  426. add %rax, $r1
  427. mov %rdx, %rax
  428. imulq 16-128($np), %rax
  429. add $r0, $r1
  430. add %rax, $r2
  431. imulq 24-128($np), %rdx
  432. add %rdx, $r3
  433. mov $r1, %rax
  434. imull $n0, %eax
  435. and \$0x1fffffff, %eax
  436. mov \$9, $i
  437. jmp .LOOP_REDUCE_1024
  438. .align 32
  439. .LOOP_REDUCE_1024:
  440. vmovd %eax, $Y2
  441. vpbroadcastq $Y2, $Y2
  442. vpmuludq 32*1-128($np), $Y1, $TEMP0
  443. mov %rax, %rdx
  444. imulq -128($np), %rax
  445. vpaddq $TEMP0, $ACC1, $ACC1
  446. add %rax, $r1
  447. vpmuludq 32*2-128($np), $Y1, $TEMP1
  448. mov %rdx, %rax
  449. imulq 8-128($np), %rax
  450. vpaddq $TEMP1, $ACC2, $ACC2
  451. vpmuludq 32*3-128($np), $Y1, $TEMP2
  452. .byte 0x67
  453. add %rax, $r2
  454. .byte 0x67
  455. mov %rdx, %rax
  456. imulq 16-128($np), %rax
  457. shr \$29, $r1
  458. vpaddq $TEMP2, $ACC3, $ACC3
  459. vpmuludq 32*4-128($np), $Y1, $TEMP0
  460. add %rax, $r3
  461. add $r1, $r2
  462. vpaddq $TEMP0, $ACC4, $ACC4
  463. vpmuludq 32*5-128($np), $Y1, $TEMP1
  464. mov $r2, %rax
  465. imull $n0, %eax
  466. vpaddq $TEMP1, $ACC5, $ACC5
  467. vpmuludq 32*6-128($np), $Y1, $TEMP2
  468. and \$0x1fffffff, %eax
  469. vpaddq $TEMP2, $ACC6, $ACC6
  470. vpmuludq 32*7-128($np), $Y1, $TEMP0
  471. vpaddq $TEMP0, $ACC7, $ACC7
  472. vpmuludq 32*8-128($np), $Y1, $TEMP1
  473. vmovd %eax, $Y1
  474. #vmovdqu 32*1-8-128($np), $TEMP2 # moved below
  475. vpaddq $TEMP1, $ACC8, $ACC8
  476. #vmovdqu 32*2-8-128($np), $TEMP0 # moved below
  477. vpbroadcastq $Y1, $Y1
  478. vpmuludq 32*1-8-128($np), $Y2, $TEMP2 # see above
  479. vmovdqu 32*3-8-128($np), $TEMP1
  480. mov %rax, %rdx
  481. imulq -128($np), %rax
  482. vpaddq $TEMP2, $ACC1, $ACC1
  483. vpmuludq 32*2-8-128($np), $Y2, $TEMP0 # see above
  484. vmovdqu 32*4-8-128($np), $TEMP2
  485. add %rax, $r2
  486. mov %rdx, %rax
  487. imulq 8-128($np), %rax
  488. vpaddq $TEMP0, $ACC2, $ACC2
  489. add $r3, %rax
  490. shr \$29, $r2
  491. vpmuludq $Y2, $TEMP1, $TEMP1
  492. vmovdqu 32*5-8-128($np), $TEMP0
  493. add $r2, %rax
  494. vpaddq $TEMP1, $ACC3, $ACC3
  495. vpmuludq $Y2, $TEMP2, $TEMP2
  496. vmovdqu 32*6-8-128($np), $TEMP1
  497. .byte 0x67
  498. mov %rax, $r3
  499. imull $n0, %eax
  500. vpaddq $TEMP2, $ACC4, $ACC4
  501. vpmuludq $Y2, $TEMP0, $TEMP0
  502. .byte 0xc4,0x41,0x7e,0x6f,0x9d,0x58,0x00,0x00,0x00 # vmovdqu 32*7-8-128($np), $TEMP2
  503. and \$0x1fffffff, %eax
  504. vpaddq $TEMP0, $ACC5, $ACC5
  505. vpmuludq $Y2, $TEMP1, $TEMP1
  506. vmovdqu 32*8-8-128($np), $TEMP0
  507. vpaddq $TEMP1, $ACC6, $ACC6
  508. vpmuludq $Y2, $TEMP2, $TEMP2
  509. vmovdqu 32*9-8-128($np), $ACC9
  510. vmovd %eax, $ACC0 # borrow ACC0 for Y2
  511. imulq -128($np), %rax
  512. vpaddq $TEMP2, $ACC7, $ACC7
  513. vpmuludq $Y2, $TEMP0, $TEMP0
  514. vmovdqu 32*1-16-128($np), $TEMP1
  515. vpbroadcastq $ACC0, $ACC0
  516. vpaddq $TEMP0, $ACC8, $ACC8
  517. vpmuludq $Y2, $ACC9, $ACC9
  518. vmovdqu 32*2-16-128($np), $TEMP2
  519. add %rax, $r3
  520. ___
  521. ($ACC0,$Y2)=($Y2,$ACC0);
  522. $code.=<<___;
  523. vmovdqu 32*1-24-128($np), $ACC0
  524. vpmuludq $Y1, $TEMP1, $TEMP1
  525. vmovdqu 32*3-16-128($np), $TEMP0
  526. vpaddq $TEMP1, $ACC1, $ACC1
  527. vpmuludq $Y2, $ACC0, $ACC0
  528. vpmuludq $Y1, $TEMP2, $TEMP2
  529. .byte 0xc4,0x41,0x7e,0x6f,0xb5,0xf0,0xff,0xff,0xff # vmovdqu 32*4-16-128($np), $TEMP1
  530. vpaddq $ACC1, $ACC0, $ACC0
  531. vpaddq $TEMP2, $ACC2, $ACC2
  532. vpmuludq $Y1, $TEMP0, $TEMP0
  533. vmovdqu 32*5-16-128($np), $TEMP2
  534. .byte 0x67
  535. vmovq $ACC0, %rax
  536. vmovdqu $ACC0, (%rsp) # transfer $r0-$r3
  537. vpaddq $TEMP0, $ACC3, $ACC3
  538. vpmuludq $Y1, $TEMP1, $TEMP1
  539. vmovdqu 32*6-16-128($np), $TEMP0
  540. vpaddq $TEMP1, $ACC4, $ACC4
  541. vpmuludq $Y1, $TEMP2, $TEMP2
  542. vmovdqu 32*7-16-128($np), $TEMP1
  543. vpaddq $TEMP2, $ACC5, $ACC5
  544. vpmuludq $Y1, $TEMP0, $TEMP0
  545. vmovdqu 32*8-16-128($np), $TEMP2
  546. vpaddq $TEMP0, $ACC6, $ACC6
  547. vpmuludq $Y1, $TEMP1, $TEMP1
  548. shr \$29, $r3
  549. vmovdqu 32*9-16-128($np), $TEMP0
  550. add $r3, %rax
  551. vpaddq $TEMP1, $ACC7, $ACC7
  552. vpmuludq $Y1, $TEMP2, $TEMP2
  553. #vmovdqu 32*2-24-128($np), $TEMP1 # moved below
  554. mov %rax, $r0
  555. imull $n0, %eax
  556. vpaddq $TEMP2, $ACC8, $ACC8
  557. vpmuludq $Y1, $TEMP0, $TEMP0
  558. and \$0x1fffffff, %eax
  559. vmovd %eax, $Y1
  560. vmovdqu 32*3-24-128($np), $TEMP2
  561. .byte 0x67
  562. vpaddq $TEMP0, $ACC9, $ACC9
  563. vpbroadcastq $Y1, $Y1
  564. vpmuludq 32*2-24-128($np), $Y2, $TEMP1 # see above
  565. vmovdqu 32*4-24-128($np), $TEMP0
  566. mov %rax, %rdx
  567. imulq -128($np), %rax
  568. mov 8(%rsp), $r1
  569. vpaddq $TEMP1, $ACC2, $ACC1
  570. vpmuludq $Y2, $TEMP2, $TEMP2
  571. vmovdqu 32*5-24-128($np), $TEMP1
  572. add %rax, $r0
  573. mov %rdx, %rax
  574. imulq 8-128($np), %rax
  575. .byte 0x67
  576. shr \$29, $r0
  577. mov 16(%rsp), $r2
  578. vpaddq $TEMP2, $ACC3, $ACC2
  579. vpmuludq $Y2, $TEMP0, $TEMP0
  580. vmovdqu 32*6-24-128($np), $TEMP2
  581. add %rax, $r1
  582. mov %rdx, %rax
  583. imulq 16-128($np), %rax
  584. vpaddq $TEMP0, $ACC4, $ACC3
  585. vpmuludq $Y2, $TEMP1, $TEMP1
  586. vmovdqu 32*7-24-128($np), $TEMP0
  587. imulq 24-128($np), %rdx # future $r3
  588. add %rax, $r2
  589. lea ($r0,$r1), %rax
  590. vpaddq $TEMP1, $ACC5, $ACC4
  591. vpmuludq $Y2, $TEMP2, $TEMP2
  592. vmovdqu 32*8-24-128($np), $TEMP1
  593. mov %rax, $r1
  594. imull $n0, %eax
  595. vpmuludq $Y2, $TEMP0, $TEMP0
  596. vpaddq $TEMP2, $ACC6, $ACC5
  597. vmovdqu 32*9-24-128($np), $TEMP2
  598. and \$0x1fffffff, %eax
  599. vpaddq $TEMP0, $ACC7, $ACC6
  600. vpmuludq $Y2, $TEMP1, $TEMP1
  601. add 24(%rsp), %rdx
  602. vpaddq $TEMP1, $ACC8, $ACC7
  603. vpmuludq $Y2, $TEMP2, $TEMP2
  604. vpaddq $TEMP2, $ACC9, $ACC8
  605. vmovq $r3, $ACC9
  606. mov %rdx, $r3
  607. dec $i
  608. jnz .LOOP_REDUCE_1024
  609. ___
  610. ($ACC0,$Y2)=($Y2,$ACC0);
  611. $code.=<<___;
  612. lea 448(%rsp), $tp1 # size optimization
  613. vpaddq $ACC9, $Y2, $ACC0
  614. vpxor $ZERO, $ZERO, $ZERO
  615. vpaddq 32*9-192($tp0), $ACC0, $ACC0
  616. vpaddq 32*10-448($tp1), $ACC1, $ACC1
  617. vpaddq 32*11-448($tp1), $ACC2, $ACC2
  618. vpaddq 32*12-448($tp1), $ACC3, $ACC3
  619. vpaddq 32*13-448($tp1), $ACC4, $ACC4
  620. vpaddq 32*14-448($tp1), $ACC5, $ACC5
  621. vpaddq 32*15-448($tp1), $ACC6, $ACC6
  622. vpaddq 32*16-448($tp1), $ACC7, $ACC7
  623. vpaddq 32*17-448($tp1), $ACC8, $ACC8
  624. vpsrlq \$29, $ACC0, $TEMP1
  625. vpand $AND_MASK, $ACC0, $ACC0
  626. vpsrlq \$29, $ACC1, $TEMP2
  627. vpand $AND_MASK, $ACC1, $ACC1
  628. vpsrlq \$29, $ACC2, $TEMP3
  629. vpermq \$0x93, $TEMP1, $TEMP1
  630. vpand $AND_MASK, $ACC2, $ACC2
  631. vpsrlq \$29, $ACC3, $TEMP4
  632. vpermq \$0x93, $TEMP2, $TEMP2
  633. vpand $AND_MASK, $ACC3, $ACC3
  634. vpermq \$0x93, $TEMP3, $TEMP3
  635. vpblendd \$3, $ZERO, $TEMP1, $TEMP0
  636. vpermq \$0x93, $TEMP4, $TEMP4
  637. vpblendd \$3, $TEMP1, $TEMP2, $TEMP1
  638. vpaddq $TEMP0, $ACC0, $ACC0
  639. vpblendd \$3, $TEMP2, $TEMP3, $TEMP2
  640. vpaddq $TEMP1, $ACC1, $ACC1
  641. vpblendd \$3, $TEMP3, $TEMP4, $TEMP3
  642. vpaddq $TEMP2, $ACC2, $ACC2
  643. vpblendd \$3, $TEMP4, $ZERO, $TEMP4
  644. vpaddq $TEMP3, $ACC3, $ACC3
  645. vpaddq $TEMP4, $ACC4, $ACC4
  646. vpsrlq \$29, $ACC0, $TEMP1
  647. vpand $AND_MASK, $ACC0, $ACC0
  648. vpsrlq \$29, $ACC1, $TEMP2
  649. vpand $AND_MASK, $ACC1, $ACC1
  650. vpsrlq \$29, $ACC2, $TEMP3
  651. vpermq \$0x93, $TEMP1, $TEMP1
  652. vpand $AND_MASK, $ACC2, $ACC2
  653. vpsrlq \$29, $ACC3, $TEMP4
  654. vpermq \$0x93, $TEMP2, $TEMP2
  655. vpand $AND_MASK, $ACC3, $ACC3
  656. vpermq \$0x93, $TEMP3, $TEMP3
  657. vpblendd \$3, $ZERO, $TEMP1, $TEMP0
  658. vpermq \$0x93, $TEMP4, $TEMP4
  659. vpblendd \$3, $TEMP1, $TEMP2, $TEMP1
  660. vpaddq $TEMP0, $ACC0, $ACC0
  661. vpblendd \$3, $TEMP2, $TEMP3, $TEMP2
  662. vpaddq $TEMP1, $ACC1, $ACC1
  663. vmovdqu $ACC0, 32*0-128($rp)
  664. vpblendd \$3, $TEMP3, $TEMP4, $TEMP3
  665. vpaddq $TEMP2, $ACC2, $ACC2
  666. vmovdqu $ACC1, 32*1-128($rp)
  667. vpblendd \$3, $TEMP4, $ZERO, $TEMP4
  668. vpaddq $TEMP3, $ACC3, $ACC3
  669. vmovdqu $ACC2, 32*2-128($rp)
  670. vpaddq $TEMP4, $ACC4, $ACC4
  671. vmovdqu $ACC3, 32*3-128($rp)
  672. ___
  673. $TEMP5=$ACC0;
  674. $code.=<<___;
  675. vpsrlq \$29, $ACC4, $TEMP1
  676. vpand $AND_MASK, $ACC4, $ACC4
  677. vpsrlq \$29, $ACC5, $TEMP2
  678. vpand $AND_MASK, $ACC5, $ACC5
  679. vpsrlq \$29, $ACC6, $TEMP3
  680. vpermq \$0x93, $TEMP1, $TEMP1
  681. vpand $AND_MASK, $ACC6, $ACC6
  682. vpsrlq \$29, $ACC7, $TEMP4
  683. vpermq \$0x93, $TEMP2, $TEMP2
  684. vpand $AND_MASK, $ACC7, $ACC7
  685. vpsrlq \$29, $ACC8, $TEMP5
  686. vpermq \$0x93, $TEMP3, $TEMP3
  687. vpand $AND_MASK, $ACC8, $ACC8
  688. vpermq \$0x93, $TEMP4, $TEMP4
  689. vpblendd \$3, $ZERO, $TEMP1, $TEMP0
  690. vpermq \$0x93, $TEMP5, $TEMP5
  691. vpblendd \$3, $TEMP1, $TEMP2, $TEMP1
  692. vpaddq $TEMP0, $ACC4, $ACC4
  693. vpblendd \$3, $TEMP2, $TEMP3, $TEMP2
  694. vpaddq $TEMP1, $ACC5, $ACC5
  695. vpblendd \$3, $TEMP3, $TEMP4, $TEMP3
  696. vpaddq $TEMP2, $ACC6, $ACC6
  697. vpblendd \$3, $TEMP4, $TEMP5, $TEMP4
  698. vpaddq $TEMP3, $ACC7, $ACC7
  699. vpaddq $TEMP4, $ACC8, $ACC8
  700. vpsrlq \$29, $ACC4, $TEMP1
  701. vpand $AND_MASK, $ACC4, $ACC4
  702. vpsrlq \$29, $ACC5, $TEMP2
  703. vpand $AND_MASK, $ACC5, $ACC5
  704. vpsrlq \$29, $ACC6, $TEMP3
  705. vpermq \$0x93, $TEMP1, $TEMP1
  706. vpand $AND_MASK, $ACC6, $ACC6
  707. vpsrlq \$29, $ACC7, $TEMP4
  708. vpermq \$0x93, $TEMP2, $TEMP2
  709. vpand $AND_MASK, $ACC7, $ACC7
  710. vpsrlq \$29, $ACC8, $TEMP5
  711. vpermq \$0x93, $TEMP3, $TEMP3
  712. vpand $AND_MASK, $ACC8, $ACC8
  713. vpermq \$0x93, $TEMP4, $TEMP4
  714. vpblendd \$3, $ZERO, $TEMP1, $TEMP0
  715. vpermq \$0x93, $TEMP5, $TEMP5
  716. vpblendd \$3, $TEMP1, $TEMP2, $TEMP1
  717. vpaddq $TEMP0, $ACC4, $ACC4
  718. vpblendd \$3, $TEMP2, $TEMP3, $TEMP2
  719. vpaddq $TEMP1, $ACC5, $ACC5
  720. vmovdqu $ACC4, 32*4-128($rp)
  721. vpblendd \$3, $TEMP3, $TEMP4, $TEMP3
  722. vpaddq $TEMP2, $ACC6, $ACC6
  723. vmovdqu $ACC5, 32*5-128($rp)
  724. vpblendd \$3, $TEMP4, $TEMP5, $TEMP4
  725. vpaddq $TEMP3, $ACC7, $ACC7
  726. vmovdqu $ACC6, 32*6-128($rp)
  727. vpaddq $TEMP4, $ACC8, $ACC8
  728. vmovdqu $ACC7, 32*7-128($rp)
  729. vmovdqu $ACC8, 32*8-128($rp)
  730. mov $rp, $ap
  731. dec $rep
  732. jne .LOOP_GRANDE_SQR_1024
  733. vzeroall
  734. mov %rbp, %rax
  735. .cfi_def_cfa_register %rax
  736. ___
  737. $code.=<<___ if ($win64);
  738. .Lsqr_1024_in_tail:
  739. movaps -0xd8(%rax),%xmm6
  740. movaps -0xc8(%rax),%xmm7
  741. movaps -0xb8(%rax),%xmm8
  742. movaps -0xa8(%rax),%xmm9
  743. movaps -0x98(%rax),%xmm10
  744. movaps -0x88(%rax),%xmm11
  745. movaps -0x78(%rax),%xmm12
  746. movaps -0x68(%rax),%xmm13
  747. movaps -0x58(%rax),%xmm14
  748. movaps -0x48(%rax),%xmm15
  749. ___
  750. $code.=<<___;
  751. mov -48(%rax),%r15
  752. .cfi_restore %r15
  753. mov -40(%rax),%r14
  754. .cfi_restore %r14
  755. mov -32(%rax),%r13
  756. .cfi_restore %r13
  757. mov -24(%rax),%r12
  758. .cfi_restore %r12
  759. mov -16(%rax),%rbp
  760. .cfi_restore %rbp
  761. mov -8(%rax),%rbx
  762. .cfi_restore %rbx
  763. lea (%rax),%rsp # restore %rsp
  764. .cfi_def_cfa_register %rsp
  765. .Lsqr_1024_epilogue:
  766. ret
  767. .cfi_endproc
  768. .size rsaz_1024_sqr_avx2,.-rsaz_1024_sqr_avx2
  769. ___
  770. }
  771. { # void AMM_WW(
  772. my $rp="%rdi"; # BN_ULONG *rp,
  773. my $ap="%rsi"; # const BN_ULONG *ap,
  774. my $bp="%rdx"; # const BN_ULONG *bp,
  775. my $np="%rcx"; # const BN_ULONG *np,
  776. my $n0="%r8d"; # unsigned int n0);
  777. # The registers that hold the accumulated redundant result
  778. # The AMM works on 1024 bit operands, and redundant word size is 29
  779. # Therefore: ceil(1024/29)/4 = 9
  780. my $ACC0="%ymm0";
  781. my $ACC1="%ymm1";
  782. my $ACC2="%ymm2";
  783. my $ACC3="%ymm3";
  784. my $ACC4="%ymm4";
  785. my $ACC5="%ymm5";
  786. my $ACC6="%ymm6";
  787. my $ACC7="%ymm7";
  788. my $ACC8="%ymm8";
  789. my $ACC9="%ymm9";
  790. # Registers that hold the broadcasted words of multiplier, currently used
  791. my $Bi="%ymm10";
  792. my $Yi="%ymm11";
  793. # Helper registers
  794. my $TEMP0=$ACC0;
  795. my $TEMP1="%ymm12";
  796. my $TEMP2="%ymm13";
  797. my $ZERO="%ymm14";
  798. my $AND_MASK="%ymm15";
  799. # alu registers that hold the first words of the ACC
  800. my $r0="%r9";
  801. my $r1="%r10";
  802. my $r2="%r11";
  803. my $r3="%r12";
  804. my $i="%r14d";
  805. my $tmp="%r15";
  806. $bp="%r13"; # reassigned argument
  807. $code.=<<___;
  808. .globl rsaz_1024_mul_avx2
  809. .type rsaz_1024_mul_avx2,\@function,5
  810. .align 64
  811. rsaz_1024_mul_avx2:
  812. .cfi_startproc
  813. lea (%rsp), %rax
  814. .cfi_def_cfa_register %rax
  815. push %rbx
  816. .cfi_push %rbx
  817. push %rbp
  818. .cfi_push %rbp
  819. push %r12
  820. .cfi_push %r12
  821. push %r13
  822. .cfi_push %r13
  823. push %r14
  824. .cfi_push %r14
  825. push %r15
  826. .cfi_push %r15
  827. ___
  828. $code.=<<___ if ($win64);
  829. vzeroupper
  830. lea -0xa8(%rsp),%rsp
  831. vmovaps %xmm6,-0xd8(%rax)
  832. vmovaps %xmm7,-0xc8(%rax)
  833. vmovaps %xmm8,-0xb8(%rax)
  834. vmovaps %xmm9,-0xa8(%rax)
  835. vmovaps %xmm10,-0x98(%rax)
  836. vmovaps %xmm11,-0x88(%rax)
  837. vmovaps %xmm12,-0x78(%rax)
  838. vmovaps %xmm13,-0x68(%rax)
  839. vmovaps %xmm14,-0x58(%rax)
  840. vmovaps %xmm15,-0x48(%rax)
  841. .Lmul_1024_body:
  842. ___
  843. $code.=<<___;
  844. mov %rax,%rbp
  845. .cfi_def_cfa_register %rbp
  846. vzeroall
  847. mov %rdx, $bp # reassigned argument
  848. sub \$64,%rsp
  849. # unaligned 256-bit load that crosses page boundary can
  850. # cause severe performance degradation here, so if $ap does
  851. # cross page boundary, swap it with $bp [meaning that caller
  852. # is advised to lay down $ap and $bp next to each other, so
  853. # that only one can cross page boundary].
  854. .byte 0x67,0x67
  855. mov $ap, $tmp
  856. and \$4095, $tmp
  857. add \$32*10, $tmp
  858. shr \$12, $tmp
  859. mov $ap, $tmp
  860. cmovnz $bp, $ap
  861. cmovnz $tmp, $bp
  862. mov $np, $tmp
  863. sub \$-128,$ap # size optimization
  864. sub \$-128,$np
  865. sub \$-128,$rp
  866. and \$4095, $tmp # see if $np crosses page
  867. add \$32*10, $tmp
  868. .byte 0x67,0x67
  869. shr \$12, $tmp
  870. jz .Lmul_1024_no_n_copy
  871. # unaligned 256-bit load that crosses page boundary can
  872. # cause severe performance degradation here, so if $np does
  873. # cross page boundary, copy it to stack and make sure stack
  874. # frame doesn't...
  875. sub \$32*10,%rsp
  876. vmovdqu 32*0-128($np), $ACC0
  877. and \$-512, %rsp
  878. vmovdqu 32*1-128($np), $ACC1
  879. vmovdqu 32*2-128($np), $ACC2
  880. vmovdqu 32*3-128($np), $ACC3
  881. vmovdqu 32*4-128($np), $ACC4
  882. vmovdqu 32*5-128($np), $ACC5
  883. vmovdqu 32*6-128($np), $ACC6
  884. vmovdqu 32*7-128($np), $ACC7
  885. vmovdqu 32*8-128($np), $ACC8
  886. lea 64+128(%rsp),$np
  887. vmovdqu $ACC0, 32*0-128($np)
  888. vpxor $ACC0, $ACC0, $ACC0
  889. vmovdqu $ACC1, 32*1-128($np)
  890. vpxor $ACC1, $ACC1, $ACC1
  891. vmovdqu $ACC2, 32*2-128($np)
  892. vpxor $ACC2, $ACC2, $ACC2
  893. vmovdqu $ACC3, 32*3-128($np)
  894. vpxor $ACC3, $ACC3, $ACC3
  895. vmovdqu $ACC4, 32*4-128($np)
  896. vpxor $ACC4, $ACC4, $ACC4
  897. vmovdqu $ACC5, 32*5-128($np)
  898. vpxor $ACC5, $ACC5, $ACC5
  899. vmovdqu $ACC6, 32*6-128($np)
  900. vpxor $ACC6, $ACC6, $ACC6
  901. vmovdqu $ACC7, 32*7-128($np)
  902. vpxor $ACC7, $ACC7, $ACC7
  903. vmovdqu $ACC8, 32*8-128($np)
  904. vmovdqa $ACC0, $ACC8
  905. vmovdqu $ACC9, 32*9-128($np) # $ACC9 is zero after vzeroall
  906. .Lmul_1024_no_n_copy:
  907. and \$-64,%rsp
  908. mov ($bp), %rbx
  909. vpbroadcastq ($bp), $Bi
  910. vmovdqu $ACC0, (%rsp) # clear top of stack
  911. xor $r0, $r0
  912. .byte 0x67
  913. xor $r1, $r1
  914. xor $r2, $r2
  915. xor $r3, $r3
  916. vmovdqu .Land_mask(%rip), $AND_MASK
  917. mov \$9, $i
  918. vmovdqu $ACC9, 32*9-128($rp) # $ACC9 is zero after vzeroall
  919. jmp .Loop_mul_1024
  920. .align 32
  921. .Loop_mul_1024:
  922. vpsrlq \$29, $ACC3, $ACC9 # correct $ACC3(*)
  923. mov %rbx, %rax
  924. imulq -128($ap), %rax
  925. add $r0, %rax
  926. mov %rbx, $r1
  927. imulq 8-128($ap), $r1
  928. add 8(%rsp), $r1
  929. mov %rax, $r0
  930. imull $n0, %eax
  931. and \$0x1fffffff, %eax
  932. mov %rbx, $r2
  933. imulq 16-128($ap), $r2
  934. add 16(%rsp), $r2
  935. mov %rbx, $r3
  936. imulq 24-128($ap), $r3
  937. add 24(%rsp), $r3
  938. vpmuludq 32*1-128($ap),$Bi,$TEMP0
  939. vmovd %eax, $Yi
  940. vpaddq $TEMP0,$ACC1,$ACC1
  941. vpmuludq 32*2-128($ap),$Bi,$TEMP1
  942. vpbroadcastq $Yi, $Yi
  943. vpaddq $TEMP1,$ACC2,$ACC2
  944. vpmuludq 32*3-128($ap),$Bi,$TEMP2
  945. vpand $AND_MASK, $ACC3, $ACC3 # correct $ACC3
  946. vpaddq $TEMP2,$ACC3,$ACC3
  947. vpmuludq 32*4-128($ap),$Bi,$TEMP0
  948. vpaddq $TEMP0,$ACC4,$ACC4
  949. vpmuludq 32*5-128($ap),$Bi,$TEMP1
  950. vpaddq $TEMP1,$ACC5,$ACC5
  951. vpmuludq 32*6-128($ap),$Bi,$TEMP2
  952. vpaddq $TEMP2,$ACC6,$ACC6
  953. vpmuludq 32*7-128($ap),$Bi,$TEMP0
  954. vpermq \$0x93, $ACC9, $ACC9 # correct $ACC3
  955. vpaddq $TEMP0,$ACC7,$ACC7
  956. vpmuludq 32*8-128($ap),$Bi,$TEMP1
  957. vpbroadcastq 8($bp), $Bi
  958. vpaddq $TEMP1,$ACC8,$ACC8
  959. mov %rax,%rdx
  960. imulq -128($np),%rax
  961. add %rax,$r0
  962. mov %rdx,%rax
  963. imulq 8-128($np),%rax
  964. add %rax,$r1
  965. mov %rdx,%rax
  966. imulq 16-128($np),%rax
  967. add %rax,$r2
  968. shr \$29, $r0
  969. imulq 24-128($np),%rdx
  970. add %rdx,$r3
  971. add $r0, $r1
  972. vpmuludq 32*1-128($np),$Yi,$TEMP2
  973. vmovq $Bi, %rbx
  974. vpaddq $TEMP2,$ACC1,$ACC1
  975. vpmuludq 32*2-128($np),$Yi,$TEMP0
  976. vpaddq $TEMP0,$ACC2,$ACC2
  977. vpmuludq 32*3-128($np),$Yi,$TEMP1
  978. vpaddq $TEMP1,$ACC3,$ACC3
  979. vpmuludq 32*4-128($np),$Yi,$TEMP2
  980. vpaddq $TEMP2,$ACC4,$ACC4
  981. vpmuludq 32*5-128($np),$Yi,$TEMP0
  982. vpaddq $TEMP0,$ACC5,$ACC5
  983. vpmuludq 32*6-128($np),$Yi,$TEMP1
  984. vpaddq $TEMP1,$ACC6,$ACC6
  985. vpmuludq 32*7-128($np),$Yi,$TEMP2
  986. vpblendd \$3, $ZERO, $ACC9, $TEMP1 # correct $ACC3
  987. vpaddq $TEMP2,$ACC7,$ACC7
  988. vpmuludq 32*8-128($np),$Yi,$TEMP0
  989. vpaddq $TEMP1, $ACC3, $ACC3 # correct $ACC3
  990. vpaddq $TEMP0,$ACC8,$ACC8
  991. mov %rbx, %rax
  992. imulq -128($ap),%rax
  993. add %rax,$r1
  994. vmovdqu -8+32*1-128($ap),$TEMP1
  995. mov %rbx, %rax
  996. imulq 8-128($ap),%rax
  997. add %rax,$r2
  998. vmovdqu -8+32*2-128($ap),$TEMP2
  999. mov $r1, %rax
  1000. vpblendd \$0xfc, $ZERO, $ACC9, $ACC9 # correct $ACC3
  1001. imull $n0, %eax
  1002. vpaddq $ACC9,$ACC4,$ACC4 # correct $ACC3
  1003. and \$0x1fffffff, %eax
  1004. imulq 16-128($ap),%rbx
  1005. add %rbx,$r3
  1006. vpmuludq $Bi,$TEMP1,$TEMP1
  1007. vmovd %eax, $Yi
  1008. vmovdqu -8+32*3-128($ap),$TEMP0
  1009. vpaddq $TEMP1,$ACC1,$ACC1
  1010. vpmuludq $Bi,$TEMP2,$TEMP2
  1011. vpbroadcastq $Yi, $Yi
  1012. vmovdqu -8+32*4-128($ap),$TEMP1
  1013. vpaddq $TEMP2,$ACC2,$ACC2
  1014. vpmuludq $Bi,$TEMP0,$TEMP0
  1015. vmovdqu -8+32*5-128($ap),$TEMP2
  1016. vpaddq $TEMP0,$ACC3,$ACC3
  1017. vpmuludq $Bi,$TEMP1,$TEMP1
  1018. vmovdqu -8+32*6-128($ap),$TEMP0
  1019. vpaddq $TEMP1,$ACC4,$ACC4
  1020. vpmuludq $Bi,$TEMP2,$TEMP2
  1021. vmovdqu -8+32*7-128($ap),$TEMP1
  1022. vpaddq $TEMP2,$ACC5,$ACC5
  1023. vpmuludq $Bi,$TEMP0,$TEMP0
  1024. vmovdqu -8+32*8-128($ap),$TEMP2
  1025. vpaddq $TEMP0,$ACC6,$ACC6
  1026. vpmuludq $Bi,$TEMP1,$TEMP1
  1027. vmovdqu -8+32*9-128($ap),$ACC9
  1028. vpaddq $TEMP1,$ACC7,$ACC7
  1029. vpmuludq $Bi,$TEMP2,$TEMP2
  1030. vpaddq $TEMP2,$ACC8,$ACC8
  1031. vpmuludq $Bi,$ACC9,$ACC9
  1032. vpbroadcastq 16($bp), $Bi
  1033. mov %rax,%rdx
  1034. imulq -128($np),%rax
  1035. add %rax,$r1
  1036. vmovdqu -8+32*1-128($np),$TEMP0
  1037. mov %rdx,%rax
  1038. imulq 8-128($np),%rax
  1039. add %rax,$r2
  1040. vmovdqu -8+32*2-128($np),$TEMP1
  1041. shr \$29, $r1
  1042. imulq 16-128($np),%rdx
  1043. add %rdx,$r3
  1044. add $r1, $r2
  1045. vpmuludq $Yi,$TEMP0,$TEMP0
  1046. vmovq $Bi, %rbx
  1047. vmovdqu -8+32*3-128($np),$TEMP2
  1048. vpaddq $TEMP0,$ACC1,$ACC1
  1049. vpmuludq $Yi,$TEMP1,$TEMP1
  1050. vmovdqu -8+32*4-128($np),$TEMP0
  1051. vpaddq $TEMP1,$ACC2,$ACC2
  1052. vpmuludq $Yi,$TEMP2,$TEMP2
  1053. vmovdqu -8+32*5-128($np),$TEMP1
  1054. vpaddq $TEMP2,$ACC3,$ACC3
  1055. vpmuludq $Yi,$TEMP0,$TEMP0
  1056. vmovdqu -8+32*6-128($np),$TEMP2
  1057. vpaddq $TEMP0,$ACC4,$ACC4
  1058. vpmuludq $Yi,$TEMP1,$TEMP1
  1059. vmovdqu -8+32*7-128($np),$TEMP0
  1060. vpaddq $TEMP1,$ACC5,$ACC5
  1061. vpmuludq $Yi,$TEMP2,$TEMP2
  1062. vmovdqu -8+32*8-128($np),$TEMP1
  1063. vpaddq $TEMP2,$ACC6,$ACC6
  1064. vpmuludq $Yi,$TEMP0,$TEMP0
  1065. vmovdqu -8+32*9-128($np),$TEMP2
  1066. vpaddq $TEMP0,$ACC7,$ACC7
  1067. vpmuludq $Yi,$TEMP1,$TEMP1
  1068. vpaddq $TEMP1,$ACC8,$ACC8
  1069. vpmuludq $Yi,$TEMP2,$TEMP2
  1070. vpaddq $TEMP2,$ACC9,$ACC9
  1071. vmovdqu -16+32*1-128($ap),$TEMP0
  1072. mov %rbx,%rax
  1073. imulq -128($ap),%rax
  1074. add $r2,%rax
  1075. vmovdqu -16+32*2-128($ap),$TEMP1
  1076. mov %rax,$r2
  1077. imull $n0, %eax
  1078. and \$0x1fffffff, %eax
  1079. imulq 8-128($ap),%rbx
  1080. add %rbx,$r3
  1081. vpmuludq $Bi,$TEMP0,$TEMP0
  1082. vmovd %eax, $Yi
  1083. vmovdqu -16+32*3-128($ap),$TEMP2
  1084. vpaddq $TEMP0,$ACC1,$ACC1
  1085. vpmuludq $Bi,$TEMP1,$TEMP1
  1086. vpbroadcastq $Yi, $Yi
  1087. vmovdqu -16+32*4-128($ap),$TEMP0
  1088. vpaddq $TEMP1,$ACC2,$ACC2
  1089. vpmuludq $Bi,$TEMP2,$TEMP2
  1090. vmovdqu -16+32*5-128($ap),$TEMP1
  1091. vpaddq $TEMP2,$ACC3,$ACC3
  1092. vpmuludq $Bi,$TEMP0,$TEMP0
  1093. vmovdqu -16+32*6-128($ap),$TEMP2
  1094. vpaddq $TEMP0,$ACC4,$ACC4
  1095. vpmuludq $Bi,$TEMP1,$TEMP1
  1096. vmovdqu -16+32*7-128($ap),$TEMP0
  1097. vpaddq $TEMP1,$ACC5,$ACC5
  1098. vpmuludq $Bi,$TEMP2,$TEMP2
  1099. vmovdqu -16+32*8-128($ap),$TEMP1
  1100. vpaddq $TEMP2,$ACC6,$ACC6
  1101. vpmuludq $Bi,$TEMP0,$TEMP0
  1102. vmovdqu -16+32*9-128($ap),$TEMP2
  1103. vpaddq $TEMP0,$ACC7,$ACC7
  1104. vpmuludq $Bi,$TEMP1,$TEMP1
  1105. vpaddq $TEMP1,$ACC8,$ACC8
  1106. vpmuludq $Bi,$TEMP2,$TEMP2
  1107. vpbroadcastq 24($bp), $Bi
  1108. vpaddq $TEMP2,$ACC9,$ACC9
  1109. vmovdqu -16+32*1-128($np),$TEMP0
  1110. mov %rax,%rdx
  1111. imulq -128($np),%rax
  1112. add %rax,$r2
  1113. vmovdqu -16+32*2-128($np),$TEMP1
  1114. imulq 8-128($np),%rdx
  1115. add %rdx,$r3
  1116. shr \$29, $r2
  1117. vpmuludq $Yi,$TEMP0,$TEMP0
  1118. vmovq $Bi, %rbx
  1119. vmovdqu -16+32*3-128($np),$TEMP2
  1120. vpaddq $TEMP0,$ACC1,$ACC1
  1121. vpmuludq $Yi,$TEMP1,$TEMP1
  1122. vmovdqu -16+32*4-128($np),$TEMP0
  1123. vpaddq $TEMP1,$ACC2,$ACC2
  1124. vpmuludq $Yi,$TEMP2,$TEMP2
  1125. vmovdqu -16+32*5-128($np),$TEMP1
  1126. vpaddq $TEMP2,$ACC3,$ACC3
  1127. vpmuludq $Yi,$TEMP0,$TEMP0
  1128. vmovdqu -16+32*6-128($np),$TEMP2
  1129. vpaddq $TEMP0,$ACC4,$ACC4
  1130. vpmuludq $Yi,$TEMP1,$TEMP1
  1131. vmovdqu -16+32*7-128($np),$TEMP0
  1132. vpaddq $TEMP1,$ACC5,$ACC5
  1133. vpmuludq $Yi,$TEMP2,$TEMP2
  1134. vmovdqu -16+32*8-128($np),$TEMP1
  1135. vpaddq $TEMP2,$ACC6,$ACC6
  1136. vpmuludq $Yi,$TEMP0,$TEMP0
  1137. vmovdqu -16+32*9-128($np),$TEMP2
  1138. vpaddq $TEMP0,$ACC7,$ACC7
  1139. vpmuludq $Yi,$TEMP1,$TEMP1
  1140. vmovdqu -24+32*1-128($ap),$TEMP0
  1141. vpaddq $TEMP1,$ACC8,$ACC8
  1142. vpmuludq $Yi,$TEMP2,$TEMP2
  1143. vmovdqu -24+32*2-128($ap),$TEMP1
  1144. vpaddq $TEMP2,$ACC9,$ACC9
  1145. add $r2, $r3
  1146. imulq -128($ap),%rbx
  1147. add %rbx,$r3
  1148. mov $r3, %rax
  1149. imull $n0, %eax
  1150. and \$0x1fffffff, %eax
  1151. vpmuludq $Bi,$TEMP0,$TEMP0
  1152. vmovd %eax, $Yi
  1153. vmovdqu -24+32*3-128($ap),$TEMP2
  1154. vpaddq $TEMP0,$ACC1,$ACC1
  1155. vpmuludq $Bi,$TEMP1,$TEMP1
  1156. vpbroadcastq $Yi, $Yi
  1157. vmovdqu -24+32*4-128($ap),$TEMP0
  1158. vpaddq $TEMP1,$ACC2,$ACC2
  1159. vpmuludq $Bi,$TEMP2,$TEMP2
  1160. vmovdqu -24+32*5-128($ap),$TEMP1
  1161. vpaddq $TEMP2,$ACC3,$ACC3
  1162. vpmuludq $Bi,$TEMP0,$TEMP0
  1163. vmovdqu -24+32*6-128($ap),$TEMP2
  1164. vpaddq $TEMP0,$ACC4,$ACC4
  1165. vpmuludq $Bi,$TEMP1,$TEMP1
  1166. vmovdqu -24+32*7-128($ap),$TEMP0
  1167. vpaddq $TEMP1,$ACC5,$ACC5
  1168. vpmuludq $Bi,$TEMP2,$TEMP2
  1169. vmovdqu -24+32*8-128($ap),$TEMP1
  1170. vpaddq $TEMP2,$ACC6,$ACC6
  1171. vpmuludq $Bi,$TEMP0,$TEMP0
  1172. vmovdqu -24+32*9-128($ap),$TEMP2
  1173. vpaddq $TEMP0,$ACC7,$ACC7
  1174. vpmuludq $Bi,$TEMP1,$TEMP1
  1175. vpaddq $TEMP1,$ACC8,$ACC8
  1176. vpmuludq $Bi,$TEMP2,$TEMP2
  1177. vpbroadcastq 32($bp), $Bi
  1178. vpaddq $TEMP2,$ACC9,$ACC9
  1179. add \$32, $bp # $bp++
  1180. vmovdqu -24+32*1-128($np),$TEMP0
  1181. imulq -128($np),%rax
  1182. add %rax,$r3
  1183. shr \$29, $r3
  1184. vmovdqu -24+32*2-128($np),$TEMP1
  1185. vpmuludq $Yi,$TEMP0,$TEMP0
  1186. vmovq $Bi, %rbx
  1187. vmovdqu -24+32*3-128($np),$TEMP2
  1188. vpaddq $TEMP0,$ACC1,$ACC0 # $ACC0==$TEMP0
  1189. vpmuludq $Yi,$TEMP1,$TEMP1
  1190. vmovdqu $ACC0, (%rsp) # transfer $r0-$r3
  1191. vpaddq $TEMP1,$ACC2,$ACC1
  1192. vmovdqu -24+32*4-128($np),$TEMP0
  1193. vpmuludq $Yi,$TEMP2,$TEMP2
  1194. vmovdqu -24+32*5-128($np),$TEMP1
  1195. vpaddq $TEMP2,$ACC3,$ACC2
  1196. vpmuludq $Yi,$TEMP0,$TEMP0
  1197. vmovdqu -24+32*6-128($np),$TEMP2
  1198. vpaddq $TEMP0,$ACC4,$ACC3
  1199. vpmuludq $Yi,$TEMP1,$TEMP1
  1200. vmovdqu -24+32*7-128($np),$TEMP0
  1201. vpaddq $TEMP1,$ACC5,$ACC4
  1202. vpmuludq $Yi,$TEMP2,$TEMP2
  1203. vmovdqu -24+32*8-128($np),$TEMP1
  1204. vpaddq $TEMP2,$ACC6,$ACC5
  1205. vpmuludq $Yi,$TEMP0,$TEMP0
  1206. vmovdqu -24+32*9-128($np),$TEMP2
  1207. mov $r3, $r0
  1208. vpaddq $TEMP0,$ACC7,$ACC6
  1209. vpmuludq $Yi,$TEMP1,$TEMP1
  1210. add (%rsp), $r0
  1211. vpaddq $TEMP1,$ACC8,$ACC7
  1212. vpmuludq $Yi,$TEMP2,$TEMP2
  1213. vmovq $r3, $TEMP1
  1214. vpaddq $TEMP2,$ACC9,$ACC8
  1215. dec $i
  1216. jnz .Loop_mul_1024
  1217. ___
  1218. # (*) Original implementation was correcting ACC1-ACC3 for overflow
  1219. # after 7 loop runs, or after 28 iterations, or 56 additions.
  1220. # But as we underutilize resources, it's possible to correct in
  1221. # each iteration with marginal performance loss. But then, as
  1222. # we do it in each iteration, we can correct less digits, and
  1223. # avoid performance penalties completely.
  1224. $TEMP0 = $ACC9;
  1225. $TEMP3 = $Bi;
  1226. $TEMP4 = $Yi;
  1227. $code.=<<___;
  1228. vpaddq (%rsp), $TEMP1, $ACC0
  1229. vpsrlq \$29, $ACC0, $TEMP1
  1230. vpand $AND_MASK, $ACC0, $ACC0
  1231. vpsrlq \$29, $ACC1, $TEMP2
  1232. vpand $AND_MASK, $ACC1, $ACC1
  1233. vpsrlq \$29, $ACC2, $TEMP3
  1234. vpermq \$0x93, $TEMP1, $TEMP1
  1235. vpand $AND_MASK, $ACC2, $ACC2
  1236. vpsrlq \$29, $ACC3, $TEMP4
  1237. vpermq \$0x93, $TEMP2, $TEMP2
  1238. vpand $AND_MASK, $ACC3, $ACC3
  1239. vpblendd \$3, $ZERO, $TEMP1, $TEMP0
  1240. vpermq \$0x93, $TEMP3, $TEMP3
  1241. vpblendd \$3, $TEMP1, $TEMP2, $TEMP1
  1242. vpermq \$0x93, $TEMP4, $TEMP4
  1243. vpaddq $TEMP0, $ACC0, $ACC0
  1244. vpblendd \$3, $TEMP2, $TEMP3, $TEMP2
  1245. vpaddq $TEMP1, $ACC1, $ACC1
  1246. vpblendd \$3, $TEMP3, $TEMP4, $TEMP3
  1247. vpaddq $TEMP2, $ACC2, $ACC2
  1248. vpblendd \$3, $TEMP4, $ZERO, $TEMP4
  1249. vpaddq $TEMP3, $ACC3, $ACC3
  1250. vpaddq $TEMP4, $ACC4, $ACC4
  1251. vpsrlq \$29, $ACC0, $TEMP1
  1252. vpand $AND_MASK, $ACC0, $ACC0
  1253. vpsrlq \$29, $ACC1, $TEMP2
  1254. vpand $AND_MASK, $ACC1, $ACC1
  1255. vpsrlq \$29, $ACC2, $TEMP3
  1256. vpermq \$0x93, $TEMP1, $TEMP1
  1257. vpand $AND_MASK, $ACC2, $ACC2
  1258. vpsrlq \$29, $ACC3, $TEMP4
  1259. vpermq \$0x93, $TEMP2, $TEMP2
  1260. vpand $AND_MASK, $ACC3, $ACC3
  1261. vpermq \$0x93, $TEMP3, $TEMP3
  1262. vpblendd \$3, $ZERO, $TEMP1, $TEMP0
  1263. vpermq \$0x93, $TEMP4, $TEMP4
  1264. vpblendd \$3, $TEMP1, $TEMP2, $TEMP1
  1265. vpaddq $TEMP0, $ACC0, $ACC0
  1266. vpblendd \$3, $TEMP2, $TEMP3, $TEMP2
  1267. vpaddq $TEMP1, $ACC1, $ACC1
  1268. vpblendd \$3, $TEMP3, $TEMP4, $TEMP3
  1269. vpaddq $TEMP2, $ACC2, $ACC2
  1270. vpblendd \$3, $TEMP4, $ZERO, $TEMP4
  1271. vpaddq $TEMP3, $ACC3, $ACC3
  1272. vpaddq $TEMP4, $ACC4, $ACC4
  1273. vmovdqu $ACC0, 0-128($rp)
  1274. vmovdqu $ACC1, 32-128($rp)
  1275. vmovdqu $ACC2, 64-128($rp)
  1276. vmovdqu $ACC3, 96-128($rp)
  1277. ___
  1278. $TEMP5=$ACC0;
  1279. $code.=<<___;
  1280. vpsrlq \$29, $ACC4, $TEMP1
  1281. vpand $AND_MASK, $ACC4, $ACC4
  1282. vpsrlq \$29, $ACC5, $TEMP2
  1283. vpand $AND_MASK, $ACC5, $ACC5
  1284. vpsrlq \$29, $ACC6, $TEMP3
  1285. vpermq \$0x93, $TEMP1, $TEMP1
  1286. vpand $AND_MASK, $ACC6, $ACC6
  1287. vpsrlq \$29, $ACC7, $TEMP4
  1288. vpermq \$0x93, $TEMP2, $TEMP2
  1289. vpand $AND_MASK, $ACC7, $ACC7
  1290. vpsrlq \$29, $ACC8, $TEMP5
  1291. vpermq \$0x93, $TEMP3, $TEMP3
  1292. vpand $AND_MASK, $ACC8, $ACC8
  1293. vpermq \$0x93, $TEMP4, $TEMP4
  1294. vpblendd \$3, $ZERO, $TEMP1, $TEMP0
  1295. vpermq \$0x93, $TEMP5, $TEMP5
  1296. vpblendd \$3, $TEMP1, $TEMP2, $TEMP1
  1297. vpaddq $TEMP0, $ACC4, $ACC4
  1298. vpblendd \$3, $TEMP2, $TEMP3, $TEMP2
  1299. vpaddq $TEMP1, $ACC5, $ACC5
  1300. vpblendd \$3, $TEMP3, $TEMP4, $TEMP3
  1301. vpaddq $TEMP2, $ACC6, $ACC6
  1302. vpblendd \$3, $TEMP4, $TEMP5, $TEMP4
  1303. vpaddq $TEMP3, $ACC7, $ACC7
  1304. vpaddq $TEMP4, $ACC8, $ACC8
  1305. vpsrlq \$29, $ACC4, $TEMP1
  1306. vpand $AND_MASK, $ACC4, $ACC4
  1307. vpsrlq \$29, $ACC5, $TEMP2
  1308. vpand $AND_MASK, $ACC5, $ACC5
  1309. vpsrlq \$29, $ACC6, $TEMP3
  1310. vpermq \$0x93, $TEMP1, $TEMP1
  1311. vpand $AND_MASK, $ACC6, $ACC6
  1312. vpsrlq \$29, $ACC7, $TEMP4
  1313. vpermq \$0x93, $TEMP2, $TEMP2
  1314. vpand $AND_MASK, $ACC7, $ACC7
  1315. vpsrlq \$29, $ACC8, $TEMP5
  1316. vpermq \$0x93, $TEMP3, $TEMP3
  1317. vpand $AND_MASK, $ACC8, $ACC8
  1318. vpermq \$0x93, $TEMP4, $TEMP4
  1319. vpblendd \$3, $ZERO, $TEMP1, $TEMP0
  1320. vpermq \$0x93, $TEMP5, $TEMP5
  1321. vpblendd \$3, $TEMP1, $TEMP2, $TEMP1
  1322. vpaddq $TEMP0, $ACC4, $ACC4
  1323. vpblendd \$3, $TEMP2, $TEMP3, $TEMP2
  1324. vpaddq $TEMP1, $ACC5, $ACC5
  1325. vpblendd \$3, $TEMP3, $TEMP4, $TEMP3
  1326. vpaddq $TEMP2, $ACC6, $ACC6
  1327. vpblendd \$3, $TEMP4, $TEMP5, $TEMP4
  1328. vpaddq $TEMP3, $ACC7, $ACC7
  1329. vpaddq $TEMP4, $ACC8, $ACC8
  1330. vmovdqu $ACC4, 128-128($rp)
  1331. vmovdqu $ACC5, 160-128($rp)
  1332. vmovdqu $ACC6, 192-128($rp)
  1333. vmovdqu $ACC7, 224-128($rp)
  1334. vmovdqu $ACC8, 256-128($rp)
  1335. vzeroupper
  1336. mov %rbp, %rax
  1337. .cfi_def_cfa_register %rax
  1338. ___
  1339. $code.=<<___ if ($win64);
  1340. .Lmul_1024_in_tail:
  1341. movaps -0xd8(%rax),%xmm6
  1342. movaps -0xc8(%rax),%xmm7
  1343. movaps -0xb8(%rax),%xmm8
  1344. movaps -0xa8(%rax),%xmm9
  1345. movaps -0x98(%rax),%xmm10
  1346. movaps -0x88(%rax),%xmm11
  1347. movaps -0x78(%rax),%xmm12
  1348. movaps -0x68(%rax),%xmm13
  1349. movaps -0x58(%rax),%xmm14
  1350. movaps -0x48(%rax),%xmm15
  1351. ___
  1352. $code.=<<___;
  1353. mov -48(%rax),%r15
  1354. .cfi_restore %r15
  1355. mov -40(%rax),%r14
  1356. .cfi_restore %r14
  1357. mov -32(%rax),%r13
  1358. .cfi_restore %r13
  1359. mov -24(%rax),%r12
  1360. .cfi_restore %r12
  1361. mov -16(%rax),%rbp
  1362. .cfi_restore %rbp
  1363. mov -8(%rax),%rbx
  1364. .cfi_restore %rbx
  1365. lea (%rax),%rsp # restore %rsp
  1366. .cfi_def_cfa_register %rsp
  1367. .Lmul_1024_epilogue:
  1368. ret
  1369. .cfi_endproc
  1370. .size rsaz_1024_mul_avx2,.-rsaz_1024_mul_avx2
  1371. ___
  1372. }
  1373. {
  1374. my ($out,$inp) = $win64 ? ("%rcx","%rdx") : ("%rdi","%rsi");
  1375. my @T = map("%r$_",(8..11));
  1376. $code.=<<___;
  1377. .globl rsaz_1024_red2norm_avx2
  1378. .type rsaz_1024_red2norm_avx2,\@abi-omnipotent
  1379. .align 32
  1380. rsaz_1024_red2norm_avx2:
  1381. .cfi_startproc
  1382. sub \$-128,$inp # size optimization
  1383. xor %rax,%rax
  1384. ___
  1385. for ($j=0,$i=0; $i<16; $i++) {
  1386. my $k=0;
  1387. while (29*$j<64*($i+1)) { # load data till boundary
  1388. $code.=" mov `8*$j-128`($inp), @T[0]\n";
  1389. $j++; $k++; push(@T,shift(@T));
  1390. }
  1391. $l=$k;
  1392. while ($k>1) { # shift loaded data but last value
  1393. $code.=" shl \$`29*($j-$k)`,@T[-$k]\n";
  1394. $k--;
  1395. }
  1396. $code.=<<___; # shift last value
  1397. mov @T[-1], @T[0]
  1398. shl \$`29*($j-1)`, @T[-1]
  1399. shr \$`-29*($j-1)`, @T[0]
  1400. ___
  1401. while ($l) { # accumulate all values
  1402. $code.=" add @T[-$l], %rax\n";
  1403. $l--;
  1404. }
  1405. $code.=<<___;
  1406. adc \$0, @T[0] # consume eventual carry
  1407. mov %rax, 8*$i($out)
  1408. mov @T[0], %rax
  1409. ___
  1410. push(@T,shift(@T));
  1411. }
  1412. $code.=<<___;
  1413. ret
  1414. .cfi_endproc
  1415. .size rsaz_1024_red2norm_avx2,.-rsaz_1024_red2norm_avx2
  1416. .globl rsaz_1024_norm2red_avx2
  1417. .type rsaz_1024_norm2red_avx2,\@abi-omnipotent
  1418. .align 32
  1419. rsaz_1024_norm2red_avx2:
  1420. .cfi_startproc
  1421. sub \$-128,$out # size optimization
  1422. mov ($inp),@T[0]
  1423. mov \$0x1fffffff,%eax
  1424. ___
  1425. for ($j=0,$i=0; $i<16; $i++) {
  1426. $code.=" mov `8*($i+1)`($inp),@T[1]\n" if ($i<15);
  1427. $code.=" xor @T[1],@T[1]\n" if ($i==15);
  1428. my $k=1;
  1429. while (29*($j+1)<64*($i+1)) {
  1430. $code.=<<___;
  1431. mov @T[0],@T[-$k]
  1432. shr \$`29*$j`,@T[-$k]
  1433. and %rax,@T[-$k] # &0x1fffffff
  1434. mov @T[-$k],`8*$j-128`($out)
  1435. ___
  1436. $j++; $k++;
  1437. }
  1438. $code.=<<___;
  1439. shrd \$`29*$j`,@T[1],@T[0]
  1440. and %rax,@T[0]
  1441. mov @T[0],`8*$j-128`($out)
  1442. ___
  1443. $j++;
  1444. push(@T,shift(@T));
  1445. }
  1446. $code.=<<___;
  1447. mov @T[0],`8*$j-128`($out) # zero
  1448. mov @T[0],`8*($j+1)-128`($out)
  1449. mov @T[0],`8*($j+2)-128`($out)
  1450. mov @T[0],`8*($j+3)-128`($out)
  1451. ret
  1452. .cfi_endproc
  1453. .size rsaz_1024_norm2red_avx2,.-rsaz_1024_norm2red_avx2
  1454. ___
  1455. }
  1456. {
  1457. my ($out,$inp,$power) = $win64 ? ("%rcx","%rdx","%r8d") : ("%rdi","%rsi","%edx");
  1458. $code.=<<___;
  1459. .globl rsaz_1024_scatter5_avx2
  1460. .type rsaz_1024_scatter5_avx2,\@abi-omnipotent
  1461. .align 32
  1462. rsaz_1024_scatter5_avx2:
  1463. .cfi_startproc
  1464. vzeroupper
  1465. vmovdqu .Lscatter_permd(%rip),%ymm5
  1466. shl \$4,$power
  1467. lea ($out,$power),$out
  1468. mov \$9,%eax
  1469. jmp .Loop_scatter_1024
  1470. .align 32
  1471. .Loop_scatter_1024:
  1472. vmovdqu ($inp),%ymm0
  1473. lea 32($inp),$inp
  1474. vpermd %ymm0,%ymm5,%ymm0
  1475. vmovdqu %xmm0,($out)
  1476. lea 16*32($out),$out
  1477. dec %eax
  1478. jnz .Loop_scatter_1024
  1479. vzeroupper
  1480. ret
  1481. .cfi_endproc
  1482. .size rsaz_1024_scatter5_avx2,.-rsaz_1024_scatter5_avx2
  1483. .globl rsaz_1024_gather5_avx2
  1484. .type rsaz_1024_gather5_avx2,\@abi-omnipotent
  1485. .align 32
  1486. rsaz_1024_gather5_avx2:
  1487. .cfi_startproc
  1488. vzeroupper
  1489. mov %rsp,%r11
  1490. .cfi_def_cfa_register %r11
  1491. ___
  1492. $code.=<<___ if ($win64);
  1493. lea -0x88(%rsp),%rax
  1494. .LSEH_begin_rsaz_1024_gather5:
  1495. # I can't trust assembler to use specific encoding:-(
  1496. .byte 0x48,0x8d,0x60,0xe0 # lea -0x20(%rax),%rsp
  1497. .byte 0xc5,0xf8,0x29,0x70,0xe0 # vmovaps %xmm6,-0x20(%rax)
  1498. .byte 0xc5,0xf8,0x29,0x78,0xf0 # vmovaps %xmm7,-0x10(%rax)
  1499. .byte 0xc5,0x78,0x29,0x40,0x00 # vmovaps %xmm8,0(%rax)
  1500. .byte 0xc5,0x78,0x29,0x48,0x10 # vmovaps %xmm9,0x10(%rax)
  1501. .byte 0xc5,0x78,0x29,0x50,0x20 # vmovaps %xmm10,0x20(%rax)
  1502. .byte 0xc5,0x78,0x29,0x58,0x30 # vmovaps %xmm11,0x30(%rax)
  1503. .byte 0xc5,0x78,0x29,0x60,0x40 # vmovaps %xmm12,0x40(%rax)
  1504. .byte 0xc5,0x78,0x29,0x68,0x50 # vmovaps %xmm13,0x50(%rax)
  1505. .byte 0xc5,0x78,0x29,0x70,0x60 # vmovaps %xmm14,0x60(%rax)
  1506. .byte 0xc5,0x78,0x29,0x78,0x70 # vmovaps %xmm15,0x70(%rax)
  1507. ___
  1508. $code.=<<___;
  1509. lea -0x100(%rsp),%rsp
  1510. and \$-32, %rsp
  1511. lea .Linc(%rip), %r10
  1512. lea -128(%rsp),%rax # control u-op density
  1513. vmovd $power, %xmm4
  1514. vmovdqa (%r10),%ymm0
  1515. vmovdqa 32(%r10),%ymm1
  1516. vmovdqa 64(%r10),%ymm5
  1517. vpbroadcastd %xmm4,%ymm4
  1518. vpaddd %ymm5, %ymm0, %ymm2
  1519. vpcmpeqd %ymm4, %ymm0, %ymm0
  1520. vpaddd %ymm5, %ymm1, %ymm3
  1521. vpcmpeqd %ymm4, %ymm1, %ymm1
  1522. vmovdqa %ymm0, 32*0+128(%rax)
  1523. vpaddd %ymm5, %ymm2, %ymm0
  1524. vpcmpeqd %ymm4, %ymm2, %ymm2
  1525. vmovdqa %ymm1, 32*1+128(%rax)
  1526. vpaddd %ymm5, %ymm3, %ymm1
  1527. vpcmpeqd %ymm4, %ymm3, %ymm3
  1528. vmovdqa %ymm2, 32*2+128(%rax)
  1529. vpaddd %ymm5, %ymm0, %ymm2
  1530. vpcmpeqd %ymm4, %ymm0, %ymm0
  1531. vmovdqa %ymm3, 32*3+128(%rax)
  1532. vpaddd %ymm5, %ymm1, %ymm3
  1533. vpcmpeqd %ymm4, %ymm1, %ymm1
  1534. vmovdqa %ymm0, 32*4+128(%rax)
  1535. vpaddd %ymm5, %ymm2, %ymm8
  1536. vpcmpeqd %ymm4, %ymm2, %ymm2
  1537. vmovdqa %ymm1, 32*5+128(%rax)
  1538. vpaddd %ymm5, %ymm3, %ymm9
  1539. vpcmpeqd %ymm4, %ymm3, %ymm3
  1540. vmovdqa %ymm2, 32*6+128(%rax)
  1541. vpaddd %ymm5, %ymm8, %ymm10
  1542. vpcmpeqd %ymm4, %ymm8, %ymm8
  1543. vmovdqa %ymm3, 32*7+128(%rax)
  1544. vpaddd %ymm5, %ymm9, %ymm11
  1545. vpcmpeqd %ymm4, %ymm9, %ymm9
  1546. vpaddd %ymm5, %ymm10, %ymm12
  1547. vpcmpeqd %ymm4, %ymm10, %ymm10
  1548. vpaddd %ymm5, %ymm11, %ymm13
  1549. vpcmpeqd %ymm4, %ymm11, %ymm11
  1550. vpaddd %ymm5, %ymm12, %ymm14
  1551. vpcmpeqd %ymm4, %ymm12, %ymm12
  1552. vpaddd %ymm5, %ymm13, %ymm15
  1553. vpcmpeqd %ymm4, %ymm13, %ymm13
  1554. vpcmpeqd %ymm4, %ymm14, %ymm14
  1555. vpcmpeqd %ymm4, %ymm15, %ymm15
  1556. vmovdqa -32(%r10),%ymm7 # .Lgather_permd
  1557. lea 128($inp), $inp
  1558. mov \$9,$power
  1559. .Loop_gather_1024:
  1560. vmovdqa 32*0-128($inp), %ymm0
  1561. vmovdqa 32*1-128($inp), %ymm1
  1562. vmovdqa 32*2-128($inp), %ymm2
  1563. vmovdqa 32*3-128($inp), %ymm3
  1564. vpand 32*0+128(%rax), %ymm0, %ymm0
  1565. vpand 32*1+128(%rax), %ymm1, %ymm1
  1566. vpand 32*2+128(%rax), %ymm2, %ymm2
  1567. vpor %ymm0, %ymm1, %ymm4
  1568. vpand 32*3+128(%rax), %ymm3, %ymm3
  1569. vmovdqa 32*4-128($inp), %ymm0
  1570. vmovdqa 32*5-128($inp), %ymm1
  1571. vpor %ymm2, %ymm3, %ymm5
  1572. vmovdqa 32*6-128($inp), %ymm2
  1573. vmovdqa 32*7-128($inp), %ymm3
  1574. vpand 32*4+128(%rax), %ymm0, %ymm0
  1575. vpand 32*5+128(%rax), %ymm1, %ymm1
  1576. vpand 32*6+128(%rax), %ymm2, %ymm2
  1577. vpor %ymm0, %ymm4, %ymm4
  1578. vpand 32*7+128(%rax), %ymm3, %ymm3
  1579. vpand 32*8-128($inp), %ymm8, %ymm0
  1580. vpor %ymm1, %ymm5, %ymm5
  1581. vpand 32*9-128($inp), %ymm9, %ymm1
  1582. vpor %ymm2, %ymm4, %ymm4
  1583. vpand 32*10-128($inp),%ymm10, %ymm2
  1584. vpor %ymm3, %ymm5, %ymm5
  1585. vpand 32*11-128($inp),%ymm11, %ymm3
  1586. vpor %ymm0, %ymm4, %ymm4
  1587. vpand 32*12-128($inp),%ymm12, %ymm0
  1588. vpor %ymm1, %ymm5, %ymm5
  1589. vpand 32*13-128($inp),%ymm13, %ymm1
  1590. vpor %ymm2, %ymm4, %ymm4
  1591. vpand 32*14-128($inp),%ymm14, %ymm2
  1592. vpor %ymm3, %ymm5, %ymm5
  1593. vpand 32*15-128($inp),%ymm15, %ymm3
  1594. lea 32*16($inp), $inp
  1595. vpor %ymm0, %ymm4, %ymm4
  1596. vpor %ymm1, %ymm5, %ymm5
  1597. vpor %ymm2, %ymm4, %ymm4
  1598. vpor %ymm3, %ymm5, %ymm5
  1599. vpor %ymm5, %ymm4, %ymm4
  1600. vextracti128 \$1, %ymm4, %xmm5 # upper half is cleared
  1601. vpor %xmm4, %xmm5, %xmm5
  1602. vpermd %ymm5,%ymm7,%ymm5
  1603. vmovdqu %ymm5,($out)
  1604. lea 32($out),$out
  1605. dec $power
  1606. jnz .Loop_gather_1024
  1607. vpxor %ymm0,%ymm0,%ymm0
  1608. vmovdqu %ymm0,($out)
  1609. vzeroupper
  1610. ___
  1611. $code.=<<___ if ($win64);
  1612. movaps -0xa8(%r11),%xmm6
  1613. movaps -0x98(%r11),%xmm7
  1614. movaps -0x88(%r11),%xmm8
  1615. movaps -0x78(%r11),%xmm9
  1616. movaps -0x68(%r11),%xmm10
  1617. movaps -0x58(%r11),%xmm11
  1618. movaps -0x48(%r11),%xmm12
  1619. movaps -0x38(%r11),%xmm13
  1620. movaps -0x28(%r11),%xmm14
  1621. movaps -0x18(%r11),%xmm15
  1622. ___
  1623. $code.=<<___;
  1624. lea (%r11),%rsp
  1625. .cfi_def_cfa_register %rsp
  1626. ret
  1627. .cfi_endproc
  1628. .LSEH_end_rsaz_1024_gather5:
  1629. .size rsaz_1024_gather5_avx2,.-rsaz_1024_gather5_avx2
  1630. ___
  1631. }
  1632. $code.=<<___;
  1633. .extern OPENSSL_ia32cap_P
  1634. .globl rsaz_avx2_eligible
  1635. .type rsaz_avx2_eligible,\@abi-omnipotent
  1636. .align 32
  1637. rsaz_avx2_eligible:
  1638. mov OPENSSL_ia32cap_P+8(%rip),%eax
  1639. ___
  1640. $code.=<<___ if ($addx);
  1641. mov \$`1<<8|1<<19`,%ecx
  1642. mov \$0,%edx
  1643. and %eax,%ecx
  1644. cmp \$`1<<8|1<<19`,%ecx # check for BMI2+AD*X
  1645. cmove %edx,%eax
  1646. ___
  1647. $code.=<<___;
  1648. and \$`1<<5`,%eax
  1649. shr \$5,%eax
  1650. ret
  1651. .size rsaz_avx2_eligible,.-rsaz_avx2_eligible
  1652. .align 64
  1653. .Land_mask:
  1654. .quad 0x1fffffff,0x1fffffff,0x1fffffff,0x1fffffff
  1655. .Lscatter_permd:
  1656. .long 0,2,4,6,7,7,7,7
  1657. .Lgather_permd:
  1658. .long 0,7,1,7,2,7,3,7
  1659. .Linc:
  1660. .long 0,0,0,0, 1,1,1,1
  1661. .long 2,2,2,2, 3,3,3,3
  1662. .long 4,4,4,4, 4,4,4,4
  1663. .align 64
  1664. ___
  1665. if ($win64) {
  1666. $rec="%rcx";
  1667. $frame="%rdx";
  1668. $context="%r8";
  1669. $disp="%r9";
  1670. $code.=<<___
  1671. .extern __imp_RtlVirtualUnwind
  1672. .type rsaz_se_handler,\@abi-omnipotent
  1673. .align 16
  1674. rsaz_se_handler:
  1675. push %rsi
  1676. push %rdi
  1677. push %rbx
  1678. push %rbp
  1679. push %r12
  1680. push %r13
  1681. push %r14
  1682. push %r15
  1683. pushfq
  1684. sub \$64,%rsp
  1685. mov 120($context),%rax # pull context->Rax
  1686. mov 248($context),%rbx # pull context->Rip
  1687. mov 8($disp),%rsi # disp->ImageBase
  1688. mov 56($disp),%r11 # disp->HandlerData
  1689. mov 0(%r11),%r10d # HandlerData[0]
  1690. lea (%rsi,%r10),%r10 # prologue label
  1691. cmp %r10,%rbx # context->Rip<prologue label
  1692. jb .Lcommon_seh_tail
  1693. mov 4(%r11),%r10d # HandlerData[1]
  1694. lea (%rsi,%r10),%r10 # epilogue label
  1695. cmp %r10,%rbx # context->Rip>=epilogue label
  1696. jae .Lcommon_seh_tail
  1697. mov 160($context),%rbp # pull context->Rbp
  1698. mov 8(%r11),%r10d # HandlerData[2]
  1699. lea (%rsi,%r10),%r10 # "in tail" label
  1700. cmp %r10,%rbx # context->Rip>="in tail" label
  1701. cmovc %rbp,%rax
  1702. mov -48(%rax),%r15
  1703. mov -40(%rax),%r14
  1704. mov -32(%rax),%r13
  1705. mov -24(%rax),%r12
  1706. mov -16(%rax),%rbp
  1707. mov -8(%rax),%rbx
  1708. mov %r15,240($context)
  1709. mov %r14,232($context)
  1710. mov %r13,224($context)
  1711. mov %r12,216($context)
  1712. mov %rbp,160($context)
  1713. mov %rbx,144($context)
  1714. lea -0xd8(%rax),%rsi # %xmm save area
  1715. lea 512($context),%rdi # & context.Xmm6
  1716. mov \$20,%ecx # 10*sizeof(%xmm0)/sizeof(%rax)
  1717. .long 0xa548f3fc # cld; rep movsq
  1718. .Lcommon_seh_tail:
  1719. mov 8(%rax),%rdi
  1720. mov 16(%rax),%rsi
  1721. mov %rax,152($context) # restore context->Rsp
  1722. mov %rsi,168($context) # restore context->Rsi
  1723. mov %rdi,176($context) # restore context->Rdi
  1724. mov 40($disp),%rdi # disp->ContextRecord
  1725. mov $context,%rsi # context
  1726. mov \$154,%ecx # sizeof(CONTEXT)
  1727. .long 0xa548f3fc # cld; rep movsq
  1728. mov $disp,%rsi
  1729. xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
  1730. mov 8(%rsi),%rdx # arg2, disp->ImageBase
  1731. mov 0(%rsi),%r8 # arg3, disp->ControlPc
  1732. mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
  1733. mov 40(%rsi),%r10 # disp->ContextRecord
  1734. lea 56(%rsi),%r11 # &disp->HandlerData
  1735. lea 24(%rsi),%r12 # &disp->EstablisherFrame
  1736. mov %r10,32(%rsp) # arg5
  1737. mov %r11,40(%rsp) # arg6
  1738. mov %r12,48(%rsp) # arg7
  1739. mov %rcx,56(%rsp) # arg8, (NULL)
  1740. call *__imp_RtlVirtualUnwind(%rip)
  1741. mov \$1,%eax # ExceptionContinueSearch
  1742. add \$64,%rsp
  1743. popfq
  1744. pop %r15
  1745. pop %r14
  1746. pop %r13
  1747. pop %r12
  1748. pop %rbp
  1749. pop %rbx
  1750. pop %rdi
  1751. pop %rsi
  1752. ret
  1753. .size rsaz_se_handler,.-rsaz_se_handler
  1754. .section .pdata
  1755. .align 4
  1756. .rva .LSEH_begin_rsaz_1024_sqr_avx2
  1757. .rva .LSEH_end_rsaz_1024_sqr_avx2
  1758. .rva .LSEH_info_rsaz_1024_sqr_avx2
  1759. .rva .LSEH_begin_rsaz_1024_mul_avx2
  1760. .rva .LSEH_end_rsaz_1024_mul_avx2
  1761. .rva .LSEH_info_rsaz_1024_mul_avx2
  1762. .rva .LSEH_begin_rsaz_1024_gather5
  1763. .rva .LSEH_end_rsaz_1024_gather5
  1764. .rva .LSEH_info_rsaz_1024_gather5
  1765. .section .xdata
  1766. .align 8
  1767. .LSEH_info_rsaz_1024_sqr_avx2:
  1768. .byte 9,0,0,0
  1769. .rva rsaz_se_handler
  1770. .rva .Lsqr_1024_body,.Lsqr_1024_epilogue,.Lsqr_1024_in_tail
  1771. .long 0
  1772. .LSEH_info_rsaz_1024_mul_avx2:
  1773. .byte 9,0,0,0
  1774. .rva rsaz_se_handler
  1775. .rva .Lmul_1024_body,.Lmul_1024_epilogue,.Lmul_1024_in_tail
  1776. .long 0
  1777. .LSEH_info_rsaz_1024_gather5:
  1778. .byte 0x01,0x36,0x17,0x0b
  1779. .byte 0x36,0xf8,0x09,0x00 # vmovaps 0x90(rsp),xmm15
  1780. .byte 0x31,0xe8,0x08,0x00 # vmovaps 0x80(rsp),xmm14
  1781. .byte 0x2c,0xd8,0x07,0x00 # vmovaps 0x70(rsp),xmm13
  1782. .byte 0x27,0xc8,0x06,0x00 # vmovaps 0x60(rsp),xmm12
  1783. .byte 0x22,0xb8,0x05,0x00 # vmovaps 0x50(rsp),xmm11
  1784. .byte 0x1d,0xa8,0x04,0x00 # vmovaps 0x40(rsp),xmm10
  1785. .byte 0x18,0x98,0x03,0x00 # vmovaps 0x30(rsp),xmm9
  1786. .byte 0x13,0x88,0x02,0x00 # vmovaps 0x20(rsp),xmm8
  1787. .byte 0x0e,0x78,0x01,0x00 # vmovaps 0x10(rsp),xmm7
  1788. .byte 0x09,0x68,0x00,0x00 # vmovaps 0x00(rsp),xmm6
  1789. .byte 0x04,0x01,0x15,0x00 # sub rsp,0xa8
  1790. .byte 0x00,0xb3,0x00,0x00 # set_frame r11
  1791. ___
  1792. }
  1793. foreach (split("\n",$code)) {
  1794. s/\`([^\`]*)\`/eval($1)/ge;
  1795. s/\b(sh[rl]d?\s+\$)(-?[0-9]+)/$1.$2%64/ge or
  1796. s/\b(vmov[dq])\b(.+)%ymm([0-9]+)/$1$2%xmm$3/go or
  1797. s/\b(vmovdqu)\b(.+)%x%ymm([0-9]+)/$1$2%xmm$3/go or
  1798. s/\b(vpinsr[qd])\b(.+)%ymm([0-9]+)/$1$2%xmm$3/go or
  1799. s/\b(vpextr[qd])\b(.+)%ymm([0-9]+)/$1$2%xmm$3/go or
  1800. s/\b(vpbroadcast[qd]\s+)%ymm([0-9]+)/$1%xmm$2/go;
  1801. print $_,"\n";
  1802. }
  1803. }}} else {{{
  1804. print <<___; # assembler is too old
  1805. .text
  1806. .globl rsaz_avx2_eligible
  1807. .type rsaz_avx2_eligible,\@abi-omnipotent
  1808. rsaz_avx2_eligible:
  1809. xor %eax,%eax
  1810. ret
  1811. .size rsaz_avx2_eligible,.-rsaz_avx2_eligible
  1812. .globl rsaz_1024_sqr_avx2
  1813. .globl rsaz_1024_mul_avx2
  1814. .globl rsaz_1024_norm2red_avx2
  1815. .globl rsaz_1024_red2norm_avx2
  1816. .globl rsaz_1024_scatter5_avx2
  1817. .globl rsaz_1024_gather5_avx2
  1818. .type rsaz_1024_sqr_avx2,\@abi-omnipotent
  1819. rsaz_1024_sqr_avx2:
  1820. rsaz_1024_mul_avx2:
  1821. rsaz_1024_norm2red_avx2:
  1822. rsaz_1024_red2norm_avx2:
  1823. rsaz_1024_scatter5_avx2:
  1824. rsaz_1024_gather5_avx2:
  1825. .byte 0x0f,0x0b # ud2
  1826. ret
  1827. .size rsaz_1024_sqr_avx2,.-rsaz_1024_sqr_avx2
  1828. ___
  1829. }}}
  1830. close STDOUT or die "error closing STDOUT: $!";