rsaz-x86_64.pl 46 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056205720582059206020612062206320642065206620672068206920702071207220732074207520762077207820792080208120822083208420852086208720882089209020912092209320942095209620972098209921002101210221032104210521062107210821092110211121122113211421152116211721182119212021212122212321242125212621272128212921302131213221332134213521362137213821392140214121422143214421452146214721482149215021512152215321542155215621572158215921602161216221632164216521662167216821692170217121722173217421752176217721782179218021812182218321842185218621872188218921902191219221932194219521962197219821992200220122022203220422052206220722082209221022112212221322142215221622172218221922202221222222232224222522262227222822292230223122322233223422352236223722382239224022412242224322442245224622472248224922502251225222532254225522562257225822592260226122622263226422652266226722682269227022712272227322742275227622772278227922802281228222832284228522862287228822892290229122922293229422952296229722982299230023012302230323042305230623072308230923102311231223132314231523162317231823192320232123222323232423252326232723282329233023312332233323342335233623372338233923402341234223432344234523462347234823492350235123522353235423552356235723582359236023612362236323642365236623672368236923702371237223732374237523762377237823792380238123822383238423852386238723882389239023912392239323942395239623972398239924002401240224032404240524062407240824092410241124122413241424152416241724182419242024212422242324242425242624272428242924302431
  1. #! /usr/bin/env perl
  2. # Copyright 2013-2020 The OpenSSL Project Authors. All Rights Reserved.
  3. # Copyright (c) 2012, Intel Corporation. All Rights Reserved.
  4. #
  5. # Licensed under the OpenSSL license (the "License"). You may not use
  6. # this file except in compliance with the License. You can obtain a copy
  7. # in the file LICENSE in the source distribution or at
  8. # https://www.openssl.org/source/license.html
  9. #
  10. # Originally written by Shay Gueron (1, 2), and Vlad Krasnov (1)
  11. # (1) Intel Corporation, Israel Development Center, Haifa, Israel
  12. # (2) University of Haifa, Israel
  13. #
  14. # References:
  15. # [1] S. Gueron, "Efficient Software Implementations of Modular
  16. # Exponentiation", http://eprint.iacr.org/2011/239
  17. # [2] S. Gueron, V. Krasnov. "Speeding up Big-Numbers Squaring".
  18. # IEEE Proceedings of 9th International Conference on Information
  19. # Technology: New Generations (ITNG 2012), 821-823 (2012).
  20. # [3] S. Gueron, Efficient Software Implementations of Modular Exponentiation
  21. # Journal of Cryptographic Engineering 2:31-43 (2012).
  22. # [4] S. Gueron, V. Krasnov: "[PATCH] Efficient and side channel analysis
  23. # resistant 512-bit and 1024-bit modular exponentiation for optimizing
  24. # RSA1024 and RSA2048 on x86_64 platforms",
  25. # http://rt.openssl.org/Ticket/Display.html?id=2582&user=guest&pass=guest
  26. #
  27. # While original submission covers 512- and 1024-bit exponentiation,
  28. # this module is limited to 512-bit version only (and as such
  29. # accelerates RSA1024 sign). This is because improvement for longer
  30. # keys is not high enough to justify the effort, highest measured
  31. # was ~5% on Westmere. [This is relative to OpenSSL 1.0.2, upcoming
  32. # for the moment of this writing!] Nor does this module implement
  33. # "monolithic" complete exponentiation jumbo-subroutine, but adheres
  34. # to more modular mixture of C and assembly. And it's optimized even
  35. # for processors other than Intel Core family (see table below for
  36. # improvement coefficients).
  37. # <appro@openssl.org>
  38. #
  39. # RSA1024 sign/sec this/original |this/rsax(*) this/fips(*)
  40. # ----------------+---------------------------
  41. # Opteron +13% |+5% +20%
  42. # Bulldozer -0% |-1% +10%
  43. # P4 +11% |+7% +8%
  44. # Westmere +5% |+14% +17%
  45. # Sandy Bridge +2% |+12% +29%
  46. # Ivy Bridge +1% |+11% +35%
  47. # Haswell(**) -0% |+12% +39%
  48. # Atom +13% |+11% +4%
  49. # VIA Nano +70% |+9% +25%
  50. #
  51. # (*) rsax engine and fips numbers are presented for reference
  52. # purposes;
  53. # (**) MULX was attempted, but found to give only marginal improvement;
  54. $flavour = shift;
  55. $output = shift;
  56. if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
  57. $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
  58. $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
  59. ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
  60. ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
  61. die "can't locate x86_64-xlate.pl";
  62. open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
  63. *STDOUT=*OUT;
  64. if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
  65. =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
  66. $addx = ($1>=2.23);
  67. }
  68. if (!$addx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
  69. `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
  70. $addx = ($1>=2.10);
  71. }
  72. if (!$addx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
  73. `ml64 2>&1` =~ /Version ([0-9]+)\./) {
  74. $addx = ($1>=12);
  75. }
  76. if (!$addx && `$ENV{CC} -v 2>&1` =~ /((?:clang|LLVM) version|.*based on LLVM) ([0-9]+)\.([0-9]+)/) {
  77. my $ver = $2 + $3/100.0; # 3.1->3.01, 3.10->3.10
  78. $addx = ($ver>=3.03);
  79. }
  80. ($out, $inp, $mod) = ("%rdi", "%rsi", "%rbp"); # common internal API
  81. {
  82. my ($out,$inp,$mod,$n0,$times) = ("%rdi","%rsi","%rdx","%rcx","%r8d");
  83. $code.=<<___;
  84. .text
  85. .extern OPENSSL_ia32cap_P
  86. .globl rsaz_512_sqr
  87. .type rsaz_512_sqr,\@function,5
  88. .align 32
  89. rsaz_512_sqr: # 25-29% faster than rsaz_512_mul
  90. .cfi_startproc
  91. push %rbx
  92. .cfi_push %rbx
  93. push %rbp
  94. .cfi_push %rbp
  95. push %r12
  96. .cfi_push %r12
  97. push %r13
  98. .cfi_push %r13
  99. push %r14
  100. .cfi_push %r14
  101. push %r15
  102. .cfi_push %r15
  103. subq \$128+24, %rsp
  104. .cfi_adjust_cfa_offset 128+24
  105. .Lsqr_body:
  106. movq $mod, %xmm1 # common off-load
  107. movq ($inp), %rdx
  108. movq 8($inp), %rax
  109. movq $n0, 128(%rsp)
  110. ___
  111. $code.=<<___ if ($addx);
  112. movl \$0x80100,%r11d
  113. andl OPENSSL_ia32cap_P+8(%rip),%r11d
  114. cmpl \$0x80100,%r11d # check for MULX and ADO/CX
  115. je .Loop_sqrx
  116. ___
  117. $code.=<<___;
  118. jmp .Loop_sqr
  119. .align 32
  120. .Loop_sqr:
  121. movl $times,128+8(%rsp)
  122. #first iteration
  123. movq %rdx, %rbx # 0($inp)
  124. mov %rax, %rbp # 8($inp)
  125. mulq %rdx
  126. movq %rax, %r8
  127. movq 16($inp), %rax
  128. movq %rdx, %r9
  129. mulq %rbx
  130. addq %rax, %r9
  131. movq 24($inp), %rax
  132. movq %rdx, %r10
  133. adcq \$0, %r10
  134. mulq %rbx
  135. addq %rax, %r10
  136. movq 32($inp), %rax
  137. movq %rdx, %r11
  138. adcq \$0, %r11
  139. mulq %rbx
  140. addq %rax, %r11
  141. movq 40($inp), %rax
  142. movq %rdx, %r12
  143. adcq \$0, %r12
  144. mulq %rbx
  145. addq %rax, %r12
  146. movq 48($inp), %rax
  147. movq %rdx, %r13
  148. adcq \$0, %r13
  149. mulq %rbx
  150. addq %rax, %r13
  151. movq 56($inp), %rax
  152. movq %rdx, %r14
  153. adcq \$0, %r14
  154. mulq %rbx
  155. addq %rax, %r14
  156. movq %rbx, %rax
  157. adcq \$0, %rdx
  158. xorq %rcx,%rcx # rcx:r8 = r8 << 1
  159. addq %r8, %r8
  160. movq %rdx, %r15
  161. adcq \$0, %rcx
  162. mulq %rax
  163. addq %r8, %rdx
  164. adcq \$0, %rcx
  165. movq %rax, (%rsp)
  166. movq %rdx, 8(%rsp)
  167. #second iteration
  168. movq 16($inp), %rax
  169. mulq %rbp
  170. addq %rax, %r10
  171. movq 24($inp), %rax
  172. movq %rdx, %rbx
  173. adcq \$0, %rbx
  174. mulq %rbp
  175. addq %rax, %r11
  176. movq 32($inp), %rax
  177. adcq \$0, %rdx
  178. addq %rbx, %r11
  179. movq %rdx, %rbx
  180. adcq \$0, %rbx
  181. mulq %rbp
  182. addq %rax, %r12
  183. movq 40($inp), %rax
  184. adcq \$0, %rdx
  185. addq %rbx, %r12
  186. movq %rdx, %rbx
  187. adcq \$0, %rbx
  188. mulq %rbp
  189. addq %rax, %r13
  190. movq 48($inp), %rax
  191. adcq \$0, %rdx
  192. addq %rbx, %r13
  193. movq %rdx, %rbx
  194. adcq \$0, %rbx
  195. mulq %rbp
  196. addq %rax, %r14
  197. movq 56($inp), %rax
  198. adcq \$0, %rdx
  199. addq %rbx, %r14
  200. movq %rdx, %rbx
  201. adcq \$0, %rbx
  202. mulq %rbp
  203. addq %rax, %r15
  204. movq %rbp, %rax
  205. adcq \$0, %rdx
  206. addq %rbx, %r15
  207. adcq \$0, %rdx
  208. xorq %rbx, %rbx # rbx:r10:r9 = r10:r9 << 1
  209. addq %r9, %r9
  210. movq %rdx, %r8
  211. adcq %r10, %r10
  212. adcq \$0, %rbx
  213. mulq %rax
  214. # rcx <= 2 and rax <= 0xFFFF..F9, so carry must be zero here
  215. addq %rcx, %rax
  216. movq 16($inp), %rbp
  217. addq %rax, %r9
  218. movq 24($inp), %rax
  219. adcq %rdx, %r10
  220. adcq \$0, %rbx
  221. movq %r9, 16(%rsp)
  222. movq %r10, 24(%rsp)
  223. #third iteration
  224. mulq %rbp
  225. addq %rax, %r12
  226. movq 32($inp), %rax
  227. movq %rdx, %rcx
  228. adcq \$0, %rcx
  229. mulq %rbp
  230. addq %rax, %r13
  231. movq 40($inp), %rax
  232. adcq \$0, %rdx
  233. addq %rcx, %r13
  234. movq %rdx, %rcx
  235. adcq \$0, %rcx
  236. mulq %rbp
  237. addq %rax, %r14
  238. movq 48($inp), %rax
  239. adcq \$0, %rdx
  240. addq %rcx, %r14
  241. movq %rdx, %rcx
  242. adcq \$0, %rcx
  243. mulq %rbp
  244. addq %rax, %r15
  245. movq 56($inp), %rax
  246. adcq \$0, %rdx
  247. addq %rcx, %r15
  248. movq %rdx, %rcx
  249. adcq \$0, %rcx
  250. mulq %rbp
  251. addq %rax, %r8
  252. movq %rbp, %rax
  253. adcq \$0, %rdx
  254. addq %rcx, %r8
  255. adcq \$0, %rdx
  256. xorq %rcx, %rcx # rcx:r12:r11 = r12:r11 << 1
  257. addq %r11, %r11
  258. movq %rdx, %r9
  259. adcq %r12, %r12
  260. adcq \$0, %rcx
  261. mulq %rax
  262. # rbx <= 2 and rax <= 0xFFFF..F9, so carry must be zero here
  263. addq %rbx, %rax
  264. movq 24($inp), %r10
  265. addq %rax, %r11
  266. movq 32($inp), %rax
  267. adcq %rdx, %r12
  268. adcq \$0, %rcx
  269. movq %r11, 32(%rsp)
  270. movq %r12, 40(%rsp)
  271. #fourth iteration
  272. mov %rax, %r11 # 32($inp)
  273. mulq %r10
  274. addq %rax, %r14
  275. movq 40($inp), %rax
  276. movq %rdx, %rbx
  277. adcq \$0, %rbx
  278. mov %rax, %r12 # 40($inp)
  279. mulq %r10
  280. addq %rax, %r15
  281. movq 48($inp), %rax
  282. adcq \$0, %rdx
  283. addq %rbx, %r15
  284. movq %rdx, %rbx
  285. adcq \$0, %rbx
  286. mov %rax, %rbp # 48($inp)
  287. mulq %r10
  288. addq %rax, %r8
  289. movq 56($inp), %rax
  290. adcq \$0, %rdx
  291. addq %rbx, %r8
  292. movq %rdx, %rbx
  293. adcq \$0, %rbx
  294. mulq %r10
  295. addq %rax, %r9
  296. movq %r10, %rax
  297. adcq \$0, %rdx
  298. addq %rbx, %r9
  299. adcq \$0, %rdx
  300. xorq %rbx, %rbx # rbx:r13:r14 = r13:r14 << 1
  301. addq %r13, %r13
  302. movq %rdx, %r10
  303. adcq %r14, %r14
  304. adcq \$0, %rbx
  305. mulq %rax
  306. # rcx <= 2 and rax <= 0xFFFF..F9, so carry must be zero here
  307. addq %rcx, %rax
  308. addq %rax, %r13
  309. movq %r12, %rax # 40($inp)
  310. adcq %rdx, %r14
  311. adcq \$0, %rbx
  312. movq %r13, 48(%rsp)
  313. movq %r14, 56(%rsp)
  314. #fifth iteration
  315. mulq %r11
  316. addq %rax, %r8
  317. movq %rbp, %rax # 48($inp)
  318. movq %rdx, %rcx
  319. adcq \$0, %rcx
  320. mulq %r11
  321. addq %rax, %r9
  322. movq 56($inp), %rax
  323. adcq \$0, %rdx
  324. addq %rcx, %r9
  325. movq %rdx, %rcx
  326. adcq \$0, %rcx
  327. mov %rax, %r14 # 56($inp)
  328. mulq %r11
  329. addq %rax, %r10
  330. movq %r11, %rax
  331. adcq \$0, %rdx
  332. addq %rcx, %r10
  333. adcq \$0, %rdx
  334. xorq %rcx, %rcx # rcx:r8:r15 = r8:r15 << 1
  335. addq %r15, %r15
  336. movq %rdx, %r11
  337. adcq %r8, %r8
  338. adcq \$0, %rcx
  339. mulq %rax
  340. # rbx <= 2 and rax <= 0xFFFF..F9, so carry must be zero here
  341. addq %rbx, %rax
  342. addq %rax, %r15
  343. movq %rbp, %rax # 48($inp)
  344. adcq %rdx, %r8
  345. adcq \$0, %rcx
  346. movq %r15, 64(%rsp)
  347. movq %r8, 72(%rsp)
  348. #sixth iteration
  349. mulq %r12
  350. addq %rax, %r10
  351. movq %r14, %rax # 56($inp)
  352. movq %rdx, %rbx
  353. adcq \$0, %rbx
  354. mulq %r12
  355. addq %rax, %r11
  356. movq %r12, %rax
  357. adcq \$0, %rdx
  358. addq %rbx, %r11
  359. adcq \$0, %rdx
  360. xorq %rbx, %rbx # rbx:r10:r9 = r10:r9 << 1
  361. addq %r9, %r9
  362. movq %rdx, %r12
  363. adcq %r10, %r10
  364. adcq \$0, %rbx
  365. mulq %rax
  366. # rcx <= 2 and rax <= 0xFFFF..F9, so carry must be zero here
  367. addq %rcx, %rax
  368. addq %rax, %r9
  369. movq %r14, %rax # 56($inp)
  370. adcq %rdx, %r10
  371. adcq \$0, %rbx
  372. movq %r9, 80(%rsp)
  373. movq %r10, 88(%rsp)
  374. #seventh iteration
  375. mulq %rbp
  376. addq %rax, %r12
  377. movq %rbp, %rax
  378. adcq \$0, %rdx
  379. xorq %rcx, %rcx # rcx:r12:r11 = r12:r11 << 1
  380. addq %r11, %r11
  381. movq %rdx, %r13
  382. adcq %r12, %r12
  383. adcq \$0, %rcx
  384. mulq %rax
  385. # rbx <= 2 and rax <= 0xFFFF..F9, so carry must be zero here
  386. addq %rbx, %rax
  387. addq %rax, %r11
  388. movq %r14, %rax # 56($inp)
  389. adcq %rdx, %r12
  390. adcq \$0, %rcx
  391. movq %r11, 96(%rsp)
  392. movq %r12, 104(%rsp)
  393. #eighth iteration
  394. xorq %rbx, %rbx # rbx:r13 = r13 << 1
  395. addq %r13, %r13
  396. adcq \$0, %rbx
  397. mulq %rax
  398. # rcx <= 2 and rax <= 0xFFFF..F9, so carry must be zero here
  399. addq %rcx, %rax
  400. addq %r13, %rax
  401. adcq %rbx, %rdx
  402. movq (%rsp), %r8
  403. movq 8(%rsp), %r9
  404. movq 16(%rsp), %r10
  405. movq 24(%rsp), %r11
  406. movq 32(%rsp), %r12
  407. movq 40(%rsp), %r13
  408. movq 48(%rsp), %r14
  409. movq 56(%rsp), %r15
  410. movq %xmm1, %rbp
  411. movq %rax, 112(%rsp)
  412. movq %rdx, 120(%rsp)
  413. call __rsaz_512_reduce
  414. addq 64(%rsp), %r8
  415. adcq 72(%rsp), %r9
  416. adcq 80(%rsp), %r10
  417. adcq 88(%rsp), %r11
  418. adcq 96(%rsp), %r12
  419. adcq 104(%rsp), %r13
  420. adcq 112(%rsp), %r14
  421. adcq 120(%rsp), %r15
  422. sbbq %rcx, %rcx
  423. call __rsaz_512_subtract
  424. movq %r8, %rdx
  425. movq %r9, %rax
  426. movl 128+8(%rsp), $times
  427. movq $out, $inp
  428. decl $times
  429. jnz .Loop_sqr
  430. ___
  431. if ($addx) {
  432. $code.=<<___;
  433. jmp .Lsqr_tail
  434. .align 32
  435. .Loop_sqrx:
  436. movl $times,128+8(%rsp)
  437. movq $out, %xmm0 # off-load
  438. #first iteration
  439. mulx %rax, %r8, %r9
  440. mov %rax, %rbx
  441. mulx 16($inp), %rcx, %r10
  442. xor %rbp, %rbp # cf=0, of=0
  443. mulx 24($inp), %rax, %r11
  444. adcx %rcx, %r9
  445. .byte 0xc4,0x62,0xf3,0xf6,0xa6,0x20,0x00,0x00,0x00 # mulx 32($inp), %rcx, %r12
  446. adcx %rax, %r10
  447. .byte 0xc4,0x62,0xfb,0xf6,0xae,0x28,0x00,0x00,0x00 # mulx 40($inp), %rax, %r13
  448. adcx %rcx, %r11
  449. mulx 48($inp), %rcx, %r14
  450. adcx %rax, %r12
  451. adcx %rcx, %r13
  452. mulx 56($inp), %rax, %r15
  453. adcx %rax, %r14
  454. adcx %rbp, %r15 # %rbp is 0
  455. mulx %rdx, %rax, $out
  456. mov %rbx, %rdx # 8($inp)
  457. xor %rcx, %rcx
  458. adox %r8, %r8
  459. adcx $out, %r8
  460. adox %rbp, %rcx
  461. adcx %rbp, %rcx
  462. mov %rax, (%rsp)
  463. mov %r8, 8(%rsp)
  464. #second iteration
  465. .byte 0xc4,0xe2,0xfb,0xf6,0x9e,0x10,0x00,0x00,0x00 # mulx 16($inp), %rax, %rbx
  466. adox %rax, %r10
  467. adcx %rbx, %r11
  468. mulx 24($inp), $out, %r8
  469. adox $out, %r11
  470. .byte 0x66
  471. adcx %r8, %r12
  472. mulx 32($inp), %rax, %rbx
  473. adox %rax, %r12
  474. adcx %rbx, %r13
  475. mulx 40($inp), $out, %r8
  476. adox $out, %r13
  477. adcx %r8, %r14
  478. .byte 0xc4,0xe2,0xfb,0xf6,0x9e,0x30,0x00,0x00,0x00 # mulx 48($inp), %rax, %rbx
  479. adox %rax, %r14
  480. adcx %rbx, %r15
  481. .byte 0xc4,0x62,0xc3,0xf6,0x86,0x38,0x00,0x00,0x00 # mulx 56($inp), $out, %r8
  482. adox $out, %r15
  483. adcx %rbp, %r8
  484. mulx %rdx, %rax, $out
  485. adox %rbp, %r8
  486. .byte 0x48,0x8b,0x96,0x10,0x00,0x00,0x00 # mov 16($inp), %rdx
  487. xor %rbx, %rbx
  488. adox %r9, %r9
  489. # rcx <= 2 and rax <= 0xFFFF..F9, so carry must be zero here
  490. adcx %rcx, %rax
  491. adox %r10, %r10
  492. adcx %rax, %r9
  493. adox %rbp, %rbx
  494. adcx $out, %r10
  495. adcx %rbp, %rbx
  496. mov %r9, 16(%rsp)
  497. .byte 0x4c,0x89,0x94,0x24,0x18,0x00,0x00,0x00 # mov %r10, 24(%rsp)
  498. #third iteration
  499. mulx 24($inp), $out, %r9
  500. adox $out, %r12
  501. adcx %r9, %r13
  502. mulx 32($inp), %rax, %rcx
  503. adox %rax, %r13
  504. adcx %rcx, %r14
  505. .byte 0xc4,0x62,0xc3,0xf6,0x8e,0x28,0x00,0x00,0x00 # mulx 40($inp), $out, %r9
  506. adox $out, %r14
  507. adcx %r9, %r15
  508. .byte 0xc4,0xe2,0xfb,0xf6,0x8e,0x30,0x00,0x00,0x00 # mulx 48($inp), %rax, %rcx
  509. adox %rax, %r15
  510. adcx %rcx, %r8
  511. mulx 56($inp), $out, %r9
  512. adox $out, %r8
  513. adcx %rbp, %r9
  514. mulx %rdx, %rax, $out
  515. adox %rbp, %r9
  516. mov 24($inp), %rdx
  517. xor %rcx, %rcx
  518. adox %r11, %r11
  519. # rbx <= 2 and rax <= 0xFFFF..F9, so carry must be zero here
  520. adcx %rbx, %rax
  521. adox %r12, %r12
  522. adcx %rax, %r11
  523. adox %rbp, %rcx
  524. adcx $out, %r12
  525. adcx %rbp, %rcx
  526. mov %r11, 32(%rsp)
  527. mov %r12, 40(%rsp)
  528. #fourth iteration
  529. mulx 32($inp), %rax, %rbx
  530. adox %rax, %r14
  531. adcx %rbx, %r15
  532. mulx 40($inp), $out, %r10
  533. adox $out, %r15
  534. adcx %r10, %r8
  535. mulx 48($inp), %rax, %rbx
  536. adox %rax, %r8
  537. adcx %rbx, %r9
  538. mulx 56($inp), $out, %r10
  539. adox $out, %r9
  540. adcx %rbp, %r10
  541. mulx %rdx, %rax, $out
  542. adox %rbp, %r10
  543. mov 32($inp), %rdx
  544. xor %rbx, %rbx
  545. adox %r13, %r13
  546. # rcx <= 2 and rax <= 0xFFFF..F9, so carry must be zero here
  547. adcx %rcx, %rax
  548. adox %r14, %r14
  549. adcx %rax, %r13
  550. adox %rbp, %rbx
  551. adcx $out, %r14
  552. adcx %rbp, %rbx
  553. mov %r13, 48(%rsp)
  554. mov %r14, 56(%rsp)
  555. #fifth iteration
  556. mulx 40($inp), $out, %r11
  557. adox $out, %r8
  558. adcx %r11, %r9
  559. mulx 48($inp), %rax, %rcx
  560. adox %rax, %r9
  561. adcx %rcx, %r10
  562. mulx 56($inp), $out, %r11
  563. adox $out, %r10
  564. adcx %rbp, %r11
  565. mulx %rdx, %rax, $out
  566. mov 40($inp), %rdx
  567. adox %rbp, %r11
  568. xor %rcx, %rcx
  569. adox %r15, %r15
  570. # rbx <= 2 and rax <= 0xFFFF..F9, so carry must be zero here
  571. adcx %rbx, %rax
  572. adox %r8, %r8
  573. adcx %rax, %r15
  574. adox %rbp, %rcx
  575. adcx $out, %r8
  576. adcx %rbp, %rcx
  577. mov %r15, 64(%rsp)
  578. mov %r8, 72(%rsp)
  579. #sixth iteration
  580. .byte 0xc4,0xe2,0xfb,0xf6,0x9e,0x30,0x00,0x00,0x00 # mulx 48($inp), %rax, %rbx
  581. adox %rax, %r10
  582. adcx %rbx, %r11
  583. .byte 0xc4,0x62,0xc3,0xf6,0xa6,0x38,0x00,0x00,0x00 # mulx 56($inp), $out, %r12
  584. adox $out, %r11
  585. adcx %rbp, %r12
  586. mulx %rdx, %rax, $out
  587. adox %rbp, %r12
  588. mov 48($inp), %rdx
  589. xor %rbx, %rbx
  590. adox %r9, %r9
  591. # rcx <= 2 and rax <= 0xFFFF..F9, so carry must be zero here
  592. adcx %rcx, %rax
  593. adox %r10, %r10
  594. adcx %rax, %r9
  595. adcx $out, %r10
  596. adox %rbp, %rbx
  597. adcx %rbp, %rbx
  598. mov %r9, 80(%rsp)
  599. mov %r10, 88(%rsp)
  600. #seventh iteration
  601. .byte 0xc4,0x62,0xfb,0xf6,0xae,0x38,0x00,0x00,0x00 # mulx 56($inp), %rax, %r13
  602. adox %rax, %r12
  603. adox %rbp, %r13
  604. mulx %rdx, %rax, $out
  605. xor %rcx, %rcx
  606. mov 56($inp), %rdx
  607. adox %r11, %r11
  608. # rbx <= 2 and rax <= 0xFFFF..F9, so carry must be zero here
  609. adcx %rbx, %rax
  610. adox %r12, %r12
  611. adcx %rax, %r11
  612. adox %rbp, %rcx
  613. adcx $out, %r12
  614. adcx %rbp, %rcx
  615. .byte 0x4c,0x89,0x9c,0x24,0x60,0x00,0x00,0x00 # mov %r11, 96(%rsp)
  616. .byte 0x4c,0x89,0xa4,0x24,0x68,0x00,0x00,0x00 # mov %r12, 104(%rsp)
  617. #eighth iteration
  618. mulx %rdx, %rax, %rdx
  619. xor %rbx, %rbx
  620. adox %r13, %r13
  621. # rcx <= 2 and rax <= 0xFFFF..F9, so carry must be zero here
  622. adcx %rcx, %rax
  623. adox %rbp, %rbx
  624. adcx %r13, %rax
  625. adcx %rdx, %rbx
  626. movq %xmm0, $out
  627. movq %xmm1, %rbp
  628. movq 128(%rsp), %rdx # pull $n0
  629. movq (%rsp), %r8
  630. movq 8(%rsp), %r9
  631. movq 16(%rsp), %r10
  632. movq 24(%rsp), %r11
  633. movq 32(%rsp), %r12
  634. movq 40(%rsp), %r13
  635. movq 48(%rsp), %r14
  636. movq 56(%rsp), %r15
  637. movq %rax, 112(%rsp)
  638. movq %rbx, 120(%rsp)
  639. call __rsaz_512_reducex
  640. addq 64(%rsp), %r8
  641. adcq 72(%rsp), %r9
  642. adcq 80(%rsp), %r10
  643. adcq 88(%rsp), %r11
  644. adcq 96(%rsp), %r12
  645. adcq 104(%rsp), %r13
  646. adcq 112(%rsp), %r14
  647. adcq 120(%rsp), %r15
  648. sbbq %rcx, %rcx
  649. call __rsaz_512_subtract
  650. movq %r8, %rdx
  651. movq %r9, %rax
  652. movl 128+8(%rsp), $times
  653. movq $out, $inp
  654. decl $times
  655. jnz .Loop_sqrx
  656. .Lsqr_tail:
  657. ___
  658. }
  659. $code.=<<___;
  660. leaq 128+24+48(%rsp), %rax
  661. .cfi_def_cfa %rax,8
  662. movq -48(%rax), %r15
  663. .cfi_restore %r15
  664. movq -40(%rax), %r14
  665. .cfi_restore %r14
  666. movq -32(%rax), %r13
  667. .cfi_restore %r13
  668. movq -24(%rax), %r12
  669. .cfi_restore %r12
  670. movq -16(%rax), %rbp
  671. .cfi_restore %rbp
  672. movq -8(%rax), %rbx
  673. .cfi_restore %rbx
  674. leaq (%rax), %rsp
  675. .cfi_def_cfa_register %rsp
  676. .Lsqr_epilogue:
  677. ret
  678. .cfi_endproc
  679. .size rsaz_512_sqr,.-rsaz_512_sqr
  680. ___
  681. }
  682. {
  683. my ($out,$ap,$bp,$mod,$n0) = ("%rdi","%rsi","%rdx","%rcx","%r8");
  684. $code.=<<___;
  685. .globl rsaz_512_mul
  686. .type rsaz_512_mul,\@function,5
  687. .align 32
  688. rsaz_512_mul:
  689. .cfi_startproc
  690. push %rbx
  691. .cfi_push %rbx
  692. push %rbp
  693. .cfi_push %rbp
  694. push %r12
  695. .cfi_push %r12
  696. push %r13
  697. .cfi_push %r13
  698. push %r14
  699. .cfi_push %r14
  700. push %r15
  701. .cfi_push %r15
  702. subq \$128+24, %rsp
  703. .cfi_adjust_cfa_offset 128+24
  704. .Lmul_body:
  705. movq $out, %xmm0 # off-load arguments
  706. movq $mod, %xmm1
  707. movq $n0, 128(%rsp)
  708. ___
  709. $code.=<<___ if ($addx);
  710. movl \$0x80100,%r11d
  711. andl OPENSSL_ia32cap_P+8(%rip),%r11d
  712. cmpl \$0x80100,%r11d # check for MULX and ADO/CX
  713. je .Lmulx
  714. ___
  715. $code.=<<___;
  716. movq ($bp), %rbx # pass b[0]
  717. movq $bp, %rbp # pass argument
  718. call __rsaz_512_mul
  719. movq %xmm0, $out
  720. movq %xmm1, %rbp
  721. movq (%rsp), %r8
  722. movq 8(%rsp), %r9
  723. movq 16(%rsp), %r10
  724. movq 24(%rsp), %r11
  725. movq 32(%rsp), %r12
  726. movq 40(%rsp), %r13
  727. movq 48(%rsp), %r14
  728. movq 56(%rsp), %r15
  729. call __rsaz_512_reduce
  730. ___
  731. $code.=<<___ if ($addx);
  732. jmp .Lmul_tail
  733. .align 32
  734. .Lmulx:
  735. movq $bp, %rbp # pass argument
  736. movq ($bp), %rdx # pass b[0]
  737. call __rsaz_512_mulx
  738. movq %xmm0, $out
  739. movq %xmm1, %rbp
  740. movq 128(%rsp), %rdx # pull $n0
  741. movq (%rsp), %r8
  742. movq 8(%rsp), %r9
  743. movq 16(%rsp), %r10
  744. movq 24(%rsp), %r11
  745. movq 32(%rsp), %r12
  746. movq 40(%rsp), %r13
  747. movq 48(%rsp), %r14
  748. movq 56(%rsp), %r15
  749. call __rsaz_512_reducex
  750. .Lmul_tail:
  751. ___
  752. $code.=<<___;
  753. addq 64(%rsp), %r8
  754. adcq 72(%rsp), %r9
  755. adcq 80(%rsp), %r10
  756. adcq 88(%rsp), %r11
  757. adcq 96(%rsp), %r12
  758. adcq 104(%rsp), %r13
  759. adcq 112(%rsp), %r14
  760. adcq 120(%rsp), %r15
  761. sbbq %rcx, %rcx
  762. call __rsaz_512_subtract
  763. leaq 128+24+48(%rsp), %rax
  764. .cfi_def_cfa %rax,8
  765. movq -48(%rax), %r15
  766. .cfi_restore %r15
  767. movq -40(%rax), %r14
  768. .cfi_restore %r14
  769. movq -32(%rax), %r13
  770. .cfi_restore %r13
  771. movq -24(%rax), %r12
  772. .cfi_restore %r12
  773. movq -16(%rax), %rbp
  774. .cfi_restore %rbp
  775. movq -8(%rax), %rbx
  776. .cfi_restore %rbx
  777. leaq (%rax), %rsp
  778. .cfi_def_cfa_register %rsp
  779. .Lmul_epilogue:
  780. ret
  781. .cfi_endproc
  782. .size rsaz_512_mul,.-rsaz_512_mul
  783. ___
  784. }
  785. {
  786. my ($out,$ap,$bp,$mod,$n0,$pwr) = ("%rdi","%rsi","%rdx","%rcx","%r8","%r9d");
  787. $code.=<<___;
  788. .globl rsaz_512_mul_gather4
  789. .type rsaz_512_mul_gather4,\@function,6
  790. .align 32
  791. rsaz_512_mul_gather4:
  792. .cfi_startproc
  793. push %rbx
  794. .cfi_push %rbx
  795. push %rbp
  796. .cfi_push %rbp
  797. push %r12
  798. .cfi_push %r12
  799. push %r13
  800. .cfi_push %r13
  801. push %r14
  802. .cfi_push %r14
  803. push %r15
  804. .cfi_push %r15
  805. subq \$`128+24+($win64?0xb0:0)`, %rsp
  806. .cfi_adjust_cfa_offset `128+24+($win64?0xb0:0)`
  807. ___
  808. $code.=<<___ if ($win64);
  809. movaps %xmm6,0xa0(%rsp)
  810. movaps %xmm7,0xb0(%rsp)
  811. movaps %xmm8,0xc0(%rsp)
  812. movaps %xmm9,0xd0(%rsp)
  813. movaps %xmm10,0xe0(%rsp)
  814. movaps %xmm11,0xf0(%rsp)
  815. movaps %xmm12,0x100(%rsp)
  816. movaps %xmm13,0x110(%rsp)
  817. movaps %xmm14,0x120(%rsp)
  818. movaps %xmm15,0x130(%rsp)
  819. ___
  820. $code.=<<___;
  821. .Lmul_gather4_body:
  822. movd $pwr,%xmm8
  823. movdqa .Linc+16(%rip),%xmm1 # 00000002000000020000000200000002
  824. movdqa .Linc(%rip),%xmm0 # 00000001000000010000000000000000
  825. pshufd \$0,%xmm8,%xmm8 # broadcast $power
  826. movdqa %xmm1,%xmm7
  827. movdqa %xmm1,%xmm2
  828. ___
  829. ########################################################################
  830. # calculate mask by comparing 0..15 to $power
  831. #
  832. for($i=0;$i<4;$i++) {
  833. $code.=<<___;
  834. paddd %xmm`$i`,%xmm`$i+1`
  835. pcmpeqd %xmm8,%xmm`$i`
  836. movdqa %xmm7,%xmm`$i+3`
  837. ___
  838. }
  839. for(;$i<7;$i++) {
  840. $code.=<<___;
  841. paddd %xmm`$i`,%xmm`$i+1`
  842. pcmpeqd %xmm8,%xmm`$i`
  843. ___
  844. }
  845. $code.=<<___;
  846. pcmpeqd %xmm8,%xmm7
  847. movdqa 16*0($bp),%xmm8
  848. movdqa 16*1($bp),%xmm9
  849. movdqa 16*2($bp),%xmm10
  850. movdqa 16*3($bp),%xmm11
  851. pand %xmm0,%xmm8
  852. movdqa 16*4($bp),%xmm12
  853. pand %xmm1,%xmm9
  854. movdqa 16*5($bp),%xmm13
  855. pand %xmm2,%xmm10
  856. movdqa 16*6($bp),%xmm14
  857. pand %xmm3,%xmm11
  858. movdqa 16*7($bp),%xmm15
  859. leaq 128($bp), %rbp
  860. pand %xmm4,%xmm12
  861. pand %xmm5,%xmm13
  862. pand %xmm6,%xmm14
  863. pand %xmm7,%xmm15
  864. por %xmm10,%xmm8
  865. por %xmm11,%xmm9
  866. por %xmm12,%xmm8
  867. por %xmm13,%xmm9
  868. por %xmm14,%xmm8
  869. por %xmm15,%xmm9
  870. por %xmm9,%xmm8
  871. pshufd \$0x4e,%xmm8,%xmm9
  872. por %xmm9,%xmm8
  873. ___
  874. $code.=<<___ if ($addx);
  875. movl \$0x80100,%r11d
  876. andl OPENSSL_ia32cap_P+8(%rip),%r11d
  877. cmpl \$0x80100,%r11d # check for MULX and ADO/CX
  878. je .Lmulx_gather
  879. ___
  880. $code.=<<___;
  881. movq %xmm8,%rbx
  882. movq $n0, 128(%rsp) # off-load arguments
  883. movq $out, 128+8(%rsp)
  884. movq $mod, 128+16(%rsp)
  885. movq ($ap), %rax
  886. movq 8($ap), %rcx
  887. mulq %rbx # 0 iteration
  888. movq %rax, (%rsp)
  889. movq %rcx, %rax
  890. movq %rdx, %r8
  891. mulq %rbx
  892. addq %rax, %r8
  893. movq 16($ap), %rax
  894. movq %rdx, %r9
  895. adcq \$0, %r9
  896. mulq %rbx
  897. addq %rax, %r9
  898. movq 24($ap), %rax
  899. movq %rdx, %r10
  900. adcq \$0, %r10
  901. mulq %rbx
  902. addq %rax, %r10
  903. movq 32($ap), %rax
  904. movq %rdx, %r11
  905. adcq \$0, %r11
  906. mulq %rbx
  907. addq %rax, %r11
  908. movq 40($ap), %rax
  909. movq %rdx, %r12
  910. adcq \$0, %r12
  911. mulq %rbx
  912. addq %rax, %r12
  913. movq 48($ap), %rax
  914. movq %rdx, %r13
  915. adcq \$0, %r13
  916. mulq %rbx
  917. addq %rax, %r13
  918. movq 56($ap), %rax
  919. movq %rdx, %r14
  920. adcq \$0, %r14
  921. mulq %rbx
  922. addq %rax, %r14
  923. movq ($ap), %rax
  924. movq %rdx, %r15
  925. adcq \$0, %r15
  926. leaq 8(%rsp), %rdi
  927. movl \$7, %ecx
  928. jmp .Loop_mul_gather
  929. .align 32
  930. .Loop_mul_gather:
  931. movdqa 16*0(%rbp),%xmm8
  932. movdqa 16*1(%rbp),%xmm9
  933. movdqa 16*2(%rbp),%xmm10
  934. movdqa 16*3(%rbp),%xmm11
  935. pand %xmm0,%xmm8
  936. movdqa 16*4(%rbp),%xmm12
  937. pand %xmm1,%xmm9
  938. movdqa 16*5(%rbp),%xmm13
  939. pand %xmm2,%xmm10
  940. movdqa 16*6(%rbp),%xmm14
  941. pand %xmm3,%xmm11
  942. movdqa 16*7(%rbp),%xmm15
  943. leaq 128(%rbp), %rbp
  944. pand %xmm4,%xmm12
  945. pand %xmm5,%xmm13
  946. pand %xmm6,%xmm14
  947. pand %xmm7,%xmm15
  948. por %xmm10,%xmm8
  949. por %xmm11,%xmm9
  950. por %xmm12,%xmm8
  951. por %xmm13,%xmm9
  952. por %xmm14,%xmm8
  953. por %xmm15,%xmm9
  954. por %xmm9,%xmm8
  955. pshufd \$0x4e,%xmm8,%xmm9
  956. por %xmm9,%xmm8
  957. movq %xmm8,%rbx
  958. mulq %rbx
  959. addq %rax, %r8
  960. movq 8($ap), %rax
  961. movq %r8, (%rdi)
  962. movq %rdx, %r8
  963. adcq \$0, %r8
  964. mulq %rbx
  965. addq %rax, %r9
  966. movq 16($ap), %rax
  967. adcq \$0, %rdx
  968. addq %r9, %r8
  969. movq %rdx, %r9
  970. adcq \$0, %r9
  971. mulq %rbx
  972. addq %rax, %r10
  973. movq 24($ap), %rax
  974. adcq \$0, %rdx
  975. addq %r10, %r9
  976. movq %rdx, %r10
  977. adcq \$0, %r10
  978. mulq %rbx
  979. addq %rax, %r11
  980. movq 32($ap), %rax
  981. adcq \$0, %rdx
  982. addq %r11, %r10
  983. movq %rdx, %r11
  984. adcq \$0, %r11
  985. mulq %rbx
  986. addq %rax, %r12
  987. movq 40($ap), %rax
  988. adcq \$0, %rdx
  989. addq %r12, %r11
  990. movq %rdx, %r12
  991. adcq \$0, %r12
  992. mulq %rbx
  993. addq %rax, %r13
  994. movq 48($ap), %rax
  995. adcq \$0, %rdx
  996. addq %r13, %r12
  997. movq %rdx, %r13
  998. adcq \$0, %r13
  999. mulq %rbx
  1000. addq %rax, %r14
  1001. movq 56($ap), %rax
  1002. adcq \$0, %rdx
  1003. addq %r14, %r13
  1004. movq %rdx, %r14
  1005. adcq \$0, %r14
  1006. mulq %rbx
  1007. addq %rax, %r15
  1008. movq ($ap), %rax
  1009. adcq \$0, %rdx
  1010. addq %r15, %r14
  1011. movq %rdx, %r15
  1012. adcq \$0, %r15
  1013. leaq 8(%rdi), %rdi
  1014. decl %ecx
  1015. jnz .Loop_mul_gather
  1016. movq %r8, (%rdi)
  1017. movq %r9, 8(%rdi)
  1018. movq %r10, 16(%rdi)
  1019. movq %r11, 24(%rdi)
  1020. movq %r12, 32(%rdi)
  1021. movq %r13, 40(%rdi)
  1022. movq %r14, 48(%rdi)
  1023. movq %r15, 56(%rdi)
  1024. movq 128+8(%rsp), $out
  1025. movq 128+16(%rsp), %rbp
  1026. movq (%rsp), %r8
  1027. movq 8(%rsp), %r9
  1028. movq 16(%rsp), %r10
  1029. movq 24(%rsp), %r11
  1030. movq 32(%rsp), %r12
  1031. movq 40(%rsp), %r13
  1032. movq 48(%rsp), %r14
  1033. movq 56(%rsp), %r15
  1034. call __rsaz_512_reduce
  1035. ___
  1036. $code.=<<___ if ($addx);
  1037. jmp .Lmul_gather_tail
  1038. .align 32
  1039. .Lmulx_gather:
  1040. movq %xmm8,%rdx
  1041. mov $n0, 128(%rsp) # off-load arguments
  1042. mov $out, 128+8(%rsp)
  1043. mov $mod, 128+16(%rsp)
  1044. mulx ($ap), %rbx, %r8 # 0 iteration
  1045. mov %rbx, (%rsp)
  1046. xor %edi, %edi # cf=0, of=0
  1047. mulx 8($ap), %rax, %r9
  1048. mulx 16($ap), %rbx, %r10
  1049. adcx %rax, %r8
  1050. mulx 24($ap), %rax, %r11
  1051. adcx %rbx, %r9
  1052. mulx 32($ap), %rbx, %r12
  1053. adcx %rax, %r10
  1054. mulx 40($ap), %rax, %r13
  1055. adcx %rbx, %r11
  1056. mulx 48($ap), %rbx, %r14
  1057. adcx %rax, %r12
  1058. mulx 56($ap), %rax, %r15
  1059. adcx %rbx, %r13
  1060. adcx %rax, %r14
  1061. .byte 0x67
  1062. mov %r8, %rbx
  1063. adcx %rdi, %r15 # %rdi is 0
  1064. mov \$-7, %rcx
  1065. jmp .Loop_mulx_gather
  1066. .align 32
  1067. .Loop_mulx_gather:
  1068. movdqa 16*0(%rbp),%xmm8
  1069. movdqa 16*1(%rbp),%xmm9
  1070. movdqa 16*2(%rbp),%xmm10
  1071. movdqa 16*3(%rbp),%xmm11
  1072. pand %xmm0,%xmm8
  1073. movdqa 16*4(%rbp),%xmm12
  1074. pand %xmm1,%xmm9
  1075. movdqa 16*5(%rbp),%xmm13
  1076. pand %xmm2,%xmm10
  1077. movdqa 16*6(%rbp),%xmm14
  1078. pand %xmm3,%xmm11
  1079. movdqa 16*7(%rbp),%xmm15
  1080. leaq 128(%rbp), %rbp
  1081. pand %xmm4,%xmm12
  1082. pand %xmm5,%xmm13
  1083. pand %xmm6,%xmm14
  1084. pand %xmm7,%xmm15
  1085. por %xmm10,%xmm8
  1086. por %xmm11,%xmm9
  1087. por %xmm12,%xmm8
  1088. por %xmm13,%xmm9
  1089. por %xmm14,%xmm8
  1090. por %xmm15,%xmm9
  1091. por %xmm9,%xmm8
  1092. pshufd \$0x4e,%xmm8,%xmm9
  1093. por %xmm9,%xmm8
  1094. movq %xmm8,%rdx
  1095. .byte 0xc4,0x62,0xfb,0xf6,0x86,0x00,0x00,0x00,0x00 # mulx ($ap), %rax, %r8
  1096. adcx %rax, %rbx
  1097. adox %r9, %r8
  1098. mulx 8($ap), %rax, %r9
  1099. adcx %rax, %r8
  1100. adox %r10, %r9
  1101. mulx 16($ap), %rax, %r10
  1102. adcx %rax, %r9
  1103. adox %r11, %r10
  1104. .byte 0xc4,0x62,0xfb,0xf6,0x9e,0x18,0x00,0x00,0x00 # mulx 24($ap), %rax, %r11
  1105. adcx %rax, %r10
  1106. adox %r12, %r11
  1107. mulx 32($ap), %rax, %r12
  1108. adcx %rax, %r11
  1109. adox %r13, %r12
  1110. mulx 40($ap), %rax, %r13
  1111. adcx %rax, %r12
  1112. adox %r14, %r13
  1113. .byte 0xc4,0x62,0xfb,0xf6,0xb6,0x30,0x00,0x00,0x00 # mulx 48($ap), %rax, %r14
  1114. adcx %rax, %r13
  1115. .byte 0x67
  1116. adox %r15, %r14
  1117. mulx 56($ap), %rax, %r15
  1118. mov %rbx, 64(%rsp,%rcx,8)
  1119. adcx %rax, %r14
  1120. adox %rdi, %r15
  1121. mov %r8, %rbx
  1122. adcx %rdi, %r15 # cf=0
  1123. inc %rcx # of=0
  1124. jnz .Loop_mulx_gather
  1125. mov %r8, 64(%rsp)
  1126. mov %r9, 64+8(%rsp)
  1127. mov %r10, 64+16(%rsp)
  1128. mov %r11, 64+24(%rsp)
  1129. mov %r12, 64+32(%rsp)
  1130. mov %r13, 64+40(%rsp)
  1131. mov %r14, 64+48(%rsp)
  1132. mov %r15, 64+56(%rsp)
  1133. mov 128(%rsp), %rdx # pull arguments
  1134. mov 128+8(%rsp), $out
  1135. mov 128+16(%rsp), %rbp
  1136. mov (%rsp), %r8
  1137. mov 8(%rsp), %r9
  1138. mov 16(%rsp), %r10
  1139. mov 24(%rsp), %r11
  1140. mov 32(%rsp), %r12
  1141. mov 40(%rsp), %r13
  1142. mov 48(%rsp), %r14
  1143. mov 56(%rsp), %r15
  1144. call __rsaz_512_reducex
  1145. .Lmul_gather_tail:
  1146. ___
  1147. $code.=<<___;
  1148. addq 64(%rsp), %r8
  1149. adcq 72(%rsp), %r9
  1150. adcq 80(%rsp), %r10
  1151. adcq 88(%rsp), %r11
  1152. adcq 96(%rsp), %r12
  1153. adcq 104(%rsp), %r13
  1154. adcq 112(%rsp), %r14
  1155. adcq 120(%rsp), %r15
  1156. sbbq %rcx, %rcx
  1157. call __rsaz_512_subtract
  1158. leaq 128+24+48(%rsp), %rax
  1159. ___
  1160. $code.=<<___ if ($win64);
  1161. movaps 0xa0-0xc8(%rax),%xmm6
  1162. movaps 0xb0-0xc8(%rax),%xmm7
  1163. movaps 0xc0-0xc8(%rax),%xmm8
  1164. movaps 0xd0-0xc8(%rax),%xmm9
  1165. movaps 0xe0-0xc8(%rax),%xmm10
  1166. movaps 0xf0-0xc8(%rax),%xmm11
  1167. movaps 0x100-0xc8(%rax),%xmm12
  1168. movaps 0x110-0xc8(%rax),%xmm13
  1169. movaps 0x120-0xc8(%rax),%xmm14
  1170. movaps 0x130-0xc8(%rax),%xmm15
  1171. lea 0xb0(%rax),%rax
  1172. ___
  1173. $code.=<<___;
  1174. .cfi_def_cfa %rax,8
  1175. movq -48(%rax), %r15
  1176. .cfi_restore %r15
  1177. movq -40(%rax), %r14
  1178. .cfi_restore %r14
  1179. movq -32(%rax), %r13
  1180. .cfi_restore %r13
  1181. movq -24(%rax), %r12
  1182. .cfi_restore %r12
  1183. movq -16(%rax), %rbp
  1184. .cfi_restore %rbp
  1185. movq -8(%rax), %rbx
  1186. .cfi_restore %rbx
  1187. leaq (%rax), %rsp
  1188. .cfi_def_cfa_register %rsp
  1189. .Lmul_gather4_epilogue:
  1190. ret
  1191. .cfi_endproc
  1192. .size rsaz_512_mul_gather4,.-rsaz_512_mul_gather4
  1193. ___
  1194. }
  1195. {
  1196. my ($out,$ap,$mod,$n0,$tbl,$pwr) = ("%rdi","%rsi","%rdx","%rcx","%r8","%r9d");
  1197. $code.=<<___;
  1198. .globl rsaz_512_mul_scatter4
  1199. .type rsaz_512_mul_scatter4,\@function,6
  1200. .align 32
  1201. rsaz_512_mul_scatter4:
  1202. .cfi_startproc
  1203. push %rbx
  1204. .cfi_push %rbx
  1205. push %rbp
  1206. .cfi_push %rbp
  1207. push %r12
  1208. .cfi_push %r12
  1209. push %r13
  1210. .cfi_push %r13
  1211. push %r14
  1212. .cfi_push %r14
  1213. push %r15
  1214. .cfi_push %r15
  1215. mov $pwr, $pwr
  1216. subq \$128+24, %rsp
  1217. .cfi_adjust_cfa_offset 128+24
  1218. .Lmul_scatter4_body:
  1219. leaq ($tbl,$pwr,8), $tbl
  1220. movq $out, %xmm0 # off-load arguments
  1221. movq $mod, %xmm1
  1222. movq $tbl, %xmm2
  1223. movq $n0, 128(%rsp)
  1224. movq $out, %rbp
  1225. ___
  1226. $code.=<<___ if ($addx);
  1227. movl \$0x80100,%r11d
  1228. andl OPENSSL_ia32cap_P+8(%rip),%r11d
  1229. cmpl \$0x80100,%r11d # check for MULX and ADO/CX
  1230. je .Lmulx_scatter
  1231. ___
  1232. $code.=<<___;
  1233. movq ($out),%rbx # pass b[0]
  1234. call __rsaz_512_mul
  1235. movq %xmm0, $out
  1236. movq %xmm1, %rbp
  1237. movq (%rsp), %r8
  1238. movq 8(%rsp), %r9
  1239. movq 16(%rsp), %r10
  1240. movq 24(%rsp), %r11
  1241. movq 32(%rsp), %r12
  1242. movq 40(%rsp), %r13
  1243. movq 48(%rsp), %r14
  1244. movq 56(%rsp), %r15
  1245. call __rsaz_512_reduce
  1246. ___
  1247. $code.=<<___ if ($addx);
  1248. jmp .Lmul_scatter_tail
  1249. .align 32
  1250. .Lmulx_scatter:
  1251. movq ($out), %rdx # pass b[0]
  1252. call __rsaz_512_mulx
  1253. movq %xmm0, $out
  1254. movq %xmm1, %rbp
  1255. movq 128(%rsp), %rdx # pull $n0
  1256. movq (%rsp), %r8
  1257. movq 8(%rsp), %r9
  1258. movq 16(%rsp), %r10
  1259. movq 24(%rsp), %r11
  1260. movq 32(%rsp), %r12
  1261. movq 40(%rsp), %r13
  1262. movq 48(%rsp), %r14
  1263. movq 56(%rsp), %r15
  1264. call __rsaz_512_reducex
  1265. .Lmul_scatter_tail:
  1266. ___
  1267. $code.=<<___;
  1268. addq 64(%rsp), %r8
  1269. adcq 72(%rsp), %r9
  1270. adcq 80(%rsp), %r10
  1271. adcq 88(%rsp), %r11
  1272. adcq 96(%rsp), %r12
  1273. adcq 104(%rsp), %r13
  1274. adcq 112(%rsp), %r14
  1275. adcq 120(%rsp), %r15
  1276. movq %xmm2, $inp
  1277. sbbq %rcx, %rcx
  1278. call __rsaz_512_subtract
  1279. movq %r8, 128*0($inp) # scatter
  1280. movq %r9, 128*1($inp)
  1281. movq %r10, 128*2($inp)
  1282. movq %r11, 128*3($inp)
  1283. movq %r12, 128*4($inp)
  1284. movq %r13, 128*5($inp)
  1285. movq %r14, 128*6($inp)
  1286. movq %r15, 128*7($inp)
  1287. leaq 128+24+48(%rsp), %rax
  1288. .cfi_def_cfa %rax,8
  1289. movq -48(%rax), %r15
  1290. .cfi_restore %r15
  1291. movq -40(%rax), %r14
  1292. .cfi_restore %r14
  1293. movq -32(%rax), %r13
  1294. .cfi_restore %r13
  1295. movq -24(%rax), %r12
  1296. .cfi_restore %r12
  1297. movq -16(%rax), %rbp
  1298. .cfi_restore %rbp
  1299. movq -8(%rax), %rbx
  1300. .cfi_restore %rbx
  1301. leaq (%rax), %rsp
  1302. .cfi_def_cfa_register %rsp
  1303. .Lmul_scatter4_epilogue:
  1304. ret
  1305. .cfi_endproc
  1306. .size rsaz_512_mul_scatter4,.-rsaz_512_mul_scatter4
  1307. ___
  1308. }
  1309. {
  1310. my ($out,$inp,$mod,$n0) = ("%rdi","%rsi","%rdx","%rcx");
  1311. $code.=<<___;
  1312. .globl rsaz_512_mul_by_one
  1313. .type rsaz_512_mul_by_one,\@function,4
  1314. .align 32
  1315. rsaz_512_mul_by_one:
  1316. .cfi_startproc
  1317. push %rbx
  1318. .cfi_push %rbx
  1319. push %rbp
  1320. .cfi_push %rbp
  1321. push %r12
  1322. .cfi_push %r12
  1323. push %r13
  1324. .cfi_push %r13
  1325. push %r14
  1326. .cfi_push %r14
  1327. push %r15
  1328. .cfi_push %r15
  1329. subq \$128+24, %rsp
  1330. .cfi_adjust_cfa_offset 128+24
  1331. .Lmul_by_one_body:
  1332. ___
  1333. $code.=<<___ if ($addx);
  1334. movl OPENSSL_ia32cap_P+8(%rip),%eax
  1335. ___
  1336. $code.=<<___;
  1337. movq $mod, %rbp # reassign argument
  1338. movq $n0, 128(%rsp)
  1339. movq ($inp), %r8
  1340. pxor %xmm0, %xmm0
  1341. movq 8($inp), %r9
  1342. movq 16($inp), %r10
  1343. movq 24($inp), %r11
  1344. movq 32($inp), %r12
  1345. movq 40($inp), %r13
  1346. movq 48($inp), %r14
  1347. movq 56($inp), %r15
  1348. movdqa %xmm0, (%rsp)
  1349. movdqa %xmm0, 16(%rsp)
  1350. movdqa %xmm0, 32(%rsp)
  1351. movdqa %xmm0, 48(%rsp)
  1352. movdqa %xmm0, 64(%rsp)
  1353. movdqa %xmm0, 80(%rsp)
  1354. movdqa %xmm0, 96(%rsp)
  1355. ___
  1356. $code.=<<___ if ($addx);
  1357. andl \$0x80100,%eax
  1358. cmpl \$0x80100,%eax # check for MULX and ADO/CX
  1359. je .Lby_one_callx
  1360. ___
  1361. $code.=<<___;
  1362. call __rsaz_512_reduce
  1363. ___
  1364. $code.=<<___ if ($addx);
  1365. jmp .Lby_one_tail
  1366. .align 32
  1367. .Lby_one_callx:
  1368. movq 128(%rsp), %rdx # pull $n0
  1369. call __rsaz_512_reducex
  1370. .Lby_one_tail:
  1371. ___
  1372. $code.=<<___;
  1373. movq %r8, ($out)
  1374. movq %r9, 8($out)
  1375. movq %r10, 16($out)
  1376. movq %r11, 24($out)
  1377. movq %r12, 32($out)
  1378. movq %r13, 40($out)
  1379. movq %r14, 48($out)
  1380. movq %r15, 56($out)
  1381. leaq 128+24+48(%rsp), %rax
  1382. .cfi_def_cfa %rax,8
  1383. movq -48(%rax), %r15
  1384. .cfi_restore %r15
  1385. movq -40(%rax), %r14
  1386. .cfi_restore %r14
  1387. movq -32(%rax), %r13
  1388. .cfi_restore %r13
  1389. movq -24(%rax), %r12
  1390. .cfi_restore %r12
  1391. movq -16(%rax), %rbp
  1392. .cfi_restore %rbp
  1393. movq -8(%rax), %rbx
  1394. .cfi_restore %rbx
  1395. leaq (%rax), %rsp
  1396. .cfi_def_cfa_register %rsp
  1397. .Lmul_by_one_epilogue:
  1398. ret
  1399. .cfi_endproc
  1400. .size rsaz_512_mul_by_one,.-rsaz_512_mul_by_one
  1401. ___
  1402. }
  1403. { # __rsaz_512_reduce
  1404. #
  1405. # input: %r8-%r15, %rbp - mod, 128(%rsp) - n0
  1406. # output: %r8-%r15
  1407. # clobbers: everything except %rbp and %rdi
  1408. $code.=<<___;
  1409. .type __rsaz_512_reduce,\@abi-omnipotent
  1410. .align 32
  1411. __rsaz_512_reduce:
  1412. .cfi_startproc
  1413. movq %r8, %rbx
  1414. imulq 128+8(%rsp), %rbx
  1415. movq 0(%rbp), %rax
  1416. movl \$8, %ecx
  1417. jmp .Lreduction_loop
  1418. .align 32
  1419. .Lreduction_loop:
  1420. mulq %rbx
  1421. movq 8(%rbp), %rax
  1422. negq %r8
  1423. movq %rdx, %r8
  1424. adcq \$0, %r8
  1425. mulq %rbx
  1426. addq %rax, %r9
  1427. movq 16(%rbp), %rax
  1428. adcq \$0, %rdx
  1429. addq %r9, %r8
  1430. movq %rdx, %r9
  1431. adcq \$0, %r9
  1432. mulq %rbx
  1433. addq %rax, %r10
  1434. movq 24(%rbp), %rax
  1435. adcq \$0, %rdx
  1436. addq %r10, %r9
  1437. movq %rdx, %r10
  1438. adcq \$0, %r10
  1439. mulq %rbx
  1440. addq %rax, %r11
  1441. movq 32(%rbp), %rax
  1442. adcq \$0, %rdx
  1443. addq %r11, %r10
  1444. movq 128+8(%rsp), %rsi
  1445. #movq %rdx, %r11
  1446. #adcq \$0, %r11
  1447. adcq \$0, %rdx
  1448. movq %rdx, %r11
  1449. mulq %rbx
  1450. addq %rax, %r12
  1451. movq 40(%rbp), %rax
  1452. adcq \$0, %rdx
  1453. imulq %r8, %rsi
  1454. addq %r12, %r11
  1455. movq %rdx, %r12
  1456. adcq \$0, %r12
  1457. mulq %rbx
  1458. addq %rax, %r13
  1459. movq 48(%rbp), %rax
  1460. adcq \$0, %rdx
  1461. addq %r13, %r12
  1462. movq %rdx, %r13
  1463. adcq \$0, %r13
  1464. mulq %rbx
  1465. addq %rax, %r14
  1466. movq 56(%rbp), %rax
  1467. adcq \$0, %rdx
  1468. addq %r14, %r13
  1469. movq %rdx, %r14
  1470. adcq \$0, %r14
  1471. mulq %rbx
  1472. movq %rsi, %rbx
  1473. addq %rax, %r15
  1474. movq 0(%rbp), %rax
  1475. adcq \$0, %rdx
  1476. addq %r15, %r14
  1477. movq %rdx, %r15
  1478. adcq \$0, %r15
  1479. decl %ecx
  1480. jne .Lreduction_loop
  1481. ret
  1482. .cfi_endproc
  1483. .size __rsaz_512_reduce,.-__rsaz_512_reduce
  1484. ___
  1485. }
  1486. if ($addx) {
  1487. # __rsaz_512_reducex
  1488. #
  1489. # input: %r8-%r15, %rbp - mod, 128(%rsp) - n0
  1490. # output: %r8-%r15
  1491. # clobbers: everything except %rbp and %rdi
  1492. $code.=<<___;
  1493. .type __rsaz_512_reducex,\@abi-omnipotent
  1494. .align 32
  1495. __rsaz_512_reducex:
  1496. .cfi_startproc
  1497. #movq 128+8(%rsp), %rdx # pull $n0
  1498. imulq %r8, %rdx
  1499. xorq %rsi, %rsi # cf=0,of=0
  1500. movl \$8, %ecx
  1501. jmp .Lreduction_loopx
  1502. .align 32
  1503. .Lreduction_loopx:
  1504. mov %r8, %rbx
  1505. mulx 0(%rbp), %rax, %r8
  1506. adcx %rbx, %rax
  1507. adox %r9, %r8
  1508. mulx 8(%rbp), %rax, %r9
  1509. adcx %rax, %r8
  1510. adox %r10, %r9
  1511. mulx 16(%rbp), %rbx, %r10
  1512. adcx %rbx, %r9
  1513. adox %r11, %r10
  1514. mulx 24(%rbp), %rbx, %r11
  1515. adcx %rbx, %r10
  1516. adox %r12, %r11
  1517. .byte 0xc4,0x62,0xe3,0xf6,0xa5,0x20,0x00,0x00,0x00 # mulx 32(%rbp), %rbx, %r12
  1518. mov %rdx, %rax
  1519. mov %r8, %rdx
  1520. adcx %rbx, %r11
  1521. adox %r13, %r12
  1522. mulx 128+8(%rsp), %rbx, %rdx
  1523. mov %rax, %rdx
  1524. mulx 40(%rbp), %rax, %r13
  1525. adcx %rax, %r12
  1526. adox %r14, %r13
  1527. .byte 0xc4,0x62,0xfb,0xf6,0xb5,0x30,0x00,0x00,0x00 # mulx 48(%rbp), %rax, %r14
  1528. adcx %rax, %r13
  1529. adox %r15, %r14
  1530. mulx 56(%rbp), %rax, %r15
  1531. mov %rbx, %rdx
  1532. adcx %rax, %r14
  1533. adox %rsi, %r15 # %rsi is 0
  1534. adcx %rsi, %r15 # cf=0
  1535. decl %ecx # of=0
  1536. jne .Lreduction_loopx
  1537. ret
  1538. .cfi_endproc
  1539. .size __rsaz_512_reducex,.-__rsaz_512_reducex
  1540. ___
  1541. }
  1542. { # __rsaz_512_subtract
  1543. # input: %r8-%r15, %rdi - $out, %rbp - $mod, %rcx - mask
  1544. # output:
  1545. # clobbers: everything but %rdi, %rsi and %rbp
  1546. $code.=<<___;
  1547. .type __rsaz_512_subtract,\@abi-omnipotent
  1548. .align 32
  1549. __rsaz_512_subtract:
  1550. .cfi_startproc
  1551. movq %r8, ($out)
  1552. movq %r9, 8($out)
  1553. movq %r10, 16($out)
  1554. movq %r11, 24($out)
  1555. movq %r12, 32($out)
  1556. movq %r13, 40($out)
  1557. movq %r14, 48($out)
  1558. movq %r15, 56($out)
  1559. movq 0($mod), %r8
  1560. movq 8($mod), %r9
  1561. negq %r8
  1562. notq %r9
  1563. andq %rcx, %r8
  1564. movq 16($mod), %r10
  1565. andq %rcx, %r9
  1566. notq %r10
  1567. movq 24($mod), %r11
  1568. andq %rcx, %r10
  1569. notq %r11
  1570. movq 32($mod), %r12
  1571. andq %rcx, %r11
  1572. notq %r12
  1573. movq 40($mod), %r13
  1574. andq %rcx, %r12
  1575. notq %r13
  1576. movq 48($mod), %r14
  1577. andq %rcx, %r13
  1578. notq %r14
  1579. movq 56($mod), %r15
  1580. andq %rcx, %r14
  1581. notq %r15
  1582. andq %rcx, %r15
  1583. addq ($out), %r8
  1584. adcq 8($out), %r9
  1585. adcq 16($out), %r10
  1586. adcq 24($out), %r11
  1587. adcq 32($out), %r12
  1588. adcq 40($out), %r13
  1589. adcq 48($out), %r14
  1590. adcq 56($out), %r15
  1591. movq %r8, ($out)
  1592. movq %r9, 8($out)
  1593. movq %r10, 16($out)
  1594. movq %r11, 24($out)
  1595. movq %r12, 32($out)
  1596. movq %r13, 40($out)
  1597. movq %r14, 48($out)
  1598. movq %r15, 56($out)
  1599. ret
  1600. .cfi_endproc
  1601. .size __rsaz_512_subtract,.-__rsaz_512_subtract
  1602. ___
  1603. }
  1604. { # __rsaz_512_mul
  1605. #
  1606. # input: %rsi - ap, %rbp - bp
  1607. # output:
  1608. # clobbers: everything
  1609. my ($ap,$bp) = ("%rsi","%rbp");
  1610. $code.=<<___;
  1611. .type __rsaz_512_mul,\@abi-omnipotent
  1612. .align 32
  1613. __rsaz_512_mul:
  1614. .cfi_startproc
  1615. leaq 8(%rsp), %rdi
  1616. movq ($ap), %rax
  1617. mulq %rbx
  1618. movq %rax, (%rdi)
  1619. movq 8($ap), %rax
  1620. movq %rdx, %r8
  1621. mulq %rbx
  1622. addq %rax, %r8
  1623. movq 16($ap), %rax
  1624. movq %rdx, %r9
  1625. adcq \$0, %r9
  1626. mulq %rbx
  1627. addq %rax, %r9
  1628. movq 24($ap), %rax
  1629. movq %rdx, %r10
  1630. adcq \$0, %r10
  1631. mulq %rbx
  1632. addq %rax, %r10
  1633. movq 32($ap), %rax
  1634. movq %rdx, %r11
  1635. adcq \$0, %r11
  1636. mulq %rbx
  1637. addq %rax, %r11
  1638. movq 40($ap), %rax
  1639. movq %rdx, %r12
  1640. adcq \$0, %r12
  1641. mulq %rbx
  1642. addq %rax, %r12
  1643. movq 48($ap), %rax
  1644. movq %rdx, %r13
  1645. adcq \$0, %r13
  1646. mulq %rbx
  1647. addq %rax, %r13
  1648. movq 56($ap), %rax
  1649. movq %rdx, %r14
  1650. adcq \$0, %r14
  1651. mulq %rbx
  1652. addq %rax, %r14
  1653. movq ($ap), %rax
  1654. movq %rdx, %r15
  1655. adcq \$0, %r15
  1656. leaq 8($bp), $bp
  1657. leaq 8(%rdi), %rdi
  1658. movl \$7, %ecx
  1659. jmp .Loop_mul
  1660. .align 32
  1661. .Loop_mul:
  1662. movq ($bp), %rbx
  1663. mulq %rbx
  1664. addq %rax, %r8
  1665. movq 8($ap), %rax
  1666. movq %r8, (%rdi)
  1667. movq %rdx, %r8
  1668. adcq \$0, %r8
  1669. mulq %rbx
  1670. addq %rax, %r9
  1671. movq 16($ap), %rax
  1672. adcq \$0, %rdx
  1673. addq %r9, %r8
  1674. movq %rdx, %r9
  1675. adcq \$0, %r9
  1676. mulq %rbx
  1677. addq %rax, %r10
  1678. movq 24($ap), %rax
  1679. adcq \$0, %rdx
  1680. addq %r10, %r9
  1681. movq %rdx, %r10
  1682. adcq \$0, %r10
  1683. mulq %rbx
  1684. addq %rax, %r11
  1685. movq 32($ap), %rax
  1686. adcq \$0, %rdx
  1687. addq %r11, %r10
  1688. movq %rdx, %r11
  1689. adcq \$0, %r11
  1690. mulq %rbx
  1691. addq %rax, %r12
  1692. movq 40($ap), %rax
  1693. adcq \$0, %rdx
  1694. addq %r12, %r11
  1695. movq %rdx, %r12
  1696. adcq \$0, %r12
  1697. mulq %rbx
  1698. addq %rax, %r13
  1699. movq 48($ap), %rax
  1700. adcq \$0, %rdx
  1701. addq %r13, %r12
  1702. movq %rdx, %r13
  1703. adcq \$0, %r13
  1704. mulq %rbx
  1705. addq %rax, %r14
  1706. movq 56($ap), %rax
  1707. adcq \$0, %rdx
  1708. addq %r14, %r13
  1709. movq %rdx, %r14
  1710. leaq 8($bp), $bp
  1711. adcq \$0, %r14
  1712. mulq %rbx
  1713. addq %rax, %r15
  1714. movq ($ap), %rax
  1715. adcq \$0, %rdx
  1716. addq %r15, %r14
  1717. movq %rdx, %r15
  1718. adcq \$0, %r15
  1719. leaq 8(%rdi), %rdi
  1720. decl %ecx
  1721. jnz .Loop_mul
  1722. movq %r8, (%rdi)
  1723. movq %r9, 8(%rdi)
  1724. movq %r10, 16(%rdi)
  1725. movq %r11, 24(%rdi)
  1726. movq %r12, 32(%rdi)
  1727. movq %r13, 40(%rdi)
  1728. movq %r14, 48(%rdi)
  1729. movq %r15, 56(%rdi)
  1730. ret
  1731. .cfi_endproc
  1732. .size __rsaz_512_mul,.-__rsaz_512_mul
  1733. ___
  1734. }
  1735. if ($addx) {
  1736. # __rsaz_512_mulx
  1737. #
  1738. # input: %rsi - ap, %rbp - bp
  1739. # output:
  1740. # clobbers: everything
  1741. my ($ap,$bp,$zero) = ("%rsi","%rbp","%rdi");
  1742. $code.=<<___;
  1743. .type __rsaz_512_mulx,\@abi-omnipotent
  1744. .align 32
  1745. __rsaz_512_mulx:
  1746. .cfi_startproc
  1747. mulx ($ap), %rbx, %r8 # initial %rdx preloaded by caller
  1748. mov \$-6, %rcx
  1749. mulx 8($ap), %rax, %r9
  1750. movq %rbx, 8(%rsp)
  1751. mulx 16($ap), %rbx, %r10
  1752. adc %rax, %r8
  1753. mulx 24($ap), %rax, %r11
  1754. adc %rbx, %r9
  1755. mulx 32($ap), %rbx, %r12
  1756. adc %rax, %r10
  1757. mulx 40($ap), %rax, %r13
  1758. adc %rbx, %r11
  1759. mulx 48($ap), %rbx, %r14
  1760. adc %rax, %r12
  1761. mulx 56($ap), %rax, %r15
  1762. mov 8($bp), %rdx
  1763. adc %rbx, %r13
  1764. adc %rax, %r14
  1765. adc \$0, %r15
  1766. xor $zero, $zero # cf=0,of=0
  1767. jmp .Loop_mulx
  1768. .align 32
  1769. .Loop_mulx:
  1770. movq %r8, %rbx
  1771. mulx ($ap), %rax, %r8
  1772. adcx %rax, %rbx
  1773. adox %r9, %r8
  1774. mulx 8($ap), %rax, %r9
  1775. adcx %rax, %r8
  1776. adox %r10, %r9
  1777. mulx 16($ap), %rax, %r10
  1778. adcx %rax, %r9
  1779. adox %r11, %r10
  1780. mulx 24($ap), %rax, %r11
  1781. adcx %rax, %r10
  1782. adox %r12, %r11
  1783. .byte 0x3e,0xc4,0x62,0xfb,0xf6,0xa6,0x20,0x00,0x00,0x00 # mulx 32($ap), %rax, %r12
  1784. adcx %rax, %r11
  1785. adox %r13, %r12
  1786. mulx 40($ap), %rax, %r13
  1787. adcx %rax, %r12
  1788. adox %r14, %r13
  1789. mulx 48($ap), %rax, %r14
  1790. adcx %rax, %r13
  1791. adox %r15, %r14
  1792. mulx 56($ap), %rax, %r15
  1793. movq 64($bp,%rcx,8), %rdx
  1794. movq %rbx, 8+64-8(%rsp,%rcx,8)
  1795. adcx %rax, %r14
  1796. adox $zero, %r15
  1797. adcx $zero, %r15 # cf=0
  1798. inc %rcx # of=0
  1799. jnz .Loop_mulx
  1800. movq %r8, %rbx
  1801. mulx ($ap), %rax, %r8
  1802. adcx %rax, %rbx
  1803. adox %r9, %r8
  1804. .byte 0xc4,0x62,0xfb,0xf6,0x8e,0x08,0x00,0x00,0x00 # mulx 8($ap), %rax, %r9
  1805. adcx %rax, %r8
  1806. adox %r10, %r9
  1807. .byte 0xc4,0x62,0xfb,0xf6,0x96,0x10,0x00,0x00,0x00 # mulx 16($ap), %rax, %r10
  1808. adcx %rax, %r9
  1809. adox %r11, %r10
  1810. mulx 24($ap), %rax, %r11
  1811. adcx %rax, %r10
  1812. adox %r12, %r11
  1813. mulx 32($ap), %rax, %r12
  1814. adcx %rax, %r11
  1815. adox %r13, %r12
  1816. mulx 40($ap), %rax, %r13
  1817. adcx %rax, %r12
  1818. adox %r14, %r13
  1819. .byte 0xc4,0x62,0xfb,0xf6,0xb6,0x30,0x00,0x00,0x00 # mulx 48($ap), %rax, %r14
  1820. adcx %rax, %r13
  1821. adox %r15, %r14
  1822. .byte 0xc4,0x62,0xfb,0xf6,0xbe,0x38,0x00,0x00,0x00 # mulx 56($ap), %rax, %r15
  1823. adcx %rax, %r14
  1824. adox $zero, %r15
  1825. adcx $zero, %r15
  1826. mov %rbx, 8+64-8(%rsp)
  1827. mov %r8, 8+64(%rsp)
  1828. mov %r9, 8+64+8(%rsp)
  1829. mov %r10, 8+64+16(%rsp)
  1830. mov %r11, 8+64+24(%rsp)
  1831. mov %r12, 8+64+32(%rsp)
  1832. mov %r13, 8+64+40(%rsp)
  1833. mov %r14, 8+64+48(%rsp)
  1834. mov %r15, 8+64+56(%rsp)
  1835. ret
  1836. .cfi_endproc
  1837. .size __rsaz_512_mulx,.-__rsaz_512_mulx
  1838. ___
  1839. }
  1840. {
  1841. my ($out,$inp,$power)= $win64 ? ("%rcx","%rdx","%r8d") : ("%rdi","%rsi","%edx");
  1842. $code.=<<___;
  1843. .globl rsaz_512_scatter4
  1844. .type rsaz_512_scatter4,\@abi-omnipotent
  1845. .align 16
  1846. rsaz_512_scatter4:
  1847. .cfi_startproc
  1848. leaq ($out,$power,8), $out
  1849. movl \$8, %r9d
  1850. jmp .Loop_scatter
  1851. .align 16
  1852. .Loop_scatter:
  1853. movq ($inp), %rax
  1854. leaq 8($inp), $inp
  1855. movq %rax, ($out)
  1856. leaq 128($out), $out
  1857. decl %r9d
  1858. jnz .Loop_scatter
  1859. ret
  1860. .cfi_endproc
  1861. .size rsaz_512_scatter4,.-rsaz_512_scatter4
  1862. .globl rsaz_512_gather4
  1863. .type rsaz_512_gather4,\@abi-omnipotent
  1864. .align 16
  1865. rsaz_512_gather4:
  1866. .cfi_startproc
  1867. ___
  1868. $code.=<<___ if ($win64);
  1869. .LSEH_begin_rsaz_512_gather4:
  1870. .byte 0x48,0x81,0xec,0xa8,0x00,0x00,0x00 # sub $0xa8,%rsp
  1871. .byte 0x0f,0x29,0x34,0x24 # movaps %xmm6,(%rsp)
  1872. .byte 0x0f,0x29,0x7c,0x24,0x10 # movaps %xmm7,0x10(%rsp)
  1873. .byte 0x44,0x0f,0x29,0x44,0x24,0x20 # movaps %xmm8,0x20(%rsp)
  1874. .byte 0x44,0x0f,0x29,0x4c,0x24,0x30 # movaps %xmm9,0x30(%rsp)
  1875. .byte 0x44,0x0f,0x29,0x54,0x24,0x40 # movaps %xmm10,0x40(%rsp)
  1876. .byte 0x44,0x0f,0x29,0x5c,0x24,0x50 # movaps %xmm11,0x50(%rsp)
  1877. .byte 0x44,0x0f,0x29,0x64,0x24,0x60 # movaps %xmm12,0x60(%rsp)
  1878. .byte 0x44,0x0f,0x29,0x6c,0x24,0x70 # movaps %xmm13,0x70(%rsp)
  1879. .byte 0x44,0x0f,0x29,0xb4,0x24,0x80,0,0,0 # movaps %xmm14,0x80(%rsp)
  1880. .byte 0x44,0x0f,0x29,0xbc,0x24,0x90,0,0,0 # movaps %xmm15,0x90(%rsp)
  1881. ___
  1882. $code.=<<___;
  1883. movd $power,%xmm8
  1884. movdqa .Linc+16(%rip),%xmm1 # 00000002000000020000000200000002
  1885. movdqa .Linc(%rip),%xmm0 # 00000001000000010000000000000000
  1886. pshufd \$0,%xmm8,%xmm8 # broadcast $power
  1887. movdqa %xmm1,%xmm7
  1888. movdqa %xmm1,%xmm2
  1889. ___
  1890. ########################################################################
  1891. # calculate mask by comparing 0..15 to $power
  1892. #
  1893. for($i=0;$i<4;$i++) {
  1894. $code.=<<___;
  1895. paddd %xmm`$i`,%xmm`$i+1`
  1896. pcmpeqd %xmm8,%xmm`$i`
  1897. movdqa %xmm7,%xmm`$i+3`
  1898. ___
  1899. }
  1900. for(;$i<7;$i++) {
  1901. $code.=<<___;
  1902. paddd %xmm`$i`,%xmm`$i+1`
  1903. pcmpeqd %xmm8,%xmm`$i`
  1904. ___
  1905. }
  1906. $code.=<<___;
  1907. pcmpeqd %xmm8,%xmm7
  1908. movl \$8, %r9d
  1909. jmp .Loop_gather
  1910. .align 16
  1911. .Loop_gather:
  1912. movdqa 16*0($inp),%xmm8
  1913. movdqa 16*1($inp),%xmm9
  1914. movdqa 16*2($inp),%xmm10
  1915. movdqa 16*3($inp),%xmm11
  1916. pand %xmm0,%xmm8
  1917. movdqa 16*4($inp),%xmm12
  1918. pand %xmm1,%xmm9
  1919. movdqa 16*5($inp),%xmm13
  1920. pand %xmm2,%xmm10
  1921. movdqa 16*6($inp),%xmm14
  1922. pand %xmm3,%xmm11
  1923. movdqa 16*7($inp),%xmm15
  1924. leaq 128($inp), $inp
  1925. pand %xmm4,%xmm12
  1926. pand %xmm5,%xmm13
  1927. pand %xmm6,%xmm14
  1928. pand %xmm7,%xmm15
  1929. por %xmm10,%xmm8
  1930. por %xmm11,%xmm9
  1931. por %xmm12,%xmm8
  1932. por %xmm13,%xmm9
  1933. por %xmm14,%xmm8
  1934. por %xmm15,%xmm9
  1935. por %xmm9,%xmm8
  1936. pshufd \$0x4e,%xmm8,%xmm9
  1937. por %xmm9,%xmm8
  1938. movq %xmm8,($out)
  1939. leaq 8($out), $out
  1940. decl %r9d
  1941. jnz .Loop_gather
  1942. ___
  1943. $code.=<<___ if ($win64);
  1944. movaps 0x00(%rsp),%xmm6
  1945. movaps 0x10(%rsp),%xmm7
  1946. movaps 0x20(%rsp),%xmm8
  1947. movaps 0x30(%rsp),%xmm9
  1948. movaps 0x40(%rsp),%xmm10
  1949. movaps 0x50(%rsp),%xmm11
  1950. movaps 0x60(%rsp),%xmm12
  1951. movaps 0x70(%rsp),%xmm13
  1952. movaps 0x80(%rsp),%xmm14
  1953. movaps 0x90(%rsp),%xmm15
  1954. add \$0xa8,%rsp
  1955. ___
  1956. $code.=<<___;
  1957. ret
  1958. .LSEH_end_rsaz_512_gather4:
  1959. .cfi_endproc
  1960. .size rsaz_512_gather4,.-rsaz_512_gather4
  1961. .align 64
  1962. .Linc:
  1963. .long 0,0, 1,1
  1964. .long 2,2, 2,2
  1965. ___
  1966. }
  1967. # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
  1968. # CONTEXT *context,DISPATCHER_CONTEXT *disp)
  1969. if ($win64) {
  1970. $rec="%rcx";
  1971. $frame="%rdx";
  1972. $context="%r8";
  1973. $disp="%r9";
  1974. $code.=<<___;
  1975. .extern __imp_RtlVirtualUnwind
  1976. .type se_handler,\@abi-omnipotent
  1977. .align 16
  1978. se_handler:
  1979. push %rsi
  1980. push %rdi
  1981. push %rbx
  1982. push %rbp
  1983. push %r12
  1984. push %r13
  1985. push %r14
  1986. push %r15
  1987. pushfq
  1988. sub \$64,%rsp
  1989. mov 120($context),%rax # pull context->Rax
  1990. mov 248($context),%rbx # pull context->Rip
  1991. mov 8($disp),%rsi # disp->ImageBase
  1992. mov 56($disp),%r11 # disp->HandlerData
  1993. mov 0(%r11),%r10d # HandlerData[0]
  1994. lea (%rsi,%r10),%r10 # end of prologue label
  1995. cmp %r10,%rbx # context->Rip<end of prologue label
  1996. jb .Lcommon_seh_tail
  1997. mov 152($context),%rax # pull context->Rsp
  1998. mov 4(%r11),%r10d # HandlerData[1]
  1999. lea (%rsi,%r10),%r10 # epilogue label
  2000. cmp %r10,%rbx # context->Rip>=epilogue label
  2001. jae .Lcommon_seh_tail
  2002. lea 128+24+48(%rax),%rax
  2003. lea .Lmul_gather4_epilogue(%rip),%rbx
  2004. cmp %r10,%rbx
  2005. jne .Lse_not_in_mul_gather4
  2006. lea 0xb0(%rax),%rax
  2007. lea -48-0xa8(%rax),%rsi
  2008. lea 512($context),%rdi
  2009. mov \$20,%ecx
  2010. .long 0xa548f3fc # cld; rep movsq
  2011. .Lse_not_in_mul_gather4:
  2012. mov -8(%rax),%rbx
  2013. mov -16(%rax),%rbp
  2014. mov -24(%rax),%r12
  2015. mov -32(%rax),%r13
  2016. mov -40(%rax),%r14
  2017. mov -48(%rax),%r15
  2018. mov %rbx,144($context) # restore context->Rbx
  2019. mov %rbp,160($context) # restore context->Rbp
  2020. mov %r12,216($context) # restore context->R12
  2021. mov %r13,224($context) # restore context->R13
  2022. mov %r14,232($context) # restore context->R14
  2023. mov %r15,240($context) # restore context->R15
  2024. .Lcommon_seh_tail:
  2025. mov 8(%rax),%rdi
  2026. mov 16(%rax),%rsi
  2027. mov %rax,152($context) # restore context->Rsp
  2028. mov %rsi,168($context) # restore context->Rsi
  2029. mov %rdi,176($context) # restore context->Rdi
  2030. mov 40($disp),%rdi # disp->ContextRecord
  2031. mov $context,%rsi # context
  2032. mov \$154,%ecx # sizeof(CONTEXT)
  2033. .long 0xa548f3fc # cld; rep movsq
  2034. mov $disp,%rsi
  2035. xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
  2036. mov 8(%rsi),%rdx # arg2, disp->ImageBase
  2037. mov 0(%rsi),%r8 # arg3, disp->ControlPc
  2038. mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
  2039. mov 40(%rsi),%r10 # disp->ContextRecord
  2040. lea 56(%rsi),%r11 # &disp->HandlerData
  2041. lea 24(%rsi),%r12 # &disp->EstablisherFrame
  2042. mov %r10,32(%rsp) # arg5
  2043. mov %r11,40(%rsp) # arg6
  2044. mov %r12,48(%rsp) # arg7
  2045. mov %rcx,56(%rsp) # arg8, (NULL)
  2046. call *__imp_RtlVirtualUnwind(%rip)
  2047. mov \$1,%eax # ExceptionContinueSearch
  2048. add \$64,%rsp
  2049. popfq
  2050. pop %r15
  2051. pop %r14
  2052. pop %r13
  2053. pop %r12
  2054. pop %rbp
  2055. pop %rbx
  2056. pop %rdi
  2057. pop %rsi
  2058. ret
  2059. .size se_handler,.-se_handler
  2060. .section .pdata
  2061. .align 4
  2062. .rva .LSEH_begin_rsaz_512_sqr
  2063. .rva .LSEH_end_rsaz_512_sqr
  2064. .rva .LSEH_info_rsaz_512_sqr
  2065. .rva .LSEH_begin_rsaz_512_mul
  2066. .rva .LSEH_end_rsaz_512_mul
  2067. .rva .LSEH_info_rsaz_512_mul
  2068. .rva .LSEH_begin_rsaz_512_mul_gather4
  2069. .rva .LSEH_end_rsaz_512_mul_gather4
  2070. .rva .LSEH_info_rsaz_512_mul_gather4
  2071. .rva .LSEH_begin_rsaz_512_mul_scatter4
  2072. .rva .LSEH_end_rsaz_512_mul_scatter4
  2073. .rva .LSEH_info_rsaz_512_mul_scatter4
  2074. .rva .LSEH_begin_rsaz_512_mul_by_one
  2075. .rva .LSEH_end_rsaz_512_mul_by_one
  2076. .rva .LSEH_info_rsaz_512_mul_by_one
  2077. .rva .LSEH_begin_rsaz_512_gather4
  2078. .rva .LSEH_end_rsaz_512_gather4
  2079. .rva .LSEH_info_rsaz_512_gather4
  2080. .section .xdata
  2081. .align 8
  2082. .LSEH_info_rsaz_512_sqr:
  2083. .byte 9,0,0,0
  2084. .rva se_handler
  2085. .rva .Lsqr_body,.Lsqr_epilogue # HandlerData[]
  2086. .LSEH_info_rsaz_512_mul:
  2087. .byte 9,0,0,0
  2088. .rva se_handler
  2089. .rva .Lmul_body,.Lmul_epilogue # HandlerData[]
  2090. .LSEH_info_rsaz_512_mul_gather4:
  2091. .byte 9,0,0,0
  2092. .rva se_handler
  2093. .rva .Lmul_gather4_body,.Lmul_gather4_epilogue # HandlerData[]
  2094. .LSEH_info_rsaz_512_mul_scatter4:
  2095. .byte 9,0,0,0
  2096. .rva se_handler
  2097. .rva .Lmul_scatter4_body,.Lmul_scatter4_epilogue # HandlerData[]
  2098. .LSEH_info_rsaz_512_mul_by_one:
  2099. .byte 9,0,0,0
  2100. .rva se_handler
  2101. .rva .Lmul_by_one_body,.Lmul_by_one_epilogue # HandlerData[]
  2102. .LSEH_info_rsaz_512_gather4:
  2103. .byte 0x01,0x46,0x16,0x00
  2104. .byte 0x46,0xf8,0x09,0x00 # vmovaps 0x90(rsp),xmm15
  2105. .byte 0x3d,0xe8,0x08,0x00 # vmovaps 0x80(rsp),xmm14
  2106. .byte 0x34,0xd8,0x07,0x00 # vmovaps 0x70(rsp),xmm13
  2107. .byte 0x2e,0xc8,0x06,0x00 # vmovaps 0x60(rsp),xmm12
  2108. .byte 0x28,0xb8,0x05,0x00 # vmovaps 0x50(rsp),xmm11
  2109. .byte 0x22,0xa8,0x04,0x00 # vmovaps 0x40(rsp),xmm10
  2110. .byte 0x1c,0x98,0x03,0x00 # vmovaps 0x30(rsp),xmm9
  2111. .byte 0x16,0x88,0x02,0x00 # vmovaps 0x20(rsp),xmm8
  2112. .byte 0x10,0x78,0x01,0x00 # vmovaps 0x10(rsp),xmm7
  2113. .byte 0x0b,0x68,0x00,0x00 # vmovaps 0x00(rsp),xmm6
  2114. .byte 0x07,0x01,0x15,0x00 # sub rsp,0xa8
  2115. ___
  2116. }
  2117. $code =~ s/\`([^\`]*)\`/eval $1/gem;
  2118. print $code;
  2119. close STDOUT or die "error closing STDOUT: $!";