x25519-x86_64.pl 24 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131
  1. #!/usr/bin/env perl
  2. # Copyright 2018-2020 The OpenSSL Project Authors. All Rights Reserved.
  3. #
  4. # Licensed under the OpenSSL license (the "License"). You may not use
  5. # this file except in compliance with the License. You can obtain a copy
  6. # in the file LICENSE in the source distribution or at
  7. # https://www.openssl.org/source/license.html
  8. #
  9. # ====================================================================
  10. # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
  11. # project. The module is, however, dual licensed under OpenSSL and
  12. # CRYPTOGAMS licenses depending on where you obtain it. For further
  13. # details see http://www.openssl.org/~appro/cryptogams/.
  14. # ====================================================================
  15. #
  16. # X25519 lower-level primitives for x86_64.
  17. #
  18. # February 2018.
  19. #
  20. # This module implements radix 2^51 multiplication and squaring, and
  21. # radix 2^64 multiplication, squaring, addition, subtraction and final
  22. # reduction. Latter radix is used on ADCX/ADOX-capable processors such
  23. # as Broadwell. On related note one should mention that there are
  24. # vector implementations that provide significantly better performance
  25. # on some processors(*), but they are large and overly complex. Which
  26. # in combination with them being effectively processor-specific makes
  27. # the undertaking hard to justify. The goal for this implementation
  28. # is rather versatility and simplicity [and ultimately formal
  29. # verification].
  30. #
  31. # (*) For example sandy2x should provide ~30% improvement on Sandy
  32. # Bridge, but only nominal ~5% on Haswell [and big loss on
  33. # Broadwell and successors].
  34. #
  35. ######################################################################
  36. # Improvement coefficients:
  37. #
  38. # amd64-51(*) gcc-5.x(**)
  39. #
  40. # P4 +22% +40%
  41. # Sandy Bridge -3% +11%
  42. # Haswell -1% +13%
  43. # Broadwell(***) +30% +35%
  44. # Skylake(***) +33% +47%
  45. # Silvermont +20% +26%
  46. # Goldmont +40% +50%
  47. # Bulldozer +20% +9%
  48. # Ryzen(***) +43% +40%
  49. # VIA +170% +120%
  50. #
  51. # (*) amd64-51 is popular assembly implementation with 2^51 radix,
  52. # only multiplication and squaring subroutines were linked
  53. # for comparison, but not complete ladder step; gain on most
  54. # processors is because this module refrains from shld, and
  55. # minor regression on others is because this does result in
  56. # higher instruction count;
  57. # (**) compiler is free to inline functions, in assembly one would
  58. # need to implement ladder step to do that, and it will improve
  59. # performance by several percent;
  60. # (***) ADCX/ADOX result for 2^64 radix, there is no corresponding
  61. # C implementation, so that comparison is always against
  62. # 2^51 radix;
  63. $flavour = shift;
  64. $output = shift;
  65. if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
  66. $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
  67. $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
  68. ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
  69. ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
  70. die "can't locate x86_64-xlate.pl";
  71. open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
  72. *STDOUT=*OUT;
  73. if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
  74. =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
  75. $addx = ($1>=2.23);
  76. }
  77. if (!$addx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
  78. `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
  79. $addx = ($1>=2.10);
  80. }
  81. if (!$addx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
  82. `ml64 2>&1` =~ /Version ([0-9]+)\./) {
  83. $addx = ($1>=12);
  84. }
  85. if (!$addx && `$ENV{CC} -v 2>&1` =~ /((?:clang|LLVM) version|.*based on LLVM) ([0-9]+)\.([0-9]+)/) {
  86. my $ver = $2 + $3/100.0; # 3.1->3.01, 3.10->3.10
  87. $addx = ($ver>=3.03);
  88. }
  89. $code.=<<___;
  90. .text
  91. .globl x25519_fe51_mul
  92. .type x25519_fe51_mul,\@function,3
  93. .align 32
  94. x25519_fe51_mul:
  95. .cfi_startproc
  96. push %rbp
  97. .cfi_push %rbp
  98. push %rbx
  99. .cfi_push %rbx
  100. push %r12
  101. .cfi_push %r12
  102. push %r13
  103. .cfi_push %r13
  104. push %r14
  105. .cfi_push %r14
  106. push %r15
  107. .cfi_push %r15
  108. lea -8*5(%rsp),%rsp
  109. .cfi_adjust_cfa_offset 40
  110. .Lfe51_mul_body:
  111. mov 8*0(%rsi),%rax # f[0]
  112. mov 8*0(%rdx),%r11 # load g[0-4]
  113. mov 8*1(%rdx),%r12
  114. mov 8*2(%rdx),%r13
  115. mov 8*3(%rdx),%rbp
  116. mov 8*4(%rdx),%r14
  117. mov %rdi,8*4(%rsp) # offload 1st argument
  118. mov %rax,%rdi
  119. mulq %r11 # f[0]*g[0]
  120. mov %r11,8*0(%rsp) # offload g[0]
  121. mov %rax,%rbx # %rbx:%rcx = h0
  122. mov %rdi,%rax
  123. mov %rdx,%rcx
  124. mulq %r12 # f[0]*g[1]
  125. mov %r12,8*1(%rsp) # offload g[1]
  126. mov %rax,%r8 # %r8:%r9 = h1
  127. mov %rdi,%rax
  128. lea (%r14,%r14,8),%r15
  129. mov %rdx,%r9
  130. mulq %r13 # f[0]*g[2]
  131. mov %r13,8*2(%rsp) # offload g[2]
  132. mov %rax,%r10 # %r10:%r11 = h2
  133. mov %rdi,%rax
  134. lea (%r14,%r15,2),%rdi # g[4]*19
  135. mov %rdx,%r11
  136. mulq %rbp # f[0]*g[3]
  137. mov %rax,%r12 # %r12:%r13 = h3
  138. mov 8*0(%rsi),%rax # f[0]
  139. mov %rdx,%r13
  140. mulq %r14 # f[0]*g[4]
  141. mov %rax,%r14 # %r14:%r15 = h4
  142. mov 8*1(%rsi),%rax # f[1]
  143. mov %rdx,%r15
  144. mulq %rdi # f[1]*g[4]*19
  145. add %rax,%rbx
  146. mov 8*2(%rsi),%rax # f[2]
  147. adc %rdx,%rcx
  148. mulq %rdi # f[2]*g[4]*19
  149. add %rax,%r8
  150. mov 8*3(%rsi),%rax # f[3]
  151. adc %rdx,%r9
  152. mulq %rdi # f[3]*g[4]*19
  153. add %rax,%r10
  154. mov 8*4(%rsi),%rax # f[4]
  155. adc %rdx,%r11
  156. mulq %rdi # f[4]*g[4]*19
  157. imulq \$19,%rbp,%rdi # g[3]*19
  158. add %rax,%r12
  159. mov 8*1(%rsi),%rax # f[1]
  160. adc %rdx,%r13
  161. mulq %rbp # f[1]*g[3]
  162. mov 8*2(%rsp),%rbp # g[2]
  163. add %rax,%r14
  164. mov 8*2(%rsi),%rax # f[2]
  165. adc %rdx,%r15
  166. mulq %rdi # f[2]*g[3]*19
  167. add %rax,%rbx
  168. mov 8*3(%rsi),%rax # f[3]
  169. adc %rdx,%rcx
  170. mulq %rdi # f[3]*g[3]*19
  171. add %rax,%r8
  172. mov 8*4(%rsi),%rax # f[4]
  173. adc %rdx,%r9
  174. mulq %rdi # f[4]*g[3]*19
  175. imulq \$19,%rbp,%rdi # g[2]*19
  176. add %rax,%r10
  177. mov 8*1(%rsi),%rax # f[1]
  178. adc %rdx,%r11
  179. mulq %rbp # f[1]*g[2]
  180. add %rax,%r12
  181. mov 8*2(%rsi),%rax # f[2]
  182. adc %rdx,%r13
  183. mulq %rbp # f[2]*g[2]
  184. mov 8*1(%rsp),%rbp # g[1]
  185. add %rax,%r14
  186. mov 8*3(%rsi),%rax # f[3]
  187. adc %rdx,%r15
  188. mulq %rdi # f[3]*g[2]*19
  189. add %rax,%rbx
  190. mov 8*4(%rsi),%rax # f[3]
  191. adc %rdx,%rcx
  192. mulq %rdi # f[4]*g[2]*19
  193. add %rax,%r8
  194. mov 8*1(%rsi),%rax # f[1]
  195. adc %rdx,%r9
  196. mulq %rbp # f[1]*g[1]
  197. imulq \$19,%rbp,%rdi
  198. add %rax,%r10
  199. mov 8*2(%rsi),%rax # f[2]
  200. adc %rdx,%r11
  201. mulq %rbp # f[2]*g[1]
  202. add %rax,%r12
  203. mov 8*3(%rsi),%rax # f[3]
  204. adc %rdx,%r13
  205. mulq %rbp # f[3]*g[1]
  206. mov 8*0(%rsp),%rbp # g[0]
  207. add %rax,%r14
  208. mov 8*4(%rsi),%rax # f[4]
  209. adc %rdx,%r15
  210. mulq %rdi # f[4]*g[1]*19
  211. add %rax,%rbx
  212. mov 8*1(%rsi),%rax # f[1]
  213. adc %rdx,%rcx
  214. mul %rbp # f[1]*g[0]
  215. add %rax,%r8
  216. mov 8*2(%rsi),%rax # f[2]
  217. adc %rdx,%r9
  218. mul %rbp # f[2]*g[0]
  219. add %rax,%r10
  220. mov 8*3(%rsi),%rax # f[3]
  221. adc %rdx,%r11
  222. mul %rbp # f[3]*g[0]
  223. add %rax,%r12
  224. mov 8*4(%rsi),%rax # f[4]
  225. adc %rdx,%r13
  226. mulq %rbp # f[4]*g[0]
  227. add %rax,%r14
  228. adc %rdx,%r15
  229. mov 8*4(%rsp),%rdi # restore 1st argument
  230. jmp .Lreduce51
  231. .Lfe51_mul_epilogue:
  232. .cfi_endproc
  233. .size x25519_fe51_mul,.-x25519_fe51_mul
  234. .globl x25519_fe51_sqr
  235. .type x25519_fe51_sqr,\@function,2
  236. .align 32
  237. x25519_fe51_sqr:
  238. .cfi_startproc
  239. push %rbp
  240. .cfi_push %rbp
  241. push %rbx
  242. .cfi_push %rbx
  243. push %r12
  244. .cfi_push %r12
  245. push %r13
  246. .cfi_push %r13
  247. push %r14
  248. .cfi_push %r14
  249. push %r15
  250. .cfi_push %r15
  251. lea -8*5(%rsp),%rsp
  252. .cfi_adjust_cfa_offset 40
  253. .Lfe51_sqr_body:
  254. mov 8*0(%rsi),%rax # g[0]
  255. mov 8*2(%rsi),%r15 # g[2]
  256. mov 8*4(%rsi),%rbp # g[4]
  257. mov %rdi,8*4(%rsp) # offload 1st argument
  258. lea (%rax,%rax),%r14
  259. mulq %rax # g[0]*g[0]
  260. mov %rax,%rbx
  261. mov 8*1(%rsi),%rax # g[1]
  262. mov %rdx,%rcx
  263. mulq %r14 # 2*g[0]*g[1]
  264. mov %rax,%r8
  265. mov %r15,%rax
  266. mov %r15,8*0(%rsp) # offload g[2]
  267. mov %rdx,%r9
  268. mulq %r14 # 2*g[0]*g[2]
  269. mov %rax,%r10
  270. mov 8*3(%rsi),%rax
  271. mov %rdx,%r11
  272. imulq \$19,%rbp,%rdi # g[4]*19
  273. mulq %r14 # 2*g[0]*g[3]
  274. mov %rax,%r12
  275. mov %rbp,%rax
  276. mov %rdx,%r13
  277. mulq %r14 # 2*g[0]*g[4]
  278. mov %rax,%r14
  279. mov %rbp,%rax
  280. mov %rdx,%r15
  281. mulq %rdi # g[4]*g[4]*19
  282. add %rax,%r12
  283. mov 8*1(%rsi),%rax # g[1]
  284. adc %rdx,%r13
  285. mov 8*3(%rsi),%rsi # g[3]
  286. lea (%rax,%rax),%rbp
  287. mulq %rax # g[1]*g[1]
  288. add %rax,%r10
  289. mov 8*0(%rsp),%rax # g[2]
  290. adc %rdx,%r11
  291. mulq %rbp # 2*g[1]*g[2]
  292. add %rax,%r12
  293. mov %rbp,%rax
  294. adc %rdx,%r13
  295. mulq %rsi # 2*g[1]*g[3]
  296. add %rax,%r14
  297. mov %rbp,%rax
  298. adc %rdx,%r15
  299. imulq \$19,%rsi,%rbp # g[3]*19
  300. mulq %rdi # 2*g[1]*g[4]*19
  301. add %rax,%rbx
  302. lea (%rsi,%rsi),%rax
  303. adc %rdx,%rcx
  304. mulq %rdi # 2*g[3]*g[4]*19
  305. add %rax,%r10
  306. mov %rsi,%rax
  307. adc %rdx,%r11
  308. mulq %rbp # g[3]*g[3]*19
  309. add %rax,%r8
  310. mov 8*0(%rsp),%rax # g[2]
  311. adc %rdx,%r9
  312. lea (%rax,%rax),%rsi
  313. mulq %rax # g[2]*g[2]
  314. add %rax,%r14
  315. mov %rbp,%rax
  316. adc %rdx,%r15
  317. mulq %rsi # 2*g[2]*g[3]*19
  318. add %rax,%rbx
  319. mov %rsi,%rax
  320. adc %rdx,%rcx
  321. mulq %rdi # 2*g[2]*g[4]*19
  322. add %rax,%r8
  323. adc %rdx,%r9
  324. mov 8*4(%rsp),%rdi # restore 1st argument
  325. jmp .Lreduce51
  326. .align 32
  327. .Lreduce51:
  328. mov \$0x7ffffffffffff,%rbp
  329. mov %r10,%rdx
  330. shr \$51,%r10
  331. shl \$13,%r11
  332. and %rbp,%rdx # %rdx = g2 = h2 & mask
  333. or %r10,%r11 # h2>>51
  334. add %r11,%r12
  335. adc \$0,%r13 # h3 += h2>>51
  336. mov %rbx,%rax
  337. shr \$51,%rbx
  338. shl \$13,%rcx
  339. and %rbp,%rax # %rax = g0 = h0 & mask
  340. or %rbx,%rcx # h0>>51
  341. add %rcx,%r8 # h1 += h0>>51
  342. adc \$0,%r9
  343. mov %r12,%rbx
  344. shr \$51,%r12
  345. shl \$13,%r13
  346. and %rbp,%rbx # %rbx = g3 = h3 & mask
  347. or %r12,%r13 # h3>>51
  348. add %r13,%r14 # h4 += h3>>51
  349. adc \$0,%r15
  350. mov %r8,%rcx
  351. shr \$51,%r8
  352. shl \$13,%r9
  353. and %rbp,%rcx # %rcx = g1 = h1 & mask
  354. or %r8,%r9
  355. add %r9,%rdx # g2 += h1>>51
  356. mov %r14,%r10
  357. shr \$51,%r14
  358. shl \$13,%r15
  359. and %rbp,%r10 # %r10 = g4 = h0 & mask
  360. or %r14,%r15 # h0>>51
  361. lea (%r15,%r15,8),%r14
  362. lea (%r15,%r14,2),%r15
  363. add %r15,%rax # g0 += (h0>>51)*19
  364. mov %rdx,%r8
  365. and %rbp,%rdx # g2 &= mask
  366. shr \$51,%r8
  367. add %r8,%rbx # g3 += g2>>51
  368. mov %rax,%r9
  369. and %rbp,%rax # g0 &= mask
  370. shr \$51,%r9
  371. add %r9,%rcx # g1 += g0>>51
  372. mov %rax,8*0(%rdi) # save the result
  373. mov %rcx,8*1(%rdi)
  374. mov %rdx,8*2(%rdi)
  375. mov %rbx,8*3(%rdi)
  376. mov %r10,8*4(%rdi)
  377. mov 8*5(%rsp),%r15
  378. .cfi_restore %r15
  379. mov 8*6(%rsp),%r14
  380. .cfi_restore %r14
  381. mov 8*7(%rsp),%r13
  382. .cfi_restore %r13
  383. mov 8*8(%rsp),%r12
  384. .cfi_restore %r12
  385. mov 8*9(%rsp),%rbx
  386. .cfi_restore %rbx
  387. mov 8*10(%rsp),%rbp
  388. .cfi_restore %rbp
  389. lea 8*11(%rsp),%rsp
  390. .cfi_adjust_cfa_offset 88
  391. .Lfe51_sqr_epilogue:
  392. ret
  393. .cfi_endproc
  394. .size x25519_fe51_sqr,.-x25519_fe51_sqr
  395. .globl x25519_fe51_mul121666
  396. .type x25519_fe51_mul121666,\@function,2
  397. .align 32
  398. x25519_fe51_mul121666:
  399. .cfi_startproc
  400. push %rbp
  401. .cfi_push %rbp
  402. push %rbx
  403. .cfi_push %rbx
  404. push %r12
  405. .cfi_push %r12
  406. push %r13
  407. .cfi_push %r13
  408. push %r14
  409. .cfi_push %r14
  410. push %r15
  411. .cfi_push %r15
  412. lea -8*5(%rsp),%rsp
  413. .cfi_adjust_cfa_offset 40
  414. .Lfe51_mul121666_body:
  415. mov \$121666,%eax
  416. mulq 8*0(%rsi)
  417. mov %rax,%rbx # %rbx:%rcx = h0
  418. mov \$121666,%eax
  419. mov %rdx,%rcx
  420. mulq 8*1(%rsi)
  421. mov %rax,%r8 # %r8:%r9 = h1
  422. mov \$121666,%eax
  423. mov %rdx,%r9
  424. mulq 8*2(%rsi)
  425. mov %rax,%r10 # %r10:%r11 = h2
  426. mov \$121666,%eax
  427. mov %rdx,%r11
  428. mulq 8*3(%rsi)
  429. mov %rax,%r12 # %r12:%r13 = h3
  430. mov \$121666,%eax # f[0]
  431. mov %rdx,%r13
  432. mulq 8*4(%rsi)
  433. mov %rax,%r14 # %r14:%r15 = h4
  434. mov %rdx,%r15
  435. jmp .Lreduce51
  436. .Lfe51_mul121666_epilogue:
  437. .cfi_endproc
  438. .size x25519_fe51_mul121666,.-x25519_fe51_mul121666
  439. ___
  440. ########################################################################
  441. # Base 2^64 subroutines modulo 2*(2^255-19)
  442. #
  443. if ($addx) {
  444. my ($acc0,$acc1,$acc2,$acc3,$acc4,$acc5,$acc6,$acc7) = map("%r$_",(8..15));
  445. $code.=<<___;
  446. .extern OPENSSL_ia32cap_P
  447. .globl x25519_fe64_eligible
  448. .type x25519_fe64_eligible,\@abi-omnipotent
  449. .align 32
  450. x25519_fe64_eligible:
  451. .cfi_startproc
  452. mov OPENSSL_ia32cap_P+8(%rip),%ecx
  453. xor %eax,%eax
  454. and \$0x80100,%ecx
  455. cmp \$0x80100,%ecx
  456. cmove %ecx,%eax
  457. ret
  458. .cfi_endproc
  459. .size x25519_fe64_eligible,.-x25519_fe64_eligible
  460. .globl x25519_fe64_mul
  461. .type x25519_fe64_mul,\@function,3
  462. .align 32
  463. x25519_fe64_mul:
  464. .cfi_startproc
  465. push %rbp
  466. .cfi_push %rbp
  467. push %rbx
  468. .cfi_push %rbx
  469. push %r12
  470. .cfi_push %r12
  471. push %r13
  472. .cfi_push %r13
  473. push %r14
  474. .cfi_push %r14
  475. push %r15
  476. .cfi_push %r15
  477. push %rdi # offload dst
  478. .cfi_push %rdi
  479. lea -8*2(%rsp),%rsp
  480. .cfi_adjust_cfa_offset 16
  481. .Lfe64_mul_body:
  482. mov %rdx,%rax
  483. mov 8*0(%rdx),%rbp # b[0]
  484. mov 8*0(%rsi),%rdx # a[0]
  485. mov 8*1(%rax),%rcx # b[1]
  486. mov 8*2(%rax),$acc6 # b[2]
  487. mov 8*3(%rax),$acc7 # b[3]
  488. mulx %rbp,$acc0,%rax # a[0]*b[0]
  489. xor %edi,%edi # cf=0,of=0
  490. mulx %rcx,$acc1,%rbx # a[0]*b[1]
  491. adcx %rax,$acc1
  492. mulx $acc6,$acc2,%rax # a[0]*b[2]
  493. adcx %rbx,$acc2
  494. mulx $acc7,$acc3,$acc4 # a[0]*b[3]
  495. mov 8*1(%rsi),%rdx # a[1]
  496. adcx %rax,$acc3
  497. mov $acc6,(%rsp) # offload b[2]
  498. adcx %rdi,$acc4 # cf=0
  499. mulx %rbp,%rax,%rbx # a[1]*b[0]
  500. adox %rax,$acc1
  501. adcx %rbx,$acc2
  502. mulx %rcx,%rax,%rbx # a[1]*b[1]
  503. adox %rax,$acc2
  504. adcx %rbx,$acc3
  505. mulx $acc6,%rax,%rbx # a[1]*b[2]
  506. adox %rax,$acc3
  507. adcx %rbx,$acc4
  508. mulx $acc7,%rax,$acc5 # a[1]*b[3]
  509. mov 8*2(%rsi),%rdx # a[2]
  510. adox %rax,$acc4
  511. adcx %rdi,$acc5 # cf=0
  512. adox %rdi,$acc5 # of=0
  513. mulx %rbp,%rax,%rbx # a[2]*b[0]
  514. adcx %rax,$acc2
  515. adox %rbx,$acc3
  516. mulx %rcx,%rax,%rbx # a[2]*b[1]
  517. adcx %rax,$acc3
  518. adox %rbx,$acc4
  519. mulx $acc6,%rax,%rbx # a[2]*b[2]
  520. adcx %rax,$acc4
  521. adox %rbx,$acc5
  522. mulx $acc7,%rax,$acc6 # a[2]*b[3]
  523. mov 8*3(%rsi),%rdx # a[3]
  524. adcx %rax,$acc5
  525. adox %rdi,$acc6 # of=0
  526. adcx %rdi,$acc6 # cf=0
  527. mulx %rbp,%rax,%rbx # a[3]*b[0]
  528. adox %rax,$acc3
  529. adcx %rbx,$acc4
  530. mulx %rcx,%rax,%rbx # a[3]*b[1]
  531. adox %rax,$acc4
  532. adcx %rbx,$acc5
  533. mulx (%rsp),%rax,%rbx # a[3]*b[2]
  534. adox %rax,$acc5
  535. adcx %rbx,$acc6
  536. mulx $acc7,%rax,$acc7 # a[3]*b[3]
  537. mov \$38,%edx
  538. adox %rax,$acc6
  539. adcx %rdi,$acc7 # cf=0
  540. adox %rdi,$acc7 # of=0
  541. jmp .Lreduce64
  542. .Lfe64_mul_epilogue:
  543. .cfi_endproc
  544. .size x25519_fe64_mul,.-x25519_fe64_mul
  545. .globl x25519_fe64_sqr
  546. .type x25519_fe64_sqr,\@function,2
  547. .align 32
  548. x25519_fe64_sqr:
  549. .cfi_startproc
  550. push %rbp
  551. .cfi_push %rbp
  552. push %rbx
  553. .cfi_push %rbx
  554. push %r12
  555. .cfi_push %r12
  556. push %r13
  557. .cfi_push %r13
  558. push %r14
  559. .cfi_push %r14
  560. push %r15
  561. .cfi_push %r15
  562. push %rdi # offload dst
  563. .cfi_push %rdi
  564. lea -8*2(%rsp),%rsp
  565. .cfi_adjust_cfa_offset 16
  566. .Lfe64_sqr_body:
  567. mov 8*0(%rsi),%rdx # a[0]
  568. mov 8*1(%rsi),%rcx # a[1]
  569. mov 8*2(%rsi),%rbp # a[2]
  570. mov 8*3(%rsi),%rsi # a[3]
  571. ################################################################
  572. mulx %rdx,$acc0,$acc7 # a[0]*a[0]
  573. mulx %rcx,$acc1,%rax # a[0]*a[1]
  574. xor %edi,%edi # cf=0,of=0
  575. mulx %rbp,$acc2,%rbx # a[0]*a[2]
  576. adcx %rax,$acc2
  577. mulx %rsi,$acc3,$acc4 # a[0]*a[3]
  578. mov %rcx,%rdx # a[1]
  579. adcx %rbx,$acc3
  580. adcx %rdi,$acc4 # cf=0
  581. ################################################################
  582. mulx %rbp,%rax,%rbx # a[1]*a[2]
  583. adox %rax,$acc3
  584. adcx %rbx,$acc4
  585. mulx %rsi,%rax,$acc5 # a[1]*a[3]
  586. mov %rbp,%rdx # a[2]
  587. adox %rax,$acc4
  588. adcx %rdi,$acc5
  589. ################################################################
  590. mulx %rsi,%rax,$acc6 # a[2]*a[3]
  591. mov %rcx,%rdx # a[1]
  592. adox %rax,$acc5
  593. adcx %rdi,$acc6 # cf=0
  594. adox %rdi,$acc6 # of=0
  595. adcx $acc1,$acc1 # acc1:6<<1
  596. adox $acc7,$acc1
  597. adcx $acc2,$acc2
  598. mulx %rdx,%rax,%rbx # a[1]*a[1]
  599. mov %rbp,%rdx # a[2]
  600. adcx $acc3,$acc3
  601. adox %rax,$acc2
  602. adcx $acc4,$acc4
  603. adox %rbx,$acc3
  604. mulx %rdx,%rax,%rbx # a[2]*a[2]
  605. mov %rsi,%rdx # a[3]
  606. adcx $acc5,$acc5
  607. adox %rax,$acc4
  608. adcx $acc6,$acc6
  609. adox %rbx,$acc5
  610. mulx %rdx,%rax,$acc7 # a[3]*a[3]
  611. mov \$38,%edx
  612. adox %rax,$acc6
  613. adcx %rdi,$acc7 # cf=0
  614. adox %rdi,$acc7 # of=0
  615. jmp .Lreduce64
  616. .align 32
  617. .Lreduce64:
  618. mulx $acc4,%rax,%rbx
  619. adcx %rax,$acc0
  620. adox %rbx,$acc1
  621. mulx $acc5,%rax,%rbx
  622. adcx %rax,$acc1
  623. adox %rbx,$acc2
  624. mulx $acc6,%rax,%rbx
  625. adcx %rax,$acc2
  626. adox %rbx,$acc3
  627. mulx $acc7,%rax,$acc4
  628. adcx %rax,$acc3
  629. adox %rdi,$acc4
  630. adcx %rdi,$acc4
  631. mov 8*2(%rsp),%rdi # restore dst
  632. imulq %rdx,$acc4
  633. add $acc4,$acc0
  634. adc \$0,$acc1
  635. adc \$0,$acc2
  636. adc \$0,$acc3
  637. sbb %rax,%rax # cf -> mask
  638. and \$38,%rax
  639. add %rax,$acc0
  640. mov $acc1,8*1(%rdi)
  641. mov $acc2,8*2(%rdi)
  642. mov $acc3,8*3(%rdi)
  643. mov $acc0,8*0(%rdi)
  644. mov 8*3(%rsp),%r15
  645. .cfi_restore %r15
  646. mov 8*4(%rsp),%r14
  647. .cfi_restore %r14
  648. mov 8*5(%rsp),%r13
  649. .cfi_restore %r13
  650. mov 8*6(%rsp),%r12
  651. .cfi_restore %r12
  652. mov 8*7(%rsp),%rbx
  653. .cfi_restore %rbx
  654. mov 8*8(%rsp),%rbp
  655. .cfi_restore %rbp
  656. lea 8*9(%rsp),%rsp
  657. .cfi_adjust_cfa_offset 88
  658. .Lfe64_sqr_epilogue:
  659. ret
  660. .cfi_endproc
  661. .size x25519_fe64_sqr,.-x25519_fe64_sqr
  662. .globl x25519_fe64_mul121666
  663. .type x25519_fe64_mul121666,\@function,2
  664. .align 32
  665. x25519_fe64_mul121666:
  666. .Lfe64_mul121666_body:
  667. .cfi_startproc
  668. mov \$121666,%edx
  669. mulx 8*0(%rsi),$acc0,%rcx
  670. mulx 8*1(%rsi),$acc1,%rax
  671. add %rcx,$acc1
  672. mulx 8*2(%rsi),$acc2,%rcx
  673. adc %rax,$acc2
  674. mulx 8*3(%rsi),$acc3,%rax
  675. adc %rcx,$acc3
  676. adc \$0,%rax
  677. imulq \$38,%rax,%rax
  678. add %rax,$acc0
  679. adc \$0,$acc1
  680. adc \$0,$acc2
  681. adc \$0,$acc3
  682. sbb %rax,%rax # cf -> mask
  683. and \$38,%rax
  684. add %rax,$acc0
  685. mov $acc1,8*1(%rdi)
  686. mov $acc2,8*2(%rdi)
  687. mov $acc3,8*3(%rdi)
  688. mov $acc0,8*0(%rdi)
  689. .Lfe64_mul121666_epilogue:
  690. ret
  691. .cfi_endproc
  692. .size x25519_fe64_mul121666,.-x25519_fe64_mul121666
  693. .globl x25519_fe64_add
  694. .type x25519_fe64_add,\@function,3
  695. .align 32
  696. x25519_fe64_add:
  697. .Lfe64_add_body:
  698. .cfi_startproc
  699. mov 8*0(%rsi),$acc0
  700. mov 8*1(%rsi),$acc1
  701. mov 8*2(%rsi),$acc2
  702. mov 8*3(%rsi),$acc3
  703. add 8*0(%rdx),$acc0
  704. adc 8*1(%rdx),$acc1
  705. adc 8*2(%rdx),$acc2
  706. adc 8*3(%rdx),$acc3
  707. sbb %rax,%rax # cf -> mask
  708. and \$38,%rax
  709. add %rax,$acc0
  710. adc \$0,$acc1
  711. adc \$0,$acc2
  712. mov $acc1,8*1(%rdi)
  713. adc \$0,$acc3
  714. mov $acc2,8*2(%rdi)
  715. sbb %rax,%rax # cf -> mask
  716. mov $acc3,8*3(%rdi)
  717. and \$38,%rax
  718. add %rax,$acc0
  719. mov $acc0,8*0(%rdi)
  720. .Lfe64_add_epilogue:
  721. ret
  722. .cfi_endproc
  723. .size x25519_fe64_add,.-x25519_fe64_add
  724. .globl x25519_fe64_sub
  725. .type x25519_fe64_sub,\@function,3
  726. .align 32
  727. x25519_fe64_sub:
  728. .Lfe64_sub_body:
  729. .cfi_startproc
  730. mov 8*0(%rsi),$acc0
  731. mov 8*1(%rsi),$acc1
  732. mov 8*2(%rsi),$acc2
  733. mov 8*3(%rsi),$acc3
  734. sub 8*0(%rdx),$acc0
  735. sbb 8*1(%rdx),$acc1
  736. sbb 8*2(%rdx),$acc2
  737. sbb 8*3(%rdx),$acc3
  738. sbb %rax,%rax # cf -> mask
  739. and \$38,%rax
  740. sub %rax,$acc0
  741. sbb \$0,$acc1
  742. sbb \$0,$acc2
  743. mov $acc1,8*1(%rdi)
  744. sbb \$0,$acc3
  745. mov $acc2,8*2(%rdi)
  746. sbb %rax,%rax # cf -> mask
  747. mov $acc3,8*3(%rdi)
  748. and \$38,%rax
  749. sub %rax,$acc0
  750. mov $acc0,8*0(%rdi)
  751. .Lfe64_sub_epilogue:
  752. ret
  753. .cfi_endproc
  754. .size x25519_fe64_sub,.-x25519_fe64_sub
  755. .globl x25519_fe64_tobytes
  756. .type x25519_fe64_tobytes,\@function,2
  757. .align 32
  758. x25519_fe64_tobytes:
  759. .Lfe64_to_body:
  760. .cfi_startproc
  761. mov 8*0(%rsi),$acc0
  762. mov 8*1(%rsi),$acc1
  763. mov 8*2(%rsi),$acc2
  764. mov 8*3(%rsi),$acc3
  765. ################################# reduction modulo 2^255-19
  766. lea ($acc3,$acc3),%rax
  767. sar \$63,$acc3 # most significant bit -> mask
  768. shr \$1,%rax # most significant bit cleared
  769. and \$19,$acc3
  770. add \$19,$acc3 # compare to modulus in the same go
  771. add $acc3,$acc0
  772. adc \$0,$acc1
  773. adc \$0,$acc2
  774. adc \$0,%rax
  775. lea (%rax,%rax),$acc3
  776. sar \$63,%rax # most significant bit -> mask
  777. shr \$1,$acc3 # most significant bit cleared
  778. not %rax
  779. and \$19,%rax
  780. sub %rax,$acc0
  781. sbb \$0,$acc1
  782. sbb \$0,$acc2
  783. sbb \$0,$acc3
  784. mov $acc0,8*0(%rdi)
  785. mov $acc1,8*1(%rdi)
  786. mov $acc2,8*2(%rdi)
  787. mov $acc3,8*3(%rdi)
  788. .Lfe64_to_epilogue:
  789. ret
  790. .cfi_endproc
  791. .size x25519_fe64_tobytes,.-x25519_fe64_tobytes
  792. ___
  793. } else {
  794. $code.=<<___;
  795. .globl x25519_fe64_eligible
  796. .type x25519_fe64_eligible,\@abi-omnipotent
  797. .align 32
  798. x25519_fe64_eligible:
  799. .cfi_startproc
  800. xor %eax,%eax
  801. ret
  802. .cfi_endproc
  803. .size x25519_fe64_eligible,.-x25519_fe64_eligible
  804. .globl x25519_fe64_mul
  805. .type x25519_fe64_mul,\@abi-omnipotent
  806. .globl x25519_fe64_sqr
  807. .globl x25519_fe64_mul121666
  808. .globl x25519_fe64_add
  809. .globl x25519_fe64_sub
  810. .globl x25519_fe64_tobytes
  811. x25519_fe64_mul:
  812. x25519_fe64_sqr:
  813. x25519_fe64_mul121666:
  814. x25519_fe64_add:
  815. x25519_fe64_sub:
  816. x25519_fe64_tobytes:
  817. .cfi_startproc
  818. .byte 0x0f,0x0b # ud2
  819. ret
  820. .cfi_endproc
  821. .size x25519_fe64_mul,.-x25519_fe64_mul
  822. ___
  823. }
  824. $code.=<<___;
  825. .asciz "X25519 primitives for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
  826. ___
  827. # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
  828. # CONTEXT *context,DISPATCHER_CONTEXT *disp)
  829. if ($win64) {
  830. $rec="%rcx";
  831. $frame="%rdx";
  832. $context="%r8";
  833. $disp="%r9";
  834. $code.=<<___;
  835. .extern __imp_RtlVirtualUnwind
  836. .type short_handler,\@abi-omnipotent
  837. .align 16
  838. short_handler:
  839. push %rsi
  840. push %rdi
  841. push %rbx
  842. push %rbp
  843. push %r12
  844. push %r13
  845. push %r14
  846. push %r15
  847. pushfq
  848. sub \$64,%rsp
  849. mov 120($context),%rax # pull context->Rax
  850. mov 248($context),%rbx # pull context->Rip
  851. mov 8($disp),%rsi # disp->ImageBase
  852. mov 56($disp),%r11 # disp->HandlerData
  853. mov 0(%r11),%r10d # HandlerData[0]
  854. lea (%rsi,%r10),%r10 # end of prologue label
  855. cmp %r10,%rbx # context->Rip<end of prologue label
  856. jb .Lcommon_seh_tail
  857. mov 152($context),%rax # pull context->Rsp
  858. jmp .Lcommon_seh_tail
  859. .size short_handler,.-short_handler
  860. .type full_handler,\@abi-omnipotent
  861. .align 16
  862. full_handler:
  863. push %rsi
  864. push %rdi
  865. push %rbx
  866. push %rbp
  867. push %r12
  868. push %r13
  869. push %r14
  870. push %r15
  871. pushfq
  872. sub \$64,%rsp
  873. mov 120($context),%rax # pull context->Rax
  874. mov 248($context),%rbx # pull context->Rip
  875. mov 8($disp),%rsi # disp->ImageBase
  876. mov 56($disp),%r11 # disp->HandlerData
  877. mov 0(%r11),%r10d # HandlerData[0]
  878. lea (%rsi,%r10),%r10 # end of prologue label
  879. cmp %r10,%rbx # context->Rip<end of prologue label
  880. jb .Lcommon_seh_tail
  881. mov 152($context),%rax # pull context->Rsp
  882. mov 4(%r11),%r10d # HandlerData[1]
  883. lea (%rsi,%r10),%r10 # epilogue label
  884. cmp %r10,%rbx # context->Rip>=epilogue label
  885. jae .Lcommon_seh_tail
  886. mov 8(%r11),%r10d # HandlerData[2]
  887. lea (%rax,%r10),%rax
  888. mov -8(%rax),%rbp
  889. mov -16(%rax),%rbx
  890. mov -24(%rax),%r12
  891. mov -32(%rax),%r13
  892. mov -40(%rax),%r14
  893. mov -48(%rax),%r15
  894. mov %rbx,144($context) # restore context->Rbx
  895. mov %rbp,160($context) # restore context->Rbp
  896. mov %r12,216($context) # restore context->R12
  897. mov %r13,224($context) # restore context->R13
  898. mov %r14,232($context) # restore context->R14
  899. mov %r15,240($context) # restore context->R15
  900. .Lcommon_seh_tail:
  901. mov 8(%rax),%rdi
  902. mov 16(%rax),%rsi
  903. mov %rax,152($context) # restore context->Rsp
  904. mov %rsi,168($context) # restore context->Rsi
  905. mov %rdi,176($context) # restore context->Rdi
  906. mov 40($disp),%rdi # disp->ContextRecord
  907. mov $context,%rsi # context
  908. mov \$154,%ecx # sizeof(CONTEXT)
  909. .long 0xa548f3fc # cld; rep movsq
  910. mov $disp,%rsi
  911. xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
  912. mov 8(%rsi),%rdx # arg2, disp->ImageBase
  913. mov 0(%rsi),%r8 # arg3, disp->ControlPc
  914. mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
  915. mov 40(%rsi),%r10 # disp->ContextRecord
  916. lea 56(%rsi),%r11 # &disp->HandlerData
  917. lea 24(%rsi),%r12 # &disp->EstablisherFrame
  918. mov %r10,32(%rsp) # arg5
  919. mov %r11,40(%rsp) # arg6
  920. mov %r12,48(%rsp) # arg7
  921. mov %rcx,56(%rsp) # arg8, (NULL)
  922. call *__imp_RtlVirtualUnwind(%rip)
  923. mov \$1,%eax # ExceptionContinueSearch
  924. add \$64,%rsp
  925. popfq
  926. pop %r15
  927. pop %r14
  928. pop %r13
  929. pop %r12
  930. pop %rbp
  931. pop %rbx
  932. pop %rdi
  933. pop %rsi
  934. ret
  935. .size full_handler,.-full_handler
  936. .section .pdata
  937. .align 4
  938. .rva .LSEH_begin_x25519_fe51_mul
  939. .rva .LSEH_end_x25519_fe51_mul
  940. .rva .LSEH_info_x25519_fe51_mul
  941. .rva .LSEH_begin_x25519_fe51_sqr
  942. .rva .LSEH_end_x25519_fe51_sqr
  943. .rva .LSEH_info_x25519_fe51_sqr
  944. .rva .LSEH_begin_x25519_fe51_mul121666
  945. .rva .LSEH_end_x25519_fe51_mul121666
  946. .rva .LSEH_info_x25519_fe51_mul121666
  947. ___
  948. $code.=<<___ if ($addx);
  949. .rva .LSEH_begin_x25519_fe64_mul
  950. .rva .LSEH_end_x25519_fe64_mul
  951. .rva .LSEH_info_x25519_fe64_mul
  952. .rva .LSEH_begin_x25519_fe64_sqr
  953. .rva .LSEH_end_x25519_fe64_sqr
  954. .rva .LSEH_info_x25519_fe64_sqr
  955. .rva .LSEH_begin_x25519_fe64_mul121666
  956. .rva .LSEH_end_x25519_fe64_mul121666
  957. .rva .LSEH_info_x25519_fe64_mul121666
  958. .rva .LSEH_begin_x25519_fe64_add
  959. .rva .LSEH_end_x25519_fe64_add
  960. .rva .LSEH_info_x25519_fe64_add
  961. .rva .LSEH_begin_x25519_fe64_sub
  962. .rva .LSEH_end_x25519_fe64_sub
  963. .rva .LSEH_info_x25519_fe64_sub
  964. .rva .LSEH_begin_x25519_fe64_tobytes
  965. .rva .LSEH_end_x25519_fe64_tobytes
  966. .rva .LSEH_info_x25519_fe64_tobytes
  967. ___
  968. $code.=<<___;
  969. .section .xdata
  970. .align 8
  971. .LSEH_info_x25519_fe51_mul:
  972. .byte 9,0,0,0
  973. .rva full_handler
  974. .rva .Lfe51_mul_body,.Lfe51_mul_epilogue # HandlerData[]
  975. .long 88,0
  976. .LSEH_info_x25519_fe51_sqr:
  977. .byte 9,0,0,0
  978. .rva full_handler
  979. .rva .Lfe51_sqr_body,.Lfe51_sqr_epilogue # HandlerData[]
  980. .long 88,0
  981. .LSEH_info_x25519_fe51_mul121666:
  982. .byte 9,0,0,0
  983. .rva full_handler
  984. .rva .Lfe51_mul121666_body,.Lfe51_mul121666_epilogue # HandlerData[]
  985. .long 88,0
  986. ___
  987. $code.=<<___ if ($addx);
  988. .LSEH_info_x25519_fe64_mul:
  989. .byte 9,0,0,0
  990. .rva full_handler
  991. .rva .Lfe64_mul_body,.Lfe64_mul_epilogue # HandlerData[]
  992. .long 72,0
  993. .LSEH_info_x25519_fe64_sqr:
  994. .byte 9,0,0,0
  995. .rva full_handler
  996. .rva .Lfe64_sqr_body,.Lfe64_sqr_epilogue # HandlerData[]
  997. .long 72,0
  998. .LSEH_info_x25519_fe64_mul121666:
  999. .byte 9,0,0,0
  1000. .rva short_handler
  1001. .rva .Lfe64_mul121666_body,.Lfe64_mul121666_epilogue # HandlerData[]
  1002. .LSEH_info_x25519_fe64_add:
  1003. .byte 9,0,0,0
  1004. .rva short_handler
  1005. .rva .Lfe64_add_body,.Lfe64_add_epilogue # HandlerData[]
  1006. .LSEH_info_x25519_fe64_sub:
  1007. .byte 9,0,0,0
  1008. .rva short_handler
  1009. .rva .Lfe64_sub_body,.Lfe64_sub_epilogue # HandlerData[]
  1010. .LSEH_info_x25519_fe64_tobytes:
  1011. .byte 9,0,0,0
  1012. .rva short_handler
  1013. .rva .Lfe64_to_body,.Lfe64_to_epilogue # HandlerData[]
  1014. ___
  1015. }
  1016. $code =~ s/\`([^\`]*)\`/eval $1/gem;
  1017. print $code;
  1018. close STDOUT or die "error closing STDOUT: $!";