2
0

x25519-ppc64.pl 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824
  1. #! /usr/bin/env perl
  2. # Copyright 2018-2020 The OpenSSL Project Authors. All Rights Reserved.
  3. #
  4. # Licensed under the OpenSSL license (the "License"). You may not use
  5. # this file except in compliance with the License. You can obtain a copy
  6. # in the file LICENSE in the source distribution or at
  7. # https://www.openssl.org/source/license.html
  8. #
  9. # ====================================================================
  10. # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
  11. # project. The module is, however, dual licensed under OpenSSL and
  12. # CRYPTOGAMS licenses depending on where you obtain it. For further
  13. # details see http://www.openssl.org/~appro/cryptogams/.
  14. # ====================================================================
  15. #
  16. # X25519 lower-level primitives for PPC64.
  17. #
  18. # July 2018.
  19. #
  20. # Base 2^64 is faster than base 2^51 on pre-POWER8, most notably ~15%
  21. # faster on PPC970/G5. POWER8 on the other hand seems to trip on own
  22. # shoelaces when handling longer carry chains. As base 2^51 has just
  23. # single-carry pairs, it's 25% faster than base 2^64. Since PPC970 is
  24. # pretty old, base 2^64 implementation is not engaged. Comparison to
  25. # compiler-generated code is complicated by the fact that not all
  26. # compilers support 128-bit integers. When compiler doesn't, like xlc,
  27. # this module delivers more than 2x improvement, and when it does,
  28. # from 12% to 30% improvement was measured...
  29. $flavour = shift;
  30. while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {}
  31. $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
  32. ( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
  33. ( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
  34. die "can't locate ppc-xlate.pl";
  35. open OUT,"| \"$^X\" $xlate $flavour $output";
  36. *STDOUT=*OUT;
  37. my $sp = "r1";
  38. my ($rp,$ap,$bp) = map("r$_",3..5);
  39. ####################################################### base 2^64
  40. if (0) {
  41. my ($bi,$a0,$a1,$a2,$a3,$t0,$t1, $t2,$t3,
  42. $acc0,$acc1,$acc2,$acc3,$acc4,$acc5,$acc6,$acc7) =
  43. map("r$_",(6..12,22..31));
  44. my $zero = "r0";
  45. my $FRAME = 16*8;
  46. $code.=<<___;
  47. .text
  48. .globl x25519_fe64_mul
  49. .type x25519_fe64_mul,\@function
  50. .align 5
  51. x25519_fe64_mul:
  52. stdu $sp,-$FRAME($sp)
  53. std r22,`$FRAME-8*10`($sp)
  54. std r23,`$FRAME-8*9`($sp)
  55. std r24,`$FRAME-8*8`($sp)
  56. std r25,`$FRAME-8*7`($sp)
  57. std r26,`$FRAME-8*6`($sp)
  58. std r27,`$FRAME-8*5`($sp)
  59. std r28,`$FRAME-8*4`($sp)
  60. std r29,`$FRAME-8*3`($sp)
  61. std r30,`$FRAME-8*2`($sp)
  62. std r31,`$FRAME-8*1`($sp)
  63. ld $bi,0($bp)
  64. ld $a0,0($ap)
  65. xor $zero,$zero,$zero
  66. ld $a1,8($ap)
  67. ld $a2,16($ap)
  68. ld $a3,24($ap)
  69. mulld $acc0,$a0,$bi # a[0]*b[0]
  70. mulhdu $t0,$a0,$bi
  71. mulld $acc1,$a1,$bi # a[1]*b[0]
  72. mulhdu $t1,$a1,$bi
  73. mulld $acc2,$a2,$bi # a[2]*b[0]
  74. mulhdu $t2,$a2,$bi
  75. mulld $acc3,$a3,$bi # a[3]*b[0]
  76. mulhdu $t3,$a3,$bi
  77. ___
  78. for(my @acc=($acc0,$acc1,$acc2,$acc3,$acc4,$acc5,$acc6,$acc7),
  79. my $i=1; $i<4; shift(@acc), $i++) {
  80. my $acc4 = $i==1? $zero : @acc[4];
  81. $code.=<<___;
  82. ld $bi,`8*$i`($bp)
  83. addc @acc[1],@acc[1],$t0 # accumulate high parts
  84. mulld $t0,$a0,$bi
  85. adde @acc[2],@acc[2],$t1
  86. mulld $t1,$a1,$bi
  87. adde @acc[3],@acc[3],$t2
  88. mulld $t2,$a2,$bi
  89. adde @acc[4],$acc4,$t3
  90. mulld $t3,$a3,$bi
  91. addc @acc[1],@acc[1],$t0 # accumulate low parts
  92. mulhdu $t0,$a0,$bi
  93. adde @acc[2],@acc[2],$t1
  94. mulhdu $t1,$a1,$bi
  95. adde @acc[3],@acc[3],$t2
  96. mulhdu $t2,$a2,$bi
  97. adde @acc[4],@acc[4],$t3
  98. mulhdu $t3,$a3,$bi
  99. adde @acc[5],$zero,$zero
  100. ___
  101. }
  102. $code.=<<___;
  103. li $bi,38
  104. addc $acc4,$acc4,$t0
  105. mulld $t0,$acc4,$bi
  106. adde $acc5,$acc5,$t1
  107. mulld $t1,$acc5,$bi
  108. adde $acc6,$acc6,$t2
  109. mulld $t2,$acc6,$bi
  110. adde $acc7,$acc7,$t3
  111. mulld $t3,$acc7,$bi
  112. addc $acc0,$acc0,$t0
  113. mulhdu $t0,$acc4,$bi
  114. adde $acc1,$acc1,$t1
  115. mulhdu $t1,$acc5,$bi
  116. adde $acc2,$acc2,$t2
  117. mulhdu $t2,$acc6,$bi
  118. adde $acc3,$acc3,$t3
  119. mulhdu $t3,$acc7,$bi
  120. adde $acc4,$zero,$zero
  121. addc $acc1,$acc1,$t0
  122. adde $acc2,$acc2,$t1
  123. adde $acc3,$acc3,$t2
  124. adde $acc4,$acc4,$t3
  125. mulld $acc4,$acc4,$bi
  126. addc $acc0,$acc0,$acc4
  127. addze $acc1,$acc1
  128. addze $acc2,$acc2
  129. addze $acc3,$acc3
  130. subfe $acc4,$acc4,$acc4 # carry -> ~mask
  131. std $acc1,8($rp)
  132. andc $acc4,$bi,$acc4
  133. std $acc2,16($rp)
  134. add $acc0,$acc0,$acc4
  135. std $acc3,24($rp)
  136. std $acc0,0($rp)
  137. ld r22,`$FRAME-8*10`($sp)
  138. ld r23,`$FRAME-8*9`($sp)
  139. ld r24,`$FRAME-8*8`($sp)
  140. ld r25,`$FRAME-8*7`($sp)
  141. ld r26,`$FRAME-8*6`($sp)
  142. ld r27,`$FRAME-8*5`($sp)
  143. ld r28,`$FRAME-8*4`($sp)
  144. ld r29,`$FRAME-8*3`($sp)
  145. ld r30,`$FRAME-8*2`($sp)
  146. ld r31,`$FRAME-8*1`($sp)
  147. addi $sp,$sp,$FRAME
  148. blr
  149. .long 0
  150. .byte 0,12,4,0,0x80,10,3,0
  151. .long 0
  152. .size x25519_fe64_mul,.-x25519_fe64_mul
  153. .globl x25519_fe64_sqr
  154. .type x25519_fe64_sqr,\@function
  155. .align 5
  156. x25519_fe64_sqr:
  157. stdu $sp,-$FRAME($sp)
  158. std r22,`$FRAME-8*10`($sp)
  159. std r23,`$FRAME-8*9`($sp)
  160. std r24,`$FRAME-8*8`($sp)
  161. std r25,`$FRAME-8*7`($sp)
  162. std r26,`$FRAME-8*6`($sp)
  163. std r27,`$FRAME-8*5`($sp)
  164. std r28,`$FRAME-8*4`($sp)
  165. std r29,`$FRAME-8*3`($sp)
  166. std r30,`$FRAME-8*2`($sp)
  167. std r31,`$FRAME-8*1`($sp)
  168. ld $a0,0($ap)
  169. xor $zero,$zero,$zero
  170. ld $a1,8($ap)
  171. ld $a2,16($ap)
  172. ld $a3,24($ap)
  173. ################################
  174. # | | | | | |a1*a0| |
  175. # | | | | |a2*a0| | |
  176. # | |a3*a2|a3*a0| | | |
  177. # | | | |a2*a1| | | |
  178. # | | |a3*a1| | | | |
  179. # *| | | | | | | | 2|
  180. # +|a3*a3|a2*a2|a1*a1|a0*a0|
  181. # |--+--+--+--+--+--+--+--|
  182. # |A7|A6|A5|A4|A3|A2|A1|A0|, where Ax is $accx, i.e. follow $accx
  183. #
  184. # "can't overflow" below mark carrying into high part of
  185. # multiplication result, which can't overflow, because it
  186. # can never be all ones.
  187. mulld $acc1,$a1,$a0 # a[1]*a[0]
  188. mulhdu $t1,$a1,$a0
  189. mulld $acc2,$a2,$a0 # a[2]*a[0]
  190. mulhdu $t2,$a2,$a0
  191. mulld $acc3,$a3,$a0 # a[3]*a[0]
  192. mulhdu $acc4,$a3,$a0
  193. addc $acc2,$acc2,$t1 # accumulate high parts of multiplication
  194. mulld $t0,$a2,$a1 # a[2]*a[1]
  195. mulhdu $t1,$a2,$a1
  196. adde $acc3,$acc3,$t2
  197. mulld $t2,$a3,$a1 # a[3]*a[1]
  198. mulhdu $t3,$a3,$a1
  199. addze $acc4,$acc4 # can't overflow
  200. mulld $acc5,$a3,$a2 # a[3]*a[2]
  201. mulhdu $acc6,$a3,$a2
  202. addc $t1,$t1,$t2 # accumulate high parts of multiplication
  203. mulld $acc0,$a0,$a0 # a[0]*a[0]
  204. addze $t2,$t3 # can't overflow
  205. addc $acc3,$acc3,$t0 # accumulate low parts of multiplication
  206. mulhdu $a0,$a0,$a0
  207. adde $acc4,$acc4,$t1
  208. mulld $t1,$a1,$a1 # a[1]*a[1]
  209. adde $acc5,$acc5,$t2
  210. mulhdu $a1,$a1,$a1
  211. addze $acc6,$acc6 # can't overflow
  212. addc $acc1,$acc1,$acc1 # acc[1-6]*=2
  213. mulld $t2,$a2,$a2 # a[2]*a[2]
  214. adde $acc2,$acc2,$acc2
  215. mulhdu $a2,$a2,$a2
  216. adde $acc3,$acc3,$acc3
  217. mulld $t3,$a3,$a3 # a[3]*a[3]
  218. adde $acc4,$acc4,$acc4
  219. mulhdu $a3,$a3,$a3
  220. adde $acc5,$acc5,$acc5
  221. adde $acc6,$acc6,$acc6
  222. addze $acc7,$zero
  223. addc $acc1,$acc1,$a0 # +a[i]*a[i]
  224. li $bi,38
  225. adde $acc2,$acc2,$t1
  226. adde $acc3,$acc3,$a1
  227. adde $acc4,$acc4,$t2
  228. adde $acc5,$acc5,$a2
  229. adde $acc6,$acc6,$t3
  230. adde $acc7,$acc7,$a3
  231. mulld $t0,$acc4,$bi
  232. mulld $t1,$acc5,$bi
  233. mulld $t2,$acc6,$bi
  234. mulld $t3,$acc7,$bi
  235. addc $acc0,$acc0,$t0
  236. mulhdu $t0,$acc4,$bi
  237. adde $acc1,$acc1,$t1
  238. mulhdu $t1,$acc5,$bi
  239. adde $acc2,$acc2,$t2
  240. mulhdu $t2,$acc6,$bi
  241. adde $acc3,$acc3,$t3
  242. mulhdu $t3,$acc7,$bi
  243. addze $acc4,$zero
  244. addc $acc1,$acc1,$t0
  245. adde $acc2,$acc2,$t1
  246. adde $acc3,$acc3,$t2
  247. adde $acc4,$acc4,$t3
  248. mulld $acc4,$acc4,$bi
  249. addc $acc0,$acc0,$acc4
  250. addze $acc1,$acc1
  251. addze $acc2,$acc2
  252. addze $acc3,$acc3
  253. subfe $acc4,$acc4,$acc4 # carry -> ~mask
  254. std $acc1,8($rp)
  255. andc $acc4,$bi,$acc4
  256. std $acc2,16($rp)
  257. add $acc0,$acc0,$acc4
  258. std $acc3,24($rp)
  259. std $acc0,0($rp)
  260. ld r22,`$FRAME-8*10`($sp)
  261. ld r23,`$FRAME-8*9`($sp)
  262. ld r24,`$FRAME-8*8`($sp)
  263. ld r25,`$FRAME-8*7`($sp)
  264. ld r26,`$FRAME-8*6`($sp)
  265. ld r27,`$FRAME-8*5`($sp)
  266. ld r28,`$FRAME-8*4`($sp)
  267. ld r29,`$FRAME-8*3`($sp)
  268. ld r30,`$FRAME-8*2`($sp)
  269. ld r31,`$FRAME-8*1`($sp)
  270. addi $sp,$sp,$FRAME
  271. blr
  272. .long 0
  273. .byte 0,12,4,0,0x80,10,2,0
  274. .long 0
  275. .size x25519_fe64_sqr,.-x25519_fe64_sqr
  276. .globl x25519_fe64_mul121666
  277. .type x25519_fe64_mul121666,\@function
  278. .align 5
  279. x25519_fe64_mul121666:
  280. lis $bi,`65536>>16`
  281. ori $bi,$bi,`121666-65536`
  282. ld $t0,0($ap)
  283. ld $t1,8($ap)
  284. ld $bp,16($ap)
  285. ld $ap,24($ap)
  286. mulld $a0,$t0,$bi
  287. mulhdu $t0,$t0,$bi
  288. mulld $a1,$t1,$bi
  289. mulhdu $t1,$t1,$bi
  290. mulld $a2,$bp,$bi
  291. mulhdu $bp,$bp,$bi
  292. mulld $a3,$ap,$bi
  293. mulhdu $ap,$ap,$bi
  294. addc $a1,$a1,$t0
  295. adde $a2,$a2,$t1
  296. adde $a3,$a3,$bp
  297. addze $ap, $ap
  298. mulli $ap,$ap,38
  299. addc $a0,$a0,$ap
  300. addze $a1,$a1
  301. addze $a2,$a2
  302. addze $a3,$a3
  303. subfe $t1,$t1,$t1 # carry -> ~mask
  304. std $a1,8($rp)
  305. andc $t0,$t0,$t1
  306. std $a2,16($rp)
  307. add $a0,$a0,$t0
  308. std $a3,24($rp)
  309. std $a0,0($rp)
  310. blr
  311. .long 0
  312. .byte 0,12,0x14,0,0,0,2,0
  313. .long 0
  314. .size x25519_fe64_mul121666,.-x25519_fe64_mul121666
  315. .globl x25519_fe64_add
  316. .type x25519_fe64_add,\@function
  317. .align 5
  318. x25519_fe64_add:
  319. ld $a0,0($ap)
  320. ld $t0,0($bp)
  321. ld $a1,8($ap)
  322. ld $t1,8($bp)
  323. ld $a2,16($ap)
  324. ld $bi,16($bp)
  325. ld $a3,24($ap)
  326. ld $bp,24($bp)
  327. addc $a0,$a0,$t0
  328. adde $a1,$a1,$t1
  329. adde $a2,$a2,$bi
  330. adde $a3,$a3,$bp
  331. li $t0,38
  332. subfe $t1,$t1,$t1 # carry -> ~mask
  333. andc $t1,$t0,$t1
  334. addc $a0,$a0,$t1
  335. addze $a1,$a1
  336. addze $a2,$a2
  337. addze $a3,$a3
  338. subfe $t1,$t1,$t1 # carry -> ~mask
  339. std $a1,8($rp)
  340. andc $t0,$t0,$t1
  341. std $a2,16($rp)
  342. add $a0,$a0,$t0
  343. std $a3,24($rp)
  344. std $a0,0($rp)
  345. blr
  346. .long 0
  347. .byte 0,12,0x14,0,0,0,3,0
  348. .long 0
  349. .size x25519_fe64_add,.-x25519_fe64_add
  350. .globl x25519_fe64_sub
  351. .type x25519_fe64_sub,\@function
  352. .align 5
  353. x25519_fe64_sub:
  354. ld $a0,0($ap)
  355. ld $t0,0($bp)
  356. ld $a1,8($ap)
  357. ld $t1,8($bp)
  358. ld $a2,16($ap)
  359. ld $bi,16($bp)
  360. ld $a3,24($ap)
  361. ld $bp,24($bp)
  362. subfc $a0,$t0,$a0
  363. subfe $a1,$t1,$a1
  364. subfe $a2,$bi,$a2
  365. subfe $a3,$bp,$a3
  366. li $t0,38
  367. subfe $t1,$t1,$t1 # borrow -> mask
  368. xor $zero,$zero,$zero
  369. and $t1,$t0,$t1
  370. subfc $a0,$t1,$a0
  371. subfe $a1,$zero,$a1
  372. subfe $a2,$zero,$a2
  373. subfe $a3,$zero,$a3
  374. subfe $t1,$t1,$t1 # borrow -> mask
  375. std $a1,8($rp)
  376. and $t0,$t0,$t1
  377. std $a2,16($rp)
  378. subf $a0,$t0,$a0
  379. std $a3,24($rp)
  380. std $a0,0($rp)
  381. blr
  382. .long 0
  383. .byte 0,12,0x14,0,0,0,3,0
  384. .long 0
  385. .size x25519_fe64_sub,.-x25519_fe64_sub
  386. .globl x25519_fe64_tobytes
  387. .type x25519_fe64_tobytes,\@function
  388. .align 5
  389. x25519_fe64_tobytes:
  390. ld $a3,24($ap)
  391. ld $a0,0($ap)
  392. ld $a1,8($ap)
  393. ld $a2,16($ap)
  394. sradi $t0,$a3,63 # most significant bit -> mask
  395. li $t1,19
  396. and $t0,$t0,$t1
  397. sldi $a3,$a3,1
  398. add $t0,$t0,$t1 # compare to modulus in the same go
  399. srdi $a3,$a3,1 # most significant bit cleared
  400. addc $a0,$a0,$t0
  401. addze $a1,$a1
  402. addze $a2,$a2
  403. addze $a3,$a3
  404. xor $zero,$zero,$zero
  405. sradi $t0,$a3,63 # most significant bit -> mask
  406. sldi $a3,$a3,1
  407. andc $t0,$t1,$t0
  408. srdi $a3,$a3,1 # most significant bit cleared
  409. subi $rp,$rp,1
  410. subfc $a0,$t0,$a0
  411. subfe $a1,$zero,$a1
  412. subfe $a2,$zero,$a2
  413. subfe $a3,$zero,$a3
  414. ___
  415. for (my @a=($a0,$a1,$a2,$a3), my $i=0; $i<4; shift(@a), $i++) {
  416. $code.=<<___;
  417. srdi $t0,@a[0],8
  418. stbu @a[0],1($rp)
  419. srdi @a[0],@a[0],16
  420. stbu $t0,1($rp)
  421. srdi $t0,@a[0],8
  422. stbu @a[0],1($rp)
  423. srdi @a[0],@a[0],16
  424. stbu $t0,1($rp)
  425. srdi $t0,@a[0],8
  426. stbu @a[0],1($rp)
  427. srdi @a[0],@a[0],16
  428. stbu $t0,1($rp)
  429. srdi $t0,@a[0],8
  430. stbu @a[0],1($rp)
  431. stbu $t0,1($rp)
  432. ___
  433. }
  434. $code.=<<___;
  435. blr
  436. .long 0
  437. .byte 0,12,0x14,0,0,0,2,0
  438. .long 0
  439. .size x25519_fe64_tobytes,.-x25519_fe64_tobytes
  440. ___
  441. }
  442. ####################################################### base 2^51
  443. {
  444. my ($bi,$a0,$a1,$a2,$a3,$a4,$t0, $t1,
  445. $h0lo,$h0hi,$h1lo,$h1hi,$h2lo,$h2hi,$h3lo,$h3hi,$h4lo,$h4hi) =
  446. map("r$_",(6..12,21..31));
  447. my $mask = "r0";
  448. my $FRAME = 18*8;
  449. $code.=<<___;
  450. .text
  451. .globl x25519_fe51_mul
  452. .type x25519_fe51_mul,\@function
  453. .align 5
  454. x25519_fe51_mul:
  455. stdu $sp,-$FRAME($sp)
  456. std r21,`$FRAME-8*11`($sp)
  457. std r22,`$FRAME-8*10`($sp)
  458. std r23,`$FRAME-8*9`($sp)
  459. std r24,`$FRAME-8*8`($sp)
  460. std r25,`$FRAME-8*7`($sp)
  461. std r26,`$FRAME-8*6`($sp)
  462. std r27,`$FRAME-8*5`($sp)
  463. std r28,`$FRAME-8*4`($sp)
  464. std r29,`$FRAME-8*3`($sp)
  465. std r30,`$FRAME-8*2`($sp)
  466. std r31,`$FRAME-8*1`($sp)
  467. ld $bi,0($bp)
  468. ld $a0,0($ap)
  469. ld $a1,8($ap)
  470. ld $a2,16($ap)
  471. ld $a3,24($ap)
  472. ld $a4,32($ap)
  473. mulld $h0lo,$a0,$bi # a[0]*b[0]
  474. mulhdu $h0hi,$a0,$bi
  475. mulld $h1lo,$a1,$bi # a[1]*b[0]
  476. mulhdu $h1hi,$a1,$bi
  477. mulld $h4lo,$a4,$bi # a[4]*b[0]
  478. mulhdu $h4hi,$a4,$bi
  479. ld $ap,8($bp)
  480. mulli $a4,$a4,19
  481. mulld $h2lo,$a2,$bi # a[2]*b[0]
  482. mulhdu $h2hi,$a2,$bi
  483. mulld $h3lo,$a3,$bi # a[3]*b[0]
  484. mulhdu $h3hi,$a3,$bi
  485. ___
  486. for(my @a=($a0,$a1,$a2,$a3,$a4),
  487. my $i=1; $i<4; $i++) {
  488. ($ap,$bi) = ($bi,$ap);
  489. $code.=<<___;
  490. mulld $t0,@a[4],$bi
  491. mulhdu $t1,@a[4],$bi
  492. addc $h0lo,$h0lo,$t0
  493. adde $h0hi,$h0hi,$t1
  494. mulld $t0,@a[0],$bi
  495. mulhdu $t1,@a[0],$bi
  496. addc $h1lo,$h1lo,$t0
  497. adde $h1hi,$h1hi,$t1
  498. mulld $t0,@a[3],$bi
  499. mulhdu $t1,@a[3],$bi
  500. ld $ap,`8*($i+1)`($bp)
  501. mulli @a[3],@a[3],19
  502. addc $h4lo,$h4lo,$t0
  503. adde $h4hi,$h4hi,$t1
  504. mulld $t0,@a[1],$bi
  505. mulhdu $t1,@a[1],$bi
  506. addc $h2lo,$h2lo,$t0
  507. adde $h2hi,$h2hi,$t1
  508. mulld $t0,@a[2],$bi
  509. mulhdu $t1,@a[2],$bi
  510. addc $h3lo,$h3lo,$t0
  511. adde $h3hi,$h3hi,$t1
  512. ___
  513. unshift(@a,pop(@a));
  514. }
  515. ($ap,$bi) = ($bi,$ap);
  516. $code.=<<___;
  517. mulld $t0,$a1,$bi
  518. mulhdu $t1,$a1,$bi
  519. addc $h0lo,$h0lo,$t0
  520. adde $h0hi,$h0hi,$t1
  521. mulld $t0,$a2,$bi
  522. mulhdu $t1,$a2,$bi
  523. addc $h1lo,$h1lo,$t0
  524. adde $h1hi,$h1hi,$t1
  525. mulld $t0,$a3,$bi
  526. mulhdu $t1,$a3,$bi
  527. addc $h2lo,$h2lo,$t0
  528. adde $h2hi,$h2hi,$t1
  529. mulld $t0,$a4,$bi
  530. mulhdu $t1,$a4,$bi
  531. addc $h3lo,$h3lo,$t0
  532. adde $h3hi,$h3hi,$t1
  533. mulld $t0,$a0,$bi
  534. mulhdu $t1,$a0,$bi
  535. addc $h4lo,$h4lo,$t0
  536. adde $h4hi,$h4hi,$t1
  537. .Lfe51_reduce:
  538. li $mask,-1
  539. srdi $mask,$mask,13 # 0x7ffffffffffff
  540. srdi $t0,$h2lo,51
  541. and $a2,$h2lo,$mask
  542. insrdi $t0,$h2hi,51,0 # h2>>51
  543. srdi $t1,$h0lo,51
  544. and $a0,$h0lo,$mask
  545. insrdi $t1,$h0hi,51,0 # h0>>51
  546. addc $h3lo,$h3lo,$t0
  547. addze $h3hi,$h3hi
  548. addc $h1lo,$h1lo,$t1
  549. addze $h1hi,$h1hi
  550. srdi $t0,$h3lo,51
  551. and $a3,$h3lo,$mask
  552. insrdi $t0,$h3hi,51,0 # h3>>51
  553. srdi $t1,$h1lo,51
  554. and $a1,$h1lo,$mask
  555. insrdi $t1,$h1hi,51,0 # h1>>51
  556. addc $h4lo,$h4lo,$t0
  557. addze $h4hi,$h4hi
  558. add $a2,$a2,$t1
  559. srdi $t0,$h4lo,51
  560. and $a4,$h4lo,$mask
  561. insrdi $t0,$h4hi,51,0
  562. mulli $t0,$t0,19 # (h4 >> 51) * 19
  563. add $a0,$a0,$t0
  564. srdi $t1,$a2,51
  565. and $a2,$a2,$mask
  566. add $a3,$a3,$t1
  567. srdi $t0,$a0,51
  568. and $a0,$a0,$mask
  569. add $a1,$a1,$t0
  570. std $a2,16($rp)
  571. std $a3,24($rp)
  572. std $a4,32($rp)
  573. std $a0,0($rp)
  574. std $a1,8($rp)
  575. ld r21,`$FRAME-8*11`($sp)
  576. ld r22,`$FRAME-8*10`($sp)
  577. ld r23,`$FRAME-8*9`($sp)
  578. ld r24,`$FRAME-8*8`($sp)
  579. ld r25,`$FRAME-8*7`($sp)
  580. ld r26,`$FRAME-8*6`($sp)
  581. ld r27,`$FRAME-8*5`($sp)
  582. ld r28,`$FRAME-8*4`($sp)
  583. ld r29,`$FRAME-8*3`($sp)
  584. ld r30,`$FRAME-8*2`($sp)
  585. ld r31,`$FRAME-8*1`($sp)
  586. addi $sp,$sp,$FRAME
  587. blr
  588. .long 0
  589. .byte 0,12,4,0,0x80,11,3,0
  590. .long 0
  591. .size x25519_fe51_mul,.-x25519_fe51_mul
  592. ___
  593. {
  594. my ($a0,$a1,$a2,$a3,$a4,$t0,$t1) = ($a0,$a1,$a2,$a3,$a4,$t0,$t1);
  595. $code.=<<___;
  596. .globl x25519_fe51_sqr
  597. .type x25519_fe51_sqr,\@function
  598. .align 5
  599. x25519_fe51_sqr:
  600. stdu $sp,-$FRAME($sp)
  601. std r21,`$FRAME-8*11`($sp)
  602. std r22,`$FRAME-8*10`($sp)
  603. std r23,`$FRAME-8*9`($sp)
  604. std r24,`$FRAME-8*8`($sp)
  605. std r25,`$FRAME-8*7`($sp)
  606. std r26,`$FRAME-8*6`($sp)
  607. std r27,`$FRAME-8*5`($sp)
  608. std r28,`$FRAME-8*4`($sp)
  609. std r29,`$FRAME-8*3`($sp)
  610. std r30,`$FRAME-8*2`($sp)
  611. std r31,`$FRAME-8*1`($sp)
  612. ld $a0,0($ap)
  613. ld $a1,8($ap)
  614. ld $a2,16($ap)
  615. ld $a3,24($ap)
  616. ld $a4,32($ap)
  617. add $bi,$a0,$a0 # a[0]*2
  618. mulli $t1,$a4,19 # a[4]*19
  619. mulld $h0lo,$a0,$a0
  620. mulhdu $h0hi,$a0,$a0
  621. mulld $h1lo,$a1,$bi
  622. mulhdu $h1hi,$a1,$bi
  623. mulld $h2lo,$a2,$bi
  624. mulhdu $h2hi,$a2,$bi
  625. mulld $h3lo,$a3,$bi
  626. mulhdu $h3hi,$a3,$bi
  627. mulld $h4lo,$a4,$bi
  628. mulhdu $h4hi,$a4,$bi
  629. add $bi,$a1,$a1 # a[1]*2
  630. ___
  631. ($a4,$t1) = ($t1,$a4);
  632. $code.=<<___;
  633. mulld $t0,$t1,$a4
  634. mulhdu $t1,$t1,$a4
  635. addc $h3lo,$h3lo,$t0
  636. adde $h3hi,$h3hi,$t1
  637. mulli $bp,$a3,19 # a[3]*19
  638. mulld $t0,$a1,$a1
  639. mulhdu $t1,$a1,$a1
  640. addc $h2lo,$h2lo,$t0
  641. adde $h2hi,$h2hi,$t1
  642. mulld $t0,$a2,$bi
  643. mulhdu $t1,$a2,$bi
  644. addc $h3lo,$h3lo,$t0
  645. adde $h3hi,$h3hi,$t1
  646. mulld $t0,$a3,$bi
  647. mulhdu $t1,$a3,$bi
  648. addc $h4lo,$h4lo,$t0
  649. adde $h4hi,$h4hi,$t1
  650. mulld $t0,$a4,$bi
  651. mulhdu $t1,$a4,$bi
  652. add $bi,$a3,$a3 # a[3]*2
  653. addc $h0lo,$h0lo,$t0
  654. adde $h0hi,$h0hi,$t1
  655. ___
  656. ($a3,$t1) = ($bp,$a3);
  657. $code.=<<___;
  658. mulld $t0,$t1,$a3
  659. mulhdu $t1,$t1,$a3
  660. addc $h1lo,$h1lo,$t0
  661. adde $h1hi,$h1hi,$t1
  662. mulld $t0,$bi,$a4
  663. mulhdu $t1,$bi,$a4
  664. add $bi,$a2,$a2 # a[2]*2
  665. addc $h2lo,$h2lo,$t0
  666. adde $h2hi,$h2hi,$t1
  667. mulld $t0,$a2,$a2
  668. mulhdu $t1,$a2,$a2
  669. addc $h4lo,$h4lo,$t0
  670. adde $h4hi,$h4hi,$t1
  671. mulld $t0,$a3,$bi
  672. mulhdu $t1,$a3,$bi
  673. addc $h0lo,$h0lo,$t0
  674. adde $h0hi,$h0hi,$t1
  675. mulld $t0,$a4,$bi
  676. mulhdu $t1,$a4,$bi
  677. addc $h1lo,$h1lo,$t0
  678. adde $h1hi,$h1hi,$t1
  679. b .Lfe51_reduce
  680. .long 0
  681. .byte 0,12,4,0,0x80,11,2,0
  682. .long 0
  683. .size x25519_fe51_sqr,.-x25519_fe51_sqr
  684. ___
  685. }
  686. $code.=<<___;
  687. .globl x25519_fe51_mul121666
  688. .type x25519_fe51_mul121666,\@function
  689. .align 5
  690. x25519_fe51_mul121666:
  691. stdu $sp,-$FRAME($sp)
  692. std r21,`$FRAME-8*11`($sp)
  693. std r22,`$FRAME-8*10`($sp)
  694. std r23,`$FRAME-8*9`($sp)
  695. std r24,`$FRAME-8*8`($sp)
  696. std r25,`$FRAME-8*7`($sp)
  697. std r26,`$FRAME-8*6`($sp)
  698. std r27,`$FRAME-8*5`($sp)
  699. std r28,`$FRAME-8*4`($sp)
  700. std r29,`$FRAME-8*3`($sp)
  701. std r30,`$FRAME-8*2`($sp)
  702. std r31,`$FRAME-8*1`($sp)
  703. lis $bi,`65536>>16`
  704. ori $bi,$bi,`121666-65536`
  705. ld $a0,0($ap)
  706. ld $a1,8($ap)
  707. ld $a2,16($ap)
  708. ld $a3,24($ap)
  709. ld $a4,32($ap)
  710. mulld $h0lo,$a0,$bi # a[0]*121666
  711. mulhdu $h0hi,$a0,$bi
  712. mulld $h1lo,$a1,$bi # a[1]*121666
  713. mulhdu $h1hi,$a1,$bi
  714. mulld $h2lo,$a2,$bi # a[2]*121666
  715. mulhdu $h2hi,$a2,$bi
  716. mulld $h3lo,$a3,$bi # a[3]*121666
  717. mulhdu $h3hi,$a3,$bi
  718. mulld $h4lo,$a4,$bi # a[4]*121666
  719. mulhdu $h4hi,$a4,$bi
  720. b .Lfe51_reduce
  721. .long 0
  722. .byte 0,12,4,0,0x80,11,2,0
  723. .long 0
  724. .size x25519_fe51_mul121666,.-x25519_fe51_mul121666
  725. ___
  726. }
  727. $code =~ s/\`([^\`]*)\`/eval $1/gem;
  728. print $code;
  729. close STDOUT or die "error closing STDOUT: $!";