armv8-mont.pl 36 KB


  1. #! /usr/bin/env perl
  2. # Copyright 2015-2020 The OpenSSL Project Authors. All Rights Reserved.
  3. #
  4. # Licensed under the OpenSSL license (the "License"). You may not use
  5. # this file except in compliance with the License. You can obtain a copy
  6. # in the file LICENSE in the source distribution or at
  7. # https://www.openssl.org/source/license.html
  8. # ====================================================================
  9. # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
  10. # project. The module is, however, dual licensed under OpenSSL and
  11. # CRYPTOGAMS licenses depending on where you obtain it. For further
  12. # details see http://www.openssl.org/~appro/cryptogams/.
  13. # ====================================================================
  14. # March 2015
  15. #
  16. # "Teaser" Montgomery multiplication module for ARMv8. Needs more
  17. # work. While it does improve RSA sign performance by 20-30% (less for
  18. # longer keys) on most processors, for some reason RSA2048 is not
  19. # faster and RSA4096 goes 15-20% slower on Cortex-A57. Multiplication
  20. # instruction issue rate is limited on processor in question, meaning
  21. # that dedicated squaring procedure is a must. Well, actually all
  22. # contemporary AArch64 processors seem to have limited multiplication
  23. # issue rate, i.e. they can't issue multiplication every cycle, which
  24. # explains moderate improvement coefficients in comparison to
  25. # compiler-generated code. Recall that compiler is instructed to use
  26. # umulh and therefore uses same amount of multiplication instructions
  27. # to do the job. Assembly's edge is to minimize number of "collateral"
  28. # instructions and of course instruction scheduling.
  29. #
  30. # April 2015
  31. #
  32. # Squaring procedure that handles lengths divisible by 8 improves
  33. # RSA/DSA performance by 25-40-60% depending on processor and key
  34. # length. Overall improvement coefficients are always positive in
  35. # comparison to compiler-generated code. On Cortex-A57 improvement
  36. # is still modest on longest key lengths, while others exhibit e.g.
  37. # 50-70% improvement for RSA4096 sign. RSA2048 sign is ~25% faster
  38. # on Cortex-A57 and ~60-100% faster on others.
  39. $flavour = shift;
  40. $output = shift;
  41. $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
  42. ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
  43. ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
  44. die "can't locate arm-xlate.pl";
  45. open OUT,"| \"$^X\" $xlate $flavour $output";
  46. *STDOUT=*OUT;
  47. ($lo0,$hi0,$aj,$m0,$alo,$ahi,
  48. $lo1,$hi1,$nj,$m1,$nlo,$nhi,
  49. $ovf, $i,$j,$tp,$tj) = map("x$_",6..17,19..24);
  50. # int bn_mul_mont(
  51. $rp="x0"; # BN_ULONG *rp,
  52. $ap="x1"; # const BN_ULONG *ap,
  53. $bp="x2"; # const BN_ULONG *bp,
  54. $np="x3"; # const BN_ULONG *np,
  55. $n0="x4"; # const BN_ULONG *n0,
  56. $num="x5"; # int num);
  57. $code.=<<___;
  58. .text
  59. .globl bn_mul_mont
  60. .type bn_mul_mont,%function
  61. .align 5
  62. bn_mul_mont:
  63. tst $num,#7
  64. b.eq __bn_sqr8x_mont
  65. tst $num,#3
  66. b.eq __bn_mul4x_mont
  67. .Lmul_mont:
  68. stp x29,x30,[sp,#-64]!
  69. add x29,sp,#0
  70. stp x19,x20,[sp,#16]
  71. stp x21,x22,[sp,#32]
  72. stp x23,x24,[sp,#48]
  73. ldr $m0,[$bp],#8 // bp[0]
  74. sub $tp,sp,$num,lsl#3
  75. ldp $hi0,$aj,[$ap],#16 // ap[0..1]
  76. lsl $num,$num,#3
  77. ldr $n0,[$n0] // *n0
  78. and $tp,$tp,#-16 // ABI says so
  79. ldp $hi1,$nj,[$np],#16 // np[0..1]
  80. mul $lo0,$hi0,$m0 // ap[0]*bp[0]
  81. sub $j,$num,#16 // j=num-2
  82. umulh $hi0,$hi0,$m0
  83. mul $alo,$aj,$m0 // ap[1]*bp[0]
  84. umulh $ahi,$aj,$m0
  85. mul $m1,$lo0,$n0 // "tp[0]"*n0
  86. mov sp,$tp // alloca
  87. // (*) mul $lo1,$hi1,$m1 // np[0]*m1
  88. umulh $hi1,$hi1,$m1
  89. mul $nlo,$nj,$m1 // np[1]*m1
  90. // (*) adds $lo1,$lo1,$lo0 // discarded
  91. // (*) As for removal of first multiplication and addition
  92. // instructions. The outcome of first addition is
  93. // guaranteed to be zero, which leaves two computationally
  94. // significant outcomes: it either carries or not. Then
  95. // question is when does it carry? Is there alternative
  96. // way to deduce it? If you follow operations, you can
  97. // observe that condition for carry is quite simple:
  98. // $lo0 being non-zero. So that carry can be calculated
  99. // by adding -1 to $lo0. That's what next instruction does.
  100. subs xzr,$lo0,#1 // (*)
  101. umulh $nhi,$nj,$m1
  102. adc $hi1,$hi1,xzr
  103. cbz $j,.L1st_skip
  104. .L1st:
  105. ldr $aj,[$ap],#8
  106. adds $lo0,$alo,$hi0
  107. sub $j,$j,#8 // j--
  108. adc $hi0,$ahi,xzr
  109. ldr $nj,[$np],#8
  110. adds $lo1,$nlo,$hi1
  111. mul $alo,$aj,$m0 // ap[j]*bp[0]
  112. adc $hi1,$nhi,xzr
  113. umulh $ahi,$aj,$m0
  114. adds $lo1,$lo1,$lo0
  115. mul $nlo,$nj,$m1 // np[j]*m1
  116. adc $hi1,$hi1,xzr
  117. umulh $nhi,$nj,$m1
  118. str $lo1,[$tp],#8 // tp[j-1]
  119. cbnz $j,.L1st
  120. .L1st_skip:
  121. adds $lo0,$alo,$hi0
  122. sub $ap,$ap,$num // rewind $ap
  123. adc $hi0,$ahi,xzr
  124. adds $lo1,$nlo,$hi1
  125. sub $np,$np,$num // rewind $np
  126. adc $hi1,$nhi,xzr
  127. adds $lo1,$lo1,$lo0
  128. sub $i,$num,#8 // i=num-1
  129. adcs $hi1,$hi1,$hi0
  130. adc $ovf,xzr,xzr // upmost overflow bit
  131. stp $lo1,$hi1,[$tp]
  132. .Louter:
  133. ldr $m0,[$bp],#8 // bp[i]
  134. ldp $hi0,$aj,[$ap],#16
  135. ldr $tj,[sp] // tp[0]
  136. add $tp,sp,#8
  137. mul $lo0,$hi0,$m0 // ap[0]*bp[i]
  138. sub $j,$num,#16 // j=num-2
  139. umulh $hi0,$hi0,$m0
  140. ldp $hi1,$nj,[$np],#16
  141. mul $alo,$aj,$m0 // ap[1]*bp[i]
  142. adds $lo0,$lo0,$tj
  143. umulh $ahi,$aj,$m0
  144. adc $hi0,$hi0,xzr
  145. mul $m1,$lo0,$n0
  146. sub $i,$i,#8 // i--
  147. // (*) mul $lo1,$hi1,$m1 // np[0]*m1
  148. umulh $hi1,$hi1,$m1
  149. mul $nlo,$nj,$m1 // np[1]*m1
  150. // (*) adds $lo1,$lo1,$lo0
  151. subs xzr,$lo0,#1 // (*)
  152. umulh $nhi,$nj,$m1
  153. cbz $j,.Linner_skip
  154. .Linner:
  155. ldr $aj,[$ap],#8
  156. adc $hi1,$hi1,xzr
  157. ldr $tj,[$tp],#8 // tp[j]
  158. adds $lo0,$alo,$hi0
  159. sub $j,$j,#8 // j--
  160. adc $hi0,$ahi,xzr
  161. adds $lo1,$nlo,$hi1
  162. ldr $nj,[$np],#8
  163. adc $hi1,$nhi,xzr
  164. mul $alo,$aj,$m0 // ap[j]*bp[i]
  165. adds $lo0,$lo0,$tj
  166. umulh $ahi,$aj,$m0
  167. adc $hi0,$hi0,xzr
  168. mul $nlo,$nj,$m1 // np[j]*m1
  169. adds $lo1,$lo1,$lo0
  170. umulh $nhi,$nj,$m1
  171. str $lo1,[$tp,#-16] // tp[j-1]
  172. cbnz $j,.Linner
  173. .Linner_skip:
  174. ldr $tj,[$tp],#8 // tp[j]
  175. adc $hi1,$hi1,xzr
  176. adds $lo0,$alo,$hi0
  177. sub $ap,$ap,$num // rewind $ap
  178. adc $hi0,$ahi,xzr
  179. adds $lo1,$nlo,$hi1
  180. sub $np,$np,$num // rewind $np
  181. adcs $hi1,$nhi,$ovf
  182. adc $ovf,xzr,xzr
  183. adds $lo0,$lo0,$tj
  184. adc $hi0,$hi0,xzr
  185. adds $lo1,$lo1,$lo0
  186. adcs $hi1,$hi1,$hi0
  187. adc $ovf,$ovf,xzr // upmost overflow bit
  188. stp $lo1,$hi1,[$tp,#-16]
  189. cbnz $i,.Louter
  190. // Final step. We see if result is larger than modulus, and
  191. // if it is, subtract the modulus. But comparison implies
  192. // subtraction. So we subtract modulus, see if it borrowed,
  193. // and conditionally copy original value.
  194. ldr $tj,[sp] // tp[0]
  195. add $tp,sp,#8
  196. ldr $nj,[$np],#8 // np[0]
  197. subs $j,$num,#8 // j=num-1 and clear borrow
  198. mov $ap,$rp
  199. .Lsub:
  200. sbcs $aj,$tj,$nj // tp[j]-np[j]
  201. ldr $tj,[$tp],#8
  202. sub $j,$j,#8 // j--
  203. ldr $nj,[$np],#8
  204. str $aj,[$ap],#8 // rp[j]=tp[j]-np[j]
  205. cbnz $j,.Lsub
  206. sbcs $aj,$tj,$nj
  207. sbcs $ovf,$ovf,xzr // did it borrow?
  208. str $aj,[$ap],#8 // rp[num-1]
  209. ldr $tj,[sp] // tp[0]
  210. add $tp,sp,#8
  211. ldr $aj,[$rp],#8 // rp[0]
  212. sub $num,$num,#8 // num--
  213. nop
  214. .Lcond_copy:
  215. sub $num,$num,#8 // num--
  216. csel $nj,$tj,$aj,lo // did it borrow?
  217. ldr $tj,[$tp],#8
  218. ldr $aj,[$rp],#8
  219. str xzr,[$tp,#-16] // wipe tp
  220. str $nj,[$rp,#-16]
  221. cbnz $num,.Lcond_copy
  222. csel $nj,$tj,$aj,lo
  223. str xzr,[$tp,#-8] // wipe tp
  224. str $nj,[$rp,#-8]
  225. ldp x19,x20,[x29,#16]
  226. mov sp,x29
  227. ldp x21,x22,[x29,#32]
  228. mov x0,#1
  229. ldp x23,x24,[x29,#48]
  230. ldr x29,[sp],#64
  231. ret
  232. .size bn_mul_mont,.-bn_mul_mont
  233. ___
  234. {
  235. ########################################################################
  236. # Following is ARMv8 adaptation of sqrx8x_mont from x86_64-mont5 module.
  237. my ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("x$_",(6..13));
  238. my ($t0,$t1,$t2,$t3)=map("x$_",(14..17));
  239. my ($acc0,$acc1,$acc2,$acc3,$acc4,$acc5,$acc6,$acc7)=map("x$_",(19..26));
  240. my ($cnt,$carry,$topmost)=("x27","x28","x30");
  241. my ($tp,$ap_end,$na0)=($bp,$np,$carry);
  242. $code.=<<___;
  243. .type __bn_sqr8x_mont,%function
  244. .align 5
  245. __bn_sqr8x_mont:
  246. cmp $ap,$bp
  247. b.ne __bn_mul4x_mont
  248. .Lsqr8x_mont:
  249. .inst 0xd503233f // paciasp
  250. stp x29,x30,[sp,#-128]!
  251. add x29,sp,#0
  252. stp x19,x20,[sp,#16]
  253. stp x21,x22,[sp,#32]
  254. stp x23,x24,[sp,#48]
  255. stp x25,x26,[sp,#64]
  256. stp x27,x28,[sp,#80]
  257. stp $rp,$np,[sp,#96] // offload rp and np
  258. ldp $a0,$a1,[$ap,#8*0]
  259. ldp $a2,$a3,[$ap,#8*2]
  260. ldp $a4,$a5,[$ap,#8*4]
  261. ldp $a6,$a7,[$ap,#8*6]
  262. sub $tp,sp,$num,lsl#4
  263. lsl $num,$num,#3
  264. ldr $n0,[$n0] // *n0
  265. mov sp,$tp // alloca
  266. sub $cnt,$num,#8*8
  267. b .Lsqr8x_zero_start
  268. .Lsqr8x_zero:
  269. sub $cnt,$cnt,#8*8
  270. stp xzr,xzr,[$tp,#8*0]
  271. stp xzr,xzr,[$tp,#8*2]
  272. stp xzr,xzr,[$tp,#8*4]
  273. stp xzr,xzr,[$tp,#8*6]
  274. .Lsqr8x_zero_start:
  275. stp xzr,xzr,[$tp,#8*8]
  276. stp xzr,xzr,[$tp,#8*10]
  277. stp xzr,xzr,[$tp,#8*12]
  278. stp xzr,xzr,[$tp,#8*14]
  279. add $tp,$tp,#8*16
  280. cbnz $cnt,.Lsqr8x_zero
  281. add $ap_end,$ap,$num
  282. add $ap,$ap,#8*8
  283. mov $acc0,xzr
  284. mov $acc1,xzr
  285. mov $acc2,xzr
  286. mov $acc3,xzr
  287. mov $acc4,xzr
  288. mov $acc5,xzr
  289. mov $acc6,xzr
  290. mov $acc7,xzr
  291. mov $tp,sp
  292. str $n0,[x29,#112] // offload n0
  293. // Multiply everything but a[i]*a[i]
  294. .align 4
  295. .Lsqr8x_outer_loop:
  296. // a[1]a[0] (i)
  297. // a[2]a[0]
  298. // a[3]a[0]
  299. // a[4]a[0]
  300. // a[5]a[0]
  301. // a[6]a[0]
  302. // a[7]a[0]
  303. // a[2]a[1] (ii)
  304. // a[3]a[1]
  305. // a[4]a[1]
  306. // a[5]a[1]
  307. // a[6]a[1]
  308. // a[7]a[1]
  309. // a[3]a[2] (iii)
  310. // a[4]a[2]
  311. // a[5]a[2]
  312. // a[6]a[2]
  313. // a[7]a[2]
  314. // a[4]a[3] (iv)
  315. // a[5]a[3]
  316. // a[6]a[3]
  317. // a[7]a[3]
  318. // a[5]a[4] (v)
  319. // a[6]a[4]
  320. // a[7]a[4]
  321. // a[6]a[5] (vi)
  322. // a[7]a[5]
  323. // a[7]a[6] (vii)
  324. mul $t0,$a1,$a0 // lo(a[1..7]*a[0]) (i)
  325. mul $t1,$a2,$a0
  326. mul $t2,$a3,$a0
  327. mul $t3,$a4,$a0
  328. adds $acc1,$acc1,$t0 // t[1]+lo(a[1]*a[0])
  329. mul $t0,$a5,$a0
  330. adcs $acc2,$acc2,$t1
  331. mul $t1,$a6,$a0
  332. adcs $acc3,$acc3,$t2
  333. mul $t2,$a7,$a0
  334. adcs $acc4,$acc4,$t3
  335. umulh $t3,$a1,$a0 // hi(a[1..7]*a[0])
  336. adcs $acc5,$acc5,$t0
  337. umulh $t0,$a2,$a0
  338. adcs $acc6,$acc6,$t1
  339. umulh $t1,$a3,$a0
  340. adcs $acc7,$acc7,$t2
  341. umulh $t2,$a4,$a0
  342. stp $acc0,$acc1,[$tp],#8*2 // t[0..1]
  343. adc $acc0,xzr,xzr // t[8]
  344. adds $acc2,$acc2,$t3 // t[2]+lo(a[1]*a[0])
  345. umulh $t3,$a5,$a0
  346. adcs $acc3,$acc3,$t0
  347. umulh $t0,$a6,$a0
  348. adcs $acc4,$acc4,$t1
  349. umulh $t1,$a7,$a0
  350. adcs $acc5,$acc5,$t2
  351. mul $t2,$a2,$a1 // lo(a[2..7]*a[1]) (ii)
  352. adcs $acc6,$acc6,$t3
  353. mul $t3,$a3,$a1
  354. adcs $acc7,$acc7,$t0
  355. mul $t0,$a4,$a1
  356. adc $acc0,$acc0,$t1
  357. mul $t1,$a5,$a1
  358. adds $acc3,$acc3,$t2
  359. mul $t2,$a6,$a1
  360. adcs $acc4,$acc4,$t3
  361. mul $t3,$a7,$a1
  362. adcs $acc5,$acc5,$t0
  363. umulh $t0,$a2,$a1 // hi(a[2..7]*a[1])
  364. adcs $acc6,$acc6,$t1
  365. umulh $t1,$a3,$a1
  366. adcs $acc7,$acc7,$t2
  367. umulh $t2,$a4,$a1
  368. adcs $acc0,$acc0,$t3
  369. umulh $t3,$a5,$a1
  370. stp $acc2,$acc3,[$tp],#8*2 // t[2..3]
  371. adc $acc1,xzr,xzr // t[9]
  372. adds $acc4,$acc4,$t0
  373. umulh $t0,$a6,$a1
  374. adcs $acc5,$acc5,$t1
  375. umulh $t1,$a7,$a1
  376. adcs $acc6,$acc6,$t2
  377. mul $t2,$a3,$a2 // lo(a[3..7]*a[2]) (iii)
  378. adcs $acc7,$acc7,$t3
  379. mul $t3,$a4,$a2
  380. adcs $acc0,$acc0,$t0
  381. mul $t0,$a5,$a2
  382. adc $acc1,$acc1,$t1
  383. mul $t1,$a6,$a2
  384. adds $acc5,$acc5,$t2
  385. mul $t2,$a7,$a2
  386. adcs $acc6,$acc6,$t3
  387. umulh $t3,$a3,$a2 // hi(a[3..7]*a[2])
  388. adcs $acc7,$acc7,$t0
  389. umulh $t0,$a4,$a2
  390. adcs $acc0,$acc0,$t1
  391. umulh $t1,$a5,$a2
  392. adcs $acc1,$acc1,$t2
  393. umulh $t2,$a6,$a2
  394. stp $acc4,$acc5,[$tp],#8*2 // t[4..5]
  395. adc $acc2,xzr,xzr // t[10]
  396. adds $acc6,$acc6,$t3
  397. umulh $t3,$a7,$a2
  398. adcs $acc7,$acc7,$t0
  399. mul $t0,$a4,$a3 // lo(a[4..7]*a[3]) (iv)
  400. adcs $acc0,$acc0,$t1
  401. mul $t1,$a5,$a3
  402. adcs $acc1,$acc1,$t2
  403. mul $t2,$a6,$a3
  404. adc $acc2,$acc2,$t3
  405. mul $t3,$a7,$a3
  406. adds $acc7,$acc7,$t0
  407. umulh $t0,$a4,$a3 // hi(a[4..7]*a[3])
  408. adcs $acc0,$acc0,$t1
  409. umulh $t1,$a5,$a3
  410. adcs $acc1,$acc1,$t2
  411. umulh $t2,$a6,$a3
  412. adcs $acc2,$acc2,$t3
  413. umulh $t3,$a7,$a3
  414. stp $acc6,$acc7,[$tp],#8*2 // t[6..7]
  415. adc $acc3,xzr,xzr // t[11]
  416. adds $acc0,$acc0,$t0
  417. mul $t0,$a5,$a4 // lo(a[5..7]*a[4]) (v)
  418. adcs $acc1,$acc1,$t1
  419. mul $t1,$a6,$a4
  420. adcs $acc2,$acc2,$t2
  421. mul $t2,$a7,$a4
  422. adc $acc3,$acc3,$t3
  423. umulh $t3,$a5,$a4 // hi(a[5..7]*a[4])
  424. adds $acc1,$acc1,$t0
  425. umulh $t0,$a6,$a4
  426. adcs $acc2,$acc2,$t1
  427. umulh $t1,$a7,$a4
  428. adcs $acc3,$acc3,$t2
  429. mul $t2,$a6,$a5 // lo(a[6..7]*a[5]) (vi)
  430. adc $acc4,xzr,xzr // t[12]
  431. adds $acc2,$acc2,$t3
  432. mul $t3,$a7,$a5
  433. adcs $acc3,$acc3,$t0
  434. umulh $t0,$a6,$a5 // hi(a[6..7]*a[5])
  435. adc $acc4,$acc4,$t1
  436. umulh $t1,$a7,$a5
  437. adds $acc3,$acc3,$t2
  438. mul $t2,$a7,$a6 // lo(a[7]*a[6]) (vii)
  439. adcs $acc4,$acc4,$t3
  440. umulh $t3,$a7,$a6 // hi(a[7]*a[6])
  441. adc $acc5,xzr,xzr // t[13]
  442. adds $acc4,$acc4,$t0
  443. sub $cnt,$ap_end,$ap // done yet?
  444. adc $acc5,$acc5,$t1
  445. adds $acc5,$acc5,$t2
  446. sub $t0,$ap_end,$num // rewinded ap
  447. adc $acc6,xzr,xzr // t[14]
  448. add $acc6,$acc6,$t3
  449. cbz $cnt,.Lsqr8x_outer_break
  450. mov $n0,$a0
  451. ldp $a0,$a1,[$tp,#8*0]
  452. ldp $a2,$a3,[$tp,#8*2]
  453. ldp $a4,$a5,[$tp,#8*4]
  454. ldp $a6,$a7,[$tp,#8*6]
  455. adds $acc0,$acc0,$a0
  456. adcs $acc1,$acc1,$a1
  457. ldp $a0,$a1,[$ap,#8*0]
  458. adcs $acc2,$acc2,$a2
  459. adcs $acc3,$acc3,$a3
  460. ldp $a2,$a3,[$ap,#8*2]
  461. adcs $acc4,$acc4,$a4
  462. adcs $acc5,$acc5,$a5
  463. ldp $a4,$a5,[$ap,#8*4]
  464. adcs $acc6,$acc6,$a6
  465. mov $rp,$ap
  466. adcs $acc7,xzr,$a7
  467. ldp $a6,$a7,[$ap,#8*6]
  468. add $ap,$ap,#8*8
  469. //adc $carry,xzr,xzr // moved below
  470. mov $cnt,#-8*8
  471. // a[8]a[0]
  472. // a[9]a[0]
  473. // a[a]a[0]
  474. // a[b]a[0]
  475. // a[c]a[0]
  476. // a[d]a[0]
  477. // a[e]a[0]
  478. // a[f]a[0]
  479. // a[8]a[1]
  480. // a[f]a[1]........................
  481. // a[8]a[2]
  482. // a[f]a[2]........................
  483. // a[8]a[3]
  484. // a[f]a[3]........................
  485. // a[8]a[4]
  486. // a[f]a[4]........................
  487. // a[8]a[5]
  488. // a[f]a[5]........................
  489. // a[8]a[6]
  490. // a[f]a[6]........................
  491. // a[8]a[7]
  492. // a[f]a[7]........................
  493. .Lsqr8x_mul:
  494. mul $t0,$a0,$n0
  495. adc $carry,xzr,xzr // carry bit, modulo-scheduled
  496. mul $t1,$a1,$n0
  497. add $cnt,$cnt,#8
  498. mul $t2,$a2,$n0
  499. mul $t3,$a3,$n0
  500. adds $acc0,$acc0,$t0
  501. mul $t0,$a4,$n0
  502. adcs $acc1,$acc1,$t1
  503. mul $t1,$a5,$n0
  504. adcs $acc2,$acc2,$t2
  505. mul $t2,$a6,$n0
  506. adcs $acc3,$acc3,$t3
  507. mul $t3,$a7,$n0
  508. adcs $acc4,$acc4,$t0
  509. umulh $t0,$a0,$n0
  510. adcs $acc5,$acc5,$t1
  511. umulh $t1,$a1,$n0
  512. adcs $acc6,$acc6,$t2
  513. umulh $t2,$a2,$n0
  514. adcs $acc7,$acc7,$t3
  515. umulh $t3,$a3,$n0
  516. adc $carry,$carry,xzr
  517. str $acc0,[$tp],#8
  518. adds $acc0,$acc1,$t0
  519. umulh $t0,$a4,$n0
  520. adcs $acc1,$acc2,$t1
  521. umulh $t1,$a5,$n0
  522. adcs $acc2,$acc3,$t2
  523. umulh $t2,$a6,$n0
  524. adcs $acc3,$acc4,$t3
  525. umulh $t3,$a7,$n0
  526. ldr $n0,[$rp,$cnt]
  527. adcs $acc4,$acc5,$t0
  528. adcs $acc5,$acc6,$t1
  529. adcs $acc6,$acc7,$t2
  530. adcs $acc7,$carry,$t3
  531. //adc $carry,xzr,xzr // moved above
  532. cbnz $cnt,.Lsqr8x_mul
  533. // note that carry flag is guaranteed
  534. // to be zero at this point
  535. cmp $ap,$ap_end // done yet?
  536. b.eq .Lsqr8x_break
  537. ldp $a0,$a1,[$tp,#8*0]
  538. ldp $a2,$a3,[$tp,#8*2]
  539. ldp $a4,$a5,[$tp,#8*4]
  540. ldp $a6,$a7,[$tp,#8*6]
  541. adds $acc0,$acc0,$a0
  542. ldr $n0,[$rp,#-8*8]
  543. adcs $acc1,$acc1,$a1
  544. ldp $a0,$a1,[$ap,#8*0]
  545. adcs $acc2,$acc2,$a2
  546. adcs $acc3,$acc3,$a3
  547. ldp $a2,$a3,[$ap,#8*2]
  548. adcs $acc4,$acc4,$a4
  549. adcs $acc5,$acc5,$a5
  550. ldp $a4,$a5,[$ap,#8*4]
  551. adcs $acc6,$acc6,$a6
  552. mov $cnt,#-8*8
  553. adcs $acc7,$acc7,$a7
  554. ldp $a6,$a7,[$ap,#8*6]
  555. add $ap,$ap,#8*8
  556. //adc $carry,xzr,xzr // moved above
  557. b .Lsqr8x_mul
  558. .align 4
  559. .Lsqr8x_break:
  560. ldp $a0,$a1,[$rp,#8*0]
  561. add $ap,$rp,#8*8
  562. ldp $a2,$a3,[$rp,#8*2]
  563. sub $t0,$ap_end,$ap // is it last iteration?
  564. ldp $a4,$a5,[$rp,#8*4]
  565. sub $t1,$tp,$t0
  566. ldp $a6,$a7,[$rp,#8*6]
  567. cbz $t0,.Lsqr8x_outer_loop
  568. stp $acc0,$acc1,[$tp,#8*0]
  569. ldp $acc0,$acc1,[$t1,#8*0]
  570. stp $acc2,$acc3,[$tp,#8*2]
  571. ldp $acc2,$acc3,[$t1,#8*2]
  572. stp $acc4,$acc5,[$tp,#8*4]
  573. ldp $acc4,$acc5,[$t1,#8*4]
  574. stp $acc6,$acc7,[$tp,#8*6]
  575. mov $tp,$t1
  576. ldp $acc6,$acc7,[$t1,#8*6]
  577. b .Lsqr8x_outer_loop
  578. .align 4
  579. .Lsqr8x_outer_break:
  580. // Now multiply above result by 2 and add a[n-1]*a[n-1]|...|a[0]*a[0]
  581. ldp $a1,$a3,[$t0,#8*0] // recall that $t0 is &a[0]
  582. ldp $t1,$t2,[sp,#8*1]
  583. ldp $a5,$a7,[$t0,#8*2]
  584. add $ap,$t0,#8*4
  585. ldp $t3,$t0,[sp,#8*3]
  586. stp $acc0,$acc1,[$tp,#8*0]
  587. mul $acc0,$a1,$a1
  588. stp $acc2,$acc3,[$tp,#8*2]
  589. umulh $a1,$a1,$a1
  590. stp $acc4,$acc5,[$tp,#8*4]
  591. mul $a2,$a3,$a3
  592. stp $acc6,$acc7,[$tp,#8*6]
  593. mov $tp,sp
  594. umulh $a3,$a3,$a3
  595. adds $acc1,$a1,$t1,lsl#1
  596. extr $t1,$t2,$t1,#63
  597. sub $cnt,$num,#8*4
  598. .Lsqr4x_shift_n_add:
  599. adcs $acc2,$a2,$t1
  600. extr $t2,$t3,$t2,#63
  601. sub $cnt,$cnt,#8*4
  602. adcs $acc3,$a3,$t2
  603. ldp $t1,$t2,[$tp,#8*5]
  604. mul $a4,$a5,$a5
  605. ldp $a1,$a3,[$ap],#8*2
  606. umulh $a5,$a5,$a5
  607. mul $a6,$a7,$a7
  608. umulh $a7,$a7,$a7
  609. extr $t3,$t0,$t3,#63
  610. stp $acc0,$acc1,[$tp,#8*0]
  611. adcs $acc4,$a4,$t3
  612. extr $t0,$t1,$t0,#63
  613. stp $acc2,$acc3,[$tp,#8*2]
  614. adcs $acc5,$a5,$t0
  615. ldp $t3,$t0,[$tp,#8*7]
  616. extr $t1,$t2,$t1,#63
  617. adcs $acc6,$a6,$t1
  618. extr $t2,$t3,$t2,#63
  619. adcs $acc7,$a7,$t2
  620. ldp $t1,$t2,[$tp,#8*9]
  621. mul $a0,$a1,$a1
  622. ldp $a5,$a7,[$ap],#8*2
  623. umulh $a1,$a1,$a1
  624. mul $a2,$a3,$a3
  625. umulh $a3,$a3,$a3
  626. stp $acc4,$acc5,[$tp,#8*4]
  627. extr $t3,$t0,$t3,#63
  628. stp $acc6,$acc7,[$tp,#8*6]
  629. add $tp,$tp,#8*8
  630. adcs $acc0,$a0,$t3
  631. extr $t0,$t1,$t0,#63
  632. adcs $acc1,$a1,$t0
  633. ldp $t3,$t0,[$tp,#8*3]
  634. extr $t1,$t2,$t1,#63
  635. cbnz $cnt,.Lsqr4x_shift_n_add
  636. ___
  637. my ($np,$np_end)=($ap,$ap_end);
  638. $code.=<<___;
  639. ldp $np,$n0,[x29,#104] // pull np and n0
  640. adcs $acc2,$a2,$t1
  641. extr $t2,$t3,$t2,#63
  642. adcs $acc3,$a3,$t2
  643. ldp $t1,$t2,[$tp,#8*5]
  644. mul $a4,$a5,$a5
  645. umulh $a5,$a5,$a5
  646. stp $acc0,$acc1,[$tp,#8*0]
  647. mul $a6,$a7,$a7
  648. umulh $a7,$a7,$a7
  649. stp $acc2,$acc3,[$tp,#8*2]
  650. extr $t3,$t0,$t3,#63
  651. adcs $acc4,$a4,$t3
  652. extr $t0,$t1,$t0,#63
  653. ldp $acc0,$acc1,[sp,#8*0]
  654. adcs $acc5,$a5,$t0
  655. extr $t1,$t2,$t1,#63
  656. ldp $a0,$a1,[$np,#8*0]
  657. adcs $acc6,$a6,$t1
  658. extr $t2,xzr,$t2,#63
  659. ldp $a2,$a3,[$np,#8*2]
  660. adc $acc7,$a7,$t2
  661. ldp $a4,$a5,[$np,#8*4]
  662. // Reduce by 512 bits per iteration
  663. mul $na0,$n0,$acc0 // t[0]*n0
  664. ldp $a6,$a7,[$np,#8*6]
  665. add $np_end,$np,$num
  666. ldp $acc2,$acc3,[sp,#8*2]
  667. stp $acc4,$acc5,[$tp,#8*4]
  668. ldp $acc4,$acc5,[sp,#8*4]
  669. stp $acc6,$acc7,[$tp,#8*6]
  670. ldp $acc6,$acc7,[sp,#8*6]
  671. add $np,$np,#8*8
  672. mov $topmost,xzr // initial top-most carry
  673. mov $tp,sp
  674. mov $cnt,#8
  675. .Lsqr8x_reduction:
  676. // (*) mul $t0,$a0,$na0 // lo(n[0-7])*lo(t[0]*n0)
  677. mul $t1,$a1,$na0
  678. sub $cnt,$cnt,#1
  679. mul $t2,$a2,$na0
  680. str $na0,[$tp],#8 // put aside t[0]*n0 for tail processing
  681. mul $t3,$a3,$na0
  682. // (*) adds xzr,$acc0,$t0
  683. subs xzr,$acc0,#1 // (*)
  684. mul $t0,$a4,$na0
  685. adcs $acc0,$acc1,$t1
  686. mul $t1,$a5,$na0
  687. adcs $acc1,$acc2,$t2
  688. mul $t2,$a6,$na0
  689. adcs $acc2,$acc3,$t3
  690. mul $t3,$a7,$na0
  691. adcs $acc3,$acc4,$t0
  692. umulh $t0,$a0,$na0 // hi(n[0-7])*lo(t[0]*n0)
  693. adcs $acc4,$acc5,$t1
  694. umulh $t1,$a1,$na0
  695. adcs $acc5,$acc6,$t2
  696. umulh $t2,$a2,$na0
  697. adcs $acc6,$acc7,$t3
  698. umulh $t3,$a3,$na0
  699. adc $acc7,xzr,xzr
  700. adds $acc0,$acc0,$t0
  701. umulh $t0,$a4,$na0
  702. adcs $acc1,$acc1,$t1
  703. umulh $t1,$a5,$na0
  704. adcs $acc2,$acc2,$t2
  705. umulh $t2,$a6,$na0
  706. adcs $acc3,$acc3,$t3
  707. umulh $t3,$a7,$na0
  708. mul $na0,$n0,$acc0 // next t[0]*n0
  709. adcs $acc4,$acc4,$t0
  710. adcs $acc5,$acc5,$t1
  711. adcs $acc6,$acc6,$t2
  712. adc $acc7,$acc7,$t3
  713. cbnz $cnt,.Lsqr8x_reduction
  714. ldp $t0,$t1,[$tp,#8*0]
  715. ldp $t2,$t3,[$tp,#8*2]
  716. mov $rp,$tp
  717. sub $cnt,$np_end,$np // done yet?
  718. adds $acc0,$acc0,$t0
  719. adcs $acc1,$acc1,$t1
  720. ldp $t0,$t1,[$tp,#8*4]
  721. adcs $acc2,$acc2,$t2
  722. adcs $acc3,$acc3,$t3
  723. ldp $t2,$t3,[$tp,#8*6]
  724. adcs $acc4,$acc4,$t0
  725. adcs $acc5,$acc5,$t1
  726. adcs $acc6,$acc6,$t2
  727. adcs $acc7,$acc7,$t3
  728. //adc $carry,xzr,xzr // moved below
  729. cbz $cnt,.Lsqr8x8_post_condition
  730. ldr $n0,[$tp,#-8*8]
  731. ldp $a0,$a1,[$np,#8*0]
  732. ldp $a2,$a3,[$np,#8*2]
  733. ldp $a4,$a5,[$np,#8*4]
  734. mov $cnt,#-8*8
  735. ldp $a6,$a7,[$np,#8*6]
  736. add $np,$np,#8*8
  737. .Lsqr8x_tail:
  738. mul $t0,$a0,$n0
  739. adc $carry,xzr,xzr // carry bit, modulo-scheduled
  740. mul $t1,$a1,$n0
  741. add $cnt,$cnt,#8
  742. mul $t2,$a2,$n0
  743. mul $t3,$a3,$n0
  744. adds $acc0,$acc0,$t0
  745. mul $t0,$a4,$n0
  746. adcs $acc1,$acc1,$t1
  747. mul $t1,$a5,$n0
  748. adcs $acc2,$acc2,$t2
  749. mul $t2,$a6,$n0
  750. adcs $acc3,$acc3,$t3
  751. mul $t3,$a7,$n0
  752. adcs $acc4,$acc4,$t0
  753. umulh $t0,$a0,$n0
  754. adcs $acc5,$acc5,$t1
  755. umulh $t1,$a1,$n0
  756. adcs $acc6,$acc6,$t2
  757. umulh $t2,$a2,$n0
  758. adcs $acc7,$acc7,$t3
  759. umulh $t3,$a3,$n0
  760. adc $carry,$carry,xzr
  761. str $acc0,[$tp],#8
  762. adds $acc0,$acc1,$t0
  763. umulh $t0,$a4,$n0
  764. adcs $acc1,$acc2,$t1
  765. umulh $t1,$a5,$n0
  766. adcs $acc2,$acc3,$t2
  767. umulh $t2,$a6,$n0
  768. adcs $acc3,$acc4,$t3
  769. umulh $t3,$a7,$n0
  770. ldr $n0,[$rp,$cnt]
  771. adcs $acc4,$acc5,$t0
  772. adcs $acc5,$acc6,$t1
  773. adcs $acc6,$acc7,$t2
  774. adcs $acc7,$carry,$t3
  775. //adc $carry,xzr,xzr // moved above
  776. cbnz $cnt,.Lsqr8x_tail
  777. // note that carry flag is guaranteed
  778. // to be zero at this point
  779. ldp $a0,$a1,[$tp,#8*0]
  780. sub $cnt,$np_end,$np // done yet?
  781. sub $t2,$np_end,$num // rewinded np
  782. ldp $a2,$a3,[$tp,#8*2]
  783. ldp $a4,$a5,[$tp,#8*4]
  784. ldp $a6,$a7,[$tp,#8*6]
  785. cbz $cnt,.Lsqr8x_tail_break
  786. ldr $n0,[$rp,#-8*8]
  787. adds $acc0,$acc0,$a0
  788. adcs $acc1,$acc1,$a1
  789. ldp $a0,$a1,[$np,#8*0]
  790. adcs $acc2,$acc2,$a2
  791. adcs $acc3,$acc3,$a3
  792. ldp $a2,$a3,[$np,#8*2]
  793. adcs $acc4,$acc4,$a4
  794. adcs $acc5,$acc5,$a5
  795. ldp $a4,$a5,[$np,#8*4]
  796. adcs $acc6,$acc6,$a6
  797. mov $cnt,#-8*8
  798. adcs $acc7,$acc7,$a7
  799. ldp $a6,$a7,[$np,#8*6]
  800. add $np,$np,#8*8
  801. //adc $carry,xzr,xzr // moved above
  802. b .Lsqr8x_tail
  803. .align 4
  804. .Lsqr8x_tail_break:
  805. ldr $n0,[x29,#112] // pull n0
  806. add $cnt,$tp,#8*8 // end of current t[num] window
  807. subs xzr,$topmost,#1 // "move" top-most carry to carry bit
  808. adcs $t0,$acc0,$a0
  809. adcs $t1,$acc1,$a1
  810. ldp $acc0,$acc1,[$rp,#8*0]
  811. adcs $acc2,$acc2,$a2
  812. ldp $a0,$a1,[$t2,#8*0] // recall that $t2 is &n[0]
  813. adcs $acc3,$acc3,$a3
  814. ldp $a2,$a3,[$t2,#8*2]
  815. adcs $acc4,$acc4,$a4
  816. adcs $acc5,$acc5,$a5
  817. ldp $a4,$a5,[$t2,#8*4]
  818. adcs $acc6,$acc6,$a6
  819. adcs $acc7,$acc7,$a7
  820. ldp $a6,$a7,[$t2,#8*6]
  821. add $np,$t2,#8*8
  822. adc $topmost,xzr,xzr // top-most carry
  823. mul $na0,$n0,$acc0
  824. stp $t0,$t1,[$tp,#8*0]
  825. stp $acc2,$acc3,[$tp,#8*2]
  826. ldp $acc2,$acc3,[$rp,#8*2]
  827. stp $acc4,$acc5,[$tp,#8*4]
  828. ldp $acc4,$acc5,[$rp,#8*4]
  829. cmp $cnt,x29 // did we hit the bottom?
  830. stp $acc6,$acc7,[$tp,#8*6]
  831. mov $tp,$rp // slide the window
  832. ldp $acc6,$acc7,[$rp,#8*6]
  833. mov $cnt,#8
  834. b.ne .Lsqr8x_reduction
  835. // Final step. We see if result is larger than modulus, and
  836. // if it is, subtract the modulus. But comparison implies
  837. // subtraction. So we subtract modulus, see if it borrowed,
  838. // and conditionally copy original value.
  839. ldr $rp,[x29,#96] // pull rp
  840. add $tp,$tp,#8*8
  841. subs $t0,$acc0,$a0
  842. sbcs $t1,$acc1,$a1
  843. sub $cnt,$num,#8*8
  844. mov $ap_end,$rp // $rp copy
  845. .Lsqr8x_sub:
  846. sbcs $t2,$acc2,$a2
  847. ldp $a0,$a1,[$np,#8*0]
  848. sbcs $t3,$acc3,$a3
  849. stp $t0,$t1,[$rp,#8*0]
  850. sbcs $t0,$acc4,$a4
  851. ldp $a2,$a3,[$np,#8*2]
  852. sbcs $t1,$acc5,$a5
  853. stp $t2,$t3,[$rp,#8*2]
  854. sbcs $t2,$acc6,$a6
  855. ldp $a4,$a5,[$np,#8*4]
  856. sbcs $t3,$acc7,$a7
  857. ldp $a6,$a7,[$np,#8*6]
  858. add $np,$np,#8*8
  859. ldp $acc0,$acc1,[$tp,#8*0]
  860. sub $cnt,$cnt,#8*8
  861. ldp $acc2,$acc3,[$tp,#8*2]
  862. ldp $acc4,$acc5,[$tp,#8*4]
  863. ldp $acc6,$acc7,[$tp,#8*6]
  864. add $tp,$tp,#8*8
  865. stp $t0,$t1,[$rp,#8*4]
  866. sbcs $t0,$acc0,$a0
  867. stp $t2,$t3,[$rp,#8*6]
  868. add $rp,$rp,#8*8
  869. sbcs $t1,$acc1,$a1
  870. cbnz $cnt,.Lsqr8x_sub
  871. sbcs $t2,$acc2,$a2
  872. mov $tp,sp
  873. add $ap,sp,$num
  874. ldp $a0,$a1,[$ap_end,#8*0]
  875. sbcs $t3,$acc3,$a3
  876. stp $t0,$t1,[$rp,#8*0]
  877. sbcs $t0,$acc4,$a4
  878. ldp $a2,$a3,[$ap_end,#8*2]
  879. sbcs $t1,$acc5,$a5
  880. stp $t2,$t3,[$rp,#8*2]
  881. sbcs $t2,$acc6,$a6
  882. ldp $acc0,$acc1,[$ap,#8*0]
  883. sbcs $t3,$acc7,$a7
  884. ldp $acc2,$acc3,[$ap,#8*2]
  885. sbcs xzr,$topmost,xzr // did it borrow?
  886. ldr x30,[x29,#8] // pull return address
  887. stp $t0,$t1,[$rp,#8*4]
  888. stp $t2,$t3,[$rp,#8*6]
  889. sub $cnt,$num,#8*4
  890. .Lsqr4x_cond_copy:
  891. sub $cnt,$cnt,#8*4
  892. csel $t0,$acc0,$a0,lo
  893. stp xzr,xzr,[$tp,#8*0]
  894. csel $t1,$acc1,$a1,lo
  895. ldp $a0,$a1,[$ap_end,#8*4]
  896. ldp $acc0,$acc1,[$ap,#8*4]
  897. csel $t2,$acc2,$a2,lo
  898. stp xzr,xzr,[$tp,#8*2]
  899. add $tp,$tp,#8*4
  900. csel $t3,$acc3,$a3,lo
  901. ldp $a2,$a3,[$ap_end,#8*6]
  902. ldp $acc2,$acc3,[$ap,#8*6]
  903. add $ap,$ap,#8*4
  904. stp $t0,$t1,[$ap_end,#8*0]
  905. stp $t2,$t3,[$ap_end,#8*2]
  906. add $ap_end,$ap_end,#8*4
  907. stp xzr,xzr,[$ap,#8*0]
  908. stp xzr,xzr,[$ap,#8*2]
  909. cbnz $cnt,.Lsqr4x_cond_copy
  910. csel $t0,$acc0,$a0,lo
  911. stp xzr,xzr,[$tp,#8*0]
  912. csel $t1,$acc1,$a1,lo
  913. stp xzr,xzr,[$tp,#8*2]
  914. csel $t2,$acc2,$a2,lo
  915. csel $t3,$acc3,$a3,lo
  916. stp $t0,$t1,[$ap_end,#8*0]
  917. stp $t2,$t3,[$ap_end,#8*2]
  918. b .Lsqr8x_done
  919. .align 4
  920. .Lsqr8x8_post_condition:
  921. adc $carry,xzr,xzr
  922. ldr x30,[x29,#8] // pull return address
  923. // $acc0-7,$carry hold result, $a0-7 hold modulus
  924. subs $a0,$acc0,$a0
  925. ldr $ap,[x29,#96] // pull rp
  926. sbcs $a1,$acc1,$a1
  927. stp xzr,xzr,[sp,#8*0]
  928. sbcs $a2,$acc2,$a2
  929. stp xzr,xzr,[sp,#8*2]
  930. sbcs $a3,$acc3,$a3
  931. stp xzr,xzr,[sp,#8*4]
  932. sbcs $a4,$acc4,$a4
  933. stp xzr,xzr,[sp,#8*6]
  934. sbcs $a5,$acc5,$a5
  935. stp xzr,xzr,[sp,#8*8]
  936. sbcs $a6,$acc6,$a6
  937. stp xzr,xzr,[sp,#8*10]
  938. sbcs $a7,$acc7,$a7
  939. stp xzr,xzr,[sp,#8*12]
  940. sbcs $carry,$carry,xzr // did it borrow?
  941. stp xzr,xzr,[sp,#8*14]
  942. // $a0-7 hold result-modulus
  943. csel $a0,$acc0,$a0,lo
  944. csel $a1,$acc1,$a1,lo
  945. csel $a2,$acc2,$a2,lo
  946. csel $a3,$acc3,$a3,lo
  947. stp $a0,$a1,[$ap,#8*0]
  948. csel $a4,$acc4,$a4,lo
  949. csel $a5,$acc5,$a5,lo
  950. stp $a2,$a3,[$ap,#8*2]
  951. csel $a6,$acc6,$a6,lo
  952. csel $a7,$acc7,$a7,lo
  953. stp $a4,$a5,[$ap,#8*4]
  954. stp $a6,$a7,[$ap,#8*6]
  955. .Lsqr8x_done:
  956. ldp x19,x20,[x29,#16]
  957. mov sp,x29
  958. ldp x21,x22,[x29,#32]
  959. mov x0,#1
  960. ldp x23,x24,[x29,#48]
  961. ldp x25,x26,[x29,#64]
  962. ldp x27,x28,[x29,#80]
  963. ldr x29,[sp],#128
  964. .inst 0xd50323bf // autiasp
  965. ret
  966. .size __bn_sqr8x_mont,.-__bn_sqr8x_mont
  967. ___
  968. }
  969. {
  970. ########################################################################
  971. # Even though this might look as ARMv8 adaptation of mulx4x_mont from
  972. # x86_64-mont5 module, it's different in sense that it performs
  973. # reduction 256 bits at a time.
  974. my ($a0,$a1,$a2,$a3,
  975. $t0,$t1,$t2,$t3,
  976. $m0,$m1,$m2,$m3,
  977. $acc0,$acc1,$acc2,$acc3,$acc4,
  978. $bi,$mi,$tp,$ap_end,$cnt) = map("x$_",(6..17,19..28));
  979. my $bp_end=$rp;
  980. my ($carry,$topmost) = ($rp,"x30");
  981. $code.=<<___;
  982. .type __bn_mul4x_mont,%function
  983. .align 5
  984. __bn_mul4x_mont:
  985. .inst 0xd503233f // paciasp
  986. stp x29,x30,[sp,#-128]!
  987. add x29,sp,#0
  988. stp x19,x20,[sp,#16]
  989. stp x21,x22,[sp,#32]
  990. stp x23,x24,[sp,#48]
  991. stp x25,x26,[sp,#64]
  992. stp x27,x28,[sp,#80]
  993. sub $tp,sp,$num,lsl#3
  994. lsl $num,$num,#3
  995. ldr $n0,[$n0] // *n0
  996. sub sp,$tp,#8*4 // alloca
  997. add $t0,$bp,$num
  998. add $ap_end,$ap,$num
  999. stp $rp,$t0,[x29,#96] // offload rp and &b[num]
  1000. ldr $bi,[$bp,#8*0] // b[0]
  1001. ldp $a0,$a1,[$ap,#8*0] // a[0..3]
  1002. ldp $a2,$a3,[$ap,#8*2]
  1003. add $ap,$ap,#8*4
  1004. mov $acc0,xzr
  1005. mov $acc1,xzr
  1006. mov $acc2,xzr
  1007. mov $acc3,xzr
  1008. ldp $m0,$m1,[$np,#8*0] // n[0..3]
  1009. ldp $m2,$m3,[$np,#8*2]
  1010. adds $np,$np,#8*4 // clear carry bit
  1011. mov $carry,xzr
  1012. mov $cnt,#0
  1013. mov $tp,sp
  1014. .Loop_mul4x_1st_reduction:
  1015. mul $t0,$a0,$bi // lo(a[0..3]*b[0])
  1016. adc $carry,$carry,xzr // modulo-scheduled
  1017. mul $t1,$a1,$bi
  1018. add $cnt,$cnt,#8
  1019. mul $t2,$a2,$bi
  1020. and $cnt,$cnt,#31
  1021. mul $t3,$a3,$bi
  1022. adds $acc0,$acc0,$t0
  1023. umulh $t0,$a0,$bi // hi(a[0..3]*b[0])
  1024. adcs $acc1,$acc1,$t1
  1025. mul $mi,$acc0,$n0 // t[0]*n0
  1026. adcs $acc2,$acc2,$t2
  1027. umulh $t1,$a1,$bi
  1028. adcs $acc3,$acc3,$t3
  1029. umulh $t2,$a2,$bi
  1030. adc $acc4,xzr,xzr
  1031. umulh $t3,$a3,$bi
  1032. ldr $bi,[$bp,$cnt] // next b[i] (or b[0])
  1033. adds $acc1,$acc1,$t0
  1034. // (*) mul $t0,$m0,$mi // lo(n[0..3]*t[0]*n0)
  1035. str $mi,[$tp],#8 // put aside t[0]*n0 for tail processing
  1036. adcs $acc2,$acc2,$t1
  1037. mul $t1,$m1,$mi
  1038. adcs $acc3,$acc3,$t2
  1039. mul $t2,$m2,$mi
  1040. adc $acc4,$acc4,$t3 // can't overflow
  1041. mul $t3,$m3,$mi
  1042. // (*) adds xzr,$acc0,$t0
  1043. subs xzr,$acc0,#1 // (*)
  1044. umulh $t0,$m0,$mi // hi(n[0..3]*t[0]*n0)
  1045. adcs $acc0,$acc1,$t1
  1046. umulh $t1,$m1,$mi
  1047. adcs $acc1,$acc2,$t2
  1048. umulh $t2,$m2,$mi
  1049. adcs $acc2,$acc3,$t3
  1050. umulh $t3,$m3,$mi
  1051. adcs $acc3,$acc4,$carry
  1052. adc $carry,xzr,xzr
  1053. adds $acc0,$acc0,$t0
  1054. sub $t0,$ap_end,$ap
  1055. adcs $acc1,$acc1,$t1
  1056. adcs $acc2,$acc2,$t2
  1057. adcs $acc3,$acc3,$t3
  1058. //adc $carry,$carry,xzr
  1059. cbnz $cnt,.Loop_mul4x_1st_reduction
  1060. cbz $t0,.Lmul4x4_post_condition
  1061. ldp $a0,$a1,[$ap,#8*0] // a[4..7]
  1062. ldp $a2,$a3,[$ap,#8*2]
  1063. add $ap,$ap,#8*4
  1064. ldr $mi,[sp] // a[0]*n0
  1065. ldp $m0,$m1,[$np,#8*0] // n[4..7]
  1066. ldp $m2,$m3,[$np,#8*2]
  1067. add $np,$np,#8*4
  1068. .Loop_mul4x_1st_tail:
  1069. mul $t0,$a0,$bi // lo(a[4..7]*b[i])
  1070. adc $carry,$carry,xzr // modulo-scheduled
  1071. mul $t1,$a1,$bi
  1072. add $cnt,$cnt,#8
  1073. mul $t2,$a2,$bi
  1074. and $cnt,$cnt,#31
  1075. mul $t3,$a3,$bi
  1076. adds $acc0,$acc0,$t0
  1077. umulh $t0,$a0,$bi // hi(a[4..7]*b[i])
  1078. adcs $acc1,$acc1,$t1
  1079. umulh $t1,$a1,$bi
  1080. adcs $acc2,$acc2,$t2
  1081. umulh $t2,$a2,$bi
  1082. adcs $acc3,$acc3,$t3
  1083. umulh $t3,$a3,$bi
  1084. adc $acc4,xzr,xzr
  1085. ldr $bi,[$bp,$cnt] // next b[i] (or b[0])
  1086. adds $acc1,$acc1,$t0
  1087. mul $t0,$m0,$mi // lo(n[4..7]*a[0]*n0)
  1088. adcs $acc2,$acc2,$t1
  1089. mul $t1,$m1,$mi
  1090. adcs $acc3,$acc3,$t2
  1091. mul $t2,$m2,$mi
  1092. adc $acc4,$acc4,$t3 // can't overflow
  1093. mul $t3,$m3,$mi
  1094. adds $acc0,$acc0,$t0
  1095. umulh $t0,$m0,$mi // hi(n[4..7]*a[0]*n0)
  1096. adcs $acc1,$acc1,$t1
  1097. umulh $t1,$m1,$mi
  1098. adcs $acc2,$acc2,$t2
  1099. umulh $t2,$m2,$mi
  1100. adcs $acc3,$acc3,$t3
  1101. adcs $acc4,$acc4,$carry
  1102. umulh $t3,$m3,$mi
  1103. adc $carry,xzr,xzr
  1104. ldr $mi,[sp,$cnt] // next t[0]*n0
  1105. str $acc0,[$tp],#8 // result!!!
  1106. adds $acc0,$acc1,$t0
  1107. sub $t0,$ap_end,$ap // done yet?
  1108. adcs $acc1,$acc2,$t1
  1109. adcs $acc2,$acc3,$t2
  1110. adcs $acc3,$acc4,$t3
  1111. //adc $carry,$carry,xzr
  1112. cbnz $cnt,.Loop_mul4x_1st_tail
  1113. sub $t1,$ap_end,$num // rewinded $ap
  1114. cbz $t0,.Lmul4x_proceed
  1115. ldp $a0,$a1,[$ap,#8*0]
  1116. ldp $a2,$a3,[$ap,#8*2]
  1117. add $ap,$ap,#8*4
  1118. ldp $m0,$m1,[$np,#8*0]
  1119. ldp $m2,$m3,[$np,#8*2]
  1120. add $np,$np,#8*4
  1121. b .Loop_mul4x_1st_tail
  1122. .align 5
  1123. .Lmul4x_proceed:
  1124. ldr $bi,[$bp,#8*4]! // *++b
  1125. adc $topmost,$carry,xzr
  1126. ldp $a0,$a1,[$t1,#8*0] // a[0..3]
  1127. sub $np,$np,$num // rewind np
  1128. ldp $a2,$a3,[$t1,#8*2]
  1129. add $ap,$t1,#8*4
  1130. stp $acc0,$acc1,[$tp,#8*0] // result!!!
  1131. ldp $acc0,$acc1,[sp,#8*4] // t[0..3]
  1132. stp $acc2,$acc3,[$tp,#8*2] // result!!!
  1133. ldp $acc2,$acc3,[sp,#8*6]
  1134. ldp $m0,$m1,[$np,#8*0] // n[0..3]
  1135. mov $tp,sp
  1136. ldp $m2,$m3,[$np,#8*2]
  1137. adds $np,$np,#8*4 // clear carry bit
  1138. mov $carry,xzr
  1139. .align 4
  1140. .Loop_mul4x_reduction:
  1141. mul $t0,$a0,$bi // lo(a[0..3]*b[4])
  1142. adc $carry,$carry,xzr // modulo-scheduled
  1143. mul $t1,$a1,$bi
  1144. add $cnt,$cnt,#8
  1145. mul $t2,$a2,$bi
  1146. and $cnt,$cnt,#31
  1147. mul $t3,$a3,$bi
  1148. adds $acc0,$acc0,$t0
  1149. umulh $t0,$a0,$bi // hi(a[0..3]*b[4])
  1150. adcs $acc1,$acc1,$t1
  1151. mul $mi,$acc0,$n0 // t[0]*n0
  1152. adcs $acc2,$acc2,$t2
  1153. umulh $t1,$a1,$bi
  1154. adcs $acc3,$acc3,$t3
  1155. umulh $t2,$a2,$bi
  1156. adc $acc4,xzr,xzr
  1157. umulh $t3,$a3,$bi
  1158. ldr $bi,[$bp,$cnt] // next b[i]
  1159. adds $acc1,$acc1,$t0
  1160. // (*) mul $t0,$m0,$mi
  1161. str $mi,[$tp],#8 // put aside t[0]*n0 for tail processing
  1162. adcs $acc2,$acc2,$t1
  1163. mul $t1,$m1,$mi // lo(n[0..3]*t[0]*n0
  1164. adcs $acc3,$acc3,$t2
  1165. mul $t2,$m2,$mi
  1166. adc $acc4,$acc4,$t3 // can't overflow
  1167. mul $t3,$m3,$mi
  1168. // (*) adds xzr,$acc0,$t0
  1169. subs xzr,$acc0,#1 // (*)
  1170. umulh $t0,$m0,$mi // hi(n[0..3]*t[0]*n0
  1171. adcs $acc0,$acc1,$t1
  1172. umulh $t1,$m1,$mi
  1173. adcs $acc1,$acc2,$t2
  1174. umulh $t2,$m2,$mi
  1175. adcs $acc2,$acc3,$t3
  1176. umulh $t3,$m3,$mi
  1177. adcs $acc3,$acc4,$carry
  1178. adc $carry,xzr,xzr
  1179. adds $acc0,$acc0,$t0
  1180. adcs $acc1,$acc1,$t1
  1181. adcs $acc2,$acc2,$t2
  1182. adcs $acc3,$acc3,$t3
  1183. //adc $carry,$carry,xzr
  1184. cbnz $cnt,.Loop_mul4x_reduction
  1185. adc $carry,$carry,xzr
  1186. ldp $t0,$t1,[$tp,#8*4] // t[4..7]
  1187. ldp $t2,$t3,[$tp,#8*6]
  1188. ldp $a0,$a1,[$ap,#8*0] // a[4..7]
  1189. ldp $a2,$a3,[$ap,#8*2]
  1190. add $ap,$ap,#8*4
  1191. adds $acc0,$acc0,$t0
  1192. adcs $acc1,$acc1,$t1
  1193. adcs $acc2,$acc2,$t2
  1194. adcs $acc3,$acc3,$t3
  1195. //adc $carry,$carry,xzr
  1196. ldr $mi,[sp] // t[0]*n0
  1197. ldp $m0,$m1,[$np,#8*0] // n[4..7]
  1198. ldp $m2,$m3,[$np,#8*2]
  1199. add $np,$np,#8*4
  1200. .align 4
  1201. .Loop_mul4x_tail:
  1202. mul $t0,$a0,$bi // lo(a[4..7]*b[4])
  1203. adc $carry,$carry,xzr // modulo-scheduled
  1204. mul $t1,$a1,$bi
  1205. add $cnt,$cnt,#8
  1206. mul $t2,$a2,$bi
  1207. and $cnt,$cnt,#31
  1208. mul $t3,$a3,$bi
  1209. adds $acc0,$acc0,$t0
  1210. umulh $t0,$a0,$bi // hi(a[4..7]*b[4])
  1211. adcs $acc1,$acc1,$t1
  1212. umulh $t1,$a1,$bi
  1213. adcs $acc2,$acc2,$t2
  1214. umulh $t2,$a2,$bi
  1215. adcs $acc3,$acc3,$t3
  1216. umulh $t3,$a3,$bi
  1217. adc $acc4,xzr,xzr
  1218. ldr $bi,[$bp,$cnt] // next b[i]
  1219. adds $acc1,$acc1,$t0
  1220. mul $t0,$m0,$mi // lo(n[4..7]*t[0]*n0)
  1221. adcs $acc2,$acc2,$t1
  1222. mul $t1,$m1,$mi
  1223. adcs $acc3,$acc3,$t2
  1224. mul $t2,$m2,$mi
  1225. adc $acc4,$acc4,$t3 // can't overflow
  1226. mul $t3,$m3,$mi
  1227. adds $acc0,$acc0,$t0
  1228. umulh $t0,$m0,$mi // hi(n[4..7]*t[0]*n0)
  1229. adcs $acc1,$acc1,$t1
  1230. umulh $t1,$m1,$mi
  1231. adcs $acc2,$acc2,$t2
  1232. umulh $t2,$m2,$mi
  1233. adcs $acc3,$acc3,$t3
  1234. umulh $t3,$m3,$mi
  1235. adcs $acc4,$acc4,$carry
  1236. ldr $mi,[sp,$cnt] // next a[0]*n0
  1237. adc $carry,xzr,xzr
  1238. str $acc0,[$tp],#8 // result!!!
  1239. adds $acc0,$acc1,$t0
  1240. sub $t0,$ap_end,$ap // done yet?
  1241. adcs $acc1,$acc2,$t1
  1242. adcs $acc2,$acc3,$t2
  1243. adcs $acc3,$acc4,$t3
  1244. //adc $carry,$carry,xzr
  1245. cbnz $cnt,.Loop_mul4x_tail
  1246. sub $t1,$np,$num // rewinded np?
  1247. adc $carry,$carry,xzr
  1248. cbz $t0,.Loop_mul4x_break
  1249. ldp $t0,$t1,[$tp,#8*4]
  1250. ldp $t2,$t3,[$tp,#8*6]
  1251. ldp $a0,$a1,[$ap,#8*0]
  1252. ldp $a2,$a3,[$ap,#8*2]
  1253. add $ap,$ap,#8*4
  1254. adds $acc0,$acc0,$t0
  1255. adcs $acc1,$acc1,$t1
  1256. adcs $acc2,$acc2,$t2
  1257. adcs $acc3,$acc3,$t3
  1258. //adc $carry,$carry,xzr
  1259. ldp $m0,$m1,[$np,#8*0]
  1260. ldp $m2,$m3,[$np,#8*2]
  1261. add $np,$np,#8*4
  1262. b .Loop_mul4x_tail
  1263. .align 4
  1264. .Loop_mul4x_break:
  1265. ldp $t2,$t3,[x29,#96] // pull rp and &b[num]
  1266. adds $acc0,$acc0,$topmost
  1267. add $bp,$bp,#8*4 // bp++
  1268. adcs $acc1,$acc1,xzr
  1269. sub $ap,$ap,$num // rewind ap
  1270. adcs $acc2,$acc2,xzr
  1271. stp $acc0,$acc1,[$tp,#8*0] // result!!!
  1272. adcs $acc3,$acc3,xzr
  1273. ldp $acc0,$acc1,[sp,#8*4] // t[0..3]
  1274. adc $topmost,$carry,xzr
  1275. stp $acc2,$acc3,[$tp,#8*2] // result!!!
  1276. cmp $bp,$t3 // done yet?
  1277. ldp $acc2,$acc3,[sp,#8*6]
  1278. ldp $m0,$m1,[$t1,#8*0] // n[0..3]
  1279. ldp $m2,$m3,[$t1,#8*2]
  1280. add $np,$t1,#8*4
  1281. b.eq .Lmul4x_post
  1282. ldr $bi,[$bp]
  1283. ldp $a0,$a1,[$ap,#8*0] // a[0..3]
  1284. ldp $a2,$a3,[$ap,#8*2]
  1285. adds $ap,$ap,#8*4 // clear carry bit
  1286. mov $carry,xzr
  1287. mov $tp,sp
  1288. b .Loop_mul4x_reduction
  1289. .align 4
  1290. .Lmul4x_post:
  1291. // Final step. We see if result is larger than modulus, and
  1292. // if it is, subtract the modulus. But comparison implies
  1293. // subtraction. So we subtract modulus, see if it borrowed,
  1294. // and conditionally copy original value.
  1295. mov $rp,$t2
  1296. mov $ap_end,$t2 // $rp copy
  1297. subs $t0,$acc0,$m0
  1298. add $tp,sp,#8*8
  1299. sbcs $t1,$acc1,$m1
  1300. sub $cnt,$num,#8*4
  1301. .Lmul4x_sub:
  1302. sbcs $t2,$acc2,$m2
  1303. ldp $m0,$m1,[$np,#8*0]
  1304. sub $cnt,$cnt,#8*4
  1305. ldp $acc0,$acc1,[$tp,#8*0]
  1306. sbcs $t3,$acc3,$m3
  1307. ldp $m2,$m3,[$np,#8*2]
  1308. add $np,$np,#8*4
  1309. ldp $acc2,$acc3,[$tp,#8*2]
  1310. add $tp,$tp,#8*4
  1311. stp $t0,$t1,[$rp,#8*0]
  1312. sbcs $t0,$acc0,$m0
  1313. stp $t2,$t3,[$rp,#8*2]
  1314. add $rp,$rp,#8*4
  1315. sbcs $t1,$acc1,$m1
  1316. cbnz $cnt,.Lmul4x_sub
  1317. sbcs $t2,$acc2,$m2
  1318. mov $tp,sp
  1319. add $ap,sp,#8*4
  1320. ldp $a0,$a1,[$ap_end,#8*0]
  1321. sbcs $t3,$acc3,$m3
  1322. stp $t0,$t1,[$rp,#8*0]
  1323. ldp $a2,$a3,[$ap_end,#8*2]
  1324. stp $t2,$t3,[$rp,#8*2]
  1325. ldp $acc0,$acc1,[$ap,#8*0]
  1326. ldp $acc2,$acc3,[$ap,#8*2]
  1327. sbcs xzr,$topmost,xzr // did it borrow?
  1328. ldr x30,[x29,#8] // pull return address
  1329. sub $cnt,$num,#8*4
  1330. .Lmul4x_cond_copy:
  1331. sub $cnt,$cnt,#8*4
  1332. csel $t0,$acc0,$a0,lo
  1333. stp xzr,xzr,[$tp,#8*0]
  1334. csel $t1,$acc1,$a1,lo
  1335. ldp $a0,$a1,[$ap_end,#8*4]
  1336. ldp $acc0,$acc1,[$ap,#8*4]
  1337. csel $t2,$acc2,$a2,lo
  1338. stp xzr,xzr,[$tp,#8*2]
  1339. add $tp,$tp,#8*4
  1340. csel $t3,$acc3,$a3,lo
  1341. ldp $a2,$a3,[$ap_end,#8*6]
  1342. ldp $acc2,$acc3,[$ap,#8*6]
  1343. add $ap,$ap,#8*4
  1344. stp $t0,$t1,[$ap_end,#8*0]
  1345. stp $t2,$t3,[$ap_end,#8*2]
  1346. add $ap_end,$ap_end,#8*4
  1347. cbnz $cnt,.Lmul4x_cond_copy
  1348. csel $t0,$acc0,$a0,lo
  1349. stp xzr,xzr,[$tp,#8*0]
  1350. csel $t1,$acc1,$a1,lo
  1351. stp xzr,xzr,[$tp,#8*2]
  1352. csel $t2,$acc2,$a2,lo
  1353. stp xzr,xzr,[$tp,#8*3]
  1354. csel $t3,$acc3,$a3,lo
  1355. stp xzr,xzr,[$tp,#8*4]
  1356. stp $t0,$t1,[$ap_end,#8*0]
  1357. stp $t2,$t3,[$ap_end,#8*2]
  1358. b .Lmul4x_done
  1359. .align 4
  1360. .Lmul4x4_post_condition:
  1361. adc $carry,$carry,xzr
  1362. ldr $ap,[x29,#96] // pull rp
  1363. // $acc0-3,$carry hold result, $m0-7 hold modulus
  1364. subs $a0,$acc0,$m0
  1365. ldr x30,[x29,#8] // pull return address
  1366. sbcs $a1,$acc1,$m1
  1367. stp xzr,xzr,[sp,#8*0]
  1368. sbcs $a2,$acc2,$m2
  1369. stp xzr,xzr,[sp,#8*2]
  1370. sbcs $a3,$acc3,$m3
  1371. stp xzr,xzr,[sp,#8*4]
  1372. sbcs xzr,$carry,xzr // did it borrow?
  1373. stp xzr,xzr,[sp,#8*6]
  1374. // $a0-3 hold result-modulus
  1375. csel $a0,$acc0,$a0,lo
  1376. csel $a1,$acc1,$a1,lo
  1377. csel $a2,$acc2,$a2,lo
  1378. csel $a3,$acc3,$a3,lo
  1379. stp $a0,$a1,[$ap,#8*0]
  1380. stp $a2,$a3,[$ap,#8*2]
  1381. .Lmul4x_done:
  1382. ldp x19,x20,[x29,#16]
  1383. mov sp,x29
  1384. ldp x21,x22,[x29,#32]
  1385. mov x0,#1
  1386. ldp x23,x24,[x29,#48]
  1387. ldp x25,x26,[x29,#64]
  1388. ldp x27,x28,[x29,#80]
  1389. ldr x29,[sp],#128
  1390. .inst 0xd50323bf // autiasp
  1391. ret
  1392. .size __bn_mul4x_mont,.-__bn_mul4x_mont
  1393. ___
  1394. }
  1395. $code.=<<___;
  1396. .asciz "Montgomery Multiplication for ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
  1397. .align 4
  1398. ___
  1399. print $code;
  1400. close STDOUT or die "error closing STDOUT: $!";