poly1305-armv4.pl 29 KB


  1. #! /usr/bin/env perl
  2. # Copyright 2016-2021 The OpenSSL Project Authors. All Rights Reserved.
  3. #
  4. # Licensed under the OpenSSL license (the "License"). You may not use
  5. # this file except in compliance with the License. You can obtain a copy
  6. # in the file LICENSE in the source distribution or at
  7. # https://www.openssl.org/source/license.html
  8. #
  9. # ====================================================================
  10. # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
  11. # project. The module is, however, dual licensed under OpenSSL and
  12. # CRYPTOGAMS licenses depending on where you obtain it. For further
  13. # details see http://www.openssl.org/~appro/cryptogams/.
  14. # ====================================================================
  15. #
  16. # IALU(*)/gcc-4.4 NEON
  17. #
  18. # ARM11xx(ARMv6) 7.78/+100% -
  19. # Cortex-A5 6.35/+130% 3.00
  20. # Cortex-A8 6.25/+115% 2.36
  21. # Cortex-A9 5.10/+95% 2.55
  22. # Cortex-A15 3.85/+85% 1.25(**)
  23. # Snapdragon S4 5.70/+100% 1.48(**)
  24. #
  25. # (*) this is for -march=armv6, i.e. with bunch of ldrb loading data;
  26. # (**) these are trade-off results, they can be improved by ~8% but at
  27. # the cost of 15/12% regression on Cortex-A5/A7, it's even possible
  28. # to improve Cortex-A9 result, but then A5/A7 loose more than 20%;
  29. $flavour = shift;
  30. if ($flavour=~/\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; }
  31. else { while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} }
  32. if ($flavour && $flavour ne "void") {
  33. $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
  34. ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
  35. ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
  36. die "can't locate arm-xlate.pl";
  37. open STDOUT,"| \"$^X\" $xlate $flavour $output";
  38. } else {
  39. open STDOUT,">$output";
  40. }
  41. ($ctx,$inp,$len,$padbit)=map("r$_",(0..3));
  42. $code.=<<___;
  43. #include "arm_arch.h"
  44. .text
  45. #if defined(__thumb2__)
  46. .syntax unified
  47. .thumb
  48. #else
  49. .code 32
  50. #endif
  51. .globl poly1305_emit
  52. .globl poly1305_blocks
  53. .globl poly1305_init
  54. .type poly1305_init,%function
  55. .align 5
  56. poly1305_init:
  57. .Lpoly1305_init:
  58. stmdb sp!,{r4-r11}
  59. eor r3,r3,r3
  60. cmp $inp,#0
  61. str r3,[$ctx,#0] @ zero hash value
  62. str r3,[$ctx,#4]
  63. str r3,[$ctx,#8]
  64. str r3,[$ctx,#12]
  65. str r3,[$ctx,#16]
  66. str r3,[$ctx,#36] @ is_base2_26
  67. add $ctx,$ctx,#20
  68. #ifdef __thumb2__
  69. it eq
  70. #endif
  71. moveq r0,#0
  72. beq .Lno_key
  73. #if __ARM_MAX_ARCH__>=7
  74. adr r11,.Lpoly1305_init
  75. ldr r12,.LOPENSSL_armcap
  76. #endif
  77. ldrb r4,[$inp,#0]
  78. mov r10,#0x0fffffff
  79. ldrb r5,[$inp,#1]
  80. and r3,r10,#-4 @ 0x0ffffffc
  81. ldrb r6,[$inp,#2]
  82. ldrb r7,[$inp,#3]
  83. orr r4,r4,r5,lsl#8
  84. ldrb r5,[$inp,#4]
  85. orr r4,r4,r6,lsl#16
  86. ldrb r6,[$inp,#5]
  87. orr r4,r4,r7,lsl#24
  88. ldrb r7,[$inp,#6]
  89. and r4,r4,r10
  90. #if __ARM_MAX_ARCH__>=7
  91. ldr r12,[r11,r12] @ OPENSSL_armcap_P
  92. # ifdef __APPLE__
  93. ldr r12,[r12]
  94. # endif
  95. #endif
  96. ldrb r8,[$inp,#7]
  97. orr r5,r5,r6,lsl#8
  98. ldrb r6,[$inp,#8]
  99. orr r5,r5,r7,lsl#16
  100. ldrb r7,[$inp,#9]
  101. orr r5,r5,r8,lsl#24
  102. ldrb r8,[$inp,#10]
  103. and r5,r5,r3
  104. #if __ARM_MAX_ARCH__>=7
  105. tst r12,#ARMV7_NEON @ check for NEON
  106. # ifdef __APPLE__
  107. adr r9,poly1305_blocks_neon
  108. adr r11,poly1305_blocks
  109. # ifdef __thumb2__
  110. it ne
  111. # endif
  112. movne r11,r9
  113. adr r12,poly1305_emit
  114. adr r10,poly1305_emit_neon
  115. # ifdef __thumb2__
  116. it ne
  117. # endif
  118. movne r12,r10
  119. # else
  120. # ifdef __thumb2__
  121. itete eq
  122. # endif
  123. addeq r12,r11,#(.Lpoly1305_emit-.Lpoly1305_init)
  124. addne r12,r11,#(.Lpoly1305_emit_neon-.Lpoly1305_init)
  125. addeq r11,r11,#(.Lpoly1305_blocks-.Lpoly1305_init)
  126. addne r11,r11,#(.Lpoly1305_blocks_neon-.Lpoly1305_init)
  127. # endif
  128. # ifdef __thumb2__
  129. orr r12,r12,#1 @ thumb-ify address
  130. orr r11,r11,#1
  131. # endif
  132. #endif
  133. ldrb r9,[$inp,#11]
  134. orr r6,r6,r7,lsl#8
  135. ldrb r7,[$inp,#12]
  136. orr r6,r6,r8,lsl#16
  137. ldrb r8,[$inp,#13]
  138. orr r6,r6,r9,lsl#24
  139. ldrb r9,[$inp,#14]
  140. and r6,r6,r3
  141. ldrb r10,[$inp,#15]
  142. orr r7,r7,r8,lsl#8
  143. str r4,[$ctx,#0]
  144. orr r7,r7,r9,lsl#16
  145. str r5,[$ctx,#4]
  146. orr r7,r7,r10,lsl#24
  147. str r6,[$ctx,#8]
  148. and r7,r7,r3
  149. str r7,[$ctx,#12]
  150. #if __ARM_MAX_ARCH__>=7
  151. stmia r2,{r11,r12} @ fill functions table
  152. mov r0,#1
  153. #else
  154. mov r0,#0
  155. #endif
  156. .Lno_key:
  157. ldmia sp!,{r4-r11}
  158. #if __ARM_ARCH__>=5
  159. ret @ bx lr
  160. #else
  161. tst lr,#1
  162. moveq pc,lr @ be binary compatible with V4, yet
  163. bx lr @ interoperable with Thumb ISA:-)
  164. #endif
  165. .size poly1305_init,.-poly1305_init
  166. ___
  167. {
  168. my ($h0,$h1,$h2,$h3,$h4,$r0,$r1,$r2,$r3)=map("r$_",(4..12));
  169. my ($s1,$s2,$s3)=($r1,$r2,$r3);
  170. $code.=<<___;
  171. .type poly1305_blocks,%function
  172. .align 5
  173. poly1305_blocks:
  174. .Lpoly1305_blocks:
  175. stmdb sp!,{r3-r11,lr}
  176. ands $len,$len,#-16
  177. beq .Lno_data
  178. cmp $padbit,#0
  179. add $len,$len,$inp @ end pointer
  180. sub sp,sp,#32
  181. ldmia $ctx,{$h0-$r3} @ load context
  182. str $ctx,[sp,#12] @ offload stuff
  183. mov lr,$inp
  184. str $len,[sp,#16]
  185. str $r1,[sp,#20]
  186. str $r2,[sp,#24]
  187. str $r3,[sp,#28]
  188. b .Loop
  189. .Loop:
  190. #if __ARM_ARCH__<7
  191. ldrb r0,[lr],#16 @ load input
  192. # ifdef __thumb2__
  193. it hi
  194. # endif
  195. addhi $h4,$h4,#1 @ 1<<128
  196. ldrb r1,[lr,#-15]
  197. ldrb r2,[lr,#-14]
  198. ldrb r3,[lr,#-13]
  199. orr r1,r0,r1,lsl#8
  200. ldrb r0,[lr,#-12]
  201. orr r2,r1,r2,lsl#16
  202. ldrb r1,[lr,#-11]
  203. orr r3,r2,r3,lsl#24
  204. ldrb r2,[lr,#-10]
  205. adds $h0,$h0,r3 @ accumulate input
  206. ldrb r3,[lr,#-9]
  207. orr r1,r0,r1,lsl#8
  208. ldrb r0,[lr,#-8]
  209. orr r2,r1,r2,lsl#16
  210. ldrb r1,[lr,#-7]
  211. orr r3,r2,r3,lsl#24
  212. ldrb r2,[lr,#-6]
  213. adcs $h1,$h1,r3
  214. ldrb r3,[lr,#-5]
  215. orr r1,r0,r1,lsl#8
  216. ldrb r0,[lr,#-4]
  217. orr r2,r1,r2,lsl#16
  218. ldrb r1,[lr,#-3]
  219. orr r3,r2,r3,lsl#24
  220. ldrb r2,[lr,#-2]
  221. adcs $h2,$h2,r3
  222. ldrb r3,[lr,#-1]
  223. orr r1,r0,r1,lsl#8
  224. str lr,[sp,#8] @ offload input pointer
  225. orr r2,r1,r2,lsl#16
  226. add $s1,$r1,$r1,lsr#2
  227. orr r3,r2,r3,lsl#24
  228. #else
  229. ldr r0,[lr],#16 @ load input
  230. # ifdef __thumb2__
  231. it hi
  232. # endif
  233. addhi $h4,$h4,#1 @ padbit
  234. ldr r1,[lr,#-12]
  235. ldr r2,[lr,#-8]
  236. ldr r3,[lr,#-4]
  237. # ifdef __ARMEB__
  238. rev r0,r0
  239. rev r1,r1
  240. rev r2,r2
  241. rev r3,r3
  242. # endif
  243. adds $h0,$h0,r0 @ accumulate input
  244. str lr,[sp,#8] @ offload input pointer
  245. adcs $h1,$h1,r1
  246. add $s1,$r1,$r1,lsr#2
  247. adcs $h2,$h2,r2
  248. #endif
  249. add $s2,$r2,$r2,lsr#2
  250. adcs $h3,$h3,r3
  251. add $s3,$r3,$r3,lsr#2
  252. umull r2,r3,$h1,$r0
  253. adc $h4,$h4,#0
  254. umull r0,r1,$h0,$r0
  255. umlal r2,r3,$h4,$s1
  256. umlal r0,r1,$h3,$s1
  257. ldr $r1,[sp,#20] @ reload $r1
  258. umlal r2,r3,$h2,$s3
  259. umlal r0,r1,$h1,$s3
  260. umlal r2,r3,$h3,$s2
  261. umlal r0,r1,$h2,$s2
  262. umlal r2,r3,$h0,$r1
  263. str r0,[sp,#0] @ future $h0
  264. mul r0,$s2,$h4
  265. ldr $r2,[sp,#24] @ reload $r2
  266. adds r2,r2,r1 @ d1+=d0>>32
  267. eor r1,r1,r1
  268. adc lr,r3,#0 @ future $h2
  269. str r2,[sp,#4] @ future $h1
  270. mul r2,$s3,$h4
  271. eor r3,r3,r3
  272. umlal r0,r1,$h3,$s3
  273. ldr $r3,[sp,#28] @ reload $r3
  274. umlal r2,r3,$h3,$r0
  275. umlal r0,r1,$h2,$r0
  276. umlal r2,r3,$h2,$r1
  277. umlal r0,r1,$h1,$r1
  278. umlal r2,r3,$h1,$r2
  279. umlal r0,r1,$h0,$r2
  280. umlal r2,r3,$h0,$r3
  281. ldr $h0,[sp,#0]
  282. mul $h4,$r0,$h4
  283. ldr $h1,[sp,#4]
  284. adds $h2,lr,r0 @ d2+=d1>>32
  285. ldr lr,[sp,#8] @ reload input pointer
  286. adc r1,r1,#0
  287. adds $h3,r2,r1 @ d3+=d2>>32
  288. ldr r0,[sp,#16] @ reload end pointer
  289. adc r3,r3,#0
  290. add $h4,$h4,r3 @ h4+=d3>>32
  291. and r1,$h4,#-4
  292. and $h4,$h4,#3
  293. add r1,r1,r1,lsr#2 @ *=5
  294. adds $h0,$h0,r1
  295. adcs $h1,$h1,#0
  296. adcs $h2,$h2,#0
  297. adcs $h3,$h3,#0
  298. adc $h4,$h4,#0
  299. cmp r0,lr @ done yet?
  300. bhi .Loop
  301. ldr $ctx,[sp,#12]
  302. add sp,sp,#32
  303. stmia $ctx,{$h0-$h4} @ store the result
  304. .Lno_data:
  305. #if __ARM_ARCH__>=5
  306. ldmia sp!,{r3-r11,pc}
  307. #else
  308. ldmia sp!,{r3-r11,lr}
  309. tst lr,#1
  310. moveq pc,lr @ be binary compatible with V4, yet
  311. bx lr @ interoperable with Thumb ISA:-)
  312. #endif
  313. .size poly1305_blocks,.-poly1305_blocks
  314. ___
  315. }
  316. {
  317. my ($ctx,$mac,$nonce)=map("r$_",(0..2));
  318. my ($h0,$h1,$h2,$h3,$h4,$g0,$g1,$g2,$g3)=map("r$_",(3..11));
  319. my $g4=$h4;
  320. $code.=<<___;
  321. .type poly1305_emit,%function
  322. .align 5
  323. poly1305_emit:
  324. .Lpoly1305_emit:
  325. stmdb sp!,{r4-r11}
  326. .Lpoly1305_emit_enter:
  327. ldmia $ctx,{$h0-$h4}
  328. adds $g0,$h0,#5 @ compare to modulus
  329. adcs $g1,$h1,#0
  330. adcs $g2,$h2,#0
  331. adcs $g3,$h3,#0
  332. adc $g4,$h4,#0
  333. tst $g4,#4 @ did it carry/borrow?
  334. #ifdef __thumb2__
  335. it ne
  336. #endif
  337. movne $h0,$g0
  338. ldr $g0,[$nonce,#0]
  339. #ifdef __thumb2__
  340. it ne
  341. #endif
  342. movne $h1,$g1
  343. ldr $g1,[$nonce,#4]
  344. #ifdef __thumb2__
  345. it ne
  346. #endif
  347. movne $h2,$g2
  348. ldr $g2,[$nonce,#8]
  349. #ifdef __thumb2__
  350. it ne
  351. #endif
  352. movne $h3,$g3
  353. ldr $g3,[$nonce,#12]
  354. adds $h0,$h0,$g0
  355. adcs $h1,$h1,$g1
  356. adcs $h2,$h2,$g2
  357. adc $h3,$h3,$g3
  358. #if __ARM_ARCH__>=7
  359. # ifdef __ARMEB__
  360. rev $h0,$h0
  361. rev $h1,$h1
  362. rev $h2,$h2
  363. rev $h3,$h3
  364. # endif
  365. str $h0,[$mac,#0]
  366. str $h1,[$mac,#4]
  367. str $h2,[$mac,#8]
  368. str $h3,[$mac,#12]
  369. #else
  370. strb $h0,[$mac,#0]
  371. mov $h0,$h0,lsr#8
  372. strb $h1,[$mac,#4]
  373. mov $h1,$h1,lsr#8
  374. strb $h2,[$mac,#8]
  375. mov $h2,$h2,lsr#8
  376. strb $h3,[$mac,#12]
  377. mov $h3,$h3,lsr#8
  378. strb $h0,[$mac,#1]
  379. mov $h0,$h0,lsr#8
  380. strb $h1,[$mac,#5]
  381. mov $h1,$h1,lsr#8
  382. strb $h2,[$mac,#9]
  383. mov $h2,$h2,lsr#8
  384. strb $h3,[$mac,#13]
  385. mov $h3,$h3,lsr#8
  386. strb $h0,[$mac,#2]
  387. mov $h0,$h0,lsr#8
  388. strb $h1,[$mac,#6]
  389. mov $h1,$h1,lsr#8
  390. strb $h2,[$mac,#10]
  391. mov $h2,$h2,lsr#8
  392. strb $h3,[$mac,#14]
  393. mov $h3,$h3,lsr#8
  394. strb $h0,[$mac,#3]
  395. strb $h1,[$mac,#7]
  396. strb $h2,[$mac,#11]
  397. strb $h3,[$mac,#15]
  398. #endif
  399. ldmia sp!,{r4-r11}
  400. #if __ARM_ARCH__>=5
  401. ret @ bx lr
  402. #else
  403. tst lr,#1
  404. moveq pc,lr @ be binary compatible with V4, yet
  405. bx lr @ interoperable with Thumb ISA:-)
  406. #endif
  407. .size poly1305_emit,.-poly1305_emit
  408. ___
  409. {
  410. my ($R0,$R1,$S1,$R2,$S2,$R3,$S3,$R4,$S4) = map("d$_",(0..9));
  411. my ($D0,$D1,$D2,$D3,$D4, $H0,$H1,$H2,$H3,$H4) = map("q$_",(5..14));
  412. my ($T0,$T1,$MASK) = map("q$_",(15,4,0));
  413. my ($in2,$zeros,$tbl0,$tbl1) = map("r$_",(4..7));
  414. $code.=<<___;
  415. #if __ARM_MAX_ARCH__>=7
  416. .fpu neon
  417. .type poly1305_init_neon,%function
  418. .align 5
  419. poly1305_init_neon:
  420. ldr r4,[$ctx,#20] @ load key base 2^32
  421. ldr r5,[$ctx,#24]
  422. ldr r6,[$ctx,#28]
  423. ldr r7,[$ctx,#32]
  424. and r2,r4,#0x03ffffff @ base 2^32 -> base 2^26
  425. mov r3,r4,lsr#26
  426. mov r4,r5,lsr#20
  427. orr r3,r3,r5,lsl#6
  428. mov r5,r6,lsr#14
  429. orr r4,r4,r6,lsl#12
  430. mov r6,r7,lsr#8
  431. orr r5,r5,r7,lsl#18
  432. and r3,r3,#0x03ffffff
  433. and r4,r4,#0x03ffffff
  434. and r5,r5,#0x03ffffff
  435. vdup.32 $R0,r2 @ r^1 in both lanes
  436. add r2,r3,r3,lsl#2 @ *5
  437. vdup.32 $R1,r3
  438. add r3,r4,r4,lsl#2
  439. vdup.32 $S1,r2
  440. vdup.32 $R2,r4
  441. add r4,r5,r5,lsl#2
  442. vdup.32 $S2,r3
  443. vdup.32 $R3,r5
  444. add r5,r6,r6,lsl#2
  445. vdup.32 $S3,r4
  446. vdup.32 $R4,r6
  447. vdup.32 $S4,r5
  448. mov $zeros,#2 @ counter
  449. .Lsquare_neon:
  450. @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
  451. @ d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
  452. @ d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4
  453. @ d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4
  454. @ d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4
  455. @ d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4
  456. vmull.u32 $D0,$R0,${R0}[1]
  457. vmull.u32 $D1,$R1,${R0}[1]
  458. vmull.u32 $D2,$R2,${R0}[1]
  459. vmull.u32 $D3,$R3,${R0}[1]
  460. vmull.u32 $D4,$R4,${R0}[1]
  461. vmlal.u32 $D0,$R4,${S1}[1]
  462. vmlal.u32 $D1,$R0,${R1}[1]
  463. vmlal.u32 $D2,$R1,${R1}[1]
  464. vmlal.u32 $D3,$R2,${R1}[1]
  465. vmlal.u32 $D4,$R3,${R1}[1]
  466. vmlal.u32 $D0,$R3,${S2}[1]
  467. vmlal.u32 $D1,$R4,${S2}[1]
  468. vmlal.u32 $D3,$R1,${R2}[1]
  469. vmlal.u32 $D2,$R0,${R2}[1]
  470. vmlal.u32 $D4,$R2,${R2}[1]
  471. vmlal.u32 $D0,$R2,${S3}[1]
  472. vmlal.u32 $D3,$R0,${R3}[1]
  473. vmlal.u32 $D1,$R3,${S3}[1]
  474. vmlal.u32 $D2,$R4,${S3}[1]
  475. vmlal.u32 $D4,$R1,${R3}[1]
  476. vmlal.u32 $D3,$R4,${S4}[1]
  477. vmlal.u32 $D0,$R1,${S4}[1]
  478. vmlal.u32 $D1,$R2,${S4}[1]
  479. vmlal.u32 $D2,$R3,${S4}[1]
  480. vmlal.u32 $D4,$R0,${R4}[1]
  481. @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
  482. @ lazy reduction as discussed in "NEON crypto" by D.J. Bernstein
  483. @ and P. Schwabe
  484. @
  485. @ H0>>+H1>>+H2>>+H3>>+H4
  486. @ H3>>+H4>>*5+H0>>+H1
  487. @
  488. @ Trivia.
  489. @
  490. @ Result of multiplication of n-bit number by m-bit number is
  491. @ n+m bits wide. However! Even though 2^n is a n+1-bit number,
  492. @ m-bit number multiplied by 2^n is still n+m bits wide.
  493. @
  494. @ Sum of two n-bit numbers is n+1 bits wide, sum of three - n+2,
  495. @ and so is sum of four. Sum of 2^m n-m-bit numbers and n-bit
  496. @ one is n+1 bits wide.
  497. @
  498. @ >>+ denotes Hnext += Hn>>26, Hn &= 0x3ffffff. This means that
  499. @ H0, H2, H3 are guaranteed to be 26 bits wide, while H1 and H4
  500. @ can be 27. However! In cases when their width exceeds 26 bits
  501. @ they are limited by 2^26+2^6. This in turn means that *sum*
  502. @ of the products with these values can still be viewed as sum
  503. @ of 52-bit numbers as long as the amount of addends is not a
  504. @ power of 2. For example,
  505. @
  506. @ H4 = H4*R0 + H3*R1 + H2*R2 + H1*R3 + H0 * R4,
  507. @
  508. @ which can't be larger than 5 * (2^26 + 2^6) * (2^26 + 2^6), or
  509. @ 5 * (2^52 + 2*2^32 + 2^12), which in turn is smaller than
  510. @ 8 * (2^52) or 2^55. However, the value is then multiplied by
  511. @ by 5, so we should be looking at 5 * 5 * (2^52 + 2^33 + 2^12),
  512. @ which is less than 32 * (2^52) or 2^57. And when processing
  513. @ data we are looking at triple as many addends...
  514. @
  515. @ In key setup procedure pre-reduced H0 is limited by 5*4+1 and
  516. @ 5*H4 - by 5*5 52-bit addends, or 57 bits. But when hashing the
  517. @ input H0 is limited by (5*4+1)*3 addends, or 58 bits, while
  518. @ 5*H4 by 5*5*3, or 59[!] bits. How is this relevant? vmlal.u32
  519. @ instruction accepts 2x32-bit input and writes 2x64-bit result.
  520. @ This means that result of reduction have to be compressed upon
  521. @ loop wrap-around. This can be done in the process of reduction
  522. @ to minimize amount of instructions [as well as amount of
  523. @ 128-bit instructions, which benefits low-end processors], but
  524. @ one has to watch for H2 (which is narrower than H0) and 5*H4
  525. @ not being wider than 58 bits, so that result of right shift
  526. @ by 26 bits fits in 32 bits. This is also useful on x86,
  527. @ because it allows to use paddd in place for paddq, which
  528. @ benefits Atom, where paddq is ridiculously slow.
  529. vshr.u64 $T0,$D3,#26
  530. vmovn.i64 $D3#lo,$D3
  531. vshr.u64 $T1,$D0,#26
  532. vmovn.i64 $D0#lo,$D0
  533. vadd.i64 $D4,$D4,$T0 @ h3 -> h4
  534. vbic.i32 $D3#lo,#0xfc000000 @ &=0x03ffffff
  535. vadd.i64 $D1,$D1,$T1 @ h0 -> h1
  536. vbic.i32 $D0#lo,#0xfc000000
  537. vshrn.u64 $T0#lo,$D4,#26
  538. vmovn.i64 $D4#lo,$D4
  539. vshr.u64 $T1,$D1,#26
  540. vmovn.i64 $D1#lo,$D1
  541. vadd.i64 $D2,$D2,$T1 @ h1 -> h2
  542. vbic.i32 $D4#lo,#0xfc000000
  543. vbic.i32 $D1#lo,#0xfc000000
  544. vadd.i32 $D0#lo,$D0#lo,$T0#lo
  545. vshl.u32 $T0#lo,$T0#lo,#2
  546. vshrn.u64 $T1#lo,$D2,#26
  547. vmovn.i64 $D2#lo,$D2
  548. vadd.i32 $D0#lo,$D0#lo,$T0#lo @ h4 -> h0
  549. vadd.i32 $D3#lo,$D3#lo,$T1#lo @ h2 -> h3
  550. vbic.i32 $D2#lo,#0xfc000000
  551. vshr.u32 $T0#lo,$D0#lo,#26
  552. vbic.i32 $D0#lo,#0xfc000000
  553. vshr.u32 $T1#lo,$D3#lo,#26
  554. vbic.i32 $D3#lo,#0xfc000000
  555. vadd.i32 $D1#lo,$D1#lo,$T0#lo @ h0 -> h1
  556. vadd.i32 $D4#lo,$D4#lo,$T1#lo @ h3 -> h4
  557. subs $zeros,$zeros,#1
  558. beq .Lsquare_break_neon
  559. add $tbl0,$ctx,#(48+0*9*4)
  560. add $tbl1,$ctx,#(48+1*9*4)
  561. vtrn.32 $R0,$D0#lo @ r^2:r^1
  562. vtrn.32 $R2,$D2#lo
  563. vtrn.32 $R3,$D3#lo
  564. vtrn.32 $R1,$D1#lo
  565. vtrn.32 $R4,$D4#lo
  566. vshl.u32 $S2,$R2,#2 @ *5
  567. vshl.u32 $S3,$R3,#2
  568. vshl.u32 $S1,$R1,#2
  569. vshl.u32 $S4,$R4,#2
  570. vadd.i32 $S2,$S2,$R2
  571. vadd.i32 $S1,$S1,$R1
  572. vadd.i32 $S3,$S3,$R3
  573. vadd.i32 $S4,$S4,$R4
  574. vst4.32 {${R0}[0],${R1}[0],${S1}[0],${R2}[0]},[$tbl0]!
  575. vst4.32 {${R0}[1],${R1}[1],${S1}[1],${R2}[1]},[$tbl1]!
  576. vst4.32 {${S2}[0],${R3}[0],${S3}[0],${R4}[0]},[$tbl0]!
  577. vst4.32 {${S2}[1],${R3}[1],${S3}[1],${R4}[1]},[$tbl1]!
  578. vst1.32 {${S4}[0]},[$tbl0,:32]
  579. vst1.32 {${S4}[1]},[$tbl1,:32]
  580. b .Lsquare_neon
  581. .align 4
  582. .Lsquare_break_neon:
  583. add $tbl0,$ctx,#(48+2*4*9)
  584. add $tbl1,$ctx,#(48+3*4*9)
  585. vmov $R0,$D0#lo @ r^4:r^3
  586. vshl.u32 $S1,$D1#lo,#2 @ *5
  587. vmov $R1,$D1#lo
  588. vshl.u32 $S2,$D2#lo,#2
  589. vmov $R2,$D2#lo
  590. vshl.u32 $S3,$D3#lo,#2
  591. vmov $R3,$D3#lo
  592. vshl.u32 $S4,$D4#lo,#2
  593. vmov $R4,$D4#lo
  594. vadd.i32 $S1,$S1,$D1#lo
  595. vadd.i32 $S2,$S2,$D2#lo
  596. vadd.i32 $S3,$S3,$D3#lo
  597. vadd.i32 $S4,$S4,$D4#lo
  598. vst4.32 {${R0}[0],${R1}[0],${S1}[0],${R2}[0]},[$tbl0]!
  599. vst4.32 {${R0}[1],${R1}[1],${S1}[1],${R2}[1]},[$tbl1]!
  600. vst4.32 {${S2}[0],${R3}[0],${S3}[0],${R4}[0]},[$tbl0]!
  601. vst4.32 {${S2}[1],${R3}[1],${S3}[1],${R4}[1]},[$tbl1]!
  602. vst1.32 {${S4}[0]},[$tbl0]
  603. vst1.32 {${S4}[1]},[$tbl1]
  604. ret @ bx lr
  605. .size poly1305_init_neon,.-poly1305_init_neon
  606. .type poly1305_blocks_neon,%function
  607. .align 5
  608. poly1305_blocks_neon:
  609. .Lpoly1305_blocks_neon:
  610. ldr ip,[$ctx,#36] @ is_base2_26
  611. ands $len,$len,#-16
  612. beq .Lno_data_neon
  613. cmp $len,#64
  614. bhs .Lenter_neon
  615. tst ip,ip @ is_base2_26?
  616. beq .Lpoly1305_blocks
  617. .Lenter_neon:
  618. stmdb sp!,{r4-r7}
  619. vstmdb sp!,{d8-d15} @ ABI specification says so
  620. tst ip,ip @ is_base2_26?
  621. bne .Lbase2_26_neon
  622. stmdb sp!,{r1-r3,lr}
  623. bl poly1305_init_neon
  624. ldr r4,[$ctx,#0] @ load hash value base 2^32
  625. ldr r5,[$ctx,#4]
  626. ldr r6,[$ctx,#8]
  627. ldr r7,[$ctx,#12]
  628. ldr ip,[$ctx,#16]
  629. and r2,r4,#0x03ffffff @ base 2^32 -> base 2^26
  630. mov r3,r4,lsr#26
  631. veor $D0#lo,$D0#lo,$D0#lo
  632. mov r4,r5,lsr#20
  633. orr r3,r3,r5,lsl#6
  634. veor $D1#lo,$D1#lo,$D1#lo
  635. mov r5,r6,lsr#14
  636. orr r4,r4,r6,lsl#12
  637. veor $D2#lo,$D2#lo,$D2#lo
  638. mov r6,r7,lsr#8
  639. orr r5,r5,r7,lsl#18
  640. veor $D3#lo,$D3#lo,$D3#lo
  641. and r3,r3,#0x03ffffff
  642. orr r6,r6,ip,lsl#24
  643. veor $D4#lo,$D4#lo,$D4#lo
  644. and r4,r4,#0x03ffffff
  645. mov r1,#1
  646. and r5,r5,#0x03ffffff
  647. str r1,[$ctx,#36] @ is_base2_26
  648. vmov.32 $D0#lo[0],r2
  649. vmov.32 $D1#lo[0],r3
  650. vmov.32 $D2#lo[0],r4
  651. vmov.32 $D3#lo[0],r5
  652. vmov.32 $D4#lo[0],r6
  653. adr $zeros,.Lzeros
  654. ldmia sp!,{r1-r3,lr}
  655. b .Lbase2_32_neon
  656. .align 4
  657. .Lbase2_26_neon:
  658. @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
  659. @ load hash value
  660. veor $D0#lo,$D0#lo,$D0#lo
  661. veor $D1#lo,$D1#lo,$D1#lo
  662. veor $D2#lo,$D2#lo,$D2#lo
  663. veor $D3#lo,$D3#lo,$D3#lo
  664. veor $D4#lo,$D4#lo,$D4#lo
  665. vld4.32 {$D0#lo[0],$D1#lo[0],$D2#lo[0],$D3#lo[0]},[$ctx]!
  666. adr $zeros,.Lzeros
  667. vld1.32 {$D4#lo[0]},[$ctx]
  668. sub $ctx,$ctx,#16 @ rewind
  669. .Lbase2_32_neon:
  670. add $in2,$inp,#32
  671. mov $padbit,$padbit,lsl#24
  672. tst $len,#31
  673. beq .Leven
  674. vld4.32 {$H0#lo[0],$H1#lo[0],$H2#lo[0],$H3#lo[0]},[$inp]!
  675. vmov.32 $H4#lo[0],$padbit
  676. sub $len,$len,#16
  677. add $in2,$inp,#32
  678. # ifdef __ARMEB__
  679. vrev32.8 $H0,$H0
  680. vrev32.8 $H3,$H3
  681. vrev32.8 $H1,$H1
  682. vrev32.8 $H2,$H2
  683. # endif
  684. vsri.u32 $H4#lo,$H3#lo,#8 @ base 2^32 -> base 2^26
  685. vshl.u32 $H3#lo,$H3#lo,#18
  686. vsri.u32 $H3#lo,$H2#lo,#14
  687. vshl.u32 $H2#lo,$H2#lo,#12
  688. vadd.i32 $H4#hi,$H4#lo,$D4#lo @ add hash value and move to #hi
  689. vbic.i32 $H3#lo,#0xfc000000
  690. vsri.u32 $H2#lo,$H1#lo,#20
  691. vshl.u32 $H1#lo,$H1#lo,#6
  692. vbic.i32 $H2#lo,#0xfc000000
  693. vsri.u32 $H1#lo,$H0#lo,#26
  694. vadd.i32 $H3#hi,$H3#lo,$D3#lo
  695. vbic.i32 $H0#lo,#0xfc000000
  696. vbic.i32 $H1#lo,#0xfc000000
  697. vadd.i32 $H2#hi,$H2#lo,$D2#lo
  698. vadd.i32 $H0#hi,$H0#lo,$D0#lo
  699. vadd.i32 $H1#hi,$H1#lo,$D1#lo
  700. mov $tbl1,$zeros
  701. add $tbl0,$ctx,#48
  702. cmp $len,$len
  703. b .Long_tail
  704. .align 4
  705. .Leven:
  706. subs $len,$len,#64
  707. it lo
  708. movlo $in2,$zeros
  709. vmov.i32 $H4,#1<<24 @ padbit, yes, always
  710. vld4.32 {$H0#lo,$H1#lo,$H2#lo,$H3#lo},[$inp] @ inp[0:1]
  711. add $inp,$inp,#64
  712. vld4.32 {$H0#hi,$H1#hi,$H2#hi,$H3#hi},[$in2] @ inp[2:3] (or 0)
  713. add $in2,$in2,#64
  714. itt hi
  715. addhi $tbl1,$ctx,#(48+1*9*4)
  716. addhi $tbl0,$ctx,#(48+3*9*4)
  717. # ifdef __ARMEB__
  718. vrev32.8 $H0,$H0
  719. vrev32.8 $H3,$H3
  720. vrev32.8 $H1,$H1
  721. vrev32.8 $H2,$H2
  722. # endif
  723. vsri.u32 $H4,$H3,#8 @ base 2^32 -> base 2^26
  724. vshl.u32 $H3,$H3,#18
  725. vsri.u32 $H3,$H2,#14
  726. vshl.u32 $H2,$H2,#12
  727. vbic.i32 $H3,#0xfc000000
  728. vsri.u32 $H2,$H1,#20
  729. vshl.u32 $H1,$H1,#6
  730. vbic.i32 $H2,#0xfc000000
  731. vsri.u32 $H1,$H0,#26
  732. vbic.i32 $H0,#0xfc000000
  733. vbic.i32 $H1,#0xfc000000
  734. bls .Lskip_loop
  735. vld4.32 {${R0}[1],${R1}[1],${S1}[1],${R2}[1]},[$tbl1]! @ load r^2
  736. vld4.32 {${R0}[0],${R1}[0],${S1}[0],${R2}[0]},[$tbl0]! @ load r^4
  737. vld4.32 {${S2}[1],${R3}[1],${S3}[1],${R4}[1]},[$tbl1]!
  738. vld4.32 {${S2}[0],${R3}[0],${S3}[0],${R4}[0]},[$tbl0]!
  739. b .Loop_neon
  740. .align 5
  741. .Loop_neon:
  742. @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
  743. @ ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2
  744. @ ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r
  745. @ \___________________/
  746. @ ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2+inp[8])*r^2
  747. @ ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^4+inp[7]*r^2+inp[9])*r
  748. @ \___________________/ \____________________/
  749. @
  750. @ Note that we start with inp[2:3]*r^2. This is because it
  751. @ doesn't depend on reduction in previous iteration.
  752. @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
  753. @ d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4
  754. @ d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4
  755. @ d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4
  756. @ d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4
  757. @ d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
  758. @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
  759. @ inp[2:3]*r^2
  760. vadd.i32 $H2#lo,$H2#lo,$D2#lo @ accumulate inp[0:1]
  761. vmull.u32 $D2,$H2#hi,${R0}[1]
  762. vadd.i32 $H0#lo,$H0#lo,$D0#lo
  763. vmull.u32 $D0,$H0#hi,${R0}[1]
  764. vadd.i32 $H3#lo,$H3#lo,$D3#lo
  765. vmull.u32 $D3,$H3#hi,${R0}[1]
  766. vmlal.u32 $D2,$H1#hi,${R1}[1]
  767. vadd.i32 $H1#lo,$H1#lo,$D1#lo
  768. vmull.u32 $D1,$H1#hi,${R0}[1]
  769. vadd.i32 $H4#lo,$H4#lo,$D4#lo
  770. vmull.u32 $D4,$H4#hi,${R0}[1]
  771. subs $len,$len,#64
  772. vmlal.u32 $D0,$H4#hi,${S1}[1]
  773. it lo
  774. movlo $in2,$zeros
  775. vmlal.u32 $D3,$H2#hi,${R1}[1]
  776. vld1.32 ${S4}[1],[$tbl1,:32]
  777. vmlal.u32 $D1,$H0#hi,${R1}[1]
  778. vmlal.u32 $D4,$H3#hi,${R1}[1]
  779. vmlal.u32 $D0,$H3#hi,${S2}[1]
  780. vmlal.u32 $D3,$H1#hi,${R2}[1]
  781. vmlal.u32 $D4,$H2#hi,${R2}[1]
  782. vmlal.u32 $D1,$H4#hi,${S2}[1]
  783. vmlal.u32 $D2,$H0#hi,${R2}[1]
  784. vmlal.u32 $D3,$H0#hi,${R3}[1]
  785. vmlal.u32 $D0,$H2#hi,${S3}[1]
  786. vmlal.u32 $D4,$H1#hi,${R3}[1]
  787. vmlal.u32 $D1,$H3#hi,${S3}[1]
  788. vmlal.u32 $D2,$H4#hi,${S3}[1]
  789. vmlal.u32 $D3,$H4#hi,${S4}[1]
  790. vmlal.u32 $D0,$H1#hi,${S4}[1]
  791. vmlal.u32 $D4,$H0#hi,${R4}[1]
  792. vmlal.u32 $D1,$H2#hi,${S4}[1]
  793. vmlal.u32 $D2,$H3#hi,${S4}[1]
  794. vld4.32 {$H0#hi,$H1#hi,$H2#hi,$H3#hi},[$in2] @ inp[2:3] (or 0)
  795. add $in2,$in2,#64
  796. @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
  797. @ (hash+inp[0:1])*r^4 and accumulate
  798. vmlal.u32 $D3,$H3#lo,${R0}[0]
  799. vmlal.u32 $D0,$H0#lo,${R0}[0]
  800. vmlal.u32 $D4,$H4#lo,${R0}[0]
  801. vmlal.u32 $D1,$H1#lo,${R0}[0]
  802. vmlal.u32 $D2,$H2#lo,${R0}[0]
  803. vld1.32 ${S4}[0],[$tbl0,:32]
  804. vmlal.u32 $D3,$H2#lo,${R1}[0]
  805. vmlal.u32 $D0,$H4#lo,${S1}[0]
  806. vmlal.u32 $D4,$H3#lo,${R1}[0]
  807. vmlal.u32 $D1,$H0#lo,${R1}[0]
  808. vmlal.u32 $D2,$H1#lo,${R1}[0]
  809. vmlal.u32 $D3,$H1#lo,${R2}[0]
  810. vmlal.u32 $D0,$H3#lo,${S2}[0]
  811. vmlal.u32 $D4,$H2#lo,${R2}[0]
  812. vmlal.u32 $D1,$H4#lo,${S2}[0]
  813. vmlal.u32 $D2,$H0#lo,${R2}[0]
  814. vmlal.u32 $D3,$H0#lo,${R3}[0]
  815. vmlal.u32 $D0,$H2#lo,${S3}[0]
  816. vmlal.u32 $D4,$H1#lo,${R3}[0]
  817. vmlal.u32 $D1,$H3#lo,${S3}[0]
  818. vmlal.u32 $D3,$H4#lo,${S4}[0]
  819. vmlal.u32 $D2,$H4#lo,${S3}[0]
  820. vmlal.u32 $D0,$H1#lo,${S4}[0]
  821. vmlal.u32 $D4,$H0#lo,${R4}[0]
  822. vmov.i32 $H4,#1<<24 @ padbit, yes, always
  823. vmlal.u32 $D1,$H2#lo,${S4}[0]
  824. vmlal.u32 $D2,$H3#lo,${S4}[0]
  825. vld4.32 {$H0#lo,$H1#lo,$H2#lo,$H3#lo},[$inp] @ inp[0:1]
  826. add $inp,$inp,#64
  827. # ifdef __ARMEB__
  828. vrev32.8 $H0,$H0
  829. vrev32.8 $H1,$H1
  830. vrev32.8 $H2,$H2
  831. vrev32.8 $H3,$H3
  832. # endif
  833. @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
  834. @ lazy reduction interleaved with base 2^32 -> base 2^26 of
  835. @ inp[0:3] previously loaded to $H0-$H3 and smashed to $H0-$H4.
  836. vshr.u64 $T0,$D3,#26
  837. vmovn.i64 $D3#lo,$D3
  838. vshr.u64 $T1,$D0,#26
  839. vmovn.i64 $D0#lo,$D0
  840. vadd.i64 $D4,$D4,$T0 @ h3 -> h4
  841. vbic.i32 $D3#lo,#0xfc000000
  842. vsri.u32 $H4,$H3,#8 @ base 2^32 -> base 2^26
  843. vadd.i64 $D1,$D1,$T1 @ h0 -> h1
  844. vshl.u32 $H3,$H3,#18
  845. vbic.i32 $D0#lo,#0xfc000000
  846. vshrn.u64 $T0#lo,$D4,#26
  847. vmovn.i64 $D4#lo,$D4
  848. vshr.u64 $T1,$D1,#26
  849. vmovn.i64 $D1#lo,$D1
  850. vadd.i64 $D2,$D2,$T1 @ h1 -> h2
  851. vsri.u32 $H3,$H2,#14
  852. vbic.i32 $D4#lo,#0xfc000000
  853. vshl.u32 $H2,$H2,#12
  854. vbic.i32 $D1#lo,#0xfc000000
  855. vadd.i32 $D0#lo,$D0#lo,$T0#lo
  856. vshl.u32 $T0#lo,$T0#lo,#2
  857. vbic.i32 $H3,#0xfc000000
  858. vshrn.u64 $T1#lo,$D2,#26
  859. vmovn.i64 $D2#lo,$D2
  860. vaddl.u32 $D0,$D0#lo,$T0#lo @ h4 -> h0 [widen for a sec]
  861. vsri.u32 $H2,$H1,#20
  862. vadd.i32 $D3#lo,$D3#lo,$T1#lo @ h2 -> h3
  863. vshl.u32 $H1,$H1,#6
  864. vbic.i32 $D2#lo,#0xfc000000
  865. vbic.i32 $H2,#0xfc000000
  866. vshrn.u64 $T0#lo,$D0,#26 @ re-narrow
  867. vmovn.i64 $D0#lo,$D0
  868. vsri.u32 $H1,$H0,#26
  869. vbic.i32 $H0,#0xfc000000
  870. vshr.u32 $T1#lo,$D3#lo,#26
  871. vbic.i32 $D3#lo,#0xfc000000
  872. vbic.i32 $D0#lo,#0xfc000000
  873. vadd.i32 $D1#lo,$D1#lo,$T0#lo @ h0 -> h1
  874. vadd.i32 $D4#lo,$D4#lo,$T1#lo @ h3 -> h4
  875. vbic.i32 $H1,#0xfc000000
  876. bhi .Loop_neon
  877. .Lskip_loop:
  878. @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
  879. @ multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1
  880. add $tbl1,$ctx,#(48+0*9*4)
  881. add $tbl0,$ctx,#(48+1*9*4)
  882. adds $len,$len,#32
  883. it ne
  884. movne $len,#0
  885. bne .Long_tail
  886. vadd.i32 $H2#hi,$H2#lo,$D2#lo @ add hash value and move to #hi
  887. vadd.i32 $H0#hi,$H0#lo,$D0#lo
  888. vadd.i32 $H3#hi,$H3#lo,$D3#lo
  889. vadd.i32 $H1#hi,$H1#lo,$D1#lo
  890. vadd.i32 $H4#hi,$H4#lo,$D4#lo
  891. .Long_tail:
  892. vld4.32 {${R0}[1],${R1}[1],${S1}[1],${R2}[1]},[$tbl1]! @ load r^1
  893. vld4.32 {${R0}[0],${R1}[0],${S1}[0],${R2}[0]},[$tbl0]! @ load r^2
  894. vadd.i32 $H2#lo,$H2#lo,$D2#lo @ can be redundant
  895. vmull.u32 $D2,$H2#hi,$R0
  896. vadd.i32 $H0#lo,$H0#lo,$D0#lo
  897. vmull.u32 $D0,$H0#hi,$R0
  898. vadd.i32 $H3#lo,$H3#lo,$D3#lo
  899. vmull.u32 $D3,$H3#hi,$R0
  900. vadd.i32 $H1#lo,$H1#lo,$D1#lo
  901. vmull.u32 $D1,$H1#hi,$R0
  902. vadd.i32 $H4#lo,$H4#lo,$D4#lo
  903. vmull.u32 $D4,$H4#hi,$R0
  904. vmlal.u32 $D0,$H4#hi,$S1
  905. vld4.32 {${S2}[1],${R3}[1],${S3}[1],${R4}[1]},[$tbl1]!
  906. vmlal.u32 $D3,$H2#hi,$R1
  907. vld4.32 {${S2}[0],${R3}[0],${S3}[0],${R4}[0]},[$tbl0]!
  908. vmlal.u32 $D1,$H0#hi,$R1
  909. vmlal.u32 $D4,$H3#hi,$R1
  910. vmlal.u32 $D2,$H1#hi,$R1
  911. vmlal.u32 $D3,$H1#hi,$R2
  912. vld1.32 ${S4}[1],[$tbl1,:32]
  913. vmlal.u32 $D0,$H3#hi,$S2
  914. vld1.32 ${S4}[0],[$tbl0,:32]
  915. vmlal.u32 $D4,$H2#hi,$R2
  916. vmlal.u32 $D1,$H4#hi,$S2
  917. vmlal.u32 $D2,$H0#hi,$R2
  918. vmlal.u32 $D3,$H0#hi,$R3
  919. it ne
  920. addne $tbl1,$ctx,#(48+2*9*4)
  921. vmlal.u32 $D0,$H2#hi,$S3
  922. it ne
  923. addne $tbl0,$ctx,#(48+3*9*4)
  924. vmlal.u32 $D4,$H1#hi,$R3
  925. vmlal.u32 $D1,$H3#hi,$S3
  926. vmlal.u32 $D2,$H4#hi,$S3
  927. vmlal.u32 $D3,$H4#hi,$S4
  928. vorn $MASK,$MASK,$MASK @ all-ones, can be redundant
  929. vmlal.u32 $D0,$H1#hi,$S4
  930. vshr.u64 $MASK,$MASK,#38
  931. vmlal.u32 $D4,$H0#hi,$R4
  932. vmlal.u32 $D1,$H2#hi,$S4
  933. vmlal.u32 $D2,$H3#hi,$S4
  934. beq .Lshort_tail
  935. @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
  936. @ (hash+inp[0:1])*r^4:r^3 and accumulate
  937. vld4.32 {${R0}[1],${R1}[1],${S1}[1],${R2}[1]},[$tbl1]! @ load r^3
  938. vld4.32 {${R0}[0],${R1}[0],${S1}[0],${R2}[0]},[$tbl0]! @ load r^4
  939. vmlal.u32 $D2,$H2#lo,$R0
  940. vmlal.u32 $D0,$H0#lo,$R0
  941. vmlal.u32 $D3,$H3#lo,$R0
  942. vmlal.u32 $D1,$H1#lo,$R0
  943. vmlal.u32 $D4,$H4#lo,$R0
  944. vmlal.u32 $D0,$H4#lo,$S1
  945. vld4.32 {${S2}[1],${R3}[1],${S3}[1],${R4}[1]},[$tbl1]!
  946. vmlal.u32 $D3,$H2#lo,$R1
  947. vld4.32 {${S2}[0],${R3}[0],${S3}[0],${R4}[0]},[$tbl0]!
  948. vmlal.u32 $D1,$H0#lo,$R1
  949. vmlal.u32 $D4,$H3#lo,$R1
  950. vmlal.u32 $D2,$H1#lo,$R1
  951. vmlal.u32 $D3,$H1#lo,$R2
  952. vld1.32 ${S4}[1],[$tbl1,:32]
  953. vmlal.u32 $D0,$H3#lo,$S2
  954. vld1.32 ${S4}[0],[$tbl0,:32]
  955. vmlal.u32 $D4,$H2#lo,$R2
  956. vmlal.u32 $D1,$H4#lo,$S2
  957. vmlal.u32 $D2,$H0#lo,$R2
  958. vmlal.u32 $D3,$H0#lo,$R3
  959. vmlal.u32 $D0,$H2#lo,$S3
  960. vmlal.u32 $D4,$H1#lo,$R3
  961. vmlal.u32 $D1,$H3#lo,$S3
  962. vmlal.u32 $D2,$H4#lo,$S3
  963. vmlal.u32 $D3,$H4#lo,$S4
  964. vorn $MASK,$MASK,$MASK @ all-ones
  965. vmlal.u32 $D0,$H1#lo,$S4
  966. vshr.u64 $MASK,$MASK,#38
  967. vmlal.u32 $D4,$H0#lo,$R4
  968. vmlal.u32 $D1,$H2#lo,$S4
  969. vmlal.u32 $D2,$H3#lo,$S4
  970. .Lshort_tail:
  971. @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
  972. @ horizontal addition
  973. vadd.i64 $D3#lo,$D3#lo,$D3#hi
  974. vadd.i64 $D0#lo,$D0#lo,$D0#hi
  975. vadd.i64 $D4#lo,$D4#lo,$D4#hi
  976. vadd.i64 $D1#lo,$D1#lo,$D1#hi
  977. vadd.i64 $D2#lo,$D2#lo,$D2#hi
  978. @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
  979. @ lazy reduction, but without narrowing
  980. vshr.u64 $T0,$D3,#26
  981. vand.i64 $D3,$D3,$MASK
  982. vshr.u64 $T1,$D0,#26
  983. vand.i64 $D0,$D0,$MASK
  984. vadd.i64 $D4,$D4,$T0 @ h3 -> h4
  985. vadd.i64 $D1,$D1,$T1 @ h0 -> h1
  986. vshr.u64 $T0,$D4,#26
  987. vand.i64 $D4,$D4,$MASK
  988. vshr.u64 $T1,$D1,#26
  989. vand.i64 $D1,$D1,$MASK
  990. vadd.i64 $D2,$D2,$T1 @ h1 -> h2
  991. vadd.i64 $D0,$D0,$T0
  992. vshl.u64 $T0,$T0,#2
  993. vshr.u64 $T1,$D2,#26
  994. vand.i64 $D2,$D2,$MASK
  995. vadd.i64 $D0,$D0,$T0 @ h4 -> h0
  996. vadd.i64 $D3,$D3,$T1 @ h2 -> h3
  997. vshr.u64 $T0,$D0,#26
  998. vand.i64 $D0,$D0,$MASK
  999. vshr.u64 $T1,$D3,#26
  1000. vand.i64 $D3,$D3,$MASK
  1001. vadd.i64 $D1,$D1,$T0 @ h0 -> h1
  1002. vadd.i64 $D4,$D4,$T1 @ h3 -> h4
  1003. cmp $len,#0
  1004. bne .Leven
  1005. @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
  1006. @ store hash value
  1007. vst4.32 {$D0#lo[0],$D1#lo[0],$D2#lo[0],$D3#lo[0]},[$ctx]!
  1008. vst1.32 {$D4#lo[0]},[$ctx]
  1009. vldmia sp!,{d8-d15} @ epilogue
  1010. ldmia sp!,{r4-r7}
  1011. .Lno_data_neon:
  1012. ret @ bx lr
  1013. .size poly1305_blocks_neon,.-poly1305_blocks_neon
  1014. .type poly1305_emit_neon,%function
  1015. .align 5
  1016. poly1305_emit_neon:
  1017. .Lpoly1305_emit_neon:
  1018. ldr ip,[$ctx,#36] @ is_base2_26
  1019. stmdb sp!,{r4-r11}
  1020. tst ip,ip
  1021. beq .Lpoly1305_emit_enter
  1022. ldmia $ctx,{$h0-$h4}
  1023. eor $g0,$g0,$g0
  1024. adds $h0,$h0,$h1,lsl#26 @ base 2^26 -> base 2^32
  1025. mov $h1,$h1,lsr#6
  1026. adcs $h1,$h1,$h2,lsl#20
  1027. mov $h2,$h2,lsr#12
  1028. adcs $h2,$h2,$h3,lsl#14
  1029. mov $h3,$h3,lsr#18
  1030. adcs $h3,$h3,$h4,lsl#8
  1031. adc $h4,$g0,$h4,lsr#24 @ can be partially reduced ...
  1032. and $g0,$h4,#-4 @ ... so reduce
  1033. and $h4,$h3,#3
  1034. add $g0,$g0,$g0,lsr#2 @ *= 5
  1035. adds $h0,$h0,$g0
  1036. adcs $h1,$h1,#0
  1037. adcs $h2,$h2,#0
  1038. adcs $h3,$h3,#0
  1039. adc $h4,$h4,#0
  1040. adds $g0,$h0,#5 @ compare to modulus
  1041. adcs $g1,$h1,#0
  1042. adcs $g2,$h2,#0
  1043. adcs $g3,$h3,#0
  1044. adc $g4,$h4,#0
  1045. tst $g4,#4 @ did it carry/borrow?
  1046. it ne
  1047. movne $h0,$g0
  1048. ldr $g0,[$nonce,#0]
  1049. it ne
  1050. movne $h1,$g1
  1051. ldr $g1,[$nonce,#4]
  1052. it ne
  1053. movne $h2,$g2
  1054. ldr $g2,[$nonce,#8]
  1055. it ne
  1056. movne $h3,$g3
  1057. ldr $g3,[$nonce,#12]
  1058. adds $h0,$h0,$g0 @ accumulate nonce
  1059. adcs $h1,$h1,$g1
  1060. adcs $h2,$h2,$g2
  1061. adc $h3,$h3,$g3
  1062. # ifdef __ARMEB__
  1063. rev $h0,$h0
  1064. rev $h1,$h1
  1065. rev $h2,$h2
  1066. rev $h3,$h3
  1067. # endif
  1068. str $h0,[$mac,#0] @ store the result
  1069. str $h1,[$mac,#4]
  1070. str $h2,[$mac,#8]
  1071. str $h3,[$mac,#12]
  1072. ldmia sp!,{r4-r11}
  1073. ret @ bx lr
  1074. .size poly1305_emit_neon,.-poly1305_emit_neon
  1075. .align 5
  1076. .Lzeros:
  1077. .long 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
  1078. .LOPENSSL_armcap:
  1079. .word OPENSSL_armcap_P-.Lpoly1305_init
  1080. #endif
  1081. ___
  1082. } }
  1083. $code.=<<___;
  1084. .asciz "Poly1305 for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>"
  1085. .align 2
  1086. #if __ARM_MAX_ARCH__>=7
  1087. .comm OPENSSL_armcap_P,4,4
  1088. #endif
  1089. ___
  1090. foreach (split("\n",$code)) {
  1091. s/\`([^\`]*)\`/eval $1/geo;
  1092. s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo or
  1093. s/\bret\b/bx lr/go or
  1094. s/\bbx\s+lr\b/.word\t0xe12fff1e/go; # make it possible to compile with -march=armv4
  1095. print $_,"\n";
  1096. }
  1097. close STDOUT or die "error closing STDOUT: $!"; # enforce flush