chacha-armv8.pl 27 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145
  1. #! /usr/bin/env perl
  2. # Copyright 2016-2020 The OpenSSL Project Authors. All Rights Reserved.
  3. #
  4. # Licensed under the OpenSSL license (the "License"). You may not use
  5. # this file except in compliance with the License. You can obtain a copy
  6. # in the file LICENSE in the source distribution or at
  7. # https://www.openssl.org/source/license.html
  8. #
  9. # ====================================================================
  10. # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
  11. # project. The module is, however, dual licensed under OpenSSL and
  12. # CRYPTOGAMS licenses depending on where you obtain it. For further
  13. # details see http://www.openssl.org/~appro/cryptogams/.
  14. # ====================================================================
  15. #
  16. # June 2015
  17. #
  18. # ChaCha20 for ARMv8.
  19. #
  20. # Performance in cycles per byte out of large buffer.
  21. #
  22. # IALU/gcc-4.9 3xNEON+1xIALU 6xNEON+2xIALU
  23. #
  24. # Apple A7 5.50/+49% 3.33 1.70
  25. # Cortex-A53 8.40/+80% 4.72 4.72(*)
  26. # Cortex-A57 8.06/+43% 4.90 4.43(**)
  27. # Denver 4.50/+82% 2.63 2.67(*)
  28. # X-Gene 9.50/+46% 8.82 8.89(*)
  29. # Mongoose 8.00/+44% 3.64 3.25
  30. # Kryo 8.17/+50% 4.83 4.65
  31. #
  32. # (*) it's expected that doubling interleave factor doesn't help
  33. # all processors, only those with higher NEON latency and
  34. # higher instruction issue rate;
  35. # (**) expected improvement was actually higher;
  36. $flavour=shift;
  37. $output=shift;
  38. $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
  39. ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
  40. ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
  41. die "can't locate arm-xlate.pl";
  42. open OUT,"| \"$^X\" $xlate $flavour $output";
  43. *STDOUT=*OUT;
  44. sub AUTOLOAD() # thunk [simplified] x86-style perlasm
  45. { my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; $opcode =~ s/_/\./;
  46. my $arg = pop;
  47. $arg = "#$arg" if ($arg*1 eq $arg);
  48. $code .= "\t$opcode\t".join(',',@_,$arg)."\n";
  49. }
  50. my ($out,$inp,$len,$key,$ctr) = map("x$_",(0..4));
  51. my @x=map("x$_",(5..17,19..21));
  52. my @d=map("x$_",(22..28,30));
  53. sub ROUND {
  54. my ($a0,$b0,$c0,$d0)=@_;
  55. my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0));
  56. my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1));
  57. my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2));
  58. (
  59. "&add_32 (@x[$a0],@x[$a0],@x[$b0])",
  60. "&add_32 (@x[$a1],@x[$a1],@x[$b1])",
  61. "&add_32 (@x[$a2],@x[$a2],@x[$b2])",
  62. "&add_32 (@x[$a3],@x[$a3],@x[$b3])",
  63. "&eor_32 (@x[$d0],@x[$d0],@x[$a0])",
  64. "&eor_32 (@x[$d1],@x[$d1],@x[$a1])",
  65. "&eor_32 (@x[$d2],@x[$d2],@x[$a2])",
  66. "&eor_32 (@x[$d3],@x[$d3],@x[$a3])",
  67. "&ror_32 (@x[$d0],@x[$d0],16)",
  68. "&ror_32 (@x[$d1],@x[$d1],16)",
  69. "&ror_32 (@x[$d2],@x[$d2],16)",
  70. "&ror_32 (@x[$d3],@x[$d3],16)",
  71. "&add_32 (@x[$c0],@x[$c0],@x[$d0])",
  72. "&add_32 (@x[$c1],@x[$c1],@x[$d1])",
  73. "&add_32 (@x[$c2],@x[$c2],@x[$d2])",
  74. "&add_32 (@x[$c3],@x[$c3],@x[$d3])",
  75. "&eor_32 (@x[$b0],@x[$b0],@x[$c0])",
  76. "&eor_32 (@x[$b1],@x[$b1],@x[$c1])",
  77. "&eor_32 (@x[$b2],@x[$b2],@x[$c2])",
  78. "&eor_32 (@x[$b3],@x[$b3],@x[$c3])",
  79. "&ror_32 (@x[$b0],@x[$b0],20)",
  80. "&ror_32 (@x[$b1],@x[$b1],20)",
  81. "&ror_32 (@x[$b2],@x[$b2],20)",
  82. "&ror_32 (@x[$b3],@x[$b3],20)",
  83. "&add_32 (@x[$a0],@x[$a0],@x[$b0])",
  84. "&add_32 (@x[$a1],@x[$a1],@x[$b1])",
  85. "&add_32 (@x[$a2],@x[$a2],@x[$b2])",
  86. "&add_32 (@x[$a3],@x[$a3],@x[$b3])",
  87. "&eor_32 (@x[$d0],@x[$d0],@x[$a0])",
  88. "&eor_32 (@x[$d1],@x[$d1],@x[$a1])",
  89. "&eor_32 (@x[$d2],@x[$d2],@x[$a2])",
  90. "&eor_32 (@x[$d3],@x[$d3],@x[$a3])",
  91. "&ror_32 (@x[$d0],@x[$d0],24)",
  92. "&ror_32 (@x[$d1],@x[$d1],24)",
  93. "&ror_32 (@x[$d2],@x[$d2],24)",
  94. "&ror_32 (@x[$d3],@x[$d3],24)",
  95. "&add_32 (@x[$c0],@x[$c0],@x[$d0])",
  96. "&add_32 (@x[$c1],@x[$c1],@x[$d1])",
  97. "&add_32 (@x[$c2],@x[$c2],@x[$d2])",
  98. "&add_32 (@x[$c3],@x[$c3],@x[$d3])",
  99. "&eor_32 (@x[$b0],@x[$b0],@x[$c0])",
  100. "&eor_32 (@x[$b1],@x[$b1],@x[$c1])",
  101. "&eor_32 (@x[$b2],@x[$b2],@x[$c2])",
  102. "&eor_32 (@x[$b3],@x[$b3],@x[$c3])",
  103. "&ror_32 (@x[$b0],@x[$b0],25)",
  104. "&ror_32 (@x[$b1],@x[$b1],25)",
  105. "&ror_32 (@x[$b2],@x[$b2],25)",
  106. "&ror_32 (@x[$b3],@x[$b3],25)"
  107. );
  108. }
  109. $code.=<<___;
  110. #include "arm_arch.h"
  111. .text
  112. .extern OPENSSL_armcap_P
  113. .hidden OPENSSL_armcap_P
  114. .align 5
  115. .Lsigma:
  116. .quad 0x3320646e61707865,0x6b20657479622d32 // endian-neutral
  117. .Lone:
  118. .long 1,0,0,0
  119. .LOPENSSL_armcap_P:
  120. #ifdef __ILP32__
  121. .long OPENSSL_armcap_P-.
  122. #else
  123. .quad OPENSSL_armcap_P-.
  124. #endif
  125. .asciz "ChaCha20 for ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
  126. .globl ChaCha20_ctr32
  127. .type ChaCha20_ctr32,%function
  128. .align 5
  129. ChaCha20_ctr32:
  130. cbz $len,.Labort
  131. adr @x[0],.LOPENSSL_armcap_P
  132. cmp $len,#192
  133. b.lo .Lshort
  134. #ifdef __ILP32__
  135. ldrsw @x[1],[@x[0]]
  136. #else
  137. ldr @x[1],[@x[0]]
  138. #endif
  139. ldr w17,[@x[1],@x[0]]
  140. tst w17,#ARMV7_NEON
  141. b.ne ChaCha20_neon
  142. .Lshort:
  143. .inst 0xd503233f // paciasp
  144. stp x29,x30,[sp,#-96]!
  145. add x29,sp,#0
  146. adr @x[0],.Lsigma
  147. stp x19,x20,[sp,#16]
  148. stp x21,x22,[sp,#32]
  149. stp x23,x24,[sp,#48]
  150. stp x25,x26,[sp,#64]
  151. stp x27,x28,[sp,#80]
  152. sub sp,sp,#64
  153. ldp @d[0],@d[1],[@x[0]] // load sigma
  154. ldp @d[2],@d[3],[$key] // load key
  155. ldp @d[4],@d[5],[$key,#16]
  156. ldp @d[6],@d[7],[$ctr] // load counter
  157. #ifdef __ARMEB__
  158. ror @d[2],@d[2],#32
  159. ror @d[3],@d[3],#32
  160. ror @d[4],@d[4],#32
  161. ror @d[5],@d[5],#32
  162. ror @d[6],@d[6],#32
  163. ror @d[7],@d[7],#32
  164. #endif
  165. .Loop_outer:
  166. mov.32 @x[0],@d[0] // unpack key block
  167. lsr @x[1],@d[0],#32
  168. mov.32 @x[2],@d[1]
  169. lsr @x[3],@d[1],#32
  170. mov.32 @x[4],@d[2]
  171. lsr @x[5],@d[2],#32
  172. mov.32 @x[6],@d[3]
  173. lsr @x[7],@d[3],#32
  174. mov.32 @x[8],@d[4]
  175. lsr @x[9],@d[4],#32
  176. mov.32 @x[10],@d[5]
  177. lsr @x[11],@d[5],#32
  178. mov.32 @x[12],@d[6]
  179. lsr @x[13],@d[6],#32
  180. mov.32 @x[14],@d[7]
  181. lsr @x[15],@d[7],#32
  182. mov $ctr,#10
  183. subs $len,$len,#64
  184. .Loop:
  185. sub $ctr,$ctr,#1
  186. ___
  187. foreach (&ROUND(0, 4, 8,12)) { eval; }
  188. foreach (&ROUND(0, 5,10,15)) { eval; }
  189. $code.=<<___;
  190. cbnz $ctr,.Loop
  191. add.32 @x[0],@x[0],@d[0] // accumulate key block
  192. add @x[1],@x[1],@d[0],lsr#32
  193. add.32 @x[2],@x[2],@d[1]
  194. add @x[3],@x[3],@d[1],lsr#32
  195. add.32 @x[4],@x[4],@d[2]
  196. add @x[5],@x[5],@d[2],lsr#32
  197. add.32 @x[6],@x[6],@d[3]
  198. add @x[7],@x[7],@d[3],lsr#32
  199. add.32 @x[8],@x[8],@d[4]
  200. add @x[9],@x[9],@d[4],lsr#32
  201. add.32 @x[10],@x[10],@d[5]
  202. add @x[11],@x[11],@d[5],lsr#32
  203. add.32 @x[12],@x[12],@d[6]
  204. add @x[13],@x[13],@d[6],lsr#32
  205. add.32 @x[14],@x[14],@d[7]
  206. add @x[15],@x[15],@d[7],lsr#32
  207. b.lo .Ltail
  208. add @x[0],@x[0],@x[1],lsl#32 // pack
  209. add @x[2],@x[2],@x[3],lsl#32
  210. ldp @x[1],@x[3],[$inp,#0] // load input
  211. add @x[4],@x[4],@x[5],lsl#32
  212. add @x[6],@x[6],@x[7],lsl#32
  213. ldp @x[5],@x[7],[$inp,#16]
  214. add @x[8],@x[8],@x[9],lsl#32
  215. add @x[10],@x[10],@x[11],lsl#32
  216. ldp @x[9],@x[11],[$inp,#32]
  217. add @x[12],@x[12],@x[13],lsl#32
  218. add @x[14],@x[14],@x[15],lsl#32
  219. ldp @x[13],@x[15],[$inp,#48]
  220. add $inp,$inp,#64
  221. #ifdef __ARMEB__
  222. rev @x[0],@x[0]
  223. rev @x[2],@x[2]
  224. rev @x[4],@x[4]
  225. rev @x[6],@x[6]
  226. rev @x[8],@x[8]
  227. rev @x[10],@x[10]
  228. rev @x[12],@x[12]
  229. rev @x[14],@x[14]
  230. #endif
  231. eor @x[0],@x[0],@x[1]
  232. eor @x[2],@x[2],@x[3]
  233. eor @x[4],@x[4],@x[5]
  234. eor @x[6],@x[6],@x[7]
  235. eor @x[8],@x[8],@x[9]
  236. eor @x[10],@x[10],@x[11]
  237. eor @x[12],@x[12],@x[13]
  238. eor @x[14],@x[14],@x[15]
  239. stp @x[0],@x[2],[$out,#0] // store output
  240. add @d[6],@d[6],#1 // increment counter
  241. stp @x[4],@x[6],[$out,#16]
  242. stp @x[8],@x[10],[$out,#32]
  243. stp @x[12],@x[14],[$out,#48]
  244. add $out,$out,#64
  245. b.hi .Loop_outer
  246. ldp x19,x20,[x29,#16]
  247. add sp,sp,#64
  248. ldp x21,x22,[x29,#32]
  249. ldp x23,x24,[x29,#48]
  250. ldp x25,x26,[x29,#64]
  251. ldp x27,x28,[x29,#80]
  252. ldp x29,x30,[sp],#96
  253. .inst 0xd50323bf // autiasp
  254. .Labort:
  255. ret
  256. .align 4
  257. .Ltail:
  258. add $len,$len,#64
  259. .Less_than_64:
  260. sub $out,$out,#1
  261. add $inp,$inp,$len
  262. add $out,$out,$len
  263. add $ctr,sp,$len
  264. neg $len,$len
  265. add @x[0],@x[0],@x[1],lsl#32 // pack
  266. add @x[2],@x[2],@x[3],lsl#32
  267. add @x[4],@x[4],@x[5],lsl#32
  268. add @x[6],@x[6],@x[7],lsl#32
  269. add @x[8],@x[8],@x[9],lsl#32
  270. add @x[10],@x[10],@x[11],lsl#32
  271. add @x[12],@x[12],@x[13],lsl#32
  272. add @x[14],@x[14],@x[15],lsl#32
  273. #ifdef __ARMEB__
  274. rev @x[0],@x[0]
  275. rev @x[2],@x[2]
  276. rev @x[4],@x[4]
  277. rev @x[6],@x[6]
  278. rev @x[8],@x[8]
  279. rev @x[10],@x[10]
  280. rev @x[12],@x[12]
  281. rev @x[14],@x[14]
  282. #endif
  283. stp @x[0],@x[2],[sp,#0]
  284. stp @x[4],@x[6],[sp,#16]
  285. stp @x[8],@x[10],[sp,#32]
  286. stp @x[12],@x[14],[sp,#48]
  287. .Loop_tail:
  288. ldrb w10,[$inp,$len]
  289. ldrb w11,[$ctr,$len]
  290. add $len,$len,#1
  291. eor w10,w10,w11
  292. strb w10,[$out,$len]
  293. cbnz $len,.Loop_tail
  294. stp xzr,xzr,[sp,#0]
  295. stp xzr,xzr,[sp,#16]
  296. stp xzr,xzr,[sp,#32]
  297. stp xzr,xzr,[sp,#48]
  298. ldp x19,x20,[x29,#16]
  299. add sp,sp,#64
  300. ldp x21,x22,[x29,#32]
  301. ldp x23,x24,[x29,#48]
  302. ldp x25,x26,[x29,#64]
  303. ldp x27,x28,[x29,#80]
  304. ldp x29,x30,[sp],#96
  305. .inst 0xd50323bf // autiasp
  306. ret
  307. .size ChaCha20_ctr32,.-ChaCha20_ctr32
  308. ___
  309. {{{
  310. my ($A0,$B0,$C0,$D0,$A1,$B1,$C1,$D1,$A2,$B2,$C2,$D2,$T0,$T1,$T2,$T3) =
  311. map("v$_.4s",(0..7,16..23));
  312. my (@K)=map("v$_.4s",(24..30));
  313. my $ONE="v31.4s";
  314. sub NEONROUND {
  315. my $odd = pop;
  316. my ($a,$b,$c,$d,$t)=@_;
  317. (
  318. "&add ('$a','$a','$b')",
  319. "&eor ('$d','$d','$a')",
  320. "&rev32_16 ('$d','$d')", # vrot ($d,16)
  321. "&add ('$c','$c','$d')",
  322. "&eor ('$t','$b','$c')",
  323. "&ushr ('$b','$t',20)",
  324. "&sli ('$b','$t',12)",
  325. "&add ('$a','$a','$b')",
  326. "&eor ('$t','$d','$a')",
  327. "&ushr ('$d','$t',24)",
  328. "&sli ('$d','$t',8)",
  329. "&add ('$c','$c','$d')",
  330. "&eor ('$t','$b','$c')",
  331. "&ushr ('$b','$t',25)",
  332. "&sli ('$b','$t',7)",
  333. "&ext ('$c','$c','$c',8)",
  334. "&ext ('$d','$d','$d',$odd?4:12)",
  335. "&ext ('$b','$b','$b',$odd?12:4)"
  336. );
  337. }
  338. $code.=<<___;
  339. .type ChaCha20_neon,%function
  340. .align 5
  341. ChaCha20_neon:
  342. .inst 0xd503233f // paciasp
  343. stp x29,x30,[sp,#-96]!
  344. add x29,sp,#0
  345. adr @x[0],.Lsigma
  346. stp x19,x20,[sp,#16]
  347. stp x21,x22,[sp,#32]
  348. stp x23,x24,[sp,#48]
  349. stp x25,x26,[sp,#64]
  350. stp x27,x28,[sp,#80]
  351. cmp $len,#512
  352. b.hs .L512_or_more_neon
  353. sub sp,sp,#64
  354. ldp @d[0],@d[1],[@x[0]] // load sigma
  355. ld1 {@K[0]},[@x[0]],#16
  356. ldp @d[2],@d[3],[$key] // load key
  357. ldp @d[4],@d[5],[$key,#16]
  358. ld1 {@K[1],@K[2]},[$key]
  359. ldp @d[6],@d[7],[$ctr] // load counter
  360. ld1 {@K[3]},[$ctr]
  361. ld1 {$ONE},[@x[0]]
  362. #ifdef __ARMEB__
  363. rev64 @K[0],@K[0]
  364. ror @d[2],@d[2],#32
  365. ror @d[3],@d[3],#32
  366. ror @d[4],@d[4],#32
  367. ror @d[5],@d[5],#32
  368. ror @d[6],@d[6],#32
  369. ror @d[7],@d[7],#32
  370. #endif
  371. add @K[3],@K[3],$ONE // += 1
  372. add @K[4],@K[3],$ONE
  373. add @K[5],@K[4],$ONE
  374. shl $ONE,$ONE,#2 // 1 -> 4
  375. .Loop_outer_neon:
  376. mov.32 @x[0],@d[0] // unpack key block
  377. lsr @x[1],@d[0],#32
  378. mov $A0,@K[0]
  379. mov.32 @x[2],@d[1]
  380. lsr @x[3],@d[1],#32
  381. mov $A1,@K[0]
  382. mov.32 @x[4],@d[2]
  383. lsr @x[5],@d[2],#32
  384. mov $A2,@K[0]
  385. mov.32 @x[6],@d[3]
  386. mov $B0,@K[1]
  387. lsr @x[7],@d[3],#32
  388. mov $B1,@K[1]
  389. mov.32 @x[8],@d[4]
  390. mov $B2,@K[1]
  391. lsr @x[9],@d[4],#32
  392. mov $D0,@K[3]
  393. mov.32 @x[10],@d[5]
  394. mov $D1,@K[4]
  395. lsr @x[11],@d[5],#32
  396. mov $D2,@K[5]
  397. mov.32 @x[12],@d[6]
  398. mov $C0,@K[2]
  399. lsr @x[13],@d[6],#32
  400. mov $C1,@K[2]
  401. mov.32 @x[14],@d[7]
  402. mov $C2,@K[2]
  403. lsr @x[15],@d[7],#32
  404. mov $ctr,#10
  405. subs $len,$len,#256
  406. .Loop_neon:
  407. sub $ctr,$ctr,#1
  408. ___
  409. my @thread0=&NEONROUND($A0,$B0,$C0,$D0,$T0,0);
  410. my @thread1=&NEONROUND($A1,$B1,$C1,$D1,$T1,0);
  411. my @thread2=&NEONROUND($A2,$B2,$C2,$D2,$T2,0);
  412. my @thread3=&ROUND(0,4,8,12);
  413. foreach (@thread0) {
  414. eval; eval(shift(@thread3));
  415. eval(shift(@thread1)); eval(shift(@thread3));
  416. eval(shift(@thread2)); eval(shift(@thread3));
  417. }
  418. @thread0=&NEONROUND($A0,$B0,$C0,$D0,$T0,1);
  419. @thread1=&NEONROUND($A1,$B1,$C1,$D1,$T1,1);
  420. @thread2=&NEONROUND($A2,$B2,$C2,$D2,$T2,1);
  421. @thread3=&ROUND(0,5,10,15);
  422. foreach (@thread0) {
  423. eval; eval(shift(@thread3));
  424. eval(shift(@thread1)); eval(shift(@thread3));
  425. eval(shift(@thread2)); eval(shift(@thread3));
  426. }
  427. $code.=<<___;
  428. cbnz $ctr,.Loop_neon
  429. add.32 @x[0],@x[0],@d[0] // accumulate key block
  430. add $A0,$A0,@K[0]
  431. add @x[1],@x[1],@d[0],lsr#32
  432. add $A1,$A1,@K[0]
  433. add.32 @x[2],@x[2],@d[1]
  434. add $A2,$A2,@K[0]
  435. add @x[3],@x[3],@d[1],lsr#32
  436. add $C0,$C0,@K[2]
  437. add.32 @x[4],@x[4],@d[2]
  438. add $C1,$C1,@K[2]
  439. add @x[5],@x[5],@d[2],lsr#32
  440. add $C2,$C2,@K[2]
  441. add.32 @x[6],@x[6],@d[3]
  442. add $D0,$D0,@K[3]
  443. add @x[7],@x[7],@d[3],lsr#32
  444. add.32 @x[8],@x[8],@d[4]
  445. add $D1,$D1,@K[4]
  446. add @x[9],@x[9],@d[4],lsr#32
  447. add.32 @x[10],@x[10],@d[5]
  448. add $D2,$D2,@K[5]
  449. add @x[11],@x[11],@d[5],lsr#32
  450. add.32 @x[12],@x[12],@d[6]
  451. add $B0,$B0,@K[1]
  452. add @x[13],@x[13],@d[6],lsr#32
  453. add.32 @x[14],@x[14],@d[7]
  454. add $B1,$B1,@K[1]
  455. add @x[15],@x[15],@d[7],lsr#32
  456. add $B2,$B2,@K[1]
  457. b.lo .Ltail_neon
  458. add @x[0],@x[0],@x[1],lsl#32 // pack
  459. add @x[2],@x[2],@x[3],lsl#32
  460. ldp @x[1],@x[3],[$inp,#0] // load input
  461. add @x[4],@x[4],@x[5],lsl#32
  462. add @x[6],@x[6],@x[7],lsl#32
  463. ldp @x[5],@x[7],[$inp,#16]
  464. add @x[8],@x[8],@x[9],lsl#32
  465. add @x[10],@x[10],@x[11],lsl#32
  466. ldp @x[9],@x[11],[$inp,#32]
  467. add @x[12],@x[12],@x[13],lsl#32
  468. add @x[14],@x[14],@x[15],lsl#32
  469. ldp @x[13],@x[15],[$inp,#48]
  470. add $inp,$inp,#64
  471. #ifdef __ARMEB__
  472. rev @x[0],@x[0]
  473. rev @x[2],@x[2]
  474. rev @x[4],@x[4]
  475. rev @x[6],@x[6]
  476. rev @x[8],@x[8]
  477. rev @x[10],@x[10]
  478. rev @x[12],@x[12]
  479. rev @x[14],@x[14]
  480. #endif
  481. ld1.8 {$T0-$T3},[$inp],#64
  482. eor @x[0],@x[0],@x[1]
  483. eor @x[2],@x[2],@x[3]
  484. eor @x[4],@x[4],@x[5]
  485. eor @x[6],@x[6],@x[7]
  486. eor @x[8],@x[8],@x[9]
  487. eor $A0,$A0,$T0
  488. eor @x[10],@x[10],@x[11]
  489. eor $B0,$B0,$T1
  490. eor @x[12],@x[12],@x[13]
  491. eor $C0,$C0,$T2
  492. eor @x[14],@x[14],@x[15]
  493. eor $D0,$D0,$T3
  494. ld1.8 {$T0-$T3},[$inp],#64
  495. stp @x[0],@x[2],[$out,#0] // store output
  496. add @d[6],@d[6],#4 // increment counter
  497. stp @x[4],@x[6],[$out,#16]
  498. add @K[3],@K[3],$ONE // += 4
  499. stp @x[8],@x[10],[$out,#32]
  500. add @K[4],@K[4],$ONE
  501. stp @x[12],@x[14],[$out,#48]
  502. add @K[5],@K[5],$ONE
  503. add $out,$out,#64
  504. st1.8 {$A0-$D0},[$out],#64
  505. ld1.8 {$A0-$D0},[$inp],#64
  506. eor $A1,$A1,$T0
  507. eor $B1,$B1,$T1
  508. eor $C1,$C1,$T2
  509. eor $D1,$D1,$T3
  510. st1.8 {$A1-$D1},[$out],#64
  511. eor $A2,$A2,$A0
  512. eor $B2,$B2,$B0
  513. eor $C2,$C2,$C0
  514. eor $D2,$D2,$D0
  515. st1.8 {$A2-$D2},[$out],#64
  516. b.hi .Loop_outer_neon
  517. ldp x19,x20,[x29,#16]
  518. add sp,sp,#64
  519. ldp x21,x22,[x29,#32]
  520. ldp x23,x24,[x29,#48]
  521. ldp x25,x26,[x29,#64]
  522. ldp x27,x28,[x29,#80]
  523. ldp x29,x30,[sp],#96
  524. .inst 0xd50323bf // autiasp
  525. ret
  526. .Ltail_neon:
  527. add $len,$len,#256
  528. cmp $len,#64
  529. b.lo .Less_than_64
  530. add @x[0],@x[0],@x[1],lsl#32 // pack
  531. add @x[2],@x[2],@x[3],lsl#32
  532. ldp @x[1],@x[3],[$inp,#0] // load input
  533. add @x[4],@x[4],@x[5],lsl#32
  534. add @x[6],@x[6],@x[7],lsl#32
  535. ldp @x[5],@x[7],[$inp,#16]
  536. add @x[8],@x[8],@x[9],lsl#32
  537. add @x[10],@x[10],@x[11],lsl#32
  538. ldp @x[9],@x[11],[$inp,#32]
  539. add @x[12],@x[12],@x[13],lsl#32
  540. add @x[14],@x[14],@x[15],lsl#32
  541. ldp @x[13],@x[15],[$inp,#48]
  542. add $inp,$inp,#64
  543. #ifdef __ARMEB__
  544. rev @x[0],@x[0]
  545. rev @x[2],@x[2]
  546. rev @x[4],@x[4]
  547. rev @x[6],@x[6]
  548. rev @x[8],@x[8]
  549. rev @x[10],@x[10]
  550. rev @x[12],@x[12]
  551. rev @x[14],@x[14]
  552. #endif
  553. eor @x[0],@x[0],@x[1]
  554. eor @x[2],@x[2],@x[3]
  555. eor @x[4],@x[4],@x[5]
  556. eor @x[6],@x[6],@x[7]
  557. eor @x[8],@x[8],@x[9]
  558. eor @x[10],@x[10],@x[11]
  559. eor @x[12],@x[12],@x[13]
  560. eor @x[14],@x[14],@x[15]
  561. stp @x[0],@x[2],[$out,#0] // store output
  562. add @d[6],@d[6],#4 // increment counter
  563. stp @x[4],@x[6],[$out,#16]
  564. stp @x[8],@x[10],[$out,#32]
  565. stp @x[12],@x[14],[$out,#48]
  566. add $out,$out,#64
  567. b.eq .Ldone_neon
  568. sub $len,$len,#64
  569. cmp $len,#64
  570. b.lo .Less_than_128
  571. ld1.8 {$T0-$T3},[$inp],#64
  572. eor $A0,$A0,$T0
  573. eor $B0,$B0,$T1
  574. eor $C0,$C0,$T2
  575. eor $D0,$D0,$T3
  576. st1.8 {$A0-$D0},[$out],#64
  577. b.eq .Ldone_neon
  578. sub $len,$len,#64
  579. cmp $len,#64
  580. b.lo .Less_than_192
  581. ld1.8 {$T0-$T3},[$inp],#64
  582. eor $A1,$A1,$T0
  583. eor $B1,$B1,$T1
  584. eor $C1,$C1,$T2
  585. eor $D1,$D1,$T3
  586. st1.8 {$A1-$D1},[$out],#64
  587. b.eq .Ldone_neon
  588. sub $len,$len,#64
  589. st1.8 {$A2-$D2},[sp]
  590. b .Last_neon
  591. .Less_than_128:
  592. st1.8 {$A0-$D0},[sp]
  593. b .Last_neon
  594. .Less_than_192:
  595. st1.8 {$A1-$D1},[sp]
  596. b .Last_neon
  597. .align 4
  598. .Last_neon:
  599. sub $out,$out,#1
  600. add $inp,$inp,$len
  601. add $out,$out,$len
  602. add $ctr,sp,$len
  603. neg $len,$len
  604. .Loop_tail_neon:
  605. ldrb w10,[$inp,$len]
  606. ldrb w11,[$ctr,$len]
  607. add $len,$len,#1
  608. eor w10,w10,w11
  609. strb w10,[$out,$len]
  610. cbnz $len,.Loop_tail_neon
  611. stp xzr,xzr,[sp,#0]
  612. stp xzr,xzr,[sp,#16]
  613. stp xzr,xzr,[sp,#32]
  614. stp xzr,xzr,[sp,#48]
  615. .Ldone_neon:
  616. ldp x19,x20,[x29,#16]
  617. add sp,sp,#64
  618. ldp x21,x22,[x29,#32]
  619. ldp x23,x24,[x29,#48]
  620. ldp x25,x26,[x29,#64]
  621. ldp x27,x28,[x29,#80]
  622. ldp x29,x30,[sp],#96
  623. .inst 0xd50323bf // autiasp
  624. ret
  625. .size ChaCha20_neon,.-ChaCha20_neon
  626. ___
  627. {
  628. my ($T0,$T1,$T2,$T3,$T4,$T5)=@K;
  629. my ($A0,$B0,$C0,$D0,$A1,$B1,$C1,$D1,$A2,$B2,$C2,$D2,
  630. $A3,$B3,$C3,$D3,$A4,$B4,$C4,$D4,$A5,$B5,$C5,$D5) = map("v$_.4s",(0..23));
  631. $code.=<<___;
  632. .type ChaCha20_512_neon,%function
  633. .align 5
  634. ChaCha20_512_neon:
  635. .inst 0xd503233f // paciasp
  636. stp x29,x30,[sp,#-96]!
  637. add x29,sp,#0
  638. adr @x[0],.Lsigma
  639. stp x19,x20,[sp,#16]
  640. stp x21,x22,[sp,#32]
  641. stp x23,x24,[sp,#48]
  642. stp x25,x26,[sp,#64]
  643. stp x27,x28,[sp,#80]
  644. .L512_or_more_neon:
  645. sub sp,sp,#128+64
  646. ldp @d[0],@d[1],[@x[0]] // load sigma
  647. ld1 {@K[0]},[@x[0]],#16
  648. ldp @d[2],@d[3],[$key] // load key
  649. ldp @d[4],@d[5],[$key,#16]
  650. ld1 {@K[1],@K[2]},[$key]
  651. ldp @d[6],@d[7],[$ctr] // load counter
  652. ld1 {@K[3]},[$ctr]
  653. ld1 {$ONE},[@x[0]]
  654. #ifdef __ARMEB__
  655. rev64 @K[0],@K[0]
  656. ror @d[2],@d[2],#32
  657. ror @d[3],@d[3],#32
  658. ror @d[4],@d[4],#32
  659. ror @d[5],@d[5],#32
  660. ror @d[6],@d[6],#32
  661. ror @d[7],@d[7],#32
  662. #endif
  663. add @K[3],@K[3],$ONE // += 1
  664. stp @K[0],@K[1],[sp,#0] // off-load key block, invariant part
  665. add @K[3],@K[3],$ONE // not typo
  666. str @K[2],[sp,#32]
  667. add @K[4],@K[3],$ONE
  668. add @K[5],@K[4],$ONE
  669. add @K[6],@K[5],$ONE
  670. shl $ONE,$ONE,#2 // 1 -> 4
  671. stp d8,d9,[sp,#128+0] // meet ABI requirements
  672. stp d10,d11,[sp,#128+16]
  673. stp d12,d13,[sp,#128+32]
  674. stp d14,d15,[sp,#128+48]
  675. sub $len,$len,#512 // not typo
  676. .Loop_outer_512_neon:
  677. mov $A0,@K[0]
  678. mov $A1,@K[0]
  679. mov $A2,@K[0]
  680. mov $A3,@K[0]
  681. mov $A4,@K[0]
  682. mov $A5,@K[0]
  683. mov $B0,@K[1]
  684. mov.32 @x[0],@d[0] // unpack key block
  685. mov $B1,@K[1]
  686. lsr @x[1],@d[0],#32
  687. mov $B2,@K[1]
  688. mov.32 @x[2],@d[1]
  689. mov $B3,@K[1]
  690. lsr @x[3],@d[1],#32
  691. mov $B4,@K[1]
  692. mov.32 @x[4],@d[2]
  693. mov $B5,@K[1]
  694. lsr @x[5],@d[2],#32
  695. mov $D0,@K[3]
  696. mov.32 @x[6],@d[3]
  697. mov $D1,@K[4]
  698. lsr @x[7],@d[3],#32
  699. mov $D2,@K[5]
  700. mov.32 @x[8],@d[4]
  701. mov $D3,@K[6]
  702. lsr @x[9],@d[4],#32
  703. mov $C0,@K[2]
  704. mov.32 @x[10],@d[5]
  705. mov $C1,@K[2]
  706. lsr @x[11],@d[5],#32
  707. add $D4,$D0,$ONE // +4
  708. mov.32 @x[12],@d[6]
  709. add $D5,$D1,$ONE // +4
  710. lsr @x[13],@d[6],#32
  711. mov $C2,@K[2]
  712. mov.32 @x[14],@d[7]
  713. mov $C3,@K[2]
  714. lsr @x[15],@d[7],#32
  715. mov $C4,@K[2]
  716. stp @K[3],@K[4],[sp,#48] // off-load key block, variable part
  717. mov $C5,@K[2]
  718. str @K[5],[sp,#80]
  719. mov $ctr,#5
  720. subs $len,$len,#512
  721. .Loop_upper_neon:
  722. sub $ctr,$ctr,#1
  723. ___
  724. my @thread0=&NEONROUND($A0,$B0,$C0,$D0,$T0,0);
  725. my @thread1=&NEONROUND($A1,$B1,$C1,$D1,$T1,0);
  726. my @thread2=&NEONROUND($A2,$B2,$C2,$D2,$T2,0);
  727. my @thread3=&NEONROUND($A3,$B3,$C3,$D3,$T3,0);
  728. my @thread4=&NEONROUND($A4,$B4,$C4,$D4,$T4,0);
  729. my @thread5=&NEONROUND($A5,$B5,$C5,$D5,$T5,0);
  730. my @thread67=(&ROUND(0,4,8,12),&ROUND(0,5,10,15));
  731. my $diff = ($#thread0+1)*6 - $#thread67 - 1;
  732. my $i = 0;
  733. foreach (@thread0) {
  734. eval; eval(shift(@thread67));
  735. eval(shift(@thread1)); eval(shift(@thread67));
  736. eval(shift(@thread2)); eval(shift(@thread67));
  737. eval(shift(@thread3)); eval(shift(@thread67));
  738. eval(shift(@thread4)); eval(shift(@thread67));
  739. eval(shift(@thread5)); eval(shift(@thread67));
  740. }
  741. @thread0=&NEONROUND($A0,$B0,$C0,$D0,$T0,1);
  742. @thread1=&NEONROUND($A1,$B1,$C1,$D1,$T1,1);
  743. @thread2=&NEONROUND($A2,$B2,$C2,$D2,$T2,1);
  744. @thread3=&NEONROUND($A3,$B3,$C3,$D3,$T3,1);
  745. @thread4=&NEONROUND($A4,$B4,$C4,$D4,$T4,1);
  746. @thread5=&NEONROUND($A5,$B5,$C5,$D5,$T5,1);
  747. @thread67=(&ROUND(0,4,8,12),&ROUND(0,5,10,15));
  748. foreach (@thread0) {
  749. eval; eval(shift(@thread67));
  750. eval(shift(@thread1)); eval(shift(@thread67));
  751. eval(shift(@thread2)); eval(shift(@thread67));
  752. eval(shift(@thread3)); eval(shift(@thread67));
  753. eval(shift(@thread4)); eval(shift(@thread67));
  754. eval(shift(@thread5)); eval(shift(@thread67));
  755. }
  756. $code.=<<___;
  757. cbnz $ctr,.Loop_upper_neon
  758. add.32 @x[0],@x[0],@d[0] // accumulate key block
  759. add @x[1],@x[1],@d[0],lsr#32
  760. add.32 @x[2],@x[2],@d[1]
  761. add @x[3],@x[3],@d[1],lsr#32
  762. add.32 @x[4],@x[4],@d[2]
  763. add @x[5],@x[5],@d[2],lsr#32
  764. add.32 @x[6],@x[6],@d[3]
  765. add @x[7],@x[7],@d[3],lsr#32
  766. add.32 @x[8],@x[8],@d[4]
  767. add @x[9],@x[9],@d[4],lsr#32
  768. add.32 @x[10],@x[10],@d[5]
  769. add @x[11],@x[11],@d[5],lsr#32
  770. add.32 @x[12],@x[12],@d[6]
  771. add @x[13],@x[13],@d[6],lsr#32
  772. add.32 @x[14],@x[14],@d[7]
  773. add @x[15],@x[15],@d[7],lsr#32
  774. add @x[0],@x[0],@x[1],lsl#32 // pack
  775. add @x[2],@x[2],@x[3],lsl#32
  776. ldp @x[1],@x[3],[$inp,#0] // load input
  777. add @x[4],@x[4],@x[5],lsl#32
  778. add @x[6],@x[6],@x[7],lsl#32
  779. ldp @x[5],@x[7],[$inp,#16]
  780. add @x[8],@x[8],@x[9],lsl#32
  781. add @x[10],@x[10],@x[11],lsl#32
  782. ldp @x[9],@x[11],[$inp,#32]
  783. add @x[12],@x[12],@x[13],lsl#32
  784. add @x[14],@x[14],@x[15],lsl#32
  785. ldp @x[13],@x[15],[$inp,#48]
  786. add $inp,$inp,#64
  787. #ifdef __ARMEB__
  788. rev @x[0],@x[0]
  789. rev @x[2],@x[2]
  790. rev @x[4],@x[4]
  791. rev @x[6],@x[6]
  792. rev @x[8],@x[8]
  793. rev @x[10],@x[10]
  794. rev @x[12],@x[12]
  795. rev @x[14],@x[14]
  796. #endif
  797. eor @x[0],@x[0],@x[1]
  798. eor @x[2],@x[2],@x[3]
  799. eor @x[4],@x[4],@x[5]
  800. eor @x[6],@x[6],@x[7]
  801. eor @x[8],@x[8],@x[9]
  802. eor @x[10],@x[10],@x[11]
  803. eor @x[12],@x[12],@x[13]
  804. eor @x[14],@x[14],@x[15]
  805. stp @x[0],@x[2],[$out,#0] // store output
  806. add @d[6],@d[6],#1 // increment counter
  807. mov.32 @x[0],@d[0] // unpack key block
  808. lsr @x[1],@d[0],#32
  809. stp @x[4],@x[6],[$out,#16]
  810. mov.32 @x[2],@d[1]
  811. lsr @x[3],@d[1],#32
  812. stp @x[8],@x[10],[$out,#32]
  813. mov.32 @x[4],@d[2]
  814. lsr @x[5],@d[2],#32
  815. stp @x[12],@x[14],[$out,#48]
  816. add $out,$out,#64
  817. mov.32 @x[6],@d[3]
  818. lsr @x[7],@d[3],#32
  819. mov.32 @x[8],@d[4]
  820. lsr @x[9],@d[4],#32
  821. mov.32 @x[10],@d[5]
  822. lsr @x[11],@d[5],#32
  823. mov.32 @x[12],@d[6]
  824. lsr @x[13],@d[6],#32
  825. mov.32 @x[14],@d[7]
  826. lsr @x[15],@d[7],#32
  827. mov $ctr,#5
  828. .Loop_lower_neon:
  829. sub $ctr,$ctr,#1
  830. ___
  831. @thread0=&NEONROUND($A0,$B0,$C0,$D0,$T0,0);
  832. @thread1=&NEONROUND($A1,$B1,$C1,$D1,$T1,0);
  833. @thread2=&NEONROUND($A2,$B2,$C2,$D2,$T2,0);
  834. @thread3=&NEONROUND($A3,$B3,$C3,$D3,$T3,0);
  835. @thread4=&NEONROUND($A4,$B4,$C4,$D4,$T4,0);
  836. @thread5=&NEONROUND($A5,$B5,$C5,$D5,$T5,0);
  837. @thread67=(&ROUND(0,4,8,12),&ROUND(0,5,10,15));
  838. foreach (@thread0) {
  839. eval; eval(shift(@thread67));
  840. eval(shift(@thread1)); eval(shift(@thread67));
  841. eval(shift(@thread2)); eval(shift(@thread67));
  842. eval(shift(@thread3)); eval(shift(@thread67));
  843. eval(shift(@thread4)); eval(shift(@thread67));
  844. eval(shift(@thread5)); eval(shift(@thread67));
  845. }
  846. @thread0=&NEONROUND($A0,$B0,$C0,$D0,$T0,1);
  847. @thread1=&NEONROUND($A1,$B1,$C1,$D1,$T1,1);
  848. @thread2=&NEONROUND($A2,$B2,$C2,$D2,$T2,1);
  849. @thread3=&NEONROUND($A3,$B3,$C3,$D3,$T3,1);
  850. @thread4=&NEONROUND($A4,$B4,$C4,$D4,$T4,1);
  851. @thread5=&NEONROUND($A5,$B5,$C5,$D5,$T5,1);
  852. @thread67=(&ROUND(0,4,8,12),&ROUND(0,5,10,15));
  853. foreach (@thread0) {
  854. eval; eval(shift(@thread67));
  855. eval(shift(@thread1)); eval(shift(@thread67));
  856. eval(shift(@thread2)); eval(shift(@thread67));
  857. eval(shift(@thread3)); eval(shift(@thread67));
  858. eval(shift(@thread4)); eval(shift(@thread67));
  859. eval(shift(@thread5)); eval(shift(@thread67));
  860. }
  861. $code.=<<___;
  862. cbnz $ctr,.Loop_lower_neon
  863. add.32 @x[0],@x[0],@d[0] // accumulate key block
  864. ldp @K[0],@K[1],[sp,#0]
  865. add @x[1],@x[1],@d[0],lsr#32
  866. ldp @K[2],@K[3],[sp,#32]
  867. add.32 @x[2],@x[2],@d[1]
  868. ldp @K[4],@K[5],[sp,#64]
  869. add @x[3],@x[3],@d[1],lsr#32
  870. add $A0,$A0,@K[0]
  871. add.32 @x[4],@x[4],@d[2]
  872. add $A1,$A1,@K[0]
  873. add @x[5],@x[5],@d[2],lsr#32
  874. add $A2,$A2,@K[0]
  875. add.32 @x[6],@x[6],@d[3]
  876. add $A3,$A3,@K[0]
  877. add @x[7],@x[7],@d[3],lsr#32
  878. add $A4,$A4,@K[0]
  879. add.32 @x[8],@x[8],@d[4]
  880. add $A5,$A5,@K[0]
  881. add @x[9],@x[9],@d[4],lsr#32
  882. add $C0,$C0,@K[2]
  883. add.32 @x[10],@x[10],@d[5]
  884. add $C1,$C1,@K[2]
  885. add @x[11],@x[11],@d[5],lsr#32
  886. add $C2,$C2,@K[2]
  887. add.32 @x[12],@x[12],@d[6]
  888. add $C3,$C3,@K[2]
  889. add @x[13],@x[13],@d[6],lsr#32
  890. add $C4,$C4,@K[2]
  891. add.32 @x[14],@x[14],@d[7]
  892. add $C5,$C5,@K[2]
  893. add @x[15],@x[15],@d[7],lsr#32
  894. add $D4,$D4,$ONE // +4
  895. add @x[0],@x[0],@x[1],lsl#32 // pack
  896. add $D5,$D5,$ONE // +4
  897. add @x[2],@x[2],@x[3],lsl#32
  898. add $D0,$D0,@K[3]
  899. ldp @x[1],@x[3],[$inp,#0] // load input
  900. add $D1,$D1,@K[4]
  901. add @x[4],@x[4],@x[5],lsl#32
  902. add $D2,$D2,@K[5]
  903. add @x[6],@x[6],@x[7],lsl#32
  904. add $D3,$D3,@K[6]
  905. ldp @x[5],@x[7],[$inp,#16]
  906. add $D4,$D4,@K[3]
  907. add @x[8],@x[8],@x[9],lsl#32
  908. add $D5,$D5,@K[4]
  909. add @x[10],@x[10],@x[11],lsl#32
  910. add $B0,$B0,@K[1]
  911. ldp @x[9],@x[11],[$inp,#32]
  912. add $B1,$B1,@K[1]
  913. add @x[12],@x[12],@x[13],lsl#32
  914. add $B2,$B2,@K[1]
  915. add @x[14],@x[14],@x[15],lsl#32
  916. add $B3,$B3,@K[1]
  917. ldp @x[13],@x[15],[$inp,#48]
  918. add $B4,$B4,@K[1]
  919. add $inp,$inp,#64
  920. add $B5,$B5,@K[1]
  921. #ifdef __ARMEB__
  922. rev @x[0],@x[0]
  923. rev @x[2],@x[2]
  924. rev @x[4],@x[4]
  925. rev @x[6],@x[6]
  926. rev @x[8],@x[8]
  927. rev @x[10],@x[10]
  928. rev @x[12],@x[12]
  929. rev @x[14],@x[14]
  930. #endif
  931. ld1.8 {$T0-$T3},[$inp],#64
  932. eor @x[0],@x[0],@x[1]
  933. eor @x[2],@x[2],@x[3]
  934. eor @x[4],@x[4],@x[5]
  935. eor @x[6],@x[6],@x[7]
  936. eor @x[8],@x[8],@x[9]
  937. eor $A0,$A0,$T0
  938. eor @x[10],@x[10],@x[11]
  939. eor $B0,$B0,$T1
  940. eor @x[12],@x[12],@x[13]
  941. eor $C0,$C0,$T2
  942. eor @x[14],@x[14],@x[15]
  943. eor $D0,$D0,$T3
  944. ld1.8 {$T0-$T3},[$inp],#64
  945. stp @x[0],@x[2],[$out,#0] // store output
  946. add @d[6],@d[6],#7 // increment counter
  947. stp @x[4],@x[6],[$out,#16]
  948. stp @x[8],@x[10],[$out,#32]
  949. stp @x[12],@x[14],[$out,#48]
  950. add $out,$out,#64
  951. st1.8 {$A0-$D0},[$out],#64
  952. ld1.8 {$A0-$D0},[$inp],#64
  953. eor $A1,$A1,$T0
  954. eor $B1,$B1,$T1
  955. eor $C1,$C1,$T2
  956. eor $D1,$D1,$T3
  957. st1.8 {$A1-$D1},[$out],#64
  958. ld1.8 {$A1-$D1},[$inp],#64
  959. eor $A2,$A2,$A0
  960. ldp @K[0],@K[1],[sp,#0]
  961. eor $B2,$B2,$B0
  962. ldp @K[2],@K[3],[sp,#32]
  963. eor $C2,$C2,$C0
  964. eor $D2,$D2,$D0
  965. st1.8 {$A2-$D2},[$out],#64
  966. ld1.8 {$A2-$D2},[$inp],#64
  967. eor $A3,$A3,$A1
  968. eor $B3,$B3,$B1
  969. eor $C3,$C3,$C1
  970. eor $D3,$D3,$D1
  971. st1.8 {$A3-$D3},[$out],#64
  972. ld1.8 {$A3-$D3},[$inp],#64
  973. eor $A4,$A4,$A2
  974. eor $B4,$B4,$B2
  975. eor $C4,$C4,$C2
  976. eor $D4,$D4,$D2
  977. st1.8 {$A4-$D4},[$out],#64
  978. shl $A0,$ONE,#1 // 4 -> 8
  979. eor $A5,$A5,$A3
  980. eor $B5,$B5,$B3
  981. eor $C5,$C5,$C3
  982. eor $D5,$D5,$D3
  983. st1.8 {$A5-$D5},[$out],#64
  984. add @K[3],@K[3],$A0 // += 8
  985. add @K[4],@K[4],$A0
  986. add @K[5],@K[5],$A0
  987. add @K[6],@K[6],$A0
  988. b.hs .Loop_outer_512_neon
  989. adds $len,$len,#512
  990. ushr $A0,$ONE,#2 // 4 -> 1
  991. ldp d8,d9,[sp,#128+0] // meet ABI requirements
  992. ldp d10,d11,[sp,#128+16]
  993. ldp d12,d13,[sp,#128+32]
  994. ldp d14,d15,[sp,#128+48]
  995. stp @K[0],$ONE,[sp,#0] // wipe off-load area
  996. stp @K[0],$ONE,[sp,#32]
  997. stp @K[0],$ONE,[sp,#64]
  998. b.eq .Ldone_512_neon
  999. cmp $len,#192
  1000. sub @K[3],@K[3],$A0 // -= 1
  1001. sub @K[4],@K[4],$A0
  1002. sub @K[5],@K[5],$A0
  1003. add sp,sp,#128
  1004. b.hs .Loop_outer_neon
  1005. eor @K[1],@K[1],@K[1]
  1006. eor @K[2],@K[2],@K[2]
  1007. eor @K[3],@K[3],@K[3]
  1008. eor @K[4],@K[4],@K[4]
  1009. eor @K[5],@K[5],@K[5]
  1010. eor @K[6],@K[6],@K[6]
  1011. b .Loop_outer
  1012. .Ldone_512_neon:
  1013. ldp x19,x20,[x29,#16]
  1014. add sp,sp,#128+64
  1015. ldp x21,x22,[x29,#32]
  1016. ldp x23,x24,[x29,#48]
  1017. ldp x25,x26,[x29,#64]
  1018. ldp x27,x28,[x29,#80]
  1019. ldp x29,x30,[sp],#96
  1020. .inst 0xd50323bf // autiasp
  1021. ret
  1022. .size ChaCha20_512_neon,.-ChaCha20_512_neon
  1023. ___
  1024. }
  1025. }}}
  1026. foreach (split("\n",$code)) {
  1027. s/\`([^\`]*)\`/eval $1/geo;
  1028. (s/\b([a-z]+)\.32\b/$1/ and (s/x([0-9]+)/w$1/g or 1)) or
  1029. (m/\b(eor|ext|mov)\b/ and (s/\.4s/\.16b/g or 1)) or
  1030. (s/\b((?:ld|st)1)\.8\b/$1/ and (s/\.4s/\.16b/g or 1)) or
  1031. (m/\b(ld|st)[rp]\b/ and (s/v([0-9]+)\.4s/q$1/g or 1)) or
  1032. (s/\brev32\.16\b/rev32/ and (s/\.4s/\.8h/g or 1));
  1033. #s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo;
  1034. print $_,"\n";
  1035. }
  1036. close STDOUT or die "error closing STDOUT: $!"; # flush