chacha-armv4.pl 27 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160
  1. #! /usr/bin/env perl
  2. # Copyright 2016-2020 The OpenSSL Project Authors. All Rights Reserved.
  3. #
  4. # Licensed under the OpenSSL license (the "License"). You may not use
  5. # this file except in compliance with the License. You can obtain a copy
  6. # in the file LICENSE in the source distribution or at
  7. # https://www.openssl.org/source/license.html
  8. #
  9. # ====================================================================
  10. # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
  11. # project. The module is, however, dual licensed under OpenSSL and
  12. # CRYPTOGAMS licenses depending on where you obtain it. For further
  13. # details see http://www.openssl.org/~appro/cryptogams/.
  14. # ====================================================================
  15. #
  16. # December 2014
  17. #
  18. # ChaCha20 for ARMv4.
  19. #
  20. # Performance in cycles per byte out of large buffer.
  21. #
  22. # IALU/gcc-4.4 1xNEON 3xNEON+1xIALU
  23. #
  24. # Cortex-A5 19.3(*)/+95% 21.8 14.1
  25. # Cortex-A8 10.5(*)/+160% 13.9 6.35
  26. # Cortex-A9 12.9(**)/+110% 14.3 6.50
  27. # Cortex-A15 11.0/+40% 16.0 5.00
  28. # Snapdragon S4 11.5/+125% 13.6 4.90
  29. #
  30. # (*) most "favourable" result for aligned data on little-endian
  31. # processor, result for misaligned data is 10-15% lower;
  32. # (**) this result is a trade-off: it can be improved by 20%,
  33. # but then Snapdragon S4 and Cortex-A8 results get
  34. # 20-25% worse;
  35. $flavour = shift;
  36. if ($flavour=~/\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; }
  37. else { while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} }
  38. if ($flavour && $flavour ne "void") {
  39. $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
  40. ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
  41. ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
  42. die "can't locate arm-xlate.pl";
  43. open STDOUT,"| \"$^X\" $xlate $flavour $output";
  44. } else {
  45. open STDOUT,">$output";
  46. }
  47. sub AUTOLOAD() # thunk [simplified] x86-style perlasm
  48. { my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; $opcode =~ s/_/\./;
  49. my $arg = pop;
  50. $arg = "#$arg" if ($arg*1 eq $arg);
  51. $code .= "\t$opcode\t".join(',',@_,$arg)."\n";
  52. }
  53. my @x=map("r$_",(0..7,"x","x","x","x",12,"x",14,"x"));
  54. my @t=map("r$_",(8..11));
  55. sub ROUND {
  56. my ($a0,$b0,$c0,$d0)=@_;
  57. my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0));
  58. my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1));
  59. my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2));
  60. my $odd = $d0&1;
  61. my ($xc,$xc_) = (@t[0..1]);
  62. my ($xd,$xd_) = $odd ? (@t[2],@x[$d1]) : (@x[$d0],@t[2]);
  63. my @ret;
  64. # Consider order in which variables are addressed by their
  65. # index:
  66. #
  67. # a b c d
  68. #
  69. # 0 4 8 12 < even round
  70. # 1 5 9 13
  71. # 2 6 10 14
  72. # 3 7 11 15
  73. # 0 5 10 15 < odd round
  74. # 1 6 11 12
  75. # 2 7 8 13
  76. # 3 4 9 14
  77. #
  78. # 'a', 'b' are permanently allocated in registers, @x[0..7],
  79. # while 'c's and pair of 'd's are maintained in memory. If
  80. # you observe 'c' column, you'll notice that pair of 'c's is
  81. # invariant between rounds. This means that we have to reload
  82. # them once per round, in the middle. This is why you'll see
  83. # bunch of 'c' stores and loads in the middle, but none in
  84. # the beginning or end. If you observe 'd' column, you'll
  85. # notice that 15 and 13 are reused in next pair of rounds.
  86. # This is why these two are chosen for offloading to memory,
  87. # to make loads count more.
  88. push @ret,(
  89. "&add (@x[$a0],@x[$a0],@x[$b0])",
  90. "&mov ($xd,$xd,'ror#16')",
  91. "&add (@x[$a1],@x[$a1],@x[$b1])",
  92. "&mov ($xd_,$xd_,'ror#16')",
  93. "&eor ($xd,$xd,@x[$a0],'ror#16')",
  94. "&eor ($xd_,$xd_,@x[$a1],'ror#16')",
  95. "&add ($xc,$xc,$xd)",
  96. "&mov (@x[$b0],@x[$b0],'ror#20')",
  97. "&add ($xc_,$xc_,$xd_)",
  98. "&mov (@x[$b1],@x[$b1],'ror#20')",
  99. "&eor (@x[$b0],@x[$b0],$xc,'ror#20')",
  100. "&eor (@x[$b1],@x[$b1],$xc_,'ror#20')",
  101. "&add (@x[$a0],@x[$a0],@x[$b0])",
  102. "&mov ($xd,$xd,'ror#24')",
  103. "&add (@x[$a1],@x[$a1],@x[$b1])",
  104. "&mov ($xd_,$xd_,'ror#24')",
  105. "&eor ($xd,$xd,@x[$a0],'ror#24')",
  106. "&eor ($xd_,$xd_,@x[$a1],'ror#24')",
  107. "&add ($xc,$xc,$xd)",
  108. "&mov (@x[$b0],@x[$b0],'ror#25')" );
  109. push @ret,(
  110. "&str ($xd,'[sp,#4*(16+$d0)]')",
  111. "&ldr ($xd,'[sp,#4*(16+$d2)]')" ) if ($odd);
  112. push @ret,(
  113. "&add ($xc_,$xc_,$xd_)",
  114. "&mov (@x[$b1],@x[$b1],'ror#25')" );
  115. push @ret,(
  116. "&str ($xd_,'[sp,#4*(16+$d1)]')",
  117. "&ldr ($xd_,'[sp,#4*(16+$d3)]')" ) if (!$odd);
  118. push @ret,(
  119. "&eor (@x[$b0],@x[$b0],$xc,'ror#25')",
  120. "&eor (@x[$b1],@x[$b1],$xc_,'ror#25')" );
  121. $xd=@x[$d2] if (!$odd);
  122. $xd_=@x[$d3] if ($odd);
  123. push @ret,(
  124. "&str ($xc,'[sp,#4*(16+$c0)]')",
  125. "&ldr ($xc,'[sp,#4*(16+$c2)]')",
  126. "&add (@x[$a2],@x[$a2],@x[$b2])",
  127. "&mov ($xd,$xd,'ror#16')",
  128. "&str ($xc_,'[sp,#4*(16+$c1)]')",
  129. "&ldr ($xc_,'[sp,#4*(16+$c3)]')",
  130. "&add (@x[$a3],@x[$a3],@x[$b3])",
  131. "&mov ($xd_,$xd_,'ror#16')",
  132. "&eor ($xd,$xd,@x[$a2],'ror#16')",
  133. "&eor ($xd_,$xd_,@x[$a3],'ror#16')",
  134. "&add ($xc,$xc,$xd)",
  135. "&mov (@x[$b2],@x[$b2],'ror#20')",
  136. "&add ($xc_,$xc_,$xd_)",
  137. "&mov (@x[$b3],@x[$b3],'ror#20')",
  138. "&eor (@x[$b2],@x[$b2],$xc,'ror#20')",
  139. "&eor (@x[$b3],@x[$b3],$xc_,'ror#20')",
  140. "&add (@x[$a2],@x[$a2],@x[$b2])",
  141. "&mov ($xd,$xd,'ror#24')",
  142. "&add (@x[$a3],@x[$a3],@x[$b3])",
  143. "&mov ($xd_,$xd_,'ror#24')",
  144. "&eor ($xd,$xd,@x[$a2],'ror#24')",
  145. "&eor ($xd_,$xd_,@x[$a3],'ror#24')",
  146. "&add ($xc,$xc,$xd)",
  147. "&mov (@x[$b2],@x[$b2],'ror#25')",
  148. "&add ($xc_,$xc_,$xd_)",
  149. "&mov (@x[$b3],@x[$b3],'ror#25')",
  150. "&eor (@x[$b2],@x[$b2],$xc,'ror#25')",
  151. "&eor (@x[$b3],@x[$b3],$xc_,'ror#25')" );
  152. @ret;
  153. }
  154. $code.=<<___;
  155. #include "arm_arch.h"
  156. .text
  157. #if defined(__thumb2__) || defined(__clang__)
  158. .syntax unified
  159. #endif
  160. #if defined(__thumb2__)
  161. .thumb
  162. #else
  163. .code 32
  164. #endif
  165. #if defined(__thumb2__) || defined(__clang__)
  166. #define ldrhsb ldrbhs
  167. #endif
  168. .align 5
  169. .Lsigma:
  170. .long 0x61707865,0x3320646e,0x79622d32,0x6b206574 @ endian-neutral
  171. .Lone:
  172. .long 1,0,0,0
  173. #if __ARM_MAX_ARCH__>=7
  174. .LOPENSSL_armcap:
  175. .word OPENSSL_armcap_P-.LChaCha20_ctr32
  176. #else
  177. .word -1
  178. #endif
  179. .globl ChaCha20_ctr32
  180. .type ChaCha20_ctr32,%function
  181. .align 5
  182. ChaCha20_ctr32:
  183. .LChaCha20_ctr32:
  184. ldr r12,[sp,#0] @ pull pointer to counter and nonce
  185. stmdb sp!,{r0-r2,r4-r11,lr}
  186. #if __ARM_ARCH__<7 && !defined(__thumb2__)
  187. sub r14,pc,#16 @ ChaCha20_ctr32
  188. #else
  189. adr r14,.LChaCha20_ctr32
  190. #endif
  191. cmp r2,#0 @ len==0?
  192. #ifdef __thumb2__
  193. itt eq
  194. #endif
  195. addeq sp,sp,#4*3
  196. beq .Lno_data
  197. #if __ARM_MAX_ARCH__>=7
  198. cmp r2,#192 @ test len
  199. bls .Lshort
  200. ldr r4,[r14,#-32]
  201. ldr r4,[r14,r4]
  202. # ifdef __APPLE__
  203. ldr r4,[r4]
  204. # endif
  205. tst r4,#ARMV7_NEON
  206. bne .LChaCha20_neon
  207. .Lshort:
  208. #endif
  209. ldmia r12,{r4-r7} @ load counter and nonce
  210. sub sp,sp,#4*(16) @ off-load area
  211. sub r14,r14,#64 @ .Lsigma
  212. stmdb sp!,{r4-r7} @ copy counter and nonce
  213. ldmia r3,{r4-r11} @ load key
  214. ldmia r14,{r0-r3} @ load sigma
  215. stmdb sp!,{r4-r11} @ copy key
  216. stmdb sp!,{r0-r3} @ copy sigma
  217. str r10,[sp,#4*(16+10)] @ off-load "@x[10]"
  218. str r11,[sp,#4*(16+11)] @ off-load "@x[11]"
  219. b .Loop_outer_enter
  220. .align 4
  221. .Loop_outer:
  222. ldmia sp,{r0-r9} @ load key material
  223. str @t[3],[sp,#4*(32+2)] @ save len
  224. str r12, [sp,#4*(32+1)] @ save inp
  225. str r14, [sp,#4*(32+0)] @ save out
  226. .Loop_outer_enter:
  227. ldr @t[3], [sp,#4*(15)]
  228. ldr @x[12],[sp,#4*(12)] @ modulo-scheduled load
  229. ldr @t[2], [sp,#4*(13)]
  230. ldr @x[14],[sp,#4*(14)]
  231. str @t[3], [sp,#4*(16+15)]
  232. mov @t[3],#10
  233. b .Loop
  234. .align 4
  235. .Loop:
  236. subs @t[3],@t[3],#1
  237. ___
  238. foreach (&ROUND(0, 4, 8,12)) { eval; }
  239. foreach (&ROUND(0, 5,10,15)) { eval; }
  240. $code.=<<___;
  241. bne .Loop
  242. ldr @t[3],[sp,#4*(32+2)] @ load len
  243. str @t[0], [sp,#4*(16+8)] @ modulo-scheduled store
  244. str @t[1], [sp,#4*(16+9)]
  245. str @x[12],[sp,#4*(16+12)]
  246. str @t[2], [sp,#4*(16+13)]
  247. str @x[14],[sp,#4*(16+14)]
  248. @ at this point we have first half of 512-bit result in
  249. @ @x[0-7] and second half at sp+4*(16+8)
  250. cmp @t[3],#64 @ done yet?
  251. #ifdef __thumb2__
  252. itete lo
  253. #endif
  254. addlo r12,sp,#4*(0) @ shortcut or ...
  255. ldrhs r12,[sp,#4*(32+1)] @ ... load inp
  256. addlo r14,sp,#4*(0) @ shortcut or ...
  257. ldrhs r14,[sp,#4*(32+0)] @ ... load out
  258. ldr @t[0],[sp,#4*(0)] @ load key material
  259. ldr @t[1],[sp,#4*(1)]
  260. #if __ARM_ARCH__>=6 || !defined(__ARMEB__)
  261. # if __ARM_ARCH__<7
  262. orr @t[2],r12,r14
  263. tst @t[2],#3 @ are input and output aligned?
  264. ldr @t[2],[sp,#4*(2)]
  265. bne .Lunaligned
  266. cmp @t[3],#64 @ restore flags
  267. # else
  268. ldr @t[2],[sp,#4*(2)]
  269. # endif
  270. ldr @t[3],[sp,#4*(3)]
  271. add @x[0],@x[0],@t[0] @ accumulate key material
  272. add @x[1],@x[1],@t[1]
  273. # ifdef __thumb2__
  274. itt hs
  275. # endif
  276. ldrhs @t[0],[r12],#16 @ load input
  277. ldrhs @t[1],[r12,#-12]
  278. add @x[2],@x[2],@t[2]
  279. add @x[3],@x[3],@t[3]
  280. # ifdef __thumb2__
  281. itt hs
  282. # endif
  283. ldrhs @t[2],[r12,#-8]
  284. ldrhs @t[3],[r12,#-4]
  285. # if __ARM_ARCH__>=6 && defined(__ARMEB__)
  286. rev @x[0],@x[0]
  287. rev @x[1],@x[1]
  288. rev @x[2],@x[2]
  289. rev @x[3],@x[3]
  290. # endif
  291. # ifdef __thumb2__
  292. itt hs
  293. # endif
  294. eorhs @x[0],@x[0],@t[0] @ xor with input
  295. eorhs @x[1],@x[1],@t[1]
  296. add @t[0],sp,#4*(4)
  297. str @x[0],[r14],#16 @ store output
  298. # ifdef __thumb2__
  299. itt hs
  300. # endif
  301. eorhs @x[2],@x[2],@t[2]
  302. eorhs @x[3],@x[3],@t[3]
  303. ldmia @t[0],{@t[0]-@t[3]} @ load key material
  304. str @x[1],[r14,#-12]
  305. str @x[2],[r14,#-8]
  306. str @x[3],[r14,#-4]
  307. add @x[4],@x[4],@t[0] @ accumulate key material
  308. add @x[5],@x[5],@t[1]
  309. # ifdef __thumb2__
  310. itt hs
  311. # endif
  312. ldrhs @t[0],[r12],#16 @ load input
  313. ldrhs @t[1],[r12,#-12]
  314. add @x[6],@x[6],@t[2]
  315. add @x[7],@x[7],@t[3]
  316. # ifdef __thumb2__
  317. itt hs
  318. # endif
  319. ldrhs @t[2],[r12,#-8]
  320. ldrhs @t[3],[r12,#-4]
  321. # if __ARM_ARCH__>=6 && defined(__ARMEB__)
  322. rev @x[4],@x[4]
  323. rev @x[5],@x[5]
  324. rev @x[6],@x[6]
  325. rev @x[7],@x[7]
  326. # endif
  327. # ifdef __thumb2__
  328. itt hs
  329. # endif
  330. eorhs @x[4],@x[4],@t[0]
  331. eorhs @x[5],@x[5],@t[1]
  332. add @t[0],sp,#4*(8)
  333. str @x[4],[r14],#16 @ store output
  334. # ifdef __thumb2__
  335. itt hs
  336. # endif
  337. eorhs @x[6],@x[6],@t[2]
  338. eorhs @x[7],@x[7],@t[3]
  339. str @x[5],[r14,#-12]
  340. ldmia @t[0],{@t[0]-@t[3]} @ load key material
  341. str @x[6],[r14,#-8]
  342. add @x[0],sp,#4*(16+8)
  343. str @x[7],[r14,#-4]
  344. ldmia @x[0],{@x[0]-@x[7]} @ load second half
  345. add @x[0],@x[0],@t[0] @ accumulate key material
  346. add @x[1],@x[1],@t[1]
  347. # ifdef __thumb2__
  348. itt hs
  349. # endif
  350. ldrhs @t[0],[r12],#16 @ load input
  351. ldrhs @t[1],[r12,#-12]
  352. # ifdef __thumb2__
  353. itt hi
  354. # endif
  355. strhi @t[2],[sp,#4*(16+10)] @ copy "@x[10]" while at it
  356. strhi @t[3],[sp,#4*(16+11)] @ copy "@x[11]" while at it
  357. add @x[2],@x[2],@t[2]
  358. add @x[3],@x[3],@t[3]
  359. # ifdef __thumb2__
  360. itt hs
  361. # endif
  362. ldrhs @t[2],[r12,#-8]
  363. ldrhs @t[3],[r12,#-4]
  364. # if __ARM_ARCH__>=6 && defined(__ARMEB__)
  365. rev @x[0],@x[0]
  366. rev @x[1],@x[1]
  367. rev @x[2],@x[2]
  368. rev @x[3],@x[3]
  369. # endif
  370. # ifdef __thumb2__
  371. itt hs
  372. # endif
  373. eorhs @x[0],@x[0],@t[0]
  374. eorhs @x[1],@x[1],@t[1]
  375. add @t[0],sp,#4*(12)
  376. str @x[0],[r14],#16 @ store output
  377. # ifdef __thumb2__
  378. itt hs
  379. # endif
  380. eorhs @x[2],@x[2],@t[2]
  381. eorhs @x[3],@x[3],@t[3]
  382. str @x[1],[r14,#-12]
  383. ldmia @t[0],{@t[0]-@t[3]} @ load key material
  384. str @x[2],[r14,#-8]
  385. str @x[3],[r14,#-4]
  386. add @x[4],@x[4],@t[0] @ accumulate key material
  387. add @x[5],@x[5],@t[1]
  388. # ifdef __thumb2__
  389. itt hi
  390. # endif
  391. addhi @t[0],@t[0],#1 @ next counter value
  392. strhi @t[0],[sp,#4*(12)] @ save next counter value
  393. # ifdef __thumb2__
  394. itt hs
  395. # endif
  396. ldrhs @t[0],[r12],#16 @ load input
  397. ldrhs @t[1],[r12,#-12]
  398. add @x[6],@x[6],@t[2]
  399. add @x[7],@x[7],@t[3]
  400. # ifdef __thumb2__
  401. itt hs
  402. # endif
  403. ldrhs @t[2],[r12,#-8]
  404. ldrhs @t[3],[r12,#-4]
  405. # if __ARM_ARCH__>=6 && defined(__ARMEB__)
  406. rev @x[4],@x[4]
  407. rev @x[5],@x[5]
  408. rev @x[6],@x[6]
  409. rev @x[7],@x[7]
  410. # endif
  411. # ifdef __thumb2__
  412. itt hs
  413. # endif
  414. eorhs @x[4],@x[4],@t[0]
  415. eorhs @x[5],@x[5],@t[1]
  416. # ifdef __thumb2__
  417. it ne
  418. # endif
  419. ldrne @t[0],[sp,#4*(32+2)] @ re-load len
  420. # ifdef __thumb2__
  421. itt hs
  422. # endif
  423. eorhs @x[6],@x[6],@t[2]
  424. eorhs @x[7],@x[7],@t[3]
  425. str @x[4],[r14],#16 @ store output
  426. str @x[5],[r14,#-12]
  427. # ifdef __thumb2__
  428. it hs
  429. # endif
  430. subhs @t[3],@t[0],#64 @ len-=64
  431. str @x[6],[r14,#-8]
  432. str @x[7],[r14,#-4]
  433. bhi .Loop_outer
  434. beq .Ldone
  435. # if __ARM_ARCH__<7
  436. b .Ltail
  437. .align 4
  438. .Lunaligned: @ unaligned endian-neutral path
  439. cmp @t[3],#64 @ restore flags
  440. # endif
  441. #endif
  442. #if __ARM_ARCH__<7
  443. ldr @t[3],[sp,#4*(3)]
  444. ___
  445. for ($i=0;$i<16;$i+=4) {
  446. my $j=$i&0x7;
  447. $code.=<<___ if ($i==4);
  448. add @x[0],sp,#4*(16+8)
  449. ___
  450. $code.=<<___ if ($i==8);
  451. ldmia @x[0],{@x[0]-@x[7]} @ load second half
  452. # ifdef __thumb2__
  453. itt hi
  454. # endif
  455. strhi @t[2],[sp,#4*(16+10)] @ copy "@x[10]"
  456. strhi @t[3],[sp,#4*(16+11)] @ copy "@x[11]"
  457. ___
  458. $code.=<<___;
  459. add @x[$j+0],@x[$j+0],@t[0] @ accumulate key material
  460. ___
  461. $code.=<<___ if ($i==12);
  462. # ifdef __thumb2__
  463. itt hi
  464. # endif
  465. addhi @t[0],@t[0],#1 @ next counter value
  466. strhi @t[0],[sp,#4*(12)] @ save next counter value
  467. ___
  468. $code.=<<___;
  469. add @x[$j+1],@x[$j+1],@t[1]
  470. add @x[$j+2],@x[$j+2],@t[2]
  471. # ifdef __thumb2__
  472. itete lo
  473. # endif
  474. eorlo @t[0],@t[0],@t[0] @ zero or ...
  475. ldrhsb @t[0],[r12],#16 @ ... load input
  476. eorlo @t[1],@t[1],@t[1]
  477. ldrhsb @t[1],[r12,#-12]
  478. add @x[$j+3],@x[$j+3],@t[3]
  479. # ifdef __thumb2__
  480. itete lo
  481. # endif
  482. eorlo @t[2],@t[2],@t[2]
  483. ldrhsb @t[2],[r12,#-8]
  484. eorlo @t[3],@t[3],@t[3]
  485. ldrhsb @t[3],[r12,#-4]
  486. eor @x[$j+0],@t[0],@x[$j+0] @ xor with input (or zero)
  487. eor @x[$j+1],@t[1],@x[$j+1]
  488. # ifdef __thumb2__
  489. itt hs
  490. # endif
  491. ldrhsb @t[0],[r12,#-15] @ load more input
  492. ldrhsb @t[1],[r12,#-11]
  493. eor @x[$j+2],@t[2],@x[$j+2]
  494. strb @x[$j+0],[r14],#16 @ store output
  495. eor @x[$j+3],@t[3],@x[$j+3]
  496. # ifdef __thumb2__
  497. itt hs
  498. # endif
  499. ldrhsb @t[2],[r12,#-7]
  500. ldrhsb @t[3],[r12,#-3]
  501. strb @x[$j+1],[r14,#-12]
  502. eor @x[$j+0],@t[0],@x[$j+0],lsr#8
  503. strb @x[$j+2],[r14,#-8]
  504. eor @x[$j+1],@t[1],@x[$j+1],lsr#8
  505. # ifdef __thumb2__
  506. itt hs
  507. # endif
  508. ldrhsb @t[0],[r12,#-14] @ load more input
  509. ldrhsb @t[1],[r12,#-10]
  510. strb @x[$j+3],[r14,#-4]
  511. eor @x[$j+2],@t[2],@x[$j+2],lsr#8
  512. strb @x[$j+0],[r14,#-15]
  513. eor @x[$j+3],@t[3],@x[$j+3],lsr#8
  514. # ifdef __thumb2__
  515. itt hs
  516. # endif
  517. ldrhsb @t[2],[r12,#-6]
  518. ldrhsb @t[3],[r12,#-2]
  519. strb @x[$j+1],[r14,#-11]
  520. eor @x[$j+0],@t[0],@x[$j+0],lsr#8
  521. strb @x[$j+2],[r14,#-7]
  522. eor @x[$j+1],@t[1],@x[$j+1],lsr#8
  523. # ifdef __thumb2__
  524. itt hs
  525. # endif
  526. ldrhsb @t[0],[r12,#-13] @ load more input
  527. ldrhsb @t[1],[r12,#-9]
  528. strb @x[$j+3],[r14,#-3]
  529. eor @x[$j+2],@t[2],@x[$j+2],lsr#8
  530. strb @x[$j+0],[r14,#-14]
  531. eor @x[$j+3],@t[3],@x[$j+3],lsr#8
  532. # ifdef __thumb2__
  533. itt hs
  534. # endif
  535. ldrhsb @t[2],[r12,#-5]
  536. ldrhsb @t[3],[r12,#-1]
  537. strb @x[$j+1],[r14,#-10]
  538. strb @x[$j+2],[r14,#-6]
  539. eor @x[$j+0],@t[0],@x[$j+0],lsr#8
  540. strb @x[$j+3],[r14,#-2]
  541. eor @x[$j+1],@t[1],@x[$j+1],lsr#8
  542. strb @x[$j+0],[r14,#-13]
  543. eor @x[$j+2],@t[2],@x[$j+2],lsr#8
  544. strb @x[$j+1],[r14,#-9]
  545. eor @x[$j+3],@t[3],@x[$j+3],lsr#8
  546. strb @x[$j+2],[r14,#-5]
  547. strb @x[$j+3],[r14,#-1]
  548. ___
  549. $code.=<<___ if ($i<12);
  550. add @t[0],sp,#4*(4+$i)
  551. ldmia @t[0],{@t[0]-@t[3]} @ load key material
  552. ___
  553. }
  554. $code.=<<___;
  555. # ifdef __thumb2__
  556. it ne
  557. # endif
  558. ldrne @t[0],[sp,#4*(32+2)] @ re-load len
  559. # ifdef __thumb2__
  560. it hs
  561. # endif
  562. subhs @t[3],@t[0],#64 @ len-=64
  563. bhi .Loop_outer
  564. beq .Ldone
  565. #endif
  566. .Ltail:
  567. ldr r12,[sp,#4*(32+1)] @ load inp
  568. add @t[1],sp,#4*(0)
  569. ldr r14,[sp,#4*(32+0)] @ load out
  570. .Loop_tail:
  571. ldrb @t[2],[@t[1]],#1 @ read buffer on stack
  572. ldrb @t[3],[r12],#1 @ read input
  573. subs @t[0],@t[0],#1
  574. eor @t[3],@t[3],@t[2]
  575. strb @t[3],[r14],#1 @ store output
  576. bne .Loop_tail
  577. .Ldone:
  578. add sp,sp,#4*(32+3)
  579. .Lno_data:
  580. ldmia sp!,{r4-r11,pc}
  581. .size ChaCha20_ctr32,.-ChaCha20_ctr32
  582. ___
  583. {{{
  584. my ($a0,$b0,$c0,$d0,$a1,$b1,$c1,$d1,$a2,$b2,$c2,$d2,$t0,$t1,$t2,$t3) =
  585. map("q$_",(0..15));
  586. sub NEONROUND {
  587. my $odd = pop;
  588. my ($a,$b,$c,$d,$t)=@_;
  589. (
  590. "&vadd_i32 ($a,$a,$b)",
  591. "&veor ($d,$d,$a)",
  592. "&vrev32_16 ($d,$d)", # vrot ($d,16)
  593. "&vadd_i32 ($c,$c,$d)",
  594. "&veor ($t,$b,$c)",
  595. "&vshr_u32 ($b,$t,20)",
  596. "&vsli_32 ($b,$t,12)",
  597. "&vadd_i32 ($a,$a,$b)",
  598. "&veor ($t,$d,$a)",
  599. "&vshr_u32 ($d,$t,24)",
  600. "&vsli_32 ($d,$t,8)",
  601. "&vadd_i32 ($c,$c,$d)",
  602. "&veor ($t,$b,$c)",
  603. "&vshr_u32 ($b,$t,25)",
  604. "&vsli_32 ($b,$t,7)",
  605. "&vext_8 ($c,$c,$c,8)",
  606. "&vext_8 ($b,$b,$b,$odd?12:4)",
  607. "&vext_8 ($d,$d,$d,$odd?4:12)"
  608. );
  609. }
  610. $code.=<<___;
  611. #if __ARM_MAX_ARCH__>=7
  612. .arch armv7-a
  613. .fpu neon
  614. .type ChaCha20_neon,%function
  615. .align 5
  616. ChaCha20_neon:
  617. ldr r12,[sp,#0] @ pull pointer to counter and nonce
  618. stmdb sp!,{r0-r2,r4-r11,lr}
  619. .LChaCha20_neon:
  620. adr r14,.Lsigma
  621. vstmdb sp!,{d8-d15} @ ABI spec says so
  622. stmdb sp!,{r0-r3}
  623. vld1.32 {$b0-$c0},[r3] @ load key
  624. ldmia r3,{r4-r11} @ load key
  625. sub sp,sp,#4*(16+16)
  626. vld1.32 {$d0},[r12] @ load counter and nonce
  627. add r12,sp,#4*8
  628. ldmia r14,{r0-r3} @ load sigma
  629. vld1.32 {$a0},[r14]! @ load sigma
  630. vld1.32 {$t0},[r14] @ one
  631. vst1.32 {$c0-$d0},[r12] @ copy 1/2key|counter|nonce
  632. vst1.32 {$a0-$b0},[sp] @ copy sigma|1/2key
  633. str r10,[sp,#4*(16+10)] @ off-load "@x[10]"
  634. str r11,[sp,#4*(16+11)] @ off-load "@x[11]"
  635. vshl.i32 $t1#lo,$t0#lo,#1 @ two
  636. vstr $t0#lo,[sp,#4*(16+0)]
  637. vshl.i32 $t2#lo,$t0#lo,#2 @ four
  638. vstr $t1#lo,[sp,#4*(16+2)]
  639. vmov $a1,$a0
  640. vstr $t2#lo,[sp,#4*(16+4)]
  641. vmov $a2,$a0
  642. vmov $b1,$b0
  643. vmov $b2,$b0
  644. b .Loop_neon_enter
  645. .align 4
  646. .Loop_neon_outer:
  647. ldmia sp,{r0-r9} @ load key material
  648. cmp @t[3],#64*2 @ if len<=64*2
  649. bls .Lbreak_neon @ switch to integer-only
  650. vmov $a1,$a0
  651. str @t[3],[sp,#4*(32+2)] @ save len
  652. vmov $a2,$a0
  653. str r12, [sp,#4*(32+1)] @ save inp
  654. vmov $b1,$b0
  655. str r14, [sp,#4*(32+0)] @ save out
  656. vmov $b2,$b0
  657. .Loop_neon_enter:
  658. ldr @t[3], [sp,#4*(15)]
  659. vadd.i32 $d1,$d0,$t0 @ counter+1
  660. ldr @x[12],[sp,#4*(12)] @ modulo-scheduled load
  661. vmov $c1,$c0
  662. ldr @t[2], [sp,#4*(13)]
  663. vmov $c2,$c0
  664. ldr @x[14],[sp,#4*(14)]
  665. vadd.i32 $d2,$d1,$t0 @ counter+2
  666. str @t[3], [sp,#4*(16+15)]
  667. mov @t[3],#10
  668. add @x[12],@x[12],#3 @ counter+3
  669. b .Loop_neon
  670. .align 4
  671. .Loop_neon:
  672. subs @t[3],@t[3],#1
  673. ___
  674. my @thread0=&NEONROUND($a0,$b0,$c0,$d0,$t0,0);
  675. my @thread1=&NEONROUND($a1,$b1,$c1,$d1,$t1,0);
  676. my @thread2=&NEONROUND($a2,$b2,$c2,$d2,$t2,0);
  677. my @thread3=&ROUND(0,4,8,12);
  678. foreach (@thread0) {
  679. eval; eval(shift(@thread3));
  680. eval(shift(@thread1)); eval(shift(@thread3));
  681. eval(shift(@thread2)); eval(shift(@thread3));
  682. }
  683. @thread0=&NEONROUND($a0,$b0,$c0,$d0,$t0,1);
  684. @thread1=&NEONROUND($a1,$b1,$c1,$d1,$t1,1);
  685. @thread2=&NEONROUND($a2,$b2,$c2,$d2,$t2,1);
  686. @thread3=&ROUND(0,5,10,15);
  687. foreach (@thread0) {
  688. eval; eval(shift(@thread3));
  689. eval(shift(@thread1)); eval(shift(@thread3));
  690. eval(shift(@thread2)); eval(shift(@thread3));
  691. }
  692. $code.=<<___;
  693. bne .Loop_neon
  694. add @t[3],sp,#32
  695. vld1.32 {$t0-$t1},[sp] @ load key material
  696. vld1.32 {$t2-$t3},[@t[3]]
  697. ldr @t[3],[sp,#4*(32+2)] @ load len
  698. str @t[0], [sp,#4*(16+8)] @ modulo-scheduled store
  699. str @t[1], [sp,#4*(16+9)]
  700. str @x[12],[sp,#4*(16+12)]
  701. str @t[2], [sp,#4*(16+13)]
  702. str @x[14],[sp,#4*(16+14)]
  703. @ at this point we have first half of 512-bit result in
  704. @ @x[0-7] and second half at sp+4*(16+8)
  705. ldr r12,[sp,#4*(32+1)] @ load inp
  706. ldr r14,[sp,#4*(32+0)] @ load out
  707. vadd.i32 $a0,$a0,$t0 @ accumulate key material
  708. vadd.i32 $a1,$a1,$t0
  709. vadd.i32 $a2,$a2,$t0
  710. vldr $t0#lo,[sp,#4*(16+0)] @ one
  711. vadd.i32 $b0,$b0,$t1
  712. vadd.i32 $b1,$b1,$t1
  713. vadd.i32 $b2,$b2,$t1
  714. vldr $t1#lo,[sp,#4*(16+2)] @ two
  715. vadd.i32 $c0,$c0,$t2
  716. vadd.i32 $c1,$c1,$t2
  717. vadd.i32 $c2,$c2,$t2
  718. vadd.i32 $d1#lo,$d1#lo,$t0#lo @ counter+1
  719. vadd.i32 $d2#lo,$d2#lo,$t1#lo @ counter+2
  720. vadd.i32 $d0,$d0,$t3
  721. vadd.i32 $d1,$d1,$t3
  722. vadd.i32 $d2,$d2,$t3
  723. cmp @t[3],#64*4
  724. blo .Ltail_neon
  725. vld1.8 {$t0-$t1},[r12]! @ load input
  726. mov @t[3],sp
  727. vld1.8 {$t2-$t3},[r12]!
  728. veor $a0,$a0,$t0 @ xor with input
  729. veor $b0,$b0,$t1
  730. vld1.8 {$t0-$t1},[r12]!
  731. veor $c0,$c0,$t2
  732. veor $d0,$d0,$t3
  733. vld1.8 {$t2-$t3},[r12]!
  734. veor $a1,$a1,$t0
  735. vst1.8 {$a0-$b0},[r14]! @ store output
  736. veor $b1,$b1,$t1
  737. vld1.8 {$t0-$t1},[r12]!
  738. veor $c1,$c1,$t2
  739. vst1.8 {$c0-$d0},[r14]!
  740. veor $d1,$d1,$t3
  741. vld1.8 {$t2-$t3},[r12]!
  742. veor $a2,$a2,$t0
  743. vld1.32 {$a0-$b0},[@t[3]]! @ load for next iteration
  744. veor $t0#hi,$t0#hi,$t0#hi
  745. vldr $t0#lo,[sp,#4*(16+4)] @ four
  746. veor $b2,$b2,$t1
  747. vld1.32 {$c0-$d0},[@t[3]]
  748. veor $c2,$c2,$t2
  749. vst1.8 {$a1-$b1},[r14]!
  750. veor $d2,$d2,$t3
  751. vst1.8 {$c1-$d1},[r14]!
  752. vadd.i32 $d0#lo,$d0#lo,$t0#lo @ next counter value
  753. vldr $t0#lo,[sp,#4*(16+0)] @ one
  754. ldmia sp,{@t[0]-@t[3]} @ load key material
  755. add @x[0],@x[0],@t[0] @ accumulate key material
  756. ldr @t[0],[r12],#16 @ load input
  757. vst1.8 {$a2-$b2},[r14]!
  758. add @x[1],@x[1],@t[1]
  759. ldr @t[1],[r12,#-12]
  760. vst1.8 {$c2-$d2},[r14]!
  761. add @x[2],@x[2],@t[2]
  762. ldr @t[2],[r12,#-8]
  763. add @x[3],@x[3],@t[3]
  764. ldr @t[3],[r12,#-4]
  765. # ifdef __ARMEB__
  766. rev @x[0],@x[0]
  767. rev @x[1],@x[1]
  768. rev @x[2],@x[2]
  769. rev @x[3],@x[3]
  770. # endif
  771. eor @x[0],@x[0],@t[0] @ xor with input
  772. add @t[0],sp,#4*(4)
  773. eor @x[1],@x[1],@t[1]
  774. str @x[0],[r14],#16 @ store output
  775. eor @x[2],@x[2],@t[2]
  776. str @x[1],[r14,#-12]
  777. eor @x[3],@x[3],@t[3]
  778. ldmia @t[0],{@t[0]-@t[3]} @ load key material
  779. str @x[2],[r14,#-8]
  780. str @x[3],[r14,#-4]
  781. add @x[4],@x[4],@t[0] @ accumulate key material
  782. ldr @t[0],[r12],#16 @ load input
  783. add @x[5],@x[5],@t[1]
  784. ldr @t[1],[r12,#-12]
  785. add @x[6],@x[6],@t[2]
  786. ldr @t[2],[r12,#-8]
  787. add @x[7],@x[7],@t[3]
  788. ldr @t[3],[r12,#-4]
  789. # ifdef __ARMEB__
  790. rev @x[4],@x[4]
  791. rev @x[5],@x[5]
  792. rev @x[6],@x[6]
  793. rev @x[7],@x[7]
  794. # endif
  795. eor @x[4],@x[4],@t[0]
  796. add @t[0],sp,#4*(8)
  797. eor @x[5],@x[5],@t[1]
  798. str @x[4],[r14],#16 @ store output
  799. eor @x[6],@x[6],@t[2]
  800. str @x[5],[r14,#-12]
  801. eor @x[7],@x[7],@t[3]
  802. ldmia @t[0],{@t[0]-@t[3]} @ load key material
  803. str @x[6],[r14,#-8]
  804. add @x[0],sp,#4*(16+8)
  805. str @x[7],[r14,#-4]
  806. ldmia @x[0],{@x[0]-@x[7]} @ load second half
  807. add @x[0],@x[0],@t[0] @ accumulate key material
  808. ldr @t[0],[r12],#16 @ load input
  809. add @x[1],@x[1],@t[1]
  810. ldr @t[1],[r12,#-12]
  811. # ifdef __thumb2__
  812. it hi
  813. # endif
  814. strhi @t[2],[sp,#4*(16+10)] @ copy "@x[10]" while at it
  815. add @x[2],@x[2],@t[2]
  816. ldr @t[2],[r12,#-8]
  817. # ifdef __thumb2__
  818. it hi
  819. # endif
  820. strhi @t[3],[sp,#4*(16+11)] @ copy "@x[11]" while at it
  821. add @x[3],@x[3],@t[3]
  822. ldr @t[3],[r12,#-4]
  823. # ifdef __ARMEB__
  824. rev @x[0],@x[0]
  825. rev @x[1],@x[1]
  826. rev @x[2],@x[2]
  827. rev @x[3],@x[3]
  828. # endif
  829. eor @x[0],@x[0],@t[0]
  830. add @t[0],sp,#4*(12)
  831. eor @x[1],@x[1],@t[1]
  832. str @x[0],[r14],#16 @ store output
  833. eor @x[2],@x[2],@t[2]
  834. str @x[1],[r14,#-12]
  835. eor @x[3],@x[3],@t[3]
  836. ldmia @t[0],{@t[0]-@t[3]} @ load key material
  837. str @x[2],[r14,#-8]
  838. str @x[3],[r14,#-4]
  839. add @x[4],@x[4],@t[0] @ accumulate key material
  840. add @t[0],@t[0],#4 @ next counter value
  841. add @x[5],@x[5],@t[1]
  842. str @t[0],[sp,#4*(12)] @ save next counter value
  843. ldr @t[0],[r12],#16 @ load input
  844. add @x[6],@x[6],@t[2]
  845. add @x[4],@x[4],#3 @ counter+3
  846. ldr @t[1],[r12,#-12]
  847. add @x[7],@x[7],@t[3]
  848. ldr @t[2],[r12,#-8]
  849. ldr @t[3],[r12,#-4]
  850. # ifdef __ARMEB__
  851. rev @x[4],@x[4]
  852. rev @x[5],@x[5]
  853. rev @x[6],@x[6]
  854. rev @x[7],@x[7]
  855. # endif
  856. eor @x[4],@x[4],@t[0]
  857. # ifdef __thumb2__
  858. it hi
  859. # endif
  860. ldrhi @t[0],[sp,#4*(32+2)] @ re-load len
  861. eor @x[5],@x[5],@t[1]
  862. eor @x[6],@x[6],@t[2]
  863. str @x[4],[r14],#16 @ store output
  864. eor @x[7],@x[7],@t[3]
  865. str @x[5],[r14,#-12]
  866. sub @t[3],@t[0],#64*4 @ len-=64*4
  867. str @x[6],[r14,#-8]
  868. str @x[7],[r14,#-4]
  869. bhi .Loop_neon_outer
  870. b .Ldone_neon
  871. .align 4
  872. .Lbreak_neon:
  873. @ harmonize NEON and integer-only stack frames: load data
  874. @ from NEON frame, but save to integer-only one; distance
  875. @ between the two is 4*(32+4+16-32)=4*(20).
  876. str @t[3], [sp,#4*(20+32+2)] @ save len
  877. add @t[3],sp,#4*(32+4)
  878. str r12, [sp,#4*(20+32+1)] @ save inp
  879. str r14, [sp,#4*(20+32+0)] @ save out
  880. ldr @x[12],[sp,#4*(16+10)]
  881. ldr @x[14],[sp,#4*(16+11)]
  882. vldmia @t[3],{d8-d15} @ fulfill ABI requirement
  883. str @x[12],[sp,#4*(20+16+10)] @ copy "@x[10]"
  884. str @x[14],[sp,#4*(20+16+11)] @ copy "@x[11]"
  885. ldr @t[3], [sp,#4*(15)]
  886. ldr @x[12],[sp,#4*(12)] @ modulo-scheduled load
  887. ldr @t[2], [sp,#4*(13)]
  888. ldr @x[14],[sp,#4*(14)]
  889. str @t[3], [sp,#4*(20+16+15)]
  890. add @t[3],sp,#4*(20)
  891. vst1.32 {$a0-$b0},[@t[3]]! @ copy key
  892. add sp,sp,#4*(20) @ switch frame
  893. vst1.32 {$c0-$d0},[@t[3]]
  894. mov @t[3],#10
  895. b .Loop @ go integer-only
  896. .align 4
  897. .Ltail_neon:
  898. cmp @t[3],#64*3
  899. bhs .L192_or_more_neon
  900. cmp @t[3],#64*2
  901. bhs .L128_or_more_neon
  902. cmp @t[3],#64*1
  903. bhs .L64_or_more_neon
  904. add @t[0],sp,#4*(8)
  905. vst1.8 {$a0-$b0},[sp]
  906. add @t[2],sp,#4*(0)
  907. vst1.8 {$c0-$d0},[@t[0]]
  908. b .Loop_tail_neon
  909. .align 4
  910. .L64_or_more_neon:
  911. vld1.8 {$t0-$t1},[r12]!
  912. vld1.8 {$t2-$t3},[r12]!
  913. veor $a0,$a0,$t0
  914. veor $b0,$b0,$t1
  915. veor $c0,$c0,$t2
  916. veor $d0,$d0,$t3
  917. vst1.8 {$a0-$b0},[r14]!
  918. vst1.8 {$c0-$d0},[r14]!
  919. beq .Ldone_neon
  920. add @t[0],sp,#4*(8)
  921. vst1.8 {$a1-$b1},[sp]
  922. add @t[2],sp,#4*(0)
  923. vst1.8 {$c1-$d1},[@t[0]]
  924. sub @t[3],@t[3],#64*1 @ len-=64*1
  925. b .Loop_tail_neon
  926. .align 4
  927. .L128_or_more_neon:
  928. vld1.8 {$t0-$t1},[r12]!
  929. vld1.8 {$t2-$t3},[r12]!
  930. veor $a0,$a0,$t0
  931. veor $b0,$b0,$t1
  932. vld1.8 {$t0-$t1},[r12]!
  933. veor $c0,$c0,$t2
  934. veor $d0,$d0,$t3
  935. vld1.8 {$t2-$t3},[r12]!
  936. veor $a1,$a1,$t0
  937. veor $b1,$b1,$t1
  938. vst1.8 {$a0-$b0},[r14]!
  939. veor $c1,$c1,$t2
  940. vst1.8 {$c0-$d0},[r14]!
  941. veor $d1,$d1,$t3
  942. vst1.8 {$a1-$b1},[r14]!
  943. vst1.8 {$c1-$d1},[r14]!
  944. beq .Ldone_neon
  945. add @t[0],sp,#4*(8)
  946. vst1.8 {$a2-$b2},[sp]
  947. add @t[2],sp,#4*(0)
  948. vst1.8 {$c2-$d2},[@t[0]]
  949. sub @t[3],@t[3],#64*2 @ len-=64*2
  950. b .Loop_tail_neon
  951. .align 4
  952. .L192_or_more_neon:
  953. vld1.8 {$t0-$t1},[r12]!
  954. vld1.8 {$t2-$t3},[r12]!
  955. veor $a0,$a0,$t0
  956. veor $b0,$b0,$t1
  957. vld1.8 {$t0-$t1},[r12]!
  958. veor $c0,$c0,$t2
  959. veor $d0,$d0,$t3
  960. vld1.8 {$t2-$t3},[r12]!
  961. veor $a1,$a1,$t0
  962. veor $b1,$b1,$t1
  963. vld1.8 {$t0-$t1},[r12]!
  964. veor $c1,$c1,$t2
  965. vst1.8 {$a0-$b0},[r14]!
  966. veor $d1,$d1,$t3
  967. vld1.8 {$t2-$t3},[r12]!
  968. veor $a2,$a2,$t0
  969. vst1.8 {$c0-$d0},[r14]!
  970. veor $b2,$b2,$t1
  971. vst1.8 {$a1-$b1},[r14]!
  972. veor $c2,$c2,$t2
  973. vst1.8 {$c1-$d1},[r14]!
  974. veor $d2,$d2,$t3
  975. vst1.8 {$a2-$b2},[r14]!
  976. vst1.8 {$c2-$d2},[r14]!
  977. beq .Ldone_neon
  978. ldmia sp,{@t[0]-@t[3]} @ load key material
  979. add @x[0],@x[0],@t[0] @ accumulate key material
  980. add @t[0],sp,#4*(4)
  981. add @x[1],@x[1],@t[1]
  982. add @x[2],@x[2],@t[2]
  983. add @x[3],@x[3],@t[3]
  984. ldmia @t[0],{@t[0]-@t[3]} @ load key material
  985. add @x[4],@x[4],@t[0] @ accumulate key material
  986. add @t[0],sp,#4*(8)
  987. add @x[5],@x[5],@t[1]
  988. add @x[6],@x[6],@t[2]
  989. add @x[7],@x[7],@t[3]
  990. ldmia @t[0],{@t[0]-@t[3]} @ load key material
  991. # ifdef __ARMEB__
  992. rev @x[0],@x[0]
  993. rev @x[1],@x[1]
  994. rev @x[2],@x[2]
  995. rev @x[3],@x[3]
  996. rev @x[4],@x[4]
  997. rev @x[5],@x[5]
  998. rev @x[6],@x[6]
  999. rev @x[7],@x[7]
  1000. # endif
  1001. stmia sp,{@x[0]-@x[7]}
  1002. add @x[0],sp,#4*(16+8)
  1003. ldmia @x[0],{@x[0]-@x[7]} @ load second half
  1004. add @x[0],@x[0],@t[0] @ accumulate key material
  1005. add @t[0],sp,#4*(12)
  1006. add @x[1],@x[1],@t[1]
  1007. add @x[2],@x[2],@t[2]
  1008. add @x[3],@x[3],@t[3]
  1009. ldmia @t[0],{@t[0]-@t[3]} @ load key material
  1010. add @x[4],@x[4],@t[0] @ accumulate key material
  1011. add @t[0],sp,#4*(8)
  1012. add @x[5],@x[5],@t[1]
  1013. add @x[4],@x[4],#3 @ counter+3
  1014. add @x[6],@x[6],@t[2]
  1015. add @x[7],@x[7],@t[3]
  1016. ldr @t[3],[sp,#4*(32+2)] @ re-load len
  1017. # ifdef __ARMEB__
  1018. rev @x[0],@x[0]
  1019. rev @x[1],@x[1]
  1020. rev @x[2],@x[2]
  1021. rev @x[3],@x[3]
  1022. rev @x[4],@x[4]
  1023. rev @x[5],@x[5]
  1024. rev @x[6],@x[6]
  1025. rev @x[7],@x[7]
  1026. # endif
  1027. stmia @t[0],{@x[0]-@x[7]}
  1028. add @t[2],sp,#4*(0)
  1029. sub @t[3],@t[3],#64*3 @ len-=64*3
  1030. .Loop_tail_neon:
  1031. ldrb @t[0],[@t[2]],#1 @ read buffer on stack
  1032. ldrb @t[1],[r12],#1 @ read input
  1033. subs @t[3],@t[3],#1
  1034. eor @t[0],@t[0],@t[1]
  1035. strb @t[0],[r14],#1 @ store output
  1036. bne .Loop_tail_neon
  1037. .Ldone_neon:
  1038. add sp,sp,#4*(32+4)
  1039. vldmia sp,{d8-d15}
  1040. add sp,sp,#4*(16+3)
  1041. ldmia sp!,{r4-r11,pc}
  1042. .size ChaCha20_neon,.-ChaCha20_neon
  1043. .comm OPENSSL_armcap_P,4,4
  1044. #endif
  1045. ___
  1046. }}}
  1047. foreach (split("\n",$code)) {
  1048. s/\`([^\`]*)\`/eval $1/geo;
  1049. s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo;
  1050. print $_,"\n";
  1051. }
  1052. close STDOUT or die "error closing STDOUT: $!";