poly1305-sparcv9.pl 24 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120
  1. #! /usr/bin/env perl
  2. # Copyright 2016-2020 The OpenSSL Project Authors. All Rights Reserved.
  3. #
  4. # Licensed under the OpenSSL license (the "License"). You may not use
  5. # this file except in compliance with the License. You can obtain a copy
  6. # in the file LICENSE in the source distribution or at
  7. # https://www.openssl.org/source/license.html
  8. #
  9. # ====================================================================
  10. # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
  11. # project. The module is, however, dual licensed under OpenSSL and
  12. # CRYPTOGAMS licenses depending on where you obtain it. For further
  13. # details see http://www.openssl.org/~appro/cryptogams/.
  14. # ====================================================================
  15. #
  16. # This module implements Poly1305 hash for SPARCv9, vanilla, as well
  17. # as VIS3 and FMA extensions.
  18. #
  19. # May, August 2015
  20. #
  21. # Numbers are cycles per processed byte with poly1305_blocks alone.
  22. #
  23. # IALU(*) FMA
  24. #
  25. # UltraSPARC III 12.3(**)
  26. # SPARC T3 7.92
  27. # SPARC T4 1.70(***) 6.55
  28. # SPARC64 X 5.60 3.64
  29. #
  30. # (*) Comparison to compiler-generated code is really problematic,
  31. # because latter's performance varies too much depending on too
  32. # many variables. For example, one can measure from 5x to 15x
  33. # improvement on T4 for gcc-4.6. Well, in T4 case it's a bit
  34. # unfair comparison, because compiler doesn't use VIS3, but
  35. # given same initial conditions coefficient varies from 3x to 9x.
  36. # (**) Pre-III performance should be even worse; floating-point
  37. # performance for UltraSPARC I-IV on the other hand is reported
  38. # to be 4.25 for hand-coded assembly, but they are just too old
  39. # to care about.
  40. # (***) Multi-process benchmark saturates at ~12.5x single-process
  41. # result on 8-core processor, or ~21GBps per 2.85GHz socket.
  42. my $output = pop;
  43. open STDOUT,">$output";
  44. my ($ctx,$inp,$len,$padbit,$shl,$shr) = map("%i$_",(0..5));
  45. my ($r0,$r1,$r2,$r3,$s1,$s2,$s3,$h4) = map("%l$_",(0..7));
  46. my ($h0,$h1,$h2,$h3, $t0,$t1,$t2) = map("%o$_",(0..5,7));
  47. my ($d0,$d1,$d2,$d3) = map("%g$_",(1..4));
  48. my $output = pop;
  49. open STDOUT,">$stdout";
  50. $code.=<<___;
  51. #include "sparc_arch.h"
  52. #ifdef __arch64__
  53. .register %g2,#scratch
  54. .register %g3,#scratch
  55. # define STPTR stx
  56. # define SIZE_T 8
  57. #else
  58. # define STPTR st
  59. # define SIZE_T 4
  60. #endif
  61. #define LOCALS (STACK_BIAS+STACK_FRAME)
  62. .section ".text",#alloc,#execinstr
  63. #ifdef __PIC__
  64. SPARC_PIC_THUNK(%g1)
  65. #endif
  66. .globl poly1305_init
  67. .align 32
  68. poly1305_init:
  69. save %sp,-STACK_FRAME-16,%sp
  70. nop
  71. SPARC_LOAD_ADDRESS(OPENSSL_sparcv9cap_P,%g1)
  72. ld [%g1],%g1
  73. and %g1,SPARCV9_FMADD|SPARCV9_VIS3,%g1
  74. cmp %g1,SPARCV9_FMADD
  75. be .Lpoly1305_init_fma
  76. nop
  77. stx %g0,[$ctx+0]
  78. stx %g0,[$ctx+8] ! zero hash value
  79. brz,pn $inp,.Lno_key
  80. stx %g0,[$ctx+16]
  81. and $inp,7,$shr ! alignment factor
  82. andn $inp,7,$inp
  83. sll $shr,3,$shr ! *8
  84. neg $shr,$shl
  85. sethi %hi(0x0ffffffc),$t0
  86. set 8,$h1
  87. or $t0,%lo(0x0ffffffc),$t0
  88. set 16,$h2
  89. sllx $t0,32,$t1
  90. or $t0,$t1,$t1 ! 0x0ffffffc0ffffffc
  91. or $t1,3,$t0 ! 0x0ffffffc0fffffff
  92. ldxa [$inp+%g0]0x88,$h0 ! load little-endian key
  93. brz,pt $shr,.Lkey_aligned
  94. ldxa [$inp+$h1]0x88,$h1
  95. ldxa [$inp+$h2]0x88,$h2
  96. srlx $h0,$shr,$h0
  97. sllx $h1,$shl,$t2
  98. srlx $h1,$shr,$h1
  99. or $t2,$h0,$h0
  100. sllx $h2,$shl,$h2
  101. or $h2,$h1,$h1
  102. .Lkey_aligned:
  103. and $t0,$h0,$h0
  104. and $t1,$h1,$h1
  105. stx $h0,[$ctx+32+0] ! store key
  106. stx $h1,[$ctx+32+8]
  107. andcc %g1,SPARCV9_VIS3,%g0
  108. be .Lno_key
  109. nop
  110. 1: call .+8
  111. add %o7,poly1305_blocks_vis3-1b,%o7
  112. add %o7,poly1305_emit-poly1305_blocks_vis3,%o5
  113. STPTR %o7,[%i2]
  114. STPTR %o5,[%i2+SIZE_T]
  115. ret
  116. restore %g0,1,%o0 ! return 1
  117. .Lno_key:
  118. ret
  119. restore %g0,%g0,%o0 ! return 0
  120. .type poly1305_init,#function
  121. .size poly1305_init,.-poly1305_init
  122. .globl poly1305_blocks
  123. .align 32
  124. poly1305_blocks:
  125. save %sp,-STACK_FRAME,%sp
  126. srln $len,4,$len
  127. brz,pn $len,.Lno_data
  128. nop
  129. ld [$ctx+32+0],$r1 ! load key
  130. ld [$ctx+32+4],$r0
  131. ld [$ctx+32+8],$r3
  132. ld [$ctx+32+12],$r2
  133. ld [$ctx+0],$h1 ! load hash value
  134. ld [$ctx+4],$h0
  135. ld [$ctx+8],$h3
  136. ld [$ctx+12],$h2
  137. ld [$ctx+16],$h4
  138. and $inp,7,$shr ! alignment factor
  139. andn $inp,7,$inp
  140. set 8,$d1
  141. sll $shr,3,$shr ! *8
  142. set 16,$d2
  143. neg $shr,$shl
  144. srl $r1,2,$s1
  145. srl $r2,2,$s2
  146. add $r1,$s1,$s1
  147. srl $r3,2,$s3
  148. add $r2,$s2,$s2
  149. add $r3,$s3,$s3
  150. .Loop:
  151. ldxa [$inp+%g0]0x88,$d0 ! load little-endian input
  152. brz,pt $shr,.Linp_aligned
  153. ldxa [$inp+$d1]0x88,$d1
  154. ldxa [$inp+$d2]0x88,$d2
  155. srlx $d0,$shr,$d0
  156. sllx $d1,$shl,$t1
  157. srlx $d1,$shr,$d1
  158. or $t1,$d0,$d0
  159. sllx $d2,$shl,$d2
  160. or $d2,$d1,$d1
  161. .Linp_aligned:
  162. srlx $d0,32,$t0
  163. addcc $d0,$h0,$h0 ! accumulate input
  164. srlx $d1,32,$t1
  165. addccc $t0,$h1,$h1
  166. addccc $d1,$h2,$h2
  167. addccc $t1,$h3,$h3
  168. addc $padbit,$h4,$h4
  169. umul $r0,$h0,$d0
  170. umul $r1,$h0,$d1
  171. umul $r2,$h0,$d2
  172. umul $r3,$h0,$d3
  173. sub $len,1,$len
  174. add $inp,16,$inp
  175. umul $s3,$h1,$t0
  176. umul $r0,$h1,$t1
  177. umul $r1,$h1,$t2
  178. add $t0,$d0,$d0
  179. add $t1,$d1,$d1
  180. umul $r2,$h1,$t0
  181. add $t2,$d2,$d2
  182. add $t0,$d3,$d3
  183. umul $s2,$h2,$t1
  184. umul $s3,$h2,$t2
  185. umul $r0,$h2,$t0
  186. add $t1,$d0,$d0
  187. add $t2,$d1,$d1
  188. umul $r1,$h2,$t1
  189. add $t0,$d2,$d2
  190. add $t1,$d3,$d3
  191. umul $s1,$h3,$t2
  192. umul $s2,$h3,$t0
  193. umul $s3,$h3,$t1
  194. add $t2,$d0,$d0
  195. add $t0,$d1,$d1
  196. umul $r0,$h3,$t2
  197. add $t1,$d2,$d2
  198. add $t2,$d3,$d3
  199. umul $s1,$h4,$t0
  200. umul $s2,$h4,$t1
  201. umul $s3,$h4,$t2
  202. umul $r0,$h4,$h4
  203. add $t0,$d1,$d1
  204. add $t1,$d2,$d2
  205. srlx $d0,32,$h1
  206. add $t2,$d3,$d3
  207. srlx $d1,32,$h2
  208. addcc $d1,$h1,$h1
  209. srlx $d2,32,$h3
  210. set 8,$d1
  211. addccc $d2,$h2,$h2
  212. srlx $d3,32,$t0
  213. set 16,$d2
  214. addccc $d3,$h3,$h3
  215. addc $t0,$h4,$h4
  216. srl $h4,2,$t0 ! final reduction step
  217. andn $h4,3,$t1
  218. and $h4,3,$h4
  219. add $t1,$t0,$t0
  220. addcc $t0,$d0,$h0
  221. addccc %g0,$h1,$h1
  222. addccc %g0,$h2,$h2
  223. addccc %g0,$h3,$h3
  224. brnz,pt $len,.Loop
  225. addc %g0,$h4,$h4
  226. st $h1,[$ctx+0] ! store hash value
  227. st $h0,[$ctx+4]
  228. st $h3,[$ctx+8]
  229. st $h2,[$ctx+12]
  230. st $h4,[$ctx+16]
  231. .Lno_data:
  232. ret
  233. restore
  234. .type poly1305_blocks,#function
  235. .size poly1305_blocks,.-poly1305_blocks
  236. ___
  237. ########################################################################
  238. # VIS3 has umulxhi and addxc...
  239. {
  240. my ($H0,$H1,$H2,$R0,$R1,$S1,$T1) = map("%o$_",(0..5,7));
  241. my ($D0,$D1,$D2,$T0) = map("%g$_",(1..4));
  242. $code.=<<___;
  243. .align 32
  244. poly1305_blocks_vis3:
  245. save %sp,-STACK_FRAME,%sp
  246. srln $len,4,$len
  247. brz,pn $len,.Lno_data
  248. nop
  249. ldx [$ctx+32+0],$R0 ! load key
  250. ldx [$ctx+32+8],$R1
  251. ldx [$ctx+0],$H0 ! load hash value
  252. ldx [$ctx+8],$H1
  253. ld [$ctx+16],$H2
  254. and $inp,7,$shr ! alignment factor
  255. andn $inp,7,$inp
  256. set 8,$r1
  257. sll $shr,3,$shr ! *8
  258. set 16,$r2
  259. neg $shr,$shl
  260. srlx $R1,2,$S1
  261. b .Loop_vis3
  262. add $R1,$S1,$S1
  263. .Loop_vis3:
  264. ldxa [$inp+%g0]0x88,$D0 ! load little-endian input
  265. brz,pt $shr,.Linp_aligned_vis3
  266. ldxa [$inp+$r1]0x88,$D1
  267. ldxa [$inp+$r2]0x88,$D2
  268. srlx $D0,$shr,$D0
  269. sllx $D1,$shl,$T1
  270. srlx $D1,$shr,$D1
  271. or $T1,$D0,$D0
  272. sllx $D2,$shl,$D2
  273. or $D2,$D1,$D1
  274. .Linp_aligned_vis3:
  275. addcc $D0,$H0,$H0 ! accumulate input
  276. sub $len,1,$len
  277. addxccc $D1,$H1,$H1
  278. add $inp,16,$inp
  279. mulx $R0,$H0,$D0 ! r0*h0
  280. addxc $padbit,$H2,$H2
  281. umulxhi $R0,$H0,$D1
  282. mulx $S1,$H1,$T0 ! s1*h1
  283. umulxhi $S1,$H1,$T1
  284. addcc $T0,$D0,$D0
  285. mulx $R1,$H0,$T0 ! r1*h0
  286. addxc $T1,$D1,$D1
  287. umulxhi $R1,$H0,$D2
  288. addcc $T0,$D1,$D1
  289. mulx $R0,$H1,$T0 ! r0*h1
  290. addxc %g0,$D2,$D2
  291. umulxhi $R0,$H1,$T1
  292. addcc $T0,$D1,$D1
  293. mulx $S1,$H2,$T0 ! s1*h2
  294. addxc $T1,$D2,$D2
  295. mulx $R0,$H2,$T1 ! r0*h2
  296. addcc $T0,$D1,$D1
  297. addxc $T1,$D2,$D2
  298. srlx $D2,2,$T0 ! final reduction step
  299. andn $D2,3,$T1
  300. and $D2,3,$H2
  301. add $T1,$T0,$T0
  302. addcc $T0,$D0,$H0
  303. addxccc %g0,$D1,$H1
  304. brnz,pt $len,.Loop_vis3
  305. addxc %g0,$H2,$H2
  306. stx $H0,[$ctx+0] ! store hash value
  307. stx $H1,[$ctx+8]
  308. st $H2,[$ctx+16]
  309. ret
  310. restore
  311. .type poly1305_blocks_vis3,#function
  312. .size poly1305_blocks_vis3,.-poly1305_blocks_vis3
  313. ___
  314. }
  315. my ($mac,$nonce) = ($inp,$len);
  316. $code.=<<___;
  317. .globl poly1305_emit
  318. .align 32
  319. poly1305_emit:
  320. save %sp,-STACK_FRAME,%sp
  321. ld [$ctx+0],$h1 ! load hash value
  322. ld [$ctx+4],$h0
  323. ld [$ctx+8],$h3
  324. ld [$ctx+12],$h2
  325. ld [$ctx+16],$h4
  326. addcc $h0,5,$r0 ! compare to modulus
  327. addccc $h1,0,$r1
  328. addccc $h2,0,$r2
  329. addccc $h3,0,$r3
  330. addc $h4,0,$h4
  331. andcc $h4,4,%g0 ! did it carry/borrow?
  332. movnz %icc,$r0,$h0
  333. ld [$nonce+0],$r0 ! load nonce
  334. movnz %icc,$r1,$h1
  335. ld [$nonce+4],$r1
  336. movnz %icc,$r2,$h2
  337. ld [$nonce+8],$r2
  338. movnz %icc,$r3,$h3
  339. ld [$nonce+12],$r3
  340. addcc $r0,$h0,$h0 ! accumulate nonce
  341. addccc $r1,$h1,$h1
  342. addccc $r2,$h2,$h2
  343. addc $r3,$h3,$h3
  344. srl $h0,8,$r0
  345. stb $h0,[$mac+0] ! store little-endian result
  346. srl $h0,16,$r1
  347. stb $r0,[$mac+1]
  348. srl $h0,24,$r2
  349. stb $r1,[$mac+2]
  350. stb $r2,[$mac+3]
  351. srl $h1,8,$r0
  352. stb $h1,[$mac+4]
  353. srl $h1,16,$r1
  354. stb $r0,[$mac+5]
  355. srl $h1,24,$r2
  356. stb $r1,[$mac+6]
  357. stb $r2,[$mac+7]
  358. srl $h2,8,$r0
  359. stb $h2,[$mac+8]
  360. srl $h2,16,$r1
  361. stb $r0,[$mac+9]
  362. srl $h2,24,$r2
  363. stb $r1,[$mac+10]
  364. stb $r2,[$mac+11]
  365. srl $h3,8,$r0
  366. stb $h3,[$mac+12]
  367. srl $h3,16,$r1
  368. stb $r0,[$mac+13]
  369. srl $h3,24,$r2
  370. stb $r1,[$mac+14]
  371. stb $r2,[$mac+15]
  372. ret
  373. restore
  374. .type poly1305_emit,#function
  375. .size poly1305_emit,.-poly1305_emit
  376. ___
  377. {
  378. my ($ctx,$inp,$len,$padbit) = map("%i$_",(0..3));
  379. my ($in0,$in1,$in2,$in3,$in4) = map("%o$_",(0..4));
  380. my ($i1,$step,$shr,$shl) = map("%l$_",(0..7));
  381. my $i2=$step;
  382. my ($h0lo,$h0hi,$h1lo,$h1hi,$h2lo,$h2hi,$h3lo,$h3hi,
  383. $two0,$two32,$two64,$two96,$two130,$five_two130,
  384. $r0lo,$r0hi,$r1lo,$r1hi,$r2lo,$r2hi,
  385. $s2lo,$s2hi,$s3lo,$s3hi,
  386. $c0lo,$c0hi,$c1lo,$c1hi,$c2lo,$c2hi,$c3lo,$c3hi) = map("%f".2*$_,(0..31));
  387. # borrowings
  388. my ($r3lo,$r3hi,$s1lo,$s1hi) = ($c0lo,$c0hi,$c1lo,$c1hi);
  389. my ($x0,$x1,$x2,$x3) = ($c2lo,$c2hi,$c3lo,$c3hi);
  390. my ($y0,$y1,$y2,$y3) = ($c1lo,$c1hi,$c3hi,$c3lo);
  391. $code.=<<___;
  392. .align 32
  393. poly1305_init_fma:
  394. save %sp,-STACK_FRAME-16,%sp
  395. nop
  396. .Lpoly1305_init_fma:
  397. 1: call .+8
  398. add %o7,.Lconsts_fma-1b,%o7
  399. ldd [%o7+8*0],$two0 ! load constants
  400. ldd [%o7+8*1],$two32
  401. ldd [%o7+8*2],$two64
  402. ldd [%o7+8*3],$two96
  403. ldd [%o7+8*5],$five_two130
  404. std $two0,[$ctx+8*0] ! initial hash value, biased 0
  405. std $two32,[$ctx+8*1]
  406. std $two64,[$ctx+8*2]
  407. std $two96,[$ctx+8*3]
  408. brz,pn $inp,.Lno_key_fma
  409. nop
  410. stx %fsr,[%sp+LOCALS] ! save original %fsr
  411. ldx [%o7+8*6],%fsr ! load new %fsr
  412. std $two0,[$ctx+8*4] ! key "template"
  413. std $two32,[$ctx+8*5]
  414. std $two64,[$ctx+8*6]
  415. std $two96,[$ctx+8*7]
  416. and $inp,7,$shr
  417. andn $inp,7,$inp ! align pointer
  418. mov 8,$i1
  419. sll $shr,3,$shr
  420. mov 16,$i2
  421. neg $shr,$shl
  422. ldxa [$inp+%g0]0x88,$in0 ! load little-endian key
  423. ldxa [$inp+$i1]0x88,$in2
  424. brz $shr,.Lkey_aligned_fma
  425. sethi %hi(0xf0000000),$i1 ! 0xf0000000
  426. ldxa [$inp+$i2]0x88,$in4
  427. srlx $in0,$shr,$in0 ! align data
  428. sllx $in2,$shl,$in1
  429. srlx $in2,$shr,$in2
  430. or $in1,$in0,$in0
  431. sllx $in4,$shl,$in3
  432. or $in3,$in2,$in2
  433. .Lkey_aligned_fma:
  434. or $i1,3,$i2 ! 0xf0000003
  435. srlx $in0,32,$in1
  436. andn $in0,$i1,$in0 ! &=0x0fffffff
  437. andn $in1,$i2,$in1 ! &=0x0ffffffc
  438. srlx $in2,32,$in3
  439. andn $in2,$i2,$in2
  440. andn $in3,$i2,$in3
  441. st $in0,[$ctx+`8*4+4`] ! fill "template"
  442. st $in1,[$ctx+`8*5+4`]
  443. st $in2,[$ctx+`8*6+4`]
  444. st $in3,[$ctx+`8*7+4`]
  445. ldd [$ctx+8*4],$h0lo ! load [biased] key
  446. ldd [$ctx+8*5],$h1lo
  447. ldd [$ctx+8*6],$h2lo
  448. ldd [$ctx+8*7],$h3lo
  449. fsubd $h0lo,$two0, $h0lo ! r0
  450. ldd [%o7+8*7],$two0 ! more constants
  451. fsubd $h1lo,$two32,$h1lo ! r1
  452. ldd [%o7+8*8],$two32
  453. fsubd $h2lo,$two64,$h2lo ! r2
  454. ldd [%o7+8*9],$two64
  455. fsubd $h3lo,$two96,$h3lo ! r3
  456. ldd [%o7+8*10],$two96
  457. fmuld $five_two130,$h1lo,$s1lo ! s1
  458. fmuld $five_two130,$h2lo,$s2lo ! s2
  459. fmuld $five_two130,$h3lo,$s3lo ! s3
  460. faddd $h0lo,$two0, $h0hi
  461. faddd $h1lo,$two32,$h1hi
  462. faddd $h2lo,$two64,$h2hi
  463. faddd $h3lo,$two96,$h3hi
  464. fsubd $h0hi,$two0, $h0hi
  465. ldd [%o7+8*11],$two0 ! more constants
  466. fsubd $h1hi,$two32,$h1hi
  467. ldd [%o7+8*12],$two32
  468. fsubd $h2hi,$two64,$h2hi
  469. ldd [%o7+8*13],$two64
  470. fsubd $h3hi,$two96,$h3hi
  471. fsubd $h0lo,$h0hi,$h0lo
  472. std $h0hi,[$ctx+8*5] ! r0hi
  473. fsubd $h1lo,$h1hi,$h1lo
  474. std $h1hi,[$ctx+8*7] ! r1hi
  475. fsubd $h2lo,$h2hi,$h2lo
  476. std $h2hi,[$ctx+8*9] ! r2hi
  477. fsubd $h3lo,$h3hi,$h3lo
  478. std $h3hi,[$ctx+8*11] ! r3hi
  479. faddd $s1lo,$two0, $s1hi
  480. faddd $s2lo,$two32,$s2hi
  481. faddd $s3lo,$two64,$s3hi
  482. fsubd $s1hi,$two0, $s1hi
  483. fsubd $s2hi,$two32,$s2hi
  484. fsubd $s3hi,$two64,$s3hi
  485. fsubd $s1lo,$s1hi,$s1lo
  486. fsubd $s2lo,$s2hi,$s2lo
  487. fsubd $s3lo,$s3hi,$s3lo
  488. ldx [%sp+LOCALS],%fsr ! restore %fsr
  489. std $h0lo,[$ctx+8*4] ! r0lo
  490. std $h1lo,[$ctx+8*6] ! r1lo
  491. std $h2lo,[$ctx+8*8] ! r2lo
  492. std $h3lo,[$ctx+8*10] ! r3lo
  493. std $s1hi,[$ctx+8*13]
  494. std $s2hi,[$ctx+8*15]
  495. std $s3hi,[$ctx+8*17]
  496. std $s1lo,[$ctx+8*12]
  497. std $s2lo,[$ctx+8*14]
  498. std $s3lo,[$ctx+8*16]
  499. add %o7,poly1305_blocks_fma-.Lconsts_fma,%o0
  500. add %o7,poly1305_emit_fma-.Lconsts_fma,%o1
  501. STPTR %o0,[%i2]
  502. STPTR %o1,[%i2+SIZE_T]
  503. ret
  504. restore %g0,1,%o0 ! return 1
  505. .Lno_key_fma:
  506. ret
  507. restore %g0,%g0,%o0 ! return 0
  508. .type poly1305_init_fma,#function
  509. .size poly1305_init_fma,.-poly1305_init_fma
  510. .align 32
  511. poly1305_blocks_fma:
  512. save %sp,-STACK_FRAME-48,%sp
  513. srln $len,4,$len
  514. brz,pn $len,.Labort
  515. sub $len,1,$len
  516. 1: call .+8
  517. add %o7,.Lconsts_fma-1b,%o7
  518. ldd [%o7+8*0],$two0 ! load constants
  519. ldd [%o7+8*1],$two32
  520. ldd [%o7+8*2],$two64
  521. ldd [%o7+8*3],$two96
  522. ldd [%o7+8*4],$two130
  523. ldd [%o7+8*5],$five_two130
  524. ldd [$ctx+8*0],$h0lo ! load [biased] hash value
  525. ldd [$ctx+8*1],$h1lo
  526. ldd [$ctx+8*2],$h2lo
  527. ldd [$ctx+8*3],$h3lo
  528. std $two0,[%sp+LOCALS+8*0] ! input "template"
  529. sethi %hi((1023+52+96)<<20),$in3
  530. std $two32,[%sp+LOCALS+8*1]
  531. or $padbit,$in3,$in3
  532. std $two64,[%sp+LOCALS+8*2]
  533. st $in3,[%sp+LOCALS+8*3]
  534. and $inp,7,$shr
  535. andn $inp,7,$inp ! align pointer
  536. mov 8,$i1
  537. sll $shr,3,$shr
  538. mov 16,$step
  539. neg $shr,$shl
  540. ldxa [$inp+%g0]0x88,$in0 ! load little-endian input
  541. brz $shr,.Linp_aligned_fma
  542. ldxa [$inp+$i1]0x88,$in2
  543. ldxa [$inp+$step]0x88,$in4
  544. add $inp,8,$inp
  545. srlx $in0,$shr,$in0 ! align data
  546. sllx $in2,$shl,$in1
  547. srlx $in2,$shr,$in2
  548. or $in1,$in0,$in0
  549. sllx $in4,$shl,$in3
  550. srlx $in4,$shr,$in4 ! pre-shift
  551. or $in3,$in2,$in2
  552. .Linp_aligned_fma:
  553. srlx $in0,32,$in1
  554. movrz $len,0,$step
  555. srlx $in2,32,$in3
  556. add $step,$inp,$inp ! conditional advance
  557. st $in0,[%sp+LOCALS+8*0+4] ! fill "template"
  558. st $in1,[%sp+LOCALS+8*1+4]
  559. st $in2,[%sp+LOCALS+8*2+4]
  560. st $in3,[%sp+LOCALS+8*3+4]
  561. ldd [$ctx+8*4],$r0lo ! load key
  562. ldd [$ctx+8*5],$r0hi
  563. ldd [$ctx+8*6],$r1lo
  564. ldd [$ctx+8*7],$r1hi
  565. ldd [$ctx+8*8],$r2lo
  566. ldd [$ctx+8*9],$r2hi
  567. ldd [$ctx+8*10],$r3lo
  568. ldd [$ctx+8*11],$r3hi
  569. ldd [$ctx+8*12],$s1lo
  570. ldd [$ctx+8*13],$s1hi
  571. ldd [$ctx+8*14],$s2lo
  572. ldd [$ctx+8*15],$s2hi
  573. ldd [$ctx+8*16],$s3lo
  574. ldd [$ctx+8*17],$s3hi
  575. stx %fsr,[%sp+LOCALS+8*4] ! save original %fsr
  576. ldx [%o7+8*6],%fsr ! load new %fsr
  577. subcc $len,1,$len
  578. movrz $len,0,$step
  579. ldd [%sp+LOCALS+8*0],$x0 ! load biased input
  580. ldd [%sp+LOCALS+8*1],$x1
  581. ldd [%sp+LOCALS+8*2],$x2
  582. ldd [%sp+LOCALS+8*3],$x3
  583. fsubd $h0lo,$two0, $h0lo ! de-bias hash value
  584. fsubd $h1lo,$two32,$h1lo
  585. ldxa [$inp+%g0]0x88,$in0 ! modulo-scheduled input load
  586. fsubd $h2lo,$two64,$h2lo
  587. fsubd $h3lo,$two96,$h3lo
  588. ldxa [$inp+$i1]0x88,$in2
  589. fsubd $x0,$two0, $x0 ! de-bias input
  590. fsubd $x1,$two32,$x1
  591. fsubd $x2,$two64,$x2
  592. fsubd $x3,$two96,$x3
  593. brz $shr,.Linp_aligned_fma2
  594. add $step,$inp,$inp ! conditional advance
  595. sllx $in0,$shl,$in1 ! align data
  596. srlx $in0,$shr,$in3
  597. or $in1,$in4,$in0
  598. sllx $in2,$shl,$in1
  599. srlx $in2,$shr,$in4 ! pre-shift
  600. or $in3,$in1,$in2
  601. .Linp_aligned_fma2:
  602. srlx $in0,32,$in1
  603. srlx $in2,32,$in3
  604. faddd $h0lo,$x0,$x0 ! accumulate input
  605. stw $in0,[%sp+LOCALS+8*0+4]
  606. faddd $h1lo,$x1,$x1
  607. stw $in1,[%sp+LOCALS+8*1+4]
  608. faddd $h2lo,$x2,$x2
  609. stw $in2,[%sp+LOCALS+8*2+4]
  610. faddd $h3lo,$x3,$x3
  611. stw $in3,[%sp+LOCALS+8*3+4]
  612. b .Lentry_fma
  613. nop
  614. .align 16
  615. .Loop_fma:
  616. ldxa [$inp+%g0]0x88,$in0 ! modulo-scheduled input load
  617. ldxa [$inp+$i1]0x88,$in2
  618. movrz $len,0,$step
  619. faddd $y0,$h0lo,$h0lo ! accumulate input
  620. faddd $y1,$h0hi,$h0hi
  621. faddd $y2,$h2lo,$h2lo
  622. faddd $y3,$h2hi,$h2hi
  623. brz,pn $shr,.Linp_aligned_fma3
  624. add $step,$inp,$inp ! conditional advance
  625. sllx $in0,$shl,$in1 ! align data
  626. srlx $in0,$shr,$in3
  627. or $in1,$in4,$in0
  628. sllx $in2,$shl,$in1
  629. srlx $in2,$shr,$in4 ! pre-shift
  630. or $in3,$in1,$in2
  631. .Linp_aligned_fma3:
  632. !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! base 2^48 -> base 2^32
  633. faddd $two64,$h1lo,$c1lo
  634. srlx $in0,32,$in1
  635. faddd $two64,$h1hi,$c1hi
  636. srlx $in2,32,$in3
  637. faddd $two130,$h3lo,$c3lo
  638. st $in0,[%sp+LOCALS+8*0+4] ! fill "template"
  639. faddd $two130,$h3hi,$c3hi
  640. st $in1,[%sp+LOCALS+8*1+4]
  641. faddd $two32,$h0lo,$c0lo
  642. st $in2,[%sp+LOCALS+8*2+4]
  643. faddd $two32,$h0hi,$c0hi
  644. st $in3,[%sp+LOCALS+8*3+4]
  645. faddd $two96,$h2lo,$c2lo
  646. faddd $two96,$h2hi,$c2hi
  647. fsubd $c1lo,$two64,$c1lo
  648. fsubd $c1hi,$two64,$c1hi
  649. fsubd $c3lo,$two130,$c3lo
  650. fsubd $c3hi,$two130,$c3hi
  651. fsubd $c0lo,$two32,$c0lo
  652. fsubd $c0hi,$two32,$c0hi
  653. fsubd $c2lo,$two96,$c2lo
  654. fsubd $c2hi,$two96,$c2hi
  655. fsubd $h1lo,$c1lo,$h1lo
  656. fsubd $h1hi,$c1hi,$h1hi
  657. fsubd $h3lo,$c3lo,$h3lo
  658. fsubd $h3hi,$c3hi,$h3hi
  659. fsubd $h2lo,$c2lo,$h2lo
  660. fsubd $h2hi,$c2hi,$h2hi
  661. fsubd $h0lo,$c0lo,$h0lo
  662. fsubd $h0hi,$c0hi,$h0hi
  663. faddd $h1lo,$c0lo,$h1lo
  664. faddd $h1hi,$c0hi,$h1hi
  665. faddd $h3lo,$c2lo,$h3lo
  666. faddd $h3hi,$c2hi,$h3hi
  667. faddd $h2lo,$c1lo,$h2lo
  668. faddd $h2hi,$c1hi,$h2hi
  669. fmaddd $five_two130,$c3lo,$h0lo,$h0lo
  670. fmaddd $five_two130,$c3hi,$h0hi,$h0hi
  671. faddd $h1lo,$h1hi,$x1
  672. ldd [$ctx+8*12],$s1lo ! reload constants
  673. faddd $h3lo,$h3hi,$x3
  674. ldd [$ctx+8*13],$s1hi
  675. faddd $h2lo,$h2hi,$x2
  676. ldd [$ctx+8*10],$r3lo
  677. faddd $h0lo,$h0hi,$x0
  678. ldd [$ctx+8*11],$r3hi
  679. .Lentry_fma:
  680. fmuld $x1,$s3lo,$h0lo
  681. fmuld $x1,$s3hi,$h0hi
  682. fmuld $x1,$r1lo,$h2lo
  683. fmuld $x1,$r1hi,$h2hi
  684. fmuld $x1,$r0lo,$h1lo
  685. fmuld $x1,$r0hi,$h1hi
  686. fmuld $x1,$r2lo,$h3lo
  687. fmuld $x1,$r2hi,$h3hi
  688. fmaddd $x3,$s1lo,$h0lo,$h0lo
  689. fmaddd $x3,$s1hi,$h0hi,$h0hi
  690. fmaddd $x3,$s3lo,$h2lo,$h2lo
  691. fmaddd $x3,$s3hi,$h2hi,$h2hi
  692. fmaddd $x3,$s2lo,$h1lo,$h1lo
  693. fmaddd $x3,$s2hi,$h1hi,$h1hi
  694. fmaddd $x3,$r0lo,$h3lo,$h3lo
  695. fmaddd $x3,$r0hi,$h3hi,$h3hi
  696. fmaddd $x2,$s2lo,$h0lo,$h0lo
  697. fmaddd $x2,$s2hi,$h0hi,$h0hi
  698. fmaddd $x2,$r0lo,$h2lo,$h2lo
  699. fmaddd $x2,$r0hi,$h2hi,$h2hi
  700. fmaddd $x2,$s3lo,$h1lo,$h1lo
  701. ldd [%sp+LOCALS+8*0],$y0 ! load [biased] input
  702. fmaddd $x2,$s3hi,$h1hi,$h1hi
  703. ldd [%sp+LOCALS+8*1],$y1
  704. fmaddd $x2,$r1lo,$h3lo,$h3lo
  705. ldd [%sp+LOCALS+8*2],$y2
  706. fmaddd $x2,$r1hi,$h3hi,$h3hi
  707. ldd [%sp+LOCALS+8*3],$y3
  708. fmaddd $x0,$r0lo,$h0lo,$h0lo
  709. fsubd $y0,$two0, $y0 ! de-bias input
  710. fmaddd $x0,$r0hi,$h0hi,$h0hi
  711. fsubd $y1,$two32,$y1
  712. fmaddd $x0,$r2lo,$h2lo,$h2lo
  713. fsubd $y2,$two64,$y2
  714. fmaddd $x0,$r2hi,$h2hi,$h2hi
  715. fsubd $y3,$two96,$y3
  716. fmaddd $x0,$r1lo,$h1lo,$h1lo
  717. fmaddd $x0,$r1hi,$h1hi,$h1hi
  718. fmaddd $x0,$r3lo,$h3lo,$h3lo
  719. fmaddd $x0,$r3hi,$h3hi,$h3hi
  720. bcc SIZE_T_CC,.Loop_fma
  721. subcc $len,1,$len
  722. !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! base 2^48 -> base 2^32
  723. faddd $h0lo,$two32,$c0lo
  724. faddd $h0hi,$two32,$c0hi
  725. faddd $h2lo,$two96,$c2lo
  726. faddd $h2hi,$two96,$c2hi
  727. faddd $h1lo,$two64,$c1lo
  728. faddd $h1hi,$two64,$c1hi
  729. faddd $h3lo,$two130,$c3lo
  730. faddd $h3hi,$two130,$c3hi
  731. fsubd $c0lo,$two32,$c0lo
  732. fsubd $c0hi,$two32,$c0hi
  733. fsubd $c2lo,$two96,$c2lo
  734. fsubd $c2hi,$two96,$c2hi
  735. fsubd $c1lo,$two64,$c1lo
  736. fsubd $c1hi,$two64,$c1hi
  737. fsubd $c3lo,$two130,$c3lo
  738. fsubd $c3hi,$two130,$c3hi
  739. fsubd $h1lo,$c1lo,$h1lo
  740. fsubd $h1hi,$c1hi,$h1hi
  741. fsubd $h3lo,$c3lo,$h3lo
  742. fsubd $h3hi,$c3hi,$h3hi
  743. fsubd $h2lo,$c2lo,$h2lo
  744. fsubd $h2hi,$c2hi,$h2hi
  745. fsubd $h0lo,$c0lo,$h0lo
  746. fsubd $h0hi,$c0hi,$h0hi
  747. faddd $h1lo,$c0lo,$h1lo
  748. faddd $h1hi,$c0hi,$h1hi
  749. faddd $h3lo,$c2lo,$h3lo
  750. faddd $h3hi,$c2hi,$h3hi
  751. faddd $h2lo,$c1lo,$h2lo
  752. faddd $h2hi,$c1hi,$h2hi
  753. fmaddd $five_two130,$c3lo,$h0lo,$h0lo
  754. fmaddd $five_two130,$c3hi,$h0hi,$h0hi
  755. faddd $h1lo,$h1hi,$x1
  756. faddd $h3lo,$h3hi,$x3
  757. faddd $h2lo,$h2hi,$x2
  758. faddd $h0lo,$h0hi,$x0
  759. faddd $x1,$two32,$x1 ! bias
  760. faddd $x3,$two96,$x3
  761. faddd $x2,$two64,$x2
  762. faddd $x0,$two0, $x0
  763. ldx [%sp+LOCALS+8*4],%fsr ! restore saved %fsr
  764. std $x1,[$ctx+8*1] ! store [biased] hash value
  765. std $x3,[$ctx+8*3]
  766. std $x2,[$ctx+8*2]
  767. std $x0,[$ctx+8*0]
  768. .Labort:
  769. ret
  770. restore
  771. .type poly1305_blocks_fma,#function
  772. .size poly1305_blocks_fma,.-poly1305_blocks_fma
  773. ___
  774. {
  775. my ($mac,$nonce)=($inp,$len);
  776. my ($h0,$h1,$h2,$h3,$h4, $d0,$d1,$d2,$d3, $mask
  777. ) = (map("%l$_",(0..5)),map("%o$_",(0..4)));
  778. $code.=<<___;
  779. .align 32
  780. poly1305_emit_fma:
  781. save %sp,-STACK_FRAME,%sp
  782. ld [$ctx+8*0+0],$d0 ! load hash
  783. ld [$ctx+8*0+4],$h0
  784. ld [$ctx+8*1+0],$d1
  785. ld [$ctx+8*1+4],$h1
  786. ld [$ctx+8*2+0],$d2
  787. ld [$ctx+8*2+4],$h2
  788. ld [$ctx+8*3+0],$d3
  789. ld [$ctx+8*3+4],$h3
  790. sethi %hi(0xfff00000),$mask
  791. andn $d0,$mask,$d0 ! mask exponent
  792. andn $d1,$mask,$d1
  793. andn $d2,$mask,$d2
  794. andn $d3,$mask,$d3 ! can be partially reduced...
  795. mov 3,$mask
  796. srl $d3,2,$padbit ! ... so reduce
  797. and $d3,$mask,$h4
  798. andn $d3,$mask,$d3
  799. add $padbit,$d3,$d3
  800. addcc $d3,$h0,$h0
  801. addccc $d0,$h1,$h1
  802. addccc $d1,$h2,$h2
  803. addccc $d2,$h3,$h3
  804. addc %g0,$h4,$h4
  805. addcc $h0,5,$d0 ! compare to modulus
  806. addccc $h1,0,$d1
  807. addccc $h2,0,$d2
  808. addccc $h3,0,$d3
  809. addc $h4,0,$mask
  810. srl $mask,2,$mask ! did it carry/borrow?
  811. neg $mask,$mask
  812. sra $mask,31,$mask ! mask
  813. andn $h0,$mask,$h0
  814. and $d0,$mask,$d0
  815. andn $h1,$mask,$h1
  816. and $d1,$mask,$d1
  817. or $d0,$h0,$h0
  818. ld [$nonce+0],$d0 ! load nonce
  819. andn $h2,$mask,$h2
  820. and $d2,$mask,$d2
  821. or $d1,$h1,$h1
  822. ld [$nonce+4],$d1
  823. andn $h3,$mask,$h3
  824. and $d3,$mask,$d3
  825. or $d2,$h2,$h2
  826. ld [$nonce+8],$d2
  827. or $d3,$h3,$h3
  828. ld [$nonce+12],$d3
  829. addcc $d0,$h0,$h0 ! accumulate nonce
  830. addccc $d1,$h1,$h1
  831. addccc $d2,$h2,$h2
  832. addc $d3,$h3,$h3
  833. stb $h0,[$mac+0] ! write little-endian result
  834. srl $h0,8,$h0
  835. stb $h1,[$mac+4]
  836. srl $h1,8,$h1
  837. stb $h2,[$mac+8]
  838. srl $h2,8,$h2
  839. stb $h3,[$mac+12]
  840. srl $h3,8,$h3
  841. stb $h0,[$mac+1]
  842. srl $h0,8,$h0
  843. stb $h1,[$mac+5]
  844. srl $h1,8,$h1
  845. stb $h2,[$mac+9]
  846. srl $h2,8,$h2
  847. stb $h3,[$mac+13]
  848. srl $h3,8,$h3
  849. stb $h0,[$mac+2]
  850. srl $h0,8,$h0
  851. stb $h1,[$mac+6]
  852. srl $h1,8,$h1
  853. stb $h2,[$mac+10]
  854. srl $h2,8,$h2
  855. stb $h3,[$mac+14]
  856. srl $h3,8,$h3
  857. stb $h0,[$mac+3]
  858. stb $h1,[$mac+7]
  859. stb $h2,[$mac+11]
  860. stb $h3,[$mac+15]
  861. ret
  862. restore
  863. .type poly1305_emit_fma,#function
  864. .size poly1305_emit_fma,.-poly1305_emit_fma
  865. ___
  866. }
  867. $code.=<<___;
  868. .align 64
  869. .Lconsts_fma:
  870. .word 0x43300000,0x00000000 ! 2^(52+0)
  871. .word 0x45300000,0x00000000 ! 2^(52+32)
  872. .word 0x47300000,0x00000000 ! 2^(52+64)
  873. .word 0x49300000,0x00000000 ! 2^(52+96)
  874. .word 0x4b500000,0x00000000 ! 2^(52+130)
  875. .word 0x37f40000,0x00000000 ! 5/2^130
  876. .word 0,1<<30 ! fsr: truncate, no exceptions
  877. .word 0x44300000,0x00000000 ! 2^(52+16+0)
  878. .word 0x46300000,0x00000000 ! 2^(52+16+32)
  879. .word 0x48300000,0x00000000 ! 2^(52+16+64)
  880. .word 0x4a300000,0x00000000 ! 2^(52+16+96)
  881. .word 0x3e300000,0x00000000 ! 2^(52+16+0-96)
  882. .word 0x40300000,0x00000000 ! 2^(52+16+32-96)
  883. .word 0x42300000,0x00000000 ! 2^(52+16+64-96)
  884. .asciz "Poly1305 for SPARCv9/VIS3/FMA, CRYPTOGAMS by <appro\@openssl.org>"
  885. .align 4
  886. ___
  887. }
  888. # Purpose of these subroutines is to explicitly encode VIS instructions,
  889. # so that one can compile the module without having to specify VIS
  890. # extensions on compiler command line, e.g. -xarch=v9 vs. -xarch=v9a.
  891. # Idea is to reserve for option to produce "universal" binary and let
  892. # programmer detect if current CPU is VIS capable at run-time.
  893. sub unvis3 {
  894. my ($mnemonic,$rs1,$rs2,$rd)=@_;
  895. my %bias = ( "g" => 0, "o" => 8, "l" => 16, "i" => 24 );
  896. my ($ref,$opf);
  897. my %visopf = ( "addxc" => 0x011,
  898. "addxccc" => 0x013,
  899. "umulxhi" => 0x016 );
  900. $ref = "$mnemonic\t$rs1,$rs2,$rd";
  901. if ($opf=$visopf{$mnemonic}) {
  902. foreach ($rs1,$rs2,$rd) {
  903. return $ref if (!/%([goli])([0-9])/);
  904. $_=$bias{$1}+$2;
  905. }
  906. return sprintf ".word\t0x%08x !%s",
  907. 0x81b00000|$rd<<25|$rs1<<14|$opf<<5|$rs2,
  908. $ref;
  909. } else {
  910. return $ref;
  911. }
  912. }
  913. sub unfma {
  914. my ($mnemonic,$rs1,$rs2,$rs3,$rd)=@_;
  915. my ($ref,$opf);
  916. my %fmaopf = ( "fmadds" => 0x1,
  917. "fmaddd" => 0x2,
  918. "fmsubs" => 0x5,
  919. "fmsubd" => 0x6 );
  920. $ref = "$mnemonic\t$rs1,$rs2,$rs3,$rd";
  921. if ($opf=$fmaopf{$mnemonic}) {
  922. foreach ($rs1,$rs2,$rs3,$rd) {
  923. return $ref if (!/%f([0-9]{1,2})/);
  924. $_=$1;
  925. if ($1>=32) {
  926. return $ref if ($1&1);
  927. # re-encode for upper double register addressing
  928. $_=($1|$1>>5)&31;
  929. }
  930. }
  931. return sprintf ".word\t0x%08x !%s",
  932. 0x81b80000|$rd<<25|$rs1<<14|$rs3<<9|$opf<<5|$rs2,
  933. $ref;
  934. } else {
  935. return $ref;
  936. }
  937. }
  938. foreach (split("\n",$code)) {
  939. s/\`([^\`]*)\`/eval $1/ge;
  940. s/\b(umulxhi|addxc[c]{0,2})\s+(%[goli][0-7]),\s*(%[goli][0-7]),\s*(%[goli][0-7])/
  941. &unvis3($1,$2,$3,$4)
  942. /ge or
  943. s/\b(fmadd[sd])\s+(%f[0-9]+),\s*(%f[0-9]+),\s*(%f[0-9]+),\s*(%f[0-9]+)/
  944. &unfma($1,$2,$3,$4,$5)
  945. /ge;
  946. print $_,"\n";
  947. }
  948. close STDOUT or die "error closing STDOUT: $!";