2
0

poly1305-ppcfp.pl 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739
  1. #! /usr/bin/env perl
  2. # Copyright 2016-2020 The OpenSSL Project Authors. All Rights Reserved.
  3. #
  4. # Licensed under the OpenSSL license (the "License"). You may not use
  5. # this file except in compliance with the License. You can obtain a copy
  6. # in the file LICENSE in the source distribution or at
  7. # https://www.openssl.org/source/license.html
  8. #
  9. # ====================================================================
  10. # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
  11. # project. The module is, however, dual licensed under OpenSSL and
  12. # CRYPTOGAMS licenses depending on where you obtain it. For further
  13. # details see http://www.openssl.org/~appro/cryptogams/.
  14. # ====================================================================
  15. #
  16. # This module implements Poly1305 hash for PowerPC FPU.
  17. #
  18. # June 2015
  19. #
  20. # Numbers are cycles per processed byte with poly1305_blocks alone,
  21. # and improvement coefficients relative to gcc-generated code.
  22. #
  23. # Freescale e300 9.78/+30%
  24. # PPC74x0 6.92/+50%
  25. # PPC970 6.03/+80%
  26. # POWER7 3.50/+30%
  27. # POWER8 3.75/+10%
  28. $flavour = shift;
  29. if ($flavour =~ /64/) {
  30. $SIZE_T =8;
  31. $LRSAVE =2*$SIZE_T;
  32. $UCMP ="cmpld";
  33. $STU ="stdu";
  34. $POP ="ld";
  35. $PUSH ="std";
  36. } elsif ($flavour =~ /32/) {
  37. $SIZE_T =4;
  38. $LRSAVE =$SIZE_T;
  39. $UCMP ="cmplw";
  40. $STU ="stwu";
  41. $POP ="lwz";
  42. $PUSH ="stw";
  43. } else { die "nonsense $flavour"; }
  44. $LITTLE_ENDIAN = ($flavour=~/le$/) ? 4 : 0;
  45. $LWXLE = $LITTLE_ENDIAN ? "lwzx" : "lwbrx";
  46. $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
  47. ( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
  48. ( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
  49. die "can't locate ppc-xlate.pl";
  50. open STDOUT,"| $^X $xlate $flavour ".shift || die "can't call $xlate: $!";
  51. $LOCALS=6*$SIZE_T;
  52. $FRAME=$LOCALS+6*8+18*8;
  53. my $sp="r1";
  54. my ($ctx,$inp,$len,$padbit) = map("r$_",(3..6));
  55. my ($in0,$in1,$in2,$in3,$i1,$i2,$i3) = map("r$_",(7..12,6));
  56. my ($h0lo,$h0hi,$h1lo,$h1hi,$h2lo,$h2hi,$h3lo,$h3hi,
  57. $two0,$two32,$two64,$two96,$two130,$five_two130,
  58. $r0lo,$r0hi,$r1lo,$r1hi,$r2lo,$r2hi,
  59. $s2lo,$s2hi,$s3lo,$s3hi,
  60. $c0lo,$c0hi,$c1lo,$c1hi,$c2lo,$c2hi,$c3lo,$c3hi) = map("f$_",(0..31));
  61. # borrowings
  62. my ($r3lo,$r3hi,$s1lo,$s1hi) = ($c0lo,$c0hi,$c1lo,$c1hi);
  63. my ($x0,$x1,$x2,$x3) = ($c2lo,$c2hi,$c3lo,$c3hi);
  64. my ($y0,$y1,$y2,$y3) = ($c3lo,$c3hi,$c1lo,$c1hi);
  65. $code.=<<___;
  66. .machine "any"
  67. .text
  68. .globl .poly1305_init_fpu
  69. .align 6
  70. .poly1305_init_fpu:
  71. $STU $sp,-$LOCALS($sp) # minimal frame
  72. mflr $padbit
  73. $PUSH $padbit,`$LOCALS+$LRSAVE`($sp)
  74. bl LPICmeup
  75. xor r0,r0,r0
  76. mtlr $padbit # restore lr
  77. lfd $two0,8*0($len) # load constants
  78. lfd $two32,8*1($len)
  79. lfd $two64,8*2($len)
  80. lfd $two96,8*3($len)
  81. lfd $two130,8*4($len)
  82. lfd $five_two130,8*5($len)
  83. stfd $two0,8*0($ctx) # initial hash value, biased 0
  84. stfd $two32,8*1($ctx)
  85. stfd $two64,8*2($ctx)
  86. stfd $two96,8*3($ctx)
  87. $UCMP $inp,r0
  88. beq- Lno_key
  89. lfd $h3lo,8*13($len) # new fpscr
  90. mffs $h3hi # old fpscr
  91. stfd $two0,8*4($ctx) # key "template"
  92. stfd $two32,8*5($ctx)
  93. stfd $two64,8*6($ctx)
  94. stfd $two96,8*7($ctx)
  95. li $in1,4
  96. li $in2,8
  97. li $in3,12
  98. $LWXLE $in0,0,$inp # load key
  99. $LWXLE $in1,$in1,$inp
  100. $LWXLE $in2,$in2,$inp
  101. $LWXLE $in3,$in3,$inp
  102. lis $i1,0xf000 # 0xf0000000
  103. ori $i2,$i1,3 # 0xf0000003
  104. andc $in0,$in0,$i1 # &=0x0fffffff
  105. andc $in1,$in1,$i2 # &=0x0ffffffc
  106. andc $in2,$in2,$i2
  107. andc $in3,$in3,$i2
  108. stw $in0,`8*4+(4^$LITTLE_ENDIAN)`($ctx) # fill "template"
  109. stw $in1,`8*5+(4^$LITTLE_ENDIAN)`($ctx)
  110. stw $in2,`8*6+(4^$LITTLE_ENDIAN)`($ctx)
  111. stw $in3,`8*7+(4^$LITTLE_ENDIAN)`($ctx)
  112. mtfsf 255,$h3lo # fpscr
  113. stfd $two0,8*18($ctx) # copy constants to context
  114. stfd $two32,8*19($ctx)
  115. stfd $two64,8*20($ctx)
  116. stfd $two96,8*21($ctx)
  117. stfd $two130,8*22($ctx)
  118. stfd $five_two130,8*23($ctx)
  119. lfd $h0lo,8*4($ctx) # load [biased] key
  120. lfd $h1lo,8*5($ctx)
  121. lfd $h2lo,8*6($ctx)
  122. lfd $h3lo,8*7($ctx)
  123. fsub $h0lo,$h0lo,$two0 # r0
  124. fsub $h1lo,$h1lo,$two32 # r1
  125. fsub $h2lo,$h2lo,$two64 # r2
  126. fsub $h3lo,$h3lo,$two96 # r3
  127. lfd $two0,8*6($len) # more constants
  128. lfd $two32,8*7($len)
  129. lfd $two64,8*8($len)
  130. lfd $two96,8*9($len)
  131. fmul $h1hi,$h1lo,$five_two130 # s1
  132. fmul $h2hi,$h2lo,$five_two130 # s2
  133. stfd $h3hi,8*15($ctx) # borrow slot for original fpscr
  134. fmul $h3hi,$h3lo,$five_two130 # s3
  135. fadd $h0hi,$h0lo,$two0
  136. stfd $h1hi,8*12($ctx) # put aside for now
  137. fadd $h1hi,$h1lo,$two32
  138. stfd $h2hi,8*13($ctx)
  139. fadd $h2hi,$h2lo,$two64
  140. stfd $h3hi,8*14($ctx)
  141. fadd $h3hi,$h3lo,$two96
  142. fsub $h0hi,$h0hi,$two0
  143. fsub $h1hi,$h1hi,$two32
  144. fsub $h2hi,$h2hi,$two64
  145. fsub $h3hi,$h3hi,$two96
  146. lfd $two0,8*10($len) # more constants
  147. lfd $two32,8*11($len)
  148. lfd $two64,8*12($len)
  149. fsub $h0lo,$h0lo,$h0hi
  150. fsub $h1lo,$h1lo,$h1hi
  151. fsub $h2lo,$h2lo,$h2hi
  152. fsub $h3lo,$h3lo,$h3hi
  153. stfd $h0hi,8*5($ctx) # r0hi
  154. stfd $h1hi,8*7($ctx) # r1hi
  155. stfd $h2hi,8*9($ctx) # r2hi
  156. stfd $h3hi,8*11($ctx) # r3hi
  157. stfd $h0lo,8*4($ctx) # r0lo
  158. stfd $h1lo,8*6($ctx) # r1lo
  159. stfd $h2lo,8*8($ctx) # r2lo
  160. stfd $h3lo,8*10($ctx) # r3lo
  161. lfd $h1lo,8*12($ctx) # s1
  162. lfd $h2lo,8*13($ctx) # s2
  163. lfd $h3lo,8*14($ctx) # s3
  164. lfd $h0lo,8*15($ctx) # pull original fpscr
  165. fadd $h1hi,$h1lo,$two0
  166. fadd $h2hi,$h2lo,$two32
  167. fadd $h3hi,$h3lo,$two64
  168. fsub $h1hi,$h1hi,$two0
  169. fsub $h2hi,$h2hi,$two32
  170. fsub $h3hi,$h3hi,$two64
  171. fsub $h1lo,$h1lo,$h1hi
  172. fsub $h2lo,$h2lo,$h2hi
  173. fsub $h3lo,$h3lo,$h3hi
  174. stfd $h1hi,8*13($ctx) # s1hi
  175. stfd $h2hi,8*15($ctx) # s2hi
  176. stfd $h3hi,8*17($ctx) # s3hi
  177. stfd $h1lo,8*12($ctx) # s1lo
  178. stfd $h2lo,8*14($ctx) # s2lo
  179. stfd $h3lo,8*16($ctx) # s3lo
  180. mtfsf 255,$h0lo # restore fpscr
  181. Lno_key:
  182. xor r3,r3,r3
  183. addi $sp,$sp,$LOCALS
  184. blr
  185. .long 0
  186. .byte 0,12,4,1,0x80,0,2,0
  187. .size .poly1305_init_fpu,.-.poly1305_init_fpu
  188. .globl .poly1305_blocks_fpu
  189. .align 4
  190. .poly1305_blocks_fpu:
  191. srwi. $len,$len,4
  192. beq- Labort
  193. $STU $sp,-$FRAME($sp)
  194. mflr r0
  195. stfd f14,`$FRAME-8*18`($sp)
  196. stfd f15,`$FRAME-8*17`($sp)
  197. stfd f16,`$FRAME-8*16`($sp)
  198. stfd f17,`$FRAME-8*15`($sp)
  199. stfd f18,`$FRAME-8*14`($sp)
  200. stfd f19,`$FRAME-8*13`($sp)
  201. stfd f20,`$FRAME-8*12`($sp)
  202. stfd f21,`$FRAME-8*11`($sp)
  203. stfd f22,`$FRAME-8*10`($sp)
  204. stfd f23,`$FRAME-8*9`($sp)
  205. stfd f24,`$FRAME-8*8`($sp)
  206. stfd f25,`$FRAME-8*7`($sp)
  207. stfd f26,`$FRAME-8*6`($sp)
  208. stfd f27,`$FRAME-8*5`($sp)
  209. stfd f28,`$FRAME-8*4`($sp)
  210. stfd f29,`$FRAME-8*3`($sp)
  211. stfd f30,`$FRAME-8*2`($sp)
  212. stfd f31,`$FRAME-8*1`($sp)
  213. $PUSH r0,`$FRAME+$LRSAVE`($sp)
  214. xor r0,r0,r0
  215. li $in3,1
  216. mtctr $len
  217. neg $len,$len
  218. stw r0,`$LOCALS+8*4+(0^$LITTLE_ENDIAN)`($sp)
  219. stw $in3,`$LOCALS+8*4+(4^$LITTLE_ENDIAN)`($sp)
  220. lfd $two0,8*18($ctx) # load constants
  221. lfd $two32,8*19($ctx)
  222. lfd $two64,8*20($ctx)
  223. lfd $two96,8*21($ctx)
  224. lfd $two130,8*22($ctx)
  225. lfd $five_two130,8*23($ctx)
  226. lfd $h0lo,8*0($ctx) # load [biased] hash value
  227. lfd $h1lo,8*1($ctx)
  228. lfd $h2lo,8*2($ctx)
  229. lfd $h3lo,8*3($ctx)
  230. stfd $two0,`$LOCALS+8*0`($sp) # input "template"
  231. oris $in3,$padbit,`(1023+52+96)<<4`
  232. stfd $two32,`$LOCALS+8*1`($sp)
  233. stfd $two64,`$LOCALS+8*2`($sp)
  234. stw $in3,`$LOCALS+8*3+(0^$LITTLE_ENDIAN)`($sp)
  235. li $i1,4
  236. li $i2,8
  237. li $i3,12
  238. $LWXLE $in0,0,$inp # load input
  239. $LWXLE $in1,$i1,$inp
  240. $LWXLE $in2,$i2,$inp
  241. $LWXLE $in3,$i3,$inp
  242. addi $inp,$inp,16
  243. stw $in0,`$LOCALS+8*0+(4^$LITTLE_ENDIAN)`($sp) # fill "template"
  244. stw $in1,`$LOCALS+8*1+(4^$LITTLE_ENDIAN)`($sp)
  245. stw $in2,`$LOCALS+8*2+(4^$LITTLE_ENDIAN)`($sp)
  246. stw $in3,`$LOCALS+8*3+(4^$LITTLE_ENDIAN)`($sp)
  247. mffs $x0 # original fpscr
  248. lfd $x1,`$LOCALS+8*4`($sp) # new fpscr
  249. lfd $r0lo,8*4($ctx) # load key
  250. lfd $r0hi,8*5($ctx)
  251. lfd $r1lo,8*6($ctx)
  252. lfd $r1hi,8*7($ctx)
  253. lfd $r2lo,8*8($ctx)
  254. lfd $r2hi,8*9($ctx)
  255. lfd $r3lo,8*10($ctx)
  256. lfd $r3hi,8*11($ctx)
  257. lfd $s1lo,8*12($ctx)
  258. lfd $s1hi,8*13($ctx)
  259. lfd $s2lo,8*14($ctx)
  260. lfd $s2hi,8*15($ctx)
  261. lfd $s3lo,8*16($ctx)
  262. lfd $s3hi,8*17($ctx)
  263. stfd $x0,`$LOCALS+8*4`($sp) # save original fpscr
  264. mtfsf 255,$x1
  265. addic $len,$len,1
  266. addze r0,r0
  267. slwi. r0,r0,4
  268. sub $inp,$inp,r0 # conditional rewind
  269. lfd $x0,`$LOCALS+8*0`($sp)
  270. lfd $x1,`$LOCALS+8*1`($sp)
  271. lfd $x2,`$LOCALS+8*2`($sp)
  272. lfd $x3,`$LOCALS+8*3`($sp)
  273. fsub $h0lo,$h0lo,$two0 # de-bias hash value
  274. $LWXLE $in0,0,$inp # modulo-scheduled input load
  275. fsub $h1lo,$h1lo,$two32
  276. $LWXLE $in1,$i1,$inp
  277. fsub $h2lo,$h2lo,$two64
  278. $LWXLE $in2,$i2,$inp
  279. fsub $h3lo,$h3lo,$two96
  280. $LWXLE $in3,$i3,$inp
  281. fsub $x0,$x0,$two0 # de-bias input
  282. addi $inp,$inp,16
  283. fsub $x1,$x1,$two32
  284. fsub $x2,$x2,$two64
  285. fsub $x3,$x3,$two96
  286. fadd $x0,$x0,$h0lo # accumulate input
  287. stw $in0,`$LOCALS+8*0+(4^$LITTLE_ENDIAN)`($sp)
  288. fadd $x1,$x1,$h1lo
  289. stw $in1,`$LOCALS+8*1+(4^$LITTLE_ENDIAN)`($sp)
  290. fadd $x2,$x2,$h2lo
  291. stw $in2,`$LOCALS+8*2+(4^$LITTLE_ENDIAN)`($sp)
  292. fadd $x3,$x3,$h3lo
  293. stw $in3,`$LOCALS+8*3+(4^$LITTLE_ENDIAN)`($sp)
  294. b Lentry
  295. .align 4
  296. Loop:
  297. fsub $y0,$y0,$two0 # de-bias input
  298. addic $len,$len,1
  299. fsub $y1,$y1,$two32
  300. addze r0,r0
  301. fsub $y2,$y2,$two64
  302. slwi. r0,r0,4
  303. fsub $y3,$y3,$two96
  304. sub $inp,$inp,r0 # conditional rewind
  305. fadd $h0lo,$h0lo,$y0 # accumulate input
  306. fadd $h0hi,$h0hi,$y1
  307. fadd $h2lo,$h2lo,$y2
  308. fadd $h2hi,$h2hi,$y3
  309. ######################################### base 2^48 -> base 2^32
  310. fadd $c1lo,$h1lo,$two64
  311. $LWXLE $in0,0,$inp # modulo-scheduled input load
  312. fadd $c1hi,$h1hi,$two64
  313. $LWXLE $in1,$i1,$inp
  314. fadd $c3lo,$h3lo,$two130
  315. $LWXLE $in2,$i2,$inp
  316. fadd $c3hi,$h3hi,$two130
  317. $LWXLE $in3,$i3,$inp
  318. fadd $c0lo,$h0lo,$two32
  319. addi $inp,$inp,16
  320. fadd $c0hi,$h0hi,$two32
  321. fadd $c2lo,$h2lo,$two96
  322. fadd $c2hi,$h2hi,$two96
  323. fsub $c1lo,$c1lo,$two64
  324. stw $in0,`$LOCALS+8*0+(4^$LITTLE_ENDIAN)`($sp) # fill "template"
  325. fsub $c1hi,$c1hi,$two64
  326. stw $in1,`$LOCALS+8*1+(4^$LITTLE_ENDIAN)`($sp)
  327. fsub $c3lo,$c3lo,$two130
  328. stw $in2,`$LOCALS+8*2+(4^$LITTLE_ENDIAN)`($sp)
  329. fsub $c3hi,$c3hi,$two130
  330. stw $in3,`$LOCALS+8*3+(4^$LITTLE_ENDIAN)`($sp)
  331. fsub $c0lo,$c0lo,$two32
  332. fsub $c0hi,$c0hi,$two32
  333. fsub $c2lo,$c2lo,$two96
  334. fsub $c2hi,$c2hi,$two96
  335. fsub $h1lo,$h1lo,$c1lo
  336. fsub $h1hi,$h1hi,$c1hi
  337. fsub $h3lo,$h3lo,$c3lo
  338. fsub $h3hi,$h3hi,$c3hi
  339. fsub $h2lo,$h2lo,$c2lo
  340. fsub $h2hi,$h2hi,$c2hi
  341. fsub $h0lo,$h0lo,$c0lo
  342. fsub $h0hi,$h0hi,$c0hi
  343. fadd $h1lo,$h1lo,$c0lo
  344. fadd $h1hi,$h1hi,$c0hi
  345. fadd $h3lo,$h3lo,$c2lo
  346. fadd $h3hi,$h3hi,$c2hi
  347. fadd $h2lo,$h2lo,$c1lo
  348. fadd $h2hi,$h2hi,$c1hi
  349. fmadd $h0lo,$c3lo,$five_two130,$h0lo
  350. fmadd $h0hi,$c3hi,$five_two130,$h0hi
  351. fadd $x1,$h1lo,$h1hi
  352. lfd $s1lo,8*12($ctx) # reload constants
  353. fadd $x3,$h3lo,$h3hi
  354. lfd $s1hi,8*13($ctx)
  355. fadd $x2,$h2lo,$h2hi
  356. lfd $r3lo,8*10($ctx)
  357. fadd $x0,$h0lo,$h0hi
  358. lfd $r3hi,8*11($ctx)
  359. Lentry:
  360. fmul $h0lo,$s3lo,$x1
  361. fmul $h0hi,$s3hi,$x1
  362. fmul $h2lo,$r1lo,$x1
  363. fmul $h2hi,$r1hi,$x1
  364. fmul $h1lo,$r0lo,$x1
  365. fmul $h1hi,$r0hi,$x1
  366. fmul $h3lo,$r2lo,$x1
  367. fmul $h3hi,$r2hi,$x1
  368. fmadd $h0lo,$s1lo,$x3,$h0lo
  369. fmadd $h0hi,$s1hi,$x3,$h0hi
  370. fmadd $h2lo,$s3lo,$x3,$h2lo
  371. fmadd $h2hi,$s3hi,$x3,$h2hi
  372. fmadd $h1lo,$s2lo,$x3,$h1lo
  373. fmadd $h1hi,$s2hi,$x3,$h1hi
  374. fmadd $h3lo,$r0lo,$x3,$h3lo
  375. fmadd $h3hi,$r0hi,$x3,$h3hi
  376. fmadd $h0lo,$s2lo,$x2,$h0lo
  377. fmadd $h0hi,$s2hi,$x2,$h0hi
  378. fmadd $h2lo,$r0lo,$x2,$h2lo
  379. fmadd $h2hi,$r0hi,$x2,$h2hi
  380. fmadd $h1lo,$s3lo,$x2,$h1lo
  381. fmadd $h1hi,$s3hi,$x2,$h1hi
  382. fmadd $h3lo,$r1lo,$x2,$h3lo
  383. fmadd $h3hi,$r1hi,$x2,$h3hi
  384. fmadd $h0lo,$r0lo,$x0,$h0lo
  385. lfd $y0,`$LOCALS+8*0`($sp) # load [biased] input
  386. fmadd $h0hi,$r0hi,$x0,$h0hi
  387. lfd $y1,`$LOCALS+8*1`($sp)
  388. fmadd $h2lo,$r2lo,$x0,$h2lo
  389. lfd $y2,`$LOCALS+8*2`($sp)
  390. fmadd $h2hi,$r2hi,$x0,$h2hi
  391. lfd $y3,`$LOCALS+8*3`($sp)
  392. fmadd $h1lo,$r1lo,$x0,$h1lo
  393. fmadd $h1hi,$r1hi,$x0,$h1hi
  394. fmadd $h3lo,$r3lo,$x0,$h3lo
  395. fmadd $h3hi,$r3hi,$x0,$h3hi
  396. bdnz Loop
  397. ######################################### base 2^48 -> base 2^32
  398. fadd $c0lo,$h0lo,$two32
  399. fadd $c0hi,$h0hi,$two32
  400. fadd $c2lo,$h2lo,$two96
  401. fadd $c2hi,$h2hi,$two96
  402. fadd $c1lo,$h1lo,$two64
  403. fadd $c1hi,$h1hi,$two64
  404. fadd $c3lo,$h3lo,$two130
  405. fadd $c3hi,$h3hi,$two130
  406. fsub $c0lo,$c0lo,$two32
  407. fsub $c0hi,$c0hi,$two32
  408. fsub $c2lo,$c2lo,$two96
  409. fsub $c2hi,$c2hi,$two96
  410. fsub $c1lo,$c1lo,$two64
  411. fsub $c1hi,$c1hi,$two64
  412. fsub $c3lo,$c3lo,$two130
  413. fsub $c3hi,$c3hi,$two130
  414. fsub $h1lo,$h1lo,$c1lo
  415. fsub $h1hi,$h1hi,$c1hi
  416. fsub $h3lo,$h3lo,$c3lo
  417. fsub $h3hi,$h3hi,$c3hi
  418. fsub $h2lo,$h2lo,$c2lo
  419. fsub $h2hi,$h2hi,$c2hi
  420. fsub $h0lo,$h0lo,$c0lo
  421. fsub $h0hi,$h0hi,$c0hi
  422. fadd $h1lo,$h1lo,$c0lo
  423. fadd $h1hi,$h1hi,$c0hi
  424. fadd $h3lo,$h3lo,$c2lo
  425. fadd $h3hi,$h3hi,$c2hi
  426. fadd $h2lo,$h2lo,$c1lo
  427. fadd $h2hi,$h2hi,$c1hi
  428. fmadd $h0lo,$c3lo,$five_two130,$h0lo
  429. fmadd $h0hi,$c3hi,$five_two130,$h0hi
  430. fadd $x1,$h1lo,$h1hi
  431. fadd $x3,$h3lo,$h3hi
  432. fadd $x2,$h2lo,$h2hi
  433. fadd $x0,$h0lo,$h0hi
  434. lfd $h0lo,`$LOCALS+8*4`($sp) # pull saved fpscr
  435. fadd $x1,$x1,$two32 # bias
  436. fadd $x3,$x3,$two96
  437. fadd $x2,$x2,$two64
  438. fadd $x0,$x0,$two0
  439. stfd $x1,8*1($ctx) # store [biased] hash value
  440. stfd $x3,8*3($ctx)
  441. stfd $x2,8*2($ctx)
  442. stfd $x0,8*0($ctx)
  443. mtfsf 255,$h0lo # restore original fpscr
  444. lfd f14,`$FRAME-8*18`($sp)
  445. lfd f15,`$FRAME-8*17`($sp)
  446. lfd f16,`$FRAME-8*16`($sp)
  447. lfd f17,`$FRAME-8*15`($sp)
  448. lfd f18,`$FRAME-8*14`($sp)
  449. lfd f19,`$FRAME-8*13`($sp)
  450. lfd f20,`$FRAME-8*12`($sp)
  451. lfd f21,`$FRAME-8*11`($sp)
  452. lfd f22,`$FRAME-8*10`($sp)
  453. lfd f23,`$FRAME-8*9`($sp)
  454. lfd f24,`$FRAME-8*8`($sp)
  455. lfd f25,`$FRAME-8*7`($sp)
  456. lfd f26,`$FRAME-8*6`($sp)
  457. lfd f27,`$FRAME-8*5`($sp)
  458. lfd f28,`$FRAME-8*4`($sp)
  459. lfd f29,`$FRAME-8*3`($sp)
  460. lfd f30,`$FRAME-8*2`($sp)
  461. lfd f31,`$FRAME-8*1`($sp)
  462. addi $sp,$sp,$FRAME
  463. Labort:
  464. blr
  465. .long 0
  466. .byte 0,12,4,1,0x80,0,4,0
  467. .size .poly1305_blocks_fpu,.-.poly1305_blocks_fpu
  468. ___
  469. {
  470. my ($mac,$nonce)=($inp,$len);
  471. my ($h0,$h1,$h2,$h3,$h4, $d0,$d1,$d2,$d3
  472. ) = map("r$_",(7..11,28..31));
  473. my $mask = "r0";
  474. my $FRAME = (6+4)*$SIZE_T;
  475. $code.=<<___;
  476. .globl .poly1305_emit_fpu
  477. .align 4
  478. .poly1305_emit_fpu:
  479. $STU $sp,-$FRAME($sp)
  480. mflr r0
  481. $PUSH r28,`$FRAME-$SIZE_T*4`($sp)
  482. $PUSH r29,`$FRAME-$SIZE_T*3`($sp)
  483. $PUSH r30,`$FRAME-$SIZE_T*2`($sp)
  484. $PUSH r31,`$FRAME-$SIZE_T*1`($sp)
  485. $PUSH r0,`$FRAME+$LRSAVE`($sp)
  486. lwz $d0,`8*0+(0^$LITTLE_ENDIAN)`($ctx) # load hash
  487. lwz $h0,`8*0+(4^$LITTLE_ENDIAN)`($ctx)
  488. lwz $d1,`8*1+(0^$LITTLE_ENDIAN)`($ctx)
  489. lwz $h1,`8*1+(4^$LITTLE_ENDIAN)`($ctx)
  490. lwz $d2,`8*2+(0^$LITTLE_ENDIAN)`($ctx)
  491. lwz $h2,`8*2+(4^$LITTLE_ENDIAN)`($ctx)
  492. lwz $d3,`8*3+(0^$LITTLE_ENDIAN)`($ctx)
  493. lwz $h3,`8*3+(4^$LITTLE_ENDIAN)`($ctx)
  494. lis $mask,0xfff0
  495. andc $d0,$d0,$mask # mask exponent
  496. andc $d1,$d1,$mask
  497. andc $d2,$d2,$mask
  498. andc $d3,$d3,$mask # can be partially reduced...
  499. li $mask,3
  500. srwi $padbit,$d3,2 # ... so reduce
  501. and $h4,$d3,$mask
  502. andc $d3,$d3,$mask
  503. add $d3,$d3,$padbit
  504. ___
  505. if ($SIZE_T==4) {
  506. $code.=<<___;
  507. addc $h0,$h0,$d3
  508. adde $h1,$h1,$d0
  509. adde $h2,$h2,$d1
  510. adde $h3,$h3,$d2
  511. addze $h4,$h4
  512. addic $d0,$h0,5 # compare to modulus
  513. addze $d1,$h1
  514. addze $d2,$h2
  515. addze $d3,$h3
  516. addze $mask,$h4
  517. srwi $mask,$mask,2 # did it carry/borrow?
  518. neg $mask,$mask
  519. srawi $mask,$mask,31 # mask
  520. andc $h0,$h0,$mask
  521. and $d0,$d0,$mask
  522. andc $h1,$h1,$mask
  523. and $d1,$d1,$mask
  524. or $h0,$h0,$d0
  525. lwz $d0,0($nonce) # load nonce
  526. andc $h2,$h2,$mask
  527. and $d2,$d2,$mask
  528. or $h1,$h1,$d1
  529. lwz $d1,4($nonce)
  530. andc $h3,$h3,$mask
  531. and $d3,$d3,$mask
  532. or $h2,$h2,$d2
  533. lwz $d2,8($nonce)
  534. or $h3,$h3,$d3
  535. lwz $d3,12($nonce)
  536. addc $h0,$h0,$d0 # accumulate nonce
  537. adde $h1,$h1,$d1
  538. adde $h2,$h2,$d2
  539. adde $h3,$h3,$d3
  540. ___
  541. } else {
  542. $code.=<<___;
  543. add $h0,$h0,$d3
  544. add $h1,$h1,$d0
  545. add $h2,$h2,$d1
  546. add $h3,$h3,$d2
  547. srdi $d0,$h0,32
  548. add $h1,$h1,$d0
  549. srdi $d1,$h1,32
  550. add $h2,$h2,$d1
  551. srdi $d2,$h2,32
  552. add $h3,$h3,$d2
  553. srdi $d3,$h3,32
  554. add $h4,$h4,$d3
  555. insrdi $h0,$h1,32,0
  556. insrdi $h2,$h3,32,0
  557. addic $d0,$h0,5 # compare to modulus
  558. addze $d1,$h2
  559. addze $d2,$h4
  560. srdi $mask,$d2,2 # did it carry/borrow?
  561. neg $mask,$mask
  562. sradi $mask,$mask,63 # mask
  563. ld $d2,0($nonce) # load nonce
  564. ld $d3,8($nonce)
  565. andc $h0,$h0,$mask
  566. and $d0,$d0,$mask
  567. andc $h2,$h2,$mask
  568. and $d1,$d1,$mask
  569. or $h0,$h0,$d0
  570. or $h2,$h2,$d1
  571. ___
  572. $code.=<<___ if (!$LITTLE_ENDIAN);
  573. rotldi $d2,$d2,32 # flip nonce words
  574. rotldi $d3,$d3,32
  575. ___
  576. $code.=<<___;
  577. addc $h0,$h0,$d2 # accumulate nonce
  578. adde $h2,$h2,$d3
  579. srdi $h1,$h0,32
  580. srdi $h3,$h2,32
  581. ___
  582. }
  583. $code.=<<___ if ($LITTLE_ENDIAN);
  584. stw $h0,0($mac) # write result
  585. stw $h1,4($mac)
  586. stw $h2,8($mac)
  587. stw $h3,12($mac)
  588. ___
  589. $code.=<<___ if (!$LITTLE_ENDIAN);
  590. li $d1,4
  591. stwbrx $h0,0,$mac # write result
  592. li $d2,8
  593. stwbrx $h1,$d1,$mac
  594. li $d3,12
  595. stwbrx $h2,$d2,$mac
  596. stwbrx $h3,$d3,$mac
  597. ___
  598. $code.=<<___;
  599. $POP r28,`$FRAME-$SIZE_T*4`($sp)
  600. $POP r29,`$FRAME-$SIZE_T*3`($sp)
  601. $POP r30,`$FRAME-$SIZE_T*2`($sp)
  602. $POP r31,`$FRAME-$SIZE_T*1`($sp)
  603. addi $sp,$sp,$FRAME
  604. blr
  605. .long 0
  606. .byte 0,12,4,1,0x80,4,3,0
  607. .size .poly1305_emit_fpu,.-.poly1305_emit_fpu
  608. ___
  609. }
  610. # Ugly hack here, because PPC assembler syntax seem to vary too
  611. # much from platforms to platform...
  612. $code.=<<___;
  613. .align 6
  614. LPICmeup:
  615. mflr r0
  616. bcl 20,31,\$+4
  617. mflr $len # vvvvvv "distance" between . and 1st data entry
  618. addi $len,$len,`64-8` # borrow $len
  619. mtlr r0
  620. blr
  621. .long 0
  622. .byte 0,12,0x14,0,0,0,0,0
  623. .space `64-9*4`
  624. .quad 0x4330000000000000 # 2^(52+0)
  625. .quad 0x4530000000000000 # 2^(52+32)
  626. .quad 0x4730000000000000 # 2^(52+64)
  627. .quad 0x4930000000000000 # 2^(52+96)
  628. .quad 0x4b50000000000000 # 2^(52+130)
  629. .quad 0x37f4000000000000 # 5/2^130
  630. .quad 0x4430000000000000 # 2^(52+16+0)
  631. .quad 0x4630000000000000 # 2^(52+16+32)
  632. .quad 0x4830000000000000 # 2^(52+16+64)
  633. .quad 0x4a30000000000000 # 2^(52+16+96)
  634. .quad 0x3e30000000000000 # 2^(52+16+0-96)
  635. .quad 0x4030000000000000 # 2^(52+16+32-96)
  636. .quad 0x4230000000000000 # 2^(52+16+64-96)
  637. .quad 0x0000000000000001 # fpscr: truncate, no exceptions
  638. .asciz "Poly1305 for PPC FPU, CRYPTOGAMS by <appro\@openssl.org>"
  639. .align 4
  640. ___
  641. $code =~ s/\`([^\`]*)\`/eval $1/gem;
  642. print $code;
  643. close STDOUT or die "error closing STDOUT: $!";