2
0

poly1305-mips.pl 8.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437
  1. #! /usr/bin/env perl
  2. # Copyright 2016-2020 The OpenSSL Project Authors. All Rights Reserved.
  3. #
  4. # Licensed under the OpenSSL license (the "License"). You may not use
  5. # this file except in compliance with the License. You can obtain a copy
  6. # in the file LICENSE in the source distribution or at
  7. # https://www.openssl.org/source/license.html
  8. # ====================================================================
  9. # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
  10. # project. The module is, however, dual licensed under OpenSSL and
  11. # CRYPTOGAMS licenses depending on where you obtain it. For further
  12. # details see http://www.openssl.org/~appro/cryptogams/.
  13. # ====================================================================
  14. # Poly1305 hash for MIPS64.
  15. #
  16. # May 2016
  17. #
  18. # Numbers are cycles per processed byte with poly1305_blocks alone.
  19. #
  20. # IALU/gcc
  21. # R1x000 5.64/+120% (big-endian)
  22. # Octeon II 3.80/+280% (little-endian)
  23. ######################################################################
  24. # There is a number of MIPS ABI in use, O32 and N32/64 are most
  25. # widely used. Then there is a new contender: NUBI. It appears that if
  26. # one picks the latter, it's possible to arrange code in ABI neutral
  27. # manner. Therefore let's stick to NUBI register layout:
  28. #
  29. ($zero,$at,$t0,$t1,$t2)=map("\$$_",(0..2,24,25));
  30. ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11));
  31. ($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7,$s8,$s9,$s10,$s11)=map("\$$_",(12..23));
  32. ($gp,$tp,$sp,$fp,$ra)=map("\$$_",(3,28..31));
  33. #
  34. # The return value is placed in $a0. Following coding rules facilitate
  35. # interoperability:
  36. #
  37. # - never ever touch $tp, "thread pointer", former $gp [o32 can be
  38. # excluded from the rule, because it's specified volatile];
  39. # - copy return value to $t0, former $v0 [or to $a0 if you're adapting
  40. # old code];
  41. # - on O32 populate $a4-$a7 with 'lw $aN,4*N($sp)' if necessary;
  42. #
  43. # For reference here is register layout for N32/64 MIPS ABIs:
  44. #
  45. # ($zero,$at,$v0,$v1)=map("\$$_",(0..3));
  46. # ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11));
  47. # ($t0,$t1,$t2,$t3,$t8,$t9)=map("\$$_",(12..15,24,25));
  48. # ($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7)=map("\$$_",(16..23));
  49. # ($gp,$sp,$fp,$ra)=map("\$$_",(28..31));
  50. #
  51. # <appro@openssl.org>
  52. #
  53. ######################################################################
  54. $flavour = shift || "o32"; # supported flavours are o32,n32,64,nubi32,nubi64
  55. die "MIPS64 only" unless ($flavour =~ /64|n32/i);
  56. $v0 = ($flavour =~ /nubi/i) ? $a0 : $t0;
  57. $SAVED_REGS_MASK = ($flavour =~ /nubi/i) ? "0x0003f000" : "0x00030000";
  58. ($ctx,$inp,$len,$padbit) = ($a0,$a1,$a2,$a3);
  59. ($in0,$in1,$tmp0,$tmp1,$tmp2,$tmp3,$tmp4) = ($a4,$a5,$a6,$a7,$at,$t0,$t1);
  60. $code.=<<___;
  61. #include "mips_arch.h"
  62. #ifdef MIPSEB
  63. # define MSB 0
  64. # define LSB 7
  65. #else
  66. # define MSB 7
  67. # define LSB 0
  68. #endif
  69. .text
  70. .set noat
  71. .set noreorder
  72. .align 5
  73. .globl poly1305_init
  74. .ent poly1305_init
  75. poly1305_init:
  76. .frame $sp,0,$ra
  77. .set reorder
  78. sd $zero,0($ctx)
  79. sd $zero,8($ctx)
  80. sd $zero,16($ctx)
  81. beqz $inp,.Lno_key
  82. #if defined(_MIPS_ARCH_MIPS64R6)
  83. ld $in0,0($inp)
  84. ld $in1,8($inp)
  85. #else
  86. ldl $in0,0+MSB($inp)
  87. ldl $in1,8+MSB($inp)
  88. ldr $in0,0+LSB($inp)
  89. ldr $in1,8+LSB($inp)
  90. #endif
  91. #ifdef MIPSEB
  92. # if defined(_MIPS_ARCH_MIPS64R2)
  93. dsbh $in0,$in0 # byte swap
  94. dsbh $in1,$in1
  95. dshd $in0,$in0
  96. dshd $in1,$in1
  97. # else
  98. ori $tmp0,$zero,0xFF
  99. dsll $tmp2,$tmp0,32
  100. or $tmp0,$tmp2 # 0x000000FF000000FF
  101. and $tmp1,$in0,$tmp0 # byte swap
  102. and $tmp3,$in1,$tmp0
  103. dsrl $tmp2,$in0,24
  104. dsrl $tmp4,$in1,24
  105. dsll $tmp1,24
  106. dsll $tmp3,24
  107. and $tmp2,$tmp0
  108. and $tmp4,$tmp0
  109. dsll $tmp0,8 # 0x0000FF000000FF00
  110. or $tmp1,$tmp2
  111. or $tmp3,$tmp4
  112. and $tmp2,$in0,$tmp0
  113. and $tmp4,$in1,$tmp0
  114. dsrl $in0,8
  115. dsrl $in1,8
  116. dsll $tmp2,8
  117. dsll $tmp4,8
  118. and $in0,$tmp0
  119. and $in1,$tmp0
  120. or $tmp1,$tmp2
  121. or $tmp3,$tmp4
  122. or $in0,$tmp1
  123. or $in1,$tmp3
  124. dsrl $tmp1,$in0,32
  125. dsrl $tmp3,$in1,32
  126. dsll $in0,32
  127. dsll $in1,32
  128. or $in0,$tmp1
  129. or $in1,$tmp3
  130. # endif
  131. #endif
  132. li $tmp0,1
  133. dsll $tmp0,32
  134. daddiu $tmp0,-63
  135. dsll $tmp0,28
  136. daddiu $tmp0,-1 # 0ffffffc0fffffff
  137. and $in0,$tmp0
  138. daddiu $tmp0,-3 # 0ffffffc0ffffffc
  139. and $in1,$tmp0
  140. sd $in0,24($ctx)
  141. dsrl $tmp0,$in1,2
  142. sd $in1,32($ctx)
  143. daddu $tmp0,$in1 # s1 = r1 + (r1 >> 2)
  144. sd $tmp0,40($ctx)
  145. .Lno_key:
  146. li $v0,0 # return 0
  147. jr $ra
  148. .end poly1305_init
  149. ___
  150. {
  151. my ($h0,$h1,$h2,$r0,$r1,$s1,$d0,$d1,$d2) =
  152. ($s0,$s1,$s2,$s3,$s4,$s5,$in0,$in1,$t2);
  153. $code.=<<___;
  154. .align 5
  155. .globl poly1305_blocks
  156. .ent poly1305_blocks
  157. poly1305_blocks:
  158. .set noreorder
  159. dsrl $len,4 # number of complete blocks
  160. bnez $len,poly1305_blocks_internal
  161. nop
  162. jr $ra
  163. nop
  164. .end poly1305_blocks
  165. .align 5
  166. .ent poly1305_blocks_internal
  167. poly1305_blocks_internal:
  168. .frame $sp,6*8,$ra
  169. .mask $SAVED_REGS_MASK,-8
  170. .set noreorder
  171. dsubu $sp,6*8
  172. sd $s5,40($sp)
  173. sd $s4,32($sp)
  174. ___
  175. $code.=<<___ if ($flavour =~ /nubi/i); # optimize non-nubi prologue
  176. sd $s3,24($sp)
  177. sd $s2,16($sp)
  178. sd $s1,8($sp)
  179. sd $s0,0($sp)
  180. ___
  181. $code.=<<___;
  182. .set reorder
  183. ld $h0,0($ctx) # load hash value
  184. ld $h1,8($ctx)
  185. ld $h2,16($ctx)
  186. ld $r0,24($ctx) # load key
  187. ld $r1,32($ctx)
  188. ld $s1,40($ctx)
  189. .Loop:
  190. #if defined(_MIPS_ARCH_MIPS64R6)
  191. ld $in0,0($inp) # load input
  192. ld $in1,8($inp)
  193. #else
  194. ldl $in0,0+MSB($inp) # load input
  195. ldl $in1,8+MSB($inp)
  196. ldr $in0,0+LSB($inp)
  197. ldr $in1,8+LSB($inp)
  198. #endif
  199. daddiu $len,-1
  200. daddiu $inp,16
  201. #ifdef MIPSEB
  202. # if defined(_MIPS_ARCH_MIPS64R2)
  203. dsbh $in0,$in0 # byte swap
  204. dsbh $in1,$in1
  205. dshd $in0,$in0
  206. dshd $in1,$in1
  207. # else
  208. ori $tmp0,$zero,0xFF
  209. dsll $tmp2,$tmp0,32
  210. or $tmp0,$tmp2 # 0x000000FF000000FF
  211. and $tmp1,$in0,$tmp0 # byte swap
  212. and $tmp3,$in1,$tmp0
  213. dsrl $tmp2,$in0,24
  214. dsrl $tmp4,$in1,24
  215. dsll $tmp1,24
  216. dsll $tmp3,24
  217. and $tmp2,$tmp0
  218. and $tmp4,$tmp0
  219. dsll $tmp0,8 # 0x0000FF000000FF00
  220. or $tmp1,$tmp2
  221. or $tmp3,$tmp4
  222. and $tmp2,$in0,$tmp0
  223. and $tmp4,$in1,$tmp0
  224. dsrl $in0,8
  225. dsrl $in1,8
  226. dsll $tmp2,8
  227. dsll $tmp4,8
  228. and $in0,$tmp0
  229. and $in1,$tmp0
  230. or $tmp1,$tmp2
  231. or $tmp3,$tmp4
  232. or $in0,$tmp1
  233. or $in1,$tmp3
  234. dsrl $tmp1,$in0,32
  235. dsrl $tmp3,$in1,32
  236. dsll $in0,32
  237. dsll $in1,32
  238. or $in0,$tmp1
  239. or $in1,$tmp3
  240. # endif
  241. #endif
  242. daddu $h0,$in0 # accumulate input
  243. daddu $h1,$in1
  244. sltu $tmp0,$h0,$in0
  245. sltu $tmp1,$h1,$in1
  246. daddu $h1,$tmp0
  247. dmultu ($r0,$h0) # h0*r0
  248. daddu $h2,$padbit
  249. sltu $tmp0,$h1,$tmp0
  250. mflo ($d0,$r0,$h0)
  251. mfhi ($d1,$r0,$h0)
  252. dmultu ($s1,$h1) # h1*5*r1
  253. daddu $tmp0,$tmp1
  254. daddu $h2,$tmp0
  255. mflo ($tmp0,$s1,$h1)
  256. mfhi ($tmp1,$s1,$h1)
  257. dmultu ($r1,$h0) # h0*r1
  258. daddu $d0,$tmp0
  259. daddu $d1,$tmp1
  260. mflo ($tmp2,$r1,$h0)
  261. mfhi ($d2,$r1,$h0)
  262. sltu $tmp0,$d0,$tmp0
  263. daddu $d1,$tmp0
  264. dmultu ($r0,$h1) # h1*r0
  265. daddu $d1,$tmp2
  266. sltu $tmp2,$d1,$tmp2
  267. mflo ($tmp0,$r0,$h1)
  268. mfhi ($tmp1,$r0,$h1)
  269. daddu $d2,$tmp2
  270. dmultu ($s1,$h2) # h2*5*r1
  271. daddu $d1,$tmp0
  272. daddu $d2,$tmp1
  273. mflo ($tmp2,$s1,$h2)
  274. dmultu ($r0,$h2) # h2*r0
  275. sltu $tmp0,$d1,$tmp0
  276. daddu $d2,$tmp0
  277. mflo ($tmp3,$r0,$h2)
  278. daddu $d1,$tmp2
  279. daddu $d2,$tmp3
  280. sltu $tmp2,$d1,$tmp2
  281. daddu $d2,$tmp2
  282. li $tmp0,-4 # final reduction
  283. and $tmp0,$d2
  284. dsrl $tmp1,$d2,2
  285. andi $h2,$d2,3
  286. daddu $tmp0,$tmp1
  287. daddu $h0,$d0,$tmp0
  288. sltu $tmp0,$h0,$tmp0
  289. daddu $h1,$d1,$tmp0
  290. sltu $tmp0,$h1,$tmp0
  291. daddu $h2,$h2,$tmp0
  292. bnez $len,.Loop
  293. sd $h0,0($ctx) # store hash value
  294. sd $h1,8($ctx)
  295. sd $h2,16($ctx)
  296. .set noreorder
  297. ld $s5,40($sp) # epilogue
  298. ld $s4,32($sp)
  299. ___
  300. $code.=<<___ if ($flavour =~ /nubi/i); # optimize non-nubi epilogue
  301. ld $s3,24($sp)
  302. ld $s2,16($sp)
  303. ld $s1,8($sp)
  304. ld $s0,0($sp)
  305. ___
  306. $code.=<<___;
  307. jr $ra
  308. daddu $sp,6*8
  309. .end poly1305_blocks_internal
  310. ___
  311. }
  312. {
  313. my ($ctx,$mac,$nonce) = ($a0,$a1,$a2);
  314. $code.=<<___;
  315. .align 5
  316. .globl poly1305_emit
  317. .ent poly1305_emit
  318. poly1305_emit:
  319. .frame $sp,0,$ra
  320. .set reorder
  321. ld $tmp0,0($ctx)
  322. ld $tmp1,8($ctx)
  323. ld $tmp2,16($ctx)
  324. daddiu $in0,$tmp0,5 # compare to modulus
  325. sltiu $tmp3,$in0,5
  326. daddu $in1,$tmp1,$tmp3
  327. sltu $tmp3,$in1,$tmp3
  328. daddu $tmp2,$tmp2,$tmp3
  329. dsrl $tmp2,2 # see if it carried/borrowed
  330. dsubu $tmp2,$zero,$tmp2
  331. nor $tmp3,$zero,$tmp2
  332. and $in0,$tmp2
  333. and $tmp0,$tmp3
  334. and $in1,$tmp2
  335. and $tmp1,$tmp3
  336. or $in0,$tmp0
  337. or $in1,$tmp1
  338. lwu $tmp0,0($nonce) # load nonce
  339. lwu $tmp1,4($nonce)
  340. lwu $tmp2,8($nonce)
  341. lwu $tmp3,12($nonce)
  342. dsll $tmp1,32
  343. dsll $tmp3,32
  344. or $tmp0,$tmp1
  345. or $tmp2,$tmp3
  346. daddu $in0,$tmp0 # accumulate nonce
  347. daddu $in1,$tmp2
  348. sltu $tmp0,$in0,$tmp0
  349. daddu $in1,$tmp0
  350. dsrl $tmp0,$in0,8 # write mac value
  351. dsrl $tmp1,$in0,16
  352. dsrl $tmp2,$in0,24
  353. sb $in0,0($mac)
  354. dsrl $tmp3,$in0,32
  355. sb $tmp0,1($mac)
  356. dsrl $tmp0,$in0,40
  357. sb $tmp1,2($mac)
  358. dsrl $tmp1,$in0,48
  359. sb $tmp2,3($mac)
  360. dsrl $tmp2,$in0,56
  361. sb $tmp3,4($mac)
  362. dsrl $tmp3,$in1,8
  363. sb $tmp0,5($mac)
  364. dsrl $tmp0,$in1,16
  365. sb $tmp1,6($mac)
  366. dsrl $tmp1,$in1,24
  367. sb $tmp2,7($mac)
  368. sb $in1,8($mac)
  369. dsrl $tmp2,$in1,32
  370. sb $tmp3,9($mac)
  371. dsrl $tmp3,$in1,40
  372. sb $tmp0,10($mac)
  373. dsrl $tmp0,$in1,48
  374. sb $tmp1,11($mac)
  375. dsrl $tmp1,$in1,56
  376. sb $tmp2,12($mac)
  377. sb $tmp3,13($mac)
  378. sb $tmp0,14($mac)
  379. sb $tmp1,15($mac)
  380. jr $ra
  381. .end poly1305_emit
  382. .rdata
  383. .asciiz "Poly1305 for MIPS64, CRYPTOGAMS by <appro\@openssl.org>"
  384. .align 2
  385. ___
  386. }
  387. $output=pop and open STDOUT,">$output";
  388. print $code;
  389. close STDOUT or die "error closing STDOUT: $!";