ghash-alpha.pl 7.8 KB


  1. #! /usr/bin/env perl
  2. # Copyright 2010-2020 The OpenSSL Project Authors. All Rights Reserved.
  3. #
  4. # Licensed under the OpenSSL license (the "License"). You may not use
  5. # this file except in compliance with the License. You can obtain a copy
  6. # in the file LICENSE in the source distribution or at
  7. # https://www.openssl.org/source/license.html
  8. #
  9. # ====================================================================
  10. # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
  11. # project. The module is, however, dual licensed under OpenSSL and
  12. # CRYPTOGAMS licenses depending on where you obtain it. For further
  13. # details see http://www.openssl.org/~appro/cryptogams/.
  14. # ====================================================================
  15. #
  16. # March 2010
  17. #
  18. # The module implements "4-bit" GCM GHASH function and underlying
  19. # single multiplication operation in GF(2^128). "4-bit" means that it
  20. # uses 256 bytes per-key table [+128 bytes shared table]. Even though
  21. # loops are aggressively modulo-scheduled in respect to references to
  22. # Htbl and Z.hi updates for 8 cycles per byte, measured performance is
  23. # ~12 cycles per processed byte on 21264 CPU. It seems to be a dynamic
  24. # scheduling "glitch," because uprofile(1) indicates uniform sample
  25. # distribution, as if all instruction bundles execute in 1.5 cycles.
  26. # Meaning that it could have been even faster, yet 12 cycles is ~60%
  27. # better than gcc-generated code and ~80% than code generated by vendor
  28. # compiler.
  29. $cnt="v0"; # $0
  30. $t0="t0";
  31. $t1="t1";
  32. $t2="t2";
  33. $Thi0="t3"; # $4
  34. $Tlo0="t4";
  35. $Thi1="t5";
  36. $Tlo1="t6";
  37. $rem="t7"; # $8
  38. #################
  39. $Xi="a0"; # $16, input argument block
  40. $Htbl="a1";
  41. $inp="a2";
  42. $len="a3";
  43. $nlo="a4"; # $20
  44. $nhi="a5";
  45. $Zhi="t8";
  46. $Zlo="t9";
  47. $Xhi="t10"; # $24
  48. $Xlo="t11";
  49. $remp="t12";
  50. $rem_4bit="AT"; # $28
  51. { my $N;
  52. sub loop() {
  53. $N++;
  54. $code.=<<___;
  55. .align 4
  56. extbl $Xlo,7,$nlo
  57. and $nlo,0xf0,$nhi
  58. sll $nlo,4,$nlo
  59. and $nlo,0xf0,$nlo
  60. addq $nlo,$Htbl,$nlo
  61. ldq $Zlo,8($nlo)
  62. addq $nhi,$Htbl,$nhi
  63. ldq $Zhi,0($nlo)
  64. and $Zlo,0x0f,$remp
  65. sll $Zhi,60,$t0
  66. lda $cnt,6(zero)
  67. extbl $Xlo,6,$nlo
  68. ldq $Tlo1,8($nhi)
  69. s8addq $remp,$rem_4bit,$remp
  70. ldq $Thi1,0($nhi)
  71. srl $Zlo,4,$Zlo
  72. ldq $rem,0($remp)
  73. srl $Zhi,4,$Zhi
  74. xor $t0,$Zlo,$Zlo
  75. and $nlo,0xf0,$nhi
  76. xor $Tlo1,$Zlo,$Zlo
  77. sll $nlo,4,$nlo
  78. xor $Thi1,$Zhi,$Zhi
  79. and $nlo,0xf0,$nlo
  80. addq $nlo,$Htbl,$nlo
  81. ldq $Tlo0,8($nlo)
  82. addq $nhi,$Htbl,$nhi
  83. ldq $Thi0,0($nlo)
  84. .Looplo$N:
  85. and $Zlo,0x0f,$remp
  86. sll $Zhi,60,$t0
  87. subq $cnt,1,$cnt
  88. srl $Zlo,4,$Zlo
  89. ldq $Tlo1,8($nhi)
  90. xor $rem,$Zhi,$Zhi
  91. ldq $Thi1,0($nhi)
  92. s8addq $remp,$rem_4bit,$remp
  93. ldq $rem,0($remp)
  94. srl $Zhi,4,$Zhi
  95. xor $t0,$Zlo,$Zlo
  96. extbl $Xlo,$cnt,$nlo
  97. and $nlo,0xf0,$nhi
  98. xor $Thi0,$Zhi,$Zhi
  99. xor $Tlo0,$Zlo,$Zlo
  100. sll $nlo,4,$nlo
  101. and $Zlo,0x0f,$remp
  102. sll $Zhi,60,$t0
  103. and $nlo,0xf0,$nlo
  104. srl $Zlo,4,$Zlo
  105. s8addq $remp,$rem_4bit,$remp
  106. xor $rem,$Zhi,$Zhi
  107. addq $nlo,$Htbl,$nlo
  108. addq $nhi,$Htbl,$nhi
  109. ldq $rem,0($remp)
  110. srl $Zhi,4,$Zhi
  111. ldq $Tlo0,8($nlo)
  112. xor $t0,$Zlo,$Zlo
  113. xor $Tlo1,$Zlo,$Zlo
  114. xor $Thi1,$Zhi,$Zhi
  115. ldq $Thi0,0($nlo)
  116. bne $cnt,.Looplo$N
  117. and $Zlo,0x0f,$remp
  118. sll $Zhi,60,$t0
  119. lda $cnt,7(zero)
  120. srl $Zlo,4,$Zlo
  121. ldq $Tlo1,8($nhi)
  122. xor $rem,$Zhi,$Zhi
  123. ldq $Thi1,0($nhi)
  124. s8addq $remp,$rem_4bit,$remp
  125. ldq $rem,0($remp)
  126. srl $Zhi,4,$Zhi
  127. xor $t0,$Zlo,$Zlo
  128. extbl $Xhi,$cnt,$nlo
  129. and $nlo,0xf0,$nhi
  130. xor $Thi0,$Zhi,$Zhi
  131. xor $Tlo0,$Zlo,$Zlo
  132. sll $nlo,4,$nlo
  133. and $Zlo,0x0f,$remp
  134. sll $Zhi,60,$t0
  135. and $nlo,0xf0,$nlo
  136. srl $Zlo,4,$Zlo
  137. s8addq $remp,$rem_4bit,$remp
  138. xor $rem,$Zhi,$Zhi
  139. addq $nlo,$Htbl,$nlo
  140. addq $nhi,$Htbl,$nhi
  141. ldq $rem,0($remp)
  142. srl $Zhi,4,$Zhi
  143. ldq $Tlo0,8($nlo)
  144. xor $t0,$Zlo,$Zlo
  145. xor $Tlo1,$Zlo,$Zlo
  146. xor $Thi1,$Zhi,$Zhi
  147. ldq $Thi0,0($nlo)
  148. unop
  149. .Loophi$N:
  150. and $Zlo,0x0f,$remp
  151. sll $Zhi,60,$t0
  152. subq $cnt,1,$cnt
  153. srl $Zlo,4,$Zlo
  154. ldq $Tlo1,8($nhi)
  155. xor $rem,$Zhi,$Zhi
  156. ldq $Thi1,0($nhi)
  157. s8addq $remp,$rem_4bit,$remp
  158. ldq $rem,0($remp)
  159. srl $Zhi,4,$Zhi
  160. xor $t0,$Zlo,$Zlo
  161. extbl $Xhi,$cnt,$nlo
  162. and $nlo,0xf0,$nhi
  163. xor $Thi0,$Zhi,$Zhi
  164. xor $Tlo0,$Zlo,$Zlo
  165. sll $nlo,4,$nlo
  166. and $Zlo,0x0f,$remp
  167. sll $Zhi,60,$t0
  168. and $nlo,0xf0,$nlo
  169. srl $Zlo,4,$Zlo
  170. s8addq $remp,$rem_4bit,$remp
  171. xor $rem,$Zhi,$Zhi
  172. addq $nlo,$Htbl,$nlo
  173. addq $nhi,$Htbl,$nhi
  174. ldq $rem,0($remp)
  175. srl $Zhi,4,$Zhi
  176. ldq $Tlo0,8($nlo)
  177. xor $t0,$Zlo,$Zlo
  178. xor $Tlo1,$Zlo,$Zlo
  179. xor $Thi1,$Zhi,$Zhi
  180. ldq $Thi0,0($nlo)
  181. bne $cnt,.Loophi$N
  182. and $Zlo,0x0f,$remp
  183. sll $Zhi,60,$t0
  184. srl $Zlo,4,$Zlo
  185. ldq $Tlo1,8($nhi)
  186. xor $rem,$Zhi,$Zhi
  187. ldq $Thi1,0($nhi)
  188. s8addq $remp,$rem_4bit,$remp
  189. ldq $rem,0($remp)
  190. srl $Zhi,4,$Zhi
  191. xor $t0,$Zlo,$Zlo
  192. xor $Tlo0,$Zlo,$Zlo
  193. xor $Thi0,$Zhi,$Zhi
  194. and $Zlo,0x0f,$remp
  195. sll $Zhi,60,$t0
  196. srl $Zlo,4,$Zlo
  197. s8addq $remp,$rem_4bit,$remp
  198. xor $rem,$Zhi,$Zhi
  199. ldq $rem,0($remp)
  200. srl $Zhi,4,$Zhi
  201. xor $Tlo1,$Zlo,$Zlo
  202. xor $Thi1,$Zhi,$Zhi
  203. xor $t0,$Zlo,$Zlo
  204. xor $rem,$Zhi,$Zhi
  205. ___
  206. }}
  207. $code=<<___;
  208. #ifdef __linux__
  209. #include <asm/regdef.h>
  210. #else
  211. #include <asm.h>
  212. #include <regdef.h>
  213. #endif
  214. .text
  215. .set noat
  216. .set noreorder
  217. .globl gcm_gmult_4bit
  218. .align 4
  219. .ent gcm_gmult_4bit
  220. gcm_gmult_4bit:
  221. .frame sp,0,ra
  222. .prologue 0
  223. ldq $Xlo,8($Xi)
  224. ldq $Xhi,0($Xi)
  225. bsr $t0,picmeup
  226. nop
  227. ___
  228. &loop();
  229. $code.=<<___;
  230. srl $Zlo,24,$t0 # byte swap
  231. srl $Zlo,8,$t1
  232. sll $Zlo,8,$t2
  233. sll $Zlo,24,$Zlo
  234. zapnot $t0,0x11,$t0
  235. zapnot $t1,0x22,$t1
  236. zapnot $Zlo,0x88,$Zlo
  237. or $t0,$t1,$t0
  238. zapnot $t2,0x44,$t2
  239. or $Zlo,$t0,$Zlo
  240. srl $Zhi,24,$t0
  241. srl $Zhi,8,$t1
  242. or $Zlo,$t2,$Zlo
  243. sll $Zhi,8,$t2
  244. sll $Zhi,24,$Zhi
  245. srl $Zlo,32,$Xlo
  246. sll $Zlo,32,$Zlo
  247. zapnot $t0,0x11,$t0
  248. zapnot $t1,0x22,$t1
  249. or $Zlo,$Xlo,$Xlo
  250. zapnot $Zhi,0x88,$Zhi
  251. or $t0,$t1,$t0
  252. zapnot $t2,0x44,$t2
  253. or $Zhi,$t0,$Zhi
  254. or $Zhi,$t2,$Zhi
  255. srl $Zhi,32,$Xhi
  256. sll $Zhi,32,$Zhi
  257. or $Zhi,$Xhi,$Xhi
  258. stq $Xlo,8($Xi)
  259. stq $Xhi,0($Xi)
  260. ret (ra)
  261. .end gcm_gmult_4bit
  262. ___
  263. $inhi="s0";
  264. $inlo="s1";
  265. $code.=<<___;
  266. .globl gcm_ghash_4bit
  267. .align 4
  268. .ent gcm_ghash_4bit
  269. gcm_ghash_4bit:
  270. lda sp,-32(sp)
  271. stq ra,0(sp)
  272. stq s0,8(sp)
  273. stq s1,16(sp)
  274. .mask 0x04000600,-32
  275. .frame sp,32,ra
  276. .prologue 0
  277. ldq_u $inhi,0($inp)
  278. ldq_u $Thi0,7($inp)
  279. ldq_u $inlo,8($inp)
  280. ldq_u $Tlo0,15($inp)
  281. ldq $Xhi,0($Xi)
  282. ldq $Xlo,8($Xi)
  283. bsr $t0,picmeup
  284. nop
  285. .Louter:
  286. extql $inhi,$inp,$inhi
  287. extqh $Thi0,$inp,$Thi0
  288. or $inhi,$Thi0,$inhi
  289. lda $inp,16($inp)
  290. extql $inlo,$inp,$inlo
  291. extqh $Tlo0,$inp,$Tlo0
  292. or $inlo,$Tlo0,$inlo
  293. subq $len,16,$len
  294. xor $Xlo,$inlo,$Xlo
  295. xor $Xhi,$inhi,$Xhi
  296. ___
  297. &loop();
  298. $code.=<<___;
  299. srl $Zlo,24,$t0 # byte swap
  300. srl $Zlo,8,$t1
  301. sll $Zlo,8,$t2
  302. sll $Zlo,24,$Zlo
  303. zapnot $t0,0x11,$t0
  304. zapnot $t1,0x22,$t1
  305. zapnot $Zlo,0x88,$Zlo
  306. or $t0,$t1,$t0
  307. zapnot $t2,0x44,$t2
  308. or $Zlo,$t0,$Zlo
  309. srl $Zhi,24,$t0
  310. srl $Zhi,8,$t1
  311. or $Zlo,$t2,$Zlo
  312. sll $Zhi,8,$t2
  313. sll $Zhi,24,$Zhi
  314. srl $Zlo,32,$Xlo
  315. sll $Zlo,32,$Zlo
  316. beq $len,.Ldone
  317. zapnot $t0,0x11,$t0
  318. zapnot $t1,0x22,$t1
  319. or $Zlo,$Xlo,$Xlo
  320. ldq_u $inhi,0($inp)
  321. zapnot $Zhi,0x88,$Zhi
  322. or $t0,$t1,$t0
  323. zapnot $t2,0x44,$t2
  324. ldq_u $Thi0,7($inp)
  325. or $Zhi,$t0,$Zhi
  326. or $Zhi,$t2,$Zhi
  327. ldq_u $inlo,8($inp)
  328. ldq_u $Tlo0,15($inp)
  329. srl $Zhi,32,$Xhi
  330. sll $Zhi,32,$Zhi
  331. or $Zhi,$Xhi,$Xhi
  332. br zero,.Louter
  333. .Ldone:
  334. zapnot $t0,0x11,$t0
  335. zapnot $t1,0x22,$t1
  336. or $Zlo,$Xlo,$Xlo
  337. zapnot $Zhi,0x88,$Zhi
  338. or $t0,$t1,$t0
  339. zapnot $t2,0x44,$t2
  340. or $Zhi,$t0,$Zhi
  341. or $Zhi,$t2,$Zhi
  342. srl $Zhi,32,$Xhi
  343. sll $Zhi,32,$Zhi
  344. or $Zhi,$Xhi,$Xhi
  345. stq $Xlo,8($Xi)
  346. stq $Xhi,0($Xi)
  347. .set noreorder
  348. /*ldq ra,0(sp)*/
  349. ldq s0,8(sp)
  350. ldq s1,16(sp)
  351. lda sp,32(sp)
  352. ret (ra)
  353. .end gcm_ghash_4bit
  354. .align 4
  355. .ent picmeup
  356. picmeup:
  357. .frame sp,0,$t0
  358. .prologue 0
  359. br $rem_4bit,.Lpic
  360. .Lpic: lda $rem_4bit,12($rem_4bit)
  361. ret ($t0)
  362. .end picmeup
  363. nop
  364. rem_4bit:
  365. .long 0,0x0000<<16, 0,0x1C20<<16, 0,0x3840<<16, 0,0x2460<<16
  366. .long 0,0x7080<<16, 0,0x6CA0<<16, 0,0x48C0<<16, 0,0x54E0<<16
  367. .long 0,0xE100<<16, 0,0xFD20<<16, 0,0xD940<<16, 0,0xC560<<16
  368. .long 0,0x9180<<16, 0,0x8DA0<<16, 0,0xA9C0<<16, 0,0xB5E0<<16
  369. .ascii "GHASH for Alpha, CRYPTOGAMS by <appro\@openssl.org>"
  370. .align 4
  371. ___
  372. $output=pop and open STDOUT,">$output";
  373. print $code;
  374. close STDOUT or die "error closing STDOUT: $!";