2
0

ghash-ia64.pl 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470
  1. #! /usr/bin/env perl
  2. # Copyright 2010-2020 The OpenSSL Project Authors. All Rights Reserved.
  3. #
  4. # Licensed under the OpenSSL license (the "License"). You may not use
  5. # this file except in compliance with the License. You can obtain a copy
  6. # in the file LICENSE in the source distribution or at
  7. # https://www.openssl.org/source/license.html
  8. # ====================================================================
  9. # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
  10. # project. The module is, however, dual licensed under OpenSSL and
  11. # CRYPTOGAMS licenses depending on where you obtain it. For further
  12. # details see http://www.openssl.org/~appro/cryptogams/.
  13. # ====================================================================
  14. #
  15. # March 2010
  16. #
  17. # The module implements "4-bit" GCM GHASH function and underlying
  18. # single multiplication operation in GF(2^128). "4-bit" means that it
  19. # uses 256 bytes per-key table [+128 bytes shared table]. Streamed
  20. # GHASH performance was measured to be 6.67 cycles per processed byte
  21. # on Itanium 2, which is >90% better than Microsoft compiler generated
  22. # code. To anchor to something else sha1-ia64.pl module processes one
  23. # byte in 5.7 cycles. On Itanium GHASH should run at ~8.5 cycles per
  24. # byte.
  25. # September 2010
  26. #
  27. # It was originally thought that it makes lesser sense to implement
  28. # "528B" variant on Itanium 2 for following reason. Because number of
  29. # functional units is naturally limited, it appeared impossible to
  30. # implement "528B" loop in 4 cycles, only in 5. This would mean that
  31. # theoretically performance improvement couldn't be more than 20%.
  32. # But occasionally you prove yourself wrong:-) I figured out a way to
  33. # fold couple of instructions and having freed yet another instruction
  34. # slot by unrolling the loop... Resulting performance is 4.45 cycles
  35. # per processed byte and 50% better than "256B" version. On original
  36. # Itanium performance should remain the same as the "256B" version,
  37. # i.e. ~8.5 cycles.
  38. $output=pop and (open STDOUT,">$output" or die "can't open $output: $!");
  39. if ($^O eq "hpux") {
  40. $ADDP="addp4";
  41. for (@ARGV) { $ADDP="add" if (/[\+DD|\-mlp]64/); }
  42. } else { $ADDP="add"; }
  43. for (@ARGV) { $big_endian=1 if (/\-DB_ENDIAN/);
  44. $big_endian=0 if (/\-DL_ENDIAN/); }
  45. if (!defined($big_endian))
  46. { $big_endian=(unpack('L',pack('N',1))==1); }
  47. sub loop() {
  48. my $label=shift;
  49. my ($p16,$p17)=(shift)?("p63","p63"):("p16","p17"); # mask references to inp
  50. # Loop is scheduled for 6 ticks on Itanium 2 and 8 on Itanium, i.e.
  51. # in scalable manner;-) Naturally assuming data in L1 cache...
  52. # Special note about 'dep' instruction, which is used to construct
  53. # &rem_4bit[Zlo&0xf]. It works, because rem_4bit is aligned at 128
  54. # bytes boundary and lower 7 bits of its address are guaranteed to
  55. # be zero.
  56. $code.=<<___;
  57. $label:
  58. { .mfi; (p18) ld8 Hlo=[Hi[1]],-8
  59. (p19) dep rem=Zlo,rem_4bitp,3,4 }
  60. { .mfi; (p19) xor Zhi=Zhi,Hhi
  61. ($p17) xor xi[1]=xi[1],in[1] };;
  62. { .mfi; (p18) ld8 Hhi=[Hi[1]]
  63. (p19) shrp Zlo=Zhi,Zlo,4 }
  64. { .mfi; (p19) ld8 rem=[rem]
  65. (p18) and Hi[1]=mask0xf0,xi[2] };;
  66. { .mmi; ($p16) ld1 in[0]=[inp],-1
  67. (p18) xor Zlo=Zlo,Hlo
  68. (p19) shr.u Zhi=Zhi,4 }
  69. { .mib; (p19) xor Hhi=Hhi,rem
  70. (p18) add Hi[1]=Htbl,Hi[1] };;
  71. { .mfi; (p18) ld8 Hlo=[Hi[1]],-8
  72. (p18) dep rem=Zlo,rem_4bitp,3,4 }
  73. { .mfi; (p17) shladd Hi[0]=xi[1],4,r0
  74. (p18) xor Zhi=Zhi,Hhi };;
  75. { .mfi; (p18) ld8 Hhi=[Hi[1]]
  76. (p18) shrp Zlo=Zhi,Zlo,4 }
  77. { .mfi; (p18) ld8 rem=[rem]
  78. (p17) and Hi[0]=mask0xf0,Hi[0] };;
  79. { .mmi; (p16) ld1 xi[0]=[Xi],-1
  80. (p18) xor Zlo=Zlo,Hlo
  81. (p18) shr.u Zhi=Zhi,4 }
  82. { .mib; (p18) xor Hhi=Hhi,rem
  83. (p17) add Hi[0]=Htbl,Hi[0]
  84. br.ctop.sptk $label };;
  85. ___
  86. }
  87. $code=<<___;
  88. .explicit
  89. .text
  90. prevfs=r2; prevlc=r3; prevpr=r8;
  91. mask0xf0=r21;
  92. rem=r22; rem_4bitp=r23;
  93. Xi=r24; Htbl=r25;
  94. inp=r26; end=r27;
  95. Hhi=r28; Hlo=r29;
  96. Zhi=r30; Zlo=r31;
  97. .align 128
  98. .skip 16 // aligns loop body
  99. .global gcm_gmult_4bit#
  100. .proc gcm_gmult_4bit#
  101. gcm_gmult_4bit:
  102. .prologue
  103. { .mmi; .save ar.pfs,prevfs
  104. alloc prevfs=ar.pfs,2,6,0,8
  105. $ADDP Xi=15,in0 // &Xi[15]
  106. mov rem_4bitp=ip }
  107. { .mii; $ADDP Htbl=8,in1 // &Htbl[0].lo
  108. .save ar.lc,prevlc
  109. mov prevlc=ar.lc
  110. .save pr,prevpr
  111. mov prevpr=pr };;
  112. .body
  113. .rotr in[3],xi[3],Hi[2]
  114. { .mib; ld1 xi[2]=[Xi],-1 // Xi[15]
  115. mov mask0xf0=0xf0
  116. brp.loop.imp .Loop1,.Lend1-16};;
  117. { .mmi; ld1 xi[1]=[Xi],-1 // Xi[14]
  118. };;
  119. { .mii; shladd Hi[1]=xi[2],4,r0
  120. mov pr.rot=0x7<<16
  121. mov ar.lc=13 };;
  122. { .mii; and Hi[1]=mask0xf0,Hi[1]
  123. mov ar.ec=3
  124. xor Zlo=Zlo,Zlo };;
  125. { .mii; add Hi[1]=Htbl,Hi[1] // &Htbl[nlo].lo
  126. add rem_4bitp=rem_4bit#-gcm_gmult_4bit#,rem_4bitp
  127. xor Zhi=Zhi,Zhi };;
  128. ___
  129. &loop (".Loop1",1);
  130. $code.=<<___;
  131. .Lend1:
  132. { .mib; xor Zhi=Zhi,Hhi };; // modulo-scheduling artefact
  133. { .mib; mux1 Zlo=Zlo,\@rev };;
  134. { .mib; mux1 Zhi=Zhi,\@rev };;
  135. { .mmi; add Hlo=9,Xi;; // ;; is here to prevent
  136. add Hhi=1,Xi };; // pipeline flush on Itanium
  137. { .mib; st8 [Hlo]=Zlo
  138. mov pr=prevpr,0x1ffff };;
  139. { .mib; st8 [Hhi]=Zhi
  140. mov ar.lc=prevlc
  141. br.ret.sptk.many b0 };;
  142. .endp gcm_gmult_4bit#
  143. ___
  144. ######################################################################
  145. # "528B" (well, "512B" actually) streamed GHASH
  146. #
  147. $Xip="in0";
  148. $Htbl="in1";
  149. $inp="in2";
  150. $len="in3";
  151. $rem_8bit="loc0";
  152. $mask0xff="loc1";
  153. ($sum,$rum) = $big_endian ? ("nop.m","nop.m") : ("sum","rum");
  154. sub load_htable() {
  155. for (my $i=0;$i<8;$i++) {
  156. $code.=<<___;
  157. { .mmi; ld8 r`16+2*$i+1`=[r8],16 // Htable[$i].hi
  158. ld8 r`16+2*$i`=[r9],16 } // Htable[$i].lo
  159. { .mmi; ldf8 f`32+2*$i+1`=[r10],16 // Htable[`8+$i`].hi
  160. ldf8 f`32+2*$i`=[r11],16 // Htable[`8+$i`].lo
  161. ___
  162. $code.=shift if (($i+$#_)==7);
  163. $code.="\t};;\n"
  164. }
  165. }
  166. $code.=<<___;
  167. prevsp=r3;
  168. .align 32
  169. .skip 16 // aligns loop body
  170. .global gcm_ghash_4bit#
  171. .proc gcm_ghash_4bit#
  172. gcm_ghash_4bit:
  173. .prologue
  174. { .mmi; .save ar.pfs,prevfs
  175. alloc prevfs=ar.pfs,4,2,0,0
  176. .vframe prevsp
  177. mov prevsp=sp
  178. mov $rem_8bit=ip };;
  179. .body
  180. { .mfi; $ADDP r8=0+0,$Htbl
  181. $ADDP r9=0+8,$Htbl }
  182. { .mfi; $ADDP r10=128+0,$Htbl
  183. $ADDP r11=128+8,$Htbl };;
  184. ___
  185. &load_htable(
  186. " $ADDP $Xip=15,$Xip", # &Xi[15]
  187. " $ADDP $len=$len,$inp", # &inp[len]
  188. " $ADDP $inp=15,$inp", # &inp[15]
  189. " mov $mask0xff=0xff",
  190. " add sp=-512,sp",
  191. " andcm sp=sp,$mask0xff", # align stack frame
  192. " add r14=0,sp",
  193. " add r15=8,sp");
  194. $code.=<<___;
  195. { .mmi; $sum 1<<1 // go big-endian
  196. add r8=256+0,sp
  197. add r9=256+8,sp }
  198. { .mmi; add r10=256+128+0,sp
  199. add r11=256+128+8,sp
  200. add $len=-17,$len };;
  201. ___
  202. for($i=0;$i<8;$i++) { # generate first half of Hshr4[]
  203. my ($rlo,$rhi)=("r".eval(16+2*$i),"r".eval(16+2*$i+1));
  204. $code.=<<___;
  205. { .mmi; st8 [r8]=$rlo,16 // Htable[$i].lo
  206. st8 [r9]=$rhi,16 // Htable[$i].hi
  207. shrp $rlo=$rhi,$rlo,4 }//;;
  208. { .mmi; stf8 [r10]=f`32+2*$i`,16 // Htable[`8+$i`].lo
  209. stf8 [r11]=f`32+2*$i+1`,16 // Htable[`8+$i`].hi
  210. shr.u $rhi=$rhi,4 };;
  211. { .mmi; st8 [r14]=$rlo,16 // Htable[$i].lo>>4
  212. st8 [r15]=$rhi,16 }//;; // Htable[$i].hi>>4
  213. ___
  214. }
  215. $code.=<<___;
  216. { .mmi; ld8 r16=[r8],16 // Htable[8].lo
  217. ld8 r17=[r9],16 };; // Htable[8].hi
  218. { .mmi; ld8 r18=[r8],16 // Htable[9].lo
  219. ld8 r19=[r9],16 } // Htable[9].hi
  220. { .mmi; rum 1<<5 // clear um.mfh
  221. shrp r16=r17,r16,4 };;
  222. ___
  223. for($i=0;$i<6;$i++) { # generate second half of Hshr4[]
  224. $code.=<<___;
  225. { .mmi; ld8 r`20+2*$i`=[r8],16 // Htable[`10+$i`].lo
  226. ld8 r`20+2*$i+1`=[r9],16 // Htable[`10+$i`].hi
  227. shr.u r`16+2*$i+1`=r`16+2*$i+1`,4 };;
  228. { .mmi; st8 [r14]=r`16+2*$i`,16 // Htable[`8+$i`].lo>>4
  229. st8 [r15]=r`16+2*$i+1`,16 // Htable[`8+$i`].hi>>4
  230. shrp r`18+2*$i`=r`18+2*$i+1`,r`18+2*$i`,4 }
  231. ___
  232. }
  233. $code.=<<___;
  234. { .mmi; shr.u r`16+2*$i+1`=r`16+2*$i+1`,4 };;
  235. { .mmi; st8 [r14]=r`16+2*$i`,16 // Htable[`8+$i`].lo>>4
  236. st8 [r15]=r`16+2*$i+1`,16 // Htable[`8+$i`].hi>>4
  237. shrp r`18+2*$i`=r`18+2*$i+1`,r`18+2*$i`,4 }
  238. { .mmi; add $Htbl=256,sp // &Htable[0]
  239. add $rem_8bit=rem_8bit#-gcm_ghash_4bit#,$rem_8bit
  240. shr.u r`18+2*$i+1`=r`18+2*$i+1`,4 };;
  241. { .mmi; st8 [r14]=r`18+2*$i` // Htable[`8+$i`].lo>>4
  242. st8 [r15]=r`18+2*$i+1` } // Htable[`8+$i`].hi>>4
  243. ___
  244. $in="r15";
  245. @xi=("r16","r17");
  246. @rem=("r18","r19");
  247. ($Alo,$Ahi,$Blo,$Bhi,$Zlo,$Zhi)=("r20","r21","r22","r23","r24","r25");
  248. ($Atbl,$Btbl)=("r26","r27");
  249. $code.=<<___; # (p16)
  250. { .mmi; ld1 $in=[$inp],-1 //(p16) *inp--
  251. ld1 $xi[0]=[$Xip],-1 //(p16) *Xi--
  252. cmp.eq p0,p6=r0,r0 };; // clear p6
  253. ___
  254. push (@xi,shift(@xi)); push (@rem,shift(@rem)); # "rotate" registers
  255. $code.=<<___; # (p16),(p17)
  256. { .mmi; ld1 $xi[0]=[$Xip],-1 //(p16) *Xi--
  257. xor $xi[1]=$xi[1],$in };; //(p17) xi=$xi[i]^inp[i]
  258. { .mii; ld1 $in=[$inp],-1 //(p16) *inp--
  259. dep $Atbl=$xi[1],$Htbl,4,4 //(p17) &Htable[nlo].lo
  260. and $xi[1]=-16,$xi[1] };; //(p17) nhi=xi&0xf0
  261. .align 32
  262. .LOOP:
  263. { .mmi;
  264. (p6) st8 [$Xip]=$Zhi,13
  265. xor $Zlo=$Zlo,$Zlo
  266. add $Btbl=$xi[1],$Htbl };; //(p17) &Htable[nhi].lo
  267. ___
  268. push (@xi,shift(@xi)); push (@rem,shift(@rem)); # "rotate" registers
  269. $code.=<<___; # (p16),(p17),(p18)
  270. { .mmi; ld8 $Alo=[$Atbl],8 //(p18) Htable[nlo].lo,&Htable[nlo].hi
  271. ld8 $rem[0]=[$Btbl],-256 //(p18) Htable[nhi].lo,&Hshr4[nhi].lo
  272. xor $xi[1]=$xi[1],$in };; //(p17) xi=$xi[i]^inp[i]
  273. { .mfi; ld8 $Ahi=[$Atbl] //(p18) Htable[nlo].hi
  274. dep $Atbl=$xi[1],$Htbl,4,4 } //(p17) &Htable[nlo].lo
  275. { .mfi; shladd $rem[0]=$rem[0],4,r0 //(p18) Htable[nhi].lo<<4
  276. xor $Zlo=$Zlo,$Alo };; //(p18) Z.lo^=Htable[nlo].lo
  277. { .mmi; ld8 $Blo=[$Btbl],8 //(p18) Hshr4[nhi].lo,&Hshr4[nhi].hi
  278. ld1 $in=[$inp],-1 } //(p16) *inp--
  279. { .mmi; xor $rem[0]=$rem[0],$Zlo //(p18) Z.lo^(Htable[nhi].lo<<4)
  280. mov $Zhi=$Ahi //(p18) Z.hi^=Htable[nlo].hi
  281. and $xi[1]=-16,$xi[1] };; //(p17) nhi=xi&0xf0
  282. { .mmi; ld8 $Bhi=[$Btbl] //(p18) Hshr4[nhi].hi
  283. ld1 $xi[0]=[$Xip],-1 //(p16) *Xi--
  284. shrp $Zlo=$Zhi,$Zlo,8 } //(p18) Z.lo=(Z.hi<<56)|(Z.lo>>8)
  285. { .mmi; and $rem[0]=$rem[0],$mask0xff //(p18) rem=($Zlo^(Htable[nhi].lo<<4))&0xff
  286. add $Btbl=$xi[1],$Htbl };; //(p17) &Htable[nhi]
  287. ___
  288. push (@xi,shift(@xi)); push (@rem,shift(@rem)); # "rotate" registers
  289. for ($i=1;$i<14;$i++) {
  290. # Above and below fragments are derived from this one by removing
  291. # unsuitable (p??) instructions.
  292. $code.=<<___; # (p16),(p17),(p18),(p19)
  293. { .mmi; ld8 $Alo=[$Atbl],8 //(p18) Htable[nlo].lo,&Htable[nlo].hi
  294. ld8 $rem[0]=[$Btbl],-256 //(p18) Htable[nhi].lo,&Hshr4[nhi].lo
  295. shr.u $Zhi=$Zhi,8 } //(p19) Z.hi>>=8
  296. { .mmi; shladd $rem[1]=$rem[1],1,$rem_8bit //(p19) &rem_8bit[rem]
  297. xor $Zlo=$Zlo,$Blo //(p19) Z.lo^=Hshr4[nhi].lo
  298. xor $xi[1]=$xi[1],$in };; //(p17) xi=$xi[i]^inp[i]
  299. { .mmi; ld8 $Ahi=[$Atbl] //(p18) Htable[nlo].hi
  300. ld2 $rem[1]=[$rem[1]] //(p19) rem_8bit[rem]
  301. dep $Atbl=$xi[1],$Htbl,4,4 } //(p17) &Htable[nlo].lo
  302. { .mmi; shladd $rem[0]=$rem[0],4,r0 //(p18) Htable[nhi].lo<<4
  303. xor $Zlo=$Zlo,$Alo //(p18) Z.lo^=Htable[nlo].lo
  304. xor $Zhi=$Zhi,$Bhi };; //(p19) Z.hi^=Hshr4[nhi].hi
  305. { .mmi; ld8 $Blo=[$Btbl],8 //(p18) Hshr4[nhi].lo,&Hshr4[nhi].hi
  306. ld1 $in=[$inp],-1 //(p16) *inp--
  307. shl $rem[1]=$rem[1],48 } //(p19) rem_8bit[rem]<<48
  308. { .mmi; xor $rem[0]=$rem[0],$Zlo //(p18) Z.lo^(Htable[nhi].lo<<4)
  309. xor $Zhi=$Zhi,$Ahi //(p18) Z.hi^=Htable[nlo].hi
  310. and $xi[1]=-16,$xi[1] };; //(p17) nhi=xi&0xf0
  311. { .mmi; ld8 $Bhi=[$Btbl] //(p18) Hshr4[nhi].hi
  312. ld1 $xi[0]=[$Xip],-1 //(p16) *Xi--
  313. shrp $Zlo=$Zhi,$Zlo,8 } //(p18) Z.lo=(Z.hi<<56)|(Z.lo>>8)
  314. { .mmi; and $rem[0]=$rem[0],$mask0xff //(p18) rem=($Zlo^(Htable[nhi].lo<<4))&0xff
  315. xor $Zhi=$Zhi,$rem[1] //(p19) Z.hi^=rem_8bit[rem]<<48
  316. add $Btbl=$xi[1],$Htbl };; //(p17) &Htable[nhi]
  317. ___
  318. push (@xi,shift(@xi)); push (@rem,shift(@rem)); # "rotate" registers
  319. }
  320. $code.=<<___; # (p17),(p18),(p19)
  321. { .mmi; ld8 $Alo=[$Atbl],8 //(p18) Htable[nlo].lo,&Htable[nlo].hi
  322. ld8 $rem[0]=[$Btbl],-256 //(p18) Htable[nhi].lo,&Hshr4[nhi].lo
  323. shr.u $Zhi=$Zhi,8 } //(p19) Z.hi>>=8
  324. { .mmi; shladd $rem[1]=$rem[1],1,$rem_8bit //(p19) &rem_8bit[rem]
  325. xor $Zlo=$Zlo,$Blo //(p19) Z.lo^=Hshr4[nhi].lo
  326. xor $xi[1]=$xi[1],$in };; //(p17) xi=$xi[i]^inp[i]
  327. { .mmi; ld8 $Ahi=[$Atbl] //(p18) Htable[nlo].hi
  328. ld2 $rem[1]=[$rem[1]] //(p19) rem_8bit[rem]
  329. dep $Atbl=$xi[1],$Htbl,4,4 };; //(p17) &Htable[nlo].lo
  330. { .mmi; shladd $rem[0]=$rem[0],4,r0 //(p18) Htable[nhi].lo<<4
  331. xor $Zlo=$Zlo,$Alo //(p18) Z.lo^=Htable[nlo].lo
  332. xor $Zhi=$Zhi,$Bhi };; //(p19) Z.hi^=Hshr4[nhi].hi
  333. { .mmi; ld8 $Blo=[$Btbl],8 //(p18) Hshr4[nhi].lo,&Hshr4[nhi].hi
  334. shl $rem[1]=$rem[1],48 } //(p19) rem_8bit[rem]<<48
  335. { .mmi; xor $rem[0]=$rem[0],$Zlo //(p18) Z.lo^(Htable[nhi].lo<<4)
  336. xor $Zhi=$Zhi,$Ahi //(p18) Z.hi^=Htable[nlo].hi
  337. and $xi[1]=-16,$xi[1] };; //(p17) nhi=xi&0xf0
  338. { .mmi; ld8 $Bhi=[$Btbl] //(p18) Hshr4[nhi].hi
  339. shrp $Zlo=$Zhi,$Zlo,8 } //(p18) Z.lo=(Z.hi<<56)|(Z.lo>>8)
  340. { .mmi; and $rem[0]=$rem[0],$mask0xff //(p18) rem=($Zlo^(Htable[nhi].lo<<4))&0xff
  341. xor $Zhi=$Zhi,$rem[1] //(p19) Z.hi^=rem_8bit[rem]<<48
  342. add $Btbl=$xi[1],$Htbl };; //(p17) &Htable[nhi]
  343. ___
  344. push (@xi,shift(@xi)); push (@rem,shift(@rem)); # "rotate" registers
  345. $code.=<<___; # (p18),(p19)
  346. { .mfi; ld8 $Alo=[$Atbl],8 //(p18) Htable[nlo].lo,&Htable[nlo].hi
  347. shr.u $Zhi=$Zhi,8 } //(p19) Z.hi>>=8
  348. { .mfi; shladd $rem[1]=$rem[1],1,$rem_8bit //(p19) &rem_8bit[rem]
  349. xor $Zlo=$Zlo,$Blo };; //(p19) Z.lo^=Hshr4[nhi].lo
  350. { .mfi; ld8 $Ahi=[$Atbl] //(p18) Htable[nlo].hi
  351. xor $Zlo=$Zlo,$Alo } //(p18) Z.lo^=Htable[nlo].lo
  352. { .mfi; ld2 $rem[1]=[$rem[1]] //(p19) rem_8bit[rem]
  353. xor $Zhi=$Zhi,$Bhi };; //(p19) Z.hi^=Hshr4[nhi].hi
  354. { .mfi; ld8 $Blo=[$Btbl],8 //(p18) Htable[nhi].lo,&Htable[nhi].hi
  355. shl $rem[1]=$rem[1],48 } //(p19) rem_8bit[rem]<<48
  356. { .mfi; shladd $rem[0]=$Zlo,4,r0 //(p18) Z.lo<<4
  357. xor $Zhi=$Zhi,$Ahi };; //(p18) Z.hi^=Htable[nlo].hi
  358. { .mfi; ld8 $Bhi=[$Btbl] //(p18) Htable[nhi].hi
  359. shrp $Zlo=$Zhi,$Zlo,4 } //(p18) Z.lo=(Z.hi<<60)|(Z.lo>>4)
  360. { .mfi; and $rem[0]=$rem[0],$mask0xff //(p18) rem=($Zlo^(Htable[nhi].lo<<4))&0xff
  361. xor $Zhi=$Zhi,$rem[1] };; //(p19) Z.hi^=rem_8bit[rem]<<48
  362. ___
  363. push (@xi,shift(@xi)); push (@rem,shift(@rem)); # "rotate" registers
  364. $code.=<<___; # (p19)
  365. { .mmi; cmp.ltu p6,p0=$inp,$len
  366. add $inp=32,$inp
  367. shr.u $Zhi=$Zhi,4 } //(p19) Z.hi>>=4
  368. { .mmi; shladd $rem[1]=$rem[1],1,$rem_8bit //(p19) &rem_8bit[rem]
  369. xor $Zlo=$Zlo,$Blo //(p19) Z.lo^=Hshr4[nhi].lo
  370. add $Xip=9,$Xip };; // &Xi.lo
  371. { .mmi; ld2 $rem[1]=[$rem[1]] //(p19) rem_8bit[rem]
  372. (p6) ld1 $in=[$inp],-1 //[p16] *inp--
  373. (p6) extr.u $xi[1]=$Zlo,8,8 } //[p17] Xi[14]
  374. { .mmi; xor $Zhi=$Zhi,$Bhi //(p19) Z.hi^=Hshr4[nhi].hi
  375. (p6) and $xi[0]=$Zlo,$mask0xff };; //[p16] Xi[15]
  376. { .mmi; st8 [$Xip]=$Zlo,-8
  377. (p6) xor $xi[0]=$xi[0],$in //[p17] xi=$xi[i]^inp[i]
  378. shl $rem[1]=$rem[1],48 };; //(p19) rem_8bit[rem]<<48
  379. { .mmi;
  380. (p6) ld1 $in=[$inp],-1 //[p16] *inp--
  381. xor $Zhi=$Zhi,$rem[1] //(p19) Z.hi^=rem_8bit[rem]<<48
  382. (p6) dep $Atbl=$xi[0],$Htbl,4,4 } //[p17] &Htable[nlo].lo
  383. { .mib;
  384. (p6) and $xi[0]=-16,$xi[0] //[p17] nhi=xi&0xf0
  385. (p6) br.cond.dptk.many .LOOP };;
  386. { .mib; st8 [$Xip]=$Zhi };;
  387. { .mib; $rum 1<<1 // return to little-endian
  388. .restore sp
  389. mov sp=prevsp
  390. br.ret.sptk.many b0 };;
  391. .endp gcm_ghash_4bit#
  392. ___
  393. $code.=<<___;
  394. .align 128
  395. .type rem_4bit#,\@object
  396. rem_4bit:
  397. data8 0x0000<<48, 0x1C20<<48, 0x3840<<48, 0x2460<<48
  398. data8 0x7080<<48, 0x6CA0<<48, 0x48C0<<48, 0x54E0<<48
  399. data8 0xE100<<48, 0xFD20<<48, 0xD940<<48, 0xC560<<48
  400. data8 0x9180<<48, 0x8DA0<<48, 0xA9C0<<48, 0xB5E0<<48
  401. .size rem_4bit#,128
  402. .type rem_8bit#,\@object
  403. rem_8bit:
  404. data1 0x00,0x00, 0x01,0xC2, 0x03,0x84, 0x02,0x46, 0x07,0x08, 0x06,0xCA, 0x04,0x8C, 0x05,0x4E
  405. data1 0x0E,0x10, 0x0F,0xD2, 0x0D,0x94, 0x0C,0x56, 0x09,0x18, 0x08,0xDA, 0x0A,0x9C, 0x0B,0x5E
  406. data1 0x1C,0x20, 0x1D,0xE2, 0x1F,0xA4, 0x1E,0x66, 0x1B,0x28, 0x1A,0xEA, 0x18,0xAC, 0x19,0x6E
  407. data1 0x12,0x30, 0x13,0xF2, 0x11,0xB4, 0x10,0x76, 0x15,0x38, 0x14,0xFA, 0x16,0xBC, 0x17,0x7E
  408. data1 0x38,0x40, 0x39,0x82, 0x3B,0xC4, 0x3A,0x06, 0x3F,0x48, 0x3E,0x8A, 0x3C,0xCC, 0x3D,0x0E
  409. data1 0x36,0x50, 0x37,0x92, 0x35,0xD4, 0x34,0x16, 0x31,0x58, 0x30,0x9A, 0x32,0xDC, 0x33,0x1E
  410. data1 0x24,0x60, 0x25,0xA2, 0x27,0xE4, 0x26,0x26, 0x23,0x68, 0x22,0xAA, 0x20,0xEC, 0x21,0x2E
  411. data1 0x2A,0x70, 0x2B,0xB2, 0x29,0xF4, 0x28,0x36, 0x2D,0x78, 0x2C,0xBA, 0x2E,0xFC, 0x2F,0x3E
  412. data1 0x70,0x80, 0x71,0x42, 0x73,0x04, 0x72,0xC6, 0x77,0x88, 0x76,0x4A, 0x74,0x0C, 0x75,0xCE
  413. data1 0x7E,0x90, 0x7F,0x52, 0x7D,0x14, 0x7C,0xD6, 0x79,0x98, 0x78,0x5A, 0x7A,0x1C, 0x7B,0xDE
  414. data1 0x6C,0xA0, 0x6D,0x62, 0x6F,0x24, 0x6E,0xE6, 0x6B,0xA8, 0x6A,0x6A, 0x68,0x2C, 0x69,0xEE
  415. data1 0x62,0xB0, 0x63,0x72, 0x61,0x34, 0x60,0xF6, 0x65,0xB8, 0x64,0x7A, 0x66,0x3C, 0x67,0xFE
  416. data1 0x48,0xC0, 0x49,0x02, 0x4B,0x44, 0x4A,0x86, 0x4F,0xC8, 0x4E,0x0A, 0x4C,0x4C, 0x4D,0x8E
  417. data1 0x46,0xD0, 0x47,0x12, 0x45,0x54, 0x44,0x96, 0x41,0xD8, 0x40,0x1A, 0x42,0x5C, 0x43,0x9E
  418. data1 0x54,0xE0, 0x55,0x22, 0x57,0x64, 0x56,0xA6, 0x53,0xE8, 0x52,0x2A, 0x50,0x6C, 0x51,0xAE
  419. data1 0x5A,0xF0, 0x5B,0x32, 0x59,0x74, 0x58,0xB6, 0x5D,0xF8, 0x5C,0x3A, 0x5E,0x7C, 0x5F,0xBE
  420. data1 0xE1,0x00, 0xE0,0xC2, 0xE2,0x84, 0xE3,0x46, 0xE6,0x08, 0xE7,0xCA, 0xE5,0x8C, 0xE4,0x4E
  421. data1 0xEF,0x10, 0xEE,0xD2, 0xEC,0x94, 0xED,0x56, 0xE8,0x18, 0xE9,0xDA, 0xEB,0x9C, 0xEA,0x5E
  422. data1 0xFD,0x20, 0xFC,0xE2, 0xFE,0xA4, 0xFF,0x66, 0xFA,0x28, 0xFB,0xEA, 0xF9,0xAC, 0xF8,0x6E
  423. data1 0xF3,0x30, 0xF2,0xF2, 0xF0,0xB4, 0xF1,0x76, 0xF4,0x38, 0xF5,0xFA, 0xF7,0xBC, 0xF6,0x7E
  424. data1 0xD9,0x40, 0xD8,0x82, 0xDA,0xC4, 0xDB,0x06, 0xDE,0x48, 0xDF,0x8A, 0xDD,0xCC, 0xDC,0x0E
  425. data1 0xD7,0x50, 0xD6,0x92, 0xD4,0xD4, 0xD5,0x16, 0xD0,0x58, 0xD1,0x9A, 0xD3,0xDC, 0xD2,0x1E
  426. data1 0xC5,0x60, 0xC4,0xA2, 0xC6,0xE4, 0xC7,0x26, 0xC2,0x68, 0xC3,0xAA, 0xC1,0xEC, 0xC0,0x2E
  427. data1 0xCB,0x70, 0xCA,0xB2, 0xC8,0xF4, 0xC9,0x36, 0xCC,0x78, 0xCD,0xBA, 0xCF,0xFC, 0xCE,0x3E
  428. data1 0x91,0x80, 0x90,0x42, 0x92,0x04, 0x93,0xC6, 0x96,0x88, 0x97,0x4A, 0x95,0x0C, 0x94,0xCE
  429. data1 0x9F,0x90, 0x9E,0x52, 0x9C,0x14, 0x9D,0xD6, 0x98,0x98, 0x99,0x5A, 0x9B,0x1C, 0x9A,0xDE
  430. data1 0x8D,0xA0, 0x8C,0x62, 0x8E,0x24, 0x8F,0xE6, 0x8A,0xA8, 0x8B,0x6A, 0x89,0x2C, 0x88,0xEE
  431. data1 0x83,0xB0, 0x82,0x72, 0x80,0x34, 0x81,0xF6, 0x84,0xB8, 0x85,0x7A, 0x87,0x3C, 0x86,0xFE
  432. data1 0xA9,0xC0, 0xA8,0x02, 0xAA,0x44, 0xAB,0x86, 0xAE,0xC8, 0xAF,0x0A, 0xAD,0x4C, 0xAC,0x8E
  433. data1 0xA7,0xD0, 0xA6,0x12, 0xA4,0x54, 0xA5,0x96, 0xA0,0xD8, 0xA1,0x1A, 0xA3,0x5C, 0xA2,0x9E
  434. data1 0xB5,0xE0, 0xB4,0x22, 0xB6,0x64, 0xB7,0xA6, 0xB2,0xE8, 0xB3,0x2A, 0xB1,0x6C, 0xB0,0xAE
  435. data1 0xBB,0xF0, 0xBA,0x32, 0xB8,0x74, 0xB9,0xB6, 0xBC,0xF8, 0xBD,0x3A, 0xBF,0x7C, 0xBE,0xBE
  436. .size rem_8bit#,512
  437. stringz "GHASH for IA64, CRYPTOGAMS by <appro\@openssl.org>"
  438. ___
  439. $code =~ s/mux1(\s+)\S+\@rev/nop.i$1 0x0/gm if ($big_endian);
  440. $code =~ s/\`([^\`]*)\`/eval $1/gem;
  441. print $code;
  442. close STDOUT or die "error closing STDOUT: $!";