ghash-x86_64.pl 43 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818
  1. #! /usr/bin/env perl
  2. # Copyright 2010-2020 The OpenSSL Project Authors. All Rights Reserved.
  3. #
  4. # Licensed under the OpenSSL license (the "License"). You may not use
  5. # this file except in compliance with the License. You can obtain a copy
  6. # in the file LICENSE in the source distribution or at
  7. # https://www.openssl.org/source/license.html
  8. #
  9. # ====================================================================
  10. # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
  11. # project. The module is, however, dual licensed under OpenSSL and
  12. # CRYPTOGAMS licenses depending on where you obtain it. For further
  13. # details see http://www.openssl.org/~appro/cryptogams/.
  14. # ====================================================================
  15. #
  16. # March, June 2010
  17. #
  18. # The module implements "4-bit" GCM GHASH function and underlying
  19. # single multiplication operation in GF(2^128). "4-bit" means that
  20. # it uses 256 bytes per-key table [+128 bytes shared table]. GHASH
  21. # function features so called "528B" variant utilizing additional
  22. # 256+16 bytes of per-key storage [+512 bytes shared table].
  23. # Performance results are for this streamed GHASH subroutine and are
  24. # expressed in cycles per processed byte, less is better:
  25. #
  26. # gcc 3.4.x(*) assembler
  27. #
  28. # P4 28.6 14.0 +100%
  29. # Opteron 19.3 7.7 +150%
  30. # Core2 17.8 8.1(**) +120%
  31. # Atom 31.6 16.8 +88%
  32. # VIA Nano 21.8 10.1 +115%
  33. #
  34. # (*) comparison is not completely fair, because C results are
  35. # for vanilla "256B" implementation, while assembler results
  36. # are for "528B";-)
  37. # (**) it's mystery [to me] why Core2 result is not same as for
  38. # Opteron;
  39. # May 2010
  40. #
  41. # Add PCLMULQDQ version performing at 2.02 cycles per processed byte.
  42. # See ghash-x86.pl for background information and details about coding
  43. # techniques.
  44. #
  45. # Special thanks to David Woodhouse for providing access to a
  46. # Westmere-based system on behalf of Intel Open Source Technology Centre.
  47. # December 2012
  48. #
  49. # Overhaul: aggregate Karatsuba post-processing, improve ILP in
  50. # reduction_alg9, increase reduction aggregate factor to 4x. As for
  51. # the latter. ghash-x86.pl discusses that it makes lesser sense to
  52. # increase aggregate factor. Then why increase here? Critical path
  53. # consists of 3 independent pclmulqdq instructions, Karatsuba post-
  54. # processing and reduction. "On top" of this we lay down aggregated
  55. # multiplication operations, triplets of independent pclmulqdq's. As
  56. # issue rate for pclmulqdq is limited, it makes lesser sense to
  57. # aggregate more multiplications than it takes to perform remaining
  58. # non-multiplication operations. 2x is near-optimal coefficient for
  59. # contemporary Intel CPUs (therefore modest improvement coefficient),
  60. # but not for Bulldozer. Latter is because logical SIMD operations
  61. # are twice as slow in comparison to Intel, so that critical path is
  62. # longer. A CPU with higher pclmulqdq issue rate would also benefit
  63. # from higher aggregate factor...
  64. #
  65. # Westmere 1.78(+13%)
  66. # Sandy Bridge 1.80(+8%)
  67. # Ivy Bridge 1.80(+7%)
  68. # Haswell 0.55(+93%) (if system doesn't support AVX)
  69. # Broadwell 0.45(+110%)(if system doesn't support AVX)
  70. # Skylake 0.44(+110%)(if system doesn't support AVX)
  71. # Bulldozer 1.49(+27%)
  72. # Silvermont 2.88(+13%)
  73. # Knights L 2.12(-) (if system doesn't support AVX)
  74. # Goldmont 1.08(+24%)
  75. # March 2013
  76. #
  77. # ... 8x aggregate factor AVX code path is using reduction algorithm
  78. # suggested by Shay Gueron[1]. Even though contemporary AVX-capable
  79. # CPUs such as Sandy and Ivy Bridge can execute it, the code performs
  80. # sub-optimally in comparison to above mentioned version. But thanks
  81. # to Ilya Albrekht and Max Locktyukhin of Intel Corp. we knew that
  82. # it performs in 0.41 cycles per byte on Haswell processor, in
  83. # 0.29 on Broadwell, and in 0.36 on Skylake.
  84. #
  85. # Knights Landing achieves 1.09 cpb.
  86. #
  87. # [1] http://rt.openssl.org/Ticket/Display.html?id=2900&user=guest&pass=guest
  88. $flavour = shift;
  89. $output = shift;
  90. if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
  91. $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
  92. $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
  93. ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
  94. ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
  95. die "can't locate x86_64-xlate.pl";
  96. if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
  97. =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
  98. $avx = ($1>=2.20) + ($1>=2.22);
  99. }
  100. if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
  101. `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
  102. $avx = ($1>=2.09) + ($1>=2.10);
  103. }
  104. if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
  105. `ml64 2>&1` =~ /Version ([0-9]+)\./) {
  106. $avx = ($1>=10) + ($1>=11);
  107. }
  108. if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:clang|LLVM) version|.*based on LLVM) ([0-9]+\.[0-9]+)/) {
  109. $avx = ($2>=3.0) + ($2>3.0);
  110. }
  111. open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
  112. *STDOUT=*OUT;
  113. $do4xaggr=1;
  114. # common register layout
  115. $nlo="%rax";
  116. $nhi="%rbx";
  117. $Zlo="%r8";
  118. $Zhi="%r9";
  119. $tmp="%r10";
  120. $rem_4bit = "%r11";
  121. $Xi="%rdi";
  122. $Htbl="%rsi";
  123. # per-function register layout
  124. $cnt="%rcx";
  125. $rem="%rdx";
  126. sub LB() { my $r=shift; $r =~ s/%[er]([a-d])x/%\1l/ or
  127. $r =~ s/%[er]([sd]i)/%\1l/ or
  128. $r =~ s/%[er](bp)/%\1l/ or
  129. $r =~ s/%(r[0-9]+)[d]?/%\1b/; $r; }
  130. sub AUTOLOAD() # thunk [simplified] 32-bit style perlasm
  131. { my $opcode = $AUTOLOAD; $opcode =~ s/.*:://;
  132. my $arg = pop;
  133. $arg = "\$$arg" if ($arg*1 eq $arg);
  134. $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n";
  135. }
  136. { my $N;
  137. sub loop() {
  138. my $inp = shift;
  139. $N++;
  140. $code.=<<___;
  141. xor $nlo,$nlo
  142. xor $nhi,$nhi
  143. mov `&LB("$Zlo")`,`&LB("$nlo")`
  144. mov `&LB("$Zlo")`,`&LB("$nhi")`
  145. shl \$4,`&LB("$nlo")`
  146. mov \$14,$cnt
  147. mov 8($Htbl,$nlo),$Zlo
  148. mov ($Htbl,$nlo),$Zhi
  149. and \$0xf0,`&LB("$nhi")`
  150. mov $Zlo,$rem
  151. jmp .Loop$N
  152. .align 16
  153. .Loop$N:
  154. shr \$4,$Zlo
  155. and \$0xf,$rem
  156. mov $Zhi,$tmp
  157. mov ($inp,$cnt),`&LB("$nlo")`
  158. shr \$4,$Zhi
  159. xor 8($Htbl,$nhi),$Zlo
  160. shl \$60,$tmp
  161. xor ($Htbl,$nhi),$Zhi
  162. mov `&LB("$nlo")`,`&LB("$nhi")`
  163. xor ($rem_4bit,$rem,8),$Zhi
  164. mov $Zlo,$rem
  165. shl \$4,`&LB("$nlo")`
  166. xor $tmp,$Zlo
  167. dec $cnt
  168. js .Lbreak$N
  169. shr \$4,$Zlo
  170. and \$0xf,$rem
  171. mov $Zhi,$tmp
  172. shr \$4,$Zhi
  173. xor 8($Htbl,$nlo),$Zlo
  174. shl \$60,$tmp
  175. xor ($Htbl,$nlo),$Zhi
  176. and \$0xf0,`&LB("$nhi")`
  177. xor ($rem_4bit,$rem,8),$Zhi
  178. mov $Zlo,$rem
  179. xor $tmp,$Zlo
  180. jmp .Loop$N
  181. .align 16
  182. .Lbreak$N:
  183. shr \$4,$Zlo
  184. and \$0xf,$rem
  185. mov $Zhi,$tmp
  186. shr \$4,$Zhi
  187. xor 8($Htbl,$nlo),$Zlo
  188. shl \$60,$tmp
  189. xor ($Htbl,$nlo),$Zhi
  190. and \$0xf0,`&LB("$nhi")`
  191. xor ($rem_4bit,$rem,8),$Zhi
  192. mov $Zlo,$rem
  193. xor $tmp,$Zlo
  194. shr \$4,$Zlo
  195. and \$0xf,$rem
  196. mov $Zhi,$tmp
  197. shr \$4,$Zhi
  198. xor 8($Htbl,$nhi),$Zlo
  199. shl \$60,$tmp
  200. xor ($Htbl,$nhi),$Zhi
  201. xor $tmp,$Zlo
  202. xor ($rem_4bit,$rem,8),$Zhi
  203. bswap $Zlo
  204. bswap $Zhi
  205. ___
  206. }}
  207. $code=<<___;
  208. .text
  209. .extern OPENSSL_ia32cap_P
  210. .globl gcm_gmult_4bit
  211. .type gcm_gmult_4bit,\@function,2
  212. .align 16
  213. gcm_gmult_4bit:
  214. .cfi_startproc
  215. push %rbx
  216. .cfi_push %rbx
  217. push %rbp # %rbp and others are pushed exclusively in
  218. .cfi_push %rbp
  219. push %r12 # order to reuse Win64 exception handler...
  220. .cfi_push %r12
  221. push %r13
  222. .cfi_push %r13
  223. push %r14
  224. .cfi_push %r14
  225. push %r15
  226. .cfi_push %r15
  227. sub \$280,%rsp
  228. .cfi_adjust_cfa_offset 280
  229. .Lgmult_prologue:
  230. movzb 15($Xi),$Zlo
  231. lea .Lrem_4bit(%rip),$rem_4bit
  232. ___
  233. &loop ($Xi);
  234. $code.=<<___;
  235. mov $Zlo,8($Xi)
  236. mov $Zhi,($Xi)
  237. lea 280+48(%rsp),%rsi
  238. .cfi_def_cfa %rsi,8
  239. mov -8(%rsi),%rbx
  240. .cfi_restore %rbx
  241. lea (%rsi),%rsp
  242. .cfi_def_cfa_register %rsp
  243. .Lgmult_epilogue:
  244. ret
  245. .cfi_endproc
  246. .size gcm_gmult_4bit,.-gcm_gmult_4bit
  247. ___
  248. # per-function register layout
  249. $inp="%rdx";
  250. $len="%rcx";
  251. $rem_8bit=$rem_4bit;
  252. $code.=<<___;
  253. .globl gcm_ghash_4bit
  254. .type gcm_ghash_4bit,\@function,4
  255. .align 16
  256. gcm_ghash_4bit:
  257. .cfi_startproc
  258. push %rbx
  259. .cfi_push %rbx
  260. push %rbp
  261. .cfi_push %rbp
  262. push %r12
  263. .cfi_push %r12
  264. push %r13
  265. .cfi_push %r13
  266. push %r14
  267. .cfi_push %r14
  268. push %r15
  269. .cfi_push %r15
  270. sub \$280,%rsp
  271. .cfi_adjust_cfa_offset 280
  272. .Lghash_prologue:
  273. mov $inp,%r14 # reassign couple of args
  274. mov $len,%r15
  275. ___
  276. { my $inp="%r14";
  277. my $dat="%edx";
  278. my $len="%r15";
  279. my @nhi=("%ebx","%ecx");
  280. my @rem=("%r12","%r13");
  281. my $Hshr4="%rbp";
  282. &sub ($Htbl,-128); # size optimization
  283. &lea ($Hshr4,"16+128(%rsp)");
  284. { my @lo =($nlo,$nhi);
  285. my @hi =($Zlo,$Zhi);
  286. &xor ($dat,$dat);
  287. for ($i=0,$j=-2;$i<18;$i++,$j++) {
  288. &mov ("$j(%rsp)",&LB($dat)) if ($i>1);
  289. &or ($lo[0],$tmp) if ($i>1);
  290. &mov (&LB($dat),&LB($lo[1])) if ($i>0 && $i<17);
  291. &shr ($lo[1],4) if ($i>0 && $i<17);
  292. &mov ($tmp,$hi[1]) if ($i>0 && $i<17);
  293. &shr ($hi[1],4) if ($i>0 && $i<17);
  294. &mov ("8*$j($Hshr4)",$hi[0]) if ($i>1);
  295. &mov ($hi[0],"16*$i+0-128($Htbl)") if ($i<16);
  296. &shl (&LB($dat),4) if ($i>0 && $i<17);
  297. &mov ("8*$j-128($Hshr4)",$lo[0]) if ($i>1);
  298. &mov ($lo[0],"16*$i+8-128($Htbl)") if ($i<16);
  299. &shl ($tmp,60) if ($i>0 && $i<17);
  300. push (@lo,shift(@lo));
  301. push (@hi,shift(@hi));
  302. }
  303. }
  304. &add ($Htbl,-128);
  305. &mov ($Zlo,"8($Xi)");
  306. &mov ($Zhi,"0($Xi)");
  307. &add ($len,$inp); # pointer to the end of data
  308. &lea ($rem_8bit,".Lrem_8bit(%rip)");
  309. &jmp (".Louter_loop");
  310. $code.=".align 16\n.Louter_loop:\n";
  311. &xor ($Zhi,"($inp)");
  312. &mov ("%rdx","8($inp)");
  313. &lea ($inp,"16($inp)");
  314. &xor ("%rdx",$Zlo);
  315. &mov ("($Xi)",$Zhi);
  316. &mov ("8($Xi)","%rdx");
  317. &shr ("%rdx",32);
  318. &xor ($nlo,$nlo);
  319. &rol ($dat,8);
  320. &mov (&LB($nlo),&LB($dat));
  321. &movz ($nhi[0],&LB($dat));
  322. &shl (&LB($nlo),4);
  323. &shr ($nhi[0],4);
  324. for ($j=11,$i=0;$i<15;$i++) {
  325. &rol ($dat,8);
  326. &xor ($Zlo,"8($Htbl,$nlo)") if ($i>0);
  327. &xor ($Zhi,"($Htbl,$nlo)") if ($i>0);
  328. &mov ($Zlo,"8($Htbl,$nlo)") if ($i==0);
  329. &mov ($Zhi,"($Htbl,$nlo)") if ($i==0);
  330. &mov (&LB($nlo),&LB($dat));
  331. &xor ($Zlo,$tmp) if ($i>0);
  332. &movzw ($rem[1],"($rem_8bit,$rem[1],2)") if ($i>0);
  333. &movz ($nhi[1],&LB($dat));
  334. &shl (&LB($nlo),4);
  335. &movzb ($rem[0],"(%rsp,$nhi[0])");
  336. &shr ($nhi[1],4) if ($i<14);
  337. &and ($nhi[1],0xf0) if ($i==14);
  338. &shl ($rem[1],48) if ($i>0);
  339. &xor ($rem[0],$Zlo);
  340. &mov ($tmp,$Zhi);
  341. &xor ($Zhi,$rem[1]) if ($i>0);
  342. &shr ($Zlo,8);
  343. &movz ($rem[0],&LB($rem[0]));
  344. &mov ($dat,"$j($Xi)") if (--$j%4==0);
  345. &shr ($Zhi,8);
  346. &xor ($Zlo,"-128($Hshr4,$nhi[0],8)");
  347. &shl ($tmp,56);
  348. &xor ($Zhi,"($Hshr4,$nhi[0],8)");
  349. unshift (@nhi,pop(@nhi)); # "rotate" registers
  350. unshift (@rem,pop(@rem));
  351. }
  352. &movzw ($rem[1],"($rem_8bit,$rem[1],2)");
  353. &xor ($Zlo,"8($Htbl,$nlo)");
  354. &xor ($Zhi,"($Htbl,$nlo)");
  355. &shl ($rem[1],48);
  356. &xor ($Zlo,$tmp);
  357. &xor ($Zhi,$rem[1]);
  358. &movz ($rem[0],&LB($Zlo));
  359. &shr ($Zlo,4);
  360. &mov ($tmp,$Zhi);
  361. &shl (&LB($rem[0]),4);
  362. &shr ($Zhi,4);
  363. &xor ($Zlo,"8($Htbl,$nhi[0])");
  364. &movzw ($rem[0],"($rem_8bit,$rem[0],2)");
  365. &shl ($tmp,60);
  366. &xor ($Zhi,"($Htbl,$nhi[0])");
  367. &xor ($Zlo,$tmp);
  368. &shl ($rem[0],48);
  369. &bswap ($Zlo);
  370. &xor ($Zhi,$rem[0]);
  371. &bswap ($Zhi);
  372. &cmp ($inp,$len);
  373. &jb (".Louter_loop");
  374. }
  375. $code.=<<___;
  376. mov $Zlo,8($Xi)
  377. mov $Zhi,($Xi)
  378. lea 280+48(%rsp),%rsi
  379. .cfi_def_cfa %rsi,8
  380. mov -48(%rsi),%r15
  381. .cfi_restore %r15
  382. mov -40(%rsi),%r14
  383. .cfi_restore %r14
  384. mov -32(%rsi),%r13
  385. .cfi_restore %r13
  386. mov -24(%rsi),%r12
  387. .cfi_restore %r12
  388. mov -16(%rsi),%rbp
  389. .cfi_restore %rbp
  390. mov -8(%rsi),%rbx
  391. .cfi_restore %rbx
  392. lea 0(%rsi),%rsp
  393. .cfi_def_cfa_register %rsp
  394. .Lghash_epilogue:
  395. ret
  396. .cfi_endproc
  397. .size gcm_ghash_4bit,.-gcm_ghash_4bit
  398. ___
  399. ######################################################################
  400. # PCLMULQDQ version.
  401. @_4args=$win64? ("%rcx","%rdx","%r8", "%r9") : # Win64 order
  402. ("%rdi","%rsi","%rdx","%rcx"); # Unix order
  403. ($Xi,$Xhi)=("%xmm0","%xmm1"); $Hkey="%xmm2";
  404. ($T1,$T2,$T3)=("%xmm3","%xmm4","%xmm5");
  405. sub clmul64x64_T2 { # minimal register pressure
  406. my ($Xhi,$Xi,$Hkey,$HK)=@_;
  407. if (!defined($HK)) { $HK = $T2;
  408. $code.=<<___;
  409. movdqa $Xi,$Xhi #
  410. pshufd \$0b01001110,$Xi,$T1
  411. pshufd \$0b01001110,$Hkey,$T2
  412. pxor $Xi,$T1 #
  413. pxor $Hkey,$T2
  414. ___
  415. } else {
  416. $code.=<<___;
  417. movdqa $Xi,$Xhi #
  418. pshufd \$0b01001110,$Xi,$T1
  419. pxor $Xi,$T1 #
  420. ___
  421. }
  422. $code.=<<___;
  423. pclmulqdq \$0x00,$Hkey,$Xi #######
  424. pclmulqdq \$0x11,$Hkey,$Xhi #######
  425. pclmulqdq \$0x00,$HK,$T1 #######
  426. pxor $Xi,$T1 #
  427. pxor $Xhi,$T1 #
  428. movdqa $T1,$T2 #
  429. psrldq \$8,$T1
  430. pslldq \$8,$T2 #
  431. pxor $T1,$Xhi
  432. pxor $T2,$Xi #
  433. ___
  434. }
  435. sub reduction_alg9 { # 17/11 times faster than Intel version
  436. my ($Xhi,$Xi) = @_;
  437. $code.=<<___;
  438. # 1st phase
  439. movdqa $Xi,$T2 #
  440. movdqa $Xi,$T1
  441. psllq \$5,$Xi
  442. pxor $Xi,$T1 #
  443. psllq \$1,$Xi
  444. pxor $T1,$Xi #
  445. psllq \$57,$Xi #
  446. movdqa $Xi,$T1 #
  447. pslldq \$8,$Xi
  448. psrldq \$8,$T1 #
  449. pxor $T2,$Xi
  450. pxor $T1,$Xhi #
  451. # 2nd phase
  452. movdqa $Xi,$T2
  453. psrlq \$1,$Xi
  454. pxor $T2,$Xhi #
  455. pxor $Xi,$T2
  456. psrlq \$5,$Xi
  457. pxor $T2,$Xi #
  458. psrlq \$1,$Xi #
  459. pxor $Xhi,$Xi #
  460. ___
  461. }
  462. { my ($Htbl,$Xip)=@_4args;
  463. my $HK="%xmm6";
  464. $code.=<<___;
  465. .globl gcm_init_clmul
  466. .type gcm_init_clmul,\@abi-omnipotent
  467. .align 16
  468. gcm_init_clmul:
  469. .cfi_startproc
  470. .L_init_clmul:
  471. ___
  472. $code.=<<___ if ($win64);
  473. .LSEH_begin_gcm_init_clmul:
  474. # I can't trust assembler to use specific encoding:-(
  475. .byte 0x48,0x83,0xec,0x18 #sub $0x18,%rsp
  476. .byte 0x0f,0x29,0x34,0x24 #movaps %xmm6,(%rsp)
  477. ___
  478. $code.=<<___;
  479. movdqu ($Xip),$Hkey
  480. pshufd \$0b01001110,$Hkey,$Hkey # dword swap
  481. # <<1 twist
  482. pshufd \$0b11111111,$Hkey,$T2 # broadcast uppermost dword
  483. movdqa $Hkey,$T1
  484. psllq \$1,$Hkey
  485. pxor $T3,$T3 #
  486. psrlq \$63,$T1
  487. pcmpgtd $T2,$T3 # broadcast carry bit
  488. pslldq \$8,$T1
  489. por $T1,$Hkey # H<<=1
  490. # magic reduction
  491. pand .L0x1c2_polynomial(%rip),$T3
  492. pxor $T3,$Hkey # if(carry) H^=0x1c2_polynomial
  493. # calculate H^2
  494. pshufd \$0b01001110,$Hkey,$HK
  495. movdqa $Hkey,$Xi
  496. pxor $Hkey,$HK
  497. ___
  498. &clmul64x64_T2 ($Xhi,$Xi,$Hkey,$HK);
  499. &reduction_alg9 ($Xhi,$Xi);
  500. $code.=<<___;
  501. pshufd \$0b01001110,$Hkey,$T1
  502. pshufd \$0b01001110,$Xi,$T2
  503. pxor $Hkey,$T1 # Karatsuba pre-processing
  504. movdqu $Hkey,0x00($Htbl) # save H
  505. pxor $Xi,$T2 # Karatsuba pre-processing
  506. movdqu $Xi,0x10($Htbl) # save H^2
  507. palignr \$8,$T1,$T2 # low part is H.lo^H.hi...
  508. movdqu $T2,0x20($Htbl) # save Karatsuba "salt"
  509. ___
  510. if ($do4xaggr) {
  511. &clmul64x64_T2 ($Xhi,$Xi,$Hkey,$HK); # H^3
  512. &reduction_alg9 ($Xhi,$Xi);
  513. $code.=<<___;
  514. movdqa $Xi,$T3
  515. ___
  516. &clmul64x64_T2 ($Xhi,$Xi,$Hkey,$HK); # H^4
  517. &reduction_alg9 ($Xhi,$Xi);
  518. $code.=<<___;
  519. pshufd \$0b01001110,$T3,$T1
  520. pshufd \$0b01001110,$Xi,$T2
  521. pxor $T3,$T1 # Karatsuba pre-processing
  522. movdqu $T3,0x30($Htbl) # save H^3
  523. pxor $Xi,$T2 # Karatsuba pre-processing
  524. movdqu $Xi,0x40($Htbl) # save H^4
  525. palignr \$8,$T1,$T2 # low part is H^3.lo^H^3.hi...
  526. movdqu $T2,0x50($Htbl) # save Karatsuba "salt"
  527. ___
  528. }
  529. $code.=<<___ if ($win64);
  530. movaps (%rsp),%xmm6
  531. lea 0x18(%rsp),%rsp
  532. .LSEH_end_gcm_init_clmul:
  533. ___
  534. $code.=<<___;
  535. ret
  536. .cfi_endproc
  537. .size gcm_init_clmul,.-gcm_init_clmul
  538. ___
  539. }
  540. { my ($Xip,$Htbl)=@_4args;
  541. $code.=<<___;
  542. .globl gcm_gmult_clmul
  543. .type gcm_gmult_clmul,\@abi-omnipotent
  544. .align 16
  545. gcm_gmult_clmul:
  546. .cfi_startproc
  547. .L_gmult_clmul:
  548. movdqu ($Xip),$Xi
  549. movdqa .Lbswap_mask(%rip),$T3
  550. movdqu ($Htbl),$Hkey
  551. movdqu 0x20($Htbl),$T2
  552. pshufb $T3,$Xi
  553. ___
  554. &clmul64x64_T2 ($Xhi,$Xi,$Hkey,$T2);
  555. $code.=<<___ if (0 || (&reduction_alg9($Xhi,$Xi)&&0));
  556. # experimental alternative. special thing about is that there
  557. # no dependency between the two multiplications...
  558. mov \$`0xE1<<1`,%eax
  559. mov \$0xA040608020C0E000,%r10 # ((7..0)·0xE0)&0xff
  560. mov \$0x07,%r11d
  561. movq %rax,$T1
  562. movq %r10,$T2
  563. movq %r11,$T3 # borrow $T3
  564. pand $Xi,$T3
  565. pshufb $T3,$T2 # ($Xi&7)·0xE0
  566. movq %rax,$T3
  567. pclmulqdq \$0x00,$Xi,$T1 # ·(0xE1<<1)
  568. pxor $Xi,$T2
  569. pslldq \$15,$T2
  570. paddd $T2,$T2 # <<(64+56+1)
  571. pxor $T2,$Xi
  572. pclmulqdq \$0x01,$T3,$Xi
  573. movdqa .Lbswap_mask(%rip),$T3 # reload $T3
  574. psrldq \$1,$T1
  575. pxor $T1,$Xhi
  576. pslldq \$7,$Xi
  577. pxor $Xhi,$Xi
  578. ___
  579. $code.=<<___;
  580. pshufb $T3,$Xi
  581. movdqu $Xi,($Xip)
  582. ret
  583. .cfi_endproc
  584. .size gcm_gmult_clmul,.-gcm_gmult_clmul
  585. ___
  586. }
  587. { my ($Xip,$Htbl,$inp,$len)=@_4args;
  588. my ($Xln,$Xmn,$Xhn,$Hkey2,$HK) = map("%xmm$_",(3..7));
  589. my ($T1,$T2,$T3)=map("%xmm$_",(8..10));
  590. $code.=<<___;
  591. .globl gcm_ghash_clmul
  592. .type gcm_ghash_clmul,\@abi-omnipotent
  593. .align 32
  594. gcm_ghash_clmul:
  595. .cfi_startproc
  596. .L_ghash_clmul:
  597. ___
  598. $code.=<<___ if ($win64);
  599. lea -0x88(%rsp),%rax
  600. .LSEH_begin_gcm_ghash_clmul:
  601. # I can't trust assembler to use specific encoding:-(
  602. .byte 0x48,0x8d,0x60,0xe0 #lea -0x20(%rax),%rsp
  603. .byte 0x0f,0x29,0x70,0xe0 #movaps %xmm6,-0x20(%rax)
  604. .byte 0x0f,0x29,0x78,0xf0 #movaps %xmm7,-0x10(%rax)
  605. .byte 0x44,0x0f,0x29,0x00 #movaps %xmm8,0(%rax)
  606. .byte 0x44,0x0f,0x29,0x48,0x10 #movaps %xmm9,0x10(%rax)
  607. .byte 0x44,0x0f,0x29,0x50,0x20 #movaps %xmm10,0x20(%rax)
  608. .byte 0x44,0x0f,0x29,0x58,0x30 #movaps %xmm11,0x30(%rax)
  609. .byte 0x44,0x0f,0x29,0x60,0x40 #movaps %xmm12,0x40(%rax)
  610. .byte 0x44,0x0f,0x29,0x68,0x50 #movaps %xmm13,0x50(%rax)
  611. .byte 0x44,0x0f,0x29,0x70,0x60 #movaps %xmm14,0x60(%rax)
  612. .byte 0x44,0x0f,0x29,0x78,0x70 #movaps %xmm15,0x70(%rax)
  613. ___
  614. $code.=<<___;
  615. movdqa .Lbswap_mask(%rip),$T3
  616. movdqu ($Xip),$Xi
  617. movdqu ($Htbl),$Hkey
  618. movdqu 0x20($Htbl),$HK
  619. pshufb $T3,$Xi
  620. sub \$0x10,$len
  621. jz .Lodd_tail
  622. movdqu 0x10($Htbl),$Hkey2
  623. ___
  624. if ($do4xaggr) {
  625. my ($Xl,$Xm,$Xh,$Hkey3,$Hkey4)=map("%xmm$_",(11..15));
  626. $code.=<<___;
  627. mov OPENSSL_ia32cap_P+4(%rip),%eax
  628. cmp \$0x30,$len
  629. jb .Lskip4x
  630. and \$`1<<26|1<<22`,%eax # isolate MOVBE+XSAVE
  631. cmp \$`1<<22`,%eax # check for MOVBE without XSAVE
  632. je .Lskip4x
  633. sub \$0x30,$len
  634. mov \$0xA040608020C0E000,%rax # ((7..0)·0xE0)&0xff
  635. movdqu 0x30($Htbl),$Hkey3
  636. movdqu 0x40($Htbl),$Hkey4
  637. #######
  638. # Xi+4 =[(H*Ii+3) + (H^2*Ii+2) + (H^3*Ii+1) + H^4*(Ii+Xi)] mod P
  639. #
  640. movdqu 0x30($inp),$Xln
  641. movdqu 0x20($inp),$Xl
  642. pshufb $T3,$Xln
  643. pshufb $T3,$Xl
  644. movdqa $Xln,$Xhn
  645. pshufd \$0b01001110,$Xln,$Xmn
  646. pxor $Xln,$Xmn
  647. pclmulqdq \$0x00,$Hkey,$Xln
  648. pclmulqdq \$0x11,$Hkey,$Xhn
  649. pclmulqdq \$0x00,$HK,$Xmn
  650. movdqa $Xl,$Xh
  651. pshufd \$0b01001110,$Xl,$Xm
  652. pxor $Xl,$Xm
  653. pclmulqdq \$0x00,$Hkey2,$Xl
  654. pclmulqdq \$0x11,$Hkey2,$Xh
  655. pclmulqdq \$0x10,$HK,$Xm
  656. xorps $Xl,$Xln
  657. xorps $Xh,$Xhn
  658. movups 0x50($Htbl),$HK
  659. xorps $Xm,$Xmn
  660. movdqu 0x10($inp),$Xl
  661. movdqu 0($inp),$T1
  662. pshufb $T3,$Xl
  663. pshufb $T3,$T1
  664. movdqa $Xl,$Xh
  665. pshufd \$0b01001110,$Xl,$Xm
  666. pxor $T1,$Xi
  667. pxor $Xl,$Xm
  668. pclmulqdq \$0x00,$Hkey3,$Xl
  669. movdqa $Xi,$Xhi
  670. pshufd \$0b01001110,$Xi,$T1
  671. pxor $Xi,$T1
  672. pclmulqdq \$0x11,$Hkey3,$Xh
  673. pclmulqdq \$0x00,$HK,$Xm
  674. xorps $Xl,$Xln
  675. xorps $Xh,$Xhn
  676. lea 0x40($inp),$inp
  677. sub \$0x40,$len
  678. jc .Ltail4x
  679. jmp .Lmod4_loop
  680. .align 32
  681. .Lmod4_loop:
  682. pclmulqdq \$0x00,$Hkey4,$Xi
  683. xorps $Xm,$Xmn
  684. movdqu 0x30($inp),$Xl
  685. pshufb $T3,$Xl
  686. pclmulqdq \$0x11,$Hkey4,$Xhi
  687. xorps $Xln,$Xi
  688. movdqu 0x20($inp),$Xln
  689. movdqa $Xl,$Xh
  690. pclmulqdq \$0x10,$HK,$T1
  691. pshufd \$0b01001110,$Xl,$Xm
  692. xorps $Xhn,$Xhi
  693. pxor $Xl,$Xm
  694. pshufb $T3,$Xln
  695. movups 0x20($Htbl),$HK
  696. xorps $Xmn,$T1
  697. pclmulqdq \$0x00,$Hkey,$Xl
  698. pshufd \$0b01001110,$Xln,$Xmn
  699. pxor $Xi,$T1 # aggregated Karatsuba post-processing
  700. movdqa $Xln,$Xhn
  701. pxor $Xhi,$T1 #
  702. pxor $Xln,$Xmn
  703. movdqa $T1,$T2 #
  704. pclmulqdq \$0x11,$Hkey,$Xh
  705. pslldq \$8,$T1
  706. psrldq \$8,$T2 #
  707. pxor $T1,$Xi
  708. movdqa .L7_mask(%rip),$T1
  709. pxor $T2,$Xhi #
  710. movq %rax,$T2
  711. pand $Xi,$T1 # 1st phase
  712. pshufb $T1,$T2 #
  713. pxor $Xi,$T2 #
  714. pclmulqdq \$0x00,$HK,$Xm
  715. psllq \$57,$T2 #
  716. movdqa $T2,$T1 #
  717. pslldq \$8,$T2
  718. pclmulqdq \$0x00,$Hkey2,$Xln
  719. psrldq \$8,$T1 #
  720. pxor $T2,$Xi
  721. pxor $T1,$Xhi #
  722. movdqu 0($inp),$T1
  723. movdqa $Xi,$T2 # 2nd phase
  724. psrlq \$1,$Xi
  725. pclmulqdq \$0x11,$Hkey2,$Xhn
  726. xorps $Xl,$Xln
  727. movdqu 0x10($inp),$Xl
  728. pshufb $T3,$Xl
  729. pclmulqdq \$0x10,$HK,$Xmn
  730. xorps $Xh,$Xhn
  731. movups 0x50($Htbl),$HK
  732. pshufb $T3,$T1
  733. pxor $T2,$Xhi #
  734. pxor $Xi,$T2
  735. psrlq \$5,$Xi
  736. movdqa $Xl,$Xh
  737. pxor $Xm,$Xmn
  738. pshufd \$0b01001110,$Xl,$Xm
  739. pxor $T2,$Xi #
  740. pxor $T1,$Xhi
  741. pxor $Xl,$Xm
  742. pclmulqdq \$0x00,$Hkey3,$Xl
  743. psrlq \$1,$Xi #
  744. pxor $Xhi,$Xi #
  745. movdqa $Xi,$Xhi
  746. pclmulqdq \$0x11,$Hkey3,$Xh
  747. xorps $Xl,$Xln
  748. pshufd \$0b01001110,$Xi,$T1
  749. pxor $Xi,$T1
  750. pclmulqdq \$0x00,$HK,$Xm
  751. xorps $Xh,$Xhn
  752. lea 0x40($inp),$inp
  753. sub \$0x40,$len
  754. jnc .Lmod4_loop
  755. .Ltail4x:
  756. pclmulqdq \$0x00,$Hkey4,$Xi
  757. pclmulqdq \$0x11,$Hkey4,$Xhi
  758. pclmulqdq \$0x10,$HK,$T1
  759. xorps $Xm,$Xmn
  760. xorps $Xln,$Xi
  761. xorps $Xhn,$Xhi
  762. pxor $Xi,$Xhi # aggregated Karatsuba post-processing
  763. pxor $Xmn,$T1
  764. pxor $Xhi,$T1 #
  765. pxor $Xi,$Xhi
  766. movdqa $T1,$T2 #
  767. psrldq \$8,$T1
  768. pslldq \$8,$T2 #
  769. pxor $T1,$Xhi
  770. pxor $T2,$Xi #
  771. ___
  772. &reduction_alg9($Xhi,$Xi);
  773. $code.=<<___;
  774. add \$0x40,$len
  775. jz .Ldone
  776. movdqu 0x20($Htbl),$HK
  777. sub \$0x10,$len
  778. jz .Lodd_tail
  779. .Lskip4x:
  780. ___
  781. }
  782. $code.=<<___;
  783. #######
  784. # Xi+2 =[H*(Ii+1 + Xi+1)] mod P =
  785. # [(H*Ii+1) + (H*Xi+1)] mod P =
  786. # [(H*Ii+1) + H^2*(Ii+Xi)] mod P
  787. #
  788. movdqu ($inp),$T1 # Ii
  789. movdqu 16($inp),$Xln # Ii+1
  790. pshufb $T3,$T1
  791. pshufb $T3,$Xln
  792. pxor $T1,$Xi # Ii+Xi
  793. movdqa $Xln,$Xhn
  794. pshufd \$0b01001110,$Xln,$Xmn
  795. pxor $Xln,$Xmn
  796. pclmulqdq \$0x00,$Hkey,$Xln
  797. pclmulqdq \$0x11,$Hkey,$Xhn
  798. pclmulqdq \$0x00,$HK,$Xmn
  799. lea 32($inp),$inp # i+=2
  800. nop
  801. sub \$0x20,$len
  802. jbe .Leven_tail
  803. nop
  804. jmp .Lmod_loop
  805. .align 32
  806. .Lmod_loop:
  807. movdqa $Xi,$Xhi
  808. movdqa $Xmn,$T1
  809. pshufd \$0b01001110,$Xi,$Xmn #
  810. pxor $Xi,$Xmn #
  811. pclmulqdq \$0x00,$Hkey2,$Xi
  812. pclmulqdq \$0x11,$Hkey2,$Xhi
  813. pclmulqdq \$0x10,$HK,$Xmn
  814. pxor $Xln,$Xi # (H*Ii+1) + H^2*(Ii+Xi)
  815. pxor $Xhn,$Xhi
  816. movdqu ($inp),$T2 # Ii
  817. pxor $Xi,$T1 # aggregated Karatsuba post-processing
  818. pshufb $T3,$T2
  819. movdqu 16($inp),$Xln # Ii+1
  820. pxor $Xhi,$T1
  821. pxor $T2,$Xhi # "Ii+Xi", consume early
  822. pxor $T1,$Xmn
  823. pshufb $T3,$Xln
  824. movdqa $Xmn,$T1 #
  825. psrldq \$8,$T1
  826. pslldq \$8,$Xmn #
  827. pxor $T1,$Xhi
  828. pxor $Xmn,$Xi #
  829. movdqa $Xln,$Xhn #
  830. movdqa $Xi,$T2 # 1st phase
  831. movdqa $Xi,$T1
  832. psllq \$5,$Xi
  833. pxor $Xi,$T1 #
  834. pclmulqdq \$0x00,$Hkey,$Xln #######
  835. psllq \$1,$Xi
  836. pxor $T1,$Xi #
  837. psllq \$57,$Xi #
  838. movdqa $Xi,$T1 #
  839. pslldq \$8,$Xi
  840. psrldq \$8,$T1 #
  841. pxor $T2,$Xi
  842. pshufd \$0b01001110,$Xhn,$Xmn
  843. pxor $T1,$Xhi #
  844. pxor $Xhn,$Xmn #
  845. movdqa $Xi,$T2 # 2nd phase
  846. psrlq \$1,$Xi
  847. pclmulqdq \$0x11,$Hkey,$Xhn #######
  848. pxor $T2,$Xhi #
  849. pxor $Xi,$T2
  850. psrlq \$5,$Xi
  851. pxor $T2,$Xi #
  852. lea 32($inp),$inp
  853. psrlq \$1,$Xi #
  854. pclmulqdq \$0x00,$HK,$Xmn #######
  855. pxor $Xhi,$Xi #
  856. sub \$0x20,$len
  857. ja .Lmod_loop
  858. .Leven_tail:
  859. movdqa $Xi,$Xhi
  860. movdqa $Xmn,$T1
  861. pshufd \$0b01001110,$Xi,$Xmn #
  862. pxor $Xi,$Xmn #
  863. pclmulqdq \$0x00,$Hkey2,$Xi
  864. pclmulqdq \$0x11,$Hkey2,$Xhi
  865. pclmulqdq \$0x10,$HK,$Xmn
  866. pxor $Xln,$Xi # (H*Ii+1) + H^2*(Ii+Xi)
  867. pxor $Xhn,$Xhi
  868. pxor $Xi,$T1
  869. pxor $Xhi,$T1
  870. pxor $T1,$Xmn
  871. movdqa $Xmn,$T1 #
  872. psrldq \$8,$T1
  873. pslldq \$8,$Xmn #
  874. pxor $T1,$Xhi
  875. pxor $Xmn,$Xi #
  876. ___
  877. &reduction_alg9 ($Xhi,$Xi);
  878. $code.=<<___;
  879. test $len,$len
  880. jnz .Ldone
  881. .Lodd_tail:
  882. movdqu ($inp),$T1 # Ii
  883. pshufb $T3,$T1
  884. pxor $T1,$Xi # Ii+Xi
  885. ___
  886. &clmul64x64_T2 ($Xhi,$Xi,$Hkey,$HK); # H*(Ii+Xi)
  887. &reduction_alg9 ($Xhi,$Xi);
  888. $code.=<<___;
  889. .Ldone:
  890. pshufb $T3,$Xi
  891. movdqu $Xi,($Xip)
  892. ___
  893. $code.=<<___ if ($win64);
  894. movaps (%rsp),%xmm6
  895. movaps 0x10(%rsp),%xmm7
  896. movaps 0x20(%rsp),%xmm8
  897. movaps 0x30(%rsp),%xmm9
  898. movaps 0x40(%rsp),%xmm10
  899. movaps 0x50(%rsp),%xmm11
  900. movaps 0x60(%rsp),%xmm12
  901. movaps 0x70(%rsp),%xmm13
  902. movaps 0x80(%rsp),%xmm14
  903. movaps 0x90(%rsp),%xmm15
  904. lea 0xa8(%rsp),%rsp
  905. .LSEH_end_gcm_ghash_clmul:
  906. ___
  907. $code.=<<___;
  908. ret
  909. .cfi_endproc
  910. .size gcm_ghash_clmul,.-gcm_ghash_clmul
  911. ___
  912. }
  913. $code.=<<___;
  914. .globl gcm_init_avx
  915. .type gcm_init_avx,\@abi-omnipotent
  916. .align 32
  917. gcm_init_avx:
  918. .cfi_startproc
  919. ___
  920. if ($avx) {
  921. my ($Htbl,$Xip)=@_4args;
  922. my $HK="%xmm6";
  923. $code.=<<___ if ($win64);
  924. .LSEH_begin_gcm_init_avx:
  925. # I can't trust assembler to use specific encoding:-(
  926. .byte 0x48,0x83,0xec,0x18 #sub $0x18,%rsp
  927. .byte 0x0f,0x29,0x34,0x24 #movaps %xmm6,(%rsp)
  928. ___
  929. $code.=<<___;
  930. vzeroupper
  931. vmovdqu ($Xip),$Hkey
  932. vpshufd \$0b01001110,$Hkey,$Hkey # dword swap
  933. # <<1 twist
  934. vpshufd \$0b11111111,$Hkey,$T2 # broadcast uppermost dword
  935. vpsrlq \$63,$Hkey,$T1
  936. vpsllq \$1,$Hkey,$Hkey
  937. vpxor $T3,$T3,$T3 #
  938. vpcmpgtd $T2,$T3,$T3 # broadcast carry bit
  939. vpslldq \$8,$T1,$T1
  940. vpor $T1,$Hkey,$Hkey # H<<=1
  941. # magic reduction
  942. vpand .L0x1c2_polynomial(%rip),$T3,$T3
  943. vpxor $T3,$Hkey,$Hkey # if(carry) H^=0x1c2_polynomial
  944. vpunpckhqdq $Hkey,$Hkey,$HK
  945. vmovdqa $Hkey,$Xi
  946. vpxor $Hkey,$HK,$HK
  947. mov \$4,%r10 # up to H^8
  948. jmp .Linit_start_avx
  949. ___
  950. sub clmul64x64_avx {
  951. my ($Xhi,$Xi,$Hkey,$HK)=@_;
  952. if (!defined($HK)) { $HK = $T2;
  953. $code.=<<___;
  954. vpunpckhqdq $Xi,$Xi,$T1
  955. vpunpckhqdq $Hkey,$Hkey,$T2
  956. vpxor $Xi,$T1,$T1 #
  957. vpxor $Hkey,$T2,$T2
  958. ___
  959. } else {
  960. $code.=<<___;
  961. vpunpckhqdq $Xi,$Xi,$T1
  962. vpxor $Xi,$T1,$T1 #
  963. ___
  964. }
  965. $code.=<<___;
  966. vpclmulqdq \$0x11,$Hkey,$Xi,$Xhi #######
  967. vpclmulqdq \$0x00,$Hkey,$Xi,$Xi #######
  968. vpclmulqdq \$0x00,$HK,$T1,$T1 #######
  969. vpxor $Xi,$Xhi,$T2 #
  970. vpxor $T2,$T1,$T1 #
  971. vpslldq \$8,$T1,$T2 #
  972. vpsrldq \$8,$T1,$T1
  973. vpxor $T2,$Xi,$Xi #
  974. vpxor $T1,$Xhi,$Xhi
  975. ___
  976. }
  977. sub reduction_avx {
  978. my ($Xhi,$Xi) = @_;
  979. $code.=<<___;
  980. vpsllq \$57,$Xi,$T1 # 1st phase
  981. vpsllq \$62,$Xi,$T2
  982. vpxor $T1,$T2,$T2 #
  983. vpsllq \$63,$Xi,$T1
  984. vpxor $T1,$T2,$T2 #
  985. vpslldq \$8,$T2,$T1 #
  986. vpsrldq \$8,$T2,$T2
  987. vpxor $T1,$Xi,$Xi #
  988. vpxor $T2,$Xhi,$Xhi
  989. vpsrlq \$1,$Xi,$T2 # 2nd phase
  990. vpxor $Xi,$Xhi,$Xhi
  991. vpxor $T2,$Xi,$Xi #
  992. vpsrlq \$5,$T2,$T2
  993. vpxor $T2,$Xi,$Xi #
  994. vpsrlq \$1,$Xi,$Xi #
  995. vpxor $Xhi,$Xi,$Xi #
  996. ___
  997. }
  998. $code.=<<___;
  999. .align 32
  1000. .Linit_loop_avx:
  1001. vpalignr \$8,$T1,$T2,$T3 # low part is H.lo^H.hi...
  1002. vmovdqu $T3,-0x10($Htbl) # save Karatsuba "salt"
  1003. ___
  1004. &clmul64x64_avx ($Xhi,$Xi,$Hkey,$HK); # calculate H^3,5,7
  1005. &reduction_avx ($Xhi,$Xi);
  1006. $code.=<<___;
  1007. .Linit_start_avx:
  1008. vmovdqa $Xi,$T3
  1009. ___
  1010. &clmul64x64_avx ($Xhi,$Xi,$Hkey,$HK); # calculate H^2,4,6,8
  1011. &reduction_avx ($Xhi,$Xi);
  1012. $code.=<<___;
  1013. vpshufd \$0b01001110,$T3,$T1
  1014. vpshufd \$0b01001110,$Xi,$T2
  1015. vpxor $T3,$T1,$T1 # Karatsuba pre-processing
  1016. vmovdqu $T3,0x00($Htbl) # save H^1,3,5,7
  1017. vpxor $Xi,$T2,$T2 # Karatsuba pre-processing
  1018. vmovdqu $Xi,0x10($Htbl) # save H^2,4,6,8
  1019. lea 0x30($Htbl),$Htbl
  1020. sub \$1,%r10
  1021. jnz .Linit_loop_avx
  1022. vpalignr \$8,$T2,$T1,$T3 # last "salt" is flipped
  1023. vmovdqu $T3,-0x10($Htbl)
  1024. vzeroupper
  1025. ___
  1026. $code.=<<___ if ($win64);
  1027. movaps (%rsp),%xmm6
  1028. lea 0x18(%rsp),%rsp
  1029. .LSEH_end_gcm_init_avx:
  1030. ___
  1031. $code.=<<___;
  1032. ret
  1033. .cfi_endproc
  1034. .size gcm_init_avx,.-gcm_init_avx
  1035. ___
  1036. } else {
  1037. $code.=<<___;
  1038. jmp .L_init_clmul
  1039. .cfi_endproc
  1040. .size gcm_init_avx,.-gcm_init_avx
  1041. ___
  1042. }
  1043. $code.=<<___;
  1044. .globl gcm_gmult_avx
  1045. .type gcm_gmult_avx,\@abi-omnipotent
  1046. .align 32
  1047. gcm_gmult_avx:
  1048. .cfi_startproc
  1049. jmp .L_gmult_clmul
  1050. .cfi_endproc
  1051. .size gcm_gmult_avx,.-gcm_gmult_avx
  1052. ___
  1053. $code.=<<___;
  1054. .globl gcm_ghash_avx
  1055. .type gcm_ghash_avx,\@abi-omnipotent
  1056. .align 32
  1057. gcm_ghash_avx:
  1058. .cfi_startproc
  1059. ___
  1060. if ($avx) {
  1061. my ($Xip,$Htbl,$inp,$len)=@_4args;
  1062. my ($Xlo,$Xhi,$Xmi,
  1063. $Zlo,$Zhi,$Zmi,
  1064. $Hkey,$HK,$T1,$T2,
  1065. $Xi,$Xo,$Tred,$bswap,$Ii,$Ij) = map("%xmm$_",(0..15));
  1066. $code.=<<___ if ($win64);
  1067. lea -0x88(%rsp),%rax
  1068. .LSEH_begin_gcm_ghash_avx:
  1069. # I can't trust assembler to use specific encoding:-(
  1070. .byte 0x48,0x8d,0x60,0xe0 #lea -0x20(%rax),%rsp
  1071. .byte 0x0f,0x29,0x70,0xe0 #movaps %xmm6,-0x20(%rax)
  1072. .byte 0x0f,0x29,0x78,0xf0 #movaps %xmm7,-0x10(%rax)
  1073. .byte 0x44,0x0f,0x29,0x00 #movaps %xmm8,0(%rax)
  1074. .byte 0x44,0x0f,0x29,0x48,0x10 #movaps %xmm9,0x10(%rax)
  1075. .byte 0x44,0x0f,0x29,0x50,0x20 #movaps %xmm10,0x20(%rax)
  1076. .byte 0x44,0x0f,0x29,0x58,0x30 #movaps %xmm11,0x30(%rax)
  1077. .byte 0x44,0x0f,0x29,0x60,0x40 #movaps %xmm12,0x40(%rax)
  1078. .byte 0x44,0x0f,0x29,0x68,0x50 #movaps %xmm13,0x50(%rax)
  1079. .byte 0x44,0x0f,0x29,0x70,0x60 #movaps %xmm14,0x60(%rax)
  1080. .byte 0x44,0x0f,0x29,0x78,0x70 #movaps %xmm15,0x70(%rax)
  1081. ___
  1082. $code.=<<___;
  1083. vzeroupper
  1084. vmovdqu ($Xip),$Xi # load $Xi
  1085. lea .L0x1c2_polynomial(%rip),%r10
  1086. lea 0x40($Htbl),$Htbl # size optimization
  1087. vmovdqu .Lbswap_mask(%rip),$bswap
  1088. vpshufb $bswap,$Xi,$Xi
  1089. cmp \$0x80,$len
  1090. jb .Lshort_avx
  1091. sub \$0x80,$len
  1092. vmovdqu 0x70($inp),$Ii # I[7]
  1093. vmovdqu 0x00-0x40($Htbl),$Hkey # $Hkey^1
  1094. vpshufb $bswap,$Ii,$Ii
  1095. vmovdqu 0x20-0x40($Htbl),$HK
  1096. vpunpckhqdq $Ii,$Ii,$T2
  1097. vmovdqu 0x60($inp),$Ij # I[6]
  1098. vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo
  1099. vpxor $Ii,$T2,$T2
  1100. vpshufb $bswap,$Ij,$Ij
  1101. vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi
  1102. vmovdqu 0x10-0x40($Htbl),$Hkey # $Hkey^2
  1103. vpunpckhqdq $Ij,$Ij,$T1
  1104. vmovdqu 0x50($inp),$Ii # I[5]
  1105. vpclmulqdq \$0x00,$HK,$T2,$Xmi
  1106. vpxor $Ij,$T1,$T1
  1107. vpshufb $bswap,$Ii,$Ii
  1108. vpclmulqdq \$0x00,$Hkey,$Ij,$Zlo
  1109. vpunpckhqdq $Ii,$Ii,$T2
  1110. vpclmulqdq \$0x11,$Hkey,$Ij,$Zhi
  1111. vmovdqu 0x30-0x40($Htbl),$Hkey # $Hkey^3
  1112. vpxor $Ii,$T2,$T2
  1113. vmovdqu 0x40($inp),$Ij # I[4]
  1114. vpclmulqdq \$0x10,$HK,$T1,$Zmi
  1115. vmovdqu 0x50-0x40($Htbl),$HK
  1116. vpshufb $bswap,$Ij,$Ij
  1117. vpxor $Xlo,$Zlo,$Zlo
  1118. vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo
  1119. vpxor $Xhi,$Zhi,$Zhi
  1120. vpunpckhqdq $Ij,$Ij,$T1
  1121. vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi
  1122. vmovdqu 0x40-0x40($Htbl),$Hkey # $Hkey^4
  1123. vpxor $Xmi,$Zmi,$Zmi
  1124. vpclmulqdq \$0x00,$HK,$T2,$Xmi
  1125. vpxor $Ij,$T1,$T1
  1126. vmovdqu 0x30($inp),$Ii # I[3]
  1127. vpxor $Zlo,$Xlo,$Xlo
  1128. vpclmulqdq \$0x00,$Hkey,$Ij,$Zlo
  1129. vpxor $Zhi,$Xhi,$Xhi
  1130. vpshufb $bswap,$Ii,$Ii
  1131. vpclmulqdq \$0x11,$Hkey,$Ij,$Zhi
  1132. vmovdqu 0x60-0x40($Htbl),$Hkey # $Hkey^5
  1133. vpxor $Zmi,$Xmi,$Xmi
  1134. vpunpckhqdq $Ii,$Ii,$T2
  1135. vpclmulqdq \$0x10,$HK,$T1,$Zmi
  1136. vmovdqu 0x80-0x40($Htbl),$HK
  1137. vpxor $Ii,$T2,$T2
  1138. vmovdqu 0x20($inp),$Ij # I[2]
  1139. vpxor $Xlo,$Zlo,$Zlo
  1140. vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo
  1141. vpxor $Xhi,$Zhi,$Zhi
  1142. vpshufb $bswap,$Ij,$Ij
  1143. vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi
  1144. vmovdqu 0x70-0x40($Htbl),$Hkey # $Hkey^6
  1145. vpxor $Xmi,$Zmi,$Zmi
  1146. vpunpckhqdq $Ij,$Ij,$T1
  1147. vpclmulqdq \$0x00,$HK,$T2,$Xmi
  1148. vpxor $Ij,$T1,$T1
  1149. vmovdqu 0x10($inp),$Ii # I[1]
  1150. vpxor $Zlo,$Xlo,$Xlo
  1151. vpclmulqdq \$0x00,$Hkey,$Ij,$Zlo
  1152. vpxor $Zhi,$Xhi,$Xhi
  1153. vpshufb $bswap,$Ii,$Ii
  1154. vpclmulqdq \$0x11,$Hkey,$Ij,$Zhi
  1155. vmovdqu 0x90-0x40($Htbl),$Hkey # $Hkey^7
  1156. vpxor $Zmi,$Xmi,$Xmi
  1157. vpunpckhqdq $Ii,$Ii,$T2
  1158. vpclmulqdq \$0x10,$HK,$T1,$Zmi
  1159. vmovdqu 0xb0-0x40($Htbl),$HK
  1160. vpxor $Ii,$T2,$T2
  1161. vmovdqu ($inp),$Ij # I[0]
  1162. vpxor $Xlo,$Zlo,$Zlo
  1163. vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo
  1164. vpxor $Xhi,$Zhi,$Zhi
  1165. vpshufb $bswap,$Ij,$Ij
  1166. vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi
  1167. vmovdqu 0xa0-0x40($Htbl),$Hkey # $Hkey^8
  1168. vpxor $Xmi,$Zmi,$Zmi
  1169. vpclmulqdq \$0x10,$HK,$T2,$Xmi
  1170. lea 0x80($inp),$inp
  1171. cmp \$0x80,$len
  1172. jb .Ltail_avx
  1173. vpxor $Xi,$Ij,$Ij # accumulate $Xi
  1174. sub \$0x80,$len
  1175. jmp .Loop8x_avx
  1176. .align 32
  1177. .Loop8x_avx:
  1178. vpunpckhqdq $Ij,$Ij,$T1
  1179. vmovdqu 0x70($inp),$Ii # I[7]
  1180. vpxor $Xlo,$Zlo,$Zlo
  1181. vpxor $Ij,$T1,$T1
  1182. vpclmulqdq \$0x00,$Hkey,$Ij,$Xi
  1183. vpshufb $bswap,$Ii,$Ii
  1184. vpxor $Xhi,$Zhi,$Zhi
  1185. vpclmulqdq \$0x11,$Hkey,$Ij,$Xo
  1186. vmovdqu 0x00-0x40($Htbl),$Hkey # $Hkey^1
  1187. vpunpckhqdq $Ii,$Ii,$T2
  1188. vpxor $Xmi,$Zmi,$Zmi
  1189. vpclmulqdq \$0x00,$HK,$T1,$Tred
  1190. vmovdqu 0x20-0x40($Htbl),$HK
  1191. vpxor $Ii,$T2,$T2
  1192. vmovdqu 0x60($inp),$Ij # I[6]
  1193. vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo
  1194. vpxor $Zlo,$Xi,$Xi # collect result
  1195. vpshufb $bswap,$Ij,$Ij
  1196. vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi
  1197. vxorps $Zhi,$Xo,$Xo
  1198. vmovdqu 0x10-0x40($Htbl),$Hkey # $Hkey^2
  1199. vpunpckhqdq $Ij,$Ij,$T1
  1200. vpclmulqdq \$0x00,$HK, $T2,$Xmi
  1201. vpxor $Zmi,$Tred,$Tred
  1202. vxorps $Ij,$T1,$T1
  1203. vmovdqu 0x50($inp),$Ii # I[5]
  1204. vpxor $Xi,$Tred,$Tred # aggregated Karatsuba post-processing
  1205. vpclmulqdq \$0x00,$Hkey,$Ij,$Zlo
  1206. vpxor $Xo,$Tred,$Tred
  1207. vpslldq \$8,$Tred,$T2
  1208. vpxor $Xlo,$Zlo,$Zlo
  1209. vpclmulqdq \$0x11,$Hkey,$Ij,$Zhi
  1210. vpsrldq \$8,$Tred,$Tred
  1211. vpxor $T2, $Xi, $Xi
  1212. vmovdqu 0x30-0x40($Htbl),$Hkey # $Hkey^3
  1213. vpshufb $bswap,$Ii,$Ii
  1214. vxorps $Tred,$Xo, $Xo
  1215. vpxor $Xhi,$Zhi,$Zhi
  1216. vpunpckhqdq $Ii,$Ii,$T2
  1217. vpclmulqdq \$0x10,$HK, $T1,$Zmi
  1218. vmovdqu 0x50-0x40($Htbl),$HK
  1219. vpxor $Ii,$T2,$T2
  1220. vpxor $Xmi,$Zmi,$Zmi
  1221. vmovdqu 0x40($inp),$Ij # I[4]
  1222. vpalignr \$8,$Xi,$Xi,$Tred # 1st phase
  1223. vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo
  1224. vpshufb $bswap,$Ij,$Ij
  1225. vpxor $Zlo,$Xlo,$Xlo
  1226. vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi
  1227. vmovdqu 0x40-0x40($Htbl),$Hkey # $Hkey^4
  1228. vpunpckhqdq $Ij,$Ij,$T1
  1229. vpxor $Zhi,$Xhi,$Xhi
  1230. vpclmulqdq \$0x00,$HK, $T2,$Xmi
  1231. vxorps $Ij,$T1,$T1
  1232. vpxor $Zmi,$Xmi,$Xmi
  1233. vmovdqu 0x30($inp),$Ii # I[3]
  1234. vpclmulqdq \$0x10,(%r10),$Xi,$Xi
  1235. vpclmulqdq \$0x00,$Hkey,$Ij,$Zlo
  1236. vpshufb $bswap,$Ii,$Ii
  1237. vpxor $Xlo,$Zlo,$Zlo
  1238. vpclmulqdq \$0x11,$Hkey,$Ij,$Zhi
  1239. vmovdqu 0x60-0x40($Htbl),$Hkey # $Hkey^5
  1240. vpunpckhqdq $Ii,$Ii,$T2
  1241. vpxor $Xhi,$Zhi,$Zhi
  1242. vpclmulqdq \$0x10,$HK, $T1,$Zmi
  1243. vmovdqu 0x80-0x40($Htbl),$HK
  1244. vpxor $Ii,$T2,$T2
  1245. vpxor $Xmi,$Zmi,$Zmi
  1246. vmovdqu 0x20($inp),$Ij # I[2]
  1247. vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo
  1248. vpshufb $bswap,$Ij,$Ij
  1249. vpxor $Zlo,$Xlo,$Xlo
  1250. vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi
  1251. vmovdqu 0x70-0x40($Htbl),$Hkey # $Hkey^6
  1252. vpunpckhqdq $Ij,$Ij,$T1
  1253. vpxor $Zhi,$Xhi,$Xhi
  1254. vpclmulqdq \$0x00,$HK, $T2,$Xmi
  1255. vpxor $Ij,$T1,$T1
  1256. vpxor $Zmi,$Xmi,$Xmi
  1257. vxorps $Tred,$Xi,$Xi
  1258. vmovdqu 0x10($inp),$Ii # I[1]
  1259. vpalignr \$8,$Xi,$Xi,$Tred # 2nd phase
  1260. vpclmulqdq \$0x00,$Hkey,$Ij,$Zlo
  1261. vpshufb $bswap,$Ii,$Ii
  1262. vpxor $Xlo,$Zlo,$Zlo
  1263. vpclmulqdq \$0x11,$Hkey,$Ij,$Zhi
  1264. vmovdqu 0x90-0x40($Htbl),$Hkey # $Hkey^7
  1265. vpclmulqdq \$0x10,(%r10),$Xi,$Xi
  1266. vxorps $Xo,$Tred,$Tred
  1267. vpunpckhqdq $Ii,$Ii,$T2
  1268. vpxor $Xhi,$Zhi,$Zhi
  1269. vpclmulqdq \$0x10,$HK, $T1,$Zmi
  1270. vmovdqu 0xb0-0x40($Htbl),$HK
  1271. vpxor $Ii,$T2,$T2
  1272. vpxor $Xmi,$Zmi,$Zmi
  1273. vmovdqu ($inp),$Ij # I[0]
  1274. vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo
  1275. vpshufb $bswap,$Ij,$Ij
  1276. vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi
  1277. vmovdqu 0xa0-0x40($Htbl),$Hkey # $Hkey^8
  1278. vpxor $Tred,$Ij,$Ij
  1279. vpclmulqdq \$0x10,$HK, $T2,$Xmi
  1280. vpxor $Xi,$Ij,$Ij # accumulate $Xi
  1281. lea 0x80($inp),$inp
  1282. sub \$0x80,$len
  1283. jnc .Loop8x_avx
  1284. add \$0x80,$len
  1285. jmp .Ltail_no_xor_avx
  1286. .align 32
  1287. .Lshort_avx:
  1288. vmovdqu -0x10($inp,$len),$Ii # very last word
  1289. lea ($inp,$len),$inp
  1290. vmovdqu 0x00-0x40($Htbl),$Hkey # $Hkey^1
  1291. vmovdqu 0x20-0x40($Htbl),$HK
  1292. vpshufb $bswap,$Ii,$Ij
  1293. vmovdqa $Xlo,$Zlo # subtle way to zero $Zlo,
  1294. vmovdqa $Xhi,$Zhi # $Zhi and
  1295. vmovdqa $Xmi,$Zmi # $Zmi
  1296. sub \$0x10,$len
  1297. jz .Ltail_avx
  1298. vpunpckhqdq $Ij,$Ij,$T1
  1299. vpxor $Xlo,$Zlo,$Zlo
  1300. vpclmulqdq \$0x00,$Hkey,$Ij,$Xlo
  1301. vpxor $Ij,$T1,$T1
  1302. vmovdqu -0x20($inp),$Ii
  1303. vpxor $Xhi,$Zhi,$Zhi
  1304. vpclmulqdq \$0x11,$Hkey,$Ij,$Xhi
  1305. vmovdqu 0x10-0x40($Htbl),$Hkey # $Hkey^2
  1306. vpshufb $bswap,$Ii,$Ij
  1307. vpxor $Xmi,$Zmi,$Zmi
  1308. vpclmulqdq \$0x00,$HK,$T1,$Xmi
  1309. vpsrldq \$8,$HK,$HK
  1310. sub \$0x10,$len
  1311. jz .Ltail_avx
  1312. vpunpckhqdq $Ij,$Ij,$T1
  1313. vpxor $Xlo,$Zlo,$Zlo
  1314. vpclmulqdq \$0x00,$Hkey,$Ij,$Xlo
  1315. vpxor $Ij,$T1,$T1
  1316. vmovdqu -0x30($inp),$Ii
  1317. vpxor $Xhi,$Zhi,$Zhi
  1318. vpclmulqdq \$0x11,$Hkey,$Ij,$Xhi
  1319. vmovdqu 0x30-0x40($Htbl),$Hkey # $Hkey^3
  1320. vpshufb $bswap,$Ii,$Ij
  1321. vpxor $Xmi,$Zmi,$Zmi
  1322. vpclmulqdq \$0x00,$HK,$T1,$Xmi
  1323. vmovdqu 0x50-0x40($Htbl),$HK
  1324. sub \$0x10,$len
  1325. jz .Ltail_avx
  1326. vpunpckhqdq $Ij,$Ij,$T1
  1327. vpxor $Xlo,$Zlo,$Zlo
  1328. vpclmulqdq \$0x00,$Hkey,$Ij,$Xlo
  1329. vpxor $Ij,$T1,$T1
  1330. vmovdqu -0x40($inp),$Ii
  1331. vpxor $Xhi,$Zhi,$Zhi
  1332. vpclmulqdq \$0x11,$Hkey,$Ij,$Xhi
  1333. vmovdqu 0x40-0x40($Htbl),$Hkey # $Hkey^4
  1334. vpshufb $bswap,$Ii,$Ij
  1335. vpxor $Xmi,$Zmi,$Zmi
  1336. vpclmulqdq \$0x00,$HK,$T1,$Xmi
  1337. vpsrldq \$8,$HK,$HK
  1338. sub \$0x10,$len
  1339. jz .Ltail_avx
  1340. vpunpckhqdq $Ij,$Ij,$T1
  1341. vpxor $Xlo,$Zlo,$Zlo
  1342. vpclmulqdq \$0x00,$Hkey,$Ij,$Xlo
  1343. vpxor $Ij,$T1,$T1
  1344. vmovdqu -0x50($inp),$Ii
  1345. vpxor $Xhi,$Zhi,$Zhi
  1346. vpclmulqdq \$0x11,$Hkey,$Ij,$Xhi
  1347. vmovdqu 0x60-0x40($Htbl),$Hkey # $Hkey^5
  1348. vpshufb $bswap,$Ii,$Ij
  1349. vpxor $Xmi,$Zmi,$Zmi
  1350. vpclmulqdq \$0x00,$HK,$T1,$Xmi
  1351. vmovdqu 0x80-0x40($Htbl),$HK
  1352. sub \$0x10,$len
  1353. jz .Ltail_avx
  1354. vpunpckhqdq $Ij,$Ij,$T1
  1355. vpxor $Xlo,$Zlo,$Zlo
  1356. vpclmulqdq \$0x00,$Hkey,$Ij,$Xlo
  1357. vpxor $Ij,$T1,$T1
  1358. vmovdqu -0x60($inp),$Ii
  1359. vpxor $Xhi,$Zhi,$Zhi
  1360. vpclmulqdq \$0x11,$Hkey,$Ij,$Xhi
  1361. vmovdqu 0x70-0x40($Htbl),$Hkey # $Hkey^6
  1362. vpshufb $bswap,$Ii,$Ij
  1363. vpxor $Xmi,$Zmi,$Zmi
  1364. vpclmulqdq \$0x00,$HK,$T1,$Xmi
  1365. vpsrldq \$8,$HK,$HK
  1366. sub \$0x10,$len
  1367. jz .Ltail_avx
  1368. vpunpckhqdq $Ij,$Ij,$T1
  1369. vpxor $Xlo,$Zlo,$Zlo
  1370. vpclmulqdq \$0x00,$Hkey,$Ij,$Xlo
  1371. vpxor $Ij,$T1,$T1
  1372. vmovdqu -0x70($inp),$Ii
  1373. vpxor $Xhi,$Zhi,$Zhi
  1374. vpclmulqdq \$0x11,$Hkey,$Ij,$Xhi
  1375. vmovdqu 0x90-0x40($Htbl),$Hkey # $Hkey^7
  1376. vpshufb $bswap,$Ii,$Ij
  1377. vpxor $Xmi,$Zmi,$Zmi
  1378. vpclmulqdq \$0x00,$HK,$T1,$Xmi
  1379. vmovq 0xb8-0x40($Htbl),$HK
  1380. sub \$0x10,$len
  1381. jmp .Ltail_avx
  1382. .align 32
  1383. .Ltail_avx:
  1384. vpxor $Xi,$Ij,$Ij # accumulate $Xi
  1385. .Ltail_no_xor_avx:
  1386. vpunpckhqdq $Ij,$Ij,$T1
  1387. vpxor $Xlo,$Zlo,$Zlo
  1388. vpclmulqdq \$0x00,$Hkey,$Ij,$Xlo
  1389. vpxor $Ij,$T1,$T1
  1390. vpxor $Xhi,$Zhi,$Zhi
  1391. vpclmulqdq \$0x11,$Hkey,$Ij,$Xhi
  1392. vpxor $Xmi,$Zmi,$Zmi
  1393. vpclmulqdq \$0x00,$HK,$T1,$Xmi
  1394. vmovdqu (%r10),$Tred
  1395. vpxor $Xlo,$Zlo,$Xi
  1396. vpxor $Xhi,$Zhi,$Xo
  1397. vpxor $Xmi,$Zmi,$Zmi
  1398. vpxor $Xi, $Zmi,$Zmi # aggregated Karatsuba post-processing
  1399. vpxor $Xo, $Zmi,$Zmi
  1400. vpslldq \$8, $Zmi,$T2
  1401. vpsrldq \$8, $Zmi,$Zmi
  1402. vpxor $T2, $Xi, $Xi
  1403. vpxor $Zmi,$Xo, $Xo
  1404. vpclmulqdq \$0x10,$Tred,$Xi,$T2 # 1st phase
  1405. vpalignr \$8,$Xi,$Xi,$Xi
  1406. vpxor $T2,$Xi,$Xi
  1407. vpclmulqdq \$0x10,$Tred,$Xi,$T2 # 2nd phase
  1408. vpalignr \$8,$Xi,$Xi,$Xi
  1409. vpxor $Xo,$Xi,$Xi
  1410. vpxor $T2,$Xi,$Xi
  1411. cmp \$0,$len
  1412. jne .Lshort_avx
  1413. vpshufb $bswap,$Xi,$Xi
  1414. vmovdqu $Xi,($Xip)
  1415. vzeroupper
  1416. ___
  1417. $code.=<<___ if ($win64);
  1418. movaps (%rsp),%xmm6
  1419. movaps 0x10(%rsp),%xmm7
  1420. movaps 0x20(%rsp),%xmm8
  1421. movaps 0x30(%rsp),%xmm9
  1422. movaps 0x40(%rsp),%xmm10
  1423. movaps 0x50(%rsp),%xmm11
  1424. movaps 0x60(%rsp),%xmm12
  1425. movaps 0x70(%rsp),%xmm13
  1426. movaps 0x80(%rsp),%xmm14
  1427. movaps 0x90(%rsp),%xmm15
  1428. lea 0xa8(%rsp),%rsp
  1429. .LSEH_end_gcm_ghash_avx:
  1430. ___
  1431. $code.=<<___;
  1432. ret
  1433. .cfi_endproc
  1434. .size gcm_ghash_avx,.-gcm_ghash_avx
  1435. ___
  1436. } else {
  1437. $code.=<<___;
  1438. jmp .L_ghash_clmul
  1439. .cfi_endproc
  1440. .size gcm_ghash_avx,.-gcm_ghash_avx
  1441. ___
  1442. }
  1443. $code.=<<___;
  1444. .align 64
  1445. .Lbswap_mask:
  1446. .byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
  1447. .L0x1c2_polynomial:
  1448. .byte 1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2
  1449. .L7_mask:
  1450. .long 7,0,7,0
  1451. .L7_mask_poly:
  1452. .long 7,0,`0xE1<<1`,0
  1453. .align 64
  1454. .type .Lrem_4bit,\@object
  1455. .Lrem_4bit:
  1456. .long 0,`0x0000<<16`,0,`0x1C20<<16`,0,`0x3840<<16`,0,`0x2460<<16`
  1457. .long 0,`0x7080<<16`,0,`0x6CA0<<16`,0,`0x48C0<<16`,0,`0x54E0<<16`
  1458. .long 0,`0xE100<<16`,0,`0xFD20<<16`,0,`0xD940<<16`,0,`0xC560<<16`
  1459. .long 0,`0x9180<<16`,0,`0x8DA0<<16`,0,`0xA9C0<<16`,0,`0xB5E0<<16`
  1460. .type .Lrem_8bit,\@object
  1461. .Lrem_8bit:
  1462. .value 0x0000,0x01C2,0x0384,0x0246,0x0708,0x06CA,0x048C,0x054E
  1463. .value 0x0E10,0x0FD2,0x0D94,0x0C56,0x0918,0x08DA,0x0A9C,0x0B5E
  1464. .value 0x1C20,0x1DE2,0x1FA4,0x1E66,0x1B28,0x1AEA,0x18AC,0x196E
  1465. .value 0x1230,0x13F2,0x11B4,0x1076,0x1538,0x14FA,0x16BC,0x177E
  1466. .value 0x3840,0x3982,0x3BC4,0x3A06,0x3F48,0x3E8A,0x3CCC,0x3D0E
  1467. .value 0x3650,0x3792,0x35D4,0x3416,0x3158,0x309A,0x32DC,0x331E
  1468. .value 0x2460,0x25A2,0x27E4,0x2626,0x2368,0x22AA,0x20EC,0x212E
  1469. .value 0x2A70,0x2BB2,0x29F4,0x2836,0x2D78,0x2CBA,0x2EFC,0x2F3E
  1470. .value 0x7080,0x7142,0x7304,0x72C6,0x7788,0x764A,0x740C,0x75CE
  1471. .value 0x7E90,0x7F52,0x7D14,0x7CD6,0x7998,0x785A,0x7A1C,0x7BDE
  1472. .value 0x6CA0,0x6D62,0x6F24,0x6EE6,0x6BA8,0x6A6A,0x682C,0x69EE
  1473. .value 0x62B0,0x6372,0x6134,0x60F6,0x65B8,0x647A,0x663C,0x67FE
  1474. .value 0x48C0,0x4902,0x4B44,0x4A86,0x4FC8,0x4E0A,0x4C4C,0x4D8E
  1475. .value 0x46D0,0x4712,0x4554,0x4496,0x41D8,0x401A,0x425C,0x439E
  1476. .value 0x54E0,0x5522,0x5764,0x56A6,0x53E8,0x522A,0x506C,0x51AE
  1477. .value 0x5AF0,0x5B32,0x5974,0x58B6,0x5DF8,0x5C3A,0x5E7C,0x5FBE
  1478. .value 0xE100,0xE0C2,0xE284,0xE346,0xE608,0xE7CA,0xE58C,0xE44E
  1479. .value 0xEF10,0xEED2,0xEC94,0xED56,0xE818,0xE9DA,0xEB9C,0xEA5E
  1480. .value 0xFD20,0xFCE2,0xFEA4,0xFF66,0xFA28,0xFBEA,0xF9AC,0xF86E
  1481. .value 0xF330,0xF2F2,0xF0B4,0xF176,0xF438,0xF5FA,0xF7BC,0xF67E
  1482. .value 0xD940,0xD882,0xDAC4,0xDB06,0xDE48,0xDF8A,0xDDCC,0xDC0E
  1483. .value 0xD750,0xD692,0xD4D4,0xD516,0xD058,0xD19A,0xD3DC,0xD21E
  1484. .value 0xC560,0xC4A2,0xC6E4,0xC726,0xC268,0xC3AA,0xC1EC,0xC02E
  1485. .value 0xCB70,0xCAB2,0xC8F4,0xC936,0xCC78,0xCDBA,0xCFFC,0xCE3E
  1486. .value 0x9180,0x9042,0x9204,0x93C6,0x9688,0x974A,0x950C,0x94CE
  1487. .value 0x9F90,0x9E52,0x9C14,0x9DD6,0x9898,0x995A,0x9B1C,0x9ADE
  1488. .value 0x8DA0,0x8C62,0x8E24,0x8FE6,0x8AA8,0x8B6A,0x892C,0x88EE
  1489. .value 0x83B0,0x8272,0x8034,0x81F6,0x84B8,0x857A,0x873C,0x86FE
  1490. .value 0xA9C0,0xA802,0xAA44,0xAB86,0xAEC8,0xAF0A,0xAD4C,0xAC8E
  1491. .value 0xA7D0,0xA612,0xA454,0xA596,0xA0D8,0xA11A,0xA35C,0xA29E
  1492. .value 0xB5E0,0xB422,0xB664,0xB7A6,0xB2E8,0xB32A,0xB16C,0xB0AE
  1493. .value 0xBBF0,0xBA32,0xB874,0xB9B6,0xBCF8,0xBD3A,0xBF7C,0xBEBE
  1494. .asciz "GHASH for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
  1495. .align 64
  1496. ___
  1497. # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
  1498. # CONTEXT *context,DISPATCHER_CONTEXT *disp)
  1499. if ($win64) {
  1500. $rec="%rcx";
  1501. $frame="%rdx";
  1502. $context="%r8";
  1503. $disp="%r9";
  1504. $code.=<<___;
  1505. .extern __imp_RtlVirtualUnwind
  1506. .type se_handler,\@abi-omnipotent
  1507. .align 16
  1508. se_handler:
  1509. push %rsi
  1510. push %rdi
  1511. push %rbx
  1512. push %rbp
  1513. push %r12
  1514. push %r13
  1515. push %r14
  1516. push %r15
  1517. pushfq
  1518. sub \$64,%rsp
  1519. mov 120($context),%rax # pull context->Rax
  1520. mov 248($context),%rbx # pull context->Rip
  1521. mov 8($disp),%rsi # disp->ImageBase
  1522. mov 56($disp),%r11 # disp->HandlerData
  1523. mov 0(%r11),%r10d # HandlerData[0]
  1524. lea (%rsi,%r10),%r10 # prologue label
  1525. cmp %r10,%rbx # context->Rip<prologue label
  1526. jb .Lin_prologue
  1527. mov 152($context),%rax # pull context->Rsp
  1528. mov 4(%r11),%r10d # HandlerData[1]
  1529. lea (%rsi,%r10),%r10 # epilogue label
  1530. cmp %r10,%rbx # context->Rip>=epilogue label
  1531. jae .Lin_prologue
  1532. lea 48+280(%rax),%rax # adjust "rsp"
  1533. mov -8(%rax),%rbx
  1534. mov -16(%rax),%rbp
  1535. mov -24(%rax),%r12
  1536. mov -32(%rax),%r13
  1537. mov -40(%rax),%r14
  1538. mov -48(%rax),%r15
  1539. mov %rbx,144($context) # restore context->Rbx
  1540. mov %rbp,160($context) # restore context->Rbp
  1541. mov %r12,216($context) # restore context->R12
  1542. mov %r13,224($context) # restore context->R13
  1543. mov %r14,232($context) # restore context->R14
  1544. mov %r15,240($context) # restore context->R15
  1545. .Lin_prologue:
  1546. mov 8(%rax),%rdi
  1547. mov 16(%rax),%rsi
  1548. mov %rax,152($context) # restore context->Rsp
  1549. mov %rsi,168($context) # restore context->Rsi
  1550. mov %rdi,176($context) # restore context->Rdi
  1551. mov 40($disp),%rdi # disp->ContextRecord
  1552. mov $context,%rsi # context
  1553. mov \$`1232/8`,%ecx # sizeof(CONTEXT)
  1554. .long 0xa548f3fc # cld; rep movsq
  1555. mov $disp,%rsi
  1556. xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
  1557. mov 8(%rsi),%rdx # arg2, disp->ImageBase
  1558. mov 0(%rsi),%r8 # arg3, disp->ControlPc
  1559. mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
  1560. mov 40(%rsi),%r10 # disp->ContextRecord
  1561. lea 56(%rsi),%r11 # &disp->HandlerData
  1562. lea 24(%rsi),%r12 # &disp->EstablisherFrame
  1563. mov %r10,32(%rsp) # arg5
  1564. mov %r11,40(%rsp) # arg6
  1565. mov %r12,48(%rsp) # arg7
  1566. mov %rcx,56(%rsp) # arg8, (NULL)
  1567. call *__imp_RtlVirtualUnwind(%rip)
  1568. mov \$1,%eax # ExceptionContinueSearch
  1569. add \$64,%rsp
  1570. popfq
  1571. pop %r15
  1572. pop %r14
  1573. pop %r13
  1574. pop %r12
  1575. pop %rbp
  1576. pop %rbx
  1577. pop %rdi
  1578. pop %rsi
  1579. ret
  1580. .size se_handler,.-se_handler
  1581. .section .pdata
  1582. .align 4
  1583. .rva .LSEH_begin_gcm_gmult_4bit
  1584. .rva .LSEH_end_gcm_gmult_4bit
  1585. .rva .LSEH_info_gcm_gmult_4bit
  1586. .rva .LSEH_begin_gcm_ghash_4bit
  1587. .rva .LSEH_end_gcm_ghash_4bit
  1588. .rva .LSEH_info_gcm_ghash_4bit
  1589. .rva .LSEH_begin_gcm_init_clmul
  1590. .rva .LSEH_end_gcm_init_clmul
  1591. .rva .LSEH_info_gcm_init_clmul
  1592. .rva .LSEH_begin_gcm_ghash_clmul
  1593. .rva .LSEH_end_gcm_ghash_clmul
  1594. .rva .LSEH_info_gcm_ghash_clmul
  1595. ___
  1596. $code.=<<___ if ($avx);
  1597. .rva .LSEH_begin_gcm_init_avx
  1598. .rva .LSEH_end_gcm_init_avx
  1599. .rva .LSEH_info_gcm_init_clmul
  1600. .rva .LSEH_begin_gcm_ghash_avx
  1601. .rva .LSEH_end_gcm_ghash_avx
  1602. .rva .LSEH_info_gcm_ghash_clmul
  1603. ___
  1604. $code.=<<___;
  1605. .section .xdata
  1606. .align 8
  1607. .LSEH_info_gcm_gmult_4bit:
  1608. .byte 9,0,0,0
  1609. .rva se_handler
  1610. .rva .Lgmult_prologue,.Lgmult_epilogue # HandlerData
  1611. .LSEH_info_gcm_ghash_4bit:
  1612. .byte 9,0,0,0
  1613. .rva se_handler
  1614. .rva .Lghash_prologue,.Lghash_epilogue # HandlerData
  1615. .LSEH_info_gcm_init_clmul:
  1616. .byte 0x01,0x08,0x03,0x00
  1617. .byte 0x08,0x68,0x00,0x00 #movaps 0x00(rsp),xmm6
  1618. .byte 0x04,0x22,0x00,0x00 #sub rsp,0x18
  1619. .LSEH_info_gcm_ghash_clmul:
  1620. .byte 0x01,0x33,0x16,0x00
  1621. .byte 0x33,0xf8,0x09,0x00 #movaps 0x90(rsp),xmm15
  1622. .byte 0x2e,0xe8,0x08,0x00 #movaps 0x80(rsp),xmm14
  1623. .byte 0x29,0xd8,0x07,0x00 #movaps 0x70(rsp),xmm13
  1624. .byte 0x24,0xc8,0x06,0x00 #movaps 0x60(rsp),xmm12
  1625. .byte 0x1f,0xb8,0x05,0x00 #movaps 0x50(rsp),xmm11
  1626. .byte 0x1a,0xa8,0x04,0x00 #movaps 0x40(rsp),xmm10
  1627. .byte 0x15,0x98,0x03,0x00 #movaps 0x30(rsp),xmm9
  1628. .byte 0x10,0x88,0x02,0x00 #movaps 0x20(rsp),xmm8
  1629. .byte 0x0c,0x78,0x01,0x00 #movaps 0x10(rsp),xmm7
  1630. .byte 0x08,0x68,0x00,0x00 #movaps 0x00(rsp),xmm6
  1631. .byte 0x04,0x01,0x15,0x00 #sub rsp,0xa8
  1632. ___
  1633. }
  1634. $code =~ s/\`([^\`]*)\`/eval($1)/gem;
  1635. print $code;
  1636. close STDOUT or die "error closing STDOUT: $!";