poly1305-x86_64.pl 98 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056205720582059206020612062206320642065206620672068206920702071207220732074207520762077207820792080208120822083208420852086208720882089209020912092209320942095209620972098209921002101210221032104210521062107210821092110211121122113211421152116211721182119212021212122212321242125212621272128212921302131213221332134213521362137213821392140214121422143214421452146214721482149215021512152215321542155215621572158215921602161216221632164216521662167216821692170217121722173217421752176217721782179218021812182218321842185218621872188218921902191219221932194219521962197219821992200220122022203220422052206220722082209221022112212221322142215221622172218221922202221222222232224222522262227222822292230223122322233223422352236223722382239224022412242224322442245224622472248224922502251225222532254225522562257225822592260226122622263226422652266226722682269227022712272227322742275227622772278227922802281228222832284228522862287228822892290229122922293229422952296229722982299230023012302230323042305230623072308230923102311231223132314231523162317231823192320232123222323232423252326232723282329233023312332233323342335233623372338233923402341234223432344234523462347234823492350235123522353235423552356235723582359236023612362236323642365236623672368236923702371237223732374237523762377237823792380238123822383238423852386238723882389239023912392239323942395239623972398239924002401240224032404240524062407240824092410241124122413241424152416241724182419242024212422242324242425242624272428242924302431243224332434243524362437243824392440244124422443244424452446244724482449245024512452245324542455245624572458245924602461246224632464246524662467246824692470247124722473247424752476247724782479248024812482248324842485248624872488248924902491249224932494249524962497249824992500250125022503250425052506250725082509251025112512251325142515251625172518251925202521252225232524252525262527252825292530253125322533253425352536253725382539254025412542254325442545254625472548254925502551255225532554255525562557255825592560256125622563256425652566256725682569257025712572257325742575257625772578257925802581258225832584258525862587258825892590259125922593259425952596259725982599260026012602260326042605260626072608260926102611261226132614261526162617261826192620262126222623262426252626262726282629263026312632263326342635263626372638263926402641264226432644264526462647264826492650265126522653265426552656265726582659266026612662266326642665266626672668266926702671267226732674267526762677267826792680268126822683268426852686268726882689269026912692269326942695269626972698269927002701270227032704270527062707270827092710271127122713271427152716271727182719272027212722272327242725272627272728272927302731273227332734273527362737273827392740274127422743274427452746274727482749275027512752275327542755275627572758275927602761276227632764276527662767276827692770277127722773277427752776277727782779278027812782278327842785278627872788278927902791279227932794279527962797279827992800280128022803280428052806280728082809281028112812281328142815281628172818281928202821282228232824282528262827282828292830283128322833283428352836283728382839284028412842284328442845284628472848284928502851285228532854285528562857285828592860286128622863286428652866286728682869287028712872287328742875287628772878287928802881288228832884288528862887288828892890289128922893289428952896289728982899290029012902290329042905290629072908290929102911291229132914291529162917291829192920292129222923292429252926292729282929293029312932293329342935293629372938293929402941294229432944294529462947294829492950295129522953295429552956295729582959296029612962296329642965296629672968296929702971297229732974297529762977297829792980298129822983298429852986298729882989299029912992299329942995299629972998299930003001300230033004300530063007300830093010301130123013301430153016301730183019302030213022302330243025302630273028302930303031303230333034303530363037303830393040304130423043304430453046304730483049305030513052305330543055305630573058305930603061306230633064306530663067306830693070307130723073307430753076307730783079308030813082308330843085308630873088308930903091309230933094309530963097309830993100310131023103310431053106310731083109311031113112311331143115311631173118311931203121312231233124312531263127312831293130313131323133313431353136313731383139314031413142314331443145314631473148314931503151315231533154315531563157315831593160316131623163316431653166316731683169317031713172317331743175317631773178317931803181318231833184318531863187318831893190319131923193319431953196319731983199320032013202320332043205320632073208320932103211321232133214321532163217321832193220322132223223322432253226322732283229323032313232323332343235323632373238323932403241324232433244324532463247324832493250325132523253325432553256325732583259326032613262326332643265326632673268326932703271327232733274327532763277327832793280328132823283328432853286328732883289329032913292329332943295329632973298329933003301330233033304330533063307330833093310331133123313331433153316331733183319332033213322332333243325332633273328332933303331333233333334333533363337333833393340334133423343334433453346334733483349335033513352335333543355335633573358335933603361336233633364336533663367336833693370337133723373337433753376337733783379338033813382338333843385338633873388338933903391339233933394339533963397339833993400340134023403340434053406340734083409341034113412341334143415341634173418341934203421342234233424342534263427342834293430343134323433343434353436343734383439344034413442344334443445344634473448344934503451345234533454345534563457345834593460346134623463346434653466346734683469347034713472347334743475347634773478347934803481348234833484348534863487348834893490349134923493349434953496349734983499350035013502350335043505350635073508350935103511351235133514351535163517351835193520352135223523352435253526352735283529353035313532353335343535353635373538353935403541354235433544354535463547354835493550355135523553355435553556355735583559356035613562356335643565356635673568356935703571357235733574357535763577357835793580358135823583358435853586358735883589359035913592359335943595359635973598359936003601360236033604360536063607360836093610361136123613361436153616361736183619362036213622362336243625362636273628362936303631363236333634363536363637363836393640364136423643364436453646364736483649365036513652365336543655365636573658365936603661366236633664366536663667366836693670367136723673367436753676367736783679368036813682368336843685368636873688368936903691369236933694369536963697369836993700370137023703370437053706370737083709371037113712371337143715371637173718371937203721372237233724372537263727372837293730373137323733373437353736373737383739374037413742374337443745374637473748374937503751375237533754375537563757375837593760376137623763376437653766376737683769377037713772377337743775377637773778377937803781378237833784378537863787378837893790379137923793379437953796379737983799380038013802380338043805380638073808380938103811381238133814381538163817381838193820382138223823382438253826382738283829383038313832383338343835383638373838383938403841384238433844384538463847384838493850385138523853385438553856385738583859386038613862386338643865386638673868386938703871387238733874387538763877387838793880388138823883388438853886388738883889389038913892389338943895389638973898389939003901390239033904390539063907390839093910391139123913391439153916391739183919392039213922392339243925392639273928392939303931393239333934393539363937393839393940394139423943394439453946394739483949395039513952395339543955395639573958395939603961396239633964396539663967396839693970397139723973397439753976397739783979398039813982398339843985398639873988398939903991399239933994399539963997399839994000400140024003400440054006400740084009401040114012401340144015401640174018401940204021402240234024402540264027402840294030403140324033403440354036403740384039404040414042404340444045404640474048404940504051405240534054405540564057405840594060406140624063406440654066406740684069407040714072407340744075407640774078407940804081408240834084408540864087408840894090409140924093409440954096409740984099410041014102410341044105410641074108410941104111411241134114411541164117411841194120412141224123412441254126412741284129413041314132413341344135413641374138413941404141414241434144414541464147414841494150415141524153415441554156415741584159416041614162416341644165416641674168416941704171417241734174417541764177417841794180418141824183
  1. #! /usr/bin/env perl
  2. # Copyright 2016-2020 The OpenSSL Project Authors. All Rights Reserved.
  3. #
  4. # Licensed under the OpenSSL license (the "License"). You may not use
  5. # this file except in compliance with the License. You can obtain a copy
  6. # in the file LICENSE in the source distribution or at
  7. # https://www.openssl.org/source/license.html
  8. #
  9. # ====================================================================
  10. # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
  11. # project. The module is, however, dual licensed under OpenSSL and
  12. # CRYPTOGAMS licenses depending on where you obtain it. For further
  13. # details see http://www.openssl.org/~appro/cryptogams/.
  14. # ====================================================================
  15. #
  16. # This module implements Poly1305 hash for x86_64.
  17. #
  18. # March 2015
  19. #
  20. # Initial release.
  21. #
  22. # December 2016
  23. #
  24. # Add AVX512F+VL+BW code path.
  25. #
  26. # November 2017
  27. #
  28. # Convert AVX512F+VL+BW code path to pure AVX512F, so that it can be
  29. # executed even on Knights Landing. Trigger for modification was
  30. # observation that AVX512 code paths can negatively affect overall
  31. # Skylake-X system performance. Since we are likely to suppress
  32. # AVX512F capability flag [at least on Skylake-X], conversion serves
  33. # as kind of "investment protection". Note that next *lake processor,
  34. # Cannolake, has AVX512IFMA code path to execute...
  35. #
  36. # Numbers are cycles per processed byte with poly1305_blocks alone,
  37. # measured with rdtsc at fixed clock frequency.
  38. #
  39. # IALU/gcc-4.8(*) AVX(**) AVX2 AVX-512
  40. # P4 4.46/+120% -
  41. # Core 2 2.41/+90% -
  42. # Westmere 1.88/+120% -
  43. # Sandy Bridge 1.39/+140% 1.10
  44. # Haswell 1.14/+175% 1.11 0.65
  45. # Skylake[-X] 1.13/+120% 0.96 0.51 [0.35]
  46. # Silvermont 2.83/+95% -
  47. # Knights L 3.60/? 1.65 1.10 0.41(***)
  48. # Goldmont 1.70/+180% -
  49. # VIA Nano 1.82/+150% -
  50. # Sledgehammer 1.38/+160% -
  51. # Bulldozer 2.30/+130% 0.97
  52. # Ryzen 1.15/+200% 1.08 1.18
  53. #
  54. # (*) improvement coefficients relative to clang are more modest and
  55. # are ~50% on most processors, in both cases we are comparing to
  56. # __int128 code;
  57. # (**) SSE2 implementation was attempted, but among non-AVX processors
  58. # it was faster than integer-only code only on older Intel P4 and
  59. # Core processors, 50-30%, less newer processor is, but slower on
  60. # contemporary ones, for example almost 2x slower on Atom, and as
  61. # former are naturally disappearing, SSE2 is deemed unnecessary;
  62. # (***) strangely enough performance seems to vary from core to core,
  63. # listed result is best case;
  64. $flavour = shift;
  65. $output = shift;
  66. if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
  67. $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
  68. $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
  69. ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
  70. ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
  71. die "can't locate x86_64-xlate.pl";
  72. if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
  73. =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
  74. $avx = ($1>=2.19) + ($1>=2.22) + ($1>=2.25) + ($1>=2.26);
  75. }
  76. if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
  77. `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)(?:\.([0-9]+))?/) {
  78. $avx = ($1>=2.09) + ($1>=2.10) + 2 * ($1>=2.12);
  79. $avx += 2 if ($1==2.11 && $2>=8);
  80. }
  81. if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
  82. `ml64 2>&1` =~ /Version ([0-9]+)\./) {
  83. $avx = ($1>=10) + ($1>=12);
  84. }
  85. if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:clang|LLVM) version|.*based on LLVM) ([0-9]+\.[0-9]+)/) {
  86. $avx = ($2>=3.0) + ($2>3.0);
  87. }
  88. open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
  89. *STDOUT=*OUT;
  90. my ($ctx,$inp,$len,$padbit)=("%rdi","%rsi","%rdx","%rcx");
  91. my ($mac,$nonce)=($inp,$len); # *_emit arguments
  92. my ($d1,$d2,$d3, $r0,$r1,$s1)=map("%r$_",(8..13));
  93. my ($h0,$h1,$h2)=("%r14","%rbx","%rbp");
  94. sub poly1305_iteration {
  95. # input: copy of $r1 in %rax, $h0-$h2, $r0-$r1
  96. # output: $h0-$h2 *= $r0-$r1
  97. $code.=<<___;
  98. mulq $h0 # h0*r1
  99. mov %rax,$d2
  100. mov $r0,%rax
  101. mov %rdx,$d3
  102. mulq $h0 # h0*r0
  103. mov %rax,$h0 # future $h0
  104. mov $r0,%rax
  105. mov %rdx,$d1
  106. mulq $h1 # h1*r0
  107. add %rax,$d2
  108. mov $s1,%rax
  109. adc %rdx,$d3
  110. mulq $h1 # h1*s1
  111. mov $h2,$h1 # borrow $h1
  112. add %rax,$h0
  113. adc %rdx,$d1
  114. imulq $s1,$h1 # h2*s1
  115. add $h1,$d2
  116. mov $d1,$h1
  117. adc \$0,$d3
  118. imulq $r0,$h2 # h2*r0
  119. add $d2,$h1
  120. mov \$-4,%rax # mask value
  121. adc $h2,$d3
  122. and $d3,%rax # last reduction step
  123. mov $d3,$h2
  124. shr \$2,$d3
  125. and \$3,$h2
  126. add $d3,%rax
  127. add %rax,$h0
  128. adc \$0,$h1
  129. adc \$0,$h2
  130. ___
  131. }
  132. ########################################################################
  133. # Layout of opaque area is following.
  134. #
  135. # unsigned __int64 h[3]; # current hash value base 2^64
  136. # unsigned __int64 r[2]; # key value base 2^64
  137. $code.=<<___;
  138. .text
  139. .extern OPENSSL_ia32cap_P
  140. .globl poly1305_init
  141. .hidden poly1305_init
  142. .globl poly1305_blocks
  143. .hidden poly1305_blocks
  144. .globl poly1305_emit
  145. .hidden poly1305_emit
  146. .type poly1305_init,\@function,3
  147. .align 32
  148. poly1305_init:
  149. .cfi_startproc
  150. xor %rax,%rax
  151. mov %rax,0($ctx) # initialize hash value
  152. mov %rax,8($ctx)
  153. mov %rax,16($ctx)
  154. cmp \$0,$inp
  155. je .Lno_key
  156. lea poly1305_blocks(%rip),%r10
  157. lea poly1305_emit(%rip),%r11
  158. ___
  159. $code.=<<___ if ($avx);
  160. mov OPENSSL_ia32cap_P+4(%rip),%r9
  161. lea poly1305_blocks_avx(%rip),%rax
  162. lea poly1305_emit_avx(%rip),%rcx
  163. bt \$`60-32`,%r9 # AVX?
  164. cmovc %rax,%r10
  165. cmovc %rcx,%r11
  166. ___
  167. $code.=<<___ if ($avx>1);
  168. lea poly1305_blocks_avx2(%rip),%rax
  169. bt \$`5+32`,%r9 # AVX2?
  170. cmovc %rax,%r10
  171. ___
  172. $code.=<<___ if ($avx>3);
  173. mov \$`(1<<31|1<<21|1<<16)`,%rax
  174. shr \$32,%r9
  175. and %rax,%r9
  176. cmp %rax,%r9
  177. je .Linit_base2_44
  178. ___
  179. $code.=<<___;
  180. mov \$0x0ffffffc0fffffff,%rax
  181. mov \$0x0ffffffc0ffffffc,%rcx
  182. and 0($inp),%rax
  183. and 8($inp),%rcx
  184. mov %rax,24($ctx)
  185. mov %rcx,32($ctx)
  186. ___
  187. $code.=<<___ if ($flavour !~ /elf32/);
  188. mov %r10,0(%rdx)
  189. mov %r11,8(%rdx)
  190. ___
  191. $code.=<<___ if ($flavour =~ /elf32/);
  192. mov %r10d,0(%rdx)
  193. mov %r11d,4(%rdx)
  194. ___
  195. $code.=<<___;
  196. mov \$1,%eax
  197. .Lno_key:
  198. ret
  199. .cfi_endproc
  200. .size poly1305_init,.-poly1305_init
  201. .type poly1305_blocks,\@function,4
  202. .align 32
  203. poly1305_blocks:
  204. .cfi_startproc
  205. .Lblocks:
  206. shr \$4,$len
  207. jz .Lno_data # too short
  208. push %rbx
  209. .cfi_push %rbx
  210. push %rbp
  211. .cfi_push %rbp
  212. push %r12
  213. .cfi_push %r12
  214. push %r13
  215. .cfi_push %r13
  216. push %r14
  217. .cfi_push %r14
  218. push %r15
  219. .cfi_push %r15
  220. .Lblocks_body:
  221. mov $len,%r15 # reassign $len
  222. mov 24($ctx),$r0 # load r
  223. mov 32($ctx),$s1
  224. mov 0($ctx),$h0 # load hash value
  225. mov 8($ctx),$h1
  226. mov 16($ctx),$h2
  227. mov $s1,$r1
  228. shr \$2,$s1
  229. mov $r1,%rax
  230. add $r1,$s1 # s1 = r1 + (r1 >> 2)
  231. jmp .Loop
  232. .align 32
  233. .Loop:
  234. add 0($inp),$h0 # accumulate input
  235. adc 8($inp),$h1
  236. lea 16($inp),$inp
  237. adc $padbit,$h2
  238. ___
  239. &poly1305_iteration();
  240. $code.=<<___;
  241. mov $r1,%rax
  242. dec %r15 # len-=16
  243. jnz .Loop
  244. mov $h0,0($ctx) # store hash value
  245. mov $h1,8($ctx)
  246. mov $h2,16($ctx)
  247. mov 0(%rsp),%r15
  248. .cfi_restore %r15
  249. mov 8(%rsp),%r14
  250. .cfi_restore %r14
  251. mov 16(%rsp),%r13
  252. .cfi_restore %r13
  253. mov 24(%rsp),%r12
  254. .cfi_restore %r12
  255. mov 32(%rsp),%rbp
  256. .cfi_restore %rbp
  257. mov 40(%rsp),%rbx
  258. .cfi_restore %rbx
  259. lea 48(%rsp),%rsp
  260. .cfi_adjust_cfa_offset -48
  261. .Lno_data:
  262. .Lblocks_epilogue:
  263. ret
  264. .cfi_endproc
  265. .size poly1305_blocks,.-poly1305_blocks
  266. .type poly1305_emit,\@function,3
  267. .align 32
  268. poly1305_emit:
  269. .cfi_startproc
  270. .Lemit:
  271. mov 0($ctx),%r8 # load hash value
  272. mov 8($ctx),%r9
  273. mov 16($ctx),%r10
  274. mov %r8,%rax
  275. add \$5,%r8 # compare to modulus
  276. mov %r9,%rcx
  277. adc \$0,%r9
  278. adc \$0,%r10
  279. shr \$2,%r10 # did 130-bit value overflow?
  280. cmovnz %r8,%rax
  281. cmovnz %r9,%rcx
  282. add 0($nonce),%rax # accumulate nonce
  283. adc 8($nonce),%rcx
  284. mov %rax,0($mac) # write result
  285. mov %rcx,8($mac)
  286. ret
  287. .cfi_endproc
  288. .size poly1305_emit,.-poly1305_emit
  289. ___
  290. if ($avx) {
  291. ########################################################################
  292. # Layout of opaque area is following.
  293. #
  294. # unsigned __int32 h[5]; # current hash value base 2^26
  295. # unsigned __int32 is_base2_26;
  296. # unsigned __int64 r[2]; # key value base 2^64
  297. # unsigned __int64 pad;
  298. # struct { unsigned __int32 r^2, r^1, r^4, r^3; } r[9];
  299. #
  300. # where r^n are base 2^26 digits of degrees of multiplier key. There are
  301. # 5 digits, but last four are interleaved with multiples of 5, totalling
  302. # in 9 elements: r0, r1, 5*r1, r2, 5*r2, r3, 5*r3, r4, 5*r4.
  303. my ($H0,$H1,$H2,$H3,$H4, $T0,$T1,$T2,$T3,$T4, $D0,$D1,$D2,$D3,$D4, $MASK) =
  304. map("%xmm$_",(0..15));
  305. $code.=<<___;
  306. .type __poly1305_block,\@abi-omnipotent
  307. .align 32
  308. __poly1305_block:
  309. .cfi_startproc
  310. ___
  311. &poly1305_iteration();
  312. $code.=<<___;
  313. ret
  314. .cfi_endproc
  315. .size __poly1305_block,.-__poly1305_block
  316. .type __poly1305_init_avx,\@abi-omnipotent
  317. .align 32
  318. __poly1305_init_avx:
  319. .cfi_startproc
  320. mov $r0,$h0
  321. mov $r1,$h1
  322. xor $h2,$h2
  323. lea 48+64($ctx),$ctx # size optimization
  324. mov $r1,%rax
  325. call __poly1305_block # r^2
  326. mov \$0x3ffffff,%eax # save interleaved r^2 and r base 2^26
  327. mov \$0x3ffffff,%edx
  328. mov $h0,$d1
  329. and $h0#d,%eax
  330. mov $r0,$d2
  331. and $r0#d,%edx
  332. mov %eax,`16*0+0-64`($ctx)
  333. shr \$26,$d1
  334. mov %edx,`16*0+4-64`($ctx)
  335. shr \$26,$d2
  336. mov \$0x3ffffff,%eax
  337. mov \$0x3ffffff,%edx
  338. and $d1#d,%eax
  339. and $d2#d,%edx
  340. mov %eax,`16*1+0-64`($ctx)
  341. lea (%rax,%rax,4),%eax # *5
  342. mov %edx,`16*1+4-64`($ctx)
  343. lea (%rdx,%rdx,4),%edx # *5
  344. mov %eax,`16*2+0-64`($ctx)
  345. shr \$26,$d1
  346. mov %edx,`16*2+4-64`($ctx)
  347. shr \$26,$d2
  348. mov $h1,%rax
  349. mov $r1,%rdx
  350. shl \$12,%rax
  351. shl \$12,%rdx
  352. or $d1,%rax
  353. or $d2,%rdx
  354. and \$0x3ffffff,%eax
  355. and \$0x3ffffff,%edx
  356. mov %eax,`16*3+0-64`($ctx)
  357. lea (%rax,%rax,4),%eax # *5
  358. mov %edx,`16*3+4-64`($ctx)
  359. lea (%rdx,%rdx,4),%edx # *5
  360. mov %eax,`16*4+0-64`($ctx)
  361. mov $h1,$d1
  362. mov %edx,`16*4+4-64`($ctx)
  363. mov $r1,$d2
  364. mov \$0x3ffffff,%eax
  365. mov \$0x3ffffff,%edx
  366. shr \$14,$d1
  367. shr \$14,$d2
  368. and $d1#d,%eax
  369. and $d2#d,%edx
  370. mov %eax,`16*5+0-64`($ctx)
  371. lea (%rax,%rax,4),%eax # *5
  372. mov %edx,`16*5+4-64`($ctx)
  373. lea (%rdx,%rdx,4),%edx # *5
  374. mov %eax,`16*6+0-64`($ctx)
  375. shr \$26,$d1
  376. mov %edx,`16*6+4-64`($ctx)
  377. shr \$26,$d2
  378. mov $h2,%rax
  379. shl \$24,%rax
  380. or %rax,$d1
  381. mov $d1#d,`16*7+0-64`($ctx)
  382. lea ($d1,$d1,4),$d1 # *5
  383. mov $d2#d,`16*7+4-64`($ctx)
  384. lea ($d2,$d2,4),$d2 # *5
  385. mov $d1#d,`16*8+0-64`($ctx)
  386. mov $d2#d,`16*8+4-64`($ctx)
  387. mov $r1,%rax
  388. call __poly1305_block # r^3
  389. mov \$0x3ffffff,%eax # save r^3 base 2^26
  390. mov $h0,$d1
  391. and $h0#d,%eax
  392. shr \$26,$d1
  393. mov %eax,`16*0+12-64`($ctx)
  394. mov \$0x3ffffff,%edx
  395. and $d1#d,%edx
  396. mov %edx,`16*1+12-64`($ctx)
  397. lea (%rdx,%rdx,4),%edx # *5
  398. shr \$26,$d1
  399. mov %edx,`16*2+12-64`($ctx)
  400. mov $h1,%rax
  401. shl \$12,%rax
  402. or $d1,%rax
  403. and \$0x3ffffff,%eax
  404. mov %eax,`16*3+12-64`($ctx)
  405. lea (%rax,%rax,4),%eax # *5
  406. mov $h1,$d1
  407. mov %eax,`16*4+12-64`($ctx)
  408. mov \$0x3ffffff,%edx
  409. shr \$14,$d1
  410. and $d1#d,%edx
  411. mov %edx,`16*5+12-64`($ctx)
  412. lea (%rdx,%rdx,4),%edx # *5
  413. shr \$26,$d1
  414. mov %edx,`16*6+12-64`($ctx)
  415. mov $h2,%rax
  416. shl \$24,%rax
  417. or %rax,$d1
  418. mov $d1#d,`16*7+12-64`($ctx)
  419. lea ($d1,$d1,4),$d1 # *5
  420. mov $d1#d,`16*8+12-64`($ctx)
  421. mov $r1,%rax
  422. call __poly1305_block # r^4
  423. mov \$0x3ffffff,%eax # save r^4 base 2^26
  424. mov $h0,$d1
  425. and $h0#d,%eax
  426. shr \$26,$d1
  427. mov %eax,`16*0+8-64`($ctx)
  428. mov \$0x3ffffff,%edx
  429. and $d1#d,%edx
  430. mov %edx,`16*1+8-64`($ctx)
  431. lea (%rdx,%rdx,4),%edx # *5
  432. shr \$26,$d1
  433. mov %edx,`16*2+8-64`($ctx)
  434. mov $h1,%rax
  435. shl \$12,%rax
  436. or $d1,%rax
  437. and \$0x3ffffff,%eax
  438. mov %eax,`16*3+8-64`($ctx)
  439. lea (%rax,%rax,4),%eax # *5
  440. mov $h1,$d1
  441. mov %eax,`16*4+8-64`($ctx)
  442. mov \$0x3ffffff,%edx
  443. shr \$14,$d1
  444. and $d1#d,%edx
  445. mov %edx,`16*5+8-64`($ctx)
  446. lea (%rdx,%rdx,4),%edx # *5
  447. shr \$26,$d1
  448. mov %edx,`16*6+8-64`($ctx)
  449. mov $h2,%rax
  450. shl \$24,%rax
  451. or %rax,$d1
  452. mov $d1#d,`16*7+8-64`($ctx)
  453. lea ($d1,$d1,4),$d1 # *5
  454. mov $d1#d,`16*8+8-64`($ctx)
  455. lea -48-64($ctx),$ctx # size [de-]optimization
  456. ret
  457. .cfi_endproc
  458. .size __poly1305_init_avx,.-__poly1305_init_avx
  459. .type poly1305_blocks_avx,\@function,4
  460. .align 32
  461. poly1305_blocks_avx:
  462. .cfi_startproc
  463. mov 20($ctx),%r8d # is_base2_26
  464. cmp \$128,$len
  465. jae .Lblocks_avx
  466. test %r8d,%r8d
  467. jz .Lblocks
  468. .Lblocks_avx:
  469. and \$-16,$len
  470. jz .Lno_data_avx
  471. vzeroupper
  472. test %r8d,%r8d
  473. jz .Lbase2_64_avx
  474. test \$31,$len
  475. jz .Leven_avx
  476. push %rbx
  477. .cfi_push %rbx
  478. push %rbp
  479. .cfi_push %rbp
  480. push %r12
  481. .cfi_push %r12
  482. push %r13
  483. .cfi_push %r13
  484. push %r14
  485. .cfi_push %r14
  486. push %r15
  487. .cfi_push %r15
  488. .Lblocks_avx_body:
  489. mov $len,%r15 # reassign $len
  490. mov 0($ctx),$d1 # load hash value
  491. mov 8($ctx),$d2
  492. mov 16($ctx),$h2#d
  493. mov 24($ctx),$r0 # load r
  494. mov 32($ctx),$s1
  495. ################################# base 2^26 -> base 2^64
  496. mov $d1#d,$h0#d
  497. and \$`-1*(1<<31)`,$d1
  498. mov $d2,$r1 # borrow $r1
  499. mov $d2#d,$h1#d
  500. and \$`-1*(1<<31)`,$d2
  501. shr \$6,$d1
  502. shl \$52,$r1
  503. add $d1,$h0
  504. shr \$12,$h1
  505. shr \$18,$d2
  506. add $r1,$h0
  507. adc $d2,$h1
  508. mov $h2,$d1
  509. shl \$40,$d1
  510. shr \$24,$h2
  511. add $d1,$h1
  512. adc \$0,$h2 # can be partially reduced...
  513. mov \$-4,$d2 # ... so reduce
  514. mov $h2,$d1
  515. and $h2,$d2
  516. shr \$2,$d1
  517. and \$3,$h2
  518. add $d2,$d1 # =*5
  519. add $d1,$h0
  520. adc \$0,$h1
  521. adc \$0,$h2
  522. mov $s1,$r1
  523. mov $s1,%rax
  524. shr \$2,$s1
  525. add $r1,$s1 # s1 = r1 + (r1 >> 2)
  526. add 0($inp),$h0 # accumulate input
  527. adc 8($inp),$h1
  528. lea 16($inp),$inp
  529. adc $padbit,$h2
  530. call __poly1305_block
  531. test $padbit,$padbit # if $padbit is zero,
  532. jz .Lstore_base2_64_avx # store hash in base 2^64 format
  533. ################################# base 2^64 -> base 2^26
  534. mov $h0,%rax
  535. mov $h0,%rdx
  536. shr \$52,$h0
  537. mov $h1,$r0
  538. mov $h1,$r1
  539. shr \$26,%rdx
  540. and \$0x3ffffff,%rax # h[0]
  541. shl \$12,$r0
  542. and \$0x3ffffff,%rdx # h[1]
  543. shr \$14,$h1
  544. or $r0,$h0
  545. shl \$24,$h2
  546. and \$0x3ffffff,$h0 # h[2]
  547. shr \$40,$r1
  548. and \$0x3ffffff,$h1 # h[3]
  549. or $r1,$h2 # h[4]
  550. sub \$16,%r15
  551. jz .Lstore_base2_26_avx
  552. vmovd %rax#d,$H0
  553. vmovd %rdx#d,$H1
  554. vmovd $h0#d,$H2
  555. vmovd $h1#d,$H3
  556. vmovd $h2#d,$H4
  557. jmp .Lproceed_avx
  558. .align 32
  559. .Lstore_base2_64_avx:
  560. mov $h0,0($ctx)
  561. mov $h1,8($ctx)
  562. mov $h2,16($ctx) # note that is_base2_26 is zeroed
  563. jmp .Ldone_avx
  564. .align 16
  565. .Lstore_base2_26_avx:
  566. mov %rax#d,0($ctx) # store hash value base 2^26
  567. mov %rdx#d,4($ctx)
  568. mov $h0#d,8($ctx)
  569. mov $h1#d,12($ctx)
  570. mov $h2#d,16($ctx)
  571. .align 16
  572. .Ldone_avx:
  573. mov 0(%rsp),%r15
  574. .cfi_restore %r15
  575. mov 8(%rsp),%r14
  576. .cfi_restore %r14
  577. mov 16(%rsp),%r13
  578. .cfi_restore %r13
  579. mov 24(%rsp),%r12
  580. .cfi_restore %r12
  581. mov 32(%rsp),%rbp
  582. .cfi_restore %rbp
  583. mov 40(%rsp),%rbx
  584. .cfi_restore %rbx
  585. lea 48(%rsp),%rsp
  586. .cfi_adjust_cfa_offset -48
  587. .Lno_data_avx:
  588. .Lblocks_avx_epilogue:
  589. ret
  590. .cfi_endproc
  591. .align 32
  592. .Lbase2_64_avx:
  593. .cfi_startproc
  594. push %rbx
  595. .cfi_push %rbx
  596. push %rbp
  597. .cfi_push %rbp
  598. push %r12
  599. .cfi_push %r12
  600. push %r13
  601. .cfi_push %r13
  602. push %r14
  603. .cfi_push %r14
  604. push %r15
  605. .cfi_push %r15
  606. .Lbase2_64_avx_body:
  607. mov $len,%r15 # reassign $len
  608. mov 24($ctx),$r0 # load r
  609. mov 32($ctx),$s1
  610. mov 0($ctx),$h0 # load hash value
  611. mov 8($ctx),$h1
  612. mov 16($ctx),$h2#d
  613. mov $s1,$r1
  614. mov $s1,%rax
  615. shr \$2,$s1
  616. add $r1,$s1 # s1 = r1 + (r1 >> 2)
  617. test \$31,$len
  618. jz .Linit_avx
  619. add 0($inp),$h0 # accumulate input
  620. adc 8($inp),$h1
  621. lea 16($inp),$inp
  622. adc $padbit,$h2
  623. sub \$16,%r15
  624. call __poly1305_block
  625. .Linit_avx:
  626. ################################# base 2^64 -> base 2^26
  627. mov $h0,%rax
  628. mov $h0,%rdx
  629. shr \$52,$h0
  630. mov $h1,$d1
  631. mov $h1,$d2
  632. shr \$26,%rdx
  633. and \$0x3ffffff,%rax # h[0]
  634. shl \$12,$d1
  635. and \$0x3ffffff,%rdx # h[1]
  636. shr \$14,$h1
  637. or $d1,$h0
  638. shl \$24,$h2
  639. and \$0x3ffffff,$h0 # h[2]
  640. shr \$40,$d2
  641. and \$0x3ffffff,$h1 # h[3]
  642. or $d2,$h2 # h[4]
  643. vmovd %rax#d,$H0
  644. vmovd %rdx#d,$H1
  645. vmovd $h0#d,$H2
  646. vmovd $h1#d,$H3
  647. vmovd $h2#d,$H4
  648. movl \$1,20($ctx) # set is_base2_26
  649. call __poly1305_init_avx
  650. .Lproceed_avx:
  651. mov %r15,$len
  652. mov 0(%rsp),%r15
  653. .cfi_restore %r15
  654. mov 8(%rsp),%r14
  655. .cfi_restore %r14
  656. mov 16(%rsp),%r13
  657. .cfi_restore %r13
  658. mov 24(%rsp),%r12
  659. .cfi_restore %r12
  660. mov 32(%rsp),%rbp
  661. .cfi_restore %rbp
  662. mov 40(%rsp),%rbx
  663. .cfi_restore %rbx
  664. lea 48(%rsp),%rax
  665. lea 48(%rsp),%rsp
  666. .cfi_adjust_cfa_offset -48
  667. .Lbase2_64_avx_epilogue:
  668. jmp .Ldo_avx
  669. .cfi_endproc
  670. .align 32
  671. .Leven_avx:
  672. .cfi_startproc
  673. vmovd 4*0($ctx),$H0 # load hash value
  674. vmovd 4*1($ctx),$H1
  675. vmovd 4*2($ctx),$H2
  676. vmovd 4*3($ctx),$H3
  677. vmovd 4*4($ctx),$H4
  678. .Ldo_avx:
  679. ___
  680. $code.=<<___ if (!$win64);
  681. lea -0x58(%rsp),%r11
  682. .cfi_def_cfa %r11,0x60
  683. sub \$0x178,%rsp
  684. ___
  685. $code.=<<___ if ($win64);
  686. lea -0xf8(%rsp),%r11
  687. sub \$0x218,%rsp
  688. vmovdqa %xmm6,0x50(%r11)
  689. vmovdqa %xmm7,0x60(%r11)
  690. vmovdqa %xmm8,0x70(%r11)
  691. vmovdqa %xmm9,0x80(%r11)
  692. vmovdqa %xmm10,0x90(%r11)
  693. vmovdqa %xmm11,0xa0(%r11)
  694. vmovdqa %xmm12,0xb0(%r11)
  695. vmovdqa %xmm13,0xc0(%r11)
  696. vmovdqa %xmm14,0xd0(%r11)
  697. vmovdqa %xmm15,0xe0(%r11)
  698. .Ldo_avx_body:
  699. ___
  700. $code.=<<___;
  701. sub \$64,$len
  702. lea -32($inp),%rax
  703. cmovc %rax,$inp
  704. vmovdqu `16*3`($ctx),$D4 # preload r0^2
  705. lea `16*3+64`($ctx),$ctx # size optimization
  706. lea .Lconst(%rip),%rcx
  707. ################################################################
  708. # load input
  709. vmovdqu 16*2($inp),$T0
  710. vmovdqu 16*3($inp),$T1
  711. vmovdqa 64(%rcx),$MASK # .Lmask26
  712. vpsrldq \$6,$T0,$T2 # splat input
  713. vpsrldq \$6,$T1,$T3
  714. vpunpckhqdq $T1,$T0,$T4 # 4
  715. vpunpcklqdq $T1,$T0,$T0 # 0:1
  716. vpunpcklqdq $T3,$T2,$T3 # 2:3
  717. vpsrlq \$40,$T4,$T4 # 4
  718. vpsrlq \$26,$T0,$T1
  719. vpand $MASK,$T0,$T0 # 0
  720. vpsrlq \$4,$T3,$T2
  721. vpand $MASK,$T1,$T1 # 1
  722. vpsrlq \$30,$T3,$T3
  723. vpand $MASK,$T2,$T2 # 2
  724. vpand $MASK,$T3,$T3 # 3
  725. vpor 32(%rcx),$T4,$T4 # padbit, yes, always
  726. jbe .Lskip_loop_avx
  727. # expand and copy pre-calculated table to stack
  728. vmovdqu `16*1-64`($ctx),$D1
  729. vmovdqu `16*2-64`($ctx),$D2
  730. vpshufd \$0xEE,$D4,$D3 # 34xx -> 3434
  731. vpshufd \$0x44,$D4,$D0 # xx12 -> 1212
  732. vmovdqa $D3,-0x90(%r11)
  733. vmovdqa $D0,0x00(%rsp)
  734. vpshufd \$0xEE,$D1,$D4
  735. vmovdqu `16*3-64`($ctx),$D0
  736. vpshufd \$0x44,$D1,$D1
  737. vmovdqa $D4,-0x80(%r11)
  738. vmovdqa $D1,0x10(%rsp)
  739. vpshufd \$0xEE,$D2,$D3
  740. vmovdqu `16*4-64`($ctx),$D1
  741. vpshufd \$0x44,$D2,$D2
  742. vmovdqa $D3,-0x70(%r11)
  743. vmovdqa $D2,0x20(%rsp)
  744. vpshufd \$0xEE,$D0,$D4
  745. vmovdqu `16*5-64`($ctx),$D2
  746. vpshufd \$0x44,$D0,$D0
  747. vmovdqa $D4,-0x60(%r11)
  748. vmovdqa $D0,0x30(%rsp)
  749. vpshufd \$0xEE,$D1,$D3
  750. vmovdqu `16*6-64`($ctx),$D0
  751. vpshufd \$0x44,$D1,$D1
  752. vmovdqa $D3,-0x50(%r11)
  753. vmovdqa $D1,0x40(%rsp)
  754. vpshufd \$0xEE,$D2,$D4
  755. vmovdqu `16*7-64`($ctx),$D1
  756. vpshufd \$0x44,$D2,$D2
  757. vmovdqa $D4,-0x40(%r11)
  758. vmovdqa $D2,0x50(%rsp)
  759. vpshufd \$0xEE,$D0,$D3
  760. vmovdqu `16*8-64`($ctx),$D2
  761. vpshufd \$0x44,$D0,$D0
  762. vmovdqa $D3,-0x30(%r11)
  763. vmovdqa $D0,0x60(%rsp)
  764. vpshufd \$0xEE,$D1,$D4
  765. vpshufd \$0x44,$D1,$D1
  766. vmovdqa $D4,-0x20(%r11)
  767. vmovdqa $D1,0x70(%rsp)
  768. vpshufd \$0xEE,$D2,$D3
  769. vmovdqa 0x00(%rsp),$D4 # preload r0^2
  770. vpshufd \$0x44,$D2,$D2
  771. vmovdqa $D3,-0x10(%r11)
  772. vmovdqa $D2,0x80(%rsp)
  773. jmp .Loop_avx
  774. .align 32
  775. .Loop_avx:
  776. ################################################################
  777. # ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2
  778. # ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r
  779. # \___________________/
  780. # ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2+inp[8])*r^2
  781. # ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^4+inp[7]*r^2+inp[9])*r
  782. # \___________________/ \____________________/
  783. #
  784. # Note that we start with inp[2:3]*r^2. This is because it
  785. # doesn't depend on reduction in previous iteration.
  786. ################################################################
  787. # d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4
  788. # d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4
  789. # d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4
  790. # d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4
  791. # d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
  792. #
  793. # though note that $Tx and $Hx are "reversed" in this section,
  794. # and $D4 is preloaded with r0^2...
  795. vpmuludq $T0,$D4,$D0 # d0 = h0*r0
  796. vpmuludq $T1,$D4,$D1 # d1 = h1*r0
  797. vmovdqa $H2,0x20(%r11) # offload hash
  798. vpmuludq $T2,$D4,$D2 # d3 = h2*r0
  799. vmovdqa 0x10(%rsp),$H2 # r1^2
  800. vpmuludq $T3,$D4,$D3 # d3 = h3*r0
  801. vpmuludq $T4,$D4,$D4 # d4 = h4*r0
  802. vmovdqa $H0,0x00(%r11) #
  803. vpmuludq 0x20(%rsp),$T4,$H0 # h4*s1
  804. vmovdqa $H1,0x10(%r11) #
  805. vpmuludq $T3,$H2,$H1 # h3*r1
  806. vpaddq $H0,$D0,$D0 # d0 += h4*s1
  807. vpaddq $H1,$D4,$D4 # d4 += h3*r1
  808. vmovdqa $H3,0x30(%r11) #
  809. vpmuludq $T2,$H2,$H0 # h2*r1
  810. vpmuludq $T1,$H2,$H1 # h1*r1
  811. vpaddq $H0,$D3,$D3 # d3 += h2*r1
  812. vmovdqa 0x30(%rsp),$H3 # r2^2
  813. vpaddq $H1,$D2,$D2 # d2 += h1*r1
  814. vmovdqa $H4,0x40(%r11) #
  815. vpmuludq $T0,$H2,$H2 # h0*r1
  816. vpmuludq $T2,$H3,$H0 # h2*r2
  817. vpaddq $H2,$D1,$D1 # d1 += h0*r1
  818. vmovdqa 0x40(%rsp),$H4 # s2^2
  819. vpaddq $H0,$D4,$D4 # d4 += h2*r2
  820. vpmuludq $T1,$H3,$H1 # h1*r2
  821. vpmuludq $T0,$H3,$H3 # h0*r2
  822. vpaddq $H1,$D3,$D3 # d3 += h1*r2
  823. vmovdqa 0x50(%rsp),$H2 # r3^2
  824. vpaddq $H3,$D2,$D2 # d2 += h0*r2
  825. vpmuludq $T4,$H4,$H0 # h4*s2
  826. vpmuludq $T3,$H4,$H4 # h3*s2
  827. vpaddq $H0,$D1,$D1 # d1 += h4*s2
  828. vmovdqa 0x60(%rsp),$H3 # s3^2
  829. vpaddq $H4,$D0,$D0 # d0 += h3*s2
  830. vmovdqa 0x80(%rsp),$H4 # s4^2
  831. vpmuludq $T1,$H2,$H1 # h1*r3
  832. vpmuludq $T0,$H2,$H2 # h0*r3
  833. vpaddq $H1,$D4,$D4 # d4 += h1*r3
  834. vpaddq $H2,$D3,$D3 # d3 += h0*r3
  835. vpmuludq $T4,$H3,$H0 # h4*s3
  836. vpmuludq $T3,$H3,$H1 # h3*s3
  837. vpaddq $H0,$D2,$D2 # d2 += h4*s3
  838. vmovdqu 16*0($inp),$H0 # load input
  839. vpaddq $H1,$D1,$D1 # d1 += h3*s3
  840. vpmuludq $T2,$H3,$H3 # h2*s3
  841. vpmuludq $T2,$H4,$T2 # h2*s4
  842. vpaddq $H3,$D0,$D0 # d0 += h2*s3
  843. vmovdqu 16*1($inp),$H1 #
  844. vpaddq $T2,$D1,$D1 # d1 += h2*s4
  845. vpmuludq $T3,$H4,$T3 # h3*s4
  846. vpmuludq $T4,$H4,$T4 # h4*s4
  847. vpsrldq \$6,$H0,$H2 # splat input
  848. vpaddq $T3,$D2,$D2 # d2 += h3*s4
  849. vpaddq $T4,$D3,$D3 # d3 += h4*s4
  850. vpsrldq \$6,$H1,$H3 #
  851. vpmuludq 0x70(%rsp),$T0,$T4 # h0*r4
  852. vpmuludq $T1,$H4,$T0 # h1*s4
  853. vpunpckhqdq $H1,$H0,$H4 # 4
  854. vpaddq $T4,$D4,$D4 # d4 += h0*r4
  855. vmovdqa -0x90(%r11),$T4 # r0^4
  856. vpaddq $T0,$D0,$D0 # d0 += h1*s4
  857. vpunpcklqdq $H1,$H0,$H0 # 0:1
  858. vpunpcklqdq $H3,$H2,$H3 # 2:3
  859. #vpsrlq \$40,$H4,$H4 # 4
  860. vpsrldq \$`40/8`,$H4,$H4 # 4
  861. vpsrlq \$26,$H0,$H1
  862. vpand $MASK,$H0,$H0 # 0
  863. vpsrlq \$4,$H3,$H2
  864. vpand $MASK,$H1,$H1 # 1
  865. vpand 0(%rcx),$H4,$H4 # .Lmask24
  866. vpsrlq \$30,$H3,$H3
  867. vpand $MASK,$H2,$H2 # 2
  868. vpand $MASK,$H3,$H3 # 3
  869. vpor 32(%rcx),$H4,$H4 # padbit, yes, always
  870. vpaddq 0x00(%r11),$H0,$H0 # add hash value
  871. vpaddq 0x10(%r11),$H1,$H1
  872. vpaddq 0x20(%r11),$H2,$H2
  873. vpaddq 0x30(%r11),$H3,$H3
  874. vpaddq 0x40(%r11),$H4,$H4
  875. lea 16*2($inp),%rax
  876. lea 16*4($inp),$inp
  877. sub \$64,$len
  878. cmovc %rax,$inp
  879. ################################################################
  880. # Now we accumulate (inp[0:1]+hash)*r^4
  881. ################################################################
  882. # d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4
  883. # d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4
  884. # d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4
  885. # d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4
  886. # d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
  887. vpmuludq $H0,$T4,$T0 # h0*r0
  888. vpmuludq $H1,$T4,$T1 # h1*r0
  889. vpaddq $T0,$D0,$D0
  890. vpaddq $T1,$D1,$D1
  891. vmovdqa -0x80(%r11),$T2 # r1^4
  892. vpmuludq $H2,$T4,$T0 # h2*r0
  893. vpmuludq $H3,$T4,$T1 # h3*r0
  894. vpaddq $T0,$D2,$D2
  895. vpaddq $T1,$D3,$D3
  896. vpmuludq $H4,$T4,$T4 # h4*r0
  897. vpmuludq -0x70(%r11),$H4,$T0 # h4*s1
  898. vpaddq $T4,$D4,$D4
  899. vpaddq $T0,$D0,$D0 # d0 += h4*s1
  900. vpmuludq $H2,$T2,$T1 # h2*r1
  901. vpmuludq $H3,$T2,$T0 # h3*r1
  902. vpaddq $T1,$D3,$D3 # d3 += h2*r1
  903. vmovdqa -0x60(%r11),$T3 # r2^4
  904. vpaddq $T0,$D4,$D4 # d4 += h3*r1
  905. vpmuludq $H1,$T2,$T1 # h1*r1
  906. vpmuludq $H0,$T2,$T2 # h0*r1
  907. vpaddq $T1,$D2,$D2 # d2 += h1*r1
  908. vpaddq $T2,$D1,$D1 # d1 += h0*r1
  909. vmovdqa -0x50(%r11),$T4 # s2^4
  910. vpmuludq $H2,$T3,$T0 # h2*r2
  911. vpmuludq $H1,$T3,$T1 # h1*r2
  912. vpaddq $T0,$D4,$D4 # d4 += h2*r2
  913. vpaddq $T1,$D3,$D3 # d3 += h1*r2
  914. vmovdqa -0x40(%r11),$T2 # r3^4
  915. vpmuludq $H0,$T3,$T3 # h0*r2
  916. vpmuludq $H4,$T4,$T0 # h4*s2
  917. vpaddq $T3,$D2,$D2 # d2 += h0*r2
  918. vpaddq $T0,$D1,$D1 # d1 += h4*s2
  919. vmovdqa -0x30(%r11),$T3 # s3^4
  920. vpmuludq $H3,$T4,$T4 # h3*s2
  921. vpmuludq $H1,$T2,$T1 # h1*r3
  922. vpaddq $T4,$D0,$D0 # d0 += h3*s2
  923. vmovdqa -0x10(%r11),$T4 # s4^4
  924. vpaddq $T1,$D4,$D4 # d4 += h1*r3
  925. vpmuludq $H0,$T2,$T2 # h0*r3
  926. vpmuludq $H4,$T3,$T0 # h4*s3
  927. vpaddq $T2,$D3,$D3 # d3 += h0*r3
  928. vpaddq $T0,$D2,$D2 # d2 += h4*s3
  929. vmovdqu 16*2($inp),$T0 # load input
  930. vpmuludq $H3,$T3,$T2 # h3*s3
  931. vpmuludq $H2,$T3,$T3 # h2*s3
  932. vpaddq $T2,$D1,$D1 # d1 += h3*s3
  933. vmovdqu 16*3($inp),$T1 #
  934. vpaddq $T3,$D0,$D0 # d0 += h2*s3
  935. vpmuludq $H2,$T4,$H2 # h2*s4
  936. vpmuludq $H3,$T4,$H3 # h3*s4
  937. vpsrldq \$6,$T0,$T2 # splat input
  938. vpaddq $H2,$D1,$D1 # d1 += h2*s4
  939. vpmuludq $H4,$T4,$H4 # h4*s4
  940. vpsrldq \$6,$T1,$T3 #
  941. vpaddq $H3,$D2,$H2 # h2 = d2 + h3*s4
  942. vpaddq $H4,$D3,$H3 # h3 = d3 + h4*s4
  943. vpmuludq -0x20(%r11),$H0,$H4 # h0*r4
  944. vpmuludq $H1,$T4,$H0
  945. vpunpckhqdq $T1,$T0,$T4 # 4
  946. vpaddq $H4,$D4,$H4 # h4 = d4 + h0*r4
  947. vpaddq $H0,$D0,$H0 # h0 = d0 + h1*s4
  948. vpunpcklqdq $T1,$T0,$T0 # 0:1
  949. vpunpcklqdq $T3,$T2,$T3 # 2:3
  950. #vpsrlq \$40,$T4,$T4 # 4
  951. vpsrldq \$`40/8`,$T4,$T4 # 4
  952. vpsrlq \$26,$T0,$T1
  953. vmovdqa 0x00(%rsp),$D4 # preload r0^2
  954. vpand $MASK,$T0,$T0 # 0
  955. vpsrlq \$4,$T3,$T2
  956. vpand $MASK,$T1,$T1 # 1
  957. vpand 0(%rcx),$T4,$T4 # .Lmask24
  958. vpsrlq \$30,$T3,$T3
  959. vpand $MASK,$T2,$T2 # 2
  960. vpand $MASK,$T3,$T3 # 3
  961. vpor 32(%rcx),$T4,$T4 # padbit, yes, always
  962. ################################################################
  963. # lazy reduction as discussed in "NEON crypto" by D.J. Bernstein
  964. # and P. Schwabe
  965. vpsrlq \$26,$H3,$D3
  966. vpand $MASK,$H3,$H3
  967. vpaddq $D3,$H4,$H4 # h3 -> h4
  968. vpsrlq \$26,$H0,$D0
  969. vpand $MASK,$H0,$H0
  970. vpaddq $D0,$D1,$H1 # h0 -> h1
  971. vpsrlq \$26,$H4,$D0
  972. vpand $MASK,$H4,$H4
  973. vpsrlq \$26,$H1,$D1
  974. vpand $MASK,$H1,$H1
  975. vpaddq $D1,$H2,$H2 # h1 -> h2
  976. vpaddq $D0,$H0,$H0
  977. vpsllq \$2,$D0,$D0
  978. vpaddq $D0,$H0,$H0 # h4 -> h0
  979. vpsrlq \$26,$H2,$D2
  980. vpand $MASK,$H2,$H2
  981. vpaddq $D2,$H3,$H3 # h2 -> h3
  982. vpsrlq \$26,$H0,$D0
  983. vpand $MASK,$H0,$H0
  984. vpaddq $D0,$H1,$H1 # h0 -> h1
  985. vpsrlq \$26,$H3,$D3
  986. vpand $MASK,$H3,$H3
  987. vpaddq $D3,$H4,$H4 # h3 -> h4
  988. ja .Loop_avx
  989. .Lskip_loop_avx:
  990. ################################################################
  991. # multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1
  992. vpshufd \$0x10,$D4,$D4 # r0^n, xx12 -> x1x2
  993. add \$32,$len
  994. jnz .Long_tail_avx
  995. vpaddq $H2,$T2,$T2
  996. vpaddq $H0,$T0,$T0
  997. vpaddq $H1,$T1,$T1
  998. vpaddq $H3,$T3,$T3
  999. vpaddq $H4,$T4,$T4
  1000. .Long_tail_avx:
  1001. vmovdqa $H2,0x20(%r11)
  1002. vmovdqa $H0,0x00(%r11)
  1003. vmovdqa $H1,0x10(%r11)
  1004. vmovdqa $H3,0x30(%r11)
  1005. vmovdqa $H4,0x40(%r11)
  1006. # d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4
  1007. # d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4
  1008. # d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4
  1009. # d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4
  1010. # d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
  1011. vpmuludq $T2,$D4,$D2 # d2 = h2*r0
  1012. vpmuludq $T0,$D4,$D0 # d0 = h0*r0
  1013. vpshufd \$0x10,`16*1-64`($ctx),$H2 # r1^n
  1014. vpmuludq $T1,$D4,$D1 # d1 = h1*r0
  1015. vpmuludq $T3,$D4,$D3 # d3 = h3*r0
  1016. vpmuludq $T4,$D4,$D4 # d4 = h4*r0
  1017. vpmuludq $T3,$H2,$H0 # h3*r1
  1018. vpaddq $H0,$D4,$D4 # d4 += h3*r1
  1019. vpshufd \$0x10,`16*2-64`($ctx),$H3 # s1^n
  1020. vpmuludq $T2,$H2,$H1 # h2*r1
  1021. vpaddq $H1,$D3,$D3 # d3 += h2*r1
  1022. vpshufd \$0x10,`16*3-64`($ctx),$H4 # r2^n
  1023. vpmuludq $T1,$H2,$H0 # h1*r1
  1024. vpaddq $H0,$D2,$D2 # d2 += h1*r1
  1025. vpmuludq $T0,$H2,$H2 # h0*r1
  1026. vpaddq $H2,$D1,$D1 # d1 += h0*r1
  1027. vpmuludq $T4,$H3,$H3 # h4*s1
  1028. vpaddq $H3,$D0,$D0 # d0 += h4*s1
  1029. vpshufd \$0x10,`16*4-64`($ctx),$H2 # s2^n
  1030. vpmuludq $T2,$H4,$H1 # h2*r2
  1031. vpaddq $H1,$D4,$D4 # d4 += h2*r2
  1032. vpmuludq $T1,$H4,$H0 # h1*r2
  1033. vpaddq $H0,$D3,$D3 # d3 += h1*r2
  1034. vpshufd \$0x10,`16*5-64`($ctx),$H3 # r3^n
  1035. vpmuludq $T0,$H4,$H4 # h0*r2
  1036. vpaddq $H4,$D2,$D2 # d2 += h0*r2
  1037. vpmuludq $T4,$H2,$H1 # h4*s2
  1038. vpaddq $H1,$D1,$D1 # d1 += h4*s2
  1039. vpshufd \$0x10,`16*6-64`($ctx),$H4 # s3^n
  1040. vpmuludq $T3,$H2,$H2 # h3*s2
  1041. vpaddq $H2,$D0,$D0 # d0 += h3*s2
  1042. vpmuludq $T1,$H3,$H0 # h1*r3
  1043. vpaddq $H0,$D4,$D4 # d4 += h1*r3
  1044. vpmuludq $T0,$H3,$H3 # h0*r3
  1045. vpaddq $H3,$D3,$D3 # d3 += h0*r3
  1046. vpshufd \$0x10,`16*7-64`($ctx),$H2 # r4^n
  1047. vpmuludq $T4,$H4,$H1 # h4*s3
  1048. vpaddq $H1,$D2,$D2 # d2 += h4*s3
  1049. vpshufd \$0x10,`16*8-64`($ctx),$H3 # s4^n
  1050. vpmuludq $T3,$H4,$H0 # h3*s3
  1051. vpaddq $H0,$D1,$D1 # d1 += h3*s3
  1052. vpmuludq $T2,$H4,$H4 # h2*s3
  1053. vpaddq $H4,$D0,$D0 # d0 += h2*s3
  1054. vpmuludq $T0,$H2,$H2 # h0*r4
  1055. vpaddq $H2,$D4,$D4 # h4 = d4 + h0*r4
  1056. vpmuludq $T4,$H3,$H1 # h4*s4
  1057. vpaddq $H1,$D3,$D3 # h3 = d3 + h4*s4
  1058. vpmuludq $T3,$H3,$H0 # h3*s4
  1059. vpaddq $H0,$D2,$D2 # h2 = d2 + h3*s4
  1060. vpmuludq $T2,$H3,$H1 # h2*s4
  1061. vpaddq $H1,$D1,$D1 # h1 = d1 + h2*s4
  1062. vpmuludq $T1,$H3,$H3 # h1*s4
  1063. vpaddq $H3,$D0,$D0 # h0 = d0 + h1*s4
  1064. jz .Lshort_tail_avx
  1065. vmovdqu 16*0($inp),$H0 # load input
  1066. vmovdqu 16*1($inp),$H1
  1067. vpsrldq \$6,$H0,$H2 # splat input
  1068. vpsrldq \$6,$H1,$H3
  1069. vpunpckhqdq $H1,$H0,$H4 # 4
  1070. vpunpcklqdq $H1,$H0,$H0 # 0:1
  1071. vpunpcklqdq $H3,$H2,$H3 # 2:3
  1072. vpsrlq \$40,$H4,$H4 # 4
  1073. vpsrlq \$26,$H0,$H1
  1074. vpand $MASK,$H0,$H0 # 0
  1075. vpsrlq \$4,$H3,$H2
  1076. vpand $MASK,$H1,$H1 # 1
  1077. vpsrlq \$30,$H3,$H3
  1078. vpand $MASK,$H2,$H2 # 2
  1079. vpand $MASK,$H3,$H3 # 3
  1080. vpor 32(%rcx),$H4,$H4 # padbit, yes, always
  1081. vpshufd \$0x32,`16*0-64`($ctx),$T4 # r0^n, 34xx -> x3x4
  1082. vpaddq 0x00(%r11),$H0,$H0
  1083. vpaddq 0x10(%r11),$H1,$H1
  1084. vpaddq 0x20(%r11),$H2,$H2
  1085. vpaddq 0x30(%r11),$H3,$H3
  1086. vpaddq 0x40(%r11),$H4,$H4
  1087. ################################################################
  1088. # multiply (inp[0:1]+hash) by r^4:r^3 and accumulate
  1089. vpmuludq $H0,$T4,$T0 # h0*r0
  1090. vpaddq $T0,$D0,$D0 # d0 += h0*r0
  1091. vpmuludq $H1,$T4,$T1 # h1*r0
  1092. vpaddq $T1,$D1,$D1 # d1 += h1*r0
  1093. vpmuludq $H2,$T4,$T0 # h2*r0
  1094. vpaddq $T0,$D2,$D2 # d2 += h2*r0
  1095. vpshufd \$0x32,`16*1-64`($ctx),$T2 # r1^n
  1096. vpmuludq $H3,$T4,$T1 # h3*r0
  1097. vpaddq $T1,$D3,$D3 # d3 += h3*r0
  1098. vpmuludq $H4,$T4,$T4 # h4*r0
  1099. vpaddq $T4,$D4,$D4 # d4 += h4*r0
  1100. vpmuludq $H3,$T2,$T0 # h3*r1
  1101. vpaddq $T0,$D4,$D4 # d4 += h3*r1
  1102. vpshufd \$0x32,`16*2-64`($ctx),$T3 # s1
  1103. vpmuludq $H2,$T2,$T1 # h2*r1
  1104. vpaddq $T1,$D3,$D3 # d3 += h2*r1
  1105. vpshufd \$0x32,`16*3-64`($ctx),$T4 # r2
  1106. vpmuludq $H1,$T2,$T0 # h1*r1
  1107. vpaddq $T0,$D2,$D2 # d2 += h1*r1
  1108. vpmuludq $H0,$T2,$T2 # h0*r1
  1109. vpaddq $T2,$D1,$D1 # d1 += h0*r1
  1110. vpmuludq $H4,$T3,$T3 # h4*s1
  1111. vpaddq $T3,$D0,$D0 # d0 += h4*s1
  1112. vpshufd \$0x32,`16*4-64`($ctx),$T2 # s2
  1113. vpmuludq $H2,$T4,$T1 # h2*r2
  1114. vpaddq $T1,$D4,$D4 # d4 += h2*r2
  1115. vpmuludq $H1,$T4,$T0 # h1*r2
  1116. vpaddq $T0,$D3,$D3 # d3 += h1*r2
  1117. vpshufd \$0x32,`16*5-64`($ctx),$T3 # r3
  1118. vpmuludq $H0,$T4,$T4 # h0*r2
  1119. vpaddq $T4,$D2,$D2 # d2 += h0*r2
  1120. vpmuludq $H4,$T2,$T1 # h4*s2
  1121. vpaddq $T1,$D1,$D1 # d1 += h4*s2
  1122. vpshufd \$0x32,`16*6-64`($ctx),$T4 # s3
  1123. vpmuludq $H3,$T2,$T2 # h3*s2
  1124. vpaddq $T2,$D0,$D0 # d0 += h3*s2
  1125. vpmuludq $H1,$T3,$T0 # h1*r3
  1126. vpaddq $T0,$D4,$D4 # d4 += h1*r3
  1127. vpmuludq $H0,$T3,$T3 # h0*r3
  1128. vpaddq $T3,$D3,$D3 # d3 += h0*r3
  1129. vpshufd \$0x32,`16*7-64`($ctx),$T2 # r4
  1130. vpmuludq $H4,$T4,$T1 # h4*s3
  1131. vpaddq $T1,$D2,$D2 # d2 += h4*s3
  1132. vpshufd \$0x32,`16*8-64`($ctx),$T3 # s4
  1133. vpmuludq $H3,$T4,$T0 # h3*s3
  1134. vpaddq $T0,$D1,$D1 # d1 += h3*s3
  1135. vpmuludq $H2,$T4,$T4 # h2*s3
  1136. vpaddq $T4,$D0,$D0 # d0 += h2*s3
  1137. vpmuludq $H0,$T2,$T2 # h0*r4
  1138. vpaddq $T2,$D4,$D4 # d4 += h0*r4
  1139. vpmuludq $H4,$T3,$T1 # h4*s4
  1140. vpaddq $T1,$D3,$D3 # d3 += h4*s4
  1141. vpmuludq $H3,$T3,$T0 # h3*s4
  1142. vpaddq $T0,$D2,$D2 # d2 += h3*s4
  1143. vpmuludq $H2,$T3,$T1 # h2*s4
  1144. vpaddq $T1,$D1,$D1 # d1 += h2*s4
  1145. vpmuludq $H1,$T3,$T3 # h1*s4
  1146. vpaddq $T3,$D0,$D0 # d0 += h1*s4
  1147. .Lshort_tail_avx:
  1148. ################################################################
  1149. # horizontal addition
  1150. vpsrldq \$8,$D4,$T4
  1151. vpsrldq \$8,$D3,$T3
  1152. vpsrldq \$8,$D1,$T1
  1153. vpsrldq \$8,$D0,$T0
  1154. vpsrldq \$8,$D2,$T2
  1155. vpaddq $T3,$D3,$D3
  1156. vpaddq $T4,$D4,$D4
  1157. vpaddq $T0,$D0,$D0
  1158. vpaddq $T1,$D1,$D1
  1159. vpaddq $T2,$D2,$D2
  1160. ################################################################
  1161. # lazy reduction
  1162. vpsrlq \$26,$D3,$H3
  1163. vpand $MASK,$D3,$D3
  1164. vpaddq $H3,$D4,$D4 # h3 -> h4
  1165. vpsrlq \$26,$D0,$H0
  1166. vpand $MASK,$D0,$D0
  1167. vpaddq $H0,$D1,$D1 # h0 -> h1
  1168. vpsrlq \$26,$D4,$H4
  1169. vpand $MASK,$D4,$D4
  1170. vpsrlq \$26,$D1,$H1
  1171. vpand $MASK,$D1,$D1
  1172. vpaddq $H1,$D2,$D2 # h1 -> h2
  1173. vpaddq $H4,$D0,$D0
  1174. vpsllq \$2,$H4,$H4
  1175. vpaddq $H4,$D0,$D0 # h4 -> h0
  1176. vpsrlq \$26,$D2,$H2
  1177. vpand $MASK,$D2,$D2
  1178. vpaddq $H2,$D3,$D3 # h2 -> h3
  1179. vpsrlq \$26,$D0,$H0
  1180. vpand $MASK,$D0,$D0
  1181. vpaddq $H0,$D1,$D1 # h0 -> h1
  1182. vpsrlq \$26,$D3,$H3
  1183. vpand $MASK,$D3,$D3
  1184. vpaddq $H3,$D4,$D4 # h3 -> h4
  1185. vmovd $D0,`4*0-48-64`($ctx) # save partially reduced
  1186. vmovd $D1,`4*1-48-64`($ctx)
  1187. vmovd $D2,`4*2-48-64`($ctx)
  1188. vmovd $D3,`4*3-48-64`($ctx)
  1189. vmovd $D4,`4*4-48-64`($ctx)
  1190. ___
  1191. $code.=<<___ if ($win64);
  1192. vmovdqa 0x50(%r11),%xmm6
  1193. vmovdqa 0x60(%r11),%xmm7
  1194. vmovdqa 0x70(%r11),%xmm8
  1195. vmovdqa 0x80(%r11),%xmm9
  1196. vmovdqa 0x90(%r11),%xmm10
  1197. vmovdqa 0xa0(%r11),%xmm11
  1198. vmovdqa 0xb0(%r11),%xmm12
  1199. vmovdqa 0xc0(%r11),%xmm13
  1200. vmovdqa 0xd0(%r11),%xmm14
  1201. vmovdqa 0xe0(%r11),%xmm15
  1202. lea 0xf8(%r11),%rsp
  1203. .Ldo_avx_epilogue:
  1204. ___
  1205. $code.=<<___ if (!$win64);
  1206. lea 0x58(%r11),%rsp
  1207. .cfi_def_cfa %rsp,8
  1208. ___
  1209. $code.=<<___;
  1210. vzeroupper
  1211. ret
  1212. .cfi_endproc
  1213. .size poly1305_blocks_avx,.-poly1305_blocks_avx
  1214. .type poly1305_emit_avx,\@function,3
  1215. .align 32
  1216. poly1305_emit_avx:
  1217. .cfi_startproc
  1218. cmpl \$0,20($ctx) # is_base2_26?
  1219. je .Lemit
  1220. mov 0($ctx),%eax # load hash value base 2^26
  1221. mov 4($ctx),%ecx
  1222. mov 8($ctx),%r8d
  1223. mov 12($ctx),%r11d
  1224. mov 16($ctx),%r10d
  1225. shl \$26,%rcx # base 2^26 -> base 2^64
  1226. mov %r8,%r9
  1227. shl \$52,%r8
  1228. add %rcx,%rax
  1229. shr \$12,%r9
  1230. add %rax,%r8 # h0
  1231. adc \$0,%r9
  1232. shl \$14,%r11
  1233. mov %r10,%rax
  1234. shr \$24,%r10
  1235. add %r11,%r9
  1236. shl \$40,%rax
  1237. add %rax,%r9 # h1
  1238. adc \$0,%r10 # h2
  1239. mov %r10,%rax # could be partially reduced, so reduce
  1240. mov %r10,%rcx
  1241. and \$3,%r10
  1242. shr \$2,%rax
  1243. and \$-4,%rcx
  1244. add %rcx,%rax
  1245. add %rax,%r8
  1246. adc \$0,%r9
  1247. adc \$0,%r10
  1248. mov %r8,%rax
  1249. add \$5,%r8 # compare to modulus
  1250. mov %r9,%rcx
  1251. adc \$0,%r9
  1252. adc \$0,%r10
  1253. shr \$2,%r10 # did 130-bit value overflow?
  1254. cmovnz %r8,%rax
  1255. cmovnz %r9,%rcx
  1256. add 0($nonce),%rax # accumulate nonce
  1257. adc 8($nonce),%rcx
  1258. mov %rax,0($mac) # write result
  1259. mov %rcx,8($mac)
  1260. ret
  1261. .cfi_endproc
  1262. .size poly1305_emit_avx,.-poly1305_emit_avx
  1263. ___
  1264. if ($avx>1) {
  1265. my ($H0,$H1,$H2,$H3,$H4, $MASK, $T4,$T0,$T1,$T2,$T3, $D0,$D1,$D2,$D3,$D4) =
  1266. map("%ymm$_",(0..15));
  1267. my $S4=$MASK;
  1268. $code.=<<___;
  1269. .type poly1305_blocks_avx2,\@function,4
  1270. .align 32
  1271. poly1305_blocks_avx2:
  1272. .cfi_startproc
  1273. mov 20($ctx),%r8d # is_base2_26
  1274. cmp \$128,$len
  1275. jae .Lblocks_avx2
  1276. test %r8d,%r8d
  1277. jz .Lblocks
  1278. .Lblocks_avx2:
  1279. and \$-16,$len
  1280. jz .Lno_data_avx2
  1281. vzeroupper
  1282. test %r8d,%r8d
  1283. jz .Lbase2_64_avx2
  1284. test \$63,$len
  1285. jz .Leven_avx2
  1286. push %rbx
  1287. .cfi_push %rbx
  1288. push %rbp
  1289. .cfi_push %rbp
  1290. push %r12
  1291. .cfi_push %r12
  1292. push %r13
  1293. .cfi_push %r13
  1294. push %r14
  1295. .cfi_push %r14
  1296. push %r15
  1297. .cfi_push %r15
  1298. .Lblocks_avx2_body:
  1299. mov $len,%r15 # reassign $len
  1300. mov 0($ctx),$d1 # load hash value
  1301. mov 8($ctx),$d2
  1302. mov 16($ctx),$h2#d
  1303. mov 24($ctx),$r0 # load r
  1304. mov 32($ctx),$s1
  1305. ################################# base 2^26 -> base 2^64
  1306. mov $d1#d,$h0#d
  1307. and \$`-1*(1<<31)`,$d1
  1308. mov $d2,$r1 # borrow $r1
  1309. mov $d2#d,$h1#d
  1310. and \$`-1*(1<<31)`,$d2
  1311. shr \$6,$d1
  1312. shl \$52,$r1
  1313. add $d1,$h0
  1314. shr \$12,$h1
  1315. shr \$18,$d2
  1316. add $r1,$h0
  1317. adc $d2,$h1
  1318. mov $h2,$d1
  1319. shl \$40,$d1
  1320. shr \$24,$h2
  1321. add $d1,$h1
  1322. adc \$0,$h2 # can be partially reduced...
  1323. mov \$-4,$d2 # ... so reduce
  1324. mov $h2,$d1
  1325. and $h2,$d2
  1326. shr \$2,$d1
  1327. and \$3,$h2
  1328. add $d2,$d1 # =*5
  1329. add $d1,$h0
  1330. adc \$0,$h1
  1331. adc \$0,$h2
  1332. mov $s1,$r1
  1333. mov $s1,%rax
  1334. shr \$2,$s1
  1335. add $r1,$s1 # s1 = r1 + (r1 >> 2)
  1336. .Lbase2_26_pre_avx2:
  1337. add 0($inp),$h0 # accumulate input
  1338. adc 8($inp),$h1
  1339. lea 16($inp),$inp
  1340. adc $padbit,$h2
  1341. sub \$16,%r15
  1342. call __poly1305_block
  1343. mov $r1,%rax
  1344. test \$63,%r15
  1345. jnz .Lbase2_26_pre_avx2
  1346. test $padbit,$padbit # if $padbit is zero,
  1347. jz .Lstore_base2_64_avx2 # store hash in base 2^64 format
  1348. ################################# base 2^64 -> base 2^26
  1349. mov $h0,%rax
  1350. mov $h0,%rdx
  1351. shr \$52,$h0
  1352. mov $h1,$r0
  1353. mov $h1,$r1
  1354. shr \$26,%rdx
  1355. and \$0x3ffffff,%rax # h[0]
  1356. shl \$12,$r0
  1357. and \$0x3ffffff,%rdx # h[1]
  1358. shr \$14,$h1
  1359. or $r0,$h0
  1360. shl \$24,$h2
  1361. and \$0x3ffffff,$h0 # h[2]
  1362. shr \$40,$r1
  1363. and \$0x3ffffff,$h1 # h[3]
  1364. or $r1,$h2 # h[4]
  1365. test %r15,%r15
  1366. jz .Lstore_base2_26_avx2
  1367. vmovd %rax#d,%x#$H0
  1368. vmovd %rdx#d,%x#$H1
  1369. vmovd $h0#d,%x#$H2
  1370. vmovd $h1#d,%x#$H3
  1371. vmovd $h2#d,%x#$H4
  1372. jmp .Lproceed_avx2
  1373. .align 32
  1374. .Lstore_base2_64_avx2:
  1375. mov $h0,0($ctx)
  1376. mov $h1,8($ctx)
  1377. mov $h2,16($ctx) # note that is_base2_26 is zeroed
  1378. jmp .Ldone_avx2
  1379. .align 16
  1380. .Lstore_base2_26_avx2:
  1381. mov %rax#d,0($ctx) # store hash value base 2^26
  1382. mov %rdx#d,4($ctx)
  1383. mov $h0#d,8($ctx)
  1384. mov $h1#d,12($ctx)
  1385. mov $h2#d,16($ctx)
  1386. .align 16
  1387. .Ldone_avx2:
  1388. mov 0(%rsp),%r15
  1389. .cfi_restore %r15
  1390. mov 8(%rsp),%r14
  1391. .cfi_restore %r14
  1392. mov 16(%rsp),%r13
  1393. .cfi_restore %r13
  1394. mov 24(%rsp),%r12
  1395. .cfi_restore %r12
  1396. mov 32(%rsp),%rbp
  1397. .cfi_restore %rbp
  1398. mov 40(%rsp),%rbx
  1399. .cfi_restore %rbx
  1400. lea 48(%rsp),%rsp
  1401. .cfi_adjust_cfa_offset -48
  1402. .Lno_data_avx2:
  1403. .Lblocks_avx2_epilogue:
  1404. ret
  1405. .cfi_endproc
  1406. .align 32
  1407. .Lbase2_64_avx2:
  1408. .cfi_startproc
  1409. push %rbx
  1410. .cfi_push %rbx
  1411. push %rbp
  1412. .cfi_push %rbp
  1413. push %r12
  1414. .cfi_push %r12
  1415. push %r13
  1416. .cfi_push %r13
  1417. push %r14
  1418. .cfi_push %r14
  1419. push %r15
  1420. .cfi_push %r15
  1421. .Lbase2_64_avx2_body:
  1422. mov $len,%r15 # reassign $len
  1423. mov 24($ctx),$r0 # load r
  1424. mov 32($ctx),$s1
  1425. mov 0($ctx),$h0 # load hash value
  1426. mov 8($ctx),$h1
  1427. mov 16($ctx),$h2#d
  1428. mov $s1,$r1
  1429. mov $s1,%rax
  1430. shr \$2,$s1
  1431. add $r1,$s1 # s1 = r1 + (r1 >> 2)
  1432. test \$63,$len
  1433. jz .Linit_avx2
  1434. .Lbase2_64_pre_avx2:
  1435. add 0($inp),$h0 # accumulate input
  1436. adc 8($inp),$h1
  1437. lea 16($inp),$inp
  1438. adc $padbit,$h2
  1439. sub \$16,%r15
  1440. call __poly1305_block
  1441. mov $r1,%rax
  1442. test \$63,%r15
  1443. jnz .Lbase2_64_pre_avx2
  1444. .Linit_avx2:
  1445. ################################# base 2^64 -> base 2^26
  1446. mov $h0,%rax
  1447. mov $h0,%rdx
  1448. shr \$52,$h0
  1449. mov $h1,$d1
  1450. mov $h1,$d2
  1451. shr \$26,%rdx
  1452. and \$0x3ffffff,%rax # h[0]
  1453. shl \$12,$d1
  1454. and \$0x3ffffff,%rdx # h[1]
  1455. shr \$14,$h1
  1456. or $d1,$h0
  1457. shl \$24,$h2
  1458. and \$0x3ffffff,$h0 # h[2]
  1459. shr \$40,$d2
  1460. and \$0x3ffffff,$h1 # h[3]
  1461. or $d2,$h2 # h[4]
  1462. vmovd %rax#d,%x#$H0
  1463. vmovd %rdx#d,%x#$H1
  1464. vmovd $h0#d,%x#$H2
  1465. vmovd $h1#d,%x#$H3
  1466. vmovd $h2#d,%x#$H4
  1467. movl \$1,20($ctx) # set is_base2_26
  1468. call __poly1305_init_avx
  1469. .Lproceed_avx2:
  1470. mov %r15,$len # restore $len
  1471. mov OPENSSL_ia32cap_P+8(%rip),%r10d
  1472. mov \$`(1<<31|1<<30|1<<16)`,%r11d
  1473. mov 0(%rsp),%r15
  1474. .cfi_restore %r15
  1475. mov 8(%rsp),%r14
  1476. .cfi_restore %r14
  1477. mov 16(%rsp),%r13
  1478. .cfi_restore %r13
  1479. mov 24(%rsp),%r12
  1480. .cfi_restore %r12
  1481. mov 32(%rsp),%rbp
  1482. .cfi_restore %rbp
  1483. mov 40(%rsp),%rbx
  1484. .cfi_restore %rbx
  1485. lea 48(%rsp),%rax
  1486. lea 48(%rsp),%rsp
  1487. .cfi_adjust_cfa_offset -48
  1488. .Lbase2_64_avx2_epilogue:
  1489. jmp .Ldo_avx2
  1490. .cfi_endproc
  1491. .align 32
  1492. .Leven_avx2:
  1493. .cfi_startproc
  1494. mov OPENSSL_ia32cap_P+8(%rip),%r10d
  1495. vmovd 4*0($ctx),%x#$H0 # load hash value base 2^26
  1496. vmovd 4*1($ctx),%x#$H1
  1497. vmovd 4*2($ctx),%x#$H2
  1498. vmovd 4*3($ctx),%x#$H3
  1499. vmovd 4*4($ctx),%x#$H4
  1500. .Ldo_avx2:
  1501. ___
  1502. $code.=<<___ if ($avx>2);
  1503. cmp \$512,$len
  1504. jb .Lskip_avx512
  1505. and %r11d,%r10d
  1506. test \$`1<<16`,%r10d # check for AVX512F
  1507. jnz .Lblocks_avx512
  1508. .Lskip_avx512:
  1509. ___
  1510. $code.=<<___ if (!$win64);
  1511. lea -8(%rsp),%r11
  1512. .cfi_def_cfa %r11,16
  1513. sub \$0x128,%rsp
  1514. ___
  1515. $code.=<<___ if ($win64);
  1516. lea -0xf8(%rsp),%r11
  1517. sub \$0x1c8,%rsp
  1518. vmovdqa %xmm6,0x50(%r11)
  1519. vmovdqa %xmm7,0x60(%r11)
  1520. vmovdqa %xmm8,0x70(%r11)
  1521. vmovdqa %xmm9,0x80(%r11)
  1522. vmovdqa %xmm10,0x90(%r11)
  1523. vmovdqa %xmm11,0xa0(%r11)
  1524. vmovdqa %xmm12,0xb0(%r11)
  1525. vmovdqa %xmm13,0xc0(%r11)
  1526. vmovdqa %xmm14,0xd0(%r11)
  1527. vmovdqa %xmm15,0xe0(%r11)
  1528. .Ldo_avx2_body:
  1529. ___
  1530. $code.=<<___;
  1531. lea .Lconst(%rip),%rcx
  1532. lea 48+64($ctx),$ctx # size optimization
  1533. vmovdqa 96(%rcx),$T0 # .Lpermd_avx2
  1534. # expand and copy pre-calculated table to stack
  1535. vmovdqu `16*0-64`($ctx),%x#$T2
  1536. and \$-512,%rsp
  1537. vmovdqu `16*1-64`($ctx),%x#$T3
  1538. vmovdqu `16*2-64`($ctx),%x#$T4
  1539. vmovdqu `16*3-64`($ctx),%x#$D0
  1540. vmovdqu `16*4-64`($ctx),%x#$D1
  1541. vmovdqu `16*5-64`($ctx),%x#$D2
  1542. lea 0x90(%rsp),%rax # size optimization
  1543. vmovdqu `16*6-64`($ctx),%x#$D3
  1544. vpermd $T2,$T0,$T2 # 00003412 -> 14243444
  1545. vmovdqu `16*7-64`($ctx),%x#$D4
  1546. vpermd $T3,$T0,$T3
  1547. vmovdqu `16*8-64`($ctx),%x#$MASK
  1548. vpermd $T4,$T0,$T4
  1549. vmovdqa $T2,0x00(%rsp)
  1550. vpermd $D0,$T0,$D0
  1551. vmovdqa $T3,0x20-0x90(%rax)
  1552. vpermd $D1,$T0,$D1
  1553. vmovdqa $T4,0x40-0x90(%rax)
  1554. vpermd $D2,$T0,$D2
  1555. vmovdqa $D0,0x60-0x90(%rax)
  1556. vpermd $D3,$T0,$D3
  1557. vmovdqa $D1,0x80-0x90(%rax)
  1558. vpermd $D4,$T0,$D4
  1559. vmovdqa $D2,0xa0-0x90(%rax)
  1560. vpermd $MASK,$T0,$MASK
  1561. vmovdqa $D3,0xc0-0x90(%rax)
  1562. vmovdqa $D4,0xe0-0x90(%rax)
  1563. vmovdqa $MASK,0x100-0x90(%rax)
  1564. vmovdqa 64(%rcx),$MASK # .Lmask26
  1565. ################################################################
  1566. # load input
  1567. vmovdqu 16*0($inp),%x#$T0
  1568. vmovdqu 16*1($inp),%x#$T1
  1569. vinserti128 \$1,16*2($inp),$T0,$T0
  1570. vinserti128 \$1,16*3($inp),$T1,$T1
  1571. lea 16*4($inp),$inp
  1572. vpsrldq \$6,$T0,$T2 # splat input
  1573. vpsrldq \$6,$T1,$T3
  1574. vpunpckhqdq $T1,$T0,$T4 # 4
  1575. vpunpcklqdq $T3,$T2,$T2 # 2:3
  1576. vpunpcklqdq $T1,$T0,$T0 # 0:1
  1577. vpsrlq \$30,$T2,$T3
  1578. vpsrlq \$4,$T2,$T2
  1579. vpsrlq \$26,$T0,$T1
  1580. vpsrlq \$40,$T4,$T4 # 4
  1581. vpand $MASK,$T2,$T2 # 2
  1582. vpand $MASK,$T0,$T0 # 0
  1583. vpand $MASK,$T1,$T1 # 1
  1584. vpand $MASK,$T3,$T3 # 3
  1585. vpor 32(%rcx),$T4,$T4 # padbit, yes, always
  1586. vpaddq $H2,$T2,$H2 # accumulate input
  1587. sub \$64,$len
  1588. jz .Ltail_avx2
  1589. jmp .Loop_avx2
  1590. .align 32
  1591. .Loop_avx2:
  1592. ################################################################
  1593. # ((inp[0]*r^4+inp[4])*r^4+inp[ 8])*r^4
  1594. # ((inp[1]*r^4+inp[5])*r^4+inp[ 9])*r^3
  1595. # ((inp[2]*r^4+inp[6])*r^4+inp[10])*r^2
  1596. # ((inp[3]*r^4+inp[7])*r^4+inp[11])*r^1
  1597. # \________/\__________/
  1598. ################################################################
  1599. #vpaddq $H2,$T2,$H2 # accumulate input
  1600. vpaddq $H0,$T0,$H0
  1601. vmovdqa `32*0`(%rsp),$T0 # r0^4
  1602. vpaddq $H1,$T1,$H1
  1603. vmovdqa `32*1`(%rsp),$T1 # r1^4
  1604. vpaddq $H3,$T3,$H3
  1605. vmovdqa `32*3`(%rsp),$T2 # r2^4
  1606. vpaddq $H4,$T4,$H4
  1607. vmovdqa `32*6-0x90`(%rax),$T3 # s3^4
  1608. vmovdqa `32*8-0x90`(%rax),$S4 # s4^4
  1609. # d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4
  1610. # d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4
  1611. # d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4
  1612. # d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4
  1613. # d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
  1614. #
  1615. # however, as h2 is "chronologically" first one available pull
  1616. # corresponding operations up, so it's
  1617. #
  1618. # d4 = h2*r2 + h4*r0 + h3*r1 + h1*r3 + h0*r4
  1619. # d3 = h2*r1 + h3*r0 + h1*r2 + h0*r3 + h4*5*r4
  1620. # d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4
  1621. # d1 = h2*5*r4 + h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3
  1622. # d0 = h2*5*r3 + h0*r0 + h4*5*r1 + h3*5*r2 + h1*5*r4
  1623. vpmuludq $H2,$T0,$D2 # d2 = h2*r0
  1624. vpmuludq $H2,$T1,$D3 # d3 = h2*r1
  1625. vpmuludq $H2,$T2,$D4 # d4 = h2*r2
  1626. vpmuludq $H2,$T3,$D0 # d0 = h2*s3
  1627. vpmuludq $H2,$S4,$D1 # d1 = h2*s4
  1628. vpmuludq $H0,$T1,$T4 # h0*r1
  1629. vpmuludq $H1,$T1,$H2 # h1*r1, borrow $H2 as temp
  1630. vpaddq $T4,$D1,$D1 # d1 += h0*r1
  1631. vpaddq $H2,$D2,$D2 # d2 += h1*r1
  1632. vpmuludq $H3,$T1,$T4 # h3*r1
  1633. vpmuludq `32*2`(%rsp),$H4,$H2 # h4*s1
  1634. vpaddq $T4,$D4,$D4 # d4 += h3*r1
  1635. vpaddq $H2,$D0,$D0 # d0 += h4*s1
  1636. vmovdqa `32*4-0x90`(%rax),$T1 # s2
  1637. vpmuludq $H0,$T0,$T4 # h0*r0
  1638. vpmuludq $H1,$T0,$H2 # h1*r0
  1639. vpaddq $T4,$D0,$D0 # d0 += h0*r0
  1640. vpaddq $H2,$D1,$D1 # d1 += h1*r0
  1641. vpmuludq $H3,$T0,$T4 # h3*r0
  1642. vpmuludq $H4,$T0,$H2 # h4*r0
  1643. vmovdqu 16*0($inp),%x#$T0 # load input
  1644. vpaddq $T4,$D3,$D3 # d3 += h3*r0
  1645. vpaddq $H2,$D4,$D4 # d4 += h4*r0
  1646. vinserti128 \$1,16*2($inp),$T0,$T0
  1647. vpmuludq $H3,$T1,$T4 # h3*s2
  1648. vpmuludq $H4,$T1,$H2 # h4*s2
  1649. vmovdqu 16*1($inp),%x#$T1
  1650. vpaddq $T4,$D0,$D0 # d0 += h3*s2
  1651. vpaddq $H2,$D1,$D1 # d1 += h4*s2
  1652. vmovdqa `32*5-0x90`(%rax),$H2 # r3
  1653. vpmuludq $H1,$T2,$T4 # h1*r2
  1654. vpmuludq $H0,$T2,$T2 # h0*r2
  1655. vpaddq $T4,$D3,$D3 # d3 += h1*r2
  1656. vpaddq $T2,$D2,$D2 # d2 += h0*r2
  1657. vinserti128 \$1,16*3($inp),$T1,$T1
  1658. lea 16*4($inp),$inp
  1659. vpmuludq $H1,$H2,$T4 # h1*r3
  1660. vpmuludq $H0,$H2,$H2 # h0*r3
  1661. vpsrldq \$6,$T0,$T2 # splat input
  1662. vpaddq $T4,$D4,$D4 # d4 += h1*r3
  1663. vpaddq $H2,$D3,$D3 # d3 += h0*r3
  1664. vpmuludq $H3,$T3,$T4 # h3*s3
  1665. vpmuludq $H4,$T3,$H2 # h4*s3
  1666. vpsrldq \$6,$T1,$T3
  1667. vpaddq $T4,$D1,$D1 # d1 += h3*s3
  1668. vpaddq $H2,$D2,$D2 # d2 += h4*s3
  1669. vpunpckhqdq $T1,$T0,$T4 # 4
  1670. vpmuludq $H3,$S4,$H3 # h3*s4
  1671. vpmuludq $H4,$S4,$H4 # h4*s4
  1672. vpunpcklqdq $T1,$T0,$T0 # 0:1
  1673. vpaddq $H3,$D2,$H2 # h2 = d2 + h3*r4
  1674. vpaddq $H4,$D3,$H3 # h3 = d3 + h4*r4
  1675. vpunpcklqdq $T3,$T2,$T3 # 2:3
  1676. vpmuludq `32*7-0x90`(%rax),$H0,$H4 # h0*r4
  1677. vpmuludq $H1,$S4,$H0 # h1*s4
  1678. vmovdqa 64(%rcx),$MASK # .Lmask26
  1679. vpaddq $H4,$D4,$H4 # h4 = d4 + h0*r4
  1680. vpaddq $H0,$D0,$H0 # h0 = d0 + h1*s4
  1681. ################################################################
  1682. # lazy reduction (interleaved with tail of input splat)
  1683. vpsrlq \$26,$H3,$D3
  1684. vpand $MASK,$H3,$H3
  1685. vpaddq $D3,$H4,$H4 # h3 -> h4
  1686. vpsrlq \$26,$H0,$D0
  1687. vpand $MASK,$H0,$H0
  1688. vpaddq $D0,$D1,$H1 # h0 -> h1
  1689. vpsrlq \$26,$H4,$D4
  1690. vpand $MASK,$H4,$H4
  1691. vpsrlq \$4,$T3,$T2
  1692. vpsrlq \$26,$H1,$D1
  1693. vpand $MASK,$H1,$H1
  1694. vpaddq $D1,$H2,$H2 # h1 -> h2
  1695. vpaddq $D4,$H0,$H0
  1696. vpsllq \$2,$D4,$D4
  1697. vpaddq $D4,$H0,$H0 # h4 -> h0
  1698. vpand $MASK,$T2,$T2 # 2
  1699. vpsrlq \$26,$T0,$T1
  1700. vpsrlq \$26,$H2,$D2
  1701. vpand $MASK,$H2,$H2
  1702. vpaddq $D2,$H3,$H3 # h2 -> h3
  1703. vpaddq $T2,$H2,$H2 # modulo-scheduled
  1704. vpsrlq \$30,$T3,$T3
  1705. vpsrlq \$26,$H0,$D0
  1706. vpand $MASK,$H0,$H0
  1707. vpaddq $D0,$H1,$H1 # h0 -> h1
  1708. vpsrlq \$40,$T4,$T4 # 4
  1709. vpsrlq \$26,$H3,$D3
  1710. vpand $MASK,$H3,$H3
  1711. vpaddq $D3,$H4,$H4 # h3 -> h4
  1712. vpand $MASK,$T0,$T0 # 0
  1713. vpand $MASK,$T1,$T1 # 1
  1714. vpand $MASK,$T3,$T3 # 3
  1715. vpor 32(%rcx),$T4,$T4 # padbit, yes, always
  1716. sub \$64,$len
  1717. jnz .Loop_avx2
  1718. .byte 0x66,0x90
  1719. .Ltail_avx2:
  1720. ################################################################
  1721. # while above multiplications were by r^4 in all lanes, in last
  1722. # iteration we multiply least significant lane by r^4 and most
  1723. # significant one by r, so copy of above except that references
  1724. # to the precomputed table are displaced by 4...
  1725. #vpaddq $H2,$T2,$H2 # accumulate input
  1726. vpaddq $H0,$T0,$H0
  1727. vmovdqu `32*0+4`(%rsp),$T0 # r0^4
  1728. vpaddq $H1,$T1,$H1
  1729. vmovdqu `32*1+4`(%rsp),$T1 # r1^4
  1730. vpaddq $H3,$T3,$H3
  1731. vmovdqu `32*3+4`(%rsp),$T2 # r2^4
  1732. vpaddq $H4,$T4,$H4
  1733. vmovdqu `32*6+4-0x90`(%rax),$T3 # s3^4
  1734. vmovdqu `32*8+4-0x90`(%rax),$S4 # s4^4
  1735. vpmuludq $H2,$T0,$D2 # d2 = h2*r0
  1736. vpmuludq $H2,$T1,$D3 # d3 = h2*r1
  1737. vpmuludq $H2,$T2,$D4 # d4 = h2*r2
  1738. vpmuludq $H2,$T3,$D0 # d0 = h2*s3
  1739. vpmuludq $H2,$S4,$D1 # d1 = h2*s4
  1740. vpmuludq $H0,$T1,$T4 # h0*r1
  1741. vpmuludq $H1,$T1,$H2 # h1*r1
  1742. vpaddq $T4,$D1,$D1 # d1 += h0*r1
  1743. vpaddq $H2,$D2,$D2 # d2 += h1*r1
  1744. vpmuludq $H3,$T1,$T4 # h3*r1
  1745. vpmuludq `32*2+4`(%rsp),$H4,$H2 # h4*s1
  1746. vpaddq $T4,$D4,$D4 # d4 += h3*r1
  1747. vpaddq $H2,$D0,$D0 # d0 += h4*s1
  1748. vpmuludq $H0,$T0,$T4 # h0*r0
  1749. vpmuludq $H1,$T0,$H2 # h1*r0
  1750. vpaddq $T4,$D0,$D0 # d0 += h0*r0
  1751. vmovdqu `32*4+4-0x90`(%rax),$T1 # s2
  1752. vpaddq $H2,$D1,$D1 # d1 += h1*r0
  1753. vpmuludq $H3,$T0,$T4 # h3*r0
  1754. vpmuludq $H4,$T0,$H2 # h4*r0
  1755. vpaddq $T4,$D3,$D3 # d3 += h3*r0
  1756. vpaddq $H2,$D4,$D4 # d4 += h4*r0
  1757. vpmuludq $H3,$T1,$T4 # h3*s2
  1758. vpmuludq $H4,$T1,$H2 # h4*s2
  1759. vpaddq $T4,$D0,$D0 # d0 += h3*s2
  1760. vpaddq $H2,$D1,$D1 # d1 += h4*s2
  1761. vmovdqu `32*5+4-0x90`(%rax),$H2 # r3
  1762. vpmuludq $H1,$T2,$T4 # h1*r2
  1763. vpmuludq $H0,$T2,$T2 # h0*r2
  1764. vpaddq $T4,$D3,$D3 # d3 += h1*r2
  1765. vpaddq $T2,$D2,$D2 # d2 += h0*r2
  1766. vpmuludq $H1,$H2,$T4 # h1*r3
  1767. vpmuludq $H0,$H2,$H2 # h0*r3
  1768. vpaddq $T4,$D4,$D4 # d4 += h1*r3
  1769. vpaddq $H2,$D3,$D3 # d3 += h0*r3
  1770. vpmuludq $H3,$T3,$T4 # h3*s3
  1771. vpmuludq $H4,$T3,$H2 # h4*s3
  1772. vpaddq $T4,$D1,$D1 # d1 += h3*s3
  1773. vpaddq $H2,$D2,$D2 # d2 += h4*s3
  1774. vpmuludq $H3,$S4,$H3 # h3*s4
  1775. vpmuludq $H4,$S4,$H4 # h4*s4
  1776. vpaddq $H3,$D2,$H2 # h2 = d2 + h3*r4
  1777. vpaddq $H4,$D3,$H3 # h3 = d3 + h4*r4
  1778. vpmuludq `32*7+4-0x90`(%rax),$H0,$H4 # h0*r4
  1779. vpmuludq $H1,$S4,$H0 # h1*s4
  1780. vmovdqa 64(%rcx),$MASK # .Lmask26
  1781. vpaddq $H4,$D4,$H4 # h4 = d4 + h0*r4
  1782. vpaddq $H0,$D0,$H0 # h0 = d0 + h1*s4
  1783. ################################################################
  1784. # horizontal addition
  1785. vpsrldq \$8,$D1,$T1
  1786. vpsrldq \$8,$H2,$T2
  1787. vpsrldq \$8,$H3,$T3
  1788. vpsrldq \$8,$H4,$T4
  1789. vpsrldq \$8,$H0,$T0
  1790. vpaddq $T1,$D1,$D1
  1791. vpaddq $T2,$H2,$H2
  1792. vpaddq $T3,$H3,$H3
  1793. vpaddq $T4,$H4,$H4
  1794. vpaddq $T0,$H0,$H0
  1795. vpermq \$0x2,$H3,$T3
  1796. vpermq \$0x2,$H4,$T4
  1797. vpermq \$0x2,$H0,$T0
  1798. vpermq \$0x2,$D1,$T1
  1799. vpermq \$0x2,$H2,$T2
  1800. vpaddq $T3,$H3,$H3
  1801. vpaddq $T4,$H4,$H4
  1802. vpaddq $T0,$H0,$H0
  1803. vpaddq $T1,$D1,$D1
  1804. vpaddq $T2,$H2,$H2
  1805. ################################################################
  1806. # lazy reduction
  1807. vpsrlq \$26,$H3,$D3
  1808. vpand $MASK,$H3,$H3
  1809. vpaddq $D3,$H4,$H4 # h3 -> h4
  1810. vpsrlq \$26,$H0,$D0
  1811. vpand $MASK,$H0,$H0
  1812. vpaddq $D0,$D1,$H1 # h0 -> h1
  1813. vpsrlq \$26,$H4,$D4
  1814. vpand $MASK,$H4,$H4
  1815. vpsrlq \$26,$H1,$D1
  1816. vpand $MASK,$H1,$H1
  1817. vpaddq $D1,$H2,$H2 # h1 -> h2
  1818. vpaddq $D4,$H0,$H0
  1819. vpsllq \$2,$D4,$D4
  1820. vpaddq $D4,$H0,$H0 # h4 -> h0
  1821. vpsrlq \$26,$H2,$D2
  1822. vpand $MASK,$H2,$H2
  1823. vpaddq $D2,$H3,$H3 # h2 -> h3
  1824. vpsrlq \$26,$H0,$D0
  1825. vpand $MASK,$H0,$H0
  1826. vpaddq $D0,$H1,$H1 # h0 -> h1
  1827. vpsrlq \$26,$H3,$D3
  1828. vpand $MASK,$H3,$H3
  1829. vpaddq $D3,$H4,$H4 # h3 -> h4
  1830. vmovd %x#$H0,`4*0-48-64`($ctx)# save partially reduced
  1831. vmovd %x#$H1,`4*1-48-64`($ctx)
  1832. vmovd %x#$H2,`4*2-48-64`($ctx)
  1833. vmovd %x#$H3,`4*3-48-64`($ctx)
  1834. vmovd %x#$H4,`4*4-48-64`($ctx)
  1835. ___
  1836. $code.=<<___ if ($win64);
  1837. vmovdqa 0x50(%r11),%xmm6
  1838. vmovdqa 0x60(%r11),%xmm7
  1839. vmovdqa 0x70(%r11),%xmm8
  1840. vmovdqa 0x80(%r11),%xmm9
  1841. vmovdqa 0x90(%r11),%xmm10
  1842. vmovdqa 0xa0(%r11),%xmm11
  1843. vmovdqa 0xb0(%r11),%xmm12
  1844. vmovdqa 0xc0(%r11),%xmm13
  1845. vmovdqa 0xd0(%r11),%xmm14
  1846. vmovdqa 0xe0(%r11),%xmm15
  1847. lea 0xf8(%r11),%rsp
  1848. .Ldo_avx2_epilogue:
  1849. ___
  1850. $code.=<<___ if (!$win64);
  1851. lea 8(%r11),%rsp
  1852. .cfi_def_cfa %rsp,8
  1853. ___
  1854. $code.=<<___;
  1855. vzeroupper
  1856. ret
  1857. .cfi_endproc
  1858. .size poly1305_blocks_avx2,.-poly1305_blocks_avx2
  1859. ___
  1860. #######################################################################
  1861. if ($avx>2) {
  1862. # On entry we have input length divisible by 64. But since inner loop
  1863. # processes 128 bytes per iteration, cases when length is not divisible
  1864. # by 128 are handled by passing tail 64 bytes to .Ltail_avx2. For this
  1865. # reason stack layout is kept identical to poly1305_blocks_avx2. If not
  1866. # for this tail, we wouldn't have to even allocate stack frame...
  1867. my ($R0,$R1,$R2,$R3,$R4, $S1,$S2,$S3,$S4) = map("%zmm$_",(16..24));
  1868. my ($M0,$M1,$M2,$M3,$M4) = map("%zmm$_",(25..29));
  1869. my $PADBIT="%zmm30";
  1870. map(s/%y/%z/,($T4,$T0,$T1,$T2,$T3)); # switch to %zmm domain
  1871. map(s/%y/%z/,($D0,$D1,$D2,$D3,$D4));
  1872. map(s/%y/%z/,($H0,$H1,$H2,$H3,$H4));
  1873. map(s/%y/%z/,($MASK));
  1874. $code.=<<___;
  1875. .type poly1305_blocks_avx512,\@function,4
  1876. .align 32
  1877. poly1305_blocks_avx512:
  1878. .cfi_startproc
  1879. .Lblocks_avx512:
  1880. mov \$15,%eax
  1881. kmovw %eax,%k2
  1882. ___
  1883. $code.=<<___ if (!$win64);
  1884. lea -8(%rsp),%r11
  1885. .cfi_def_cfa %r11,16
  1886. sub \$0x128,%rsp
  1887. ___
  1888. $code.=<<___ if ($win64);
  1889. lea -0xf8(%rsp),%r11
  1890. sub \$0x1c8,%rsp
  1891. vmovdqa %xmm6,0x50(%r11)
  1892. vmovdqa %xmm7,0x60(%r11)
  1893. vmovdqa %xmm8,0x70(%r11)
  1894. vmovdqa %xmm9,0x80(%r11)
  1895. vmovdqa %xmm10,0x90(%r11)
  1896. vmovdqa %xmm11,0xa0(%r11)
  1897. vmovdqa %xmm12,0xb0(%r11)
  1898. vmovdqa %xmm13,0xc0(%r11)
  1899. vmovdqa %xmm14,0xd0(%r11)
  1900. vmovdqa %xmm15,0xe0(%r11)
  1901. .Ldo_avx512_body:
  1902. ___
  1903. $code.=<<___;
  1904. lea .Lconst(%rip),%rcx
  1905. lea 48+64($ctx),$ctx # size optimization
  1906. vmovdqa 96(%rcx),%y#$T2 # .Lpermd_avx2
  1907. # expand pre-calculated table
  1908. vmovdqu `16*0-64`($ctx),%x#$D0 # will become expanded ${R0}
  1909. and \$-512,%rsp
  1910. vmovdqu `16*1-64`($ctx),%x#$D1 # will become ... ${R1}
  1911. mov \$0x20,%rax
  1912. vmovdqu `16*2-64`($ctx),%x#$T0 # ... ${S1}
  1913. vmovdqu `16*3-64`($ctx),%x#$D2 # ... ${R2}
  1914. vmovdqu `16*4-64`($ctx),%x#$T1 # ... ${S2}
  1915. vmovdqu `16*5-64`($ctx),%x#$D3 # ... ${R3}
  1916. vmovdqu `16*6-64`($ctx),%x#$T3 # ... ${S3}
  1917. vmovdqu `16*7-64`($ctx),%x#$D4 # ... ${R4}
  1918. vmovdqu `16*8-64`($ctx),%x#$T4 # ... ${S4}
  1919. vpermd $D0,$T2,$R0 # 00003412 -> 14243444
  1920. vpbroadcastq 64(%rcx),$MASK # .Lmask26
  1921. vpermd $D1,$T2,$R1
  1922. vpermd $T0,$T2,$S1
  1923. vpermd $D2,$T2,$R2
  1924. vmovdqa64 $R0,0x00(%rsp){%k2} # save in case $len%128 != 0
  1925. vpsrlq \$32,$R0,$T0 # 14243444 -> 01020304
  1926. vpermd $T1,$T2,$S2
  1927. vmovdqu64 $R1,0x00(%rsp,%rax){%k2}
  1928. vpsrlq \$32,$R1,$T1
  1929. vpermd $D3,$T2,$R3
  1930. vmovdqa64 $S1,0x40(%rsp){%k2}
  1931. vpermd $T3,$T2,$S3
  1932. vpermd $D4,$T2,$R4
  1933. vmovdqu64 $R2,0x40(%rsp,%rax){%k2}
  1934. vpermd $T4,$T2,$S4
  1935. vmovdqa64 $S2,0x80(%rsp){%k2}
  1936. vmovdqu64 $R3,0x80(%rsp,%rax){%k2}
  1937. vmovdqa64 $S3,0xc0(%rsp){%k2}
  1938. vmovdqu64 $R4,0xc0(%rsp,%rax){%k2}
  1939. vmovdqa64 $S4,0x100(%rsp){%k2}
  1940. ################################################################
  1941. # calculate 5th through 8th powers of the key
  1942. #
  1943. # d0 = r0'*r0 + r1'*5*r4 + r2'*5*r3 + r3'*5*r2 + r4'*5*r1
  1944. # d1 = r0'*r1 + r1'*r0 + r2'*5*r4 + r3'*5*r3 + r4'*5*r2
  1945. # d2 = r0'*r2 + r1'*r1 + r2'*r0 + r3'*5*r4 + r4'*5*r3
  1946. # d3 = r0'*r3 + r1'*r2 + r2'*r1 + r3'*r0 + r4'*5*r4
  1947. # d4 = r0'*r4 + r1'*r3 + r2'*r2 + r3'*r1 + r4'*r0
  1948. vpmuludq $T0,$R0,$D0 # d0 = r0'*r0
  1949. vpmuludq $T0,$R1,$D1 # d1 = r0'*r1
  1950. vpmuludq $T0,$R2,$D2 # d2 = r0'*r2
  1951. vpmuludq $T0,$R3,$D3 # d3 = r0'*r3
  1952. vpmuludq $T0,$R4,$D4 # d4 = r0'*r4
  1953. vpsrlq \$32,$R2,$T2
  1954. vpmuludq $T1,$S4,$M0
  1955. vpmuludq $T1,$R0,$M1
  1956. vpmuludq $T1,$R1,$M2
  1957. vpmuludq $T1,$R2,$M3
  1958. vpmuludq $T1,$R3,$M4
  1959. vpsrlq \$32,$R3,$T3
  1960. vpaddq $M0,$D0,$D0 # d0 += r1'*5*r4
  1961. vpaddq $M1,$D1,$D1 # d1 += r1'*r0
  1962. vpaddq $M2,$D2,$D2 # d2 += r1'*r1
  1963. vpaddq $M3,$D3,$D3 # d3 += r1'*r2
  1964. vpaddq $M4,$D4,$D4 # d4 += r1'*r3
  1965. vpmuludq $T2,$S3,$M0
  1966. vpmuludq $T2,$S4,$M1
  1967. vpmuludq $T2,$R1,$M3
  1968. vpmuludq $T2,$R2,$M4
  1969. vpmuludq $T2,$R0,$M2
  1970. vpsrlq \$32,$R4,$T4
  1971. vpaddq $M0,$D0,$D0 # d0 += r2'*5*r3
  1972. vpaddq $M1,$D1,$D1 # d1 += r2'*5*r4
  1973. vpaddq $M3,$D3,$D3 # d3 += r2'*r1
  1974. vpaddq $M4,$D4,$D4 # d4 += r2'*r2
  1975. vpaddq $M2,$D2,$D2 # d2 += r2'*r0
  1976. vpmuludq $T3,$S2,$M0
  1977. vpmuludq $T3,$R0,$M3
  1978. vpmuludq $T3,$R1,$M4
  1979. vpmuludq $T3,$S3,$M1
  1980. vpmuludq $T3,$S4,$M2
  1981. vpaddq $M0,$D0,$D0 # d0 += r3'*5*r2
  1982. vpaddq $M3,$D3,$D3 # d3 += r3'*r0
  1983. vpaddq $M4,$D4,$D4 # d4 += r3'*r1
  1984. vpaddq $M1,$D1,$D1 # d1 += r3'*5*r3
  1985. vpaddq $M2,$D2,$D2 # d2 += r3'*5*r4
  1986. vpmuludq $T4,$S4,$M3
  1987. vpmuludq $T4,$R0,$M4
  1988. vpmuludq $T4,$S1,$M0
  1989. vpmuludq $T4,$S2,$M1
  1990. vpmuludq $T4,$S3,$M2
  1991. vpaddq $M3,$D3,$D3 # d3 += r2'*5*r4
  1992. vpaddq $M4,$D4,$D4 # d4 += r2'*r0
  1993. vpaddq $M0,$D0,$D0 # d0 += r2'*5*r1
  1994. vpaddq $M1,$D1,$D1 # d1 += r2'*5*r2
  1995. vpaddq $M2,$D2,$D2 # d2 += r2'*5*r3
  1996. ################################################################
  1997. # load input
  1998. vmovdqu64 16*0($inp),%z#$T3
  1999. vmovdqu64 16*4($inp),%z#$T4
  2000. lea 16*8($inp),$inp
  2001. ################################################################
  2002. # lazy reduction
  2003. vpsrlq \$26,$D3,$M3
  2004. vpandq $MASK,$D3,$D3
  2005. vpaddq $M3,$D4,$D4 # d3 -> d4
  2006. vpsrlq \$26,$D0,$M0
  2007. vpandq $MASK,$D0,$D0
  2008. vpaddq $M0,$D1,$D1 # d0 -> d1
  2009. vpsrlq \$26,$D4,$M4
  2010. vpandq $MASK,$D4,$D4
  2011. vpsrlq \$26,$D1,$M1
  2012. vpandq $MASK,$D1,$D1
  2013. vpaddq $M1,$D2,$D2 # d1 -> d2
  2014. vpaddq $M4,$D0,$D0
  2015. vpsllq \$2,$M4,$M4
  2016. vpaddq $M4,$D0,$D0 # d4 -> d0
  2017. vpsrlq \$26,$D2,$M2
  2018. vpandq $MASK,$D2,$D2
  2019. vpaddq $M2,$D3,$D3 # d2 -> d3
  2020. vpsrlq \$26,$D0,$M0
  2021. vpandq $MASK,$D0,$D0
  2022. vpaddq $M0,$D1,$D1 # d0 -> d1
  2023. vpsrlq \$26,$D3,$M3
  2024. vpandq $MASK,$D3,$D3
  2025. vpaddq $M3,$D4,$D4 # d3 -> d4
  2026. ################################################################
  2027. # at this point we have 14243444 in $R0-$S4 and 05060708 in
  2028. # $D0-$D4, ...
  2029. vpunpcklqdq $T4,$T3,$T0 # transpose input
  2030. vpunpckhqdq $T4,$T3,$T4
  2031. # ... since input 64-bit lanes are ordered as 73625140, we could
  2032. # "vperm" it to 76543210 (here and in each loop iteration), *or*
  2033. # we could just flow along, hence the goal for $R0-$S4 is
  2034. # 1858286838784888 ...
  2035. vmovdqa32 128(%rcx),$M0 # .Lpermd_avx512:
  2036. mov \$0x7777,%eax
  2037. kmovw %eax,%k1
  2038. vpermd $R0,$M0,$R0 # 14243444 -> 1---2---3---4---
  2039. vpermd $R1,$M0,$R1
  2040. vpermd $R2,$M0,$R2
  2041. vpermd $R3,$M0,$R3
  2042. vpermd $R4,$M0,$R4
  2043. vpermd $D0,$M0,${R0}{%k1} # 05060708 -> 1858286838784888
  2044. vpermd $D1,$M0,${R1}{%k1}
  2045. vpermd $D2,$M0,${R2}{%k1}
  2046. vpermd $D3,$M0,${R3}{%k1}
  2047. vpermd $D4,$M0,${R4}{%k1}
  2048. vpslld \$2,$R1,$S1 # *5
  2049. vpslld \$2,$R2,$S2
  2050. vpslld \$2,$R3,$S3
  2051. vpslld \$2,$R4,$S4
  2052. vpaddd $R1,$S1,$S1
  2053. vpaddd $R2,$S2,$S2
  2054. vpaddd $R3,$S3,$S3
  2055. vpaddd $R4,$S4,$S4
  2056. vpbroadcastq 32(%rcx),$PADBIT # .L129
  2057. vpsrlq \$52,$T0,$T2 # splat input
  2058. vpsllq \$12,$T4,$T3
  2059. vporq $T3,$T2,$T2
  2060. vpsrlq \$26,$T0,$T1
  2061. vpsrlq \$14,$T4,$T3
  2062. vpsrlq \$40,$T4,$T4 # 4
  2063. vpandq $MASK,$T2,$T2 # 2
  2064. vpandq $MASK,$T0,$T0 # 0
  2065. #vpandq $MASK,$T1,$T1 # 1
  2066. #vpandq $MASK,$T3,$T3 # 3
  2067. #vporq $PADBIT,$T4,$T4 # padbit, yes, always
  2068. vpaddq $H2,$T2,$H2 # accumulate input
  2069. sub \$192,$len
  2070. jbe .Ltail_avx512
  2071. jmp .Loop_avx512
  2072. .align 32
  2073. .Loop_avx512:
  2074. ################################################################
  2075. # ((inp[0]*r^8+inp[ 8])*r^8+inp[16])*r^8
  2076. # ((inp[1]*r^8+inp[ 9])*r^8+inp[17])*r^7
  2077. # ((inp[2]*r^8+inp[10])*r^8+inp[18])*r^6
  2078. # ((inp[3]*r^8+inp[11])*r^8+inp[19])*r^5
  2079. # ((inp[4]*r^8+inp[12])*r^8+inp[20])*r^4
  2080. # ((inp[5]*r^8+inp[13])*r^8+inp[21])*r^3
  2081. # ((inp[6]*r^8+inp[14])*r^8+inp[22])*r^2
  2082. # ((inp[7]*r^8+inp[15])*r^8+inp[23])*r^1
  2083. # \________/\___________/
  2084. ################################################################
  2085. #vpaddq $H2,$T2,$H2 # accumulate input
  2086. # d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4
  2087. # d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4
  2088. # d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4
  2089. # d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4
  2090. # d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
  2091. #
  2092. # however, as h2 is "chronologically" first one available pull
  2093. # corresponding operations up, so it's
  2094. #
  2095. # d3 = h2*r1 + h0*r3 + h1*r2 + h3*r0 + h4*5*r4
  2096. # d4 = h2*r2 + h0*r4 + h1*r3 + h3*r1 + h4*r0
  2097. # d0 = h2*5*r3 + h0*r0 + h1*5*r4 + h3*5*r2 + h4*5*r1
  2098. # d1 = h2*5*r4 + h0*r1 + h1*r0 + h3*5*r3 + h4*5*r2
  2099. # d2 = h2*r0 + h0*r2 + h1*r1 + h3*5*r4 + h4*5*r3
  2100. vpmuludq $H2,$R1,$D3 # d3 = h2*r1
  2101. vpaddq $H0,$T0,$H0
  2102. vpmuludq $H2,$R2,$D4 # d4 = h2*r2
  2103. vpandq $MASK,$T1,$T1 # 1
  2104. vpmuludq $H2,$S3,$D0 # d0 = h2*s3
  2105. vpandq $MASK,$T3,$T3 # 3
  2106. vpmuludq $H2,$S4,$D1 # d1 = h2*s4
  2107. vporq $PADBIT,$T4,$T4 # padbit, yes, always
  2108. vpmuludq $H2,$R0,$D2 # d2 = h2*r0
  2109. vpaddq $H1,$T1,$H1 # accumulate input
  2110. vpaddq $H3,$T3,$H3
  2111. vpaddq $H4,$T4,$H4
  2112. vmovdqu64 16*0($inp),$T3 # load input
  2113. vmovdqu64 16*4($inp),$T4
  2114. lea 16*8($inp),$inp
  2115. vpmuludq $H0,$R3,$M3
  2116. vpmuludq $H0,$R4,$M4
  2117. vpmuludq $H0,$R0,$M0
  2118. vpmuludq $H0,$R1,$M1
  2119. vpaddq $M3,$D3,$D3 # d3 += h0*r3
  2120. vpaddq $M4,$D4,$D4 # d4 += h0*r4
  2121. vpaddq $M0,$D0,$D0 # d0 += h0*r0
  2122. vpaddq $M1,$D1,$D1 # d1 += h0*r1
  2123. vpmuludq $H1,$R2,$M3
  2124. vpmuludq $H1,$R3,$M4
  2125. vpmuludq $H1,$S4,$M0
  2126. vpmuludq $H0,$R2,$M2
  2127. vpaddq $M3,$D3,$D3 # d3 += h1*r2
  2128. vpaddq $M4,$D4,$D4 # d4 += h1*r3
  2129. vpaddq $M0,$D0,$D0 # d0 += h1*s4
  2130. vpaddq $M2,$D2,$D2 # d2 += h0*r2
  2131. vpunpcklqdq $T4,$T3,$T0 # transpose input
  2132. vpunpckhqdq $T4,$T3,$T4
  2133. vpmuludq $H3,$R0,$M3
  2134. vpmuludq $H3,$R1,$M4
  2135. vpmuludq $H1,$R0,$M1
  2136. vpmuludq $H1,$R1,$M2
  2137. vpaddq $M3,$D3,$D3 # d3 += h3*r0
  2138. vpaddq $M4,$D4,$D4 # d4 += h3*r1
  2139. vpaddq $M1,$D1,$D1 # d1 += h1*r0
  2140. vpaddq $M2,$D2,$D2 # d2 += h1*r1
  2141. vpmuludq $H4,$S4,$M3
  2142. vpmuludq $H4,$R0,$M4
  2143. vpmuludq $H3,$S2,$M0
  2144. vpmuludq $H3,$S3,$M1
  2145. vpaddq $M3,$D3,$D3 # d3 += h4*s4
  2146. vpmuludq $H3,$S4,$M2
  2147. vpaddq $M4,$D4,$D4 # d4 += h4*r0
  2148. vpaddq $M0,$D0,$D0 # d0 += h3*s2
  2149. vpaddq $M1,$D1,$D1 # d1 += h3*s3
  2150. vpaddq $M2,$D2,$D2 # d2 += h3*s4
  2151. vpmuludq $H4,$S1,$M0
  2152. vpmuludq $H4,$S2,$M1
  2153. vpmuludq $H4,$S3,$M2
  2154. vpaddq $M0,$D0,$H0 # h0 = d0 + h4*s1
  2155. vpaddq $M1,$D1,$H1 # h1 = d2 + h4*s2
  2156. vpaddq $M2,$D2,$H2 # h2 = d3 + h4*s3
  2157. ################################################################
  2158. # lazy reduction (interleaved with input splat)
  2159. vpsrlq \$52,$T0,$T2 # splat input
  2160. vpsllq \$12,$T4,$T3
  2161. vpsrlq \$26,$D3,$H3
  2162. vpandq $MASK,$D3,$D3
  2163. vpaddq $H3,$D4,$H4 # h3 -> h4
  2164. vporq $T3,$T2,$T2
  2165. vpsrlq \$26,$H0,$D0
  2166. vpandq $MASK,$H0,$H0
  2167. vpaddq $D0,$H1,$H1 # h0 -> h1
  2168. vpandq $MASK,$T2,$T2 # 2
  2169. vpsrlq \$26,$H4,$D4
  2170. vpandq $MASK,$H4,$H4
  2171. vpsrlq \$26,$H1,$D1
  2172. vpandq $MASK,$H1,$H1
  2173. vpaddq $D1,$H2,$H2 # h1 -> h2
  2174. vpaddq $D4,$H0,$H0
  2175. vpsllq \$2,$D4,$D4
  2176. vpaddq $D4,$H0,$H0 # h4 -> h0
  2177. vpaddq $T2,$H2,$H2 # modulo-scheduled
  2178. vpsrlq \$26,$T0,$T1
  2179. vpsrlq \$26,$H2,$D2
  2180. vpandq $MASK,$H2,$H2
  2181. vpaddq $D2,$D3,$H3 # h2 -> h3
  2182. vpsrlq \$14,$T4,$T3
  2183. vpsrlq \$26,$H0,$D0
  2184. vpandq $MASK,$H0,$H0
  2185. vpaddq $D0,$H1,$H1 # h0 -> h1
  2186. vpsrlq \$40,$T4,$T4 # 4
  2187. vpsrlq \$26,$H3,$D3
  2188. vpandq $MASK,$H3,$H3
  2189. vpaddq $D3,$H4,$H4 # h3 -> h4
  2190. vpandq $MASK,$T0,$T0 # 0
  2191. #vpandq $MASK,$T1,$T1 # 1
  2192. #vpandq $MASK,$T3,$T3 # 3
  2193. #vporq $PADBIT,$T4,$T4 # padbit, yes, always
  2194. sub \$128,$len
  2195. ja .Loop_avx512
  2196. .Ltail_avx512:
  2197. ################################################################
  2198. # while above multiplications were by r^8 in all lanes, in last
  2199. # iteration we multiply least significant lane by r^8 and most
  2200. # significant one by r, that's why table gets shifted...
  2201. vpsrlq \$32,$R0,$R0 # 0105020603070408
  2202. vpsrlq \$32,$R1,$R1
  2203. vpsrlq \$32,$R2,$R2
  2204. vpsrlq \$32,$S3,$S3
  2205. vpsrlq \$32,$S4,$S4
  2206. vpsrlq \$32,$R3,$R3
  2207. vpsrlq \$32,$R4,$R4
  2208. vpsrlq \$32,$S1,$S1
  2209. vpsrlq \$32,$S2,$S2
  2210. ################################################################
  2211. # load either next or last 64 byte of input
  2212. lea ($inp,$len),$inp
  2213. #vpaddq $H2,$T2,$H2 # accumulate input
  2214. vpaddq $H0,$T0,$H0
  2215. vpmuludq $H2,$R1,$D3 # d3 = h2*r1
  2216. vpmuludq $H2,$R2,$D4 # d4 = h2*r2
  2217. vpmuludq $H2,$S3,$D0 # d0 = h2*s3
  2218. vpandq $MASK,$T1,$T1 # 1
  2219. vpmuludq $H2,$S4,$D1 # d1 = h2*s4
  2220. vpandq $MASK,$T3,$T3 # 3
  2221. vpmuludq $H2,$R0,$D2 # d2 = h2*r0
  2222. vporq $PADBIT,$T4,$T4 # padbit, yes, always
  2223. vpaddq $H1,$T1,$H1 # accumulate input
  2224. vpaddq $H3,$T3,$H3
  2225. vpaddq $H4,$T4,$H4
  2226. vmovdqu 16*0($inp),%x#$T0
  2227. vpmuludq $H0,$R3,$M3
  2228. vpmuludq $H0,$R4,$M4
  2229. vpmuludq $H0,$R0,$M0
  2230. vpmuludq $H0,$R1,$M1
  2231. vpaddq $M3,$D3,$D3 # d3 += h0*r3
  2232. vpaddq $M4,$D4,$D4 # d4 += h0*r4
  2233. vpaddq $M0,$D0,$D0 # d0 += h0*r0
  2234. vpaddq $M1,$D1,$D1 # d1 += h0*r1
  2235. vmovdqu 16*1($inp),%x#$T1
  2236. vpmuludq $H1,$R2,$M3
  2237. vpmuludq $H1,$R3,$M4
  2238. vpmuludq $H1,$S4,$M0
  2239. vpmuludq $H0,$R2,$M2
  2240. vpaddq $M3,$D3,$D3 # d3 += h1*r2
  2241. vpaddq $M4,$D4,$D4 # d4 += h1*r3
  2242. vpaddq $M0,$D0,$D0 # d0 += h1*s4
  2243. vpaddq $M2,$D2,$D2 # d2 += h0*r2
  2244. vinserti128 \$1,16*2($inp),%y#$T0,%y#$T0
  2245. vpmuludq $H3,$R0,$M3
  2246. vpmuludq $H3,$R1,$M4
  2247. vpmuludq $H1,$R0,$M1
  2248. vpmuludq $H1,$R1,$M2
  2249. vpaddq $M3,$D3,$D3 # d3 += h3*r0
  2250. vpaddq $M4,$D4,$D4 # d4 += h3*r1
  2251. vpaddq $M1,$D1,$D1 # d1 += h1*r0
  2252. vpaddq $M2,$D2,$D2 # d2 += h1*r1
  2253. vinserti128 \$1,16*3($inp),%y#$T1,%y#$T1
  2254. vpmuludq $H4,$S4,$M3
  2255. vpmuludq $H4,$R0,$M4
  2256. vpmuludq $H3,$S2,$M0
  2257. vpmuludq $H3,$S3,$M1
  2258. vpmuludq $H3,$S4,$M2
  2259. vpaddq $M3,$D3,$H3 # h3 = d3 + h4*s4
  2260. vpaddq $M4,$D4,$D4 # d4 += h4*r0
  2261. vpaddq $M0,$D0,$D0 # d0 += h3*s2
  2262. vpaddq $M1,$D1,$D1 # d1 += h3*s3
  2263. vpaddq $M2,$D2,$D2 # d2 += h3*s4
  2264. vpmuludq $H4,$S1,$M0
  2265. vpmuludq $H4,$S2,$M1
  2266. vpmuludq $H4,$S3,$M2
  2267. vpaddq $M0,$D0,$H0 # h0 = d0 + h4*s1
  2268. vpaddq $M1,$D1,$H1 # h1 = d2 + h4*s2
  2269. vpaddq $M2,$D2,$H2 # h2 = d3 + h4*s3
  2270. ################################################################
  2271. # horizontal addition
  2272. mov \$1,%eax
  2273. vpermq \$0xb1,$H3,$D3
  2274. vpermq \$0xb1,$D4,$H4
  2275. vpermq \$0xb1,$H0,$D0
  2276. vpermq \$0xb1,$H1,$D1
  2277. vpermq \$0xb1,$H2,$D2
  2278. vpaddq $D3,$H3,$H3
  2279. vpaddq $D4,$H4,$H4
  2280. vpaddq $D0,$H0,$H0
  2281. vpaddq $D1,$H1,$H1
  2282. vpaddq $D2,$H2,$H2
  2283. kmovw %eax,%k3
  2284. vpermq \$0x2,$H3,$D3
  2285. vpermq \$0x2,$H4,$D4
  2286. vpermq \$0x2,$H0,$D0
  2287. vpermq \$0x2,$H1,$D1
  2288. vpermq \$0x2,$H2,$D2
  2289. vpaddq $D3,$H3,$H3
  2290. vpaddq $D4,$H4,$H4
  2291. vpaddq $D0,$H0,$H0
  2292. vpaddq $D1,$H1,$H1
  2293. vpaddq $D2,$H2,$H2
  2294. vextracti64x4 \$0x1,$H3,%y#$D3
  2295. vextracti64x4 \$0x1,$H4,%y#$D4
  2296. vextracti64x4 \$0x1,$H0,%y#$D0
  2297. vextracti64x4 \$0x1,$H1,%y#$D1
  2298. vextracti64x4 \$0x1,$H2,%y#$D2
  2299. vpaddq $D3,$H3,${H3}{%k3}{z} # keep single qword in case
  2300. vpaddq $D4,$H4,${H4}{%k3}{z} # it's passed to .Ltail_avx2
  2301. vpaddq $D0,$H0,${H0}{%k3}{z}
  2302. vpaddq $D1,$H1,${H1}{%k3}{z}
  2303. vpaddq $D2,$H2,${H2}{%k3}{z}
  2304. ___
  2305. map(s/%z/%y/,($T0,$T1,$T2,$T3,$T4, $PADBIT));
  2306. map(s/%z/%y/,($H0,$H1,$H2,$H3,$H4, $D0,$D1,$D2,$D3,$D4, $MASK));
  2307. $code.=<<___;
  2308. ################################################################
  2309. # lazy reduction (interleaved with input splat)
  2310. vpsrlq \$26,$H3,$D3
  2311. vpand $MASK,$H3,$H3
  2312. vpsrldq \$6,$T0,$T2 # splat input
  2313. vpsrldq \$6,$T1,$T3
  2314. vpunpckhqdq $T1,$T0,$T4 # 4
  2315. vpaddq $D3,$H4,$H4 # h3 -> h4
  2316. vpsrlq \$26,$H0,$D0
  2317. vpand $MASK,$H0,$H0
  2318. vpunpcklqdq $T3,$T2,$T2 # 2:3
  2319. vpunpcklqdq $T1,$T0,$T0 # 0:1
  2320. vpaddq $D0,$H1,$H1 # h0 -> h1
  2321. vpsrlq \$26,$H4,$D4
  2322. vpand $MASK,$H4,$H4
  2323. vpsrlq \$26,$H1,$D1
  2324. vpand $MASK,$H1,$H1
  2325. vpsrlq \$30,$T2,$T3
  2326. vpsrlq \$4,$T2,$T2
  2327. vpaddq $D1,$H2,$H2 # h1 -> h2
  2328. vpaddq $D4,$H0,$H0
  2329. vpsllq \$2,$D4,$D4
  2330. vpsrlq \$26,$T0,$T1
  2331. vpsrlq \$40,$T4,$T4 # 4
  2332. vpaddq $D4,$H0,$H0 # h4 -> h0
  2333. vpsrlq \$26,$H2,$D2
  2334. vpand $MASK,$H2,$H2
  2335. vpand $MASK,$T2,$T2 # 2
  2336. vpand $MASK,$T0,$T0 # 0
  2337. vpaddq $D2,$H3,$H3 # h2 -> h3
  2338. vpsrlq \$26,$H0,$D0
  2339. vpand $MASK,$H0,$H0
  2340. vpaddq $H2,$T2,$H2 # accumulate input for .Ltail_avx2
  2341. vpand $MASK,$T1,$T1 # 1
  2342. vpaddq $D0,$H1,$H1 # h0 -> h1
  2343. vpsrlq \$26,$H3,$D3
  2344. vpand $MASK,$H3,$H3
  2345. vpand $MASK,$T3,$T3 # 3
  2346. vpor 32(%rcx),$T4,$T4 # padbit, yes, always
  2347. vpaddq $D3,$H4,$H4 # h3 -> h4
  2348. lea 0x90(%rsp),%rax # size optimization for .Ltail_avx2
  2349. add \$64,$len
  2350. jnz .Ltail_avx2
  2351. vpsubq $T2,$H2,$H2 # undo input accumulation
  2352. vmovd %x#$H0,`4*0-48-64`($ctx)# save partially reduced
  2353. vmovd %x#$H1,`4*1-48-64`($ctx)
  2354. vmovd %x#$H2,`4*2-48-64`($ctx)
  2355. vmovd %x#$H3,`4*3-48-64`($ctx)
  2356. vmovd %x#$H4,`4*4-48-64`($ctx)
  2357. vzeroall
  2358. ___
  2359. $code.=<<___ if ($win64);
  2360. movdqa 0x50(%r11),%xmm6
  2361. movdqa 0x60(%r11),%xmm7
  2362. movdqa 0x70(%r11),%xmm8
  2363. movdqa 0x80(%r11),%xmm9
  2364. movdqa 0x90(%r11),%xmm10
  2365. movdqa 0xa0(%r11),%xmm11
  2366. movdqa 0xb0(%r11),%xmm12
  2367. movdqa 0xc0(%r11),%xmm13
  2368. movdqa 0xd0(%r11),%xmm14
  2369. movdqa 0xe0(%r11),%xmm15
  2370. lea 0xf8(%r11),%rsp
  2371. .Ldo_avx512_epilogue:
  2372. ___
  2373. $code.=<<___ if (!$win64);
  2374. lea 8(%r11),%rsp
  2375. .cfi_def_cfa %rsp,8
  2376. ___
  2377. $code.=<<___;
  2378. ret
  2379. .cfi_endproc
  2380. .size poly1305_blocks_avx512,.-poly1305_blocks_avx512
  2381. ___
  2382. if ($avx>3) {
  2383. ########################################################################
  2384. # VPMADD52 version using 2^44 radix.
  2385. #
  2386. # One can argue that base 2^52 would be more natural. Well, even though
  2387. # some operations would be more natural, one has to recognize couple of
  2388. # things. Base 2^52 doesn't provide advantage over base 2^44 if you look
  2389. # at amount of multiply-n-accumulate operations. Secondly, it makes it
  2390. # impossible to pre-compute multiples of 5 [referred to as s[]/sN in
  2391. # reference implementations], which means that more such operations
  2392. # would have to be performed in inner loop, which in turn makes critical
  2393. # path longer. In other words, even though base 2^44 reduction might
  2394. # look less elegant, overall critical path is actually shorter...
  2395. ########################################################################
  2396. # Layout of opaque area is following.
  2397. #
  2398. # unsigned __int64 h[3]; # current hash value base 2^44
  2399. # unsigned __int64 s[2]; # key value*20 base 2^44
  2400. # unsigned __int64 r[3]; # key value base 2^44
  2401. # struct { unsigned __int64 r^1, r^3, r^2, r^4; } R[4];
  2402. # # r^n positions reflect
  2403. # # placement in register, not
  2404. # # memory, R[3] is R[1]*20
  2405. $code.=<<___;
  2406. .type poly1305_init_base2_44,\@function,3
  2407. .align 32
  2408. poly1305_init_base2_44:
  2409. .cfi_startproc
  2410. xor %rax,%rax
  2411. mov %rax,0($ctx) # initialize hash value
  2412. mov %rax,8($ctx)
  2413. mov %rax,16($ctx)
  2414. .Linit_base2_44:
  2415. lea poly1305_blocks_vpmadd52(%rip),%r10
  2416. lea poly1305_emit_base2_44(%rip),%r11
  2417. mov \$0x0ffffffc0fffffff,%rax
  2418. mov \$0x0ffffffc0ffffffc,%rcx
  2419. and 0($inp),%rax
  2420. mov \$0x00000fffffffffff,%r8
  2421. and 8($inp),%rcx
  2422. mov \$0x00000fffffffffff,%r9
  2423. and %rax,%r8
  2424. shrd \$44,%rcx,%rax
  2425. mov %r8,40($ctx) # r0
  2426. and %r9,%rax
  2427. shr \$24,%rcx
  2428. mov %rax,48($ctx) # r1
  2429. lea (%rax,%rax,4),%rax # *5
  2430. mov %rcx,56($ctx) # r2
  2431. shl \$2,%rax # magic <<2
  2432. lea (%rcx,%rcx,4),%rcx # *5
  2433. shl \$2,%rcx # magic <<2
  2434. mov %rax,24($ctx) # s1
  2435. mov %rcx,32($ctx) # s2
  2436. movq \$-1,64($ctx) # write impossible value
  2437. ___
  2438. $code.=<<___ if ($flavour !~ /elf32/);
  2439. mov %r10,0(%rdx)
  2440. mov %r11,8(%rdx)
  2441. ___
  2442. $code.=<<___ if ($flavour =~ /elf32/);
  2443. mov %r10d,0(%rdx)
  2444. mov %r11d,4(%rdx)
  2445. ___
  2446. $code.=<<___;
  2447. mov \$1,%eax
  2448. ret
  2449. .cfi_endproc
  2450. .size poly1305_init_base2_44,.-poly1305_init_base2_44
  2451. ___
  2452. {
  2453. my ($H0,$H1,$H2,$r2r1r0,$r1r0s2,$r0s2s1,$Dlo,$Dhi) = map("%ymm$_",(0..5,16,17));
  2454. my ($T0,$inp_permd,$inp_shift,$PAD) = map("%ymm$_",(18..21));
  2455. my ($reduc_mask,$reduc_rght,$reduc_left) = map("%ymm$_",(22..25));
  2456. $code.=<<___;
  2457. .type poly1305_blocks_vpmadd52,\@function,4
  2458. .align 32
  2459. poly1305_blocks_vpmadd52:
  2460. .cfi_startproc
  2461. shr \$4,$len
  2462. jz .Lno_data_vpmadd52 # too short
  2463. shl \$40,$padbit
  2464. mov 64($ctx),%r8 # peek on power of the key
  2465. # if powers of the key are not calculated yet, process up to 3
  2466. # blocks with this single-block subroutine, otherwise ensure that
  2467. # length is divisible by 2 blocks and pass the rest down to next
  2468. # subroutine...
  2469. mov \$3,%rax
  2470. mov \$1,%r10
  2471. cmp \$4,$len # is input long
  2472. cmovae %r10,%rax
  2473. test %r8,%r8 # is power value impossible?
  2474. cmovns %r10,%rax
  2475. and $len,%rax # is input of favourable length?
  2476. jz .Lblocks_vpmadd52_4x
  2477. sub %rax,$len
  2478. mov \$7,%r10d
  2479. mov \$1,%r11d
  2480. kmovw %r10d,%k7
  2481. lea .L2_44_inp_permd(%rip),%r10
  2482. kmovw %r11d,%k1
  2483. vmovq $padbit,%x#$PAD
  2484. vmovdqa64 0(%r10),$inp_permd # .L2_44_inp_permd
  2485. vmovdqa64 32(%r10),$inp_shift # .L2_44_inp_shift
  2486. vpermq \$0xcf,$PAD,$PAD
  2487. vmovdqa64 64(%r10),$reduc_mask # .L2_44_mask
  2488. vmovdqu64 0($ctx),${Dlo}{%k7}{z} # load hash value
  2489. vmovdqu64 40($ctx),${r2r1r0}{%k7}{z} # load keys
  2490. vmovdqu64 32($ctx),${r1r0s2}{%k7}{z}
  2491. vmovdqu64 24($ctx),${r0s2s1}{%k7}{z}
  2492. vmovdqa64 96(%r10),$reduc_rght # .L2_44_shift_rgt
  2493. vmovdqa64 128(%r10),$reduc_left # .L2_44_shift_lft
  2494. jmp .Loop_vpmadd52
  2495. .align 32
  2496. .Loop_vpmadd52:
  2497. vmovdqu32 0($inp),%x#$T0 # load input as ----3210
  2498. lea 16($inp),$inp
  2499. vpermd $T0,$inp_permd,$T0 # ----3210 -> --322110
  2500. vpsrlvq $inp_shift,$T0,$T0
  2501. vpandq $reduc_mask,$T0,$T0
  2502. vporq $PAD,$T0,$T0
  2503. vpaddq $T0,$Dlo,$Dlo # accumulate input
  2504. vpermq \$0,$Dlo,${H0}{%k7}{z} # smash hash value
  2505. vpermq \$0b01010101,$Dlo,${H1}{%k7}{z}
  2506. vpermq \$0b10101010,$Dlo,${H2}{%k7}{z}
  2507. vpxord $Dlo,$Dlo,$Dlo
  2508. vpxord $Dhi,$Dhi,$Dhi
  2509. vpmadd52luq $r2r1r0,$H0,$Dlo
  2510. vpmadd52huq $r2r1r0,$H0,$Dhi
  2511. vpmadd52luq $r1r0s2,$H1,$Dlo
  2512. vpmadd52huq $r1r0s2,$H1,$Dhi
  2513. vpmadd52luq $r0s2s1,$H2,$Dlo
  2514. vpmadd52huq $r0s2s1,$H2,$Dhi
  2515. vpsrlvq $reduc_rght,$Dlo,$T0 # 0 in topmost qword
  2516. vpsllvq $reduc_left,$Dhi,$Dhi # 0 in topmost qword
  2517. vpandq $reduc_mask,$Dlo,$Dlo
  2518. vpaddq $T0,$Dhi,$Dhi
  2519. vpermq \$0b10010011,$Dhi,$Dhi # 0 in lowest qword
  2520. vpaddq $Dhi,$Dlo,$Dlo # note topmost qword :-)
  2521. vpsrlvq $reduc_rght,$Dlo,$T0 # 0 in topmost word
  2522. vpandq $reduc_mask,$Dlo,$Dlo
  2523. vpermq \$0b10010011,$T0,$T0
  2524. vpaddq $T0,$Dlo,$Dlo
  2525. vpermq \$0b10010011,$Dlo,${T0}{%k1}{z}
  2526. vpaddq $T0,$Dlo,$Dlo
  2527. vpsllq \$2,$T0,$T0
  2528. vpaddq $T0,$Dlo,$Dlo
  2529. dec %rax # len-=16
  2530. jnz .Loop_vpmadd52
  2531. vmovdqu64 $Dlo,0($ctx){%k7} # store hash value
  2532. test $len,$len
  2533. jnz .Lblocks_vpmadd52_4x
  2534. .Lno_data_vpmadd52:
  2535. ret
  2536. .cfi_endproc
  2537. .size poly1305_blocks_vpmadd52,.-poly1305_blocks_vpmadd52
  2538. ___
  2539. }
  2540. {
  2541. ########################################################################
  2542. # As implied by its name 4x subroutine processes 4 blocks in parallel
  2543. # (but handles even 4*n+2 blocks lengths). It takes up to 4th key power
  2544. # and is handled in 256-bit %ymm registers.
  2545. my ($H0,$H1,$H2,$R0,$R1,$R2,$S1,$S2) = map("%ymm$_",(0..5,16,17));
  2546. my ($D0lo,$D0hi,$D1lo,$D1hi,$D2lo,$D2hi) = map("%ymm$_",(18..23));
  2547. my ($T0,$T1,$T2,$T3,$mask44,$mask42,$tmp,$PAD) = map("%ymm$_",(24..31));
  2548. $code.=<<___;
  2549. .type poly1305_blocks_vpmadd52_4x,\@function,4
  2550. .align 32
  2551. poly1305_blocks_vpmadd52_4x:
  2552. .cfi_startproc
  2553. shr \$4,$len
  2554. jz .Lno_data_vpmadd52_4x # too short
  2555. shl \$40,$padbit
  2556. mov 64($ctx),%r8 # peek on power of the key
  2557. .Lblocks_vpmadd52_4x:
  2558. vpbroadcastq $padbit,$PAD
  2559. vmovdqa64 .Lx_mask44(%rip),$mask44
  2560. mov \$5,%eax
  2561. vmovdqa64 .Lx_mask42(%rip),$mask42
  2562. kmovw %eax,%k1 # used in 2x path
  2563. test %r8,%r8 # is power value impossible?
  2564. js .Linit_vpmadd52 # if it is, then init R[4]
  2565. vmovq 0($ctx),%x#$H0 # load current hash value
  2566. vmovq 8($ctx),%x#$H1
  2567. vmovq 16($ctx),%x#$H2
  2568. test \$3,$len # is length 4*n+2?
  2569. jnz .Lblocks_vpmadd52_2x_do
  2570. .Lblocks_vpmadd52_4x_do:
  2571. vpbroadcastq 64($ctx),$R0 # load 4th power of the key
  2572. vpbroadcastq 96($ctx),$R1
  2573. vpbroadcastq 128($ctx),$R2
  2574. vpbroadcastq 160($ctx),$S1
  2575. .Lblocks_vpmadd52_4x_key_loaded:
  2576. vpsllq \$2,$R2,$S2 # S2 = R2*5*4
  2577. vpaddq $R2,$S2,$S2
  2578. vpsllq \$2,$S2,$S2
  2579. test \$7,$len # is len 8*n?
  2580. jz .Lblocks_vpmadd52_8x
  2581. vmovdqu64 16*0($inp),$T2 # load data
  2582. vmovdqu64 16*2($inp),$T3
  2583. lea 16*4($inp),$inp
  2584. vpunpcklqdq $T3,$T2,$T1 # transpose data
  2585. vpunpckhqdq $T3,$T2,$T3
  2586. # at this point 64-bit lanes are ordered as 3-1-2-0
  2587. vpsrlq \$24,$T3,$T2 # splat the data
  2588. vporq $PAD,$T2,$T2
  2589. vpaddq $T2,$H2,$H2 # accumulate input
  2590. vpandq $mask44,$T1,$T0
  2591. vpsrlq \$44,$T1,$T1
  2592. vpsllq \$20,$T3,$T3
  2593. vporq $T3,$T1,$T1
  2594. vpandq $mask44,$T1,$T1
  2595. sub \$4,$len
  2596. jz .Ltail_vpmadd52_4x
  2597. jmp .Loop_vpmadd52_4x
  2598. ud2
  2599. .align 32
  2600. .Linit_vpmadd52:
  2601. vmovq 24($ctx),%x#$S1 # load key
  2602. vmovq 56($ctx),%x#$H2
  2603. vmovq 32($ctx),%x#$S2
  2604. vmovq 40($ctx),%x#$R0
  2605. vmovq 48($ctx),%x#$R1
  2606. vmovdqa $R0,$H0
  2607. vmovdqa $R1,$H1
  2608. vmovdqa $H2,$R2
  2609. mov \$2,%eax
  2610. .Lmul_init_vpmadd52:
  2611. vpxorq $D0lo,$D0lo,$D0lo
  2612. vpmadd52luq $H2,$S1,$D0lo
  2613. vpxorq $D0hi,$D0hi,$D0hi
  2614. vpmadd52huq $H2,$S1,$D0hi
  2615. vpxorq $D1lo,$D1lo,$D1lo
  2616. vpmadd52luq $H2,$S2,$D1lo
  2617. vpxorq $D1hi,$D1hi,$D1hi
  2618. vpmadd52huq $H2,$S2,$D1hi
  2619. vpxorq $D2lo,$D2lo,$D2lo
  2620. vpmadd52luq $H2,$R0,$D2lo
  2621. vpxorq $D2hi,$D2hi,$D2hi
  2622. vpmadd52huq $H2,$R0,$D2hi
  2623. vpmadd52luq $H0,$R0,$D0lo
  2624. vpmadd52huq $H0,$R0,$D0hi
  2625. vpmadd52luq $H0,$R1,$D1lo
  2626. vpmadd52huq $H0,$R1,$D1hi
  2627. vpmadd52luq $H0,$R2,$D2lo
  2628. vpmadd52huq $H0,$R2,$D2hi
  2629. vpmadd52luq $H1,$S2,$D0lo
  2630. vpmadd52huq $H1,$S2,$D0hi
  2631. vpmadd52luq $H1,$R0,$D1lo
  2632. vpmadd52huq $H1,$R0,$D1hi
  2633. vpmadd52luq $H1,$R1,$D2lo
  2634. vpmadd52huq $H1,$R1,$D2hi
  2635. ################################################################
  2636. # partial reduction
  2637. vpsrlq \$44,$D0lo,$tmp
  2638. vpsllq \$8,$D0hi,$D0hi
  2639. vpandq $mask44,$D0lo,$H0
  2640. vpaddq $tmp,$D0hi,$D0hi
  2641. vpaddq $D0hi,$D1lo,$D1lo
  2642. vpsrlq \$44,$D1lo,$tmp
  2643. vpsllq \$8,$D1hi,$D1hi
  2644. vpandq $mask44,$D1lo,$H1
  2645. vpaddq $tmp,$D1hi,$D1hi
  2646. vpaddq $D1hi,$D2lo,$D2lo
  2647. vpsrlq \$42,$D2lo,$tmp
  2648. vpsllq \$10,$D2hi,$D2hi
  2649. vpandq $mask42,$D2lo,$H2
  2650. vpaddq $tmp,$D2hi,$D2hi
  2651. vpaddq $D2hi,$H0,$H0
  2652. vpsllq \$2,$D2hi,$D2hi
  2653. vpaddq $D2hi,$H0,$H0
  2654. vpsrlq \$44,$H0,$tmp # additional step
  2655. vpandq $mask44,$H0,$H0
  2656. vpaddq $tmp,$H1,$H1
  2657. dec %eax
  2658. jz .Ldone_init_vpmadd52
  2659. vpunpcklqdq $R1,$H1,$R1 # 1,2
  2660. vpbroadcastq %x#$H1,%x#$H1 # 2,2
  2661. vpunpcklqdq $R2,$H2,$R2
  2662. vpbroadcastq %x#$H2,%x#$H2
  2663. vpunpcklqdq $R0,$H0,$R0
  2664. vpbroadcastq %x#$H0,%x#$H0
  2665. vpsllq \$2,$R1,$S1 # S1 = R1*5*4
  2666. vpsllq \$2,$R2,$S2 # S2 = R2*5*4
  2667. vpaddq $R1,$S1,$S1
  2668. vpaddq $R2,$S2,$S2
  2669. vpsllq \$2,$S1,$S1
  2670. vpsllq \$2,$S2,$S2
  2671. jmp .Lmul_init_vpmadd52
  2672. ud2
  2673. .align 32
  2674. .Ldone_init_vpmadd52:
  2675. vinserti128 \$1,%x#$R1,$H1,$R1 # 1,2,3,4
  2676. vinserti128 \$1,%x#$R2,$H2,$R2
  2677. vinserti128 \$1,%x#$R0,$H0,$R0
  2678. vpermq \$0b11011000,$R1,$R1 # 1,3,2,4
  2679. vpermq \$0b11011000,$R2,$R2
  2680. vpermq \$0b11011000,$R0,$R0
  2681. vpsllq \$2,$R1,$S1 # S1 = R1*5*4
  2682. vpaddq $R1,$S1,$S1
  2683. vpsllq \$2,$S1,$S1
  2684. vmovq 0($ctx),%x#$H0 # load current hash value
  2685. vmovq 8($ctx),%x#$H1
  2686. vmovq 16($ctx),%x#$H2
  2687. test \$3,$len # is length 4*n+2?
  2688. jnz .Ldone_init_vpmadd52_2x
  2689. vmovdqu64 $R0,64($ctx) # save key powers
  2690. vpbroadcastq %x#$R0,$R0 # broadcast 4th power
  2691. vmovdqu64 $R1,96($ctx)
  2692. vpbroadcastq %x#$R1,$R1
  2693. vmovdqu64 $R2,128($ctx)
  2694. vpbroadcastq %x#$R2,$R2
  2695. vmovdqu64 $S1,160($ctx)
  2696. vpbroadcastq %x#$S1,$S1
  2697. jmp .Lblocks_vpmadd52_4x_key_loaded
  2698. ud2
  2699. .align 32
  2700. .Ldone_init_vpmadd52_2x:
  2701. vmovdqu64 $R0,64($ctx) # save key powers
  2702. vpsrldq \$8,$R0,$R0 # 0-1-0-2
  2703. vmovdqu64 $R1,96($ctx)
  2704. vpsrldq \$8,$R1,$R1
  2705. vmovdqu64 $R2,128($ctx)
  2706. vpsrldq \$8,$R2,$R2
  2707. vmovdqu64 $S1,160($ctx)
  2708. vpsrldq \$8,$S1,$S1
  2709. jmp .Lblocks_vpmadd52_2x_key_loaded
  2710. ud2
  2711. .align 32
  2712. .Lblocks_vpmadd52_2x_do:
  2713. vmovdqu64 128+8($ctx),${R2}{%k1}{z}# load 2nd and 1st key powers
  2714. vmovdqu64 160+8($ctx),${S1}{%k1}{z}
  2715. vmovdqu64 64+8($ctx),${R0}{%k1}{z}
  2716. vmovdqu64 96+8($ctx),${R1}{%k1}{z}
  2717. .Lblocks_vpmadd52_2x_key_loaded:
  2718. vmovdqu64 16*0($inp),$T2 # load data
  2719. vpxorq $T3,$T3,$T3
  2720. lea 16*2($inp),$inp
  2721. vpunpcklqdq $T3,$T2,$T1 # transpose data
  2722. vpunpckhqdq $T3,$T2,$T3
  2723. # at this point 64-bit lanes are ordered as x-1-x-0
  2724. vpsrlq \$24,$T3,$T2 # splat the data
  2725. vporq $PAD,$T2,$T2
  2726. vpaddq $T2,$H2,$H2 # accumulate input
  2727. vpandq $mask44,$T1,$T0
  2728. vpsrlq \$44,$T1,$T1
  2729. vpsllq \$20,$T3,$T3
  2730. vporq $T3,$T1,$T1
  2731. vpandq $mask44,$T1,$T1
  2732. jmp .Ltail_vpmadd52_2x
  2733. ud2
  2734. .align 32
  2735. .Loop_vpmadd52_4x:
  2736. #vpaddq $T2,$H2,$H2 # accumulate input
  2737. vpaddq $T0,$H0,$H0
  2738. vpaddq $T1,$H1,$H1
  2739. vpxorq $D0lo,$D0lo,$D0lo
  2740. vpmadd52luq $H2,$S1,$D0lo
  2741. vpxorq $D0hi,$D0hi,$D0hi
  2742. vpmadd52huq $H2,$S1,$D0hi
  2743. vpxorq $D1lo,$D1lo,$D1lo
  2744. vpmadd52luq $H2,$S2,$D1lo
  2745. vpxorq $D1hi,$D1hi,$D1hi
  2746. vpmadd52huq $H2,$S2,$D1hi
  2747. vpxorq $D2lo,$D2lo,$D2lo
  2748. vpmadd52luq $H2,$R0,$D2lo
  2749. vpxorq $D2hi,$D2hi,$D2hi
  2750. vpmadd52huq $H2,$R0,$D2hi
  2751. vmovdqu64 16*0($inp),$T2 # load data
  2752. vmovdqu64 16*2($inp),$T3
  2753. lea 16*4($inp),$inp
  2754. vpmadd52luq $H0,$R0,$D0lo
  2755. vpmadd52huq $H0,$R0,$D0hi
  2756. vpmadd52luq $H0,$R1,$D1lo
  2757. vpmadd52huq $H0,$R1,$D1hi
  2758. vpmadd52luq $H0,$R2,$D2lo
  2759. vpmadd52huq $H0,$R2,$D2hi
  2760. vpunpcklqdq $T3,$T2,$T1 # transpose data
  2761. vpunpckhqdq $T3,$T2,$T3
  2762. vpmadd52luq $H1,$S2,$D0lo
  2763. vpmadd52huq $H1,$S2,$D0hi
  2764. vpmadd52luq $H1,$R0,$D1lo
  2765. vpmadd52huq $H1,$R0,$D1hi
  2766. vpmadd52luq $H1,$R1,$D2lo
  2767. vpmadd52huq $H1,$R1,$D2hi
  2768. ################################################################
  2769. # partial reduction (interleaved with data splat)
  2770. vpsrlq \$44,$D0lo,$tmp
  2771. vpsllq \$8,$D0hi,$D0hi
  2772. vpandq $mask44,$D0lo,$H0
  2773. vpaddq $tmp,$D0hi,$D0hi
  2774. vpsrlq \$24,$T3,$T2
  2775. vporq $PAD,$T2,$T2
  2776. vpaddq $D0hi,$D1lo,$D1lo
  2777. vpsrlq \$44,$D1lo,$tmp
  2778. vpsllq \$8,$D1hi,$D1hi
  2779. vpandq $mask44,$D1lo,$H1
  2780. vpaddq $tmp,$D1hi,$D1hi
  2781. vpandq $mask44,$T1,$T0
  2782. vpsrlq \$44,$T1,$T1
  2783. vpsllq \$20,$T3,$T3
  2784. vpaddq $D1hi,$D2lo,$D2lo
  2785. vpsrlq \$42,$D2lo,$tmp
  2786. vpsllq \$10,$D2hi,$D2hi
  2787. vpandq $mask42,$D2lo,$H2
  2788. vpaddq $tmp,$D2hi,$D2hi
  2789. vpaddq $T2,$H2,$H2 # accumulate input
  2790. vpaddq $D2hi,$H0,$H0
  2791. vpsllq \$2,$D2hi,$D2hi
  2792. vpaddq $D2hi,$H0,$H0
  2793. vporq $T3,$T1,$T1
  2794. vpandq $mask44,$T1,$T1
  2795. vpsrlq \$44,$H0,$tmp # additional step
  2796. vpandq $mask44,$H0,$H0
  2797. vpaddq $tmp,$H1,$H1
  2798. sub \$4,$len # len-=64
  2799. jnz .Loop_vpmadd52_4x
  2800. .Ltail_vpmadd52_4x:
  2801. vmovdqu64 128($ctx),$R2 # load all key powers
  2802. vmovdqu64 160($ctx),$S1
  2803. vmovdqu64 64($ctx),$R0
  2804. vmovdqu64 96($ctx),$R1
  2805. .Ltail_vpmadd52_2x:
  2806. vpsllq \$2,$R2,$S2 # S2 = R2*5*4
  2807. vpaddq $R2,$S2,$S2
  2808. vpsllq \$2,$S2,$S2
  2809. #vpaddq $T2,$H2,$H2 # accumulate input
  2810. vpaddq $T0,$H0,$H0
  2811. vpaddq $T1,$H1,$H1
  2812. vpxorq $D0lo,$D0lo,$D0lo
  2813. vpmadd52luq $H2,$S1,$D0lo
  2814. vpxorq $D0hi,$D0hi,$D0hi
  2815. vpmadd52huq $H2,$S1,$D0hi
  2816. vpxorq $D1lo,$D1lo,$D1lo
  2817. vpmadd52luq $H2,$S2,$D1lo
  2818. vpxorq $D1hi,$D1hi,$D1hi
  2819. vpmadd52huq $H2,$S2,$D1hi
  2820. vpxorq $D2lo,$D2lo,$D2lo
  2821. vpmadd52luq $H2,$R0,$D2lo
  2822. vpxorq $D2hi,$D2hi,$D2hi
  2823. vpmadd52huq $H2,$R0,$D2hi
  2824. vpmadd52luq $H0,$R0,$D0lo
  2825. vpmadd52huq $H0,$R0,$D0hi
  2826. vpmadd52luq $H0,$R1,$D1lo
  2827. vpmadd52huq $H0,$R1,$D1hi
  2828. vpmadd52luq $H0,$R2,$D2lo
  2829. vpmadd52huq $H0,$R2,$D2hi
  2830. vpmadd52luq $H1,$S2,$D0lo
  2831. vpmadd52huq $H1,$S2,$D0hi
  2832. vpmadd52luq $H1,$R0,$D1lo
  2833. vpmadd52huq $H1,$R0,$D1hi
  2834. vpmadd52luq $H1,$R1,$D2lo
  2835. vpmadd52huq $H1,$R1,$D2hi
  2836. ################################################################
  2837. # horizontal addition
  2838. mov \$1,%eax
  2839. kmovw %eax,%k1
  2840. vpsrldq \$8,$D0lo,$T0
  2841. vpsrldq \$8,$D0hi,$H0
  2842. vpsrldq \$8,$D1lo,$T1
  2843. vpsrldq \$8,$D1hi,$H1
  2844. vpaddq $T0,$D0lo,$D0lo
  2845. vpaddq $H0,$D0hi,$D0hi
  2846. vpsrldq \$8,$D2lo,$T2
  2847. vpsrldq \$8,$D2hi,$H2
  2848. vpaddq $T1,$D1lo,$D1lo
  2849. vpaddq $H1,$D1hi,$D1hi
  2850. vpermq \$0x2,$D0lo,$T0
  2851. vpermq \$0x2,$D0hi,$H0
  2852. vpaddq $T2,$D2lo,$D2lo
  2853. vpaddq $H2,$D2hi,$D2hi
  2854. vpermq \$0x2,$D1lo,$T1
  2855. vpermq \$0x2,$D1hi,$H1
  2856. vpaddq $T0,$D0lo,${D0lo}{%k1}{z}
  2857. vpaddq $H0,$D0hi,${D0hi}{%k1}{z}
  2858. vpermq \$0x2,$D2lo,$T2
  2859. vpermq \$0x2,$D2hi,$H2
  2860. vpaddq $T1,$D1lo,${D1lo}{%k1}{z}
  2861. vpaddq $H1,$D1hi,${D1hi}{%k1}{z}
  2862. vpaddq $T2,$D2lo,${D2lo}{%k1}{z}
  2863. vpaddq $H2,$D2hi,${D2hi}{%k1}{z}
  2864. ################################################################
  2865. # partial reduction
  2866. vpsrlq \$44,$D0lo,$tmp
  2867. vpsllq \$8,$D0hi,$D0hi
  2868. vpandq $mask44,$D0lo,$H0
  2869. vpaddq $tmp,$D0hi,$D0hi
  2870. vpaddq $D0hi,$D1lo,$D1lo
  2871. vpsrlq \$44,$D1lo,$tmp
  2872. vpsllq \$8,$D1hi,$D1hi
  2873. vpandq $mask44,$D1lo,$H1
  2874. vpaddq $tmp,$D1hi,$D1hi
  2875. vpaddq $D1hi,$D2lo,$D2lo
  2876. vpsrlq \$42,$D2lo,$tmp
  2877. vpsllq \$10,$D2hi,$D2hi
  2878. vpandq $mask42,$D2lo,$H2
  2879. vpaddq $tmp,$D2hi,$D2hi
  2880. vpaddq $D2hi,$H0,$H0
  2881. vpsllq \$2,$D2hi,$D2hi
  2882. vpaddq $D2hi,$H0,$H0
  2883. vpsrlq \$44,$H0,$tmp # additional step
  2884. vpandq $mask44,$H0,$H0
  2885. vpaddq $tmp,$H1,$H1
  2886. # at this point $len is
  2887. # either 4*n+2 or 0...
  2888. sub \$2,$len # len-=32
  2889. ja .Lblocks_vpmadd52_4x_do
  2890. vmovq %x#$H0,0($ctx)
  2891. vmovq %x#$H1,8($ctx)
  2892. vmovq %x#$H2,16($ctx)
  2893. vzeroall
  2894. .Lno_data_vpmadd52_4x:
  2895. ret
  2896. .cfi_endproc
  2897. .size poly1305_blocks_vpmadd52_4x,.-poly1305_blocks_vpmadd52_4x
  2898. ___
  2899. }
  2900. {
  2901. ########################################################################
  2902. # As implied by its name 8x subroutine processes 8 blocks in parallel...
  2903. # This is intermediate version, as it's used only in cases when input
  2904. # length is either 8*n, 8*n+1 or 8*n+2...
  2905. my ($H0,$H1,$H2,$R0,$R1,$R2,$S1,$S2) = map("%ymm$_",(0..5,16,17));
  2906. my ($D0lo,$D0hi,$D1lo,$D1hi,$D2lo,$D2hi) = map("%ymm$_",(18..23));
  2907. my ($T0,$T1,$T2,$T3,$mask44,$mask42,$tmp,$PAD) = map("%ymm$_",(24..31));
  2908. my ($RR0,$RR1,$RR2,$SS1,$SS2) = map("%ymm$_",(6..10));
  2909. $code.=<<___;
  2910. .type poly1305_blocks_vpmadd52_8x,\@function,4
  2911. .align 32
  2912. poly1305_blocks_vpmadd52_8x:
  2913. .cfi_startproc
  2914. shr \$4,$len
  2915. jz .Lno_data_vpmadd52_8x # too short
  2916. shl \$40,$padbit
  2917. mov 64($ctx),%r8 # peek on power of the key
  2918. vmovdqa64 .Lx_mask44(%rip),$mask44
  2919. vmovdqa64 .Lx_mask42(%rip),$mask42
  2920. test %r8,%r8 # is power value impossible?
  2921. js .Linit_vpmadd52 # if it is, then init R[4]
  2922. vmovq 0($ctx),%x#$H0 # load current hash value
  2923. vmovq 8($ctx),%x#$H1
  2924. vmovq 16($ctx),%x#$H2
  2925. .Lblocks_vpmadd52_8x:
  2926. ################################################################
  2927. # fist we calculate more key powers
  2928. vmovdqu64 128($ctx),$R2 # load 1-3-2-4 powers
  2929. vmovdqu64 160($ctx),$S1
  2930. vmovdqu64 64($ctx),$R0
  2931. vmovdqu64 96($ctx),$R1
  2932. vpsllq \$2,$R2,$S2 # S2 = R2*5*4
  2933. vpaddq $R2,$S2,$S2
  2934. vpsllq \$2,$S2,$S2
  2935. vpbroadcastq %x#$R2,$RR2 # broadcast 4th power
  2936. vpbroadcastq %x#$R0,$RR0
  2937. vpbroadcastq %x#$R1,$RR1
  2938. vpxorq $D0lo,$D0lo,$D0lo
  2939. vpmadd52luq $RR2,$S1,$D0lo
  2940. vpxorq $D0hi,$D0hi,$D0hi
  2941. vpmadd52huq $RR2,$S1,$D0hi
  2942. vpxorq $D1lo,$D1lo,$D1lo
  2943. vpmadd52luq $RR2,$S2,$D1lo
  2944. vpxorq $D1hi,$D1hi,$D1hi
  2945. vpmadd52huq $RR2,$S2,$D1hi
  2946. vpxorq $D2lo,$D2lo,$D2lo
  2947. vpmadd52luq $RR2,$R0,$D2lo
  2948. vpxorq $D2hi,$D2hi,$D2hi
  2949. vpmadd52huq $RR2,$R0,$D2hi
  2950. vpmadd52luq $RR0,$R0,$D0lo
  2951. vpmadd52huq $RR0,$R0,$D0hi
  2952. vpmadd52luq $RR0,$R1,$D1lo
  2953. vpmadd52huq $RR0,$R1,$D1hi
  2954. vpmadd52luq $RR0,$R2,$D2lo
  2955. vpmadd52huq $RR0,$R2,$D2hi
  2956. vpmadd52luq $RR1,$S2,$D0lo
  2957. vpmadd52huq $RR1,$S2,$D0hi
  2958. vpmadd52luq $RR1,$R0,$D1lo
  2959. vpmadd52huq $RR1,$R0,$D1hi
  2960. vpmadd52luq $RR1,$R1,$D2lo
  2961. vpmadd52huq $RR1,$R1,$D2hi
  2962. ################################################################
  2963. # partial reduction
  2964. vpsrlq \$44,$D0lo,$tmp
  2965. vpsllq \$8,$D0hi,$D0hi
  2966. vpandq $mask44,$D0lo,$RR0
  2967. vpaddq $tmp,$D0hi,$D0hi
  2968. vpaddq $D0hi,$D1lo,$D1lo
  2969. vpsrlq \$44,$D1lo,$tmp
  2970. vpsllq \$8,$D1hi,$D1hi
  2971. vpandq $mask44,$D1lo,$RR1
  2972. vpaddq $tmp,$D1hi,$D1hi
  2973. vpaddq $D1hi,$D2lo,$D2lo
  2974. vpsrlq \$42,$D2lo,$tmp
  2975. vpsllq \$10,$D2hi,$D2hi
  2976. vpandq $mask42,$D2lo,$RR2
  2977. vpaddq $tmp,$D2hi,$D2hi
  2978. vpaddq $D2hi,$RR0,$RR0
  2979. vpsllq \$2,$D2hi,$D2hi
  2980. vpaddq $D2hi,$RR0,$RR0
  2981. vpsrlq \$44,$RR0,$tmp # additional step
  2982. vpandq $mask44,$RR0,$RR0
  2983. vpaddq $tmp,$RR1,$RR1
  2984. ################################################################
  2985. # At this point Rx holds 1324 powers, RRx - 5768, and the goal
  2986. # is 15263748, which reflects how data is loaded...
  2987. vpunpcklqdq $R2,$RR2,$T2 # 3748
  2988. vpunpckhqdq $R2,$RR2,$R2 # 1526
  2989. vpunpcklqdq $R0,$RR0,$T0
  2990. vpunpckhqdq $R0,$RR0,$R0
  2991. vpunpcklqdq $R1,$RR1,$T1
  2992. vpunpckhqdq $R1,$RR1,$R1
  2993. ___
  2994. ######## switch to %zmm
  2995. map(s/%y/%z/, $H0,$H1,$H2,$R0,$R1,$R2,$S1,$S2);
  2996. map(s/%y/%z/, $D0lo,$D0hi,$D1lo,$D1hi,$D2lo,$D2hi);
  2997. map(s/%y/%z/, $T0,$T1,$T2,$T3,$mask44,$mask42,$tmp,$PAD);
  2998. map(s/%y/%z/, $RR0,$RR1,$RR2,$SS1,$SS2);
  2999. $code.=<<___;
  3000. vshufi64x2 \$0x44,$R2,$T2,$RR2 # 15263748
  3001. vshufi64x2 \$0x44,$R0,$T0,$RR0
  3002. vshufi64x2 \$0x44,$R1,$T1,$RR1
  3003. vmovdqu64 16*0($inp),$T2 # load data
  3004. vmovdqu64 16*4($inp),$T3
  3005. lea 16*8($inp),$inp
  3006. vpsllq \$2,$RR2,$SS2 # S2 = R2*5*4
  3007. vpsllq \$2,$RR1,$SS1 # S1 = R1*5*4
  3008. vpaddq $RR2,$SS2,$SS2
  3009. vpaddq $RR1,$SS1,$SS1
  3010. vpsllq \$2,$SS2,$SS2
  3011. vpsllq \$2,$SS1,$SS1
  3012. vpbroadcastq $padbit,$PAD
  3013. vpbroadcastq %x#$mask44,$mask44
  3014. vpbroadcastq %x#$mask42,$mask42
  3015. vpbroadcastq %x#$SS1,$S1 # broadcast 8th power
  3016. vpbroadcastq %x#$SS2,$S2
  3017. vpbroadcastq %x#$RR0,$R0
  3018. vpbroadcastq %x#$RR1,$R1
  3019. vpbroadcastq %x#$RR2,$R2
  3020. vpunpcklqdq $T3,$T2,$T1 # transpose data
  3021. vpunpckhqdq $T3,$T2,$T3
  3022. # at this point 64-bit lanes are ordered as 73625140
  3023. vpsrlq \$24,$T3,$T2 # splat the data
  3024. vporq $PAD,$T2,$T2
  3025. vpaddq $T2,$H2,$H2 # accumulate input
  3026. vpandq $mask44,$T1,$T0
  3027. vpsrlq \$44,$T1,$T1
  3028. vpsllq \$20,$T3,$T3
  3029. vporq $T3,$T1,$T1
  3030. vpandq $mask44,$T1,$T1
  3031. sub \$8,$len
  3032. jz .Ltail_vpmadd52_8x
  3033. jmp .Loop_vpmadd52_8x
  3034. .align 32
  3035. .Loop_vpmadd52_8x:
  3036. #vpaddq $T2,$H2,$H2 # accumulate input
  3037. vpaddq $T0,$H0,$H0
  3038. vpaddq $T1,$H1,$H1
  3039. vpxorq $D0lo,$D0lo,$D0lo
  3040. vpmadd52luq $H2,$S1,$D0lo
  3041. vpxorq $D0hi,$D0hi,$D0hi
  3042. vpmadd52huq $H2,$S1,$D0hi
  3043. vpxorq $D1lo,$D1lo,$D1lo
  3044. vpmadd52luq $H2,$S2,$D1lo
  3045. vpxorq $D1hi,$D1hi,$D1hi
  3046. vpmadd52huq $H2,$S2,$D1hi
  3047. vpxorq $D2lo,$D2lo,$D2lo
  3048. vpmadd52luq $H2,$R0,$D2lo
  3049. vpxorq $D2hi,$D2hi,$D2hi
  3050. vpmadd52huq $H2,$R0,$D2hi
  3051. vmovdqu64 16*0($inp),$T2 # load data
  3052. vmovdqu64 16*4($inp),$T3
  3053. lea 16*8($inp),$inp
  3054. vpmadd52luq $H0,$R0,$D0lo
  3055. vpmadd52huq $H0,$R0,$D0hi
  3056. vpmadd52luq $H0,$R1,$D1lo
  3057. vpmadd52huq $H0,$R1,$D1hi
  3058. vpmadd52luq $H0,$R2,$D2lo
  3059. vpmadd52huq $H0,$R2,$D2hi
  3060. vpunpcklqdq $T3,$T2,$T1 # transpose data
  3061. vpunpckhqdq $T3,$T2,$T3
  3062. vpmadd52luq $H1,$S2,$D0lo
  3063. vpmadd52huq $H1,$S2,$D0hi
  3064. vpmadd52luq $H1,$R0,$D1lo
  3065. vpmadd52huq $H1,$R0,$D1hi
  3066. vpmadd52luq $H1,$R1,$D2lo
  3067. vpmadd52huq $H1,$R1,$D2hi
  3068. ################################################################
  3069. # partial reduction (interleaved with data splat)
  3070. vpsrlq \$44,$D0lo,$tmp
  3071. vpsllq \$8,$D0hi,$D0hi
  3072. vpandq $mask44,$D0lo,$H0
  3073. vpaddq $tmp,$D0hi,$D0hi
  3074. vpsrlq \$24,$T3,$T2
  3075. vporq $PAD,$T2,$T2
  3076. vpaddq $D0hi,$D1lo,$D1lo
  3077. vpsrlq \$44,$D1lo,$tmp
  3078. vpsllq \$8,$D1hi,$D1hi
  3079. vpandq $mask44,$D1lo,$H1
  3080. vpaddq $tmp,$D1hi,$D1hi
  3081. vpandq $mask44,$T1,$T0
  3082. vpsrlq \$44,$T1,$T1
  3083. vpsllq \$20,$T3,$T3
  3084. vpaddq $D1hi,$D2lo,$D2lo
  3085. vpsrlq \$42,$D2lo,$tmp
  3086. vpsllq \$10,$D2hi,$D2hi
  3087. vpandq $mask42,$D2lo,$H2
  3088. vpaddq $tmp,$D2hi,$D2hi
  3089. vpaddq $T2,$H2,$H2 # accumulate input
  3090. vpaddq $D2hi,$H0,$H0
  3091. vpsllq \$2,$D2hi,$D2hi
  3092. vpaddq $D2hi,$H0,$H0
  3093. vporq $T3,$T1,$T1
  3094. vpandq $mask44,$T1,$T1
  3095. vpsrlq \$44,$H0,$tmp # additional step
  3096. vpandq $mask44,$H0,$H0
  3097. vpaddq $tmp,$H1,$H1
  3098. sub \$8,$len # len-=128
  3099. jnz .Loop_vpmadd52_8x
  3100. .Ltail_vpmadd52_8x:
  3101. #vpaddq $T2,$H2,$H2 # accumulate input
  3102. vpaddq $T0,$H0,$H0
  3103. vpaddq $T1,$H1,$H1
  3104. vpxorq $D0lo,$D0lo,$D0lo
  3105. vpmadd52luq $H2,$SS1,$D0lo
  3106. vpxorq $D0hi,$D0hi,$D0hi
  3107. vpmadd52huq $H2,$SS1,$D0hi
  3108. vpxorq $D1lo,$D1lo,$D1lo
  3109. vpmadd52luq $H2,$SS2,$D1lo
  3110. vpxorq $D1hi,$D1hi,$D1hi
  3111. vpmadd52huq $H2,$SS2,$D1hi
  3112. vpxorq $D2lo,$D2lo,$D2lo
  3113. vpmadd52luq $H2,$RR0,$D2lo
  3114. vpxorq $D2hi,$D2hi,$D2hi
  3115. vpmadd52huq $H2,$RR0,$D2hi
  3116. vpmadd52luq $H0,$RR0,$D0lo
  3117. vpmadd52huq $H0,$RR0,$D0hi
  3118. vpmadd52luq $H0,$RR1,$D1lo
  3119. vpmadd52huq $H0,$RR1,$D1hi
  3120. vpmadd52luq $H0,$RR2,$D2lo
  3121. vpmadd52huq $H0,$RR2,$D2hi
  3122. vpmadd52luq $H1,$SS2,$D0lo
  3123. vpmadd52huq $H1,$SS2,$D0hi
  3124. vpmadd52luq $H1,$RR0,$D1lo
  3125. vpmadd52huq $H1,$RR0,$D1hi
  3126. vpmadd52luq $H1,$RR1,$D2lo
  3127. vpmadd52huq $H1,$RR1,$D2hi
  3128. ################################################################
  3129. # horizontal addition
  3130. mov \$1,%eax
  3131. kmovw %eax,%k1
  3132. vpsrldq \$8,$D0lo,$T0
  3133. vpsrldq \$8,$D0hi,$H0
  3134. vpsrldq \$8,$D1lo,$T1
  3135. vpsrldq \$8,$D1hi,$H1
  3136. vpaddq $T0,$D0lo,$D0lo
  3137. vpaddq $H0,$D0hi,$D0hi
  3138. vpsrldq \$8,$D2lo,$T2
  3139. vpsrldq \$8,$D2hi,$H2
  3140. vpaddq $T1,$D1lo,$D1lo
  3141. vpaddq $H1,$D1hi,$D1hi
  3142. vpermq \$0x2,$D0lo,$T0
  3143. vpermq \$0x2,$D0hi,$H0
  3144. vpaddq $T2,$D2lo,$D2lo
  3145. vpaddq $H2,$D2hi,$D2hi
  3146. vpermq \$0x2,$D1lo,$T1
  3147. vpermq \$0x2,$D1hi,$H1
  3148. vpaddq $T0,$D0lo,$D0lo
  3149. vpaddq $H0,$D0hi,$D0hi
  3150. vpermq \$0x2,$D2lo,$T2
  3151. vpermq \$0x2,$D2hi,$H2
  3152. vpaddq $T1,$D1lo,$D1lo
  3153. vpaddq $H1,$D1hi,$D1hi
  3154. vextracti64x4 \$1,$D0lo,%y#$T0
  3155. vextracti64x4 \$1,$D0hi,%y#$H0
  3156. vpaddq $T2,$D2lo,$D2lo
  3157. vpaddq $H2,$D2hi,$D2hi
  3158. vextracti64x4 \$1,$D1lo,%y#$T1
  3159. vextracti64x4 \$1,$D1hi,%y#$H1
  3160. vextracti64x4 \$1,$D2lo,%y#$T2
  3161. vextracti64x4 \$1,$D2hi,%y#$H2
  3162. ___
  3163. ######## switch back to %ymm
  3164. map(s/%z/%y/, $H0,$H1,$H2,$R0,$R1,$R2,$S1,$S2);
  3165. map(s/%z/%y/, $D0lo,$D0hi,$D1lo,$D1hi,$D2lo,$D2hi);
  3166. map(s/%z/%y/, $T0,$T1,$T2,$T3,$mask44,$mask42,$tmp,$PAD);
  3167. $code.=<<___;
  3168. vpaddq $T0,$D0lo,${D0lo}{%k1}{z}
  3169. vpaddq $H0,$D0hi,${D0hi}{%k1}{z}
  3170. vpaddq $T1,$D1lo,${D1lo}{%k1}{z}
  3171. vpaddq $H1,$D1hi,${D1hi}{%k1}{z}
  3172. vpaddq $T2,$D2lo,${D2lo}{%k1}{z}
  3173. vpaddq $H2,$D2hi,${D2hi}{%k1}{z}
  3174. ################################################################
  3175. # partial reduction
  3176. vpsrlq \$44,$D0lo,$tmp
  3177. vpsllq \$8,$D0hi,$D0hi
  3178. vpandq $mask44,$D0lo,$H0
  3179. vpaddq $tmp,$D0hi,$D0hi
  3180. vpaddq $D0hi,$D1lo,$D1lo
  3181. vpsrlq \$44,$D1lo,$tmp
  3182. vpsllq \$8,$D1hi,$D1hi
  3183. vpandq $mask44,$D1lo,$H1
  3184. vpaddq $tmp,$D1hi,$D1hi
  3185. vpaddq $D1hi,$D2lo,$D2lo
  3186. vpsrlq \$42,$D2lo,$tmp
  3187. vpsllq \$10,$D2hi,$D2hi
  3188. vpandq $mask42,$D2lo,$H2
  3189. vpaddq $tmp,$D2hi,$D2hi
  3190. vpaddq $D2hi,$H0,$H0
  3191. vpsllq \$2,$D2hi,$D2hi
  3192. vpaddq $D2hi,$H0,$H0
  3193. vpsrlq \$44,$H0,$tmp # additional step
  3194. vpandq $mask44,$H0,$H0
  3195. vpaddq $tmp,$H1,$H1
  3196. ################################################################
  3197. vmovq %x#$H0,0($ctx)
  3198. vmovq %x#$H1,8($ctx)
  3199. vmovq %x#$H2,16($ctx)
  3200. vzeroall
  3201. .Lno_data_vpmadd52_8x:
  3202. ret
  3203. .cfi_endproc
  3204. .size poly1305_blocks_vpmadd52_8x,.-poly1305_blocks_vpmadd52_8x
  3205. ___
  3206. }
  3207. $code.=<<___;
  3208. .type poly1305_emit_base2_44,\@function,3
  3209. .align 32
  3210. poly1305_emit_base2_44:
  3211. .cfi_startproc
  3212. mov 0($ctx),%r8 # load hash value
  3213. mov 8($ctx),%r9
  3214. mov 16($ctx),%r10
  3215. mov %r9,%rax
  3216. shr \$20,%r9
  3217. shl \$44,%rax
  3218. mov %r10,%rcx
  3219. shr \$40,%r10
  3220. shl \$24,%rcx
  3221. add %rax,%r8
  3222. adc %rcx,%r9
  3223. adc \$0,%r10
  3224. mov %r8,%rax
  3225. add \$5,%r8 # compare to modulus
  3226. mov %r9,%rcx
  3227. adc \$0,%r9
  3228. adc \$0,%r10
  3229. shr \$2,%r10 # did 130-bit value overflow?
  3230. cmovnz %r8,%rax
  3231. cmovnz %r9,%rcx
  3232. add 0($nonce),%rax # accumulate nonce
  3233. adc 8($nonce),%rcx
  3234. mov %rax,0($mac) # write result
  3235. mov %rcx,8($mac)
  3236. ret
  3237. .cfi_endproc
  3238. .size poly1305_emit_base2_44,.-poly1305_emit_base2_44
  3239. ___
  3240. } } }
  3241. $code.=<<___;
  3242. .align 64
  3243. .Lconst:
  3244. .Lmask24:
  3245. .long 0x0ffffff,0,0x0ffffff,0,0x0ffffff,0,0x0ffffff,0
  3246. .L129:
  3247. .long `1<<24`,0,`1<<24`,0,`1<<24`,0,`1<<24`,0
  3248. .Lmask26:
  3249. .long 0x3ffffff,0,0x3ffffff,0,0x3ffffff,0,0x3ffffff,0
  3250. .Lpermd_avx2:
  3251. .long 2,2,2,3,2,0,2,1
  3252. .Lpermd_avx512:
  3253. .long 0,0,0,1, 0,2,0,3, 0,4,0,5, 0,6,0,7
  3254. .L2_44_inp_permd:
  3255. .long 0,1,1,2,2,3,7,7
  3256. .L2_44_inp_shift:
  3257. .quad 0,12,24,64
  3258. .L2_44_mask:
  3259. .quad 0xfffffffffff,0xfffffffffff,0x3ffffffffff,0xffffffffffffffff
  3260. .L2_44_shift_rgt:
  3261. .quad 44,44,42,64
  3262. .L2_44_shift_lft:
  3263. .quad 8,8,10,64
  3264. .align 64
  3265. .Lx_mask44:
  3266. .quad 0xfffffffffff,0xfffffffffff,0xfffffffffff,0xfffffffffff
  3267. .quad 0xfffffffffff,0xfffffffffff,0xfffffffffff,0xfffffffffff
  3268. .Lx_mask42:
  3269. .quad 0x3ffffffffff,0x3ffffffffff,0x3ffffffffff,0x3ffffffffff
  3270. .quad 0x3ffffffffff,0x3ffffffffff,0x3ffffffffff,0x3ffffffffff
  3271. ___
  3272. }
  3273. $code.=<<___;
  3274. .asciz "Poly1305 for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
  3275. .align 16
  3276. ___
  3277. { # chacha20-poly1305 helpers
  3278. my ($out,$inp,$otp,$len)=$win64 ? ("%rcx","%rdx","%r8", "%r9") : # Win64 order
  3279. ("%rdi","%rsi","%rdx","%rcx"); # Unix order
  3280. $code.=<<___;
  3281. .globl xor128_encrypt_n_pad
  3282. .type xor128_encrypt_n_pad,\@abi-omnipotent
  3283. .align 16
  3284. xor128_encrypt_n_pad:
  3285. .cfi_startproc
  3286. sub $otp,$inp
  3287. sub $otp,$out
  3288. mov $len,%r10 # put len aside
  3289. shr \$4,$len # len / 16
  3290. jz .Ltail_enc
  3291. nop
  3292. .Loop_enc_xmm:
  3293. movdqu ($inp,$otp),%xmm0
  3294. pxor ($otp),%xmm0
  3295. movdqu %xmm0,($out,$otp)
  3296. movdqa %xmm0,($otp)
  3297. lea 16($otp),$otp
  3298. dec $len
  3299. jnz .Loop_enc_xmm
  3300. and \$15,%r10 # len % 16
  3301. jz .Ldone_enc
  3302. .Ltail_enc:
  3303. mov \$16,$len
  3304. sub %r10,$len
  3305. xor %eax,%eax
  3306. .Loop_enc_byte:
  3307. mov ($inp,$otp),%al
  3308. xor ($otp),%al
  3309. mov %al,($out,$otp)
  3310. mov %al,($otp)
  3311. lea 1($otp),$otp
  3312. dec %r10
  3313. jnz .Loop_enc_byte
  3314. xor %eax,%eax
  3315. .Loop_enc_pad:
  3316. mov %al,($otp)
  3317. lea 1($otp),$otp
  3318. dec $len
  3319. jnz .Loop_enc_pad
  3320. .Ldone_enc:
  3321. mov $otp,%rax
  3322. ret
  3323. .cfi_endproc
  3324. .size xor128_encrypt_n_pad,.-xor128_encrypt_n_pad
  3325. .globl xor128_decrypt_n_pad
  3326. .type xor128_decrypt_n_pad,\@abi-omnipotent
  3327. .align 16
  3328. xor128_decrypt_n_pad:
  3329. .cfi_startproc
  3330. sub $otp,$inp
  3331. sub $otp,$out
  3332. mov $len,%r10 # put len aside
  3333. shr \$4,$len # len / 16
  3334. jz .Ltail_dec
  3335. nop
  3336. .Loop_dec_xmm:
  3337. movdqu ($inp,$otp),%xmm0
  3338. movdqa ($otp),%xmm1
  3339. pxor %xmm0,%xmm1
  3340. movdqu %xmm1,($out,$otp)
  3341. movdqa %xmm0,($otp)
  3342. lea 16($otp),$otp
  3343. dec $len
  3344. jnz .Loop_dec_xmm
  3345. pxor %xmm1,%xmm1
  3346. and \$15,%r10 # len % 16
  3347. jz .Ldone_dec
  3348. .Ltail_dec:
  3349. mov \$16,$len
  3350. sub %r10,$len
  3351. xor %eax,%eax
  3352. xor %r11,%r11
  3353. .Loop_dec_byte:
  3354. mov ($inp,$otp),%r11b
  3355. mov ($otp),%al
  3356. xor %r11b,%al
  3357. mov %al,($out,$otp)
  3358. mov %r11b,($otp)
  3359. lea 1($otp),$otp
  3360. dec %r10
  3361. jnz .Loop_dec_byte
  3362. xor %eax,%eax
  3363. .Loop_dec_pad:
  3364. mov %al,($otp)
  3365. lea 1($otp),$otp
  3366. dec $len
  3367. jnz .Loop_dec_pad
  3368. .Ldone_dec:
  3369. mov $otp,%rax
  3370. ret
  3371. .cfi_endproc
  3372. .size xor128_decrypt_n_pad,.-xor128_decrypt_n_pad
  3373. ___
  3374. }
  3375. # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
  3376. # CONTEXT *context,DISPATCHER_CONTEXT *disp)
  3377. if ($win64) {
  3378. $rec="%rcx";
  3379. $frame="%rdx";
  3380. $context="%r8";
  3381. $disp="%r9";
  3382. $code.=<<___;
  3383. .extern __imp_RtlVirtualUnwind
  3384. .type se_handler,\@abi-omnipotent
  3385. .align 16
  3386. se_handler:
  3387. push %rsi
  3388. push %rdi
  3389. push %rbx
  3390. push %rbp
  3391. push %r12
  3392. push %r13
  3393. push %r14
  3394. push %r15
  3395. pushfq
  3396. sub \$64,%rsp
  3397. mov 120($context),%rax # pull context->Rax
  3398. mov 248($context),%rbx # pull context->Rip
  3399. mov 8($disp),%rsi # disp->ImageBase
  3400. mov 56($disp),%r11 # disp->HandlerData
  3401. mov 0(%r11),%r10d # HandlerData[0]
  3402. lea (%rsi,%r10),%r10 # prologue label
  3403. cmp %r10,%rbx # context->Rip<.Lprologue
  3404. jb .Lcommon_seh_tail
  3405. mov 152($context),%rax # pull context->Rsp
  3406. mov 4(%r11),%r10d # HandlerData[1]
  3407. lea (%rsi,%r10),%r10 # epilogue label
  3408. cmp %r10,%rbx # context->Rip>=.Lepilogue
  3409. jae .Lcommon_seh_tail
  3410. lea 48(%rax),%rax
  3411. mov -8(%rax),%rbx
  3412. mov -16(%rax),%rbp
  3413. mov -24(%rax),%r12
  3414. mov -32(%rax),%r13
  3415. mov -40(%rax),%r14
  3416. mov -48(%rax),%r15
  3417. mov %rbx,144($context) # restore context->Rbx
  3418. mov %rbp,160($context) # restore context->Rbp
  3419. mov %r12,216($context) # restore context->R12
  3420. mov %r13,224($context) # restore context->R13
  3421. mov %r14,232($context) # restore context->R14
  3422. mov %r15,240($context) # restore context->R14
  3423. jmp .Lcommon_seh_tail
  3424. .size se_handler,.-se_handler
  3425. .type avx_handler,\@abi-omnipotent
  3426. .align 16
  3427. avx_handler:
  3428. push %rsi
  3429. push %rdi
  3430. push %rbx
  3431. push %rbp
  3432. push %r12
  3433. push %r13
  3434. push %r14
  3435. push %r15
  3436. pushfq
  3437. sub \$64,%rsp
  3438. mov 120($context),%rax # pull context->Rax
  3439. mov 248($context),%rbx # pull context->Rip
  3440. mov 8($disp),%rsi # disp->ImageBase
  3441. mov 56($disp),%r11 # disp->HandlerData
  3442. mov 0(%r11),%r10d # HandlerData[0]
  3443. lea (%rsi,%r10),%r10 # prologue label
  3444. cmp %r10,%rbx # context->Rip<prologue label
  3445. jb .Lcommon_seh_tail
  3446. mov 152($context),%rax # pull context->Rsp
  3447. mov 4(%r11),%r10d # HandlerData[1]
  3448. lea (%rsi,%r10),%r10 # epilogue label
  3449. cmp %r10,%rbx # context->Rip>=epilogue label
  3450. jae .Lcommon_seh_tail
  3451. mov 208($context),%rax # pull context->R11
  3452. lea 0x50(%rax),%rsi
  3453. lea 0xf8(%rax),%rax
  3454. lea 512($context),%rdi # &context.Xmm6
  3455. mov \$20,%ecx
  3456. .long 0xa548f3fc # cld; rep movsq
  3457. .Lcommon_seh_tail:
  3458. mov 8(%rax),%rdi
  3459. mov 16(%rax),%rsi
  3460. mov %rax,152($context) # restore context->Rsp
  3461. mov %rsi,168($context) # restore context->Rsi
  3462. mov %rdi,176($context) # restore context->Rdi
  3463. mov 40($disp),%rdi # disp->ContextRecord
  3464. mov $context,%rsi # context
  3465. mov \$154,%ecx # sizeof(CONTEXT)
  3466. .long 0xa548f3fc # cld; rep movsq
  3467. mov $disp,%rsi
  3468. xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
  3469. mov 8(%rsi),%rdx # arg2, disp->ImageBase
  3470. mov 0(%rsi),%r8 # arg3, disp->ControlPc
  3471. mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
  3472. mov 40(%rsi),%r10 # disp->ContextRecord
  3473. lea 56(%rsi),%r11 # &disp->HandlerData
  3474. lea 24(%rsi),%r12 # &disp->EstablisherFrame
  3475. mov %r10,32(%rsp) # arg5
  3476. mov %r11,40(%rsp) # arg6
  3477. mov %r12,48(%rsp) # arg7
  3478. mov %rcx,56(%rsp) # arg8, (NULL)
  3479. call *__imp_RtlVirtualUnwind(%rip)
  3480. mov \$1,%eax # ExceptionContinueSearch
  3481. add \$64,%rsp
  3482. popfq
  3483. pop %r15
  3484. pop %r14
  3485. pop %r13
  3486. pop %r12
  3487. pop %rbp
  3488. pop %rbx
  3489. pop %rdi
  3490. pop %rsi
  3491. ret
  3492. .size avx_handler,.-avx_handler
  3493. .section .pdata
  3494. .align 4
  3495. .rva .LSEH_begin_poly1305_init
  3496. .rva .LSEH_end_poly1305_init
  3497. .rva .LSEH_info_poly1305_init
  3498. .rva .LSEH_begin_poly1305_blocks
  3499. .rva .LSEH_end_poly1305_blocks
  3500. .rva .LSEH_info_poly1305_blocks
  3501. .rva .LSEH_begin_poly1305_emit
  3502. .rva .LSEH_end_poly1305_emit
  3503. .rva .LSEH_info_poly1305_emit
  3504. ___
  3505. $code.=<<___ if ($avx);
  3506. .rva .LSEH_begin_poly1305_blocks_avx
  3507. .rva .Lbase2_64_avx
  3508. .rva .LSEH_info_poly1305_blocks_avx_1
  3509. .rva .Lbase2_64_avx
  3510. .rva .Leven_avx
  3511. .rva .LSEH_info_poly1305_blocks_avx_2
  3512. .rva .Leven_avx
  3513. .rva .LSEH_end_poly1305_blocks_avx
  3514. .rva .LSEH_info_poly1305_blocks_avx_3
  3515. .rva .LSEH_begin_poly1305_emit_avx
  3516. .rva .LSEH_end_poly1305_emit_avx
  3517. .rva .LSEH_info_poly1305_emit_avx
  3518. ___
  3519. $code.=<<___ if ($avx>1);
  3520. .rva .LSEH_begin_poly1305_blocks_avx2
  3521. .rva .Lbase2_64_avx2
  3522. .rva .LSEH_info_poly1305_blocks_avx2_1
  3523. .rva .Lbase2_64_avx2
  3524. .rva .Leven_avx2
  3525. .rva .LSEH_info_poly1305_blocks_avx2_2
  3526. .rva .Leven_avx2
  3527. .rva .LSEH_end_poly1305_blocks_avx2
  3528. .rva .LSEH_info_poly1305_blocks_avx2_3
  3529. ___
  3530. $code.=<<___ if ($avx>2);
  3531. .rva .LSEH_begin_poly1305_blocks_avx512
  3532. .rva .LSEH_end_poly1305_blocks_avx512
  3533. .rva .LSEH_info_poly1305_blocks_avx512
  3534. ___
  3535. $code.=<<___;
  3536. .section .xdata
  3537. .align 8
  3538. .LSEH_info_poly1305_init:
  3539. .byte 9,0,0,0
  3540. .rva se_handler
  3541. .rva .LSEH_begin_poly1305_init,.LSEH_begin_poly1305_init
  3542. .LSEH_info_poly1305_blocks:
  3543. .byte 9,0,0,0
  3544. .rva se_handler
  3545. .rva .Lblocks_body,.Lblocks_epilogue
  3546. .LSEH_info_poly1305_emit:
  3547. .byte 9,0,0,0
  3548. .rva se_handler
  3549. .rva .LSEH_begin_poly1305_emit,.LSEH_begin_poly1305_emit
  3550. ___
  3551. $code.=<<___ if ($avx);
  3552. .LSEH_info_poly1305_blocks_avx_1:
  3553. .byte 9,0,0,0
  3554. .rva se_handler
  3555. .rva .Lblocks_avx_body,.Lblocks_avx_epilogue # HandlerData[]
  3556. .LSEH_info_poly1305_blocks_avx_2:
  3557. .byte 9,0,0,0
  3558. .rva se_handler
  3559. .rva .Lbase2_64_avx_body,.Lbase2_64_avx_epilogue # HandlerData[]
  3560. .LSEH_info_poly1305_blocks_avx_3:
  3561. .byte 9,0,0,0
  3562. .rva avx_handler
  3563. .rva .Ldo_avx_body,.Ldo_avx_epilogue # HandlerData[]
  3564. .LSEH_info_poly1305_emit_avx:
  3565. .byte 9,0,0,0
  3566. .rva se_handler
  3567. .rva .LSEH_begin_poly1305_emit_avx,.LSEH_begin_poly1305_emit_avx
  3568. ___
  3569. $code.=<<___ if ($avx>1);
  3570. .LSEH_info_poly1305_blocks_avx2_1:
  3571. .byte 9,0,0,0
  3572. .rva se_handler
  3573. .rva .Lblocks_avx2_body,.Lblocks_avx2_epilogue # HandlerData[]
  3574. .LSEH_info_poly1305_blocks_avx2_2:
  3575. .byte 9,0,0,0
  3576. .rva se_handler
  3577. .rva .Lbase2_64_avx2_body,.Lbase2_64_avx2_epilogue # HandlerData[]
  3578. .LSEH_info_poly1305_blocks_avx2_3:
  3579. .byte 9,0,0,0
  3580. .rva avx_handler
  3581. .rva .Ldo_avx2_body,.Ldo_avx2_epilogue # HandlerData[]
  3582. ___
  3583. $code.=<<___ if ($avx>2);
  3584. .LSEH_info_poly1305_blocks_avx512:
  3585. .byte 9,0,0,0
  3586. .rva avx_handler
  3587. .rva .Ldo_avx512_body,.Ldo_avx512_epilogue # HandlerData[]
  3588. ___
  3589. }
  3590. foreach (split('\n',$code)) {
  3591. s/\`([^\`]*)\`/eval($1)/ge;
  3592. s/%r([a-z]+)#d/%e$1/g;
  3593. s/%r([0-9]+)#d/%r$1d/g;
  3594. s/%x#%[yz]/%x/g or s/%y#%z/%y/g or s/%z#%[yz]/%z/g;
  3595. print $_,"\n";
  3596. }
  3597. close STDOUT or die "error closing STDOUT: $!";