fwd_dct32x32_impl_sse2.h 160 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056205720582059206020612062206320642065206620672068206920702071207220732074207520762077207820792080208120822083208420852086208720882089209020912092209320942095209620972098209921002101210221032104210521062107210821092110211121122113211421152116211721182119212021212122212321242125212621272128212921302131213221332134213521362137213821392140214121422143214421452146214721482149215021512152215321542155215621572158215921602161216221632164216521662167216821692170217121722173217421752176217721782179218021812182218321842185218621872188218921902191219221932194219521962197219821992200220122022203220422052206220722082209221022112212221322142215221622172218221922202221222222232224222522262227222822292230223122322233223422352236223722382239224022412242224322442245224622472248224922502251225222532254225522562257225822592260226122622263226422652266226722682269227022712272227322742275227622772278227922802281228222832284228522862287228822892290229122922293229422952296229722982299230023012302230323042305230623072308230923102311231223132314231523162317231823192320232123222323232423252326232723282329233023312332233323342335233623372338233923402341234223432344234523462347234823492350235123522353235423552356235723582359236023612362236323642365236623672368236923702371237223732374237523762377237823792380238123822383238423852386238723882389239023912392239323942395239623972398239924002401240224032404240524062407240824092410241124122413241424152416241724182419242024212422242324242425242624272428242924302431243224332434243524362437243824392440244124422443244424452446244724482449245024512452245324542455245624572458245924602461246224632464246524662467246824692470247124722473247424752476247724782479248024812482248324842485248624872488248924902491249224932494249524962497249824992500250125022503250425052506250725082509251025112512251325142515251625172518251925202521252225232524252525262527252825292530253125322533253425352536253725382539254025412542254325442545254625472548254925502551255225532554255525562557255825592560256125622563256425652566256725682569257025712572257325742575257625772578257925802581258225832584258525862587258825892590259125922593259425952596259725982599260026012602260326042605260626072608260926102611261226132614261526162617261826192620262126222623262426252626262726282629263026312632263326342635263626372638263926402641264226432644264526462647264826492650265126522653265426552656265726582659266026612662266326642665266626672668266926702671267226732674267526762677267826792680268126822683268426852686268726882689269026912692269326942695269626972698269927002701270227032704270527062707270827092710271127122713271427152716271727182719272027212722272327242725272627272728272927302731273227332734273527362737273827392740274127422743274427452746274727482749275027512752275327542755275627572758275927602761276227632764276527662767276827692770277127722773277427752776277727782779278027812782278327842785278627872788278927902791279227932794279527962797279827992800280128022803280428052806280728082809281028112812281328142815281628172818281928202821282228232824282528262827282828292830283128322833283428352836283728382839284028412842284328442845284628472848284928502851285228532854285528562857285828592860286128622863286428652866286728682869287028712872287328742875287628772878287928802881288228832884288528862887288828892890289128922893289428952896289728982899290029012902290329042905290629072908290929102911291229132914291529162917291829192920292129222923292429252926292729282929293029312932293329342935293629372938293929402941294229432944294529462947294829492950295129522953295429552956295729582959296029612962296329642965296629672968296929702971297229732974297529762977297829792980298129822983298429852986298729882989299029912992299329942995299629972998299930003001300230033004300530063007300830093010301130123013301430153016301730183019302030213022302330243025302630273028302930303031303230333034303530363037303830393040304130423043304430453046304730483049305030513052305330543055305630573058305930603061306230633064306530663067306830693070307130723073307430753076307730783079308030813082308330843085308630873088308930903091309230933094309530963097309830993100310131023103310431053106310731083109311031113112311331143115311631173118311931203121312231233124312531263127312831293130
  1. /*
  2. * Copyright (c) 2012 The WebM project authors. All Rights Reserved.
  3. *
  4. * Use of this source code is governed by a BSD-style license
  5. * that can be found in the LICENSE file in the root of the source
  6. * tree. An additional intellectual property rights grant can be found
  7. * in the file PATENTS. All contributing project authors may
  8. * be found in the AUTHORS file in the root of the source tree.
  9. */
  10. #include <emmintrin.h> // SSE2
  11. #include "vpx_dsp/fwd_txfm.h"
  12. #include "vpx_dsp/txfm_common.h"
  13. #include "vpx_dsp/x86/txfm_common_sse2.h"
  14. // TODO(jingning) The high bit-depth version needs re-work for performance.
  15. // The current SSE2 implementation also causes cross reference to the static
  16. // functions in the C implementation file.
  17. #if DCT_HIGH_BIT_DEPTH
  18. #define ADD_EPI16 _mm_adds_epi16
  19. #define SUB_EPI16 _mm_subs_epi16
  20. #if FDCT32x32_HIGH_PRECISION
  21. static void vpx_fdct32x32_rows_c(const int16_t *intermediate, tran_low_t *out) {
  22. int i, j;
  23. for (i = 0; i < 32; ++i) {
  24. tran_high_t temp_in[32], temp_out[32];
  25. for (j = 0; j < 32; ++j) temp_in[j] = intermediate[j * 32 + i];
  26. vpx_fdct32(temp_in, temp_out, 0);
  27. for (j = 0; j < 32; ++j)
  28. out[j + i * 32] =
  29. (tran_low_t)((temp_out[j] + 1 + (temp_out[j] < 0)) >> 2);
  30. }
  31. }
  32. #define HIGH_FDCT32x32_2D_C vpx_highbd_fdct32x32_c
  33. #define HIGH_FDCT32x32_2D_ROWS_C vpx_fdct32x32_rows_c
  34. #else
  35. static void vpx_fdct32x32_rd_rows_c(const int16_t *intermediate,
  36. tran_low_t *out) {
  37. int i, j;
  38. for (i = 0; i < 32; ++i) {
  39. tran_high_t temp_in[32], temp_out[32];
  40. for (j = 0; j < 32; ++j) temp_in[j] = intermediate[j * 32 + i];
  41. vpx_fdct32(temp_in, temp_out, 1);
  42. for (j = 0; j < 32; ++j) out[j + i * 32] = (tran_low_t)temp_out[j];
  43. }
  44. }
  45. #define HIGH_FDCT32x32_2D_C vpx_highbd_fdct32x32_rd_c
  46. #define HIGH_FDCT32x32_2D_ROWS_C vpx_fdct32x32_rd_rows_c
  47. #endif // FDCT32x32_HIGH_PRECISION
  48. #else
  49. #define ADD_EPI16 _mm_add_epi16
  50. #define SUB_EPI16 _mm_sub_epi16
  51. #endif // DCT_HIGH_BIT_DEPTH
  52. void FDCT32x32_2D(const int16_t *input, tran_low_t *output_org, int stride) {
  53. // Calculate pre-multiplied strides
  54. const int str1 = stride;
  55. const int str2 = 2 * stride;
  56. const int str3 = 2 * stride + str1;
  57. // We need an intermediate buffer between passes.
  58. DECLARE_ALIGNED(16, int16_t, intermediate[32 * 32]);
  59. // Constants
  60. // When we use them, in one case, they are all the same. In all others
  61. // it's a pair of them that we need to repeat four times. This is done
  62. // by constructing the 32 bit constant corresponding to that pair.
  63. const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
  64. const __m128i k__cospi_p16_m16 = pair_set_epi16(+cospi_16_64, -cospi_16_64);
  65. const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
  66. const __m128i k__cospi_m24_m08 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
  67. const __m128i k__cospi_p24_p08 = pair_set_epi16(+cospi_24_64, cospi_8_64);
  68. const __m128i k__cospi_p12_p20 = pair_set_epi16(+cospi_12_64, cospi_20_64);
  69. const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64);
  70. const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64);
  71. const __m128i k__cospi_p28_p04 = pair_set_epi16(+cospi_28_64, cospi_4_64);
  72. const __m128i k__cospi_m28_m04 = pair_set_epi16(-cospi_28_64, -cospi_4_64);
  73. const __m128i k__cospi_m12_m20 = pair_set_epi16(-cospi_12_64, -cospi_20_64);
  74. const __m128i k__cospi_p30_p02 = pair_set_epi16(+cospi_30_64, cospi_2_64);
  75. const __m128i k__cospi_p14_p18 = pair_set_epi16(+cospi_14_64, cospi_18_64);
  76. const __m128i k__cospi_p22_p10 = pair_set_epi16(+cospi_22_64, cospi_10_64);
  77. const __m128i k__cospi_p06_p26 = pair_set_epi16(+cospi_6_64, cospi_26_64);
  78. const __m128i k__cospi_m26_p06 = pair_set_epi16(-cospi_26_64, cospi_6_64);
  79. const __m128i k__cospi_m10_p22 = pair_set_epi16(-cospi_10_64, cospi_22_64);
  80. const __m128i k__cospi_m18_p14 = pair_set_epi16(-cospi_18_64, cospi_14_64);
  81. const __m128i k__cospi_m02_p30 = pair_set_epi16(-cospi_2_64, cospi_30_64);
  82. const __m128i k__cospi_p31_p01 = pair_set_epi16(+cospi_31_64, cospi_1_64);
  83. const __m128i k__cospi_p15_p17 = pair_set_epi16(+cospi_15_64, cospi_17_64);
  84. const __m128i k__cospi_p23_p09 = pair_set_epi16(+cospi_23_64, cospi_9_64);
  85. const __m128i k__cospi_p07_p25 = pair_set_epi16(+cospi_7_64, cospi_25_64);
  86. const __m128i k__cospi_m25_p07 = pair_set_epi16(-cospi_25_64, cospi_7_64);
  87. const __m128i k__cospi_m09_p23 = pair_set_epi16(-cospi_9_64, cospi_23_64);
  88. const __m128i k__cospi_m17_p15 = pair_set_epi16(-cospi_17_64, cospi_15_64);
  89. const __m128i k__cospi_m01_p31 = pair_set_epi16(-cospi_1_64, cospi_31_64);
  90. const __m128i k__cospi_p27_p05 = pair_set_epi16(+cospi_27_64, cospi_5_64);
  91. const __m128i k__cospi_p11_p21 = pair_set_epi16(+cospi_11_64, cospi_21_64);
  92. const __m128i k__cospi_p19_p13 = pair_set_epi16(+cospi_19_64, cospi_13_64);
  93. const __m128i k__cospi_p03_p29 = pair_set_epi16(+cospi_3_64, cospi_29_64);
  94. const __m128i k__cospi_m29_p03 = pair_set_epi16(-cospi_29_64, cospi_3_64);
  95. const __m128i k__cospi_m13_p19 = pair_set_epi16(-cospi_13_64, cospi_19_64);
  96. const __m128i k__cospi_m21_p11 = pair_set_epi16(-cospi_21_64, cospi_11_64);
  97. const __m128i k__cospi_m05_p27 = pair_set_epi16(-cospi_5_64, cospi_27_64);
  98. const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
  99. const __m128i kZero = _mm_set1_epi16(0);
  100. const __m128i kOne = _mm_set1_epi16(1);
  101. // Do the two transform/transpose passes
  102. int pass;
  103. #if DCT_HIGH_BIT_DEPTH
  104. int overflow;
  105. #endif
  106. for (pass = 0; pass < 2; ++pass) {
  107. // We process eight columns (transposed rows in second pass) at a time.
  108. int column_start;
  109. for (column_start = 0; column_start < 32; column_start += 8) {
  110. __m128i step1[32];
  111. __m128i step2[32];
  112. __m128i step3[32];
  113. __m128i out[32];
  114. // Stage 1
  115. // Note: even though all the loads below are aligned, using the aligned
  116. // intrinsic make the code slightly slower.
  117. if (0 == pass) {
  118. const int16_t *in = &input[column_start];
  119. // step1[i] = (in[ 0 * stride] + in[(32 - 1) * stride]) << 2;
  120. // Note: the next four blocks could be in a loop. That would help the
  121. // instruction cache but is actually slower.
  122. {
  123. const int16_t *ina = in + 0 * str1;
  124. const int16_t *inb = in + 31 * str1;
  125. __m128i *step1a = &step1[0];
  126. __m128i *step1b = &step1[31];
  127. const __m128i ina0 = _mm_loadu_si128((const __m128i *)(ina));
  128. const __m128i ina1 = _mm_loadu_si128((const __m128i *)(ina + str1));
  129. const __m128i ina2 = _mm_loadu_si128((const __m128i *)(ina + str2));
  130. const __m128i ina3 = _mm_loadu_si128((const __m128i *)(ina + str3));
  131. const __m128i inb3 = _mm_loadu_si128((const __m128i *)(inb - str3));
  132. const __m128i inb2 = _mm_loadu_si128((const __m128i *)(inb - str2));
  133. const __m128i inb1 = _mm_loadu_si128((const __m128i *)(inb - str1));
  134. const __m128i inb0 = _mm_loadu_si128((const __m128i *)(inb));
  135. step1a[0] = _mm_add_epi16(ina0, inb0);
  136. step1a[1] = _mm_add_epi16(ina1, inb1);
  137. step1a[2] = _mm_add_epi16(ina2, inb2);
  138. step1a[3] = _mm_add_epi16(ina3, inb3);
  139. step1b[-3] = _mm_sub_epi16(ina3, inb3);
  140. step1b[-2] = _mm_sub_epi16(ina2, inb2);
  141. step1b[-1] = _mm_sub_epi16(ina1, inb1);
  142. step1b[-0] = _mm_sub_epi16(ina0, inb0);
  143. step1a[0] = _mm_slli_epi16(step1a[0], 2);
  144. step1a[1] = _mm_slli_epi16(step1a[1], 2);
  145. step1a[2] = _mm_slli_epi16(step1a[2], 2);
  146. step1a[3] = _mm_slli_epi16(step1a[3], 2);
  147. step1b[-3] = _mm_slli_epi16(step1b[-3], 2);
  148. step1b[-2] = _mm_slli_epi16(step1b[-2], 2);
  149. step1b[-1] = _mm_slli_epi16(step1b[-1], 2);
  150. step1b[-0] = _mm_slli_epi16(step1b[-0], 2);
  151. }
  152. {
  153. const int16_t *ina = in + 4 * str1;
  154. const int16_t *inb = in + 27 * str1;
  155. __m128i *step1a = &step1[4];
  156. __m128i *step1b = &step1[27];
  157. const __m128i ina0 = _mm_loadu_si128((const __m128i *)(ina));
  158. const __m128i ina1 = _mm_loadu_si128((const __m128i *)(ina + str1));
  159. const __m128i ina2 = _mm_loadu_si128((const __m128i *)(ina + str2));
  160. const __m128i ina3 = _mm_loadu_si128((const __m128i *)(ina + str3));
  161. const __m128i inb3 = _mm_loadu_si128((const __m128i *)(inb - str3));
  162. const __m128i inb2 = _mm_loadu_si128((const __m128i *)(inb - str2));
  163. const __m128i inb1 = _mm_loadu_si128((const __m128i *)(inb - str1));
  164. const __m128i inb0 = _mm_loadu_si128((const __m128i *)(inb));
  165. step1a[0] = _mm_add_epi16(ina0, inb0);
  166. step1a[1] = _mm_add_epi16(ina1, inb1);
  167. step1a[2] = _mm_add_epi16(ina2, inb2);
  168. step1a[3] = _mm_add_epi16(ina3, inb3);
  169. step1b[-3] = _mm_sub_epi16(ina3, inb3);
  170. step1b[-2] = _mm_sub_epi16(ina2, inb2);
  171. step1b[-1] = _mm_sub_epi16(ina1, inb1);
  172. step1b[-0] = _mm_sub_epi16(ina0, inb0);
  173. step1a[0] = _mm_slli_epi16(step1a[0], 2);
  174. step1a[1] = _mm_slli_epi16(step1a[1], 2);
  175. step1a[2] = _mm_slli_epi16(step1a[2], 2);
  176. step1a[3] = _mm_slli_epi16(step1a[3], 2);
  177. step1b[-3] = _mm_slli_epi16(step1b[-3], 2);
  178. step1b[-2] = _mm_slli_epi16(step1b[-2], 2);
  179. step1b[-1] = _mm_slli_epi16(step1b[-1], 2);
  180. step1b[-0] = _mm_slli_epi16(step1b[-0], 2);
  181. }
  182. {
  183. const int16_t *ina = in + 8 * str1;
  184. const int16_t *inb = in + 23 * str1;
  185. __m128i *step1a = &step1[8];
  186. __m128i *step1b = &step1[23];
  187. const __m128i ina0 = _mm_loadu_si128((const __m128i *)(ina));
  188. const __m128i ina1 = _mm_loadu_si128((const __m128i *)(ina + str1));
  189. const __m128i ina2 = _mm_loadu_si128((const __m128i *)(ina + str2));
  190. const __m128i ina3 = _mm_loadu_si128((const __m128i *)(ina + str3));
  191. const __m128i inb3 = _mm_loadu_si128((const __m128i *)(inb - str3));
  192. const __m128i inb2 = _mm_loadu_si128((const __m128i *)(inb - str2));
  193. const __m128i inb1 = _mm_loadu_si128((const __m128i *)(inb - str1));
  194. const __m128i inb0 = _mm_loadu_si128((const __m128i *)(inb));
  195. step1a[0] = _mm_add_epi16(ina0, inb0);
  196. step1a[1] = _mm_add_epi16(ina1, inb1);
  197. step1a[2] = _mm_add_epi16(ina2, inb2);
  198. step1a[3] = _mm_add_epi16(ina3, inb3);
  199. step1b[-3] = _mm_sub_epi16(ina3, inb3);
  200. step1b[-2] = _mm_sub_epi16(ina2, inb2);
  201. step1b[-1] = _mm_sub_epi16(ina1, inb1);
  202. step1b[-0] = _mm_sub_epi16(ina0, inb0);
  203. step1a[0] = _mm_slli_epi16(step1a[0], 2);
  204. step1a[1] = _mm_slli_epi16(step1a[1], 2);
  205. step1a[2] = _mm_slli_epi16(step1a[2], 2);
  206. step1a[3] = _mm_slli_epi16(step1a[3], 2);
  207. step1b[-3] = _mm_slli_epi16(step1b[-3], 2);
  208. step1b[-2] = _mm_slli_epi16(step1b[-2], 2);
  209. step1b[-1] = _mm_slli_epi16(step1b[-1], 2);
  210. step1b[-0] = _mm_slli_epi16(step1b[-0], 2);
  211. }
  212. {
  213. const int16_t *ina = in + 12 * str1;
  214. const int16_t *inb = in + 19 * str1;
  215. __m128i *step1a = &step1[12];
  216. __m128i *step1b = &step1[19];
  217. const __m128i ina0 = _mm_loadu_si128((const __m128i *)(ina));
  218. const __m128i ina1 = _mm_loadu_si128((const __m128i *)(ina + str1));
  219. const __m128i ina2 = _mm_loadu_si128((const __m128i *)(ina + str2));
  220. const __m128i ina3 = _mm_loadu_si128((const __m128i *)(ina + str3));
  221. const __m128i inb3 = _mm_loadu_si128((const __m128i *)(inb - str3));
  222. const __m128i inb2 = _mm_loadu_si128((const __m128i *)(inb - str2));
  223. const __m128i inb1 = _mm_loadu_si128((const __m128i *)(inb - str1));
  224. const __m128i inb0 = _mm_loadu_si128((const __m128i *)(inb));
  225. step1a[0] = _mm_add_epi16(ina0, inb0);
  226. step1a[1] = _mm_add_epi16(ina1, inb1);
  227. step1a[2] = _mm_add_epi16(ina2, inb2);
  228. step1a[3] = _mm_add_epi16(ina3, inb3);
  229. step1b[-3] = _mm_sub_epi16(ina3, inb3);
  230. step1b[-2] = _mm_sub_epi16(ina2, inb2);
  231. step1b[-1] = _mm_sub_epi16(ina1, inb1);
  232. step1b[-0] = _mm_sub_epi16(ina0, inb0);
  233. step1a[0] = _mm_slli_epi16(step1a[0], 2);
  234. step1a[1] = _mm_slli_epi16(step1a[1], 2);
  235. step1a[2] = _mm_slli_epi16(step1a[2], 2);
  236. step1a[3] = _mm_slli_epi16(step1a[3], 2);
  237. step1b[-3] = _mm_slli_epi16(step1b[-3], 2);
  238. step1b[-2] = _mm_slli_epi16(step1b[-2], 2);
  239. step1b[-1] = _mm_slli_epi16(step1b[-1], 2);
  240. step1b[-0] = _mm_slli_epi16(step1b[-0], 2);
  241. }
  242. } else {
  243. int16_t *in = &intermediate[column_start];
  244. // step1[i] = in[ 0 * 32] + in[(32 - 1) * 32];
  245. // Note: using the same approach as above to have common offset is
  246. // counter-productive as all offsets can be calculated at compile
  247. // time.
  248. // Note: the next four blocks could be in a loop. That would help the
  249. // instruction cache but is actually slower.
  250. {
  251. __m128i in00 = _mm_loadu_si128((const __m128i *)(in + 0 * 32));
  252. __m128i in01 = _mm_loadu_si128((const __m128i *)(in + 1 * 32));
  253. __m128i in02 = _mm_loadu_si128((const __m128i *)(in + 2 * 32));
  254. __m128i in03 = _mm_loadu_si128((const __m128i *)(in + 3 * 32));
  255. __m128i in28 = _mm_loadu_si128((const __m128i *)(in + 28 * 32));
  256. __m128i in29 = _mm_loadu_si128((const __m128i *)(in + 29 * 32));
  257. __m128i in30 = _mm_loadu_si128((const __m128i *)(in + 30 * 32));
  258. __m128i in31 = _mm_loadu_si128((const __m128i *)(in + 31 * 32));
  259. step1[0] = ADD_EPI16(in00, in31);
  260. step1[1] = ADD_EPI16(in01, in30);
  261. step1[2] = ADD_EPI16(in02, in29);
  262. step1[3] = ADD_EPI16(in03, in28);
  263. step1[28] = SUB_EPI16(in03, in28);
  264. step1[29] = SUB_EPI16(in02, in29);
  265. step1[30] = SUB_EPI16(in01, in30);
  266. step1[31] = SUB_EPI16(in00, in31);
  267. #if DCT_HIGH_BIT_DEPTH
  268. overflow = check_epi16_overflow_x8(&step1[0], &step1[1], &step1[2],
  269. &step1[3], &step1[28], &step1[29],
  270. &step1[30], &step1[31]);
  271. if (overflow) {
  272. HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
  273. return;
  274. }
  275. #endif // DCT_HIGH_BIT_DEPTH
  276. }
  277. {
  278. __m128i in04 = _mm_loadu_si128((const __m128i *)(in + 4 * 32));
  279. __m128i in05 = _mm_loadu_si128((const __m128i *)(in + 5 * 32));
  280. __m128i in06 = _mm_loadu_si128((const __m128i *)(in + 6 * 32));
  281. __m128i in07 = _mm_loadu_si128((const __m128i *)(in + 7 * 32));
  282. __m128i in24 = _mm_loadu_si128((const __m128i *)(in + 24 * 32));
  283. __m128i in25 = _mm_loadu_si128((const __m128i *)(in + 25 * 32));
  284. __m128i in26 = _mm_loadu_si128((const __m128i *)(in + 26 * 32));
  285. __m128i in27 = _mm_loadu_si128((const __m128i *)(in + 27 * 32));
  286. step1[4] = ADD_EPI16(in04, in27);
  287. step1[5] = ADD_EPI16(in05, in26);
  288. step1[6] = ADD_EPI16(in06, in25);
  289. step1[7] = ADD_EPI16(in07, in24);
  290. step1[24] = SUB_EPI16(in07, in24);
  291. step1[25] = SUB_EPI16(in06, in25);
  292. step1[26] = SUB_EPI16(in05, in26);
  293. step1[27] = SUB_EPI16(in04, in27);
  294. #if DCT_HIGH_BIT_DEPTH
  295. overflow = check_epi16_overflow_x8(&step1[4], &step1[5], &step1[6],
  296. &step1[7], &step1[24], &step1[25],
  297. &step1[26], &step1[27]);
  298. if (overflow) {
  299. HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
  300. return;
  301. }
  302. #endif // DCT_HIGH_BIT_DEPTH
  303. }
  304. {
  305. __m128i in08 = _mm_loadu_si128((const __m128i *)(in + 8 * 32));
  306. __m128i in09 = _mm_loadu_si128((const __m128i *)(in + 9 * 32));
  307. __m128i in10 = _mm_loadu_si128((const __m128i *)(in + 10 * 32));
  308. __m128i in11 = _mm_loadu_si128((const __m128i *)(in + 11 * 32));
  309. __m128i in20 = _mm_loadu_si128((const __m128i *)(in + 20 * 32));
  310. __m128i in21 = _mm_loadu_si128((const __m128i *)(in + 21 * 32));
  311. __m128i in22 = _mm_loadu_si128((const __m128i *)(in + 22 * 32));
  312. __m128i in23 = _mm_loadu_si128((const __m128i *)(in + 23 * 32));
  313. step1[8] = ADD_EPI16(in08, in23);
  314. step1[9] = ADD_EPI16(in09, in22);
  315. step1[10] = ADD_EPI16(in10, in21);
  316. step1[11] = ADD_EPI16(in11, in20);
  317. step1[20] = SUB_EPI16(in11, in20);
  318. step1[21] = SUB_EPI16(in10, in21);
  319. step1[22] = SUB_EPI16(in09, in22);
  320. step1[23] = SUB_EPI16(in08, in23);
  321. #if DCT_HIGH_BIT_DEPTH
  322. overflow = check_epi16_overflow_x8(&step1[8], &step1[9], &step1[10],
  323. &step1[11], &step1[20], &step1[21],
  324. &step1[22], &step1[23]);
  325. if (overflow) {
  326. HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
  327. return;
  328. }
  329. #endif // DCT_HIGH_BIT_DEPTH
  330. }
  331. {
  332. __m128i in12 = _mm_loadu_si128((const __m128i *)(in + 12 * 32));
  333. __m128i in13 = _mm_loadu_si128((const __m128i *)(in + 13 * 32));
  334. __m128i in14 = _mm_loadu_si128((const __m128i *)(in + 14 * 32));
  335. __m128i in15 = _mm_loadu_si128((const __m128i *)(in + 15 * 32));
  336. __m128i in16 = _mm_loadu_si128((const __m128i *)(in + 16 * 32));
  337. __m128i in17 = _mm_loadu_si128((const __m128i *)(in + 17 * 32));
  338. __m128i in18 = _mm_loadu_si128((const __m128i *)(in + 18 * 32));
  339. __m128i in19 = _mm_loadu_si128((const __m128i *)(in + 19 * 32));
  340. step1[12] = ADD_EPI16(in12, in19);
  341. step1[13] = ADD_EPI16(in13, in18);
  342. step1[14] = ADD_EPI16(in14, in17);
  343. step1[15] = ADD_EPI16(in15, in16);
  344. step1[16] = SUB_EPI16(in15, in16);
  345. step1[17] = SUB_EPI16(in14, in17);
  346. step1[18] = SUB_EPI16(in13, in18);
  347. step1[19] = SUB_EPI16(in12, in19);
  348. #if DCT_HIGH_BIT_DEPTH
  349. overflow = check_epi16_overflow_x8(&step1[12], &step1[13], &step1[14],
  350. &step1[15], &step1[16], &step1[17],
  351. &step1[18], &step1[19]);
  352. if (overflow) {
  353. HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
  354. return;
  355. }
  356. #endif // DCT_HIGH_BIT_DEPTH
  357. }
  358. }
  359. // Stage 2
  360. {
  361. step2[0] = ADD_EPI16(step1[0], step1[15]);
  362. step2[1] = ADD_EPI16(step1[1], step1[14]);
  363. step2[2] = ADD_EPI16(step1[2], step1[13]);
  364. step2[3] = ADD_EPI16(step1[3], step1[12]);
  365. step2[4] = ADD_EPI16(step1[4], step1[11]);
  366. step2[5] = ADD_EPI16(step1[5], step1[10]);
  367. step2[6] = ADD_EPI16(step1[6], step1[9]);
  368. step2[7] = ADD_EPI16(step1[7], step1[8]);
  369. step2[8] = SUB_EPI16(step1[7], step1[8]);
  370. step2[9] = SUB_EPI16(step1[6], step1[9]);
  371. step2[10] = SUB_EPI16(step1[5], step1[10]);
  372. step2[11] = SUB_EPI16(step1[4], step1[11]);
  373. step2[12] = SUB_EPI16(step1[3], step1[12]);
  374. step2[13] = SUB_EPI16(step1[2], step1[13]);
  375. step2[14] = SUB_EPI16(step1[1], step1[14]);
  376. step2[15] = SUB_EPI16(step1[0], step1[15]);
  377. #if DCT_HIGH_BIT_DEPTH
  378. overflow = check_epi16_overflow_x16(
  379. &step2[0], &step2[1], &step2[2], &step2[3], &step2[4], &step2[5],
  380. &step2[6], &step2[7], &step2[8], &step2[9], &step2[10], &step2[11],
  381. &step2[12], &step2[13], &step2[14], &step2[15]);
  382. if (overflow) {
  383. if (pass == 0)
  384. HIGH_FDCT32x32_2D_C(input, output_org, stride);
  385. else
  386. HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
  387. return;
  388. }
  389. #endif // DCT_HIGH_BIT_DEPTH
  390. }
  391. {
  392. const __m128i s2_20_0 = _mm_unpacklo_epi16(step1[27], step1[20]);
  393. const __m128i s2_20_1 = _mm_unpackhi_epi16(step1[27], step1[20]);
  394. const __m128i s2_21_0 = _mm_unpacklo_epi16(step1[26], step1[21]);
  395. const __m128i s2_21_1 = _mm_unpackhi_epi16(step1[26], step1[21]);
  396. const __m128i s2_22_0 = _mm_unpacklo_epi16(step1[25], step1[22]);
  397. const __m128i s2_22_1 = _mm_unpackhi_epi16(step1[25], step1[22]);
  398. const __m128i s2_23_0 = _mm_unpacklo_epi16(step1[24], step1[23]);
  399. const __m128i s2_23_1 = _mm_unpackhi_epi16(step1[24], step1[23]);
  400. const __m128i s2_20_2 = _mm_madd_epi16(s2_20_0, k__cospi_p16_m16);
  401. const __m128i s2_20_3 = _mm_madd_epi16(s2_20_1, k__cospi_p16_m16);
  402. const __m128i s2_21_2 = _mm_madd_epi16(s2_21_0, k__cospi_p16_m16);
  403. const __m128i s2_21_3 = _mm_madd_epi16(s2_21_1, k__cospi_p16_m16);
  404. const __m128i s2_22_2 = _mm_madd_epi16(s2_22_0, k__cospi_p16_m16);
  405. const __m128i s2_22_3 = _mm_madd_epi16(s2_22_1, k__cospi_p16_m16);
  406. const __m128i s2_23_2 = _mm_madd_epi16(s2_23_0, k__cospi_p16_m16);
  407. const __m128i s2_23_3 = _mm_madd_epi16(s2_23_1, k__cospi_p16_m16);
  408. const __m128i s2_24_2 = _mm_madd_epi16(s2_23_0, k__cospi_p16_p16);
  409. const __m128i s2_24_3 = _mm_madd_epi16(s2_23_1, k__cospi_p16_p16);
  410. const __m128i s2_25_2 = _mm_madd_epi16(s2_22_0, k__cospi_p16_p16);
  411. const __m128i s2_25_3 = _mm_madd_epi16(s2_22_1, k__cospi_p16_p16);
  412. const __m128i s2_26_2 = _mm_madd_epi16(s2_21_0, k__cospi_p16_p16);
  413. const __m128i s2_26_3 = _mm_madd_epi16(s2_21_1, k__cospi_p16_p16);
  414. const __m128i s2_27_2 = _mm_madd_epi16(s2_20_0, k__cospi_p16_p16);
  415. const __m128i s2_27_3 = _mm_madd_epi16(s2_20_1, k__cospi_p16_p16);
  416. // dct_const_round_shift
  417. const __m128i s2_20_4 = _mm_add_epi32(s2_20_2, k__DCT_CONST_ROUNDING);
  418. const __m128i s2_20_5 = _mm_add_epi32(s2_20_3, k__DCT_CONST_ROUNDING);
  419. const __m128i s2_21_4 = _mm_add_epi32(s2_21_2, k__DCT_CONST_ROUNDING);
  420. const __m128i s2_21_5 = _mm_add_epi32(s2_21_3, k__DCT_CONST_ROUNDING);
  421. const __m128i s2_22_4 = _mm_add_epi32(s2_22_2, k__DCT_CONST_ROUNDING);
  422. const __m128i s2_22_5 = _mm_add_epi32(s2_22_3, k__DCT_CONST_ROUNDING);
  423. const __m128i s2_23_4 = _mm_add_epi32(s2_23_2, k__DCT_CONST_ROUNDING);
  424. const __m128i s2_23_5 = _mm_add_epi32(s2_23_3, k__DCT_CONST_ROUNDING);
  425. const __m128i s2_24_4 = _mm_add_epi32(s2_24_2, k__DCT_CONST_ROUNDING);
  426. const __m128i s2_24_5 = _mm_add_epi32(s2_24_3, k__DCT_CONST_ROUNDING);
  427. const __m128i s2_25_4 = _mm_add_epi32(s2_25_2, k__DCT_CONST_ROUNDING);
  428. const __m128i s2_25_5 = _mm_add_epi32(s2_25_3, k__DCT_CONST_ROUNDING);
  429. const __m128i s2_26_4 = _mm_add_epi32(s2_26_2, k__DCT_CONST_ROUNDING);
  430. const __m128i s2_26_5 = _mm_add_epi32(s2_26_3, k__DCT_CONST_ROUNDING);
  431. const __m128i s2_27_4 = _mm_add_epi32(s2_27_2, k__DCT_CONST_ROUNDING);
  432. const __m128i s2_27_5 = _mm_add_epi32(s2_27_3, k__DCT_CONST_ROUNDING);
  433. const __m128i s2_20_6 = _mm_srai_epi32(s2_20_4, DCT_CONST_BITS);
  434. const __m128i s2_20_7 = _mm_srai_epi32(s2_20_5, DCT_CONST_BITS);
  435. const __m128i s2_21_6 = _mm_srai_epi32(s2_21_4, DCT_CONST_BITS);
  436. const __m128i s2_21_7 = _mm_srai_epi32(s2_21_5, DCT_CONST_BITS);
  437. const __m128i s2_22_6 = _mm_srai_epi32(s2_22_4, DCT_CONST_BITS);
  438. const __m128i s2_22_7 = _mm_srai_epi32(s2_22_5, DCT_CONST_BITS);
  439. const __m128i s2_23_6 = _mm_srai_epi32(s2_23_4, DCT_CONST_BITS);
  440. const __m128i s2_23_7 = _mm_srai_epi32(s2_23_5, DCT_CONST_BITS);
  441. const __m128i s2_24_6 = _mm_srai_epi32(s2_24_4, DCT_CONST_BITS);
  442. const __m128i s2_24_7 = _mm_srai_epi32(s2_24_5, DCT_CONST_BITS);
  443. const __m128i s2_25_6 = _mm_srai_epi32(s2_25_4, DCT_CONST_BITS);
  444. const __m128i s2_25_7 = _mm_srai_epi32(s2_25_5, DCT_CONST_BITS);
  445. const __m128i s2_26_6 = _mm_srai_epi32(s2_26_4, DCT_CONST_BITS);
  446. const __m128i s2_26_7 = _mm_srai_epi32(s2_26_5, DCT_CONST_BITS);
  447. const __m128i s2_27_6 = _mm_srai_epi32(s2_27_4, DCT_CONST_BITS);
  448. const __m128i s2_27_7 = _mm_srai_epi32(s2_27_5, DCT_CONST_BITS);
  449. // Combine
  450. step2[20] = _mm_packs_epi32(s2_20_6, s2_20_7);
  451. step2[21] = _mm_packs_epi32(s2_21_6, s2_21_7);
  452. step2[22] = _mm_packs_epi32(s2_22_6, s2_22_7);
  453. step2[23] = _mm_packs_epi32(s2_23_6, s2_23_7);
  454. step2[24] = _mm_packs_epi32(s2_24_6, s2_24_7);
  455. step2[25] = _mm_packs_epi32(s2_25_6, s2_25_7);
  456. step2[26] = _mm_packs_epi32(s2_26_6, s2_26_7);
  457. step2[27] = _mm_packs_epi32(s2_27_6, s2_27_7);
  458. #if DCT_HIGH_BIT_DEPTH
  459. overflow = check_epi16_overflow_x8(&step2[20], &step2[21], &step2[22],
  460. &step2[23], &step2[24], &step2[25],
  461. &step2[26], &step2[27]);
  462. if (overflow) {
  463. if (pass == 0)
  464. HIGH_FDCT32x32_2D_C(input, output_org, stride);
  465. else
  466. HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
  467. return;
  468. }
  469. #endif // DCT_HIGH_BIT_DEPTH
  470. }
  471. #if !FDCT32x32_HIGH_PRECISION
  472. // dump the magnitude by half, hence the intermediate values are within
  473. // the range of 16 bits.
  474. if (1 == pass) {
  475. __m128i s3_00_0 = _mm_cmplt_epi16(step2[0], kZero);
  476. __m128i s3_01_0 = _mm_cmplt_epi16(step2[1], kZero);
  477. __m128i s3_02_0 = _mm_cmplt_epi16(step2[2], kZero);
  478. __m128i s3_03_0 = _mm_cmplt_epi16(step2[3], kZero);
  479. __m128i s3_04_0 = _mm_cmplt_epi16(step2[4], kZero);
  480. __m128i s3_05_0 = _mm_cmplt_epi16(step2[5], kZero);
  481. __m128i s3_06_0 = _mm_cmplt_epi16(step2[6], kZero);
  482. __m128i s3_07_0 = _mm_cmplt_epi16(step2[7], kZero);
  483. __m128i s2_08_0 = _mm_cmplt_epi16(step2[8], kZero);
  484. __m128i s2_09_0 = _mm_cmplt_epi16(step2[9], kZero);
  485. __m128i s3_10_0 = _mm_cmplt_epi16(step2[10], kZero);
  486. __m128i s3_11_0 = _mm_cmplt_epi16(step2[11], kZero);
  487. __m128i s3_12_0 = _mm_cmplt_epi16(step2[12], kZero);
  488. __m128i s3_13_0 = _mm_cmplt_epi16(step2[13], kZero);
  489. __m128i s2_14_0 = _mm_cmplt_epi16(step2[14], kZero);
  490. __m128i s2_15_0 = _mm_cmplt_epi16(step2[15], kZero);
  491. __m128i s3_16_0 = _mm_cmplt_epi16(step1[16], kZero);
  492. __m128i s3_17_0 = _mm_cmplt_epi16(step1[17], kZero);
  493. __m128i s3_18_0 = _mm_cmplt_epi16(step1[18], kZero);
  494. __m128i s3_19_0 = _mm_cmplt_epi16(step1[19], kZero);
  495. __m128i s3_20_0 = _mm_cmplt_epi16(step2[20], kZero);
  496. __m128i s3_21_0 = _mm_cmplt_epi16(step2[21], kZero);
  497. __m128i s3_22_0 = _mm_cmplt_epi16(step2[22], kZero);
  498. __m128i s3_23_0 = _mm_cmplt_epi16(step2[23], kZero);
  499. __m128i s3_24_0 = _mm_cmplt_epi16(step2[24], kZero);
  500. __m128i s3_25_0 = _mm_cmplt_epi16(step2[25], kZero);
  501. __m128i s3_26_0 = _mm_cmplt_epi16(step2[26], kZero);
  502. __m128i s3_27_0 = _mm_cmplt_epi16(step2[27], kZero);
  503. __m128i s3_28_0 = _mm_cmplt_epi16(step1[28], kZero);
  504. __m128i s3_29_0 = _mm_cmplt_epi16(step1[29], kZero);
  505. __m128i s3_30_0 = _mm_cmplt_epi16(step1[30], kZero);
  506. __m128i s3_31_0 = _mm_cmplt_epi16(step1[31], kZero);
  507. step2[0] = SUB_EPI16(step2[0], s3_00_0);
  508. step2[1] = SUB_EPI16(step2[1], s3_01_0);
  509. step2[2] = SUB_EPI16(step2[2], s3_02_0);
  510. step2[3] = SUB_EPI16(step2[3], s3_03_0);
  511. step2[4] = SUB_EPI16(step2[4], s3_04_0);
  512. step2[5] = SUB_EPI16(step2[5], s3_05_0);
  513. step2[6] = SUB_EPI16(step2[6], s3_06_0);
  514. step2[7] = SUB_EPI16(step2[7], s3_07_0);
  515. step2[8] = SUB_EPI16(step2[8], s2_08_0);
  516. step2[9] = SUB_EPI16(step2[9], s2_09_0);
  517. step2[10] = SUB_EPI16(step2[10], s3_10_0);
  518. step2[11] = SUB_EPI16(step2[11], s3_11_0);
  519. step2[12] = SUB_EPI16(step2[12], s3_12_0);
  520. step2[13] = SUB_EPI16(step2[13], s3_13_0);
  521. step2[14] = SUB_EPI16(step2[14], s2_14_0);
  522. step2[15] = SUB_EPI16(step2[15], s2_15_0);
  523. step1[16] = SUB_EPI16(step1[16], s3_16_0);
  524. step1[17] = SUB_EPI16(step1[17], s3_17_0);
  525. step1[18] = SUB_EPI16(step1[18], s3_18_0);
  526. step1[19] = SUB_EPI16(step1[19], s3_19_0);
  527. step2[20] = SUB_EPI16(step2[20], s3_20_0);
  528. step2[21] = SUB_EPI16(step2[21], s3_21_0);
  529. step2[22] = SUB_EPI16(step2[22], s3_22_0);
  530. step2[23] = SUB_EPI16(step2[23], s3_23_0);
  531. step2[24] = SUB_EPI16(step2[24], s3_24_0);
  532. step2[25] = SUB_EPI16(step2[25], s3_25_0);
  533. step2[26] = SUB_EPI16(step2[26], s3_26_0);
  534. step2[27] = SUB_EPI16(step2[27], s3_27_0);
  535. step1[28] = SUB_EPI16(step1[28], s3_28_0);
  536. step1[29] = SUB_EPI16(step1[29], s3_29_0);
  537. step1[30] = SUB_EPI16(step1[30], s3_30_0);
  538. step1[31] = SUB_EPI16(step1[31], s3_31_0);
  539. #if DCT_HIGH_BIT_DEPTH
  540. overflow = check_epi16_overflow_x32(
  541. &step2[0], &step2[1], &step2[2], &step2[3], &step2[4], &step2[5],
  542. &step2[6], &step2[7], &step2[8], &step2[9], &step2[10], &step2[11],
  543. &step2[12], &step2[13], &step2[14], &step2[15], &step1[16],
  544. &step1[17], &step1[18], &step1[19], &step2[20], &step2[21],
  545. &step2[22], &step2[23], &step2[24], &step2[25], &step2[26],
  546. &step2[27], &step1[28], &step1[29], &step1[30], &step1[31]);
  547. if (overflow) {
  548. HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
  549. return;
  550. }
  551. #endif // DCT_HIGH_BIT_DEPTH
  552. step2[0] = _mm_add_epi16(step2[0], kOne);
  553. step2[1] = _mm_add_epi16(step2[1], kOne);
  554. step2[2] = _mm_add_epi16(step2[2], kOne);
  555. step2[3] = _mm_add_epi16(step2[3], kOne);
  556. step2[4] = _mm_add_epi16(step2[4], kOne);
  557. step2[5] = _mm_add_epi16(step2[5], kOne);
  558. step2[6] = _mm_add_epi16(step2[6], kOne);
  559. step2[7] = _mm_add_epi16(step2[7], kOne);
  560. step2[8] = _mm_add_epi16(step2[8], kOne);
  561. step2[9] = _mm_add_epi16(step2[9], kOne);
  562. step2[10] = _mm_add_epi16(step2[10], kOne);
  563. step2[11] = _mm_add_epi16(step2[11], kOne);
  564. step2[12] = _mm_add_epi16(step2[12], kOne);
  565. step2[13] = _mm_add_epi16(step2[13], kOne);
  566. step2[14] = _mm_add_epi16(step2[14], kOne);
  567. step2[15] = _mm_add_epi16(step2[15], kOne);
  568. step1[16] = _mm_add_epi16(step1[16], kOne);
  569. step1[17] = _mm_add_epi16(step1[17], kOne);
  570. step1[18] = _mm_add_epi16(step1[18], kOne);
  571. step1[19] = _mm_add_epi16(step1[19], kOne);
  572. step2[20] = _mm_add_epi16(step2[20], kOne);
  573. step2[21] = _mm_add_epi16(step2[21], kOne);
  574. step2[22] = _mm_add_epi16(step2[22], kOne);
  575. step2[23] = _mm_add_epi16(step2[23], kOne);
  576. step2[24] = _mm_add_epi16(step2[24], kOne);
  577. step2[25] = _mm_add_epi16(step2[25], kOne);
  578. step2[26] = _mm_add_epi16(step2[26], kOne);
  579. step2[27] = _mm_add_epi16(step2[27], kOne);
  580. step1[28] = _mm_add_epi16(step1[28], kOne);
  581. step1[29] = _mm_add_epi16(step1[29], kOne);
  582. step1[30] = _mm_add_epi16(step1[30], kOne);
  583. step1[31] = _mm_add_epi16(step1[31], kOne);
  584. step2[0] = _mm_srai_epi16(step2[0], 2);
  585. step2[1] = _mm_srai_epi16(step2[1], 2);
  586. step2[2] = _mm_srai_epi16(step2[2], 2);
  587. step2[3] = _mm_srai_epi16(step2[3], 2);
  588. step2[4] = _mm_srai_epi16(step2[4], 2);
  589. step2[5] = _mm_srai_epi16(step2[5], 2);
  590. step2[6] = _mm_srai_epi16(step2[6], 2);
  591. step2[7] = _mm_srai_epi16(step2[7], 2);
  592. step2[8] = _mm_srai_epi16(step2[8], 2);
  593. step2[9] = _mm_srai_epi16(step2[9], 2);
  594. step2[10] = _mm_srai_epi16(step2[10], 2);
  595. step2[11] = _mm_srai_epi16(step2[11], 2);
  596. step2[12] = _mm_srai_epi16(step2[12], 2);
  597. step2[13] = _mm_srai_epi16(step2[13], 2);
  598. step2[14] = _mm_srai_epi16(step2[14], 2);
  599. step2[15] = _mm_srai_epi16(step2[15], 2);
  600. step1[16] = _mm_srai_epi16(step1[16], 2);
  601. step1[17] = _mm_srai_epi16(step1[17], 2);
  602. step1[18] = _mm_srai_epi16(step1[18], 2);
  603. step1[19] = _mm_srai_epi16(step1[19], 2);
  604. step2[20] = _mm_srai_epi16(step2[20], 2);
  605. step2[21] = _mm_srai_epi16(step2[21], 2);
  606. step2[22] = _mm_srai_epi16(step2[22], 2);
  607. step2[23] = _mm_srai_epi16(step2[23], 2);
  608. step2[24] = _mm_srai_epi16(step2[24], 2);
  609. step2[25] = _mm_srai_epi16(step2[25], 2);
  610. step2[26] = _mm_srai_epi16(step2[26], 2);
  611. step2[27] = _mm_srai_epi16(step2[27], 2);
  612. step1[28] = _mm_srai_epi16(step1[28], 2);
  613. step1[29] = _mm_srai_epi16(step1[29], 2);
  614. step1[30] = _mm_srai_epi16(step1[30], 2);
  615. step1[31] = _mm_srai_epi16(step1[31], 2);
  616. }
  617. #endif // !FDCT32x32_HIGH_PRECISION
  618. #if FDCT32x32_HIGH_PRECISION
  619. if (pass == 0) {
  620. #endif
  621. // Stage 3
  622. {
  623. step3[0] = ADD_EPI16(step2[(8 - 1)], step2[0]);
  624. step3[1] = ADD_EPI16(step2[(8 - 2)], step2[1]);
  625. step3[2] = ADD_EPI16(step2[(8 - 3)], step2[2]);
  626. step3[3] = ADD_EPI16(step2[(8 - 4)], step2[3]);
  627. step3[4] = SUB_EPI16(step2[(8 - 5)], step2[4]);
  628. step3[5] = SUB_EPI16(step2[(8 - 6)], step2[5]);
  629. step3[6] = SUB_EPI16(step2[(8 - 7)], step2[6]);
  630. step3[7] = SUB_EPI16(step2[(8 - 8)], step2[7]);
  631. #if DCT_HIGH_BIT_DEPTH
  632. overflow = check_epi16_overflow_x8(&step3[0], &step3[1], &step3[2],
  633. &step3[3], &step3[4], &step3[5],
  634. &step3[6], &step3[7]);
  635. if (overflow) {
  636. if (pass == 0)
  637. HIGH_FDCT32x32_2D_C(input, output_org, stride);
  638. else
  639. HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
  640. return;
  641. }
  642. #endif // DCT_HIGH_BIT_DEPTH
  643. }
  644. {
  645. const __m128i s3_10_0 = _mm_unpacklo_epi16(step2[13], step2[10]);
  646. const __m128i s3_10_1 = _mm_unpackhi_epi16(step2[13], step2[10]);
  647. const __m128i s3_11_0 = _mm_unpacklo_epi16(step2[12], step2[11]);
  648. const __m128i s3_11_1 = _mm_unpackhi_epi16(step2[12], step2[11]);
  649. const __m128i s3_10_2 = _mm_madd_epi16(s3_10_0, k__cospi_p16_m16);
  650. const __m128i s3_10_3 = _mm_madd_epi16(s3_10_1, k__cospi_p16_m16);
  651. const __m128i s3_11_2 = _mm_madd_epi16(s3_11_0, k__cospi_p16_m16);
  652. const __m128i s3_11_3 = _mm_madd_epi16(s3_11_1, k__cospi_p16_m16);
  653. const __m128i s3_12_2 = _mm_madd_epi16(s3_11_0, k__cospi_p16_p16);
  654. const __m128i s3_12_3 = _mm_madd_epi16(s3_11_1, k__cospi_p16_p16);
  655. const __m128i s3_13_2 = _mm_madd_epi16(s3_10_0, k__cospi_p16_p16);
  656. const __m128i s3_13_3 = _mm_madd_epi16(s3_10_1, k__cospi_p16_p16);
  657. // dct_const_round_shift
  658. const __m128i s3_10_4 = _mm_add_epi32(s3_10_2, k__DCT_CONST_ROUNDING);
  659. const __m128i s3_10_5 = _mm_add_epi32(s3_10_3, k__DCT_CONST_ROUNDING);
  660. const __m128i s3_11_4 = _mm_add_epi32(s3_11_2, k__DCT_CONST_ROUNDING);
  661. const __m128i s3_11_5 = _mm_add_epi32(s3_11_3, k__DCT_CONST_ROUNDING);
  662. const __m128i s3_12_4 = _mm_add_epi32(s3_12_2, k__DCT_CONST_ROUNDING);
  663. const __m128i s3_12_5 = _mm_add_epi32(s3_12_3, k__DCT_CONST_ROUNDING);
  664. const __m128i s3_13_4 = _mm_add_epi32(s3_13_2, k__DCT_CONST_ROUNDING);
  665. const __m128i s3_13_5 = _mm_add_epi32(s3_13_3, k__DCT_CONST_ROUNDING);
  666. const __m128i s3_10_6 = _mm_srai_epi32(s3_10_4, DCT_CONST_BITS);
  667. const __m128i s3_10_7 = _mm_srai_epi32(s3_10_5, DCT_CONST_BITS);
  668. const __m128i s3_11_6 = _mm_srai_epi32(s3_11_4, DCT_CONST_BITS);
  669. const __m128i s3_11_7 = _mm_srai_epi32(s3_11_5, DCT_CONST_BITS);
  670. const __m128i s3_12_6 = _mm_srai_epi32(s3_12_4, DCT_CONST_BITS);
  671. const __m128i s3_12_7 = _mm_srai_epi32(s3_12_5, DCT_CONST_BITS);
  672. const __m128i s3_13_6 = _mm_srai_epi32(s3_13_4, DCT_CONST_BITS);
  673. const __m128i s3_13_7 = _mm_srai_epi32(s3_13_5, DCT_CONST_BITS);
  674. // Combine
  675. step3[10] = _mm_packs_epi32(s3_10_6, s3_10_7);
  676. step3[11] = _mm_packs_epi32(s3_11_6, s3_11_7);
  677. step3[12] = _mm_packs_epi32(s3_12_6, s3_12_7);
  678. step3[13] = _mm_packs_epi32(s3_13_6, s3_13_7);
  679. #if DCT_HIGH_BIT_DEPTH
  680. overflow = check_epi16_overflow_x4(&step3[10], &step3[11], &step3[12],
  681. &step3[13]);
  682. if (overflow) {
  683. if (pass == 0)
  684. HIGH_FDCT32x32_2D_C(input, output_org, stride);
  685. else
  686. HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
  687. return;
  688. }
  689. #endif // DCT_HIGH_BIT_DEPTH
  690. }
  691. {
  692. step3[16] = ADD_EPI16(step2[23], step1[16]);
  693. step3[17] = ADD_EPI16(step2[22], step1[17]);
  694. step3[18] = ADD_EPI16(step2[21], step1[18]);
  695. step3[19] = ADD_EPI16(step2[20], step1[19]);
  696. step3[20] = SUB_EPI16(step1[19], step2[20]);
  697. step3[21] = SUB_EPI16(step1[18], step2[21]);
  698. step3[22] = SUB_EPI16(step1[17], step2[22]);
  699. step3[23] = SUB_EPI16(step1[16], step2[23]);
  700. step3[24] = SUB_EPI16(step1[31], step2[24]);
  701. step3[25] = SUB_EPI16(step1[30], step2[25]);
  702. step3[26] = SUB_EPI16(step1[29], step2[26]);
  703. step3[27] = SUB_EPI16(step1[28], step2[27]);
  704. step3[28] = ADD_EPI16(step2[27], step1[28]);
  705. step3[29] = ADD_EPI16(step2[26], step1[29]);
  706. step3[30] = ADD_EPI16(step2[25], step1[30]);
  707. step3[31] = ADD_EPI16(step2[24], step1[31]);
  708. #if DCT_HIGH_BIT_DEPTH
  709. overflow = check_epi16_overflow_x16(
  710. &step3[16], &step3[17], &step3[18], &step3[19], &step3[20],
  711. &step3[21], &step3[22], &step3[23], &step3[24], &step3[25],
  712. &step3[26], &step3[27], &step3[28], &step3[29], &step3[30],
  713. &step3[31]);
  714. if (overflow) {
  715. if (pass == 0)
  716. HIGH_FDCT32x32_2D_C(input, output_org, stride);
  717. else
  718. HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
  719. return;
  720. }
  721. #endif // DCT_HIGH_BIT_DEPTH
  722. }
  723. // Stage 4
  724. {
  725. step1[0] = ADD_EPI16(step3[3], step3[0]);
  726. step1[1] = ADD_EPI16(step3[2], step3[1]);
  727. step1[2] = SUB_EPI16(step3[1], step3[2]);
  728. step1[3] = SUB_EPI16(step3[0], step3[3]);
  729. step1[8] = ADD_EPI16(step3[11], step2[8]);
  730. step1[9] = ADD_EPI16(step3[10], step2[9]);
  731. step1[10] = SUB_EPI16(step2[9], step3[10]);
  732. step1[11] = SUB_EPI16(step2[8], step3[11]);
  733. step1[12] = SUB_EPI16(step2[15], step3[12]);
  734. step1[13] = SUB_EPI16(step2[14], step3[13]);
  735. step1[14] = ADD_EPI16(step3[13], step2[14]);
  736. step1[15] = ADD_EPI16(step3[12], step2[15]);
  737. #if DCT_HIGH_BIT_DEPTH
  738. overflow = check_epi16_overflow_x16(
  739. &step1[0], &step1[1], &step1[2], &step1[3], &step1[4], &step1[5],
  740. &step1[6], &step1[7], &step1[8], &step1[9], &step1[10],
  741. &step1[11], &step1[12], &step1[13], &step1[14], &step1[15]);
  742. if (overflow) {
  743. if (pass == 0)
  744. HIGH_FDCT32x32_2D_C(input, output_org, stride);
  745. else
  746. HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
  747. return;
  748. }
  749. #endif // DCT_HIGH_BIT_DEPTH
  750. }
  751. {
  752. const __m128i s1_05_0 = _mm_unpacklo_epi16(step3[6], step3[5]);
  753. const __m128i s1_05_1 = _mm_unpackhi_epi16(step3[6], step3[5]);
  754. const __m128i s1_05_2 = _mm_madd_epi16(s1_05_0, k__cospi_p16_m16);
  755. const __m128i s1_05_3 = _mm_madd_epi16(s1_05_1, k__cospi_p16_m16);
  756. const __m128i s1_06_2 = _mm_madd_epi16(s1_05_0, k__cospi_p16_p16);
  757. const __m128i s1_06_3 = _mm_madd_epi16(s1_05_1, k__cospi_p16_p16);
  758. // dct_const_round_shift
  759. const __m128i s1_05_4 = _mm_add_epi32(s1_05_2, k__DCT_CONST_ROUNDING);
  760. const __m128i s1_05_5 = _mm_add_epi32(s1_05_3, k__DCT_CONST_ROUNDING);
  761. const __m128i s1_06_4 = _mm_add_epi32(s1_06_2, k__DCT_CONST_ROUNDING);
  762. const __m128i s1_06_5 = _mm_add_epi32(s1_06_3, k__DCT_CONST_ROUNDING);
  763. const __m128i s1_05_6 = _mm_srai_epi32(s1_05_4, DCT_CONST_BITS);
  764. const __m128i s1_05_7 = _mm_srai_epi32(s1_05_5, DCT_CONST_BITS);
  765. const __m128i s1_06_6 = _mm_srai_epi32(s1_06_4, DCT_CONST_BITS);
  766. const __m128i s1_06_7 = _mm_srai_epi32(s1_06_5, DCT_CONST_BITS);
  767. // Combine
  768. step1[5] = _mm_packs_epi32(s1_05_6, s1_05_7);
  769. step1[6] = _mm_packs_epi32(s1_06_6, s1_06_7);
  770. #if DCT_HIGH_BIT_DEPTH
  771. overflow = check_epi16_overflow_x2(&step1[5], &step1[6]);
  772. if (overflow) {
  773. if (pass == 0)
  774. HIGH_FDCT32x32_2D_C(input, output_org, stride);
  775. else
  776. HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
  777. return;
  778. }
  779. #endif // DCT_HIGH_BIT_DEPTH
  780. }
  781. {
  782. const __m128i s1_18_0 = _mm_unpacklo_epi16(step3[18], step3[29]);
  783. const __m128i s1_18_1 = _mm_unpackhi_epi16(step3[18], step3[29]);
  784. const __m128i s1_19_0 = _mm_unpacklo_epi16(step3[19], step3[28]);
  785. const __m128i s1_19_1 = _mm_unpackhi_epi16(step3[19], step3[28]);
  786. const __m128i s1_20_0 = _mm_unpacklo_epi16(step3[20], step3[27]);
  787. const __m128i s1_20_1 = _mm_unpackhi_epi16(step3[20], step3[27]);
  788. const __m128i s1_21_0 = _mm_unpacklo_epi16(step3[21], step3[26]);
  789. const __m128i s1_21_1 = _mm_unpackhi_epi16(step3[21], step3[26]);
  790. const __m128i s1_18_2 = _mm_madd_epi16(s1_18_0, k__cospi_m08_p24);
  791. const __m128i s1_18_3 = _mm_madd_epi16(s1_18_1, k__cospi_m08_p24);
  792. const __m128i s1_19_2 = _mm_madd_epi16(s1_19_0, k__cospi_m08_p24);
  793. const __m128i s1_19_3 = _mm_madd_epi16(s1_19_1, k__cospi_m08_p24);
  794. const __m128i s1_20_2 = _mm_madd_epi16(s1_20_0, k__cospi_m24_m08);
  795. const __m128i s1_20_3 = _mm_madd_epi16(s1_20_1, k__cospi_m24_m08);
  796. const __m128i s1_21_2 = _mm_madd_epi16(s1_21_0, k__cospi_m24_m08);
  797. const __m128i s1_21_3 = _mm_madd_epi16(s1_21_1, k__cospi_m24_m08);
  798. const __m128i s1_26_2 = _mm_madd_epi16(s1_21_0, k__cospi_m08_p24);
  799. const __m128i s1_26_3 = _mm_madd_epi16(s1_21_1, k__cospi_m08_p24);
  800. const __m128i s1_27_2 = _mm_madd_epi16(s1_20_0, k__cospi_m08_p24);
  801. const __m128i s1_27_3 = _mm_madd_epi16(s1_20_1, k__cospi_m08_p24);
  802. const __m128i s1_28_2 = _mm_madd_epi16(s1_19_0, k__cospi_p24_p08);
  803. const __m128i s1_28_3 = _mm_madd_epi16(s1_19_1, k__cospi_p24_p08);
  804. const __m128i s1_29_2 = _mm_madd_epi16(s1_18_0, k__cospi_p24_p08);
  805. const __m128i s1_29_3 = _mm_madd_epi16(s1_18_1, k__cospi_p24_p08);
  806. // dct_const_round_shift
  807. const __m128i s1_18_4 = _mm_add_epi32(s1_18_2, k__DCT_CONST_ROUNDING);
  808. const __m128i s1_18_5 = _mm_add_epi32(s1_18_3, k__DCT_CONST_ROUNDING);
  809. const __m128i s1_19_4 = _mm_add_epi32(s1_19_2, k__DCT_CONST_ROUNDING);
  810. const __m128i s1_19_5 = _mm_add_epi32(s1_19_3, k__DCT_CONST_ROUNDING);
  811. const __m128i s1_20_4 = _mm_add_epi32(s1_20_2, k__DCT_CONST_ROUNDING);
  812. const __m128i s1_20_5 = _mm_add_epi32(s1_20_3, k__DCT_CONST_ROUNDING);
  813. const __m128i s1_21_4 = _mm_add_epi32(s1_21_2, k__DCT_CONST_ROUNDING);
  814. const __m128i s1_21_5 = _mm_add_epi32(s1_21_3, k__DCT_CONST_ROUNDING);
  815. const __m128i s1_26_4 = _mm_add_epi32(s1_26_2, k__DCT_CONST_ROUNDING);
  816. const __m128i s1_26_5 = _mm_add_epi32(s1_26_3, k__DCT_CONST_ROUNDING);
  817. const __m128i s1_27_4 = _mm_add_epi32(s1_27_2, k__DCT_CONST_ROUNDING);
  818. const __m128i s1_27_5 = _mm_add_epi32(s1_27_3, k__DCT_CONST_ROUNDING);
  819. const __m128i s1_28_4 = _mm_add_epi32(s1_28_2, k__DCT_CONST_ROUNDING);
  820. const __m128i s1_28_5 = _mm_add_epi32(s1_28_3, k__DCT_CONST_ROUNDING);
  821. const __m128i s1_29_4 = _mm_add_epi32(s1_29_2, k__DCT_CONST_ROUNDING);
  822. const __m128i s1_29_5 = _mm_add_epi32(s1_29_3, k__DCT_CONST_ROUNDING);
  823. const __m128i s1_18_6 = _mm_srai_epi32(s1_18_4, DCT_CONST_BITS);
  824. const __m128i s1_18_7 = _mm_srai_epi32(s1_18_5, DCT_CONST_BITS);
  825. const __m128i s1_19_6 = _mm_srai_epi32(s1_19_4, DCT_CONST_BITS);
  826. const __m128i s1_19_7 = _mm_srai_epi32(s1_19_5, DCT_CONST_BITS);
  827. const __m128i s1_20_6 = _mm_srai_epi32(s1_20_4, DCT_CONST_BITS);
  828. const __m128i s1_20_7 = _mm_srai_epi32(s1_20_5, DCT_CONST_BITS);
  829. const __m128i s1_21_6 = _mm_srai_epi32(s1_21_4, DCT_CONST_BITS);
  830. const __m128i s1_21_7 = _mm_srai_epi32(s1_21_5, DCT_CONST_BITS);
  831. const __m128i s1_26_6 = _mm_srai_epi32(s1_26_4, DCT_CONST_BITS);
  832. const __m128i s1_26_7 = _mm_srai_epi32(s1_26_5, DCT_CONST_BITS);
  833. const __m128i s1_27_6 = _mm_srai_epi32(s1_27_4, DCT_CONST_BITS);
  834. const __m128i s1_27_7 = _mm_srai_epi32(s1_27_5, DCT_CONST_BITS);
  835. const __m128i s1_28_6 = _mm_srai_epi32(s1_28_4, DCT_CONST_BITS);
  836. const __m128i s1_28_7 = _mm_srai_epi32(s1_28_5, DCT_CONST_BITS);
  837. const __m128i s1_29_6 = _mm_srai_epi32(s1_29_4, DCT_CONST_BITS);
  838. const __m128i s1_29_7 = _mm_srai_epi32(s1_29_5, DCT_CONST_BITS);
  839. // Combine
  840. step1[18] = _mm_packs_epi32(s1_18_6, s1_18_7);
  841. step1[19] = _mm_packs_epi32(s1_19_6, s1_19_7);
  842. step1[20] = _mm_packs_epi32(s1_20_6, s1_20_7);
  843. step1[21] = _mm_packs_epi32(s1_21_6, s1_21_7);
  844. step1[26] = _mm_packs_epi32(s1_26_6, s1_26_7);
  845. step1[27] = _mm_packs_epi32(s1_27_6, s1_27_7);
  846. step1[28] = _mm_packs_epi32(s1_28_6, s1_28_7);
  847. step1[29] = _mm_packs_epi32(s1_29_6, s1_29_7);
  848. #if DCT_HIGH_BIT_DEPTH
  849. overflow = check_epi16_overflow_x8(&step1[18], &step1[19], &step1[20],
  850. &step1[21], &step1[26], &step1[27],
  851. &step1[28], &step1[29]);
  852. if (overflow) {
  853. if (pass == 0)
  854. HIGH_FDCT32x32_2D_C(input, output_org, stride);
  855. else
  856. HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
  857. return;
  858. }
  859. #endif // DCT_HIGH_BIT_DEPTH
  860. }
  861. // Stage 5
  862. {
  863. step2[4] = ADD_EPI16(step1[5], step3[4]);
  864. step2[5] = SUB_EPI16(step3[4], step1[5]);
  865. step2[6] = SUB_EPI16(step3[7], step1[6]);
  866. step2[7] = ADD_EPI16(step1[6], step3[7]);
  867. #if DCT_HIGH_BIT_DEPTH
  868. overflow = check_epi16_overflow_x4(&step2[4], &step2[5], &step2[6],
  869. &step2[7]);
  870. if (overflow) {
  871. if (pass == 0)
  872. HIGH_FDCT32x32_2D_C(input, output_org, stride);
  873. else
  874. HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
  875. return;
  876. }
  877. #endif // DCT_HIGH_BIT_DEPTH
  878. }
  879. {
  880. const __m128i out_00_0 = _mm_unpacklo_epi16(step1[0], step1[1]);
  881. const __m128i out_00_1 = _mm_unpackhi_epi16(step1[0], step1[1]);
  882. const __m128i out_08_0 = _mm_unpacklo_epi16(step1[2], step1[3]);
  883. const __m128i out_08_1 = _mm_unpackhi_epi16(step1[2], step1[3]);
  884. const __m128i out_00_2 = _mm_madd_epi16(out_00_0, k__cospi_p16_p16);
  885. const __m128i out_00_3 = _mm_madd_epi16(out_00_1, k__cospi_p16_p16);
  886. const __m128i out_16_2 = _mm_madd_epi16(out_00_0, k__cospi_p16_m16);
  887. const __m128i out_16_3 = _mm_madd_epi16(out_00_1, k__cospi_p16_m16);
  888. const __m128i out_08_2 = _mm_madd_epi16(out_08_0, k__cospi_p24_p08);
  889. const __m128i out_08_3 = _mm_madd_epi16(out_08_1, k__cospi_p24_p08);
  890. const __m128i out_24_2 = _mm_madd_epi16(out_08_0, k__cospi_m08_p24);
  891. const __m128i out_24_3 = _mm_madd_epi16(out_08_1, k__cospi_m08_p24);
  892. // dct_const_round_shift
  893. const __m128i out_00_4 =
  894. _mm_add_epi32(out_00_2, k__DCT_CONST_ROUNDING);
  895. const __m128i out_00_5 =
  896. _mm_add_epi32(out_00_3, k__DCT_CONST_ROUNDING);
  897. const __m128i out_16_4 =
  898. _mm_add_epi32(out_16_2, k__DCT_CONST_ROUNDING);
  899. const __m128i out_16_5 =
  900. _mm_add_epi32(out_16_3, k__DCT_CONST_ROUNDING);
  901. const __m128i out_08_4 =
  902. _mm_add_epi32(out_08_2, k__DCT_CONST_ROUNDING);
  903. const __m128i out_08_5 =
  904. _mm_add_epi32(out_08_3, k__DCT_CONST_ROUNDING);
  905. const __m128i out_24_4 =
  906. _mm_add_epi32(out_24_2, k__DCT_CONST_ROUNDING);
  907. const __m128i out_24_5 =
  908. _mm_add_epi32(out_24_3, k__DCT_CONST_ROUNDING);
  909. const __m128i out_00_6 = _mm_srai_epi32(out_00_4, DCT_CONST_BITS);
  910. const __m128i out_00_7 = _mm_srai_epi32(out_00_5, DCT_CONST_BITS);
  911. const __m128i out_16_6 = _mm_srai_epi32(out_16_4, DCT_CONST_BITS);
  912. const __m128i out_16_7 = _mm_srai_epi32(out_16_5, DCT_CONST_BITS);
  913. const __m128i out_08_6 = _mm_srai_epi32(out_08_4, DCT_CONST_BITS);
  914. const __m128i out_08_7 = _mm_srai_epi32(out_08_5, DCT_CONST_BITS);
  915. const __m128i out_24_6 = _mm_srai_epi32(out_24_4, DCT_CONST_BITS);
  916. const __m128i out_24_7 = _mm_srai_epi32(out_24_5, DCT_CONST_BITS);
  917. // Combine
  918. out[0] = _mm_packs_epi32(out_00_6, out_00_7);
  919. out[16] = _mm_packs_epi32(out_16_6, out_16_7);
  920. out[8] = _mm_packs_epi32(out_08_6, out_08_7);
  921. out[24] = _mm_packs_epi32(out_24_6, out_24_7);
  922. #if DCT_HIGH_BIT_DEPTH
  923. overflow =
  924. check_epi16_overflow_x4(&out[0], &out[16], &out[8], &out[24]);
  925. if (overflow) {
  926. if (pass == 0)
  927. HIGH_FDCT32x32_2D_C(input, output_org, stride);
  928. else
  929. HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
  930. return;
  931. }
  932. #endif // DCT_HIGH_BIT_DEPTH
  933. }
  934. {
  935. const __m128i s2_09_0 = _mm_unpacklo_epi16(step1[9], step1[14]);
  936. const __m128i s2_09_1 = _mm_unpackhi_epi16(step1[9], step1[14]);
  937. const __m128i s2_10_0 = _mm_unpacklo_epi16(step1[10], step1[13]);
  938. const __m128i s2_10_1 = _mm_unpackhi_epi16(step1[10], step1[13]);
  939. const __m128i s2_09_2 = _mm_madd_epi16(s2_09_0, k__cospi_m08_p24);
  940. const __m128i s2_09_3 = _mm_madd_epi16(s2_09_1, k__cospi_m08_p24);
  941. const __m128i s2_10_2 = _mm_madd_epi16(s2_10_0, k__cospi_m24_m08);
  942. const __m128i s2_10_3 = _mm_madd_epi16(s2_10_1, k__cospi_m24_m08);
  943. const __m128i s2_13_2 = _mm_madd_epi16(s2_10_0, k__cospi_m08_p24);
  944. const __m128i s2_13_3 = _mm_madd_epi16(s2_10_1, k__cospi_m08_p24);
  945. const __m128i s2_14_2 = _mm_madd_epi16(s2_09_0, k__cospi_p24_p08);
  946. const __m128i s2_14_3 = _mm_madd_epi16(s2_09_1, k__cospi_p24_p08);
  947. // dct_const_round_shift
  948. const __m128i s2_09_4 = _mm_add_epi32(s2_09_2, k__DCT_CONST_ROUNDING);
  949. const __m128i s2_09_5 = _mm_add_epi32(s2_09_3, k__DCT_CONST_ROUNDING);
  950. const __m128i s2_10_4 = _mm_add_epi32(s2_10_2, k__DCT_CONST_ROUNDING);
  951. const __m128i s2_10_5 = _mm_add_epi32(s2_10_3, k__DCT_CONST_ROUNDING);
  952. const __m128i s2_13_4 = _mm_add_epi32(s2_13_2, k__DCT_CONST_ROUNDING);
  953. const __m128i s2_13_5 = _mm_add_epi32(s2_13_3, k__DCT_CONST_ROUNDING);
  954. const __m128i s2_14_4 = _mm_add_epi32(s2_14_2, k__DCT_CONST_ROUNDING);
  955. const __m128i s2_14_5 = _mm_add_epi32(s2_14_3, k__DCT_CONST_ROUNDING);
  956. const __m128i s2_09_6 = _mm_srai_epi32(s2_09_4, DCT_CONST_BITS);
  957. const __m128i s2_09_7 = _mm_srai_epi32(s2_09_5, DCT_CONST_BITS);
  958. const __m128i s2_10_6 = _mm_srai_epi32(s2_10_4, DCT_CONST_BITS);
  959. const __m128i s2_10_7 = _mm_srai_epi32(s2_10_5, DCT_CONST_BITS);
  960. const __m128i s2_13_6 = _mm_srai_epi32(s2_13_4, DCT_CONST_BITS);
  961. const __m128i s2_13_7 = _mm_srai_epi32(s2_13_5, DCT_CONST_BITS);
  962. const __m128i s2_14_6 = _mm_srai_epi32(s2_14_4, DCT_CONST_BITS);
  963. const __m128i s2_14_7 = _mm_srai_epi32(s2_14_5, DCT_CONST_BITS);
  964. // Combine
  965. step2[9] = _mm_packs_epi32(s2_09_6, s2_09_7);
  966. step2[10] = _mm_packs_epi32(s2_10_6, s2_10_7);
  967. step2[13] = _mm_packs_epi32(s2_13_6, s2_13_7);
  968. step2[14] = _mm_packs_epi32(s2_14_6, s2_14_7);
  969. #if DCT_HIGH_BIT_DEPTH
  970. overflow = check_epi16_overflow_x4(&step2[9], &step2[10], &step2[13],
  971. &step2[14]);
  972. if (overflow) {
  973. if (pass == 0)
  974. HIGH_FDCT32x32_2D_C(input, output_org, stride);
  975. else
  976. HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
  977. return;
  978. }
  979. #endif // DCT_HIGH_BIT_DEPTH
  980. }
  981. {
  982. step2[16] = ADD_EPI16(step1[19], step3[16]);
  983. step2[17] = ADD_EPI16(step1[18], step3[17]);
  984. step2[18] = SUB_EPI16(step3[17], step1[18]);
  985. step2[19] = SUB_EPI16(step3[16], step1[19]);
  986. step2[20] = SUB_EPI16(step3[23], step1[20]);
  987. step2[21] = SUB_EPI16(step3[22], step1[21]);
  988. step2[22] = ADD_EPI16(step1[21], step3[22]);
  989. step2[23] = ADD_EPI16(step1[20], step3[23]);
  990. step2[24] = ADD_EPI16(step1[27], step3[24]);
  991. step2[25] = ADD_EPI16(step1[26], step3[25]);
  992. step2[26] = SUB_EPI16(step3[25], step1[26]);
  993. step2[27] = SUB_EPI16(step3[24], step1[27]);
  994. step2[28] = SUB_EPI16(step3[31], step1[28]);
  995. step2[29] = SUB_EPI16(step3[30], step1[29]);
  996. step2[30] = ADD_EPI16(step1[29], step3[30]);
  997. step2[31] = ADD_EPI16(step1[28], step3[31]);
  998. #if DCT_HIGH_BIT_DEPTH
  999. overflow = check_epi16_overflow_x16(
  1000. &step2[16], &step2[17], &step2[18], &step2[19], &step2[20],
  1001. &step2[21], &step2[22], &step2[23], &step2[24], &step2[25],
  1002. &step2[26], &step2[27], &step2[28], &step2[29], &step2[30],
  1003. &step2[31]);
  1004. if (overflow) {
  1005. if (pass == 0)
  1006. HIGH_FDCT32x32_2D_C(input, output_org, stride);
  1007. else
  1008. HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
  1009. return;
  1010. }
  1011. #endif // DCT_HIGH_BIT_DEPTH
  1012. }
  1013. // Stage 6
  1014. {
  1015. const __m128i out_04_0 = _mm_unpacklo_epi16(step2[4], step2[7]);
  1016. const __m128i out_04_1 = _mm_unpackhi_epi16(step2[4], step2[7]);
  1017. const __m128i out_20_0 = _mm_unpacklo_epi16(step2[5], step2[6]);
  1018. const __m128i out_20_1 = _mm_unpackhi_epi16(step2[5], step2[6]);
  1019. const __m128i out_12_0 = _mm_unpacklo_epi16(step2[5], step2[6]);
  1020. const __m128i out_12_1 = _mm_unpackhi_epi16(step2[5], step2[6]);
  1021. const __m128i out_28_0 = _mm_unpacklo_epi16(step2[4], step2[7]);
  1022. const __m128i out_28_1 = _mm_unpackhi_epi16(step2[4], step2[7]);
  1023. const __m128i out_04_2 = _mm_madd_epi16(out_04_0, k__cospi_p28_p04);
  1024. const __m128i out_04_3 = _mm_madd_epi16(out_04_1, k__cospi_p28_p04);
  1025. const __m128i out_20_2 = _mm_madd_epi16(out_20_0, k__cospi_p12_p20);
  1026. const __m128i out_20_3 = _mm_madd_epi16(out_20_1, k__cospi_p12_p20);
  1027. const __m128i out_12_2 = _mm_madd_epi16(out_12_0, k__cospi_m20_p12);
  1028. const __m128i out_12_3 = _mm_madd_epi16(out_12_1, k__cospi_m20_p12);
  1029. const __m128i out_28_2 = _mm_madd_epi16(out_28_0, k__cospi_m04_p28);
  1030. const __m128i out_28_3 = _mm_madd_epi16(out_28_1, k__cospi_m04_p28);
  1031. // dct_const_round_shift
  1032. const __m128i out_04_4 =
  1033. _mm_add_epi32(out_04_2, k__DCT_CONST_ROUNDING);
  1034. const __m128i out_04_5 =
  1035. _mm_add_epi32(out_04_3, k__DCT_CONST_ROUNDING);
  1036. const __m128i out_20_4 =
  1037. _mm_add_epi32(out_20_2, k__DCT_CONST_ROUNDING);
  1038. const __m128i out_20_5 =
  1039. _mm_add_epi32(out_20_3, k__DCT_CONST_ROUNDING);
  1040. const __m128i out_12_4 =
  1041. _mm_add_epi32(out_12_2, k__DCT_CONST_ROUNDING);
  1042. const __m128i out_12_5 =
  1043. _mm_add_epi32(out_12_3, k__DCT_CONST_ROUNDING);
  1044. const __m128i out_28_4 =
  1045. _mm_add_epi32(out_28_2, k__DCT_CONST_ROUNDING);
  1046. const __m128i out_28_5 =
  1047. _mm_add_epi32(out_28_3, k__DCT_CONST_ROUNDING);
  1048. const __m128i out_04_6 = _mm_srai_epi32(out_04_4, DCT_CONST_BITS);
  1049. const __m128i out_04_7 = _mm_srai_epi32(out_04_5, DCT_CONST_BITS);
  1050. const __m128i out_20_6 = _mm_srai_epi32(out_20_4, DCT_CONST_BITS);
  1051. const __m128i out_20_7 = _mm_srai_epi32(out_20_5, DCT_CONST_BITS);
  1052. const __m128i out_12_6 = _mm_srai_epi32(out_12_4, DCT_CONST_BITS);
  1053. const __m128i out_12_7 = _mm_srai_epi32(out_12_5, DCT_CONST_BITS);
  1054. const __m128i out_28_6 = _mm_srai_epi32(out_28_4, DCT_CONST_BITS);
  1055. const __m128i out_28_7 = _mm_srai_epi32(out_28_5, DCT_CONST_BITS);
  1056. // Combine
  1057. out[4] = _mm_packs_epi32(out_04_6, out_04_7);
  1058. out[20] = _mm_packs_epi32(out_20_6, out_20_7);
  1059. out[12] = _mm_packs_epi32(out_12_6, out_12_7);
  1060. out[28] = _mm_packs_epi32(out_28_6, out_28_7);
  1061. #if DCT_HIGH_BIT_DEPTH
  1062. overflow =
  1063. check_epi16_overflow_x4(&out[4], &out[20], &out[12], &out[28]);
  1064. if (overflow) {
  1065. if (pass == 0)
  1066. HIGH_FDCT32x32_2D_C(input, output_org, stride);
  1067. else
  1068. HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
  1069. return;
  1070. }
  1071. #endif // DCT_HIGH_BIT_DEPTH
  1072. }
  1073. {
  1074. step3[8] = ADD_EPI16(step2[9], step1[8]);
  1075. step3[9] = SUB_EPI16(step1[8], step2[9]);
  1076. step3[10] = SUB_EPI16(step1[11], step2[10]);
  1077. step3[11] = ADD_EPI16(step2[10], step1[11]);
  1078. step3[12] = ADD_EPI16(step2[13], step1[12]);
  1079. step3[13] = SUB_EPI16(step1[12], step2[13]);
  1080. step3[14] = SUB_EPI16(step1[15], step2[14]);
  1081. step3[15] = ADD_EPI16(step2[14], step1[15]);
  1082. #if DCT_HIGH_BIT_DEPTH
  1083. overflow = check_epi16_overflow_x8(&step3[8], &step3[9], &step3[10],
  1084. &step3[11], &step3[12], &step3[13],
  1085. &step3[14], &step3[15]);
  1086. if (overflow) {
  1087. if (pass == 0)
  1088. HIGH_FDCT32x32_2D_C(input, output_org, stride);
  1089. else
  1090. HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
  1091. return;
  1092. }
  1093. #endif // DCT_HIGH_BIT_DEPTH
  1094. }
  1095. {
  1096. const __m128i s3_17_0 = _mm_unpacklo_epi16(step2[17], step2[30]);
  1097. const __m128i s3_17_1 = _mm_unpackhi_epi16(step2[17], step2[30]);
  1098. const __m128i s3_18_0 = _mm_unpacklo_epi16(step2[18], step2[29]);
  1099. const __m128i s3_18_1 = _mm_unpackhi_epi16(step2[18], step2[29]);
  1100. const __m128i s3_21_0 = _mm_unpacklo_epi16(step2[21], step2[26]);
  1101. const __m128i s3_21_1 = _mm_unpackhi_epi16(step2[21], step2[26]);
  1102. const __m128i s3_22_0 = _mm_unpacklo_epi16(step2[22], step2[25]);
  1103. const __m128i s3_22_1 = _mm_unpackhi_epi16(step2[22], step2[25]);
  1104. const __m128i s3_17_2 = _mm_madd_epi16(s3_17_0, k__cospi_m04_p28);
  1105. const __m128i s3_17_3 = _mm_madd_epi16(s3_17_1, k__cospi_m04_p28);
  1106. const __m128i s3_18_2 = _mm_madd_epi16(s3_18_0, k__cospi_m28_m04);
  1107. const __m128i s3_18_3 = _mm_madd_epi16(s3_18_1, k__cospi_m28_m04);
  1108. const __m128i s3_21_2 = _mm_madd_epi16(s3_21_0, k__cospi_m20_p12);
  1109. const __m128i s3_21_3 = _mm_madd_epi16(s3_21_1, k__cospi_m20_p12);
  1110. const __m128i s3_22_2 = _mm_madd_epi16(s3_22_0, k__cospi_m12_m20);
  1111. const __m128i s3_22_3 = _mm_madd_epi16(s3_22_1, k__cospi_m12_m20);
  1112. const __m128i s3_25_2 = _mm_madd_epi16(s3_22_0, k__cospi_m20_p12);
  1113. const __m128i s3_25_3 = _mm_madd_epi16(s3_22_1, k__cospi_m20_p12);
  1114. const __m128i s3_26_2 = _mm_madd_epi16(s3_21_0, k__cospi_p12_p20);
  1115. const __m128i s3_26_3 = _mm_madd_epi16(s3_21_1, k__cospi_p12_p20);
  1116. const __m128i s3_29_2 = _mm_madd_epi16(s3_18_0, k__cospi_m04_p28);
  1117. const __m128i s3_29_3 = _mm_madd_epi16(s3_18_1, k__cospi_m04_p28);
  1118. const __m128i s3_30_2 = _mm_madd_epi16(s3_17_0, k__cospi_p28_p04);
  1119. const __m128i s3_30_3 = _mm_madd_epi16(s3_17_1, k__cospi_p28_p04);
  1120. // dct_const_round_shift
  1121. const __m128i s3_17_4 = _mm_add_epi32(s3_17_2, k__DCT_CONST_ROUNDING);
  1122. const __m128i s3_17_5 = _mm_add_epi32(s3_17_3, k__DCT_CONST_ROUNDING);
  1123. const __m128i s3_18_4 = _mm_add_epi32(s3_18_2, k__DCT_CONST_ROUNDING);
  1124. const __m128i s3_18_5 = _mm_add_epi32(s3_18_3, k__DCT_CONST_ROUNDING);
  1125. const __m128i s3_21_4 = _mm_add_epi32(s3_21_2, k__DCT_CONST_ROUNDING);
  1126. const __m128i s3_21_5 = _mm_add_epi32(s3_21_3, k__DCT_CONST_ROUNDING);
  1127. const __m128i s3_22_4 = _mm_add_epi32(s3_22_2, k__DCT_CONST_ROUNDING);
  1128. const __m128i s3_22_5 = _mm_add_epi32(s3_22_3, k__DCT_CONST_ROUNDING);
  1129. const __m128i s3_17_6 = _mm_srai_epi32(s3_17_4, DCT_CONST_BITS);
  1130. const __m128i s3_17_7 = _mm_srai_epi32(s3_17_5, DCT_CONST_BITS);
  1131. const __m128i s3_18_6 = _mm_srai_epi32(s3_18_4, DCT_CONST_BITS);
  1132. const __m128i s3_18_7 = _mm_srai_epi32(s3_18_5, DCT_CONST_BITS);
  1133. const __m128i s3_21_6 = _mm_srai_epi32(s3_21_4, DCT_CONST_BITS);
  1134. const __m128i s3_21_7 = _mm_srai_epi32(s3_21_5, DCT_CONST_BITS);
  1135. const __m128i s3_22_6 = _mm_srai_epi32(s3_22_4, DCT_CONST_BITS);
  1136. const __m128i s3_22_7 = _mm_srai_epi32(s3_22_5, DCT_CONST_BITS);
  1137. const __m128i s3_25_4 = _mm_add_epi32(s3_25_2, k__DCT_CONST_ROUNDING);
  1138. const __m128i s3_25_5 = _mm_add_epi32(s3_25_3, k__DCT_CONST_ROUNDING);
  1139. const __m128i s3_26_4 = _mm_add_epi32(s3_26_2, k__DCT_CONST_ROUNDING);
  1140. const __m128i s3_26_5 = _mm_add_epi32(s3_26_3, k__DCT_CONST_ROUNDING);
  1141. const __m128i s3_29_4 = _mm_add_epi32(s3_29_2, k__DCT_CONST_ROUNDING);
  1142. const __m128i s3_29_5 = _mm_add_epi32(s3_29_3, k__DCT_CONST_ROUNDING);
  1143. const __m128i s3_30_4 = _mm_add_epi32(s3_30_2, k__DCT_CONST_ROUNDING);
  1144. const __m128i s3_30_5 = _mm_add_epi32(s3_30_3, k__DCT_CONST_ROUNDING);
  1145. const __m128i s3_25_6 = _mm_srai_epi32(s3_25_4, DCT_CONST_BITS);
  1146. const __m128i s3_25_7 = _mm_srai_epi32(s3_25_5, DCT_CONST_BITS);
  1147. const __m128i s3_26_6 = _mm_srai_epi32(s3_26_4, DCT_CONST_BITS);
  1148. const __m128i s3_26_7 = _mm_srai_epi32(s3_26_5, DCT_CONST_BITS);
  1149. const __m128i s3_29_6 = _mm_srai_epi32(s3_29_4, DCT_CONST_BITS);
  1150. const __m128i s3_29_7 = _mm_srai_epi32(s3_29_5, DCT_CONST_BITS);
  1151. const __m128i s3_30_6 = _mm_srai_epi32(s3_30_4, DCT_CONST_BITS);
  1152. const __m128i s3_30_7 = _mm_srai_epi32(s3_30_5, DCT_CONST_BITS);
  1153. // Combine
  1154. step3[17] = _mm_packs_epi32(s3_17_6, s3_17_7);
  1155. step3[18] = _mm_packs_epi32(s3_18_6, s3_18_7);
  1156. step3[21] = _mm_packs_epi32(s3_21_6, s3_21_7);
  1157. step3[22] = _mm_packs_epi32(s3_22_6, s3_22_7);
  1158. // Combine
  1159. step3[25] = _mm_packs_epi32(s3_25_6, s3_25_7);
  1160. step3[26] = _mm_packs_epi32(s3_26_6, s3_26_7);
  1161. step3[29] = _mm_packs_epi32(s3_29_6, s3_29_7);
  1162. step3[30] = _mm_packs_epi32(s3_30_6, s3_30_7);
  1163. #if DCT_HIGH_BIT_DEPTH
  1164. overflow = check_epi16_overflow_x8(&step3[17], &step3[18], &step3[21],
  1165. &step3[22], &step3[25], &step3[26],
  1166. &step3[29], &step3[30]);
  1167. if (overflow) {
  1168. if (pass == 0)
  1169. HIGH_FDCT32x32_2D_C(input, output_org, stride);
  1170. else
  1171. HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
  1172. return;
  1173. }
  1174. #endif // DCT_HIGH_BIT_DEPTH
  1175. }
  1176. // Stage 7
  1177. {
  1178. const __m128i out_02_0 = _mm_unpacklo_epi16(step3[8], step3[15]);
  1179. const __m128i out_02_1 = _mm_unpackhi_epi16(step3[8], step3[15]);
  1180. const __m128i out_18_0 = _mm_unpacklo_epi16(step3[9], step3[14]);
  1181. const __m128i out_18_1 = _mm_unpackhi_epi16(step3[9], step3[14]);
  1182. const __m128i out_10_0 = _mm_unpacklo_epi16(step3[10], step3[13]);
  1183. const __m128i out_10_1 = _mm_unpackhi_epi16(step3[10], step3[13]);
  1184. const __m128i out_26_0 = _mm_unpacklo_epi16(step3[11], step3[12]);
  1185. const __m128i out_26_1 = _mm_unpackhi_epi16(step3[11], step3[12]);
  1186. const __m128i out_02_2 = _mm_madd_epi16(out_02_0, k__cospi_p30_p02);
  1187. const __m128i out_02_3 = _mm_madd_epi16(out_02_1, k__cospi_p30_p02);
  1188. const __m128i out_18_2 = _mm_madd_epi16(out_18_0, k__cospi_p14_p18);
  1189. const __m128i out_18_3 = _mm_madd_epi16(out_18_1, k__cospi_p14_p18);
  1190. const __m128i out_10_2 = _mm_madd_epi16(out_10_0, k__cospi_p22_p10);
  1191. const __m128i out_10_3 = _mm_madd_epi16(out_10_1, k__cospi_p22_p10);
  1192. const __m128i out_26_2 = _mm_madd_epi16(out_26_0, k__cospi_p06_p26);
  1193. const __m128i out_26_3 = _mm_madd_epi16(out_26_1, k__cospi_p06_p26);
  1194. const __m128i out_06_2 = _mm_madd_epi16(out_26_0, k__cospi_m26_p06);
  1195. const __m128i out_06_3 = _mm_madd_epi16(out_26_1, k__cospi_m26_p06);
  1196. const __m128i out_22_2 = _mm_madd_epi16(out_10_0, k__cospi_m10_p22);
  1197. const __m128i out_22_3 = _mm_madd_epi16(out_10_1, k__cospi_m10_p22);
  1198. const __m128i out_14_2 = _mm_madd_epi16(out_18_0, k__cospi_m18_p14);
  1199. const __m128i out_14_3 = _mm_madd_epi16(out_18_1, k__cospi_m18_p14);
  1200. const __m128i out_30_2 = _mm_madd_epi16(out_02_0, k__cospi_m02_p30);
  1201. const __m128i out_30_3 = _mm_madd_epi16(out_02_1, k__cospi_m02_p30);
  1202. // dct_const_round_shift
  1203. const __m128i out_02_4 =
  1204. _mm_add_epi32(out_02_2, k__DCT_CONST_ROUNDING);
  1205. const __m128i out_02_5 =
  1206. _mm_add_epi32(out_02_3, k__DCT_CONST_ROUNDING);
  1207. const __m128i out_18_4 =
  1208. _mm_add_epi32(out_18_2, k__DCT_CONST_ROUNDING);
  1209. const __m128i out_18_5 =
  1210. _mm_add_epi32(out_18_3, k__DCT_CONST_ROUNDING);
  1211. const __m128i out_10_4 =
  1212. _mm_add_epi32(out_10_2, k__DCT_CONST_ROUNDING);
  1213. const __m128i out_10_5 =
  1214. _mm_add_epi32(out_10_3, k__DCT_CONST_ROUNDING);
  1215. const __m128i out_26_4 =
  1216. _mm_add_epi32(out_26_2, k__DCT_CONST_ROUNDING);
  1217. const __m128i out_26_5 =
  1218. _mm_add_epi32(out_26_3, k__DCT_CONST_ROUNDING);
  1219. const __m128i out_06_4 =
  1220. _mm_add_epi32(out_06_2, k__DCT_CONST_ROUNDING);
  1221. const __m128i out_06_5 =
  1222. _mm_add_epi32(out_06_3, k__DCT_CONST_ROUNDING);
  1223. const __m128i out_22_4 =
  1224. _mm_add_epi32(out_22_2, k__DCT_CONST_ROUNDING);
  1225. const __m128i out_22_5 =
  1226. _mm_add_epi32(out_22_3, k__DCT_CONST_ROUNDING);
  1227. const __m128i out_14_4 =
  1228. _mm_add_epi32(out_14_2, k__DCT_CONST_ROUNDING);
  1229. const __m128i out_14_5 =
  1230. _mm_add_epi32(out_14_3, k__DCT_CONST_ROUNDING);
  1231. const __m128i out_30_4 =
  1232. _mm_add_epi32(out_30_2, k__DCT_CONST_ROUNDING);
  1233. const __m128i out_30_5 =
  1234. _mm_add_epi32(out_30_3, k__DCT_CONST_ROUNDING);
  1235. const __m128i out_02_6 = _mm_srai_epi32(out_02_4, DCT_CONST_BITS);
  1236. const __m128i out_02_7 = _mm_srai_epi32(out_02_5, DCT_CONST_BITS);
  1237. const __m128i out_18_6 = _mm_srai_epi32(out_18_4, DCT_CONST_BITS);
  1238. const __m128i out_18_7 = _mm_srai_epi32(out_18_5, DCT_CONST_BITS);
  1239. const __m128i out_10_6 = _mm_srai_epi32(out_10_4, DCT_CONST_BITS);
  1240. const __m128i out_10_7 = _mm_srai_epi32(out_10_5, DCT_CONST_BITS);
  1241. const __m128i out_26_6 = _mm_srai_epi32(out_26_4, DCT_CONST_BITS);
  1242. const __m128i out_26_7 = _mm_srai_epi32(out_26_5, DCT_CONST_BITS);
  1243. const __m128i out_06_6 = _mm_srai_epi32(out_06_4, DCT_CONST_BITS);
  1244. const __m128i out_06_7 = _mm_srai_epi32(out_06_5, DCT_CONST_BITS);
  1245. const __m128i out_22_6 = _mm_srai_epi32(out_22_4, DCT_CONST_BITS);
  1246. const __m128i out_22_7 = _mm_srai_epi32(out_22_5, DCT_CONST_BITS);
  1247. const __m128i out_14_6 = _mm_srai_epi32(out_14_4, DCT_CONST_BITS);
  1248. const __m128i out_14_7 = _mm_srai_epi32(out_14_5, DCT_CONST_BITS);
  1249. const __m128i out_30_6 = _mm_srai_epi32(out_30_4, DCT_CONST_BITS);
  1250. const __m128i out_30_7 = _mm_srai_epi32(out_30_5, DCT_CONST_BITS);
  1251. // Combine
  1252. out[2] = _mm_packs_epi32(out_02_6, out_02_7);
  1253. out[18] = _mm_packs_epi32(out_18_6, out_18_7);
  1254. out[10] = _mm_packs_epi32(out_10_6, out_10_7);
  1255. out[26] = _mm_packs_epi32(out_26_6, out_26_7);
  1256. out[6] = _mm_packs_epi32(out_06_6, out_06_7);
  1257. out[22] = _mm_packs_epi32(out_22_6, out_22_7);
  1258. out[14] = _mm_packs_epi32(out_14_6, out_14_7);
  1259. out[30] = _mm_packs_epi32(out_30_6, out_30_7);
  1260. #if DCT_HIGH_BIT_DEPTH
  1261. overflow =
  1262. check_epi16_overflow_x8(&out[2], &out[18], &out[10], &out[26],
  1263. &out[6], &out[22], &out[14], &out[30]);
  1264. if (overflow) {
  1265. if (pass == 0)
  1266. HIGH_FDCT32x32_2D_C(input, output_org, stride);
  1267. else
  1268. HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
  1269. return;
  1270. }
  1271. #endif // DCT_HIGH_BIT_DEPTH
  1272. }
  1273. {
  1274. step1[16] = ADD_EPI16(step3[17], step2[16]);
  1275. step1[17] = SUB_EPI16(step2[16], step3[17]);
  1276. step1[18] = SUB_EPI16(step2[19], step3[18]);
  1277. step1[19] = ADD_EPI16(step3[18], step2[19]);
  1278. step1[20] = ADD_EPI16(step3[21], step2[20]);
  1279. step1[21] = SUB_EPI16(step2[20], step3[21]);
  1280. step1[22] = SUB_EPI16(step2[23], step3[22]);
  1281. step1[23] = ADD_EPI16(step3[22], step2[23]);
  1282. step1[24] = ADD_EPI16(step3[25], step2[24]);
  1283. step1[25] = SUB_EPI16(step2[24], step3[25]);
  1284. step1[26] = SUB_EPI16(step2[27], step3[26]);
  1285. step1[27] = ADD_EPI16(step3[26], step2[27]);
  1286. step1[28] = ADD_EPI16(step3[29], step2[28]);
  1287. step1[29] = SUB_EPI16(step2[28], step3[29]);
  1288. step1[30] = SUB_EPI16(step2[31], step3[30]);
  1289. step1[31] = ADD_EPI16(step3[30], step2[31]);
  1290. #if DCT_HIGH_BIT_DEPTH
  1291. overflow = check_epi16_overflow_x16(
  1292. &step1[16], &step1[17], &step1[18], &step1[19], &step1[20],
  1293. &step1[21], &step1[22], &step1[23], &step1[24], &step1[25],
  1294. &step1[26], &step1[27], &step1[28], &step1[29], &step1[30],
  1295. &step1[31]);
  1296. if (overflow) {
  1297. if (pass == 0)
  1298. HIGH_FDCT32x32_2D_C(input, output_org, stride);
  1299. else
  1300. HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
  1301. return;
  1302. }
  1303. #endif // DCT_HIGH_BIT_DEPTH
  1304. }
  1305. // Final stage --- outputs indices are bit-reversed.
  1306. {
  1307. const __m128i out_01_0 = _mm_unpacklo_epi16(step1[16], step1[31]);
  1308. const __m128i out_01_1 = _mm_unpackhi_epi16(step1[16], step1[31]);
  1309. const __m128i out_17_0 = _mm_unpacklo_epi16(step1[17], step1[30]);
  1310. const __m128i out_17_1 = _mm_unpackhi_epi16(step1[17], step1[30]);
  1311. const __m128i out_09_0 = _mm_unpacklo_epi16(step1[18], step1[29]);
  1312. const __m128i out_09_1 = _mm_unpackhi_epi16(step1[18], step1[29]);
  1313. const __m128i out_25_0 = _mm_unpacklo_epi16(step1[19], step1[28]);
  1314. const __m128i out_25_1 = _mm_unpackhi_epi16(step1[19], step1[28]);
  1315. const __m128i out_01_2 = _mm_madd_epi16(out_01_0, k__cospi_p31_p01);
  1316. const __m128i out_01_3 = _mm_madd_epi16(out_01_1, k__cospi_p31_p01);
  1317. const __m128i out_17_2 = _mm_madd_epi16(out_17_0, k__cospi_p15_p17);
  1318. const __m128i out_17_3 = _mm_madd_epi16(out_17_1, k__cospi_p15_p17);
  1319. const __m128i out_09_2 = _mm_madd_epi16(out_09_0, k__cospi_p23_p09);
  1320. const __m128i out_09_3 = _mm_madd_epi16(out_09_1, k__cospi_p23_p09);
  1321. const __m128i out_25_2 = _mm_madd_epi16(out_25_0, k__cospi_p07_p25);
  1322. const __m128i out_25_3 = _mm_madd_epi16(out_25_1, k__cospi_p07_p25);
  1323. const __m128i out_07_2 = _mm_madd_epi16(out_25_0, k__cospi_m25_p07);
  1324. const __m128i out_07_3 = _mm_madd_epi16(out_25_1, k__cospi_m25_p07);
  1325. const __m128i out_23_2 = _mm_madd_epi16(out_09_0, k__cospi_m09_p23);
  1326. const __m128i out_23_3 = _mm_madd_epi16(out_09_1, k__cospi_m09_p23);
  1327. const __m128i out_15_2 = _mm_madd_epi16(out_17_0, k__cospi_m17_p15);
  1328. const __m128i out_15_3 = _mm_madd_epi16(out_17_1, k__cospi_m17_p15);
  1329. const __m128i out_31_2 = _mm_madd_epi16(out_01_0, k__cospi_m01_p31);
  1330. const __m128i out_31_3 = _mm_madd_epi16(out_01_1, k__cospi_m01_p31);
  1331. // dct_const_round_shift
  1332. const __m128i out_01_4 =
  1333. _mm_add_epi32(out_01_2, k__DCT_CONST_ROUNDING);
  1334. const __m128i out_01_5 =
  1335. _mm_add_epi32(out_01_3, k__DCT_CONST_ROUNDING);
  1336. const __m128i out_17_4 =
  1337. _mm_add_epi32(out_17_2, k__DCT_CONST_ROUNDING);
  1338. const __m128i out_17_5 =
  1339. _mm_add_epi32(out_17_3, k__DCT_CONST_ROUNDING);
  1340. const __m128i out_09_4 =
  1341. _mm_add_epi32(out_09_2, k__DCT_CONST_ROUNDING);
  1342. const __m128i out_09_5 =
  1343. _mm_add_epi32(out_09_3, k__DCT_CONST_ROUNDING);
  1344. const __m128i out_25_4 =
  1345. _mm_add_epi32(out_25_2, k__DCT_CONST_ROUNDING);
  1346. const __m128i out_25_5 =
  1347. _mm_add_epi32(out_25_3, k__DCT_CONST_ROUNDING);
  1348. const __m128i out_07_4 =
  1349. _mm_add_epi32(out_07_2, k__DCT_CONST_ROUNDING);
  1350. const __m128i out_07_5 =
  1351. _mm_add_epi32(out_07_3, k__DCT_CONST_ROUNDING);
  1352. const __m128i out_23_4 =
  1353. _mm_add_epi32(out_23_2, k__DCT_CONST_ROUNDING);
  1354. const __m128i out_23_5 =
  1355. _mm_add_epi32(out_23_3, k__DCT_CONST_ROUNDING);
  1356. const __m128i out_15_4 =
  1357. _mm_add_epi32(out_15_2, k__DCT_CONST_ROUNDING);
  1358. const __m128i out_15_5 =
  1359. _mm_add_epi32(out_15_3, k__DCT_CONST_ROUNDING);
  1360. const __m128i out_31_4 =
  1361. _mm_add_epi32(out_31_2, k__DCT_CONST_ROUNDING);
  1362. const __m128i out_31_5 =
  1363. _mm_add_epi32(out_31_3, k__DCT_CONST_ROUNDING);
  1364. const __m128i out_01_6 = _mm_srai_epi32(out_01_4, DCT_CONST_BITS);
  1365. const __m128i out_01_7 = _mm_srai_epi32(out_01_5, DCT_CONST_BITS);
  1366. const __m128i out_17_6 = _mm_srai_epi32(out_17_4, DCT_CONST_BITS);
  1367. const __m128i out_17_7 = _mm_srai_epi32(out_17_5, DCT_CONST_BITS);
  1368. const __m128i out_09_6 = _mm_srai_epi32(out_09_4, DCT_CONST_BITS);
  1369. const __m128i out_09_7 = _mm_srai_epi32(out_09_5, DCT_CONST_BITS);
  1370. const __m128i out_25_6 = _mm_srai_epi32(out_25_4, DCT_CONST_BITS);
  1371. const __m128i out_25_7 = _mm_srai_epi32(out_25_5, DCT_CONST_BITS);
  1372. const __m128i out_07_6 = _mm_srai_epi32(out_07_4, DCT_CONST_BITS);
  1373. const __m128i out_07_7 = _mm_srai_epi32(out_07_5, DCT_CONST_BITS);
  1374. const __m128i out_23_6 = _mm_srai_epi32(out_23_4, DCT_CONST_BITS);
  1375. const __m128i out_23_7 = _mm_srai_epi32(out_23_5, DCT_CONST_BITS);
  1376. const __m128i out_15_6 = _mm_srai_epi32(out_15_4, DCT_CONST_BITS);
  1377. const __m128i out_15_7 = _mm_srai_epi32(out_15_5, DCT_CONST_BITS);
  1378. const __m128i out_31_6 = _mm_srai_epi32(out_31_4, DCT_CONST_BITS);
  1379. const __m128i out_31_7 = _mm_srai_epi32(out_31_5, DCT_CONST_BITS);
  1380. // Combine
  1381. out[1] = _mm_packs_epi32(out_01_6, out_01_7);
  1382. out[17] = _mm_packs_epi32(out_17_6, out_17_7);
  1383. out[9] = _mm_packs_epi32(out_09_6, out_09_7);
  1384. out[25] = _mm_packs_epi32(out_25_6, out_25_7);
  1385. out[7] = _mm_packs_epi32(out_07_6, out_07_7);
  1386. out[23] = _mm_packs_epi32(out_23_6, out_23_7);
  1387. out[15] = _mm_packs_epi32(out_15_6, out_15_7);
  1388. out[31] = _mm_packs_epi32(out_31_6, out_31_7);
  1389. #if DCT_HIGH_BIT_DEPTH
  1390. overflow =
  1391. check_epi16_overflow_x8(&out[1], &out[17], &out[9], &out[25],
  1392. &out[7], &out[23], &out[15], &out[31]);
  1393. if (overflow) {
  1394. if (pass == 0)
  1395. HIGH_FDCT32x32_2D_C(input, output_org, stride);
  1396. else
  1397. HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
  1398. return;
  1399. }
  1400. #endif // DCT_HIGH_BIT_DEPTH
  1401. }
  1402. {
  1403. const __m128i out_05_0 = _mm_unpacklo_epi16(step1[20], step1[27]);
  1404. const __m128i out_05_1 = _mm_unpackhi_epi16(step1[20], step1[27]);
  1405. const __m128i out_21_0 = _mm_unpacklo_epi16(step1[21], step1[26]);
  1406. const __m128i out_21_1 = _mm_unpackhi_epi16(step1[21], step1[26]);
  1407. const __m128i out_13_0 = _mm_unpacklo_epi16(step1[22], step1[25]);
  1408. const __m128i out_13_1 = _mm_unpackhi_epi16(step1[22], step1[25]);
  1409. const __m128i out_29_0 = _mm_unpacklo_epi16(step1[23], step1[24]);
  1410. const __m128i out_29_1 = _mm_unpackhi_epi16(step1[23], step1[24]);
  1411. const __m128i out_05_2 = _mm_madd_epi16(out_05_0, k__cospi_p27_p05);
  1412. const __m128i out_05_3 = _mm_madd_epi16(out_05_1, k__cospi_p27_p05);
  1413. const __m128i out_21_2 = _mm_madd_epi16(out_21_0, k__cospi_p11_p21);
  1414. const __m128i out_21_3 = _mm_madd_epi16(out_21_1, k__cospi_p11_p21);
  1415. const __m128i out_13_2 = _mm_madd_epi16(out_13_0, k__cospi_p19_p13);
  1416. const __m128i out_13_3 = _mm_madd_epi16(out_13_1, k__cospi_p19_p13);
  1417. const __m128i out_29_2 = _mm_madd_epi16(out_29_0, k__cospi_p03_p29);
  1418. const __m128i out_29_3 = _mm_madd_epi16(out_29_1, k__cospi_p03_p29);
  1419. const __m128i out_03_2 = _mm_madd_epi16(out_29_0, k__cospi_m29_p03);
  1420. const __m128i out_03_3 = _mm_madd_epi16(out_29_1, k__cospi_m29_p03);
  1421. const __m128i out_19_2 = _mm_madd_epi16(out_13_0, k__cospi_m13_p19);
  1422. const __m128i out_19_3 = _mm_madd_epi16(out_13_1, k__cospi_m13_p19);
  1423. const __m128i out_11_2 = _mm_madd_epi16(out_21_0, k__cospi_m21_p11);
  1424. const __m128i out_11_3 = _mm_madd_epi16(out_21_1, k__cospi_m21_p11);
  1425. const __m128i out_27_2 = _mm_madd_epi16(out_05_0, k__cospi_m05_p27);
  1426. const __m128i out_27_3 = _mm_madd_epi16(out_05_1, k__cospi_m05_p27);
  1427. // dct_const_round_shift
  1428. const __m128i out_05_4 =
  1429. _mm_add_epi32(out_05_2, k__DCT_CONST_ROUNDING);
  1430. const __m128i out_05_5 =
  1431. _mm_add_epi32(out_05_3, k__DCT_CONST_ROUNDING);
  1432. const __m128i out_21_4 =
  1433. _mm_add_epi32(out_21_2, k__DCT_CONST_ROUNDING);
  1434. const __m128i out_21_5 =
  1435. _mm_add_epi32(out_21_3, k__DCT_CONST_ROUNDING);
  1436. const __m128i out_13_4 =
  1437. _mm_add_epi32(out_13_2, k__DCT_CONST_ROUNDING);
  1438. const __m128i out_13_5 =
  1439. _mm_add_epi32(out_13_3, k__DCT_CONST_ROUNDING);
  1440. const __m128i out_29_4 =
  1441. _mm_add_epi32(out_29_2, k__DCT_CONST_ROUNDING);
  1442. const __m128i out_29_5 =
  1443. _mm_add_epi32(out_29_3, k__DCT_CONST_ROUNDING);
  1444. const __m128i out_03_4 =
  1445. _mm_add_epi32(out_03_2, k__DCT_CONST_ROUNDING);
  1446. const __m128i out_03_5 =
  1447. _mm_add_epi32(out_03_3, k__DCT_CONST_ROUNDING);
  1448. const __m128i out_19_4 =
  1449. _mm_add_epi32(out_19_2, k__DCT_CONST_ROUNDING);
  1450. const __m128i out_19_5 =
  1451. _mm_add_epi32(out_19_3, k__DCT_CONST_ROUNDING);
  1452. const __m128i out_11_4 =
  1453. _mm_add_epi32(out_11_2, k__DCT_CONST_ROUNDING);
  1454. const __m128i out_11_5 =
  1455. _mm_add_epi32(out_11_3, k__DCT_CONST_ROUNDING);
  1456. const __m128i out_27_4 =
  1457. _mm_add_epi32(out_27_2, k__DCT_CONST_ROUNDING);
  1458. const __m128i out_27_5 =
  1459. _mm_add_epi32(out_27_3, k__DCT_CONST_ROUNDING);
  1460. const __m128i out_05_6 = _mm_srai_epi32(out_05_4, DCT_CONST_BITS);
  1461. const __m128i out_05_7 = _mm_srai_epi32(out_05_5, DCT_CONST_BITS);
  1462. const __m128i out_21_6 = _mm_srai_epi32(out_21_4, DCT_CONST_BITS);
  1463. const __m128i out_21_7 = _mm_srai_epi32(out_21_5, DCT_CONST_BITS);
  1464. const __m128i out_13_6 = _mm_srai_epi32(out_13_4, DCT_CONST_BITS);
  1465. const __m128i out_13_7 = _mm_srai_epi32(out_13_5, DCT_CONST_BITS);
  1466. const __m128i out_29_6 = _mm_srai_epi32(out_29_4, DCT_CONST_BITS);
  1467. const __m128i out_29_7 = _mm_srai_epi32(out_29_5, DCT_CONST_BITS);
  1468. const __m128i out_03_6 = _mm_srai_epi32(out_03_4, DCT_CONST_BITS);
  1469. const __m128i out_03_7 = _mm_srai_epi32(out_03_5, DCT_CONST_BITS);
  1470. const __m128i out_19_6 = _mm_srai_epi32(out_19_4, DCT_CONST_BITS);
  1471. const __m128i out_19_7 = _mm_srai_epi32(out_19_5, DCT_CONST_BITS);
  1472. const __m128i out_11_6 = _mm_srai_epi32(out_11_4, DCT_CONST_BITS);
  1473. const __m128i out_11_7 = _mm_srai_epi32(out_11_5, DCT_CONST_BITS);
  1474. const __m128i out_27_6 = _mm_srai_epi32(out_27_4, DCT_CONST_BITS);
  1475. const __m128i out_27_7 = _mm_srai_epi32(out_27_5, DCT_CONST_BITS);
  1476. // Combine
  1477. out[5] = _mm_packs_epi32(out_05_6, out_05_7);
  1478. out[21] = _mm_packs_epi32(out_21_6, out_21_7);
  1479. out[13] = _mm_packs_epi32(out_13_6, out_13_7);
  1480. out[29] = _mm_packs_epi32(out_29_6, out_29_7);
  1481. out[3] = _mm_packs_epi32(out_03_6, out_03_7);
  1482. out[19] = _mm_packs_epi32(out_19_6, out_19_7);
  1483. out[11] = _mm_packs_epi32(out_11_6, out_11_7);
  1484. out[27] = _mm_packs_epi32(out_27_6, out_27_7);
  1485. #if DCT_HIGH_BIT_DEPTH
  1486. overflow =
  1487. check_epi16_overflow_x8(&out[5], &out[21], &out[13], &out[29],
  1488. &out[3], &out[19], &out[11], &out[27]);
  1489. if (overflow) {
  1490. if (pass == 0)
  1491. HIGH_FDCT32x32_2D_C(input, output_org, stride);
  1492. else
  1493. HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
  1494. return;
  1495. }
  1496. #endif // DCT_HIGH_BIT_DEPTH
  1497. }
  1498. #if FDCT32x32_HIGH_PRECISION
  1499. } else {
  1500. __m128i lstep1[64], lstep2[64], lstep3[64];
  1501. __m128i u[32], v[32], sign[16];
  1502. const __m128i K32One = _mm_set_epi32(1, 1, 1, 1);
  1503. const __m128i k__pOne_mOne = pair_set_epi16(1, -1);
  1504. // start using 32-bit operations
  1505. // stage 3
  1506. {
  1507. // expanding to 32-bit length while adding and subtracting
  1508. lstep2[0] = _mm_unpacklo_epi16(step2[0], step2[7]);
  1509. lstep2[1] = _mm_unpackhi_epi16(step2[0], step2[7]);
  1510. lstep2[2] = _mm_unpacklo_epi16(step2[1], step2[6]);
  1511. lstep2[3] = _mm_unpackhi_epi16(step2[1], step2[6]);
  1512. lstep2[4] = _mm_unpacklo_epi16(step2[2], step2[5]);
  1513. lstep2[5] = _mm_unpackhi_epi16(step2[2], step2[5]);
  1514. lstep2[6] = _mm_unpacklo_epi16(step2[3], step2[4]);
  1515. lstep2[7] = _mm_unpackhi_epi16(step2[3], step2[4]);
  1516. lstep3[0] = _mm_madd_epi16(lstep2[0], kOne);
  1517. lstep3[1] = _mm_madd_epi16(lstep2[1], kOne);
  1518. lstep3[2] = _mm_madd_epi16(lstep2[2], kOne);
  1519. lstep3[3] = _mm_madd_epi16(lstep2[3], kOne);
  1520. lstep3[4] = _mm_madd_epi16(lstep2[4], kOne);
  1521. lstep3[5] = _mm_madd_epi16(lstep2[5], kOne);
  1522. lstep3[6] = _mm_madd_epi16(lstep2[6], kOne);
  1523. lstep3[7] = _mm_madd_epi16(lstep2[7], kOne);
  1524. lstep3[8] = _mm_madd_epi16(lstep2[6], k__pOne_mOne);
  1525. lstep3[9] = _mm_madd_epi16(lstep2[7], k__pOne_mOne);
  1526. lstep3[10] = _mm_madd_epi16(lstep2[4], k__pOne_mOne);
  1527. lstep3[11] = _mm_madd_epi16(lstep2[5], k__pOne_mOne);
  1528. lstep3[12] = _mm_madd_epi16(lstep2[2], k__pOne_mOne);
  1529. lstep3[13] = _mm_madd_epi16(lstep2[3], k__pOne_mOne);
  1530. lstep3[14] = _mm_madd_epi16(lstep2[0], k__pOne_mOne);
  1531. lstep3[15] = _mm_madd_epi16(lstep2[1], k__pOne_mOne);
  1532. }
  1533. {
  1534. const __m128i s3_10_0 = _mm_unpacklo_epi16(step2[13], step2[10]);
  1535. const __m128i s3_10_1 = _mm_unpackhi_epi16(step2[13], step2[10]);
  1536. const __m128i s3_11_0 = _mm_unpacklo_epi16(step2[12], step2[11]);
  1537. const __m128i s3_11_1 = _mm_unpackhi_epi16(step2[12], step2[11]);
  1538. const __m128i s3_10_2 = _mm_madd_epi16(s3_10_0, k__cospi_p16_m16);
  1539. const __m128i s3_10_3 = _mm_madd_epi16(s3_10_1, k__cospi_p16_m16);
  1540. const __m128i s3_11_2 = _mm_madd_epi16(s3_11_0, k__cospi_p16_m16);
  1541. const __m128i s3_11_3 = _mm_madd_epi16(s3_11_1, k__cospi_p16_m16);
  1542. const __m128i s3_12_2 = _mm_madd_epi16(s3_11_0, k__cospi_p16_p16);
  1543. const __m128i s3_12_3 = _mm_madd_epi16(s3_11_1, k__cospi_p16_p16);
  1544. const __m128i s3_13_2 = _mm_madd_epi16(s3_10_0, k__cospi_p16_p16);
  1545. const __m128i s3_13_3 = _mm_madd_epi16(s3_10_1, k__cospi_p16_p16);
  1546. // dct_const_round_shift
  1547. const __m128i s3_10_4 = _mm_add_epi32(s3_10_2, k__DCT_CONST_ROUNDING);
  1548. const __m128i s3_10_5 = _mm_add_epi32(s3_10_3, k__DCT_CONST_ROUNDING);
  1549. const __m128i s3_11_4 = _mm_add_epi32(s3_11_2, k__DCT_CONST_ROUNDING);
  1550. const __m128i s3_11_5 = _mm_add_epi32(s3_11_3, k__DCT_CONST_ROUNDING);
  1551. const __m128i s3_12_4 = _mm_add_epi32(s3_12_2, k__DCT_CONST_ROUNDING);
  1552. const __m128i s3_12_5 = _mm_add_epi32(s3_12_3, k__DCT_CONST_ROUNDING);
  1553. const __m128i s3_13_4 = _mm_add_epi32(s3_13_2, k__DCT_CONST_ROUNDING);
  1554. const __m128i s3_13_5 = _mm_add_epi32(s3_13_3, k__DCT_CONST_ROUNDING);
  1555. lstep3[20] = _mm_srai_epi32(s3_10_4, DCT_CONST_BITS);
  1556. lstep3[21] = _mm_srai_epi32(s3_10_5, DCT_CONST_BITS);
  1557. lstep3[22] = _mm_srai_epi32(s3_11_4, DCT_CONST_BITS);
  1558. lstep3[23] = _mm_srai_epi32(s3_11_5, DCT_CONST_BITS);
  1559. lstep3[24] = _mm_srai_epi32(s3_12_4, DCT_CONST_BITS);
  1560. lstep3[25] = _mm_srai_epi32(s3_12_5, DCT_CONST_BITS);
  1561. lstep3[26] = _mm_srai_epi32(s3_13_4, DCT_CONST_BITS);
  1562. lstep3[27] = _mm_srai_epi32(s3_13_5, DCT_CONST_BITS);
  1563. }
  1564. {
  1565. lstep1[32] = _mm_unpacklo_epi16(step1[16], step2[23]);
  1566. lstep1[33] = _mm_unpackhi_epi16(step1[16], step2[23]);
  1567. lstep1[34] = _mm_unpacklo_epi16(step1[17], step2[22]);
  1568. lstep1[35] = _mm_unpackhi_epi16(step1[17], step2[22]);
  1569. lstep1[36] = _mm_unpacklo_epi16(step1[18], step2[21]);
  1570. lstep1[37] = _mm_unpackhi_epi16(step1[18], step2[21]);
  1571. lstep1[38] = _mm_unpacklo_epi16(step1[19], step2[20]);
  1572. lstep1[39] = _mm_unpackhi_epi16(step1[19], step2[20]);
  1573. lstep1[56] = _mm_unpacklo_epi16(step1[28], step2[27]);
  1574. lstep1[57] = _mm_unpackhi_epi16(step1[28], step2[27]);
  1575. lstep1[58] = _mm_unpacklo_epi16(step1[29], step2[26]);
  1576. lstep1[59] = _mm_unpackhi_epi16(step1[29], step2[26]);
  1577. lstep1[60] = _mm_unpacklo_epi16(step1[30], step2[25]);
  1578. lstep1[61] = _mm_unpackhi_epi16(step1[30], step2[25]);
  1579. lstep1[62] = _mm_unpacklo_epi16(step1[31], step2[24]);
  1580. lstep1[63] = _mm_unpackhi_epi16(step1[31], step2[24]);
  1581. lstep3[32] = _mm_madd_epi16(lstep1[32], kOne);
  1582. lstep3[33] = _mm_madd_epi16(lstep1[33], kOne);
  1583. lstep3[34] = _mm_madd_epi16(lstep1[34], kOne);
  1584. lstep3[35] = _mm_madd_epi16(lstep1[35], kOne);
  1585. lstep3[36] = _mm_madd_epi16(lstep1[36], kOne);
  1586. lstep3[37] = _mm_madd_epi16(lstep1[37], kOne);
  1587. lstep3[38] = _mm_madd_epi16(lstep1[38], kOne);
  1588. lstep3[39] = _mm_madd_epi16(lstep1[39], kOne);
  1589. lstep3[40] = _mm_madd_epi16(lstep1[38], k__pOne_mOne);
  1590. lstep3[41] = _mm_madd_epi16(lstep1[39], k__pOne_mOne);
  1591. lstep3[42] = _mm_madd_epi16(lstep1[36], k__pOne_mOne);
  1592. lstep3[43] = _mm_madd_epi16(lstep1[37], k__pOne_mOne);
  1593. lstep3[44] = _mm_madd_epi16(lstep1[34], k__pOne_mOne);
  1594. lstep3[45] = _mm_madd_epi16(lstep1[35], k__pOne_mOne);
  1595. lstep3[46] = _mm_madd_epi16(lstep1[32], k__pOne_mOne);
  1596. lstep3[47] = _mm_madd_epi16(lstep1[33], k__pOne_mOne);
  1597. lstep3[48] = _mm_madd_epi16(lstep1[62], k__pOne_mOne);
  1598. lstep3[49] = _mm_madd_epi16(lstep1[63], k__pOne_mOne);
  1599. lstep3[50] = _mm_madd_epi16(lstep1[60], k__pOne_mOne);
  1600. lstep3[51] = _mm_madd_epi16(lstep1[61], k__pOne_mOne);
  1601. lstep3[52] = _mm_madd_epi16(lstep1[58], k__pOne_mOne);
  1602. lstep3[53] = _mm_madd_epi16(lstep1[59], k__pOne_mOne);
  1603. lstep3[54] = _mm_madd_epi16(lstep1[56], k__pOne_mOne);
  1604. lstep3[55] = _mm_madd_epi16(lstep1[57], k__pOne_mOne);
  1605. lstep3[56] = _mm_madd_epi16(lstep1[56], kOne);
  1606. lstep3[57] = _mm_madd_epi16(lstep1[57], kOne);
  1607. lstep3[58] = _mm_madd_epi16(lstep1[58], kOne);
  1608. lstep3[59] = _mm_madd_epi16(lstep1[59], kOne);
  1609. lstep3[60] = _mm_madd_epi16(lstep1[60], kOne);
  1610. lstep3[61] = _mm_madd_epi16(lstep1[61], kOne);
  1611. lstep3[62] = _mm_madd_epi16(lstep1[62], kOne);
  1612. lstep3[63] = _mm_madd_epi16(lstep1[63], kOne);
  1613. }
  1614. // stage 4
  1615. {
  1616. // expanding to 32-bit length prior to addition operations
  1617. sign[0] = _mm_cmpgt_epi16(kZero, step2[8]);
  1618. sign[1] = _mm_cmpgt_epi16(kZero, step2[9]);
  1619. sign[2] = _mm_cmpgt_epi16(kZero, step2[14]);
  1620. sign[3] = _mm_cmpgt_epi16(kZero, step2[15]);
  1621. lstep2[16] = _mm_unpacklo_epi16(step2[8], sign[0]);
  1622. lstep2[17] = _mm_unpackhi_epi16(step2[8], sign[0]);
  1623. lstep2[18] = _mm_unpacklo_epi16(step2[9], sign[1]);
  1624. lstep2[19] = _mm_unpackhi_epi16(step2[9], sign[1]);
  1625. lstep2[28] = _mm_unpacklo_epi16(step2[14], sign[2]);
  1626. lstep2[29] = _mm_unpackhi_epi16(step2[14], sign[2]);
  1627. lstep2[30] = _mm_unpacklo_epi16(step2[15], sign[3]);
  1628. lstep2[31] = _mm_unpackhi_epi16(step2[15], sign[3]);
  1629. lstep1[0] = _mm_add_epi32(lstep3[6], lstep3[0]);
  1630. lstep1[1] = _mm_add_epi32(lstep3[7], lstep3[1]);
  1631. lstep1[2] = _mm_add_epi32(lstep3[4], lstep3[2]);
  1632. lstep1[3] = _mm_add_epi32(lstep3[5], lstep3[3]);
  1633. lstep1[4] = _mm_sub_epi32(lstep3[2], lstep3[4]);
  1634. lstep1[5] = _mm_sub_epi32(lstep3[3], lstep3[5]);
  1635. lstep1[6] = _mm_sub_epi32(lstep3[0], lstep3[6]);
  1636. lstep1[7] = _mm_sub_epi32(lstep3[1], lstep3[7]);
  1637. lstep1[16] = _mm_add_epi32(lstep3[22], lstep2[16]);
  1638. lstep1[17] = _mm_add_epi32(lstep3[23], lstep2[17]);
  1639. lstep1[18] = _mm_add_epi32(lstep3[20], lstep2[18]);
  1640. lstep1[19] = _mm_add_epi32(lstep3[21], lstep2[19]);
  1641. lstep1[20] = _mm_sub_epi32(lstep2[18], lstep3[20]);
  1642. lstep1[21] = _mm_sub_epi32(lstep2[19], lstep3[21]);
  1643. lstep1[22] = _mm_sub_epi32(lstep2[16], lstep3[22]);
  1644. lstep1[23] = _mm_sub_epi32(lstep2[17], lstep3[23]);
  1645. lstep1[24] = _mm_sub_epi32(lstep2[30], lstep3[24]);
  1646. lstep1[25] = _mm_sub_epi32(lstep2[31], lstep3[25]);
  1647. lstep1[26] = _mm_sub_epi32(lstep2[28], lstep3[26]);
  1648. lstep1[27] = _mm_sub_epi32(lstep2[29], lstep3[27]);
  1649. lstep1[28] = _mm_add_epi32(lstep3[26], lstep2[28]);
  1650. lstep1[29] = _mm_add_epi32(lstep3[27], lstep2[29]);
  1651. lstep1[30] = _mm_add_epi32(lstep3[24], lstep2[30]);
  1652. lstep1[31] = _mm_add_epi32(lstep3[25], lstep2[31]);
  1653. }
  1654. {
  1655. // to be continued...
  1656. //
  1657. const __m128i k32_p16_p16 = pair_set_epi32(cospi_16_64, cospi_16_64);
  1658. const __m128i k32_p16_m16 = pair_set_epi32(cospi_16_64, -cospi_16_64);
  1659. u[0] = _mm_unpacklo_epi32(lstep3[12], lstep3[10]);
  1660. u[1] = _mm_unpackhi_epi32(lstep3[12], lstep3[10]);
  1661. u[2] = _mm_unpacklo_epi32(lstep3[13], lstep3[11]);
  1662. u[3] = _mm_unpackhi_epi32(lstep3[13], lstep3[11]);
  1663. // TODO(jingning): manually inline k_madd_epi32_ to further hide
  1664. // instruction latency.
  1665. v[0] = k_madd_epi32(u[0], k32_p16_m16);
  1666. v[1] = k_madd_epi32(u[1], k32_p16_m16);
  1667. v[2] = k_madd_epi32(u[2], k32_p16_m16);
  1668. v[3] = k_madd_epi32(u[3], k32_p16_m16);
  1669. v[4] = k_madd_epi32(u[0], k32_p16_p16);
  1670. v[5] = k_madd_epi32(u[1], k32_p16_p16);
  1671. v[6] = k_madd_epi32(u[2], k32_p16_p16);
  1672. v[7] = k_madd_epi32(u[3], k32_p16_p16);
  1673. #if DCT_HIGH_BIT_DEPTH
  1674. overflow = k_check_epi32_overflow_8(&v[0], &v[1], &v[2], &v[3], &v[4],
  1675. &v[5], &v[6], &v[7], &kZero);
  1676. if (overflow) {
  1677. HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
  1678. return;
  1679. }
  1680. #endif // DCT_HIGH_BIT_DEPTH
  1681. u[0] = k_packs_epi64(v[0], v[1]);
  1682. u[1] = k_packs_epi64(v[2], v[3]);
  1683. u[2] = k_packs_epi64(v[4], v[5]);
  1684. u[3] = k_packs_epi64(v[6], v[7]);
  1685. v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
  1686. v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
  1687. v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
  1688. v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
  1689. lstep1[10] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
  1690. lstep1[11] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
  1691. lstep1[12] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
  1692. lstep1[13] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
  1693. }
  1694. {
  1695. const __m128i k32_m08_p24 = pair_set_epi32(-cospi_8_64, cospi_24_64);
  1696. const __m128i k32_m24_m08 = pair_set_epi32(-cospi_24_64, -cospi_8_64);
  1697. const __m128i k32_p24_p08 = pair_set_epi32(cospi_24_64, cospi_8_64);
  1698. u[0] = _mm_unpacklo_epi32(lstep3[36], lstep3[58]);
  1699. u[1] = _mm_unpackhi_epi32(lstep3[36], lstep3[58]);
  1700. u[2] = _mm_unpacklo_epi32(lstep3[37], lstep3[59]);
  1701. u[3] = _mm_unpackhi_epi32(lstep3[37], lstep3[59]);
  1702. u[4] = _mm_unpacklo_epi32(lstep3[38], lstep3[56]);
  1703. u[5] = _mm_unpackhi_epi32(lstep3[38], lstep3[56]);
  1704. u[6] = _mm_unpacklo_epi32(lstep3[39], lstep3[57]);
  1705. u[7] = _mm_unpackhi_epi32(lstep3[39], lstep3[57]);
  1706. u[8] = _mm_unpacklo_epi32(lstep3[40], lstep3[54]);
  1707. u[9] = _mm_unpackhi_epi32(lstep3[40], lstep3[54]);
  1708. u[10] = _mm_unpacklo_epi32(lstep3[41], lstep3[55]);
  1709. u[11] = _mm_unpackhi_epi32(lstep3[41], lstep3[55]);
  1710. u[12] = _mm_unpacklo_epi32(lstep3[42], lstep3[52]);
  1711. u[13] = _mm_unpackhi_epi32(lstep3[42], lstep3[52]);
  1712. u[14] = _mm_unpacklo_epi32(lstep3[43], lstep3[53]);
  1713. u[15] = _mm_unpackhi_epi32(lstep3[43], lstep3[53]);
  1714. v[0] = k_madd_epi32(u[0], k32_m08_p24);
  1715. v[1] = k_madd_epi32(u[1], k32_m08_p24);
  1716. v[2] = k_madd_epi32(u[2], k32_m08_p24);
  1717. v[3] = k_madd_epi32(u[3], k32_m08_p24);
  1718. v[4] = k_madd_epi32(u[4], k32_m08_p24);
  1719. v[5] = k_madd_epi32(u[5], k32_m08_p24);
  1720. v[6] = k_madd_epi32(u[6], k32_m08_p24);
  1721. v[7] = k_madd_epi32(u[7], k32_m08_p24);
  1722. v[8] = k_madd_epi32(u[8], k32_m24_m08);
  1723. v[9] = k_madd_epi32(u[9], k32_m24_m08);
  1724. v[10] = k_madd_epi32(u[10], k32_m24_m08);
  1725. v[11] = k_madd_epi32(u[11], k32_m24_m08);
  1726. v[12] = k_madd_epi32(u[12], k32_m24_m08);
  1727. v[13] = k_madd_epi32(u[13], k32_m24_m08);
  1728. v[14] = k_madd_epi32(u[14], k32_m24_m08);
  1729. v[15] = k_madd_epi32(u[15], k32_m24_m08);
  1730. v[16] = k_madd_epi32(u[12], k32_m08_p24);
  1731. v[17] = k_madd_epi32(u[13], k32_m08_p24);
  1732. v[18] = k_madd_epi32(u[14], k32_m08_p24);
  1733. v[19] = k_madd_epi32(u[15], k32_m08_p24);
  1734. v[20] = k_madd_epi32(u[8], k32_m08_p24);
  1735. v[21] = k_madd_epi32(u[9], k32_m08_p24);
  1736. v[22] = k_madd_epi32(u[10], k32_m08_p24);
  1737. v[23] = k_madd_epi32(u[11], k32_m08_p24);
  1738. v[24] = k_madd_epi32(u[4], k32_p24_p08);
  1739. v[25] = k_madd_epi32(u[5], k32_p24_p08);
  1740. v[26] = k_madd_epi32(u[6], k32_p24_p08);
  1741. v[27] = k_madd_epi32(u[7], k32_p24_p08);
  1742. v[28] = k_madd_epi32(u[0], k32_p24_p08);
  1743. v[29] = k_madd_epi32(u[1], k32_p24_p08);
  1744. v[30] = k_madd_epi32(u[2], k32_p24_p08);
  1745. v[31] = k_madd_epi32(u[3], k32_p24_p08);
  1746. #if DCT_HIGH_BIT_DEPTH
  1747. overflow = k_check_epi32_overflow_32(
  1748. &v[0], &v[1], &v[2], &v[3], &v[4], &v[5], &v[6], &v[7], &v[8],
  1749. &v[9], &v[10], &v[11], &v[12], &v[13], &v[14], &v[15], &v[16],
  1750. &v[17], &v[18], &v[19], &v[20], &v[21], &v[22], &v[23], &v[24],
  1751. &v[25], &v[26], &v[27], &v[28], &v[29], &v[30], &v[31], &kZero);
  1752. if (overflow) {
  1753. HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
  1754. return;
  1755. }
  1756. #endif // DCT_HIGH_BIT_DEPTH
  1757. u[0] = k_packs_epi64(v[0], v[1]);
  1758. u[1] = k_packs_epi64(v[2], v[3]);
  1759. u[2] = k_packs_epi64(v[4], v[5]);
  1760. u[3] = k_packs_epi64(v[6], v[7]);
  1761. u[4] = k_packs_epi64(v[8], v[9]);
  1762. u[5] = k_packs_epi64(v[10], v[11]);
  1763. u[6] = k_packs_epi64(v[12], v[13]);
  1764. u[7] = k_packs_epi64(v[14], v[15]);
  1765. u[8] = k_packs_epi64(v[16], v[17]);
  1766. u[9] = k_packs_epi64(v[18], v[19]);
  1767. u[10] = k_packs_epi64(v[20], v[21]);
  1768. u[11] = k_packs_epi64(v[22], v[23]);
  1769. u[12] = k_packs_epi64(v[24], v[25]);
  1770. u[13] = k_packs_epi64(v[26], v[27]);
  1771. u[14] = k_packs_epi64(v[28], v[29]);
  1772. u[15] = k_packs_epi64(v[30], v[31]);
  1773. v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
  1774. v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
  1775. v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
  1776. v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
  1777. v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
  1778. v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
  1779. v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
  1780. v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
  1781. v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING);
  1782. v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING);
  1783. v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
  1784. v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
  1785. v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
  1786. v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
  1787. v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
  1788. v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
  1789. lstep1[36] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
  1790. lstep1[37] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
  1791. lstep1[38] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
  1792. lstep1[39] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
  1793. lstep1[40] = _mm_srai_epi32(v[4], DCT_CONST_BITS);
  1794. lstep1[41] = _mm_srai_epi32(v[5], DCT_CONST_BITS);
  1795. lstep1[42] = _mm_srai_epi32(v[6], DCT_CONST_BITS);
  1796. lstep1[43] = _mm_srai_epi32(v[7], DCT_CONST_BITS);
  1797. lstep1[52] = _mm_srai_epi32(v[8], DCT_CONST_BITS);
  1798. lstep1[53] = _mm_srai_epi32(v[9], DCT_CONST_BITS);
  1799. lstep1[54] = _mm_srai_epi32(v[10], DCT_CONST_BITS);
  1800. lstep1[55] = _mm_srai_epi32(v[11], DCT_CONST_BITS);
  1801. lstep1[56] = _mm_srai_epi32(v[12], DCT_CONST_BITS);
  1802. lstep1[57] = _mm_srai_epi32(v[13], DCT_CONST_BITS);
  1803. lstep1[58] = _mm_srai_epi32(v[14], DCT_CONST_BITS);
  1804. lstep1[59] = _mm_srai_epi32(v[15], DCT_CONST_BITS);
  1805. }
  1806. // stage 5
  1807. {
  1808. lstep2[8] = _mm_add_epi32(lstep1[10], lstep3[8]);
  1809. lstep2[9] = _mm_add_epi32(lstep1[11], lstep3[9]);
  1810. lstep2[10] = _mm_sub_epi32(lstep3[8], lstep1[10]);
  1811. lstep2[11] = _mm_sub_epi32(lstep3[9], lstep1[11]);
  1812. lstep2[12] = _mm_sub_epi32(lstep3[14], lstep1[12]);
  1813. lstep2[13] = _mm_sub_epi32(lstep3[15], lstep1[13]);
  1814. lstep2[14] = _mm_add_epi32(lstep1[12], lstep3[14]);
  1815. lstep2[15] = _mm_add_epi32(lstep1[13], lstep3[15]);
  1816. }
  1817. {
  1818. const __m128i k32_p16_p16 = pair_set_epi32(cospi_16_64, cospi_16_64);
  1819. const __m128i k32_p16_m16 = pair_set_epi32(cospi_16_64, -cospi_16_64);
  1820. const __m128i k32_p24_p08 = pair_set_epi32(cospi_24_64, cospi_8_64);
  1821. const __m128i k32_m08_p24 = pair_set_epi32(-cospi_8_64, cospi_24_64);
  1822. u[0] = _mm_unpacklo_epi32(lstep1[0], lstep1[2]);
  1823. u[1] = _mm_unpackhi_epi32(lstep1[0], lstep1[2]);
  1824. u[2] = _mm_unpacklo_epi32(lstep1[1], lstep1[3]);
  1825. u[3] = _mm_unpackhi_epi32(lstep1[1], lstep1[3]);
  1826. u[4] = _mm_unpacklo_epi32(lstep1[4], lstep1[6]);
  1827. u[5] = _mm_unpackhi_epi32(lstep1[4], lstep1[6]);
  1828. u[6] = _mm_unpacklo_epi32(lstep1[5], lstep1[7]);
  1829. u[7] = _mm_unpackhi_epi32(lstep1[5], lstep1[7]);
  1830. // TODO(jingning): manually inline k_madd_epi32_ to further hide
  1831. // instruction latency.
  1832. v[0] = k_madd_epi32(u[0], k32_p16_p16);
  1833. v[1] = k_madd_epi32(u[1], k32_p16_p16);
  1834. v[2] = k_madd_epi32(u[2], k32_p16_p16);
  1835. v[3] = k_madd_epi32(u[3], k32_p16_p16);
  1836. v[4] = k_madd_epi32(u[0], k32_p16_m16);
  1837. v[5] = k_madd_epi32(u[1], k32_p16_m16);
  1838. v[6] = k_madd_epi32(u[2], k32_p16_m16);
  1839. v[7] = k_madd_epi32(u[3], k32_p16_m16);
  1840. v[8] = k_madd_epi32(u[4], k32_p24_p08);
  1841. v[9] = k_madd_epi32(u[5], k32_p24_p08);
  1842. v[10] = k_madd_epi32(u[6], k32_p24_p08);
  1843. v[11] = k_madd_epi32(u[7], k32_p24_p08);
  1844. v[12] = k_madd_epi32(u[4], k32_m08_p24);
  1845. v[13] = k_madd_epi32(u[5], k32_m08_p24);
  1846. v[14] = k_madd_epi32(u[6], k32_m08_p24);
  1847. v[15] = k_madd_epi32(u[7], k32_m08_p24);
  1848. #if DCT_HIGH_BIT_DEPTH
  1849. overflow = k_check_epi32_overflow_16(
  1850. &v[0], &v[1], &v[2], &v[3], &v[4], &v[5], &v[6], &v[7], &v[8],
  1851. &v[9], &v[10], &v[11], &v[12], &v[13], &v[14], &v[15], &kZero);
  1852. if (overflow) {
  1853. HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
  1854. return;
  1855. }
  1856. #endif // DCT_HIGH_BIT_DEPTH
  1857. u[0] = k_packs_epi64(v[0], v[1]);
  1858. u[1] = k_packs_epi64(v[2], v[3]);
  1859. u[2] = k_packs_epi64(v[4], v[5]);
  1860. u[3] = k_packs_epi64(v[6], v[7]);
  1861. u[4] = k_packs_epi64(v[8], v[9]);
  1862. u[5] = k_packs_epi64(v[10], v[11]);
  1863. u[6] = k_packs_epi64(v[12], v[13]);
  1864. u[7] = k_packs_epi64(v[14], v[15]);
  1865. v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
  1866. v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
  1867. v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
  1868. v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
  1869. v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
  1870. v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
  1871. v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
  1872. v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
  1873. u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
  1874. u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
  1875. u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
  1876. u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
  1877. u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS);
  1878. u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS);
  1879. u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS);
  1880. u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS);
  1881. sign[0] = _mm_cmplt_epi32(u[0], kZero);
  1882. sign[1] = _mm_cmplt_epi32(u[1], kZero);
  1883. sign[2] = _mm_cmplt_epi32(u[2], kZero);
  1884. sign[3] = _mm_cmplt_epi32(u[3], kZero);
  1885. sign[4] = _mm_cmplt_epi32(u[4], kZero);
  1886. sign[5] = _mm_cmplt_epi32(u[5], kZero);
  1887. sign[6] = _mm_cmplt_epi32(u[6], kZero);
  1888. sign[7] = _mm_cmplt_epi32(u[7], kZero);
  1889. u[0] = _mm_sub_epi32(u[0], sign[0]);
  1890. u[1] = _mm_sub_epi32(u[1], sign[1]);
  1891. u[2] = _mm_sub_epi32(u[2], sign[2]);
  1892. u[3] = _mm_sub_epi32(u[3], sign[3]);
  1893. u[4] = _mm_sub_epi32(u[4], sign[4]);
  1894. u[5] = _mm_sub_epi32(u[5], sign[5]);
  1895. u[6] = _mm_sub_epi32(u[6], sign[6]);
  1896. u[7] = _mm_sub_epi32(u[7], sign[7]);
  1897. u[0] = _mm_add_epi32(u[0], K32One);
  1898. u[1] = _mm_add_epi32(u[1], K32One);
  1899. u[2] = _mm_add_epi32(u[2], K32One);
  1900. u[3] = _mm_add_epi32(u[3], K32One);
  1901. u[4] = _mm_add_epi32(u[4], K32One);
  1902. u[5] = _mm_add_epi32(u[5], K32One);
  1903. u[6] = _mm_add_epi32(u[6], K32One);
  1904. u[7] = _mm_add_epi32(u[7], K32One);
  1905. u[0] = _mm_srai_epi32(u[0], 2);
  1906. u[1] = _mm_srai_epi32(u[1], 2);
  1907. u[2] = _mm_srai_epi32(u[2], 2);
  1908. u[3] = _mm_srai_epi32(u[3], 2);
  1909. u[4] = _mm_srai_epi32(u[4], 2);
  1910. u[5] = _mm_srai_epi32(u[5], 2);
  1911. u[6] = _mm_srai_epi32(u[6], 2);
  1912. u[7] = _mm_srai_epi32(u[7], 2);
  1913. // Combine
  1914. out[0] = _mm_packs_epi32(u[0], u[1]);
  1915. out[16] = _mm_packs_epi32(u[2], u[3]);
  1916. out[8] = _mm_packs_epi32(u[4], u[5]);
  1917. out[24] = _mm_packs_epi32(u[6], u[7]);
  1918. #if DCT_HIGH_BIT_DEPTH
  1919. overflow =
  1920. check_epi16_overflow_x4(&out[0], &out[16], &out[8], &out[24]);
  1921. if (overflow) {
  1922. HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
  1923. return;
  1924. }
  1925. #endif // DCT_HIGH_BIT_DEPTH
  1926. }
  1927. {
  1928. const __m128i k32_m08_p24 = pair_set_epi32(-cospi_8_64, cospi_24_64);
  1929. const __m128i k32_m24_m08 = pair_set_epi32(-cospi_24_64, -cospi_8_64);
  1930. const __m128i k32_p24_p08 = pair_set_epi32(cospi_24_64, cospi_8_64);
  1931. u[0] = _mm_unpacklo_epi32(lstep1[18], lstep1[28]);
  1932. u[1] = _mm_unpackhi_epi32(lstep1[18], lstep1[28]);
  1933. u[2] = _mm_unpacklo_epi32(lstep1[19], lstep1[29]);
  1934. u[3] = _mm_unpackhi_epi32(lstep1[19], lstep1[29]);
  1935. u[4] = _mm_unpacklo_epi32(lstep1[20], lstep1[26]);
  1936. u[5] = _mm_unpackhi_epi32(lstep1[20], lstep1[26]);
  1937. u[6] = _mm_unpacklo_epi32(lstep1[21], lstep1[27]);
  1938. u[7] = _mm_unpackhi_epi32(lstep1[21], lstep1[27]);
  1939. v[0] = k_madd_epi32(u[0], k32_m08_p24);
  1940. v[1] = k_madd_epi32(u[1], k32_m08_p24);
  1941. v[2] = k_madd_epi32(u[2], k32_m08_p24);
  1942. v[3] = k_madd_epi32(u[3], k32_m08_p24);
  1943. v[4] = k_madd_epi32(u[4], k32_m24_m08);
  1944. v[5] = k_madd_epi32(u[5], k32_m24_m08);
  1945. v[6] = k_madd_epi32(u[6], k32_m24_m08);
  1946. v[7] = k_madd_epi32(u[7], k32_m24_m08);
  1947. v[8] = k_madd_epi32(u[4], k32_m08_p24);
  1948. v[9] = k_madd_epi32(u[5], k32_m08_p24);
  1949. v[10] = k_madd_epi32(u[6], k32_m08_p24);
  1950. v[11] = k_madd_epi32(u[7], k32_m08_p24);
  1951. v[12] = k_madd_epi32(u[0], k32_p24_p08);
  1952. v[13] = k_madd_epi32(u[1], k32_p24_p08);
  1953. v[14] = k_madd_epi32(u[2], k32_p24_p08);
  1954. v[15] = k_madd_epi32(u[3], k32_p24_p08);
  1955. #if DCT_HIGH_BIT_DEPTH
  1956. overflow = k_check_epi32_overflow_16(
  1957. &v[0], &v[1], &v[2], &v[3], &v[4], &v[5], &v[6], &v[7], &v[8],
  1958. &v[9], &v[10], &v[11], &v[12], &v[13], &v[14], &v[15], &kZero);
  1959. if (overflow) {
  1960. HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
  1961. return;
  1962. }
  1963. #endif // DCT_HIGH_BIT_DEPTH
  1964. u[0] = k_packs_epi64(v[0], v[1]);
  1965. u[1] = k_packs_epi64(v[2], v[3]);
  1966. u[2] = k_packs_epi64(v[4], v[5]);
  1967. u[3] = k_packs_epi64(v[6], v[7]);
  1968. u[4] = k_packs_epi64(v[8], v[9]);
  1969. u[5] = k_packs_epi64(v[10], v[11]);
  1970. u[6] = k_packs_epi64(v[12], v[13]);
  1971. u[7] = k_packs_epi64(v[14], v[15]);
  1972. u[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
  1973. u[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
  1974. u[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
  1975. u[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
  1976. u[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
  1977. u[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
  1978. u[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
  1979. u[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
  1980. lstep2[18] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
  1981. lstep2[19] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
  1982. lstep2[20] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
  1983. lstep2[21] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
  1984. lstep2[26] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
  1985. lstep2[27] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
  1986. lstep2[28] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
  1987. lstep2[29] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
  1988. }
  1989. {
  1990. lstep2[32] = _mm_add_epi32(lstep1[38], lstep3[32]);
  1991. lstep2[33] = _mm_add_epi32(lstep1[39], lstep3[33]);
  1992. lstep2[34] = _mm_add_epi32(lstep1[36], lstep3[34]);
  1993. lstep2[35] = _mm_add_epi32(lstep1[37], lstep3[35]);
  1994. lstep2[36] = _mm_sub_epi32(lstep3[34], lstep1[36]);
  1995. lstep2[37] = _mm_sub_epi32(lstep3[35], lstep1[37]);
  1996. lstep2[38] = _mm_sub_epi32(lstep3[32], lstep1[38]);
  1997. lstep2[39] = _mm_sub_epi32(lstep3[33], lstep1[39]);
  1998. lstep2[40] = _mm_sub_epi32(lstep3[46], lstep1[40]);
  1999. lstep2[41] = _mm_sub_epi32(lstep3[47], lstep1[41]);
  2000. lstep2[42] = _mm_sub_epi32(lstep3[44], lstep1[42]);
  2001. lstep2[43] = _mm_sub_epi32(lstep3[45], lstep1[43]);
  2002. lstep2[44] = _mm_add_epi32(lstep1[42], lstep3[44]);
  2003. lstep2[45] = _mm_add_epi32(lstep1[43], lstep3[45]);
  2004. lstep2[46] = _mm_add_epi32(lstep1[40], lstep3[46]);
  2005. lstep2[47] = _mm_add_epi32(lstep1[41], lstep3[47]);
  2006. lstep2[48] = _mm_add_epi32(lstep1[54], lstep3[48]);
  2007. lstep2[49] = _mm_add_epi32(lstep1[55], lstep3[49]);
  2008. lstep2[50] = _mm_add_epi32(lstep1[52], lstep3[50]);
  2009. lstep2[51] = _mm_add_epi32(lstep1[53], lstep3[51]);
  2010. lstep2[52] = _mm_sub_epi32(lstep3[50], lstep1[52]);
  2011. lstep2[53] = _mm_sub_epi32(lstep3[51], lstep1[53]);
  2012. lstep2[54] = _mm_sub_epi32(lstep3[48], lstep1[54]);
  2013. lstep2[55] = _mm_sub_epi32(lstep3[49], lstep1[55]);
  2014. lstep2[56] = _mm_sub_epi32(lstep3[62], lstep1[56]);
  2015. lstep2[57] = _mm_sub_epi32(lstep3[63], lstep1[57]);
  2016. lstep2[58] = _mm_sub_epi32(lstep3[60], lstep1[58]);
  2017. lstep2[59] = _mm_sub_epi32(lstep3[61], lstep1[59]);
  2018. lstep2[60] = _mm_add_epi32(lstep1[58], lstep3[60]);
  2019. lstep2[61] = _mm_add_epi32(lstep1[59], lstep3[61]);
  2020. lstep2[62] = _mm_add_epi32(lstep1[56], lstep3[62]);
  2021. lstep2[63] = _mm_add_epi32(lstep1[57], lstep3[63]);
  2022. }
  2023. // stage 6
  2024. {
  2025. const __m128i k32_p28_p04 = pair_set_epi32(cospi_28_64, cospi_4_64);
  2026. const __m128i k32_p12_p20 = pair_set_epi32(cospi_12_64, cospi_20_64);
  2027. const __m128i k32_m20_p12 = pair_set_epi32(-cospi_20_64, cospi_12_64);
  2028. const __m128i k32_m04_p28 = pair_set_epi32(-cospi_4_64, cospi_28_64);
  2029. u[0] = _mm_unpacklo_epi32(lstep2[8], lstep2[14]);
  2030. u[1] = _mm_unpackhi_epi32(lstep2[8], lstep2[14]);
  2031. u[2] = _mm_unpacklo_epi32(lstep2[9], lstep2[15]);
  2032. u[3] = _mm_unpackhi_epi32(lstep2[9], lstep2[15]);
  2033. u[4] = _mm_unpacklo_epi32(lstep2[10], lstep2[12]);
  2034. u[5] = _mm_unpackhi_epi32(lstep2[10], lstep2[12]);
  2035. u[6] = _mm_unpacklo_epi32(lstep2[11], lstep2[13]);
  2036. u[7] = _mm_unpackhi_epi32(lstep2[11], lstep2[13]);
  2037. u[8] = _mm_unpacklo_epi32(lstep2[10], lstep2[12]);
  2038. u[9] = _mm_unpackhi_epi32(lstep2[10], lstep2[12]);
  2039. u[10] = _mm_unpacklo_epi32(lstep2[11], lstep2[13]);
  2040. u[11] = _mm_unpackhi_epi32(lstep2[11], lstep2[13]);
  2041. u[12] = _mm_unpacklo_epi32(lstep2[8], lstep2[14]);
  2042. u[13] = _mm_unpackhi_epi32(lstep2[8], lstep2[14]);
  2043. u[14] = _mm_unpacklo_epi32(lstep2[9], lstep2[15]);
  2044. u[15] = _mm_unpackhi_epi32(lstep2[9], lstep2[15]);
  2045. v[0] = k_madd_epi32(u[0], k32_p28_p04);
  2046. v[1] = k_madd_epi32(u[1], k32_p28_p04);
  2047. v[2] = k_madd_epi32(u[2], k32_p28_p04);
  2048. v[3] = k_madd_epi32(u[3], k32_p28_p04);
  2049. v[4] = k_madd_epi32(u[4], k32_p12_p20);
  2050. v[5] = k_madd_epi32(u[5], k32_p12_p20);
  2051. v[6] = k_madd_epi32(u[6], k32_p12_p20);
  2052. v[7] = k_madd_epi32(u[7], k32_p12_p20);
  2053. v[8] = k_madd_epi32(u[8], k32_m20_p12);
  2054. v[9] = k_madd_epi32(u[9], k32_m20_p12);
  2055. v[10] = k_madd_epi32(u[10], k32_m20_p12);
  2056. v[11] = k_madd_epi32(u[11], k32_m20_p12);
  2057. v[12] = k_madd_epi32(u[12], k32_m04_p28);
  2058. v[13] = k_madd_epi32(u[13], k32_m04_p28);
  2059. v[14] = k_madd_epi32(u[14], k32_m04_p28);
  2060. v[15] = k_madd_epi32(u[15], k32_m04_p28);
  2061. #if DCT_HIGH_BIT_DEPTH
  2062. overflow = k_check_epi32_overflow_16(
  2063. &v[0], &v[1], &v[2], &v[3], &v[4], &v[5], &v[6], &v[7], &v[8],
  2064. &v[9], &v[10], &v[11], &v[12], &v[13], &v[14], &v[15], &kZero);
  2065. if (overflow) {
  2066. HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
  2067. return;
  2068. }
  2069. #endif // DCT_HIGH_BIT_DEPTH
  2070. u[0] = k_packs_epi64(v[0], v[1]);
  2071. u[1] = k_packs_epi64(v[2], v[3]);
  2072. u[2] = k_packs_epi64(v[4], v[5]);
  2073. u[3] = k_packs_epi64(v[6], v[7]);
  2074. u[4] = k_packs_epi64(v[8], v[9]);
  2075. u[5] = k_packs_epi64(v[10], v[11]);
  2076. u[6] = k_packs_epi64(v[12], v[13]);
  2077. u[7] = k_packs_epi64(v[14], v[15]);
  2078. v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
  2079. v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
  2080. v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
  2081. v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
  2082. v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
  2083. v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
  2084. v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
  2085. v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
  2086. u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
  2087. u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
  2088. u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
  2089. u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
  2090. u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS);
  2091. u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS);
  2092. u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS);
  2093. u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS);
  2094. sign[0] = _mm_cmplt_epi32(u[0], kZero);
  2095. sign[1] = _mm_cmplt_epi32(u[1], kZero);
  2096. sign[2] = _mm_cmplt_epi32(u[2], kZero);
  2097. sign[3] = _mm_cmplt_epi32(u[3], kZero);
  2098. sign[4] = _mm_cmplt_epi32(u[4], kZero);
  2099. sign[5] = _mm_cmplt_epi32(u[5], kZero);
  2100. sign[6] = _mm_cmplt_epi32(u[6], kZero);
  2101. sign[7] = _mm_cmplt_epi32(u[7], kZero);
  2102. u[0] = _mm_sub_epi32(u[0], sign[0]);
  2103. u[1] = _mm_sub_epi32(u[1], sign[1]);
  2104. u[2] = _mm_sub_epi32(u[2], sign[2]);
  2105. u[3] = _mm_sub_epi32(u[3], sign[3]);
  2106. u[4] = _mm_sub_epi32(u[4], sign[4]);
  2107. u[5] = _mm_sub_epi32(u[5], sign[5]);
  2108. u[6] = _mm_sub_epi32(u[6], sign[6]);
  2109. u[7] = _mm_sub_epi32(u[7], sign[7]);
  2110. u[0] = _mm_add_epi32(u[0], K32One);
  2111. u[1] = _mm_add_epi32(u[1], K32One);
  2112. u[2] = _mm_add_epi32(u[2], K32One);
  2113. u[3] = _mm_add_epi32(u[3], K32One);
  2114. u[4] = _mm_add_epi32(u[4], K32One);
  2115. u[5] = _mm_add_epi32(u[5], K32One);
  2116. u[6] = _mm_add_epi32(u[6], K32One);
  2117. u[7] = _mm_add_epi32(u[7], K32One);
  2118. u[0] = _mm_srai_epi32(u[0], 2);
  2119. u[1] = _mm_srai_epi32(u[1], 2);
  2120. u[2] = _mm_srai_epi32(u[2], 2);
  2121. u[3] = _mm_srai_epi32(u[3], 2);
  2122. u[4] = _mm_srai_epi32(u[4], 2);
  2123. u[5] = _mm_srai_epi32(u[5], 2);
  2124. u[6] = _mm_srai_epi32(u[6], 2);
  2125. u[7] = _mm_srai_epi32(u[7], 2);
  2126. out[4] = _mm_packs_epi32(u[0], u[1]);
  2127. out[20] = _mm_packs_epi32(u[2], u[3]);
  2128. out[12] = _mm_packs_epi32(u[4], u[5]);
  2129. out[28] = _mm_packs_epi32(u[6], u[7]);
  2130. #if DCT_HIGH_BIT_DEPTH
  2131. overflow =
  2132. check_epi16_overflow_x4(&out[4], &out[20], &out[12], &out[28]);
  2133. if (overflow) {
  2134. HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
  2135. return;
  2136. }
  2137. #endif // DCT_HIGH_BIT_DEPTH
  2138. }
  2139. {
  2140. lstep3[16] = _mm_add_epi32(lstep2[18], lstep1[16]);
  2141. lstep3[17] = _mm_add_epi32(lstep2[19], lstep1[17]);
  2142. lstep3[18] = _mm_sub_epi32(lstep1[16], lstep2[18]);
  2143. lstep3[19] = _mm_sub_epi32(lstep1[17], lstep2[19]);
  2144. lstep3[20] = _mm_sub_epi32(lstep1[22], lstep2[20]);
  2145. lstep3[21] = _mm_sub_epi32(lstep1[23], lstep2[21]);
  2146. lstep3[22] = _mm_add_epi32(lstep2[20], lstep1[22]);
  2147. lstep3[23] = _mm_add_epi32(lstep2[21], lstep1[23]);
  2148. lstep3[24] = _mm_add_epi32(lstep2[26], lstep1[24]);
  2149. lstep3[25] = _mm_add_epi32(lstep2[27], lstep1[25]);
  2150. lstep3[26] = _mm_sub_epi32(lstep1[24], lstep2[26]);
  2151. lstep3[27] = _mm_sub_epi32(lstep1[25], lstep2[27]);
  2152. lstep3[28] = _mm_sub_epi32(lstep1[30], lstep2[28]);
  2153. lstep3[29] = _mm_sub_epi32(lstep1[31], lstep2[29]);
  2154. lstep3[30] = _mm_add_epi32(lstep2[28], lstep1[30]);
  2155. lstep3[31] = _mm_add_epi32(lstep2[29], lstep1[31]);
  2156. }
  2157. {
  2158. const __m128i k32_m04_p28 = pair_set_epi32(-cospi_4_64, cospi_28_64);
  2159. const __m128i k32_m28_m04 = pair_set_epi32(-cospi_28_64, -cospi_4_64);
  2160. const __m128i k32_m20_p12 = pair_set_epi32(-cospi_20_64, cospi_12_64);
  2161. const __m128i k32_m12_m20 =
  2162. pair_set_epi32(-cospi_12_64, -cospi_20_64);
  2163. const __m128i k32_p12_p20 = pair_set_epi32(cospi_12_64, cospi_20_64);
  2164. const __m128i k32_p28_p04 = pair_set_epi32(cospi_28_64, cospi_4_64);
  2165. u[0] = _mm_unpacklo_epi32(lstep2[34], lstep2[60]);
  2166. u[1] = _mm_unpackhi_epi32(lstep2[34], lstep2[60]);
  2167. u[2] = _mm_unpacklo_epi32(lstep2[35], lstep2[61]);
  2168. u[3] = _mm_unpackhi_epi32(lstep2[35], lstep2[61]);
  2169. u[4] = _mm_unpacklo_epi32(lstep2[36], lstep2[58]);
  2170. u[5] = _mm_unpackhi_epi32(lstep2[36], lstep2[58]);
  2171. u[6] = _mm_unpacklo_epi32(lstep2[37], lstep2[59]);
  2172. u[7] = _mm_unpackhi_epi32(lstep2[37], lstep2[59]);
  2173. u[8] = _mm_unpacklo_epi32(lstep2[42], lstep2[52]);
  2174. u[9] = _mm_unpackhi_epi32(lstep2[42], lstep2[52]);
  2175. u[10] = _mm_unpacklo_epi32(lstep2[43], lstep2[53]);
  2176. u[11] = _mm_unpackhi_epi32(lstep2[43], lstep2[53]);
  2177. u[12] = _mm_unpacklo_epi32(lstep2[44], lstep2[50]);
  2178. u[13] = _mm_unpackhi_epi32(lstep2[44], lstep2[50]);
  2179. u[14] = _mm_unpacklo_epi32(lstep2[45], lstep2[51]);
  2180. u[15] = _mm_unpackhi_epi32(lstep2[45], lstep2[51]);
  2181. v[0] = k_madd_epi32(u[0], k32_m04_p28);
  2182. v[1] = k_madd_epi32(u[1], k32_m04_p28);
  2183. v[2] = k_madd_epi32(u[2], k32_m04_p28);
  2184. v[3] = k_madd_epi32(u[3], k32_m04_p28);
  2185. v[4] = k_madd_epi32(u[4], k32_m28_m04);
  2186. v[5] = k_madd_epi32(u[5], k32_m28_m04);
  2187. v[6] = k_madd_epi32(u[6], k32_m28_m04);
  2188. v[7] = k_madd_epi32(u[7], k32_m28_m04);
  2189. v[8] = k_madd_epi32(u[8], k32_m20_p12);
  2190. v[9] = k_madd_epi32(u[9], k32_m20_p12);
  2191. v[10] = k_madd_epi32(u[10], k32_m20_p12);
  2192. v[11] = k_madd_epi32(u[11], k32_m20_p12);
  2193. v[12] = k_madd_epi32(u[12], k32_m12_m20);
  2194. v[13] = k_madd_epi32(u[13], k32_m12_m20);
  2195. v[14] = k_madd_epi32(u[14], k32_m12_m20);
  2196. v[15] = k_madd_epi32(u[15], k32_m12_m20);
  2197. v[16] = k_madd_epi32(u[12], k32_m20_p12);
  2198. v[17] = k_madd_epi32(u[13], k32_m20_p12);
  2199. v[18] = k_madd_epi32(u[14], k32_m20_p12);
  2200. v[19] = k_madd_epi32(u[15], k32_m20_p12);
  2201. v[20] = k_madd_epi32(u[8], k32_p12_p20);
  2202. v[21] = k_madd_epi32(u[9], k32_p12_p20);
  2203. v[22] = k_madd_epi32(u[10], k32_p12_p20);
  2204. v[23] = k_madd_epi32(u[11], k32_p12_p20);
  2205. v[24] = k_madd_epi32(u[4], k32_m04_p28);
  2206. v[25] = k_madd_epi32(u[5], k32_m04_p28);
  2207. v[26] = k_madd_epi32(u[6], k32_m04_p28);
  2208. v[27] = k_madd_epi32(u[7], k32_m04_p28);
  2209. v[28] = k_madd_epi32(u[0], k32_p28_p04);
  2210. v[29] = k_madd_epi32(u[1], k32_p28_p04);
  2211. v[30] = k_madd_epi32(u[2], k32_p28_p04);
  2212. v[31] = k_madd_epi32(u[3], k32_p28_p04);
  2213. #if DCT_HIGH_BIT_DEPTH
  2214. overflow = k_check_epi32_overflow_32(
  2215. &v[0], &v[1], &v[2], &v[3], &v[4], &v[5], &v[6], &v[7], &v[8],
  2216. &v[9], &v[10], &v[11], &v[12], &v[13], &v[14], &v[15], &v[16],
  2217. &v[17], &v[18], &v[19], &v[20], &v[21], &v[22], &v[23], &v[24],
  2218. &v[25], &v[26], &v[27], &v[28], &v[29], &v[30], &v[31], &kZero);
  2219. if (overflow) {
  2220. HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
  2221. return;
  2222. }
  2223. #endif // DCT_HIGH_BIT_DEPTH
  2224. u[0] = k_packs_epi64(v[0], v[1]);
  2225. u[1] = k_packs_epi64(v[2], v[3]);
  2226. u[2] = k_packs_epi64(v[4], v[5]);
  2227. u[3] = k_packs_epi64(v[6], v[7]);
  2228. u[4] = k_packs_epi64(v[8], v[9]);
  2229. u[5] = k_packs_epi64(v[10], v[11]);
  2230. u[6] = k_packs_epi64(v[12], v[13]);
  2231. u[7] = k_packs_epi64(v[14], v[15]);
  2232. u[8] = k_packs_epi64(v[16], v[17]);
  2233. u[9] = k_packs_epi64(v[18], v[19]);
  2234. u[10] = k_packs_epi64(v[20], v[21]);
  2235. u[11] = k_packs_epi64(v[22], v[23]);
  2236. u[12] = k_packs_epi64(v[24], v[25]);
  2237. u[13] = k_packs_epi64(v[26], v[27]);
  2238. u[14] = k_packs_epi64(v[28], v[29]);
  2239. u[15] = k_packs_epi64(v[30], v[31]);
  2240. v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
  2241. v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
  2242. v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
  2243. v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
  2244. v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
  2245. v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
  2246. v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
  2247. v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
  2248. v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING);
  2249. v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING);
  2250. v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
  2251. v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
  2252. v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
  2253. v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
  2254. v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
  2255. v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
  2256. lstep3[34] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
  2257. lstep3[35] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
  2258. lstep3[36] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
  2259. lstep3[37] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
  2260. lstep3[42] = _mm_srai_epi32(v[4], DCT_CONST_BITS);
  2261. lstep3[43] = _mm_srai_epi32(v[5], DCT_CONST_BITS);
  2262. lstep3[44] = _mm_srai_epi32(v[6], DCT_CONST_BITS);
  2263. lstep3[45] = _mm_srai_epi32(v[7], DCT_CONST_BITS);
  2264. lstep3[50] = _mm_srai_epi32(v[8], DCT_CONST_BITS);
  2265. lstep3[51] = _mm_srai_epi32(v[9], DCT_CONST_BITS);
  2266. lstep3[52] = _mm_srai_epi32(v[10], DCT_CONST_BITS);
  2267. lstep3[53] = _mm_srai_epi32(v[11], DCT_CONST_BITS);
  2268. lstep3[58] = _mm_srai_epi32(v[12], DCT_CONST_BITS);
  2269. lstep3[59] = _mm_srai_epi32(v[13], DCT_CONST_BITS);
  2270. lstep3[60] = _mm_srai_epi32(v[14], DCT_CONST_BITS);
  2271. lstep3[61] = _mm_srai_epi32(v[15], DCT_CONST_BITS);
  2272. }
  2273. // stage 7
  2274. {
  2275. const __m128i k32_p30_p02 = pair_set_epi32(cospi_30_64, cospi_2_64);
  2276. const __m128i k32_p14_p18 = pair_set_epi32(cospi_14_64, cospi_18_64);
  2277. const __m128i k32_p22_p10 = pair_set_epi32(cospi_22_64, cospi_10_64);
  2278. const __m128i k32_p06_p26 = pair_set_epi32(cospi_6_64, cospi_26_64);
  2279. const __m128i k32_m26_p06 = pair_set_epi32(-cospi_26_64, cospi_6_64);
  2280. const __m128i k32_m10_p22 = pair_set_epi32(-cospi_10_64, cospi_22_64);
  2281. const __m128i k32_m18_p14 = pair_set_epi32(-cospi_18_64, cospi_14_64);
  2282. const __m128i k32_m02_p30 = pair_set_epi32(-cospi_2_64, cospi_30_64);
  2283. u[0] = _mm_unpacklo_epi32(lstep3[16], lstep3[30]);
  2284. u[1] = _mm_unpackhi_epi32(lstep3[16], lstep3[30]);
  2285. u[2] = _mm_unpacklo_epi32(lstep3[17], lstep3[31]);
  2286. u[3] = _mm_unpackhi_epi32(lstep3[17], lstep3[31]);
  2287. u[4] = _mm_unpacklo_epi32(lstep3[18], lstep3[28]);
  2288. u[5] = _mm_unpackhi_epi32(lstep3[18], lstep3[28]);
  2289. u[6] = _mm_unpacklo_epi32(lstep3[19], lstep3[29]);
  2290. u[7] = _mm_unpackhi_epi32(lstep3[19], lstep3[29]);
  2291. u[8] = _mm_unpacklo_epi32(lstep3[20], lstep3[26]);
  2292. u[9] = _mm_unpackhi_epi32(lstep3[20], lstep3[26]);
  2293. u[10] = _mm_unpacklo_epi32(lstep3[21], lstep3[27]);
  2294. u[11] = _mm_unpackhi_epi32(lstep3[21], lstep3[27]);
  2295. u[12] = _mm_unpacklo_epi32(lstep3[22], lstep3[24]);
  2296. u[13] = _mm_unpackhi_epi32(lstep3[22], lstep3[24]);
  2297. u[14] = _mm_unpacklo_epi32(lstep3[23], lstep3[25]);
  2298. u[15] = _mm_unpackhi_epi32(lstep3[23], lstep3[25]);
  2299. v[0] = k_madd_epi32(u[0], k32_p30_p02);
  2300. v[1] = k_madd_epi32(u[1], k32_p30_p02);
  2301. v[2] = k_madd_epi32(u[2], k32_p30_p02);
  2302. v[3] = k_madd_epi32(u[3], k32_p30_p02);
  2303. v[4] = k_madd_epi32(u[4], k32_p14_p18);
  2304. v[5] = k_madd_epi32(u[5], k32_p14_p18);
  2305. v[6] = k_madd_epi32(u[6], k32_p14_p18);
  2306. v[7] = k_madd_epi32(u[7], k32_p14_p18);
  2307. v[8] = k_madd_epi32(u[8], k32_p22_p10);
  2308. v[9] = k_madd_epi32(u[9], k32_p22_p10);
  2309. v[10] = k_madd_epi32(u[10], k32_p22_p10);
  2310. v[11] = k_madd_epi32(u[11], k32_p22_p10);
  2311. v[12] = k_madd_epi32(u[12], k32_p06_p26);
  2312. v[13] = k_madd_epi32(u[13], k32_p06_p26);
  2313. v[14] = k_madd_epi32(u[14], k32_p06_p26);
  2314. v[15] = k_madd_epi32(u[15], k32_p06_p26);
  2315. v[16] = k_madd_epi32(u[12], k32_m26_p06);
  2316. v[17] = k_madd_epi32(u[13], k32_m26_p06);
  2317. v[18] = k_madd_epi32(u[14], k32_m26_p06);
  2318. v[19] = k_madd_epi32(u[15], k32_m26_p06);
  2319. v[20] = k_madd_epi32(u[8], k32_m10_p22);
  2320. v[21] = k_madd_epi32(u[9], k32_m10_p22);
  2321. v[22] = k_madd_epi32(u[10], k32_m10_p22);
  2322. v[23] = k_madd_epi32(u[11], k32_m10_p22);
  2323. v[24] = k_madd_epi32(u[4], k32_m18_p14);
  2324. v[25] = k_madd_epi32(u[5], k32_m18_p14);
  2325. v[26] = k_madd_epi32(u[6], k32_m18_p14);
  2326. v[27] = k_madd_epi32(u[7], k32_m18_p14);
  2327. v[28] = k_madd_epi32(u[0], k32_m02_p30);
  2328. v[29] = k_madd_epi32(u[1], k32_m02_p30);
  2329. v[30] = k_madd_epi32(u[2], k32_m02_p30);
  2330. v[31] = k_madd_epi32(u[3], k32_m02_p30);
  2331. #if DCT_HIGH_BIT_DEPTH
  2332. overflow = k_check_epi32_overflow_32(
  2333. &v[0], &v[1], &v[2], &v[3], &v[4], &v[5], &v[6], &v[7], &v[8],
  2334. &v[9], &v[10], &v[11], &v[12], &v[13], &v[14], &v[15], &v[16],
  2335. &v[17], &v[18], &v[19], &v[20], &v[21], &v[22], &v[23], &v[24],
  2336. &v[25], &v[26], &v[27], &v[28], &v[29], &v[30], &v[31], &kZero);
  2337. if (overflow) {
  2338. HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
  2339. return;
  2340. }
  2341. #endif // DCT_HIGH_BIT_DEPTH
  2342. u[0] = k_packs_epi64(v[0], v[1]);
  2343. u[1] = k_packs_epi64(v[2], v[3]);
  2344. u[2] = k_packs_epi64(v[4], v[5]);
  2345. u[3] = k_packs_epi64(v[6], v[7]);
  2346. u[4] = k_packs_epi64(v[8], v[9]);
  2347. u[5] = k_packs_epi64(v[10], v[11]);
  2348. u[6] = k_packs_epi64(v[12], v[13]);
  2349. u[7] = k_packs_epi64(v[14], v[15]);
  2350. u[8] = k_packs_epi64(v[16], v[17]);
  2351. u[9] = k_packs_epi64(v[18], v[19]);
  2352. u[10] = k_packs_epi64(v[20], v[21]);
  2353. u[11] = k_packs_epi64(v[22], v[23]);
  2354. u[12] = k_packs_epi64(v[24], v[25]);
  2355. u[13] = k_packs_epi64(v[26], v[27]);
  2356. u[14] = k_packs_epi64(v[28], v[29]);
  2357. u[15] = k_packs_epi64(v[30], v[31]);
  2358. v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
  2359. v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
  2360. v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
  2361. v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
  2362. v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
  2363. v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
  2364. v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
  2365. v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
  2366. v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING);
  2367. v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING);
  2368. v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
  2369. v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
  2370. v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
  2371. v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
  2372. v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
  2373. v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
  2374. u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
  2375. u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
  2376. u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
  2377. u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
  2378. u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS);
  2379. u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS);
  2380. u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS);
  2381. u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS);
  2382. u[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS);
  2383. u[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS);
  2384. u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS);
  2385. u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS);
  2386. u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS);
  2387. u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS);
  2388. u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS);
  2389. u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS);
  2390. v[0] = _mm_cmplt_epi32(u[0], kZero);
  2391. v[1] = _mm_cmplt_epi32(u[1], kZero);
  2392. v[2] = _mm_cmplt_epi32(u[2], kZero);
  2393. v[3] = _mm_cmplt_epi32(u[3], kZero);
  2394. v[4] = _mm_cmplt_epi32(u[4], kZero);
  2395. v[5] = _mm_cmplt_epi32(u[5], kZero);
  2396. v[6] = _mm_cmplt_epi32(u[6], kZero);
  2397. v[7] = _mm_cmplt_epi32(u[7], kZero);
  2398. v[8] = _mm_cmplt_epi32(u[8], kZero);
  2399. v[9] = _mm_cmplt_epi32(u[9], kZero);
  2400. v[10] = _mm_cmplt_epi32(u[10], kZero);
  2401. v[11] = _mm_cmplt_epi32(u[11], kZero);
  2402. v[12] = _mm_cmplt_epi32(u[12], kZero);
  2403. v[13] = _mm_cmplt_epi32(u[13], kZero);
  2404. v[14] = _mm_cmplt_epi32(u[14], kZero);
  2405. v[15] = _mm_cmplt_epi32(u[15], kZero);
  2406. u[0] = _mm_sub_epi32(u[0], v[0]);
  2407. u[1] = _mm_sub_epi32(u[1], v[1]);
  2408. u[2] = _mm_sub_epi32(u[2], v[2]);
  2409. u[3] = _mm_sub_epi32(u[3], v[3]);
  2410. u[4] = _mm_sub_epi32(u[4], v[4]);
  2411. u[5] = _mm_sub_epi32(u[5], v[5]);
  2412. u[6] = _mm_sub_epi32(u[6], v[6]);
  2413. u[7] = _mm_sub_epi32(u[7], v[7]);
  2414. u[8] = _mm_sub_epi32(u[8], v[8]);
  2415. u[9] = _mm_sub_epi32(u[9], v[9]);
  2416. u[10] = _mm_sub_epi32(u[10], v[10]);
  2417. u[11] = _mm_sub_epi32(u[11], v[11]);
  2418. u[12] = _mm_sub_epi32(u[12], v[12]);
  2419. u[13] = _mm_sub_epi32(u[13], v[13]);
  2420. u[14] = _mm_sub_epi32(u[14], v[14]);
  2421. u[15] = _mm_sub_epi32(u[15], v[15]);
  2422. v[0] = _mm_add_epi32(u[0], K32One);
  2423. v[1] = _mm_add_epi32(u[1], K32One);
  2424. v[2] = _mm_add_epi32(u[2], K32One);
  2425. v[3] = _mm_add_epi32(u[3], K32One);
  2426. v[4] = _mm_add_epi32(u[4], K32One);
  2427. v[5] = _mm_add_epi32(u[5], K32One);
  2428. v[6] = _mm_add_epi32(u[6], K32One);
  2429. v[7] = _mm_add_epi32(u[7], K32One);
  2430. v[8] = _mm_add_epi32(u[8], K32One);
  2431. v[9] = _mm_add_epi32(u[9], K32One);
  2432. v[10] = _mm_add_epi32(u[10], K32One);
  2433. v[11] = _mm_add_epi32(u[11], K32One);
  2434. v[12] = _mm_add_epi32(u[12], K32One);
  2435. v[13] = _mm_add_epi32(u[13], K32One);
  2436. v[14] = _mm_add_epi32(u[14], K32One);
  2437. v[15] = _mm_add_epi32(u[15], K32One);
  2438. u[0] = _mm_srai_epi32(v[0], 2);
  2439. u[1] = _mm_srai_epi32(v[1], 2);
  2440. u[2] = _mm_srai_epi32(v[2], 2);
  2441. u[3] = _mm_srai_epi32(v[3], 2);
  2442. u[4] = _mm_srai_epi32(v[4], 2);
  2443. u[5] = _mm_srai_epi32(v[5], 2);
  2444. u[6] = _mm_srai_epi32(v[6], 2);
  2445. u[7] = _mm_srai_epi32(v[7], 2);
  2446. u[8] = _mm_srai_epi32(v[8], 2);
  2447. u[9] = _mm_srai_epi32(v[9], 2);
  2448. u[10] = _mm_srai_epi32(v[10], 2);
  2449. u[11] = _mm_srai_epi32(v[11], 2);
  2450. u[12] = _mm_srai_epi32(v[12], 2);
  2451. u[13] = _mm_srai_epi32(v[13], 2);
  2452. u[14] = _mm_srai_epi32(v[14], 2);
  2453. u[15] = _mm_srai_epi32(v[15], 2);
  2454. out[2] = _mm_packs_epi32(u[0], u[1]);
  2455. out[18] = _mm_packs_epi32(u[2], u[3]);
  2456. out[10] = _mm_packs_epi32(u[4], u[5]);
  2457. out[26] = _mm_packs_epi32(u[6], u[7]);
  2458. out[6] = _mm_packs_epi32(u[8], u[9]);
  2459. out[22] = _mm_packs_epi32(u[10], u[11]);
  2460. out[14] = _mm_packs_epi32(u[12], u[13]);
  2461. out[30] = _mm_packs_epi32(u[14], u[15]);
  2462. #if DCT_HIGH_BIT_DEPTH
  2463. overflow =
  2464. check_epi16_overflow_x8(&out[2], &out[18], &out[10], &out[26],
  2465. &out[6], &out[22], &out[14], &out[30]);
  2466. if (overflow) {
  2467. HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
  2468. return;
  2469. }
  2470. #endif // DCT_HIGH_BIT_DEPTH
  2471. }
  2472. {
  2473. lstep1[32] = _mm_add_epi32(lstep3[34], lstep2[32]);
  2474. lstep1[33] = _mm_add_epi32(lstep3[35], lstep2[33]);
  2475. lstep1[34] = _mm_sub_epi32(lstep2[32], lstep3[34]);
  2476. lstep1[35] = _mm_sub_epi32(lstep2[33], lstep3[35]);
  2477. lstep1[36] = _mm_sub_epi32(lstep2[38], lstep3[36]);
  2478. lstep1[37] = _mm_sub_epi32(lstep2[39], lstep3[37]);
  2479. lstep1[38] = _mm_add_epi32(lstep3[36], lstep2[38]);
  2480. lstep1[39] = _mm_add_epi32(lstep3[37], lstep2[39]);
  2481. lstep1[40] = _mm_add_epi32(lstep3[42], lstep2[40]);
  2482. lstep1[41] = _mm_add_epi32(lstep3[43], lstep2[41]);
  2483. lstep1[42] = _mm_sub_epi32(lstep2[40], lstep3[42]);
  2484. lstep1[43] = _mm_sub_epi32(lstep2[41], lstep3[43]);
  2485. lstep1[44] = _mm_sub_epi32(lstep2[46], lstep3[44]);
  2486. lstep1[45] = _mm_sub_epi32(lstep2[47], lstep3[45]);
  2487. lstep1[46] = _mm_add_epi32(lstep3[44], lstep2[46]);
  2488. lstep1[47] = _mm_add_epi32(lstep3[45], lstep2[47]);
  2489. lstep1[48] = _mm_add_epi32(lstep3[50], lstep2[48]);
  2490. lstep1[49] = _mm_add_epi32(lstep3[51], lstep2[49]);
  2491. lstep1[50] = _mm_sub_epi32(lstep2[48], lstep3[50]);
  2492. lstep1[51] = _mm_sub_epi32(lstep2[49], lstep3[51]);
  2493. lstep1[52] = _mm_sub_epi32(lstep2[54], lstep3[52]);
  2494. lstep1[53] = _mm_sub_epi32(lstep2[55], lstep3[53]);
  2495. lstep1[54] = _mm_add_epi32(lstep3[52], lstep2[54]);
  2496. lstep1[55] = _mm_add_epi32(lstep3[53], lstep2[55]);
  2497. lstep1[56] = _mm_add_epi32(lstep3[58], lstep2[56]);
  2498. lstep1[57] = _mm_add_epi32(lstep3[59], lstep2[57]);
  2499. lstep1[58] = _mm_sub_epi32(lstep2[56], lstep3[58]);
  2500. lstep1[59] = _mm_sub_epi32(lstep2[57], lstep3[59]);
  2501. lstep1[60] = _mm_sub_epi32(lstep2[62], lstep3[60]);
  2502. lstep1[61] = _mm_sub_epi32(lstep2[63], lstep3[61]);
  2503. lstep1[62] = _mm_add_epi32(lstep3[60], lstep2[62]);
  2504. lstep1[63] = _mm_add_epi32(lstep3[61], lstep2[63]);
  2505. }
  2506. // stage 8
  2507. {
  2508. const __m128i k32_p31_p01 = pair_set_epi32(cospi_31_64, cospi_1_64);
  2509. const __m128i k32_p15_p17 = pair_set_epi32(cospi_15_64, cospi_17_64);
  2510. const __m128i k32_p23_p09 = pair_set_epi32(cospi_23_64, cospi_9_64);
  2511. const __m128i k32_p07_p25 = pair_set_epi32(cospi_7_64, cospi_25_64);
  2512. const __m128i k32_m25_p07 = pair_set_epi32(-cospi_25_64, cospi_7_64);
  2513. const __m128i k32_m09_p23 = pair_set_epi32(-cospi_9_64, cospi_23_64);
  2514. const __m128i k32_m17_p15 = pair_set_epi32(-cospi_17_64, cospi_15_64);
  2515. const __m128i k32_m01_p31 = pair_set_epi32(-cospi_1_64, cospi_31_64);
  2516. u[0] = _mm_unpacklo_epi32(lstep1[32], lstep1[62]);
  2517. u[1] = _mm_unpackhi_epi32(lstep1[32], lstep1[62]);
  2518. u[2] = _mm_unpacklo_epi32(lstep1[33], lstep1[63]);
  2519. u[3] = _mm_unpackhi_epi32(lstep1[33], lstep1[63]);
  2520. u[4] = _mm_unpacklo_epi32(lstep1[34], lstep1[60]);
  2521. u[5] = _mm_unpackhi_epi32(lstep1[34], lstep1[60]);
  2522. u[6] = _mm_unpacklo_epi32(lstep1[35], lstep1[61]);
  2523. u[7] = _mm_unpackhi_epi32(lstep1[35], lstep1[61]);
  2524. u[8] = _mm_unpacklo_epi32(lstep1[36], lstep1[58]);
  2525. u[9] = _mm_unpackhi_epi32(lstep1[36], lstep1[58]);
  2526. u[10] = _mm_unpacklo_epi32(lstep1[37], lstep1[59]);
  2527. u[11] = _mm_unpackhi_epi32(lstep1[37], lstep1[59]);
  2528. u[12] = _mm_unpacklo_epi32(lstep1[38], lstep1[56]);
  2529. u[13] = _mm_unpackhi_epi32(lstep1[38], lstep1[56]);
  2530. u[14] = _mm_unpacklo_epi32(lstep1[39], lstep1[57]);
  2531. u[15] = _mm_unpackhi_epi32(lstep1[39], lstep1[57]);
  2532. v[0] = k_madd_epi32(u[0], k32_p31_p01);
  2533. v[1] = k_madd_epi32(u[1], k32_p31_p01);
  2534. v[2] = k_madd_epi32(u[2], k32_p31_p01);
  2535. v[3] = k_madd_epi32(u[3], k32_p31_p01);
  2536. v[4] = k_madd_epi32(u[4], k32_p15_p17);
  2537. v[5] = k_madd_epi32(u[5], k32_p15_p17);
  2538. v[6] = k_madd_epi32(u[6], k32_p15_p17);
  2539. v[7] = k_madd_epi32(u[7], k32_p15_p17);
  2540. v[8] = k_madd_epi32(u[8], k32_p23_p09);
  2541. v[9] = k_madd_epi32(u[9], k32_p23_p09);
  2542. v[10] = k_madd_epi32(u[10], k32_p23_p09);
  2543. v[11] = k_madd_epi32(u[11], k32_p23_p09);
  2544. v[12] = k_madd_epi32(u[12], k32_p07_p25);
  2545. v[13] = k_madd_epi32(u[13], k32_p07_p25);
  2546. v[14] = k_madd_epi32(u[14], k32_p07_p25);
  2547. v[15] = k_madd_epi32(u[15], k32_p07_p25);
  2548. v[16] = k_madd_epi32(u[12], k32_m25_p07);
  2549. v[17] = k_madd_epi32(u[13], k32_m25_p07);
  2550. v[18] = k_madd_epi32(u[14], k32_m25_p07);
  2551. v[19] = k_madd_epi32(u[15], k32_m25_p07);
  2552. v[20] = k_madd_epi32(u[8], k32_m09_p23);
  2553. v[21] = k_madd_epi32(u[9], k32_m09_p23);
  2554. v[22] = k_madd_epi32(u[10], k32_m09_p23);
  2555. v[23] = k_madd_epi32(u[11], k32_m09_p23);
  2556. v[24] = k_madd_epi32(u[4], k32_m17_p15);
  2557. v[25] = k_madd_epi32(u[5], k32_m17_p15);
  2558. v[26] = k_madd_epi32(u[6], k32_m17_p15);
  2559. v[27] = k_madd_epi32(u[7], k32_m17_p15);
  2560. v[28] = k_madd_epi32(u[0], k32_m01_p31);
  2561. v[29] = k_madd_epi32(u[1], k32_m01_p31);
  2562. v[30] = k_madd_epi32(u[2], k32_m01_p31);
  2563. v[31] = k_madd_epi32(u[3], k32_m01_p31);
  2564. #if DCT_HIGH_BIT_DEPTH
  2565. overflow = k_check_epi32_overflow_32(
  2566. &v[0], &v[1], &v[2], &v[3], &v[4], &v[5], &v[6], &v[7], &v[8],
  2567. &v[9], &v[10], &v[11], &v[12], &v[13], &v[14], &v[15], &v[16],
  2568. &v[17], &v[18], &v[19], &v[20], &v[21], &v[22], &v[23], &v[24],
  2569. &v[25], &v[26], &v[27], &v[28], &v[29], &v[30], &v[31], &kZero);
  2570. if (overflow) {
  2571. HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
  2572. return;
  2573. }
  2574. #endif // DCT_HIGH_BIT_DEPTH
  2575. u[0] = k_packs_epi64(v[0], v[1]);
  2576. u[1] = k_packs_epi64(v[2], v[3]);
  2577. u[2] = k_packs_epi64(v[4], v[5]);
  2578. u[3] = k_packs_epi64(v[6], v[7]);
  2579. u[4] = k_packs_epi64(v[8], v[9]);
  2580. u[5] = k_packs_epi64(v[10], v[11]);
  2581. u[6] = k_packs_epi64(v[12], v[13]);
  2582. u[7] = k_packs_epi64(v[14], v[15]);
  2583. u[8] = k_packs_epi64(v[16], v[17]);
  2584. u[9] = k_packs_epi64(v[18], v[19]);
  2585. u[10] = k_packs_epi64(v[20], v[21]);
  2586. u[11] = k_packs_epi64(v[22], v[23]);
  2587. u[12] = k_packs_epi64(v[24], v[25]);
  2588. u[13] = k_packs_epi64(v[26], v[27]);
  2589. u[14] = k_packs_epi64(v[28], v[29]);
  2590. u[15] = k_packs_epi64(v[30], v[31]);
  2591. v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
  2592. v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
  2593. v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
  2594. v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
  2595. v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
  2596. v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
  2597. v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
  2598. v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
  2599. v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING);
  2600. v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING);
  2601. v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
  2602. v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
  2603. v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
  2604. v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
  2605. v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
  2606. v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
  2607. u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
  2608. u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
  2609. u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
  2610. u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
  2611. u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS);
  2612. u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS);
  2613. u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS);
  2614. u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS);
  2615. u[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS);
  2616. u[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS);
  2617. u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS);
  2618. u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS);
  2619. u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS);
  2620. u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS);
  2621. u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS);
  2622. u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS);
  2623. v[0] = _mm_cmplt_epi32(u[0], kZero);
  2624. v[1] = _mm_cmplt_epi32(u[1], kZero);
  2625. v[2] = _mm_cmplt_epi32(u[2], kZero);
  2626. v[3] = _mm_cmplt_epi32(u[3], kZero);
  2627. v[4] = _mm_cmplt_epi32(u[4], kZero);
  2628. v[5] = _mm_cmplt_epi32(u[5], kZero);
  2629. v[6] = _mm_cmplt_epi32(u[6], kZero);
  2630. v[7] = _mm_cmplt_epi32(u[7], kZero);
  2631. v[8] = _mm_cmplt_epi32(u[8], kZero);
  2632. v[9] = _mm_cmplt_epi32(u[9], kZero);
  2633. v[10] = _mm_cmplt_epi32(u[10], kZero);
  2634. v[11] = _mm_cmplt_epi32(u[11], kZero);
  2635. v[12] = _mm_cmplt_epi32(u[12], kZero);
  2636. v[13] = _mm_cmplt_epi32(u[13], kZero);
  2637. v[14] = _mm_cmplt_epi32(u[14], kZero);
  2638. v[15] = _mm_cmplt_epi32(u[15], kZero);
  2639. u[0] = _mm_sub_epi32(u[0], v[0]);
  2640. u[1] = _mm_sub_epi32(u[1], v[1]);
  2641. u[2] = _mm_sub_epi32(u[2], v[2]);
  2642. u[3] = _mm_sub_epi32(u[3], v[3]);
  2643. u[4] = _mm_sub_epi32(u[4], v[4]);
  2644. u[5] = _mm_sub_epi32(u[5], v[5]);
  2645. u[6] = _mm_sub_epi32(u[6], v[6]);
  2646. u[7] = _mm_sub_epi32(u[7], v[7]);
  2647. u[8] = _mm_sub_epi32(u[8], v[8]);
  2648. u[9] = _mm_sub_epi32(u[9], v[9]);
  2649. u[10] = _mm_sub_epi32(u[10], v[10]);
  2650. u[11] = _mm_sub_epi32(u[11], v[11]);
  2651. u[12] = _mm_sub_epi32(u[12], v[12]);
  2652. u[13] = _mm_sub_epi32(u[13], v[13]);
  2653. u[14] = _mm_sub_epi32(u[14], v[14]);
  2654. u[15] = _mm_sub_epi32(u[15], v[15]);
  2655. v[0] = _mm_add_epi32(u[0], K32One);
  2656. v[1] = _mm_add_epi32(u[1], K32One);
  2657. v[2] = _mm_add_epi32(u[2], K32One);
  2658. v[3] = _mm_add_epi32(u[3], K32One);
  2659. v[4] = _mm_add_epi32(u[4], K32One);
  2660. v[5] = _mm_add_epi32(u[5], K32One);
  2661. v[6] = _mm_add_epi32(u[6], K32One);
  2662. v[7] = _mm_add_epi32(u[7], K32One);
  2663. v[8] = _mm_add_epi32(u[8], K32One);
  2664. v[9] = _mm_add_epi32(u[9], K32One);
  2665. v[10] = _mm_add_epi32(u[10], K32One);
  2666. v[11] = _mm_add_epi32(u[11], K32One);
  2667. v[12] = _mm_add_epi32(u[12], K32One);
  2668. v[13] = _mm_add_epi32(u[13], K32One);
  2669. v[14] = _mm_add_epi32(u[14], K32One);
  2670. v[15] = _mm_add_epi32(u[15], K32One);
  2671. u[0] = _mm_srai_epi32(v[0], 2);
  2672. u[1] = _mm_srai_epi32(v[1], 2);
  2673. u[2] = _mm_srai_epi32(v[2], 2);
  2674. u[3] = _mm_srai_epi32(v[3], 2);
  2675. u[4] = _mm_srai_epi32(v[4], 2);
  2676. u[5] = _mm_srai_epi32(v[5], 2);
  2677. u[6] = _mm_srai_epi32(v[6], 2);
  2678. u[7] = _mm_srai_epi32(v[7], 2);
  2679. u[8] = _mm_srai_epi32(v[8], 2);
  2680. u[9] = _mm_srai_epi32(v[9], 2);
  2681. u[10] = _mm_srai_epi32(v[10], 2);
  2682. u[11] = _mm_srai_epi32(v[11], 2);
  2683. u[12] = _mm_srai_epi32(v[12], 2);
  2684. u[13] = _mm_srai_epi32(v[13], 2);
  2685. u[14] = _mm_srai_epi32(v[14], 2);
  2686. u[15] = _mm_srai_epi32(v[15], 2);
  2687. out[1] = _mm_packs_epi32(u[0], u[1]);
  2688. out[17] = _mm_packs_epi32(u[2], u[3]);
  2689. out[9] = _mm_packs_epi32(u[4], u[5]);
  2690. out[25] = _mm_packs_epi32(u[6], u[7]);
  2691. out[7] = _mm_packs_epi32(u[8], u[9]);
  2692. out[23] = _mm_packs_epi32(u[10], u[11]);
  2693. out[15] = _mm_packs_epi32(u[12], u[13]);
  2694. out[31] = _mm_packs_epi32(u[14], u[15]);
  2695. #if DCT_HIGH_BIT_DEPTH
  2696. overflow =
  2697. check_epi16_overflow_x8(&out[1], &out[17], &out[9], &out[25],
  2698. &out[7], &out[23], &out[15], &out[31]);
  2699. if (overflow) {
  2700. HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
  2701. return;
  2702. }
  2703. #endif // DCT_HIGH_BIT_DEPTH
  2704. }
  2705. {
  2706. const __m128i k32_p27_p05 = pair_set_epi32(cospi_27_64, cospi_5_64);
  2707. const __m128i k32_p11_p21 = pair_set_epi32(cospi_11_64, cospi_21_64);
  2708. const __m128i k32_p19_p13 = pair_set_epi32(cospi_19_64, cospi_13_64);
  2709. const __m128i k32_p03_p29 = pair_set_epi32(cospi_3_64, cospi_29_64);
  2710. const __m128i k32_m29_p03 = pair_set_epi32(-cospi_29_64, cospi_3_64);
  2711. const __m128i k32_m13_p19 = pair_set_epi32(-cospi_13_64, cospi_19_64);
  2712. const __m128i k32_m21_p11 = pair_set_epi32(-cospi_21_64, cospi_11_64);
  2713. const __m128i k32_m05_p27 = pair_set_epi32(-cospi_5_64, cospi_27_64);
  2714. u[0] = _mm_unpacklo_epi32(lstep1[40], lstep1[54]);
  2715. u[1] = _mm_unpackhi_epi32(lstep1[40], lstep1[54]);
  2716. u[2] = _mm_unpacklo_epi32(lstep1[41], lstep1[55]);
  2717. u[3] = _mm_unpackhi_epi32(lstep1[41], lstep1[55]);
  2718. u[4] = _mm_unpacklo_epi32(lstep1[42], lstep1[52]);
  2719. u[5] = _mm_unpackhi_epi32(lstep1[42], lstep1[52]);
  2720. u[6] = _mm_unpacklo_epi32(lstep1[43], lstep1[53]);
  2721. u[7] = _mm_unpackhi_epi32(lstep1[43], lstep1[53]);
  2722. u[8] = _mm_unpacklo_epi32(lstep1[44], lstep1[50]);
  2723. u[9] = _mm_unpackhi_epi32(lstep1[44], lstep1[50]);
  2724. u[10] = _mm_unpacklo_epi32(lstep1[45], lstep1[51]);
  2725. u[11] = _mm_unpackhi_epi32(lstep1[45], lstep1[51]);
  2726. u[12] = _mm_unpacklo_epi32(lstep1[46], lstep1[48]);
  2727. u[13] = _mm_unpackhi_epi32(lstep1[46], lstep1[48]);
  2728. u[14] = _mm_unpacklo_epi32(lstep1[47], lstep1[49]);
  2729. u[15] = _mm_unpackhi_epi32(lstep1[47], lstep1[49]);
  2730. v[0] = k_madd_epi32(u[0], k32_p27_p05);
  2731. v[1] = k_madd_epi32(u[1], k32_p27_p05);
  2732. v[2] = k_madd_epi32(u[2], k32_p27_p05);
  2733. v[3] = k_madd_epi32(u[3], k32_p27_p05);
  2734. v[4] = k_madd_epi32(u[4], k32_p11_p21);
  2735. v[5] = k_madd_epi32(u[5], k32_p11_p21);
  2736. v[6] = k_madd_epi32(u[6], k32_p11_p21);
  2737. v[7] = k_madd_epi32(u[7], k32_p11_p21);
  2738. v[8] = k_madd_epi32(u[8], k32_p19_p13);
  2739. v[9] = k_madd_epi32(u[9], k32_p19_p13);
  2740. v[10] = k_madd_epi32(u[10], k32_p19_p13);
  2741. v[11] = k_madd_epi32(u[11], k32_p19_p13);
  2742. v[12] = k_madd_epi32(u[12], k32_p03_p29);
  2743. v[13] = k_madd_epi32(u[13], k32_p03_p29);
  2744. v[14] = k_madd_epi32(u[14], k32_p03_p29);
  2745. v[15] = k_madd_epi32(u[15], k32_p03_p29);
  2746. v[16] = k_madd_epi32(u[12], k32_m29_p03);
  2747. v[17] = k_madd_epi32(u[13], k32_m29_p03);
  2748. v[18] = k_madd_epi32(u[14], k32_m29_p03);
  2749. v[19] = k_madd_epi32(u[15], k32_m29_p03);
  2750. v[20] = k_madd_epi32(u[8], k32_m13_p19);
  2751. v[21] = k_madd_epi32(u[9], k32_m13_p19);
  2752. v[22] = k_madd_epi32(u[10], k32_m13_p19);
  2753. v[23] = k_madd_epi32(u[11], k32_m13_p19);
  2754. v[24] = k_madd_epi32(u[4], k32_m21_p11);
  2755. v[25] = k_madd_epi32(u[5], k32_m21_p11);
  2756. v[26] = k_madd_epi32(u[6], k32_m21_p11);
  2757. v[27] = k_madd_epi32(u[7], k32_m21_p11);
  2758. v[28] = k_madd_epi32(u[0], k32_m05_p27);
  2759. v[29] = k_madd_epi32(u[1], k32_m05_p27);
  2760. v[30] = k_madd_epi32(u[2], k32_m05_p27);
  2761. v[31] = k_madd_epi32(u[3], k32_m05_p27);
  2762. #if DCT_HIGH_BIT_DEPTH
  2763. overflow = k_check_epi32_overflow_32(
  2764. &v[0], &v[1], &v[2], &v[3], &v[4], &v[5], &v[6], &v[7], &v[8],
  2765. &v[9], &v[10], &v[11], &v[12], &v[13], &v[14], &v[15], &v[16],
  2766. &v[17], &v[18], &v[19], &v[20], &v[21], &v[22], &v[23], &v[24],
  2767. &v[25], &v[26], &v[27], &v[28], &v[29], &v[30], &v[31], &kZero);
  2768. if (overflow) {
  2769. HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
  2770. return;
  2771. }
  2772. #endif // DCT_HIGH_BIT_DEPTH
  2773. u[0] = k_packs_epi64(v[0], v[1]);
  2774. u[1] = k_packs_epi64(v[2], v[3]);
  2775. u[2] = k_packs_epi64(v[4], v[5]);
  2776. u[3] = k_packs_epi64(v[6], v[7]);
  2777. u[4] = k_packs_epi64(v[8], v[9]);
  2778. u[5] = k_packs_epi64(v[10], v[11]);
  2779. u[6] = k_packs_epi64(v[12], v[13]);
  2780. u[7] = k_packs_epi64(v[14], v[15]);
  2781. u[8] = k_packs_epi64(v[16], v[17]);
  2782. u[9] = k_packs_epi64(v[18], v[19]);
  2783. u[10] = k_packs_epi64(v[20], v[21]);
  2784. u[11] = k_packs_epi64(v[22], v[23]);
  2785. u[12] = k_packs_epi64(v[24], v[25]);
  2786. u[13] = k_packs_epi64(v[26], v[27]);
  2787. u[14] = k_packs_epi64(v[28], v[29]);
  2788. u[15] = k_packs_epi64(v[30], v[31]);
  2789. v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
  2790. v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
  2791. v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
  2792. v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
  2793. v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
  2794. v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
  2795. v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
  2796. v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
  2797. v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING);
  2798. v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING);
  2799. v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
  2800. v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
  2801. v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
  2802. v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
  2803. v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
  2804. v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
  2805. u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
  2806. u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
  2807. u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
  2808. u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
  2809. u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS);
  2810. u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS);
  2811. u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS);
  2812. u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS);
  2813. u[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS);
  2814. u[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS);
  2815. u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS);
  2816. u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS);
  2817. u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS);
  2818. u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS);
  2819. u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS);
  2820. u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS);
  2821. v[0] = _mm_cmplt_epi32(u[0], kZero);
  2822. v[1] = _mm_cmplt_epi32(u[1], kZero);
  2823. v[2] = _mm_cmplt_epi32(u[2], kZero);
  2824. v[3] = _mm_cmplt_epi32(u[3], kZero);
  2825. v[4] = _mm_cmplt_epi32(u[4], kZero);
  2826. v[5] = _mm_cmplt_epi32(u[5], kZero);
  2827. v[6] = _mm_cmplt_epi32(u[6], kZero);
  2828. v[7] = _mm_cmplt_epi32(u[7], kZero);
  2829. v[8] = _mm_cmplt_epi32(u[8], kZero);
  2830. v[9] = _mm_cmplt_epi32(u[9], kZero);
  2831. v[10] = _mm_cmplt_epi32(u[10], kZero);
  2832. v[11] = _mm_cmplt_epi32(u[11], kZero);
  2833. v[12] = _mm_cmplt_epi32(u[12], kZero);
  2834. v[13] = _mm_cmplt_epi32(u[13], kZero);
  2835. v[14] = _mm_cmplt_epi32(u[14], kZero);
  2836. v[15] = _mm_cmplt_epi32(u[15], kZero);
  2837. u[0] = _mm_sub_epi32(u[0], v[0]);
  2838. u[1] = _mm_sub_epi32(u[1], v[1]);
  2839. u[2] = _mm_sub_epi32(u[2], v[2]);
  2840. u[3] = _mm_sub_epi32(u[3], v[3]);
  2841. u[4] = _mm_sub_epi32(u[4], v[4]);
  2842. u[5] = _mm_sub_epi32(u[5], v[5]);
  2843. u[6] = _mm_sub_epi32(u[6], v[6]);
  2844. u[7] = _mm_sub_epi32(u[7], v[7]);
  2845. u[8] = _mm_sub_epi32(u[8], v[8]);
  2846. u[9] = _mm_sub_epi32(u[9], v[9]);
  2847. u[10] = _mm_sub_epi32(u[10], v[10]);
  2848. u[11] = _mm_sub_epi32(u[11], v[11]);
  2849. u[12] = _mm_sub_epi32(u[12], v[12]);
  2850. u[13] = _mm_sub_epi32(u[13], v[13]);
  2851. u[14] = _mm_sub_epi32(u[14], v[14]);
  2852. u[15] = _mm_sub_epi32(u[15], v[15]);
  2853. v[0] = _mm_add_epi32(u[0], K32One);
  2854. v[1] = _mm_add_epi32(u[1], K32One);
  2855. v[2] = _mm_add_epi32(u[2], K32One);
  2856. v[3] = _mm_add_epi32(u[3], K32One);
  2857. v[4] = _mm_add_epi32(u[4], K32One);
  2858. v[5] = _mm_add_epi32(u[5], K32One);
  2859. v[6] = _mm_add_epi32(u[6], K32One);
  2860. v[7] = _mm_add_epi32(u[7], K32One);
  2861. v[8] = _mm_add_epi32(u[8], K32One);
  2862. v[9] = _mm_add_epi32(u[9], K32One);
  2863. v[10] = _mm_add_epi32(u[10], K32One);
  2864. v[11] = _mm_add_epi32(u[11], K32One);
  2865. v[12] = _mm_add_epi32(u[12], K32One);
  2866. v[13] = _mm_add_epi32(u[13], K32One);
  2867. v[14] = _mm_add_epi32(u[14], K32One);
  2868. v[15] = _mm_add_epi32(u[15], K32One);
  2869. u[0] = _mm_srai_epi32(v[0], 2);
  2870. u[1] = _mm_srai_epi32(v[1], 2);
  2871. u[2] = _mm_srai_epi32(v[2], 2);
  2872. u[3] = _mm_srai_epi32(v[3], 2);
  2873. u[4] = _mm_srai_epi32(v[4], 2);
  2874. u[5] = _mm_srai_epi32(v[5], 2);
  2875. u[6] = _mm_srai_epi32(v[6], 2);
  2876. u[7] = _mm_srai_epi32(v[7], 2);
  2877. u[8] = _mm_srai_epi32(v[8], 2);
  2878. u[9] = _mm_srai_epi32(v[9], 2);
  2879. u[10] = _mm_srai_epi32(v[10], 2);
  2880. u[11] = _mm_srai_epi32(v[11], 2);
  2881. u[12] = _mm_srai_epi32(v[12], 2);
  2882. u[13] = _mm_srai_epi32(v[13], 2);
  2883. u[14] = _mm_srai_epi32(v[14], 2);
  2884. u[15] = _mm_srai_epi32(v[15], 2);
  2885. out[5] = _mm_packs_epi32(u[0], u[1]);
  2886. out[21] = _mm_packs_epi32(u[2], u[3]);
  2887. out[13] = _mm_packs_epi32(u[4], u[5]);
  2888. out[29] = _mm_packs_epi32(u[6], u[7]);
  2889. out[3] = _mm_packs_epi32(u[8], u[9]);
  2890. out[19] = _mm_packs_epi32(u[10], u[11]);
  2891. out[11] = _mm_packs_epi32(u[12], u[13]);
  2892. out[27] = _mm_packs_epi32(u[14], u[15]);
  2893. #if DCT_HIGH_BIT_DEPTH
  2894. overflow =
  2895. check_epi16_overflow_x8(&out[5], &out[21], &out[13], &out[29],
  2896. &out[3], &out[19], &out[11], &out[27]);
  2897. if (overflow) {
  2898. HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
  2899. return;
  2900. }
  2901. #endif // DCT_HIGH_BIT_DEPTH
  2902. }
  2903. }
  2904. #endif // FDCT32x32_HIGH_PRECISION
  2905. // Transpose the results, do it as four 8x8 transposes.
  2906. {
  2907. int transpose_block;
  2908. int16_t *output0 = &intermediate[column_start * 32];
  2909. tran_low_t *output1 = &output_org[column_start * 32];
  2910. for (transpose_block = 0; transpose_block < 4; ++transpose_block) {
  2911. __m128i *this_out = &out[8 * transpose_block];
  2912. // 00 01 02 03 04 05 06 07
  2913. // 10 11 12 13 14 15 16 17
  2914. // 20 21 22 23 24 25 26 27
  2915. // 30 31 32 33 34 35 36 37
  2916. // 40 41 42 43 44 45 46 47
  2917. // 50 51 52 53 54 55 56 57
  2918. // 60 61 62 63 64 65 66 67
  2919. // 70 71 72 73 74 75 76 77
  2920. const __m128i tr0_0 = _mm_unpacklo_epi16(this_out[0], this_out[1]);
  2921. const __m128i tr0_1 = _mm_unpacklo_epi16(this_out[2], this_out[3]);
  2922. const __m128i tr0_2 = _mm_unpackhi_epi16(this_out[0], this_out[1]);
  2923. const __m128i tr0_3 = _mm_unpackhi_epi16(this_out[2], this_out[3]);
  2924. const __m128i tr0_4 = _mm_unpacklo_epi16(this_out[4], this_out[5]);
  2925. const __m128i tr0_5 = _mm_unpacklo_epi16(this_out[6], this_out[7]);
  2926. const __m128i tr0_6 = _mm_unpackhi_epi16(this_out[4], this_out[5]);
  2927. const __m128i tr0_7 = _mm_unpackhi_epi16(this_out[6], this_out[7]);
  2928. // 00 10 01 11 02 12 03 13
  2929. // 20 30 21 31 22 32 23 33
  2930. // 04 14 05 15 06 16 07 17
  2931. // 24 34 25 35 26 36 27 37
  2932. // 40 50 41 51 42 52 43 53
  2933. // 60 70 61 71 62 72 63 73
  2934. // 54 54 55 55 56 56 57 57
  2935. // 64 74 65 75 66 76 67 77
  2936. const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
  2937. const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3);
  2938. const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1);
  2939. const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3);
  2940. const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5);
  2941. const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7);
  2942. const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5);
  2943. const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7);
  2944. // 00 10 20 30 01 11 21 31
  2945. // 40 50 60 70 41 51 61 71
  2946. // 02 12 22 32 03 13 23 33
  2947. // 42 52 62 72 43 53 63 73
  2948. // 04 14 24 34 05 15 21 36
  2949. // 44 54 64 74 45 55 61 76
  2950. // 06 16 26 36 07 17 27 37
  2951. // 46 56 66 76 47 57 67 77
  2952. __m128i tr2_0 = _mm_unpacklo_epi64(tr1_0, tr1_4);
  2953. __m128i tr2_1 = _mm_unpackhi_epi64(tr1_0, tr1_4);
  2954. __m128i tr2_2 = _mm_unpacklo_epi64(tr1_2, tr1_6);
  2955. __m128i tr2_3 = _mm_unpackhi_epi64(tr1_2, tr1_6);
  2956. __m128i tr2_4 = _mm_unpacklo_epi64(tr1_1, tr1_5);
  2957. __m128i tr2_5 = _mm_unpackhi_epi64(tr1_1, tr1_5);
  2958. __m128i tr2_6 = _mm_unpacklo_epi64(tr1_3, tr1_7);
  2959. __m128i tr2_7 = _mm_unpackhi_epi64(tr1_3, tr1_7);
  2960. // 00 10 20 30 40 50 60 70
  2961. // 01 11 21 31 41 51 61 71
  2962. // 02 12 22 32 42 52 62 72
  2963. // 03 13 23 33 43 53 63 73
  2964. // 04 14 24 34 44 54 64 74
  2965. // 05 15 25 35 45 55 65 75
  2966. // 06 16 26 36 46 56 66 76
  2967. // 07 17 27 37 47 57 67 77
  2968. if (0 == pass) {
  2969. // output[j] = (output[j] + 1 + (output[j] > 0)) >> 2;
  2970. // TODO(cd): see quality impact of only doing
  2971. // output[j] = (output[j] + 1) >> 2;
  2972. // which would remove the code between here ...
  2973. __m128i tr2_0_0 = _mm_cmpgt_epi16(tr2_0, kZero);
  2974. __m128i tr2_1_0 = _mm_cmpgt_epi16(tr2_1, kZero);
  2975. __m128i tr2_2_0 = _mm_cmpgt_epi16(tr2_2, kZero);
  2976. __m128i tr2_3_0 = _mm_cmpgt_epi16(tr2_3, kZero);
  2977. __m128i tr2_4_0 = _mm_cmpgt_epi16(tr2_4, kZero);
  2978. __m128i tr2_5_0 = _mm_cmpgt_epi16(tr2_5, kZero);
  2979. __m128i tr2_6_0 = _mm_cmpgt_epi16(tr2_6, kZero);
  2980. __m128i tr2_7_0 = _mm_cmpgt_epi16(tr2_7, kZero);
  2981. tr2_0 = _mm_sub_epi16(tr2_0, tr2_0_0);
  2982. tr2_1 = _mm_sub_epi16(tr2_1, tr2_1_0);
  2983. tr2_2 = _mm_sub_epi16(tr2_2, tr2_2_0);
  2984. tr2_3 = _mm_sub_epi16(tr2_3, tr2_3_0);
  2985. tr2_4 = _mm_sub_epi16(tr2_4, tr2_4_0);
  2986. tr2_5 = _mm_sub_epi16(tr2_5, tr2_5_0);
  2987. tr2_6 = _mm_sub_epi16(tr2_6, tr2_6_0);
  2988. tr2_7 = _mm_sub_epi16(tr2_7, tr2_7_0);
  2989. // ... and here.
  2990. // PS: also change code in vp9/encoder/vp9_dct.c
  2991. tr2_0 = _mm_add_epi16(tr2_0, kOne);
  2992. tr2_1 = _mm_add_epi16(tr2_1, kOne);
  2993. tr2_2 = _mm_add_epi16(tr2_2, kOne);
  2994. tr2_3 = _mm_add_epi16(tr2_3, kOne);
  2995. tr2_4 = _mm_add_epi16(tr2_4, kOne);
  2996. tr2_5 = _mm_add_epi16(tr2_5, kOne);
  2997. tr2_6 = _mm_add_epi16(tr2_6, kOne);
  2998. tr2_7 = _mm_add_epi16(tr2_7, kOne);
  2999. tr2_0 = _mm_srai_epi16(tr2_0, 2);
  3000. tr2_1 = _mm_srai_epi16(tr2_1, 2);
  3001. tr2_2 = _mm_srai_epi16(tr2_2, 2);
  3002. tr2_3 = _mm_srai_epi16(tr2_3, 2);
  3003. tr2_4 = _mm_srai_epi16(tr2_4, 2);
  3004. tr2_5 = _mm_srai_epi16(tr2_5, 2);
  3005. tr2_6 = _mm_srai_epi16(tr2_6, 2);
  3006. tr2_7 = _mm_srai_epi16(tr2_7, 2);
  3007. }
  3008. // Note: even though all these stores are aligned, using the aligned
  3009. // intrinsic make the code slightly slower.
  3010. if (pass == 0) {
  3011. _mm_storeu_si128((__m128i *)(output0 + 0 * 32), tr2_0);
  3012. _mm_storeu_si128((__m128i *)(output0 + 1 * 32), tr2_1);
  3013. _mm_storeu_si128((__m128i *)(output0 + 2 * 32), tr2_2);
  3014. _mm_storeu_si128((__m128i *)(output0 + 3 * 32), tr2_3);
  3015. _mm_storeu_si128((__m128i *)(output0 + 4 * 32), tr2_4);
  3016. _mm_storeu_si128((__m128i *)(output0 + 5 * 32), tr2_5);
  3017. _mm_storeu_si128((__m128i *)(output0 + 6 * 32), tr2_6);
  3018. _mm_storeu_si128((__m128i *)(output0 + 7 * 32), tr2_7);
  3019. // Process next 8x8
  3020. output0 += 8;
  3021. } else {
  3022. storeu_output(&tr2_0, (output1 + 0 * 32));
  3023. storeu_output(&tr2_1, (output1 + 1 * 32));
  3024. storeu_output(&tr2_2, (output1 + 2 * 32));
  3025. storeu_output(&tr2_3, (output1 + 3 * 32));
  3026. storeu_output(&tr2_4, (output1 + 4 * 32));
  3027. storeu_output(&tr2_5, (output1 + 5 * 32));
  3028. storeu_output(&tr2_6, (output1 + 6 * 32));
  3029. storeu_output(&tr2_7, (output1 + 7 * 32));
  3030. // Process next 8x8
  3031. output1 += 8;
  3032. }
  3033. }
  3034. }
  3035. }
  3036. }
  3037. } // NOLINT
  3038. #undef ADD_EPI16
  3039. #undef SUB_EPI16
  3040. #undef HIGH_FDCT32x32_2D_C
  3041. #undef HIGH_FDCT32x32_2D_ROWS_C