scale_gcc.cc 57 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374
  1. /*
  2. * Copyright 2013 The LibYuv Project Authors. All rights reserved.
  3. *
  4. * Use of this source code is governed by a BSD-style license
  5. * that can be found in the LICENSE file in the root of the source
  6. * tree. An additional intellectual property rights grant can be found
  7. * in the file PATENTS. All contributing project authors may
  8. * be found in the AUTHORS file in the root of the source tree.
  9. */
  10. #include "libyuv/row.h"
  11. #include "libyuv/scale_row.h"
  12. #ifdef __cplusplus
  13. namespace libyuv {
  14. extern "C" {
  15. #endif
  16. // This module is for GCC x86 and x64.
  17. #if !defined(LIBYUV_DISABLE_X86) && \
  18. (defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER)))
  19. // Offsets for source bytes 0 to 9
  20. static const uvec8 kShuf0 = {0, 1, 3, 4, 5, 7, 8, 9,
  21. 128, 128, 128, 128, 128, 128, 128, 128};
  22. // Offsets for source bytes 11 to 20 with 8 subtracted = 3 to 12.
  23. static const uvec8 kShuf1 = {3, 4, 5, 7, 8, 9, 11, 12,
  24. 128, 128, 128, 128, 128, 128, 128, 128};
  25. // Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31.
  26. static const uvec8 kShuf2 = {5, 7, 8, 9, 11, 12, 13, 15,
  27. 128, 128, 128, 128, 128, 128, 128, 128};
  28. // Offsets for source bytes 0 to 10
  29. static const uvec8 kShuf01 = {0, 1, 1, 2, 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10};
  30. // Offsets for source bytes 10 to 21 with 8 subtracted = 3 to 13.
  31. static const uvec8 kShuf11 = {2, 3, 4, 5, 5, 6, 6, 7,
  32. 8, 9, 9, 10, 10, 11, 12, 13};
  33. // Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31.
  34. static const uvec8 kShuf21 = {5, 6, 6, 7, 8, 9, 9, 10,
  35. 10, 11, 12, 13, 13, 14, 14, 15};
  36. // Coefficients for source bytes 0 to 10
  37. static const uvec8 kMadd01 = {3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2};
  38. // Coefficients for source bytes 10 to 21
  39. static const uvec8 kMadd11 = {1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1};
  40. // Coefficients for source bytes 21 to 31
  41. static const uvec8 kMadd21 = {2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3};
  42. // Coefficients for source bytes 21 to 31
  43. static const vec16 kRound34 = {2, 2, 2, 2, 2, 2, 2, 2};
  44. static const uvec8 kShuf38a = {0, 3, 6, 8, 11, 14, 128, 128,
  45. 128, 128, 128, 128, 128, 128, 128, 128};
  46. static const uvec8 kShuf38b = {128, 128, 128, 128, 128, 128, 0, 3,
  47. 6, 8, 11, 14, 128, 128, 128, 128};
  48. // Arrange words 0,3,6 into 0,1,2
  49. static const uvec8 kShufAc = {0, 1, 6, 7, 12, 13, 128, 128,
  50. 128, 128, 128, 128, 128, 128, 128, 128};
  51. // Arrange words 0,3,6 into 3,4,5
  52. static const uvec8 kShufAc3 = {128, 128, 128, 128, 128, 128, 0, 1,
  53. 6, 7, 12, 13, 128, 128, 128, 128};
  54. // Scaling values for boxes of 3x3 and 2x3
  55. static const uvec16 kScaleAc33 = {65536 / 9, 65536 / 9, 65536 / 6, 65536 / 9,
  56. 65536 / 9, 65536 / 6, 0, 0};
  57. // Arrange first value for pixels 0,1,2,3,4,5
  58. static const uvec8 kShufAb0 = {0, 128, 3, 128, 6, 128, 8, 128,
  59. 11, 128, 14, 128, 128, 128, 128, 128};
  60. // Arrange second value for pixels 0,1,2,3,4,5
  61. static const uvec8 kShufAb1 = {1, 128, 4, 128, 7, 128, 9, 128,
  62. 12, 128, 15, 128, 128, 128, 128, 128};
  63. // Arrange third value for pixels 0,1,2,3,4,5
  64. static const uvec8 kShufAb2 = {2, 128, 5, 128, 128, 128, 10, 128,
  65. 13, 128, 128, 128, 128, 128, 128, 128};
  66. // Scaling values for boxes of 3x2 and 2x2
  67. static const uvec16 kScaleAb2 = {65536 / 3, 65536 / 3, 65536 / 2, 65536 / 3,
  68. 65536 / 3, 65536 / 2, 0, 0};
  69. // GCC versions of row functions are verbatim conversions from Visual C.
  70. // Generated using gcc disassembly on Visual C object file:
  71. // objdump -D yuvscaler.obj >yuvscaler.txt
  72. void ScaleRowDown2_SSSE3(const uint8_t* src_ptr,
  73. ptrdiff_t src_stride,
  74. uint8_t* dst_ptr,
  75. int dst_width) {
  76. (void)src_stride;
  77. asm volatile(
  78. // 16 pixel loop.
  79. LABELALIGN
  80. "1: \n"
  81. "movdqu (%0),%%xmm0 \n"
  82. "movdqu 0x10(%0),%%xmm1 \n"
  83. "lea 0x20(%0),%0 \n"
  84. "psrlw $0x8,%%xmm0 \n"
  85. "psrlw $0x8,%%xmm1 \n"
  86. "packuswb %%xmm1,%%xmm0 \n"
  87. "movdqu %%xmm0,(%1) \n"
  88. "lea 0x10(%1),%1 \n"
  89. "sub $0x10,%2 \n"
  90. "jg 1b \n"
  91. : "+r"(src_ptr), // %0
  92. "+r"(dst_ptr), // %1
  93. "+r"(dst_width) // %2
  94. ::"memory",
  95. "cc", "xmm0", "xmm1");
  96. }
  97. void ScaleRowDown2Linear_SSSE3(const uint8_t* src_ptr,
  98. ptrdiff_t src_stride,
  99. uint8_t* dst_ptr,
  100. int dst_width) {
  101. (void)src_stride;
  102. asm volatile(
  103. "pcmpeqb %%xmm4,%%xmm4 \n"
  104. "psrlw $0xf,%%xmm4 \n"
  105. "packuswb %%xmm4,%%xmm4 \n"
  106. "pxor %%xmm5,%%xmm5 \n"
  107. LABELALIGN
  108. "1: \n"
  109. "movdqu (%0),%%xmm0 \n"
  110. "movdqu 0x10(%0),%%xmm1 \n"
  111. "lea 0x20(%0),%0 \n"
  112. "pmaddubsw %%xmm4,%%xmm0 \n"
  113. "pmaddubsw %%xmm4,%%xmm1 \n"
  114. "pavgw %%xmm5,%%xmm0 \n"
  115. "pavgw %%xmm5,%%xmm1 \n"
  116. "packuswb %%xmm1,%%xmm0 \n"
  117. "movdqu %%xmm0,(%1) \n"
  118. "lea 0x10(%1),%1 \n"
  119. "sub $0x10,%2 \n"
  120. "jg 1b \n"
  121. : "+r"(src_ptr), // %0
  122. "+r"(dst_ptr), // %1
  123. "+r"(dst_width) // %2
  124. ::"memory",
  125. "cc", "xmm0", "xmm1", "xmm4", "xmm5");
  126. }
  127. void ScaleRowDown2Box_SSSE3(const uint8_t* src_ptr,
  128. ptrdiff_t src_stride,
  129. uint8_t* dst_ptr,
  130. int dst_width) {
  131. asm volatile(
  132. "pcmpeqb %%xmm4,%%xmm4 \n"
  133. "psrlw $0xf,%%xmm4 \n"
  134. "packuswb %%xmm4,%%xmm4 \n"
  135. "pxor %%xmm5,%%xmm5 \n"
  136. LABELALIGN
  137. "1: \n"
  138. "movdqu (%0),%%xmm0 \n"
  139. "movdqu 0x10(%0),%%xmm1 \n"
  140. "movdqu 0x00(%0,%3,1),%%xmm2 \n"
  141. "movdqu 0x10(%0,%3,1),%%xmm3 \n"
  142. "lea 0x20(%0),%0 \n"
  143. "pmaddubsw %%xmm4,%%xmm0 \n"
  144. "pmaddubsw %%xmm4,%%xmm1 \n"
  145. "pmaddubsw %%xmm4,%%xmm2 \n"
  146. "pmaddubsw %%xmm4,%%xmm3 \n"
  147. "paddw %%xmm2,%%xmm0 \n"
  148. "paddw %%xmm3,%%xmm1 \n"
  149. "psrlw $0x1,%%xmm0 \n"
  150. "psrlw $0x1,%%xmm1 \n"
  151. "pavgw %%xmm5,%%xmm0 \n"
  152. "pavgw %%xmm5,%%xmm1 \n"
  153. "packuswb %%xmm1,%%xmm0 \n"
  154. "movdqu %%xmm0,(%1) \n"
  155. "lea 0x10(%1),%1 \n"
  156. "sub $0x10,%2 \n"
  157. "jg 1b \n"
  158. : "+r"(src_ptr), // %0
  159. "+r"(dst_ptr), // %1
  160. "+r"(dst_width) // %2
  161. : "r"((intptr_t)(src_stride)) // %3
  162. : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
  163. }
  164. #ifdef HAS_SCALEROWDOWN2_AVX2
  165. void ScaleRowDown2_AVX2(const uint8_t* src_ptr,
  166. ptrdiff_t src_stride,
  167. uint8_t* dst_ptr,
  168. int dst_width) {
  169. (void)src_stride;
  170. asm volatile(
  171. LABELALIGN
  172. "1: \n"
  173. "vmovdqu (%0),%%ymm0 \n"
  174. "vmovdqu 0x20(%0),%%ymm1 \n"
  175. "lea 0x40(%0),%0 \n"
  176. "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
  177. "vpsrlw $0x8,%%ymm1,%%ymm1 \n"
  178. "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
  179. "vpermq $0xd8,%%ymm0,%%ymm0 \n"
  180. "vmovdqu %%ymm0,(%1) \n"
  181. "lea 0x20(%1),%1 \n"
  182. "sub $0x20,%2 \n"
  183. "jg 1b \n"
  184. "vzeroupper \n"
  185. : "+r"(src_ptr), // %0
  186. "+r"(dst_ptr), // %1
  187. "+r"(dst_width) // %2
  188. ::"memory",
  189. "cc", "xmm0", "xmm1");
  190. }
  191. void ScaleRowDown2Linear_AVX2(const uint8_t* src_ptr,
  192. ptrdiff_t src_stride,
  193. uint8_t* dst_ptr,
  194. int dst_width) {
  195. (void)src_stride;
  196. asm volatile(
  197. "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n"
  198. "vpsrlw $0xf,%%ymm4,%%ymm4 \n"
  199. "vpackuswb %%ymm4,%%ymm4,%%ymm4 \n"
  200. "vpxor %%ymm5,%%ymm5,%%ymm5 \n"
  201. LABELALIGN
  202. "1: \n"
  203. "vmovdqu (%0),%%ymm0 \n"
  204. "vmovdqu 0x20(%0),%%ymm1 \n"
  205. "lea 0x40(%0),%0 \n"
  206. "vpmaddubsw %%ymm4,%%ymm0,%%ymm0 \n"
  207. "vpmaddubsw %%ymm4,%%ymm1,%%ymm1 \n"
  208. "vpavgw %%ymm5,%%ymm0,%%ymm0 \n"
  209. "vpavgw %%ymm5,%%ymm1,%%ymm1 \n"
  210. "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
  211. "vpermq $0xd8,%%ymm0,%%ymm0 \n"
  212. "vmovdqu %%ymm0,(%1) \n"
  213. "lea 0x20(%1),%1 \n"
  214. "sub $0x20,%2 \n"
  215. "jg 1b \n"
  216. "vzeroupper \n"
  217. : "+r"(src_ptr), // %0
  218. "+r"(dst_ptr), // %1
  219. "+r"(dst_width) // %2
  220. ::"memory",
  221. "cc", "xmm0", "xmm1", "xmm4", "xmm5");
  222. }
  223. void ScaleRowDown2Box_AVX2(const uint8_t* src_ptr,
  224. ptrdiff_t src_stride,
  225. uint8_t* dst_ptr,
  226. int dst_width) {
  227. asm volatile(
  228. "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n"
  229. "vpsrlw $0xf,%%ymm4,%%ymm4 \n"
  230. "vpackuswb %%ymm4,%%ymm4,%%ymm4 \n"
  231. "vpxor %%ymm5,%%ymm5,%%ymm5 \n"
  232. LABELALIGN
  233. "1: \n"
  234. "vmovdqu (%0),%%ymm0 \n"
  235. "vmovdqu 0x20(%0),%%ymm1 \n"
  236. "vmovdqu 0x00(%0,%3,1),%%ymm2 \n"
  237. "vmovdqu 0x20(%0,%3,1),%%ymm3 \n"
  238. "lea 0x40(%0),%0 \n"
  239. "vpmaddubsw %%ymm4,%%ymm0,%%ymm0 \n"
  240. "vpmaddubsw %%ymm4,%%ymm1,%%ymm1 \n"
  241. "vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n"
  242. "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n"
  243. "vpaddw %%ymm2,%%ymm0,%%ymm0 \n"
  244. "vpaddw %%ymm3,%%ymm1,%%ymm1 \n"
  245. "vpsrlw $0x1,%%ymm0,%%ymm0 \n"
  246. "vpsrlw $0x1,%%ymm1,%%ymm1 \n"
  247. "vpavgw %%ymm5,%%ymm0,%%ymm0 \n"
  248. "vpavgw %%ymm5,%%ymm1,%%ymm1 \n"
  249. "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
  250. "vpermq $0xd8,%%ymm0,%%ymm0 \n"
  251. "vmovdqu %%ymm0,(%1) \n"
  252. "lea 0x20(%1),%1 \n"
  253. "sub $0x20,%2 \n"
  254. "jg 1b \n"
  255. "vzeroupper \n"
  256. : "+r"(src_ptr), // %0
  257. "+r"(dst_ptr), // %1
  258. "+r"(dst_width) // %2
  259. : "r"((intptr_t)(src_stride)) // %3
  260. : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
  261. }
  262. #endif // HAS_SCALEROWDOWN2_AVX2
  263. void ScaleRowDown4_SSSE3(const uint8_t* src_ptr,
  264. ptrdiff_t src_stride,
  265. uint8_t* dst_ptr,
  266. int dst_width) {
  267. (void)src_stride;
  268. asm volatile(
  269. "pcmpeqb %%xmm5,%%xmm5 \n"
  270. "psrld $0x18,%%xmm5 \n"
  271. "pslld $0x10,%%xmm5 \n"
  272. LABELALIGN
  273. "1: \n"
  274. "movdqu (%0),%%xmm0 \n"
  275. "movdqu 0x10(%0),%%xmm1 \n"
  276. "lea 0x20(%0),%0 \n"
  277. "pand %%xmm5,%%xmm0 \n"
  278. "pand %%xmm5,%%xmm1 \n"
  279. "packuswb %%xmm1,%%xmm0 \n"
  280. "psrlw $0x8,%%xmm0 \n"
  281. "packuswb %%xmm0,%%xmm0 \n"
  282. "movq %%xmm0,(%1) \n"
  283. "lea 0x8(%1),%1 \n"
  284. "sub $0x8,%2 \n"
  285. "jg 1b \n"
  286. : "+r"(src_ptr), // %0
  287. "+r"(dst_ptr), // %1
  288. "+r"(dst_width) // %2
  289. ::"memory",
  290. "cc", "xmm0", "xmm1", "xmm5");
  291. }
  292. void ScaleRowDown4Box_SSSE3(const uint8_t* src_ptr,
  293. ptrdiff_t src_stride,
  294. uint8_t* dst_ptr,
  295. int dst_width) {
  296. intptr_t stridex3;
  297. asm volatile(
  298. "pcmpeqb %%xmm4,%%xmm4 \n"
  299. "psrlw $0xf,%%xmm4 \n"
  300. "movdqa %%xmm4,%%xmm5 \n"
  301. "packuswb %%xmm4,%%xmm4 \n"
  302. "psllw $0x3,%%xmm5 \n"
  303. "lea 0x00(%4,%4,2),%3 \n"
  304. LABELALIGN
  305. "1: \n"
  306. "movdqu (%0),%%xmm0 \n"
  307. "movdqu 0x10(%0),%%xmm1 \n"
  308. "movdqu 0x00(%0,%4,1),%%xmm2 \n"
  309. "movdqu 0x10(%0,%4,1),%%xmm3 \n"
  310. "pmaddubsw %%xmm4,%%xmm0 \n"
  311. "pmaddubsw %%xmm4,%%xmm1 \n"
  312. "pmaddubsw %%xmm4,%%xmm2 \n"
  313. "pmaddubsw %%xmm4,%%xmm3 \n"
  314. "paddw %%xmm2,%%xmm0 \n"
  315. "paddw %%xmm3,%%xmm1 \n"
  316. "movdqu 0x00(%0,%4,2),%%xmm2 \n"
  317. "movdqu 0x10(%0,%4,2),%%xmm3 \n"
  318. "pmaddubsw %%xmm4,%%xmm2 \n"
  319. "pmaddubsw %%xmm4,%%xmm3 \n"
  320. "paddw %%xmm2,%%xmm0 \n"
  321. "paddw %%xmm3,%%xmm1 \n"
  322. "movdqu 0x00(%0,%3,1),%%xmm2 \n"
  323. "movdqu 0x10(%0,%3,1),%%xmm3 \n"
  324. "lea 0x20(%0),%0 \n"
  325. "pmaddubsw %%xmm4,%%xmm2 \n"
  326. "pmaddubsw %%xmm4,%%xmm3 \n"
  327. "paddw %%xmm2,%%xmm0 \n"
  328. "paddw %%xmm3,%%xmm1 \n"
  329. "phaddw %%xmm1,%%xmm0 \n"
  330. "paddw %%xmm5,%%xmm0 \n"
  331. "psrlw $0x4,%%xmm0 \n"
  332. "packuswb %%xmm0,%%xmm0 \n"
  333. "movq %%xmm0,(%1) \n"
  334. "lea 0x8(%1),%1 \n"
  335. "sub $0x8,%2 \n"
  336. "jg 1b \n"
  337. : "+r"(src_ptr), // %0
  338. "+r"(dst_ptr), // %1
  339. "+r"(dst_width), // %2
  340. "=&r"(stridex3) // %3
  341. : "r"((intptr_t)(src_stride)) // %4
  342. : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
  343. }
  344. #ifdef HAS_SCALEROWDOWN4_AVX2
  345. void ScaleRowDown4_AVX2(const uint8_t* src_ptr,
  346. ptrdiff_t src_stride,
  347. uint8_t* dst_ptr,
  348. int dst_width) {
  349. (void)src_stride;
  350. asm volatile(
  351. "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
  352. "vpsrld $0x18,%%ymm5,%%ymm5 \n"
  353. "vpslld $0x10,%%ymm5,%%ymm5 \n"
  354. LABELALIGN
  355. "1: \n"
  356. "vmovdqu (%0),%%ymm0 \n"
  357. "vmovdqu 0x20(%0),%%ymm1 \n"
  358. "lea 0x40(%0),%0 \n"
  359. "vpand %%ymm5,%%ymm0,%%ymm0 \n"
  360. "vpand %%ymm5,%%ymm1,%%ymm1 \n"
  361. "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
  362. "vpermq $0xd8,%%ymm0,%%ymm0 \n"
  363. "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
  364. "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n"
  365. "vpermq $0xd8,%%ymm0,%%ymm0 \n"
  366. "vmovdqu %%xmm0,(%1) \n"
  367. "lea 0x10(%1),%1 \n"
  368. "sub $0x10,%2 \n"
  369. "jg 1b \n"
  370. "vzeroupper \n"
  371. : "+r"(src_ptr), // %0
  372. "+r"(dst_ptr), // %1
  373. "+r"(dst_width) // %2
  374. ::"memory",
  375. "cc", "xmm0", "xmm1", "xmm5");
  376. }
  377. void ScaleRowDown4Box_AVX2(const uint8_t* src_ptr,
  378. ptrdiff_t src_stride,
  379. uint8_t* dst_ptr,
  380. int dst_width) {
  381. asm volatile(
  382. "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n"
  383. "vpsrlw $0xf,%%ymm4,%%ymm4 \n"
  384. "vpsllw $0x3,%%ymm4,%%ymm5 \n"
  385. "vpackuswb %%ymm4,%%ymm4,%%ymm4 \n"
  386. LABELALIGN
  387. "1: \n"
  388. "vmovdqu (%0),%%ymm0 \n"
  389. "vmovdqu 0x20(%0),%%ymm1 \n"
  390. "vmovdqu 0x00(%0,%3,1),%%ymm2 \n"
  391. "vmovdqu 0x20(%0,%3,1),%%ymm3 \n"
  392. "vpmaddubsw %%ymm4,%%ymm0,%%ymm0 \n"
  393. "vpmaddubsw %%ymm4,%%ymm1,%%ymm1 \n"
  394. "vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n"
  395. "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n"
  396. "vpaddw %%ymm2,%%ymm0,%%ymm0 \n"
  397. "vpaddw %%ymm3,%%ymm1,%%ymm1 \n"
  398. "vmovdqu 0x00(%0,%3,2),%%ymm2 \n"
  399. "vmovdqu 0x20(%0,%3,2),%%ymm3 \n"
  400. "vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n"
  401. "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n"
  402. "vpaddw %%ymm2,%%ymm0,%%ymm0 \n"
  403. "vpaddw %%ymm3,%%ymm1,%%ymm1 \n"
  404. "vmovdqu 0x00(%0,%4,1),%%ymm2 \n"
  405. "vmovdqu 0x20(%0,%4,1),%%ymm3 \n"
  406. "lea 0x40(%0),%0 \n"
  407. "vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n"
  408. "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n"
  409. "vpaddw %%ymm2,%%ymm0,%%ymm0 \n"
  410. "vpaddw %%ymm3,%%ymm1,%%ymm1 \n"
  411. "vphaddw %%ymm1,%%ymm0,%%ymm0 \n"
  412. "vpermq $0xd8,%%ymm0,%%ymm0 \n"
  413. "vpaddw %%ymm5,%%ymm0,%%ymm0 \n"
  414. "vpsrlw $0x4,%%ymm0,%%ymm0 \n"
  415. "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n"
  416. "vpermq $0xd8,%%ymm0,%%ymm0 \n"
  417. "vmovdqu %%xmm0,(%1) \n"
  418. "lea 0x10(%1),%1 \n"
  419. "sub $0x10,%2 \n"
  420. "jg 1b \n"
  421. "vzeroupper \n"
  422. : "+r"(src_ptr), // %0
  423. "+r"(dst_ptr), // %1
  424. "+r"(dst_width) // %2
  425. : "r"((intptr_t)(src_stride)), // %3
  426. "r"((intptr_t)(src_stride * 3)) // %4
  427. : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
  428. }
  429. #endif // HAS_SCALEROWDOWN4_AVX2
  430. void ScaleRowDown34_SSSE3(const uint8_t* src_ptr,
  431. ptrdiff_t src_stride,
  432. uint8_t* dst_ptr,
  433. int dst_width) {
  434. (void)src_stride;
  435. asm volatile(
  436. "movdqa %0,%%xmm3 \n"
  437. "movdqa %1,%%xmm4 \n"
  438. "movdqa %2,%%xmm5 \n"
  439. :
  440. : "m"(kShuf0), // %0
  441. "m"(kShuf1), // %1
  442. "m"(kShuf2) // %2
  443. );
  444. asm volatile(
  445. LABELALIGN
  446. "1: \n"
  447. "movdqu (%0),%%xmm0 \n"
  448. "movdqu 0x10(%0),%%xmm2 \n"
  449. "lea 0x20(%0),%0 \n"
  450. "movdqa %%xmm2,%%xmm1 \n"
  451. "palignr $0x8,%%xmm0,%%xmm1 \n"
  452. "pshufb %%xmm3,%%xmm0 \n"
  453. "pshufb %%xmm4,%%xmm1 \n"
  454. "pshufb %%xmm5,%%xmm2 \n"
  455. "movq %%xmm0,(%1) \n"
  456. "movq %%xmm1,0x8(%1) \n"
  457. "movq %%xmm2,0x10(%1) \n"
  458. "lea 0x18(%1),%1 \n"
  459. "sub $0x18,%2 \n"
  460. "jg 1b \n"
  461. : "+r"(src_ptr), // %0
  462. "+r"(dst_ptr), // %1
  463. "+r"(dst_width) // %2
  464. ::"memory",
  465. "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
  466. }
  467. void ScaleRowDown34_1_Box_SSSE3(const uint8_t* src_ptr,
  468. ptrdiff_t src_stride,
  469. uint8_t* dst_ptr,
  470. int dst_width) {
  471. asm volatile(
  472. "movdqa %0,%%xmm2 \n" // kShuf01
  473. "movdqa %1,%%xmm3 \n" // kShuf11
  474. "movdqa %2,%%xmm4 \n" // kShuf21
  475. :
  476. : "m"(kShuf01), // %0
  477. "m"(kShuf11), // %1
  478. "m"(kShuf21) // %2
  479. );
  480. asm volatile(
  481. "movdqa %0,%%xmm5 \n" // kMadd01
  482. "movdqa %1,%%xmm0 \n" // kMadd11
  483. "movdqa %2,%%xmm1 \n" // kRound34
  484. :
  485. : "m"(kMadd01), // %0
  486. "m"(kMadd11), // %1
  487. "m"(kRound34) // %2
  488. );
  489. asm volatile(
  490. LABELALIGN
  491. "1: \n"
  492. "movdqu (%0),%%xmm6 \n"
  493. "movdqu 0x00(%0,%3,1),%%xmm7 \n"
  494. "pavgb %%xmm7,%%xmm6 \n"
  495. "pshufb %%xmm2,%%xmm6 \n"
  496. "pmaddubsw %%xmm5,%%xmm6 \n"
  497. "paddsw %%xmm1,%%xmm6 \n"
  498. "psrlw $0x2,%%xmm6 \n"
  499. "packuswb %%xmm6,%%xmm6 \n"
  500. "movq %%xmm6,(%1) \n"
  501. "movdqu 0x8(%0),%%xmm6 \n"
  502. "movdqu 0x8(%0,%3,1),%%xmm7 \n"
  503. "pavgb %%xmm7,%%xmm6 \n"
  504. "pshufb %%xmm3,%%xmm6 \n"
  505. "pmaddubsw %%xmm0,%%xmm6 \n"
  506. "paddsw %%xmm1,%%xmm6 \n"
  507. "psrlw $0x2,%%xmm6 \n"
  508. "packuswb %%xmm6,%%xmm6 \n"
  509. "movq %%xmm6,0x8(%1) \n"
  510. "movdqu 0x10(%0),%%xmm6 \n"
  511. "movdqu 0x10(%0,%3,1),%%xmm7 \n"
  512. "lea 0x20(%0),%0 \n"
  513. "pavgb %%xmm7,%%xmm6 \n"
  514. "pshufb %%xmm4,%%xmm6 \n"
  515. "pmaddubsw %4,%%xmm6 \n"
  516. "paddsw %%xmm1,%%xmm6 \n"
  517. "psrlw $0x2,%%xmm6 \n"
  518. "packuswb %%xmm6,%%xmm6 \n"
  519. "movq %%xmm6,0x10(%1) \n"
  520. "lea 0x18(%1),%1 \n"
  521. "sub $0x18,%2 \n"
  522. "jg 1b \n"
  523. : "+r"(src_ptr), // %0
  524. "+r"(dst_ptr), // %1
  525. "+r"(dst_width) // %2
  526. : "r"((intptr_t)(src_stride)), // %3
  527. "m"(kMadd21) // %4
  528. : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
  529. "xmm7");
  530. }
  531. void ScaleRowDown34_0_Box_SSSE3(const uint8_t* src_ptr,
  532. ptrdiff_t src_stride,
  533. uint8_t* dst_ptr,
  534. int dst_width) {
  535. asm volatile(
  536. "movdqa %0,%%xmm2 \n" // kShuf01
  537. "movdqa %1,%%xmm3 \n" // kShuf11
  538. "movdqa %2,%%xmm4 \n" // kShuf21
  539. :
  540. : "m"(kShuf01), // %0
  541. "m"(kShuf11), // %1
  542. "m"(kShuf21) // %2
  543. );
  544. asm volatile(
  545. "movdqa %0,%%xmm5 \n" // kMadd01
  546. "movdqa %1,%%xmm0 \n" // kMadd11
  547. "movdqa %2,%%xmm1 \n" // kRound34
  548. :
  549. : "m"(kMadd01), // %0
  550. "m"(kMadd11), // %1
  551. "m"(kRound34) // %2
  552. );
  553. asm volatile(
  554. LABELALIGN
  555. "1: \n"
  556. "movdqu (%0),%%xmm6 \n"
  557. "movdqu 0x00(%0,%3,1),%%xmm7 \n"
  558. "pavgb %%xmm6,%%xmm7 \n"
  559. "pavgb %%xmm7,%%xmm6 \n"
  560. "pshufb %%xmm2,%%xmm6 \n"
  561. "pmaddubsw %%xmm5,%%xmm6 \n"
  562. "paddsw %%xmm1,%%xmm6 \n"
  563. "psrlw $0x2,%%xmm6 \n"
  564. "packuswb %%xmm6,%%xmm6 \n"
  565. "movq %%xmm6,(%1) \n"
  566. "movdqu 0x8(%0),%%xmm6 \n"
  567. "movdqu 0x8(%0,%3,1),%%xmm7 \n"
  568. "pavgb %%xmm6,%%xmm7 \n"
  569. "pavgb %%xmm7,%%xmm6 \n"
  570. "pshufb %%xmm3,%%xmm6 \n"
  571. "pmaddubsw %%xmm0,%%xmm6 \n"
  572. "paddsw %%xmm1,%%xmm6 \n"
  573. "psrlw $0x2,%%xmm6 \n"
  574. "packuswb %%xmm6,%%xmm6 \n"
  575. "movq %%xmm6,0x8(%1) \n"
  576. "movdqu 0x10(%0),%%xmm6 \n"
  577. "movdqu 0x10(%0,%3,1),%%xmm7 \n"
  578. "lea 0x20(%0),%0 \n"
  579. "pavgb %%xmm6,%%xmm7 \n"
  580. "pavgb %%xmm7,%%xmm6 \n"
  581. "pshufb %%xmm4,%%xmm6 \n"
  582. "pmaddubsw %4,%%xmm6 \n"
  583. "paddsw %%xmm1,%%xmm6 \n"
  584. "psrlw $0x2,%%xmm6 \n"
  585. "packuswb %%xmm6,%%xmm6 \n"
  586. "movq %%xmm6,0x10(%1) \n"
  587. "lea 0x18(%1),%1 \n"
  588. "sub $0x18,%2 \n"
  589. "jg 1b \n"
  590. : "+r"(src_ptr), // %0
  591. "+r"(dst_ptr), // %1
  592. "+r"(dst_width) // %2
  593. : "r"((intptr_t)(src_stride)), // %3
  594. "m"(kMadd21) // %4
  595. : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
  596. "xmm7");
  597. }
  598. void ScaleRowDown38_SSSE3(const uint8_t* src_ptr,
  599. ptrdiff_t src_stride,
  600. uint8_t* dst_ptr,
  601. int dst_width) {
  602. (void)src_stride;
  603. asm volatile(
  604. "movdqa %3,%%xmm4 \n"
  605. "movdqa %4,%%xmm5 \n"
  606. LABELALIGN
  607. "1: \n"
  608. "movdqu (%0),%%xmm0 \n"
  609. "movdqu 0x10(%0),%%xmm1 \n"
  610. "lea 0x20(%0),%0 \n"
  611. "pshufb %%xmm4,%%xmm0 \n"
  612. "pshufb %%xmm5,%%xmm1 \n"
  613. "paddusb %%xmm1,%%xmm0 \n"
  614. "movq %%xmm0,(%1) \n"
  615. "movhlps %%xmm0,%%xmm1 \n"
  616. "movd %%xmm1,0x8(%1) \n"
  617. "lea 0xc(%1),%1 \n"
  618. "sub $0xc,%2 \n"
  619. "jg 1b \n"
  620. : "+r"(src_ptr), // %0
  621. "+r"(dst_ptr), // %1
  622. "+r"(dst_width) // %2
  623. : "m"(kShuf38a), // %3
  624. "m"(kShuf38b) // %4
  625. : "memory", "cc", "xmm0", "xmm1", "xmm4", "xmm5");
  626. }
  627. void ScaleRowDown38_2_Box_SSSE3(const uint8_t* src_ptr,
  628. ptrdiff_t src_stride,
  629. uint8_t* dst_ptr,
  630. int dst_width) {
  631. asm volatile(
  632. "movdqa %0,%%xmm2 \n"
  633. "movdqa %1,%%xmm3 \n"
  634. "movdqa %2,%%xmm4 \n"
  635. "movdqa %3,%%xmm5 \n"
  636. :
  637. : "m"(kShufAb0), // %0
  638. "m"(kShufAb1), // %1
  639. "m"(kShufAb2), // %2
  640. "m"(kScaleAb2) // %3
  641. );
  642. asm volatile(
  643. LABELALIGN
  644. "1: \n"
  645. "movdqu (%0),%%xmm0 \n"
  646. "movdqu 0x00(%0,%3,1),%%xmm1 \n"
  647. "lea 0x10(%0),%0 \n"
  648. "pavgb %%xmm1,%%xmm0 \n"
  649. "movdqa %%xmm0,%%xmm1 \n"
  650. "pshufb %%xmm2,%%xmm1 \n"
  651. "movdqa %%xmm0,%%xmm6 \n"
  652. "pshufb %%xmm3,%%xmm6 \n"
  653. "paddusw %%xmm6,%%xmm1 \n"
  654. "pshufb %%xmm4,%%xmm0 \n"
  655. "paddusw %%xmm0,%%xmm1 \n"
  656. "pmulhuw %%xmm5,%%xmm1 \n"
  657. "packuswb %%xmm1,%%xmm1 \n"
  658. "movd %%xmm1,(%1) \n"
  659. "psrlq $0x10,%%xmm1 \n"
  660. "movd %%xmm1,0x2(%1) \n"
  661. "lea 0x6(%1),%1 \n"
  662. "sub $0x6,%2 \n"
  663. "jg 1b \n"
  664. : "+r"(src_ptr), // %0
  665. "+r"(dst_ptr), // %1
  666. "+r"(dst_width) // %2
  667. : "r"((intptr_t)(src_stride)) // %3
  668. : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
  669. }
  670. void ScaleRowDown38_3_Box_SSSE3(const uint8_t* src_ptr,
  671. ptrdiff_t src_stride,
  672. uint8_t* dst_ptr,
  673. int dst_width) {
  674. asm volatile(
  675. "movdqa %0,%%xmm2 \n"
  676. "movdqa %1,%%xmm3 \n"
  677. "movdqa %2,%%xmm4 \n"
  678. "pxor %%xmm5,%%xmm5 \n"
  679. :
  680. : "m"(kShufAc), // %0
  681. "m"(kShufAc3), // %1
  682. "m"(kScaleAc33) // %2
  683. );
  684. asm volatile(
  685. LABELALIGN
  686. "1: \n"
  687. "movdqu (%0),%%xmm0 \n"
  688. "movdqu 0x00(%0,%3,1),%%xmm6 \n"
  689. "movhlps %%xmm0,%%xmm1 \n"
  690. "movhlps %%xmm6,%%xmm7 \n"
  691. "punpcklbw %%xmm5,%%xmm0 \n"
  692. "punpcklbw %%xmm5,%%xmm1 \n"
  693. "punpcklbw %%xmm5,%%xmm6 \n"
  694. "punpcklbw %%xmm5,%%xmm7 \n"
  695. "paddusw %%xmm6,%%xmm0 \n"
  696. "paddusw %%xmm7,%%xmm1 \n"
  697. "movdqu 0x00(%0,%3,2),%%xmm6 \n"
  698. "lea 0x10(%0),%0 \n"
  699. "movhlps %%xmm6,%%xmm7 \n"
  700. "punpcklbw %%xmm5,%%xmm6 \n"
  701. "punpcklbw %%xmm5,%%xmm7 \n"
  702. "paddusw %%xmm6,%%xmm0 \n"
  703. "paddusw %%xmm7,%%xmm1 \n"
  704. "movdqa %%xmm0,%%xmm6 \n"
  705. "psrldq $0x2,%%xmm0 \n"
  706. "paddusw %%xmm0,%%xmm6 \n"
  707. "psrldq $0x2,%%xmm0 \n"
  708. "paddusw %%xmm0,%%xmm6 \n"
  709. "pshufb %%xmm2,%%xmm6 \n"
  710. "movdqa %%xmm1,%%xmm7 \n"
  711. "psrldq $0x2,%%xmm1 \n"
  712. "paddusw %%xmm1,%%xmm7 \n"
  713. "psrldq $0x2,%%xmm1 \n"
  714. "paddusw %%xmm1,%%xmm7 \n"
  715. "pshufb %%xmm3,%%xmm7 \n"
  716. "paddusw %%xmm7,%%xmm6 \n"
  717. "pmulhuw %%xmm4,%%xmm6 \n"
  718. "packuswb %%xmm6,%%xmm6 \n"
  719. "movd %%xmm6,(%1) \n"
  720. "psrlq $0x10,%%xmm6 \n"
  721. "movd %%xmm6,0x2(%1) \n"
  722. "lea 0x6(%1),%1 \n"
  723. "sub $0x6,%2 \n"
  724. "jg 1b \n"
  725. : "+r"(src_ptr), // %0
  726. "+r"(dst_ptr), // %1
  727. "+r"(dst_width) // %2
  728. : "r"((intptr_t)(src_stride)) // %3
  729. : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
  730. "xmm7");
  731. }
  732. // Reads 16xN bytes and produces 16 shorts at a time.
  733. void ScaleAddRow_SSE2(const uint8_t* src_ptr,
  734. uint16_t* dst_ptr,
  735. int src_width) {
  736. asm volatile(
  737. "pxor %%xmm5,%%xmm5 \n"
  738. // 16 pixel loop.
  739. LABELALIGN
  740. "1: \n"
  741. "movdqu (%0),%%xmm3 \n"
  742. "lea 0x10(%0),%0 \n" // src_ptr += 16
  743. "movdqu (%1),%%xmm0 \n"
  744. "movdqu 0x10(%1),%%xmm1 \n"
  745. "movdqa %%xmm3,%%xmm2 \n"
  746. "punpcklbw %%xmm5,%%xmm2 \n"
  747. "punpckhbw %%xmm5,%%xmm3 \n"
  748. "paddusw %%xmm2,%%xmm0 \n"
  749. "paddusw %%xmm3,%%xmm1 \n"
  750. "movdqu %%xmm0,(%1) \n"
  751. "movdqu %%xmm1,0x10(%1) \n"
  752. "lea 0x20(%1),%1 \n"
  753. "sub $0x10,%2 \n"
  754. "jg 1b \n"
  755. : "+r"(src_ptr), // %0
  756. "+r"(dst_ptr), // %1
  757. "+r"(src_width) // %2
  758. :
  759. : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
  760. }
  761. #ifdef HAS_SCALEADDROW_AVX2
  762. // Reads 32 bytes and accumulates to 32 shorts at a time.
  763. void ScaleAddRow_AVX2(const uint8_t* src_ptr,
  764. uint16_t* dst_ptr,
  765. int src_width) {
  766. asm volatile(
  767. "vpxor %%ymm5,%%ymm5,%%ymm5 \n"
  768. LABELALIGN
  769. "1: \n"
  770. "vmovdqu (%0),%%ymm3 \n"
  771. "lea 0x20(%0),%0 \n" // src_ptr += 32
  772. "vpermq $0xd8,%%ymm3,%%ymm3 \n"
  773. "vpunpcklbw %%ymm5,%%ymm3,%%ymm2 \n"
  774. "vpunpckhbw %%ymm5,%%ymm3,%%ymm3 \n"
  775. "vpaddusw (%1),%%ymm2,%%ymm0 \n"
  776. "vpaddusw 0x20(%1),%%ymm3,%%ymm1 \n"
  777. "vmovdqu %%ymm0,(%1) \n"
  778. "vmovdqu %%ymm1,0x20(%1) \n"
  779. "lea 0x40(%1),%1 \n"
  780. "sub $0x20,%2 \n"
  781. "jg 1b \n"
  782. "vzeroupper \n"
  783. : "+r"(src_ptr), // %0
  784. "+r"(dst_ptr), // %1
  785. "+r"(src_width) // %2
  786. :
  787. : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
  788. }
  789. #endif // HAS_SCALEADDROW_AVX2
  790. // Constant for making pixels signed to avoid pmaddubsw
  791. // saturation.
  792. static const uvec8 kFsub80 = {0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
  793. 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80};
  794. // Constant for making pixels unsigned and adding .5 for rounding.
  795. static const uvec16 kFadd40 = {0x4040, 0x4040, 0x4040, 0x4040,
  796. 0x4040, 0x4040, 0x4040, 0x4040};
  797. // Bilinear column filtering. SSSE3 version.
  798. void ScaleFilterCols_SSSE3(uint8_t* dst_ptr,
  799. const uint8_t* src_ptr,
  800. int dst_width,
  801. int x,
  802. int dx) {
  803. intptr_t x0, x1, temp_pixel;
  804. asm volatile(
  805. "movd %6,%%xmm2 \n"
  806. "movd %7,%%xmm3 \n"
  807. "movl $0x04040000,%k2 \n"
  808. "movd %k2,%%xmm5 \n"
  809. "pcmpeqb %%xmm6,%%xmm6 \n"
  810. "psrlw $0x9,%%xmm6 \n" // 0x007f007f
  811. "pcmpeqb %%xmm7,%%xmm7 \n"
  812. "psrlw $15,%%xmm7 \n" // 0x00010001
  813. "pextrw $0x1,%%xmm2,%k3 \n"
  814. "subl $0x2,%5 \n"
  815. "jl 29f \n"
  816. "movdqa %%xmm2,%%xmm0 \n"
  817. "paddd %%xmm3,%%xmm0 \n"
  818. "punpckldq %%xmm0,%%xmm2 \n"
  819. "punpckldq %%xmm3,%%xmm3 \n"
  820. "paddd %%xmm3,%%xmm3 \n"
  821. "pextrw $0x3,%%xmm2,%k4 \n"
  822. LABELALIGN
  823. "2: \n"
  824. "movdqa %%xmm2,%%xmm1 \n"
  825. "paddd %%xmm3,%%xmm2 \n"
  826. "movzwl 0x00(%1,%3,1),%k2 \n"
  827. "movd %k2,%%xmm0 \n"
  828. "psrlw $0x9,%%xmm1 \n"
  829. "movzwl 0x00(%1,%4,1),%k2 \n"
  830. "movd %k2,%%xmm4 \n"
  831. "pshufb %%xmm5,%%xmm1 \n"
  832. "punpcklwd %%xmm4,%%xmm0 \n"
  833. "psubb %8,%%xmm0 \n" // make pixels signed.
  834. "pxor %%xmm6,%%xmm1 \n" // 128 - f = (f ^ 127 ) +
  835. // 1
  836. "paddusb %%xmm7,%%xmm1 \n"
  837. "pmaddubsw %%xmm0,%%xmm1 \n"
  838. "pextrw $0x1,%%xmm2,%k3 \n"
  839. "pextrw $0x3,%%xmm2,%k4 \n"
  840. "paddw %9,%%xmm1 \n" // make pixels unsigned.
  841. "psrlw $0x7,%%xmm1 \n"
  842. "packuswb %%xmm1,%%xmm1 \n"
  843. "movd %%xmm1,%k2 \n"
  844. "mov %w2,(%0) \n"
  845. "lea 0x2(%0),%0 \n"
  846. "subl $0x2,%5 \n"
  847. "jge 2b \n"
  848. LABELALIGN
  849. "29: \n"
  850. "addl $0x1,%5 \n"
  851. "jl 99f \n"
  852. "movzwl 0x00(%1,%3,1),%k2 \n"
  853. "movd %k2,%%xmm0 \n"
  854. "psrlw $0x9,%%xmm2 \n"
  855. "pshufb %%xmm5,%%xmm2 \n"
  856. "psubb %8,%%xmm0 \n" // make pixels signed.
  857. "pxor %%xmm6,%%xmm2 \n"
  858. "paddusb %%xmm7,%%xmm2 \n"
  859. "pmaddubsw %%xmm0,%%xmm2 \n"
  860. "paddw %9,%%xmm2 \n" // make pixels unsigned.
  861. "psrlw $0x7,%%xmm2 \n"
  862. "packuswb %%xmm2,%%xmm2 \n"
  863. "movd %%xmm2,%k2 \n"
  864. "mov %b2,(%0) \n"
  865. "99: \n"
  866. : "+r"(dst_ptr), // %0
  867. "+r"(src_ptr), // %1
  868. "=&a"(temp_pixel), // %2
  869. "=&r"(x0), // %3
  870. "=&r"(x1), // %4
  871. #if defined(__x86_64__)
  872. "+rm"(dst_width) // %5
  873. #else
  874. "+m"(dst_width) // %5
  875. #endif
  876. : "rm"(x), // %6
  877. "rm"(dx), // %7
  878. #if defined(__x86_64__)
  879. "x"(kFsub80), // %8
  880. "x"(kFadd40) // %9
  881. #else
  882. "m"(kFsub80), // %8
  883. "m"(kFadd40) // %9
  884. #endif
  885. : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
  886. "xmm7");
  887. }
  888. // Reads 4 pixels, duplicates them and writes 8 pixels.
  889. // Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned.
  890. void ScaleColsUp2_SSE2(uint8_t* dst_ptr,
  891. const uint8_t* src_ptr,
  892. int dst_width,
  893. int x,
  894. int dx) {
  895. (void)x;
  896. (void)dx;
  897. asm volatile(
  898. LABELALIGN
  899. "1: \n"
  900. "movdqu (%1),%%xmm0 \n"
  901. "lea 0x10(%1),%1 \n"
  902. "movdqa %%xmm0,%%xmm1 \n"
  903. "punpcklbw %%xmm0,%%xmm0 \n"
  904. "punpckhbw %%xmm1,%%xmm1 \n"
  905. "movdqu %%xmm0,(%0) \n"
  906. "movdqu %%xmm1,0x10(%0) \n"
  907. "lea 0x20(%0),%0 \n"
  908. "sub $0x20,%2 \n"
  909. "jg 1b \n"
  910. : "+r"(dst_ptr), // %0
  911. "+r"(src_ptr), // %1
  912. "+r"(dst_width) // %2
  913. ::"memory",
  914. "cc", "xmm0", "xmm1");
  915. }
  916. void ScaleARGBRowDown2_SSE2(const uint8_t* src_argb,
  917. ptrdiff_t src_stride,
  918. uint8_t* dst_argb,
  919. int dst_width) {
  920. (void)src_stride;
  921. asm volatile(
  922. LABELALIGN
  923. "1: \n"
  924. "movdqu (%0),%%xmm0 \n"
  925. "movdqu 0x10(%0),%%xmm1 \n"
  926. "lea 0x20(%0),%0 \n"
  927. "shufps $0xdd,%%xmm1,%%xmm0 \n"
  928. "movdqu %%xmm0,(%1) \n"
  929. "lea 0x10(%1),%1 \n"
  930. "sub $0x4,%2 \n"
  931. "jg 1b \n"
  932. : "+r"(src_argb), // %0
  933. "+r"(dst_argb), // %1
  934. "+r"(dst_width) // %2
  935. ::"memory",
  936. "cc", "xmm0", "xmm1");
  937. }
  938. void ScaleARGBRowDown2Linear_SSE2(const uint8_t* src_argb,
  939. ptrdiff_t src_stride,
  940. uint8_t* dst_argb,
  941. int dst_width) {
  942. (void)src_stride;
  943. asm volatile(
  944. LABELALIGN
  945. "1: \n"
  946. "movdqu (%0),%%xmm0 \n"
  947. "movdqu 0x10(%0),%%xmm1 \n"
  948. "lea 0x20(%0),%0 \n"
  949. "movdqa %%xmm0,%%xmm2 \n"
  950. "shufps $0x88,%%xmm1,%%xmm0 \n"
  951. "shufps $0xdd,%%xmm1,%%xmm2 \n"
  952. "pavgb %%xmm2,%%xmm0 \n"
  953. "movdqu %%xmm0,(%1) \n"
  954. "lea 0x10(%1),%1 \n"
  955. "sub $0x4,%2 \n"
  956. "jg 1b \n"
  957. : "+r"(src_argb), // %0
  958. "+r"(dst_argb), // %1
  959. "+r"(dst_width) // %2
  960. ::"memory",
  961. "cc", "xmm0", "xmm1");
  962. }
  963. void ScaleARGBRowDown2Box_SSE2(const uint8_t* src_argb,
  964. ptrdiff_t src_stride,
  965. uint8_t* dst_argb,
  966. int dst_width) {
  967. asm volatile(
  968. LABELALIGN
  969. "1: \n"
  970. "movdqu (%0),%%xmm0 \n"
  971. "movdqu 0x10(%0),%%xmm1 \n"
  972. "movdqu 0x00(%0,%3,1),%%xmm2 \n"
  973. "movdqu 0x10(%0,%3,1),%%xmm3 \n"
  974. "lea 0x20(%0),%0 \n"
  975. "pavgb %%xmm2,%%xmm0 \n"
  976. "pavgb %%xmm3,%%xmm1 \n"
  977. "movdqa %%xmm0,%%xmm2 \n"
  978. "shufps $0x88,%%xmm1,%%xmm0 \n"
  979. "shufps $0xdd,%%xmm1,%%xmm2 \n"
  980. "pavgb %%xmm2,%%xmm0 \n"
  981. "movdqu %%xmm0,(%1) \n"
  982. "lea 0x10(%1),%1 \n"
  983. "sub $0x4,%2 \n"
  984. "jg 1b \n"
  985. : "+r"(src_argb), // %0
  986. "+r"(dst_argb), // %1
  987. "+r"(dst_width) // %2
  988. : "r"((intptr_t)(src_stride)) // %3
  989. : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3");
  990. }
  991. // Reads 4 pixels at a time.
  992. // Alignment requirement: dst_argb 16 byte aligned.
  993. void ScaleARGBRowDownEven_SSE2(const uint8_t* src_argb,
  994. ptrdiff_t src_stride,
  995. int src_stepx,
  996. uint8_t* dst_argb,
  997. int dst_width) {
  998. intptr_t src_stepx_x4 = (intptr_t)(src_stepx);
  999. intptr_t src_stepx_x12;
  1000. (void)src_stride;
  1001. asm volatile(
  1002. "lea 0x00(,%1,4),%1 \n"
  1003. "lea 0x00(%1,%1,2),%4 \n"
  1004. LABELALIGN
  1005. "1: \n"
  1006. "movd (%0),%%xmm0 \n"
  1007. "movd 0x00(%0,%1,1),%%xmm1 \n"
  1008. "punpckldq %%xmm1,%%xmm0 \n"
  1009. "movd 0x00(%0,%1,2),%%xmm2 \n"
  1010. "movd 0x00(%0,%4,1),%%xmm3 \n"
  1011. "lea 0x00(%0,%1,4),%0 \n"
  1012. "punpckldq %%xmm3,%%xmm2 \n"
  1013. "punpcklqdq %%xmm2,%%xmm0 \n"
  1014. "movdqu %%xmm0,(%2) \n"
  1015. "lea 0x10(%2),%2 \n"
  1016. "sub $0x4,%3 \n"
  1017. "jg 1b \n"
  1018. : "+r"(src_argb), // %0
  1019. "+r"(src_stepx_x4), // %1
  1020. "+r"(dst_argb), // %2
  1021. "+r"(dst_width), // %3
  1022. "=&r"(src_stepx_x12) // %4
  1023. ::"memory",
  1024. "cc", "xmm0", "xmm1", "xmm2", "xmm3");
  1025. }
  1026. // Blends four 2x2 to 4x1.
  1027. // Alignment requirement: dst_argb 16 byte aligned.
  1028. void ScaleARGBRowDownEvenBox_SSE2(const uint8_t* src_argb,
  1029. ptrdiff_t src_stride,
  1030. int src_stepx,
  1031. uint8_t* dst_argb,
  1032. int dst_width) {
  1033. intptr_t src_stepx_x4 = (intptr_t)(src_stepx);
  1034. intptr_t src_stepx_x12;
  1035. intptr_t row1 = (intptr_t)(src_stride);
  1036. asm volatile(
  1037. "lea 0x00(,%1,4),%1 \n"
  1038. "lea 0x00(%1,%1,2),%4 \n"
  1039. "lea 0x00(%0,%5,1),%5 \n"
  1040. LABELALIGN
  1041. "1: \n"
  1042. "movq (%0),%%xmm0 \n"
  1043. "movhps 0x00(%0,%1,1),%%xmm0 \n"
  1044. "movq 0x00(%0,%1,2),%%xmm1 \n"
  1045. "movhps 0x00(%0,%4,1),%%xmm1 \n"
  1046. "lea 0x00(%0,%1,4),%0 \n"
  1047. "movq (%5),%%xmm2 \n"
  1048. "movhps 0x00(%5,%1,1),%%xmm2 \n"
  1049. "movq 0x00(%5,%1,2),%%xmm3 \n"
  1050. "movhps 0x00(%5,%4,1),%%xmm3 \n"
  1051. "lea 0x00(%5,%1,4),%5 \n"
  1052. "pavgb %%xmm2,%%xmm0 \n"
  1053. "pavgb %%xmm3,%%xmm1 \n"
  1054. "movdqa %%xmm0,%%xmm2 \n"
  1055. "shufps $0x88,%%xmm1,%%xmm0 \n"
  1056. "shufps $0xdd,%%xmm1,%%xmm2 \n"
  1057. "pavgb %%xmm2,%%xmm0 \n"
  1058. "movdqu %%xmm0,(%2) \n"
  1059. "lea 0x10(%2),%2 \n"
  1060. "sub $0x4,%3 \n"
  1061. "jg 1b \n"
  1062. : "+r"(src_argb), // %0
  1063. "+r"(src_stepx_x4), // %1
  1064. "+r"(dst_argb), // %2
  1065. "+rm"(dst_width), // %3
  1066. "=&r"(src_stepx_x12), // %4
  1067. "+r"(row1) // %5
  1068. ::"memory",
  1069. "cc", "xmm0", "xmm1", "xmm2", "xmm3");
  1070. }
  1071. void ScaleARGBCols_SSE2(uint8_t* dst_argb,
  1072. const uint8_t* src_argb,
  1073. int dst_width,
  1074. int x,
  1075. int dx) {
  1076. intptr_t x0, x1;
  1077. asm volatile(
  1078. "movd %5,%%xmm2 \n"
  1079. "movd %6,%%xmm3 \n"
  1080. "pshufd $0x0,%%xmm2,%%xmm2 \n"
  1081. "pshufd $0x11,%%xmm3,%%xmm0 \n"
  1082. "paddd %%xmm0,%%xmm2 \n"
  1083. "paddd %%xmm3,%%xmm3 \n"
  1084. "pshufd $0x5,%%xmm3,%%xmm0 \n"
  1085. "paddd %%xmm0,%%xmm2 \n"
  1086. "paddd %%xmm3,%%xmm3 \n"
  1087. "pshufd $0x0,%%xmm3,%%xmm3 \n"
  1088. "pextrw $0x1,%%xmm2,%k0 \n"
  1089. "pextrw $0x3,%%xmm2,%k1 \n"
  1090. "cmp $0x0,%4 \n"
  1091. "jl 99f \n"
  1092. "sub $0x4,%4 \n"
  1093. "jl 49f \n"
  1094. LABELALIGN
  1095. "40: \n"
  1096. "movd 0x00(%3,%0,4),%%xmm0 \n"
  1097. "movd 0x00(%3,%1,4),%%xmm1 \n"
  1098. "pextrw $0x5,%%xmm2,%k0 \n"
  1099. "pextrw $0x7,%%xmm2,%k1 \n"
  1100. "paddd %%xmm3,%%xmm2 \n"
  1101. "punpckldq %%xmm1,%%xmm0 \n"
  1102. "movd 0x00(%3,%0,4),%%xmm1 \n"
  1103. "movd 0x00(%3,%1,4),%%xmm4 \n"
  1104. "pextrw $0x1,%%xmm2,%k0 \n"
  1105. "pextrw $0x3,%%xmm2,%k1 \n"
  1106. "punpckldq %%xmm4,%%xmm1 \n"
  1107. "punpcklqdq %%xmm1,%%xmm0 \n"
  1108. "movdqu %%xmm0,(%2) \n"
  1109. "lea 0x10(%2),%2 \n"
  1110. "sub $0x4,%4 \n"
  1111. "jge 40b \n"
  1112. "49: \n"
  1113. "test $0x2,%4 \n"
  1114. "je 29f \n"
  1115. "movd 0x00(%3,%0,4),%%xmm0 \n"
  1116. "movd 0x00(%3,%1,4),%%xmm1 \n"
  1117. "pextrw $0x5,%%xmm2,%k0 \n"
  1118. "punpckldq %%xmm1,%%xmm0 \n"
  1119. "movq %%xmm0,(%2) \n"
  1120. "lea 0x8(%2),%2 \n"
  1121. "29: \n"
  1122. "test $0x1,%4 \n"
  1123. "je 99f \n"
  1124. "movd 0x00(%3,%0,4),%%xmm0 \n"
  1125. "movd %%xmm0,(%2) \n"
  1126. "99: \n"
  1127. : "=&a"(x0), // %0
  1128. "=&d"(x1), // %1
  1129. "+r"(dst_argb), // %2
  1130. "+r"(src_argb), // %3
  1131. "+r"(dst_width) // %4
  1132. : "rm"(x), // %5
  1133. "rm"(dx) // %6
  1134. : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4");
  1135. }
  1136. // Reads 4 pixels, duplicates them and writes 8 pixels.
  1137. // Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned.
  1138. void ScaleARGBColsUp2_SSE2(uint8_t* dst_argb,
  1139. const uint8_t* src_argb,
  1140. int dst_width,
  1141. int x,
  1142. int dx) {
  1143. (void)x;
  1144. (void)dx;
  1145. asm volatile(
  1146. LABELALIGN
  1147. "1: \n"
  1148. "movdqu (%1),%%xmm0 \n"
  1149. "lea 0x10(%1),%1 \n"
  1150. "movdqa %%xmm0,%%xmm1 \n"
  1151. "punpckldq %%xmm0,%%xmm0 \n"
  1152. "punpckhdq %%xmm1,%%xmm1 \n"
  1153. "movdqu %%xmm0,(%0) \n"
  1154. "movdqu %%xmm1,0x10(%0) \n"
  1155. "lea 0x20(%0),%0 \n"
  1156. "sub $0x8,%2 \n"
  1157. "jg 1b \n"
  1158. : "+r"(dst_argb), // %0
  1159. "+r"(src_argb), // %1
  1160. "+r"(dst_width) // %2
  1161. ::"memory",
  1162. "cc", "xmm0", "xmm1");
  1163. }
  1164. // Shuffle table for arranging 2 pixels into pairs for pmaddubsw
  1165. static const uvec8 kShuffleColARGB = {
  1166. 0u, 4u, 1u, 5u, 2u, 6u, 3u, 7u, // bbggrraa 1st pixel
  1167. 8u, 12u, 9u, 13u, 10u, 14u, 11u, 15u // bbggrraa 2nd pixel
  1168. };
  1169. // Shuffle table for duplicating 2 fractions into 8 bytes each
  1170. static const uvec8 kShuffleFractions = {
  1171. 0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, 4u, 4u, 4u, 4u, 4u, 4u, 4u, 4u,
  1172. };
  1173. // Bilinear row filtering combines 4x2 -> 4x1. SSSE3 version
  1174. void ScaleARGBFilterCols_SSSE3(uint8_t* dst_argb,
  1175. const uint8_t* src_argb,
  1176. int dst_width,
  1177. int x,
  1178. int dx) {
  1179. intptr_t x0, x1;
  1180. asm volatile(
  1181. "movdqa %0,%%xmm4 \n"
  1182. "movdqa %1,%%xmm5 \n"
  1183. :
  1184. : "m"(kShuffleColARGB), // %0
  1185. "m"(kShuffleFractions) // %1
  1186. );
  1187. asm volatile(
  1188. "movd %5,%%xmm2 \n"
  1189. "movd %6,%%xmm3 \n"
  1190. "pcmpeqb %%xmm6,%%xmm6 \n"
  1191. "psrlw $0x9,%%xmm6 \n"
  1192. "pextrw $0x1,%%xmm2,%k3 \n"
  1193. "sub $0x2,%2 \n"
  1194. "jl 29f \n"
  1195. "movdqa %%xmm2,%%xmm0 \n"
  1196. "paddd %%xmm3,%%xmm0 \n"
  1197. "punpckldq %%xmm0,%%xmm2 \n"
  1198. "punpckldq %%xmm3,%%xmm3 \n"
  1199. "paddd %%xmm3,%%xmm3 \n"
  1200. "pextrw $0x3,%%xmm2,%k4 \n"
  1201. LABELALIGN
  1202. "2: \n"
  1203. "movdqa %%xmm2,%%xmm1 \n"
  1204. "paddd %%xmm3,%%xmm2 \n"
  1205. "movq 0x00(%1,%3,4),%%xmm0 \n"
  1206. "psrlw $0x9,%%xmm1 \n"
  1207. "movhps 0x00(%1,%4,4),%%xmm0 \n"
  1208. "pshufb %%xmm5,%%xmm1 \n"
  1209. "pshufb %%xmm4,%%xmm0 \n"
  1210. "pxor %%xmm6,%%xmm1 \n"
  1211. "pmaddubsw %%xmm1,%%xmm0 \n"
  1212. "psrlw $0x7,%%xmm0 \n"
  1213. "pextrw $0x1,%%xmm2,%k3 \n"
  1214. "pextrw $0x3,%%xmm2,%k4 \n"
  1215. "packuswb %%xmm0,%%xmm0 \n"
  1216. "movq %%xmm0,(%0) \n"
  1217. "lea 0x8(%0),%0 \n"
  1218. "sub $0x2,%2 \n"
  1219. "jge 2b \n"
  1220. LABELALIGN
  1221. "29: \n"
  1222. "add $0x1,%2 \n"
  1223. "jl 99f \n"
  1224. "psrlw $0x9,%%xmm2 \n"
  1225. "movq 0x00(%1,%3,4),%%xmm0 \n"
  1226. "pshufb %%xmm5,%%xmm2 \n"
  1227. "pshufb %%xmm4,%%xmm0 \n"
  1228. "pxor %%xmm6,%%xmm2 \n"
  1229. "pmaddubsw %%xmm2,%%xmm0 \n"
  1230. "psrlw $0x7,%%xmm0 \n"
  1231. "packuswb %%xmm0,%%xmm0 \n"
  1232. "movd %%xmm0,(%0) \n"
  1233. LABELALIGN "99: \n" // clang-format error.
  1234. : "+r"(dst_argb), // %0
  1235. "+r"(src_argb), // %1
  1236. "+rm"(dst_width), // %2
  1237. "=&r"(x0), // %3
  1238. "=&r"(x1) // %4
  1239. : "rm"(x), // %5
  1240. "rm"(dx) // %6
  1241. : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
  1242. }
  1243. // Divide num by div and return as 16.16 fixed point result.
  1244. int FixedDiv_X86(int num, int div) {
  1245. asm volatile(
  1246. "cdq \n"
  1247. "shld $0x10,%%eax,%%edx \n"
  1248. "shl $0x10,%%eax \n"
  1249. "idiv %1 \n"
  1250. "mov %0, %%eax \n"
  1251. : "+a"(num) // %0
  1252. : "c"(div) // %1
  1253. : "memory", "cc", "edx");
  1254. return num;
  1255. }
  1256. // Divide num - 1 by div - 1 and return as 16.16 fixed point result.
  1257. int FixedDiv1_X86(int num, int div) {
  1258. asm volatile(
  1259. "cdq \n"
  1260. "shld $0x10,%%eax,%%edx \n"
  1261. "shl $0x10,%%eax \n"
  1262. "sub $0x10001,%%eax \n"
  1263. "sbb $0x0,%%edx \n"
  1264. "sub $0x1,%1 \n"
  1265. "idiv %1 \n"
  1266. "mov %0, %%eax \n"
  1267. : "+a"(num) // %0
  1268. : "c"(div) // %1
  1269. : "memory", "cc", "edx");
  1270. return num;
  1271. }
  1272. #endif // defined(__x86_64__) || defined(__i386__)
  1273. #ifdef __cplusplus
  1274. } // extern "C"
  1275. } // namespace libyuv
  1276. #endif