row_win.cc 202 KB


  1. /*
  2. * Copyright 2011 The LibYuv Project Authors. All rights reserved.
  3. *
  4. * Use of this source code is governed by a BSD-style license
  5. * that can be found in the LICENSE file in the root of the source
  6. * tree. An additional intellectual property rights grant can be found
  7. * in the file PATENTS. All contributing project authors may
  8. * be found in the AUTHORS file in the root of the source tree.
  9. */
  10. #include "libyuv/row.h"
  11. // This module is for Visual C 32/64 bit and clangcl 32 bit
  12. #if !defined(LIBYUV_DISABLE_X86) && defined(_MSC_VER) && \
  13. (defined(_M_IX86) || (defined(_M_X64) && !defined(__clang__)))
  14. #if defined(_M_X64)
  15. #include <emmintrin.h>
  16. #include <tmmintrin.h> // For _mm_maddubs_epi16
  17. #endif
  18. #ifdef __cplusplus
  19. namespace libyuv {
  20. extern "C" {
  21. #endif
  22. // 64 bit
  23. #if defined(_M_X64)
  24. // Read 4 UV from 422, upsample to 8 UV.
  25. #define READYUV422 \
  26. xmm0 = _mm_cvtsi32_si128(*(uint32_t*)u_buf); \
  27. xmm1 = _mm_cvtsi32_si128(*(uint32_t*)(u_buf + offset)); \
  28. xmm0 = _mm_unpacklo_epi8(xmm0, xmm1); \
  29. xmm0 = _mm_unpacklo_epi16(xmm0, xmm0); \
  30. u_buf += 4; \
  31. xmm4 = _mm_loadl_epi64((__m128i*)y_buf); \
  32. xmm4 = _mm_unpacklo_epi8(xmm4, xmm4); \
  33. y_buf += 8;
  34. // Read 4 UV from 422, upsample to 8 UV. With 8 Alpha.
  35. #define READYUVA422 \
  36. xmm0 = _mm_cvtsi32_si128(*(uint32_t*)u_buf); \
  37. xmm1 = _mm_cvtsi32_si128(*(uint32_t*)(u_buf + offset)); \
  38. xmm0 = _mm_unpacklo_epi8(xmm0, xmm1); \
  39. xmm0 = _mm_unpacklo_epi16(xmm0, xmm0); \
  40. u_buf += 4; \
  41. xmm4 = _mm_loadl_epi64((__m128i*)y_buf); \
  42. xmm4 = _mm_unpacklo_epi8(xmm4, xmm4); \
  43. y_buf += 8; \
  44. xmm5 = _mm_loadl_epi64((__m128i*)a_buf); \
  45. a_buf += 8;
  46. // Convert 8 pixels: 8 UV and 8 Y.
  47. #define YUVTORGB(yuvconstants) \
  48. xmm1 = _mm_loadu_si128(&xmm0); \
  49. xmm2 = _mm_loadu_si128(&xmm0); \
  50. xmm0 = _mm_maddubs_epi16(xmm0, *(__m128i*)yuvconstants->kUVToB); \
  51. xmm1 = _mm_maddubs_epi16(xmm1, *(__m128i*)yuvconstants->kUVToG); \
  52. xmm2 = _mm_maddubs_epi16(xmm2, *(__m128i*)yuvconstants->kUVToR); \
  53. xmm0 = _mm_sub_epi16(*(__m128i*)yuvconstants->kUVBiasB, xmm0); \
  54. xmm1 = _mm_sub_epi16(*(__m128i*)yuvconstants->kUVBiasG, xmm1); \
  55. xmm2 = _mm_sub_epi16(*(__m128i*)yuvconstants->kUVBiasR, xmm2); \
  56. xmm4 = _mm_mulhi_epu16(xmm4, *(__m128i*)yuvconstants->kYToRgb); \
  57. xmm0 = _mm_adds_epi16(xmm0, xmm4); \
  58. xmm1 = _mm_adds_epi16(xmm1, xmm4); \
  59. xmm2 = _mm_adds_epi16(xmm2, xmm4); \
  60. xmm0 = _mm_srai_epi16(xmm0, 6); \
  61. xmm1 = _mm_srai_epi16(xmm1, 6); \
  62. xmm2 = _mm_srai_epi16(xmm2, 6); \
  63. xmm0 = _mm_packus_epi16(xmm0, xmm0); \
  64. xmm1 = _mm_packus_epi16(xmm1, xmm1); \
  65. xmm2 = _mm_packus_epi16(xmm2, xmm2);
  66. // Store 8 ARGB values.
  67. #define STOREARGB \
  68. xmm0 = _mm_unpacklo_epi8(xmm0, xmm1); \
  69. xmm2 = _mm_unpacklo_epi8(xmm2, xmm5); \
  70. xmm1 = _mm_loadu_si128(&xmm0); \
  71. xmm0 = _mm_unpacklo_epi16(xmm0, xmm2); \
  72. xmm1 = _mm_unpackhi_epi16(xmm1, xmm2); \
  73. _mm_storeu_si128((__m128i*)dst_argb, xmm0); \
  74. _mm_storeu_si128((__m128i*)(dst_argb + 16), xmm1); \
  75. dst_argb += 32;
  76. #if defined(HAS_I422TOARGBROW_SSSE3)
  77. void I422ToARGBRow_SSSE3(const uint8_t* y_buf,
  78. const uint8_t* u_buf,
  79. const uint8_t* v_buf,
  80. uint8_t* dst_argb,
  81. const struct YuvConstants* yuvconstants,
  82. int width) {
  83. __m128i xmm0, xmm1, xmm2, xmm4;
  84. const __m128i xmm5 = _mm_set1_epi8(-1);
  85. const ptrdiff_t offset = (uint8_t*)v_buf - (uint8_t*)u_buf;
  86. while (width > 0) {
  87. READYUV422
  88. YUVTORGB(yuvconstants)
  89. STOREARGB
  90. width -= 8;
  91. }
  92. }
  93. #endif
  94. #if defined(HAS_I422ALPHATOARGBROW_SSSE3)
  95. void I422AlphaToARGBRow_SSSE3(const uint8_t* y_buf,
  96. const uint8_t* u_buf,
  97. const uint8_t* v_buf,
  98. const uint8_t* a_buf,
  99. uint8_t* dst_argb,
  100. const struct YuvConstants* yuvconstants,
  101. int width) {
  102. __m128i xmm0, xmm1, xmm2, xmm4, xmm5;
  103. const ptrdiff_t offset = (uint8_t*)v_buf - (uint8_t*)u_buf;
  104. while (width > 0) {
  105. READYUVA422
  106. YUVTORGB(yuvconstants)
  107. STOREARGB
  108. width -= 8;
  109. }
  110. }
  111. #endif
  112. // 32 bit
  113. #else // defined(_M_X64)
  114. #ifdef HAS_ARGBTOYROW_SSSE3
  115. // Constants for ARGB.
  116. static const vec8 kARGBToY = {13, 65, 33, 0, 13, 65, 33, 0,
  117. 13, 65, 33, 0, 13, 65, 33, 0};
  118. // JPeg full range.
  119. static const vec8 kARGBToYJ = {15, 75, 38, 0, 15, 75, 38, 0,
  120. 15, 75, 38, 0, 15, 75, 38, 0};
  121. static const vec8 kARGBToU = {112, -74, -38, 0, 112, -74, -38, 0,
  122. 112, -74, -38, 0, 112, -74, -38, 0};
  123. static const vec8 kARGBToUJ = {127, -84, -43, 0, 127, -84, -43, 0,
  124. 127, -84, -43, 0, 127, -84, -43, 0};
  125. static const vec8 kARGBToV = {
  126. -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0,
  127. };
  128. static const vec8 kARGBToVJ = {-20, -107, 127, 0, -20, -107, 127, 0,
  129. -20, -107, 127, 0, -20, -107, 127, 0};
  130. // vpshufb for vphaddw + vpackuswb packed to shorts.
  131. static const lvec8 kShufARGBToUV_AVX = {
  132. 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15,
  133. 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15};
  134. // Constants for BGRA.
  135. static const vec8 kBGRAToY = {0, 33, 65, 13, 0, 33, 65, 13,
  136. 0, 33, 65, 13, 0, 33, 65, 13};
  137. static const vec8 kBGRAToU = {0, -38, -74, 112, 0, -38, -74, 112,
  138. 0, -38, -74, 112, 0, -38, -74, 112};
  139. static const vec8 kBGRAToV = {0, 112, -94, -18, 0, 112, -94, -18,
  140. 0, 112, -94, -18, 0, 112, -94, -18};
  141. // Constants for ABGR.
  142. static const vec8 kABGRToY = {33, 65, 13, 0, 33, 65, 13, 0,
  143. 33, 65, 13, 0, 33, 65, 13, 0};
  144. static const vec8 kABGRToU = {-38, -74, 112, 0, -38, -74, 112, 0,
  145. -38, -74, 112, 0, -38, -74, 112, 0};
  146. static const vec8 kABGRToV = {112, -94, -18, 0, 112, -94, -18, 0,
  147. 112, -94, -18, 0, 112, -94, -18, 0};
  148. // Constants for RGBA.
  149. static const vec8 kRGBAToY = {0, 13, 65, 33, 0, 13, 65, 33,
  150. 0, 13, 65, 33, 0, 13, 65, 33};
  151. static const vec8 kRGBAToU = {0, 112, -74, -38, 0, 112, -74, -38,
  152. 0, 112, -74, -38, 0, 112, -74, -38};
  153. static const vec8 kRGBAToV = {0, -18, -94, 112, 0, -18, -94, 112,
  154. 0, -18, -94, 112, 0, -18, -94, 112};
  155. static const uvec8 kAddY16 = {16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u,
  156. 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u};
  157. // 7 bit fixed point 0.5.
  158. static const vec16 kAddYJ64 = {64, 64, 64, 64, 64, 64, 64, 64};
  159. static const uvec8 kAddUV128 = {128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u,
  160. 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u};
  161. static const uvec16 kAddUVJ128 = {0x8080u, 0x8080u, 0x8080u, 0x8080u,
  162. 0x8080u, 0x8080u, 0x8080u, 0x8080u};
  163. // Shuffle table for converting RGB24 to ARGB.
  164. static const uvec8 kShuffleMaskRGB24ToARGB = {
  165. 0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u, 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u};
  166. // Shuffle table for converting RAW to ARGB.
  167. static const uvec8 kShuffleMaskRAWToARGB = {2u, 1u, 0u, 12u, 5u, 4u, 3u, 13u,
  168. 8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u};
  169. // Shuffle table for converting RAW to RGB24. First 8.
  170. static const uvec8 kShuffleMaskRAWToRGB24_0 = {
  171. 2u, 1u, 0u, 5u, 4u, 3u, 8u, 7u,
  172. 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u};
  173. // Shuffle table for converting RAW to RGB24. Middle 8.
  174. static const uvec8 kShuffleMaskRAWToRGB24_1 = {
  175. 2u, 7u, 6u, 5u, 10u, 9u, 8u, 13u,
  176. 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u};
  177. // Shuffle table for converting RAW to RGB24. Last 8.
  178. static const uvec8 kShuffleMaskRAWToRGB24_2 = {
  179. 8u, 7u, 12u, 11u, 10u, 15u, 14u, 13u,
  180. 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u};
  181. // Shuffle table for converting ARGB to RGB24.
  182. static const uvec8 kShuffleMaskARGBToRGB24 = {
  183. 0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 10u, 12u, 13u, 14u, 128u, 128u, 128u, 128u};
  184. // Shuffle table for converting ARGB to RAW.
  185. static const uvec8 kShuffleMaskARGBToRAW = {
  186. 2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 8u, 14u, 13u, 12u, 128u, 128u, 128u, 128u};
  187. // Shuffle table for converting ARGBToRGB24 for I422ToRGB24. First 8 + next 4
  188. static const uvec8 kShuffleMaskARGBToRGB24_0 = {
  189. 0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 128u, 128u, 128u, 128u, 10u, 12u, 13u, 14u};
  190. // YUY2 shuf 16 Y to 32 Y.
  191. static const lvec8 kShuffleYUY2Y = {0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10,
  192. 10, 12, 12, 14, 14, 0, 0, 2, 2, 4, 4,
  193. 6, 6, 8, 8, 10, 10, 12, 12, 14, 14};
  194. // YUY2 shuf 8 UV to 16 UV.
  195. static const lvec8 kShuffleYUY2UV = {1, 3, 1, 3, 5, 7, 5, 7, 9, 11, 9,
  196. 11, 13, 15, 13, 15, 1, 3, 1, 3, 5, 7,
  197. 5, 7, 9, 11, 9, 11, 13, 15, 13, 15};
  198. // UYVY shuf 16 Y to 32 Y.
  199. static const lvec8 kShuffleUYVYY = {1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11,
  200. 11, 13, 13, 15, 15, 1, 1, 3, 3, 5, 5,
  201. 7, 7, 9, 9, 11, 11, 13, 13, 15, 15};
  202. // UYVY shuf 8 UV to 16 UV.
  203. static const lvec8 kShuffleUYVYUV = {0, 2, 0, 2, 4, 6, 4, 6, 8, 10, 8,
  204. 10, 12, 14, 12, 14, 0, 2, 0, 2, 4, 6,
  205. 4, 6, 8, 10, 8, 10, 12, 14, 12, 14};
  206. // NV21 shuf 8 VU to 16 UV.
  207. static const lvec8 kShuffleNV21 = {
  208. 1, 0, 1, 0, 3, 2, 3, 2, 5, 4, 5, 4, 7, 6, 7, 6,
  209. 1, 0, 1, 0, 3, 2, 3, 2, 5, 4, 5, 4, 7, 6, 7, 6,
  210. };
  211. // Duplicates gray value 3 times and fills in alpha opaque.
  212. __declspec(naked) void J400ToARGBRow_SSE2(const uint8_t* src_y,
  213. uint8_t* dst_argb,
  214. int width) {
  215. __asm {
  216. mov eax, [esp + 4] // src_y
  217. mov edx, [esp + 8] // dst_argb
  218. mov ecx, [esp + 12] // width
  219. pcmpeqb xmm5, xmm5 // generate mask 0xff000000
  220. pslld xmm5, 24
  221. convertloop:
  222. movq xmm0, qword ptr [eax]
  223. lea eax, [eax + 8]
  224. punpcklbw xmm0, xmm0
  225. movdqa xmm1, xmm0
  226. punpcklwd xmm0, xmm0
  227. punpckhwd xmm1, xmm1
  228. por xmm0, xmm5
  229. por xmm1, xmm5
  230. movdqu [edx], xmm0
  231. movdqu [edx + 16], xmm1
  232. lea edx, [edx + 32]
  233. sub ecx, 8
  234. jg convertloop
  235. ret
  236. }
  237. }
  238. #ifdef HAS_J400TOARGBROW_AVX2
  239. // Duplicates gray value 3 times and fills in alpha opaque.
  240. __declspec(naked) void J400ToARGBRow_AVX2(const uint8_t* src_y,
  241. uint8_t* dst_argb,
  242. int width) {
  243. __asm {
  244. mov eax, [esp + 4] // src_y
  245. mov edx, [esp + 8] // dst_argb
  246. mov ecx, [esp + 12] // width
  247. vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0xff000000
  248. vpslld ymm5, ymm5, 24
  249. convertloop:
  250. vmovdqu xmm0, [eax]
  251. lea eax, [eax + 16]
  252. vpermq ymm0, ymm0, 0xd8
  253. vpunpcklbw ymm0, ymm0, ymm0
  254. vpermq ymm0, ymm0, 0xd8
  255. vpunpckhwd ymm1, ymm0, ymm0
  256. vpunpcklwd ymm0, ymm0, ymm0
  257. vpor ymm0, ymm0, ymm5
  258. vpor ymm1, ymm1, ymm5
  259. vmovdqu [edx], ymm0
  260. vmovdqu [edx + 32], ymm1
  261. lea edx, [edx + 64]
  262. sub ecx, 16
  263. jg convertloop
  264. vzeroupper
  265. ret
  266. }
  267. }
  268. #endif // HAS_J400TOARGBROW_AVX2
  269. __declspec(naked) void RGB24ToARGBRow_SSSE3(const uint8_t* src_rgb24,
  270. uint8_t* dst_argb,
  271. int width) {
  272. __asm {
  273. mov eax, [esp + 4] // src_rgb24
  274. mov edx, [esp + 8] // dst_argb
  275. mov ecx, [esp + 12] // width
  276. pcmpeqb xmm5, xmm5 // generate mask 0xff000000
  277. pslld xmm5, 24
  278. movdqa xmm4, xmmword ptr kShuffleMaskRGB24ToARGB
  279. convertloop:
  280. movdqu xmm0, [eax]
  281. movdqu xmm1, [eax + 16]
  282. movdqu xmm3, [eax + 32]
  283. lea eax, [eax + 48]
  284. movdqa xmm2, xmm3
  285. palignr xmm2, xmm1, 8 // xmm2 = { xmm3[0:3] xmm1[8:15]}
  286. pshufb xmm2, xmm4
  287. por xmm2, xmm5
  288. palignr xmm1, xmm0, 12 // xmm1 = { xmm3[0:7] xmm0[12:15]}
  289. pshufb xmm0, xmm4
  290. movdqu [edx + 32], xmm2
  291. por xmm0, xmm5
  292. pshufb xmm1, xmm4
  293. movdqu [edx], xmm0
  294. por xmm1, xmm5
  295. palignr xmm3, xmm3, 4 // xmm3 = { xmm3[4:15]}
  296. pshufb xmm3, xmm4
  297. movdqu [edx + 16], xmm1
  298. por xmm3, xmm5
  299. movdqu [edx + 48], xmm3
  300. lea edx, [edx + 64]
  301. sub ecx, 16
  302. jg convertloop
  303. ret
  304. }
  305. }
  306. __declspec(naked) void RAWToARGBRow_SSSE3(const uint8_t* src_raw,
  307. uint8_t* dst_argb,
  308. int width) {
  309. __asm {
  310. mov eax, [esp + 4] // src_raw
  311. mov edx, [esp + 8] // dst_argb
  312. mov ecx, [esp + 12] // width
  313. pcmpeqb xmm5, xmm5 // generate mask 0xff000000
  314. pslld xmm5, 24
  315. movdqa xmm4, xmmword ptr kShuffleMaskRAWToARGB
  316. convertloop:
  317. movdqu xmm0, [eax]
  318. movdqu xmm1, [eax + 16]
  319. movdqu xmm3, [eax + 32]
  320. lea eax, [eax + 48]
  321. movdqa xmm2, xmm3
  322. palignr xmm2, xmm1, 8 // xmm2 = { xmm3[0:3] xmm1[8:15]}
  323. pshufb xmm2, xmm4
  324. por xmm2, xmm5
  325. palignr xmm1, xmm0, 12 // xmm1 = { xmm3[0:7] xmm0[12:15]}
  326. pshufb xmm0, xmm4
  327. movdqu [edx + 32], xmm2
  328. por xmm0, xmm5
  329. pshufb xmm1, xmm4
  330. movdqu [edx], xmm0
  331. por xmm1, xmm5
  332. palignr xmm3, xmm3, 4 // xmm3 = { xmm3[4:15]}
  333. pshufb xmm3, xmm4
  334. movdqu [edx + 16], xmm1
  335. por xmm3, xmm5
  336. movdqu [edx + 48], xmm3
  337. lea edx, [edx + 64]
  338. sub ecx, 16
  339. jg convertloop
  340. ret
  341. }
  342. }
  343. __declspec(naked) void RAWToRGB24Row_SSSE3(const uint8_t* src_raw,
  344. uint8_t* dst_rgb24,
  345. int width) {
  346. __asm {
  347. mov eax, [esp + 4] // src_raw
  348. mov edx, [esp + 8] // dst_rgb24
  349. mov ecx, [esp + 12] // width
  350. movdqa xmm3, xmmword ptr kShuffleMaskRAWToRGB24_0
  351. movdqa xmm4, xmmword ptr kShuffleMaskRAWToRGB24_1
  352. movdqa xmm5, xmmword ptr kShuffleMaskRAWToRGB24_2
  353. convertloop:
  354. movdqu xmm0, [eax]
  355. movdqu xmm1, [eax + 4]
  356. movdqu xmm2, [eax + 8]
  357. lea eax, [eax + 24]
  358. pshufb xmm0, xmm3
  359. pshufb xmm1, xmm4
  360. pshufb xmm2, xmm5
  361. movq qword ptr [edx], xmm0
  362. movq qword ptr [edx + 8], xmm1
  363. movq qword ptr [edx + 16], xmm2
  364. lea edx, [edx + 24]
  365. sub ecx, 8
  366. jg convertloop
  367. ret
  368. }
  369. }
  370. // pmul method to replicate bits.
  371. // Math to replicate bits:
  372. // (v << 8) | (v << 3)
  373. // v * 256 + v * 8
  374. // v * (256 + 8)
  375. // G shift of 5 is incorporated, so shift is 5 + 8 and 5 + 3
  376. // 20 instructions.
  377. __declspec(naked) void RGB565ToARGBRow_SSE2(const uint8_t* src_rgb565,
  378. uint8_t* dst_argb,
  379. int width) {
  380. __asm {
  381. mov eax, 0x01080108 // generate multiplier to repeat 5 bits
  382. movd xmm5, eax
  383. pshufd xmm5, xmm5, 0
  384. mov eax, 0x20802080 // multiplier shift by 5 and then repeat 6 bits
  385. movd xmm6, eax
  386. pshufd xmm6, xmm6, 0
  387. pcmpeqb xmm3, xmm3 // generate mask 0xf800f800 for Red
  388. psllw xmm3, 11
  389. pcmpeqb xmm4, xmm4 // generate mask 0x07e007e0 for Green
  390. psllw xmm4, 10
  391. psrlw xmm4, 5
  392. pcmpeqb xmm7, xmm7 // generate mask 0xff00ff00 for Alpha
  393. psllw xmm7, 8
  394. mov eax, [esp + 4] // src_rgb565
  395. mov edx, [esp + 8] // dst_argb
  396. mov ecx, [esp + 12] // width
  397. sub edx, eax
  398. sub edx, eax
  399. convertloop:
  400. movdqu xmm0, [eax] // fetch 8 pixels of bgr565
  401. movdqa xmm1, xmm0
  402. movdqa xmm2, xmm0
  403. pand xmm1, xmm3 // R in upper 5 bits
  404. psllw xmm2, 11 // B in upper 5 bits
  405. pmulhuw xmm1, xmm5 // * (256 + 8)
  406. pmulhuw xmm2, xmm5 // * (256 + 8)
  407. psllw xmm1, 8
  408. por xmm1, xmm2 // RB
  409. pand xmm0, xmm4 // G in middle 6 bits
  410. pmulhuw xmm0, xmm6 // << 5 * (256 + 4)
  411. por xmm0, xmm7 // AG
  412. movdqa xmm2, xmm1
  413. punpcklbw xmm1, xmm0
  414. punpckhbw xmm2, xmm0
  415. movdqu [eax * 2 + edx], xmm1 // store 4 pixels of ARGB
  416. movdqu [eax * 2 + edx + 16], xmm2 // store next 4 pixels of ARGB
  417. lea eax, [eax + 16]
  418. sub ecx, 8
  419. jg convertloop
  420. ret
  421. }
  422. }
  423. #ifdef HAS_RGB565TOARGBROW_AVX2
  424. // pmul method to replicate bits.
  425. // Math to replicate bits:
  426. // (v << 8) | (v << 3)
  427. // v * 256 + v * 8
  428. // v * (256 + 8)
  429. // G shift of 5 is incorporated, so shift is 5 + 8 and 5 + 3
  430. __declspec(naked) void RGB565ToARGBRow_AVX2(const uint8_t* src_rgb565,
  431. uint8_t* dst_argb,
  432. int width) {
  433. __asm {
  434. mov eax, 0x01080108 // generate multiplier to repeat 5 bits
  435. vmovd xmm5, eax
  436. vbroadcastss ymm5, xmm5
  437. mov eax, 0x20802080 // multiplier shift by 5 and then repeat 6 bits
  438. vmovd xmm6, eax
  439. vbroadcastss ymm6, xmm6
  440. vpcmpeqb ymm3, ymm3, ymm3 // generate mask 0xf800f800 for Red
  441. vpsllw ymm3, ymm3, 11
  442. vpcmpeqb ymm4, ymm4, ymm4 // generate mask 0x07e007e0 for Green
  443. vpsllw ymm4, ymm4, 10
  444. vpsrlw ymm4, ymm4, 5
  445. vpcmpeqb ymm7, ymm7, ymm7 // generate mask 0xff00ff00 for Alpha
  446. vpsllw ymm7, ymm7, 8
  447. mov eax, [esp + 4] // src_rgb565
  448. mov edx, [esp + 8] // dst_argb
  449. mov ecx, [esp + 12] // width
  450. sub edx, eax
  451. sub edx, eax
  452. convertloop:
  453. vmovdqu ymm0, [eax] // fetch 16 pixels of bgr565
  454. vpand ymm1, ymm0, ymm3 // R in upper 5 bits
  455. vpsllw ymm2, ymm0, 11 // B in upper 5 bits
  456. vpmulhuw ymm1, ymm1, ymm5 // * (256 + 8)
  457. vpmulhuw ymm2, ymm2, ymm5 // * (256 + 8)
  458. vpsllw ymm1, ymm1, 8
  459. vpor ymm1, ymm1, ymm2 // RB
  460. vpand ymm0, ymm0, ymm4 // G in middle 6 bits
  461. vpmulhuw ymm0, ymm0, ymm6 // << 5 * (256 + 4)
  462. vpor ymm0, ymm0, ymm7 // AG
  463. vpermq ymm0, ymm0, 0xd8 // mutate for unpack
  464. vpermq ymm1, ymm1, 0xd8
  465. vpunpckhbw ymm2, ymm1, ymm0
  466. vpunpcklbw ymm1, ymm1, ymm0
  467. vmovdqu [eax * 2 + edx], ymm1 // store 4 pixels of ARGB
  468. vmovdqu [eax * 2 + edx + 32], ymm2 // store next 4 pixels of ARGB
  469. lea eax, [eax + 32]
  470. sub ecx, 16
  471. jg convertloop
  472. vzeroupper
  473. ret
  474. }
  475. }
  476. #endif // HAS_RGB565TOARGBROW_AVX2
  477. #ifdef HAS_ARGB1555TOARGBROW_AVX2
  478. __declspec(naked) void ARGB1555ToARGBRow_AVX2(const uint8_t* src_argb1555,
  479. uint8_t* dst_argb,
  480. int width) {
  481. __asm {
  482. mov eax, 0x01080108 // generate multiplier to repeat 5 bits
  483. vmovd xmm5, eax
  484. vbroadcastss ymm5, xmm5
  485. mov eax, 0x42004200 // multiplier shift by 6 and then repeat 5 bits
  486. vmovd xmm6, eax
  487. vbroadcastss ymm6, xmm6
  488. vpcmpeqb ymm3, ymm3, ymm3 // generate mask 0xf800f800 for Red
  489. vpsllw ymm3, ymm3, 11
  490. vpsrlw ymm4, ymm3, 6 // generate mask 0x03e003e0 for Green
  491. vpcmpeqb ymm7, ymm7, ymm7 // generate mask 0xff00ff00 for Alpha
  492. vpsllw ymm7, ymm7, 8
  493. mov eax, [esp + 4] // src_argb1555
  494. mov edx, [esp + 8] // dst_argb
  495. mov ecx, [esp + 12] // width
  496. sub edx, eax
  497. sub edx, eax
  498. convertloop:
  499. vmovdqu ymm0, [eax] // fetch 16 pixels of 1555
  500. vpsllw ymm1, ymm0, 1 // R in upper 5 bits
  501. vpsllw ymm2, ymm0, 11 // B in upper 5 bits
  502. vpand ymm1, ymm1, ymm3
  503. vpmulhuw ymm2, ymm2, ymm5 // * (256 + 8)
  504. vpmulhuw ymm1, ymm1, ymm5 // * (256 + 8)
  505. vpsllw ymm1, ymm1, 8
  506. vpor ymm1, ymm1, ymm2 // RB
  507. vpsraw ymm2, ymm0, 8 // A
  508. vpand ymm0, ymm0, ymm4 // G in middle 5 bits
  509. vpmulhuw ymm0, ymm0, ymm6 // << 6 * (256 + 8)
  510. vpand ymm2, ymm2, ymm7
  511. vpor ymm0, ymm0, ymm2 // AG
  512. vpermq ymm0, ymm0, 0xd8 // mutate for unpack
  513. vpermq ymm1, ymm1, 0xd8
  514. vpunpckhbw ymm2, ymm1, ymm0
  515. vpunpcklbw ymm1, ymm1, ymm0
  516. vmovdqu [eax * 2 + edx], ymm1 // store 8 pixels of ARGB
  517. vmovdqu [eax * 2 + edx + 32], ymm2 // store next 8 pixels of ARGB
  518. lea eax, [eax + 32]
  519. sub ecx, 16
  520. jg convertloop
  521. vzeroupper
  522. ret
  523. }
  524. }
  525. #endif // HAS_ARGB1555TOARGBROW_AVX2
  526. #ifdef HAS_ARGB4444TOARGBROW_AVX2
  527. __declspec(naked) void ARGB4444ToARGBRow_AVX2(const uint8_t* src_argb4444,
  528. uint8_t* dst_argb,
  529. int width) {
  530. __asm {
  531. mov eax, 0x0f0f0f0f // generate mask 0x0f0f0f0f
  532. vmovd xmm4, eax
  533. vbroadcastss ymm4, xmm4
  534. vpslld ymm5, ymm4, 4 // 0xf0f0f0f0 for high nibbles
  535. mov eax, [esp + 4] // src_argb4444
  536. mov edx, [esp + 8] // dst_argb
  537. mov ecx, [esp + 12] // width
  538. sub edx, eax
  539. sub edx, eax
  540. convertloop:
  541. vmovdqu ymm0, [eax] // fetch 16 pixels of bgra4444
  542. vpand ymm2, ymm0, ymm5 // mask high nibbles
  543. vpand ymm0, ymm0, ymm4 // mask low nibbles
  544. vpsrlw ymm3, ymm2, 4
  545. vpsllw ymm1, ymm0, 4
  546. vpor ymm2, ymm2, ymm3
  547. vpor ymm0, ymm0, ymm1
  548. vpermq ymm0, ymm0, 0xd8 // mutate for unpack
  549. vpermq ymm2, ymm2, 0xd8
  550. vpunpckhbw ymm1, ymm0, ymm2
  551. vpunpcklbw ymm0, ymm0, ymm2
  552. vmovdqu [eax * 2 + edx], ymm0 // store 8 pixels of ARGB
  553. vmovdqu [eax * 2 + edx + 32], ymm1 // store next 8 pixels of ARGB
  554. lea eax, [eax + 32]
  555. sub ecx, 16
  556. jg convertloop
  557. vzeroupper
  558. ret
  559. }
  560. }
  561. #endif // HAS_ARGB4444TOARGBROW_AVX2
  562. // 24 instructions
  563. __declspec(naked) void ARGB1555ToARGBRow_SSE2(const uint8_t* src_argb1555,
  564. uint8_t* dst_argb,
  565. int width) {
  566. __asm {
  567. mov eax, 0x01080108 // generate multiplier to repeat 5 bits
  568. movd xmm5, eax
  569. pshufd xmm5, xmm5, 0
  570. mov eax, 0x42004200 // multiplier shift by 6 and then repeat 5 bits
  571. movd xmm6, eax
  572. pshufd xmm6, xmm6, 0
  573. pcmpeqb xmm3, xmm3 // generate mask 0xf800f800 for Red
  574. psllw xmm3, 11
  575. movdqa xmm4, xmm3 // generate mask 0x03e003e0 for Green
  576. psrlw xmm4, 6
  577. pcmpeqb xmm7, xmm7 // generate mask 0xff00ff00 for Alpha
  578. psllw xmm7, 8
  579. mov eax, [esp + 4] // src_argb1555
  580. mov edx, [esp + 8] // dst_argb
  581. mov ecx, [esp + 12] // width
  582. sub edx, eax
  583. sub edx, eax
  584. convertloop:
  585. movdqu xmm0, [eax] // fetch 8 pixels of 1555
  586. movdqa xmm1, xmm0
  587. movdqa xmm2, xmm0
  588. psllw xmm1, 1 // R in upper 5 bits
  589. psllw xmm2, 11 // B in upper 5 bits
  590. pand xmm1, xmm3
  591. pmulhuw xmm2, xmm5 // * (256 + 8)
  592. pmulhuw xmm1, xmm5 // * (256 + 8)
  593. psllw xmm1, 8
  594. por xmm1, xmm2 // RB
  595. movdqa xmm2, xmm0
  596. pand xmm0, xmm4 // G in middle 5 bits
  597. psraw xmm2, 8 // A
  598. pmulhuw xmm0, xmm6 // << 6 * (256 + 8)
  599. pand xmm2, xmm7
  600. por xmm0, xmm2 // AG
  601. movdqa xmm2, xmm1
  602. punpcklbw xmm1, xmm0
  603. punpckhbw xmm2, xmm0
  604. movdqu [eax * 2 + edx], xmm1 // store 4 pixels of ARGB
  605. movdqu [eax * 2 + edx + 16], xmm2 // store next 4 pixels of ARGB
  606. lea eax, [eax + 16]
  607. sub ecx, 8
  608. jg convertloop
  609. ret
  610. }
  611. }
  612. // 18 instructions.
  613. __declspec(naked) void ARGB4444ToARGBRow_SSE2(const uint8_t* src_argb4444,
  614. uint8_t* dst_argb,
  615. int width) {
  616. __asm {
  617. mov eax, 0x0f0f0f0f // generate mask 0x0f0f0f0f
  618. movd xmm4, eax
  619. pshufd xmm4, xmm4, 0
  620. movdqa xmm5, xmm4 // 0xf0f0f0f0 for high nibbles
  621. pslld xmm5, 4
  622. mov eax, [esp + 4] // src_argb4444
  623. mov edx, [esp + 8] // dst_argb
  624. mov ecx, [esp + 12] // width
  625. sub edx, eax
  626. sub edx, eax
  627. convertloop:
  628. movdqu xmm0, [eax] // fetch 8 pixels of bgra4444
  629. movdqa xmm2, xmm0
  630. pand xmm0, xmm4 // mask low nibbles
  631. pand xmm2, xmm5 // mask high nibbles
  632. movdqa xmm1, xmm0
  633. movdqa xmm3, xmm2
  634. psllw xmm1, 4
  635. psrlw xmm3, 4
  636. por xmm0, xmm1
  637. por xmm2, xmm3
  638. movdqa xmm1, xmm0
  639. punpcklbw xmm0, xmm2
  640. punpckhbw xmm1, xmm2
  641. movdqu [eax * 2 + edx], xmm0 // store 4 pixels of ARGB
  642. movdqu [eax * 2 + edx + 16], xmm1 // store next 4 pixels of ARGB
  643. lea eax, [eax + 16]
  644. sub ecx, 8
  645. jg convertloop
  646. ret
  647. }
  648. }
  649. __declspec(naked) void ARGBToRGB24Row_SSSE3(const uint8_t* src_argb,
  650. uint8_t* dst_rgb,
  651. int width) {
  652. __asm {
  653. mov eax, [esp + 4] // src_argb
  654. mov edx, [esp + 8] // dst_rgb
  655. mov ecx, [esp + 12] // width
  656. movdqa xmm6, xmmword ptr kShuffleMaskARGBToRGB24
  657. convertloop:
  658. movdqu xmm0, [eax] // fetch 16 pixels of argb
  659. movdqu xmm1, [eax + 16]
  660. movdqu xmm2, [eax + 32]
  661. movdqu xmm3, [eax + 48]
  662. lea eax, [eax + 64]
  663. pshufb xmm0, xmm6 // pack 16 bytes of ARGB to 12 bytes of RGB
  664. pshufb xmm1, xmm6
  665. pshufb xmm2, xmm6
  666. pshufb xmm3, xmm6
  667. movdqa xmm4, xmm1 // 4 bytes from 1 for 0
  668. psrldq xmm1, 4 // 8 bytes from 1
  669. pslldq xmm4, 12 // 4 bytes from 1 for 0
  670. movdqa xmm5, xmm2 // 8 bytes from 2 for 1
  671. por xmm0, xmm4 // 4 bytes from 1 for 0
  672. pslldq xmm5, 8 // 8 bytes from 2 for 1
  673. movdqu [edx], xmm0 // store 0
  674. por xmm1, xmm5 // 8 bytes from 2 for 1
  675. psrldq xmm2, 8 // 4 bytes from 2
  676. pslldq xmm3, 4 // 12 bytes from 3 for 2
  677. por xmm2, xmm3 // 12 bytes from 3 for 2
  678. movdqu [edx + 16], xmm1 // store 1
  679. movdqu [edx + 32], xmm2 // store 2
  680. lea edx, [edx + 48]
  681. sub ecx, 16
  682. jg convertloop
  683. ret
  684. }
  685. }
  686. __declspec(naked) void ARGBToRAWRow_SSSE3(const uint8_t* src_argb,
  687. uint8_t* dst_rgb,
  688. int width) {
  689. __asm {
  690. mov eax, [esp + 4] // src_argb
  691. mov edx, [esp + 8] // dst_rgb
  692. mov ecx, [esp + 12] // width
  693. movdqa xmm6, xmmword ptr kShuffleMaskARGBToRAW
  694. convertloop:
  695. movdqu xmm0, [eax] // fetch 16 pixels of argb
  696. movdqu xmm1, [eax + 16]
  697. movdqu xmm2, [eax + 32]
  698. movdqu xmm3, [eax + 48]
  699. lea eax, [eax + 64]
  700. pshufb xmm0, xmm6 // pack 16 bytes of ARGB to 12 bytes of RGB
  701. pshufb xmm1, xmm6
  702. pshufb xmm2, xmm6
  703. pshufb xmm3, xmm6
  704. movdqa xmm4, xmm1 // 4 bytes from 1 for 0
  705. psrldq xmm1, 4 // 8 bytes from 1
  706. pslldq xmm4, 12 // 4 bytes from 1 for 0
  707. movdqa xmm5, xmm2 // 8 bytes from 2 for 1
  708. por xmm0, xmm4 // 4 bytes from 1 for 0
  709. pslldq xmm5, 8 // 8 bytes from 2 for 1
  710. movdqu [edx], xmm0 // store 0
  711. por xmm1, xmm5 // 8 bytes from 2 for 1
  712. psrldq xmm2, 8 // 4 bytes from 2
  713. pslldq xmm3, 4 // 12 bytes from 3 for 2
  714. por xmm2, xmm3 // 12 bytes from 3 for 2
  715. movdqu [edx + 16], xmm1 // store 1
  716. movdqu [edx + 32], xmm2 // store 2
  717. lea edx, [edx + 48]
  718. sub ecx, 16
  719. jg convertloop
  720. ret
  721. }
  722. }
  723. __declspec(naked) void ARGBToRGB565Row_SSE2(const uint8_t* src_argb,
  724. uint8_t* dst_rgb,
  725. int width) {
  726. __asm {
  727. mov eax, [esp + 4] // src_argb
  728. mov edx, [esp + 8] // dst_rgb
  729. mov ecx, [esp + 12] // width
  730. pcmpeqb xmm3, xmm3 // generate mask 0x0000001f
  731. psrld xmm3, 27
  732. pcmpeqb xmm4, xmm4 // generate mask 0x000007e0
  733. psrld xmm4, 26
  734. pslld xmm4, 5
  735. pcmpeqb xmm5, xmm5 // generate mask 0xfffff800
  736. pslld xmm5, 11
  737. convertloop:
  738. movdqu xmm0, [eax] // fetch 4 pixels of argb
  739. movdqa xmm1, xmm0 // B
  740. movdqa xmm2, xmm0 // G
  741. pslld xmm0, 8 // R
  742. psrld xmm1, 3 // B
  743. psrld xmm2, 5 // G
  744. psrad xmm0, 16 // R
  745. pand xmm1, xmm3 // B
  746. pand xmm2, xmm4 // G
  747. pand xmm0, xmm5 // R
  748. por xmm1, xmm2 // BG
  749. por xmm0, xmm1 // BGR
  750. packssdw xmm0, xmm0
  751. lea eax, [eax + 16]
  752. movq qword ptr [edx], xmm0 // store 4 pixels of RGB565
  753. lea edx, [edx + 8]
  754. sub ecx, 4
  755. jg convertloop
  756. ret
  757. }
  758. }
  759. __declspec(naked) void ARGBToRGB565DitherRow_SSE2(const uint8_t* src_argb,
  760. uint8_t* dst_rgb,
  761. const uint32_t dither4,
  762. int width) {
  763. __asm {
  764. mov eax, [esp + 4] // src_argb
  765. mov edx, [esp + 8] // dst_rgb
  766. movd xmm6, [esp + 12] // dither4
  767. mov ecx, [esp + 16] // width
  768. punpcklbw xmm6, xmm6 // make dither 16 bytes
  769. movdqa xmm7, xmm6
  770. punpcklwd xmm6, xmm6
  771. punpckhwd xmm7, xmm7
  772. pcmpeqb xmm3, xmm3 // generate mask 0x0000001f
  773. psrld xmm3, 27
  774. pcmpeqb xmm4, xmm4 // generate mask 0x000007e0
  775. psrld xmm4, 26
  776. pslld xmm4, 5
  777. pcmpeqb xmm5, xmm5 // generate mask 0xfffff800
  778. pslld xmm5, 11
  779. convertloop:
  780. movdqu xmm0, [eax] // fetch 4 pixels of argb
  781. paddusb xmm0, xmm6 // add dither
  782. movdqa xmm1, xmm0 // B
  783. movdqa xmm2, xmm0 // G
  784. pslld xmm0, 8 // R
  785. psrld xmm1, 3 // B
  786. psrld xmm2, 5 // G
  787. psrad xmm0, 16 // R
  788. pand xmm1, xmm3 // B
  789. pand xmm2, xmm4 // G
  790. pand xmm0, xmm5 // R
  791. por xmm1, xmm2 // BG
  792. por xmm0, xmm1 // BGR
  793. packssdw xmm0, xmm0
  794. lea eax, [eax + 16]
  795. movq qword ptr [edx], xmm0 // store 4 pixels of RGB565
  796. lea edx, [edx + 8]
  797. sub ecx, 4
  798. jg convertloop
  799. ret
  800. }
  801. }
  802. #ifdef HAS_ARGBTORGB565DITHERROW_AVX2
  803. __declspec(naked) void ARGBToRGB565DitherRow_AVX2(const uint8_t* src_argb,
  804. uint8_t* dst_rgb,
  805. const uint32_t dither4,
  806. int width) {
  807. __asm {
  808. mov eax, [esp + 4] // src_argb
  809. mov edx, [esp + 8] // dst_rgb
  810. vbroadcastss xmm6, [esp + 12] // dither4
  811. mov ecx, [esp + 16] // width
  812. vpunpcklbw xmm6, xmm6, xmm6 // make dither 32 bytes
  813. vpermq ymm6, ymm6, 0xd8
  814. vpunpcklwd ymm6, ymm6, ymm6
  815. vpcmpeqb ymm3, ymm3, ymm3 // generate mask 0x0000001f
  816. vpsrld ymm3, ymm3, 27
  817. vpcmpeqb ymm4, ymm4, ymm4 // generate mask 0x000007e0
  818. vpsrld ymm4, ymm4, 26
  819. vpslld ymm4, ymm4, 5
  820. vpslld ymm5, ymm3, 11 // generate mask 0x0000f800
  821. convertloop:
  822. vmovdqu ymm0, [eax] // fetch 8 pixels of argb
  823. vpaddusb ymm0, ymm0, ymm6 // add dither
  824. vpsrld ymm2, ymm0, 5 // G
  825. vpsrld ymm1, ymm0, 3 // B
  826. vpsrld ymm0, ymm0, 8 // R
  827. vpand ymm2, ymm2, ymm4 // G
  828. vpand ymm1, ymm1, ymm3 // B
  829. vpand ymm0, ymm0, ymm5 // R
  830. vpor ymm1, ymm1, ymm2 // BG
  831. vpor ymm0, ymm0, ymm1 // BGR
  832. vpackusdw ymm0, ymm0, ymm0
  833. vpermq ymm0, ymm0, 0xd8
  834. lea eax, [eax + 32]
  835. vmovdqu [edx], xmm0 // store 8 pixels of RGB565
  836. lea edx, [edx + 16]
  837. sub ecx, 8
  838. jg convertloop
  839. vzeroupper
  840. ret
  841. }
  842. }
  843. #endif // HAS_ARGBTORGB565DITHERROW_AVX2
  844. // TODO(fbarchard): Improve sign extension/packing.
  845. __declspec(naked) void ARGBToARGB1555Row_SSE2(const uint8_t* src_argb,
  846. uint8_t* dst_rgb,
  847. int width) {
  848. __asm {
  849. mov eax, [esp + 4] // src_argb
  850. mov edx, [esp + 8] // dst_rgb
  851. mov ecx, [esp + 12] // width
  852. pcmpeqb xmm4, xmm4 // generate mask 0x0000001f
  853. psrld xmm4, 27
  854. movdqa xmm5, xmm4 // generate mask 0x000003e0
  855. pslld xmm5, 5
  856. movdqa xmm6, xmm4 // generate mask 0x00007c00
  857. pslld xmm6, 10
  858. pcmpeqb xmm7, xmm7 // generate mask 0xffff8000
  859. pslld xmm7, 15
  860. convertloop:
  861. movdqu xmm0, [eax] // fetch 4 pixels of argb
  862. movdqa xmm1, xmm0 // B
  863. movdqa xmm2, xmm0 // G
  864. movdqa xmm3, xmm0 // R
  865. psrad xmm0, 16 // A
  866. psrld xmm1, 3 // B
  867. psrld xmm2, 6 // G
  868. psrld xmm3, 9 // R
  869. pand xmm0, xmm7 // A
  870. pand xmm1, xmm4 // B
  871. pand xmm2, xmm5 // G
  872. pand xmm3, xmm6 // R
  873. por xmm0, xmm1 // BA
  874. por xmm2, xmm3 // GR
  875. por xmm0, xmm2 // BGRA
  876. packssdw xmm0, xmm0
  877. lea eax, [eax + 16]
  878. movq qword ptr [edx], xmm0 // store 4 pixels of ARGB1555
  879. lea edx, [edx + 8]
  880. sub ecx, 4
  881. jg convertloop
  882. ret
  883. }
  884. }
  885. __declspec(naked) void ARGBToARGB4444Row_SSE2(const uint8_t* src_argb,
  886. uint8_t* dst_rgb,
  887. int width) {
  888. __asm {
  889. mov eax, [esp + 4] // src_argb
  890. mov edx, [esp + 8] // dst_rgb
  891. mov ecx, [esp + 12] // width
  892. pcmpeqb xmm4, xmm4 // generate mask 0xf000f000
  893. psllw xmm4, 12
  894. movdqa xmm3, xmm4 // generate mask 0x00f000f0
  895. psrlw xmm3, 8
  896. convertloop:
  897. movdqu xmm0, [eax] // fetch 4 pixels of argb
  898. movdqa xmm1, xmm0
  899. pand xmm0, xmm3 // low nibble
  900. pand xmm1, xmm4 // high nibble
  901. psrld xmm0, 4
  902. psrld xmm1, 8
  903. por xmm0, xmm1
  904. packuswb xmm0, xmm0
  905. lea eax, [eax + 16]
  906. movq qword ptr [edx], xmm0 // store 4 pixels of ARGB4444
  907. lea edx, [edx + 8]
  908. sub ecx, 4
  909. jg convertloop
  910. ret
  911. }
  912. }
  913. #ifdef HAS_ARGBTORGB565ROW_AVX2
  914. __declspec(naked) void ARGBToRGB565Row_AVX2(const uint8_t* src_argb,
  915. uint8_t* dst_rgb,
  916. int width) {
  917. __asm {
  918. mov eax, [esp + 4] // src_argb
  919. mov edx, [esp + 8] // dst_rgb
  920. mov ecx, [esp + 12] // width
  921. vpcmpeqb ymm3, ymm3, ymm3 // generate mask 0x0000001f
  922. vpsrld ymm3, ymm3, 27
  923. vpcmpeqb ymm4, ymm4, ymm4 // generate mask 0x000007e0
  924. vpsrld ymm4, ymm4, 26
  925. vpslld ymm4, ymm4, 5
  926. vpslld ymm5, ymm3, 11 // generate mask 0x0000f800
  927. convertloop:
  928. vmovdqu ymm0, [eax] // fetch 8 pixels of argb
  929. vpsrld ymm2, ymm0, 5 // G
  930. vpsrld ymm1, ymm0, 3 // B
  931. vpsrld ymm0, ymm0, 8 // R
  932. vpand ymm2, ymm2, ymm4 // G
  933. vpand ymm1, ymm1, ymm3 // B
  934. vpand ymm0, ymm0, ymm5 // R
  935. vpor ymm1, ymm1, ymm2 // BG
  936. vpor ymm0, ymm0, ymm1 // BGR
  937. vpackusdw ymm0, ymm0, ymm0
  938. vpermq ymm0, ymm0, 0xd8
  939. lea eax, [eax + 32]
  940. vmovdqu [edx], xmm0 // store 8 pixels of RGB565
  941. lea edx, [edx + 16]
  942. sub ecx, 8
  943. jg convertloop
  944. vzeroupper
  945. ret
  946. }
  947. }
  948. #endif // HAS_ARGBTORGB565ROW_AVX2
  949. #ifdef HAS_ARGBTOARGB1555ROW_AVX2
  950. __declspec(naked) void ARGBToARGB1555Row_AVX2(const uint8_t* src_argb,
  951. uint8_t* dst_rgb,
  952. int width) {
  953. __asm {
  954. mov eax, [esp + 4] // src_argb
  955. mov edx, [esp + 8] // dst_rgb
  956. mov ecx, [esp + 12] // width
  957. vpcmpeqb ymm4, ymm4, ymm4
  958. vpsrld ymm4, ymm4, 27 // generate mask 0x0000001f
  959. vpslld ymm5, ymm4, 5 // generate mask 0x000003e0
  960. vpslld ymm6, ymm4, 10 // generate mask 0x00007c00
  961. vpcmpeqb ymm7, ymm7, ymm7 // generate mask 0xffff8000
  962. vpslld ymm7, ymm7, 15
  963. convertloop:
  964. vmovdqu ymm0, [eax] // fetch 8 pixels of argb
  965. vpsrld ymm3, ymm0, 9 // R
  966. vpsrld ymm2, ymm0, 6 // G
  967. vpsrld ymm1, ymm0, 3 // B
  968. vpsrad ymm0, ymm0, 16 // A
  969. vpand ymm3, ymm3, ymm6 // R
  970. vpand ymm2, ymm2, ymm5 // G
  971. vpand ymm1, ymm1, ymm4 // B
  972. vpand ymm0, ymm0, ymm7 // A
  973. vpor ymm0, ymm0, ymm1 // BA
  974. vpor ymm2, ymm2, ymm3 // GR
  975. vpor ymm0, ymm0, ymm2 // BGRA
  976. vpackssdw ymm0, ymm0, ymm0
  977. vpermq ymm0, ymm0, 0xd8
  978. lea eax, [eax + 32]
  979. vmovdqu [edx], xmm0 // store 8 pixels of ARGB1555
  980. lea edx, [edx + 16]
  981. sub ecx, 8
  982. jg convertloop
  983. vzeroupper
  984. ret
  985. }
  986. }
  987. #endif // HAS_ARGBTOARGB1555ROW_AVX2
  988. #ifdef HAS_ARGBTOARGB4444ROW_AVX2
  989. __declspec(naked) void ARGBToARGB4444Row_AVX2(const uint8_t* src_argb,
  990. uint8_t* dst_rgb,
  991. int width) {
  992. __asm {
  993. mov eax, [esp + 4] // src_argb
  994. mov edx, [esp + 8] // dst_rgb
  995. mov ecx, [esp + 12] // width
  996. vpcmpeqb ymm4, ymm4, ymm4 // generate mask 0xf000f000
  997. vpsllw ymm4, ymm4, 12
  998. vpsrlw ymm3, ymm4, 8 // generate mask 0x00f000f0
  999. convertloop:
  1000. vmovdqu ymm0, [eax] // fetch 8 pixels of argb
  1001. vpand ymm1, ymm0, ymm4 // high nibble
  1002. vpand ymm0, ymm0, ymm3 // low nibble
  1003. vpsrld ymm1, ymm1, 8
  1004. vpsrld ymm0, ymm0, 4
  1005. vpor ymm0, ymm0, ymm1
  1006. vpackuswb ymm0, ymm0, ymm0
  1007. vpermq ymm0, ymm0, 0xd8
  1008. lea eax, [eax + 32]
  1009. vmovdqu [edx], xmm0 // store 8 pixels of ARGB4444
  1010. lea edx, [edx + 16]
  1011. sub ecx, 8
  1012. jg convertloop
  1013. vzeroupper
  1014. ret
  1015. }
  1016. }
  1017. #endif // HAS_ARGBTOARGB4444ROW_AVX2
  1018. // Convert 16 ARGB pixels (64 bytes) to 16 Y values.
  1019. __declspec(naked) void ARGBToYRow_SSSE3(const uint8_t* src_argb,
  1020. uint8_t* dst_y,
  1021. int width) {
  1022. __asm {
  1023. mov eax, [esp + 4] /* src_argb */
  1024. mov edx, [esp + 8] /* dst_y */
  1025. mov ecx, [esp + 12] /* width */
  1026. movdqa xmm4, xmmword ptr kARGBToY
  1027. movdqa xmm5, xmmword ptr kAddY16
  1028. convertloop:
  1029. movdqu xmm0, [eax]
  1030. movdqu xmm1, [eax + 16]
  1031. movdqu xmm2, [eax + 32]
  1032. movdqu xmm3, [eax + 48]
  1033. pmaddubsw xmm0, xmm4
  1034. pmaddubsw xmm1, xmm4
  1035. pmaddubsw xmm2, xmm4
  1036. pmaddubsw xmm3, xmm4
  1037. lea eax, [eax + 64]
  1038. phaddw xmm0, xmm1
  1039. phaddw xmm2, xmm3
  1040. psrlw xmm0, 7
  1041. psrlw xmm2, 7
  1042. packuswb xmm0, xmm2
  1043. paddb xmm0, xmm5
  1044. movdqu [edx], xmm0
  1045. lea edx, [edx + 16]
  1046. sub ecx, 16
  1047. jg convertloop
  1048. ret
  1049. }
  1050. }
  1051. // Convert 16 ARGB pixels (64 bytes) to 16 YJ values.
  1052. // Same as ARGBToYRow but different coefficients, no add 16, but do rounding.
  1053. __declspec(naked) void ARGBToYJRow_SSSE3(const uint8_t* src_argb,
  1054. uint8_t* dst_y,
  1055. int width) {
  1056. __asm {
  1057. mov eax, [esp + 4] /* src_argb */
  1058. mov edx, [esp + 8] /* dst_y */
  1059. mov ecx, [esp + 12] /* width */
  1060. movdqa xmm4, xmmword ptr kARGBToYJ
  1061. movdqa xmm5, xmmword ptr kAddYJ64
  1062. convertloop:
  1063. movdqu xmm0, [eax]
  1064. movdqu xmm1, [eax + 16]
  1065. movdqu xmm2, [eax + 32]
  1066. movdqu xmm3, [eax + 48]
  1067. pmaddubsw xmm0, xmm4
  1068. pmaddubsw xmm1, xmm4
  1069. pmaddubsw xmm2, xmm4
  1070. pmaddubsw xmm3, xmm4
  1071. lea eax, [eax + 64]
  1072. phaddw xmm0, xmm1
  1073. phaddw xmm2, xmm3
  1074. paddw xmm0, xmm5 // Add .5 for rounding.
  1075. paddw xmm2, xmm5
  1076. psrlw xmm0, 7
  1077. psrlw xmm2, 7
  1078. packuswb xmm0, xmm2
  1079. movdqu [edx], xmm0
  1080. lea edx, [edx + 16]
  1081. sub ecx, 16
  1082. jg convertloop
  1083. ret
  1084. }
  1085. }
  1086. #ifdef HAS_ARGBTOYROW_AVX2
  1087. // vpermd for vphaddw + vpackuswb vpermd.
  1088. static const lvec32 kPermdARGBToY_AVX = {0, 4, 1, 5, 2, 6, 3, 7};
  1089. // Convert 32 ARGB pixels (128 bytes) to 32 Y values.
  1090. __declspec(naked) void ARGBToYRow_AVX2(const uint8_t* src_argb,
  1091. uint8_t* dst_y,
  1092. int width) {
  1093. __asm {
  1094. mov eax, [esp + 4] /* src_argb */
  1095. mov edx, [esp + 8] /* dst_y */
  1096. mov ecx, [esp + 12] /* width */
  1097. vbroadcastf128 ymm4, xmmword ptr kARGBToY
  1098. vbroadcastf128 ymm5, xmmword ptr kAddY16
  1099. vmovdqu ymm6, ymmword ptr kPermdARGBToY_AVX
  1100. convertloop:
  1101. vmovdqu ymm0, [eax]
  1102. vmovdqu ymm1, [eax + 32]
  1103. vmovdqu ymm2, [eax + 64]
  1104. vmovdqu ymm3, [eax + 96]
  1105. vpmaddubsw ymm0, ymm0, ymm4
  1106. vpmaddubsw ymm1, ymm1, ymm4
  1107. vpmaddubsw ymm2, ymm2, ymm4
  1108. vpmaddubsw ymm3, ymm3, ymm4
  1109. lea eax, [eax + 128]
  1110. vphaddw ymm0, ymm0, ymm1 // mutates.
  1111. vphaddw ymm2, ymm2, ymm3
  1112. vpsrlw ymm0, ymm0, 7
  1113. vpsrlw ymm2, ymm2, 7
  1114. vpackuswb ymm0, ymm0, ymm2 // mutates.
  1115. vpermd ymm0, ymm6, ymm0 // For vphaddw + vpackuswb mutation.
  1116. vpaddb ymm0, ymm0, ymm5 // add 16 for Y
  1117. vmovdqu [edx], ymm0
  1118. lea edx, [edx + 32]
  1119. sub ecx, 32
  1120. jg convertloop
  1121. vzeroupper
  1122. ret
  1123. }
  1124. }
  1125. #endif // HAS_ARGBTOYROW_AVX2
  1126. #ifdef HAS_ARGBTOYJROW_AVX2
  1127. // Convert 32 ARGB pixels (128 bytes) to 32 Y values.
  1128. __declspec(naked) void ARGBToYJRow_AVX2(const uint8_t* src_argb,
  1129. uint8_t* dst_y,
  1130. int width) {
  1131. __asm {
  1132. mov eax, [esp + 4] /* src_argb */
  1133. mov edx, [esp + 8] /* dst_y */
  1134. mov ecx, [esp + 12] /* width */
  1135. vbroadcastf128 ymm4, xmmword ptr kARGBToYJ
  1136. vbroadcastf128 ymm5, xmmword ptr kAddYJ64
  1137. vmovdqu ymm6, ymmword ptr kPermdARGBToY_AVX
  1138. convertloop:
  1139. vmovdqu ymm0, [eax]
  1140. vmovdqu ymm1, [eax + 32]
  1141. vmovdqu ymm2, [eax + 64]
  1142. vmovdqu ymm3, [eax + 96]
  1143. vpmaddubsw ymm0, ymm0, ymm4
  1144. vpmaddubsw ymm1, ymm1, ymm4
  1145. vpmaddubsw ymm2, ymm2, ymm4
  1146. vpmaddubsw ymm3, ymm3, ymm4
  1147. lea eax, [eax + 128]
  1148. vphaddw ymm0, ymm0, ymm1 // mutates.
  1149. vphaddw ymm2, ymm2, ymm3
  1150. vpaddw ymm0, ymm0, ymm5 // Add .5 for rounding.
  1151. vpaddw ymm2, ymm2, ymm5
  1152. vpsrlw ymm0, ymm0, 7
  1153. vpsrlw ymm2, ymm2, 7
  1154. vpackuswb ymm0, ymm0, ymm2 // mutates.
  1155. vpermd ymm0, ymm6, ymm0 // For vphaddw + vpackuswb mutation.
  1156. vmovdqu [edx], ymm0
  1157. lea edx, [edx + 32]
  1158. sub ecx, 32
  1159. jg convertloop
  1160. vzeroupper
  1161. ret
  1162. }
  1163. }
  1164. #endif // HAS_ARGBTOYJROW_AVX2
  1165. __declspec(naked) void BGRAToYRow_SSSE3(const uint8_t* src_argb,
  1166. uint8_t* dst_y,
  1167. int width) {
  1168. __asm {
  1169. mov eax, [esp + 4] /* src_argb */
  1170. mov edx, [esp + 8] /* dst_y */
  1171. mov ecx, [esp + 12] /* width */
  1172. movdqa xmm4, xmmword ptr kBGRAToY
  1173. movdqa xmm5, xmmword ptr kAddY16
  1174. convertloop:
  1175. movdqu xmm0, [eax]
  1176. movdqu xmm1, [eax + 16]
  1177. movdqu xmm2, [eax + 32]
  1178. movdqu xmm3, [eax + 48]
  1179. pmaddubsw xmm0, xmm4
  1180. pmaddubsw xmm1, xmm4
  1181. pmaddubsw xmm2, xmm4
  1182. pmaddubsw xmm3, xmm4
  1183. lea eax, [eax + 64]
  1184. phaddw xmm0, xmm1
  1185. phaddw xmm2, xmm3
  1186. psrlw xmm0, 7
  1187. psrlw xmm2, 7
  1188. packuswb xmm0, xmm2
  1189. paddb xmm0, xmm5
  1190. movdqu [edx], xmm0
  1191. lea edx, [edx + 16]
  1192. sub ecx, 16
  1193. jg convertloop
  1194. ret
  1195. }
  1196. }
  1197. __declspec(naked) void ABGRToYRow_SSSE3(const uint8_t* src_argb,
  1198. uint8_t* dst_y,
  1199. int width) {
  1200. __asm {
  1201. mov eax, [esp + 4] /* src_argb */
  1202. mov edx, [esp + 8] /* dst_y */
  1203. mov ecx, [esp + 12] /* width */
  1204. movdqa xmm4, xmmword ptr kABGRToY
  1205. movdqa xmm5, xmmword ptr kAddY16
  1206. convertloop:
  1207. movdqu xmm0, [eax]
  1208. movdqu xmm1, [eax + 16]
  1209. movdqu xmm2, [eax + 32]
  1210. movdqu xmm3, [eax + 48]
  1211. pmaddubsw xmm0, xmm4
  1212. pmaddubsw xmm1, xmm4
  1213. pmaddubsw xmm2, xmm4
  1214. pmaddubsw xmm3, xmm4
  1215. lea eax, [eax + 64]
  1216. phaddw xmm0, xmm1
  1217. phaddw xmm2, xmm3
  1218. psrlw xmm0, 7
  1219. psrlw xmm2, 7
  1220. packuswb xmm0, xmm2
  1221. paddb xmm0, xmm5
  1222. movdqu [edx], xmm0
  1223. lea edx, [edx + 16]
  1224. sub ecx, 16
  1225. jg convertloop
  1226. ret
  1227. }
  1228. }
  1229. __declspec(naked) void RGBAToYRow_SSSE3(const uint8_t* src_argb,
  1230. uint8_t* dst_y,
  1231. int width) {
  1232. __asm {
  1233. mov eax, [esp + 4] /* src_argb */
  1234. mov edx, [esp + 8] /* dst_y */
  1235. mov ecx, [esp + 12] /* width */
  1236. movdqa xmm4, xmmword ptr kRGBAToY
  1237. movdqa xmm5, xmmword ptr kAddY16
  1238. convertloop:
  1239. movdqu xmm0, [eax]
  1240. movdqu xmm1, [eax + 16]
  1241. movdqu xmm2, [eax + 32]
  1242. movdqu xmm3, [eax + 48]
  1243. pmaddubsw xmm0, xmm4
  1244. pmaddubsw xmm1, xmm4
  1245. pmaddubsw xmm2, xmm4
  1246. pmaddubsw xmm3, xmm4
  1247. lea eax, [eax + 64]
  1248. phaddw xmm0, xmm1
  1249. phaddw xmm2, xmm3
  1250. psrlw xmm0, 7
  1251. psrlw xmm2, 7
  1252. packuswb xmm0, xmm2
  1253. paddb xmm0, xmm5
  1254. movdqu [edx], xmm0
  1255. lea edx, [edx + 16]
  1256. sub ecx, 16
  1257. jg convertloop
  1258. ret
  1259. }
  1260. }
  1261. __declspec(naked) void ARGBToUVRow_SSSE3(const uint8_t* src_argb0,
  1262. int src_stride_argb,
  1263. uint8_t* dst_u,
  1264. uint8_t* dst_v,
  1265. int width) {
  1266. __asm {
  1267. push esi
  1268. push edi
  1269. mov eax, [esp + 8 + 4] // src_argb
  1270. mov esi, [esp + 8 + 8] // src_stride_argb
  1271. mov edx, [esp + 8 + 12] // dst_u
  1272. mov edi, [esp + 8 + 16] // dst_v
  1273. mov ecx, [esp + 8 + 20] // width
  1274. movdqa xmm5, xmmword ptr kAddUV128
  1275. movdqa xmm6, xmmword ptr kARGBToV
  1276. movdqa xmm7, xmmword ptr kARGBToU
  1277. sub edi, edx // stride from u to v
  1278. convertloop:
  1279. /* step 1 - subsample 16x2 argb pixels to 8x1 */
  1280. movdqu xmm0, [eax]
  1281. movdqu xmm4, [eax + esi]
  1282. pavgb xmm0, xmm4
  1283. movdqu xmm1, [eax + 16]
  1284. movdqu xmm4, [eax + esi + 16]
  1285. pavgb xmm1, xmm4
  1286. movdqu xmm2, [eax + 32]
  1287. movdqu xmm4, [eax + esi + 32]
  1288. pavgb xmm2, xmm4
  1289. movdqu xmm3, [eax + 48]
  1290. movdqu xmm4, [eax + esi + 48]
  1291. pavgb xmm3, xmm4
  1292. lea eax, [eax + 64]
  1293. movdqa xmm4, xmm0
  1294. shufps xmm0, xmm1, 0x88
  1295. shufps xmm4, xmm1, 0xdd
  1296. pavgb xmm0, xmm4
  1297. movdqa xmm4, xmm2
  1298. shufps xmm2, xmm3, 0x88
  1299. shufps xmm4, xmm3, 0xdd
  1300. pavgb xmm2, xmm4
  1301. // step 2 - convert to U and V
  1302. // from here down is very similar to Y code except
  1303. // instead of 16 different pixels, its 8 pixels of U and 8 of V
  1304. movdqa xmm1, xmm0
  1305. movdqa xmm3, xmm2
  1306. pmaddubsw xmm0, xmm7 // U
  1307. pmaddubsw xmm2, xmm7
  1308. pmaddubsw xmm1, xmm6 // V
  1309. pmaddubsw xmm3, xmm6
  1310. phaddw xmm0, xmm2
  1311. phaddw xmm1, xmm3
  1312. psraw xmm0, 8
  1313. psraw xmm1, 8
  1314. packsswb xmm0, xmm1
  1315. paddb xmm0, xmm5 // -> unsigned
  1316. // step 3 - store 8 U and 8 V values
  1317. movlps qword ptr [edx], xmm0 // U
  1318. movhps qword ptr [edx + edi], xmm0 // V
  1319. lea edx, [edx + 8]
  1320. sub ecx, 16
  1321. jg convertloop
  1322. pop edi
  1323. pop esi
  1324. ret
  1325. }
  1326. }
  1327. __declspec(naked) void ARGBToUVJRow_SSSE3(const uint8_t* src_argb0,
  1328. int src_stride_argb,
  1329. uint8_t* dst_u,
  1330. uint8_t* dst_v,
  1331. int width) {
  1332. __asm {
  1333. push esi
  1334. push edi
  1335. mov eax, [esp + 8 + 4] // src_argb
  1336. mov esi, [esp + 8 + 8] // src_stride_argb
  1337. mov edx, [esp + 8 + 12] // dst_u
  1338. mov edi, [esp + 8 + 16] // dst_v
  1339. mov ecx, [esp + 8 + 20] // width
  1340. movdqa xmm5, xmmword ptr kAddUVJ128
  1341. movdqa xmm6, xmmword ptr kARGBToVJ
  1342. movdqa xmm7, xmmword ptr kARGBToUJ
  1343. sub edi, edx // stride from u to v
  1344. convertloop:
  1345. /* step 1 - subsample 16x2 argb pixels to 8x1 */
  1346. movdqu xmm0, [eax]
  1347. movdqu xmm4, [eax + esi]
  1348. pavgb xmm0, xmm4
  1349. movdqu xmm1, [eax + 16]
  1350. movdqu xmm4, [eax + esi + 16]
  1351. pavgb xmm1, xmm4
  1352. movdqu xmm2, [eax + 32]
  1353. movdqu xmm4, [eax + esi + 32]
  1354. pavgb xmm2, xmm4
  1355. movdqu xmm3, [eax + 48]
  1356. movdqu xmm4, [eax + esi + 48]
  1357. pavgb xmm3, xmm4
  1358. lea eax, [eax + 64]
  1359. movdqa xmm4, xmm0
  1360. shufps xmm0, xmm1, 0x88
  1361. shufps xmm4, xmm1, 0xdd
  1362. pavgb xmm0, xmm4
  1363. movdqa xmm4, xmm2
  1364. shufps xmm2, xmm3, 0x88
  1365. shufps xmm4, xmm3, 0xdd
  1366. pavgb xmm2, xmm4
  1367. // step 2 - convert to U and V
  1368. // from here down is very similar to Y code except
  1369. // instead of 16 different pixels, its 8 pixels of U and 8 of V
  1370. movdqa xmm1, xmm0
  1371. movdqa xmm3, xmm2
  1372. pmaddubsw xmm0, xmm7 // U
  1373. pmaddubsw xmm2, xmm7
  1374. pmaddubsw xmm1, xmm6 // V
  1375. pmaddubsw xmm3, xmm6
  1376. phaddw xmm0, xmm2
  1377. phaddw xmm1, xmm3
  1378. paddw xmm0, xmm5 // +.5 rounding -> unsigned
  1379. paddw xmm1, xmm5
  1380. psraw xmm0, 8
  1381. psraw xmm1, 8
  1382. packsswb xmm0, xmm1
  1383. // step 3 - store 8 U and 8 V values
  1384. movlps qword ptr [edx], xmm0 // U
  1385. movhps qword ptr [edx + edi], xmm0 // V
  1386. lea edx, [edx + 8]
  1387. sub ecx, 16
  1388. jg convertloop
  1389. pop edi
  1390. pop esi
  1391. ret
  1392. }
  1393. }
  1394. #ifdef HAS_ARGBTOUVROW_AVX2
  1395. __declspec(naked) void ARGBToUVRow_AVX2(const uint8_t* src_argb0,
  1396. int src_stride_argb,
  1397. uint8_t* dst_u,
  1398. uint8_t* dst_v,
  1399. int width) {
  1400. __asm {
  1401. push esi
  1402. push edi
  1403. mov eax, [esp + 8 + 4] // src_argb
  1404. mov esi, [esp + 8 + 8] // src_stride_argb
  1405. mov edx, [esp + 8 + 12] // dst_u
  1406. mov edi, [esp + 8 + 16] // dst_v
  1407. mov ecx, [esp + 8 + 20] // width
  1408. vbroadcastf128 ymm5, xmmword ptr kAddUV128
  1409. vbroadcastf128 ymm6, xmmword ptr kARGBToV
  1410. vbroadcastf128 ymm7, xmmword ptr kARGBToU
  1411. sub edi, edx // stride from u to v
  1412. convertloop:
  1413. /* step 1 - subsample 32x2 argb pixels to 16x1 */
  1414. vmovdqu ymm0, [eax]
  1415. vmovdqu ymm1, [eax + 32]
  1416. vmovdqu ymm2, [eax + 64]
  1417. vmovdqu ymm3, [eax + 96]
  1418. vpavgb ymm0, ymm0, [eax + esi]
  1419. vpavgb ymm1, ymm1, [eax + esi + 32]
  1420. vpavgb ymm2, ymm2, [eax + esi + 64]
  1421. vpavgb ymm3, ymm3, [eax + esi + 96]
  1422. lea eax, [eax + 128]
  1423. vshufps ymm4, ymm0, ymm1, 0x88
  1424. vshufps ymm0, ymm0, ymm1, 0xdd
  1425. vpavgb ymm0, ymm0, ymm4 // mutated by vshufps
  1426. vshufps ymm4, ymm2, ymm3, 0x88
  1427. vshufps ymm2, ymm2, ymm3, 0xdd
  1428. vpavgb ymm2, ymm2, ymm4 // mutated by vshufps
  1429. // step 2 - convert to U and V
  1430. // from here down is very similar to Y code except
  1431. // instead of 32 different pixels, its 16 pixels of U and 16 of V
  1432. vpmaddubsw ymm1, ymm0, ymm7 // U
  1433. vpmaddubsw ymm3, ymm2, ymm7
  1434. vpmaddubsw ymm0, ymm0, ymm6 // V
  1435. vpmaddubsw ymm2, ymm2, ymm6
  1436. vphaddw ymm1, ymm1, ymm3 // mutates
  1437. vphaddw ymm0, ymm0, ymm2
  1438. vpsraw ymm1, ymm1, 8
  1439. vpsraw ymm0, ymm0, 8
  1440. vpacksswb ymm0, ymm1, ymm0 // mutates
  1441. vpermq ymm0, ymm0, 0xd8 // For vpacksswb
  1442. vpshufb ymm0, ymm0, ymmword ptr kShufARGBToUV_AVX // for vshufps/vphaddw
  1443. vpaddb ymm0, ymm0, ymm5 // -> unsigned
  1444. // step 3 - store 16 U and 16 V values
  1445. vextractf128 [edx], ymm0, 0 // U
  1446. vextractf128 [edx + edi], ymm0, 1 // V
  1447. lea edx, [edx + 16]
  1448. sub ecx, 32
  1449. jg convertloop
  1450. pop edi
  1451. pop esi
  1452. vzeroupper
  1453. ret
  1454. }
  1455. }
  1456. #endif // HAS_ARGBTOUVROW_AVX2
  1457. #ifdef HAS_ARGBTOUVJROW_AVX2
  1458. __declspec(naked) void ARGBToUVJRow_AVX2(const uint8_t* src_argb0,
  1459. int src_stride_argb,
  1460. uint8_t* dst_u,
  1461. uint8_t* dst_v,
  1462. int width) {
  1463. __asm {
  1464. push esi
  1465. push edi
  1466. mov eax, [esp + 8 + 4] // src_argb
  1467. mov esi, [esp + 8 + 8] // src_stride_argb
  1468. mov edx, [esp + 8 + 12] // dst_u
  1469. mov edi, [esp + 8 + 16] // dst_v
  1470. mov ecx, [esp + 8 + 20] // width
  1471. vbroadcastf128 ymm5, xmmword ptr kAddUVJ128
  1472. vbroadcastf128 ymm6, xmmword ptr kARGBToVJ
  1473. vbroadcastf128 ymm7, xmmword ptr kARGBToUJ
  1474. sub edi, edx // stride from u to v
  1475. convertloop:
  1476. /* step 1 - subsample 32x2 argb pixels to 16x1 */
  1477. vmovdqu ymm0, [eax]
  1478. vmovdqu ymm1, [eax + 32]
  1479. vmovdqu ymm2, [eax + 64]
  1480. vmovdqu ymm3, [eax + 96]
  1481. vpavgb ymm0, ymm0, [eax + esi]
  1482. vpavgb ymm1, ymm1, [eax + esi + 32]
  1483. vpavgb ymm2, ymm2, [eax + esi + 64]
  1484. vpavgb ymm3, ymm3, [eax + esi + 96]
  1485. lea eax, [eax + 128]
  1486. vshufps ymm4, ymm0, ymm1, 0x88
  1487. vshufps ymm0, ymm0, ymm1, 0xdd
  1488. vpavgb ymm0, ymm0, ymm4 // mutated by vshufps
  1489. vshufps ymm4, ymm2, ymm3, 0x88
  1490. vshufps ymm2, ymm2, ymm3, 0xdd
  1491. vpavgb ymm2, ymm2, ymm4 // mutated by vshufps
  1492. // step 2 - convert to U and V
  1493. // from here down is very similar to Y code except
  1494. // instead of 32 different pixels, its 16 pixels of U and 16 of V
  1495. vpmaddubsw ymm1, ymm0, ymm7 // U
  1496. vpmaddubsw ymm3, ymm2, ymm7
  1497. vpmaddubsw ymm0, ymm0, ymm6 // V
  1498. vpmaddubsw ymm2, ymm2, ymm6
  1499. vphaddw ymm1, ymm1, ymm3 // mutates
  1500. vphaddw ymm0, ymm0, ymm2
  1501. vpaddw ymm1, ymm1, ymm5 // +.5 rounding -> unsigned
  1502. vpaddw ymm0, ymm0, ymm5
  1503. vpsraw ymm1, ymm1, 8
  1504. vpsraw ymm0, ymm0, 8
  1505. vpacksswb ymm0, ymm1, ymm0 // mutates
  1506. vpermq ymm0, ymm0, 0xd8 // For vpacksswb
  1507. vpshufb ymm0, ymm0, ymmword ptr kShufARGBToUV_AVX // for vshufps/vphaddw
  1508. // step 3 - store 16 U and 16 V values
  1509. vextractf128 [edx], ymm0, 0 // U
  1510. vextractf128 [edx + edi], ymm0, 1 // V
  1511. lea edx, [edx + 16]
  1512. sub ecx, 32
  1513. jg convertloop
  1514. pop edi
  1515. pop esi
  1516. vzeroupper
  1517. ret
  1518. }
  1519. }
  1520. #endif // HAS_ARGBTOUVJROW_AVX2
  1521. __declspec(naked) void ARGBToUV444Row_SSSE3(const uint8_t* src_argb0,
  1522. uint8_t* dst_u,
  1523. uint8_t* dst_v,
  1524. int width) {
  1525. __asm {
  1526. push edi
  1527. mov eax, [esp + 4 + 4] // src_argb
  1528. mov edx, [esp + 4 + 8] // dst_u
  1529. mov edi, [esp + 4 + 12] // dst_v
  1530. mov ecx, [esp + 4 + 16] // width
  1531. movdqa xmm5, xmmword ptr kAddUV128
  1532. movdqa xmm6, xmmword ptr kARGBToV
  1533. movdqa xmm7, xmmword ptr kARGBToU
  1534. sub edi, edx // stride from u to v
  1535. convertloop:
  1536. /* convert to U and V */
  1537. movdqu xmm0, [eax] // U
  1538. movdqu xmm1, [eax + 16]
  1539. movdqu xmm2, [eax + 32]
  1540. movdqu xmm3, [eax + 48]
  1541. pmaddubsw xmm0, xmm7
  1542. pmaddubsw xmm1, xmm7
  1543. pmaddubsw xmm2, xmm7
  1544. pmaddubsw xmm3, xmm7
  1545. phaddw xmm0, xmm1
  1546. phaddw xmm2, xmm3
  1547. psraw xmm0, 8
  1548. psraw xmm2, 8
  1549. packsswb xmm0, xmm2
  1550. paddb xmm0, xmm5
  1551. movdqu [edx], xmm0
  1552. movdqu xmm0, [eax] // V
  1553. movdqu xmm1, [eax + 16]
  1554. movdqu xmm2, [eax + 32]
  1555. movdqu xmm3, [eax + 48]
  1556. pmaddubsw xmm0, xmm6
  1557. pmaddubsw xmm1, xmm6
  1558. pmaddubsw xmm2, xmm6
  1559. pmaddubsw xmm3, xmm6
  1560. phaddw xmm0, xmm1
  1561. phaddw xmm2, xmm3
  1562. psraw xmm0, 8
  1563. psraw xmm2, 8
  1564. packsswb xmm0, xmm2
  1565. paddb xmm0, xmm5
  1566. lea eax, [eax + 64]
  1567. movdqu [edx + edi], xmm0
  1568. lea edx, [edx + 16]
  1569. sub ecx, 16
  1570. jg convertloop
  1571. pop edi
  1572. ret
  1573. }
  1574. }
  1575. __declspec(naked) void BGRAToUVRow_SSSE3(const uint8_t* src_argb0,
  1576. int src_stride_argb,
  1577. uint8_t* dst_u,
  1578. uint8_t* dst_v,
  1579. int width) {
  1580. __asm {
  1581. push esi
  1582. push edi
  1583. mov eax, [esp + 8 + 4] // src_argb
  1584. mov esi, [esp + 8 + 8] // src_stride_argb
  1585. mov edx, [esp + 8 + 12] // dst_u
  1586. mov edi, [esp + 8 + 16] // dst_v
  1587. mov ecx, [esp + 8 + 20] // width
  1588. movdqa xmm5, xmmword ptr kAddUV128
  1589. movdqa xmm6, xmmword ptr kBGRAToV
  1590. movdqa xmm7, xmmword ptr kBGRAToU
  1591. sub edi, edx // stride from u to v
  1592. convertloop:
  1593. /* step 1 - subsample 16x2 argb pixels to 8x1 */
  1594. movdqu xmm0, [eax]
  1595. movdqu xmm4, [eax + esi]
  1596. pavgb xmm0, xmm4
  1597. movdqu xmm1, [eax + 16]
  1598. movdqu xmm4, [eax + esi + 16]
  1599. pavgb xmm1, xmm4
  1600. movdqu xmm2, [eax + 32]
  1601. movdqu xmm4, [eax + esi + 32]
  1602. pavgb xmm2, xmm4
  1603. movdqu xmm3, [eax + 48]
  1604. movdqu xmm4, [eax + esi + 48]
  1605. pavgb xmm3, xmm4
  1606. lea eax, [eax + 64]
  1607. movdqa xmm4, xmm0
  1608. shufps xmm0, xmm1, 0x88
  1609. shufps xmm4, xmm1, 0xdd
  1610. pavgb xmm0, xmm4
  1611. movdqa xmm4, xmm2
  1612. shufps xmm2, xmm3, 0x88
  1613. shufps xmm4, xmm3, 0xdd
  1614. pavgb xmm2, xmm4
  1615. // step 2 - convert to U and V
  1616. // from here down is very similar to Y code except
  1617. // instead of 16 different pixels, its 8 pixels of U and 8 of V
  1618. movdqa xmm1, xmm0
  1619. movdqa xmm3, xmm2
  1620. pmaddubsw xmm0, xmm7 // U
  1621. pmaddubsw xmm2, xmm7
  1622. pmaddubsw xmm1, xmm6 // V
  1623. pmaddubsw xmm3, xmm6
  1624. phaddw xmm0, xmm2
  1625. phaddw xmm1, xmm3
  1626. psraw xmm0, 8
  1627. psraw xmm1, 8
  1628. packsswb xmm0, xmm1
  1629. paddb xmm0, xmm5 // -> unsigned
  1630. // step 3 - store 8 U and 8 V values
  1631. movlps qword ptr [edx], xmm0 // U
  1632. movhps qword ptr [edx + edi], xmm0 // V
  1633. lea edx, [edx + 8]
  1634. sub ecx, 16
  1635. jg convertloop
  1636. pop edi
  1637. pop esi
  1638. ret
  1639. }
  1640. }
  1641. __declspec(naked) void ABGRToUVRow_SSSE3(const uint8_t* src_argb0,
  1642. int src_stride_argb,
  1643. uint8_t* dst_u,
  1644. uint8_t* dst_v,
  1645. int width) {
  1646. __asm {
  1647. push esi
  1648. push edi
  1649. mov eax, [esp + 8 + 4] // src_argb
  1650. mov esi, [esp + 8 + 8] // src_stride_argb
  1651. mov edx, [esp + 8 + 12] // dst_u
  1652. mov edi, [esp + 8 + 16] // dst_v
  1653. mov ecx, [esp + 8 + 20] // width
  1654. movdqa xmm5, xmmword ptr kAddUV128
  1655. movdqa xmm6, xmmword ptr kABGRToV
  1656. movdqa xmm7, xmmword ptr kABGRToU
  1657. sub edi, edx // stride from u to v
  1658. convertloop:
  1659. /* step 1 - subsample 16x2 argb pixels to 8x1 */
  1660. movdqu xmm0, [eax]
  1661. movdqu xmm4, [eax + esi]
  1662. pavgb xmm0, xmm4
  1663. movdqu xmm1, [eax + 16]
  1664. movdqu xmm4, [eax + esi + 16]
  1665. pavgb xmm1, xmm4
  1666. movdqu xmm2, [eax + 32]
  1667. movdqu xmm4, [eax + esi + 32]
  1668. pavgb xmm2, xmm4
  1669. movdqu xmm3, [eax + 48]
  1670. movdqu xmm4, [eax + esi + 48]
  1671. pavgb xmm3, xmm4
  1672. lea eax, [eax + 64]
  1673. movdqa xmm4, xmm0
  1674. shufps xmm0, xmm1, 0x88
  1675. shufps xmm4, xmm1, 0xdd
  1676. pavgb xmm0, xmm4
  1677. movdqa xmm4, xmm2
  1678. shufps xmm2, xmm3, 0x88
  1679. shufps xmm4, xmm3, 0xdd
  1680. pavgb xmm2, xmm4
  1681. // step 2 - convert to U and V
  1682. // from here down is very similar to Y code except
  1683. // instead of 16 different pixels, its 8 pixels of U and 8 of V
  1684. movdqa xmm1, xmm0
  1685. movdqa xmm3, xmm2
  1686. pmaddubsw xmm0, xmm7 // U
  1687. pmaddubsw xmm2, xmm7
  1688. pmaddubsw xmm1, xmm6 // V
  1689. pmaddubsw xmm3, xmm6
  1690. phaddw xmm0, xmm2
  1691. phaddw xmm1, xmm3
  1692. psraw xmm0, 8
  1693. psraw xmm1, 8
  1694. packsswb xmm0, xmm1
  1695. paddb xmm0, xmm5 // -> unsigned
  1696. // step 3 - store 8 U and 8 V values
  1697. movlps qword ptr [edx], xmm0 // U
  1698. movhps qword ptr [edx + edi], xmm0 // V
  1699. lea edx, [edx + 8]
  1700. sub ecx, 16
  1701. jg convertloop
  1702. pop edi
  1703. pop esi
  1704. ret
  1705. }
  1706. }
  1707. __declspec(naked) void RGBAToUVRow_SSSE3(const uint8_t* src_argb0,
  1708. int src_stride_argb,
  1709. uint8_t* dst_u,
  1710. uint8_t* dst_v,
  1711. int width) {
  1712. __asm {
  1713. push esi
  1714. push edi
  1715. mov eax, [esp + 8 + 4] // src_argb
  1716. mov esi, [esp + 8 + 8] // src_stride_argb
  1717. mov edx, [esp + 8 + 12] // dst_u
  1718. mov edi, [esp + 8 + 16] // dst_v
  1719. mov ecx, [esp + 8 + 20] // width
  1720. movdqa xmm5, xmmword ptr kAddUV128
  1721. movdqa xmm6, xmmword ptr kRGBAToV
  1722. movdqa xmm7, xmmword ptr kRGBAToU
  1723. sub edi, edx // stride from u to v
  1724. convertloop:
  1725. /* step 1 - subsample 16x2 argb pixels to 8x1 */
  1726. movdqu xmm0, [eax]
  1727. movdqu xmm4, [eax + esi]
  1728. pavgb xmm0, xmm4
  1729. movdqu xmm1, [eax + 16]
  1730. movdqu xmm4, [eax + esi + 16]
  1731. pavgb xmm1, xmm4
  1732. movdqu xmm2, [eax + 32]
  1733. movdqu xmm4, [eax + esi + 32]
  1734. pavgb xmm2, xmm4
  1735. movdqu xmm3, [eax + 48]
  1736. movdqu xmm4, [eax + esi + 48]
  1737. pavgb xmm3, xmm4
  1738. lea eax, [eax + 64]
  1739. movdqa xmm4, xmm0
  1740. shufps xmm0, xmm1, 0x88
  1741. shufps xmm4, xmm1, 0xdd
  1742. pavgb xmm0, xmm4
  1743. movdqa xmm4, xmm2
  1744. shufps xmm2, xmm3, 0x88
  1745. shufps xmm4, xmm3, 0xdd
  1746. pavgb xmm2, xmm4
  1747. // step 2 - convert to U and V
  1748. // from here down is very similar to Y code except
  1749. // instead of 16 different pixels, its 8 pixels of U and 8 of V
  1750. movdqa xmm1, xmm0
  1751. movdqa xmm3, xmm2
  1752. pmaddubsw xmm0, xmm7 // U
  1753. pmaddubsw xmm2, xmm7
  1754. pmaddubsw xmm1, xmm6 // V
  1755. pmaddubsw xmm3, xmm6
  1756. phaddw xmm0, xmm2
  1757. phaddw xmm1, xmm3
  1758. psraw xmm0, 8
  1759. psraw xmm1, 8
  1760. packsswb xmm0, xmm1
  1761. paddb xmm0, xmm5 // -> unsigned
  1762. // step 3 - store 8 U and 8 V values
  1763. movlps qword ptr [edx], xmm0 // U
  1764. movhps qword ptr [edx + edi], xmm0 // V
  1765. lea edx, [edx + 8]
  1766. sub ecx, 16
  1767. jg convertloop
  1768. pop edi
  1769. pop esi
  1770. ret
  1771. }
  1772. }
  1773. #endif // HAS_ARGBTOYROW_SSSE3
  1774. // Read 16 UV from 444
  1775. #define READYUV444_AVX2 \
  1776. __asm { \
  1777. __asm vmovdqu xmm0, [esi] /* U */ \
  1778. __asm vmovdqu xmm1, [esi + edi] /* V */ \
  1779. __asm lea esi, [esi + 16] \
  1780. __asm vpermq ymm0, ymm0, 0xd8 \
  1781. __asm vpermq ymm1, ymm1, 0xd8 \
  1782. __asm vpunpcklbw ymm0, ymm0, ymm1 /* UV */ \
  1783. __asm vmovdqu xmm4, [eax] /* Y */ \
  1784. __asm vpermq ymm4, ymm4, 0xd8 \
  1785. __asm vpunpcklbw ymm4, ymm4, ymm4 \
  1786. __asm lea eax, [eax + 16]}
  1787. // Read 8 UV from 422, upsample to 16 UV.
  1788. #define READYUV422_AVX2 \
  1789. __asm { \
  1790. __asm vmovq xmm0, qword ptr [esi] /* U */ \
  1791. __asm vmovq xmm1, qword ptr [esi + edi] /* V */ \
  1792. __asm lea esi, [esi + 8] \
  1793. __asm vpunpcklbw ymm0, ymm0, ymm1 /* UV */ \
  1794. __asm vpermq ymm0, ymm0, 0xd8 \
  1795. __asm vpunpcklwd ymm0, ymm0, ymm0 /* UVUV (upsample) */ \
  1796. __asm vmovdqu xmm4, [eax] /* Y */ \
  1797. __asm vpermq ymm4, ymm4, 0xd8 \
  1798. __asm vpunpcklbw ymm4, ymm4, ymm4 \
  1799. __asm lea eax, [eax + 16]}
  1800. // Read 8 UV from 422, upsample to 16 UV. With 16 Alpha.
  1801. #define READYUVA422_AVX2 \
  1802. __asm { \
  1803. __asm vmovq xmm0, qword ptr [esi] /* U */ \
  1804. __asm vmovq xmm1, qword ptr [esi + edi] /* V */ \
  1805. __asm lea esi, [esi + 8] \
  1806. __asm vpunpcklbw ymm0, ymm0, ymm1 /* UV */ \
  1807. __asm vpermq ymm0, ymm0, 0xd8 \
  1808. __asm vpunpcklwd ymm0, ymm0, ymm0 /* UVUV (upsample) */ \
  1809. __asm vmovdqu xmm4, [eax] /* Y */ \
  1810. __asm vpermq ymm4, ymm4, 0xd8 \
  1811. __asm vpunpcklbw ymm4, ymm4, ymm4 \
  1812. __asm lea eax, [eax + 16] \
  1813. __asm vmovdqu xmm5, [ebp] /* A */ \
  1814. __asm vpermq ymm5, ymm5, 0xd8 \
  1815. __asm lea ebp, [ebp + 16]}
  1816. // Read 8 UV from NV12, upsample to 16 UV.
  1817. #define READNV12_AVX2 \
  1818. __asm { \
  1819. __asm vmovdqu xmm0, [esi] /* UV */ \
  1820. __asm lea esi, [esi + 16] \
  1821. __asm vpermq ymm0, ymm0, 0xd8 \
  1822. __asm vpunpcklwd ymm0, ymm0, ymm0 /* UVUV (upsample) */ \
  1823. __asm vmovdqu xmm4, [eax] /* Y */ \
  1824. __asm vpermq ymm4, ymm4, 0xd8 \
  1825. __asm vpunpcklbw ymm4, ymm4, ymm4 \
  1826. __asm lea eax, [eax + 16]}
  1827. // Read 8 UV from NV21, upsample to 16 UV.
  1828. #define READNV21_AVX2 \
  1829. __asm { \
  1830. __asm vmovdqu xmm0, [esi] /* UV */ \
  1831. __asm lea esi, [esi + 16] \
  1832. __asm vpermq ymm0, ymm0, 0xd8 \
  1833. __asm vpshufb ymm0, ymm0, ymmword ptr kShuffleNV21 \
  1834. __asm vmovdqu xmm4, [eax] /* Y */ \
  1835. __asm vpermq ymm4, ymm4, 0xd8 \
  1836. __asm vpunpcklbw ymm4, ymm4, ymm4 \
  1837. __asm lea eax, [eax + 16]}
  1838. // Read 8 YUY2 with 16 Y and upsample 8 UV to 16 UV.
  1839. #define READYUY2_AVX2 \
  1840. __asm { \
  1841. __asm vmovdqu ymm4, [eax] /* YUY2 */ \
  1842. __asm vpshufb ymm4, ymm4, ymmword ptr kShuffleYUY2Y \
  1843. __asm vmovdqu ymm0, [eax] /* UV */ \
  1844. __asm vpshufb ymm0, ymm0, ymmword ptr kShuffleYUY2UV \
  1845. __asm lea eax, [eax + 32]}
  1846. // Read 8 UYVY with 16 Y and upsample 8 UV to 16 UV.
  1847. #define READUYVY_AVX2 \
  1848. __asm { \
  1849. __asm vmovdqu ymm4, [eax] /* UYVY */ \
  1850. __asm vpshufb ymm4, ymm4, ymmword ptr kShuffleUYVYY \
  1851. __asm vmovdqu ymm0, [eax] /* UV */ \
  1852. __asm vpshufb ymm0, ymm0, ymmword ptr kShuffleUYVYUV \
  1853. __asm lea eax, [eax + 32]}
  1854. // Convert 16 pixels: 16 UV and 16 Y.
  1855. #define YUVTORGB_AVX2(YuvConstants) \
  1856. __asm { \
  1857. __asm vpmaddubsw ymm2, ymm0, ymmword ptr [YuvConstants + KUVTOR] /* R UV */\
  1858. __asm vpmaddubsw ymm1, ymm0, ymmword ptr [YuvConstants + KUVTOG] /* G UV */\
  1859. __asm vpmaddubsw ymm0, ymm0, ymmword ptr [YuvConstants + KUVTOB] /* B UV */\
  1860. __asm vmovdqu ymm3, ymmword ptr [YuvConstants + KUVBIASR] \
  1861. __asm vpsubw ymm2, ymm3, ymm2 \
  1862. __asm vmovdqu ymm3, ymmword ptr [YuvConstants + KUVBIASG] \
  1863. __asm vpsubw ymm1, ymm3, ymm1 \
  1864. __asm vmovdqu ymm3, ymmword ptr [YuvConstants + KUVBIASB] \
  1865. __asm vpsubw ymm0, ymm3, ymm0 /* Step 2: Find Y contribution to 16 R,G,B values */ \
  1866. __asm vpmulhuw ymm4, ymm4, ymmword ptr [YuvConstants + KYTORGB] \
  1867. __asm vpaddsw ymm0, ymm0, ymm4 /* B += Y */ \
  1868. __asm vpaddsw ymm1, ymm1, ymm4 /* G += Y */ \
  1869. __asm vpaddsw ymm2, ymm2, ymm4 /* R += Y */ \
  1870. __asm vpsraw ymm0, ymm0, 6 \
  1871. __asm vpsraw ymm1, ymm1, 6 \
  1872. __asm vpsraw ymm2, ymm2, 6 \
  1873. __asm vpackuswb ymm0, ymm0, ymm0 /* B */ \
  1874. __asm vpackuswb ymm1, ymm1, ymm1 /* G */ \
  1875. __asm vpackuswb ymm2, ymm2, ymm2 /* R */ \
  1876. }
  1877. // Store 16 ARGB values.
  1878. #define STOREARGB_AVX2 \
  1879. __asm { \
  1880. __asm vpunpcklbw ymm0, ymm0, ymm1 /* BG */ \
  1881. __asm vpermq ymm0, ymm0, 0xd8 \
  1882. __asm vpunpcklbw ymm2, ymm2, ymm5 /* RA */ \
  1883. __asm vpermq ymm2, ymm2, 0xd8 \
  1884. __asm vpunpcklwd ymm1, ymm0, ymm2 /* BGRA first 8 pixels */ \
  1885. __asm vpunpckhwd ymm0, ymm0, ymm2 /* BGRA next 8 pixels */ \
  1886. __asm vmovdqu 0[edx], ymm1 \
  1887. __asm vmovdqu 32[edx], ymm0 \
  1888. __asm lea edx, [edx + 64]}
  1889. // Store 16 RGBA values.
  1890. #define STORERGBA_AVX2 \
  1891. __asm { \
  1892. __asm vpunpcklbw ymm1, ymm1, ymm2 /* GR */ \
  1893. __asm vpermq ymm1, ymm1, 0xd8 \
  1894. __asm vpunpcklbw ymm2, ymm5, ymm0 /* AB */ \
  1895. __asm vpermq ymm2, ymm2, 0xd8 \
  1896. __asm vpunpcklwd ymm0, ymm2, ymm1 /* ABGR first 8 pixels */ \
  1897. __asm vpunpckhwd ymm1, ymm2, ymm1 /* ABGR next 8 pixels */ \
  1898. __asm vmovdqu [edx], ymm0 \
  1899. __asm vmovdqu [edx + 32], ymm1 \
  1900. __asm lea edx, [edx + 64]}
  1901. #ifdef HAS_I422TOARGBROW_AVX2
  1902. // 16 pixels
  1903. // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
  1904. __declspec(naked) void I422ToARGBRow_AVX2(
  1905. const uint8_t* y_buf,
  1906. const uint8_t* u_buf,
  1907. const uint8_t* v_buf,
  1908. uint8_t* dst_argb,
  1909. const struct YuvConstants* yuvconstants,
  1910. int width) {
  1911. __asm {
  1912. push esi
  1913. push edi
  1914. push ebx
  1915. mov eax, [esp + 12 + 4] // Y
  1916. mov esi, [esp + 12 + 8] // U
  1917. mov edi, [esp + 12 + 12] // V
  1918. mov edx, [esp + 12 + 16] // argb
  1919. mov ebx, [esp + 12 + 20] // yuvconstants
  1920. mov ecx, [esp + 12 + 24] // width
  1921. sub edi, esi
  1922. vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha
  1923. convertloop:
  1924. READYUV422_AVX2
  1925. YUVTORGB_AVX2(ebx)
  1926. STOREARGB_AVX2
  1927. sub ecx, 16
  1928. jg convertloop
  1929. pop ebx
  1930. pop edi
  1931. pop esi
  1932. vzeroupper
  1933. ret
  1934. }
  1935. }
  1936. #endif // HAS_I422TOARGBROW_AVX2
  1937. #ifdef HAS_I422ALPHATOARGBROW_AVX2
  1938. // 16 pixels
  1939. // 8 UV values upsampled to 16 UV, mixed with 16 Y and 16 A producing 16 ARGB.
  1940. __declspec(naked) void I422AlphaToARGBRow_AVX2(
  1941. const uint8_t* y_buf,
  1942. const uint8_t* u_buf,
  1943. const uint8_t* v_buf,
  1944. const uint8_t* a_buf,
  1945. uint8_t* dst_argb,
  1946. const struct YuvConstants* yuvconstants,
  1947. int width) {
  1948. __asm {
  1949. push esi
  1950. push edi
  1951. push ebx
  1952. push ebp
  1953. mov eax, [esp + 16 + 4] // Y
  1954. mov esi, [esp + 16 + 8] // U
  1955. mov edi, [esp + 16 + 12] // V
  1956. mov ebp, [esp + 16 + 16] // A
  1957. mov edx, [esp + 16 + 20] // argb
  1958. mov ebx, [esp + 16 + 24] // yuvconstants
  1959. mov ecx, [esp + 16 + 28] // width
  1960. sub edi, esi
  1961. convertloop:
  1962. READYUVA422_AVX2
  1963. YUVTORGB_AVX2(ebx)
  1964. STOREARGB_AVX2
  1965. sub ecx, 16
  1966. jg convertloop
  1967. pop ebp
  1968. pop ebx
  1969. pop edi
  1970. pop esi
  1971. vzeroupper
  1972. ret
  1973. }
  1974. }
  1975. #endif // HAS_I422ALPHATOARGBROW_AVX2
  1976. #ifdef HAS_I444TOARGBROW_AVX2
  1977. // 16 pixels
  1978. // 16 UV values with 16 Y producing 16 ARGB (64 bytes).
  1979. __declspec(naked) void I444ToARGBRow_AVX2(
  1980. const uint8_t* y_buf,
  1981. const uint8_t* u_buf,
  1982. const uint8_t* v_buf,
  1983. uint8_t* dst_argb,
  1984. const struct YuvConstants* yuvconstants,
  1985. int width) {
  1986. __asm {
  1987. push esi
  1988. push edi
  1989. push ebx
  1990. mov eax, [esp + 12 + 4] // Y
  1991. mov esi, [esp + 12 + 8] // U
  1992. mov edi, [esp + 12 + 12] // V
  1993. mov edx, [esp + 12 + 16] // argb
  1994. mov ebx, [esp + 12 + 20] // yuvconstants
  1995. mov ecx, [esp + 12 + 24] // width
  1996. sub edi, esi
  1997. vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha
  1998. convertloop:
  1999. READYUV444_AVX2
  2000. YUVTORGB_AVX2(ebx)
  2001. STOREARGB_AVX2
  2002. sub ecx, 16
  2003. jg convertloop
  2004. pop ebx
  2005. pop edi
  2006. pop esi
  2007. vzeroupper
  2008. ret
  2009. }
  2010. }
  2011. #endif // HAS_I444TOARGBROW_AVX2
  2012. #ifdef HAS_NV12TOARGBROW_AVX2
  2013. // 16 pixels.
  2014. // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
  2015. __declspec(naked) void NV12ToARGBRow_AVX2(
  2016. const uint8_t* y_buf,
  2017. const uint8_t* uv_buf,
  2018. uint8_t* dst_argb,
  2019. const struct YuvConstants* yuvconstants,
  2020. int width) {
  2021. __asm {
  2022. push esi
  2023. push ebx
  2024. mov eax, [esp + 8 + 4] // Y
  2025. mov esi, [esp + 8 + 8] // UV
  2026. mov edx, [esp + 8 + 12] // argb
  2027. mov ebx, [esp + 8 + 16] // yuvconstants
  2028. mov ecx, [esp + 8 + 20] // width
  2029. vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha
  2030. convertloop:
  2031. READNV12_AVX2
  2032. YUVTORGB_AVX2(ebx)
  2033. STOREARGB_AVX2
  2034. sub ecx, 16
  2035. jg convertloop
  2036. pop ebx
  2037. pop esi
  2038. vzeroupper
  2039. ret
  2040. }
  2041. }
  2042. #endif // HAS_NV12TOARGBROW_AVX2
  2043. #ifdef HAS_NV21TOARGBROW_AVX2
  2044. // 16 pixels.
  2045. // 8 VU values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
  2046. __declspec(naked) void NV21ToARGBRow_AVX2(
  2047. const uint8_t* y_buf,
  2048. const uint8_t* vu_buf,
  2049. uint8_t* dst_argb,
  2050. const struct YuvConstants* yuvconstants,
  2051. int width) {
  2052. __asm {
  2053. push esi
  2054. push ebx
  2055. mov eax, [esp + 8 + 4] // Y
  2056. mov esi, [esp + 8 + 8] // VU
  2057. mov edx, [esp + 8 + 12] // argb
  2058. mov ebx, [esp + 8 + 16] // yuvconstants
  2059. mov ecx, [esp + 8 + 20] // width
  2060. vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha
  2061. convertloop:
  2062. READNV21_AVX2
  2063. YUVTORGB_AVX2(ebx)
  2064. STOREARGB_AVX2
  2065. sub ecx, 16
  2066. jg convertloop
  2067. pop ebx
  2068. pop esi
  2069. vzeroupper
  2070. ret
  2071. }
  2072. }
  2073. #endif // HAS_NV21TOARGBROW_AVX2
  2074. #ifdef HAS_YUY2TOARGBROW_AVX2
  2075. // 16 pixels.
  2076. // 8 YUY2 values with 16 Y and 8 UV producing 16 ARGB (64 bytes).
  2077. __declspec(naked) void YUY2ToARGBRow_AVX2(
  2078. const uint8_t* src_yuy2,
  2079. uint8_t* dst_argb,
  2080. const struct YuvConstants* yuvconstants,
  2081. int width) {
  2082. __asm {
  2083. push ebx
  2084. mov eax, [esp + 4 + 4] // yuy2
  2085. mov edx, [esp + 4 + 8] // argb
  2086. mov ebx, [esp + 4 + 12] // yuvconstants
  2087. mov ecx, [esp + 4 + 16] // width
  2088. vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha
  2089. convertloop:
  2090. READYUY2_AVX2
  2091. YUVTORGB_AVX2(ebx)
  2092. STOREARGB_AVX2
  2093. sub ecx, 16
  2094. jg convertloop
  2095. pop ebx
  2096. vzeroupper
  2097. ret
  2098. }
  2099. }
  2100. #endif // HAS_YUY2TOARGBROW_AVX2
  2101. #ifdef HAS_UYVYTOARGBROW_AVX2
  2102. // 16 pixels.
  2103. // 8 UYVY values with 16 Y and 8 UV producing 16 ARGB (64 bytes).
  2104. __declspec(naked) void UYVYToARGBRow_AVX2(
  2105. const uint8_t* src_uyvy,
  2106. uint8_t* dst_argb,
  2107. const struct YuvConstants* yuvconstants,
  2108. int width) {
  2109. __asm {
  2110. push ebx
  2111. mov eax, [esp + 4 + 4] // uyvy
  2112. mov edx, [esp + 4 + 8] // argb
  2113. mov ebx, [esp + 4 + 12] // yuvconstants
  2114. mov ecx, [esp + 4 + 16] // width
  2115. vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha
  2116. convertloop:
  2117. READUYVY_AVX2
  2118. YUVTORGB_AVX2(ebx)
  2119. STOREARGB_AVX2
  2120. sub ecx, 16
  2121. jg convertloop
  2122. pop ebx
  2123. vzeroupper
  2124. ret
  2125. }
  2126. }
  2127. #endif // HAS_UYVYTOARGBROW_AVX2
  2128. #ifdef HAS_I422TORGBAROW_AVX2
  2129. // 16 pixels
  2130. // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 RGBA (64 bytes).
  2131. __declspec(naked) void I422ToRGBARow_AVX2(
  2132. const uint8_t* y_buf,
  2133. const uint8_t* u_buf,
  2134. const uint8_t* v_buf,
  2135. uint8_t* dst_argb,
  2136. const struct YuvConstants* yuvconstants,
  2137. int width) {
  2138. __asm {
  2139. push esi
  2140. push edi
  2141. push ebx
  2142. mov eax, [esp + 12 + 4] // Y
  2143. mov esi, [esp + 12 + 8] // U
  2144. mov edi, [esp + 12 + 12] // V
  2145. mov edx, [esp + 12 + 16] // abgr
  2146. mov ebx, [esp + 12 + 20] // yuvconstants
  2147. mov ecx, [esp + 12 + 24] // width
  2148. sub edi, esi
  2149. vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha
  2150. convertloop:
  2151. READYUV422_AVX2
  2152. YUVTORGB_AVX2(ebx)
  2153. STORERGBA_AVX2
  2154. sub ecx, 16
  2155. jg convertloop
  2156. pop ebx
  2157. pop edi
  2158. pop esi
  2159. vzeroupper
  2160. ret
  2161. }
  2162. }
  2163. #endif // HAS_I422TORGBAROW_AVX2
  2164. #if defined(HAS_I422TOARGBROW_SSSE3)
  2165. // TODO(fbarchard): Read that does half size on Y and treats 420 as 444.
  2166. // Allows a conversion with half size scaling.
  2167. // Read 8 UV from 444.
  2168. #define READYUV444 \
  2169. __asm { \
  2170. __asm movq xmm0, qword ptr [esi] /* U */ \
  2171. __asm movq xmm1, qword ptr [esi + edi] /* V */ \
  2172. __asm lea esi, [esi + 8] \
  2173. __asm punpcklbw xmm0, xmm1 /* UV */ \
  2174. __asm movq xmm4, qword ptr [eax] \
  2175. __asm punpcklbw xmm4, xmm4 \
  2176. __asm lea eax, [eax + 8]}
  2177. // Read 4 UV from 422, upsample to 8 UV.
  2178. #define READYUV422 \
  2179. __asm { \
  2180. __asm movd xmm0, [esi] /* U */ \
  2181. __asm movd xmm1, [esi + edi] /* V */ \
  2182. __asm lea esi, [esi + 4] \
  2183. __asm punpcklbw xmm0, xmm1 /* UV */ \
  2184. __asm punpcklwd xmm0, xmm0 /* UVUV (upsample) */ \
  2185. __asm movq xmm4, qword ptr [eax] \
  2186. __asm punpcklbw xmm4, xmm4 \
  2187. __asm lea eax, [eax + 8]}
  2188. // Read 4 UV from 422, upsample to 8 UV. With 8 Alpha.
  2189. #define READYUVA422 \
  2190. __asm { \
  2191. __asm movd xmm0, [esi] /* U */ \
  2192. __asm movd xmm1, [esi + edi] /* V */ \
  2193. __asm lea esi, [esi + 4] \
  2194. __asm punpcklbw xmm0, xmm1 /* UV */ \
  2195. __asm punpcklwd xmm0, xmm0 /* UVUV (upsample) */ \
  2196. __asm movq xmm4, qword ptr [eax] /* Y */ \
  2197. __asm punpcklbw xmm4, xmm4 \
  2198. __asm lea eax, [eax + 8] \
  2199. __asm movq xmm5, qword ptr [ebp] /* A */ \
  2200. __asm lea ebp, [ebp + 8]}
  2201. // Read 4 UV from NV12, upsample to 8 UV.
  2202. #define READNV12 \
  2203. __asm { \
  2204. __asm movq xmm0, qword ptr [esi] /* UV */ \
  2205. __asm lea esi, [esi + 8] \
  2206. __asm punpcklwd xmm0, xmm0 /* UVUV (upsample) */ \
  2207. __asm movq xmm4, qword ptr [eax] \
  2208. __asm punpcklbw xmm4, xmm4 \
  2209. __asm lea eax, [eax + 8]}
  2210. // Read 4 VU from NV21, upsample to 8 UV.
  2211. #define READNV21 \
  2212. __asm { \
  2213. __asm movq xmm0, qword ptr [esi] /* UV */ \
  2214. __asm lea esi, [esi + 8] \
  2215. __asm pshufb xmm0, xmmword ptr kShuffleNV21 \
  2216. __asm movq xmm4, qword ptr [eax] \
  2217. __asm punpcklbw xmm4, xmm4 \
  2218. __asm lea eax, [eax + 8]}
  2219. // Read 4 YUY2 with 8 Y and upsample 4 UV to 8 UV.
  2220. #define READYUY2 \
  2221. __asm { \
  2222. __asm movdqu xmm4, [eax] /* YUY2 */ \
  2223. __asm pshufb xmm4, xmmword ptr kShuffleYUY2Y \
  2224. __asm movdqu xmm0, [eax] /* UV */ \
  2225. __asm pshufb xmm0, xmmword ptr kShuffleYUY2UV \
  2226. __asm lea eax, [eax + 16]}
  2227. // Read 4 UYVY with 8 Y and upsample 4 UV to 8 UV.
  2228. #define READUYVY \
  2229. __asm { \
  2230. __asm movdqu xmm4, [eax] /* UYVY */ \
  2231. __asm pshufb xmm4, xmmword ptr kShuffleUYVYY \
  2232. __asm movdqu xmm0, [eax] /* UV */ \
  2233. __asm pshufb xmm0, xmmword ptr kShuffleUYVYUV \
  2234. __asm lea eax, [eax + 16]}
  2235. // Convert 8 pixels: 8 UV and 8 Y.
  2236. #define YUVTORGB(YuvConstants) \
  2237. __asm { \
  2238. __asm movdqa xmm1, xmm0 \
  2239. __asm movdqa xmm2, xmm0 \
  2240. __asm movdqa xmm3, xmm0 \
  2241. __asm movdqa xmm0, xmmword ptr [YuvConstants + KUVBIASB] \
  2242. __asm pmaddubsw xmm1, xmmword ptr [YuvConstants + KUVTOB] \
  2243. __asm psubw xmm0, xmm1 \
  2244. __asm movdqa xmm1, xmmword ptr [YuvConstants + KUVBIASG] \
  2245. __asm pmaddubsw xmm2, xmmword ptr [YuvConstants + KUVTOG] \
  2246. __asm psubw xmm1, xmm2 \
  2247. __asm movdqa xmm2, xmmword ptr [YuvConstants + KUVBIASR] \
  2248. __asm pmaddubsw xmm3, xmmword ptr [YuvConstants + KUVTOR] \
  2249. __asm psubw xmm2, xmm3 \
  2250. __asm pmulhuw xmm4, xmmword ptr [YuvConstants + KYTORGB] \
  2251. __asm paddsw xmm0, xmm4 /* B += Y */ \
  2252. __asm paddsw xmm1, xmm4 /* G += Y */ \
  2253. __asm paddsw xmm2, xmm4 /* R += Y */ \
  2254. __asm psraw xmm0, 6 \
  2255. __asm psraw xmm1, 6 \
  2256. __asm psraw xmm2, 6 \
  2257. __asm packuswb xmm0, xmm0 /* B */ \
  2258. __asm packuswb xmm1, xmm1 /* G */ \
  2259. __asm packuswb xmm2, xmm2 /* R */ \
  2260. }
  2261. // Store 8 ARGB values.
  2262. #define STOREARGB \
  2263. __asm { \
  2264. __asm punpcklbw xmm0, xmm1 /* BG */ \
  2265. __asm punpcklbw xmm2, xmm5 /* RA */ \
  2266. __asm movdqa xmm1, xmm0 \
  2267. __asm punpcklwd xmm0, xmm2 /* BGRA first 4 pixels */ \
  2268. __asm punpckhwd xmm1, xmm2 /* BGRA next 4 pixels */ \
  2269. __asm movdqu 0[edx], xmm0 \
  2270. __asm movdqu 16[edx], xmm1 \
  2271. __asm lea edx, [edx + 32]}
  2272. // Store 8 BGRA values.
  2273. #define STOREBGRA \
  2274. __asm { \
  2275. __asm pcmpeqb xmm5, xmm5 /* generate 0xffffffff for alpha */ \
  2276. __asm punpcklbw xmm1, xmm0 /* GB */ \
  2277. __asm punpcklbw xmm5, xmm2 /* AR */ \
  2278. __asm movdqa xmm0, xmm5 \
  2279. __asm punpcklwd xmm5, xmm1 /* BGRA first 4 pixels */ \
  2280. __asm punpckhwd xmm0, xmm1 /* BGRA next 4 pixels */ \
  2281. __asm movdqu 0[edx], xmm5 \
  2282. __asm movdqu 16[edx], xmm0 \
  2283. __asm lea edx, [edx + 32]}
  2284. // Store 8 RGBA values.
  2285. #define STORERGBA \
  2286. __asm { \
  2287. __asm pcmpeqb xmm5, xmm5 /* generate 0xffffffff for alpha */ \
  2288. __asm punpcklbw xmm1, xmm2 /* GR */ \
  2289. __asm punpcklbw xmm5, xmm0 /* AB */ \
  2290. __asm movdqa xmm0, xmm5 \
  2291. __asm punpcklwd xmm5, xmm1 /* RGBA first 4 pixels */ \
  2292. __asm punpckhwd xmm0, xmm1 /* RGBA next 4 pixels */ \
  2293. __asm movdqu 0[edx], xmm5 \
  2294. __asm movdqu 16[edx], xmm0 \
  2295. __asm lea edx, [edx + 32]}
  2296. // Store 8 RGB24 values.
  2297. #define STORERGB24 \
  2298. __asm {/* Weave into RRGB */ \
  2299. __asm punpcklbw xmm0, xmm1 /* BG */ \
  2300. __asm punpcklbw xmm2, xmm2 /* RR */ \
  2301. __asm movdqa xmm1, xmm0 \
  2302. __asm punpcklwd xmm0, xmm2 /* BGRR first 4 pixels */ \
  2303. __asm punpckhwd xmm1, xmm2 /* BGRR next 4 pixels */ /* RRGB -> RGB24 */ \
  2304. __asm pshufb xmm0, xmm5 /* Pack first 8 and last 4 bytes. */ \
  2305. __asm pshufb xmm1, xmm6 /* Pack first 12 bytes. */ \
  2306. __asm palignr xmm1, xmm0, 12 /* last 4 bytes of xmm0 + 12 xmm1 */ \
  2307. __asm movq qword ptr 0[edx], xmm0 /* First 8 bytes */ \
  2308. __asm movdqu 8[edx], xmm1 /* Last 16 bytes */ \
  2309. __asm lea edx, [edx + 24]}
  2310. // Store 8 RGB565 values.
  2311. #define STORERGB565 \
  2312. __asm {/* Weave into RRGB */ \
  2313. __asm punpcklbw xmm0, xmm1 /* BG */ \
  2314. __asm punpcklbw xmm2, xmm2 /* RR */ \
  2315. __asm movdqa xmm1, xmm0 \
  2316. __asm punpcklwd xmm0, xmm2 /* BGRR first 4 pixels */ \
  2317. __asm punpckhwd xmm1, xmm2 /* BGRR next 4 pixels */ /* RRGB -> RGB565 */ \
  2318. __asm movdqa xmm3, xmm0 /* B first 4 pixels of argb */ \
  2319. __asm movdqa xmm2, xmm0 /* G */ \
  2320. __asm pslld xmm0, 8 /* R */ \
  2321. __asm psrld xmm3, 3 /* B */ \
  2322. __asm psrld xmm2, 5 /* G */ \
  2323. __asm psrad xmm0, 16 /* R */ \
  2324. __asm pand xmm3, xmm5 /* B */ \
  2325. __asm pand xmm2, xmm6 /* G */ \
  2326. __asm pand xmm0, xmm7 /* R */ \
  2327. __asm por xmm3, xmm2 /* BG */ \
  2328. __asm por xmm0, xmm3 /* BGR */ \
  2329. __asm movdqa xmm3, xmm1 /* B next 4 pixels of argb */ \
  2330. __asm movdqa xmm2, xmm1 /* G */ \
  2331. __asm pslld xmm1, 8 /* R */ \
  2332. __asm psrld xmm3, 3 /* B */ \
  2333. __asm psrld xmm2, 5 /* G */ \
  2334. __asm psrad xmm1, 16 /* R */ \
  2335. __asm pand xmm3, xmm5 /* B */ \
  2336. __asm pand xmm2, xmm6 /* G */ \
  2337. __asm pand xmm1, xmm7 /* R */ \
  2338. __asm por xmm3, xmm2 /* BG */ \
  2339. __asm por xmm1, xmm3 /* BGR */ \
  2340. __asm packssdw xmm0, xmm1 \
  2341. __asm movdqu 0[edx], xmm0 /* store 8 pixels of RGB565 */ \
  2342. __asm lea edx, [edx + 16]}
  2343. // 8 pixels.
  2344. // 8 UV values, mixed with 8 Y producing 8 ARGB (32 bytes).
  2345. __declspec(naked) void I444ToARGBRow_SSSE3(
  2346. const uint8_t* y_buf,
  2347. const uint8_t* u_buf,
  2348. const uint8_t* v_buf,
  2349. uint8_t* dst_argb,
  2350. const struct YuvConstants* yuvconstants,
  2351. int width) {
  2352. __asm {
  2353. push esi
  2354. push edi
  2355. push ebx
  2356. mov eax, [esp + 12 + 4] // Y
  2357. mov esi, [esp + 12 + 8] // U
  2358. mov edi, [esp + 12 + 12] // V
  2359. mov edx, [esp + 12 + 16] // argb
  2360. mov ebx, [esp + 12 + 20] // yuvconstants
  2361. mov ecx, [esp + 12 + 24] // width
  2362. sub edi, esi
  2363. pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
  2364. convertloop:
  2365. READYUV444
  2366. YUVTORGB(ebx)
  2367. STOREARGB
  2368. sub ecx, 8
  2369. jg convertloop
  2370. pop ebx
  2371. pop edi
  2372. pop esi
  2373. ret
  2374. }
  2375. }
  2376. // 8 pixels.
  2377. // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 RGB24 (24 bytes).
  2378. __declspec(naked) void I422ToRGB24Row_SSSE3(
  2379. const uint8_t* y_buf,
  2380. const uint8_t* u_buf,
  2381. const uint8_t* v_buf,
  2382. uint8_t* dst_rgb24,
  2383. const struct YuvConstants* yuvconstants,
  2384. int width) {
  2385. __asm {
  2386. push esi
  2387. push edi
  2388. push ebx
  2389. mov eax, [esp + 12 + 4] // Y
  2390. mov esi, [esp + 12 + 8] // U
  2391. mov edi, [esp + 12 + 12] // V
  2392. mov edx, [esp + 12 + 16] // argb
  2393. mov ebx, [esp + 12 + 20] // yuvconstants
  2394. mov ecx, [esp + 12 + 24] // width
  2395. sub edi, esi
  2396. movdqa xmm5, xmmword ptr kShuffleMaskARGBToRGB24_0
  2397. movdqa xmm6, xmmword ptr kShuffleMaskARGBToRGB24
  2398. convertloop:
  2399. READYUV422
  2400. YUVTORGB(ebx)
  2401. STORERGB24
  2402. sub ecx, 8
  2403. jg convertloop
  2404. pop ebx
  2405. pop edi
  2406. pop esi
  2407. ret
  2408. }
  2409. }
  2410. // 8 pixels
  2411. // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 RGB565 (16 bytes).
  2412. __declspec(naked) void I422ToRGB565Row_SSSE3(
  2413. const uint8_t* y_buf,
  2414. const uint8_t* u_buf,
  2415. const uint8_t* v_buf,
  2416. uint8_t* rgb565_buf,
  2417. const struct YuvConstants* yuvconstants,
  2418. int width) {
  2419. __asm {
  2420. push esi
  2421. push edi
  2422. push ebx
  2423. mov eax, [esp + 12 + 4] // Y
  2424. mov esi, [esp + 12 + 8] // U
  2425. mov edi, [esp + 12 + 12] // V
  2426. mov edx, [esp + 12 + 16] // argb
  2427. mov ebx, [esp + 12 + 20] // yuvconstants
  2428. mov ecx, [esp + 12 + 24] // width
  2429. sub edi, esi
  2430. pcmpeqb xmm5, xmm5 // generate mask 0x0000001f
  2431. psrld xmm5, 27
  2432. pcmpeqb xmm6, xmm6 // generate mask 0x000007e0
  2433. psrld xmm6, 26
  2434. pslld xmm6, 5
  2435. pcmpeqb xmm7, xmm7 // generate mask 0xfffff800
  2436. pslld xmm7, 11
  2437. convertloop:
  2438. READYUV422
  2439. YUVTORGB(ebx)
  2440. STORERGB565
  2441. sub ecx, 8
  2442. jg convertloop
  2443. pop ebx
  2444. pop edi
  2445. pop esi
  2446. ret
  2447. }
  2448. }
  2449. // 8 pixels.
  2450. // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
  2451. __declspec(naked) void I422ToARGBRow_SSSE3(
  2452. const uint8_t* y_buf,
  2453. const uint8_t* u_buf,
  2454. const uint8_t* v_buf,
  2455. uint8_t* dst_argb,
  2456. const struct YuvConstants* yuvconstants,
  2457. int width) {
  2458. __asm {
  2459. push esi
  2460. push edi
  2461. push ebx
  2462. mov eax, [esp + 12 + 4] // Y
  2463. mov esi, [esp + 12 + 8] // U
  2464. mov edi, [esp + 12 + 12] // V
  2465. mov edx, [esp + 12 + 16] // argb
  2466. mov ebx, [esp + 12 + 20] // yuvconstants
  2467. mov ecx, [esp + 12 + 24] // width
  2468. sub edi, esi
  2469. pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
  2470. convertloop:
  2471. READYUV422
  2472. YUVTORGB(ebx)
  2473. STOREARGB
  2474. sub ecx, 8
  2475. jg convertloop
  2476. pop ebx
  2477. pop edi
  2478. pop esi
  2479. ret
  2480. }
  2481. }
  2482. // 8 pixels.
  2483. // 4 UV values upsampled to 8 UV, mixed with 8 Y and 8 A producing 8 ARGB.
  2484. __declspec(naked) void I422AlphaToARGBRow_SSSE3(
  2485. const uint8_t* y_buf,
  2486. const uint8_t* u_buf,
  2487. const uint8_t* v_buf,
  2488. const uint8_t* a_buf,
  2489. uint8_t* dst_argb,
  2490. const struct YuvConstants* yuvconstants,
  2491. int width) {
  2492. __asm {
  2493. push esi
  2494. push edi
  2495. push ebx
  2496. push ebp
  2497. mov eax, [esp + 16 + 4] // Y
  2498. mov esi, [esp + 16 + 8] // U
  2499. mov edi, [esp + 16 + 12] // V
  2500. mov ebp, [esp + 16 + 16] // A
  2501. mov edx, [esp + 16 + 20] // argb
  2502. mov ebx, [esp + 16 + 24] // yuvconstants
  2503. mov ecx, [esp + 16 + 28] // width
  2504. sub edi, esi
  2505. convertloop:
  2506. READYUVA422
  2507. YUVTORGB(ebx)
  2508. STOREARGB
  2509. sub ecx, 8
  2510. jg convertloop
  2511. pop ebp
  2512. pop ebx
  2513. pop edi
  2514. pop esi
  2515. ret
  2516. }
  2517. }
  2518. // 8 pixels.
  2519. // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
  2520. __declspec(naked) void NV12ToARGBRow_SSSE3(
  2521. const uint8_t* y_buf,
  2522. const uint8_t* uv_buf,
  2523. uint8_t* dst_argb,
  2524. const struct YuvConstants* yuvconstants,
  2525. int width) {
  2526. __asm {
  2527. push esi
  2528. push ebx
  2529. mov eax, [esp + 8 + 4] // Y
  2530. mov esi, [esp + 8 + 8] // UV
  2531. mov edx, [esp + 8 + 12] // argb
  2532. mov ebx, [esp + 8 + 16] // yuvconstants
  2533. mov ecx, [esp + 8 + 20] // width
  2534. pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
  2535. convertloop:
  2536. READNV12
  2537. YUVTORGB(ebx)
  2538. STOREARGB
  2539. sub ecx, 8
  2540. jg convertloop
  2541. pop ebx
  2542. pop esi
  2543. ret
  2544. }
  2545. }
  2546. // 8 pixels.
  2547. // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
  2548. __declspec(naked) void NV21ToARGBRow_SSSE3(
  2549. const uint8_t* y_buf,
  2550. const uint8_t* vu_buf,
  2551. uint8_t* dst_argb,
  2552. const struct YuvConstants* yuvconstants,
  2553. int width) {
  2554. __asm {
  2555. push esi
  2556. push ebx
  2557. mov eax, [esp + 8 + 4] // Y
  2558. mov esi, [esp + 8 + 8] // VU
  2559. mov edx, [esp + 8 + 12] // argb
  2560. mov ebx, [esp + 8 + 16] // yuvconstants
  2561. mov ecx, [esp + 8 + 20] // width
  2562. pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
  2563. convertloop:
  2564. READNV21
  2565. YUVTORGB(ebx)
  2566. STOREARGB
  2567. sub ecx, 8
  2568. jg convertloop
  2569. pop ebx
  2570. pop esi
  2571. ret
  2572. }
  2573. }
  2574. // 8 pixels.
  2575. // 4 YUY2 values with 8 Y and 4 UV producing 8 ARGB (32 bytes).
  2576. __declspec(naked) void YUY2ToARGBRow_SSSE3(
  2577. const uint8_t* src_yuy2,
  2578. uint8_t* dst_argb,
  2579. const struct YuvConstants* yuvconstants,
  2580. int width) {
  2581. __asm {
  2582. push ebx
  2583. mov eax, [esp + 4 + 4] // yuy2
  2584. mov edx, [esp + 4 + 8] // argb
  2585. mov ebx, [esp + 4 + 12] // yuvconstants
  2586. mov ecx, [esp + 4 + 16] // width
  2587. pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
  2588. convertloop:
  2589. READYUY2
  2590. YUVTORGB(ebx)
  2591. STOREARGB
  2592. sub ecx, 8
  2593. jg convertloop
  2594. pop ebx
  2595. ret
  2596. }
  2597. }
  2598. // 8 pixels.
  2599. // 4 UYVY values with 8 Y and 4 UV producing 8 ARGB (32 bytes).
  2600. __declspec(naked) void UYVYToARGBRow_SSSE3(
  2601. const uint8_t* src_uyvy,
  2602. uint8_t* dst_argb,
  2603. const struct YuvConstants* yuvconstants,
  2604. int width) {
  2605. __asm {
  2606. push ebx
  2607. mov eax, [esp + 4 + 4] // uyvy
  2608. mov edx, [esp + 4 + 8] // argb
  2609. mov ebx, [esp + 4 + 12] // yuvconstants
  2610. mov ecx, [esp + 4 + 16] // width
  2611. pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
  2612. convertloop:
  2613. READUYVY
  2614. YUVTORGB(ebx)
  2615. STOREARGB
  2616. sub ecx, 8
  2617. jg convertloop
  2618. pop ebx
  2619. ret
  2620. }
  2621. }
  2622. __declspec(naked) void I422ToRGBARow_SSSE3(
  2623. const uint8_t* y_buf,
  2624. const uint8_t* u_buf,
  2625. const uint8_t* v_buf,
  2626. uint8_t* dst_rgba,
  2627. const struct YuvConstants* yuvconstants,
  2628. int width) {
  2629. __asm {
  2630. push esi
  2631. push edi
  2632. push ebx
  2633. mov eax, [esp + 12 + 4] // Y
  2634. mov esi, [esp + 12 + 8] // U
  2635. mov edi, [esp + 12 + 12] // V
  2636. mov edx, [esp + 12 + 16] // argb
  2637. mov ebx, [esp + 12 + 20] // yuvconstants
  2638. mov ecx, [esp + 12 + 24] // width
  2639. sub edi, esi
  2640. convertloop:
  2641. READYUV422
  2642. YUVTORGB(ebx)
  2643. STORERGBA
  2644. sub ecx, 8
  2645. jg convertloop
  2646. pop ebx
  2647. pop edi
  2648. pop esi
  2649. ret
  2650. }
  2651. }
  2652. #endif // HAS_I422TOARGBROW_SSSE3
  2653. #ifdef HAS_I400TOARGBROW_SSE2
  2654. // 8 pixels of Y converted to 8 pixels of ARGB (32 bytes).
  2655. __declspec(naked) void I400ToARGBRow_SSE2(const uint8_t* y_buf,
  2656. uint8_t* rgb_buf,
  2657. int width) {
  2658. __asm {
  2659. mov eax, 0x4a354a35 // 4a35 = 18997 = round(1.164 * 64 * 256)
  2660. movd xmm2, eax
  2661. pshufd xmm2, xmm2,0
  2662. mov eax, 0x04880488 // 0488 = 1160 = round(1.164 * 64 * 16)
  2663. movd xmm3, eax
  2664. pshufd xmm3, xmm3, 0
  2665. pcmpeqb xmm4, xmm4 // generate mask 0xff000000
  2666. pslld xmm4, 24
  2667. mov eax, [esp + 4] // Y
  2668. mov edx, [esp + 8] // rgb
  2669. mov ecx, [esp + 12] // width
  2670. convertloop:
  2671. // Step 1: Scale Y contribution to 8 G values. G = (y - 16) * 1.164
  2672. movq xmm0, qword ptr [eax]
  2673. lea eax, [eax + 8]
  2674. punpcklbw xmm0, xmm0 // Y.Y
  2675. pmulhuw xmm0, xmm2
  2676. psubusw xmm0, xmm3
  2677. psrlw xmm0, 6
  2678. packuswb xmm0, xmm0 // G
  2679. // Step 2: Weave into ARGB
  2680. punpcklbw xmm0, xmm0 // GG
  2681. movdqa xmm1, xmm0
  2682. punpcklwd xmm0, xmm0 // BGRA first 4 pixels
  2683. punpckhwd xmm1, xmm1 // BGRA next 4 pixels
  2684. por xmm0, xmm4
  2685. por xmm1, xmm4
  2686. movdqu [edx], xmm0
  2687. movdqu [edx + 16], xmm1
  2688. lea edx, [edx + 32]
  2689. sub ecx, 8
  2690. jg convertloop
  2691. ret
  2692. }
  2693. }
  2694. #endif // HAS_I400TOARGBROW_SSE2
  2695. #ifdef HAS_I400TOARGBROW_AVX2
  2696. // 16 pixels of Y converted to 16 pixels of ARGB (64 bytes).
  2697. // note: vpunpcklbw mutates and vpackuswb unmutates.
  2698. __declspec(naked) void I400ToARGBRow_AVX2(const uint8_t* y_buf,
  2699. uint8_t* rgb_buf,
  2700. int width) {
  2701. __asm {
  2702. mov eax, 0x4a354a35 // 4a35 = 18997 = round(1.164 * 64 * 256)
  2703. vmovd xmm2, eax
  2704. vbroadcastss ymm2, xmm2
  2705. mov eax, 0x04880488 // 0488 = 1160 = round(1.164 * 64 * 16)
  2706. vmovd xmm3, eax
  2707. vbroadcastss ymm3, xmm3
  2708. vpcmpeqb ymm4, ymm4, ymm4 // generate mask 0xff000000
  2709. vpslld ymm4, ymm4, 24
  2710. mov eax, [esp + 4] // Y
  2711. mov edx, [esp + 8] // rgb
  2712. mov ecx, [esp + 12] // width
  2713. convertloop:
  2714. // Step 1: Scale Y contriportbution to 16 G values. G = (y - 16) * 1.164
  2715. vmovdqu xmm0, [eax]
  2716. lea eax, [eax + 16]
  2717. vpermq ymm0, ymm0, 0xd8 // vpunpcklbw mutates
  2718. vpunpcklbw ymm0, ymm0, ymm0 // Y.Y
  2719. vpmulhuw ymm0, ymm0, ymm2
  2720. vpsubusw ymm0, ymm0, ymm3
  2721. vpsrlw ymm0, ymm0, 6
  2722. vpackuswb ymm0, ymm0, ymm0 // G. still mutated: 3120
  2723. // TODO(fbarchard): Weave alpha with unpack.
  2724. // Step 2: Weave into ARGB
  2725. vpunpcklbw ymm1, ymm0, ymm0 // GG - mutates
  2726. vpermq ymm1, ymm1, 0xd8
  2727. vpunpcklwd ymm0, ymm1, ymm1 // GGGG first 8 pixels
  2728. vpunpckhwd ymm1, ymm1, ymm1 // GGGG next 8 pixels
  2729. vpor ymm0, ymm0, ymm4
  2730. vpor ymm1, ymm1, ymm4
  2731. vmovdqu [edx], ymm0
  2732. vmovdqu [edx + 32], ymm1
  2733. lea edx, [edx + 64]
  2734. sub ecx, 16
  2735. jg convertloop
  2736. vzeroupper
  2737. ret
  2738. }
  2739. }
  2740. #endif // HAS_I400TOARGBROW_AVX2
  2741. #ifdef HAS_MIRRORROW_SSSE3
  2742. // Shuffle table for reversing the bytes.
  2743. static const uvec8 kShuffleMirror = {15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u,
  2744. 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u};
  2745. // TODO(fbarchard): Replace lea with -16 offset.
  2746. __declspec(naked) void MirrorRow_SSSE3(const uint8_t* src,
  2747. uint8_t* dst,
  2748. int width) {
  2749. __asm {
  2750. mov eax, [esp + 4] // src
  2751. mov edx, [esp + 8] // dst
  2752. mov ecx, [esp + 12] // width
  2753. movdqa xmm5, xmmword ptr kShuffleMirror
  2754. convertloop:
  2755. movdqu xmm0, [eax - 16 + ecx]
  2756. pshufb xmm0, xmm5
  2757. movdqu [edx], xmm0
  2758. lea edx, [edx + 16]
  2759. sub ecx, 16
  2760. jg convertloop
  2761. ret
  2762. }
  2763. }
  2764. #endif // HAS_MIRRORROW_SSSE3
  2765. #ifdef HAS_MIRRORROW_AVX2
  2766. __declspec(naked) void MirrorRow_AVX2(const uint8_t* src,
  2767. uint8_t* dst,
  2768. int width) {
  2769. __asm {
  2770. mov eax, [esp + 4] // src
  2771. mov edx, [esp + 8] // dst
  2772. mov ecx, [esp + 12] // width
  2773. vbroadcastf128 ymm5, xmmword ptr kShuffleMirror
  2774. convertloop:
  2775. vmovdqu ymm0, [eax - 32 + ecx]
  2776. vpshufb ymm0, ymm0, ymm5
  2777. vpermq ymm0, ymm0, 0x4e // swap high and low halfs
  2778. vmovdqu [edx], ymm0
  2779. lea edx, [edx + 32]
  2780. sub ecx, 32
  2781. jg convertloop
  2782. vzeroupper
  2783. ret
  2784. }
  2785. }
  2786. #endif // HAS_MIRRORROW_AVX2
  2787. #ifdef HAS_MIRRORUVROW_SSSE3
  2788. // Shuffle table for reversing the bytes of UV channels.
  2789. static const uvec8 kShuffleMirrorUV = {14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u,
  2790. 15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u};
  2791. __declspec(naked) void MirrorUVRow_SSSE3(const uint8_t* src,
  2792. uint8_t* dst_u,
  2793. uint8_t* dst_v,
  2794. int width) {
  2795. __asm {
  2796. push edi
  2797. mov eax, [esp + 4 + 4] // src
  2798. mov edx, [esp + 4 + 8] // dst_u
  2799. mov edi, [esp + 4 + 12] // dst_v
  2800. mov ecx, [esp + 4 + 16] // width
  2801. movdqa xmm1, xmmword ptr kShuffleMirrorUV
  2802. lea eax, [eax + ecx * 2 - 16]
  2803. sub edi, edx
  2804. convertloop:
  2805. movdqu xmm0, [eax]
  2806. lea eax, [eax - 16]
  2807. pshufb xmm0, xmm1
  2808. movlpd qword ptr [edx], xmm0
  2809. movhpd qword ptr [edx + edi], xmm0
  2810. lea edx, [edx + 8]
  2811. sub ecx, 8
  2812. jg convertloop
  2813. pop edi
  2814. ret
  2815. }
  2816. }
  2817. #endif // HAS_MIRRORUVROW_SSSE3
  2818. #ifdef HAS_ARGBMIRRORROW_SSE2
  2819. __declspec(naked) void ARGBMirrorRow_SSE2(const uint8_t* src,
  2820. uint8_t* dst,
  2821. int width) {
  2822. __asm {
  2823. mov eax, [esp + 4] // src
  2824. mov edx, [esp + 8] // dst
  2825. mov ecx, [esp + 12] // width
  2826. lea eax, [eax - 16 + ecx * 4] // last 4 pixels.
  2827. convertloop:
  2828. movdqu xmm0, [eax]
  2829. lea eax, [eax - 16]
  2830. pshufd xmm0, xmm0, 0x1b
  2831. movdqu [edx], xmm0
  2832. lea edx, [edx + 16]
  2833. sub ecx, 4
  2834. jg convertloop
  2835. ret
  2836. }
  2837. }
  2838. #endif // HAS_ARGBMIRRORROW_SSE2
  2839. #ifdef HAS_ARGBMIRRORROW_AVX2
  2840. // Shuffle table for reversing the bytes.
  2841. static const ulvec32 kARGBShuffleMirror_AVX2 = {7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u};
  2842. __declspec(naked) void ARGBMirrorRow_AVX2(const uint8_t* src,
  2843. uint8_t* dst,
  2844. int width) {
  2845. __asm {
  2846. mov eax, [esp + 4] // src
  2847. mov edx, [esp + 8] // dst
  2848. mov ecx, [esp + 12] // width
  2849. vmovdqu ymm5, ymmword ptr kARGBShuffleMirror_AVX2
  2850. convertloop:
  2851. vpermd ymm0, ymm5, [eax - 32 + ecx * 4] // permute dword order
  2852. vmovdqu [edx], ymm0
  2853. lea edx, [edx + 32]
  2854. sub ecx, 8
  2855. jg convertloop
  2856. vzeroupper
  2857. ret
  2858. }
  2859. }
  2860. #endif // HAS_ARGBMIRRORROW_AVX2
  2861. #ifdef HAS_SPLITUVROW_SSE2
  2862. __declspec(naked) void SplitUVRow_SSE2(const uint8_t* src_uv,
  2863. uint8_t* dst_u,
  2864. uint8_t* dst_v,
  2865. int width) {
  2866. __asm {
  2867. push edi
  2868. mov eax, [esp + 4 + 4] // src_uv
  2869. mov edx, [esp + 4 + 8] // dst_u
  2870. mov edi, [esp + 4 + 12] // dst_v
  2871. mov ecx, [esp + 4 + 16] // width
  2872. pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
  2873. psrlw xmm5, 8
  2874. sub edi, edx
  2875. convertloop:
  2876. movdqu xmm0, [eax]
  2877. movdqu xmm1, [eax + 16]
  2878. lea eax, [eax + 32]
  2879. movdqa xmm2, xmm0
  2880. movdqa xmm3, xmm1
  2881. pand xmm0, xmm5 // even bytes
  2882. pand xmm1, xmm5
  2883. packuswb xmm0, xmm1
  2884. psrlw xmm2, 8 // odd bytes
  2885. psrlw xmm3, 8
  2886. packuswb xmm2, xmm3
  2887. movdqu [edx], xmm0
  2888. movdqu [edx + edi], xmm2
  2889. lea edx, [edx + 16]
  2890. sub ecx, 16
  2891. jg convertloop
  2892. pop edi
  2893. ret
  2894. }
  2895. }
  2896. #endif // HAS_SPLITUVROW_SSE2
  2897. #ifdef HAS_SPLITUVROW_AVX2
  2898. __declspec(naked) void SplitUVRow_AVX2(const uint8_t* src_uv,
  2899. uint8_t* dst_u,
  2900. uint8_t* dst_v,
  2901. int width) {
  2902. __asm {
  2903. push edi
  2904. mov eax, [esp + 4 + 4] // src_uv
  2905. mov edx, [esp + 4 + 8] // dst_u
  2906. mov edi, [esp + 4 + 12] // dst_v
  2907. mov ecx, [esp + 4 + 16] // width
  2908. vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff
  2909. vpsrlw ymm5, ymm5, 8
  2910. sub edi, edx
  2911. convertloop:
  2912. vmovdqu ymm0, [eax]
  2913. vmovdqu ymm1, [eax + 32]
  2914. lea eax, [eax + 64]
  2915. vpsrlw ymm2, ymm0, 8 // odd bytes
  2916. vpsrlw ymm3, ymm1, 8
  2917. vpand ymm0, ymm0, ymm5 // even bytes
  2918. vpand ymm1, ymm1, ymm5
  2919. vpackuswb ymm0, ymm0, ymm1
  2920. vpackuswb ymm2, ymm2, ymm3
  2921. vpermq ymm0, ymm0, 0xd8
  2922. vpermq ymm2, ymm2, 0xd8
  2923. vmovdqu [edx], ymm0
  2924. vmovdqu [edx + edi], ymm2
  2925. lea edx, [edx + 32]
  2926. sub ecx, 32
  2927. jg convertloop
  2928. pop edi
  2929. vzeroupper
  2930. ret
  2931. }
  2932. }
  2933. #endif // HAS_SPLITUVROW_AVX2
  2934. #ifdef HAS_MERGEUVROW_SSE2
  2935. __declspec(naked) void MergeUVRow_SSE2(const uint8_t* src_u,
  2936. const uint8_t* src_v,
  2937. uint8_t* dst_uv,
  2938. int width) {
  2939. __asm {
  2940. push edi
  2941. mov eax, [esp + 4 + 4] // src_u
  2942. mov edx, [esp + 4 + 8] // src_v
  2943. mov edi, [esp + 4 + 12] // dst_uv
  2944. mov ecx, [esp + 4 + 16] // width
  2945. sub edx, eax
  2946. convertloop:
  2947. movdqu xmm0, [eax] // read 16 U's
  2948. movdqu xmm1, [eax + edx] // and 16 V's
  2949. lea eax, [eax + 16]
  2950. movdqa xmm2, xmm0
  2951. punpcklbw xmm0, xmm1 // first 8 UV pairs
  2952. punpckhbw xmm2, xmm1 // next 8 UV pairs
  2953. movdqu [edi], xmm0
  2954. movdqu [edi + 16], xmm2
  2955. lea edi, [edi + 32]
  2956. sub ecx, 16
  2957. jg convertloop
  2958. pop edi
  2959. ret
  2960. }
  2961. }
  2962. #endif // HAS_MERGEUVROW_SSE2
  2963. #ifdef HAS_MERGEUVROW_AVX2
  2964. __declspec(naked) void MergeUVRow_AVX2(const uint8_t* src_u,
  2965. const uint8_t* src_v,
  2966. uint8_t* dst_uv,
  2967. int width) {
  2968. __asm {
  2969. push edi
  2970. mov eax, [esp + 4 + 4] // src_u
  2971. mov edx, [esp + 4 + 8] // src_v
  2972. mov edi, [esp + 4 + 12] // dst_uv
  2973. mov ecx, [esp + 4 + 16] // width
  2974. sub edx, eax
  2975. convertloop:
  2976. vmovdqu ymm0, [eax] // read 32 U's
  2977. vmovdqu ymm1, [eax + edx] // and 32 V's
  2978. lea eax, [eax + 32]
  2979. vpunpcklbw ymm2, ymm0, ymm1 // low 16 UV pairs. mutated qqword 0,2
  2980. vpunpckhbw ymm0, ymm0, ymm1 // high 16 UV pairs. mutated qqword 1,3
  2981. vextractf128 [edi], ymm2, 0 // bytes 0..15
  2982. vextractf128 [edi + 16], ymm0, 0 // bytes 16..31
  2983. vextractf128 [edi + 32], ymm2, 1 // bytes 32..47
  2984. vextractf128 [edi + 48], ymm0, 1 // bytes 47..63
  2985. lea edi, [edi + 64]
  2986. sub ecx, 32
  2987. jg convertloop
  2988. pop edi
  2989. vzeroupper
  2990. ret
  2991. }
  2992. }
  2993. #endif // HAS_MERGEUVROW_AVX2
  2994. #ifdef HAS_COPYROW_SSE2
  2995. // CopyRow copys 'width' bytes using a 16 byte load/store, 32 bytes at time.
  2996. __declspec(naked) void CopyRow_SSE2(const uint8_t* src,
  2997. uint8_t* dst,
  2998. int width) {
  2999. __asm {
  3000. mov eax, [esp + 4] // src
  3001. mov edx, [esp + 8] // dst
  3002. mov ecx, [esp + 12] // width
  3003. test eax, 15
  3004. jne convertloopu
  3005. test edx, 15
  3006. jne convertloopu
  3007. convertloopa:
  3008. movdqa xmm0, [eax]
  3009. movdqa xmm1, [eax + 16]
  3010. lea eax, [eax + 32]
  3011. movdqa [edx], xmm0
  3012. movdqa [edx + 16], xmm1
  3013. lea edx, [edx + 32]
  3014. sub ecx, 32
  3015. jg convertloopa
  3016. ret
  3017. convertloopu:
  3018. movdqu xmm0, [eax]
  3019. movdqu xmm1, [eax + 16]
  3020. lea eax, [eax + 32]
  3021. movdqu [edx], xmm0
  3022. movdqu [edx + 16], xmm1
  3023. lea edx, [edx + 32]
  3024. sub ecx, 32
  3025. jg convertloopu
  3026. ret
  3027. }
  3028. }
  3029. #endif // HAS_COPYROW_SSE2
  3030. #ifdef HAS_COPYROW_AVX
  3031. // CopyRow copys 'width' bytes using a 32 byte load/store, 64 bytes at time.
  3032. __declspec(naked) void CopyRow_AVX(const uint8_t* src,
  3033. uint8_t* dst,
  3034. int width) {
  3035. __asm {
  3036. mov eax, [esp + 4] // src
  3037. mov edx, [esp + 8] // dst
  3038. mov ecx, [esp + 12] // width
  3039. convertloop:
  3040. vmovdqu ymm0, [eax]
  3041. vmovdqu ymm1, [eax + 32]
  3042. lea eax, [eax + 64]
  3043. vmovdqu [edx], ymm0
  3044. vmovdqu [edx + 32], ymm1
  3045. lea edx, [edx + 64]
  3046. sub ecx, 64
  3047. jg convertloop
  3048. vzeroupper
  3049. ret
  3050. }
  3051. }
  3052. #endif // HAS_COPYROW_AVX
  3053. // Multiple of 1.
  3054. __declspec(naked) void CopyRow_ERMS(const uint8_t* src,
  3055. uint8_t* dst,
  3056. int width) {
  3057. __asm {
  3058. mov eax, esi
  3059. mov edx, edi
  3060. mov esi, [esp + 4] // src
  3061. mov edi, [esp + 8] // dst
  3062. mov ecx, [esp + 12] // width
  3063. rep movsb
  3064. mov edi, edx
  3065. mov esi, eax
  3066. ret
  3067. }
  3068. }
  3069. #ifdef HAS_ARGBCOPYALPHAROW_SSE2
  3070. // width in pixels
  3071. __declspec(naked) void ARGBCopyAlphaRow_SSE2(const uint8_t* src,
  3072. uint8_t* dst,
  3073. int width) {
  3074. __asm {
  3075. mov eax, [esp + 4] // src
  3076. mov edx, [esp + 8] // dst
  3077. mov ecx, [esp + 12] // width
  3078. pcmpeqb xmm0, xmm0 // generate mask 0xff000000
  3079. pslld xmm0, 24
  3080. pcmpeqb xmm1, xmm1 // generate mask 0x00ffffff
  3081. psrld xmm1, 8
  3082. convertloop:
  3083. movdqu xmm2, [eax]
  3084. movdqu xmm3, [eax + 16]
  3085. lea eax, [eax + 32]
  3086. movdqu xmm4, [edx]
  3087. movdqu xmm5, [edx + 16]
  3088. pand xmm2, xmm0
  3089. pand xmm3, xmm0
  3090. pand xmm4, xmm1
  3091. pand xmm5, xmm1
  3092. por xmm2, xmm4
  3093. por xmm3, xmm5
  3094. movdqu [edx], xmm2
  3095. movdqu [edx + 16], xmm3
  3096. lea edx, [edx + 32]
  3097. sub ecx, 8
  3098. jg convertloop
  3099. ret
  3100. }
  3101. }
  3102. #endif // HAS_ARGBCOPYALPHAROW_SSE2
  3103. #ifdef HAS_ARGBCOPYALPHAROW_AVX2
  3104. // width in pixels
  3105. __declspec(naked) void ARGBCopyAlphaRow_AVX2(const uint8_t* src,
  3106. uint8_t* dst,
  3107. int width) {
  3108. __asm {
  3109. mov eax, [esp + 4] // src
  3110. mov edx, [esp + 8] // dst
  3111. mov ecx, [esp + 12] // width
  3112. vpcmpeqb ymm0, ymm0, ymm0
  3113. vpsrld ymm0, ymm0, 8 // generate mask 0x00ffffff
  3114. convertloop:
  3115. vmovdqu ymm1, [eax]
  3116. vmovdqu ymm2, [eax + 32]
  3117. lea eax, [eax + 64]
  3118. vpblendvb ymm1, ymm1, [edx], ymm0
  3119. vpblendvb ymm2, ymm2, [edx + 32], ymm0
  3120. vmovdqu [edx], ymm1
  3121. vmovdqu [edx + 32], ymm2
  3122. lea edx, [edx + 64]
  3123. sub ecx, 16
  3124. jg convertloop
  3125. vzeroupper
  3126. ret
  3127. }
  3128. }
  3129. #endif // HAS_ARGBCOPYALPHAROW_AVX2
  3130. #ifdef HAS_ARGBEXTRACTALPHAROW_SSE2
  3131. // width in pixels
  3132. __declspec(naked) void ARGBExtractAlphaRow_SSE2(const uint8_t* src_argb,
  3133. uint8_t* dst_a,
  3134. int width) {
  3135. __asm {
  3136. mov eax, [esp + 4] // src_argb
  3137. mov edx, [esp + 8] // dst_a
  3138. mov ecx, [esp + 12] // width
  3139. extractloop:
  3140. movdqu xmm0, [eax]
  3141. movdqu xmm1, [eax + 16]
  3142. lea eax, [eax + 32]
  3143. psrld xmm0, 24
  3144. psrld xmm1, 24
  3145. packssdw xmm0, xmm1
  3146. packuswb xmm0, xmm0
  3147. movq qword ptr [edx], xmm0
  3148. lea edx, [edx + 8]
  3149. sub ecx, 8
  3150. jg extractloop
  3151. ret
  3152. }
  3153. }
  3154. #endif // HAS_ARGBEXTRACTALPHAROW_SSE2
  3155. #ifdef HAS_ARGBEXTRACTALPHAROW_AVX2
  3156. // width in pixels
  3157. __declspec(naked) void ARGBExtractAlphaRow_AVX2(const uint8_t* src_argb,
  3158. uint8_t* dst_a,
  3159. int width) {
  3160. __asm {
  3161. mov eax, [esp + 4] // src_argb
  3162. mov edx, [esp + 8] // dst_a
  3163. mov ecx, [esp + 12] // width
  3164. vmovdqa ymm4, ymmword ptr kPermdARGBToY_AVX
  3165. extractloop:
  3166. vmovdqu ymm0, [eax]
  3167. vmovdqu ymm1, [eax + 32]
  3168. vpsrld ymm0, ymm0, 24
  3169. vpsrld ymm1, ymm1, 24
  3170. vmovdqu ymm2, [eax + 64]
  3171. vmovdqu ymm3, [eax + 96]
  3172. lea eax, [eax + 128]
  3173. vpackssdw ymm0, ymm0, ymm1 // mutates
  3174. vpsrld ymm2, ymm2, 24
  3175. vpsrld ymm3, ymm3, 24
  3176. vpackssdw ymm2, ymm2, ymm3 // mutates
  3177. vpackuswb ymm0, ymm0, ymm2 // mutates
  3178. vpermd ymm0, ymm4, ymm0 // unmutate
  3179. vmovdqu [edx], ymm0
  3180. lea edx, [edx + 32]
  3181. sub ecx, 32
  3182. jg extractloop
  3183. vzeroupper
  3184. ret
  3185. }
  3186. }
  3187. #endif // HAS_ARGBEXTRACTALPHAROW_AVX2
  3188. #ifdef HAS_ARGBCOPYYTOALPHAROW_SSE2
  3189. // width in pixels
  3190. __declspec(naked) void ARGBCopyYToAlphaRow_SSE2(const uint8_t* src,
  3191. uint8_t* dst,
  3192. int width) {
  3193. __asm {
  3194. mov eax, [esp + 4] // src
  3195. mov edx, [esp + 8] // dst
  3196. mov ecx, [esp + 12] // width
  3197. pcmpeqb xmm0, xmm0 // generate mask 0xff000000
  3198. pslld xmm0, 24
  3199. pcmpeqb xmm1, xmm1 // generate mask 0x00ffffff
  3200. psrld xmm1, 8
  3201. convertloop:
  3202. movq xmm2, qword ptr [eax] // 8 Y's
  3203. lea eax, [eax + 8]
  3204. punpcklbw xmm2, xmm2
  3205. punpckhwd xmm3, xmm2
  3206. punpcklwd xmm2, xmm2
  3207. movdqu xmm4, [edx]
  3208. movdqu xmm5, [edx + 16]
  3209. pand xmm2, xmm0
  3210. pand xmm3, xmm0
  3211. pand xmm4, xmm1
  3212. pand xmm5, xmm1
  3213. por xmm2, xmm4
  3214. por xmm3, xmm5
  3215. movdqu [edx], xmm2
  3216. movdqu [edx + 16], xmm3
  3217. lea edx, [edx + 32]
  3218. sub ecx, 8
  3219. jg convertloop
  3220. ret
  3221. }
  3222. }
  3223. #endif // HAS_ARGBCOPYYTOALPHAROW_SSE2
  3224. #ifdef HAS_ARGBCOPYYTOALPHAROW_AVX2
  3225. // width in pixels
  3226. __declspec(naked) void ARGBCopyYToAlphaRow_AVX2(const uint8_t* src,
  3227. uint8_t* dst,
  3228. int width) {
  3229. __asm {
  3230. mov eax, [esp + 4] // src
  3231. mov edx, [esp + 8] // dst
  3232. mov ecx, [esp + 12] // width
  3233. vpcmpeqb ymm0, ymm0, ymm0
  3234. vpsrld ymm0, ymm0, 8 // generate mask 0x00ffffff
  3235. convertloop:
  3236. vpmovzxbd ymm1, qword ptr [eax]
  3237. vpmovzxbd ymm2, qword ptr [eax + 8]
  3238. lea eax, [eax + 16]
  3239. vpslld ymm1, ymm1, 24
  3240. vpslld ymm2, ymm2, 24
  3241. vpblendvb ymm1, ymm1, [edx], ymm0
  3242. vpblendvb ymm2, ymm2, [edx + 32], ymm0
  3243. vmovdqu [edx], ymm1
  3244. vmovdqu [edx + 32], ymm2
  3245. lea edx, [edx + 64]
  3246. sub ecx, 16
  3247. jg convertloop
  3248. vzeroupper
  3249. ret
  3250. }
  3251. }
  3252. #endif // HAS_ARGBCOPYYTOALPHAROW_AVX2
  3253. #ifdef HAS_SETROW_X86
  3254. // Write 'width' bytes using an 8 bit value repeated.
  3255. // width should be multiple of 4.
  3256. __declspec(naked) void SetRow_X86(uint8_t* dst, uint8_t v8, int width) {
  3257. __asm {
  3258. movzx eax, byte ptr [esp + 8] // v8
  3259. mov edx, 0x01010101 // Duplicate byte to all bytes.
  3260. mul edx // overwrites edx with upper part of result.
  3261. mov edx, edi
  3262. mov edi, [esp + 4] // dst
  3263. mov ecx, [esp + 12] // width
  3264. shr ecx, 2
  3265. rep stosd
  3266. mov edi, edx
  3267. ret
  3268. }
  3269. }
  3270. // Write 'width' bytes using an 8 bit value repeated.
  3271. __declspec(naked) void SetRow_ERMS(uint8_t* dst, uint8_t v8, int width) {
  3272. __asm {
  3273. mov edx, edi
  3274. mov edi, [esp + 4] // dst
  3275. mov eax, [esp + 8] // v8
  3276. mov ecx, [esp + 12] // width
  3277. rep stosb
  3278. mov edi, edx
  3279. ret
  3280. }
  3281. }
  3282. // Write 'width' 32 bit values.
  3283. __declspec(naked) void ARGBSetRow_X86(uint8_t* dst_argb,
  3284. uint32_t v32,
  3285. int width) {
  3286. __asm {
  3287. mov edx, edi
  3288. mov edi, [esp + 4] // dst
  3289. mov eax, [esp + 8] // v32
  3290. mov ecx, [esp + 12] // width
  3291. rep stosd
  3292. mov edi, edx
  3293. ret
  3294. }
  3295. }
  3296. #endif // HAS_SETROW_X86
  3297. #ifdef HAS_YUY2TOYROW_AVX2
  3298. __declspec(naked) void YUY2ToYRow_AVX2(const uint8_t* src_yuy2,
  3299. uint8_t* dst_y,
  3300. int width) {
  3301. __asm {
  3302. mov eax, [esp + 4] // src_yuy2
  3303. mov edx, [esp + 8] // dst_y
  3304. mov ecx, [esp + 12] // width
  3305. vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff
  3306. vpsrlw ymm5, ymm5, 8
  3307. convertloop:
  3308. vmovdqu ymm0, [eax]
  3309. vmovdqu ymm1, [eax + 32]
  3310. lea eax, [eax + 64]
  3311. vpand ymm0, ymm0, ymm5 // even bytes are Y
  3312. vpand ymm1, ymm1, ymm5
  3313. vpackuswb ymm0, ymm0, ymm1 // mutates.
  3314. vpermq ymm0, ymm0, 0xd8
  3315. vmovdqu [edx], ymm0
  3316. lea edx, [edx + 32]
  3317. sub ecx, 32
  3318. jg convertloop
  3319. vzeroupper
  3320. ret
  3321. }
  3322. }
  3323. __declspec(naked) void YUY2ToUVRow_AVX2(const uint8_t* src_yuy2,
  3324. int stride_yuy2,
  3325. uint8_t* dst_u,
  3326. uint8_t* dst_v,
  3327. int width) {
  3328. __asm {
  3329. push esi
  3330. push edi
  3331. mov eax, [esp + 8 + 4] // src_yuy2
  3332. mov esi, [esp + 8 + 8] // stride_yuy2
  3333. mov edx, [esp + 8 + 12] // dst_u
  3334. mov edi, [esp + 8 + 16] // dst_v
  3335. mov ecx, [esp + 8 + 20] // width
  3336. vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff
  3337. vpsrlw ymm5, ymm5, 8
  3338. sub edi, edx
  3339. convertloop:
  3340. vmovdqu ymm0, [eax]
  3341. vmovdqu ymm1, [eax + 32]
  3342. vpavgb ymm0, ymm0, [eax + esi]
  3343. vpavgb ymm1, ymm1, [eax + esi + 32]
  3344. lea eax, [eax + 64]
  3345. vpsrlw ymm0, ymm0, 8 // YUYV -> UVUV
  3346. vpsrlw ymm1, ymm1, 8
  3347. vpackuswb ymm0, ymm0, ymm1 // mutates.
  3348. vpermq ymm0, ymm0, 0xd8
  3349. vpand ymm1, ymm0, ymm5 // U
  3350. vpsrlw ymm0, ymm0, 8 // V
  3351. vpackuswb ymm1, ymm1, ymm1 // mutates.
  3352. vpackuswb ymm0, ymm0, ymm0 // mutates.
  3353. vpermq ymm1, ymm1, 0xd8
  3354. vpermq ymm0, ymm0, 0xd8
  3355. vextractf128 [edx], ymm1, 0 // U
  3356. vextractf128 [edx + edi], ymm0, 0 // V
  3357. lea edx, [edx + 16]
  3358. sub ecx, 32
  3359. jg convertloop
  3360. pop edi
  3361. pop esi
  3362. vzeroupper
  3363. ret
  3364. }
  3365. }
  3366. __declspec(naked) void YUY2ToUV422Row_AVX2(const uint8_t* src_yuy2,
  3367. uint8_t* dst_u,
  3368. uint8_t* dst_v,
  3369. int width) {
  3370. __asm {
  3371. push edi
  3372. mov eax, [esp + 4 + 4] // src_yuy2
  3373. mov edx, [esp + 4 + 8] // dst_u
  3374. mov edi, [esp + 4 + 12] // dst_v
  3375. mov ecx, [esp + 4 + 16] // width
  3376. vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff
  3377. vpsrlw ymm5, ymm5, 8
  3378. sub edi, edx
  3379. convertloop:
  3380. vmovdqu ymm0, [eax]
  3381. vmovdqu ymm1, [eax + 32]
  3382. lea eax, [eax + 64]
  3383. vpsrlw ymm0, ymm0, 8 // YUYV -> UVUV
  3384. vpsrlw ymm1, ymm1, 8
  3385. vpackuswb ymm0, ymm0, ymm1 // mutates.
  3386. vpermq ymm0, ymm0, 0xd8
  3387. vpand ymm1, ymm0, ymm5 // U
  3388. vpsrlw ymm0, ymm0, 8 // V
  3389. vpackuswb ymm1, ymm1, ymm1 // mutates.
  3390. vpackuswb ymm0, ymm0, ymm0 // mutates.
  3391. vpermq ymm1, ymm1, 0xd8
  3392. vpermq ymm0, ymm0, 0xd8
  3393. vextractf128 [edx], ymm1, 0 // U
  3394. vextractf128 [edx + edi], ymm0, 0 // V
  3395. lea edx, [edx + 16]
  3396. sub ecx, 32
  3397. jg convertloop
  3398. pop edi
  3399. vzeroupper
  3400. ret
  3401. }
  3402. }
  3403. __declspec(naked) void UYVYToYRow_AVX2(const uint8_t* src_uyvy,
  3404. uint8_t* dst_y,
  3405. int width) {
  3406. __asm {
  3407. mov eax, [esp + 4] // src_uyvy
  3408. mov edx, [esp + 8] // dst_y
  3409. mov ecx, [esp + 12] // width
  3410. convertloop:
  3411. vmovdqu ymm0, [eax]
  3412. vmovdqu ymm1, [eax + 32]
  3413. lea eax, [eax + 64]
  3414. vpsrlw ymm0, ymm0, 8 // odd bytes are Y
  3415. vpsrlw ymm1, ymm1, 8
  3416. vpackuswb ymm0, ymm0, ymm1 // mutates.
  3417. vpermq ymm0, ymm0, 0xd8
  3418. vmovdqu [edx], ymm0
  3419. lea edx, [edx + 32]
  3420. sub ecx, 32
  3421. jg convertloop
  3422. vzeroupper
  3423. ret
  3424. }
  3425. }
  3426. __declspec(naked) void UYVYToUVRow_AVX2(const uint8_t* src_uyvy,
  3427. int stride_uyvy,
  3428. uint8_t* dst_u,
  3429. uint8_t* dst_v,
  3430. int width) {
  3431. __asm {
  3432. push esi
  3433. push edi
  3434. mov eax, [esp + 8 + 4] // src_yuy2
  3435. mov esi, [esp + 8 + 8] // stride_yuy2
  3436. mov edx, [esp + 8 + 12] // dst_u
  3437. mov edi, [esp + 8 + 16] // dst_v
  3438. mov ecx, [esp + 8 + 20] // width
  3439. vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff
  3440. vpsrlw ymm5, ymm5, 8
  3441. sub edi, edx
  3442. convertloop:
  3443. vmovdqu ymm0, [eax]
  3444. vmovdqu ymm1, [eax + 32]
  3445. vpavgb ymm0, ymm0, [eax + esi]
  3446. vpavgb ymm1, ymm1, [eax + esi + 32]
  3447. lea eax, [eax + 64]
  3448. vpand ymm0, ymm0, ymm5 // UYVY -> UVUV
  3449. vpand ymm1, ymm1, ymm5
  3450. vpackuswb ymm0, ymm0, ymm1 // mutates.
  3451. vpermq ymm0, ymm0, 0xd8
  3452. vpand ymm1, ymm0, ymm5 // U
  3453. vpsrlw ymm0, ymm0, 8 // V
  3454. vpackuswb ymm1, ymm1, ymm1 // mutates.
  3455. vpackuswb ymm0, ymm0, ymm0 // mutates.
  3456. vpermq ymm1, ymm1, 0xd8
  3457. vpermq ymm0, ymm0, 0xd8
  3458. vextractf128 [edx], ymm1, 0 // U
  3459. vextractf128 [edx + edi], ymm0, 0 // V
  3460. lea edx, [edx + 16]
  3461. sub ecx, 32
  3462. jg convertloop
  3463. pop edi
  3464. pop esi
  3465. vzeroupper
  3466. ret
  3467. }
  3468. }
  3469. __declspec(naked) void UYVYToUV422Row_AVX2(const uint8_t* src_uyvy,
  3470. uint8_t* dst_u,
  3471. uint8_t* dst_v,
  3472. int width) {
  3473. __asm {
  3474. push edi
  3475. mov eax, [esp + 4 + 4] // src_yuy2
  3476. mov edx, [esp + 4 + 8] // dst_u
  3477. mov edi, [esp + 4 + 12] // dst_v
  3478. mov ecx, [esp + 4 + 16] // width
  3479. vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff
  3480. vpsrlw ymm5, ymm5, 8
  3481. sub edi, edx
  3482. convertloop:
  3483. vmovdqu ymm0, [eax]
  3484. vmovdqu ymm1, [eax + 32]
  3485. lea eax, [eax + 64]
  3486. vpand ymm0, ymm0, ymm5 // UYVY -> UVUV
  3487. vpand ymm1, ymm1, ymm5
  3488. vpackuswb ymm0, ymm0, ymm1 // mutates.
  3489. vpermq ymm0, ymm0, 0xd8
  3490. vpand ymm1, ymm0, ymm5 // U
  3491. vpsrlw ymm0, ymm0, 8 // V
  3492. vpackuswb ymm1, ymm1, ymm1 // mutates.
  3493. vpackuswb ymm0, ymm0, ymm0 // mutates.
  3494. vpermq ymm1, ymm1, 0xd8
  3495. vpermq ymm0, ymm0, 0xd8
  3496. vextractf128 [edx], ymm1, 0 // U
  3497. vextractf128 [edx + edi], ymm0, 0 // V
  3498. lea edx, [edx + 16]
  3499. sub ecx, 32
  3500. jg convertloop
  3501. pop edi
  3502. vzeroupper
  3503. ret
  3504. }
  3505. }
  3506. #endif // HAS_YUY2TOYROW_AVX2
  3507. #ifdef HAS_YUY2TOYROW_SSE2
  3508. __declspec(naked) void YUY2ToYRow_SSE2(const uint8_t* src_yuy2,
  3509. uint8_t* dst_y,
  3510. int width) {
  3511. __asm {
  3512. mov eax, [esp + 4] // src_yuy2
  3513. mov edx, [esp + 8] // dst_y
  3514. mov ecx, [esp + 12] // width
  3515. pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
  3516. psrlw xmm5, 8
  3517. convertloop:
  3518. movdqu xmm0, [eax]
  3519. movdqu xmm1, [eax + 16]
  3520. lea eax, [eax + 32]
  3521. pand xmm0, xmm5 // even bytes are Y
  3522. pand xmm1, xmm5
  3523. packuswb xmm0, xmm1
  3524. movdqu [edx], xmm0
  3525. lea edx, [edx + 16]
  3526. sub ecx, 16
  3527. jg convertloop
  3528. ret
  3529. }
  3530. }
  3531. __declspec(naked) void YUY2ToUVRow_SSE2(const uint8_t* src_yuy2,
  3532. int stride_yuy2,
  3533. uint8_t* dst_u,
  3534. uint8_t* dst_v,
  3535. int width) {
  3536. __asm {
  3537. push esi
  3538. push edi
  3539. mov eax, [esp + 8 + 4] // src_yuy2
  3540. mov esi, [esp + 8 + 8] // stride_yuy2
  3541. mov edx, [esp + 8 + 12] // dst_u
  3542. mov edi, [esp + 8 + 16] // dst_v
  3543. mov ecx, [esp + 8 + 20] // width
  3544. pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
  3545. psrlw xmm5, 8
  3546. sub edi, edx
  3547. convertloop:
  3548. movdqu xmm0, [eax]
  3549. movdqu xmm1, [eax + 16]
  3550. movdqu xmm2, [eax + esi]
  3551. movdqu xmm3, [eax + esi + 16]
  3552. lea eax, [eax + 32]
  3553. pavgb xmm0, xmm2
  3554. pavgb xmm1, xmm3
  3555. psrlw xmm0, 8 // YUYV -> UVUV
  3556. psrlw xmm1, 8
  3557. packuswb xmm0, xmm1
  3558. movdqa xmm1, xmm0
  3559. pand xmm0, xmm5 // U
  3560. packuswb xmm0, xmm0
  3561. psrlw xmm1, 8 // V
  3562. packuswb xmm1, xmm1
  3563. movq qword ptr [edx], xmm0
  3564. movq qword ptr [edx + edi], xmm1
  3565. lea edx, [edx + 8]
  3566. sub ecx, 16
  3567. jg convertloop
  3568. pop edi
  3569. pop esi
  3570. ret
  3571. }
  3572. }
  3573. __declspec(naked) void YUY2ToUV422Row_SSE2(const uint8_t* src_yuy2,
  3574. uint8_t* dst_u,
  3575. uint8_t* dst_v,
  3576. int width) {
  3577. __asm {
  3578. push edi
  3579. mov eax, [esp + 4 + 4] // src_yuy2
  3580. mov edx, [esp + 4 + 8] // dst_u
  3581. mov edi, [esp + 4 + 12] // dst_v
  3582. mov ecx, [esp + 4 + 16] // width
  3583. pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
  3584. psrlw xmm5, 8
  3585. sub edi, edx
  3586. convertloop:
  3587. movdqu xmm0, [eax]
  3588. movdqu xmm1, [eax + 16]
  3589. lea eax, [eax + 32]
  3590. psrlw xmm0, 8 // YUYV -> UVUV
  3591. psrlw xmm1, 8
  3592. packuswb xmm0, xmm1
  3593. movdqa xmm1, xmm0
  3594. pand xmm0, xmm5 // U
  3595. packuswb xmm0, xmm0
  3596. psrlw xmm1, 8 // V
  3597. packuswb xmm1, xmm1
  3598. movq qword ptr [edx], xmm0
  3599. movq qword ptr [edx + edi], xmm1
  3600. lea edx, [edx + 8]
  3601. sub ecx, 16
  3602. jg convertloop
  3603. pop edi
  3604. ret
  3605. }
  3606. }
  3607. __declspec(naked) void UYVYToYRow_SSE2(const uint8_t* src_uyvy,
  3608. uint8_t* dst_y,
  3609. int width) {
  3610. __asm {
  3611. mov eax, [esp + 4] // src_uyvy
  3612. mov edx, [esp + 8] // dst_y
  3613. mov ecx, [esp + 12] // width
  3614. convertloop:
  3615. movdqu xmm0, [eax]
  3616. movdqu xmm1, [eax + 16]
  3617. lea eax, [eax + 32]
  3618. psrlw xmm0, 8 // odd bytes are Y
  3619. psrlw xmm1, 8
  3620. packuswb xmm0, xmm1
  3621. movdqu [edx], xmm0
  3622. lea edx, [edx + 16]
  3623. sub ecx, 16
  3624. jg convertloop
  3625. ret
  3626. }
  3627. }
  3628. __declspec(naked) void UYVYToUVRow_SSE2(const uint8_t* src_uyvy,
  3629. int stride_uyvy,
  3630. uint8_t* dst_u,
  3631. uint8_t* dst_v,
  3632. int width) {
  3633. __asm {
  3634. push esi
  3635. push edi
  3636. mov eax, [esp + 8 + 4] // src_yuy2
  3637. mov esi, [esp + 8 + 8] // stride_yuy2
  3638. mov edx, [esp + 8 + 12] // dst_u
  3639. mov edi, [esp + 8 + 16] // dst_v
  3640. mov ecx, [esp + 8 + 20] // width
  3641. pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
  3642. psrlw xmm5, 8
  3643. sub edi, edx
  3644. convertloop:
  3645. movdqu xmm0, [eax]
  3646. movdqu xmm1, [eax + 16]
  3647. movdqu xmm2, [eax + esi]
  3648. movdqu xmm3, [eax + esi + 16]
  3649. lea eax, [eax + 32]
  3650. pavgb xmm0, xmm2
  3651. pavgb xmm1, xmm3
  3652. pand xmm0, xmm5 // UYVY -> UVUV
  3653. pand xmm1, xmm5
  3654. packuswb xmm0, xmm1
  3655. movdqa xmm1, xmm0
  3656. pand xmm0, xmm5 // U
  3657. packuswb xmm0, xmm0
  3658. psrlw xmm1, 8 // V
  3659. packuswb xmm1, xmm1
  3660. movq qword ptr [edx], xmm0
  3661. movq qword ptr [edx + edi], xmm1
  3662. lea edx, [edx + 8]
  3663. sub ecx, 16
  3664. jg convertloop
  3665. pop edi
  3666. pop esi
  3667. ret
  3668. }
  3669. }
  3670. __declspec(naked) void UYVYToUV422Row_SSE2(const uint8_t* src_uyvy,
  3671. uint8_t* dst_u,
  3672. uint8_t* dst_v,
  3673. int width) {
  3674. __asm {
  3675. push edi
  3676. mov eax, [esp + 4 + 4] // src_yuy2
  3677. mov edx, [esp + 4 + 8] // dst_u
  3678. mov edi, [esp + 4 + 12] // dst_v
  3679. mov ecx, [esp + 4 + 16] // width
  3680. pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
  3681. psrlw xmm5, 8
  3682. sub edi, edx
  3683. convertloop:
  3684. movdqu xmm0, [eax]
  3685. movdqu xmm1, [eax + 16]
  3686. lea eax, [eax + 32]
  3687. pand xmm0, xmm5 // UYVY -> UVUV
  3688. pand xmm1, xmm5
  3689. packuswb xmm0, xmm1
  3690. movdqa xmm1, xmm0
  3691. pand xmm0, xmm5 // U
  3692. packuswb xmm0, xmm0
  3693. psrlw xmm1, 8 // V
  3694. packuswb xmm1, xmm1
  3695. movq qword ptr [edx], xmm0
  3696. movq qword ptr [edx + edi], xmm1
  3697. lea edx, [edx + 8]
  3698. sub ecx, 16
  3699. jg convertloop
  3700. pop edi
  3701. ret
  3702. }
  3703. }
  3704. #endif // HAS_YUY2TOYROW_SSE2
  3705. #ifdef HAS_BLENDPLANEROW_SSSE3
  3706. // Blend 8 pixels at a time.
  3707. // unsigned version of math
  3708. // =((A2*C2)+(B2*(255-C2))+255)/256
  3709. // signed version of math
  3710. // =(((A2-128)*C2)+((B2-128)*(255-C2))+32768+127)/256
  3711. __declspec(naked) void BlendPlaneRow_SSSE3(const uint8_t* src0,
  3712. const uint8_t* src1,
  3713. const uint8_t* alpha,
  3714. uint8_t* dst,
  3715. int width) {
  3716. __asm {
  3717. push esi
  3718. push edi
  3719. pcmpeqb xmm5, xmm5 // generate mask 0xff00ff00
  3720. psllw xmm5, 8
  3721. mov eax, 0x80808080 // 128 for biasing image to signed.
  3722. movd xmm6, eax
  3723. pshufd xmm6, xmm6, 0x00
  3724. mov eax, 0x807f807f // 32768 + 127 for unbias and round.
  3725. movd xmm7, eax
  3726. pshufd xmm7, xmm7, 0x00
  3727. mov eax, [esp + 8 + 4] // src0
  3728. mov edx, [esp + 8 + 8] // src1
  3729. mov esi, [esp + 8 + 12] // alpha
  3730. mov edi, [esp + 8 + 16] // dst
  3731. mov ecx, [esp + 8 + 20] // width
  3732. sub eax, esi
  3733. sub edx, esi
  3734. sub edi, esi
  3735. // 8 pixel loop.
  3736. convertloop8:
  3737. movq xmm0, qword ptr [esi] // alpha
  3738. punpcklbw xmm0, xmm0
  3739. pxor xmm0, xmm5 // a, 255-a
  3740. movq xmm1, qword ptr [eax + esi] // src0
  3741. movq xmm2, qword ptr [edx + esi] // src1
  3742. punpcklbw xmm1, xmm2
  3743. psubb xmm1, xmm6 // bias src0/1 - 128
  3744. pmaddubsw xmm0, xmm1
  3745. paddw xmm0, xmm7 // unbias result - 32768 and round.
  3746. psrlw xmm0, 8
  3747. packuswb xmm0, xmm0
  3748. movq qword ptr [edi + esi], xmm0
  3749. lea esi, [esi + 8]
  3750. sub ecx, 8
  3751. jg convertloop8
  3752. pop edi
  3753. pop esi
  3754. ret
  3755. }
  3756. }
  3757. #endif // HAS_BLENDPLANEROW_SSSE3
  3758. #ifdef HAS_BLENDPLANEROW_AVX2
  3759. // Blend 32 pixels at a time.
  3760. // unsigned version of math
  3761. // =((A2*C2)+(B2*(255-C2))+255)/256
  3762. // signed version of math
  3763. // =(((A2-128)*C2)+((B2-128)*(255-C2))+32768+127)/256
  3764. __declspec(naked) void BlendPlaneRow_AVX2(const uint8_t* src0,
  3765. const uint8_t* src1,
  3766. const uint8_t* alpha,
  3767. uint8_t* dst,
  3768. int width) {
  3769. __asm {
  3770. push esi
  3771. push edi
  3772. vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0xff00ff00
  3773. vpsllw ymm5, ymm5, 8
  3774. mov eax, 0x80808080 // 128 for biasing image to signed.
  3775. vmovd xmm6, eax
  3776. vbroadcastss ymm6, xmm6
  3777. mov eax, 0x807f807f // 32768 + 127 for unbias and round.
  3778. vmovd xmm7, eax
  3779. vbroadcastss ymm7, xmm7
  3780. mov eax, [esp + 8 + 4] // src0
  3781. mov edx, [esp + 8 + 8] // src1
  3782. mov esi, [esp + 8 + 12] // alpha
  3783. mov edi, [esp + 8 + 16] // dst
  3784. mov ecx, [esp + 8 + 20] // width
  3785. sub eax, esi
  3786. sub edx, esi
  3787. sub edi, esi
  3788. // 32 pixel loop.
  3789. convertloop32:
  3790. vmovdqu ymm0, [esi] // alpha
  3791. vpunpckhbw ymm3, ymm0, ymm0 // 8..15, 24..31
  3792. vpunpcklbw ymm0, ymm0, ymm0 // 0..7, 16..23
  3793. vpxor ymm3, ymm3, ymm5 // a, 255-a
  3794. vpxor ymm0, ymm0, ymm5 // a, 255-a
  3795. vmovdqu ymm1, [eax + esi] // src0
  3796. vmovdqu ymm2, [edx + esi] // src1
  3797. vpunpckhbw ymm4, ymm1, ymm2
  3798. vpunpcklbw ymm1, ymm1, ymm2
  3799. vpsubb ymm4, ymm4, ymm6 // bias src0/1 - 128
  3800. vpsubb ymm1, ymm1, ymm6 // bias src0/1 - 128
  3801. vpmaddubsw ymm3, ymm3, ymm4
  3802. vpmaddubsw ymm0, ymm0, ymm1
  3803. vpaddw ymm3, ymm3, ymm7 // unbias result - 32768 and round.
  3804. vpaddw ymm0, ymm0, ymm7 // unbias result - 32768 and round.
  3805. vpsrlw ymm3, ymm3, 8
  3806. vpsrlw ymm0, ymm0, 8
  3807. vpackuswb ymm0, ymm0, ymm3
  3808. vmovdqu [edi + esi], ymm0
  3809. lea esi, [esi + 32]
  3810. sub ecx, 32
  3811. jg convertloop32
  3812. pop edi
  3813. pop esi
  3814. vzeroupper
  3815. ret
  3816. }
  3817. }
  3818. #endif // HAS_BLENDPLANEROW_AVX2
  3819. #ifdef HAS_ARGBBLENDROW_SSSE3
  3820. // Shuffle table for isolating alpha.
  3821. static const uvec8 kShuffleAlpha = {3u, 0x80, 3u, 0x80, 7u, 0x80, 7u, 0x80,
  3822. 11u, 0x80, 11u, 0x80, 15u, 0x80, 15u, 0x80};
  3823. // Blend 8 pixels at a time.
  3824. __declspec(naked) void ARGBBlendRow_SSSE3(const uint8_t* src_argb0,
  3825. const uint8_t* src_argb1,
  3826. uint8_t* dst_argb,
  3827. int width) {
  3828. __asm {
  3829. push esi
  3830. mov eax, [esp + 4 + 4] // src_argb0
  3831. mov esi, [esp + 4 + 8] // src_argb1
  3832. mov edx, [esp + 4 + 12] // dst_argb
  3833. mov ecx, [esp + 4 + 16] // width
  3834. pcmpeqb xmm7, xmm7 // generate constant 0x0001
  3835. psrlw xmm7, 15
  3836. pcmpeqb xmm6, xmm6 // generate mask 0x00ff00ff
  3837. psrlw xmm6, 8
  3838. pcmpeqb xmm5, xmm5 // generate mask 0xff00ff00
  3839. psllw xmm5, 8
  3840. pcmpeqb xmm4, xmm4 // generate mask 0xff000000
  3841. pslld xmm4, 24
  3842. sub ecx, 4
  3843. jl convertloop4b // less than 4 pixels?
  3844. // 4 pixel loop.
  3845. convertloop4:
  3846. movdqu xmm3, [eax] // src argb
  3847. lea eax, [eax + 16]
  3848. movdqa xmm0, xmm3 // src argb
  3849. pxor xmm3, xmm4 // ~alpha
  3850. movdqu xmm2, [esi] // _r_b
  3851. pshufb xmm3, xmmword ptr kShuffleAlpha // alpha
  3852. pand xmm2, xmm6 // _r_b
  3853. paddw xmm3, xmm7 // 256 - alpha
  3854. pmullw xmm2, xmm3 // _r_b * alpha
  3855. movdqu xmm1, [esi] // _a_g
  3856. lea esi, [esi + 16]
  3857. psrlw xmm1, 8 // _a_g
  3858. por xmm0, xmm4 // set alpha to 255
  3859. pmullw xmm1, xmm3 // _a_g * alpha
  3860. psrlw xmm2, 8 // _r_b convert to 8 bits again
  3861. paddusb xmm0, xmm2 // + src argb
  3862. pand xmm1, xmm5 // a_g_ convert to 8 bits again
  3863. paddusb xmm0, xmm1 // + src argb
  3864. movdqu [edx], xmm0
  3865. lea edx, [edx + 16]
  3866. sub ecx, 4
  3867. jge convertloop4
  3868. convertloop4b:
  3869. add ecx, 4 - 1
  3870. jl convertloop1b
  3871. // 1 pixel loop.
  3872. convertloop1:
  3873. movd xmm3, [eax] // src argb
  3874. lea eax, [eax + 4]
  3875. movdqa xmm0, xmm3 // src argb
  3876. pxor xmm3, xmm4 // ~alpha
  3877. movd xmm2, [esi] // _r_b
  3878. pshufb xmm3, xmmword ptr kShuffleAlpha // alpha
  3879. pand xmm2, xmm6 // _r_b
  3880. paddw xmm3, xmm7 // 256 - alpha
  3881. pmullw xmm2, xmm3 // _r_b * alpha
  3882. movd xmm1, [esi] // _a_g
  3883. lea esi, [esi + 4]
  3884. psrlw xmm1, 8 // _a_g
  3885. por xmm0, xmm4 // set alpha to 255
  3886. pmullw xmm1, xmm3 // _a_g * alpha
  3887. psrlw xmm2, 8 // _r_b convert to 8 bits again
  3888. paddusb xmm0, xmm2 // + src argb
  3889. pand xmm1, xmm5 // a_g_ convert to 8 bits again
  3890. paddusb xmm0, xmm1 // + src argb
  3891. movd [edx], xmm0
  3892. lea edx, [edx + 4]
  3893. sub ecx, 1
  3894. jge convertloop1
  3895. convertloop1b:
  3896. pop esi
  3897. ret
  3898. }
  3899. }
  3900. #endif // HAS_ARGBBLENDROW_SSSE3
  3901. #ifdef HAS_ARGBATTENUATEROW_SSSE3
  3902. // Shuffle table duplicating alpha.
  3903. static const uvec8 kShuffleAlpha0 = {
  3904. 3u, 3u, 3u, 3u, 3u, 3u, 128u, 128u, 7u, 7u, 7u, 7u, 7u, 7u, 128u, 128u,
  3905. };
  3906. static const uvec8 kShuffleAlpha1 = {
  3907. 11u, 11u, 11u, 11u, 11u, 11u, 128u, 128u,
  3908. 15u, 15u, 15u, 15u, 15u, 15u, 128u, 128u,
  3909. };
  3910. __declspec(naked) void ARGBAttenuateRow_SSSE3(const uint8_t* src_argb,
  3911. uint8_t* dst_argb,
  3912. int width) {
  3913. __asm {
  3914. mov eax, [esp + 4] // src_argb0
  3915. mov edx, [esp + 8] // dst_argb
  3916. mov ecx, [esp + 12] // width
  3917. pcmpeqb xmm3, xmm3 // generate mask 0xff000000
  3918. pslld xmm3, 24
  3919. movdqa xmm4, xmmword ptr kShuffleAlpha0
  3920. movdqa xmm5, xmmword ptr kShuffleAlpha1
  3921. convertloop:
  3922. movdqu xmm0, [eax] // read 4 pixels
  3923. pshufb xmm0, xmm4 // isolate first 2 alphas
  3924. movdqu xmm1, [eax] // read 4 pixels
  3925. punpcklbw xmm1, xmm1 // first 2 pixel rgbs
  3926. pmulhuw xmm0, xmm1 // rgb * a
  3927. movdqu xmm1, [eax] // read 4 pixels
  3928. pshufb xmm1, xmm5 // isolate next 2 alphas
  3929. movdqu xmm2, [eax] // read 4 pixels
  3930. punpckhbw xmm2, xmm2 // next 2 pixel rgbs
  3931. pmulhuw xmm1, xmm2 // rgb * a
  3932. movdqu xmm2, [eax] // mask original alpha
  3933. lea eax, [eax + 16]
  3934. pand xmm2, xmm3
  3935. psrlw xmm0, 8
  3936. psrlw xmm1, 8
  3937. packuswb xmm0, xmm1
  3938. por xmm0, xmm2 // copy original alpha
  3939. movdqu [edx], xmm0
  3940. lea edx, [edx + 16]
  3941. sub ecx, 4
  3942. jg convertloop
  3943. ret
  3944. }
  3945. }
  3946. #endif // HAS_ARGBATTENUATEROW_SSSE3
  3947. #ifdef HAS_ARGBATTENUATEROW_AVX2
  3948. // Shuffle table duplicating alpha.
  3949. static const uvec8 kShuffleAlpha_AVX2 = {6u, 7u, 6u, 7u, 6u, 7u,
  3950. 128u, 128u, 14u, 15u, 14u, 15u,
  3951. 14u, 15u, 128u, 128u};
  3952. __declspec(naked) void ARGBAttenuateRow_AVX2(const uint8_t* src_argb,
  3953. uint8_t* dst_argb,
  3954. int width) {
  3955. __asm {
  3956. mov eax, [esp + 4] // src_argb0
  3957. mov edx, [esp + 8] // dst_argb
  3958. mov ecx, [esp + 12] // width
  3959. sub edx, eax
  3960. vbroadcastf128 ymm4, xmmword ptr kShuffleAlpha_AVX2
  3961. vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0xff000000
  3962. vpslld ymm5, ymm5, 24
  3963. convertloop:
  3964. vmovdqu ymm6, [eax] // read 8 pixels.
  3965. vpunpcklbw ymm0, ymm6, ymm6 // low 4 pixels. mutated.
  3966. vpunpckhbw ymm1, ymm6, ymm6 // high 4 pixels. mutated.
  3967. vpshufb ymm2, ymm0, ymm4 // low 4 alphas
  3968. vpshufb ymm3, ymm1, ymm4 // high 4 alphas
  3969. vpmulhuw ymm0, ymm0, ymm2 // rgb * a
  3970. vpmulhuw ymm1, ymm1, ymm3 // rgb * a
  3971. vpand ymm6, ymm6, ymm5 // isolate alpha
  3972. vpsrlw ymm0, ymm0, 8
  3973. vpsrlw ymm1, ymm1, 8
  3974. vpackuswb ymm0, ymm0, ymm1 // unmutated.
  3975. vpor ymm0, ymm0, ymm6 // copy original alpha
  3976. vmovdqu [eax + edx], ymm0
  3977. lea eax, [eax + 32]
  3978. sub ecx, 8
  3979. jg convertloop
  3980. vzeroupper
  3981. ret
  3982. }
  3983. }
  3984. #endif // HAS_ARGBATTENUATEROW_AVX2
  3985. #ifdef HAS_ARGBUNATTENUATEROW_SSE2
  3986. // Unattenuate 4 pixels at a time.
  3987. __declspec(naked) void ARGBUnattenuateRow_SSE2(const uint8_t* src_argb,
  3988. uint8_t* dst_argb,
  3989. int width) {
  3990. __asm {
  3991. push ebx
  3992. push esi
  3993. push edi
  3994. mov eax, [esp + 12 + 4] // src_argb
  3995. mov edx, [esp + 12 + 8] // dst_argb
  3996. mov ecx, [esp + 12 + 12] // width
  3997. lea ebx, fixed_invtbl8
  3998. convertloop:
  3999. movdqu xmm0, [eax] // read 4 pixels
  4000. movzx esi, byte ptr [eax + 3] // first alpha
  4001. movzx edi, byte ptr [eax + 7] // second alpha
  4002. punpcklbw xmm0, xmm0 // first 2
  4003. movd xmm2, dword ptr [ebx + esi * 4]
  4004. movd xmm3, dword ptr [ebx + edi * 4]
  4005. pshuflw xmm2, xmm2, 040h // first 4 inv_alpha words. 1, a, a, a
  4006. pshuflw xmm3, xmm3, 040h // next 4 inv_alpha words
  4007. movlhps xmm2, xmm3
  4008. pmulhuw xmm0, xmm2 // rgb * a
  4009. movdqu xmm1, [eax] // read 4 pixels
  4010. movzx esi, byte ptr [eax + 11] // third alpha
  4011. movzx edi, byte ptr [eax + 15] // forth alpha
  4012. punpckhbw xmm1, xmm1 // next 2
  4013. movd xmm2, dword ptr [ebx + esi * 4]
  4014. movd xmm3, dword ptr [ebx + edi * 4]
  4015. pshuflw xmm2, xmm2, 040h // first 4 inv_alpha words
  4016. pshuflw xmm3, xmm3, 040h // next 4 inv_alpha words
  4017. movlhps xmm2, xmm3
  4018. pmulhuw xmm1, xmm2 // rgb * a
  4019. lea eax, [eax + 16]
  4020. packuswb xmm0, xmm1
  4021. movdqu [edx], xmm0
  4022. lea edx, [edx + 16]
  4023. sub ecx, 4
  4024. jg convertloop
  4025. pop edi
  4026. pop esi
  4027. pop ebx
  4028. ret
  4029. }
  4030. }
  4031. #endif // HAS_ARGBUNATTENUATEROW_SSE2
  4032. #ifdef HAS_ARGBUNATTENUATEROW_AVX2
  4033. // Shuffle table duplicating alpha.
  4034. static const uvec8 kUnattenShuffleAlpha_AVX2 = {
  4035. 0u, 1u, 0u, 1u, 0u, 1u, 6u, 7u, 8u, 9u, 8u, 9u, 8u, 9u, 14u, 15u};
  4036. // TODO(fbarchard): Enable USE_GATHER for future hardware if faster.
  4037. // USE_GATHER is not on by default, due to being a slow instruction.
  4038. #ifdef USE_GATHER
  4039. __declspec(naked) void ARGBUnattenuateRow_AVX2(const uint8_t* src_argb,
  4040. uint8_t* dst_argb,
  4041. int width) {
  4042. __asm {
  4043. mov eax, [esp + 4] // src_argb0
  4044. mov edx, [esp + 8] // dst_argb
  4045. mov ecx, [esp + 12] // width
  4046. sub edx, eax
  4047. vbroadcastf128 ymm4, xmmword ptr kUnattenShuffleAlpha_AVX2
  4048. convertloop:
  4049. vmovdqu ymm6, [eax] // read 8 pixels.
  4050. vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0xffffffff for gather.
  4051. vpsrld ymm2, ymm6, 24 // alpha in low 8 bits.
  4052. vpunpcklbw ymm0, ymm6, ymm6 // low 4 pixels. mutated.
  4053. vpunpckhbw ymm1, ymm6, ymm6 // high 4 pixels. mutated.
  4054. vpgatherdd ymm3, [ymm2 * 4 + fixed_invtbl8], ymm5 // ymm5 cleared. 1, a
  4055. vpunpcklwd ymm2, ymm3, ymm3 // low 4 inverted alphas. mutated. 1, 1, a, a
  4056. vpunpckhwd ymm3, ymm3, ymm3 // high 4 inverted alphas. mutated.
  4057. vpshufb ymm2, ymm2, ymm4 // replicate low 4 alphas. 1, a, a, a
  4058. vpshufb ymm3, ymm3, ymm4 // replicate high 4 alphas
  4059. vpmulhuw ymm0, ymm0, ymm2 // rgb * ia
  4060. vpmulhuw ymm1, ymm1, ymm3 // rgb * ia
  4061. vpackuswb ymm0, ymm0, ymm1 // unmutated.
  4062. vmovdqu [eax + edx], ymm0
  4063. lea eax, [eax + 32]
  4064. sub ecx, 8
  4065. jg convertloop
  4066. vzeroupper
  4067. ret
  4068. }
  4069. }
  4070. #else // USE_GATHER
  4071. __declspec(naked) void ARGBUnattenuateRow_AVX2(const uint8_t* src_argb,
  4072. uint8_t* dst_argb,
  4073. int width) {
  4074. __asm {
  4075. push ebx
  4076. push esi
  4077. push edi
  4078. mov eax, [esp + 12 + 4] // src_argb
  4079. mov edx, [esp + 12 + 8] // dst_argb
  4080. mov ecx, [esp + 12 + 12] // width
  4081. sub edx, eax
  4082. lea ebx, fixed_invtbl8
  4083. vbroadcastf128 ymm5, xmmword ptr kUnattenShuffleAlpha_AVX2
  4084. convertloop:
  4085. // replace VPGATHER
  4086. movzx esi, byte ptr [eax + 3] // alpha0
  4087. movzx edi, byte ptr [eax + 7] // alpha1
  4088. vmovd xmm0, dword ptr [ebx + esi * 4] // [1,a0]
  4089. vmovd xmm1, dword ptr [ebx + edi * 4] // [1,a1]
  4090. movzx esi, byte ptr [eax + 11] // alpha2
  4091. movzx edi, byte ptr [eax + 15] // alpha3
  4092. vpunpckldq xmm6, xmm0, xmm1 // [1,a1,1,a0]
  4093. vmovd xmm2, dword ptr [ebx + esi * 4] // [1,a2]
  4094. vmovd xmm3, dword ptr [ebx + edi * 4] // [1,a3]
  4095. movzx esi, byte ptr [eax + 19] // alpha4
  4096. movzx edi, byte ptr [eax + 23] // alpha5
  4097. vpunpckldq xmm7, xmm2, xmm3 // [1,a3,1,a2]
  4098. vmovd xmm0, dword ptr [ebx + esi * 4] // [1,a4]
  4099. vmovd xmm1, dword ptr [ebx + edi * 4] // [1,a5]
  4100. movzx esi, byte ptr [eax + 27] // alpha6
  4101. movzx edi, byte ptr [eax + 31] // alpha7
  4102. vpunpckldq xmm0, xmm0, xmm1 // [1,a5,1,a4]
  4103. vmovd xmm2, dword ptr [ebx + esi * 4] // [1,a6]
  4104. vmovd xmm3, dword ptr [ebx + edi * 4] // [1,a7]
  4105. vpunpckldq xmm2, xmm2, xmm3 // [1,a7,1,a6]
  4106. vpunpcklqdq xmm3, xmm6, xmm7 // [1,a3,1,a2,1,a1,1,a0]
  4107. vpunpcklqdq xmm0, xmm0, xmm2 // [1,a7,1,a6,1,a5,1,a4]
  4108. vinserti128 ymm3, ymm3, xmm0, 1 // [1,a7,1,a6,1,a5,1,a4,1,a3,1,a2,1,a1,1,a0]
  4109. // end of VPGATHER
  4110. vmovdqu ymm6, [eax] // read 8 pixels.
  4111. vpunpcklbw ymm0, ymm6, ymm6 // low 4 pixels. mutated.
  4112. vpunpckhbw ymm1, ymm6, ymm6 // high 4 pixels. mutated.
  4113. vpunpcklwd ymm2, ymm3, ymm3 // low 4 inverted alphas. mutated. 1, 1, a, a
  4114. vpunpckhwd ymm3, ymm3, ymm3 // high 4 inverted alphas. mutated.
  4115. vpshufb ymm2, ymm2, ymm5 // replicate low 4 alphas. 1, a, a, a
  4116. vpshufb ymm3, ymm3, ymm5 // replicate high 4 alphas
  4117. vpmulhuw ymm0, ymm0, ymm2 // rgb * ia
  4118. vpmulhuw ymm1, ymm1, ymm3 // rgb * ia
  4119. vpackuswb ymm0, ymm0, ymm1 // unmutated.
  4120. vmovdqu [eax + edx], ymm0
  4121. lea eax, [eax + 32]
  4122. sub ecx, 8
  4123. jg convertloop
  4124. pop edi
  4125. pop esi
  4126. pop ebx
  4127. vzeroupper
  4128. ret
  4129. }
  4130. }
  4131. #endif // USE_GATHER
  4132. #endif // HAS_ARGBATTENUATEROW_AVX2
  4133. #ifdef HAS_ARGBGRAYROW_SSSE3
  4134. // Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels.
  4135. __declspec(naked) void ARGBGrayRow_SSSE3(const uint8_t* src_argb,
  4136. uint8_t* dst_argb,
  4137. int width) {
  4138. __asm {
  4139. mov eax, [esp + 4] /* src_argb */
  4140. mov edx, [esp + 8] /* dst_argb */
  4141. mov ecx, [esp + 12] /* width */
  4142. movdqa xmm4, xmmword ptr kARGBToYJ
  4143. movdqa xmm5, xmmword ptr kAddYJ64
  4144. convertloop:
  4145. movdqu xmm0, [eax] // G
  4146. movdqu xmm1, [eax + 16]
  4147. pmaddubsw xmm0, xmm4
  4148. pmaddubsw xmm1, xmm4
  4149. phaddw xmm0, xmm1
  4150. paddw xmm0, xmm5 // Add .5 for rounding.
  4151. psrlw xmm0, 7
  4152. packuswb xmm0, xmm0 // 8 G bytes
  4153. movdqu xmm2, [eax] // A
  4154. movdqu xmm3, [eax + 16]
  4155. lea eax, [eax + 32]
  4156. psrld xmm2, 24
  4157. psrld xmm3, 24
  4158. packuswb xmm2, xmm3
  4159. packuswb xmm2, xmm2 // 8 A bytes
  4160. movdqa xmm3, xmm0 // Weave into GG, GA, then GGGA
  4161. punpcklbw xmm0, xmm0 // 8 GG words
  4162. punpcklbw xmm3, xmm2 // 8 GA words
  4163. movdqa xmm1, xmm0
  4164. punpcklwd xmm0, xmm3 // GGGA first 4
  4165. punpckhwd xmm1, xmm3 // GGGA next 4
  4166. movdqu [edx], xmm0
  4167. movdqu [edx + 16], xmm1
  4168. lea edx, [edx + 32]
  4169. sub ecx, 8
  4170. jg convertloop
  4171. ret
  4172. }
  4173. }
  4174. #endif // HAS_ARGBGRAYROW_SSSE3
  4175. #ifdef HAS_ARGBSEPIAROW_SSSE3
  4176. // b = (r * 35 + g * 68 + b * 17) >> 7
  4177. // g = (r * 45 + g * 88 + b * 22) >> 7
  4178. // r = (r * 50 + g * 98 + b * 24) >> 7
  4179. // Constant for ARGB color to sepia tone.
  4180. static const vec8 kARGBToSepiaB = {17, 68, 35, 0, 17, 68, 35, 0,
  4181. 17, 68, 35, 0, 17, 68, 35, 0};
  4182. static const vec8 kARGBToSepiaG = {22, 88, 45, 0, 22, 88, 45, 0,
  4183. 22, 88, 45, 0, 22, 88, 45, 0};
  4184. static const vec8 kARGBToSepiaR = {24, 98, 50, 0, 24, 98, 50, 0,
  4185. 24, 98, 50, 0, 24, 98, 50, 0};
  4186. // Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels.
  4187. __declspec(naked) void ARGBSepiaRow_SSSE3(uint8_t* dst_argb, int width) {
  4188. __asm {
  4189. mov eax, [esp + 4] /* dst_argb */
  4190. mov ecx, [esp + 8] /* width */
  4191. movdqa xmm2, xmmword ptr kARGBToSepiaB
  4192. movdqa xmm3, xmmword ptr kARGBToSepiaG
  4193. movdqa xmm4, xmmword ptr kARGBToSepiaR
  4194. convertloop:
  4195. movdqu xmm0, [eax] // B
  4196. movdqu xmm6, [eax + 16]
  4197. pmaddubsw xmm0, xmm2
  4198. pmaddubsw xmm6, xmm2
  4199. phaddw xmm0, xmm6
  4200. psrlw xmm0, 7
  4201. packuswb xmm0, xmm0 // 8 B values
  4202. movdqu xmm5, [eax] // G
  4203. movdqu xmm1, [eax + 16]
  4204. pmaddubsw xmm5, xmm3
  4205. pmaddubsw xmm1, xmm3
  4206. phaddw xmm5, xmm1
  4207. psrlw xmm5, 7
  4208. packuswb xmm5, xmm5 // 8 G values
  4209. punpcklbw xmm0, xmm5 // 8 BG values
  4210. movdqu xmm5, [eax] // R
  4211. movdqu xmm1, [eax + 16]
  4212. pmaddubsw xmm5, xmm4
  4213. pmaddubsw xmm1, xmm4
  4214. phaddw xmm5, xmm1
  4215. psrlw xmm5, 7
  4216. packuswb xmm5, xmm5 // 8 R values
  4217. movdqu xmm6, [eax] // A
  4218. movdqu xmm1, [eax + 16]
  4219. psrld xmm6, 24
  4220. psrld xmm1, 24
  4221. packuswb xmm6, xmm1
  4222. packuswb xmm6, xmm6 // 8 A values
  4223. punpcklbw xmm5, xmm6 // 8 RA values
  4224. movdqa xmm1, xmm0 // Weave BG, RA together
  4225. punpcklwd xmm0, xmm5 // BGRA first 4
  4226. punpckhwd xmm1, xmm5 // BGRA next 4
  4227. movdqu [eax], xmm0
  4228. movdqu [eax + 16], xmm1
  4229. lea eax, [eax + 32]
  4230. sub ecx, 8
  4231. jg convertloop
  4232. ret
  4233. }
  4234. }
  4235. #endif // HAS_ARGBSEPIAROW_SSSE3
  4236. #ifdef HAS_ARGBCOLORMATRIXROW_SSSE3
  4237. // Tranform 8 ARGB pixels (32 bytes) with color matrix.
  4238. // Same as Sepia except matrix is provided.
  4239. // TODO(fbarchard): packuswbs only use half of the reg. To make RGBA, combine R
  4240. // and B into a high and low, then G/A, unpackl/hbw and then unpckl/hwd.
  4241. __declspec(naked) void ARGBColorMatrixRow_SSSE3(const uint8_t* src_argb,
  4242. uint8_t* dst_argb,
  4243. const int8_t* matrix_argb,
  4244. int width) {
  4245. __asm {
  4246. mov eax, [esp + 4] /* src_argb */
  4247. mov edx, [esp + 8] /* dst_argb */
  4248. mov ecx, [esp + 12] /* matrix_argb */
  4249. movdqu xmm5, [ecx]
  4250. pshufd xmm2, xmm5, 0x00
  4251. pshufd xmm3, xmm5, 0x55
  4252. pshufd xmm4, xmm5, 0xaa
  4253. pshufd xmm5, xmm5, 0xff
  4254. mov ecx, [esp + 16] /* width */
  4255. convertloop:
  4256. movdqu xmm0, [eax] // B
  4257. movdqu xmm7, [eax + 16]
  4258. pmaddubsw xmm0, xmm2
  4259. pmaddubsw xmm7, xmm2
  4260. movdqu xmm6, [eax] // G
  4261. movdqu xmm1, [eax + 16]
  4262. pmaddubsw xmm6, xmm3
  4263. pmaddubsw xmm1, xmm3
  4264. phaddsw xmm0, xmm7 // B
  4265. phaddsw xmm6, xmm1 // G
  4266. psraw xmm0, 6 // B
  4267. psraw xmm6, 6 // G
  4268. packuswb xmm0, xmm0 // 8 B values
  4269. packuswb xmm6, xmm6 // 8 G values
  4270. punpcklbw xmm0, xmm6 // 8 BG values
  4271. movdqu xmm1, [eax] // R
  4272. movdqu xmm7, [eax + 16]
  4273. pmaddubsw xmm1, xmm4
  4274. pmaddubsw xmm7, xmm4
  4275. phaddsw xmm1, xmm7 // R
  4276. movdqu xmm6, [eax] // A
  4277. movdqu xmm7, [eax + 16]
  4278. pmaddubsw xmm6, xmm5
  4279. pmaddubsw xmm7, xmm5
  4280. phaddsw xmm6, xmm7 // A
  4281. psraw xmm1, 6 // R
  4282. psraw xmm6, 6 // A
  4283. packuswb xmm1, xmm1 // 8 R values
  4284. packuswb xmm6, xmm6 // 8 A values
  4285. punpcklbw xmm1, xmm6 // 8 RA values
  4286. movdqa xmm6, xmm0 // Weave BG, RA together
  4287. punpcklwd xmm0, xmm1 // BGRA first 4
  4288. punpckhwd xmm6, xmm1 // BGRA next 4
  4289. movdqu [edx], xmm0
  4290. movdqu [edx + 16], xmm6
  4291. lea eax, [eax + 32]
  4292. lea edx, [edx + 32]
  4293. sub ecx, 8
  4294. jg convertloop
  4295. ret
  4296. }
  4297. }
  4298. #endif // HAS_ARGBCOLORMATRIXROW_SSSE3
  4299. #ifdef HAS_ARGBQUANTIZEROW_SSE2
  4300. // Quantize 4 ARGB pixels (16 bytes).
  4301. __declspec(naked) void ARGBQuantizeRow_SSE2(uint8_t* dst_argb,
  4302. int scale,
  4303. int interval_size,
  4304. int interval_offset,
  4305. int width) {
  4306. __asm {
  4307. mov eax, [esp + 4] /* dst_argb */
  4308. movd xmm2, [esp + 8] /* scale */
  4309. movd xmm3, [esp + 12] /* interval_size */
  4310. movd xmm4, [esp + 16] /* interval_offset */
  4311. mov ecx, [esp + 20] /* width */
  4312. pshuflw xmm2, xmm2, 040h
  4313. pshufd xmm2, xmm2, 044h
  4314. pshuflw xmm3, xmm3, 040h
  4315. pshufd xmm3, xmm3, 044h
  4316. pshuflw xmm4, xmm4, 040h
  4317. pshufd xmm4, xmm4, 044h
  4318. pxor xmm5, xmm5 // constant 0
  4319. pcmpeqb xmm6, xmm6 // generate mask 0xff000000
  4320. pslld xmm6, 24
  4321. convertloop:
  4322. movdqu xmm0, [eax] // read 4 pixels
  4323. punpcklbw xmm0, xmm5 // first 2 pixels
  4324. pmulhuw xmm0, xmm2 // pixel * scale >> 16
  4325. movdqu xmm1, [eax] // read 4 pixels
  4326. punpckhbw xmm1, xmm5 // next 2 pixels
  4327. pmulhuw xmm1, xmm2
  4328. pmullw xmm0, xmm3 // * interval_size
  4329. movdqu xmm7, [eax] // read 4 pixels
  4330. pmullw xmm1, xmm3
  4331. pand xmm7, xmm6 // mask alpha
  4332. paddw xmm0, xmm4 // + interval_size / 2
  4333. paddw xmm1, xmm4
  4334. packuswb xmm0, xmm1
  4335. por xmm0, xmm7
  4336. movdqu [eax], xmm0
  4337. lea eax, [eax + 16]
  4338. sub ecx, 4
  4339. jg convertloop
  4340. ret
  4341. }
  4342. }
  4343. #endif // HAS_ARGBQUANTIZEROW_SSE2
  4344. #ifdef HAS_ARGBSHADEROW_SSE2
  4345. // Shade 4 pixels at a time by specified value.
  4346. __declspec(naked) void ARGBShadeRow_SSE2(const uint8_t* src_argb,
  4347. uint8_t* dst_argb,
  4348. int width,
  4349. uint32_t value) {
  4350. __asm {
  4351. mov eax, [esp + 4] // src_argb
  4352. mov edx, [esp + 8] // dst_argb
  4353. mov ecx, [esp + 12] // width
  4354. movd xmm2, [esp + 16] // value
  4355. punpcklbw xmm2, xmm2
  4356. punpcklqdq xmm2, xmm2
  4357. convertloop:
  4358. movdqu xmm0, [eax] // read 4 pixels
  4359. lea eax, [eax + 16]
  4360. movdqa xmm1, xmm0
  4361. punpcklbw xmm0, xmm0 // first 2
  4362. punpckhbw xmm1, xmm1 // next 2
  4363. pmulhuw xmm0, xmm2 // argb * value
  4364. pmulhuw xmm1, xmm2 // argb * value
  4365. psrlw xmm0, 8
  4366. psrlw xmm1, 8
  4367. packuswb xmm0, xmm1
  4368. movdqu [edx], xmm0
  4369. lea edx, [edx + 16]
  4370. sub ecx, 4
  4371. jg convertloop
  4372. ret
  4373. }
  4374. }
  4375. #endif // HAS_ARGBSHADEROW_SSE2
  4376. #ifdef HAS_ARGBMULTIPLYROW_SSE2
  4377. // Multiply 2 rows of ARGB pixels together, 4 pixels at a time.
  4378. __declspec(naked) void ARGBMultiplyRow_SSE2(const uint8_t* src_argb0,
  4379. const uint8_t* src_argb1,
  4380. uint8_t* dst_argb,
  4381. int width) {
  4382. __asm {
  4383. push esi
  4384. mov eax, [esp + 4 + 4] // src_argb0
  4385. mov esi, [esp + 4 + 8] // src_argb1
  4386. mov edx, [esp + 4 + 12] // dst_argb
  4387. mov ecx, [esp + 4 + 16] // width
  4388. pxor xmm5, xmm5 // constant 0
  4389. convertloop:
  4390. movdqu xmm0, [eax] // read 4 pixels from src_argb0
  4391. movdqu xmm2, [esi] // read 4 pixels from src_argb1
  4392. movdqu xmm1, xmm0
  4393. movdqu xmm3, xmm2
  4394. punpcklbw xmm0, xmm0 // first 2
  4395. punpckhbw xmm1, xmm1 // next 2
  4396. punpcklbw xmm2, xmm5 // first 2
  4397. punpckhbw xmm3, xmm5 // next 2
  4398. pmulhuw xmm0, xmm2 // src_argb0 * src_argb1 first 2
  4399. pmulhuw xmm1, xmm3 // src_argb0 * src_argb1 next 2
  4400. lea eax, [eax + 16]
  4401. lea esi, [esi + 16]
  4402. packuswb xmm0, xmm1
  4403. movdqu [edx], xmm0
  4404. lea edx, [edx + 16]
  4405. sub ecx, 4
  4406. jg convertloop
  4407. pop esi
  4408. ret
  4409. }
  4410. }
  4411. #endif // HAS_ARGBMULTIPLYROW_SSE2
  4412. #ifdef HAS_ARGBADDROW_SSE2
  4413. // Add 2 rows of ARGB pixels together, 4 pixels at a time.
  4414. // TODO(fbarchard): Port this to posix, neon and other math functions.
  4415. __declspec(naked) void ARGBAddRow_SSE2(const uint8_t* src_argb0,
  4416. const uint8_t* src_argb1,
  4417. uint8_t* dst_argb,
  4418. int width) {
  4419. __asm {
  4420. push esi
  4421. mov eax, [esp + 4 + 4] // src_argb0
  4422. mov esi, [esp + 4 + 8] // src_argb1
  4423. mov edx, [esp + 4 + 12] // dst_argb
  4424. mov ecx, [esp + 4 + 16] // width
  4425. sub ecx, 4
  4426. jl convertloop49
  4427. convertloop4:
  4428. movdqu xmm0, [eax] // read 4 pixels from src_argb0
  4429. lea eax, [eax + 16]
  4430. movdqu xmm1, [esi] // read 4 pixels from src_argb1
  4431. lea esi, [esi + 16]
  4432. paddusb xmm0, xmm1 // src_argb0 + src_argb1
  4433. movdqu [edx], xmm0
  4434. lea edx, [edx + 16]
  4435. sub ecx, 4
  4436. jge convertloop4
  4437. convertloop49:
  4438. add ecx, 4 - 1
  4439. jl convertloop19
  4440. convertloop1:
  4441. movd xmm0, [eax] // read 1 pixels from src_argb0
  4442. lea eax, [eax + 4]
  4443. movd xmm1, [esi] // read 1 pixels from src_argb1
  4444. lea esi, [esi + 4]
  4445. paddusb xmm0, xmm1 // src_argb0 + src_argb1
  4446. movd [edx], xmm0
  4447. lea edx, [edx + 4]
  4448. sub ecx, 1
  4449. jge convertloop1
  4450. convertloop19:
  4451. pop esi
  4452. ret
  4453. }
  4454. }
  4455. #endif // HAS_ARGBADDROW_SSE2
  4456. #ifdef HAS_ARGBSUBTRACTROW_SSE2
  4457. // Subtract 2 rows of ARGB pixels together, 4 pixels at a time.
  4458. __declspec(naked) void ARGBSubtractRow_SSE2(const uint8_t* src_argb0,
  4459. const uint8_t* src_argb1,
  4460. uint8_t* dst_argb,
  4461. int width) {
  4462. __asm {
  4463. push esi
  4464. mov eax, [esp + 4 + 4] // src_argb0
  4465. mov esi, [esp + 4 + 8] // src_argb1
  4466. mov edx, [esp + 4 + 12] // dst_argb
  4467. mov ecx, [esp + 4 + 16] // width
  4468. convertloop:
  4469. movdqu xmm0, [eax] // read 4 pixels from src_argb0
  4470. lea eax, [eax + 16]
  4471. movdqu xmm1, [esi] // read 4 pixels from src_argb1
  4472. lea esi, [esi + 16]
  4473. psubusb xmm0, xmm1 // src_argb0 - src_argb1
  4474. movdqu [edx], xmm0
  4475. lea edx, [edx + 16]
  4476. sub ecx, 4
  4477. jg convertloop
  4478. pop esi
  4479. ret
  4480. }
  4481. }
  4482. #endif // HAS_ARGBSUBTRACTROW_SSE2
  4483. #ifdef HAS_ARGBMULTIPLYROW_AVX2
  4484. // Multiply 2 rows of ARGB pixels together, 8 pixels at a time.
  4485. __declspec(naked) void ARGBMultiplyRow_AVX2(const uint8_t* src_argb0,
  4486. const uint8_t* src_argb1,
  4487. uint8_t* dst_argb,
  4488. int width) {
  4489. __asm {
  4490. push esi
  4491. mov eax, [esp + 4 + 4] // src_argb0
  4492. mov esi, [esp + 4 + 8] // src_argb1
  4493. mov edx, [esp + 4 + 12] // dst_argb
  4494. mov ecx, [esp + 4 + 16] // width
  4495. vpxor ymm5, ymm5, ymm5 // constant 0
  4496. convertloop:
  4497. vmovdqu ymm1, [eax] // read 8 pixels from src_argb0
  4498. lea eax, [eax + 32]
  4499. vmovdqu ymm3, [esi] // read 8 pixels from src_argb1
  4500. lea esi, [esi + 32]
  4501. vpunpcklbw ymm0, ymm1, ymm1 // low 4
  4502. vpunpckhbw ymm1, ymm1, ymm1 // high 4
  4503. vpunpcklbw ymm2, ymm3, ymm5 // low 4
  4504. vpunpckhbw ymm3, ymm3, ymm5 // high 4
  4505. vpmulhuw ymm0, ymm0, ymm2 // src_argb0 * src_argb1 low 4
  4506. vpmulhuw ymm1, ymm1, ymm3 // src_argb0 * src_argb1 high 4
  4507. vpackuswb ymm0, ymm0, ymm1
  4508. vmovdqu [edx], ymm0
  4509. lea edx, [edx + 32]
  4510. sub ecx, 8
  4511. jg convertloop
  4512. pop esi
  4513. vzeroupper
  4514. ret
  4515. }
  4516. }
  4517. #endif // HAS_ARGBMULTIPLYROW_AVX2
  4518. #ifdef HAS_ARGBADDROW_AVX2
  4519. // Add 2 rows of ARGB pixels together, 8 pixels at a time.
  4520. __declspec(naked) void ARGBAddRow_AVX2(const uint8_t* src_argb0,
  4521. const uint8_t* src_argb1,
  4522. uint8_t* dst_argb,
  4523. int width) {
  4524. __asm {
  4525. push esi
  4526. mov eax, [esp + 4 + 4] // src_argb0
  4527. mov esi, [esp + 4 + 8] // src_argb1
  4528. mov edx, [esp + 4 + 12] // dst_argb
  4529. mov ecx, [esp + 4 + 16] // width
  4530. convertloop:
  4531. vmovdqu ymm0, [eax] // read 8 pixels from src_argb0
  4532. lea eax, [eax + 32]
  4533. vpaddusb ymm0, ymm0, [esi] // add 8 pixels from src_argb1
  4534. lea esi, [esi + 32]
  4535. vmovdqu [edx], ymm0
  4536. lea edx, [edx + 32]
  4537. sub ecx, 8
  4538. jg convertloop
  4539. pop esi
  4540. vzeroupper
  4541. ret
  4542. }
  4543. }
  4544. #endif // HAS_ARGBADDROW_AVX2
  4545. #ifdef HAS_ARGBSUBTRACTROW_AVX2
  4546. // Subtract 2 rows of ARGB pixels together, 8 pixels at a time.
  4547. __declspec(naked) void ARGBSubtractRow_AVX2(const uint8_t* src_argb0,
  4548. const uint8_t* src_argb1,
  4549. uint8_t* dst_argb,
  4550. int width) {
  4551. __asm {
  4552. push esi
  4553. mov eax, [esp + 4 + 4] // src_argb0
  4554. mov esi, [esp + 4 + 8] // src_argb1
  4555. mov edx, [esp + 4 + 12] // dst_argb
  4556. mov ecx, [esp + 4 + 16] // width
  4557. convertloop:
  4558. vmovdqu ymm0, [eax] // read 8 pixels from src_argb0
  4559. lea eax, [eax + 32]
  4560. vpsubusb ymm0, ymm0, [esi] // src_argb0 - src_argb1
  4561. lea esi, [esi + 32]
  4562. vmovdqu [edx], ymm0
  4563. lea edx, [edx + 32]
  4564. sub ecx, 8
  4565. jg convertloop
  4566. pop esi
  4567. vzeroupper
  4568. ret
  4569. }
  4570. }
  4571. #endif // HAS_ARGBSUBTRACTROW_AVX2
  4572. #ifdef HAS_SOBELXROW_SSE2
  4573. // SobelX as a matrix is
  4574. // -1 0 1
  4575. // -2 0 2
  4576. // -1 0 1
  4577. __declspec(naked) void SobelXRow_SSE2(const uint8_t* src_y0,
  4578. const uint8_t* src_y1,
  4579. const uint8_t* src_y2,
  4580. uint8_t* dst_sobelx,
  4581. int width) {
  4582. __asm {
  4583. push esi
  4584. push edi
  4585. mov eax, [esp + 8 + 4] // src_y0
  4586. mov esi, [esp + 8 + 8] // src_y1
  4587. mov edi, [esp + 8 + 12] // src_y2
  4588. mov edx, [esp + 8 + 16] // dst_sobelx
  4589. mov ecx, [esp + 8 + 20] // width
  4590. sub esi, eax
  4591. sub edi, eax
  4592. sub edx, eax
  4593. pxor xmm5, xmm5 // constant 0
  4594. convertloop:
  4595. movq xmm0, qword ptr [eax] // read 8 pixels from src_y0[0]
  4596. movq xmm1, qword ptr [eax + 2] // read 8 pixels from src_y0[2]
  4597. punpcklbw xmm0, xmm5
  4598. punpcklbw xmm1, xmm5
  4599. psubw xmm0, xmm1
  4600. movq xmm1, qword ptr [eax + esi] // read 8 pixels from src_y1[0]
  4601. movq xmm2, qword ptr [eax + esi + 2] // read 8 pixels from src_y1[2]
  4602. punpcklbw xmm1, xmm5
  4603. punpcklbw xmm2, xmm5
  4604. psubw xmm1, xmm2
  4605. movq xmm2, qword ptr [eax + edi] // read 8 pixels from src_y2[0]
  4606. movq xmm3, qword ptr [eax + edi + 2] // read 8 pixels from src_y2[2]
  4607. punpcklbw xmm2, xmm5
  4608. punpcklbw xmm3, xmm5
  4609. psubw xmm2, xmm3
  4610. paddw xmm0, xmm2
  4611. paddw xmm0, xmm1
  4612. paddw xmm0, xmm1
  4613. pxor xmm1, xmm1 // abs = max(xmm0, -xmm0). SSSE3 could use pabsw
  4614. psubw xmm1, xmm0
  4615. pmaxsw xmm0, xmm1
  4616. packuswb xmm0, xmm0
  4617. movq qword ptr [eax + edx], xmm0
  4618. lea eax, [eax + 8]
  4619. sub ecx, 8
  4620. jg convertloop
  4621. pop edi
  4622. pop esi
  4623. ret
  4624. }
  4625. }
  4626. #endif // HAS_SOBELXROW_SSE2
  4627. #ifdef HAS_SOBELYROW_SSE2
  4628. // SobelY as a matrix is
  4629. // -1 -2 -1
  4630. // 0 0 0
  4631. // 1 2 1
  4632. __declspec(naked) void SobelYRow_SSE2(const uint8_t* src_y0,
  4633. const uint8_t* src_y1,
  4634. uint8_t* dst_sobely,
  4635. int width) {
  4636. __asm {
  4637. push esi
  4638. mov eax, [esp + 4 + 4] // src_y0
  4639. mov esi, [esp + 4 + 8] // src_y1
  4640. mov edx, [esp + 4 + 12] // dst_sobely
  4641. mov ecx, [esp + 4 + 16] // width
  4642. sub esi, eax
  4643. sub edx, eax
  4644. pxor xmm5, xmm5 // constant 0
  4645. convertloop:
  4646. movq xmm0, qword ptr [eax] // read 8 pixels from src_y0[0]
  4647. movq xmm1, qword ptr [eax + esi] // read 8 pixels from src_y1[0]
  4648. punpcklbw xmm0, xmm5
  4649. punpcklbw xmm1, xmm5
  4650. psubw xmm0, xmm1
  4651. movq xmm1, qword ptr [eax + 1] // read 8 pixels from src_y0[1]
  4652. movq xmm2, qword ptr [eax + esi + 1] // read 8 pixels from src_y1[1]
  4653. punpcklbw xmm1, xmm5
  4654. punpcklbw xmm2, xmm5
  4655. psubw xmm1, xmm2
  4656. movq xmm2, qword ptr [eax + 2] // read 8 pixels from src_y0[2]
  4657. movq xmm3, qword ptr [eax + esi + 2] // read 8 pixels from src_y1[2]
  4658. punpcklbw xmm2, xmm5
  4659. punpcklbw xmm3, xmm5
  4660. psubw xmm2, xmm3
  4661. paddw xmm0, xmm2
  4662. paddw xmm0, xmm1
  4663. paddw xmm0, xmm1
  4664. pxor xmm1, xmm1 // abs = max(xmm0, -xmm0). SSSE3 could use pabsw
  4665. psubw xmm1, xmm0
  4666. pmaxsw xmm0, xmm1
  4667. packuswb xmm0, xmm0
  4668. movq qword ptr [eax + edx], xmm0
  4669. lea eax, [eax + 8]
  4670. sub ecx, 8
  4671. jg convertloop
  4672. pop esi
  4673. ret
  4674. }
  4675. }
  4676. #endif // HAS_SOBELYROW_SSE2
  4677. #ifdef HAS_SOBELROW_SSE2
  4678. // Adds Sobel X and Sobel Y and stores Sobel into ARGB.
  4679. // A = 255
  4680. // R = Sobel
  4681. // G = Sobel
  4682. // B = Sobel
  4683. __declspec(naked) void SobelRow_SSE2(const uint8_t* src_sobelx,
  4684. const uint8_t* src_sobely,
  4685. uint8_t* dst_argb,
  4686. int width) {
  4687. __asm {
  4688. push esi
  4689. mov eax, [esp + 4 + 4] // src_sobelx
  4690. mov esi, [esp + 4 + 8] // src_sobely
  4691. mov edx, [esp + 4 + 12] // dst_argb
  4692. mov ecx, [esp + 4 + 16] // width
  4693. sub esi, eax
  4694. pcmpeqb xmm5, xmm5 // alpha 255
  4695. pslld xmm5, 24 // 0xff000000
  4696. convertloop:
  4697. movdqu xmm0, [eax] // read 16 pixels src_sobelx
  4698. movdqu xmm1, [eax + esi] // read 16 pixels src_sobely
  4699. lea eax, [eax + 16]
  4700. paddusb xmm0, xmm1 // sobel = sobelx + sobely
  4701. movdqa xmm2, xmm0 // GG
  4702. punpcklbw xmm2, xmm0 // First 8
  4703. punpckhbw xmm0, xmm0 // Next 8
  4704. movdqa xmm1, xmm2 // GGGG
  4705. punpcklwd xmm1, xmm2 // First 4
  4706. punpckhwd xmm2, xmm2 // Next 4
  4707. por xmm1, xmm5 // GGGA
  4708. por xmm2, xmm5
  4709. movdqa xmm3, xmm0 // GGGG
  4710. punpcklwd xmm3, xmm0 // Next 4
  4711. punpckhwd xmm0, xmm0 // Last 4
  4712. por xmm3, xmm5 // GGGA
  4713. por xmm0, xmm5
  4714. movdqu [edx], xmm1
  4715. movdqu [edx + 16], xmm2
  4716. movdqu [edx + 32], xmm3
  4717. movdqu [edx + 48], xmm0
  4718. lea edx, [edx + 64]
  4719. sub ecx, 16
  4720. jg convertloop
  4721. pop esi
  4722. ret
  4723. }
  4724. }
  4725. #endif // HAS_SOBELROW_SSE2
  4726. #ifdef HAS_SOBELTOPLANEROW_SSE2
  4727. // Adds Sobel X and Sobel Y and stores Sobel into a plane.
  4728. __declspec(naked) void SobelToPlaneRow_SSE2(const uint8_t* src_sobelx,
  4729. const uint8_t* src_sobely,
  4730. uint8_t* dst_y,
  4731. int width) {
  4732. __asm {
  4733. push esi
  4734. mov eax, [esp + 4 + 4] // src_sobelx
  4735. mov esi, [esp + 4 + 8] // src_sobely
  4736. mov edx, [esp + 4 + 12] // dst_argb
  4737. mov ecx, [esp + 4 + 16] // width
  4738. sub esi, eax
  4739. convertloop:
  4740. movdqu xmm0, [eax] // read 16 pixels src_sobelx
  4741. movdqu xmm1, [eax + esi] // read 16 pixels src_sobely
  4742. lea eax, [eax + 16]
  4743. paddusb xmm0, xmm1 // sobel = sobelx + sobely
  4744. movdqu [edx], xmm0
  4745. lea edx, [edx + 16]
  4746. sub ecx, 16
  4747. jg convertloop
  4748. pop esi
  4749. ret
  4750. }
  4751. }
  4752. #endif // HAS_SOBELTOPLANEROW_SSE2
  4753. #ifdef HAS_SOBELXYROW_SSE2
  4754. // Mixes Sobel X, Sobel Y and Sobel into ARGB.
  4755. // A = 255
  4756. // R = Sobel X
  4757. // G = Sobel
  4758. // B = Sobel Y
  4759. __declspec(naked) void SobelXYRow_SSE2(const uint8_t* src_sobelx,
  4760. const uint8_t* src_sobely,
  4761. uint8_t* dst_argb,
  4762. int width) {
  4763. __asm {
  4764. push esi
  4765. mov eax, [esp + 4 + 4] // src_sobelx
  4766. mov esi, [esp + 4 + 8] // src_sobely
  4767. mov edx, [esp + 4 + 12] // dst_argb
  4768. mov ecx, [esp + 4 + 16] // width
  4769. sub esi, eax
  4770. pcmpeqb xmm5, xmm5 // alpha 255
  4771. convertloop:
  4772. movdqu xmm0, [eax] // read 16 pixels src_sobelx
  4773. movdqu xmm1, [eax + esi] // read 16 pixels src_sobely
  4774. lea eax, [eax + 16]
  4775. movdqa xmm2, xmm0
  4776. paddusb xmm2, xmm1 // sobel = sobelx + sobely
  4777. movdqa xmm3, xmm0 // XA
  4778. punpcklbw xmm3, xmm5
  4779. punpckhbw xmm0, xmm5
  4780. movdqa xmm4, xmm1 // YS
  4781. punpcklbw xmm4, xmm2
  4782. punpckhbw xmm1, xmm2
  4783. movdqa xmm6, xmm4 // YSXA
  4784. punpcklwd xmm6, xmm3 // First 4
  4785. punpckhwd xmm4, xmm3 // Next 4
  4786. movdqa xmm7, xmm1 // YSXA
  4787. punpcklwd xmm7, xmm0 // Next 4
  4788. punpckhwd xmm1, xmm0 // Last 4
  4789. movdqu [edx], xmm6
  4790. movdqu [edx + 16], xmm4
  4791. movdqu [edx + 32], xmm7
  4792. movdqu [edx + 48], xmm1
  4793. lea edx, [edx + 64]
  4794. sub ecx, 16
  4795. jg convertloop
  4796. pop esi
  4797. ret
  4798. }
  4799. }
  4800. #endif // HAS_SOBELXYROW_SSE2
  4801. #ifdef HAS_CUMULATIVESUMTOAVERAGEROW_SSE2
  4802. // Consider float CumulativeSum.
  4803. // Consider calling CumulativeSum one row at time as needed.
  4804. // Consider circular CumulativeSum buffer of radius * 2 + 1 height.
  4805. // Convert cumulative sum for an area to an average for 1 pixel.
  4806. // topleft is pointer to top left of CumulativeSum buffer for area.
  4807. // botleft is pointer to bottom left of CumulativeSum buffer.
  4808. // width is offset from left to right of area in CumulativeSum buffer measured
  4809. // in number of ints.
  4810. // area is the number of pixels in the area being averaged.
  4811. // dst points to pixel to store result to.
  4812. // count is number of averaged pixels to produce.
  4813. // Does 4 pixels at a time.
  4814. // This function requires alignment on accumulation buffer pointers.
  4815. void CumulativeSumToAverageRow_SSE2(const int32_t* topleft,
  4816. const int32_t* botleft,
  4817. int width,
  4818. int area,
  4819. uint8_t* dst,
  4820. int count) {
  4821. __asm {
  4822. mov eax, topleft // eax topleft
  4823. mov esi, botleft // esi botleft
  4824. mov edx, width
  4825. movd xmm5, area
  4826. mov edi, dst
  4827. mov ecx, count
  4828. cvtdq2ps xmm5, xmm5
  4829. rcpss xmm4, xmm5 // 1.0f / area
  4830. pshufd xmm4, xmm4, 0
  4831. sub ecx, 4
  4832. jl l4b
  4833. cmp area, 128 // 128 pixels will not overflow 15 bits.
  4834. ja l4
  4835. pshufd xmm5, xmm5, 0 // area
  4836. pcmpeqb xmm6, xmm6 // constant of 65536.0 - 1 = 65535.0
  4837. psrld xmm6, 16
  4838. cvtdq2ps xmm6, xmm6
  4839. addps xmm5, xmm6 // (65536.0 + area - 1)
  4840. mulps xmm5, xmm4 // (65536.0 + area - 1) * 1 / area
  4841. cvtps2dq xmm5, xmm5 // 0.16 fixed point
  4842. packssdw xmm5, xmm5 // 16 bit shorts
  4843. // 4 pixel loop small blocks.
  4844. s4:
  4845. // top left
  4846. movdqu xmm0, [eax]
  4847. movdqu xmm1, [eax + 16]
  4848. movdqu xmm2, [eax + 32]
  4849. movdqu xmm3, [eax + 48]
  4850. // - top right
  4851. psubd xmm0, [eax + edx * 4]
  4852. psubd xmm1, [eax + edx * 4 + 16]
  4853. psubd xmm2, [eax + edx * 4 + 32]
  4854. psubd xmm3, [eax + edx * 4 + 48]
  4855. lea eax, [eax + 64]
  4856. // - bottom left
  4857. psubd xmm0, [esi]
  4858. psubd xmm1, [esi + 16]
  4859. psubd xmm2, [esi + 32]
  4860. psubd xmm3, [esi + 48]
  4861. // + bottom right
  4862. paddd xmm0, [esi + edx * 4]
  4863. paddd xmm1, [esi + edx * 4 + 16]
  4864. paddd xmm2, [esi + edx * 4 + 32]
  4865. paddd xmm3, [esi + edx * 4 + 48]
  4866. lea esi, [esi + 64]
  4867. packssdw xmm0, xmm1 // pack 4 pixels into 2 registers
  4868. packssdw xmm2, xmm3
  4869. pmulhuw xmm0, xmm5
  4870. pmulhuw xmm2, xmm5
  4871. packuswb xmm0, xmm2
  4872. movdqu [edi], xmm0
  4873. lea edi, [edi + 16]
  4874. sub ecx, 4
  4875. jge s4
  4876. jmp l4b
  4877. // 4 pixel loop
  4878. l4:
  4879. // top left
  4880. movdqu xmm0, [eax]
  4881. movdqu xmm1, [eax + 16]
  4882. movdqu xmm2, [eax + 32]
  4883. movdqu xmm3, [eax + 48]
  4884. // - top right
  4885. psubd xmm0, [eax + edx * 4]
  4886. psubd xmm1, [eax + edx * 4 + 16]
  4887. psubd xmm2, [eax + edx * 4 + 32]
  4888. psubd xmm3, [eax + edx * 4 + 48]
  4889. lea eax, [eax + 64]
  4890. // - bottom left
  4891. psubd xmm0, [esi]
  4892. psubd xmm1, [esi + 16]
  4893. psubd xmm2, [esi + 32]
  4894. psubd xmm3, [esi + 48]
  4895. // + bottom right
  4896. paddd xmm0, [esi + edx * 4]
  4897. paddd xmm1, [esi + edx * 4 + 16]
  4898. paddd xmm2, [esi + edx * 4 + 32]
  4899. paddd xmm3, [esi + edx * 4 + 48]
  4900. lea esi, [esi + 64]
  4901. cvtdq2ps xmm0, xmm0 // Average = Sum * 1 / Area
  4902. cvtdq2ps xmm1, xmm1
  4903. mulps xmm0, xmm4
  4904. mulps xmm1, xmm4
  4905. cvtdq2ps xmm2, xmm2
  4906. cvtdq2ps xmm3, xmm3
  4907. mulps xmm2, xmm4
  4908. mulps xmm3, xmm4
  4909. cvtps2dq xmm0, xmm0
  4910. cvtps2dq xmm1, xmm1
  4911. cvtps2dq xmm2, xmm2
  4912. cvtps2dq xmm3, xmm3
  4913. packssdw xmm0, xmm1
  4914. packssdw xmm2, xmm3
  4915. packuswb xmm0, xmm2
  4916. movdqu [edi], xmm0
  4917. lea edi, [edi + 16]
  4918. sub ecx, 4
  4919. jge l4
  4920. l4b:
  4921. add ecx, 4 - 1
  4922. jl l1b
  4923. // 1 pixel loop
  4924. l1:
  4925. movdqu xmm0, [eax]
  4926. psubd xmm0, [eax + edx * 4]
  4927. lea eax, [eax + 16]
  4928. psubd xmm0, [esi]
  4929. paddd xmm0, [esi + edx * 4]
  4930. lea esi, [esi + 16]
  4931. cvtdq2ps xmm0, xmm0
  4932. mulps xmm0, xmm4
  4933. cvtps2dq xmm0, xmm0
  4934. packssdw xmm0, xmm0
  4935. packuswb xmm0, xmm0
  4936. movd dword ptr [edi], xmm0
  4937. lea edi, [edi + 4]
  4938. sub ecx, 1
  4939. jge l1
  4940. l1b:
  4941. }
  4942. }
  4943. #endif // HAS_CUMULATIVESUMTOAVERAGEROW_SSE2
  4944. #ifdef HAS_COMPUTECUMULATIVESUMROW_SSE2
  4945. // Creates a table of cumulative sums where each value is a sum of all values
  4946. // above and to the left of the value.
  4947. void ComputeCumulativeSumRow_SSE2(const uint8_t* row,
  4948. int32_t* cumsum,
  4949. const int32_t* previous_cumsum,
  4950. int width) {
  4951. __asm {
  4952. mov eax, row
  4953. mov edx, cumsum
  4954. mov esi, previous_cumsum
  4955. mov ecx, width
  4956. pxor xmm0, xmm0
  4957. pxor xmm1, xmm1
  4958. sub ecx, 4
  4959. jl l4b
  4960. test edx, 15
  4961. jne l4b
  4962. // 4 pixel loop
  4963. l4:
  4964. movdqu xmm2, [eax] // 4 argb pixels 16 bytes.
  4965. lea eax, [eax + 16]
  4966. movdqa xmm4, xmm2
  4967. punpcklbw xmm2, xmm1
  4968. movdqa xmm3, xmm2
  4969. punpcklwd xmm2, xmm1
  4970. punpckhwd xmm3, xmm1
  4971. punpckhbw xmm4, xmm1
  4972. movdqa xmm5, xmm4
  4973. punpcklwd xmm4, xmm1
  4974. punpckhwd xmm5, xmm1
  4975. paddd xmm0, xmm2
  4976. movdqu xmm2, [esi] // previous row above.
  4977. paddd xmm2, xmm0
  4978. paddd xmm0, xmm3
  4979. movdqu xmm3, [esi + 16]
  4980. paddd xmm3, xmm0
  4981. paddd xmm0, xmm4
  4982. movdqu xmm4, [esi + 32]
  4983. paddd xmm4, xmm0
  4984. paddd xmm0, xmm5
  4985. movdqu xmm5, [esi + 48]
  4986. lea esi, [esi + 64]
  4987. paddd xmm5, xmm0
  4988. movdqu [edx], xmm2
  4989. movdqu [edx + 16], xmm3
  4990. movdqu [edx + 32], xmm4
  4991. movdqu [edx + 48], xmm5
  4992. lea edx, [edx + 64]
  4993. sub ecx, 4
  4994. jge l4
  4995. l4b:
  4996. add ecx, 4 - 1
  4997. jl l1b
  4998. // 1 pixel loop
  4999. l1:
  5000. movd xmm2, dword ptr [eax] // 1 argb pixel
  5001. lea eax, [eax + 4]
  5002. punpcklbw xmm2, xmm1
  5003. punpcklwd xmm2, xmm1
  5004. paddd xmm0, xmm2
  5005. movdqu xmm2, [esi]
  5006. lea esi, [esi + 16]
  5007. paddd xmm2, xmm0
  5008. movdqu [edx], xmm2
  5009. lea edx, [edx + 16]
  5010. sub ecx, 1
  5011. jge l1
  5012. l1b:
  5013. }
  5014. }
  5015. #endif // HAS_COMPUTECUMULATIVESUMROW_SSE2
  5016. #ifdef HAS_ARGBAFFINEROW_SSE2
  5017. // Copy ARGB pixels from source image with slope to a row of destination.
  5018. __declspec(naked) LIBYUV_API void ARGBAffineRow_SSE2(const uint8_t* src_argb,
  5019. int src_argb_stride,
  5020. uint8_t* dst_argb,
  5021. const float* uv_dudv,
  5022. int width) {
  5023. __asm {
  5024. push esi
  5025. push edi
  5026. mov eax, [esp + 12] // src_argb
  5027. mov esi, [esp + 16] // stride
  5028. mov edx, [esp + 20] // dst_argb
  5029. mov ecx, [esp + 24] // pointer to uv_dudv
  5030. movq xmm2, qword ptr [ecx] // uv
  5031. movq xmm7, qword ptr [ecx + 8] // dudv
  5032. mov ecx, [esp + 28] // width
  5033. shl esi, 16 // 4, stride
  5034. add esi, 4
  5035. movd xmm5, esi
  5036. sub ecx, 4
  5037. jl l4b
  5038. // setup for 4 pixel loop
  5039. pshufd xmm7, xmm7, 0x44 // dup dudv
  5040. pshufd xmm5, xmm5, 0 // dup 4, stride
  5041. movdqa xmm0, xmm2 // x0, y0, x1, y1
  5042. addps xmm0, xmm7
  5043. movlhps xmm2, xmm0
  5044. movdqa xmm4, xmm7
  5045. addps xmm4, xmm4 // dudv *= 2
  5046. movdqa xmm3, xmm2 // x2, y2, x3, y3
  5047. addps xmm3, xmm4
  5048. addps xmm4, xmm4 // dudv *= 4
  5049. // 4 pixel loop
  5050. l4:
  5051. cvttps2dq xmm0, xmm2 // x, y float to int first 2
  5052. cvttps2dq xmm1, xmm3 // x, y float to int next 2
  5053. packssdw xmm0, xmm1 // x, y as 8 shorts
  5054. pmaddwd xmm0, xmm5 // offsets = x * 4 + y * stride.
  5055. movd esi, xmm0
  5056. pshufd xmm0, xmm0, 0x39 // shift right
  5057. movd edi, xmm0
  5058. pshufd xmm0, xmm0, 0x39 // shift right
  5059. movd xmm1, [eax + esi] // read pixel 0
  5060. movd xmm6, [eax + edi] // read pixel 1
  5061. punpckldq xmm1, xmm6 // combine pixel 0 and 1
  5062. addps xmm2, xmm4 // x, y += dx, dy first 2
  5063. movq qword ptr [edx], xmm1
  5064. movd esi, xmm0
  5065. pshufd xmm0, xmm0, 0x39 // shift right
  5066. movd edi, xmm0
  5067. movd xmm6, [eax + esi] // read pixel 2
  5068. movd xmm0, [eax + edi] // read pixel 3
  5069. punpckldq xmm6, xmm0 // combine pixel 2 and 3
  5070. addps xmm3, xmm4 // x, y += dx, dy next 2
  5071. movq qword ptr 8[edx], xmm6
  5072. lea edx, [edx + 16]
  5073. sub ecx, 4
  5074. jge l4
  5075. l4b:
  5076. add ecx, 4 - 1
  5077. jl l1b
  5078. // 1 pixel loop
  5079. l1:
  5080. cvttps2dq xmm0, xmm2 // x, y float to int
  5081. packssdw xmm0, xmm0 // x, y as shorts
  5082. pmaddwd xmm0, xmm5 // offset = x * 4 + y * stride
  5083. addps xmm2, xmm7 // x, y += dx, dy
  5084. movd esi, xmm0
  5085. movd xmm0, [eax + esi] // copy a pixel
  5086. movd [edx], xmm0
  5087. lea edx, [edx + 4]
  5088. sub ecx, 1
  5089. jge l1
  5090. l1b:
  5091. pop edi
  5092. pop esi
  5093. ret
  5094. }
  5095. }
  5096. #endif // HAS_ARGBAFFINEROW_SSE2
  5097. #ifdef HAS_INTERPOLATEROW_AVX2
  5098. // Bilinear filter 32x2 -> 32x1
  5099. __declspec(naked) void InterpolateRow_AVX2(uint8_t* dst_ptr,
  5100. const uint8_t* src_ptr,
  5101. ptrdiff_t src_stride,
  5102. int dst_width,
  5103. int source_y_fraction) {
  5104. __asm {
  5105. push esi
  5106. push edi
  5107. mov edi, [esp + 8 + 4] // dst_ptr
  5108. mov esi, [esp + 8 + 8] // src_ptr
  5109. mov edx, [esp + 8 + 12] // src_stride
  5110. mov ecx, [esp + 8 + 16] // dst_width
  5111. mov eax, [esp + 8 + 20] // source_y_fraction (0..255)
  5112. // Dispatch to specialized filters if applicable.
  5113. cmp eax, 0
  5114. je xloop100 // 0 / 256. Blend 100 / 0.
  5115. sub edi, esi
  5116. cmp eax, 128
  5117. je xloop50 // 128 /256 is 0.50. Blend 50 / 50.
  5118. vmovd xmm0, eax // high fraction 0..255
  5119. neg eax
  5120. add eax, 256
  5121. vmovd xmm5, eax // low fraction 256..1
  5122. vpunpcklbw xmm5, xmm5, xmm0
  5123. vpunpcklwd xmm5, xmm5, xmm5
  5124. vbroadcastss ymm5, xmm5
  5125. mov eax, 0x80808080 // 128b for bias and rounding.
  5126. vmovd xmm4, eax
  5127. vbroadcastss ymm4, xmm4
  5128. xloop:
  5129. vmovdqu ymm0, [esi]
  5130. vmovdqu ymm2, [esi + edx]
  5131. vpunpckhbw ymm1, ymm0, ymm2 // mutates
  5132. vpunpcklbw ymm0, ymm0, ymm2
  5133. vpsubb ymm1, ymm1, ymm4 // bias to signed image
  5134. vpsubb ymm0, ymm0, ymm4
  5135. vpmaddubsw ymm1, ymm5, ymm1
  5136. vpmaddubsw ymm0, ymm5, ymm0
  5137. vpaddw ymm1, ymm1, ymm4 // unbias and round
  5138. vpaddw ymm0, ymm0, ymm4
  5139. vpsrlw ymm1, ymm1, 8
  5140. vpsrlw ymm0, ymm0, 8
  5141. vpackuswb ymm0, ymm0, ymm1 // unmutates
  5142. vmovdqu [esi + edi], ymm0
  5143. lea esi, [esi + 32]
  5144. sub ecx, 32
  5145. jg xloop
  5146. jmp xloop99
  5147. // Blend 50 / 50.
  5148. xloop50:
  5149. vmovdqu ymm0, [esi]
  5150. vpavgb ymm0, ymm0, [esi + edx]
  5151. vmovdqu [esi + edi], ymm0
  5152. lea esi, [esi + 32]
  5153. sub ecx, 32
  5154. jg xloop50
  5155. jmp xloop99
  5156. // Blend 100 / 0 - Copy row unchanged.
  5157. xloop100:
  5158. rep movsb
  5159. xloop99:
  5160. pop edi
  5161. pop esi
  5162. vzeroupper
  5163. ret
  5164. }
  5165. }
  5166. #endif // HAS_INTERPOLATEROW_AVX2
  5167. // Bilinear filter 16x2 -> 16x1
  5168. // TODO(fbarchard): Consider allowing 256 using memcpy.
  5169. __declspec(naked) void InterpolateRow_SSSE3(uint8_t* dst_ptr,
  5170. const uint8_t* src_ptr,
  5171. ptrdiff_t src_stride,
  5172. int dst_width,
  5173. int source_y_fraction) {
  5174. __asm {
  5175. push esi
  5176. push edi
  5177. mov edi, [esp + 8 + 4] // dst_ptr
  5178. mov esi, [esp + 8 + 8] // src_ptr
  5179. mov edx, [esp + 8 + 12] // src_stride
  5180. mov ecx, [esp + 8 + 16] // dst_width
  5181. mov eax, [esp + 8 + 20] // source_y_fraction (0..255)
  5182. sub edi, esi
  5183. // Dispatch to specialized filters if applicable.
  5184. cmp eax, 0
  5185. je xloop100 // 0 /256. Blend 100 / 0.
  5186. cmp eax, 128
  5187. je xloop50 // 128 / 256 is 0.50. Blend 50 / 50.
  5188. movd xmm0, eax // high fraction 0..255
  5189. neg eax
  5190. add eax, 256
  5191. movd xmm5, eax // low fraction 255..1
  5192. punpcklbw xmm5, xmm0
  5193. punpcklwd xmm5, xmm5
  5194. pshufd xmm5, xmm5, 0
  5195. mov eax, 0x80808080 // 128 for biasing image to signed.
  5196. movd xmm4, eax
  5197. pshufd xmm4, xmm4, 0x00
  5198. xloop:
  5199. movdqu xmm0, [esi]
  5200. movdqu xmm2, [esi + edx]
  5201. movdqu xmm1, xmm0
  5202. punpcklbw xmm0, xmm2
  5203. punpckhbw xmm1, xmm2
  5204. psubb xmm0, xmm4 // bias image by -128
  5205. psubb xmm1, xmm4
  5206. movdqa xmm2, xmm5
  5207. movdqa xmm3, xmm5
  5208. pmaddubsw xmm2, xmm0
  5209. pmaddubsw xmm3, xmm1
  5210. paddw xmm2, xmm4
  5211. paddw xmm3, xmm4
  5212. psrlw xmm2, 8
  5213. psrlw xmm3, 8
  5214. packuswb xmm2, xmm3
  5215. movdqu [esi + edi], xmm2
  5216. lea esi, [esi + 16]
  5217. sub ecx, 16
  5218. jg xloop
  5219. jmp xloop99
  5220. // Blend 50 / 50.
  5221. xloop50:
  5222. movdqu xmm0, [esi]
  5223. movdqu xmm1, [esi + edx]
  5224. pavgb xmm0, xmm1
  5225. movdqu [esi + edi], xmm0
  5226. lea esi, [esi + 16]
  5227. sub ecx, 16
  5228. jg xloop50
  5229. jmp xloop99
  5230. // Blend 100 / 0 - Copy row unchanged.
  5231. xloop100:
  5232. movdqu xmm0, [esi]
  5233. movdqu [esi + edi], xmm0
  5234. lea esi, [esi + 16]
  5235. sub ecx, 16
  5236. jg xloop100
  5237. xloop99:
  5238. pop edi
  5239. pop esi
  5240. ret
  5241. }
  5242. }
  5243. // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
  5244. __declspec(naked) void ARGBShuffleRow_SSSE3(const uint8_t* src_argb,
  5245. uint8_t* dst_argb,
  5246. const uint8_t* shuffler,
  5247. int width) {
  5248. __asm {
  5249. mov eax, [esp + 4] // src_argb
  5250. mov edx, [esp + 8] // dst_argb
  5251. mov ecx, [esp + 12] // shuffler
  5252. movdqu xmm5, [ecx]
  5253. mov ecx, [esp + 16] // width
  5254. wloop:
  5255. movdqu xmm0, [eax]
  5256. movdqu xmm1, [eax + 16]
  5257. lea eax, [eax + 32]
  5258. pshufb xmm0, xmm5
  5259. pshufb xmm1, xmm5
  5260. movdqu [edx], xmm0
  5261. movdqu [edx + 16], xmm1
  5262. lea edx, [edx + 32]
  5263. sub ecx, 8
  5264. jg wloop
  5265. ret
  5266. }
  5267. }
  5268. #ifdef HAS_ARGBSHUFFLEROW_AVX2
  5269. __declspec(naked) void ARGBShuffleRow_AVX2(const uint8_t* src_argb,
  5270. uint8_t* dst_argb,
  5271. const uint8_t* shuffler,
  5272. int width) {
  5273. __asm {
  5274. mov eax, [esp + 4] // src_argb
  5275. mov edx, [esp + 8] // dst_argb
  5276. mov ecx, [esp + 12] // shuffler
  5277. vbroadcastf128 ymm5, [ecx] // same shuffle in high as low.
  5278. mov ecx, [esp + 16] // width
  5279. wloop:
  5280. vmovdqu ymm0, [eax]
  5281. vmovdqu ymm1, [eax + 32]
  5282. lea eax, [eax + 64]
  5283. vpshufb ymm0, ymm0, ymm5
  5284. vpshufb ymm1, ymm1, ymm5
  5285. vmovdqu [edx], ymm0
  5286. vmovdqu [edx + 32], ymm1
  5287. lea edx, [edx + 64]
  5288. sub ecx, 16
  5289. jg wloop
  5290. vzeroupper
  5291. ret
  5292. }
  5293. }
  5294. #endif // HAS_ARGBSHUFFLEROW_AVX2
  5295. // YUY2 - Macro-pixel = 2 image pixels
  5296. // Y0U0Y1V0....Y2U2Y3V2...Y4U4Y5V4....
  5297. // UYVY - Macro-pixel = 2 image pixels
  5298. // U0Y0V0Y1
  5299. __declspec(naked) void I422ToYUY2Row_SSE2(const uint8_t* src_y,
  5300. const uint8_t* src_u,
  5301. const uint8_t* src_v,
  5302. uint8_t* dst_frame,
  5303. int width) {
  5304. __asm {
  5305. push esi
  5306. push edi
  5307. mov eax, [esp + 8 + 4] // src_y
  5308. mov esi, [esp + 8 + 8] // src_u
  5309. mov edx, [esp + 8 + 12] // src_v
  5310. mov edi, [esp + 8 + 16] // dst_frame
  5311. mov ecx, [esp + 8 + 20] // width
  5312. sub edx, esi
  5313. convertloop:
  5314. movq xmm2, qword ptr [esi] // U
  5315. movq xmm3, qword ptr [esi + edx] // V
  5316. lea esi, [esi + 8]
  5317. punpcklbw xmm2, xmm3 // UV
  5318. movdqu xmm0, [eax] // Y
  5319. lea eax, [eax + 16]
  5320. movdqa xmm1, xmm0
  5321. punpcklbw xmm0, xmm2 // YUYV
  5322. punpckhbw xmm1, xmm2
  5323. movdqu [edi], xmm0
  5324. movdqu [edi + 16], xmm1
  5325. lea edi, [edi + 32]
  5326. sub ecx, 16
  5327. jg convertloop
  5328. pop edi
  5329. pop esi
  5330. ret
  5331. }
  5332. }
  5333. __declspec(naked) void I422ToUYVYRow_SSE2(const uint8_t* src_y,
  5334. const uint8_t* src_u,
  5335. const uint8_t* src_v,
  5336. uint8_t* dst_frame,
  5337. int width) {
  5338. __asm {
  5339. push esi
  5340. push edi
  5341. mov eax, [esp + 8 + 4] // src_y
  5342. mov esi, [esp + 8 + 8] // src_u
  5343. mov edx, [esp + 8 + 12] // src_v
  5344. mov edi, [esp + 8 + 16] // dst_frame
  5345. mov ecx, [esp + 8 + 20] // width
  5346. sub edx, esi
  5347. convertloop:
  5348. movq xmm2, qword ptr [esi] // U
  5349. movq xmm3, qword ptr [esi + edx] // V
  5350. lea esi, [esi + 8]
  5351. punpcklbw xmm2, xmm3 // UV
  5352. movdqu xmm0, [eax] // Y
  5353. movdqa xmm1, xmm2
  5354. lea eax, [eax + 16]
  5355. punpcklbw xmm1, xmm0 // UYVY
  5356. punpckhbw xmm2, xmm0
  5357. movdqu [edi], xmm1
  5358. movdqu [edi + 16], xmm2
  5359. lea edi, [edi + 32]
  5360. sub ecx, 16
  5361. jg convertloop
  5362. pop edi
  5363. pop esi
  5364. ret
  5365. }
  5366. }
  5367. #ifdef HAS_ARGBPOLYNOMIALROW_SSE2
  5368. __declspec(naked) void ARGBPolynomialRow_SSE2(const uint8_t* src_argb,
  5369. uint8_t* dst_argb,
  5370. const float* poly,
  5371. int width) {
  5372. __asm {
  5373. push esi
  5374. mov eax, [esp + 4 + 4] /* src_argb */
  5375. mov edx, [esp + 4 + 8] /* dst_argb */
  5376. mov esi, [esp + 4 + 12] /* poly */
  5377. mov ecx, [esp + 4 + 16] /* width */
  5378. pxor xmm3, xmm3 // 0 constant for zero extending bytes to ints.
  5379. // 2 pixel loop.
  5380. convertloop:
  5381. // pmovzxbd xmm0, dword ptr [eax] // BGRA pixel
  5382. // pmovzxbd xmm4, dword ptr [eax + 4] // BGRA pixel
  5383. movq xmm0, qword ptr [eax] // BGRABGRA
  5384. lea eax, [eax + 8]
  5385. punpcklbw xmm0, xmm3
  5386. movdqa xmm4, xmm0
  5387. punpcklwd xmm0, xmm3 // pixel 0
  5388. punpckhwd xmm4, xmm3 // pixel 1
  5389. cvtdq2ps xmm0, xmm0 // 4 floats
  5390. cvtdq2ps xmm4, xmm4
  5391. movdqa xmm1, xmm0 // X
  5392. movdqa xmm5, xmm4
  5393. mulps xmm0, [esi + 16] // C1 * X
  5394. mulps xmm4, [esi + 16]
  5395. addps xmm0, [esi] // result = C0 + C1 * X
  5396. addps xmm4, [esi]
  5397. movdqa xmm2, xmm1
  5398. movdqa xmm6, xmm5
  5399. mulps xmm2, xmm1 // X * X
  5400. mulps xmm6, xmm5
  5401. mulps xmm1, xmm2 // X * X * X
  5402. mulps xmm5, xmm6
  5403. mulps xmm2, [esi + 32] // C2 * X * X
  5404. mulps xmm6, [esi + 32]
  5405. mulps xmm1, [esi + 48] // C3 * X * X * X
  5406. mulps xmm5, [esi + 48]
  5407. addps xmm0, xmm2 // result += C2 * X * X
  5408. addps xmm4, xmm6
  5409. addps xmm0, xmm1 // result += C3 * X * X * X
  5410. addps xmm4, xmm5
  5411. cvttps2dq xmm0, xmm0
  5412. cvttps2dq xmm4, xmm4
  5413. packuswb xmm0, xmm4
  5414. packuswb xmm0, xmm0
  5415. movq qword ptr [edx], xmm0
  5416. lea edx, [edx + 8]
  5417. sub ecx, 2
  5418. jg convertloop
  5419. pop esi
  5420. ret
  5421. }
  5422. }
  5423. #endif // HAS_ARGBPOLYNOMIALROW_SSE2
  5424. #ifdef HAS_ARGBPOLYNOMIALROW_AVX2
  5425. __declspec(naked) void ARGBPolynomialRow_AVX2(const uint8_t* src_argb,
  5426. uint8_t* dst_argb,
  5427. const float* poly,
  5428. int width) {
  5429. __asm {
  5430. mov eax, [esp + 4] /* src_argb */
  5431. mov edx, [esp + 8] /* dst_argb */
  5432. mov ecx, [esp + 12] /* poly */
  5433. vbroadcastf128 ymm4, [ecx] // C0
  5434. vbroadcastf128 ymm5, [ecx + 16] // C1
  5435. vbroadcastf128 ymm6, [ecx + 32] // C2
  5436. vbroadcastf128 ymm7, [ecx + 48] // C3
  5437. mov ecx, [esp + 16] /* width */
  5438. // 2 pixel loop.
  5439. convertloop:
  5440. vpmovzxbd ymm0, qword ptr [eax] // 2 BGRA pixels
  5441. lea eax, [eax + 8]
  5442. vcvtdq2ps ymm0, ymm0 // X 8 floats
  5443. vmulps ymm2, ymm0, ymm0 // X * X
  5444. vmulps ymm3, ymm0, ymm7 // C3 * X
  5445. vfmadd132ps ymm0, ymm4, ymm5 // result = C0 + C1 * X
  5446. vfmadd231ps ymm0, ymm2, ymm6 // result += C2 * X * X
  5447. vfmadd231ps ymm0, ymm2, ymm3 // result += C3 * X * X * X
  5448. vcvttps2dq ymm0, ymm0
  5449. vpackusdw ymm0, ymm0, ymm0 // b0g0r0a0_00000000_b0g0r0a0_00000000
  5450. vpermq ymm0, ymm0, 0xd8 // b0g0r0a0_b0g0r0a0_00000000_00000000
  5451. vpackuswb xmm0, xmm0, xmm0 // bgrabgra_00000000_00000000_00000000
  5452. vmovq qword ptr [edx], xmm0
  5453. lea edx, [edx + 8]
  5454. sub ecx, 2
  5455. jg convertloop
  5456. vzeroupper
  5457. ret
  5458. }
  5459. }
  5460. #endif // HAS_ARGBPOLYNOMIALROW_AVX2
  5461. #ifdef HAS_HALFFLOATROW_SSE2
  5462. static float kExpBias = 1.9259299444e-34f;
  5463. __declspec(naked) void HalfFloatRow_SSE2(const uint16_t* src,
  5464. uint16_t* dst,
  5465. float scale,
  5466. int width) {
  5467. __asm {
  5468. mov eax, [esp + 4] /* src */
  5469. mov edx, [esp + 8] /* dst */
  5470. movd xmm4, dword ptr [esp + 12] /* scale */
  5471. mov ecx, [esp + 16] /* width */
  5472. mulss xmm4, kExpBias
  5473. pshufd xmm4, xmm4, 0
  5474. pxor xmm5, xmm5
  5475. sub edx, eax
  5476. // 8 pixel loop.
  5477. convertloop:
  5478. movdqu xmm2, xmmword ptr [eax] // 8 shorts
  5479. add eax, 16
  5480. movdqa xmm3, xmm2
  5481. punpcklwd xmm2, xmm5
  5482. cvtdq2ps xmm2, xmm2 // convert 8 ints to floats
  5483. punpckhwd xmm3, xmm5
  5484. cvtdq2ps xmm3, xmm3
  5485. mulps xmm2, xmm4
  5486. mulps xmm3, xmm4
  5487. psrld xmm2, 13
  5488. psrld xmm3, 13
  5489. packssdw xmm2, xmm3
  5490. movdqu [eax + edx - 16], xmm2
  5491. sub ecx, 8
  5492. jg convertloop
  5493. ret
  5494. }
  5495. }
  5496. #endif // HAS_HALFFLOATROW_SSE2
  5497. #ifdef HAS_HALFFLOATROW_AVX2
  5498. __declspec(naked) void HalfFloatRow_AVX2(const uint16_t* src,
  5499. uint16_t* dst,
  5500. float scale,
  5501. int width) {
  5502. __asm {
  5503. mov eax, [esp + 4] /* src */
  5504. mov edx, [esp + 8] /* dst */
  5505. movd xmm4, dword ptr [esp + 12] /* scale */
  5506. mov ecx, [esp + 16] /* width */
  5507. vmulss xmm4, xmm4, kExpBias
  5508. vbroadcastss ymm4, xmm4
  5509. vpxor ymm5, ymm5, ymm5
  5510. sub edx, eax
  5511. // 16 pixel loop.
  5512. convertloop:
  5513. vmovdqu ymm2, [eax] // 16 shorts
  5514. add eax, 32
  5515. vpunpckhwd ymm3, ymm2, ymm5 // convert 16 shorts to 16 ints
  5516. vpunpcklwd ymm2, ymm2, ymm5
  5517. vcvtdq2ps ymm3, ymm3 // convert 16 ints to floats
  5518. vcvtdq2ps ymm2, ymm2
  5519. vmulps ymm3, ymm3, ymm4 // scale to adjust exponent for 5 bit range.
  5520. vmulps ymm2, ymm2, ymm4
  5521. vpsrld ymm3, ymm3, 13 // float convert to 8 half floats truncate
  5522. vpsrld ymm2, ymm2, 13
  5523. vpackssdw ymm2, ymm2, ymm3
  5524. vmovdqu [eax + edx - 32], ymm2
  5525. sub ecx, 16
  5526. jg convertloop
  5527. vzeroupper
  5528. ret
  5529. }
  5530. }
  5531. #endif // HAS_HALFFLOATROW_AVX2
  5532. #ifdef HAS_HALFFLOATROW_F16C
  5533. __declspec(naked) void HalfFloatRow_F16C(const uint16_t* src,
  5534. uint16_t* dst,
  5535. float scale,
  5536. int width) {
  5537. __asm {
  5538. mov eax, [esp + 4] /* src */
  5539. mov edx, [esp + 8] /* dst */
  5540. vbroadcastss ymm4, [esp + 12] /* scale */
  5541. mov ecx, [esp + 16] /* width */
  5542. sub edx, eax
  5543. // 16 pixel loop.
  5544. convertloop:
  5545. vpmovzxwd ymm2, xmmword ptr [eax] // 8 shorts -> 8 ints
  5546. vpmovzxwd ymm3, xmmword ptr [eax + 16] // 8 more shorts
  5547. add eax, 32
  5548. vcvtdq2ps ymm2, ymm2 // convert 8 ints to floats
  5549. vcvtdq2ps ymm3, ymm3
  5550. vmulps ymm2, ymm2, ymm4 // scale to normalized range 0 to 1
  5551. vmulps ymm3, ymm3, ymm4
  5552. vcvtps2ph xmm2, ymm2, 3 // float convert to 8 half floats truncate
  5553. vcvtps2ph xmm3, ymm3, 3
  5554. vmovdqu [eax + edx + 32], xmm2
  5555. vmovdqu [eax + edx + 32 + 16], xmm3
  5556. sub ecx, 16
  5557. jg convertloop
  5558. vzeroupper
  5559. ret
  5560. }
  5561. }
  5562. #endif // HAS_HALFFLOATROW_F16C
  5563. #ifdef HAS_ARGBCOLORTABLEROW_X86
  5564. // Tranform ARGB pixels with color table.
  5565. __declspec(naked) void ARGBColorTableRow_X86(uint8_t* dst_argb,
  5566. const uint8_t* table_argb,
  5567. int width) {
  5568. __asm {
  5569. push esi
  5570. mov eax, [esp + 4 + 4] /* dst_argb */
  5571. mov esi, [esp + 4 + 8] /* table_argb */
  5572. mov ecx, [esp + 4 + 12] /* width */
  5573. // 1 pixel loop.
  5574. convertloop:
  5575. movzx edx, byte ptr [eax]
  5576. lea eax, [eax + 4]
  5577. movzx edx, byte ptr [esi + edx * 4]
  5578. mov byte ptr [eax - 4], dl
  5579. movzx edx, byte ptr [eax - 4 + 1]
  5580. movzx edx, byte ptr [esi + edx * 4 + 1]
  5581. mov byte ptr [eax - 4 + 1], dl
  5582. movzx edx, byte ptr [eax - 4 + 2]
  5583. movzx edx, byte ptr [esi + edx * 4 + 2]
  5584. mov byte ptr [eax - 4 + 2], dl
  5585. movzx edx, byte ptr [eax - 4 + 3]
  5586. movzx edx, byte ptr [esi + edx * 4 + 3]
  5587. mov byte ptr [eax - 4 + 3], dl
  5588. dec ecx
  5589. jg convertloop
  5590. pop esi
  5591. ret
  5592. }
  5593. }
  5594. #endif // HAS_ARGBCOLORTABLEROW_X86
  5595. #ifdef HAS_RGBCOLORTABLEROW_X86
  5596. // Tranform RGB pixels with color table.
  5597. __declspec(naked) void RGBColorTableRow_X86(uint8_t* dst_argb,
  5598. const uint8_t* table_argb,
  5599. int width) {
  5600. __asm {
  5601. push esi
  5602. mov eax, [esp + 4 + 4] /* dst_argb */
  5603. mov esi, [esp + 4 + 8] /* table_argb */
  5604. mov ecx, [esp + 4 + 12] /* width */
  5605. // 1 pixel loop.
  5606. convertloop:
  5607. movzx edx, byte ptr [eax]
  5608. lea eax, [eax + 4]
  5609. movzx edx, byte ptr [esi + edx * 4]
  5610. mov byte ptr [eax - 4], dl
  5611. movzx edx, byte ptr [eax - 4 + 1]
  5612. movzx edx, byte ptr [esi + edx * 4 + 1]
  5613. mov byte ptr [eax - 4 + 1], dl
  5614. movzx edx, byte ptr [eax - 4 + 2]
  5615. movzx edx, byte ptr [esi + edx * 4 + 2]
  5616. mov byte ptr [eax - 4 + 2], dl
  5617. dec ecx
  5618. jg convertloop
  5619. pop esi
  5620. ret
  5621. }
  5622. }
  5623. #endif // HAS_RGBCOLORTABLEROW_X86
  5624. #ifdef HAS_ARGBLUMACOLORTABLEROW_SSSE3
  5625. // Tranform RGB pixels with luma table.
  5626. __declspec(naked) void ARGBLumaColorTableRow_SSSE3(const uint8_t* src_argb,
  5627. uint8_t* dst_argb,
  5628. int width,
  5629. const uint8_t* luma,
  5630. uint32_t lumacoeff) {
  5631. __asm {
  5632. push esi
  5633. push edi
  5634. mov eax, [esp + 8 + 4] /* src_argb */
  5635. mov edi, [esp + 8 + 8] /* dst_argb */
  5636. mov ecx, [esp + 8 + 12] /* width */
  5637. movd xmm2, dword ptr [esp + 8 + 16] // luma table
  5638. movd xmm3, dword ptr [esp + 8 + 20] // lumacoeff
  5639. pshufd xmm2, xmm2, 0
  5640. pshufd xmm3, xmm3, 0
  5641. pcmpeqb xmm4, xmm4 // generate mask 0xff00ff00
  5642. psllw xmm4, 8
  5643. pxor xmm5, xmm5
  5644. // 4 pixel loop.
  5645. convertloop:
  5646. movdqu xmm0, xmmword ptr [eax] // generate luma ptr
  5647. pmaddubsw xmm0, xmm3
  5648. phaddw xmm0, xmm0
  5649. pand xmm0, xmm4 // mask out low bits
  5650. punpcklwd xmm0, xmm5
  5651. paddd xmm0, xmm2 // add table base
  5652. movd esi, xmm0
  5653. pshufd xmm0, xmm0, 0x39 // 00111001 to rotate right 32
  5654. movzx edx, byte ptr [eax]
  5655. movzx edx, byte ptr [esi + edx]
  5656. mov byte ptr [edi], dl
  5657. movzx edx, byte ptr [eax + 1]
  5658. movzx edx, byte ptr [esi + edx]
  5659. mov byte ptr [edi + 1], dl
  5660. movzx edx, byte ptr [eax + 2]
  5661. movzx edx, byte ptr [esi + edx]
  5662. mov byte ptr [edi + 2], dl
  5663. movzx edx, byte ptr [eax + 3] // copy alpha.
  5664. mov byte ptr [edi + 3], dl
  5665. movd esi, xmm0
  5666. pshufd xmm0, xmm0, 0x39 // 00111001 to rotate right 32
  5667. movzx edx, byte ptr [eax + 4]
  5668. movzx edx, byte ptr [esi + edx]
  5669. mov byte ptr [edi + 4], dl
  5670. movzx edx, byte ptr [eax + 5]
  5671. movzx edx, byte ptr [esi + edx]
  5672. mov byte ptr [edi + 5], dl
  5673. movzx edx, byte ptr [eax + 6]
  5674. movzx edx, byte ptr [esi + edx]
  5675. mov byte ptr [edi + 6], dl
  5676. movzx edx, byte ptr [eax + 7] // copy alpha.
  5677. mov byte ptr [edi + 7], dl
  5678. movd esi, xmm0
  5679. pshufd xmm0, xmm0, 0x39 // 00111001 to rotate right 32
  5680. movzx edx, byte ptr [eax + 8]
  5681. movzx edx, byte ptr [esi + edx]
  5682. mov byte ptr [edi + 8], dl
  5683. movzx edx, byte ptr [eax + 9]
  5684. movzx edx, byte ptr [esi + edx]
  5685. mov byte ptr [edi + 9], dl
  5686. movzx edx, byte ptr [eax + 10]
  5687. movzx edx, byte ptr [esi + edx]
  5688. mov byte ptr [edi + 10], dl
  5689. movzx edx, byte ptr [eax + 11] // copy alpha.
  5690. mov byte ptr [edi + 11], dl
  5691. movd esi, xmm0
  5692. movzx edx, byte ptr [eax + 12]
  5693. movzx edx, byte ptr [esi + edx]
  5694. mov byte ptr [edi + 12], dl
  5695. movzx edx, byte ptr [eax + 13]
  5696. movzx edx, byte ptr [esi + edx]
  5697. mov byte ptr [edi + 13], dl
  5698. movzx edx, byte ptr [eax + 14]
  5699. movzx edx, byte ptr [esi + edx]
  5700. mov byte ptr [edi + 14], dl
  5701. movzx edx, byte ptr [eax + 15] // copy alpha.
  5702. mov byte ptr [edi + 15], dl
  5703. lea eax, [eax + 16]
  5704. lea edi, [edi + 16]
  5705. sub ecx, 4
  5706. jg convertloop
  5707. pop edi
  5708. pop esi
  5709. ret
  5710. }
  5711. }
  5712. #endif // HAS_ARGBLUMACOLORTABLEROW_SSSE3
  5713. #endif // defined(_M_X64)
  5714. #ifdef __cplusplus
  5715. } // extern "C"
  5716. } // namespace libyuv
  5717. #endif
  5718. #endif // !defined(LIBYUV_DISABLE_X86) && (defined(_M_IX86) || defined(_M_X64))