2
0

scale_win.cc 42 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374
  1. /*
  2. * Copyright 2013 The LibYuv Project Authors. All rights reserved.
  3. *
  4. * Use of this source code is governed by a BSD-style license
  5. * that can be found in the LICENSE file in the root of the source
  6. * tree. An additional intellectual property rights grant can be found
  7. * in the file PATENTS. All contributing project authors may
  8. * be found in the AUTHORS file in the root of the source tree.
  9. */
  10. #include "libyuv/row.h"
  11. #include "libyuv/scale_row.h"
  12. #ifdef __cplusplus
  13. namespace libyuv {
  14. extern "C" {
  15. #endif
  16. // This module is for 32 bit Visual C x86 and clangcl
  17. #if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86)
  18. // Offsets for source bytes 0 to 9
  19. static uvec8 kShuf0 =
  20. { 0, 1, 3, 4, 5, 7, 8, 9, 128, 128, 128, 128, 128, 128, 128, 128 };
  21. // Offsets for source bytes 11 to 20 with 8 subtracted = 3 to 12.
  22. static uvec8 kShuf1 =
  23. { 3, 4, 5, 7, 8, 9, 11, 12, 128, 128, 128, 128, 128, 128, 128, 128 };
  24. // Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31.
  25. static uvec8 kShuf2 =
  26. { 5, 7, 8, 9, 11, 12, 13, 15, 128, 128, 128, 128, 128, 128, 128, 128 };
  27. // Offsets for source bytes 0 to 10
  28. static uvec8 kShuf01 =
  29. { 0, 1, 1, 2, 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10 };
  30. // Offsets for source bytes 10 to 21 with 8 subtracted = 3 to 13.
  31. static uvec8 kShuf11 =
  32. { 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13 };
  33. // Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31.
  34. static uvec8 kShuf21 =
  35. { 5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13, 13, 14, 14, 15 };
  36. // Coefficients for source bytes 0 to 10
  37. static uvec8 kMadd01 =
  38. { 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2 };
  39. // Coefficients for source bytes 10 to 21
  40. static uvec8 kMadd11 =
  41. { 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1 };
  42. // Coefficients for source bytes 21 to 31
  43. static uvec8 kMadd21 =
  44. { 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3 };
  45. // Coefficients for source bytes 21 to 31
  46. static vec16 kRound34 =
  47. { 2, 2, 2, 2, 2, 2, 2, 2 };
  48. static uvec8 kShuf38a =
  49. { 0, 3, 6, 8, 11, 14, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 };
  50. static uvec8 kShuf38b =
  51. { 128, 128, 128, 128, 128, 128, 0, 3, 6, 8, 11, 14, 128, 128, 128, 128 };
  52. // Arrange words 0,3,6 into 0,1,2
  53. static uvec8 kShufAc =
  54. { 0, 1, 6, 7, 12, 13, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 };
  55. // Arrange words 0,3,6 into 3,4,5
  56. static uvec8 kShufAc3 =
  57. { 128, 128, 128, 128, 128, 128, 0, 1, 6, 7, 12, 13, 128, 128, 128, 128 };
  58. // Scaling values for boxes of 3x3 and 2x3
  59. static uvec16 kScaleAc33 =
  60. { 65536 / 9, 65536 / 9, 65536 / 6, 65536 / 9, 65536 / 9, 65536 / 6, 0, 0 };
  61. // Arrange first value for pixels 0,1,2,3,4,5
  62. static uvec8 kShufAb0 =
  63. { 0, 128, 3, 128, 6, 128, 8, 128, 11, 128, 14, 128, 128, 128, 128, 128 };
  64. // Arrange second value for pixels 0,1,2,3,4,5
  65. static uvec8 kShufAb1 =
  66. { 1, 128, 4, 128, 7, 128, 9, 128, 12, 128, 15, 128, 128, 128, 128, 128 };
  67. // Arrange third value for pixels 0,1,2,3,4,5
  68. static uvec8 kShufAb2 =
  69. { 2, 128, 5, 128, 128, 128, 10, 128, 13, 128, 128, 128, 128, 128, 128, 128 };
  70. // Scaling values for boxes of 3x2 and 2x2
  71. static uvec16 kScaleAb2 =
  72. { 65536 / 3, 65536 / 3, 65536 / 2, 65536 / 3, 65536 / 3, 65536 / 2, 0, 0 };
  73. // Reads 32 pixels, throws half away and writes 16 pixels.
  74. __declspec(naked)
  75. void ScaleRowDown2_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
  76. uint8* dst_ptr, int dst_width) {
  77. __asm {
  78. mov eax, [esp + 4] // src_ptr
  79. // src_stride ignored
  80. mov edx, [esp + 12] // dst_ptr
  81. mov ecx, [esp + 16] // dst_width
  82. wloop:
  83. movdqu xmm0, [eax]
  84. movdqu xmm1, [eax + 16]
  85. lea eax, [eax + 32]
  86. psrlw xmm0, 8 // isolate odd pixels.
  87. psrlw xmm1, 8
  88. packuswb xmm0, xmm1
  89. movdqu [edx], xmm0
  90. lea edx, [edx + 16]
  91. sub ecx, 16
  92. jg wloop
  93. ret
  94. }
  95. }
  96. // Blends 32x1 rectangle to 16x1.
  97. __declspec(naked)
  98. void ScaleRowDown2Linear_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
  99. uint8* dst_ptr, int dst_width) {
  100. __asm {
  101. mov eax, [esp + 4] // src_ptr
  102. // src_stride
  103. mov edx, [esp + 12] // dst_ptr
  104. mov ecx, [esp + 16] // dst_width
  105. pcmpeqb xmm4, xmm4 // constant 0x0101
  106. psrlw xmm4, 15
  107. packuswb xmm4, xmm4
  108. pxor xmm5, xmm5 // constant 0
  109. wloop:
  110. movdqu xmm0, [eax]
  111. movdqu xmm1, [eax + 16]
  112. lea eax, [eax + 32]
  113. pmaddubsw xmm0, xmm4 // horizontal add
  114. pmaddubsw xmm1, xmm4
  115. pavgw xmm0, xmm5 // (x + 1) / 2
  116. pavgw xmm1, xmm5
  117. packuswb xmm0, xmm1
  118. movdqu [edx], xmm0
  119. lea edx, [edx + 16]
  120. sub ecx, 16
  121. jg wloop
  122. ret
  123. }
  124. }
  125. // Blends 32x2 rectangle to 16x1.
  126. __declspec(naked)
  127. void ScaleRowDown2Box_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
  128. uint8* dst_ptr, int dst_width) {
  129. __asm {
  130. push esi
  131. mov eax, [esp + 4 + 4] // src_ptr
  132. mov esi, [esp + 4 + 8] // src_stride
  133. mov edx, [esp + 4 + 12] // dst_ptr
  134. mov ecx, [esp + 4 + 16] // dst_width
  135. pcmpeqb xmm4, xmm4 // constant 0x0101
  136. psrlw xmm4, 15
  137. packuswb xmm4, xmm4
  138. pxor xmm5, xmm5 // constant 0
  139. wloop:
  140. movdqu xmm0, [eax]
  141. movdqu xmm1, [eax + 16]
  142. movdqu xmm2, [eax + esi]
  143. movdqu xmm3, [eax + esi + 16]
  144. lea eax, [eax + 32]
  145. pmaddubsw xmm0, xmm4 // horizontal add
  146. pmaddubsw xmm1, xmm4
  147. pmaddubsw xmm2, xmm4
  148. pmaddubsw xmm3, xmm4
  149. paddw xmm0, xmm2 // vertical add
  150. paddw xmm1, xmm3
  151. psrlw xmm0, 1
  152. psrlw xmm1, 1
  153. pavgw xmm0, xmm5 // (x + 1) / 2
  154. pavgw xmm1, xmm5
  155. packuswb xmm0, xmm1
  156. movdqu [edx], xmm0
  157. lea edx, [edx + 16]
  158. sub ecx, 16
  159. jg wloop
  160. pop esi
  161. ret
  162. }
  163. }
  164. #ifdef HAS_SCALEROWDOWN2_AVX2
  165. // Reads 64 pixels, throws half away and writes 32 pixels.
  166. __declspec(naked)
  167. void ScaleRowDown2_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
  168. uint8* dst_ptr, int dst_width) {
  169. __asm {
  170. mov eax, [esp + 4] // src_ptr
  171. // src_stride ignored
  172. mov edx, [esp + 12] // dst_ptr
  173. mov ecx, [esp + 16] // dst_width
  174. wloop:
  175. vmovdqu ymm0, [eax]
  176. vmovdqu ymm1, [eax + 32]
  177. lea eax, [eax + 64]
  178. vpsrlw ymm0, ymm0, 8 // isolate odd pixels.
  179. vpsrlw ymm1, ymm1, 8
  180. vpackuswb ymm0, ymm0, ymm1
  181. vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb
  182. vmovdqu [edx], ymm0
  183. lea edx, [edx + 32]
  184. sub ecx, 32
  185. jg wloop
  186. vzeroupper
  187. ret
  188. }
  189. }
  190. // Blends 64x1 rectangle to 32x1.
  191. __declspec(naked)
  192. void ScaleRowDown2Linear_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
  193. uint8* dst_ptr, int dst_width) {
  194. __asm {
  195. mov eax, [esp + 4] // src_ptr
  196. // src_stride
  197. mov edx, [esp + 12] // dst_ptr
  198. mov ecx, [esp + 16] // dst_width
  199. vpcmpeqb ymm4, ymm4, ymm4 // '1' constant, 8b
  200. vpsrlw ymm4, ymm4, 15
  201. vpackuswb ymm4, ymm4, ymm4
  202. vpxor ymm5, ymm5, ymm5 // constant 0
  203. wloop:
  204. vmovdqu ymm0, [eax]
  205. vmovdqu ymm1, [eax + 32]
  206. lea eax, [eax + 64]
  207. vpmaddubsw ymm0, ymm0, ymm4 // horizontal add
  208. vpmaddubsw ymm1, ymm1, ymm4
  209. vpavgw ymm0, ymm0, ymm5 // (x + 1) / 2
  210. vpavgw ymm1, ymm1, ymm5
  211. vpackuswb ymm0, ymm0, ymm1
  212. vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb
  213. vmovdqu [edx], ymm0
  214. lea edx, [edx + 32]
  215. sub ecx, 32
  216. jg wloop
  217. vzeroupper
  218. ret
  219. }
  220. }
  221. // For rounding, average = (sum + 2) / 4
  222. // becomes average((sum >> 1), 0)
  223. // Blends 64x2 rectangle to 32x1.
  224. __declspec(naked)
  225. void ScaleRowDown2Box_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
  226. uint8* dst_ptr, int dst_width) {
  227. __asm {
  228. push esi
  229. mov eax, [esp + 4 + 4] // src_ptr
  230. mov esi, [esp + 4 + 8] // src_stride
  231. mov edx, [esp + 4 + 12] // dst_ptr
  232. mov ecx, [esp + 4 + 16] // dst_width
  233. vpcmpeqb ymm4, ymm4, ymm4 // '1' constant, 8b
  234. vpsrlw ymm4, ymm4, 15
  235. vpackuswb ymm4, ymm4, ymm4
  236. vpxor ymm5, ymm5, ymm5 // constant 0
  237. wloop:
  238. vmovdqu ymm0, [eax]
  239. vmovdqu ymm1, [eax + 32]
  240. vmovdqu ymm2, [eax + esi]
  241. vmovdqu ymm3, [eax + esi + 32]
  242. lea eax, [eax + 64]
  243. vpmaddubsw ymm0, ymm0, ymm4 // horizontal add
  244. vpmaddubsw ymm1, ymm1, ymm4
  245. vpmaddubsw ymm2, ymm2, ymm4
  246. vpmaddubsw ymm3, ymm3, ymm4
  247. vpaddw ymm0, ymm0, ymm2 // vertical add
  248. vpaddw ymm1, ymm1, ymm3
  249. vpsrlw ymm0, ymm0, 1 // (x + 2) / 4 = (x / 2 + 1) / 2
  250. vpsrlw ymm1, ymm1, 1
  251. vpavgw ymm0, ymm0, ymm5 // (x + 1) / 2
  252. vpavgw ymm1, ymm1, ymm5
  253. vpackuswb ymm0, ymm0, ymm1
  254. vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb
  255. vmovdqu [edx], ymm0
  256. lea edx, [edx + 32]
  257. sub ecx, 32
  258. jg wloop
  259. pop esi
  260. vzeroupper
  261. ret
  262. }
  263. }
  264. #endif // HAS_SCALEROWDOWN2_AVX2
  265. // Point samples 32 pixels to 8 pixels.
  266. __declspec(naked)
  267. void ScaleRowDown4_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
  268. uint8* dst_ptr, int dst_width) {
  269. __asm {
  270. mov eax, [esp + 4] // src_ptr
  271. // src_stride ignored
  272. mov edx, [esp + 12] // dst_ptr
  273. mov ecx, [esp + 16] // dst_width
  274. pcmpeqb xmm5, xmm5 // generate mask 0x00ff0000
  275. psrld xmm5, 24
  276. pslld xmm5, 16
  277. wloop:
  278. movdqu xmm0, [eax]
  279. movdqu xmm1, [eax + 16]
  280. lea eax, [eax + 32]
  281. pand xmm0, xmm5
  282. pand xmm1, xmm5
  283. packuswb xmm0, xmm1
  284. psrlw xmm0, 8
  285. packuswb xmm0, xmm0
  286. movq qword ptr [edx], xmm0
  287. lea edx, [edx + 8]
  288. sub ecx, 8
  289. jg wloop
  290. ret
  291. }
  292. }
  293. // Blends 32x4 rectangle to 8x1.
  294. __declspec(naked)
  295. void ScaleRowDown4Box_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
  296. uint8* dst_ptr, int dst_width) {
  297. __asm {
  298. push esi
  299. push edi
  300. mov eax, [esp + 8 + 4] // src_ptr
  301. mov esi, [esp + 8 + 8] // src_stride
  302. mov edx, [esp + 8 + 12] // dst_ptr
  303. mov ecx, [esp + 8 + 16] // dst_width
  304. lea edi, [esi + esi * 2] // src_stride * 3
  305. pcmpeqb xmm4, xmm4 // constant 0x0101
  306. psrlw xmm4, 15
  307. movdqa xmm5, xmm4
  308. packuswb xmm4, xmm4
  309. psllw xmm5, 3 // constant 0x0008
  310. wloop:
  311. movdqu xmm0, [eax] // average rows
  312. movdqu xmm1, [eax + 16]
  313. movdqu xmm2, [eax + esi]
  314. movdqu xmm3, [eax + esi + 16]
  315. pmaddubsw xmm0, xmm4 // horizontal add
  316. pmaddubsw xmm1, xmm4
  317. pmaddubsw xmm2, xmm4
  318. pmaddubsw xmm3, xmm4
  319. paddw xmm0, xmm2 // vertical add rows 0, 1
  320. paddw xmm1, xmm3
  321. movdqu xmm2, [eax + esi * 2]
  322. movdqu xmm3, [eax + esi * 2 + 16]
  323. pmaddubsw xmm2, xmm4
  324. pmaddubsw xmm3, xmm4
  325. paddw xmm0, xmm2 // add row 2
  326. paddw xmm1, xmm3
  327. movdqu xmm2, [eax + edi]
  328. movdqu xmm3, [eax + edi + 16]
  329. lea eax, [eax + 32]
  330. pmaddubsw xmm2, xmm4
  331. pmaddubsw xmm3, xmm4
  332. paddw xmm0, xmm2 // add row 3
  333. paddw xmm1, xmm3
  334. phaddw xmm0, xmm1
  335. paddw xmm0, xmm5 // + 8 for round
  336. psrlw xmm0, 4 // /16 for average of 4 * 4
  337. packuswb xmm0, xmm0
  338. movq qword ptr [edx], xmm0
  339. lea edx, [edx + 8]
  340. sub ecx, 8
  341. jg wloop
  342. pop edi
  343. pop esi
  344. ret
  345. }
  346. }
  347. #ifdef HAS_SCALEROWDOWN4_AVX2
  348. // Point samples 64 pixels to 16 pixels.
  349. __declspec(naked)
  350. void ScaleRowDown4_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
  351. uint8* dst_ptr, int dst_width) {
  352. __asm {
  353. mov eax, [esp + 4] // src_ptr
  354. // src_stride ignored
  355. mov edx, [esp + 12] // dst_ptr
  356. mov ecx, [esp + 16] // dst_width
  357. vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff0000
  358. vpsrld ymm5, ymm5, 24
  359. vpslld ymm5, ymm5, 16
  360. wloop:
  361. vmovdqu ymm0, [eax]
  362. vmovdqu ymm1, [eax + 32]
  363. lea eax, [eax + 64]
  364. vpand ymm0, ymm0, ymm5
  365. vpand ymm1, ymm1, ymm5
  366. vpackuswb ymm0, ymm0, ymm1
  367. vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb
  368. vpsrlw ymm0, ymm0, 8
  369. vpackuswb ymm0, ymm0, ymm0
  370. vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb
  371. vmovdqu [edx], xmm0
  372. lea edx, [edx + 16]
  373. sub ecx, 16
  374. jg wloop
  375. vzeroupper
  376. ret
  377. }
  378. }
  379. // Blends 64x4 rectangle to 16x1.
  380. __declspec(naked)
  381. void ScaleRowDown4Box_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
  382. uint8* dst_ptr, int dst_width) {
  383. __asm {
  384. push esi
  385. push edi
  386. mov eax, [esp + 8 + 4] // src_ptr
  387. mov esi, [esp + 8 + 8] // src_stride
  388. mov edx, [esp + 8 + 12] // dst_ptr
  389. mov ecx, [esp + 8 + 16] // dst_width
  390. lea edi, [esi + esi * 2] // src_stride * 3
  391. vpcmpeqb ymm4, ymm4, ymm4 // constant 0x0101
  392. vpsrlw ymm4, ymm4, 15
  393. vpsllw ymm5, ymm4, 3 // constant 0x0008
  394. vpackuswb ymm4, ymm4, ymm4
  395. wloop:
  396. vmovdqu ymm0, [eax] // average rows
  397. vmovdqu ymm1, [eax + 32]
  398. vmovdqu ymm2, [eax + esi]
  399. vmovdqu ymm3, [eax + esi + 32]
  400. vpmaddubsw ymm0, ymm0, ymm4 // horizontal add
  401. vpmaddubsw ymm1, ymm1, ymm4
  402. vpmaddubsw ymm2, ymm2, ymm4
  403. vpmaddubsw ymm3, ymm3, ymm4
  404. vpaddw ymm0, ymm0, ymm2 // vertical add rows 0, 1
  405. vpaddw ymm1, ymm1, ymm3
  406. vmovdqu ymm2, [eax + esi * 2]
  407. vmovdqu ymm3, [eax + esi * 2 + 32]
  408. vpmaddubsw ymm2, ymm2, ymm4
  409. vpmaddubsw ymm3, ymm3, ymm4
  410. vpaddw ymm0, ymm0, ymm2 // add row 2
  411. vpaddw ymm1, ymm1, ymm3
  412. vmovdqu ymm2, [eax + edi]
  413. vmovdqu ymm3, [eax + edi + 32]
  414. lea eax, [eax + 64]
  415. vpmaddubsw ymm2, ymm2, ymm4
  416. vpmaddubsw ymm3, ymm3, ymm4
  417. vpaddw ymm0, ymm0, ymm2 // add row 3
  418. vpaddw ymm1, ymm1, ymm3
  419. vphaddw ymm0, ymm0, ymm1 // mutates
  420. vpermq ymm0, ymm0, 0xd8 // unmutate vphaddw
  421. vpaddw ymm0, ymm0, ymm5 // + 8 for round
  422. vpsrlw ymm0, ymm0, 4 // /32 for average of 4 * 4
  423. vpackuswb ymm0, ymm0, ymm0
  424. vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb
  425. vmovdqu [edx], xmm0
  426. lea edx, [edx + 16]
  427. sub ecx, 16
  428. jg wloop
  429. pop edi
  430. pop esi
  431. vzeroupper
  432. ret
  433. }
  434. }
  435. #endif // HAS_SCALEROWDOWN4_AVX2
  436. // Point samples 32 pixels to 24 pixels.
  437. // Produces three 8 byte values. For each 8 bytes, 16 bytes are read.
  438. // Then shuffled to do the scaling.
  439. __declspec(naked)
  440. void ScaleRowDown34_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
  441. uint8* dst_ptr, int dst_width) {
  442. __asm {
  443. mov eax, [esp + 4] // src_ptr
  444. // src_stride ignored
  445. mov edx, [esp + 12] // dst_ptr
  446. mov ecx, [esp + 16] // dst_width
  447. movdqa xmm3, xmmword ptr kShuf0
  448. movdqa xmm4, xmmword ptr kShuf1
  449. movdqa xmm5, xmmword ptr kShuf2
  450. wloop:
  451. movdqu xmm0, [eax]
  452. movdqu xmm1, [eax + 16]
  453. lea eax, [eax + 32]
  454. movdqa xmm2, xmm1
  455. palignr xmm1, xmm0, 8
  456. pshufb xmm0, xmm3
  457. pshufb xmm1, xmm4
  458. pshufb xmm2, xmm5
  459. movq qword ptr [edx], xmm0
  460. movq qword ptr [edx + 8], xmm1
  461. movq qword ptr [edx + 16], xmm2
  462. lea edx, [edx + 24]
  463. sub ecx, 24
  464. jg wloop
  465. ret
  466. }
  467. }
  468. // Blends 32x2 rectangle to 24x1
  469. // Produces three 8 byte values. For each 8 bytes, 16 bytes are read.
  470. // Then shuffled to do the scaling.
  471. // Register usage:
  472. // xmm0 src_row 0
  473. // xmm1 src_row 1
  474. // xmm2 shuf 0
  475. // xmm3 shuf 1
  476. // xmm4 shuf 2
  477. // xmm5 madd 0
  478. // xmm6 madd 1
  479. // xmm7 kRound34
  480. // Note that movdqa+palign may be better than movdqu.
  481. __declspec(naked)
  482. void ScaleRowDown34_1_Box_SSSE3(const uint8* src_ptr,
  483. ptrdiff_t src_stride,
  484. uint8* dst_ptr, int dst_width) {
  485. __asm {
  486. push esi
  487. mov eax, [esp + 4 + 4] // src_ptr
  488. mov esi, [esp + 4 + 8] // src_stride
  489. mov edx, [esp + 4 + 12] // dst_ptr
  490. mov ecx, [esp + 4 + 16] // dst_width
  491. movdqa xmm2, xmmword ptr kShuf01
  492. movdqa xmm3, xmmword ptr kShuf11
  493. movdqa xmm4, xmmword ptr kShuf21
  494. movdqa xmm5, xmmword ptr kMadd01
  495. movdqa xmm6, xmmword ptr kMadd11
  496. movdqa xmm7, xmmword ptr kRound34
  497. wloop:
  498. movdqu xmm0, [eax] // pixels 0..7
  499. movdqu xmm1, [eax + esi]
  500. pavgb xmm0, xmm1
  501. pshufb xmm0, xmm2
  502. pmaddubsw xmm0, xmm5
  503. paddsw xmm0, xmm7
  504. psrlw xmm0, 2
  505. packuswb xmm0, xmm0
  506. movq qword ptr [edx], xmm0
  507. movdqu xmm0, [eax + 8] // pixels 8..15
  508. movdqu xmm1, [eax + esi + 8]
  509. pavgb xmm0, xmm1
  510. pshufb xmm0, xmm3
  511. pmaddubsw xmm0, xmm6
  512. paddsw xmm0, xmm7
  513. psrlw xmm0, 2
  514. packuswb xmm0, xmm0
  515. movq qword ptr [edx + 8], xmm0
  516. movdqu xmm0, [eax + 16] // pixels 16..23
  517. movdqu xmm1, [eax + esi + 16]
  518. lea eax, [eax + 32]
  519. pavgb xmm0, xmm1
  520. pshufb xmm0, xmm4
  521. movdqa xmm1, xmmword ptr kMadd21
  522. pmaddubsw xmm0, xmm1
  523. paddsw xmm0, xmm7
  524. psrlw xmm0, 2
  525. packuswb xmm0, xmm0
  526. movq qword ptr [edx + 16], xmm0
  527. lea edx, [edx + 24]
  528. sub ecx, 24
  529. jg wloop
  530. pop esi
  531. ret
  532. }
  533. }
  534. // Note that movdqa+palign may be better than movdqu.
  535. __declspec(naked)
  536. void ScaleRowDown34_0_Box_SSSE3(const uint8* src_ptr,
  537. ptrdiff_t src_stride,
  538. uint8* dst_ptr, int dst_width) {
  539. __asm {
  540. push esi
  541. mov eax, [esp + 4 + 4] // src_ptr
  542. mov esi, [esp + 4 + 8] // src_stride
  543. mov edx, [esp + 4 + 12] // dst_ptr
  544. mov ecx, [esp + 4 + 16] // dst_width
  545. movdqa xmm2, xmmword ptr kShuf01
  546. movdqa xmm3, xmmword ptr kShuf11
  547. movdqa xmm4, xmmword ptr kShuf21
  548. movdqa xmm5, xmmword ptr kMadd01
  549. movdqa xmm6, xmmword ptr kMadd11
  550. movdqa xmm7, xmmword ptr kRound34
  551. wloop:
  552. movdqu xmm0, [eax] // pixels 0..7
  553. movdqu xmm1, [eax + esi]
  554. pavgb xmm1, xmm0
  555. pavgb xmm0, xmm1
  556. pshufb xmm0, xmm2
  557. pmaddubsw xmm0, xmm5
  558. paddsw xmm0, xmm7
  559. psrlw xmm0, 2
  560. packuswb xmm0, xmm0
  561. movq qword ptr [edx], xmm0
  562. movdqu xmm0, [eax + 8] // pixels 8..15
  563. movdqu xmm1, [eax + esi + 8]
  564. pavgb xmm1, xmm0
  565. pavgb xmm0, xmm1
  566. pshufb xmm0, xmm3
  567. pmaddubsw xmm0, xmm6
  568. paddsw xmm0, xmm7
  569. psrlw xmm0, 2
  570. packuswb xmm0, xmm0
  571. movq qword ptr [edx + 8], xmm0
  572. movdqu xmm0, [eax + 16] // pixels 16..23
  573. movdqu xmm1, [eax + esi + 16]
  574. lea eax, [eax + 32]
  575. pavgb xmm1, xmm0
  576. pavgb xmm0, xmm1
  577. pshufb xmm0, xmm4
  578. movdqa xmm1, xmmword ptr kMadd21
  579. pmaddubsw xmm0, xmm1
  580. paddsw xmm0, xmm7
  581. psrlw xmm0, 2
  582. packuswb xmm0, xmm0
  583. movq qword ptr [edx + 16], xmm0
  584. lea edx, [edx+24]
  585. sub ecx, 24
  586. jg wloop
  587. pop esi
  588. ret
  589. }
  590. }
  591. // 3/8 point sampler
  592. // Scale 32 pixels to 12
  593. __declspec(naked)
  594. void ScaleRowDown38_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
  595. uint8* dst_ptr, int dst_width) {
  596. __asm {
  597. mov eax, [esp + 4] // src_ptr
  598. // src_stride ignored
  599. mov edx, [esp + 12] // dst_ptr
  600. mov ecx, [esp + 16] // dst_width
  601. movdqa xmm4, xmmword ptr kShuf38a
  602. movdqa xmm5, xmmword ptr kShuf38b
  603. xloop:
  604. movdqu xmm0, [eax] // 16 pixels -> 0,1,2,3,4,5
  605. movdqu xmm1, [eax + 16] // 16 pixels -> 6,7,8,9,10,11
  606. lea eax, [eax + 32]
  607. pshufb xmm0, xmm4
  608. pshufb xmm1, xmm5
  609. paddusb xmm0, xmm1
  610. movq qword ptr [edx], xmm0 // write 12 pixels
  611. movhlps xmm1, xmm0
  612. movd [edx + 8], xmm1
  613. lea edx, [edx + 12]
  614. sub ecx, 12
  615. jg xloop
  616. ret
  617. }
  618. }
  619. // Scale 16x3 pixels to 6x1 with interpolation
  620. __declspec(naked)
  621. void ScaleRowDown38_3_Box_SSSE3(const uint8* src_ptr,
  622. ptrdiff_t src_stride,
  623. uint8* dst_ptr, int dst_width) {
  624. __asm {
  625. push esi
  626. mov eax, [esp + 4 + 4] // src_ptr
  627. mov esi, [esp + 4 + 8] // src_stride
  628. mov edx, [esp + 4 + 12] // dst_ptr
  629. mov ecx, [esp + 4 + 16] // dst_width
  630. movdqa xmm2, xmmword ptr kShufAc
  631. movdqa xmm3, xmmword ptr kShufAc3
  632. movdqa xmm4, xmmword ptr kScaleAc33
  633. pxor xmm5, xmm5
  634. xloop:
  635. movdqu xmm0, [eax] // sum up 3 rows into xmm0/1
  636. movdqu xmm6, [eax + esi]
  637. movhlps xmm1, xmm0
  638. movhlps xmm7, xmm6
  639. punpcklbw xmm0, xmm5
  640. punpcklbw xmm1, xmm5
  641. punpcklbw xmm6, xmm5
  642. punpcklbw xmm7, xmm5
  643. paddusw xmm0, xmm6
  644. paddusw xmm1, xmm7
  645. movdqu xmm6, [eax + esi * 2]
  646. lea eax, [eax + 16]
  647. movhlps xmm7, xmm6
  648. punpcklbw xmm6, xmm5
  649. punpcklbw xmm7, xmm5
  650. paddusw xmm0, xmm6
  651. paddusw xmm1, xmm7
  652. movdqa xmm6, xmm0 // 8 pixels -> 0,1,2 of xmm6
  653. psrldq xmm0, 2
  654. paddusw xmm6, xmm0
  655. psrldq xmm0, 2
  656. paddusw xmm6, xmm0
  657. pshufb xmm6, xmm2
  658. movdqa xmm7, xmm1 // 8 pixels -> 3,4,5 of xmm6
  659. psrldq xmm1, 2
  660. paddusw xmm7, xmm1
  661. psrldq xmm1, 2
  662. paddusw xmm7, xmm1
  663. pshufb xmm7, xmm3
  664. paddusw xmm6, xmm7
  665. pmulhuw xmm6, xmm4 // divide by 9,9,6, 9,9,6
  666. packuswb xmm6, xmm6
  667. movd [edx], xmm6 // write 6 pixels
  668. psrlq xmm6, 16
  669. movd [edx + 2], xmm6
  670. lea edx, [edx + 6]
  671. sub ecx, 6
  672. jg xloop
  673. pop esi
  674. ret
  675. }
  676. }
  677. // Scale 16x2 pixels to 6x1 with interpolation
  678. __declspec(naked)
  679. void ScaleRowDown38_2_Box_SSSE3(const uint8* src_ptr,
  680. ptrdiff_t src_stride,
  681. uint8* dst_ptr, int dst_width) {
  682. __asm {
  683. push esi
  684. mov eax, [esp + 4 + 4] // src_ptr
  685. mov esi, [esp + 4 + 8] // src_stride
  686. mov edx, [esp + 4 + 12] // dst_ptr
  687. mov ecx, [esp + 4 + 16] // dst_width
  688. movdqa xmm2, xmmword ptr kShufAb0
  689. movdqa xmm3, xmmword ptr kShufAb1
  690. movdqa xmm4, xmmword ptr kShufAb2
  691. movdqa xmm5, xmmword ptr kScaleAb2
  692. xloop:
  693. movdqu xmm0, [eax] // average 2 rows into xmm0
  694. movdqu xmm1, [eax + esi]
  695. lea eax, [eax + 16]
  696. pavgb xmm0, xmm1
  697. movdqa xmm1, xmm0 // 16 pixels -> 0,1,2,3,4,5 of xmm1
  698. pshufb xmm1, xmm2
  699. movdqa xmm6, xmm0
  700. pshufb xmm6, xmm3
  701. paddusw xmm1, xmm6
  702. pshufb xmm0, xmm4
  703. paddusw xmm1, xmm0
  704. pmulhuw xmm1, xmm5 // divide by 3,3,2, 3,3,2
  705. packuswb xmm1, xmm1
  706. movd [edx], xmm1 // write 6 pixels
  707. psrlq xmm1, 16
  708. movd [edx + 2], xmm1
  709. lea edx, [edx + 6]
  710. sub ecx, 6
  711. jg xloop
  712. pop esi
  713. ret
  714. }
  715. }
  716. // Reads 16 bytes and accumulates to 16 shorts at a time.
  717. __declspec(naked)
  718. void ScaleAddRow_SSE2(const uint8* src_ptr, uint16* dst_ptr, int src_width) {
  719. __asm {
  720. mov eax, [esp + 4] // src_ptr
  721. mov edx, [esp + 8] // dst_ptr
  722. mov ecx, [esp + 12] // src_width
  723. pxor xmm5, xmm5
  724. // sum rows
  725. xloop:
  726. movdqu xmm3, [eax] // read 16 bytes
  727. lea eax, [eax + 16]
  728. movdqu xmm0, [edx] // read 16 words from destination
  729. movdqu xmm1, [edx + 16]
  730. movdqa xmm2, xmm3
  731. punpcklbw xmm2, xmm5
  732. punpckhbw xmm3, xmm5
  733. paddusw xmm0, xmm2 // sum 16 words
  734. paddusw xmm1, xmm3
  735. movdqu [edx], xmm0 // write 16 words to destination
  736. movdqu [edx + 16], xmm1
  737. lea edx, [edx + 32]
  738. sub ecx, 16
  739. jg xloop
  740. ret
  741. }
  742. }
  743. #ifdef HAS_SCALEADDROW_AVX2
  744. // Reads 32 bytes and accumulates to 32 shorts at a time.
  745. __declspec(naked)
  746. void ScaleAddRow_AVX2(const uint8* src_ptr, uint16* dst_ptr, int src_width) {
  747. __asm {
  748. mov eax, [esp + 4] // src_ptr
  749. mov edx, [esp + 8] // dst_ptr
  750. mov ecx, [esp + 12] // src_width
  751. vpxor ymm5, ymm5, ymm5
  752. // sum rows
  753. xloop:
  754. vmovdqu ymm3, [eax] // read 32 bytes
  755. lea eax, [eax + 32]
  756. vpermq ymm3, ymm3, 0xd8 // unmutate for vpunpck
  757. vpunpcklbw ymm2, ymm3, ymm5
  758. vpunpckhbw ymm3, ymm3, ymm5
  759. vpaddusw ymm0, ymm2, [edx] // sum 16 words
  760. vpaddusw ymm1, ymm3, [edx + 32]
  761. vmovdqu [edx], ymm0 // write 32 words to destination
  762. vmovdqu [edx + 32], ymm1
  763. lea edx, [edx + 64]
  764. sub ecx, 32
  765. jg xloop
  766. vzeroupper
  767. ret
  768. }
  769. }
  770. #endif // HAS_SCALEADDROW_AVX2
  771. // Constant for making pixels signed to avoid pmaddubsw
  772. // saturation.
  773. static uvec8 kFsub80 =
  774. { 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
  775. 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 };
  776. // Constant for making pixels unsigned and adding .5 for rounding.
  777. static uvec16 kFadd40 =
  778. { 0x4040, 0x4040, 0x4040, 0x4040, 0x4040, 0x4040, 0x4040, 0x4040 };
  779. // Bilinear column filtering. SSSE3 version.
  780. __declspec(naked)
  781. void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
  782. int dst_width, int x, int dx) {
  783. __asm {
  784. push ebx
  785. push esi
  786. push edi
  787. mov edi, [esp + 12 + 4] // dst_ptr
  788. mov esi, [esp + 12 + 8] // src_ptr
  789. mov ecx, [esp + 12 + 12] // dst_width
  790. movd xmm2, [esp + 12 + 16] // x
  791. movd xmm3, [esp + 12 + 20] // dx
  792. mov eax, 0x04040000 // shuffle to line up fractions with pixel.
  793. movd xmm5, eax
  794. pcmpeqb xmm6, xmm6 // generate 0x007f for inverting fraction.
  795. psrlw xmm6, 9
  796. pcmpeqb xmm7, xmm7 // generate 0x0001
  797. psrlw xmm7, 15
  798. pextrw eax, xmm2, 1 // get x0 integer. preroll
  799. sub ecx, 2
  800. jl xloop29
  801. movdqa xmm0, xmm2 // x1 = x0 + dx
  802. paddd xmm0, xmm3
  803. punpckldq xmm2, xmm0 // x0 x1
  804. punpckldq xmm3, xmm3 // dx dx
  805. paddd xmm3, xmm3 // dx * 2, dx * 2
  806. pextrw edx, xmm2, 3 // get x1 integer. preroll
  807. // 2 Pixel loop.
  808. xloop2:
  809. movdqa xmm1, xmm2 // x0, x1 fractions.
  810. paddd xmm2, xmm3 // x += dx
  811. movzx ebx, word ptr [esi + eax] // 2 source x0 pixels
  812. movd xmm0, ebx
  813. psrlw xmm1, 9 // 7 bit fractions.
  814. movzx ebx, word ptr [esi + edx] // 2 source x1 pixels
  815. movd xmm4, ebx
  816. pshufb xmm1, xmm5 // 0011
  817. punpcklwd xmm0, xmm4
  818. psubb xmm0, xmmword ptr kFsub80 // make pixels signed.
  819. pxor xmm1, xmm6 // 0..7f and 7f..0
  820. paddusb xmm1, xmm7 // +1 so 0..7f and 80..1
  821. pmaddubsw xmm1, xmm0 // 16 bit, 2 pixels.
  822. pextrw eax, xmm2, 1 // get x0 integer. next iteration.
  823. pextrw edx, xmm2, 3 // get x1 integer. next iteration.
  824. paddw xmm1, xmmword ptr kFadd40 // make pixels unsigned and round.
  825. psrlw xmm1, 7 // 8.7 fixed point to low 8 bits.
  826. packuswb xmm1, xmm1 // 8 bits, 2 pixels.
  827. movd ebx, xmm1
  828. mov [edi], bx
  829. lea edi, [edi + 2]
  830. sub ecx, 2 // 2 pixels
  831. jge xloop2
  832. xloop29:
  833. add ecx, 2 - 1
  834. jl xloop99
  835. // 1 pixel remainder
  836. movzx ebx, word ptr [esi + eax] // 2 source x0 pixels
  837. movd xmm0, ebx
  838. psrlw xmm2, 9 // 7 bit fractions.
  839. pshufb xmm2, xmm5 // 0011
  840. psubb xmm0, xmmword ptr kFsub80 // make pixels signed.
  841. pxor xmm2, xmm6 // 0..7f and 7f..0
  842. paddusb xmm2, xmm7 // +1 so 0..7f and 80..1
  843. pmaddubsw xmm2, xmm0 // 16 bit
  844. paddw xmm2, xmmword ptr kFadd40 // make pixels unsigned and round.
  845. psrlw xmm2, 7 // 8.7 fixed point to low 8 bits.
  846. packuswb xmm2, xmm2 // 8 bits
  847. movd ebx, xmm2
  848. mov [edi], bl
  849. xloop99:
  850. pop edi
  851. pop esi
  852. pop ebx
  853. ret
  854. }
  855. }
  856. // Reads 16 pixels, duplicates them and writes 32 pixels.
  857. __declspec(naked)
  858. void ScaleColsUp2_SSE2(uint8* dst_ptr, const uint8* src_ptr,
  859. int dst_width, int x, int dx) {
  860. __asm {
  861. mov edx, [esp + 4] // dst_ptr
  862. mov eax, [esp + 8] // src_ptr
  863. mov ecx, [esp + 12] // dst_width
  864. wloop:
  865. movdqu xmm0, [eax]
  866. lea eax, [eax + 16]
  867. movdqa xmm1, xmm0
  868. punpcklbw xmm0, xmm0
  869. punpckhbw xmm1, xmm1
  870. movdqu [edx], xmm0
  871. movdqu [edx + 16], xmm1
  872. lea edx, [edx + 32]
  873. sub ecx, 32
  874. jg wloop
  875. ret
  876. }
  877. }
  878. // Reads 8 pixels, throws half away and writes 4 even pixels (0, 2, 4, 6)
  879. __declspec(naked)
  880. void ScaleARGBRowDown2_SSE2(const uint8* src_argb,
  881. ptrdiff_t src_stride,
  882. uint8* dst_argb, int dst_width) {
  883. __asm {
  884. mov eax, [esp + 4] // src_argb
  885. // src_stride ignored
  886. mov edx, [esp + 12] // dst_argb
  887. mov ecx, [esp + 16] // dst_width
  888. wloop:
  889. movdqu xmm0, [eax]
  890. movdqu xmm1, [eax + 16]
  891. lea eax, [eax + 32]
  892. shufps xmm0, xmm1, 0xdd
  893. movdqu [edx], xmm0
  894. lea edx, [edx + 16]
  895. sub ecx, 4
  896. jg wloop
  897. ret
  898. }
  899. }
  900. // Blends 8x1 rectangle to 4x1.
  901. __declspec(naked)
  902. void ScaleARGBRowDown2Linear_SSE2(const uint8* src_argb,
  903. ptrdiff_t src_stride,
  904. uint8* dst_argb, int dst_width) {
  905. __asm {
  906. mov eax, [esp + 4] // src_argb
  907. // src_stride ignored
  908. mov edx, [esp + 12] // dst_argb
  909. mov ecx, [esp + 16] // dst_width
  910. wloop:
  911. movdqu xmm0, [eax]
  912. movdqu xmm1, [eax + 16]
  913. lea eax, [eax + 32]
  914. movdqa xmm2, xmm0
  915. shufps xmm0, xmm1, 0x88 // even pixels
  916. shufps xmm2, xmm1, 0xdd // odd pixels
  917. pavgb xmm0, xmm2
  918. movdqu [edx], xmm0
  919. lea edx, [edx + 16]
  920. sub ecx, 4
  921. jg wloop
  922. ret
  923. }
  924. }
  925. // Blends 8x2 rectangle to 4x1.
  926. __declspec(naked)
  927. void ScaleARGBRowDown2Box_SSE2(const uint8* src_argb,
  928. ptrdiff_t src_stride,
  929. uint8* dst_argb, int dst_width) {
  930. __asm {
  931. push esi
  932. mov eax, [esp + 4 + 4] // src_argb
  933. mov esi, [esp + 4 + 8] // src_stride
  934. mov edx, [esp + 4 + 12] // dst_argb
  935. mov ecx, [esp + 4 + 16] // dst_width
  936. wloop:
  937. movdqu xmm0, [eax]
  938. movdqu xmm1, [eax + 16]
  939. movdqu xmm2, [eax + esi]
  940. movdqu xmm3, [eax + esi + 16]
  941. lea eax, [eax + 32]
  942. pavgb xmm0, xmm2 // average rows
  943. pavgb xmm1, xmm3
  944. movdqa xmm2, xmm0 // average columns (8 to 4 pixels)
  945. shufps xmm0, xmm1, 0x88 // even pixels
  946. shufps xmm2, xmm1, 0xdd // odd pixels
  947. pavgb xmm0, xmm2
  948. movdqu [edx], xmm0
  949. lea edx, [edx + 16]
  950. sub ecx, 4
  951. jg wloop
  952. pop esi
  953. ret
  954. }
  955. }
  956. // Reads 4 pixels at a time.
  957. __declspec(naked)
  958. void ScaleARGBRowDownEven_SSE2(const uint8* src_argb, ptrdiff_t src_stride,
  959. int src_stepx,
  960. uint8* dst_argb, int dst_width) {
  961. __asm {
  962. push ebx
  963. push edi
  964. mov eax, [esp + 8 + 4] // src_argb
  965. // src_stride ignored
  966. mov ebx, [esp + 8 + 12] // src_stepx
  967. mov edx, [esp + 8 + 16] // dst_argb
  968. mov ecx, [esp + 8 + 20] // dst_width
  969. lea ebx, [ebx * 4]
  970. lea edi, [ebx + ebx * 2]
  971. wloop:
  972. movd xmm0, [eax]
  973. movd xmm1, [eax + ebx]
  974. punpckldq xmm0, xmm1
  975. movd xmm2, [eax + ebx * 2]
  976. movd xmm3, [eax + edi]
  977. lea eax, [eax + ebx * 4]
  978. punpckldq xmm2, xmm3
  979. punpcklqdq xmm0, xmm2
  980. movdqu [edx], xmm0
  981. lea edx, [edx + 16]
  982. sub ecx, 4
  983. jg wloop
  984. pop edi
  985. pop ebx
  986. ret
  987. }
  988. }
  989. // Blends four 2x2 to 4x1.
  990. __declspec(naked)
  991. void ScaleARGBRowDownEvenBox_SSE2(const uint8* src_argb,
  992. ptrdiff_t src_stride,
  993. int src_stepx,
  994. uint8* dst_argb, int dst_width) {
  995. __asm {
  996. push ebx
  997. push esi
  998. push edi
  999. mov eax, [esp + 12 + 4] // src_argb
  1000. mov esi, [esp + 12 + 8] // src_stride
  1001. mov ebx, [esp + 12 + 12] // src_stepx
  1002. mov edx, [esp + 12 + 16] // dst_argb
  1003. mov ecx, [esp + 12 + 20] // dst_width
  1004. lea esi, [eax + esi] // row1 pointer
  1005. lea ebx, [ebx * 4]
  1006. lea edi, [ebx + ebx * 2]
  1007. wloop:
  1008. movq xmm0, qword ptr [eax] // row0 4 pairs
  1009. movhps xmm0, qword ptr [eax + ebx]
  1010. movq xmm1, qword ptr [eax + ebx * 2]
  1011. movhps xmm1, qword ptr [eax + edi]
  1012. lea eax, [eax + ebx * 4]
  1013. movq xmm2, qword ptr [esi] // row1 4 pairs
  1014. movhps xmm2, qword ptr [esi + ebx]
  1015. movq xmm3, qword ptr [esi + ebx * 2]
  1016. movhps xmm3, qword ptr [esi + edi]
  1017. lea esi, [esi + ebx * 4]
  1018. pavgb xmm0, xmm2 // average rows
  1019. pavgb xmm1, xmm3
  1020. movdqa xmm2, xmm0 // average columns (8 to 4 pixels)
  1021. shufps xmm0, xmm1, 0x88 // even pixels
  1022. shufps xmm2, xmm1, 0xdd // odd pixels
  1023. pavgb xmm0, xmm2
  1024. movdqu [edx], xmm0
  1025. lea edx, [edx + 16]
  1026. sub ecx, 4
  1027. jg wloop
  1028. pop edi
  1029. pop esi
  1030. pop ebx
  1031. ret
  1032. }
  1033. }
  1034. // Column scaling unfiltered. SSE2 version.
  1035. __declspec(naked)
  1036. void ScaleARGBCols_SSE2(uint8* dst_argb, const uint8* src_argb,
  1037. int dst_width, int x, int dx) {
  1038. __asm {
  1039. push edi
  1040. push esi
  1041. mov edi, [esp + 8 + 4] // dst_argb
  1042. mov esi, [esp + 8 + 8] // src_argb
  1043. mov ecx, [esp + 8 + 12] // dst_width
  1044. movd xmm2, [esp + 8 + 16] // x
  1045. movd xmm3, [esp + 8 + 20] // dx
  1046. pshufd xmm2, xmm2, 0 // x0 x0 x0 x0
  1047. pshufd xmm0, xmm3, 0x11 // dx 0 dx 0
  1048. paddd xmm2, xmm0
  1049. paddd xmm3, xmm3 // 0, 0, 0, dx * 2
  1050. pshufd xmm0, xmm3, 0x05 // dx * 2, dx * 2, 0, 0
  1051. paddd xmm2, xmm0 // x3 x2 x1 x0
  1052. paddd xmm3, xmm3 // 0, 0, 0, dx * 4
  1053. pshufd xmm3, xmm3, 0 // dx * 4, dx * 4, dx * 4, dx * 4
  1054. pextrw eax, xmm2, 1 // get x0 integer.
  1055. pextrw edx, xmm2, 3 // get x1 integer.
  1056. cmp ecx, 0
  1057. jle xloop99
  1058. sub ecx, 4
  1059. jl xloop49
  1060. // 4 Pixel loop.
  1061. xloop4:
  1062. movd xmm0, [esi + eax * 4] // 1 source x0 pixels
  1063. movd xmm1, [esi + edx * 4] // 1 source x1 pixels
  1064. pextrw eax, xmm2, 5 // get x2 integer.
  1065. pextrw edx, xmm2, 7 // get x3 integer.
  1066. paddd xmm2, xmm3 // x += dx
  1067. punpckldq xmm0, xmm1 // x0 x1
  1068. movd xmm1, [esi + eax * 4] // 1 source x2 pixels
  1069. movd xmm4, [esi + edx * 4] // 1 source x3 pixels
  1070. pextrw eax, xmm2, 1 // get x0 integer. next iteration.
  1071. pextrw edx, xmm2, 3 // get x1 integer. next iteration.
  1072. punpckldq xmm1, xmm4 // x2 x3
  1073. punpcklqdq xmm0, xmm1 // x0 x1 x2 x3
  1074. movdqu [edi], xmm0
  1075. lea edi, [edi + 16]
  1076. sub ecx, 4 // 4 pixels
  1077. jge xloop4
  1078. xloop49:
  1079. test ecx, 2
  1080. je xloop29
  1081. // 2 Pixels.
  1082. movd xmm0, [esi + eax * 4] // 1 source x0 pixels
  1083. movd xmm1, [esi + edx * 4] // 1 source x1 pixels
  1084. pextrw eax, xmm2, 5 // get x2 integer.
  1085. punpckldq xmm0, xmm1 // x0 x1
  1086. movq qword ptr [edi], xmm0
  1087. lea edi, [edi + 8]
  1088. xloop29:
  1089. test ecx, 1
  1090. je xloop99
  1091. // 1 Pixels.
  1092. movd xmm0, [esi + eax * 4] // 1 source x2 pixels
  1093. movd dword ptr [edi], xmm0
  1094. xloop99:
  1095. pop esi
  1096. pop edi
  1097. ret
  1098. }
  1099. }
  1100. // Bilinear row filtering combines 2x1 -> 1x1. SSSE3 version.
  1101. // TODO(fbarchard): Port to Neon
  1102. // Shuffle table for arranging 2 pixels into pairs for pmaddubsw
  1103. static uvec8 kShuffleColARGB = {
  1104. 0u, 4u, 1u, 5u, 2u, 6u, 3u, 7u, // bbggrraa 1st pixel
  1105. 8u, 12u, 9u, 13u, 10u, 14u, 11u, 15u // bbggrraa 2nd pixel
  1106. };
  1107. // Shuffle table for duplicating 2 fractions into 8 bytes each
  1108. static uvec8 kShuffleFractions = {
  1109. 0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, 4u, 4u, 4u, 4u, 4u, 4u, 4u, 4u,
  1110. };
  1111. __declspec(naked)
  1112. void ScaleARGBFilterCols_SSSE3(uint8* dst_argb, const uint8* src_argb,
  1113. int dst_width, int x, int dx) {
  1114. __asm {
  1115. push esi
  1116. push edi
  1117. mov edi, [esp + 8 + 4] // dst_argb
  1118. mov esi, [esp + 8 + 8] // src_argb
  1119. mov ecx, [esp + 8 + 12] // dst_width
  1120. movd xmm2, [esp + 8 + 16] // x
  1121. movd xmm3, [esp + 8 + 20] // dx
  1122. movdqa xmm4, xmmword ptr kShuffleColARGB
  1123. movdqa xmm5, xmmword ptr kShuffleFractions
  1124. pcmpeqb xmm6, xmm6 // generate 0x007f for inverting fraction.
  1125. psrlw xmm6, 9
  1126. pextrw eax, xmm2, 1 // get x0 integer. preroll
  1127. sub ecx, 2
  1128. jl xloop29
  1129. movdqa xmm0, xmm2 // x1 = x0 + dx
  1130. paddd xmm0, xmm3
  1131. punpckldq xmm2, xmm0 // x0 x1
  1132. punpckldq xmm3, xmm3 // dx dx
  1133. paddd xmm3, xmm3 // dx * 2, dx * 2
  1134. pextrw edx, xmm2, 3 // get x1 integer. preroll
  1135. // 2 Pixel loop.
  1136. xloop2:
  1137. movdqa xmm1, xmm2 // x0, x1 fractions.
  1138. paddd xmm2, xmm3 // x += dx
  1139. movq xmm0, qword ptr [esi + eax * 4] // 2 source x0 pixels
  1140. psrlw xmm1, 9 // 7 bit fractions.
  1141. movhps xmm0, qword ptr [esi + edx * 4] // 2 source x1 pixels
  1142. pshufb xmm1, xmm5 // 0000000011111111
  1143. pshufb xmm0, xmm4 // arrange pixels into pairs
  1144. pxor xmm1, xmm6 // 0..7f and 7f..0
  1145. pmaddubsw xmm0, xmm1 // argb_argb 16 bit, 2 pixels.
  1146. pextrw eax, xmm2, 1 // get x0 integer. next iteration.
  1147. pextrw edx, xmm2, 3 // get x1 integer. next iteration.
  1148. psrlw xmm0, 7 // argb 8.7 fixed point to low 8 bits.
  1149. packuswb xmm0, xmm0 // argb_argb 8 bits, 2 pixels.
  1150. movq qword ptr [edi], xmm0
  1151. lea edi, [edi + 8]
  1152. sub ecx, 2 // 2 pixels
  1153. jge xloop2
  1154. xloop29:
  1155. add ecx, 2 - 1
  1156. jl xloop99
  1157. // 1 pixel remainder
  1158. psrlw xmm2, 9 // 7 bit fractions.
  1159. movq xmm0, qword ptr [esi + eax * 4] // 2 source x0 pixels
  1160. pshufb xmm2, xmm5 // 00000000
  1161. pshufb xmm0, xmm4 // arrange pixels into pairs
  1162. pxor xmm2, xmm6 // 0..7f and 7f..0
  1163. pmaddubsw xmm0, xmm2 // argb 16 bit, 1 pixel.
  1164. psrlw xmm0, 7
  1165. packuswb xmm0, xmm0 // argb 8 bits, 1 pixel.
  1166. movd [edi], xmm0
  1167. xloop99:
  1168. pop edi
  1169. pop esi
  1170. ret
  1171. }
  1172. }
  1173. // Reads 4 pixels, duplicates them and writes 8 pixels.
  1174. __declspec(naked)
  1175. void ScaleARGBColsUp2_SSE2(uint8* dst_argb, const uint8* src_argb,
  1176. int dst_width, int x, int dx) {
  1177. __asm {
  1178. mov edx, [esp + 4] // dst_argb
  1179. mov eax, [esp + 8] // src_argb
  1180. mov ecx, [esp + 12] // dst_width
  1181. wloop:
  1182. movdqu xmm0, [eax]
  1183. lea eax, [eax + 16]
  1184. movdqa xmm1, xmm0
  1185. punpckldq xmm0, xmm0
  1186. punpckhdq xmm1, xmm1
  1187. movdqu [edx], xmm0
  1188. movdqu [edx + 16], xmm1
  1189. lea edx, [edx + 32]
  1190. sub ecx, 8
  1191. jg wloop
  1192. ret
  1193. }
  1194. }
  1195. // Divide num by div and return as 16.16 fixed point result.
  1196. __declspec(naked)
  1197. int FixedDiv_X86(int num, int div) {
  1198. __asm {
  1199. mov eax, [esp + 4] // num
  1200. cdq // extend num to 64 bits
  1201. shld edx, eax, 16 // 32.16
  1202. shl eax, 16
  1203. idiv dword ptr [esp + 8]
  1204. ret
  1205. }
  1206. }
  1207. // Divide num by div and return as 16.16 fixed point result.
  1208. __declspec(naked)
  1209. int FixedDiv1_X86(int num, int div) {
  1210. __asm {
  1211. mov eax, [esp + 4] // num
  1212. mov ecx, [esp + 8] // denom
  1213. cdq // extend num to 64 bits
  1214. shld edx, eax, 16 // 32.16
  1215. shl eax, 16
  1216. sub eax, 0x00010001
  1217. sbb edx, 0
  1218. sub ecx, 1
  1219. idiv ecx
  1220. ret
  1221. }
  1222. }
  1223. #endif // !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86)
  1224. #ifdef __cplusplus
  1225. } // extern "C"
  1226. } // namespace libyuv
  1227. #endif