vp8_asm_stubs.c 21 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460
  1. /*
  2. * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
  3. *
  4. * Use of this source code is governed by a BSD-style license
  5. * that can be found in the LICENSE file in the root of the source
  6. * tree. An additional intellectual property rights grant can be found
  7. * in the file PATENTS. All contributing project authors may
  8. * be found in the AUTHORS file in the root of the source tree.
  9. */
  10. #include "vpx_config.h"
  11. #include "vp8_rtcd.h"
  12. #include "vpx_ports/mem.h"
  13. #include "filter_x86.h"
  14. extern const short vp8_six_tap_mmx[8][6 * 8];
  15. extern void vp8_filter_block1d_h6_mmx(unsigned char *src_ptr,
  16. unsigned short *output_ptr,
  17. unsigned int src_pixels_per_line,
  18. unsigned int pixel_step,
  19. unsigned int output_height,
  20. unsigned int output_width,
  21. const short *vp8_filter);
  22. extern void vp8_filter_block1dc_v6_mmx(
  23. unsigned short *src_ptr, unsigned char *output_ptr, int output_pitch,
  24. unsigned int pixels_per_line, unsigned int pixel_step,
  25. unsigned int output_height, unsigned int output_width,
  26. const short *vp8_filter);
  27. extern void vp8_filter_block1d8_h6_sse2(unsigned char *src_ptr,
  28. unsigned short *output_ptr,
  29. unsigned int src_pixels_per_line,
  30. unsigned int pixel_step,
  31. unsigned int output_height,
  32. unsigned int output_width,
  33. const short *vp8_filter);
  34. extern void vp8_filter_block1d16_h6_sse2(unsigned char *src_ptr,
  35. unsigned short *output_ptr,
  36. unsigned int src_pixels_per_line,
  37. unsigned int pixel_step,
  38. unsigned int output_height,
  39. unsigned int output_width,
  40. const short *vp8_filter);
  41. extern void vp8_filter_block1d8_v6_sse2(
  42. unsigned short *src_ptr, unsigned char *output_ptr, int dst_ptich,
  43. unsigned int pixels_per_line, unsigned int pixel_step,
  44. unsigned int output_height, unsigned int output_width,
  45. const short *vp8_filter);
  46. extern void vp8_filter_block1d16_v6_sse2(
  47. unsigned short *src_ptr, unsigned char *output_ptr, int dst_ptich,
  48. unsigned int pixels_per_line, unsigned int pixel_step,
  49. unsigned int output_height, unsigned int output_width,
  50. const short *vp8_filter);
  51. extern void vp8_unpack_block1d16_h6_sse2(unsigned char *src_ptr,
  52. unsigned short *output_ptr,
  53. unsigned int src_pixels_per_line,
  54. unsigned int output_height,
  55. unsigned int output_width);
  56. extern void vp8_filter_block1d8_h6_only_sse2(unsigned char *src_ptr,
  57. unsigned int src_pixels_per_line,
  58. unsigned char *output_ptr,
  59. int dst_ptich,
  60. unsigned int output_height,
  61. const short *vp8_filter);
  62. extern void vp8_filter_block1d16_h6_only_sse2(unsigned char *src_ptr,
  63. unsigned int src_pixels_per_line,
  64. unsigned char *output_ptr,
  65. int dst_ptich,
  66. unsigned int output_height,
  67. const short *vp8_filter);
  68. extern void vp8_filter_block1d8_v6_only_sse2(unsigned char *src_ptr,
  69. unsigned int src_pixels_per_line,
  70. unsigned char *output_ptr,
  71. int dst_ptich,
  72. unsigned int output_height,
  73. const short *vp8_filter);
  74. #if HAVE_MMX
  75. void vp8_sixtap_predict4x4_mmx(unsigned char *src_ptr, int src_pixels_per_line,
  76. int xoffset, int yoffset, unsigned char *dst_ptr,
  77. int dst_pitch) {
  78. DECLARE_ALIGNED(16, unsigned short,
  79. FData2[16 * 16]); /* Temp data bufffer used in filtering */
  80. const short *HFilter, *VFilter;
  81. HFilter = vp8_six_tap_mmx[xoffset];
  82. vp8_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line), FData2,
  83. src_pixels_per_line, 1, 9, 8, HFilter);
  84. VFilter = vp8_six_tap_mmx[yoffset];
  85. vp8_filter_block1dc_v6_mmx(FData2 + 8, dst_ptr, dst_pitch, 8, 4, 4, 4,
  86. VFilter);
  87. }
  88. void vp8_sixtap_predict16x16_mmx(unsigned char *src_ptr,
  89. int src_pixels_per_line, int xoffset,
  90. int yoffset, unsigned char *dst_ptr,
  91. int dst_pitch) {
  92. DECLARE_ALIGNED(16, unsigned short,
  93. FData2[24 * 24]); /* Temp data bufffer used in filtering */
  94. const short *HFilter, *VFilter;
  95. HFilter = vp8_six_tap_mmx[xoffset];
  96. vp8_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line), FData2,
  97. src_pixels_per_line, 1, 21, 32, HFilter);
  98. vp8_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line) + 4, FData2 + 4,
  99. src_pixels_per_line, 1, 21, 32, HFilter);
  100. vp8_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line) + 8, FData2 + 8,
  101. src_pixels_per_line, 1, 21, 32, HFilter);
  102. vp8_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line) + 12,
  103. FData2 + 12, src_pixels_per_line, 1, 21, 32,
  104. HFilter);
  105. VFilter = vp8_six_tap_mmx[yoffset];
  106. vp8_filter_block1dc_v6_mmx(FData2 + 32, dst_ptr, dst_pitch, 32, 16, 16, 16,
  107. VFilter);
  108. vp8_filter_block1dc_v6_mmx(FData2 + 36, dst_ptr + 4, dst_pitch, 32, 16, 16,
  109. 16, VFilter);
  110. vp8_filter_block1dc_v6_mmx(FData2 + 40, dst_ptr + 8, dst_pitch, 32, 16, 16,
  111. 16, VFilter);
  112. vp8_filter_block1dc_v6_mmx(FData2 + 44, dst_ptr + 12, dst_pitch, 32, 16, 16,
  113. 16, VFilter);
  114. }
  115. void vp8_sixtap_predict8x8_mmx(unsigned char *src_ptr, int src_pixels_per_line,
  116. int xoffset, int yoffset, unsigned char *dst_ptr,
  117. int dst_pitch) {
  118. DECLARE_ALIGNED(16, unsigned short,
  119. FData2[256]); /* Temp data bufffer used in filtering */
  120. const short *HFilter, *VFilter;
  121. HFilter = vp8_six_tap_mmx[xoffset];
  122. vp8_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line), FData2,
  123. src_pixels_per_line, 1, 13, 16, HFilter);
  124. vp8_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line) + 4, FData2 + 4,
  125. src_pixels_per_line, 1, 13, 16, HFilter);
  126. VFilter = vp8_six_tap_mmx[yoffset];
  127. vp8_filter_block1dc_v6_mmx(FData2 + 16, dst_ptr, dst_pitch, 16, 8, 8, 8,
  128. VFilter);
  129. vp8_filter_block1dc_v6_mmx(FData2 + 20, dst_ptr + 4, dst_pitch, 16, 8, 8, 8,
  130. VFilter);
  131. }
  132. void vp8_sixtap_predict8x4_mmx(unsigned char *src_ptr, int src_pixels_per_line,
  133. int xoffset, int yoffset, unsigned char *dst_ptr,
  134. int dst_pitch) {
  135. DECLARE_ALIGNED(16, unsigned short,
  136. FData2[256]); /* Temp data bufffer used in filtering */
  137. const short *HFilter, *VFilter;
  138. HFilter = vp8_six_tap_mmx[xoffset];
  139. vp8_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line), FData2,
  140. src_pixels_per_line, 1, 9, 16, HFilter);
  141. vp8_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line) + 4, FData2 + 4,
  142. src_pixels_per_line, 1, 9, 16, HFilter);
  143. VFilter = vp8_six_tap_mmx[yoffset];
  144. vp8_filter_block1dc_v6_mmx(FData2 + 16, dst_ptr, dst_pitch, 16, 8, 4, 8,
  145. VFilter);
  146. vp8_filter_block1dc_v6_mmx(FData2 + 20, dst_ptr + 4, dst_pitch, 16, 8, 4, 8,
  147. VFilter);
  148. }
  149. void vp8_bilinear_predict16x16_mmx(unsigned char *src_ptr,
  150. int src_pixels_per_line, int xoffset,
  151. int yoffset, unsigned char *dst_ptr,
  152. int dst_pitch) {
  153. vp8_bilinear_predict8x8_mmx(src_ptr, src_pixels_per_line, xoffset, yoffset,
  154. dst_ptr, dst_pitch);
  155. vp8_bilinear_predict8x8_mmx(src_ptr + 8, src_pixels_per_line, xoffset,
  156. yoffset, dst_ptr + 8, dst_pitch);
  157. vp8_bilinear_predict8x8_mmx(src_ptr + 8 * src_pixels_per_line,
  158. src_pixels_per_line, xoffset, yoffset,
  159. dst_ptr + dst_pitch * 8, dst_pitch);
  160. vp8_bilinear_predict8x8_mmx(src_ptr + 8 * src_pixels_per_line + 8,
  161. src_pixels_per_line, xoffset, yoffset,
  162. dst_ptr + dst_pitch * 8 + 8, dst_pitch);
  163. }
  164. #endif
  165. #if HAVE_SSE2
  166. void vp8_sixtap_predict16x16_sse2(unsigned char *src_ptr,
  167. int src_pixels_per_line, int xoffset,
  168. int yoffset, unsigned char *dst_ptr,
  169. int dst_pitch
  170. ) {
  171. DECLARE_ALIGNED(16, unsigned short,
  172. FData2[24 * 24]); /* Temp data bufffer used in filtering */
  173. const short *HFilter, *VFilter;
  174. if (xoffset) {
  175. if (yoffset) {
  176. HFilter = vp8_six_tap_mmx[xoffset];
  177. vp8_filter_block1d16_h6_sse2(src_ptr - (2 * src_pixels_per_line), FData2,
  178. src_pixels_per_line, 1, 21, 32, HFilter);
  179. VFilter = vp8_six_tap_mmx[yoffset];
  180. vp8_filter_block1d16_v6_sse2(FData2 + 32, dst_ptr, dst_pitch, 32, 16, 16,
  181. dst_pitch, VFilter);
  182. } else {
  183. /* First-pass only */
  184. HFilter = vp8_six_tap_mmx[xoffset];
  185. vp8_filter_block1d16_h6_only_sse2(src_ptr, src_pixels_per_line, dst_ptr,
  186. dst_pitch, 16, HFilter);
  187. }
  188. } else {
  189. /* Second-pass only */
  190. VFilter = vp8_six_tap_mmx[yoffset];
  191. vp8_unpack_block1d16_h6_sse2(src_ptr - (2 * src_pixels_per_line), FData2,
  192. src_pixels_per_line, 21, 32);
  193. vp8_filter_block1d16_v6_sse2(FData2 + 32, dst_ptr, dst_pitch, 32, 16, 16,
  194. dst_pitch, VFilter);
  195. }
  196. }
  197. void vp8_sixtap_predict8x8_sse2(unsigned char *src_ptr, int src_pixels_per_line,
  198. int xoffset, int yoffset,
  199. unsigned char *dst_ptr, int dst_pitch) {
  200. DECLARE_ALIGNED(16, unsigned short,
  201. FData2[256]); /* Temp data bufffer used in filtering */
  202. const short *HFilter, *VFilter;
  203. if (xoffset) {
  204. if (yoffset) {
  205. HFilter = vp8_six_tap_mmx[xoffset];
  206. vp8_filter_block1d8_h6_sse2(src_ptr - (2 * src_pixels_per_line), FData2,
  207. src_pixels_per_line, 1, 13, 16, HFilter);
  208. VFilter = vp8_six_tap_mmx[yoffset];
  209. vp8_filter_block1d8_v6_sse2(FData2 + 16, dst_ptr, dst_pitch, 16, 8, 8,
  210. dst_pitch, VFilter);
  211. } else {
  212. /* First-pass only */
  213. HFilter = vp8_six_tap_mmx[xoffset];
  214. vp8_filter_block1d8_h6_only_sse2(src_ptr, src_pixels_per_line, dst_ptr,
  215. dst_pitch, 8, HFilter);
  216. }
  217. } else {
  218. /* Second-pass only */
  219. VFilter = vp8_six_tap_mmx[yoffset];
  220. vp8_filter_block1d8_v6_only_sse2(src_ptr - (2 * src_pixels_per_line),
  221. src_pixels_per_line, dst_ptr, dst_pitch, 8,
  222. VFilter);
  223. }
  224. }
  225. void vp8_sixtap_predict8x4_sse2(unsigned char *src_ptr, int src_pixels_per_line,
  226. int xoffset, int yoffset,
  227. unsigned char *dst_ptr, int dst_pitch) {
  228. DECLARE_ALIGNED(16, unsigned short,
  229. FData2[256]); /* Temp data bufffer used in filtering */
  230. const short *HFilter, *VFilter;
  231. if (xoffset) {
  232. if (yoffset) {
  233. HFilter = vp8_six_tap_mmx[xoffset];
  234. vp8_filter_block1d8_h6_sse2(src_ptr - (2 * src_pixels_per_line), FData2,
  235. src_pixels_per_line, 1, 9, 16, HFilter);
  236. VFilter = vp8_six_tap_mmx[yoffset];
  237. vp8_filter_block1d8_v6_sse2(FData2 + 16, dst_ptr, dst_pitch, 16, 8, 4,
  238. dst_pitch, VFilter);
  239. } else {
  240. /* First-pass only */
  241. HFilter = vp8_six_tap_mmx[xoffset];
  242. vp8_filter_block1d8_h6_only_sse2(src_ptr, src_pixels_per_line, dst_ptr,
  243. dst_pitch, 4, HFilter);
  244. }
  245. } else {
  246. /* Second-pass only */
  247. VFilter = vp8_six_tap_mmx[yoffset];
  248. vp8_filter_block1d8_v6_only_sse2(src_ptr - (2 * src_pixels_per_line),
  249. src_pixels_per_line, dst_ptr, dst_pitch, 4,
  250. VFilter);
  251. }
  252. }
  253. #endif
  254. #if HAVE_SSSE3
  255. extern void vp8_filter_block1d8_h6_ssse3(unsigned char *src_ptr,
  256. unsigned int src_pixels_per_line,
  257. unsigned char *output_ptr,
  258. unsigned int output_pitch,
  259. unsigned int output_height,
  260. unsigned int vp8_filter_index);
  261. extern void vp8_filter_block1d16_h6_ssse3(unsigned char *src_ptr,
  262. unsigned int src_pixels_per_line,
  263. unsigned char *output_ptr,
  264. unsigned int output_pitch,
  265. unsigned int output_height,
  266. unsigned int vp8_filter_index);
  267. extern void vp8_filter_block1d16_v6_ssse3(unsigned char *src_ptr,
  268. unsigned int src_pitch,
  269. unsigned char *output_ptr,
  270. unsigned int out_pitch,
  271. unsigned int output_height,
  272. unsigned int vp8_filter_index);
  273. extern void vp8_filter_block1d8_v6_ssse3(unsigned char *src_ptr,
  274. unsigned int src_pitch,
  275. unsigned char *output_ptr,
  276. unsigned int out_pitch,
  277. unsigned int output_height,
  278. unsigned int vp8_filter_index);
  279. extern void vp8_filter_block1d4_h6_ssse3(unsigned char *src_ptr,
  280. unsigned int src_pixels_per_line,
  281. unsigned char *output_ptr,
  282. unsigned int output_pitch,
  283. unsigned int output_height,
  284. unsigned int vp8_filter_index);
  285. extern void vp8_filter_block1d4_v6_ssse3(unsigned char *src_ptr,
  286. unsigned int src_pitch,
  287. unsigned char *output_ptr,
  288. unsigned int out_pitch,
  289. unsigned int output_height,
  290. unsigned int vp8_filter_index);
  291. void vp8_sixtap_predict16x16_ssse3(unsigned char *src_ptr,
  292. int src_pixels_per_line, int xoffset,
  293. int yoffset, unsigned char *dst_ptr,
  294. int dst_pitch
  295. ) {
  296. DECLARE_ALIGNED(16, unsigned char, FData2[24 * 24]);
  297. if (xoffset) {
  298. if (yoffset) {
  299. vp8_filter_block1d16_h6_ssse3(src_ptr - (2 * src_pixels_per_line),
  300. src_pixels_per_line, FData2, 16, 21,
  301. xoffset);
  302. vp8_filter_block1d16_v6_ssse3(FData2, 16, dst_ptr, dst_pitch, 16,
  303. yoffset);
  304. } else {
  305. /* First-pass only */
  306. vp8_filter_block1d16_h6_ssse3(src_ptr, src_pixels_per_line, dst_ptr,
  307. dst_pitch, 16, xoffset);
  308. }
  309. } else {
  310. if (yoffset) {
  311. /* Second-pass only */
  312. vp8_filter_block1d16_v6_ssse3(src_ptr - (2 * src_pixels_per_line),
  313. src_pixels_per_line, dst_ptr, dst_pitch, 16,
  314. yoffset);
  315. } else {
  316. /* ssse3 second-pass only function couldn't handle (xoffset==0 &&
  317. * yoffset==0) case correctly. Add copy function here to guarantee
  318. * six-tap function handles all possible offsets. */
  319. vp8_copy_mem16x16(src_ptr, src_pixels_per_line, dst_ptr, dst_pitch);
  320. }
  321. }
  322. }
  323. void vp8_sixtap_predict8x8_ssse3(unsigned char *src_ptr,
  324. int src_pixels_per_line, int xoffset,
  325. int yoffset, unsigned char *dst_ptr,
  326. int dst_pitch) {
  327. DECLARE_ALIGNED(16, unsigned char, FData2[256]);
  328. if (xoffset) {
  329. if (yoffset) {
  330. vp8_filter_block1d8_h6_ssse3(src_ptr - (2 * src_pixels_per_line),
  331. src_pixels_per_line, FData2, 8, 13, xoffset);
  332. vp8_filter_block1d8_v6_ssse3(FData2, 8, dst_ptr, dst_pitch, 8, yoffset);
  333. } else {
  334. vp8_filter_block1d8_h6_ssse3(src_ptr, src_pixels_per_line, dst_ptr,
  335. dst_pitch, 8, xoffset);
  336. }
  337. } else {
  338. if (yoffset) {
  339. /* Second-pass only */
  340. vp8_filter_block1d8_v6_ssse3(src_ptr - (2 * src_pixels_per_line),
  341. src_pixels_per_line, dst_ptr, dst_pitch, 8,
  342. yoffset);
  343. } else {
  344. /* ssse3 second-pass only function couldn't handle (xoffset==0 &&
  345. * yoffset==0) case correctly. Add copy function here to guarantee
  346. * six-tap function handles all possible offsets. */
  347. vp8_copy_mem8x8(src_ptr, src_pixels_per_line, dst_ptr, dst_pitch);
  348. }
  349. }
  350. }
  351. void vp8_sixtap_predict8x4_ssse3(unsigned char *src_ptr,
  352. int src_pixels_per_line, int xoffset,
  353. int yoffset, unsigned char *dst_ptr,
  354. int dst_pitch) {
  355. DECLARE_ALIGNED(16, unsigned char, FData2[256]);
  356. if (xoffset) {
  357. if (yoffset) {
  358. vp8_filter_block1d8_h6_ssse3(src_ptr - (2 * src_pixels_per_line),
  359. src_pixels_per_line, FData2, 8, 9, xoffset);
  360. vp8_filter_block1d8_v6_ssse3(FData2, 8, dst_ptr, dst_pitch, 4, yoffset);
  361. } else {
  362. /* First-pass only */
  363. vp8_filter_block1d8_h6_ssse3(src_ptr, src_pixels_per_line, dst_ptr,
  364. dst_pitch, 4, xoffset);
  365. }
  366. } else {
  367. if (yoffset) {
  368. /* Second-pass only */
  369. vp8_filter_block1d8_v6_ssse3(src_ptr - (2 * src_pixels_per_line),
  370. src_pixels_per_line, dst_ptr, dst_pitch, 4,
  371. yoffset);
  372. } else {
  373. /* ssse3 second-pass only function couldn't handle (xoffset==0 &&
  374. * yoffset==0) case correctly. Add copy function here to guarantee
  375. * six-tap function handles all possible offsets. */
  376. vp8_copy_mem8x4(src_ptr, src_pixels_per_line, dst_ptr, dst_pitch);
  377. }
  378. }
  379. }
  380. void vp8_sixtap_predict4x4_ssse3(unsigned char *src_ptr,
  381. int src_pixels_per_line, int xoffset,
  382. int yoffset, unsigned char *dst_ptr,
  383. int dst_pitch) {
  384. DECLARE_ALIGNED(16, unsigned char, FData2[4 * 9]);
  385. if (xoffset) {
  386. if (yoffset) {
  387. vp8_filter_block1d4_h6_ssse3(src_ptr - (2 * src_pixels_per_line),
  388. src_pixels_per_line, FData2, 4, 9, xoffset);
  389. vp8_filter_block1d4_v6_ssse3(FData2, 4, dst_ptr, dst_pitch, 4, yoffset);
  390. } else {
  391. vp8_filter_block1d4_h6_ssse3(src_ptr, src_pixels_per_line, dst_ptr,
  392. dst_pitch, 4, xoffset);
  393. }
  394. } else {
  395. if (yoffset) {
  396. vp8_filter_block1d4_v6_ssse3(src_ptr - (2 * src_pixels_per_line),
  397. src_pixels_per_line, dst_ptr, dst_pitch, 4,
  398. yoffset);
  399. } else {
  400. /* ssse3 second-pass only function couldn't handle (xoffset==0 &&
  401. * yoffset==0) case correctly. Add copy function here to guarantee
  402. * six-tap function handles all possible offsets. */
  403. int r;
  404. for (r = 0; r < 4; ++r) {
  405. dst_ptr[0] = src_ptr[0];
  406. dst_ptr[1] = src_ptr[1];
  407. dst_ptr[2] = src_ptr[2];
  408. dst_ptr[3] = src_ptr[3];
  409. dst_ptr += dst_pitch;
  410. src_ptr += src_pixels_per_line;
  411. }
  412. }
  413. }
  414. }
  415. #endif