vp8_asm_stubs.c 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365
  1. /*
  2. * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
  3. *
  4. * Use of this source code is governed by a BSD-style license
  5. * that can be found in the LICENSE file in the root of the source
  6. * tree. An additional intellectual property rights grant can be found
  7. * in the file PATENTS. All contributing project authors may
  8. * be found in the AUTHORS file in the root of the source tree.
  9. */
  10. #include "vpx_config.h"
  11. #include "vp8_rtcd.h"
  12. #include "vpx_ports/mem.h"
  13. extern const short vp8_six_tap_x86[8][6 * 8];
  14. extern void vp8_filter_block1d_h6_mmx(unsigned char *src_ptr,
  15. unsigned short *output_ptr,
  16. unsigned int src_pixels_per_line,
  17. unsigned int pixel_step,
  18. unsigned int output_height,
  19. unsigned int output_width,
  20. const short *vp8_filter);
  21. extern void vp8_filter_block1dc_v6_mmx(
  22. unsigned short *src_ptr, unsigned char *output_ptr, int output_pitch,
  23. unsigned int pixels_per_line, unsigned int pixel_step,
  24. unsigned int output_height, unsigned int output_width,
  25. const short *vp8_filter);
  26. extern void vp8_filter_block1d8_h6_sse2(unsigned char *src_ptr,
  27. unsigned short *output_ptr,
  28. unsigned int src_pixels_per_line,
  29. unsigned int pixel_step,
  30. unsigned int output_height,
  31. unsigned int output_width,
  32. const short *vp8_filter);
  33. extern void vp8_filter_block1d16_h6_sse2(unsigned char *src_ptr,
  34. unsigned short *output_ptr,
  35. unsigned int src_pixels_per_line,
  36. unsigned int pixel_step,
  37. unsigned int output_height,
  38. unsigned int output_width,
  39. const short *vp8_filter);
  40. extern void vp8_filter_block1d8_v6_sse2(
  41. unsigned short *src_ptr, unsigned char *output_ptr, int dst_ptich,
  42. unsigned int pixels_per_line, unsigned int pixel_step,
  43. unsigned int output_height, unsigned int output_width,
  44. const short *vp8_filter);
  45. extern void vp8_filter_block1d16_v6_sse2(
  46. unsigned short *src_ptr, unsigned char *output_ptr, int dst_ptich,
  47. unsigned int pixels_per_line, unsigned int pixel_step,
  48. unsigned int output_height, unsigned int output_width,
  49. const short *vp8_filter);
  50. extern void vp8_unpack_block1d16_h6_sse2(unsigned char *src_ptr,
  51. unsigned short *output_ptr,
  52. unsigned int src_pixels_per_line,
  53. unsigned int output_height,
  54. unsigned int output_width);
  55. extern void vp8_filter_block1d8_h6_only_sse2(unsigned char *src_ptr,
  56. unsigned int src_pixels_per_line,
  57. unsigned char *output_ptr,
  58. int dst_ptich,
  59. unsigned int output_height,
  60. const short *vp8_filter);
  61. extern void vp8_filter_block1d16_h6_only_sse2(unsigned char *src_ptr,
  62. unsigned int src_pixels_per_line,
  63. unsigned char *output_ptr,
  64. int dst_ptich,
  65. unsigned int output_height,
  66. const short *vp8_filter);
  67. extern void vp8_filter_block1d8_v6_only_sse2(unsigned char *src_ptr,
  68. unsigned int src_pixels_per_line,
  69. unsigned char *output_ptr,
  70. int dst_ptich,
  71. unsigned int output_height,
  72. const short *vp8_filter);
  73. #if HAVE_MMX
  74. void vp8_sixtap_predict4x4_mmx(unsigned char *src_ptr, int src_pixels_per_line,
  75. int xoffset, int yoffset, unsigned char *dst_ptr,
  76. int dst_pitch) {
  77. DECLARE_ALIGNED(16, unsigned short,
  78. FData2[16 * 16]); /* Temp data bufffer used in filtering */
  79. const short *HFilter, *VFilter;
  80. HFilter = vp8_six_tap_x86[xoffset];
  81. vp8_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line), FData2,
  82. src_pixels_per_line, 1, 9, 8, HFilter);
  83. VFilter = vp8_six_tap_x86[yoffset];
  84. vp8_filter_block1dc_v6_mmx(FData2 + 8, dst_ptr, dst_pitch, 8, 4, 4, 4,
  85. VFilter);
  86. }
  87. #endif
  88. #if HAVE_SSE2
  89. void vp8_sixtap_predict16x16_sse2(unsigned char *src_ptr,
  90. int src_pixels_per_line, int xoffset,
  91. int yoffset, unsigned char *dst_ptr,
  92. int dst_pitch) {
  93. DECLARE_ALIGNED(16, unsigned short,
  94. FData2[24 * 24]); /* Temp data bufffer used in filtering */
  95. const short *HFilter, *VFilter;
  96. if (xoffset) {
  97. if (yoffset) {
  98. HFilter = vp8_six_tap_x86[xoffset];
  99. vp8_filter_block1d16_h6_sse2(src_ptr - (2 * src_pixels_per_line), FData2,
  100. src_pixels_per_line, 1, 21, 32, HFilter);
  101. VFilter = vp8_six_tap_x86[yoffset];
  102. vp8_filter_block1d16_v6_sse2(FData2 + 32, dst_ptr, dst_pitch, 32, 16, 16,
  103. dst_pitch, VFilter);
  104. } else {
  105. /* First-pass only */
  106. HFilter = vp8_six_tap_x86[xoffset];
  107. vp8_filter_block1d16_h6_only_sse2(src_ptr, src_pixels_per_line, dst_ptr,
  108. dst_pitch, 16, HFilter);
  109. }
  110. } else {
  111. /* Second-pass only */
  112. VFilter = vp8_six_tap_x86[yoffset];
  113. vp8_unpack_block1d16_h6_sse2(src_ptr - (2 * src_pixels_per_line), FData2,
  114. src_pixels_per_line, 21, 32);
  115. vp8_filter_block1d16_v6_sse2(FData2 + 32, dst_ptr, dst_pitch, 32, 16, 16,
  116. dst_pitch, VFilter);
  117. }
  118. }
  119. void vp8_sixtap_predict8x8_sse2(unsigned char *src_ptr, int src_pixels_per_line,
  120. int xoffset, int yoffset,
  121. unsigned char *dst_ptr, int dst_pitch) {
  122. DECLARE_ALIGNED(16, unsigned short,
  123. FData2[256]); /* Temp data bufffer used in filtering */
  124. const short *HFilter, *VFilter;
  125. if (xoffset) {
  126. if (yoffset) {
  127. HFilter = vp8_six_tap_x86[xoffset];
  128. vp8_filter_block1d8_h6_sse2(src_ptr - (2 * src_pixels_per_line), FData2,
  129. src_pixels_per_line, 1, 13, 16, HFilter);
  130. VFilter = vp8_six_tap_x86[yoffset];
  131. vp8_filter_block1d8_v6_sse2(FData2 + 16, dst_ptr, dst_pitch, 16, 8, 8,
  132. dst_pitch, VFilter);
  133. } else {
  134. /* First-pass only */
  135. HFilter = vp8_six_tap_x86[xoffset];
  136. vp8_filter_block1d8_h6_only_sse2(src_ptr, src_pixels_per_line, dst_ptr,
  137. dst_pitch, 8, HFilter);
  138. }
  139. } else {
  140. /* Second-pass only */
  141. VFilter = vp8_six_tap_x86[yoffset];
  142. vp8_filter_block1d8_v6_only_sse2(src_ptr - (2 * src_pixels_per_line),
  143. src_pixels_per_line, dst_ptr, dst_pitch, 8,
  144. VFilter);
  145. }
  146. }
  147. void vp8_sixtap_predict8x4_sse2(unsigned char *src_ptr, int src_pixels_per_line,
  148. int xoffset, int yoffset,
  149. unsigned char *dst_ptr, int dst_pitch) {
  150. DECLARE_ALIGNED(16, unsigned short,
  151. FData2[256]); /* Temp data bufffer used in filtering */
  152. const short *HFilter, *VFilter;
  153. if (xoffset) {
  154. if (yoffset) {
  155. HFilter = vp8_six_tap_x86[xoffset];
  156. vp8_filter_block1d8_h6_sse2(src_ptr - (2 * src_pixels_per_line), FData2,
  157. src_pixels_per_line, 1, 9, 16, HFilter);
  158. VFilter = vp8_six_tap_x86[yoffset];
  159. vp8_filter_block1d8_v6_sse2(FData2 + 16, dst_ptr, dst_pitch, 16, 8, 4,
  160. dst_pitch, VFilter);
  161. } else {
  162. /* First-pass only */
  163. HFilter = vp8_six_tap_x86[xoffset];
  164. vp8_filter_block1d8_h6_only_sse2(src_ptr, src_pixels_per_line, dst_ptr,
  165. dst_pitch, 4, HFilter);
  166. }
  167. } else {
  168. /* Second-pass only */
  169. VFilter = vp8_six_tap_x86[yoffset];
  170. vp8_filter_block1d8_v6_only_sse2(src_ptr - (2 * src_pixels_per_line),
  171. src_pixels_per_line, dst_ptr, dst_pitch, 4,
  172. VFilter);
  173. }
  174. }
  175. #endif
  176. #if HAVE_SSSE3
  177. extern void vp8_filter_block1d8_h6_ssse3(unsigned char *src_ptr,
  178. unsigned int src_pixels_per_line,
  179. unsigned char *output_ptr,
  180. unsigned int output_pitch,
  181. unsigned int output_height,
  182. unsigned int vp8_filter_index);
  183. extern void vp8_filter_block1d16_h6_ssse3(unsigned char *src_ptr,
  184. unsigned int src_pixels_per_line,
  185. unsigned char *output_ptr,
  186. unsigned int output_pitch,
  187. unsigned int output_height,
  188. unsigned int vp8_filter_index);
  189. extern void vp8_filter_block1d16_v6_ssse3(unsigned char *src_ptr,
  190. unsigned int src_pitch,
  191. unsigned char *output_ptr,
  192. unsigned int out_pitch,
  193. unsigned int output_height,
  194. unsigned int vp8_filter_index);
  195. extern void vp8_filter_block1d8_v6_ssse3(unsigned char *src_ptr,
  196. unsigned int src_pitch,
  197. unsigned char *output_ptr,
  198. unsigned int out_pitch,
  199. unsigned int output_height,
  200. unsigned int vp8_filter_index);
  201. extern void vp8_filter_block1d4_h6_ssse3(unsigned char *src_ptr,
  202. unsigned int src_pixels_per_line,
  203. unsigned char *output_ptr,
  204. unsigned int output_pitch,
  205. unsigned int output_height,
  206. unsigned int vp8_filter_index);
  207. extern void vp8_filter_block1d4_v6_ssse3(unsigned char *src_ptr,
  208. unsigned int src_pitch,
  209. unsigned char *output_ptr,
  210. unsigned int out_pitch,
  211. unsigned int output_height,
  212. unsigned int vp8_filter_index);
  213. void vp8_sixtap_predict16x16_ssse3(unsigned char *src_ptr,
  214. int src_pixels_per_line, int xoffset,
  215. int yoffset, unsigned char *dst_ptr,
  216. int dst_pitch) {
  217. DECLARE_ALIGNED(16, unsigned char, FData2[24 * 24]);
  218. if (xoffset) {
  219. if (yoffset) {
  220. vp8_filter_block1d16_h6_ssse3(src_ptr - (2 * src_pixels_per_line),
  221. src_pixels_per_line, FData2, 16, 21,
  222. xoffset);
  223. vp8_filter_block1d16_v6_ssse3(FData2, 16, dst_ptr, dst_pitch, 16,
  224. yoffset);
  225. } else {
  226. /* First-pass only */
  227. vp8_filter_block1d16_h6_ssse3(src_ptr, src_pixels_per_line, dst_ptr,
  228. dst_pitch, 16, xoffset);
  229. }
  230. } else {
  231. if (yoffset) {
  232. /* Second-pass only */
  233. vp8_filter_block1d16_v6_ssse3(src_ptr - (2 * src_pixels_per_line),
  234. src_pixels_per_line, dst_ptr, dst_pitch, 16,
  235. yoffset);
  236. } else {
  237. /* ssse3 second-pass only function couldn't handle (xoffset==0 &&
  238. * yoffset==0) case correctly. Add copy function here to guarantee
  239. * six-tap function handles all possible offsets. */
  240. vp8_copy_mem16x16(src_ptr, src_pixels_per_line, dst_ptr, dst_pitch);
  241. }
  242. }
  243. }
  244. void vp8_sixtap_predict8x8_ssse3(unsigned char *src_ptr,
  245. int src_pixels_per_line, int xoffset,
  246. int yoffset, unsigned char *dst_ptr,
  247. int dst_pitch) {
  248. DECLARE_ALIGNED(16, unsigned char, FData2[256]);
  249. if (xoffset) {
  250. if (yoffset) {
  251. vp8_filter_block1d8_h6_ssse3(src_ptr - (2 * src_pixels_per_line),
  252. src_pixels_per_line, FData2, 8, 13, xoffset);
  253. vp8_filter_block1d8_v6_ssse3(FData2, 8, dst_ptr, dst_pitch, 8, yoffset);
  254. } else {
  255. vp8_filter_block1d8_h6_ssse3(src_ptr, src_pixels_per_line, dst_ptr,
  256. dst_pitch, 8, xoffset);
  257. }
  258. } else {
  259. if (yoffset) {
  260. /* Second-pass only */
  261. vp8_filter_block1d8_v6_ssse3(src_ptr - (2 * src_pixels_per_line),
  262. src_pixels_per_line, dst_ptr, dst_pitch, 8,
  263. yoffset);
  264. } else {
  265. /* ssse3 second-pass only function couldn't handle (xoffset==0 &&
  266. * yoffset==0) case correctly. Add copy function here to guarantee
  267. * six-tap function handles all possible offsets. */
  268. vp8_copy_mem8x8(src_ptr, src_pixels_per_line, dst_ptr, dst_pitch);
  269. }
  270. }
  271. }
  272. void vp8_sixtap_predict8x4_ssse3(unsigned char *src_ptr,
  273. int src_pixels_per_line, int xoffset,
  274. int yoffset, unsigned char *dst_ptr,
  275. int dst_pitch) {
  276. DECLARE_ALIGNED(16, unsigned char, FData2[256]);
  277. if (xoffset) {
  278. if (yoffset) {
  279. vp8_filter_block1d8_h6_ssse3(src_ptr - (2 * src_pixels_per_line),
  280. src_pixels_per_line, FData2, 8, 9, xoffset);
  281. vp8_filter_block1d8_v6_ssse3(FData2, 8, dst_ptr, dst_pitch, 4, yoffset);
  282. } else {
  283. /* First-pass only */
  284. vp8_filter_block1d8_h6_ssse3(src_ptr, src_pixels_per_line, dst_ptr,
  285. dst_pitch, 4, xoffset);
  286. }
  287. } else {
  288. if (yoffset) {
  289. /* Second-pass only */
  290. vp8_filter_block1d8_v6_ssse3(src_ptr - (2 * src_pixels_per_line),
  291. src_pixels_per_line, dst_ptr, dst_pitch, 4,
  292. yoffset);
  293. } else {
  294. /* ssse3 second-pass only function couldn't handle (xoffset==0 &&
  295. * yoffset==0) case correctly. Add copy function here to guarantee
  296. * six-tap function handles all possible offsets. */
  297. vp8_copy_mem8x4(src_ptr, src_pixels_per_line, dst_ptr, dst_pitch);
  298. }
  299. }
  300. }
  301. void vp8_sixtap_predict4x4_ssse3(unsigned char *src_ptr,
  302. int src_pixels_per_line, int xoffset,
  303. int yoffset, unsigned char *dst_ptr,
  304. int dst_pitch) {
  305. DECLARE_ALIGNED(16, unsigned char, FData2[4 * 9]);
  306. if (xoffset) {
  307. if (yoffset) {
  308. vp8_filter_block1d4_h6_ssse3(src_ptr - (2 * src_pixels_per_line),
  309. src_pixels_per_line, FData2, 4, 9, xoffset);
  310. vp8_filter_block1d4_v6_ssse3(FData2, 4, dst_ptr, dst_pitch, 4, yoffset);
  311. } else {
  312. vp8_filter_block1d4_h6_ssse3(src_ptr, src_pixels_per_line, dst_ptr,
  313. dst_pitch, 4, xoffset);
  314. }
  315. } else {
  316. if (yoffset) {
  317. vp8_filter_block1d4_v6_ssse3(src_ptr - (2 * src_pixels_per_line),
  318. src_pixels_per_line, dst_ptr, dst_pitch, 4,
  319. yoffset);
  320. } else {
  321. /* ssse3 second-pass only function couldn't handle (xoffset==0 &&
  322. * yoffset==0) case correctly. Add copy function here to guarantee
  323. * six-tap function handles all possible offsets. */
  324. int r;
  325. for (r = 0; r < 4; ++r) {
  326. dst_ptr[0] = src_ptr[0];
  327. dst_ptr[1] = src_ptr[1];
  328. dst_ptr[2] = src_ptr[2];
  329. dst_ptr[3] = src_ptr[3];
  330. dst_ptr += dst_pitch;
  331. src_ptr += src_pixels_per_line;
  332. }
  333. }
  334. }
  335. }
  336. #endif