subpixel_mmx.asm 8.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270
  1. ;
  2. ; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
  3. ;
  4. ; Use of this source code is governed by a BSD-style license
  5. ; that can be found in the LICENSE file in the root of the source
  6. ; tree. An additional intellectual property rights grant can be found
  7. ; in the file PATENTS. All contributing project authors may
  8. ; be found in the AUTHORS file in the root of the source tree.
  9. ;
  10. %include "vpx_ports/x86_abi_support.asm"
  11. %define BLOCK_HEIGHT_WIDTH 4
  12. %define vp8_filter_weight 128
  13. %define VP8_FILTER_SHIFT 7
  14. SECTION .text
  15. ;void vp8_filter_block1d_h6_mmx
  16. ;(
  17. ; unsigned char *src_ptr,
  18. ; unsigned short *output_ptr,
  19. ; unsigned int src_pixels_per_line,
  20. ; unsigned int pixel_step,
  21. ; unsigned int output_height,
  22. ; unsigned int output_width,
  23. ; short * vp8_filter
  24. ;)
  25. global sym(vp8_filter_block1d_h6_mmx) PRIVATE
  26. sym(vp8_filter_block1d_h6_mmx):
  27. push rbp
  28. mov rbp, rsp
  29. SHADOW_ARGS_TO_STACK 7
  30. GET_GOT rbx
  31. push rsi
  32. push rdi
  33. ; end prolog
  34. mov rdx, arg(6) ;vp8_filter
  35. movq mm1, [rdx + 16] ; do both the negative taps first!!!
  36. movq mm2, [rdx + 32] ;
  37. movq mm6, [rdx + 48] ;
  38. movq mm7, [rdx + 64] ;
  39. mov rdi, arg(1) ;output_ptr
  40. mov rsi, arg(0) ;src_ptr
  41. movsxd rcx, dword ptr arg(4) ;output_height
  42. movsxd rax, dword ptr arg(5) ;output_width ; destination pitch?
  43. pxor mm0, mm0 ; mm0 = 00000000
  44. .nextrow:
  45. movq mm3, [rsi-2] ; mm3 = p-2..p5
  46. movq mm4, mm3 ; mm4 = p-2..p5
  47. psrlq mm3, 8 ; mm3 = p-1..p5
  48. punpcklbw mm3, mm0 ; mm3 = p-1..p2
  49. pmullw mm3, mm1 ; mm3 *= kernel 1 modifiers.
  50. movq mm5, mm4 ; mm5 = p-2..p5
  51. punpckhbw mm4, mm0 ; mm5 = p2..p5
  52. pmullw mm4, mm7 ; mm5 *= kernel 4 modifiers
  53. paddsw mm3, mm4 ; mm3 += mm5
  54. movq mm4, mm5 ; mm4 = p-2..p5;
  55. psrlq mm5, 16 ; mm5 = p0..p5;
  56. punpcklbw mm5, mm0 ; mm5 = p0..p3
  57. pmullw mm5, mm2 ; mm5 *= kernel 2 modifiers
  58. paddsw mm3, mm5 ; mm3 += mm5
  59. movq mm5, mm4 ; mm5 = p-2..p5
  60. psrlq mm4, 24 ; mm4 = p1..p5
  61. punpcklbw mm4, mm0 ; mm4 = p1..p4
  62. pmullw mm4, mm6 ; mm5 *= kernel 3 modifiers
  63. paddsw mm3, mm4 ; mm3 += mm5
  64. ; do outer positive taps
  65. movd mm4, [rsi+3]
  66. punpcklbw mm4, mm0 ; mm5 = p3..p6
  67. pmullw mm4, [rdx+80] ; mm5 *= kernel 0 modifiers
  68. paddsw mm3, mm4 ; mm3 += mm5
  69. punpcklbw mm5, mm0 ; mm5 = p-2..p1
  70. pmullw mm5, [rdx] ; mm5 *= kernel 5 modifiers
  71. paddsw mm3, mm5 ; mm3 += mm5
  72. paddsw mm3, [GLOBAL(rd)] ; mm3 += round value
  73. psraw mm3, VP8_FILTER_SHIFT ; mm3 /= 128
  74. packuswb mm3, mm0 ; pack and unpack to saturate
  75. punpcklbw mm3, mm0 ;
  76. movq [rdi], mm3 ; store the results in the destination
  77. %if ABI_IS_32BIT
  78. add rsi, dword ptr arg(2) ;src_pixels_per_line ; next line
  79. add rdi, rax;
  80. %else
  81. movsxd r8, dword ptr arg(2) ;src_pixels_per_line
  82. add rdi, rax;
  83. add rsi, r8 ; next line
  84. %endif
  85. dec rcx ; decrement count
  86. jnz .nextrow ; next row
  87. ; begin epilog
  88. pop rdi
  89. pop rsi
  90. RESTORE_GOT
  91. UNSHADOW_ARGS
  92. pop rbp
  93. ret
  94. ;void vp8_filter_block1dc_v6_mmx
  95. ;(
  96. ; short *src_ptr,
  97. ; unsigned char *output_ptr,
  98. ; int output_pitch,
  99. ; unsigned int pixels_per_line,
  100. ; unsigned int pixel_step,
  101. ; unsigned int output_height,
  102. ; unsigned int output_width,
  103. ; short * vp8_filter
  104. ;)
  105. global sym(vp8_filter_block1dc_v6_mmx) PRIVATE
  106. sym(vp8_filter_block1dc_v6_mmx):
  107. push rbp
  108. mov rbp, rsp
  109. SHADOW_ARGS_TO_STACK 8
  110. GET_GOT rbx
  111. push rsi
  112. push rdi
  113. ; end prolog
  114. movq mm5, [GLOBAL(rd)]
  115. push rbx
  116. mov rbx, arg(7) ;vp8_filter
  117. movq mm1, [rbx + 16] ; do both the negative taps first!!!
  118. movq mm2, [rbx + 32] ;
  119. movq mm6, [rbx + 48] ;
  120. movq mm7, [rbx + 64] ;
  121. movsxd rdx, dword ptr arg(3) ;pixels_per_line
  122. mov rdi, arg(1) ;output_ptr
  123. mov rsi, arg(0) ;src_ptr
  124. sub rsi, rdx
  125. sub rsi, rdx
  126. movsxd rcx, DWORD PTR arg(5) ;output_height
  127. movsxd rax, DWORD PTR arg(2) ;output_pitch ; destination pitch?
  128. pxor mm0, mm0 ; mm0 = 00000000
  129. .nextrow_cv:
  130. movq mm3, [rsi+rdx] ; mm3 = p0..p8 = row -1
  131. pmullw mm3, mm1 ; mm3 *= kernel 1 modifiers.
  132. movq mm4, [rsi + 4*rdx] ; mm4 = p0..p3 = row 2
  133. pmullw mm4, mm7 ; mm4 *= kernel 4 modifiers.
  134. paddsw mm3, mm4 ; mm3 += mm4
  135. movq mm4, [rsi + 2*rdx] ; mm4 = p0..p3 = row 0
  136. pmullw mm4, mm2 ; mm4 *= kernel 2 modifiers.
  137. paddsw mm3, mm4 ; mm3 += mm4
  138. movq mm4, [rsi] ; mm4 = p0..p3 = row -2
  139. pmullw mm4, [rbx] ; mm4 *= kernel 0 modifiers.
  140. paddsw mm3, mm4 ; mm3 += mm4
  141. add rsi, rdx ; move source forward 1 line to avoid 3 * pitch
  142. movq mm4, [rsi + 2*rdx] ; mm4 = p0..p3 = row 1
  143. pmullw mm4, mm6 ; mm4 *= kernel 3 modifiers.
  144. paddsw mm3, mm4 ; mm3 += mm4
  145. movq mm4, [rsi + 4*rdx] ; mm4 = p0..p3 = row 3
  146. pmullw mm4, [rbx +80] ; mm4 *= kernel 3 modifiers.
  147. paddsw mm3, mm4 ; mm3 += mm4
  148. paddsw mm3, mm5 ; mm3 += round value
  149. psraw mm3, VP8_FILTER_SHIFT ; mm3 /= 128
  150. packuswb mm3, mm0 ; pack and saturate
  151. movd [rdi],mm3 ; store the results in the destination
  152. ; the subsequent iterations repeat 3 out of 4 of these reads. Since the
  153. ; recon block should be in cache this shouldn't cost much. Its obviously
  154. ; avoidable!!!.
  155. lea rdi, [rdi+rax] ;
  156. dec rcx ; decrement count
  157. jnz .nextrow_cv ; next row
  158. pop rbx
  159. ; begin epilog
  160. pop rdi
  161. pop rsi
  162. RESTORE_GOT
  163. UNSHADOW_ARGS
  164. pop rbp
  165. ret
  166. SECTION_RODATA
  167. align 16
  168. rd:
  169. times 4 dw 0x40
  170. align 16
  171. global HIDDEN_DATA(sym(vp8_six_tap_x86))
  172. sym(vp8_six_tap_x86):
  173. times 8 dw 0
  174. times 8 dw 0
  175. times 8 dw 128
  176. times 8 dw 0
  177. times 8 dw 0
  178. times 8 dw 0
  179. times 8 dw 0
  180. times 8 dw -6
  181. times 8 dw 123
  182. times 8 dw 12
  183. times 8 dw -1
  184. times 8 dw 0
  185. times 8 dw 2
  186. times 8 dw -11
  187. times 8 dw 108
  188. times 8 dw 36
  189. times 8 dw -8
  190. times 8 dw 1
  191. times 8 dw 0
  192. times 8 dw -9
  193. times 8 dw 93
  194. times 8 dw 50
  195. times 8 dw -6
  196. times 8 dw 0
  197. times 8 dw 3
  198. times 8 dw -16
  199. times 8 dw 77
  200. times 8 dw 77
  201. times 8 dw -16
  202. times 8 dw 3
  203. times 8 dw 0
  204. times 8 dw -6
  205. times 8 dw 50
  206. times 8 dw 93
  207. times 8 dw -9
  208. times 8 dw 0
  209. times 8 dw 1
  210. times 8 dw -8
  211. times 8 dw 36
  212. times 8 dw 108
  213. times 8 dw -11
  214. times 8 dw 2
  215. times 8 dw 0
  216. times 8 dw -1
  217. times 8 dw 12
  218. times 8 dw 123
  219. times 8 dw -6
  220. times 8 dw 0