2
0

subpixel_mmx.asm 23 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702
  1. ;
  2. ; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
  3. ;
  4. ; Use of this source code is governed by a BSD-style license
  5. ; that can be found in the LICENSE file in the root of the source
  6. ; tree. An additional intellectual property rights grant can be found
  7. ; in the file PATENTS. All contributing project authors may
  8. ; be found in the AUTHORS file in the root of the source tree.
  9. ;
  10. %include "vpx_ports/x86_abi_support.asm"
  11. extern sym(vp8_bilinear_filters_x86_8)
  12. %define BLOCK_HEIGHT_WIDTH 4
  13. %define vp8_filter_weight 128
  14. %define VP8_FILTER_SHIFT 7
  15. ;void vp8_filter_block1d_h6_mmx
  16. ;(
  17. ; unsigned char *src_ptr,
  18. ; unsigned short *output_ptr,
  19. ; unsigned int src_pixels_per_line,
  20. ; unsigned int pixel_step,
  21. ; unsigned int output_height,
  22. ; unsigned int output_width,
  23. ; short * vp8_filter
  24. ;)
  25. global sym(vp8_filter_block1d_h6_mmx) PRIVATE
  26. sym(vp8_filter_block1d_h6_mmx):
  27. push rbp
  28. mov rbp, rsp
  29. SHADOW_ARGS_TO_STACK 7
  30. GET_GOT rbx
  31. push rsi
  32. push rdi
  33. ; end prolog
  34. mov rdx, arg(6) ;vp8_filter
  35. movq mm1, [rdx + 16] ; do both the negative taps first!!!
  36. movq mm2, [rdx + 32] ;
  37. movq mm6, [rdx + 48] ;
  38. movq mm7, [rdx + 64] ;
  39. mov rdi, arg(1) ;output_ptr
  40. mov rsi, arg(0) ;src_ptr
  41. movsxd rcx, dword ptr arg(4) ;output_height
  42. movsxd rax, dword ptr arg(5) ;output_width ; destination pitch?
  43. pxor mm0, mm0 ; mm0 = 00000000
  44. .nextrow:
  45. movq mm3, [rsi-2] ; mm3 = p-2..p5
  46. movq mm4, mm3 ; mm4 = p-2..p5
  47. psrlq mm3, 8 ; mm3 = p-1..p5
  48. punpcklbw mm3, mm0 ; mm3 = p-1..p2
  49. pmullw mm3, mm1 ; mm3 *= kernel 1 modifiers.
  50. movq mm5, mm4 ; mm5 = p-2..p5
  51. punpckhbw mm4, mm0 ; mm5 = p2..p5
  52. pmullw mm4, mm7 ; mm5 *= kernel 4 modifiers
  53. paddsw mm3, mm4 ; mm3 += mm5
  54. movq mm4, mm5 ; mm4 = p-2..p5;
  55. psrlq mm5, 16 ; mm5 = p0..p5;
  56. punpcklbw mm5, mm0 ; mm5 = p0..p3
  57. pmullw mm5, mm2 ; mm5 *= kernel 2 modifiers
  58. paddsw mm3, mm5 ; mm3 += mm5
  59. movq mm5, mm4 ; mm5 = p-2..p5
  60. psrlq mm4, 24 ; mm4 = p1..p5
  61. punpcklbw mm4, mm0 ; mm4 = p1..p4
  62. pmullw mm4, mm6 ; mm5 *= kernel 3 modifiers
  63. paddsw mm3, mm4 ; mm3 += mm5
  64. ; do outer positive taps
  65. movd mm4, [rsi+3]
  66. punpcklbw mm4, mm0 ; mm5 = p3..p6
  67. pmullw mm4, [rdx+80] ; mm5 *= kernel 0 modifiers
  68. paddsw mm3, mm4 ; mm3 += mm5
  69. punpcklbw mm5, mm0 ; mm5 = p-2..p1
  70. pmullw mm5, [rdx] ; mm5 *= kernel 5 modifiers
  71. paddsw mm3, mm5 ; mm3 += mm5
  72. paddsw mm3, [GLOBAL(rd)] ; mm3 += round value
  73. psraw mm3, VP8_FILTER_SHIFT ; mm3 /= 128
  74. packuswb mm3, mm0 ; pack and unpack to saturate
  75. punpcklbw mm3, mm0 ;
  76. movq [rdi], mm3 ; store the results in the destination
  77. %if ABI_IS_32BIT
  78. add rsi, dword ptr arg(2) ;src_pixels_per_line ; next line
  79. add rdi, rax;
  80. %else
  81. movsxd r8, dword ptr arg(2) ;src_pixels_per_line
  82. add rdi, rax;
  83. add rsi, r8 ; next line
  84. %endif
  85. dec rcx ; decrement count
  86. jnz .nextrow ; next row
  87. ; begin epilog
  88. pop rdi
  89. pop rsi
  90. RESTORE_GOT
  91. UNSHADOW_ARGS
  92. pop rbp
  93. ret
  94. ;void vp8_filter_block1dc_v6_mmx
  95. ;(
  96. ; short *src_ptr,
  97. ; unsigned char *output_ptr,
  98. ; int output_pitch,
  99. ; unsigned int pixels_per_line,
  100. ; unsigned int pixel_step,
  101. ; unsigned int output_height,
  102. ; unsigned int output_width,
  103. ; short * vp8_filter
  104. ;)
  105. global sym(vp8_filter_block1dc_v6_mmx) PRIVATE
  106. sym(vp8_filter_block1dc_v6_mmx):
  107. push rbp
  108. mov rbp, rsp
  109. SHADOW_ARGS_TO_STACK 8
  110. GET_GOT rbx
  111. push rsi
  112. push rdi
  113. ; end prolog
  114. movq mm5, [GLOBAL(rd)]
  115. push rbx
  116. mov rbx, arg(7) ;vp8_filter
  117. movq mm1, [rbx + 16] ; do both the negative taps first!!!
  118. movq mm2, [rbx + 32] ;
  119. movq mm6, [rbx + 48] ;
  120. movq mm7, [rbx + 64] ;
  121. movsxd rdx, dword ptr arg(3) ;pixels_per_line
  122. mov rdi, arg(1) ;output_ptr
  123. mov rsi, arg(0) ;src_ptr
  124. sub rsi, rdx
  125. sub rsi, rdx
  126. movsxd rcx, DWORD PTR arg(5) ;output_height
  127. movsxd rax, DWORD PTR arg(2) ;output_pitch ; destination pitch?
  128. pxor mm0, mm0 ; mm0 = 00000000
  129. .nextrow_cv:
  130. movq mm3, [rsi+rdx] ; mm3 = p0..p8 = row -1
  131. pmullw mm3, mm1 ; mm3 *= kernel 1 modifiers.
  132. movq mm4, [rsi + 4*rdx] ; mm4 = p0..p3 = row 2
  133. pmullw mm4, mm7 ; mm4 *= kernel 4 modifiers.
  134. paddsw mm3, mm4 ; mm3 += mm4
  135. movq mm4, [rsi + 2*rdx] ; mm4 = p0..p3 = row 0
  136. pmullw mm4, mm2 ; mm4 *= kernel 2 modifiers.
  137. paddsw mm3, mm4 ; mm3 += mm4
  138. movq mm4, [rsi] ; mm4 = p0..p3 = row -2
  139. pmullw mm4, [rbx] ; mm4 *= kernel 0 modifiers.
  140. paddsw mm3, mm4 ; mm3 += mm4
  141. add rsi, rdx ; move source forward 1 line to avoid 3 * pitch
  142. movq mm4, [rsi + 2*rdx] ; mm4 = p0..p3 = row 1
  143. pmullw mm4, mm6 ; mm4 *= kernel 3 modifiers.
  144. paddsw mm3, mm4 ; mm3 += mm4
  145. movq mm4, [rsi + 4*rdx] ; mm4 = p0..p3 = row 3
  146. pmullw mm4, [rbx +80] ; mm4 *= kernel 3 modifiers.
  147. paddsw mm3, mm4 ; mm3 += mm4
  148. paddsw mm3, mm5 ; mm3 += round value
  149. psraw mm3, VP8_FILTER_SHIFT ; mm3 /= 128
  150. packuswb mm3, mm0 ; pack and saturate
  151. movd [rdi],mm3 ; store the results in the destination
  152. ; the subsequent iterations repeat 3 out of 4 of these reads. Since the
  153. ; recon block should be in cache this shouldn't cost much. Its obviously
  154. ; avoidable!!!.
  155. lea rdi, [rdi+rax] ;
  156. dec rcx ; decrement count
  157. jnz .nextrow_cv ; next row
  158. pop rbx
  159. ; begin epilog
  160. pop rdi
  161. pop rsi
  162. RESTORE_GOT
  163. UNSHADOW_ARGS
  164. pop rbp
  165. ret
  166. ;void bilinear_predict8x8_mmx
  167. ;(
  168. ; unsigned char *src_ptr,
  169. ; int src_pixels_per_line,
  170. ; int xoffset,
  171. ; int yoffset,
  172. ; unsigned char *dst_ptr,
  173. ; int dst_pitch
  174. ;)
  175. global sym(vp8_bilinear_predict8x8_mmx) PRIVATE
  176. sym(vp8_bilinear_predict8x8_mmx):
  177. push rbp
  178. mov rbp, rsp
  179. SHADOW_ARGS_TO_STACK 6
  180. GET_GOT rbx
  181. push rsi
  182. push rdi
  183. ; end prolog
  184. ;const short *HFilter = vp8_bilinear_filters_x86_8[xoffset];
  185. ;const short *VFilter = vp8_bilinear_filters_x86_8[yoffset];
  186. movsxd rax, dword ptr arg(2) ;xoffset
  187. mov rdi, arg(4) ;dst_ptr ;
  188. shl rax, 5 ; offset * 32
  189. lea rcx, [GLOBAL(sym(vp8_bilinear_filters_x86_8))]
  190. add rax, rcx ; HFilter
  191. mov rsi, arg(0) ;src_ptr ;
  192. movsxd rdx, dword ptr arg(5) ;dst_pitch
  193. movq mm1, [rax] ;
  194. movq mm2, [rax+16] ;
  195. movsxd rax, dword ptr arg(3) ;yoffset
  196. pxor mm0, mm0 ;
  197. shl rax, 5 ; offset*32
  198. add rax, rcx ; VFilter
  199. lea rcx, [rdi+rdx*8] ;
  200. movsxd rdx, dword ptr arg(1) ;src_pixels_per_line ;
  201. ; get the first horizontal line done ;
  202. movq mm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
  203. movq mm4, mm3 ; make a copy of current line
  204. punpcklbw mm3, mm0 ; xx 00 01 02 03 04 05 06
  205. punpckhbw mm4, mm0 ;
  206. pmullw mm3, mm1 ;
  207. pmullw mm4, mm1 ;
  208. movq mm5, [rsi+1] ;
  209. movq mm6, mm5 ;
  210. punpcklbw mm5, mm0 ;
  211. punpckhbw mm6, mm0 ;
  212. pmullw mm5, mm2 ;
  213. pmullw mm6, mm2 ;
  214. paddw mm3, mm5 ;
  215. paddw mm4, mm6 ;
  216. paddw mm3, [GLOBAL(rd)] ; xmm3 += round value
  217. psraw mm3, VP8_FILTER_SHIFT ; xmm3 /= 128
  218. paddw mm4, [GLOBAL(rd)] ;
  219. psraw mm4, VP8_FILTER_SHIFT ;
  220. movq mm7, mm3 ;
  221. packuswb mm7, mm4 ;
  222. add rsi, rdx ; next line
  223. .next_row_8x8:
  224. movq mm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
  225. movq mm4, mm3 ; make a copy of current line
  226. punpcklbw mm3, mm0 ; xx 00 01 02 03 04 05 06
  227. punpckhbw mm4, mm0 ;
  228. pmullw mm3, mm1 ;
  229. pmullw mm4, mm1 ;
  230. movq mm5, [rsi+1] ;
  231. movq mm6, mm5 ;
  232. punpcklbw mm5, mm0 ;
  233. punpckhbw mm6, mm0 ;
  234. pmullw mm5, mm2 ;
  235. pmullw mm6, mm2 ;
  236. paddw mm3, mm5 ;
  237. paddw mm4, mm6 ;
  238. movq mm5, mm7 ;
  239. movq mm6, mm7 ;
  240. punpcklbw mm5, mm0 ;
  241. punpckhbw mm6, mm0
  242. pmullw mm5, [rax] ;
  243. pmullw mm6, [rax] ;
  244. paddw mm3, [GLOBAL(rd)] ; xmm3 += round value
  245. psraw mm3, VP8_FILTER_SHIFT ; xmm3 /= 128
  246. paddw mm4, [GLOBAL(rd)] ;
  247. psraw mm4, VP8_FILTER_SHIFT ;
  248. movq mm7, mm3 ;
  249. packuswb mm7, mm4 ;
  250. pmullw mm3, [rax+16] ;
  251. pmullw mm4, [rax+16] ;
  252. paddw mm3, mm5 ;
  253. paddw mm4, mm6 ;
  254. paddw mm3, [GLOBAL(rd)] ; xmm3 += round value
  255. psraw mm3, VP8_FILTER_SHIFT ; xmm3 /= 128
  256. paddw mm4, [GLOBAL(rd)] ;
  257. psraw mm4, VP8_FILTER_SHIFT ;
  258. packuswb mm3, mm4
  259. movq [rdi], mm3 ; store the results in the destination
  260. %if ABI_IS_32BIT
  261. add rsi, rdx ; next line
  262. add rdi, dword ptr arg(5) ;dst_pitch ;
  263. %else
  264. movsxd r8, dword ptr arg(5) ;dst_pitch
  265. add rsi, rdx ; next line
  266. add rdi, r8 ;dst_pitch
  267. %endif
  268. cmp rdi, rcx ;
  269. jne .next_row_8x8
  270. ; begin epilog
  271. pop rdi
  272. pop rsi
  273. RESTORE_GOT
  274. UNSHADOW_ARGS
  275. pop rbp
  276. ret
  277. ;void bilinear_predict8x4_mmx
  278. ;(
  279. ; unsigned char *src_ptr,
  280. ; int src_pixels_per_line,
  281. ; int xoffset,
  282. ; int yoffset,
  283. ; unsigned char *dst_ptr,
  284. ; int dst_pitch
  285. ;)
  286. global sym(vp8_bilinear_predict8x4_mmx) PRIVATE
  287. sym(vp8_bilinear_predict8x4_mmx):
  288. push rbp
  289. mov rbp, rsp
  290. SHADOW_ARGS_TO_STACK 6
  291. GET_GOT rbx
  292. push rsi
  293. push rdi
  294. ; end prolog
  295. ;const short *HFilter = vp8_bilinear_filters_x86_8[xoffset];
  296. ;const short *VFilter = vp8_bilinear_filters_x86_8[yoffset];
  297. movsxd rax, dword ptr arg(2) ;xoffset
  298. mov rdi, arg(4) ;dst_ptr ;
  299. lea rcx, [GLOBAL(sym(vp8_bilinear_filters_x86_8))]
  300. shl rax, 5
  301. mov rsi, arg(0) ;src_ptr ;
  302. add rax, rcx
  303. movsxd rdx, dword ptr arg(5) ;dst_pitch
  304. movq mm1, [rax] ;
  305. movq mm2, [rax+16] ;
  306. movsxd rax, dword ptr arg(3) ;yoffset
  307. pxor mm0, mm0 ;
  308. shl rax, 5
  309. add rax, rcx
  310. lea rcx, [rdi+rdx*4] ;
  311. movsxd rdx, dword ptr arg(1) ;src_pixels_per_line ;
  312. ; get the first horizontal line done ;
  313. movq mm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
  314. movq mm4, mm3 ; make a copy of current line
  315. punpcklbw mm3, mm0 ; xx 00 01 02 03 04 05 06
  316. punpckhbw mm4, mm0 ;
  317. pmullw mm3, mm1 ;
  318. pmullw mm4, mm1 ;
  319. movq mm5, [rsi+1] ;
  320. movq mm6, mm5 ;
  321. punpcklbw mm5, mm0 ;
  322. punpckhbw mm6, mm0 ;
  323. pmullw mm5, mm2 ;
  324. pmullw mm6, mm2 ;
  325. paddw mm3, mm5 ;
  326. paddw mm4, mm6 ;
  327. paddw mm3, [GLOBAL(rd)] ; xmm3 += round value
  328. psraw mm3, VP8_FILTER_SHIFT ; xmm3 /= 128
  329. paddw mm4, [GLOBAL(rd)] ;
  330. psraw mm4, VP8_FILTER_SHIFT ;
  331. movq mm7, mm3 ;
  332. packuswb mm7, mm4 ;
  333. add rsi, rdx ; next line
  334. .next_row_8x4:
  335. movq mm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
  336. movq mm4, mm3 ; make a copy of current line
  337. punpcklbw mm3, mm0 ; xx 00 01 02 03 04 05 06
  338. punpckhbw mm4, mm0 ;
  339. pmullw mm3, mm1 ;
  340. pmullw mm4, mm1 ;
  341. movq mm5, [rsi+1] ;
  342. movq mm6, mm5 ;
  343. punpcklbw mm5, mm0 ;
  344. punpckhbw mm6, mm0 ;
  345. pmullw mm5, mm2 ;
  346. pmullw mm6, mm2 ;
  347. paddw mm3, mm5 ;
  348. paddw mm4, mm6 ;
  349. movq mm5, mm7 ;
  350. movq mm6, mm7 ;
  351. punpcklbw mm5, mm0 ;
  352. punpckhbw mm6, mm0
  353. pmullw mm5, [rax] ;
  354. pmullw mm6, [rax] ;
  355. paddw mm3, [GLOBAL(rd)] ; xmm3 += round value
  356. psraw mm3, VP8_FILTER_SHIFT ; xmm3 /= 128
  357. paddw mm4, [GLOBAL(rd)] ;
  358. psraw mm4, VP8_FILTER_SHIFT ;
  359. movq mm7, mm3 ;
  360. packuswb mm7, mm4 ;
  361. pmullw mm3, [rax+16] ;
  362. pmullw mm4, [rax+16] ;
  363. paddw mm3, mm5 ;
  364. paddw mm4, mm6 ;
  365. paddw mm3, [GLOBAL(rd)] ; xmm3 += round value
  366. psraw mm3, VP8_FILTER_SHIFT ; xmm3 /= 128
  367. paddw mm4, [GLOBAL(rd)] ;
  368. psraw mm4, VP8_FILTER_SHIFT ;
  369. packuswb mm3, mm4
  370. movq [rdi], mm3 ; store the results in the destination
  371. %if ABI_IS_32BIT
  372. add rsi, rdx ; next line
  373. add rdi, dword ptr arg(5) ;dst_pitch ;
  374. %else
  375. movsxd r8, dword ptr arg(5) ;dst_pitch
  376. add rsi, rdx ; next line
  377. add rdi, r8
  378. %endif
  379. cmp rdi, rcx ;
  380. jne .next_row_8x4
  381. ; begin epilog
  382. pop rdi
  383. pop rsi
  384. RESTORE_GOT
  385. UNSHADOW_ARGS
  386. pop rbp
  387. ret
  388. ;void bilinear_predict4x4_mmx
  389. ;(
  390. ; unsigned char *src_ptr,
  391. ; int src_pixels_per_line,
  392. ; int xoffset,
  393. ; int yoffset,
  394. ; unsigned char *dst_ptr,
  395. ; int dst_pitch
  396. ;)
  397. global sym(vp8_bilinear_predict4x4_mmx) PRIVATE
  398. sym(vp8_bilinear_predict4x4_mmx):
  399. push rbp
  400. mov rbp, rsp
  401. SHADOW_ARGS_TO_STACK 6
  402. GET_GOT rbx
  403. push rsi
  404. push rdi
  405. ; end prolog
  406. ;const short *HFilter = vp8_bilinear_filters_x86_8[xoffset];
  407. ;const short *VFilter = vp8_bilinear_filters_x86_8[yoffset];
  408. movsxd rax, dword ptr arg(2) ;xoffset
  409. mov rdi, arg(4) ;dst_ptr ;
  410. lea rcx, [GLOBAL(sym(vp8_bilinear_filters_x86_8))]
  411. shl rax, 5
  412. add rax, rcx ; HFilter
  413. mov rsi, arg(0) ;src_ptr ;
  414. movsxd rdx, dword ptr arg(5) ;ldst_pitch
  415. movq mm1, [rax] ;
  416. movq mm2, [rax+16] ;
  417. movsxd rax, dword ptr arg(3) ;yoffset
  418. pxor mm0, mm0 ;
  419. shl rax, 5
  420. add rax, rcx
  421. lea rcx, [rdi+rdx*4] ;
  422. movsxd rdx, dword ptr arg(1) ;src_pixels_per_line ;
  423. ; get the first horizontal line done ;
  424. movd mm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
  425. punpcklbw mm3, mm0 ; xx 00 01 02 03 04 05 06
  426. pmullw mm3, mm1 ;
  427. movd mm5, [rsi+1] ;
  428. punpcklbw mm5, mm0 ;
  429. pmullw mm5, mm2 ;
  430. paddw mm3, mm5 ;
  431. paddw mm3, [GLOBAL(rd)] ; xmm3 += round value
  432. psraw mm3, VP8_FILTER_SHIFT ; xmm3 /= 128
  433. movq mm7, mm3 ;
  434. packuswb mm7, mm0 ;
  435. add rsi, rdx ; next line
  436. .next_row_4x4:
  437. movd mm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
  438. punpcklbw mm3, mm0 ; xx 00 01 02 03 04 05 06
  439. pmullw mm3, mm1 ;
  440. movd mm5, [rsi+1] ;
  441. punpcklbw mm5, mm0 ;
  442. pmullw mm5, mm2 ;
  443. paddw mm3, mm5 ;
  444. movq mm5, mm7 ;
  445. punpcklbw mm5, mm0 ;
  446. pmullw mm5, [rax] ;
  447. paddw mm3, [GLOBAL(rd)] ; xmm3 += round value
  448. psraw mm3, VP8_FILTER_SHIFT ; xmm3 /= 128
  449. movq mm7, mm3 ;
  450. packuswb mm7, mm0 ;
  451. pmullw mm3, [rax+16] ;
  452. paddw mm3, mm5 ;
  453. paddw mm3, [GLOBAL(rd)] ; xmm3 += round value
  454. psraw mm3, VP8_FILTER_SHIFT ; xmm3 /= 128
  455. packuswb mm3, mm0
  456. movd [rdi], mm3 ; store the results in the destination
  457. %if ABI_IS_32BIT
  458. add rsi, rdx ; next line
  459. add rdi, dword ptr arg(5) ;dst_pitch ;
  460. %else
  461. movsxd r8, dword ptr arg(5) ;dst_pitch ;
  462. add rsi, rdx ; next line
  463. add rdi, r8
  464. %endif
  465. cmp rdi, rcx ;
  466. jne .next_row_4x4
  467. ; begin epilog
  468. pop rdi
  469. pop rsi
  470. RESTORE_GOT
  471. UNSHADOW_ARGS
  472. pop rbp
  473. ret
  474. SECTION_RODATA
  475. align 16
  476. rd:
  477. times 4 dw 0x40
  478. align 16
  479. global HIDDEN_DATA(sym(vp8_six_tap_mmx))
  480. sym(vp8_six_tap_mmx):
  481. times 8 dw 0
  482. times 8 dw 0
  483. times 8 dw 128
  484. times 8 dw 0
  485. times 8 dw 0
  486. times 8 dw 0
  487. times 8 dw 0
  488. times 8 dw -6
  489. times 8 dw 123
  490. times 8 dw 12
  491. times 8 dw -1
  492. times 8 dw 0
  493. times 8 dw 2
  494. times 8 dw -11
  495. times 8 dw 108
  496. times 8 dw 36
  497. times 8 dw -8
  498. times 8 dw 1
  499. times 8 dw 0
  500. times 8 dw -9
  501. times 8 dw 93
  502. times 8 dw 50
  503. times 8 dw -6
  504. times 8 dw 0
  505. times 8 dw 3
  506. times 8 dw -16
  507. times 8 dw 77
  508. times 8 dw 77
  509. times 8 dw -16
  510. times 8 dw 3
  511. times 8 dw 0
  512. times 8 dw -6
  513. times 8 dw 50
  514. times 8 dw 93
  515. times 8 dw -9
  516. times 8 dw 0
  517. times 8 dw 1
  518. times 8 dw -8
  519. times 8 dw 36
  520. times 8 dw 108
  521. times 8 dw -11
  522. times 8 dw 2
  523. times 8 dw 0
  524. times 8 dw -1
  525. times 8 dw 12
  526. times 8 dw 123
  527. times 8 dw -6
  528. times 8 dw 0