highbd_sad_sse2.asm 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363
  1. ;
  2. ; Copyright (c) 2014 The WebM project authors. All Rights Reserved.
  3. ;
  4. ; Use of this source code is governed by a BSD-style license
  5. ; that can be found in the LICENSE file in the root of the source
  6. ; tree. An additional intellectual property rights grant can be found
  7. ; in the file PATENTS. All contributing project authors may
  8. ; be found in the AUTHORS file in the root of the source tree.
  9. ;
  10. %include "third_party/x86inc/x86inc.asm"
  11. SECTION .text
  12. %macro HIGH_SAD_FN 4
  13. %if %4 == 0
  14. %if %3 == 5
  15. cglobal highbd_sad%1x%2, 4, %3, 7, src, src_stride, ref, ref_stride, n_rows
  16. %else ; %3 == 7
  17. cglobal highbd_sad%1x%2, 4, %3, 7, src, src_stride, ref, ref_stride, \
  18. src_stride3, ref_stride3, n_rows
  19. %endif ; %3 == 5/7
  20. %else ; avg
  21. %if %3 == 5
  22. cglobal highbd_sad%1x%2_avg, 5, 1 + %3, 7, src, src_stride, ref, ref_stride, \
  23. second_pred, n_rows
  24. %else ; %3 == 7
  25. cglobal highbd_sad%1x%2_avg, 5, ARCH_X86_64 + %3, 7, src, src_stride, \
  26. ref, ref_stride, \
  27. second_pred, \
  28. src_stride3, ref_stride3
  29. %if ARCH_X86_64
  30. %define n_rowsd r7d
  31. %else ; x86-32
  32. %define n_rowsd dword r0m
  33. %endif ; x86-32/64
  34. %endif ; %3 == 5/7
  35. %endif ; avg/sad
  36. movsxdifnidn src_strideq, src_strided
  37. movsxdifnidn ref_strideq, ref_strided
  38. %if %3 == 7
  39. lea src_stride3q, [src_strideq*3]
  40. lea ref_stride3q, [ref_strideq*3]
  41. %endif ; %3 == 7
  42. ; convert src, ref & second_pred to short ptrs (from byte ptrs)
  43. shl srcq, 1
  44. shl refq, 1
  45. %if %4 == 1
  46. shl second_predq, 1
  47. %endif
  48. %endmacro
  49. ; unsigned int vpx_highbd_sad64x{16,32,64}_sse2(uint8_t *src, int src_stride,
  50. ; uint8_t *ref, int ref_stride);
  51. %macro HIGH_SAD64XN 1-2 0
  52. HIGH_SAD_FN 64, %1, 5, %2
  53. mov n_rowsd, %1
  54. pxor m0, m0
  55. pxor m6, m6
  56. .loop:
  57. ; first half of each row
  58. movu m1, [refq]
  59. movu m2, [refq+16]
  60. movu m3, [refq+32]
  61. movu m4, [refq+48]
  62. %if %2 == 1
  63. pavgw m1, [second_predq+mmsize*0]
  64. pavgw m2, [second_predq+mmsize*1]
  65. pavgw m3, [second_predq+mmsize*2]
  66. pavgw m4, [second_predq+mmsize*3]
  67. lea second_predq, [second_predq+mmsize*4]
  68. %endif
  69. mova m5, [srcq]
  70. psubusw m5, m1
  71. psubusw m1, [srcq]
  72. por m1, m5
  73. mova m5, [srcq+16]
  74. psubusw m5, m2
  75. psubusw m2, [srcq+16]
  76. por m2, m5
  77. mova m5, [srcq+32]
  78. psubusw m5, m3
  79. psubusw m3, [srcq+32]
  80. por m3, m5
  81. mova m5, [srcq+48]
  82. psubusw m5, m4
  83. psubusw m4, [srcq+48]
  84. por m4, m5
  85. paddw m1, m2
  86. paddw m3, m4
  87. movhlps m2, m1
  88. movhlps m4, m3
  89. paddw m1, m2
  90. paddw m3, m4
  91. punpcklwd m1, m6
  92. punpcklwd m3, m6
  93. paddd m0, m1
  94. paddd m0, m3
  95. ; second half of each row
  96. movu m1, [refq+64]
  97. movu m2, [refq+80]
  98. movu m3, [refq+96]
  99. movu m4, [refq+112]
  100. %if %2 == 1
  101. pavgw m1, [second_predq+mmsize*0]
  102. pavgw m2, [second_predq+mmsize*1]
  103. pavgw m3, [second_predq+mmsize*2]
  104. pavgw m4, [second_predq+mmsize*3]
  105. lea second_predq, [second_predq+mmsize*4]
  106. %endif
  107. mova m5, [srcq+64]
  108. psubusw m5, m1
  109. psubusw m1, [srcq+64]
  110. por m1, m5
  111. mova m5, [srcq+80]
  112. psubusw m5, m2
  113. psubusw m2, [srcq+80]
  114. por m2, m5
  115. mova m5, [srcq+96]
  116. psubusw m5, m3
  117. psubusw m3, [srcq+96]
  118. por m3, m5
  119. mova m5, [srcq+112]
  120. psubusw m5, m4
  121. psubusw m4, [srcq+112]
  122. por m4, m5
  123. paddw m1, m2
  124. paddw m3, m4
  125. movhlps m2, m1
  126. movhlps m4, m3
  127. paddw m1, m2
  128. paddw m3, m4
  129. punpcklwd m1, m6
  130. punpcklwd m3, m6
  131. lea refq, [refq+ref_strideq*2]
  132. paddd m0, m1
  133. lea srcq, [srcq+src_strideq*2]
  134. paddd m0, m3
  135. dec n_rowsd
  136. jg .loop
  137. movhlps m1, m0
  138. paddd m0, m1
  139. punpckldq m0, m6
  140. movhlps m1, m0
  141. paddd m0, m1
  142. movd eax, m0
  143. RET
  144. %endmacro
  145. INIT_XMM sse2
  146. HIGH_SAD64XN 64 ; highbd_sad64x64_sse2
  147. HIGH_SAD64XN 32 ; highbd_sad64x32_sse2
  148. HIGH_SAD64XN 64, 1 ; highbd_sad64x64_avg_sse2
  149. HIGH_SAD64XN 32, 1 ; highbd_sad64x32_avg_sse2
  150. ; unsigned int vpx_highbd_sad32x{16,32,64}_sse2(uint8_t *src, int src_stride,
  151. ; uint8_t *ref, int ref_stride);
  152. %macro HIGH_SAD32XN 1-2 0
  153. HIGH_SAD_FN 32, %1, 5, %2
  154. mov n_rowsd, %1
  155. pxor m0, m0
  156. pxor m6, m6
  157. .loop:
  158. movu m1, [refq]
  159. movu m2, [refq+16]
  160. movu m3, [refq+32]
  161. movu m4, [refq+48]
  162. %if %2 == 1
  163. pavgw m1, [second_predq+mmsize*0]
  164. pavgw m2, [second_predq+mmsize*1]
  165. pavgw m3, [second_predq+mmsize*2]
  166. pavgw m4, [second_predq+mmsize*3]
  167. lea second_predq, [second_predq+mmsize*4]
  168. %endif
  169. mova m5, [srcq]
  170. psubusw m5, m1
  171. psubusw m1, [srcq]
  172. por m1, m5
  173. mova m5, [srcq+16]
  174. psubusw m5, m2
  175. psubusw m2, [srcq+16]
  176. por m2, m5
  177. mova m5, [srcq+32]
  178. psubusw m5, m3
  179. psubusw m3, [srcq+32]
  180. por m3, m5
  181. mova m5, [srcq+48]
  182. psubusw m5, m4
  183. psubusw m4, [srcq+48]
  184. por m4, m5
  185. paddw m1, m2
  186. paddw m3, m4
  187. movhlps m2, m1
  188. movhlps m4, m3
  189. paddw m1, m2
  190. paddw m3, m4
  191. punpcklwd m1, m6
  192. punpcklwd m3, m6
  193. lea refq, [refq+ref_strideq*2]
  194. paddd m0, m1
  195. lea srcq, [srcq+src_strideq*2]
  196. paddd m0, m3
  197. dec n_rowsd
  198. jg .loop
  199. movhlps m1, m0
  200. paddd m0, m1
  201. punpckldq m0, m6
  202. movhlps m1, m0
  203. paddd m0, m1
  204. movd eax, m0
  205. RET
  206. %endmacro
  207. INIT_XMM sse2
  208. HIGH_SAD32XN 64 ; highbd_sad32x64_sse2
  209. HIGH_SAD32XN 32 ; highbd_sad32x32_sse2
  210. HIGH_SAD32XN 16 ; highbd_sad32x16_sse2
  211. HIGH_SAD32XN 64, 1 ; highbd_sad32x64_avg_sse2
  212. HIGH_SAD32XN 32, 1 ; highbd_sad32x32_avg_sse2
  213. HIGH_SAD32XN 16, 1 ; highbd_sad32x16_avg_sse2
  214. ; unsigned int vpx_highbd_sad16x{8,16,32}_sse2(uint8_t *src, int src_stride,
  215. ; uint8_t *ref, int ref_stride);
  216. %macro HIGH_SAD16XN 1-2 0
  217. HIGH_SAD_FN 16, %1, 5, %2
  218. mov n_rowsd, %1/2
  219. pxor m0, m0
  220. pxor m6, m6
  221. .loop:
  222. movu m1, [refq]
  223. movu m2, [refq+16]
  224. movu m3, [refq+ref_strideq*2]
  225. movu m4, [refq+ref_strideq*2+16]
  226. %if %2 == 1
  227. pavgw m1, [second_predq+mmsize*0]
  228. pavgw m2, [second_predq+16]
  229. pavgw m3, [second_predq+mmsize*2]
  230. pavgw m4, [second_predq+mmsize*2+16]
  231. lea second_predq, [second_predq+mmsize*4]
  232. %endif
  233. mova m5, [srcq]
  234. psubusw m5, m1
  235. psubusw m1, [srcq]
  236. por m1, m5
  237. mova m5, [srcq+16]
  238. psubusw m5, m2
  239. psubusw m2, [srcq+16]
  240. por m2, m5
  241. mova m5, [srcq+src_strideq*2]
  242. psubusw m5, m3
  243. psubusw m3, [srcq+src_strideq*2]
  244. por m3, m5
  245. mova m5, [srcq+src_strideq*2+16]
  246. psubusw m5, m4
  247. psubusw m4, [srcq+src_strideq*2+16]
  248. por m4, m5
  249. paddw m1, m2
  250. paddw m3, m4
  251. movhlps m2, m1
  252. movhlps m4, m3
  253. paddw m1, m2
  254. paddw m3, m4
  255. punpcklwd m1, m6
  256. punpcklwd m3, m6
  257. lea refq, [refq+ref_strideq*4]
  258. paddd m0, m1
  259. lea srcq, [srcq+src_strideq*4]
  260. paddd m0, m3
  261. dec n_rowsd
  262. jg .loop
  263. movhlps m1, m0
  264. paddd m0, m1
  265. punpckldq m0, m6
  266. movhlps m1, m0
  267. paddd m0, m1
  268. movd eax, m0
  269. RET
  270. %endmacro
  271. INIT_XMM sse2
  272. HIGH_SAD16XN 32 ; highbd_sad16x32_sse2
  273. HIGH_SAD16XN 16 ; highbd_sad16x16_sse2
  274. HIGH_SAD16XN 8 ; highbd_sad16x8_sse2
  275. HIGH_SAD16XN 32, 1 ; highbd_sad16x32_avg_sse2
  276. HIGH_SAD16XN 16, 1 ; highbd_sad16x16_avg_sse2
  277. HIGH_SAD16XN 8, 1 ; highbd_sad16x8_avg_sse2
  278. ; unsigned int vpx_highbd_sad8x{4,8,16}_sse2(uint8_t *src, int src_stride,
  279. ; uint8_t *ref, int ref_stride);
  280. %macro HIGH_SAD8XN 1-2 0
  281. HIGH_SAD_FN 8, %1, 7, %2
  282. mov n_rowsd, %1/4
  283. pxor m0, m0
  284. pxor m6, m6
  285. .loop:
  286. movu m1, [refq]
  287. movu m2, [refq+ref_strideq*2]
  288. movu m3, [refq+ref_strideq*4]
  289. movu m4, [refq+ref_stride3q*2]
  290. %if %2 == 1
  291. pavgw m1, [second_predq+mmsize*0]
  292. pavgw m2, [second_predq+mmsize*1]
  293. pavgw m3, [second_predq+mmsize*2]
  294. pavgw m4, [second_predq+mmsize*3]
  295. lea second_predq, [second_predq+mmsize*4]
  296. %endif
  297. mova m5, [srcq]
  298. psubusw m5, m1
  299. psubusw m1, [srcq]
  300. por m1, m5
  301. mova m5, [srcq+src_strideq*2]
  302. psubusw m5, m2
  303. psubusw m2, [srcq+src_strideq*2]
  304. por m2, m5
  305. mova m5, [srcq+src_strideq*4]
  306. psubusw m5, m3
  307. psubusw m3, [srcq+src_strideq*4]
  308. por m3, m5
  309. mova m5, [srcq+src_stride3q*2]
  310. psubusw m5, m4
  311. psubusw m4, [srcq+src_stride3q*2]
  312. por m4, m5
  313. paddw m1, m2
  314. paddw m3, m4
  315. movhlps m2, m1
  316. movhlps m4, m3
  317. paddw m1, m2
  318. paddw m3, m4
  319. punpcklwd m1, m6
  320. punpcklwd m3, m6
  321. lea refq, [refq+ref_strideq*8]
  322. paddd m0, m1
  323. lea srcq, [srcq+src_strideq*8]
  324. paddd m0, m3
  325. dec n_rowsd
  326. jg .loop
  327. movhlps m1, m0
  328. paddd m0, m1
  329. punpckldq m0, m6
  330. movhlps m1, m0
  331. paddd m0, m1
  332. movd eax, m0
  333. RET
  334. %endmacro
  335. INIT_XMM sse2
  336. HIGH_SAD8XN 16 ; highbd_sad8x16_sse2
  337. HIGH_SAD8XN 8 ; highbd_sad8x8_sse2
  338. HIGH_SAD8XN 4 ; highbd_sad8x4_sse2
  339. HIGH_SAD8XN 16, 1 ; highbd_sad8x16_avg_sse2
  340. HIGH_SAD8XN 8, 1 ; highbd_sad8x8_avg_sse2
  341. HIGH_SAD8XN 4, 1 ; highbd_sad8x4_avg_sse2