highbd_subpel_variance_impl_sse2.asm 31 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021
  1. ;
  2. ; Copyright (c) 2014 The WebM project authors. All Rights Reserved.
  3. ;
  4. ; Use of this source code is governed by a BSD-style license
  5. ; that can be found in the LICENSE file in the root of the source
  6. ; tree. An additional intellectual property rights grant can be found
  7. ; in the file PATENTS. All contributing project authors may
  8. ; be found in the AUTHORS file in the root of the source tree.
  9. ;
  10. %include "third_party/x86inc/x86inc.asm"
  11. SECTION_RODATA
  12. pw_8: times 8 dw 8
  13. bilin_filter_m_sse2: times 8 dw 16
  14. times 8 dw 0
  15. times 8 dw 14
  16. times 8 dw 2
  17. times 8 dw 12
  18. times 8 dw 4
  19. times 8 dw 10
  20. times 8 dw 6
  21. times 16 dw 8
  22. times 8 dw 6
  23. times 8 dw 10
  24. times 8 dw 4
  25. times 8 dw 12
  26. times 8 dw 2
  27. times 8 dw 14
  28. SECTION .text
  29. ; int vpx_sub_pixel_varianceNxh(const uint8_t *src, ptrdiff_t src_stride,
  30. ; int x_offset, int y_offset,
  31. ; const uint8_t *ref, ptrdiff_t ref_stride,
  32. ; int height, unsigned int *sse);
  33. ;
  34. ; This function returns the SE and stores SSE in the given pointer.
  35. %macro SUM_SSE 6 ; src1, ref1, src2, ref2, sum, sse
  36. psubw %3, %4
  37. psubw %1, %2
  38. mova %4, %3 ; make copies to manipulate to calc sum
  39. mova %2, %1 ; use originals for calc sse
  40. pmaddwd %3, %3
  41. paddw %4, %2
  42. pmaddwd %1, %1
  43. movhlps %2, %4
  44. paddd %6, %3
  45. paddw %4, %2
  46. pxor %2, %2
  47. pcmpgtw %2, %4 ; mask for 0 > %4 (sum)
  48. punpcklwd %4, %2 ; sign-extend word to dword
  49. paddd %6, %1
  50. paddd %5, %4
  51. %endmacro
  52. %macro STORE_AND_RET 0
  53. %if mmsize == 16
  54. ; if H=64 and W=16, we have 8 words of each 2(1bit)x64(6bit)x9bit=16bit
  55. ; in m6, i.e. it _exactly_ fits in a signed word per word in the xmm reg.
  56. ; We have to sign-extend it before adding the words within the register
  57. ; and outputing to a dword.
  58. movhlps m3, m7
  59. movhlps m4, m6
  60. paddd m7, m3
  61. paddd m6, m4
  62. pshufd m3, m7, 0x1
  63. pshufd m4, m6, 0x1
  64. paddd m7, m3
  65. paddd m6, m4
  66. mov r1, ssem ; r1 = unsigned int *sse
  67. movd [r1], m7 ; store sse
  68. movd eax, m6 ; store sum as return value
  69. %endif
  70. RET
  71. %endmacro
  72. %macro INC_SRC_BY_SRC_STRIDE 0
  73. %if ARCH_X86=1 && CONFIG_PIC=1
  74. add srcq, src_stridemp
  75. add srcq, src_stridemp
  76. %else
  77. lea srcq, [srcq + src_strideq*2]
  78. %endif
  79. %endmacro
  80. %macro SUBPEL_VARIANCE 1-2 0 ; W
  81. %define bilin_filter_m bilin_filter_m_sse2
  82. %define filter_idx_shift 5
  83. %if ARCH_X86_64
  84. %if %2 == 1 ; avg
  85. cglobal highbd_sub_pixel_avg_variance%1xh, 9, 10, 13, src, src_stride, \
  86. x_offset, y_offset, \
  87. ref, ref_stride, \
  88. second_pred, second_stride, height, sse
  89. %define second_str second_strideq
  90. %else
  91. cglobal highbd_sub_pixel_variance%1xh, 7, 8, 13, src, src_stride, \
  92. x_offset, y_offset, \
  93. ref, ref_stride, height, sse
  94. %endif
  95. %define block_height heightd
  96. %define bilin_filter sseq
  97. %else
  98. %if CONFIG_PIC=1
  99. %if %2 == 1 ; avg
  100. cglobal highbd_sub_pixel_avg_variance%1xh, 7, 7, 13, src, src_stride, \
  101. x_offset, y_offset, \
  102. ref, ref_stride, \
  103. second_pred, second_stride, height, sse
  104. %define block_height dword heightm
  105. %define second_str second_stridemp
  106. %else
  107. cglobal highbd_sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, \
  108. x_offset, y_offset, \
  109. ref, ref_stride, height, sse
  110. %define block_height heightd
  111. %endif
  112. ; reuse argument stack space
  113. %define g_bilin_filterm x_offsetm
  114. %define g_pw_8m y_offsetm
  115. ; Store bilin_filter and pw_8 location in stack
  116. %if GET_GOT_DEFINED == 1
  117. GET_GOT eax
  118. add esp, 4 ; restore esp
  119. %endif
  120. lea ecx, [GLOBAL(bilin_filter_m)]
  121. mov g_bilin_filterm, ecx
  122. lea ecx, [GLOBAL(pw_8)]
  123. mov g_pw_8m, ecx
  124. LOAD_IF_USED 0, 1 ; load eax, ecx back
  125. %else
  126. %if %2 == 1 ; avg
  127. cglobal highbd_sub_pixel_avg_variance%1xh, 7, 7, 13, src, src_stride, \
  128. x_offset, y_offset, \
  129. ref, ref_stride, \
  130. second_pred, second_stride, height, sse
  131. %define block_height dword heightm
  132. %define second_str second_stridemp
  133. %else
  134. cglobal highbd_sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, \
  135. x_offset, y_offset, \
  136. ref, ref_stride, height, sse
  137. %define block_height heightd
  138. %endif
  139. %define bilin_filter bilin_filter_m
  140. %endif
  141. %endif
  142. ASSERT %1 <= 16 ; m6 overflows if w > 16
  143. pxor m6, m6 ; sum
  144. pxor m7, m7 ; sse
  145. %if %1 < 16
  146. sar block_height, 1
  147. %endif
  148. %if %2 == 1 ; avg
  149. shl second_str, 1
  150. %endif
  151. ; FIXME(rbultje) replace by jumptable?
  152. test x_offsetd, x_offsetd
  153. jnz .x_nonzero
  154. ; x_offset == 0
  155. test y_offsetd, y_offsetd
  156. jnz .x_zero_y_nonzero
  157. ; x_offset == 0 && y_offset == 0
  158. .x_zero_y_zero_loop:
  159. %if %1 == 16
  160. movu m0, [srcq]
  161. movu m2, [srcq + 16]
  162. mova m1, [refq]
  163. mova m3, [refq + 16]
  164. %if %2 == 1 ; avg
  165. pavgw m0, [second_predq]
  166. pavgw m2, [second_predq+16]
  167. %endif
  168. SUM_SSE m0, m1, m2, m3, m6, m7
  169. lea srcq, [srcq + src_strideq*2]
  170. lea refq, [refq + ref_strideq*2]
  171. %if %2 == 1 ; avg
  172. add second_predq, second_str
  173. %endif
  174. %else ; %1 < 16
  175. movu m0, [srcq]
  176. movu m2, [srcq + src_strideq*2]
  177. mova m1, [refq]
  178. mova m3, [refq + ref_strideq*2]
  179. %if %2 == 1 ; avg
  180. pavgw m0, [second_predq]
  181. add second_predq, second_str
  182. pavgw m2, [second_predq]
  183. %endif
  184. SUM_SSE m0, m1, m2, m3, m6, m7
  185. lea srcq, [srcq + src_strideq*4]
  186. lea refq, [refq + ref_strideq*4]
  187. %if %2 == 1 ; avg
  188. add second_predq, second_str
  189. %endif
  190. %endif
  191. dec block_height
  192. jg .x_zero_y_zero_loop
  193. STORE_AND_RET
  194. .x_zero_y_nonzero:
  195. cmp y_offsetd, 8
  196. jne .x_zero_y_nonhalf
  197. ; x_offset == 0 && y_offset == 0.5
  198. .x_zero_y_half_loop:
  199. %if %1 == 16
  200. movu m0, [srcq]
  201. movu m1, [srcq+16]
  202. movu m4, [srcq+src_strideq*2]
  203. movu m5, [srcq+src_strideq*2+16]
  204. mova m2, [refq]
  205. mova m3, [refq+16]
  206. pavgw m0, m4
  207. pavgw m1, m5
  208. %if %2 == 1 ; avg
  209. pavgw m0, [second_predq]
  210. pavgw m1, [second_predq+16]
  211. %endif
  212. SUM_SSE m0, m2, m1, m3, m6, m7
  213. lea srcq, [srcq + src_strideq*2]
  214. lea refq, [refq + ref_strideq*2]
  215. %if %2 == 1 ; avg
  216. add second_predq, second_str
  217. %endif
  218. %else ; %1 < 16
  219. movu m0, [srcq]
  220. movu m1, [srcq+src_strideq*2]
  221. movu m5, [srcq+src_strideq*4]
  222. mova m2, [refq]
  223. mova m3, [refq+ref_strideq*2]
  224. pavgw m0, m1
  225. pavgw m1, m5
  226. %if %2 == 1 ; avg
  227. pavgw m0, [second_predq]
  228. add second_predq, second_str
  229. pavgw m1, [second_predq]
  230. %endif
  231. SUM_SSE m0, m2, m1, m3, m6, m7
  232. lea srcq, [srcq + src_strideq*4]
  233. lea refq, [refq + ref_strideq*4]
  234. %if %2 == 1 ; avg
  235. add second_predq, second_str
  236. %endif
  237. %endif
  238. dec block_height
  239. jg .x_zero_y_half_loop
  240. STORE_AND_RET
  241. .x_zero_y_nonhalf:
  242. ; x_offset == 0 && y_offset == bilin interpolation
  243. %if ARCH_X86_64
  244. lea bilin_filter, [GLOBAL(bilin_filter_m)]
  245. %endif
  246. shl y_offsetd, filter_idx_shift
  247. %if ARCH_X86_64 && mmsize == 16
  248. mova m8, [bilin_filter+y_offsetq]
  249. mova m9, [bilin_filter+y_offsetq+16]
  250. mova m10, [GLOBAL(pw_8)]
  251. %define filter_y_a m8
  252. %define filter_y_b m9
  253. %define filter_rnd m10
  254. %else ; x86-32 or mmx
  255. %if ARCH_X86=1 && CONFIG_PIC=1
  256. ; x_offset == 0, reuse x_offset reg
  257. %define tempq x_offsetq
  258. add y_offsetq, g_bilin_filterm
  259. %define filter_y_a [y_offsetq]
  260. %define filter_y_b [y_offsetq+16]
  261. mov tempq, g_pw_8m
  262. %define filter_rnd [tempq]
  263. %else
  264. add y_offsetq, bilin_filter
  265. %define filter_y_a [y_offsetq]
  266. %define filter_y_b [y_offsetq+16]
  267. %define filter_rnd [GLOBAL(pw_8)]
  268. %endif
  269. %endif
  270. .x_zero_y_other_loop:
  271. %if %1 == 16
  272. movu m0, [srcq]
  273. movu m1, [srcq + 16]
  274. movu m4, [srcq+src_strideq*2]
  275. movu m5, [srcq+src_strideq*2+16]
  276. mova m2, [refq]
  277. mova m3, [refq+16]
  278. ; FIXME(rbultje) instead of out=((num-x)*in1+x*in2+rnd)>>log2(num), we can
  279. ; also do out=in1+(((num-x)*(in2-in1)+rnd)>>log2(num)). Total number of
  280. ; instructions is the same (5), but it is 1 mul instead of 2, so might be
  281. ; slightly faster because of pmullw latency. It would also cut our rodata
  282. ; tables in half for this function, and save 1-2 registers on x86-64.
  283. pmullw m1, filter_y_a
  284. pmullw m5, filter_y_b
  285. paddw m1, filter_rnd
  286. pmullw m0, filter_y_a
  287. pmullw m4, filter_y_b
  288. paddw m0, filter_rnd
  289. paddw m1, m5
  290. paddw m0, m4
  291. psrlw m1, 4
  292. psrlw m0, 4
  293. %if %2 == 1 ; avg
  294. pavgw m0, [second_predq]
  295. pavgw m1, [second_predq+16]
  296. %endif
  297. SUM_SSE m0, m2, m1, m3, m6, m7
  298. lea srcq, [srcq + src_strideq*2]
  299. lea refq, [refq + ref_strideq*2]
  300. %if %2 == 1 ; avg
  301. add second_predq, second_str
  302. %endif
  303. %else ; %1 < 16
  304. movu m0, [srcq]
  305. movu m1, [srcq+src_strideq*2]
  306. movu m5, [srcq+src_strideq*4]
  307. mova m4, m1
  308. mova m2, [refq]
  309. mova m3, [refq+ref_strideq*2]
  310. pmullw m1, filter_y_a
  311. pmullw m5, filter_y_b
  312. paddw m1, filter_rnd
  313. pmullw m0, filter_y_a
  314. pmullw m4, filter_y_b
  315. paddw m0, filter_rnd
  316. paddw m1, m5
  317. paddw m0, m4
  318. psrlw m1, 4
  319. psrlw m0, 4
  320. %if %2 == 1 ; avg
  321. pavgw m0, [second_predq]
  322. add second_predq, second_str
  323. pavgw m1, [second_predq]
  324. %endif
  325. SUM_SSE m0, m2, m1, m3, m6, m7
  326. lea srcq, [srcq + src_strideq*4]
  327. lea refq, [refq + ref_strideq*4]
  328. %if %2 == 1 ; avg
  329. add second_predq, second_str
  330. %endif
  331. %endif
  332. dec block_height
  333. jg .x_zero_y_other_loop
  334. %undef filter_y_a
  335. %undef filter_y_b
  336. %undef filter_rnd
  337. STORE_AND_RET
  338. .x_nonzero:
  339. cmp x_offsetd, 8
  340. jne .x_nonhalf
  341. ; x_offset == 0.5
  342. test y_offsetd, y_offsetd
  343. jnz .x_half_y_nonzero
  344. ; x_offset == 0.5 && y_offset == 0
  345. .x_half_y_zero_loop:
  346. %if %1 == 16
  347. movu m0, [srcq]
  348. movu m1, [srcq + 16]
  349. movu m4, [srcq + 2]
  350. movu m5, [srcq + 18]
  351. mova m2, [refq]
  352. mova m3, [refq + 16]
  353. pavgw m0, m4
  354. pavgw m1, m5
  355. %if %2 == 1 ; avg
  356. pavgw m0, [second_predq]
  357. pavgw m1, [second_predq+16]
  358. %endif
  359. SUM_SSE m0, m2, m1, m3, m6, m7
  360. lea srcq, [srcq + src_strideq*2]
  361. lea refq, [refq + ref_strideq*2]
  362. %if %2 == 1 ; avg
  363. add second_predq, second_str
  364. %endif
  365. %else ; %1 < 16
  366. movu m0, [srcq]
  367. movu m1, [srcq + src_strideq*2]
  368. movu m4, [srcq + 2]
  369. movu m5, [srcq + src_strideq*2 + 2]
  370. mova m2, [refq]
  371. mova m3, [refq + ref_strideq*2]
  372. pavgw m0, m4
  373. pavgw m1, m5
  374. %if %2 == 1 ; avg
  375. pavgw m0, [second_predq]
  376. add second_predq, second_str
  377. pavgw m1, [second_predq]
  378. %endif
  379. SUM_SSE m0, m2, m1, m3, m6, m7
  380. lea srcq, [srcq + src_strideq*4]
  381. lea refq, [refq + ref_strideq*4]
  382. %if %2 == 1 ; avg
  383. add second_predq, second_str
  384. %endif
  385. %endif
  386. dec block_height
  387. jg .x_half_y_zero_loop
  388. STORE_AND_RET
  389. .x_half_y_nonzero:
  390. cmp y_offsetd, 8
  391. jne .x_half_y_nonhalf
  392. ; x_offset == 0.5 && y_offset == 0.5
  393. %if %1 == 16
  394. movu m0, [srcq]
  395. movu m1, [srcq+16]
  396. movu m2, [srcq+2]
  397. movu m3, [srcq+18]
  398. lea srcq, [srcq + src_strideq*2]
  399. pavgw m0, m2
  400. pavgw m1, m3
  401. .x_half_y_half_loop:
  402. movu m2, [srcq]
  403. movu m3, [srcq + 16]
  404. movu m4, [srcq + 2]
  405. movu m5, [srcq + 18]
  406. pavgw m2, m4
  407. pavgw m3, m5
  408. pavgw m0, m2
  409. pavgw m1, m3
  410. mova m4, [refq]
  411. mova m5, [refq + 16]
  412. %if %2 == 1 ; avg
  413. pavgw m0, [second_predq]
  414. pavgw m1, [second_predq+16]
  415. %endif
  416. SUM_SSE m0, m4, m1, m5, m6, m7
  417. mova m0, m2
  418. mova m1, m3
  419. lea srcq, [srcq + src_strideq*2]
  420. lea refq, [refq + ref_strideq*2]
  421. %if %2 == 1 ; avg
  422. add second_predq, second_str
  423. %endif
  424. %else ; %1 < 16
  425. movu m0, [srcq]
  426. movu m2, [srcq+2]
  427. lea srcq, [srcq + src_strideq*2]
  428. pavgw m0, m2
  429. .x_half_y_half_loop:
  430. movu m2, [srcq]
  431. movu m3, [srcq + src_strideq*2]
  432. movu m4, [srcq + 2]
  433. movu m5, [srcq + src_strideq*2 + 2]
  434. pavgw m2, m4
  435. pavgw m3, m5
  436. pavgw m0, m2
  437. pavgw m2, m3
  438. mova m4, [refq]
  439. mova m5, [refq + ref_strideq*2]
  440. %if %2 == 1 ; avg
  441. pavgw m0, [second_predq]
  442. add second_predq, second_str
  443. pavgw m2, [second_predq]
  444. %endif
  445. SUM_SSE m0, m4, m2, m5, m6, m7
  446. mova m0, m3
  447. lea srcq, [srcq + src_strideq*4]
  448. lea refq, [refq + ref_strideq*4]
  449. %if %2 == 1 ; avg
  450. add second_predq, second_str
  451. %endif
  452. %endif
  453. dec block_height
  454. jg .x_half_y_half_loop
  455. STORE_AND_RET
  456. .x_half_y_nonhalf:
  457. ; x_offset == 0.5 && y_offset == bilin interpolation
  458. %if ARCH_X86_64
  459. lea bilin_filter, [GLOBAL(bilin_filter_m)]
  460. %endif
  461. shl y_offsetd, filter_idx_shift
  462. %if ARCH_X86_64 && mmsize == 16
  463. mova m8, [bilin_filter+y_offsetq]
  464. mova m9, [bilin_filter+y_offsetq+16]
  465. mova m10, [GLOBAL(pw_8)]
  466. %define filter_y_a m8
  467. %define filter_y_b m9
  468. %define filter_rnd m10
  469. %else ; x86_32
  470. %if ARCH_X86=1 && CONFIG_PIC=1
  471. ; x_offset == 0.5. We can reuse x_offset reg
  472. %define tempq x_offsetq
  473. add y_offsetq, g_bilin_filterm
  474. %define filter_y_a [y_offsetq]
  475. %define filter_y_b [y_offsetq+16]
  476. mov tempq, g_pw_8m
  477. %define filter_rnd [tempq]
  478. %else
  479. add y_offsetq, bilin_filter
  480. %define filter_y_a [y_offsetq]
  481. %define filter_y_b [y_offsetq+16]
  482. %define filter_rnd [GLOBAL(pw_8)]
  483. %endif
  484. %endif
  485. %if %1 == 16
  486. movu m0, [srcq]
  487. movu m1, [srcq+16]
  488. movu m2, [srcq+2]
  489. movu m3, [srcq+18]
  490. lea srcq, [srcq + src_strideq*2]
  491. pavgw m0, m2
  492. pavgw m1, m3
  493. .x_half_y_other_loop:
  494. movu m2, [srcq]
  495. movu m3, [srcq+16]
  496. movu m4, [srcq+2]
  497. movu m5, [srcq+18]
  498. pavgw m2, m4
  499. pavgw m3, m5
  500. mova m4, m2
  501. mova m5, m3
  502. pmullw m1, filter_y_a
  503. pmullw m3, filter_y_b
  504. paddw m1, filter_rnd
  505. paddw m1, m3
  506. pmullw m0, filter_y_a
  507. pmullw m2, filter_y_b
  508. paddw m0, filter_rnd
  509. psrlw m1, 4
  510. paddw m0, m2
  511. mova m2, [refq]
  512. psrlw m0, 4
  513. mova m3, [refq+16]
  514. %if %2 == 1 ; avg
  515. pavgw m0, [second_predq]
  516. pavgw m1, [second_predq+16]
  517. %endif
  518. SUM_SSE m0, m2, m1, m3, m6, m7
  519. mova m0, m4
  520. mova m1, m5
  521. lea srcq, [srcq + src_strideq*2]
  522. lea refq, [refq + ref_strideq*2]
  523. %if %2 == 1 ; avg
  524. add second_predq, second_str
  525. %endif
  526. %else ; %1 < 16
  527. movu m0, [srcq]
  528. movu m2, [srcq+2]
  529. lea srcq, [srcq + src_strideq*2]
  530. pavgw m0, m2
  531. .x_half_y_other_loop:
  532. movu m2, [srcq]
  533. movu m3, [srcq+src_strideq*2]
  534. movu m4, [srcq+2]
  535. movu m5, [srcq+src_strideq*2+2]
  536. pavgw m2, m4
  537. pavgw m3, m5
  538. mova m4, m2
  539. mova m5, m3
  540. pmullw m4, filter_y_a
  541. pmullw m3, filter_y_b
  542. paddw m4, filter_rnd
  543. paddw m4, m3
  544. pmullw m0, filter_y_a
  545. pmullw m2, filter_y_b
  546. paddw m0, filter_rnd
  547. psrlw m4, 4
  548. paddw m0, m2
  549. mova m2, [refq]
  550. psrlw m0, 4
  551. mova m3, [refq+ref_strideq*2]
  552. %if %2 == 1 ; avg
  553. pavgw m0, [second_predq]
  554. add second_predq, second_str
  555. pavgw m4, [second_predq]
  556. %endif
  557. SUM_SSE m0, m2, m4, m3, m6, m7
  558. mova m0, m5
  559. lea srcq, [srcq + src_strideq*4]
  560. lea refq, [refq + ref_strideq*4]
  561. %if %2 == 1 ; avg
  562. add second_predq, second_str
  563. %endif
  564. %endif
  565. dec block_height
  566. jg .x_half_y_other_loop
  567. %undef filter_y_a
  568. %undef filter_y_b
  569. %undef filter_rnd
  570. STORE_AND_RET
  571. .x_nonhalf:
  572. test y_offsetd, y_offsetd
  573. jnz .x_nonhalf_y_nonzero
  574. ; x_offset == bilin interpolation && y_offset == 0
  575. %if ARCH_X86_64
  576. lea bilin_filter, [GLOBAL(bilin_filter_m)]
  577. %endif
  578. shl x_offsetd, filter_idx_shift
  579. %if ARCH_X86_64 && mmsize == 16
  580. mova m8, [bilin_filter+x_offsetq]
  581. mova m9, [bilin_filter+x_offsetq+16]
  582. mova m10, [GLOBAL(pw_8)]
  583. %define filter_x_a m8
  584. %define filter_x_b m9
  585. %define filter_rnd m10
  586. %else ; x86-32
  587. %if ARCH_X86=1 && CONFIG_PIC=1
  588. ; y_offset == 0. We can reuse y_offset reg.
  589. %define tempq y_offsetq
  590. add x_offsetq, g_bilin_filterm
  591. %define filter_x_a [x_offsetq]
  592. %define filter_x_b [x_offsetq+16]
  593. mov tempq, g_pw_8m
  594. %define filter_rnd [tempq]
  595. %else
  596. add x_offsetq, bilin_filter
  597. %define filter_x_a [x_offsetq]
  598. %define filter_x_b [x_offsetq+16]
  599. %define filter_rnd [GLOBAL(pw_8)]
  600. %endif
  601. %endif
  602. .x_other_y_zero_loop:
  603. %if %1 == 16
  604. movu m0, [srcq]
  605. movu m1, [srcq+16]
  606. movu m2, [srcq+2]
  607. movu m3, [srcq+18]
  608. mova m4, [refq]
  609. mova m5, [refq+16]
  610. pmullw m1, filter_x_a
  611. pmullw m3, filter_x_b
  612. paddw m1, filter_rnd
  613. pmullw m0, filter_x_a
  614. pmullw m2, filter_x_b
  615. paddw m0, filter_rnd
  616. paddw m1, m3
  617. paddw m0, m2
  618. psrlw m1, 4
  619. psrlw m0, 4
  620. %if %2 == 1 ; avg
  621. pavgw m0, [second_predq]
  622. pavgw m1, [second_predq+16]
  623. %endif
  624. SUM_SSE m0, m4, m1, m5, m6, m7
  625. lea srcq, [srcq+src_strideq*2]
  626. lea refq, [refq+ref_strideq*2]
  627. %if %2 == 1 ; avg
  628. add second_predq, second_str
  629. %endif
  630. %else ; %1 < 16
  631. movu m0, [srcq]
  632. movu m1, [srcq+src_strideq*2]
  633. movu m2, [srcq+2]
  634. movu m3, [srcq+src_strideq*2+2]
  635. mova m4, [refq]
  636. mova m5, [refq+ref_strideq*2]
  637. pmullw m1, filter_x_a
  638. pmullw m3, filter_x_b
  639. paddw m1, filter_rnd
  640. pmullw m0, filter_x_a
  641. pmullw m2, filter_x_b
  642. paddw m0, filter_rnd
  643. paddw m1, m3
  644. paddw m0, m2
  645. psrlw m1, 4
  646. psrlw m0, 4
  647. %if %2 == 1 ; avg
  648. pavgw m0, [second_predq]
  649. add second_predq, second_str
  650. pavgw m1, [second_predq]
  651. %endif
  652. SUM_SSE m0, m4, m1, m5, m6, m7
  653. lea srcq, [srcq+src_strideq*4]
  654. lea refq, [refq+ref_strideq*4]
  655. %if %2 == 1 ; avg
  656. add second_predq, second_str
  657. %endif
  658. %endif
  659. dec block_height
  660. jg .x_other_y_zero_loop
  661. %undef filter_x_a
  662. %undef filter_x_b
  663. %undef filter_rnd
  664. STORE_AND_RET
  665. .x_nonhalf_y_nonzero:
  666. cmp y_offsetd, 8
  667. jne .x_nonhalf_y_nonhalf
  668. ; x_offset == bilin interpolation && y_offset == 0.5
  669. %if ARCH_X86_64
  670. lea bilin_filter, [GLOBAL(bilin_filter_m)]
  671. %endif
  672. shl x_offsetd, filter_idx_shift
  673. %if ARCH_X86_64 && mmsize == 16
  674. mova m8, [bilin_filter+x_offsetq]
  675. mova m9, [bilin_filter+x_offsetq+16]
  676. mova m10, [GLOBAL(pw_8)]
  677. %define filter_x_a m8
  678. %define filter_x_b m9
  679. %define filter_rnd m10
  680. %else ; x86-32
  681. %if ARCH_X86=1 && CONFIG_PIC=1
  682. ; y_offset == 0.5. We can reuse y_offset reg.
  683. %define tempq y_offsetq
  684. add x_offsetq, g_bilin_filterm
  685. %define filter_x_a [x_offsetq]
  686. %define filter_x_b [x_offsetq+16]
  687. mov tempq, g_pw_8m
  688. %define filter_rnd [tempq]
  689. %else
  690. add x_offsetq, bilin_filter
  691. %define filter_x_a [x_offsetq]
  692. %define filter_x_b [x_offsetq+16]
  693. %define filter_rnd [GLOBAL(pw_8)]
  694. %endif
  695. %endif
  696. %if %1 == 16
  697. movu m0, [srcq]
  698. movu m1, [srcq+16]
  699. movu m2, [srcq+2]
  700. movu m3, [srcq+18]
  701. pmullw m0, filter_x_a
  702. pmullw m2, filter_x_b
  703. paddw m0, filter_rnd
  704. pmullw m1, filter_x_a
  705. pmullw m3, filter_x_b
  706. paddw m1, filter_rnd
  707. paddw m0, m2
  708. paddw m1, m3
  709. psrlw m0, 4
  710. psrlw m1, 4
  711. lea srcq, [srcq+src_strideq*2]
  712. .x_other_y_half_loop:
  713. movu m2, [srcq]
  714. movu m3, [srcq+16]
  715. movu m4, [srcq+2]
  716. movu m5, [srcq+18]
  717. pmullw m2, filter_x_a
  718. pmullw m4, filter_x_b
  719. paddw m2, filter_rnd
  720. pmullw m3, filter_x_a
  721. pmullw m5, filter_x_b
  722. paddw m3, filter_rnd
  723. paddw m2, m4
  724. paddw m3, m5
  725. mova m4, [refq]
  726. mova m5, [refq+16]
  727. psrlw m2, 4
  728. psrlw m3, 4
  729. pavgw m0, m2
  730. pavgw m1, m3
  731. %if %2 == 1 ; avg
  732. pavgw m0, [second_predq]
  733. pavgw m1, [second_predq+16]
  734. %endif
  735. SUM_SSE m0, m4, m1, m5, m6, m7
  736. mova m0, m2
  737. mova m1, m3
  738. lea srcq, [srcq+src_strideq*2]
  739. lea refq, [refq+ref_strideq*2]
  740. %if %2 == 1 ; avg
  741. add second_predq, second_str
  742. %endif
  743. %else ; %1 < 16
  744. movu m0, [srcq]
  745. movu m2, [srcq+2]
  746. pmullw m0, filter_x_a
  747. pmullw m2, filter_x_b
  748. paddw m0, filter_rnd
  749. paddw m0, m2
  750. psrlw m0, 4
  751. lea srcq, [srcq+src_strideq*2]
  752. .x_other_y_half_loop:
  753. movu m2, [srcq]
  754. movu m3, [srcq+src_strideq*2]
  755. movu m4, [srcq+2]
  756. movu m5, [srcq+src_strideq*2+2]
  757. pmullw m2, filter_x_a
  758. pmullw m4, filter_x_b
  759. paddw m2, filter_rnd
  760. pmullw m3, filter_x_a
  761. pmullw m5, filter_x_b
  762. paddw m3, filter_rnd
  763. paddw m2, m4
  764. paddw m3, m5
  765. mova m4, [refq]
  766. mova m5, [refq+ref_strideq*2]
  767. psrlw m2, 4
  768. psrlw m3, 4
  769. pavgw m0, m2
  770. pavgw m2, m3
  771. %if %2 == 1 ; avg
  772. pavgw m0, [second_predq]
  773. add second_predq, second_str
  774. pavgw m2, [second_predq]
  775. %endif
  776. SUM_SSE m0, m4, m2, m5, m6, m7
  777. mova m0, m3
  778. lea srcq, [srcq+src_strideq*4]
  779. lea refq, [refq+ref_strideq*4]
  780. %if %2 == 1 ; avg
  781. add second_predq, second_str
  782. %endif
  783. %endif
  784. dec block_height
  785. jg .x_other_y_half_loop
  786. %undef filter_x_a
  787. %undef filter_x_b
  788. %undef filter_rnd
  789. STORE_AND_RET
  790. .x_nonhalf_y_nonhalf:
  791. ; loading filter - this is same as in 8-bit depth
  792. %if ARCH_X86_64
  793. lea bilin_filter, [GLOBAL(bilin_filter_m)]
  794. %endif
  795. shl x_offsetd, filter_idx_shift ; filter_idx_shift = 5
  796. shl y_offsetd, filter_idx_shift
  797. %if ARCH_X86_64 && mmsize == 16
  798. mova m8, [bilin_filter+x_offsetq]
  799. mova m9, [bilin_filter+x_offsetq+16]
  800. mova m10, [bilin_filter+y_offsetq]
  801. mova m11, [bilin_filter+y_offsetq+16]
  802. mova m12, [GLOBAL(pw_8)]
  803. %define filter_x_a m8
  804. %define filter_x_b m9
  805. %define filter_y_a m10
  806. %define filter_y_b m11
  807. %define filter_rnd m12
  808. %else ; x86-32
  809. %if ARCH_X86=1 && CONFIG_PIC=1
  810. ; In this case, there is NO unused register. Used src_stride register. Later,
  811. ; src_stride has to be loaded from stack when it is needed.
  812. %define tempq src_strideq
  813. mov tempq, g_bilin_filterm
  814. add x_offsetq, tempq
  815. add y_offsetq, tempq
  816. %define filter_x_a [x_offsetq]
  817. %define filter_x_b [x_offsetq+16]
  818. %define filter_y_a [y_offsetq]
  819. %define filter_y_b [y_offsetq+16]
  820. mov tempq, g_pw_8m
  821. %define filter_rnd [tempq]
  822. %else
  823. add x_offsetq, bilin_filter
  824. add y_offsetq, bilin_filter
  825. %define filter_x_a [x_offsetq]
  826. %define filter_x_b [x_offsetq+16]
  827. %define filter_y_a [y_offsetq]
  828. %define filter_y_b [y_offsetq+16]
  829. %define filter_rnd [GLOBAL(pw_8)]
  830. %endif
  831. %endif
  832. ; end of load filter
  833. ; x_offset == bilin interpolation && y_offset == bilin interpolation
  834. %if %1 == 16
  835. movu m0, [srcq]
  836. movu m2, [srcq+2]
  837. movu m1, [srcq+16]
  838. movu m3, [srcq+18]
  839. pmullw m0, filter_x_a
  840. pmullw m2, filter_x_b
  841. paddw m0, filter_rnd
  842. pmullw m1, filter_x_a
  843. pmullw m3, filter_x_b
  844. paddw m1, filter_rnd
  845. paddw m0, m2
  846. paddw m1, m3
  847. psrlw m0, 4
  848. psrlw m1, 4
  849. INC_SRC_BY_SRC_STRIDE
  850. .x_other_y_other_loop:
  851. movu m2, [srcq]
  852. movu m4, [srcq+2]
  853. movu m3, [srcq+16]
  854. movu m5, [srcq+18]
  855. pmullw m2, filter_x_a
  856. pmullw m4, filter_x_b
  857. paddw m2, filter_rnd
  858. pmullw m3, filter_x_a
  859. pmullw m5, filter_x_b
  860. paddw m3, filter_rnd
  861. paddw m2, m4
  862. paddw m3, m5
  863. psrlw m2, 4
  864. psrlw m3, 4
  865. mova m4, m2
  866. mova m5, m3
  867. pmullw m0, filter_y_a
  868. pmullw m2, filter_y_b
  869. paddw m0, filter_rnd
  870. pmullw m1, filter_y_a
  871. pmullw m3, filter_y_b
  872. paddw m0, m2
  873. paddw m1, filter_rnd
  874. mova m2, [refq]
  875. paddw m1, m3
  876. psrlw m0, 4
  877. psrlw m1, 4
  878. mova m3, [refq+16]
  879. %if %2 == 1 ; avg
  880. pavgw m0, [second_predq]
  881. pavgw m1, [second_predq+16]
  882. %endif
  883. SUM_SSE m0, m2, m1, m3, m6, m7
  884. mova m0, m4
  885. mova m1, m5
  886. INC_SRC_BY_SRC_STRIDE
  887. lea refq, [refq + ref_strideq * 2]
  888. %if %2 == 1 ; avg
  889. add second_predq, second_str
  890. %endif
  891. %else ; %1 < 16
  892. movu m0, [srcq]
  893. movu m2, [srcq+2]
  894. pmullw m0, filter_x_a
  895. pmullw m2, filter_x_b
  896. paddw m0, filter_rnd
  897. paddw m0, m2
  898. psrlw m0, 4
  899. INC_SRC_BY_SRC_STRIDE
  900. .x_other_y_other_loop:
  901. movu m2, [srcq]
  902. movu m4, [srcq+2]
  903. INC_SRC_BY_SRC_STRIDE
  904. movu m3, [srcq]
  905. movu m5, [srcq+2]
  906. pmullw m2, filter_x_a
  907. pmullw m4, filter_x_b
  908. paddw m2, filter_rnd
  909. pmullw m3, filter_x_a
  910. pmullw m5, filter_x_b
  911. paddw m3, filter_rnd
  912. paddw m2, m4
  913. paddw m3, m5
  914. psrlw m2, 4
  915. psrlw m3, 4
  916. mova m4, m2
  917. mova m5, m3
  918. pmullw m0, filter_y_a
  919. pmullw m2, filter_y_b
  920. paddw m0, filter_rnd
  921. pmullw m4, filter_y_a
  922. pmullw m3, filter_y_b
  923. paddw m0, m2
  924. paddw m4, filter_rnd
  925. mova m2, [refq]
  926. paddw m4, m3
  927. psrlw m0, 4
  928. psrlw m4, 4
  929. mova m3, [refq+ref_strideq*2]
  930. %if %2 == 1 ; avg
  931. pavgw m0, [second_predq]
  932. add second_predq, second_str
  933. pavgw m4, [second_predq]
  934. %endif
  935. SUM_SSE m0, m2, m4, m3, m6, m7
  936. mova m0, m5
  937. INC_SRC_BY_SRC_STRIDE
  938. lea refq, [refq + ref_strideq * 4]
  939. %if %2 == 1 ; avg
  940. add second_predq, second_str
  941. %endif
  942. %endif
  943. dec block_height
  944. jg .x_other_y_other_loop
  945. %undef filter_x_a
  946. %undef filter_x_b
  947. %undef filter_y_a
  948. %undef filter_y_b
  949. %undef filter_rnd
  950. STORE_AND_RET
  951. %endmacro
  952. INIT_XMM sse2
  953. SUBPEL_VARIANCE 8
  954. SUBPEL_VARIANCE 16
  955. INIT_XMM sse2
  956. SUBPEL_VARIANCE 8, 1
  957. SUBPEL_VARIANCE 16, 1