2
0

variance_mmi.c 65 KB


  1. /*
  2. * Copyright (c) 2017 The WebM project authors. All Rights Reserved.
  3. *
  4. * Use of this source code is governed by a BSD-style license
  5. * that can be found in the LICENSE file in the root of the source
  6. * tree. An additional intellectual property rights grant can be found
  7. * in the file PATENTS. All contributing project authors may
  8. * be found in the AUTHORS file in the root of the source tree.
  9. */
  10. #include "./vpx_dsp_rtcd.h"
  11. #include "vpx_dsp/variance.h"
  12. #include "vpx_ports/mem.h"
  13. #include "vpx/vpx_integer.h"
  14. #include "vpx_ports/asmdefs_mmi.h"
  15. static const uint8_t bilinear_filters[8][2] = {
  16. { 128, 0 }, { 112, 16 }, { 96, 32 }, { 80, 48 },
  17. { 64, 64 }, { 48, 80 }, { 32, 96 }, { 16, 112 },
  18. };
  19. /* Use VARIANCE_SSE_SUM_8_FOR_W64 in vpx_variance64x64,vpx_variance64x32,
  20. vpx_variance32x64. VARIANCE_SSE_SUM_8 will lead to sum overflow. */
  21. #define VARIANCE_SSE_SUM_8_FOR_W64 \
  22. /* sse */ \
  23. "pasubub %[ftmp3], %[ftmp1], %[ftmp2] \n\t" \
  24. "punpcklbh %[ftmp4], %[ftmp3], %[ftmp0] \n\t" \
  25. "punpckhbh %[ftmp5], %[ftmp3], %[ftmp0] \n\t" \
  26. "pmaddhw %[ftmp6], %[ftmp4], %[ftmp4] \n\t" \
  27. "pmaddhw %[ftmp7], %[ftmp5], %[ftmp5] \n\t" \
  28. "paddw %[ftmp10], %[ftmp10], %[ftmp6] \n\t" \
  29. "paddw %[ftmp10], %[ftmp10], %[ftmp7] \n\t" \
  30. \
  31. /* sum */ \
  32. "punpcklbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \
  33. "punpckhbh %[ftmp4], %[ftmp1], %[ftmp0] \n\t" \
  34. "punpcklbh %[ftmp5], %[ftmp2], %[ftmp0] \n\t" \
  35. "punpckhbh %[ftmp6], %[ftmp2], %[ftmp0] \n\t" \
  36. "punpcklhw %[ftmp1], %[ftmp3], %[ftmp0] \n\t" \
  37. "punpckhhw %[ftmp2], %[ftmp3], %[ftmp0] \n\t" \
  38. "punpcklhw %[ftmp7], %[ftmp5], %[ftmp0] \n\t" \
  39. "punpckhhw %[ftmp8], %[ftmp5], %[ftmp0] \n\t" \
  40. "psubw %[ftmp3], %[ftmp1], %[ftmp7] \n\t" \
  41. "psubw %[ftmp5], %[ftmp2], %[ftmp8] \n\t" \
  42. "punpcklhw %[ftmp1], %[ftmp4], %[ftmp0] \n\t" \
  43. "punpckhhw %[ftmp2], %[ftmp4], %[ftmp0] \n\t" \
  44. "punpcklhw %[ftmp7], %[ftmp6], %[ftmp0] \n\t" \
  45. "punpckhhw %[ftmp8], %[ftmp6], %[ftmp0] \n\t" \
  46. "psubw %[ftmp4], %[ftmp1], %[ftmp7] \n\t" \
  47. "psubw %[ftmp6], %[ftmp2], %[ftmp8] \n\t" \
  48. "paddw %[ftmp9], %[ftmp9], %[ftmp3] \n\t" \
  49. "paddw %[ftmp9], %[ftmp9], %[ftmp4] \n\t" \
  50. "paddw %[ftmp9], %[ftmp9], %[ftmp5] \n\t" \
  51. "paddw %[ftmp9], %[ftmp9], %[ftmp6] \n\t"
  52. #define VARIANCE_SSE_SUM_4 \
  53. /* sse */ \
  54. "pasubub %[ftmp3], %[ftmp1], %[ftmp2] \n\t" \
  55. "punpcklbh %[ftmp4], %[ftmp3], %[ftmp0] \n\t" \
  56. "pmaddhw %[ftmp5], %[ftmp4], %[ftmp4] \n\t" \
  57. "paddw %[ftmp6], %[ftmp6], %[ftmp5] \n\t" \
  58. \
  59. /* sum */ \
  60. "punpcklbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \
  61. "punpcklbh %[ftmp4], %[ftmp2], %[ftmp0] \n\t" \
  62. "paddh %[ftmp7], %[ftmp7], %[ftmp3] \n\t" \
  63. "paddh %[ftmp8], %[ftmp8], %[ftmp4] \n\t"
  64. #define VARIANCE_SSE_SUM_8 \
  65. /* sse */ \
  66. "pasubub %[ftmp3], %[ftmp1], %[ftmp2] \n\t" \
  67. "punpcklbh %[ftmp4], %[ftmp3], %[ftmp0] \n\t" \
  68. "punpckhbh %[ftmp5], %[ftmp3], %[ftmp0] \n\t" \
  69. "pmaddhw %[ftmp6], %[ftmp4], %[ftmp4] \n\t" \
  70. "pmaddhw %[ftmp7], %[ftmp5], %[ftmp5] \n\t" \
  71. "paddw %[ftmp8], %[ftmp8], %[ftmp6] \n\t" \
  72. "paddw %[ftmp8], %[ftmp8], %[ftmp7] \n\t" \
  73. \
  74. /* sum */ \
  75. "punpcklbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \
  76. "punpckhbh %[ftmp4], %[ftmp1], %[ftmp0] \n\t" \
  77. "punpcklbh %[ftmp5], %[ftmp2], %[ftmp0] \n\t" \
  78. "punpckhbh %[ftmp6], %[ftmp2], %[ftmp0] \n\t" \
  79. "paddh %[ftmp10], %[ftmp10], %[ftmp3] \n\t" \
  80. "paddh %[ftmp10], %[ftmp10], %[ftmp4] \n\t" \
  81. "paddh %[ftmp12], %[ftmp12], %[ftmp5] \n\t" \
  82. "paddh %[ftmp12], %[ftmp12], %[ftmp6] \n\t"
  83. #define VARIANCE_SSE_8 \
  84. "gsldlc1 %[ftmp1], 0x07(%[src_ptr]) \n\t" \
  85. "gsldrc1 %[ftmp1], 0x00(%[src_ptr]) \n\t" \
  86. "gsldlc1 %[ftmp2], 0x07(%[ref_ptr]) \n\t" \
  87. "gsldrc1 %[ftmp2], 0x00(%[ref_ptr]) \n\t" \
  88. "pasubub %[ftmp3], %[ftmp1], %[ftmp2] \n\t" \
  89. "punpcklbh %[ftmp4], %[ftmp3], %[ftmp0] \n\t" \
  90. "punpckhbh %[ftmp5], %[ftmp3], %[ftmp0] \n\t" \
  91. "pmaddhw %[ftmp6], %[ftmp4], %[ftmp4] \n\t" \
  92. "pmaddhw %[ftmp7], %[ftmp5], %[ftmp5] \n\t" \
  93. "paddw %[ftmp8], %[ftmp8], %[ftmp6] \n\t" \
  94. "paddw %[ftmp8], %[ftmp8], %[ftmp7] \n\t"
  95. #define VARIANCE_SSE_16 \
  96. VARIANCE_SSE_8 \
  97. "gsldlc1 %[ftmp1], 0x0f(%[src_ptr]) \n\t" \
  98. "gsldrc1 %[ftmp1], 0x08(%[src_ptr]) \n\t" \
  99. "gsldlc1 %[ftmp2], 0x0f(%[ref_ptr]) \n\t" \
  100. "gsldrc1 %[ftmp2], 0x08(%[ref_ptr]) \n\t" \
  101. "pasubub %[ftmp3], %[ftmp1], %[ftmp2] \n\t" \
  102. "punpcklbh %[ftmp4], %[ftmp3], %[ftmp0] \n\t" \
  103. "punpckhbh %[ftmp5], %[ftmp3], %[ftmp0] \n\t" \
  104. "pmaddhw %[ftmp6], %[ftmp4], %[ftmp4] \n\t" \
  105. "pmaddhw %[ftmp7], %[ftmp5], %[ftmp5] \n\t" \
  106. "paddw %[ftmp8], %[ftmp8], %[ftmp6] \n\t" \
  107. "paddw %[ftmp8], %[ftmp8], %[ftmp7] \n\t"
  108. #define VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_4_A \
  109. /* calculate fdata3[0]~fdata3[3], store at ftmp2*/ \
  110. "gsldlc1 %[ftmp1], 0x07(%[src_ptr]) \n\t" \
  111. "gsldrc1 %[ftmp1], 0x00(%[src_ptr]) \n\t" \
  112. "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
  113. "gsldlc1 %[ftmp1], 0x08(%[src_ptr]) \n\t" \
  114. "gsldrc1 %[ftmp1], 0x01(%[src_ptr]) \n\t" \
  115. "punpcklbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \
  116. "pmullh %[ftmp2], %[ftmp2], %[filter_x0] \n\t" \
  117. "paddh %[ftmp2], %[ftmp2], %[ff_ph_40] \n\t" \
  118. "pmullh %[ftmp3], %[ftmp3], %[filter_x1] \n\t" \
  119. "paddh %[ftmp2], %[ftmp2], %[ftmp3] \n\t" \
  120. "psrlh %[ftmp2], %[ftmp2], %[ftmp6] \n\t"
  121. #define VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_4_B \
  122. /* calculate fdata3[0]~fdata3[3], store at ftmp4*/ \
  123. "gsldlc1 %[ftmp1], 0x07(%[src_ptr]) \n\t" \
  124. "gsldrc1 %[ftmp1], 0x00(%[src_ptr]) \n\t" \
  125. "punpcklbh %[ftmp4], %[ftmp1], %[ftmp0] \n\t" \
  126. "gsldlc1 %[ftmp1], 0x08(%[src_ptr]) \n\t" \
  127. "gsldrc1 %[ftmp1], 0x01(%[src_ptr]) \n\t" \
  128. "punpcklbh %[ftmp5], %[ftmp1], %[ftmp0] \n\t" \
  129. "pmullh %[ftmp4], %[ftmp4], %[filter_x0] \n\t" \
  130. "paddh %[ftmp4], %[ftmp4], %[ff_ph_40] \n\t" \
  131. "pmullh %[ftmp5], %[ftmp5], %[filter_x1] \n\t" \
  132. "paddh %[ftmp4], %[ftmp4], %[ftmp5] \n\t" \
  133. "psrlh %[ftmp4], %[ftmp4], %[ftmp6] \n\t"
  134. #define VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_4_A \
  135. /* calculate: temp2[0] ~ temp2[3] */ \
  136. "pmullh %[ftmp2], %[ftmp2], %[filter_y0] \n\t" \
  137. "paddh %[ftmp2], %[ftmp2], %[ff_ph_40] \n\t" \
  138. "pmullh %[ftmp1], %[ftmp4], %[filter_y1] \n\t" \
  139. "paddh %[ftmp2], %[ftmp2], %[ftmp1] \n\t" \
  140. "psrlh %[ftmp2], %[ftmp2], %[ftmp6] \n\t" \
  141. \
  142. /* store: temp2[0] ~ temp2[3] */ \
  143. "and %[ftmp2], %[ftmp2], %[mask] \n\t" \
  144. "packushb %[ftmp2], %[ftmp2], %[ftmp0] \n\t" \
  145. "gssdrc1 %[ftmp2], 0x00(%[temp2_ptr]) \n\t"
  146. #define VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_4_B \
  147. /* calculate: temp2[0] ~ temp2[3] */ \
  148. "pmullh %[ftmp4], %[ftmp4], %[filter_y0] \n\t" \
  149. "paddh %[ftmp4], %[ftmp4], %[ff_ph_40] \n\t" \
  150. "pmullh %[ftmp1], %[ftmp2], %[filter_y1] \n\t" \
  151. "paddh %[ftmp4], %[ftmp4], %[ftmp1] \n\t" \
  152. "psrlh %[ftmp4], %[ftmp4], %[ftmp6] \n\t" \
  153. \
  154. /* store: temp2[0] ~ temp2[3] */ \
  155. "and %[ftmp4], %[ftmp4], %[mask] \n\t" \
  156. "packushb %[ftmp4], %[ftmp4], %[ftmp0] \n\t" \
  157. "gssdrc1 %[ftmp4], 0x00(%[temp2_ptr]) \n\t"
  158. #define VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_8_A \
  159. /* calculate fdata3[0]~fdata3[7], store at ftmp2 and ftmp3*/ \
  160. "gsldlc1 %[ftmp1], 0x07(%[src_ptr]) \n\t" \
  161. "gsldrc1 %[ftmp1], 0x00(%[src_ptr]) \n\t" \
  162. "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
  163. "punpckhbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \
  164. "gsldlc1 %[ftmp1], 0x08(%[src_ptr]) \n\t" \
  165. "gsldrc1 %[ftmp1], 0x01(%[src_ptr]) \n\t" \
  166. "punpcklbh %[ftmp4], %[ftmp1], %[ftmp0] \n\t" \
  167. "punpckhbh %[ftmp5], %[ftmp1], %[ftmp0] \n\t" \
  168. "pmullh %[ftmp2], %[ftmp2], %[filter_x0] \n\t" \
  169. "pmullh %[ftmp3], %[ftmp3], %[filter_x0] \n\t" \
  170. "paddh %[ftmp2], %[ftmp2], %[ff_ph_40] \n\t" \
  171. "paddh %[ftmp3], %[ftmp3], %[ff_ph_40] \n\t" \
  172. "pmullh %[ftmp4], %[ftmp4], %[filter_x1] \n\t" \
  173. "pmullh %[ftmp5], %[ftmp5], %[filter_x1] \n\t" \
  174. "paddh %[ftmp2], %[ftmp2], %[ftmp4] \n\t" \
  175. "paddh %[ftmp3], %[ftmp3], %[ftmp5] \n\t" \
  176. "psrlh %[ftmp2], %[ftmp2], %[ftmp14] \n\t" \
  177. "psrlh %[ftmp3], %[ftmp3], %[ftmp14] \n\t"
  178. #define VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_8_B \
  179. /* calculate fdata3[0]~fdata3[7], store at ftmp8 and ftmp9*/ \
  180. "gsldlc1 %[ftmp1], 0x07(%[src_ptr]) \n\t" \
  181. "gsldrc1 %[ftmp1], 0x00(%[src_ptr]) \n\t" \
  182. "punpcklbh %[ftmp8], %[ftmp1], %[ftmp0] \n\t" \
  183. "punpckhbh %[ftmp9], %[ftmp1], %[ftmp0] \n\t" \
  184. "gsldlc1 %[ftmp1], 0x08(%[src_ptr]) \n\t" \
  185. "gsldrc1 %[ftmp1], 0x01(%[src_ptr]) \n\t" \
  186. "punpcklbh %[ftmp10], %[ftmp1], %[ftmp0] \n\t" \
  187. "punpckhbh %[ftmp11], %[ftmp1], %[ftmp0] \n\t" \
  188. "pmullh %[ftmp8], %[ftmp8], %[filter_x0] \n\t" \
  189. "pmullh %[ftmp9], %[ftmp9], %[filter_x0] \n\t" \
  190. "paddh %[ftmp8], %[ftmp8], %[ff_ph_40] \n\t" \
  191. "paddh %[ftmp9], %[ftmp9], %[ff_ph_40] \n\t" \
  192. "pmullh %[ftmp10], %[ftmp10], %[filter_x1] \n\t" \
  193. "pmullh %[ftmp11], %[ftmp11], %[filter_x1] \n\t" \
  194. "paddh %[ftmp8], %[ftmp8], %[ftmp10] \n\t" \
  195. "paddh %[ftmp9], %[ftmp9], %[ftmp11] \n\t" \
  196. "psrlh %[ftmp8], %[ftmp8], %[ftmp14] \n\t" \
  197. "psrlh %[ftmp9], %[ftmp9], %[ftmp14] \n\t"
  198. #define VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_8_A \
  199. /* calculate: temp2[0] ~ temp2[3] */ \
  200. "pmullh %[ftmp2], %[ftmp2], %[filter_y0] \n\t" \
  201. "paddh %[ftmp2], %[ftmp2], %[ff_ph_40] \n\t" \
  202. "pmullh %[ftmp1], %[ftmp8], %[filter_y1] \n\t" \
  203. "paddh %[ftmp2], %[ftmp2], %[ftmp1] \n\t" \
  204. "psrlh %[ftmp2], %[ftmp2], %[ftmp14] \n\t" \
  205. \
  206. /* calculate: temp2[4] ~ temp2[7] */ \
  207. "pmullh %[ftmp3], %[ftmp3], %[filter_y0] \n\t" \
  208. "paddh %[ftmp3], %[ftmp3], %[ff_ph_40] \n\t" \
  209. "pmullh %[ftmp1], %[ftmp9], %[filter_y1] \n\t" \
  210. "paddh %[ftmp3], %[ftmp3], %[ftmp1] \n\t" \
  211. "psrlh %[ftmp3], %[ftmp3], %[ftmp14] \n\t" \
  212. \
  213. /* store: temp2[0] ~ temp2[7] */ \
  214. "and %[ftmp2], %[ftmp2], %[mask] \n\t" \
  215. "and %[ftmp3], %[ftmp3], %[mask] \n\t" \
  216. "packushb %[ftmp2], %[ftmp2], %[ftmp3] \n\t" \
  217. "gssdlc1 %[ftmp2], 0x07(%[temp2_ptr]) \n\t" \
  218. "gssdrc1 %[ftmp2], 0x00(%[temp2_ptr]) \n\t"
  219. #define VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_8_B \
  220. /* calculate: temp2[0] ~ temp2[3] */ \
  221. "pmullh %[ftmp8], %[ftmp8], %[filter_y0] \n\t" \
  222. "paddh %[ftmp8], %[ftmp8], %[ff_ph_40] \n\t" \
  223. "pmullh %[ftmp1], %[ftmp2], %[filter_y1] \n\t" \
  224. "paddh %[ftmp8], %[ftmp8], %[ftmp1] \n\t" \
  225. "psrlh %[ftmp8], %[ftmp8], %[ftmp14] \n\t" \
  226. \
  227. /* calculate: temp2[4] ~ temp2[7] */ \
  228. "pmullh %[ftmp9], %[ftmp9], %[filter_y0] \n\t" \
  229. "paddh %[ftmp9], %[ftmp9], %[ff_ph_40] \n\t" \
  230. "pmullh %[ftmp1], %[ftmp3], %[filter_y1] \n\t" \
  231. "paddh %[ftmp9], %[ftmp9], %[ftmp1] \n\t" \
  232. "psrlh %[ftmp9], %[ftmp9], %[ftmp14] \n\t" \
  233. \
  234. /* store: temp2[0] ~ temp2[7] */ \
  235. "and %[ftmp8], %[ftmp8], %[mask] \n\t" \
  236. "and %[ftmp9], %[ftmp9], %[mask] \n\t" \
  237. "packushb %[ftmp8], %[ftmp8], %[ftmp9] \n\t" \
  238. "gssdlc1 %[ftmp8], 0x07(%[temp2_ptr]) \n\t" \
  239. "gssdrc1 %[ftmp8], 0x00(%[temp2_ptr]) \n\t"
  240. #define VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_16_A \
  241. /* calculate fdata3[0]~fdata3[7], store at ftmp2 and ftmp3*/ \
  242. VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_8_A \
  243. \
  244. /* calculate fdata3[8]~fdata3[15], store at ftmp4 and ftmp5*/ \
  245. "gsldlc1 %[ftmp1], 0x0f(%[src_ptr]) \n\t" \
  246. "gsldrc1 %[ftmp1], 0x08(%[src_ptr]) \n\t" \
  247. "punpcklbh %[ftmp4], %[ftmp1], %[ftmp0] \n\t" \
  248. "punpckhbh %[ftmp5], %[ftmp1], %[ftmp0] \n\t" \
  249. "gsldlc1 %[ftmp1], 0x10(%[src_ptr]) \n\t" \
  250. "gsldrc1 %[ftmp1], 0x09(%[src_ptr]) \n\t" \
  251. "punpcklbh %[ftmp6], %[ftmp1], %[ftmp0] \n\t" \
  252. "punpckhbh %[ftmp7], %[ftmp1], %[ftmp0] \n\t" \
  253. "pmullh %[ftmp4], %[ftmp4], %[filter_x0] \n\t" \
  254. "pmullh %[ftmp5], %[ftmp5], %[filter_x0] \n\t" \
  255. "paddh %[ftmp4], %[ftmp4], %[ff_ph_40] \n\t" \
  256. "paddh %[ftmp5], %[ftmp5], %[ff_ph_40] \n\t" \
  257. "pmullh %[ftmp6], %[ftmp6], %[filter_x1] \n\t" \
  258. "pmullh %[ftmp7], %[ftmp7], %[filter_x1] \n\t" \
  259. "paddh %[ftmp4], %[ftmp4], %[ftmp6] \n\t" \
  260. "paddh %[ftmp5], %[ftmp5], %[ftmp7] \n\t" \
  261. "psrlh %[ftmp4], %[ftmp4], %[ftmp14] \n\t" \
  262. "psrlh %[ftmp5], %[ftmp5], %[ftmp14] \n\t"
  263. #define VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_16_B \
  264. /* calculate fdata3[0]~fdata3[7], store at ftmp8 and ftmp9*/ \
  265. VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_8_B \
  266. \
  267. /* calculate fdata3[8]~fdata3[15], store at ftmp10 and ftmp11*/ \
  268. "gsldlc1 %[ftmp1], 0x0f(%[src_ptr]) \n\t" \
  269. "gsldrc1 %[ftmp1], 0x08(%[src_ptr]) \n\t" \
  270. "punpcklbh %[ftmp10], %[ftmp1], %[ftmp0] \n\t" \
  271. "punpckhbh %[ftmp11], %[ftmp1], %[ftmp0] \n\t" \
  272. "gsldlc1 %[ftmp1], 0x10(%[src_ptr]) \n\t" \
  273. "gsldrc1 %[ftmp1], 0x09(%[src_ptr]) \n\t" \
  274. "punpcklbh %[ftmp12], %[ftmp1], %[ftmp0] \n\t" \
  275. "punpckhbh %[ftmp13], %[ftmp1], %[ftmp0] \n\t" \
  276. "pmullh %[ftmp10], %[ftmp10], %[filter_x0] \n\t" \
  277. "pmullh %[ftmp11], %[ftmp11], %[filter_x0] \n\t" \
  278. "paddh %[ftmp10], %[ftmp10], %[ff_ph_40] \n\t" \
  279. "paddh %[ftmp11], %[ftmp11], %[ff_ph_40] \n\t" \
  280. "pmullh %[ftmp12], %[ftmp12], %[filter_x1] \n\t" \
  281. "pmullh %[ftmp13], %[ftmp13], %[filter_x1] \n\t" \
  282. "paddh %[ftmp10], %[ftmp10], %[ftmp12] \n\t" \
  283. "paddh %[ftmp11], %[ftmp11], %[ftmp13] \n\t" \
  284. "psrlh %[ftmp10], %[ftmp10], %[ftmp14] \n\t" \
  285. "psrlh %[ftmp11], %[ftmp11], %[ftmp14] \n\t"
  286. #define VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_16_A \
  287. VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_8_A \
  288. \
  289. /* calculate: temp2[8] ~ temp2[11] */ \
  290. "pmullh %[ftmp4], %[ftmp4], %[filter_y0] \n\t" \
  291. "paddh %[ftmp4], %[ftmp4], %[ff_ph_40] \n\t" \
  292. "pmullh %[ftmp1], %[ftmp10], %[filter_y1] \n\t" \
  293. "paddh %[ftmp4], %[ftmp4], %[ftmp1] \n\t" \
  294. "psrlh %[ftmp4], %[ftmp4], %[ftmp14] \n\t" \
  295. \
  296. /* calculate: temp2[12] ~ temp2[15] */ \
  297. "pmullh %[ftmp5], %[ftmp5], %[filter_y0] \n\t" \
  298. "paddh %[ftmp5], %[ftmp5], %[ff_ph_40] \n\t" \
  299. "pmullh %[ftmp1], %[ftmp11], %[filter_y1] \n\t" \
  300. "paddh %[ftmp5], %[ftmp5], %[ftmp1] \n\t" \
  301. "psrlh %[ftmp5], %[ftmp5], %[ftmp14] \n\t" \
  302. \
  303. /* store: temp2[8] ~ temp2[15] */ \
  304. "and %[ftmp4], %[ftmp4], %[mask] \n\t" \
  305. "and %[ftmp5], %[ftmp5], %[mask] \n\t" \
  306. "packushb %[ftmp4], %[ftmp4], %[ftmp5] \n\t" \
  307. "gssdlc1 %[ftmp4], 0x0f(%[temp2_ptr]) \n\t" \
  308. "gssdrc1 %[ftmp4], 0x08(%[temp2_ptr]) \n\t"
  309. #define VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_16_B \
  310. VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_8_B \
  311. \
  312. /* calculate: temp2[8] ~ temp2[11] */ \
  313. "pmullh %[ftmp10], %[ftmp10], %[filter_y0] \n\t" \
  314. "paddh %[ftmp10], %[ftmp10], %[ff_ph_40] \n\t" \
  315. "pmullh %[ftmp1], %[ftmp4], %[filter_y1] \n\t" \
  316. "paddh %[ftmp10], %[ftmp10], %[ftmp1] \n\t" \
  317. "psrlh %[ftmp10], %[ftmp10], %[ftmp14] \n\t" \
  318. \
  319. /* calculate: temp2[12] ~ temp2[15] */ \
  320. "pmullh %[ftmp11], %[ftmp11], %[filter_y0] \n\t" \
  321. "paddh %[ftmp11], %[ftmp11], %[ff_ph_40] \n\t" \
  322. "pmullh %[ftmp1], %[ftmp5], %[filter_y1] \n\t" \
  323. "paddh %[ftmp11], %[ftmp11], %[ftmp1] \n\t" \
  324. "psrlh %[ftmp11], %[ftmp11], %[ftmp14] \n\t" \
  325. \
  326. /* store: temp2[8] ~ temp2[15] */ \
  327. "and %[ftmp10], %[ftmp10], %[mask] \n\t" \
  328. "and %[ftmp11], %[ftmp11], %[mask] \n\t" \
  329. "packushb %[ftmp10], %[ftmp10], %[ftmp11] \n\t" \
  330. "gssdlc1 %[ftmp10], 0x0f(%[temp2_ptr]) \n\t" \
  331. "gssdrc1 %[ftmp10], 0x08(%[temp2_ptr]) \n\t"
  332. // Applies a 1-D 2-tap bilinear filter to the source block in either horizontal
  333. // or vertical direction to produce the filtered output block. Used to implement
  334. // the first-pass of 2-D separable filter.
  335. //
  336. // Produces int16_t output to retain precision for the next pass. Two filter
  337. // taps should sum to FILTER_WEIGHT. pixel_step defines whether the filter is
  338. // applied horizontally (pixel_step = 1) or vertically (pixel_step = stride).
  339. // It defines the offset required to move from one input to the next.
  340. static void var_filter_block2d_bil_first_pass(
  341. const uint8_t *src_ptr, uint16_t *ref_ptr, unsigned int src_pixels_per_line,
  342. int pixel_step, unsigned int output_height, unsigned int output_width,
  343. const uint8_t *filter) {
  344. unsigned int i, j;
  345. for (i = 0; i < output_height; ++i) {
  346. for (j = 0; j < output_width; ++j) {
  347. ref_ptr[j] = ROUND_POWER_OF_TWO(
  348. (int)src_ptr[0] * filter[0] + (int)src_ptr[pixel_step] * filter[1],
  349. FILTER_BITS);
  350. ++src_ptr;
  351. }
  352. src_ptr += src_pixels_per_line - output_width;
  353. ref_ptr += output_width;
  354. }
  355. }
  356. // Applies a 1-D 2-tap bilinear filter to the source block in either horizontal
  357. // or vertical direction to produce the filtered output block. Used to implement
  358. // the second-pass of 2-D separable filter.
  359. //
  360. // Requires 16-bit input as produced by filter_block2d_bil_first_pass. Two
  361. // filter taps should sum to FILTER_WEIGHT. pixel_step defines whether the
  362. // filter is applied horizontally (pixel_step = 1) or vertically
  363. // (pixel_step = stride). It defines the offset required to move from one input
  364. // to the next. Output is 8-bit.
  365. static void var_filter_block2d_bil_second_pass(
  366. const uint16_t *src_ptr, uint8_t *ref_ptr, unsigned int src_pixels_per_line,
  367. unsigned int pixel_step, unsigned int output_height,
  368. unsigned int output_width, const uint8_t *filter) {
  369. unsigned int i, j;
  370. for (i = 0; i < output_height; ++i) {
  371. for (j = 0; j < output_width; ++j) {
  372. ref_ptr[j] = ROUND_POWER_OF_TWO(
  373. (int)src_ptr[0] * filter[0] + (int)src_ptr[pixel_step] * filter[1],
  374. FILTER_BITS);
  375. ++src_ptr;
  376. }
  377. src_ptr += src_pixels_per_line - output_width;
  378. ref_ptr += output_width;
  379. }
  380. }
  381. static inline uint32_t vpx_variance64x(const uint8_t *src_ptr, int src_stride,
  382. const uint8_t *ref_ptr, int ref_stride,
  383. uint32_t *sse, int high) {
  384. int sum;
  385. double ftmp[12];
  386. uint32_t tmp[3];
  387. *sse = 0;
  388. __asm__ volatile (
  389. "li %[tmp0], 0x20 \n\t"
  390. "mtc1 %[tmp0], %[ftmp11] \n\t"
  391. MMI_L(%[tmp0], %[high], 0x00)
  392. "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
  393. "xor %[ftmp9], %[ftmp9], %[ftmp9] \n\t"
  394. "xor %[ftmp10], %[ftmp10], %[ftmp10] \n\t"
  395. "1: \n\t"
  396. "gsldlc1 %[ftmp1], 0x07(%[src_ptr]) \n\t"
  397. "gsldrc1 %[ftmp1], 0x00(%[src_ptr]) \n\t"
  398. "gsldlc1 %[ftmp2], 0x07(%[ref_ptr]) \n\t"
  399. "gsldrc1 %[ftmp2], 0x00(%[ref_ptr]) \n\t"
  400. VARIANCE_SSE_SUM_8_FOR_W64
  401. "gsldlc1 %[ftmp1], 0x0f(%[src_ptr]) \n\t"
  402. "gsldrc1 %[ftmp1], 0x08(%[src_ptr]) \n\t"
  403. "gsldlc1 %[ftmp2], 0x0f(%[ref_ptr]) \n\t"
  404. "gsldrc1 %[ftmp2], 0x08(%[ref_ptr]) \n\t"
  405. VARIANCE_SSE_SUM_8_FOR_W64
  406. "gsldlc1 %[ftmp1], 0x17(%[src_ptr]) \n\t"
  407. "gsldrc1 %[ftmp1], 0x10(%[src_ptr]) \n\t"
  408. "gsldlc1 %[ftmp2], 0x17(%[ref_ptr]) \n\t"
  409. "gsldrc1 %[ftmp2], 0x10(%[ref_ptr]) \n\t"
  410. VARIANCE_SSE_SUM_8_FOR_W64
  411. "gsldlc1 %[ftmp1], 0x1f(%[src_ptr]) \n\t"
  412. "gsldrc1 %[ftmp1], 0x18(%[src_ptr]) \n\t"
  413. "gsldlc1 %[ftmp2], 0x1f(%[ref_ptr]) \n\t"
  414. "gsldrc1 %[ftmp2], 0x18(%[ref_ptr]) \n\t"
  415. VARIANCE_SSE_SUM_8_FOR_W64
  416. "gsldlc1 %[ftmp1], 0x27(%[src_ptr]) \n\t"
  417. "gsldrc1 %[ftmp1], 0x20(%[src_ptr]) \n\t"
  418. "gsldlc1 %[ftmp2], 0x27(%[ref_ptr]) \n\t"
  419. "gsldrc1 %[ftmp2], 0x20(%[ref_ptr]) \n\t"
  420. VARIANCE_SSE_SUM_8_FOR_W64
  421. "gsldlc1 %[ftmp1], 0x2f(%[src_ptr]) \n\t"
  422. "gsldrc1 %[ftmp1], 0x28(%[src_ptr]) \n\t"
  423. "gsldlc1 %[ftmp2], 0x2f(%[ref_ptr]) \n\t"
  424. "gsldrc1 %[ftmp2], 0x28(%[ref_ptr]) \n\t"
  425. VARIANCE_SSE_SUM_8_FOR_W64
  426. "gsldlc1 %[ftmp1], 0x37(%[src_ptr]) \n\t"
  427. "gsldrc1 %[ftmp1], 0x30(%[src_ptr]) \n\t"
  428. "gsldlc1 %[ftmp2], 0x37(%[ref_ptr]) \n\t"
  429. "gsldrc1 %[ftmp2], 0x30(%[ref_ptr]) \n\t"
  430. VARIANCE_SSE_SUM_8_FOR_W64
  431. "gsldlc1 %[ftmp1], 0x3f(%[src_ptr]) \n\t"
  432. "gsldrc1 %[ftmp1], 0x38(%[src_ptr]) \n\t"
  433. "gsldlc1 %[ftmp2], 0x3f(%[ref_ptr]) \n\t"
  434. "gsldrc1 %[ftmp2], 0x38(%[ref_ptr]) \n\t"
  435. VARIANCE_SSE_SUM_8_FOR_W64
  436. "addiu %[tmp0], %[tmp0], -0x01 \n\t"
  437. MMI_ADDU(%[src_ptr], %[src_ptr], %[src_stride])
  438. MMI_ADDU(%[ref_ptr], %[ref_ptr], %[ref_stride])
  439. "bnez %[tmp0], 1b \n\t"
  440. "mfc1 %[tmp1], %[ftmp9] \n\t"
  441. "mfhc1 %[tmp2], %[ftmp9] \n\t"
  442. "addu %[sum], %[tmp1], %[tmp2] \n\t"
  443. "dsrl %[ftmp1], %[ftmp10], %[ftmp11] \n\t"
  444. "paddw %[ftmp1], %[ftmp1], %[ftmp10] \n\t"
  445. "swc1 %[ftmp1], 0x00(%[sse]) \n\t"
  446. : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
  447. [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
  448. [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
  449. [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
  450. [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]),
  451. [ftmp10]"=&f"(ftmp[10]), [ftmp11]"=&f"(ftmp[11]),
  452. [tmp0]"=&r"(tmp[0]), [tmp1]"=&r"(tmp[1]),
  453. [tmp2]"=&r"(tmp[2]),
  454. [src_ptr]"+&r"(src_ptr), [ref_ptr]"+&r"(ref_ptr),
  455. [sum]"=&r"(sum)
  456. : [src_stride]"r"((mips_reg)src_stride),
  457. [ref_stride]"r"((mips_reg)ref_stride),
  458. [high]"r"(&high), [sse]"r"(sse)
  459. : "memory"
  460. );
  461. return *sse - (((int64_t)sum * sum) / (64 * high));
  462. }
  463. #define VPX_VARIANCE64XN(n) \
  464. uint32_t vpx_variance64x##n##_mmi(const uint8_t *src_ptr, int src_stride, \
  465. const uint8_t *ref_ptr, int ref_stride, \
  466. uint32_t *sse) { \
  467. return vpx_variance64x(src_ptr, src_stride, ref_ptr, ref_stride, sse, n); \
  468. }
  469. VPX_VARIANCE64XN(64)
  470. VPX_VARIANCE64XN(32)
  471. uint32_t vpx_variance32x64_mmi(const uint8_t *src_ptr, int src_stride,
  472. const uint8_t *ref_ptr, int ref_stride,
  473. uint32_t *sse) {
  474. int sum;
  475. double ftmp[12];
  476. uint32_t tmp[3];
  477. *sse = 0;
  478. __asm__ volatile (
  479. "li %[tmp0], 0x20 \n\t"
  480. "mtc1 %[tmp0], %[ftmp11] \n\t"
  481. "li %[tmp0], 0x40 \n\t"
  482. "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
  483. "xor %[ftmp9], %[ftmp9], %[ftmp9] \n\t"
  484. "xor %[ftmp10], %[ftmp10], %[ftmp10] \n\t"
  485. "1: \n\t"
  486. "gsldlc1 %[ftmp1], 0x07(%[src_ptr]) \n\t"
  487. "gsldrc1 %[ftmp1], 0x00(%[src_ptr]) \n\t"
  488. "gsldlc1 %[ftmp2], 0x07(%[ref_ptr]) \n\t"
  489. "gsldrc1 %[ftmp2], 0x00(%[ref_ptr]) \n\t"
  490. VARIANCE_SSE_SUM_8_FOR_W64
  491. "gsldlc1 %[ftmp1], 0x0f(%[src_ptr]) \n\t"
  492. "gsldrc1 %[ftmp1], 0x08(%[src_ptr]) \n\t"
  493. "gsldlc1 %[ftmp2], 0x0f(%[ref_ptr]) \n\t"
  494. "gsldrc1 %[ftmp2], 0x08(%[ref_ptr]) \n\t"
  495. VARIANCE_SSE_SUM_8_FOR_W64
  496. "gsldlc1 %[ftmp1], 0x17(%[src_ptr]) \n\t"
  497. "gsldrc1 %[ftmp1], 0x10(%[src_ptr]) \n\t"
  498. "gsldlc1 %[ftmp2], 0x17(%[ref_ptr]) \n\t"
  499. "gsldrc1 %[ftmp2], 0x10(%[ref_ptr]) \n\t"
  500. VARIANCE_SSE_SUM_8_FOR_W64
  501. "gsldlc1 %[ftmp1], 0x1f(%[src_ptr]) \n\t"
  502. "gsldrc1 %[ftmp1], 0x18(%[src_ptr]) \n\t"
  503. "gsldlc1 %[ftmp2], 0x1f(%[ref_ptr]) \n\t"
  504. "gsldrc1 %[ftmp2], 0x18(%[ref_ptr]) \n\t"
  505. VARIANCE_SSE_SUM_8_FOR_W64
  506. "addiu %[tmp0], %[tmp0], -0x01 \n\t"
  507. MMI_ADDU(%[src_ptr], %[src_ptr], %[src_stride])
  508. MMI_ADDU(%[ref_ptr], %[ref_ptr], %[ref_stride])
  509. "bnez %[tmp0], 1b \n\t"
  510. "mfc1 %[tmp1], %[ftmp9] \n\t"
  511. "mfhc1 %[tmp2], %[ftmp9] \n\t"
  512. "addu %[sum], %[tmp1], %[tmp2] \n\t"
  513. "dsrl %[ftmp1], %[ftmp10], %[ftmp11] \n\t"
  514. "paddw %[ftmp1], %[ftmp1], %[ftmp10] \n\t"
  515. "swc1 %[ftmp1], 0x00(%[sse]) \n\t"
  516. : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
  517. [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
  518. [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
  519. [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
  520. [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]),
  521. [ftmp10]"=&f"(ftmp[10]), [ftmp11]"=&f"(ftmp[11]),
  522. [tmp0]"=&r"(tmp[0]), [tmp1]"=&r"(tmp[1]),
  523. [tmp2]"=&r"(tmp[2]),
  524. [src_ptr]"+&r"(src_ptr), [ref_ptr]"+&r"(ref_ptr),
  525. [sum]"=&r"(sum)
  526. : [src_stride]"r"((mips_reg)src_stride),
  527. [ref_stride]"r"((mips_reg)ref_stride),
  528. [sse]"r"(sse)
  529. : "memory"
  530. );
  531. return *sse - (((int64_t)sum * sum) / 2048);
  532. }
  533. static inline uint32_t vpx_variance32x(const uint8_t *src_ptr, int src_stride,
  534. const uint8_t *ref_ptr, int ref_stride,
  535. uint32_t *sse, int high) {
  536. int sum;
  537. double ftmp[13];
  538. uint32_t tmp[3];
  539. *sse = 0;
  540. __asm__ volatile (
  541. "li %[tmp0], 0x20 \n\t"
  542. "mtc1 %[tmp0], %[ftmp11] \n\t"
  543. MMI_L(%[tmp0], %[high], 0x00)
  544. "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
  545. "xor %[ftmp8], %[ftmp8], %[ftmp8] \n\t"
  546. "xor %[ftmp10], %[ftmp10], %[ftmp10] \n\t"
  547. "xor %[ftmp12], %[ftmp12], %[ftmp12] \n\t"
  548. "1: \n\t"
  549. "gsldlc1 %[ftmp1], 0x07(%[src_ptr]) \n\t"
  550. "gsldrc1 %[ftmp1], 0x00(%[src_ptr]) \n\t"
  551. "gsldlc1 %[ftmp2], 0x07(%[ref_ptr]) \n\t"
  552. "gsldrc1 %[ftmp2], 0x00(%[ref_ptr]) \n\t"
  553. VARIANCE_SSE_SUM_8
  554. "gsldlc1 %[ftmp1], 0x0f(%[src_ptr]) \n\t"
  555. "gsldrc1 %[ftmp1], 0x08(%[src_ptr]) \n\t"
  556. "gsldlc1 %[ftmp2], 0x0f(%[ref_ptr]) \n\t"
  557. "gsldrc1 %[ftmp2], 0x08(%[ref_ptr]) \n\t"
  558. VARIANCE_SSE_SUM_8
  559. "gsldlc1 %[ftmp1], 0x17(%[src_ptr]) \n\t"
  560. "gsldrc1 %[ftmp1], 0x10(%[src_ptr]) \n\t"
  561. "gsldlc1 %[ftmp2], 0x17(%[ref_ptr]) \n\t"
  562. "gsldrc1 %[ftmp2], 0x10(%[ref_ptr]) \n\t"
  563. VARIANCE_SSE_SUM_8
  564. "gsldlc1 %[ftmp1], 0x1f(%[src_ptr]) \n\t"
  565. "gsldrc1 %[ftmp1], 0x18(%[src_ptr]) \n\t"
  566. "gsldlc1 %[ftmp2], 0x1f(%[ref_ptr]) \n\t"
  567. "gsldrc1 %[ftmp2], 0x18(%[ref_ptr]) \n\t"
  568. VARIANCE_SSE_SUM_8
  569. "addiu %[tmp0], %[tmp0], -0x01 \n\t"
  570. MMI_ADDU(%[src_ptr], %[src_ptr], %[src_stride])
  571. MMI_ADDU(%[ref_ptr], %[ref_ptr], %[ref_stride])
  572. "bnez %[tmp0], 1b \n\t"
  573. "dsrl %[ftmp9], %[ftmp8], %[ftmp11] \n\t"
  574. "paddw %[ftmp9], %[ftmp9], %[ftmp8] \n\t"
  575. "swc1 %[ftmp9], 0x00(%[sse]) \n\t"
  576. "punpcklhw %[ftmp3], %[ftmp10], %[ftmp0] \n\t"
  577. "punpckhhw %[ftmp4], %[ftmp10], %[ftmp0] \n\t"
  578. "punpcklhw %[ftmp5], %[ftmp12], %[ftmp0] \n\t"
  579. "punpckhhw %[ftmp6], %[ftmp12], %[ftmp0] \n\t"
  580. "paddw %[ftmp3], %[ftmp3], %[ftmp4] \n\t"
  581. "psubw %[ftmp3], %[ftmp3], %[ftmp5] \n\t"
  582. "psubw %[ftmp3], %[ftmp3], %[ftmp6] \n\t"
  583. "dsrl %[ftmp0], %[ftmp3], %[ftmp11] \n\t"
  584. "paddw %[ftmp0], %[ftmp0], %[ftmp3] \n\t"
  585. "swc1 %[ftmp0], 0x00(%[sum]) \n\t"
  586. : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
  587. [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
  588. [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
  589. [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
  590. [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]),
  591. [ftmp10]"=&f"(ftmp[10]), [ftmp11]"=&f"(ftmp[11]),
  592. [ftmp12]"=&f"(ftmp[12]), [tmp0]"=&r"(tmp[0]),
  593. [src_ptr]"+&r"(src_ptr), [ref_ptr]"+&r"(ref_ptr)
  594. : [src_stride]"r"((mips_reg)src_stride),
  595. [ref_stride]"r"((mips_reg)ref_stride),
  596. [high]"r"(&high), [sse]"r"(sse), [sum]"r"(&sum)
  597. : "memory"
  598. );
  599. return *sse - (((int64_t)sum * sum) / (32 * high));
  600. }
  601. #define VPX_VARIANCE32XN(n) \
  602. uint32_t vpx_variance32x##n##_mmi(const uint8_t *src_ptr, int src_stride, \
  603. const uint8_t *ref_ptr, int ref_stride, \
  604. uint32_t *sse) { \
  605. return vpx_variance32x(src_ptr, src_stride, ref_ptr, ref_stride, sse, n); \
  606. }
  607. VPX_VARIANCE32XN(32)
  608. VPX_VARIANCE32XN(16)
  609. static inline uint32_t vpx_variance16x(const uint8_t *src_ptr, int src_stride,
  610. const uint8_t *ref_ptr, int ref_stride,
  611. uint32_t *sse, int high) {
  612. int sum;
  613. double ftmp[13];
  614. uint32_t tmp[3];
  615. *sse = 0;
  616. __asm__ volatile (
  617. "li %[tmp0], 0x20 \n\t"
  618. "mtc1 %[tmp0], %[ftmp11] \n\t"
  619. MMI_L(%[tmp0], %[high], 0x00)
  620. "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
  621. "xor %[ftmp8], %[ftmp8], %[ftmp8] \n\t"
  622. "xor %[ftmp10], %[ftmp10], %[ftmp10] \n\t"
  623. "xor %[ftmp12], %[ftmp12], %[ftmp12] \n\t"
  624. "1: \n\t"
  625. "gsldlc1 %[ftmp1], 0x07(%[src_ptr]) \n\t"
  626. "gsldrc1 %[ftmp1], 0x00(%[src_ptr]) \n\t"
  627. "gsldlc1 %[ftmp2], 0x07(%[ref_ptr]) \n\t"
  628. "gsldrc1 %[ftmp2], 0x00(%[ref_ptr]) \n\t"
  629. VARIANCE_SSE_SUM_8
  630. "gsldlc1 %[ftmp1], 0x0f(%[src_ptr]) \n\t"
  631. "gsldrc1 %[ftmp1], 0x08(%[src_ptr]) \n\t"
  632. "gsldlc1 %[ftmp2], 0x0f(%[ref_ptr]) \n\t"
  633. "gsldrc1 %[ftmp2], 0x08(%[ref_ptr]) \n\t"
  634. VARIANCE_SSE_SUM_8
  635. "addiu %[tmp0], %[tmp0], -0x01 \n\t"
  636. MMI_ADDU(%[src_ptr], %[src_ptr], %[src_stride])
  637. MMI_ADDU(%[ref_ptr], %[ref_ptr], %[ref_stride])
  638. "bnez %[tmp0], 1b \n\t"
  639. "dsrl %[ftmp9], %[ftmp8], %[ftmp11] \n\t"
  640. "paddw %[ftmp9], %[ftmp9], %[ftmp8] \n\t"
  641. "swc1 %[ftmp9], 0x00(%[sse]) \n\t"
  642. "punpcklhw %[ftmp3], %[ftmp10], %[ftmp0] \n\t"
  643. "punpckhhw %[ftmp4], %[ftmp10], %[ftmp0] \n\t"
  644. "punpcklhw %[ftmp5], %[ftmp12], %[ftmp0] \n\t"
  645. "punpckhhw %[ftmp6], %[ftmp12], %[ftmp0] \n\t"
  646. "paddw %[ftmp3], %[ftmp3], %[ftmp4] \n\t"
  647. "psubw %[ftmp3], %[ftmp3], %[ftmp5] \n\t"
  648. "psubw %[ftmp3], %[ftmp3], %[ftmp6] \n\t"
  649. "dsrl %[ftmp0], %[ftmp3], %[ftmp11] \n\t"
  650. "paddw %[ftmp0], %[ftmp0], %[ftmp3] \n\t"
  651. "swc1 %[ftmp0], 0x00(%[sum]) \n\t"
  652. : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
  653. [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
  654. [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
  655. [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
  656. [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]),
  657. [ftmp10]"=&f"(ftmp[10]), [ftmp11]"=&f"(ftmp[11]),
  658. [ftmp12]"=&f"(ftmp[12]), [tmp0]"=&r"(tmp[0]),
  659. [src_ptr]"+&r"(src_ptr), [ref_ptr]"+&r"(ref_ptr)
  660. : [src_stride]"r"((mips_reg)src_stride),
  661. [ref_stride]"r"((mips_reg)ref_stride),
  662. [high]"r"(&high), [sse]"r"(sse), [sum]"r"(&sum)
  663. : "memory"
  664. );
  665. return *sse - (((int64_t)sum * sum) / (16 * high));
  666. }
  667. #define VPX_VARIANCE16XN(n) \
  668. uint32_t vpx_variance16x##n##_mmi(const uint8_t *src_ptr, int src_stride, \
  669. const uint8_t *ref_ptr, int ref_stride, \
  670. uint32_t *sse) { \
  671. return vpx_variance16x(src_ptr, src_stride, ref_ptr, ref_stride, sse, n); \
  672. }
  673. VPX_VARIANCE16XN(32)
  674. VPX_VARIANCE16XN(16)
  675. VPX_VARIANCE16XN(8)
  676. static inline uint32_t vpx_variance8x(const uint8_t *src_ptr, int src_stride,
  677. const uint8_t *ref_ptr, int ref_stride,
  678. uint32_t *sse, int high) {
  679. int sum;
  680. double ftmp[13];
  681. uint32_t tmp[3];
  682. *sse = 0;
  683. __asm__ volatile (
  684. "li %[tmp0], 0x20 \n\t"
  685. "mtc1 %[tmp0], %[ftmp11] \n\t"
  686. MMI_L(%[tmp0], %[high], 0x00)
  687. "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
  688. "xor %[ftmp8], %[ftmp8], %[ftmp8] \n\t"
  689. "xor %[ftmp10], %[ftmp10], %[ftmp10] \n\t"
  690. "xor %[ftmp12], %[ftmp12], %[ftmp12] \n\t"
  691. "1: \n\t"
  692. "gsldlc1 %[ftmp1], 0x07(%[src_ptr]) \n\t"
  693. "gsldrc1 %[ftmp1], 0x00(%[src_ptr]) \n\t"
  694. "gsldlc1 %[ftmp2], 0x07(%[ref_ptr]) \n\t"
  695. "gsldrc1 %[ftmp2], 0x00(%[ref_ptr]) \n\t"
  696. VARIANCE_SSE_SUM_8
  697. "addiu %[tmp0], %[tmp0], -0x01 \n\t"
  698. MMI_ADDU(%[src_ptr], %[src_ptr], %[src_stride])
  699. MMI_ADDU(%[ref_ptr], %[ref_ptr], %[ref_stride])
  700. "bnez %[tmp0], 1b \n\t"
  701. "dsrl %[ftmp9], %[ftmp8], %[ftmp11] \n\t"
  702. "paddw %[ftmp9], %[ftmp9], %[ftmp8] \n\t"
  703. "swc1 %[ftmp9], 0x00(%[sse]) \n\t"
  704. "punpcklhw %[ftmp3], %[ftmp10], %[ftmp0] \n\t"
  705. "punpckhhw %[ftmp4], %[ftmp10], %[ftmp0] \n\t"
  706. "punpcklhw %[ftmp5], %[ftmp12], %[ftmp0] \n\t"
  707. "punpckhhw %[ftmp6], %[ftmp12], %[ftmp0] \n\t"
  708. "paddw %[ftmp3], %[ftmp3], %[ftmp4] \n\t"
  709. "psubw %[ftmp3], %[ftmp3], %[ftmp5] \n\t"
  710. "psubw %[ftmp3], %[ftmp3], %[ftmp6] \n\t"
  711. "dsrl %[ftmp0], %[ftmp3], %[ftmp11] \n\t"
  712. "paddw %[ftmp0], %[ftmp0], %[ftmp3] \n\t"
  713. "swc1 %[ftmp0], 0x00(%[sum]) \n\t"
  714. : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
  715. [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
  716. [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
  717. [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
  718. [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]),
  719. [ftmp10]"=&f"(ftmp[10]), [ftmp11]"=&f"(ftmp[11]),
  720. [ftmp12]"=&f"(ftmp[12]), [tmp0]"=&r"(tmp[0]),
  721. [src_ptr]"+&r"(src_ptr), [ref_ptr]"+&r"(ref_ptr)
  722. : [src_stride]"r"((mips_reg)src_stride),
  723. [ref_stride]"r"((mips_reg)ref_stride),
  724. [high]"r"(&high), [sse]"r"(sse), [sum]"r"(&sum)
  725. : "memory"
  726. );
  727. return *sse - (((int64_t)sum * sum) / (8 * high));
  728. }
  729. #define VPX_VARIANCE8XN(n) \
  730. uint32_t vpx_variance8x##n##_mmi(const uint8_t *src_ptr, int src_stride, \
  731. const uint8_t *ref_ptr, int ref_stride, \
  732. uint32_t *sse) { \
  733. return vpx_variance8x(src_ptr, src_stride, ref_ptr, ref_stride, sse, n); \
  734. }
  735. VPX_VARIANCE8XN(16)
  736. VPX_VARIANCE8XN(8)
  737. VPX_VARIANCE8XN(4)
  738. static inline uint32_t vpx_variance4x(const uint8_t *src_ptr, int src_stride,
  739. const uint8_t *ref_ptr, int ref_stride,
  740. uint32_t *sse, int high) {
  741. int sum;
  742. double ftmp[12];
  743. uint32_t tmp[3];
  744. *sse = 0;
  745. __asm__ volatile (
  746. "li %[tmp0], 0x20 \n\t"
  747. "mtc1 %[tmp0], %[ftmp10] \n\t"
  748. MMI_L(%[tmp0], %[high], 0x00)
  749. "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
  750. "xor %[ftmp6], %[ftmp6], %[ftmp6] \n\t"
  751. "xor %[ftmp7], %[ftmp7], %[ftmp7] \n\t"
  752. "xor %[ftmp8], %[ftmp8], %[ftmp8] \n\t"
  753. "1: \n\t"
  754. "gsldlc1 %[ftmp1], 0x07(%[src_ptr]) \n\t"
  755. "gsldrc1 %[ftmp1], 0x00(%[src_ptr]) \n\t"
  756. "gsldlc1 %[ftmp2], 0x07(%[ref_ptr]) \n\t"
  757. "gsldrc1 %[ftmp2], 0x00(%[ref_ptr]) \n\t"
  758. VARIANCE_SSE_SUM_4
  759. "addiu %[tmp0], %[tmp0], -0x01 \n\t"
  760. MMI_ADDU(%[src_ptr], %[src_ptr], %[src_stride])
  761. MMI_ADDU(%[ref_ptr], %[ref_ptr], %[ref_stride])
  762. "bnez %[tmp0], 1b \n\t"
  763. "dsrl %[ftmp9], %[ftmp6], %[ftmp10] \n\t"
  764. "paddw %[ftmp9], %[ftmp9], %[ftmp6] \n\t"
  765. "swc1 %[ftmp9], 0x00(%[sse]) \n\t"
  766. "punpcklhw %[ftmp3], %[ftmp7], %[ftmp0] \n\t"
  767. "punpckhhw %[ftmp4], %[ftmp7], %[ftmp0] \n\t"
  768. "punpcklhw %[ftmp5], %[ftmp8], %[ftmp0] \n\t"
  769. "punpckhhw %[ftmp6], %[ftmp8], %[ftmp0] \n\t"
  770. "paddw %[ftmp3], %[ftmp3], %[ftmp4] \n\t"
  771. "psubw %[ftmp3], %[ftmp3], %[ftmp5] \n\t"
  772. "psubw %[ftmp3], %[ftmp3], %[ftmp6] \n\t"
  773. "dsrl %[ftmp0], %[ftmp3], %[ftmp10] \n\t"
  774. "paddw %[ftmp0], %[ftmp0], %[ftmp3] \n\t"
  775. "swc1 %[ftmp0], 0x00(%[sum]) \n\t"
  776. : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
  777. [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
  778. [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
  779. [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
  780. [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]),
  781. [ftmp10]"=&f"(ftmp[10]),
  782. [tmp0]"=&r"(tmp[0]),
  783. [src_ptr]"+&r"(src_ptr), [ref_ptr]"+&r"(ref_ptr)
  784. : [src_stride]"r"((mips_reg)src_stride),
  785. [ref_stride]"r"((mips_reg)ref_stride),
  786. [high]"r"(&high), [sse]"r"(sse), [sum]"r"(&sum)
  787. : "memory"
  788. );
  789. return *sse - (((int64_t)sum * sum) / (4 * high));
  790. }
  791. #define VPX_VARIANCE4XN(n) \
  792. uint32_t vpx_variance4x##n##_mmi(const uint8_t *src_ptr, int src_stride, \
  793. const uint8_t *ref_ptr, int ref_stride, \
  794. uint32_t *sse) { \
  795. return vpx_variance4x(src_ptr, src_stride, ref_ptr, ref_stride, sse, n); \
  796. }
  797. VPX_VARIANCE4XN(8)
  798. VPX_VARIANCE4XN(4)
  799. static inline uint32_t vpx_mse16x(const uint8_t *src_ptr, int src_stride,
  800. const uint8_t *ref_ptr, int ref_stride,
  801. uint32_t *sse, uint64_t high) {
  802. double ftmp[12];
  803. uint32_t tmp[1];
  804. *sse = 0;
  805. __asm__ volatile (
  806. "li %[tmp0], 0x20 \n\t"
  807. "mtc1 %[tmp0], %[ftmp11] \n\t"
  808. MMI_L(%[tmp0], %[high], 0x00)
  809. "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
  810. "xor %[ftmp8], %[ftmp8], %[ftmp8] \n\t"
  811. "1: \n\t"
  812. VARIANCE_SSE_16
  813. "addiu %[tmp0], %[tmp0], -0x01 \n\t"
  814. MMI_ADDU(%[src_ptr], %[src_ptr], %[src_stride])
  815. MMI_ADDU(%[ref_ptr], %[ref_ptr], %[ref_stride])
  816. "bnez %[tmp0], 1b \n\t"
  817. "dsrl %[ftmp9], %[ftmp8], %[ftmp11] \n\t"
  818. "paddw %[ftmp9], %[ftmp9], %[ftmp8] \n\t"
  819. "swc1 %[ftmp9], 0x00(%[sse]) \n\t"
  820. : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
  821. [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
  822. [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
  823. [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
  824. [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]),
  825. [ftmp10]"=&f"(ftmp[10]), [ftmp11]"=&f"(ftmp[11]),
  826. [tmp0]"=&r"(tmp[0]),
  827. [src_ptr]"+&r"(src_ptr), [ref_ptr]"+&r"(ref_ptr)
  828. : [src_stride]"r"((mips_reg)src_stride),
  829. [ref_stride]"r"((mips_reg)ref_stride),
  830. [high]"r"(&high), [sse]"r"(sse)
  831. : "memory"
  832. );
  833. return *sse;
  834. }
  835. #define vpx_mse16xN(n) \
  836. uint32_t vpx_mse16x##n##_mmi(const uint8_t *src_ptr, int src_stride, \
  837. const uint8_t *ref_ptr, int ref_stride, \
  838. uint32_t *sse) { \
  839. return vpx_mse16x(src_ptr, src_stride, ref_ptr, ref_stride, sse, n); \
  840. }
  841. vpx_mse16xN(16);
  842. vpx_mse16xN(8);
  843. static inline uint32_t vpx_mse8x(const uint8_t *src_ptr, int src_stride,
  844. const uint8_t *ref_ptr, int ref_stride,
  845. uint32_t *sse, uint64_t high) {
  846. double ftmp[12];
  847. uint32_t tmp[1];
  848. *sse = 0;
  849. __asm__ volatile (
  850. "li %[tmp0], 0x20 \n\t"
  851. "mtc1 %[tmp0], %[ftmp11] \n\t"
  852. MMI_L(%[tmp0], %[high], 0x00)
  853. "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
  854. "xor %[ftmp8], %[ftmp8], %[ftmp8] \n\t"
  855. "1: \n\t"
  856. VARIANCE_SSE_8
  857. "addiu %[tmp0], %[tmp0], -0x01 \n\t"
  858. MMI_ADDU(%[src_ptr], %[src_ptr], %[src_stride])
  859. MMI_ADDU(%[ref_ptr], %[ref_ptr], %[ref_stride])
  860. "bnez %[tmp0], 1b \n\t"
  861. "dsrl %[ftmp9], %[ftmp8], %[ftmp11] \n\t"
  862. "paddw %[ftmp9], %[ftmp9], %[ftmp8] \n\t"
  863. "swc1 %[ftmp9], 0x00(%[sse]) \n\t"
  864. : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
  865. [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
  866. [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
  867. [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
  868. [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]),
  869. [ftmp10]"=&f"(ftmp[10]), [ftmp11]"=&f"(ftmp[11]),
  870. [tmp0]"=&r"(tmp[0]),
  871. [src_ptr]"+&r"(src_ptr), [ref_ptr]"+&r"(ref_ptr)
  872. : [src_stride]"r"((mips_reg)src_stride),
  873. [ref_stride]"r"((mips_reg)ref_stride),
  874. [high]"r"(&high), [sse]"r"(sse)
  875. : "memory"
  876. );
  877. return *sse;
  878. }
  879. #define vpx_mse8xN(n) \
  880. uint32_t vpx_mse8x##n##_mmi(const uint8_t *src_ptr, int src_stride, \
  881. const uint8_t *ref_ptr, int ref_stride, \
  882. uint32_t *sse) { \
  883. return vpx_mse8x(src_ptr, src_stride, ref_ptr, ref_stride, sse, n); \
  884. }
  885. vpx_mse8xN(16);
  886. vpx_mse8xN(8);
  887. #define SUBPIX_VAR(W, H) \
  888. uint32_t vpx_sub_pixel_variance##W##x##H##_mmi( \
  889. const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, \
  890. const uint8_t *ref_ptr, int ref_stride, uint32_t *sse) { \
  891. uint16_t fdata3[((H) + 1) * (W)]; \
  892. uint8_t temp2[(H) * (W)]; \
  893. \
  894. var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_stride, 1, (H) + 1, \
  895. W, bilinear_filters[x_offset]); \
  896. var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
  897. bilinear_filters[y_offset]); \
  898. \
  899. return vpx_variance##W##x##H##_mmi(temp2, W, ref_ptr, ref_stride, sse); \
  900. }
  901. SUBPIX_VAR(64, 64)
  902. SUBPIX_VAR(64, 32)
  903. SUBPIX_VAR(32, 64)
  904. SUBPIX_VAR(32, 32)
  905. SUBPIX_VAR(32, 16)
  906. SUBPIX_VAR(16, 32)
  907. static inline void var_filter_block2d_bil_16x(const uint8_t *src_ptr,
  908. int src_stride, int x_offset,
  909. int y_offset, uint8_t *temp2,
  910. int counter) {
  911. uint8_t *temp2_ptr = temp2;
  912. mips_reg l_counter = counter;
  913. double ftmp[15];
  914. mips_reg tmp[2];
  915. DECLARE_ALIGNED(8, const uint64_t, ff_ph_40) = { 0x0040004000400040ULL };
  916. DECLARE_ALIGNED(8, const uint64_t, mask) = { 0x00ff00ff00ff00ffULL };
  917. const uint8_t *filter_x = bilinear_filters[x_offset];
  918. const uint8_t *filter_y = bilinear_filters[y_offset];
  919. __asm__ volatile (
  920. "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
  921. MMI_LI(%[tmp0], 0x07)
  922. MMI_MTC1(%[tmp0], %[ftmp14])
  923. "pshufh %[filter_x0], %[filter_x0], %[ftmp0] \n\t"
  924. "pshufh %[filter_x1], %[filter_x1], %[ftmp0] \n\t"
  925. "pshufh %[filter_y0], %[filter_y0], %[ftmp0] \n\t"
  926. "pshufh %[filter_y1], %[filter_y1], %[ftmp0] \n\t"
  927. // fdata3: fdata3[0] ~ fdata3[15]
  928. VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_16_A
  929. // fdata3 +src_stride*1: fdata3[0] ~ fdata3[15]
  930. MMI_ADDU(%[src_ptr], %[src_ptr], %[src_stride])
  931. VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_16_B
  932. // temp2: temp2[0] ~ temp2[15]
  933. VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_16_A
  934. // fdata3 +src_stride*2: fdata3[0] ~ fdata3[15]
  935. MMI_ADDU(%[src_ptr], %[src_ptr], %[src_stride])
  936. VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_16_A
  937. // temp2+16*1: temp2[0] ~ temp2[15]
  938. MMI_ADDIU(%[temp2_ptr], %[temp2_ptr], 0x10)
  939. VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_16_B
  940. "1: \n\t"
  941. MMI_ADDU(%[src_ptr], %[src_ptr], %[src_stride])
  942. VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_16_B
  943. MMI_ADDIU(%[temp2_ptr], %[temp2_ptr], 0x10)
  944. VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_16_A
  945. MMI_ADDU(%[src_ptr], %[src_ptr], %[src_stride])
  946. VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_16_A
  947. MMI_ADDIU(%[temp2_ptr], %[temp2_ptr], 0x10)
  948. VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_16_B
  949. "addiu %[counter], %[counter], -0x01 \n\t"
  950. "bnez %[counter], 1b \n\t"
  951. : [ftmp0] "=&f"(ftmp[0]), [ftmp1] "=&f"(ftmp[1]), [ftmp2] "=&f"(ftmp[2]),
  952. [ftmp3] "=&f"(ftmp[3]), [ftmp4] "=&f"(ftmp[4]), [ftmp5] "=&f"(ftmp[5]),
  953. [ftmp6] "=&f"(ftmp[6]), [ftmp7] "=&f"(ftmp[7]), [ftmp8] "=&f"(ftmp[8]),
  954. [ftmp9] "=&f"(ftmp[9]), [ftmp10] "=&f"(ftmp[10]),
  955. [ftmp11] "=&f"(ftmp[11]), [ftmp12] "=&f"(ftmp[12]),
  956. [ftmp13] "=&f"(ftmp[13]), [ftmp14] "=&f"(ftmp[14]),
  957. [tmp0] "=&r"(tmp[0]), [src_ptr] "+&r"(src_ptr), [temp2_ptr] "+&r"(temp2_ptr),
  958. [counter]"+&r"(l_counter)
  959. : [filter_x0] "f"((uint64_t)filter_x[0]),
  960. [filter_x1] "f"((uint64_t)filter_x[1]),
  961. [filter_y0] "f"((uint64_t)filter_y[0]),
  962. [filter_y1] "f"((uint64_t)filter_y[1]),
  963. [src_stride] "r"((mips_reg)src_stride), [ff_ph_40] "f"(ff_ph_40),
  964. [mask] "f"(mask)
  965. : "memory"
  966. );
  967. }
  968. #define SUBPIX_VAR16XN(H) \
  969. uint32_t vpx_sub_pixel_variance16x##H##_mmi( \
  970. const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, \
  971. const uint8_t *ref_ptr, int ref_stride, uint32_t *sse) { \
  972. uint8_t temp2[16 * (H)]; \
  973. var_filter_block2d_bil_16x(src_ptr, src_stride, x_offset, y_offset, temp2, \
  974. ((H)-2) / 2); \
  975. \
  976. return vpx_variance16x##H##_mmi(temp2, 16, ref_ptr, ref_stride, sse); \
  977. }
  978. SUBPIX_VAR16XN(16)
  979. SUBPIX_VAR16XN(8)
  980. static inline void var_filter_block2d_bil_8x(const uint8_t *src_ptr,
  981. int src_stride, int x_offset,
  982. int y_offset, uint8_t *temp2,
  983. int counter) {
  984. uint8_t *temp2_ptr = temp2;
  985. mips_reg l_counter = counter;
  986. double ftmp[15];
  987. mips_reg tmp[2];
  988. DECLARE_ALIGNED(8, const uint64_t, ff_ph_40) = { 0x0040004000400040ULL };
  989. DECLARE_ALIGNED(8, const uint64_t, mask) = { 0x00ff00ff00ff00ffULL };
  990. const uint8_t *filter_x = bilinear_filters[x_offset];
  991. const uint8_t *filter_y = bilinear_filters[y_offset];
  992. __asm__ volatile (
  993. "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
  994. MMI_LI(%[tmp0], 0x07)
  995. MMI_MTC1(%[tmp0], %[ftmp14])
  996. "pshufh %[filter_x0], %[filter_x0], %[ftmp0] \n\t"
  997. "pshufh %[filter_x1], %[filter_x1], %[ftmp0] \n\t"
  998. "pshufh %[filter_y0], %[filter_y0], %[ftmp0] \n\t"
  999. "pshufh %[filter_y1], %[filter_y1], %[ftmp0] \n\t"
  1000. // fdata3: fdata3[0] ~ fdata3[7]
  1001. VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_8_A
  1002. // fdata3 +src_stride*1: fdata3[0] ~ fdata3[7]
  1003. MMI_ADDU(%[src_ptr], %[src_ptr], %[src_stride])
  1004. VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_8_B
  1005. // temp2: temp2[0] ~ temp2[7]
  1006. VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_8_A
  1007. // fdata3 +src_stride*2: fdata3[0] ~ fdata3[7]
  1008. MMI_ADDU(%[src_ptr], %[src_ptr], %[src_stride])
  1009. VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_8_A
  1010. // temp2+8*1: temp2[0] ~ temp2[7]
  1011. MMI_ADDIU(%[temp2_ptr], %[temp2_ptr], 0x08)
  1012. VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_8_B
  1013. "1: \n\t"
  1014. MMI_ADDU(%[src_ptr], %[src_ptr], %[src_stride])
  1015. VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_8_B
  1016. MMI_ADDIU(%[temp2_ptr], %[temp2_ptr], 0x08)
  1017. VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_8_A
  1018. MMI_ADDU(%[src_ptr], %[src_ptr], %[src_stride])
  1019. VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_8_A
  1020. MMI_ADDIU(%[temp2_ptr], %[temp2_ptr], 0x08)
  1021. VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_8_B
  1022. "addiu %[counter], %[counter], -0x01 \n\t"
  1023. "bnez %[counter], 1b \n\t"
  1024. : [ftmp0] "=&f"(ftmp[0]), [ftmp1] "=&f"(ftmp[1]), [ftmp2] "=&f"(ftmp[2]),
  1025. [ftmp3] "=&f"(ftmp[3]), [ftmp4] "=&f"(ftmp[4]), [ftmp5] "=&f"(ftmp[5]),
  1026. [ftmp6] "=&f"(ftmp[6]), [ftmp7] "=&f"(ftmp[7]), [ftmp8] "=&f"(ftmp[8]),
  1027. [ftmp9] "=&f"(ftmp[9]), [ftmp10] "=&f"(ftmp[10]),
  1028. [ftmp11] "=&f"(ftmp[11]), [ftmp12] "=&f"(ftmp[12]),
  1029. [ftmp13] "=&f"(ftmp[13]), [ftmp14] "=&f"(ftmp[14]),
  1030. [tmp0] "=&r"(tmp[0]), [src_ptr] "+&r"(src_ptr), [temp2_ptr] "+&r"(temp2_ptr),
  1031. [counter]"+&r"(l_counter)
  1032. : [filter_x0] "f"((uint64_t)filter_x[0]),
  1033. [filter_x1] "f"((uint64_t)filter_x[1]),
  1034. [filter_y0] "f"((uint64_t)filter_y[0]),
  1035. [filter_y1] "f"((uint64_t)filter_y[1]),
  1036. [src_stride] "r"((mips_reg)src_stride), [ff_ph_40] "f"(ff_ph_40),
  1037. [mask] "f"(mask)
  1038. : "memory"
  1039. );
  1040. }
  1041. #define SUBPIX_VAR8XN(H) \
  1042. uint32_t vpx_sub_pixel_variance8x##H##_mmi( \
  1043. const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, \
  1044. const uint8_t *ref_ptr, int ref_stride, uint32_t *sse) { \
  1045. uint8_t temp2[8 * (H)]; \
  1046. var_filter_block2d_bil_8x(src_ptr, src_stride, x_offset, y_offset, temp2, \
  1047. ((H)-2) / 2); \
  1048. \
  1049. return vpx_variance8x##H##_mmi(temp2, 8, ref_ptr, ref_stride, sse); \
  1050. }
  1051. SUBPIX_VAR8XN(16)
  1052. SUBPIX_VAR8XN(8)
  1053. SUBPIX_VAR8XN(4)
  1054. static inline void var_filter_block2d_bil_4x(const uint8_t *src_ptr,
  1055. int src_stride, int x_offset,
  1056. int y_offset, uint8_t *temp2,
  1057. int counter) {
  1058. uint8_t *temp2_ptr = temp2;
  1059. mips_reg l_counter = counter;
  1060. double ftmp[7];
  1061. mips_reg tmp[2];
  1062. DECLARE_ALIGNED(8, const uint64_t, ff_ph_40) = { 0x0040004000400040ULL };
  1063. DECLARE_ALIGNED(8, const uint64_t, mask) = { 0x00ff00ff00ff00ffULL };
  1064. const uint8_t *filter_x = bilinear_filters[x_offset];
  1065. const uint8_t *filter_y = bilinear_filters[y_offset];
  1066. __asm__ volatile (
  1067. "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
  1068. MMI_LI(%[tmp0], 0x07)
  1069. MMI_MTC1(%[tmp0], %[ftmp6])
  1070. "pshufh %[filter_x0], %[filter_x0], %[ftmp0] \n\t"
  1071. "pshufh %[filter_x1], %[filter_x1], %[ftmp0] \n\t"
  1072. "pshufh %[filter_y0], %[filter_y0], %[ftmp0] \n\t"
  1073. "pshufh %[filter_y1], %[filter_y1], %[ftmp0] \n\t"
  1074. // fdata3: fdata3[0] ~ fdata3[3]
  1075. VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_4_A
  1076. // fdata3 +src_stride*1: fdata3[0] ~ fdata3[3]
  1077. MMI_ADDU(%[src_ptr], %[src_ptr], %[src_stride])
  1078. VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_4_B
  1079. // temp2: temp2[0] ~ temp2[7]
  1080. VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_4_A
  1081. // fdata3 +src_stride*2: fdata3[0] ~ fdata3[3]
  1082. MMI_ADDU(%[src_ptr], %[src_ptr], %[src_stride])
  1083. VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_4_A
  1084. // temp2+4*1: temp2[0] ~ temp2[7]
  1085. MMI_ADDIU(%[temp2_ptr], %[temp2_ptr], 0x04)
  1086. VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_4_B
  1087. "1: \n\t"
  1088. MMI_ADDU(%[src_ptr], %[src_ptr], %[src_stride])
  1089. VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_4_B
  1090. MMI_ADDIU(%[temp2_ptr], %[temp2_ptr], 0x04)
  1091. VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_4_A
  1092. MMI_ADDU(%[src_ptr], %[src_ptr], %[src_stride])
  1093. VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_4_A
  1094. MMI_ADDIU(%[temp2_ptr], %[temp2_ptr], 0x04)
  1095. VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_4_B
  1096. "addiu %[counter], %[counter], -0x01 \n\t"
  1097. "bnez %[counter], 1b \n\t"
  1098. : [ftmp0] "=&f"(ftmp[0]), [ftmp1] "=&f"(ftmp[1]), [ftmp2] "=&f"(ftmp[2]),
  1099. [ftmp3] "=&f"(ftmp[3]), [ftmp4] "=&f"(ftmp[4]), [ftmp5] "=&f"(ftmp[5]),
  1100. [ftmp6] "=&f"(ftmp[6]), [tmp0] "=&r"(tmp[0]), [src_ptr] "+&r"(src_ptr),
  1101. [temp2_ptr] "+&r"(temp2_ptr), [counter]"+&r"(l_counter)
  1102. : [filter_x0] "f"((uint64_t)filter_x[0]),
  1103. [filter_x1] "f"((uint64_t)filter_x[1]),
  1104. [filter_y0] "f"((uint64_t)filter_y[0]),
  1105. [filter_y1] "f"((uint64_t)filter_y[1]),
  1106. [src_stride] "r"((mips_reg)src_stride), [ff_ph_40] "f"(ff_ph_40),
  1107. [mask] "f"(mask)
  1108. : "memory"
  1109. );
  1110. }
  1111. #define SUBPIX_VAR4XN(H) \
  1112. uint32_t vpx_sub_pixel_variance4x##H##_mmi( \
  1113. const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, \
  1114. const uint8_t *ref_ptr, int ref_stride, uint32_t *sse) { \
  1115. uint8_t temp2[4 * (H)]; \
  1116. var_filter_block2d_bil_4x(src_ptr, src_stride, x_offset, y_offset, temp2, \
  1117. ((H)-2) / 2); \
  1118. \
  1119. return vpx_variance4x##H##_mmi(temp2, 4, ref_ptr, ref_stride, sse); \
  1120. }
  1121. SUBPIX_VAR4XN(8)
  1122. SUBPIX_VAR4XN(4)
  1123. #define SUBPIX_AVG_VAR(W, H) \
  1124. uint32_t vpx_sub_pixel_avg_variance##W##x##H##_mmi( \
  1125. const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, \
  1126. const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, \
  1127. const uint8_t *second_pred) { \
  1128. uint16_t fdata3[((H) + 1) * (W)]; \
  1129. uint8_t temp2[(H) * (W)]; \
  1130. DECLARE_ALIGNED(16, uint8_t, temp3[(H) * (W)]); \
  1131. \
  1132. var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_stride, 1, (H) + 1, \
  1133. W, bilinear_filters[x_offset]); \
  1134. var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
  1135. bilinear_filters[y_offset]); \
  1136. \
  1137. vpx_comp_avg_pred_c(temp3, second_pred, W, H, temp2, W); \
  1138. \
  1139. return vpx_variance##W##x##H##_mmi(temp3, W, ref_ptr, ref_stride, sse); \
  1140. }
  1141. SUBPIX_AVG_VAR(64, 64)
  1142. SUBPIX_AVG_VAR(64, 32)
  1143. SUBPIX_AVG_VAR(32, 64)
  1144. SUBPIX_AVG_VAR(32, 32)
  1145. SUBPIX_AVG_VAR(32, 16)
  1146. SUBPIX_AVG_VAR(16, 32)
  1147. SUBPIX_AVG_VAR(16, 16)
  1148. SUBPIX_AVG_VAR(16, 8)
  1149. SUBPIX_AVG_VAR(8, 16)
  1150. SUBPIX_AVG_VAR(8, 8)
  1151. SUBPIX_AVG_VAR(8, 4)
  1152. SUBPIX_AVG_VAR(4, 8)
  1153. SUBPIX_AVG_VAR(4, 4)