subtract_sse2.asm 3.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127
  1. ;
  2. ; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
  3. ;
  4. ; Use of this source code is governed by a BSD-style license
  5. ; that can be found in the LICENSE file in the root of the source
  6. ; tree. An additional intellectual property rights grant can be found
  7. ; in the file PATENTS. All contributing project authors may
  8. ; be found in the AUTHORS file in the root of the source tree.
  9. ;
  10. %include "third_party/x86inc/x86inc.asm"
  11. SECTION .text
  12. ; void vpx_subtract_block(int rows, int cols,
  13. ; int16_t *diff, ptrdiff_t diff_stride,
  14. ; const uint8_t *src, ptrdiff_t src_stride,
  15. ; const uint8_t *pred, ptrdiff_t pred_stride)
  16. INIT_XMM sse2
  17. cglobal subtract_block, 7, 7, 8, \
  18. rows, cols, diff, diff_stride, src, src_stride, \
  19. pred, pred_stride
  20. %define pred_str colsq
  21. pxor m7, m7 ; dedicated zero register
  22. cmp colsd, 4
  23. je .case_4
  24. cmp colsd, 8
  25. je .case_8
  26. cmp colsd, 16
  27. je .case_16
  28. cmp colsd, 32
  29. je .case_32
  30. %macro loop16 6
  31. mova m0, [srcq+%1]
  32. mova m4, [srcq+%2]
  33. mova m1, [predq+%3]
  34. mova m5, [predq+%4]
  35. punpckhbw m2, m0, m7
  36. punpckhbw m3, m1, m7
  37. punpcklbw m0, m7
  38. punpcklbw m1, m7
  39. psubw m2, m3
  40. psubw m0, m1
  41. punpckhbw m1, m4, m7
  42. punpckhbw m3, m5, m7
  43. punpcklbw m4, m7
  44. punpcklbw m5, m7
  45. psubw m1, m3
  46. psubw m4, m5
  47. mova [diffq+mmsize*0+%5], m0
  48. mova [diffq+mmsize*1+%5], m2
  49. mova [diffq+mmsize*0+%6], m4
  50. mova [diffq+mmsize*1+%6], m1
  51. %endmacro
  52. mov pred_str, pred_stridemp
  53. .loop_64:
  54. loop16 0*mmsize, 1*mmsize, 0*mmsize, 1*mmsize, 0*mmsize, 2*mmsize
  55. loop16 2*mmsize, 3*mmsize, 2*mmsize, 3*mmsize, 4*mmsize, 6*mmsize
  56. lea diffq, [diffq+diff_strideq*2]
  57. add predq, pred_str
  58. add srcq, src_strideq
  59. dec rowsd
  60. jg .loop_64
  61. RET
  62. .case_32:
  63. mov pred_str, pred_stridemp
  64. .loop_32:
  65. loop16 0, mmsize, 0, mmsize, 0, 2*mmsize
  66. lea diffq, [diffq+diff_strideq*2]
  67. add predq, pred_str
  68. add srcq, src_strideq
  69. dec rowsd
  70. jg .loop_32
  71. RET
  72. .case_16:
  73. mov pred_str, pred_stridemp
  74. .loop_16:
  75. loop16 0, src_strideq, 0, pred_str, 0, diff_strideq*2
  76. lea diffq, [diffq+diff_strideq*4]
  77. lea predq, [predq+pred_str*2]
  78. lea srcq, [srcq+src_strideq*2]
  79. sub rowsd, 2
  80. jg .loop_16
  81. RET
  82. %macro loop_h 0
  83. movh m0, [srcq]
  84. movh m2, [srcq+src_strideq]
  85. movh m1, [predq]
  86. movh m3, [predq+pred_str]
  87. punpcklbw m0, m7
  88. punpcklbw m1, m7
  89. punpcklbw m2, m7
  90. punpcklbw m3, m7
  91. psubw m0, m1
  92. psubw m2, m3
  93. mova [diffq], m0
  94. mova [diffq+diff_strideq*2], m2
  95. %endmacro
  96. .case_8:
  97. mov pred_str, pred_stridemp
  98. .loop_8:
  99. loop_h
  100. lea diffq, [diffq+diff_strideq*4]
  101. lea srcq, [srcq+src_strideq*2]
  102. lea predq, [predq+pred_str*2]
  103. sub rowsd, 2
  104. jg .loop_8
  105. RET
  106. INIT_MMX
  107. .case_4:
  108. mov pred_str, pred_stridemp
  109. .loop_4:
  110. loop_h
  111. lea diffq, [diffq+diff_strideq*4]
  112. lea srcq, [srcq+src_strideq*2]
  113. lea predq, [predq+pred_str*2]
  114. sub rowsd, 2
  115. jg .loop_4
  116. RET