idct4x4_add_neon.asm 5.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188
  1. ;
  2. ; Copyright (c) 2013 The WebM project authors. All Rights Reserved.
  3. ;
  4. ; Use of this source code is governed by a BSD-style license
  5. ; that can be found in the LICENSE file in the root of the source
  6. ; tree. An additional intellectual property rights grant can be found
  7. ; in the file PATENTS. All contributing project authors may
  8. ; be found in the AUTHORS file in the root of the source tree.
  9. ;
  10. EXPORT |vpx_idct4x4_16_add_neon|
  11. ARM
  12. REQUIRE8
  13. PRESERVE8
  14. AREA ||.text||, CODE, READONLY, ALIGN=2
  15. INCLUDE vpx_dsp/arm/idct_neon.asm.S
  16. AREA Block, CODE, READONLY ; name this block of code
  17. ;void vpx_idct4x4_16_add_neon(int16_t *input, uint8_t *dest, int stride)
  18. ;
  19. ; r0 int16_t input
  20. ; r1 uint8_t *dest
  21. ; r2 int stride)
  22. |vpx_idct4x4_16_add_neon| PROC
  23. ; The 2D transform is done with two passes which are actually pretty
  24. ; similar. We first transform the rows. This is done by transposing
  25. ; the inputs, doing an SIMD column transform (the columns are the
  26. ; transposed rows) and then transpose the results (so that it goes back
  27. ; in normal/row positions). Then, we transform the columns by doing
  28. ; another SIMD column transform.
  29. ; So, two passes of a transpose followed by a column transform.
  30. ; load the inputs into q8-q9, d16-d19
  31. LOAD_TRAN_LOW_TO_S16 d16, d17, d18, d19, r0
  32. ; generate scalar constants
  33. ; cospi_8_64 = 15137
  34. movw r0, #0x3b21
  35. ; cospi_16_64 = 11585
  36. movw r3, #0x2d41
  37. ; cospi_24_64 = 6270
  38. movw r12, #0x187e
  39. ; transpose the input data
  40. ; 00 01 02 03 d16
  41. ; 10 11 12 13 d17
  42. ; 20 21 22 23 d18
  43. ; 30 31 32 33 d19
  44. vtrn.16 d16, d17
  45. vtrn.16 d18, d19
  46. ; generate constant vectors
  47. vdup.16 d20, r0 ; replicate cospi_8_64
  48. vdup.16 d21, r3 ; replicate cospi_16_64
  49. ; 00 10 02 12 d16
  50. ; 01 11 03 13 d17
  51. ; 20 30 22 32 d18
  52. ; 21 31 23 33 d19
  53. vtrn.32 q8, q9
  54. ; 00 10 20 30 d16
  55. ; 01 11 21 31 d17
  56. ; 02 12 22 32 d18
  57. ; 03 13 23 33 d19
  58. vdup.16 d22, r12 ; replicate cospi_24_64
  59. ; do the transform on transposed rows
  60. ; stage 1
  61. vmull.s16 q15, d17, d22 ; input[1] * cospi_24_64
  62. vmull.s16 q1, d17, d20 ; input[1] * cospi_8_64
  63. ; (input[0] + input[2]) * cospi_16_64;
  64. ; (input[0] - input[2]) * cospi_16_64;
  65. vmull.s16 q8, d16, d21
  66. vmull.s16 q14, d18, d21
  67. vadd.s32 q13, q8, q14
  68. vsub.s32 q14, q8, q14
  69. ; input[1] * cospi_24_64 - input[3] * cospi_8_64;
  70. ; input[1] * cospi_8_64 + input[3] * cospi_24_64;
  71. vmlsl.s16 q15, d19, d20
  72. vmlal.s16 q1, d19, d22
  73. ; dct_const_round_shift
  74. vrshrn.s32 d26, q13, #14
  75. vrshrn.s32 d27, q14, #14
  76. vrshrn.s32 d29, q15, #14
  77. vrshrn.s32 d28, q1, #14
  78. ; stage 2
  79. ; output[0] = step[0] + step[3];
  80. ; output[1] = step[1] + step[2];
  81. ; output[3] = step[0] - step[3];
  82. ; output[2] = step[1] - step[2];
  83. vadd.s16 q8, q13, q14
  84. vsub.s16 q9, q13, q14
  85. vswp d18, d19
  86. ; transpose the results
  87. ; 00 01 02 03 d16
  88. ; 10 11 12 13 d17
  89. ; 20 21 22 23 d18
  90. ; 30 31 32 33 d19
  91. vtrn.16 d16, d17
  92. vtrn.16 d18, d19
  93. ; 00 10 02 12 d16
  94. ; 01 11 03 13 d17
  95. ; 20 30 22 32 d18
  96. ; 21 31 23 33 d19
  97. vtrn.32 q8, q9
  98. ; 00 10 20 30 d16
  99. ; 01 11 21 31 d17
  100. ; 02 12 22 32 d18
  101. ; 03 13 23 33 d19
  102. ; do the transform on columns
  103. ; stage 1
  104. vadd.s16 d23, d16, d18 ; (input[0] + input[2])
  105. vsub.s16 d24, d16, d18 ; (input[0] - input[2])
  106. vmull.s16 q15, d17, d22 ; input[1] * cospi_24_64
  107. vmull.s16 q1, d17, d20 ; input[1] * cospi_8_64
  108. ; (input[0] + input[2]) * cospi_16_64;
  109. ; (input[0] - input[2]) * cospi_16_64;
  110. vmull.s16 q13, d23, d21
  111. vmull.s16 q14, d24, d21
  112. ; input[1] * cospi_24_64 - input[3] * cospi_8_64;
  113. ; input[1] * cospi_8_64 + input[3] * cospi_24_64;
  114. vmlsl.s16 q15, d19, d20
  115. vmlal.s16 q1, d19, d22
  116. ; dct_const_round_shift
  117. vrshrn.s32 d26, q13, #14
  118. vrshrn.s32 d27, q14, #14
  119. vrshrn.s32 d29, q15, #14
  120. vrshrn.s32 d28, q1, #14
  121. ; stage 2
  122. ; output[0] = step[0] + step[3];
  123. ; output[1] = step[1] + step[2];
  124. ; output[3] = step[0] - step[3];
  125. ; output[2] = step[1] - step[2];
  126. vadd.s16 q8, q13, q14
  127. vsub.s16 q9, q13, q14
  128. ; The results are in two registers, one of them being swapped. This will
  129. ; be taken care of by loading the 'dest' value in a swapped fashion and
  130. ; also storing them in the same swapped fashion.
  131. ; temp_out[0, 1] = d16, d17 = q8
  132. ; temp_out[2, 3] = d19, d18 = q9 swapped
  133. ; ROUND_POWER_OF_TWO(temp_out[j], 4)
  134. vrshr.s16 q8, q8, #4
  135. vrshr.s16 q9, q9, #4
  136. vld1.32 {d26[0]}, [r1], r2
  137. vld1.32 {d26[1]}, [r1], r2
  138. vld1.32 {d27[1]}, [r1], r2
  139. vld1.32 {d27[0]}, [r1] ; no post-increment
  140. ; ROUND_POWER_OF_TWO(temp_out[j], 4) + dest[j * stride + i]
  141. vaddw.u8 q8, q8, d26
  142. vaddw.u8 q9, q9, d27
  143. ; clip_pixel
  144. vqmovun.s16 d26, q8
  145. vqmovun.s16 d27, q9
  146. ; do the stores in reverse order with negative post-increment, by changing
  147. ; the sign of the stride
  148. rsb r2, r2, #0
  149. vst1.32 {d27[0]}, [r1], r2
  150. vst1.32 {d27[1]}, [r1], r2
  151. vst1.32 {d26[1]}, [r1], r2
  152. vst1.32 {d26[0]}, [r1] ; no post-increment
  153. bx lr
  154. ENDP ; |vpx_idct4x4_16_add_neon|
  155. END