transpose_neon.h 53 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316
  1. /*
  2. * Copyright (c) 2016 The WebM project authors. All Rights Reserved.
  3. *
  4. * Use of this source code is governed by a BSD-style license
  5. * that can be found in the LICENSE file in the root of the source
  6. * tree. An additional intellectual property rights grant can be found
  7. * in the file PATENTS. All contributing project authors may
  8. * be found in the AUTHORS file in the root of the source tree.
  9. */
  10. #ifndef VPX_VPX_DSP_ARM_TRANSPOSE_NEON_H_
  11. #define VPX_VPX_DSP_ARM_TRANSPOSE_NEON_H_
  12. #include <arm_neon.h>
  13. #include "./vpx_config.h"
  14. // Transpose 64 bit elements as follows:
  15. // a0: 00 01 02 03 04 05 06 07
  16. // a1: 16 17 18 19 20 21 22 23
  17. //
  18. // b0.val[0]: 00 01 02 03 16 17 18 19
  19. // b0.val[1]: 04 05 06 07 20 21 22 23
  20. static INLINE int16x8x2_t vpx_vtrnq_s64_to_s16(int32x4_t a0, int32x4_t a1) {
  21. int16x8x2_t b0;
  22. b0.val[0] = vcombine_s16(vreinterpret_s16_s32(vget_low_s32(a0)),
  23. vreinterpret_s16_s32(vget_low_s32(a1)));
  24. b0.val[1] = vcombine_s16(vreinterpret_s16_s32(vget_high_s32(a0)),
  25. vreinterpret_s16_s32(vget_high_s32(a1)));
  26. return b0;
  27. }
  28. static INLINE int32x4x2_t vpx_vtrnq_s64_to_s32(int32x4_t a0, int32x4_t a1) {
  29. int32x4x2_t b0;
  30. b0.val[0] = vcombine_s32(vget_low_s32(a0), vget_low_s32(a1));
  31. b0.val[1] = vcombine_s32(vget_high_s32(a0), vget_high_s32(a1));
  32. return b0;
  33. }
  34. static INLINE int64x2x2_t vpx_vtrnq_s64(int32x4_t a0, int32x4_t a1) {
  35. int64x2x2_t b0;
  36. b0.val[0] = vcombine_s64(vreinterpret_s64_s32(vget_low_s32(a0)),
  37. vreinterpret_s64_s32(vget_low_s32(a1)));
  38. b0.val[1] = vcombine_s64(vreinterpret_s64_s32(vget_high_s32(a0)),
  39. vreinterpret_s64_s32(vget_high_s32(a1)));
  40. return b0;
  41. }
  42. static INLINE uint8x16x2_t vpx_vtrnq_u64_to_u8(uint32x4_t a0, uint32x4_t a1) {
  43. uint8x16x2_t b0;
  44. b0.val[0] = vcombine_u8(vreinterpret_u8_u32(vget_low_u32(a0)),
  45. vreinterpret_u8_u32(vget_low_u32(a1)));
  46. b0.val[1] = vcombine_u8(vreinterpret_u8_u32(vget_high_u32(a0)),
  47. vreinterpret_u8_u32(vget_high_u32(a1)));
  48. return b0;
  49. }
  50. static INLINE uint16x8x2_t vpx_vtrnq_u64_to_u16(uint32x4_t a0, uint32x4_t a1) {
  51. uint16x8x2_t b0;
  52. b0.val[0] = vcombine_u16(vreinterpret_u16_u32(vget_low_u32(a0)),
  53. vreinterpret_u16_u32(vget_low_u32(a1)));
  54. b0.val[1] = vcombine_u16(vreinterpret_u16_u32(vget_high_u32(a0)),
  55. vreinterpret_u16_u32(vget_high_u32(a1)));
  56. return b0;
  57. }
  58. static INLINE void transpose_u8_4x4(uint8x8_t *a0, uint8x8_t *a1) {
  59. // Swap 16 bit elements. Goes from:
  60. // a0: 00 01 02 03 10 11 12 13
  61. // a1: 20 21 22 23 30 31 32 33
  62. // to:
  63. // b0.val[0]: 00 01 20 21 10 11 30 31
  64. // b0.val[1]: 02 03 22 23 12 13 32 33
  65. const uint16x4x2_t b0 =
  66. vtrn_u16(vreinterpret_u16_u8(*a0), vreinterpret_u16_u8(*a1));
  67. // Swap 32 bit elements resulting in:
  68. // c0.val[0]: 00 01 20 21 02 03 22 23
  69. // c0.val[1]: 10 11 30 31 12 13 32 33
  70. const uint32x2x2_t c0 = vtrn_u32(vreinterpret_u32_u16(b0.val[0]),
  71. vreinterpret_u32_u16(b0.val[1]));
  72. // Swap 8 bit elements resulting in:
  73. // d0.val[0]: 00 10 20 30 02 12 22 32
  74. // d0.val[1]: 01 11 21 31 03 13 23 33
  75. const uint8x8x2_t d0 =
  76. vtrn_u8(vreinterpret_u8_u32(c0.val[0]), vreinterpret_u8_u32(c0.val[1]));
  77. *a0 = d0.val[0];
  78. *a1 = d0.val[1];
  79. }
  80. static INLINE void transpose_s16_4x4d(int16x4_t *a0, int16x4_t *a1,
  81. int16x4_t *a2, int16x4_t *a3) {
  82. // Swap 16 bit elements. Goes from:
  83. // a0: 00 01 02 03
  84. // a1: 10 11 12 13
  85. // a2: 20 21 22 23
  86. // a3: 30 31 32 33
  87. // to:
  88. // b0.val[0]: 00 10 02 12
  89. // b0.val[1]: 01 11 03 13
  90. // b1.val[0]: 20 30 22 32
  91. // b1.val[1]: 21 31 23 33
  92. const int16x4x2_t b0 = vtrn_s16(*a0, *a1);
  93. const int16x4x2_t b1 = vtrn_s16(*a2, *a3);
  94. // Swap 32 bit elements resulting in:
  95. // c0.val[0]: 00 10 20 30
  96. // c0.val[1]: 02 12 22 32
  97. // c1.val[0]: 01 11 21 31
  98. // c1.val[1]: 03 13 23 33
  99. const int32x2x2_t c0 = vtrn_s32(vreinterpret_s32_s16(b0.val[0]),
  100. vreinterpret_s32_s16(b1.val[0]));
  101. const int32x2x2_t c1 = vtrn_s32(vreinterpret_s32_s16(b0.val[1]),
  102. vreinterpret_s32_s16(b1.val[1]));
  103. *a0 = vreinterpret_s16_s32(c0.val[0]);
  104. *a1 = vreinterpret_s16_s32(c1.val[0]);
  105. *a2 = vreinterpret_s16_s32(c0.val[1]);
  106. *a3 = vreinterpret_s16_s32(c1.val[1]);
  107. }
  108. static INLINE void transpose_s16_4x4q(int16x8_t *a0, int16x8_t *a1) {
  109. // Swap 32 bit elements. Goes from:
  110. // a0: 00 01 02 03 10 11 12 13
  111. // a1: 20 21 22 23 30 31 32 33
  112. // to:
  113. // b0.val[0]: 00 01 20 21 10 11 30 31
  114. // b0.val[1]: 02 03 22 23 12 13 32 33
  115. const int32x4x2_t b0 =
  116. vtrnq_s32(vreinterpretq_s32_s16(*a0), vreinterpretq_s32_s16(*a1));
  117. // Swap 64 bit elements resulting in:
  118. // c0.val[0]: 00 01 20 21 02 03 22 23
  119. // c0.val[1]: 10 11 30 31 12 13 32 33
  120. const int32x4_t c0 =
  121. vcombine_s32(vget_low_s32(b0.val[0]), vget_low_s32(b0.val[1]));
  122. const int32x4_t c1 =
  123. vcombine_s32(vget_high_s32(b0.val[0]), vget_high_s32(b0.val[1]));
  124. // Swap 16 bit elements resulting in:
  125. // d0.val[0]: 00 10 20 30 02 12 22 32
  126. // d0.val[1]: 01 11 21 31 03 13 23 33
  127. const int16x8x2_t d0 =
  128. vtrnq_s16(vreinterpretq_s16_s32(c0), vreinterpretq_s16_s32(c1));
  129. *a0 = d0.val[0];
  130. *a1 = d0.val[1];
  131. }
  132. static INLINE void transpose_u16_4x4q(uint16x8_t *a0, uint16x8_t *a1) {
  133. // Swap 32 bit elements. Goes from:
  134. // a0: 00 01 02 03 10 11 12 13
  135. // a1: 20 21 22 23 30 31 32 33
  136. // to:
  137. // b0.val[0]: 00 01 20 21 10 11 30 31
  138. // b0.val[1]: 02 03 22 23 12 13 32 33
  139. const uint32x4x2_t b0 =
  140. vtrnq_u32(vreinterpretq_u32_u16(*a0), vreinterpretq_u32_u16(*a1));
  141. // Swap 64 bit elements resulting in:
  142. // c0.val[0]: 00 01 20 21 02 03 22 23
  143. // c0.val[1]: 10 11 30 31 12 13 32 33
  144. const uint32x4_t c0 =
  145. vcombine_u32(vget_low_u32(b0.val[0]), vget_low_u32(b0.val[1]));
  146. const uint32x4_t c1 =
  147. vcombine_u32(vget_high_u32(b0.val[0]), vget_high_u32(b0.val[1]));
  148. // Swap 16 bit elements resulting in:
  149. // d0.val[0]: 00 10 20 30 02 12 22 32
  150. // d0.val[1]: 01 11 21 31 03 13 23 33
  151. const uint16x8x2_t d0 =
  152. vtrnq_u16(vreinterpretq_u16_u32(c0), vreinterpretq_u16_u32(c1));
  153. *a0 = d0.val[0];
  154. *a1 = d0.val[1];
  155. }
  156. static INLINE void transpose_u8_4x8(uint8x8_t *a0, uint8x8_t *a1, uint8x8_t *a2,
  157. uint8x8_t *a3, const uint8x8_t a4,
  158. const uint8x8_t a5, const uint8x8_t a6,
  159. const uint8x8_t a7) {
  160. // Swap 32 bit elements. Goes from:
  161. // a0: 00 01 02 03 XX XX XX XX
  162. // a1: 10 11 12 13 XX XX XX XX
  163. // a2: 20 21 22 23 XX XX XX XX
  164. // a3; 30 31 32 33 XX XX XX XX
  165. // a4: 40 41 42 43 XX XX XX XX
  166. // a5: 50 51 52 53 XX XX XX XX
  167. // a6: 60 61 62 63 XX XX XX XX
  168. // a7: 70 71 72 73 XX XX XX XX
  169. // to:
  170. // b0.val[0]: 00 01 02 03 40 41 42 43
  171. // b1.val[0]: 10 11 12 13 50 51 52 53
  172. // b2.val[0]: 20 21 22 23 60 61 62 63
  173. // b3.val[0]: 30 31 32 33 70 71 72 73
  174. const uint32x2x2_t b0 =
  175. vtrn_u32(vreinterpret_u32_u8(*a0), vreinterpret_u32_u8(a4));
  176. const uint32x2x2_t b1 =
  177. vtrn_u32(vreinterpret_u32_u8(*a1), vreinterpret_u32_u8(a5));
  178. const uint32x2x2_t b2 =
  179. vtrn_u32(vreinterpret_u32_u8(*a2), vreinterpret_u32_u8(a6));
  180. const uint32x2x2_t b3 =
  181. vtrn_u32(vreinterpret_u32_u8(*a3), vreinterpret_u32_u8(a7));
  182. // Swap 16 bit elements resulting in:
  183. // c0.val[0]: 00 01 20 21 40 41 60 61
  184. // c0.val[1]: 02 03 22 23 42 43 62 63
  185. // c1.val[0]: 10 11 30 31 50 51 70 71
  186. // c1.val[1]: 12 13 32 33 52 53 72 73
  187. const uint16x4x2_t c0 = vtrn_u16(vreinterpret_u16_u32(b0.val[0]),
  188. vreinterpret_u16_u32(b2.val[0]));
  189. const uint16x4x2_t c1 = vtrn_u16(vreinterpret_u16_u32(b1.val[0]),
  190. vreinterpret_u16_u32(b3.val[0]));
  191. // Swap 8 bit elements resulting in:
  192. // d0.val[0]: 00 10 20 30 40 50 60 70
  193. // d0.val[1]: 01 11 21 31 41 51 61 71
  194. // d1.val[0]: 02 12 22 32 42 52 62 72
  195. // d1.val[1]: 03 13 23 33 43 53 63 73
  196. const uint8x8x2_t d0 =
  197. vtrn_u8(vreinterpret_u8_u16(c0.val[0]), vreinterpret_u8_u16(c1.val[0]));
  198. const uint8x8x2_t d1 =
  199. vtrn_u8(vreinterpret_u8_u16(c0.val[1]), vreinterpret_u8_u16(c1.val[1]));
  200. *a0 = d0.val[0];
  201. *a1 = d0.val[1];
  202. *a2 = d1.val[0];
  203. *a3 = d1.val[1];
  204. }
  205. static INLINE void transpose_s32_4x4(int32x4_t *a0, int32x4_t *a1,
  206. int32x4_t *a2, int32x4_t *a3) {
  207. // Swap 32 bit elements. Goes from:
  208. // a0: 00 01 02 03
  209. // a1: 10 11 12 13
  210. // a2: 20 21 22 23
  211. // a3: 30 31 32 33
  212. // to:
  213. // b0.val[0]: 00 10 02 12
  214. // b0.val[1]: 01 11 03 13
  215. // b1.val[0]: 20 30 22 32
  216. // b1.val[1]: 21 31 23 33
  217. const int32x4x2_t b0 = vtrnq_s32(*a0, *a1);
  218. const int32x4x2_t b1 = vtrnq_s32(*a2, *a3);
  219. // Swap 64 bit elements resulting in:
  220. // c0.val[0]: 00 10 20 30
  221. // c0.val[1]: 02 12 22 32
  222. // c1.val[0]: 01 11 21 31
  223. // c1.val[1]: 03 13 23 33
  224. const int32x4x2_t c0 = vpx_vtrnq_s64_to_s32(b0.val[0], b1.val[0]);
  225. const int32x4x2_t c1 = vpx_vtrnq_s64_to_s32(b0.val[1], b1.val[1]);
  226. *a0 = c0.val[0];
  227. *a1 = c1.val[0];
  228. *a2 = c0.val[1];
  229. *a3 = c1.val[1];
  230. }
  231. static INLINE void transpose_s16_4x8(const int16x4_t a0, const int16x4_t a1,
  232. const int16x4_t a2, const int16x4_t a3,
  233. const int16x4_t a4, const int16x4_t a5,
  234. const int16x4_t a6, const int16x4_t a7,
  235. int16x8_t *const o0, int16x8_t *const o1,
  236. int16x8_t *const o2, int16x8_t *const o3) {
  237. // Swap 16 bit elements. Goes from:
  238. // a0: 00 01 02 03
  239. // a1: 10 11 12 13
  240. // a2: 20 21 22 23
  241. // a3: 30 31 32 33
  242. // a4: 40 41 42 43
  243. // a5: 50 51 52 53
  244. // a6: 60 61 62 63
  245. // a7: 70 71 72 73
  246. // to:
  247. // b0.val[0]: 00 10 02 12
  248. // b0.val[1]: 01 11 03 13
  249. // b1.val[0]: 20 30 22 32
  250. // b1.val[1]: 21 31 23 33
  251. // b2.val[0]: 40 50 42 52
  252. // b2.val[1]: 41 51 43 53
  253. // b3.val[0]: 60 70 62 72
  254. // b3.val[1]: 61 71 63 73
  255. const int16x4x2_t b0 = vtrn_s16(a0, a1);
  256. const int16x4x2_t b1 = vtrn_s16(a2, a3);
  257. const int16x4x2_t b2 = vtrn_s16(a4, a5);
  258. const int16x4x2_t b3 = vtrn_s16(a6, a7);
  259. // Swap 32 bit elements resulting in:
  260. // c0.val[0]: 00 10 20 30
  261. // c0.val[1]: 02 12 22 32
  262. // c1.val[0]: 01 11 21 31
  263. // c1.val[1]: 03 13 23 33
  264. // c2.val[0]: 40 50 60 70
  265. // c2.val[1]: 42 52 62 72
  266. // c3.val[0]: 41 51 61 71
  267. // c3.val[1]: 43 53 63 73
  268. const int32x2x2_t c0 = vtrn_s32(vreinterpret_s32_s16(b0.val[0]),
  269. vreinterpret_s32_s16(b1.val[0]));
  270. const int32x2x2_t c1 = vtrn_s32(vreinterpret_s32_s16(b0.val[1]),
  271. vreinterpret_s32_s16(b1.val[1]));
  272. const int32x2x2_t c2 = vtrn_s32(vreinterpret_s32_s16(b2.val[0]),
  273. vreinterpret_s32_s16(b3.val[0]));
  274. const int32x2x2_t c3 = vtrn_s32(vreinterpret_s32_s16(b2.val[1]),
  275. vreinterpret_s32_s16(b3.val[1]));
  276. // Swap 64 bit elements resulting in:
  277. // o0: 00 10 20 30 40 50 60 70
  278. // o1: 01 11 21 31 41 51 61 71
  279. // o2: 02 12 22 32 42 52 62 72
  280. // o3: 03 13 23 33 43 53 63 73
  281. *o0 = vcombine_s16(vreinterpret_s16_s32(c0.val[0]),
  282. vreinterpret_s16_s32(c2.val[0]));
  283. *o1 = vcombine_s16(vreinterpret_s16_s32(c1.val[0]),
  284. vreinterpret_s16_s32(c3.val[0]));
  285. *o2 = vcombine_s16(vreinterpret_s16_s32(c0.val[1]),
  286. vreinterpret_s16_s32(c2.val[1]));
  287. *o3 = vcombine_s16(vreinterpret_s16_s32(c1.val[1]),
  288. vreinterpret_s16_s32(c3.val[1]));
  289. }
  290. static INLINE void transpose_s32_4x8(int32x4_t *const a0, int32x4_t *const a1,
  291. int32x4_t *const a2, int32x4_t *const a3,
  292. int32x4_t *const a4, int32x4_t *const a5,
  293. int32x4_t *const a6, int32x4_t *const a7) {
  294. // Swap 32 bit elements. Goes from:
  295. // a0: 00 01 02 03
  296. // a1: 10 11 12 13
  297. // a2: 20 21 22 23
  298. // a3: 30 31 32 33
  299. // a4: 40 41 42 43
  300. // a5: 50 51 52 53
  301. // a6: 60 61 62 63
  302. // a7: 70 71 72 73
  303. // to:
  304. // b0.val[0]: 00 10 02 12
  305. // b0.val[1]: 01 11 03 13
  306. // b1.val[0]: 20 30 22 32
  307. // b1.val[1]: 21 31 23 33
  308. // b2.val[0]: 40 50 42 52
  309. // b2.val[1]: 41 51 43 53
  310. // b3.val[0]: 60 70 62 72
  311. // b3.val[1]: 61 71 63 73
  312. const int32x4x2_t b0 = vtrnq_s32(*a0, *a1);
  313. const int32x4x2_t b1 = vtrnq_s32(*a2, *a3);
  314. const int32x4x2_t b2 = vtrnq_s32(*a4, *a5);
  315. const int32x4x2_t b3 = vtrnq_s32(*a6, *a7);
  316. // Swap 64 bit elements resulting in:
  317. // c0.val[0]: 00 10 20 30
  318. // c0.val[1]: 02 12 22 32
  319. // c1.val[0]: 01 11 21 31
  320. // c1.val[1]: 03 13 23 33
  321. // c2.val[0]: 40 50 60 70
  322. // c2.val[1]: 42 52 62 72
  323. // c3.val[0]: 41 51 61 71
  324. // c3.val[1]: 43 53 63 73
  325. const int64x2x2_t c0 = vpx_vtrnq_s64(b0.val[0], b1.val[0]);
  326. const int64x2x2_t c1 = vpx_vtrnq_s64(b0.val[1], b1.val[1]);
  327. const int64x2x2_t c2 = vpx_vtrnq_s64(b2.val[0], b3.val[0]);
  328. const int64x2x2_t c3 = vpx_vtrnq_s64(b2.val[1], b3.val[1]);
  329. *a0 = vreinterpretq_s32_s64(c0.val[0]);
  330. *a1 = vreinterpretq_s32_s64(c2.val[0]);
  331. *a2 = vreinterpretq_s32_s64(c1.val[0]);
  332. *a3 = vreinterpretq_s32_s64(c3.val[0]);
  333. *a4 = vreinterpretq_s32_s64(c0.val[1]);
  334. *a5 = vreinterpretq_s32_s64(c2.val[1]);
  335. *a6 = vreinterpretq_s32_s64(c1.val[1]);
  336. *a7 = vreinterpretq_s32_s64(c3.val[1]);
  337. }
  338. static INLINE void transpose_u8_8x4(uint8x8_t *a0, uint8x8_t *a1, uint8x8_t *a2,
  339. uint8x8_t *a3) {
  340. // Swap 8 bit elements. Goes from:
  341. // a0: 00 01 02 03 04 05 06 07
  342. // a1: 10 11 12 13 14 15 16 17
  343. // a2: 20 21 22 23 24 25 26 27
  344. // a3: 30 31 32 33 34 35 36 37
  345. // to:
  346. // b0.val[0]: 00 10 02 12 04 14 06 16
  347. // b0.val[1]: 01 11 03 13 05 15 07 17
  348. // b1.val[0]: 20 30 22 32 24 34 26 36
  349. // b1.val[1]: 21 31 23 33 25 35 27 37
  350. const uint8x8x2_t b0 = vtrn_u8(*a0, *a1);
  351. const uint8x8x2_t b1 = vtrn_u8(*a2, *a3);
  352. // Swap 16 bit elements resulting in:
  353. // c0.val[0]: 00 10 20 30 04 14 24 34
  354. // c0.val[1]: 02 12 22 32 06 16 26 36
  355. // c1.val[0]: 01 11 21 31 05 15 25 35
  356. // c1.val[1]: 03 13 23 33 07 17 27 37
  357. const uint16x4x2_t c0 =
  358. vtrn_u16(vreinterpret_u16_u8(b0.val[0]), vreinterpret_u16_u8(b1.val[0]));
  359. const uint16x4x2_t c1 =
  360. vtrn_u16(vreinterpret_u16_u8(b0.val[1]), vreinterpret_u16_u8(b1.val[1]));
  361. *a0 = vreinterpret_u8_u16(c0.val[0]);
  362. *a1 = vreinterpret_u8_u16(c1.val[0]);
  363. *a2 = vreinterpret_u8_u16(c0.val[1]);
  364. *a3 = vreinterpret_u8_u16(c1.val[1]);
  365. }
  366. static INLINE void transpose_u16_8x4(uint16x8_t *a0, uint16x8_t *a1,
  367. uint16x8_t *a2, uint16x8_t *a3) {
  368. // Swap 16 bit elements. Goes from:
  369. // a0: 00 01 02 03 04 05 06 07
  370. // a1: 10 11 12 13 14 15 16 17
  371. // a2: 20 21 22 23 24 25 26 27
  372. // a3: 30 31 32 33 34 35 36 37
  373. // to:
  374. // b0.val[0]: 00 10 02 12 04 14 06 16
  375. // b0.val[1]: 01 11 03 13 05 15 07 17
  376. // b1.val[0]: 20 30 22 32 24 34 26 36
  377. // b1.val[1]: 21 31 23 33 25 35 27 37
  378. const uint16x8x2_t b0 = vtrnq_u16(*a0, *a1);
  379. const uint16x8x2_t b1 = vtrnq_u16(*a2, *a3);
  380. // Swap 32 bit elements resulting in:
  381. // c0.val[0]: 00 10 20 30 04 14 24 34
  382. // c0.val[1]: 02 12 22 32 06 16 26 36
  383. // c1.val[0]: 01 11 21 31 05 15 25 35
  384. // c1.val[1]: 03 13 23 33 07 17 27 37
  385. const uint32x4x2_t c0 = vtrnq_u32(vreinterpretq_u32_u16(b0.val[0]),
  386. vreinterpretq_u32_u16(b1.val[0]));
  387. const uint32x4x2_t c1 = vtrnq_u32(vreinterpretq_u32_u16(b0.val[1]),
  388. vreinterpretq_u32_u16(b1.val[1]));
  389. *a0 = vreinterpretq_u16_u32(c0.val[0]);
  390. *a1 = vreinterpretq_u16_u32(c1.val[0]);
  391. *a2 = vreinterpretq_u16_u32(c0.val[1]);
  392. *a3 = vreinterpretq_u16_u32(c1.val[1]);
  393. }
  394. static INLINE void transpose_s32_8x4(int32x4_t *const a0, int32x4_t *const a1,
  395. int32x4_t *const a2, int32x4_t *const a3,
  396. int32x4_t *const a4, int32x4_t *const a5,
  397. int32x4_t *const a6, int32x4_t *const a7) {
  398. // Swap 32 bit elements. Goes from:
  399. // a0: 00 01 02 03
  400. // a1: 04 05 06 07
  401. // a2: 10 11 12 13
  402. // a3: 14 15 16 17
  403. // a4: 20 21 22 23
  404. // a5: 24 25 26 27
  405. // a6: 30 31 32 33
  406. // a7: 34 35 36 37
  407. // to:
  408. // b0.val[0]: 00 10 02 12
  409. // b0.val[1]: 01 11 03 13
  410. // b1.val[0]: 04 14 06 16
  411. // b1.val[1]: 05 15 07 17
  412. // b2.val[0]: 20 30 22 32
  413. // b2.val[1]: 21 31 23 33
  414. // b3.val[0]: 24 34 26 36
  415. // b3.val[1]: 25 35 27 37
  416. const int32x4x2_t b0 = vtrnq_s32(*a0, *a2);
  417. const int32x4x2_t b1 = vtrnq_s32(*a1, *a3);
  418. const int32x4x2_t b2 = vtrnq_s32(*a4, *a6);
  419. const int32x4x2_t b3 = vtrnq_s32(*a5, *a7);
  420. // Swap 64 bit elements resulting in:
  421. // c0.val[0]: 00 10 20 30
  422. // c0.val[1]: 02 12 22 32
  423. // c1.val[0]: 01 11 21 31
  424. // c1.val[1]: 03 13 23 33
  425. // c2.val[0]: 04 14 24 34
  426. // c2.val[1]: 06 16 26 36
  427. // c3.val[0]: 05 15 25 35
  428. // c3.val[1]: 07 17 27 37
  429. const int64x2x2_t c0 = vpx_vtrnq_s64(b0.val[0], b2.val[0]);
  430. const int64x2x2_t c1 = vpx_vtrnq_s64(b0.val[1], b2.val[1]);
  431. const int64x2x2_t c2 = vpx_vtrnq_s64(b1.val[0], b3.val[0]);
  432. const int64x2x2_t c3 = vpx_vtrnq_s64(b1.val[1], b3.val[1]);
  433. *a0 = vreinterpretq_s32_s64(c0.val[0]);
  434. *a1 = vreinterpretq_s32_s64(c1.val[0]);
  435. *a2 = vreinterpretq_s32_s64(c0.val[1]);
  436. *a3 = vreinterpretq_s32_s64(c1.val[1]);
  437. *a4 = vreinterpretq_s32_s64(c2.val[0]);
  438. *a5 = vreinterpretq_s32_s64(c3.val[0]);
  439. *a6 = vreinterpretq_s32_s64(c2.val[1]);
  440. *a7 = vreinterpretq_s32_s64(c3.val[1]);
  441. }
  442. // Note: Using 'd' registers or 'q' registers has almost identical speed. We use
  443. // 'q' registers here to save some instructions.
  444. static INLINE void transpose_u8_8x8(uint8x8_t *a0, uint8x8_t *a1, uint8x8_t *a2,
  445. uint8x8_t *a3, uint8x8_t *a4, uint8x8_t *a5,
  446. uint8x8_t *a6, uint8x8_t *a7) {
  447. // Swap 8 bit elements. Goes from:
  448. // a0: 00 01 02 03 04 05 06 07
  449. // a1: 10 11 12 13 14 15 16 17
  450. // a2: 20 21 22 23 24 25 26 27
  451. // a3: 30 31 32 33 34 35 36 37
  452. // a4: 40 41 42 43 44 45 46 47
  453. // a5: 50 51 52 53 54 55 56 57
  454. // a6: 60 61 62 63 64 65 66 67
  455. // a7: 70 71 72 73 74 75 76 77
  456. // to:
  457. // b0.val[0]: 00 10 02 12 04 14 06 16 40 50 42 52 44 54 46 56
  458. // b0.val[1]: 01 11 03 13 05 15 07 17 41 51 43 53 45 55 47 57
  459. // b1.val[0]: 20 30 22 32 24 34 26 36 60 70 62 72 64 74 66 76
  460. // b1.val[1]: 21 31 23 33 25 35 27 37 61 71 63 73 65 75 67 77
  461. const uint8x16x2_t b0 =
  462. vtrnq_u8(vcombine_u8(*a0, *a4), vcombine_u8(*a1, *a5));
  463. const uint8x16x2_t b1 =
  464. vtrnq_u8(vcombine_u8(*a2, *a6), vcombine_u8(*a3, *a7));
  465. // Swap 16 bit elements resulting in:
  466. // c0.val[0]: 00 10 20 30 04 14 24 34 40 50 60 70 44 54 64 74
  467. // c0.val[1]: 02 12 22 32 06 16 26 36 42 52 62 72 46 56 66 76
  468. // c1.val[0]: 01 11 21 31 05 15 25 35 41 51 61 71 45 55 65 75
  469. // c1.val[1]: 03 13 23 33 07 17 27 37 43 53 63 73 47 57 67 77
  470. const uint16x8x2_t c0 = vtrnq_u16(vreinterpretq_u16_u8(b0.val[0]),
  471. vreinterpretq_u16_u8(b1.val[0]));
  472. const uint16x8x2_t c1 = vtrnq_u16(vreinterpretq_u16_u8(b0.val[1]),
  473. vreinterpretq_u16_u8(b1.val[1]));
  474. // Unzip 32 bit elements resulting in:
  475. // d0.val[0]: 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71
  476. // d0.val[1]: 04 14 24 34 44 54 64 74 05 15 25 35 45 55 65 75
  477. // d1.val[0]: 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73
  478. // d1.val[1]: 06 16 26 36 46 56 66 76 07 17 27 37 47 57 67 77
  479. const uint32x4x2_t d0 = vuzpq_u32(vreinterpretq_u32_u16(c0.val[0]),
  480. vreinterpretq_u32_u16(c1.val[0]));
  481. const uint32x4x2_t d1 = vuzpq_u32(vreinterpretq_u32_u16(c0.val[1]),
  482. vreinterpretq_u32_u16(c1.val[1]));
  483. *a0 = vreinterpret_u8_u32(vget_low_u32(d0.val[0]));
  484. *a1 = vreinterpret_u8_u32(vget_high_u32(d0.val[0]));
  485. *a2 = vreinterpret_u8_u32(vget_low_u32(d1.val[0]));
  486. *a3 = vreinterpret_u8_u32(vget_high_u32(d1.val[0]));
  487. *a4 = vreinterpret_u8_u32(vget_low_u32(d0.val[1]));
  488. *a5 = vreinterpret_u8_u32(vget_high_u32(d0.val[1]));
  489. *a6 = vreinterpret_u8_u32(vget_low_u32(d1.val[1]));
  490. *a7 = vreinterpret_u8_u32(vget_high_u32(d1.val[1]));
  491. }
  492. static INLINE void transpose_s16_8x8(int16x8_t *a0, int16x8_t *a1,
  493. int16x8_t *a2, int16x8_t *a3,
  494. int16x8_t *a4, int16x8_t *a5,
  495. int16x8_t *a6, int16x8_t *a7) {
  496. // Swap 16 bit elements. Goes from:
  497. // a0: 00 01 02 03 04 05 06 07
  498. // a1: 10 11 12 13 14 15 16 17
  499. // a2: 20 21 22 23 24 25 26 27
  500. // a3: 30 31 32 33 34 35 36 37
  501. // a4: 40 41 42 43 44 45 46 47
  502. // a5: 50 51 52 53 54 55 56 57
  503. // a6: 60 61 62 63 64 65 66 67
  504. // a7: 70 71 72 73 74 75 76 77
  505. // to:
  506. // b0.val[0]: 00 10 02 12 04 14 06 16
  507. // b0.val[1]: 01 11 03 13 05 15 07 17
  508. // b1.val[0]: 20 30 22 32 24 34 26 36
  509. // b1.val[1]: 21 31 23 33 25 35 27 37
  510. // b2.val[0]: 40 50 42 52 44 54 46 56
  511. // b2.val[1]: 41 51 43 53 45 55 47 57
  512. // b3.val[0]: 60 70 62 72 64 74 66 76
  513. // b3.val[1]: 61 71 63 73 65 75 67 77
  514. const int16x8x2_t b0 = vtrnq_s16(*a0, *a1);
  515. const int16x8x2_t b1 = vtrnq_s16(*a2, *a3);
  516. const int16x8x2_t b2 = vtrnq_s16(*a4, *a5);
  517. const int16x8x2_t b3 = vtrnq_s16(*a6, *a7);
  518. // Swap 32 bit elements resulting in:
  519. // c0.val[0]: 00 10 20 30 04 14 24 34
  520. // c0.val[1]: 02 12 22 32 06 16 26 36
  521. // c1.val[0]: 01 11 21 31 05 15 25 35
  522. // c1.val[1]: 03 13 23 33 07 17 27 37
  523. // c2.val[0]: 40 50 60 70 44 54 64 74
  524. // c2.val[1]: 42 52 62 72 46 56 66 76
  525. // c3.val[0]: 41 51 61 71 45 55 65 75
  526. // c3.val[1]: 43 53 63 73 47 57 67 77
  527. const int32x4x2_t c0 = vtrnq_s32(vreinterpretq_s32_s16(b0.val[0]),
  528. vreinterpretq_s32_s16(b1.val[0]));
  529. const int32x4x2_t c1 = vtrnq_s32(vreinterpretq_s32_s16(b0.val[1]),
  530. vreinterpretq_s32_s16(b1.val[1]));
  531. const int32x4x2_t c2 = vtrnq_s32(vreinterpretq_s32_s16(b2.val[0]),
  532. vreinterpretq_s32_s16(b3.val[0]));
  533. const int32x4x2_t c3 = vtrnq_s32(vreinterpretq_s32_s16(b2.val[1]),
  534. vreinterpretq_s32_s16(b3.val[1]));
  535. // Swap 64 bit elements resulting in:
  536. // d0.val[0]: 00 10 20 30 40 50 60 70
  537. // d0.val[1]: 04 14 24 34 44 54 64 74
  538. // d1.val[0]: 01 11 21 31 41 51 61 71
  539. // d1.val[1]: 05 15 25 35 45 55 65 75
  540. // d2.val[0]: 02 12 22 32 42 52 62 72
  541. // d2.val[1]: 06 16 26 36 46 56 66 76
  542. // d3.val[0]: 03 13 23 33 43 53 63 73
  543. // d3.val[1]: 07 17 27 37 47 57 67 77
  544. const int16x8x2_t d0 = vpx_vtrnq_s64_to_s16(c0.val[0], c2.val[0]);
  545. const int16x8x2_t d1 = vpx_vtrnq_s64_to_s16(c1.val[0], c3.val[0]);
  546. const int16x8x2_t d2 = vpx_vtrnq_s64_to_s16(c0.val[1], c2.val[1]);
  547. const int16x8x2_t d3 = vpx_vtrnq_s64_to_s16(c1.val[1], c3.val[1]);
  548. *a0 = d0.val[0];
  549. *a1 = d1.val[0];
  550. *a2 = d2.val[0];
  551. *a3 = d3.val[0];
  552. *a4 = d0.val[1];
  553. *a5 = d1.val[1];
  554. *a6 = d2.val[1];
  555. *a7 = d3.val[1];
  556. }
  557. static INLINE void transpose_u16_8x8(uint16x8_t *a0, uint16x8_t *a1,
  558. uint16x8_t *a2, uint16x8_t *a3,
  559. uint16x8_t *a4, uint16x8_t *a5,
  560. uint16x8_t *a6, uint16x8_t *a7) {
  561. // Swap 16 bit elements. Goes from:
  562. // a0: 00 01 02 03 04 05 06 07
  563. // a1: 10 11 12 13 14 15 16 17
  564. // a2: 20 21 22 23 24 25 26 27
  565. // a3: 30 31 32 33 34 35 36 37
  566. // a4: 40 41 42 43 44 45 46 47
  567. // a5: 50 51 52 53 54 55 56 57
  568. // a6: 60 61 62 63 64 65 66 67
  569. // a7: 70 71 72 73 74 75 76 77
  570. // to:
  571. // b0.val[0]: 00 10 02 12 04 14 06 16
  572. // b0.val[1]: 01 11 03 13 05 15 07 17
  573. // b1.val[0]: 20 30 22 32 24 34 26 36
  574. // b1.val[1]: 21 31 23 33 25 35 27 37
  575. // b2.val[0]: 40 50 42 52 44 54 46 56
  576. // b2.val[1]: 41 51 43 53 45 55 47 57
  577. // b3.val[0]: 60 70 62 72 64 74 66 76
  578. // b3.val[1]: 61 71 63 73 65 75 67 77
  579. const uint16x8x2_t b0 = vtrnq_u16(*a0, *a1);
  580. const uint16x8x2_t b1 = vtrnq_u16(*a2, *a3);
  581. const uint16x8x2_t b2 = vtrnq_u16(*a4, *a5);
  582. const uint16x8x2_t b3 = vtrnq_u16(*a6, *a7);
  583. // Swap 32 bit elements resulting in:
  584. // c0.val[0]: 00 10 20 30 04 14 24 34
  585. // c0.val[1]: 02 12 22 32 06 16 26 36
  586. // c1.val[0]: 01 11 21 31 05 15 25 35
  587. // c1.val[1]: 03 13 23 33 07 17 27 37
  588. // c2.val[0]: 40 50 60 70 44 54 64 74
  589. // c2.val[1]: 42 52 62 72 46 56 66 76
  590. // c3.val[0]: 41 51 61 71 45 55 65 75
  591. // c3.val[1]: 43 53 63 73 47 57 67 77
  592. const uint32x4x2_t c0 = vtrnq_u32(vreinterpretq_u32_u16(b0.val[0]),
  593. vreinterpretq_u32_u16(b1.val[0]));
  594. const uint32x4x2_t c1 = vtrnq_u32(vreinterpretq_u32_u16(b0.val[1]),
  595. vreinterpretq_u32_u16(b1.val[1]));
  596. const uint32x4x2_t c2 = vtrnq_u32(vreinterpretq_u32_u16(b2.val[0]),
  597. vreinterpretq_u32_u16(b3.val[0]));
  598. const uint32x4x2_t c3 = vtrnq_u32(vreinterpretq_u32_u16(b2.val[1]),
  599. vreinterpretq_u32_u16(b3.val[1]));
  600. // Swap 64 bit elements resulting in:
  601. // d0.val[0]: 00 10 20 30 40 50 60 70
  602. // d0.val[1]: 04 14 24 34 44 54 64 74
  603. // d1.val[0]: 01 11 21 31 41 51 61 71
  604. // d1.val[1]: 05 15 25 35 45 55 65 75
  605. // d2.val[0]: 02 12 22 32 42 52 62 72
  606. // d2.val[1]: 06 16 26 36 46 56 66 76
  607. // d3.val[0]: 03 13 23 33 43 53 63 73
  608. // d3.val[1]: 07 17 27 37 47 57 67 77
  609. const uint16x8x2_t d0 = vpx_vtrnq_u64_to_u16(c0.val[0], c2.val[0]);
  610. const uint16x8x2_t d1 = vpx_vtrnq_u64_to_u16(c1.val[0], c3.val[0]);
  611. const uint16x8x2_t d2 = vpx_vtrnq_u64_to_u16(c0.val[1], c2.val[1]);
  612. const uint16x8x2_t d3 = vpx_vtrnq_u64_to_u16(c1.val[1], c3.val[1]);
  613. *a0 = d0.val[0];
  614. *a1 = d1.val[0];
  615. *a2 = d2.val[0];
  616. *a3 = d3.val[0];
  617. *a4 = d0.val[1];
  618. *a5 = d1.val[1];
  619. *a6 = d2.val[1];
  620. *a7 = d3.val[1];
  621. }
  622. static INLINE void transpose_s32_8x8(int32x4x2_t *a0, int32x4x2_t *a1,
  623. int32x4x2_t *a2, int32x4x2_t *a3,
  624. int32x4x2_t *a4, int32x4x2_t *a5,
  625. int32x4x2_t *a6, int32x4x2_t *a7) {
  626. // Swap 32 bit elements. Goes from:
  627. // a0: 00 01 02 03 04 05 06 07
  628. // a1: 10 11 12 13 14 15 16 17
  629. // a2: 20 21 22 23 24 25 26 27
  630. // a3: 30 31 32 33 34 35 36 37
  631. // a4: 40 41 42 43 44 45 46 47
  632. // a5: 50 51 52 53 54 55 56 57
  633. // a6: 60 61 62 63 64 65 66 67
  634. // a7: 70 71 72 73 74 75 76 77
  635. // to:
  636. // b0: 00 10 02 12 01 11 03 13
  637. // b1: 20 30 22 32 21 31 23 33
  638. // b2: 40 50 42 52 41 51 43 53
  639. // b3: 60 70 62 72 61 71 63 73
  640. // b4: 04 14 06 16 05 15 07 17
  641. // b5: 24 34 26 36 25 35 27 37
  642. // b6: 44 54 46 56 45 55 47 57
  643. // b7: 64 74 66 76 65 75 67 77
  644. const int32x4x2_t b0 = vtrnq_s32(a0->val[0], a1->val[0]);
  645. const int32x4x2_t b1 = vtrnq_s32(a2->val[0], a3->val[0]);
  646. const int32x4x2_t b2 = vtrnq_s32(a4->val[0], a5->val[0]);
  647. const int32x4x2_t b3 = vtrnq_s32(a6->val[0], a7->val[0]);
  648. const int32x4x2_t b4 = vtrnq_s32(a0->val[1], a1->val[1]);
  649. const int32x4x2_t b5 = vtrnq_s32(a2->val[1], a3->val[1]);
  650. const int32x4x2_t b6 = vtrnq_s32(a4->val[1], a5->val[1]);
  651. const int32x4x2_t b7 = vtrnq_s32(a6->val[1], a7->val[1]);
  652. // Swap 64 bit elements resulting in:
  653. // c0: 00 10 20 30 02 12 22 32
  654. // c1: 01 11 21 31 03 13 23 33
  655. // c2: 40 50 60 70 42 52 62 72
  656. // c3: 41 51 61 71 43 53 63 73
  657. // c4: 04 14 24 34 06 16 26 36
  658. // c5: 05 15 25 35 07 17 27 37
  659. // c6: 44 54 64 74 46 56 66 76
  660. // c7: 45 55 65 75 47 57 67 77
  661. const int32x4x2_t c0 = vpx_vtrnq_s64_to_s32(b0.val[0], b1.val[0]);
  662. const int32x4x2_t c1 = vpx_vtrnq_s64_to_s32(b0.val[1], b1.val[1]);
  663. const int32x4x2_t c2 = vpx_vtrnq_s64_to_s32(b2.val[0], b3.val[0]);
  664. const int32x4x2_t c3 = vpx_vtrnq_s64_to_s32(b2.val[1], b3.val[1]);
  665. const int32x4x2_t c4 = vpx_vtrnq_s64_to_s32(b4.val[0], b5.val[0]);
  666. const int32x4x2_t c5 = vpx_vtrnq_s64_to_s32(b4.val[1], b5.val[1]);
  667. const int32x4x2_t c6 = vpx_vtrnq_s64_to_s32(b6.val[0], b7.val[0]);
  668. const int32x4x2_t c7 = vpx_vtrnq_s64_to_s32(b6.val[1], b7.val[1]);
  669. // Swap 128 bit elements resulting in:
  670. // a0: 00 10 20 30 40 50 60 70
  671. // a1: 01 11 21 31 41 51 61 71
  672. // a2: 02 12 22 32 42 52 62 72
  673. // a3: 03 13 23 33 43 53 63 73
  674. // a4: 04 14 24 34 44 54 64 74
  675. // a5: 05 15 25 35 45 55 65 75
  676. // a6: 06 16 26 36 46 56 66 76
  677. // a7: 07 17 27 37 47 57 67 77
  678. a0->val[0] = c0.val[0];
  679. a0->val[1] = c2.val[0];
  680. a1->val[0] = c1.val[0];
  681. a1->val[1] = c3.val[0];
  682. a2->val[0] = c0.val[1];
  683. a2->val[1] = c2.val[1];
  684. a3->val[0] = c1.val[1];
  685. a3->val[1] = c3.val[1];
  686. a4->val[0] = c4.val[0];
  687. a4->val[1] = c6.val[0];
  688. a5->val[0] = c5.val[0];
  689. a5->val[1] = c7.val[0];
  690. a6->val[0] = c4.val[1];
  691. a6->val[1] = c6.val[1];
  692. a7->val[0] = c5.val[1];
  693. a7->val[1] = c7.val[1];
  694. }
  695. static INLINE void transpose_u8_16x8(
  696. const uint8x16_t i0, const uint8x16_t i1, const uint8x16_t i2,
  697. const uint8x16_t i3, const uint8x16_t i4, const uint8x16_t i5,
  698. const uint8x16_t i6, const uint8x16_t i7, uint8x8_t *o0, uint8x8_t *o1,
  699. uint8x8_t *o2, uint8x8_t *o3, uint8x8_t *o4, uint8x8_t *o5, uint8x8_t *o6,
  700. uint8x8_t *o7, uint8x8_t *o8, uint8x8_t *o9, uint8x8_t *o10, uint8x8_t *o11,
  701. uint8x8_t *o12, uint8x8_t *o13, uint8x8_t *o14, uint8x8_t *o15) {
  702. // Swap 8 bit elements. Goes from:
  703. // i0: 00 01 02 03 04 05 06 07 08 09 0A 0B 0C 0D 0E 0F
  704. // i1: 10 11 12 13 14 15 16 17 18 19 1A 1B 1C 1D 1E 1F
  705. // i2: 20 21 22 23 24 25 26 27 28 29 2A 2B 2C 2D 2E 2F
  706. // i3: 30 31 32 33 34 35 36 37 38 39 3A 3B 3C 3D 3E 3F
  707. // i4: 40 41 42 43 44 45 46 47 48 49 4A 4B 4C 4D 4E 4F
  708. // i5: 50 51 52 53 54 55 56 57 58 59 5A 5B 5C 5D 5E 5F
  709. // i6: 60 61 62 63 64 65 66 67 68 69 6A 6B 6C 6D 6E 6F
  710. // i7: 70 71 72 73 74 75 76 77 78 79 7A 7B 7C 7D 7E 7F
  711. // to:
  712. // b0.val[0]: 00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E
  713. // b0.val[1]: 01 11 03 13 05 15 07 17 09 19 0B 1B 0D 1D 0F 1F
  714. // b1.val[0]: 20 30 22 32 24 34 26 36 28 38 2A 3A 2C 3C 2E 3E
  715. // b1.val[1]: 21 31 23 33 25 35 27 37 29 39 2B 3B 2D 3D 2F 3F
  716. // b2.val[0]: 40 50 42 52 44 54 46 56 48 58 4A 5A 4C 5C 4E 5E
  717. // b2.val[1]: 41 51 43 53 45 55 47 57 49 59 4B 5B 4D 5D 4F 5F
  718. // b3.val[0]: 60 70 62 72 64 74 66 76 68 78 6A 7A 6C 7C 6E 7E
  719. // b3.val[1]: 61 71 63 73 65 75 67 77 69 79 6B 7B 6D 7D 6F 7F
  720. const uint8x16x2_t b0 = vtrnq_u8(i0, i1);
  721. const uint8x16x2_t b1 = vtrnq_u8(i2, i3);
  722. const uint8x16x2_t b2 = vtrnq_u8(i4, i5);
  723. const uint8x16x2_t b3 = vtrnq_u8(i6, i7);
  724. // Swap 16 bit elements resulting in:
  725. // c0.val[0]: 00 10 20 30 04 14 24 34 08 18 28 38 0C 1C 2C 3C
  726. // c0.val[1]: 02 12 22 32 06 16 26 36 0A 1A 2A 3A 0E 1E 2E 3E
  727. // c1.val[0]: 01 11 21 31 05 15 25 35 09 19 29 39 0D 1D 2D 3D
  728. // c1.val[1]: 03 13 23 33 07 17 27 37 0B 1B 2B 3B 0F 1F 2F 3F
  729. // c2.val[0]: 40 50 60 70 44 54 64 74 48 58 68 78 4C 5C 6C 7C
  730. // c2.val[1]: 42 52 62 72 46 56 66 76 4A 5A 6A 7A 4E 5E 6E 7E
  731. // c3.val[0]: 41 51 61 71 45 55 65 75 49 59 69 79 4D 5D 6D 7D
  732. // c3.val[1]: 43 53 63 73 47 57 67 77 4B 5B 6B 7B 4F 5F 6F 7F
  733. const uint16x8x2_t c0 = vtrnq_u16(vreinterpretq_u16_u8(b0.val[0]),
  734. vreinterpretq_u16_u8(b1.val[0]));
  735. const uint16x8x2_t c1 = vtrnq_u16(vreinterpretq_u16_u8(b0.val[1]),
  736. vreinterpretq_u16_u8(b1.val[1]));
  737. const uint16x8x2_t c2 = vtrnq_u16(vreinterpretq_u16_u8(b2.val[0]),
  738. vreinterpretq_u16_u8(b3.val[0]));
  739. const uint16x8x2_t c3 = vtrnq_u16(vreinterpretq_u16_u8(b2.val[1]),
  740. vreinterpretq_u16_u8(b3.val[1]));
  741. // Swap 32 bit elements resulting in:
  742. // d0.val[0]: 00 10 20 30 40 50 60 70 08 18 28 38 48 58 68 78
  743. // d0.val[1]: 04 14 24 34 44 54 64 74 0C 1C 2C 3C 4C 5C 6C 7C
  744. // d1.val[0]: 02 12 22 32 42 52 62 72 0A 1A 2A 3A 4A 5A 6A 7A
  745. // d1.val[1]: 06 16 26 36 46 56 66 76 0E 1E 2E 3E 4E 5E 6E 7E
  746. // d2.val[0]: 01 11 21 31 41 51 61 71 09 19 29 39 49 59 69 79
  747. // d2.val[1]: 05 15 25 35 45 55 65 75 0D 1D 2D 3D 4D 5D 6D 7D
  748. // d3.val[0]: 03 13 23 33 43 53 63 73 0B 1B 2B 3B 4B 5B 6B 7B
  749. // d3.val[1]: 07 17 27 37 47 57 67 77 0F 1F 2F 3F 4F 5F 6F 7F
  750. const uint32x4x2_t d0 = vtrnq_u32(vreinterpretq_u32_u16(c0.val[0]),
  751. vreinterpretq_u32_u16(c2.val[0]));
  752. const uint32x4x2_t d1 = vtrnq_u32(vreinterpretq_u32_u16(c0.val[1]),
  753. vreinterpretq_u32_u16(c2.val[1]));
  754. const uint32x4x2_t d2 = vtrnq_u32(vreinterpretq_u32_u16(c1.val[0]),
  755. vreinterpretq_u32_u16(c3.val[0]));
  756. const uint32x4x2_t d3 = vtrnq_u32(vreinterpretq_u32_u16(c1.val[1]),
  757. vreinterpretq_u32_u16(c3.val[1]));
  758. // Output:
  759. // o0 : 00 10 20 30 40 50 60 70
  760. // o1 : 01 11 21 31 41 51 61 71
  761. // o2 : 02 12 22 32 42 52 62 72
  762. // o3 : 03 13 23 33 43 53 63 73
  763. // o4 : 04 14 24 34 44 54 64 74
  764. // o5 : 05 15 25 35 45 55 65 75
  765. // o6 : 06 16 26 36 46 56 66 76
  766. // o7 : 07 17 27 37 47 57 67 77
  767. // o8 : 08 18 28 38 48 58 68 78
  768. // o9 : 09 19 29 39 49 59 69 79
  769. // o10: 0A 1A 2A 3A 4A 5A 6A 7A
  770. // o11: 0B 1B 2B 3B 4B 5B 6B 7B
  771. // o12: 0C 1C 2C 3C 4C 5C 6C 7C
  772. // o13: 0D 1D 2D 3D 4D 5D 6D 7D
  773. // o14: 0E 1E 2E 3E 4E 5E 6E 7E
  774. // o15: 0F 1F 2F 3F 4F 5F 6F 7F
  775. *o0 = vget_low_u8(vreinterpretq_u8_u32(d0.val[0]));
  776. *o1 = vget_low_u8(vreinterpretq_u8_u32(d2.val[0]));
  777. *o2 = vget_low_u8(vreinterpretq_u8_u32(d1.val[0]));
  778. *o3 = vget_low_u8(vreinterpretq_u8_u32(d3.val[0]));
  779. *o4 = vget_low_u8(vreinterpretq_u8_u32(d0.val[1]));
  780. *o5 = vget_low_u8(vreinterpretq_u8_u32(d2.val[1]));
  781. *o6 = vget_low_u8(vreinterpretq_u8_u32(d1.val[1]));
  782. *o7 = vget_low_u8(vreinterpretq_u8_u32(d3.val[1]));
  783. *o8 = vget_high_u8(vreinterpretq_u8_u32(d0.val[0]));
  784. *o9 = vget_high_u8(vreinterpretq_u8_u32(d2.val[0]));
  785. *o10 = vget_high_u8(vreinterpretq_u8_u32(d1.val[0]));
  786. *o11 = vget_high_u8(vreinterpretq_u8_u32(d3.val[0]));
  787. *o12 = vget_high_u8(vreinterpretq_u8_u32(d0.val[1]));
  788. *o13 = vget_high_u8(vreinterpretq_u8_u32(d2.val[1]));
  789. *o14 = vget_high_u8(vreinterpretq_u8_u32(d1.val[1]));
  790. *o15 = vget_high_u8(vreinterpretq_u8_u32(d3.val[1]));
  791. }
  792. static INLINE void transpose_u8_8x16(
  793. const uint8x8_t i0, const uint8x8_t i1, const uint8x8_t i2,
  794. const uint8x8_t i3, const uint8x8_t i4, const uint8x8_t i5,
  795. const uint8x8_t i6, const uint8x8_t i7, const uint8x8_t i8,
  796. const uint8x8_t i9, const uint8x8_t i10, const uint8x8_t i11,
  797. const uint8x8_t i12, const uint8x8_t i13, const uint8x8_t i14,
  798. const uint8x8_t i15, uint8x16_t *o0, uint8x16_t *o1, uint8x16_t *o2,
  799. uint8x16_t *o3, uint8x16_t *o4, uint8x16_t *o5, uint8x16_t *o6,
  800. uint8x16_t *o7) {
  801. // Combine 8 bit elements. Goes from:
  802. // i0 : 00 01 02 03 04 05 06 07
  803. // i1 : 10 11 12 13 14 15 16 17
  804. // i2 : 20 21 22 23 24 25 26 27
  805. // i3 : 30 31 32 33 34 35 36 37
  806. // i4 : 40 41 42 43 44 45 46 47
  807. // i5 : 50 51 52 53 54 55 56 57
  808. // i6 : 60 61 62 63 64 65 66 67
  809. // i7 : 70 71 72 73 74 75 76 77
  810. // i8 : 80 81 82 83 84 85 86 87
  811. // i9 : 90 91 92 93 94 95 96 97
  812. // i10: A0 A1 A2 A3 A4 A5 A6 A7
  813. // i11: B0 B1 B2 B3 B4 B5 B6 B7
  814. // i12: C0 C1 C2 C3 C4 C5 C6 C7
  815. // i13: D0 D1 D2 D3 D4 D5 D6 D7
  816. // i14: E0 E1 E2 E3 E4 E5 E6 E7
  817. // i15: F0 F1 F2 F3 F4 F5 F6 F7
  818. // to:
  819. // a0: 00 01 02 03 04 05 06 07 80 81 82 83 84 85 86 87
  820. // a1: 10 11 12 13 14 15 16 17 90 91 92 93 94 95 96 97
  821. // a2: 20 21 22 23 24 25 26 27 A0 A1 A2 A3 A4 A5 A6 A7
  822. // a3: 30 31 32 33 34 35 36 37 B0 B1 B2 B3 B4 B5 B6 B7
  823. // a4: 40 41 42 43 44 45 46 47 C0 C1 C2 C3 C4 C5 C6 C7
  824. // a5: 50 51 52 53 54 55 56 57 D0 D1 D2 D3 D4 D5 D6 D7
  825. // a6: 60 61 62 63 64 65 66 67 E0 E1 E2 E3 E4 E5 E6 E7
  826. // a7: 70 71 72 73 74 75 76 77 F0 F1 F2 F3 F4 F5 F6 F7
  827. const uint8x16_t a0 = vcombine_u8(i0, i8);
  828. const uint8x16_t a1 = vcombine_u8(i1, i9);
  829. const uint8x16_t a2 = vcombine_u8(i2, i10);
  830. const uint8x16_t a3 = vcombine_u8(i3, i11);
  831. const uint8x16_t a4 = vcombine_u8(i4, i12);
  832. const uint8x16_t a5 = vcombine_u8(i5, i13);
  833. const uint8x16_t a6 = vcombine_u8(i6, i14);
  834. const uint8x16_t a7 = vcombine_u8(i7, i15);
  835. // Swap 8 bit elements resulting in:
  836. // b0.val[0]: 00 10 02 12 04 14 06 16 80 90 82 92 84 94 86 96
  837. // b0.val[1]: 01 11 03 13 05 15 07 17 81 91 83 93 85 95 87 97
  838. // b1.val[0]: 20 30 22 32 24 34 26 36 A0 B0 A2 B2 A4 B4 A6 B6
  839. // b1.val[1]: 21 31 23 33 25 35 27 37 A1 B1 A3 B3 A5 B5 A7 B7
  840. // b2.val[0]: 40 50 42 52 44 54 46 56 C0 D0 C2 D2 C4 D4 C6 D6
  841. // b2.val[1]: 41 51 43 53 45 55 47 57 C1 D1 C3 D3 C5 D5 C7 D7
  842. // b3.val[0]: 60 70 62 72 64 74 66 76 E0 F0 E2 F2 E4 F4 E6 F6
  843. // b3.val[1]: 61 71 63 73 65 75 67 77 E1 F1 E3 F3 E5 F5 E7 F7
  844. const uint8x16x2_t b0 = vtrnq_u8(a0, a1);
  845. const uint8x16x2_t b1 = vtrnq_u8(a2, a3);
  846. const uint8x16x2_t b2 = vtrnq_u8(a4, a5);
  847. const uint8x16x2_t b3 = vtrnq_u8(a6, a7);
  848. // Swap 16 bit elements resulting in:
  849. // c0.val[0]: 00 10 20 30 04 14 24 34 80 90 A0 B0 84 94 A4 B4
  850. // c0.val[1]: 02 12 22 32 06 16 26 36 82 92 A2 B2 86 96 A6 B6
  851. // c1.val[0]: 01 11 21 31 05 15 25 35 81 91 A1 B1 85 95 A5 B5
  852. // c1.val[1]: 03 13 23 33 07 17 27 37 83 93 A3 B3 87 97 A7 B7
  853. // c2.val[0]: 40 50 60 70 44 54 64 74 C0 D0 E0 F0 C4 D4 E4 F4
  854. // c2.val[1]: 42 52 62 72 46 56 66 76 C2 D2 E2 F2 C6 D6 E6 F6
  855. // c3.val[0]: 41 51 61 71 45 55 65 75 C1 D1 E1 F1 C5 D5 E5 F5
  856. // c3.val[1]: 43 53 63 73 47 57 67 77 C3 D3 E3 F3 C7 D7 E7 F7
  857. const uint16x8x2_t c0 = vtrnq_u16(vreinterpretq_u16_u8(b0.val[0]),
  858. vreinterpretq_u16_u8(b1.val[0]));
  859. const uint16x8x2_t c1 = vtrnq_u16(vreinterpretq_u16_u8(b0.val[1]),
  860. vreinterpretq_u16_u8(b1.val[1]));
  861. const uint16x8x2_t c2 = vtrnq_u16(vreinterpretq_u16_u8(b2.val[0]),
  862. vreinterpretq_u16_u8(b3.val[0]));
  863. const uint16x8x2_t c3 = vtrnq_u16(vreinterpretq_u16_u8(b2.val[1]),
  864. vreinterpretq_u16_u8(b3.val[1]));
  865. // Swap 32 bit elements resulting in:
  866. // d0.val[0]: 00 10 20 30 40 50 60 70 80 90 A0 B0 C0 D0 E0 F0
  867. // d0.val[1]: 04 14 24 34 44 54 64 74 84 94 A4 B4 C4 D4 E4 F4
  868. // d1.val[0]: 02 12 22 32 42 52 62 72 82 92 A2 B2 C2 D2 E2 F2
  869. // d1.val[1]: 06 16 26 36 46 56 66 76 86 96 A6 B6 C6 D6 E6 F6
  870. // d2.val[0]: 01 11 21 31 41 51 61 71 81 91 A1 B1 C1 D1 E1 F1
  871. // d2.val[1]: 05 15 25 35 45 55 65 75 85 95 A5 B5 C5 D5 E5 F5
  872. // d3.val[0]: 03 13 23 33 43 53 63 73 83 93 A3 B3 C3 D3 E3 F3
  873. // d3.val[1]: 07 17 27 37 47 57 67 77 87 97 A7 B7 C7 D7 E7 F7
  874. const uint32x4x2_t d0 = vtrnq_u32(vreinterpretq_u32_u16(c0.val[0]),
  875. vreinterpretq_u32_u16(c2.val[0]));
  876. const uint32x4x2_t d1 = vtrnq_u32(vreinterpretq_u32_u16(c0.val[1]),
  877. vreinterpretq_u32_u16(c2.val[1]));
  878. const uint32x4x2_t d2 = vtrnq_u32(vreinterpretq_u32_u16(c1.val[0]),
  879. vreinterpretq_u32_u16(c3.val[0]));
  880. const uint32x4x2_t d3 = vtrnq_u32(vreinterpretq_u32_u16(c1.val[1]),
  881. vreinterpretq_u32_u16(c3.val[1]));
  882. // Output:
  883. // o0: 00 10 20 30 40 50 60 70 80 90 A0 B0 C0 D0 E0 F0
  884. // o1: 01 11 21 31 41 51 61 71 81 91 A1 B1 C1 D1 E1 F1
  885. // o2: 02 12 22 32 42 52 62 72 82 92 A2 B2 C2 D2 E2 F2
  886. // o3: 03 13 23 33 43 53 63 73 83 93 A3 B3 C3 D3 E3 F3
  887. // o4: 04 14 24 34 44 54 64 74 84 94 A4 B4 C4 D4 E4 F4
  888. // o5: 05 15 25 35 45 55 65 75 85 95 A5 B5 C5 D5 E5 F5
  889. // o6: 06 16 26 36 46 56 66 76 86 96 A6 B6 C6 D6 E6 F6
  890. // o7: 07 17 27 37 47 57 67 77 87 97 A7 B7 C7 D7 E7 F7
  891. *o0 = vreinterpretq_u8_u32(d0.val[0]);
  892. *o1 = vreinterpretq_u8_u32(d2.val[0]);
  893. *o2 = vreinterpretq_u8_u32(d1.val[0]);
  894. *o3 = vreinterpretq_u8_u32(d3.val[0]);
  895. *o4 = vreinterpretq_u8_u32(d0.val[1]);
  896. *o5 = vreinterpretq_u8_u32(d2.val[1]);
  897. *o6 = vreinterpretq_u8_u32(d1.val[1]);
  898. *o7 = vreinterpretq_u8_u32(d3.val[1]);
  899. }
  900. static INLINE void transpose_u8_16x16(
  901. const uint8x16_t i0, const uint8x16_t i1, const uint8x16_t i2,
  902. const uint8x16_t i3, const uint8x16_t i4, const uint8x16_t i5,
  903. const uint8x16_t i6, const uint8x16_t i7, const uint8x16_t i8,
  904. const uint8x16_t i9, const uint8x16_t i10, const uint8x16_t i11,
  905. const uint8x16_t i12, const uint8x16_t i13, const uint8x16_t i14,
  906. const uint8x16_t i15, uint8x16_t *o0, uint8x16_t *o1, uint8x16_t *o2,
  907. uint8x16_t *o3, uint8x16_t *o4, uint8x16_t *o5, uint8x16_t *o6,
  908. uint8x16_t *o7, uint8x16_t *o8, uint8x16_t *o9, uint8x16_t *o10,
  909. uint8x16_t *o11, uint8x16_t *o12, uint8x16_t *o13, uint8x16_t *o14,
  910. uint8x16_t *o15) {
  911. // Swap 8 bit elements. Goes from:
  912. // i0: 00 01 02 03 04 05 06 07 08 09 0A 0B 0C 0D 0E 0F
  913. // i1: 10 11 12 13 14 15 16 17 18 19 1A 1B 1C 1D 1E 1F
  914. // i2: 20 21 22 23 24 25 26 27 28 29 2A 2B 2C 2D 2E 2F
  915. // i3: 30 31 32 33 34 35 36 37 38 39 3A 3B 3C 3D 3E 3F
  916. // i4: 40 41 42 43 44 45 46 47 48 49 4A 4B 4C 4D 4E 4F
  917. // i5: 50 51 52 53 54 55 56 57 58 59 5A 5B 5C 5D 5E 5F
  918. // i6: 60 61 62 63 64 65 66 67 68 69 6A 6B 6C 6D 6E 6F
  919. // i7: 70 71 72 73 74 75 76 77 78 79 7A 7B 7C 7D 7E 7F
  920. // i8: 80 81 82 83 84 85 86 87 88 89 8A 8B 8C 8D 8E 8F
  921. // i9: 90 91 92 93 94 95 96 97 98 99 9A 9B 9C 9D 9E 9F
  922. // i10: A0 A1 A2 A3 A4 A5 A6 A7 A8 A9 AA AB AC AD AE AF
  923. // i11: B0 B1 B2 B3 B4 B5 B6 B7 B8 B9 BA BB BC BD BE BF
  924. // i12: C0 C1 C2 C3 C4 C5 C6 C7 C8 C9 CA CB CC CD CE CF
  925. // i13: D0 D1 D2 D3 D4 D5 D6 D7 D8 D9 DA DB DC DD DE DF
  926. // i14: E0 E1 E2 E3 E4 E5 E6 E7 E8 E9 EA EB EC ED EE EF
  927. // i15: F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 FA FB FC FD FE FF
  928. // to:
  929. // b0.val[0]: 00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E
  930. // b0.val[1]: 01 11 03 13 05 15 07 17 09 19 0B 1B 0D 1D 0F 1F
  931. // b1.val[0]: 20 30 22 32 24 34 26 36 28 38 2A 3A 2C 3C 2E 3E
  932. // b1.val[1]: 21 31 23 33 25 35 27 37 29 39 2B 3B 2D 3D 2F 3F
  933. // b2.val[0]: 40 50 42 52 44 54 46 56 48 58 4A 5A 4C 5C 4E 5E
  934. // b2.val[1]: 41 51 43 53 45 55 47 57 49 59 4B 5B 4D 5D 4F 5F
  935. // b3.val[0]: 60 70 62 72 64 74 66 76 68 78 6A 7A 6C 7C 6E 7E
  936. // b3.val[1]: 61 71 63 73 65 75 67 77 69 79 6B 7B 6D 7D 6F 7F
  937. // b4.val[0]: 80 90 82 92 84 94 86 96 88 98 8A 9A 8C 9C 8E 9E
  938. // b4.val[1]: 81 91 83 93 85 95 87 97 89 99 8B 9B 8D 9D 8F 9F
  939. // b5.val[0]: A0 B0 A2 B2 A4 B4 A6 B6 A8 B8 AA BA AC BC AE BE
  940. // b5.val[1]: A1 B1 A3 B3 A5 B5 A7 B7 A9 B9 AB BB AD BD AF BF
  941. // b6.val[0]: C0 D0 C2 D2 C4 D4 C6 D6 C8 D8 CA DA CC DC CE DE
  942. // b6.val[1]: C1 D1 C3 D3 C5 D5 C7 D7 C9 D9 CB DB CD DD CF DF
  943. // b7.val[0]: E0 F0 E2 F2 E4 F4 E6 F6 E8 F8 EA FA EC FC EE FE
  944. // b7.val[1]: E1 F1 E3 F3 E5 F5 E7 F7 E9 F9 EB FB ED FD EF FF
  945. const uint8x16x2_t b0 = vtrnq_u8(i0, i1);
  946. const uint8x16x2_t b1 = vtrnq_u8(i2, i3);
  947. const uint8x16x2_t b2 = vtrnq_u8(i4, i5);
  948. const uint8x16x2_t b3 = vtrnq_u8(i6, i7);
  949. const uint8x16x2_t b4 = vtrnq_u8(i8, i9);
  950. const uint8x16x2_t b5 = vtrnq_u8(i10, i11);
  951. const uint8x16x2_t b6 = vtrnq_u8(i12, i13);
  952. const uint8x16x2_t b7 = vtrnq_u8(i14, i15);
  953. // Swap 16 bit elements resulting in:
  954. // c0.val[0]: 00 10 20 30 04 14 24 34 08 18 28 38 0C 1C 2C 3C
  955. // c0.val[1]: 02 12 22 32 06 16 26 36 0A 1A 2A 3A 0E 1E 2E 3E
  956. // c1.val[0]: 01 11 21 31 05 15 25 35 09 19 29 39 0D 1D 2D 3D
  957. // c1.val[1]: 03 13 23 33 07 17 27 37 0B 1B 2B 3B 0F 1F 2F 3F
  958. // c2.val[0]: 40 50 60 70 44 54 64 74 48 58 68 78 4C 5C 6C 7C
  959. // c2.val[1]: 42 52 62 72 46 56 66 76 4A 5A 6A 7A 4E 5E 6E 7E
  960. // c3.val[0]: 41 51 61 71 45 55 65 75 49 59 69 79 4D 5D 6D 7D
  961. // c3.val[1]: 43 53 63 73 47 57 67 77 4B 5B 6B 7B 4F 5F 6F 7F
  962. // c4.val[0]: 80 90 A0 B0 84 94 A4 B4 88 98 A8 B8 8C 9C AC BC
  963. // c4.val[1]: 82 92 A2 B2 86 96 A6 B6 8A 9A AA BA 8E 9E AE BE
  964. // c5.val[0]: 81 91 A1 B1 85 95 A5 B5 89 99 A9 B9 8D 9D AD BD
  965. // c5.val[1]: 83 93 A3 B3 87 97 A7 B7 8B 9B AB BB 8F 9F AF BF
  966. // c6.val[0]: C0 D0 E0 F0 C4 D4 E4 F4 C8 D8 E8 F8 CC DC EC FC
  967. // c6.val[1]: C2 D2 E2 F2 C6 D6 E6 F6 CA DA EA FA CE DE EE FE
  968. // c7.val[0]: C1 D1 E1 F1 C5 D5 E5 F5 C9 D9 E9 F9 CD DD ED FD
  969. // c7.val[1]: C3 D3 E3 F3 C7 D7 E7 F7 CB DB EB FB CF DF EF FF
  970. const uint16x8x2_t c0 = vtrnq_u16(vreinterpretq_u16_u8(b0.val[0]),
  971. vreinterpretq_u16_u8(b1.val[0]));
  972. const uint16x8x2_t c1 = vtrnq_u16(vreinterpretq_u16_u8(b0.val[1]),
  973. vreinterpretq_u16_u8(b1.val[1]));
  974. const uint16x8x2_t c2 = vtrnq_u16(vreinterpretq_u16_u8(b2.val[0]),
  975. vreinterpretq_u16_u8(b3.val[0]));
  976. const uint16x8x2_t c3 = vtrnq_u16(vreinterpretq_u16_u8(b2.val[1]),
  977. vreinterpretq_u16_u8(b3.val[1]));
  978. const uint16x8x2_t c4 = vtrnq_u16(vreinterpretq_u16_u8(b4.val[0]),
  979. vreinterpretq_u16_u8(b5.val[0]));
  980. const uint16x8x2_t c5 = vtrnq_u16(vreinterpretq_u16_u8(b4.val[1]),
  981. vreinterpretq_u16_u8(b5.val[1]));
  982. const uint16x8x2_t c6 = vtrnq_u16(vreinterpretq_u16_u8(b6.val[0]),
  983. vreinterpretq_u16_u8(b7.val[0]));
  984. const uint16x8x2_t c7 = vtrnq_u16(vreinterpretq_u16_u8(b6.val[1]),
  985. vreinterpretq_u16_u8(b7.val[1]));
  986. // Swap 32 bit elements resulting in:
  987. // d0.val[0]: 00 10 20 30 40 50 60 70 08 18 28 38 48 58 68 78
  988. // d0.val[1]: 04 14 24 34 44 54 64 74 0C 1C 2C 3C 4C 5C 6C 7C
  989. // d1.val[0]: 02 12 22 32 42 52 62 72 0A 1A 2A 3A 4A 5A 6A 7A
  990. // d1.val[1]: 06 16 26 36 46 56 66 76 0E 1E 2E 3E 4E 5E 6E 7E
  991. // d2.val[0]: 01 11 21 31 41 51 61 71 09 19 29 39 49 59 69 79
  992. // d2.val[1]: 05 15 25 35 45 55 65 75 0D 1D 2D 3D 4D 5D 6D 7D
  993. // d3.val[0]: 03 13 23 33 43 53 63 73 0B 1B 2B 3B 4B 5B 6B 7B
  994. // d3.val[1]: 07 17 27 37 47 57 67 77 0F 1F 2F 3F 4F 5F 6F 7F
  995. // d4.val[0]: 80 90 A0 B0 C0 D0 E0 F0 88 98 A8 B8 C8 D8 E8 F8
  996. // d4.val[1]: 84 94 A4 B4 C4 D4 E4 F4 8C 9C AC BC CC DC EC FC
  997. // d5.val[0]: 82 92 A2 B2 C2 D2 E2 F2 8A 9A AA BA CA DA EA FA
  998. // d5.val[1]: 86 96 A6 B6 C6 D6 E6 F6 8E 9E AE BE CE DE EE FE
  999. // d6.val[0]: 81 91 A1 B1 C1 D1 E1 F1 89 99 A9 B9 C9 D9 E9 F9
  1000. // d6.val[1]: 85 95 A5 B5 C5 D5 E5 F5 8D 9D AD BD CD DD ED FD
  1001. // d7.val[0]: 83 93 A3 B3 C3 D3 E3 F3 8B 9B AB BB CB DB EB FB
  1002. // d7.val[1]: 87 97 A7 B7 C7 D7 E7 F7 8F 9F AF BF CF DF EF FF
  1003. const uint32x4x2_t d0 = vtrnq_u32(vreinterpretq_u32_u16(c0.val[0]),
  1004. vreinterpretq_u32_u16(c2.val[0]));
  1005. const uint32x4x2_t d1 = vtrnq_u32(vreinterpretq_u32_u16(c0.val[1]),
  1006. vreinterpretq_u32_u16(c2.val[1]));
  1007. const uint32x4x2_t d2 = vtrnq_u32(vreinterpretq_u32_u16(c1.val[0]),
  1008. vreinterpretq_u32_u16(c3.val[0]));
  1009. const uint32x4x2_t d3 = vtrnq_u32(vreinterpretq_u32_u16(c1.val[1]),
  1010. vreinterpretq_u32_u16(c3.val[1]));
  1011. const uint32x4x2_t d4 = vtrnq_u32(vreinterpretq_u32_u16(c4.val[0]),
  1012. vreinterpretq_u32_u16(c6.val[0]));
  1013. const uint32x4x2_t d5 = vtrnq_u32(vreinterpretq_u32_u16(c4.val[1]),
  1014. vreinterpretq_u32_u16(c6.val[1]));
  1015. const uint32x4x2_t d6 = vtrnq_u32(vreinterpretq_u32_u16(c5.val[0]),
  1016. vreinterpretq_u32_u16(c7.val[0]));
  1017. const uint32x4x2_t d7 = vtrnq_u32(vreinterpretq_u32_u16(c5.val[1]),
  1018. vreinterpretq_u32_u16(c7.val[1]));
  1019. // Swap 64 bit elements resulting in:
  1020. // e0.val[0]: 00 10 20 30 40 50 60 70 80 90 A0 B0 C0 D0 E0 F0
  1021. // e0.val[1]: 08 18 28 38 48 58 68 78 88 98 A8 B8 C8 D8 E8 F8
  1022. // e1.val[0]: 01 11 21 31 41 51 61 71 84 94 A4 B4 C4 D4 E4 F4
  1023. // e1.val[1]: 09 19 29 39 49 59 69 79 89 99 A9 B9 C9 D9 E9 F9
  1024. // e2.val[0]: 02 12 22 32 42 52 62 72 82 92 A2 B2 C2 D2 E2 F2
  1025. // e2.val[1]: 0A 1A 2A 3A 4A 5A 6A 7A 8A 9A AA BA CA DA EA FA
  1026. // e3.val[0]: 03 13 23 33 43 53 63 73 86 96 A6 B6 C6 D6 E6 F6
  1027. // e3.val[1]: 0B 1B 2B 3B 4B 5B 6B 7B 8B 9B AB BB CB DB EB FB
  1028. // e4.val[0]: 04 14 24 34 44 54 64 74 81 91 A1 B1 C1 D1 E1 F1
  1029. // e4.val[1]: 0C 1C 2C 3C 4C 5C 6C 7C 8C 9C AC BC CC DC EC FC
  1030. // e5.val[0]: 05 15 25 35 45 55 65 75 85 95 A5 B5 C5 D5 E5 F5
  1031. // e5.val[1]: 0D 1D 2D 3D 4D 5D 6D 7D 8D 9D AD BD CD DD ED FD
  1032. // e6.val[0]: 06 16 26 36 46 56 66 76 83 93 A3 B3 C3 D3 E3 F3
  1033. // e6.val[1]: 0E 1E 2E 3E 4E 5E 6E 7E 8E 9E AE BE CE DE EE FE
  1034. // e7.val[0]: 07 17 27 37 47 57 67 77 87 97 A7 B7 C7 D7 E7 F7
  1035. // e7.val[1]: 0F 1F 2F 3F 4F 5F 6F 7F 8F 9F AF BF CF DF EF FF
  1036. const uint8x16x2_t e0 = vpx_vtrnq_u64_to_u8(d0.val[0], d4.val[0]);
  1037. const uint8x16x2_t e1 = vpx_vtrnq_u64_to_u8(d2.val[0], d6.val[0]);
  1038. const uint8x16x2_t e2 = vpx_vtrnq_u64_to_u8(d1.val[0], d5.val[0]);
  1039. const uint8x16x2_t e3 = vpx_vtrnq_u64_to_u8(d3.val[0], d7.val[0]);
  1040. const uint8x16x2_t e4 = vpx_vtrnq_u64_to_u8(d0.val[1], d4.val[1]);
  1041. const uint8x16x2_t e5 = vpx_vtrnq_u64_to_u8(d2.val[1], d6.val[1]);
  1042. const uint8x16x2_t e6 = vpx_vtrnq_u64_to_u8(d1.val[1], d5.val[1]);
  1043. const uint8x16x2_t e7 = vpx_vtrnq_u64_to_u8(d3.val[1], d7.val[1]);
  1044. // Output:
  1045. // o0 : 00 10 20 30 40 50 60 70 80 90 A0 B0 C0 D0 E0 F0
  1046. // o1 : 01 11 21 31 41 51 61 71 84 94 A4 B4 C4 D4 E4 F4
  1047. // o2 : 02 12 22 32 42 52 62 72 82 92 A2 B2 C2 D2 E2 F2
  1048. // o3 : 03 13 23 33 43 53 63 73 86 96 A6 B6 C6 D6 E6 F6
  1049. // o4 : 04 14 24 34 44 54 64 74 81 91 A1 B1 C1 D1 E1 F1
  1050. // o5 : 05 15 25 35 45 55 65 75 85 95 A5 B5 C5 D5 E5 F5
  1051. // o6 : 06 16 26 36 46 56 66 76 83 93 A3 B3 C3 D3 E3 F3
  1052. // o7 : 07 17 27 37 47 57 67 77 87 97 A7 B7 C7 D7 E7 F7
  1053. // o8 : 08 18 28 38 48 58 68 78 88 98 A8 B8 C8 D8 E8 F8
  1054. // o9 : 09 19 29 39 49 59 69 79 89 99 A9 B9 C9 D9 E9 F9
  1055. // o10: 0A 1A 2A 3A 4A 5A 6A 7A 8A 9A AA BA CA DA EA FA
  1056. // o11: 0B 1B 2B 3B 4B 5B 6B 7B 8B 9B AB BB CB DB EB FB
  1057. // o12: 0C 1C 2C 3C 4C 5C 6C 7C 8C 9C AC BC CC DC EC FC
  1058. // o13: 0D 1D 2D 3D 4D 5D 6D 7D 8D 9D AD BD CD DD ED FD
  1059. // o14: 0E 1E 2E 3E 4E 5E 6E 7E 8E 9E AE BE CE DE EE FE
  1060. // o15: 0F 1F 2F 3F 4F 5F 6F 7F 8F 9F AF BF CF DF EF FF
  1061. *o0 = e0.val[0];
  1062. *o1 = e1.val[0];
  1063. *o2 = e2.val[0];
  1064. *o3 = e3.val[0];
  1065. *o4 = e4.val[0];
  1066. *o5 = e5.val[0];
  1067. *o6 = e6.val[0];
  1068. *o7 = e7.val[0];
  1069. *o8 = e0.val[1];
  1070. *o9 = e1.val[1];
  1071. *o10 = e2.val[1];
  1072. *o11 = e3.val[1];
  1073. *o12 = e4.val[1];
  1074. *o13 = e5.val[1];
  1075. *o14 = e6.val[1];
  1076. *o15 = e7.val[1];
  1077. }
  1078. static INLINE void load_and_transpose_u8_4x8(const uint8_t *a,
  1079. const int a_stride, uint8x8_t *a0,
  1080. uint8x8_t *a1, uint8x8_t *a2,
  1081. uint8x8_t *a3) {
  1082. uint8x8_t a4, a5, a6, a7;
  1083. *a0 = vld1_u8(a);
  1084. a += a_stride;
  1085. *a1 = vld1_u8(a);
  1086. a += a_stride;
  1087. *a2 = vld1_u8(a);
  1088. a += a_stride;
  1089. *a3 = vld1_u8(a);
  1090. a += a_stride;
  1091. a4 = vld1_u8(a);
  1092. a += a_stride;
  1093. a5 = vld1_u8(a);
  1094. a += a_stride;
  1095. a6 = vld1_u8(a);
  1096. a += a_stride;
  1097. a7 = vld1_u8(a);
  1098. transpose_u8_4x8(a0, a1, a2, a3, a4, a5, a6, a7);
  1099. }
  1100. static INLINE void load_and_transpose_u8_8x8(const uint8_t *a,
  1101. const int a_stride, uint8x8_t *a0,
  1102. uint8x8_t *a1, uint8x8_t *a2,
  1103. uint8x8_t *a3, uint8x8_t *a4,
  1104. uint8x8_t *a5, uint8x8_t *a6,
  1105. uint8x8_t *a7) {
  1106. *a0 = vld1_u8(a);
  1107. a += a_stride;
  1108. *a1 = vld1_u8(a);
  1109. a += a_stride;
  1110. *a2 = vld1_u8(a);
  1111. a += a_stride;
  1112. *a3 = vld1_u8(a);
  1113. a += a_stride;
  1114. *a4 = vld1_u8(a);
  1115. a += a_stride;
  1116. *a5 = vld1_u8(a);
  1117. a += a_stride;
  1118. *a6 = vld1_u8(a);
  1119. a += a_stride;
  1120. *a7 = vld1_u8(a);
  1121. transpose_u8_8x8(a0, a1, a2, a3, a4, a5, a6, a7);
  1122. }
  1123. static INLINE void transpose_and_store_u8_8x8(uint8_t *a, const int a_stride,
  1124. uint8x8_t a0, uint8x8_t a1,
  1125. uint8x8_t a2, uint8x8_t a3,
  1126. uint8x8_t a4, uint8x8_t a5,
  1127. uint8x8_t a6, uint8x8_t a7) {
  1128. transpose_u8_8x8(&a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7);
  1129. vst1_u8(a, a0);
  1130. a += a_stride;
  1131. vst1_u8(a, a1);
  1132. a += a_stride;
  1133. vst1_u8(a, a2);
  1134. a += a_stride;
  1135. vst1_u8(a, a3);
  1136. a += a_stride;
  1137. vst1_u8(a, a4);
  1138. a += a_stride;
  1139. vst1_u8(a, a5);
  1140. a += a_stride;
  1141. vst1_u8(a, a6);
  1142. a += a_stride;
  1143. vst1_u8(a, a7);
  1144. }
  1145. static INLINE void load_and_transpose_s16_8x8(const int16_t *a,
  1146. const int a_stride, int16x8_t *a0,
  1147. int16x8_t *a1, int16x8_t *a2,
  1148. int16x8_t *a3, int16x8_t *a4,
  1149. int16x8_t *a5, int16x8_t *a6,
  1150. int16x8_t *a7) {
  1151. *a0 = vld1q_s16(a);
  1152. a += a_stride;
  1153. *a1 = vld1q_s16(a);
  1154. a += a_stride;
  1155. *a2 = vld1q_s16(a);
  1156. a += a_stride;
  1157. *a3 = vld1q_s16(a);
  1158. a += a_stride;
  1159. *a4 = vld1q_s16(a);
  1160. a += a_stride;
  1161. *a5 = vld1q_s16(a);
  1162. a += a_stride;
  1163. *a6 = vld1q_s16(a);
  1164. a += a_stride;
  1165. *a7 = vld1q_s16(a);
  1166. transpose_s16_8x8(a0, a1, a2, a3, a4, a5, a6, a7);
  1167. }
  1168. static INLINE void load_and_transpose_s32_8x8(
  1169. const int32_t *a, const int a_stride, int32x4x2_t *const a0,
  1170. int32x4x2_t *const a1, int32x4x2_t *const a2, int32x4x2_t *const a3,
  1171. int32x4x2_t *const a4, int32x4x2_t *const a5, int32x4x2_t *const a6,
  1172. int32x4x2_t *const a7) {
  1173. a0->val[0] = vld1q_s32(a);
  1174. a0->val[1] = vld1q_s32(a + 4);
  1175. a += a_stride;
  1176. a1->val[0] = vld1q_s32(a);
  1177. a1->val[1] = vld1q_s32(a + 4);
  1178. a += a_stride;
  1179. a2->val[0] = vld1q_s32(a);
  1180. a2->val[1] = vld1q_s32(a + 4);
  1181. a += a_stride;
  1182. a3->val[0] = vld1q_s32(a);
  1183. a3->val[1] = vld1q_s32(a + 4);
  1184. a += a_stride;
  1185. a4->val[0] = vld1q_s32(a);
  1186. a4->val[1] = vld1q_s32(a + 4);
  1187. a += a_stride;
  1188. a5->val[0] = vld1q_s32(a);
  1189. a5->val[1] = vld1q_s32(a + 4);
  1190. a += a_stride;
  1191. a6->val[0] = vld1q_s32(a);
  1192. a6->val[1] = vld1q_s32(a + 4);
  1193. a += a_stride;
  1194. a7->val[0] = vld1q_s32(a);
  1195. a7->val[1] = vld1q_s32(a + 4);
  1196. transpose_s32_8x8(a0, a1, a2, a3, a4, a5, a6, a7);
  1197. }
  1198. #endif // VPX_VPX_DSP_ARM_TRANSPOSE_NEON_H_