vp8_macros_msa.h 83 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758
  1. /*
  2. * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
  3. *
  4. * Use of this source code is governed by a BSD-style license
  5. * that can be found in the LICENSE file in the root of the source
  6. * tree. An additional intellectual property rights grant can be found
  7. * in the file PATENTS. All contributing project authors may
  8. * be found in the AUTHORS file in the root of the source tree.
  9. */
  10. #ifndef VP8_COMMON_MIPS_MSA_VP8_MACROS_MSA_H_
  11. #define VP8_COMMON_MIPS_MSA_VP8_MACROS_MSA_H_
  12. #include <msa.h>
  13. #include "./vpx_config.h"
  14. #include "vpx/vpx_integer.h"
  15. #define LD_B(RTYPE, psrc) *((const RTYPE *)(psrc))
  16. #define LD_UB(...) LD_B(v16u8, __VA_ARGS__)
  17. #define LD_SB(...) LD_B(v16i8, __VA_ARGS__)
  18. #define LD_H(RTYPE, psrc) *((const RTYPE *)(psrc))
  19. #define LD_UH(...) LD_H(v8u16, __VA_ARGS__)
  20. #define LD_SH(...) LD_H(v8i16, __VA_ARGS__)
  21. #define LD_W(RTYPE, psrc) *((const RTYPE *)(psrc))
  22. #define LD_UW(...) LD_W(v4u32, __VA_ARGS__)
  23. #define LD_SW(...) LD_W(v4i32, __VA_ARGS__)
  24. #define ST_B(RTYPE, in, pdst) *((RTYPE *)(pdst)) = (in)
  25. #define ST_UB(...) ST_B(v16u8, __VA_ARGS__)
  26. #define ST_SB(...) ST_B(v16i8, __VA_ARGS__)
  27. #define ST_H(RTYPE, in, pdst) *((RTYPE *)(pdst)) = (in)
  28. #define ST_UH(...) ST_H(v8u16, __VA_ARGS__)
  29. #define ST_SH(...) ST_H(v8i16, __VA_ARGS__)
  30. #define ST_W(RTYPE, in, pdst) *((RTYPE *)(pdst)) = (in)
  31. #define ST_SW(...) ST_W(v4i32, __VA_ARGS__)
  32. #if (__mips_isa_rev >= 6)
  33. #define LW(psrc) \
  34. ({ \
  35. const uint8_t *psrc_m = (const uint8_t *)(psrc); \
  36. uint32_t val_m; \
  37. \
  38. asm volatile("lw %[val_m], %[psrc_m] \n\t" \
  39. \
  40. : [val_m] "=r"(val_m) \
  41. : [psrc_m] "m"(*psrc_m)); \
  42. \
  43. val_m; \
  44. })
  45. #if (__mips == 64)
  46. #define LD(psrc) \
  47. ({ \
  48. const uint8_t *psrc_m = (const uint8_t *)(psrc); \
  49. uint64_t val_m = 0; \
  50. \
  51. asm volatile("ld %[val_m], %[psrc_m] \n\t" \
  52. \
  53. : [val_m] "=r"(val_m) \
  54. : [psrc_m] "m"(*psrc_m)); \
  55. \
  56. val_m; \
  57. })
  58. #else // !(__mips == 64)
  59. #define LD(psrc) \
  60. ({ \
  61. const uint8_t *psrc_m = (const uint8_t *)(psrc); \
  62. uint32_t val0_m, val1_m; \
  63. uint64_t val_m = 0; \
  64. \
  65. val0_m = LW(psrc_m); \
  66. val1_m = LW(psrc_m + 4); \
  67. \
  68. val_m = (uint64_t)(val1_m); \
  69. val_m = (uint64_t)((val_m << 32) & 0xFFFFFFFF00000000); \
  70. val_m = (uint64_t)(val_m | (uint64_t)val0_m); \
  71. \
  72. val_m; \
  73. })
  74. #endif // (__mips == 64)
  75. #define SH(val, pdst) \
  76. { \
  77. uint8_t *pdst_m = (uint8_t *)(pdst); \
  78. const uint16_t val_m = (val); \
  79. \
  80. asm volatile("sh %[val_m], %[pdst_m] \n\t" \
  81. \
  82. : [pdst_m] "=m"(*pdst_m) \
  83. : [val_m] "r"(val_m)); \
  84. }
  85. #define SW(val, pdst) \
  86. { \
  87. uint8_t *pdst_m = (uint8_t *)(pdst); \
  88. const uint32_t val_m = (val); \
  89. \
  90. asm volatile("sw %[val_m], %[pdst_m] \n\t" \
  91. \
  92. : [pdst_m] "=m"(*pdst_m) \
  93. : [val_m] "r"(val_m)); \
  94. }
  95. #define SD(val, pdst) \
  96. { \
  97. uint8_t *pdst_m = (uint8_t *)(pdst); \
  98. const uint64_t val_m = (val); \
  99. \
  100. asm volatile("sd %[val_m], %[pdst_m] \n\t" \
  101. \
  102. : [pdst_m] "=m"(*pdst_m) \
  103. : [val_m] "r"(val_m)); \
  104. }
  105. #else // !(__mips_isa_rev >= 6)
  106. #define LW(psrc) \
  107. ({ \
  108. const uint8_t *psrc_m = (const uint8_t *)(psrc); \
  109. uint32_t val_m; \
  110. \
  111. asm volatile("ulw %[val_m], %[psrc_m] \n\t" \
  112. \
  113. : [val_m] "=r"(val_m) \
  114. : [psrc_m] "m"(*psrc_m)); \
  115. \
  116. val_m; \
  117. })
  118. #if (__mips == 64)
  119. #define LD(psrc) \
  120. ({ \
  121. const uint8_t *psrc_m = (const uint8_t *)(psrc); \
  122. uint64_t val_m = 0; \
  123. \
  124. asm volatile("uld %[val_m], %[psrc_m] \n\t" \
  125. \
  126. : [val_m] "=r"(val_m) \
  127. : [psrc_m] "m"(*psrc_m)); \
  128. \
  129. val_m; \
  130. })
  131. #else // !(__mips == 64)
  132. #define LD(psrc) \
  133. ({ \
  134. const uint8_t *psrc_m1 = (const uint8_t *)(psrc); \
  135. uint32_t val0_m, val1_m; \
  136. uint64_t val_m = 0; \
  137. \
  138. val0_m = LW(psrc_m1); \
  139. val1_m = LW(psrc_m1 + 4); \
  140. \
  141. val_m = (uint64_t)(val1_m); \
  142. val_m = (uint64_t)((val_m << 32) & 0xFFFFFFFF00000000); \
  143. val_m = (uint64_t)(val_m | (uint64_t)val0_m); \
  144. \
  145. val_m; \
  146. })
  147. #endif // (__mips == 64)
  148. #define SH(val, pdst) \
  149. { \
  150. uint8_t *pdst_m = (uint8_t *)(pdst); \
  151. const uint16_t val_m = (val); \
  152. \
  153. asm volatile("ush %[val_m], %[pdst_m] \n\t" \
  154. \
  155. : [pdst_m] "=m"(*pdst_m) \
  156. : [val_m] "r"(val_m)); \
  157. }
  158. #define SW(val, pdst) \
  159. { \
  160. uint8_t *pdst_m = (uint8_t *)(pdst); \
  161. const uint32_t val_m = (val); \
  162. \
  163. asm volatile("usw %[val_m], %[pdst_m] \n\t" \
  164. \
  165. : [pdst_m] "=m"(*pdst_m) \
  166. : [val_m] "r"(val_m)); \
  167. }
  168. #define SD(val, pdst) \
  169. { \
  170. uint8_t *pdst_m1 = (uint8_t *)(pdst); \
  171. uint32_t val0_m, val1_m; \
  172. \
  173. val0_m = (uint32_t)((val)&0x00000000FFFFFFFF); \
  174. val1_m = (uint32_t)(((val) >> 32) & 0x00000000FFFFFFFF); \
  175. \
  176. SW(val0_m, pdst_m1); \
  177. SW(val1_m, pdst_m1 + 4); \
  178. }
  179. #endif // (__mips_isa_rev >= 6)
  180. /* Description : Load 4 words with stride
  181. Arguments : Inputs - psrc, stride
  182. Outputs - out0, out1, out2, out3
  183. Details : Load word in 'out0' from (psrc)
  184. Load word in 'out1' from (psrc + stride)
  185. Load word in 'out2' from (psrc + 2 * stride)
  186. Load word in 'out3' from (psrc + 3 * stride)
  187. */
  188. #define LW4(psrc, stride, out0, out1, out2, out3) \
  189. { \
  190. out0 = LW((psrc)); \
  191. out1 = LW((psrc) + stride); \
  192. out2 = LW((psrc) + 2 * stride); \
  193. out3 = LW((psrc) + 3 * stride); \
  194. }
  195. /* Description : Load double words with stride
  196. Arguments : Inputs - psrc, stride
  197. Outputs - out0, out1
  198. Details : Load double word in 'out0' from (psrc)
  199. Load double word in 'out1' from (psrc + stride)
  200. */
  201. #define LD2(psrc, stride, out0, out1) \
  202. { \
  203. out0 = LD((psrc)); \
  204. out1 = LD((psrc) + stride); \
  205. }
  206. #define LD4(psrc, stride, out0, out1, out2, out3) \
  207. { \
  208. LD2((psrc), stride, out0, out1); \
  209. LD2((psrc) + 2 * stride, stride, out2, out3); \
  210. }
  211. /* Description : Store 4 words with stride
  212. Arguments : Inputs - in0, in1, in2, in3, pdst, stride
  213. Details : Store word from 'in0' to (pdst)
  214. Store word from 'in1' to (pdst + stride)
  215. Store word from 'in2' to (pdst + 2 * stride)
  216. Store word from 'in3' to (pdst + 3 * stride)
  217. */
  218. #define SW4(in0, in1, in2, in3, pdst, stride) \
  219. { \
  220. SW(in0, (pdst)); \
  221. SW(in1, (pdst) + stride); \
  222. SW(in2, (pdst) + 2 * stride); \
  223. SW(in3, (pdst) + 3 * stride); \
  224. }
  225. /* Description : Store 4 double words with stride
  226. Arguments : Inputs - in0, in1, in2, in3, pdst, stride
  227. Details : Store double word from 'in0' to (pdst)
  228. Store double word from 'in1' to (pdst + stride)
  229. Store double word from 'in2' to (pdst + 2 * stride)
  230. Store double word from 'in3' to (pdst + 3 * stride)
  231. */
  232. #define SD4(in0, in1, in2, in3, pdst, stride) \
  233. { \
  234. SD(in0, (pdst)); \
  235. SD(in1, (pdst) + stride); \
  236. SD(in2, (pdst) + 2 * stride); \
  237. SD(in3, (pdst) + 3 * stride); \
  238. }
  239. /* Description : Load vectors with 16 byte elements with stride
  240. Arguments : Inputs - psrc, stride
  241. Outputs - out0, out1
  242. Return Type - as per RTYPE
  243. Details : Load 16 byte elements in 'out0' from (psrc)
  244. Load 16 byte elements in 'out1' from (psrc + stride)
  245. */
  246. #define LD_B2(RTYPE, psrc, stride, out0, out1) \
  247. { \
  248. out0 = LD_B(RTYPE, (psrc)); \
  249. out1 = LD_B(RTYPE, (psrc) + stride); \
  250. }
  251. #define LD_UB2(...) LD_B2(v16u8, __VA_ARGS__)
  252. #define LD_SB2(...) LD_B2(v16i8, __VA_ARGS__)
  253. #define LD_B3(RTYPE, psrc, stride, out0, out1, out2) \
  254. { \
  255. LD_B2(RTYPE, (psrc), stride, out0, out1); \
  256. out2 = LD_B(RTYPE, (psrc) + 2 * stride); \
  257. }
  258. #define LD_UB3(...) LD_B3(v16u8, __VA_ARGS__)
  259. #define LD_SB3(...) LD_B3(v16i8, __VA_ARGS__)
  260. #define LD_B4(RTYPE, psrc, stride, out0, out1, out2, out3) \
  261. { \
  262. LD_B2(RTYPE, (psrc), stride, out0, out1); \
  263. LD_B2(RTYPE, (psrc) + 2 * stride, stride, out2, out3); \
  264. }
  265. #define LD_UB4(...) LD_B4(v16u8, __VA_ARGS__)
  266. #define LD_SB4(...) LD_B4(v16i8, __VA_ARGS__)
  267. #define LD_B5(RTYPE, psrc, stride, out0, out1, out2, out3, out4) \
  268. { \
  269. LD_B4(RTYPE, (psrc), stride, out0, out1, out2, out3); \
  270. out4 = LD_B(RTYPE, (psrc) + 4 * stride); \
  271. }
  272. #define LD_UB5(...) LD_B5(v16u8, __VA_ARGS__)
  273. #define LD_SB5(...) LD_B5(v16i8, __VA_ARGS__)
  274. #define LD_B8(RTYPE, psrc, stride, out0, out1, out2, out3, out4, out5, out6, \
  275. out7) \
  276. { \
  277. LD_B4(RTYPE, (psrc), stride, out0, out1, out2, out3); \
  278. LD_B4(RTYPE, (psrc) + 4 * stride, stride, out4, out5, out6, out7); \
  279. }
  280. #define LD_UB8(...) LD_B8(v16u8, __VA_ARGS__)
  281. #define LD_SB8(...) LD_B8(v16i8, __VA_ARGS__)
  282. /* Description : Load vectors with 8 halfword elements with stride
  283. Arguments : Inputs - psrc, stride
  284. Outputs - out0, out1
  285. Details : Load 8 halfword elements in 'out0' from (psrc)
  286. Load 8 halfword elements in 'out1' from (psrc + stride)
  287. */
  288. #define LD_H2(RTYPE, psrc, stride, out0, out1) \
  289. { \
  290. out0 = LD_H(RTYPE, (psrc)); \
  291. out1 = LD_H(RTYPE, (psrc) + (stride)); \
  292. }
  293. #define LD_SH2(...) LD_H2(v8i16, __VA_ARGS__)
  294. #define LD_H4(RTYPE, psrc, stride, out0, out1, out2, out3) \
  295. { \
  296. LD_H2(RTYPE, (psrc), stride, out0, out1); \
  297. LD_H2(RTYPE, (psrc) + 2 * stride, stride, out2, out3); \
  298. }
  299. #define LD_SH4(...) LD_H4(v8i16, __VA_ARGS__)
  300. /* Description : Load 2 vectors of signed word elements with stride
  301. Arguments : Inputs - psrc, stride
  302. Outputs - out0, out1
  303. Return Type - signed word
  304. */
  305. #define LD_SW2(psrc, stride, out0, out1) \
  306. { \
  307. out0 = LD_SW((psrc)); \
  308. out1 = LD_SW((psrc) + stride); \
  309. }
  310. /* Description : Store vectors of 16 byte elements with stride
  311. Arguments : Inputs - in0, in1, pdst, stride
  312. Details : Store 16 byte elements from 'in0' to (pdst)
  313. Store 16 byte elements from 'in1' to (pdst + stride)
  314. */
  315. #define ST_B2(RTYPE, in0, in1, pdst, stride) \
  316. { \
  317. ST_B(RTYPE, in0, (pdst)); \
  318. ST_B(RTYPE, in1, (pdst) + stride); \
  319. }
  320. #define ST_UB2(...) ST_B2(v16u8, __VA_ARGS__)
  321. #define ST_B4(RTYPE, in0, in1, in2, in3, pdst, stride) \
  322. { \
  323. ST_B2(RTYPE, in0, in1, (pdst), stride); \
  324. ST_B2(RTYPE, in2, in3, (pdst) + 2 * stride, stride); \
  325. }
  326. #define ST_UB4(...) ST_B4(v16u8, __VA_ARGS__)
  327. #define ST_SB4(...) ST_B4(v16i8, __VA_ARGS__)
  328. #define ST_B8(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride) \
  329. { \
  330. ST_B4(RTYPE, in0, in1, in2, in3, pdst, stride); \
  331. ST_B4(RTYPE, in4, in5, in6, in7, (pdst) + 4 * stride, stride); \
  332. }
  333. #define ST_UB8(...) ST_B8(v16u8, __VA_ARGS__)
  334. /* Description : Store vectors of 8 halfword elements with stride
  335. Arguments : Inputs - in0, in1, pdst, stride
  336. Details : Store 8 halfword elements from 'in0' to (pdst)
  337. Store 8 halfword elements from 'in1' to (pdst + stride)
  338. */
  339. #define ST_H2(RTYPE, in0, in1, pdst, stride) \
  340. { \
  341. ST_H(RTYPE, in0, (pdst)); \
  342. ST_H(RTYPE, in1, (pdst) + stride); \
  343. }
  344. #define ST_SH2(...) ST_H2(v8i16, __VA_ARGS__)
  345. /* Description : Store vectors of word elements with stride
  346. Arguments : Inputs - in0, in1, pdst, stride
  347. Details : Store 4 word elements from 'in0' to (pdst)
  348. Store 4 word elements from 'in1' to (pdst + stride)
  349. */
  350. #define ST_SW2(in0, in1, pdst, stride) \
  351. { \
  352. ST_SW(in0, (pdst)); \
  353. ST_SW(in1, (pdst) + stride); \
  354. }
  355. /* Description : Store 2x4 byte block to destination memory from input vector
  356. Arguments : Inputs - in, stidx, pdst, stride
  357. Details : Index 'stidx' halfword element from 'in' vector is copied to
  358. the GP register and stored to (pdst)
  359. Index 'stidx+1' halfword element from 'in' vector is copied to
  360. the GP register and stored to (pdst + stride)
  361. Index 'stidx+2' halfword element from 'in' vector is copied to
  362. the GP register and stored to (pdst + 2 * stride)
  363. Index 'stidx+3' halfword element from 'in' vector is copied to
  364. the GP register and stored to (pdst + 3 * stride)
  365. */
  366. #define ST2x4_UB(in, stidx, pdst, stride) \
  367. { \
  368. uint16_t out0_m, out1_m, out2_m, out3_m; \
  369. uint8_t *pblk_2x4_m = (uint8_t *)(pdst); \
  370. \
  371. out0_m = __msa_copy_u_h((v8i16)in, (stidx)); \
  372. out1_m = __msa_copy_u_h((v8i16)in, (stidx + 1)); \
  373. out2_m = __msa_copy_u_h((v8i16)in, (stidx + 2)); \
  374. out3_m = __msa_copy_u_h((v8i16)in, (stidx + 3)); \
  375. \
  376. SH(out0_m, pblk_2x4_m); \
  377. SH(out1_m, pblk_2x4_m + stride); \
  378. SH(out2_m, pblk_2x4_m + 2 * stride); \
  379. SH(out3_m, pblk_2x4_m + 3 * stride); \
  380. }
  381. /* Description : Store 4x4 byte block to destination memory from input vector
  382. Arguments : Inputs - in0, in1, pdst, stride
  383. Details : 'Idx0' word element from input vector 'in0' is copied to the
  384. GP register and stored to (pdst)
  385. 'Idx1' word element from input vector 'in0' is copied to the
  386. GP register and stored to (pdst + stride)
  387. 'Idx2' word element from input vector 'in0' is copied to the
  388. GP register and stored to (pdst + 2 * stride)
  389. 'Idx3' word element from input vector 'in0' is copied to the
  390. GP register and stored to (pdst + 3 * stride)
  391. */
  392. #define ST4x4_UB(in0, in1, idx0, idx1, idx2, idx3, pdst, stride) \
  393. { \
  394. uint32_t out0_m, out1_m, out2_m, out3_m; \
  395. uint8_t *pblk_4x4_m = (uint8_t *)(pdst); \
  396. \
  397. out0_m = __msa_copy_u_w((v4i32)in0, idx0); \
  398. out1_m = __msa_copy_u_w((v4i32)in0, idx1); \
  399. out2_m = __msa_copy_u_w((v4i32)in1, idx2); \
  400. out3_m = __msa_copy_u_w((v4i32)in1, idx3); \
  401. \
  402. SW4(out0_m, out1_m, out2_m, out3_m, pblk_4x4_m, stride); \
  403. }
  404. #define ST4x8_UB(in0, in1, pdst, stride) \
  405. { \
  406. uint8_t *pblk_4x8 = (uint8_t *)(pdst); \
  407. \
  408. ST4x4_UB(in0, in0, 0, 1, 2, 3, pblk_4x8, stride); \
  409. ST4x4_UB(in1, in1, 0, 1, 2, 3, pblk_4x8 + 4 * stride, stride); \
  410. }
  411. /* Description : Store 8x1 byte block to destination memory from input vector
  412. Arguments : Inputs - in, pdst
  413. Details : Index 0 double word element from 'in' vector is copied to the
  414. GP register and stored to (pdst)
  415. */
  416. #define ST8x1_UB(in, pdst) \
  417. { \
  418. uint64_t out0_m; \
  419. \
  420. out0_m = __msa_copy_u_d((v2i64)in, 0); \
  421. SD(out0_m, pdst); \
  422. }
  423. /* Description : Store 8x2 byte block to destination memory from input vector
  424. Arguments : Inputs - in, pdst, stride
  425. Details : Index 0 double word element from 'in' vector is copied to the
  426. GP register and stored to (pdst)
  427. Index 1 double word element from 'in' vector is copied to the
  428. GP register and stored to (pdst + stride)
  429. */
  430. #define ST8x2_UB(in, pdst, stride) \
  431. { \
  432. uint64_t out0_m, out1_m; \
  433. uint8_t *pblk_8x2_m = (uint8_t *)(pdst); \
  434. \
  435. out0_m = __msa_copy_u_d((v2i64)in, 0); \
  436. out1_m = __msa_copy_u_d((v2i64)in, 1); \
  437. \
  438. SD(out0_m, pblk_8x2_m); \
  439. SD(out1_m, pblk_8x2_m + stride); \
  440. }
  441. /* Description : Store 8x4 byte block to destination memory from input
  442. vectors
  443. Arguments : Inputs - in0, in1, pdst, stride
  444. Details : Index 0 double word element from 'in0' vector is copied to the
  445. GP register and stored to (pdst)
  446. Index 1 double word element from 'in0' vector is copied to the
  447. GP register and stored to (pdst + stride)
  448. Index 0 double word element from 'in1' vector is copied to the
  449. GP register and stored to (pdst + 2 * stride)
  450. Index 1 double word element from 'in1' vector is copied to the
  451. GP register and stored to (pdst + 3 * stride)
  452. */
  453. #define ST8x4_UB(in0, in1, pdst, stride) \
  454. { \
  455. uint64_t out0_m, out1_m, out2_m, out3_m; \
  456. uint8_t *pblk_8x4_m = (uint8_t *)(pdst); \
  457. \
  458. out0_m = __msa_copy_u_d((v2i64)in0, 0); \
  459. out1_m = __msa_copy_u_d((v2i64)in0, 1); \
  460. out2_m = __msa_copy_u_d((v2i64)in1, 0); \
  461. out3_m = __msa_copy_u_d((v2i64)in1, 1); \
  462. \
  463. SD4(out0_m, out1_m, out2_m, out3_m, pblk_8x4_m, stride); \
  464. }
  465. /* Description : Immediate number of elements to slide with zero
  466. Arguments : Inputs - in0, in1, slide_val
  467. Outputs - out0, out1
  468. Return Type - as per RTYPE
  469. Details : Byte elements from 'zero_m' vector are slid into 'in0' by
  470. value specified in the 'slide_val'
  471. */
  472. #define SLDI_B2_0(RTYPE, in0, in1, out0, out1, slide_val) \
  473. { \
  474. v16i8 zero_m = { 0 }; \
  475. \
  476. out0 = (RTYPE)__msa_sldi_b((v16i8)zero_m, (v16i8)in0, slide_val); \
  477. out1 = (RTYPE)__msa_sldi_b((v16i8)zero_m, (v16i8)in1, slide_val); \
  478. }
  479. #define SLDI_B2_0_UB(...) SLDI_B2_0(v16u8, __VA_ARGS__)
  480. /* Description : Immediate number of elements to slide
  481. Arguments : Inputs - in0_0, in0_1, in1_0, in1_1, slide_val
  482. Outputs - out0, out1
  483. Return Type - as per RTYPE
  484. Details : Byte elements from 'in0_0' vector are slid into 'in1_0' by
  485. value specified in the 'slide_val'
  486. */
  487. #define SLDI_B2(RTYPE, in0_0, in0_1, in1_0, in1_1, out0, out1, slide_val) \
  488. { \
  489. out0 = (RTYPE)__msa_sldi_b((v16i8)in0_0, (v16i8)in1_0, slide_val); \
  490. out1 = (RTYPE)__msa_sldi_b((v16i8)in0_1, (v16i8)in1_1, slide_val); \
  491. }
  492. #define SLDI_B3(RTYPE, in0_0, in0_1, in0_2, in1_0, in1_1, in1_2, out0, out1, \
  493. out2, slide_val) \
  494. { \
  495. SLDI_B2(RTYPE, in0_0, in0_1, in1_0, in1_1, out0, out1, slide_val); \
  496. out2 = (RTYPE)__msa_sldi_b((v16i8)in0_2, (v16i8)in1_2, slide_val); \
  497. }
  498. #define SLDI_B3_UH(...) SLDI_B3(v8u16, __VA_ARGS__)
  499. /* Description : Shuffle byte vector elements as per mask vector
  500. Arguments : Inputs - in0, in1, in2, in3, mask0, mask1
  501. Outputs - out0, out1
  502. Return Type - as per RTYPE
  503. Details : Byte elements from 'in0' & 'in1' are copied selectively to
  504. 'out0' as per control vector 'mask0'
  505. */
  506. #define VSHF_B2(RTYPE, in0, in1, in2, in3, mask0, mask1, out0, out1) \
  507. { \
  508. out0 = (RTYPE)__msa_vshf_b((v16i8)mask0, (v16i8)in1, (v16i8)in0); \
  509. out1 = (RTYPE)__msa_vshf_b((v16i8)mask1, (v16i8)in3, (v16i8)in2); \
  510. }
  511. #define VSHF_B2_UB(...) VSHF_B2(v16u8, __VA_ARGS__)
  512. #define VSHF_B2_SB(...) VSHF_B2(v16i8, __VA_ARGS__)
  513. #define VSHF_B2_UH(...) VSHF_B2(v8u16, __VA_ARGS__)
  514. #define VSHF_B3(RTYPE, in0, in1, in2, in3, in4, in5, mask0, mask1, mask2, \
  515. out0, out1, out2) \
  516. { \
  517. VSHF_B2(RTYPE, in0, in1, in2, in3, mask0, mask1, out0, out1); \
  518. out2 = (RTYPE)__msa_vshf_b((v16i8)mask2, (v16i8)in5, (v16i8)in4); \
  519. }
  520. #define VSHF_B3_SB(...) VSHF_B3(v16i8, __VA_ARGS__)
  521. /* Description : Shuffle halfword vector elements as per mask vector
  522. Arguments : Inputs - in0, in1, in2, in3, mask0, mask1
  523. Outputs - out0, out1
  524. Return Type - as per RTYPE
  525. Details : halfword elements from 'in0' & 'in1' are copied selectively to
  526. 'out0' as per control vector 'mask0'
  527. */
  528. #define VSHF_H2(RTYPE, in0, in1, in2, in3, mask0, mask1, out0, out1) \
  529. { \
  530. out0 = (RTYPE)__msa_vshf_h((v8i16)mask0, (v8i16)in1, (v8i16)in0); \
  531. out1 = (RTYPE)__msa_vshf_h((v8i16)mask1, (v8i16)in3, (v8i16)in2); \
  532. }
  533. #define VSHF_H2_SH(...) VSHF_H2(v8i16, __VA_ARGS__)
  534. /* Description : Dot product of byte vector elements
  535. Arguments : Inputs - mult0, mult1, cnst0, cnst1
  536. Outputs - out0, out1
  537. Return Type - as per RTYPE
  538. Details : Unsigned byte elements from 'mult0' are multiplied with
  539. unsigned byte elements from 'cnst0' producing a result
  540. twice the size of input i.e. unsigned halfword.
  541. The multiplication result of adjacent odd-even elements
  542. are added together and written to the 'out0' vector
  543. */
  544. #define DOTP_UB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) \
  545. { \
  546. out0 = (RTYPE)__msa_dotp_u_h((v16u8)mult0, (v16u8)cnst0); \
  547. out1 = (RTYPE)__msa_dotp_u_h((v16u8)mult1, (v16u8)cnst1); \
  548. }
  549. #define DOTP_UB2_UH(...) DOTP_UB2(v8u16, __VA_ARGS__)
  550. #define DOTP_UB4(RTYPE, mult0, mult1, mult2, mult3, cnst0, cnst1, cnst2, \
  551. cnst3, out0, out1, out2, out3) \
  552. { \
  553. DOTP_UB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1); \
  554. DOTP_UB2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3); \
  555. }
  556. #define DOTP_UB4_UH(...) DOTP_UB4(v8u16, __VA_ARGS__)
  557. /* Description : Dot product of byte vector elements
  558. Arguments : Inputs - mult0, mult1, cnst0, cnst1
  559. Outputs - out0, out1
  560. Return Type - as per RTYPE
  561. Details : Signed byte elements from 'mult0' are multiplied with
  562. signed byte elements from 'cnst0' producing a result
  563. twice the size of input i.e. signed halfword.
  564. The multiplication result of adjacent odd-even elements
  565. are added together and written to the 'out0' vector
  566. */
  567. #define DOTP_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) \
  568. { \
  569. out0 = (RTYPE)__msa_dotp_s_h((v16i8)mult0, (v16i8)cnst0); \
  570. out1 = (RTYPE)__msa_dotp_s_h((v16i8)mult1, (v16i8)cnst1); \
  571. }
  572. #define DOTP_SB2_SH(...) DOTP_SB2(v8i16, __VA_ARGS__)
  573. #define DOTP_SB4(RTYPE, mult0, mult1, mult2, mult3, cnst0, cnst1, cnst2, \
  574. cnst3, out0, out1, out2, out3) \
  575. { \
  576. DOTP_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1); \
  577. DOTP_SB2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3); \
  578. }
  579. #define DOTP_SB4_SH(...) DOTP_SB4(v8i16, __VA_ARGS__)
  580. /* Description : Dot product of halfword vector elements
  581. Arguments : Inputs - mult0, mult1, cnst0, cnst1
  582. Outputs - out0, out1
  583. Return Type - as per RTYPE
  584. Details : Signed halfword elements from 'mult0' are multiplied with
  585. signed halfword elements from 'cnst0' producing a result
  586. twice the size of input i.e. signed word.
  587. The multiplication result of adjacent odd-even elements
  588. are added together and written to the 'out0' vector
  589. */
  590. #define DOTP_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) \
  591. { \
  592. out0 = (RTYPE)__msa_dotp_s_w((v8i16)mult0, (v8i16)cnst0); \
  593. out1 = (RTYPE)__msa_dotp_s_w((v8i16)mult1, (v8i16)cnst1); \
  594. }
  595. #define DOTP_SH4(RTYPE, mult0, mult1, mult2, mult3, cnst0, cnst1, cnst2, \
  596. cnst3, out0, out1, out2, out3) \
  597. { \
  598. DOTP_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1); \
  599. DOTP_SH2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3); \
  600. }
  601. #define DOTP_SH4_SW(...) DOTP_SH4(v4i32, __VA_ARGS__)
  602. /* Description : Dot product of word vector elements
  603. Arguments : Inputs - mult0, mult1, cnst0, cnst1
  604. Outputs - out0, out1
  605. Return Type - as per RTYPE
  606. Details : Signed word elements from 'mult0' are multiplied with
  607. signed word elements from 'cnst0' producing a result
  608. twice the size of input i.e. signed double word.
  609. The multiplication result of adjacent odd-even elements
  610. are added together and written to the 'out0' vector
  611. */
  612. #define DOTP_SW2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) \
  613. { \
  614. out0 = (RTYPE)__msa_dotp_s_d((v4i32)mult0, (v4i32)cnst0); \
  615. out1 = (RTYPE)__msa_dotp_s_d((v4i32)mult1, (v4i32)cnst1); \
  616. }
  617. #define DOTP_SW2_SD(...) DOTP_SW2(v2i64, __VA_ARGS__)
  618. /* Description : Dot product & addition of byte vector elements
  619. Arguments : Inputs - mult0, mult1, cnst0, cnst1
  620. Outputs - out0, out1
  621. Return Type - as per RTYPE
  622. Details : Signed byte elements from 'mult0' are multiplied with
  623. signed byte elements from 'cnst0' producing a result
  624. twice the size of input i.e. signed halfword.
  625. The multiplication result of adjacent odd-even elements
  626. are added to the 'out0' vector
  627. */
  628. #define DPADD_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) \
  629. { \
  630. out0 = (RTYPE)__msa_dpadd_s_h((v8i16)out0, (v16i8)mult0, (v16i8)cnst0); \
  631. out1 = (RTYPE)__msa_dpadd_s_h((v8i16)out1, (v16i8)mult1, (v16i8)cnst1); \
  632. }
  633. #define DPADD_SB2_SH(...) DPADD_SB2(v8i16, __VA_ARGS__)
  634. #define DPADD_SB4(RTYPE, mult0, mult1, mult2, mult3, cnst0, cnst1, cnst2, \
  635. cnst3, out0, out1, out2, out3) \
  636. { \
  637. DPADD_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1); \
  638. DPADD_SB2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3); \
  639. }
  640. #define DPADD_SB4_SH(...) DPADD_SB4(v8i16, __VA_ARGS__)
  641. /* Description : Dot product & addition of halfword vector elements
  642. Arguments : Inputs - mult0, mult1, cnst0, cnst1
  643. Outputs - out0, out1
  644. Return Type - as per RTYPE
  645. Details : Signed halfword elements from 'mult0' are multiplied with
  646. signed halfword elements from 'cnst0' producing a result
  647. twice the size of input i.e. signed word.
  648. The multiplication result of adjacent odd-even elements
  649. are added to the 'out0' vector
  650. */
  651. #define DPADD_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) \
  652. { \
  653. out0 = (RTYPE)__msa_dpadd_s_w((v4i32)out0, (v8i16)mult0, (v8i16)cnst0); \
  654. out1 = (RTYPE)__msa_dpadd_s_w((v4i32)out1, (v8i16)mult1, (v8i16)cnst1); \
  655. }
  656. #define DPADD_SH2_SW(...) DPADD_SH2(v4i32, __VA_ARGS__)
  657. #define DPADD_SH4(RTYPE, mult0, mult1, mult2, mult3, cnst0, cnst1, cnst2, \
  658. cnst3, out0, out1, out2, out3) \
  659. { \
  660. DPADD_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1); \
  661. DPADD_SH2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3); \
  662. }
  663. #define DPADD_SH4_SW(...) DPADD_SH4(v4i32, __VA_ARGS__)
  664. /* Description : Dot product & addition of double word vector elements
  665. Arguments : Inputs - mult0, mult1
  666. Outputs - out0, out1
  667. Return Type - as per RTYPE
  668. Details : Each signed word element from 'mult0' is multiplied with itself
  669. producing an intermediate result twice the size of it
  670. i.e. signed double word
  671. The multiplication result of adjacent odd-even elements
  672. are added to the 'out0' vector
  673. */
  674. #define DPADD_SD2(RTYPE, mult0, mult1, out0, out1) \
  675. { \
  676. out0 = (RTYPE)__msa_dpadd_s_d((v2i64)out0, (v4i32)mult0, (v4i32)mult0); \
  677. out1 = (RTYPE)__msa_dpadd_s_d((v2i64)out1, (v4i32)mult1, (v4i32)mult1); \
  678. }
  679. #define DPADD_SD2_SD(...) DPADD_SD2(v2i64, __VA_ARGS__)
  680. /* Description : Clips all signed halfword elements of input vector
  681. between 0 & 255
  682. Arguments : Input - in
  683. Output - out_m
  684. Return Type - signed halfword
  685. */
  686. #define CLIP_SH_0_255(in) \
  687. ({ \
  688. v8i16 max_m = __msa_ldi_h(255); \
  689. v8i16 out_m; \
  690. \
  691. out_m = __msa_maxi_s_h((v8i16)in, 0); \
  692. out_m = __msa_min_s_h((v8i16)max_m, (v8i16)out_m); \
  693. out_m; \
  694. })
  695. #define CLIP_SH2_0_255(in0, in1) \
  696. { \
  697. in0 = CLIP_SH_0_255(in0); \
  698. in1 = CLIP_SH_0_255(in1); \
  699. }
  700. #define CLIP_SH4_0_255(in0, in1, in2, in3) \
  701. { \
  702. CLIP_SH2_0_255(in0, in1); \
  703. CLIP_SH2_0_255(in2, in3); \
  704. }
  705. /* Description : Clips all signed word elements of input vector
  706. between 0 & 255
  707. Arguments : Input - in
  708. Output - out_m
  709. Return Type - signed word
  710. */
  711. #define CLIP_SW_0_255(in) \
  712. ({ \
  713. v4i32 max_m = __msa_ldi_w(255); \
  714. v4i32 out_m; \
  715. \
  716. out_m = __msa_maxi_s_w((v4i32)in, 0); \
  717. out_m = __msa_min_s_w((v4i32)max_m, (v4i32)out_m); \
  718. out_m; \
  719. })
  720. /* Description : Horizontal addition of 4 signed word elements of input vector
  721. Arguments : Input - in (signed word vector)
  722. Output - sum_m (i32 sum)
  723. Return Type - signed word (GP)
  724. Details : 4 signed word elements of 'in' vector are added together and
  725. the resulting integer sum is returned
  726. */
  727. #define HADD_SW_S32(in) \
  728. ({ \
  729. v2i64 res0_m, res1_m; \
  730. int32_t sum_m; \
  731. \
  732. res0_m = __msa_hadd_s_d((v4i32)in, (v4i32)in); \
  733. res1_m = __msa_splati_d(res0_m, 1); \
  734. res0_m = res0_m + res1_m; \
  735. sum_m = __msa_copy_s_w((v4i32)res0_m, 0); \
  736. sum_m; \
  737. })
  738. /* Description : Horizontal addition of 8 unsigned halfword elements
  739. Arguments : Inputs - in (unsigned halfword vector)
  740. Outputs - sum_m (u32 sum)
  741. Return Type - unsigned word
  742. Details : 8 unsigned halfword elements of input vector are added
  743. together and the resulting integer sum is returned
  744. */
  745. #define HADD_UH_U32(in) \
  746. ({ \
  747. v4u32 res_m; \
  748. v2u64 res0_m, res1_m; \
  749. uint32_t sum_m; \
  750. \
  751. res_m = __msa_hadd_u_w((v8u16)in, (v8u16)in); \
  752. res0_m = __msa_hadd_u_d(res_m, res_m); \
  753. res1_m = (v2u64)__msa_splati_d((v2i64)res0_m, 1); \
  754. res0_m = res0_m + res1_m; \
  755. sum_m = __msa_copy_u_w((v4i32)res0_m, 0); \
  756. sum_m; \
  757. })
  758. /* Description : Horizontal addition of unsigned byte vector elements
  759. Arguments : Inputs - in0, in1
  760. Outputs - out0, out1
  761. Return Type - as per RTYPE
  762. Details : Each unsigned odd byte element from 'in0' is added to
  763. even unsigned byte element from 'in0' (pairwise) and the
  764. halfword result is written to 'out0'
  765. */
  766. #define HADD_UB2(RTYPE, in0, in1, out0, out1) \
  767. { \
  768. out0 = (RTYPE)__msa_hadd_u_h((v16u8)in0, (v16u8)in0); \
  769. out1 = (RTYPE)__msa_hadd_u_h((v16u8)in1, (v16u8)in1); \
  770. }
  771. #define HADD_UB2_UH(...) HADD_UB2(v8u16, __VA_ARGS__)
  772. /* Description : Horizontal subtraction of unsigned byte vector elements
  773. Arguments : Inputs - in0, in1
  774. Outputs - out0, out1
  775. Return Type - as per RTYPE
  776. Details : Each unsigned odd byte element from 'in0' is subtracted from
  777. even unsigned byte element from 'in0' (pairwise) and the
  778. halfword result is written to 'out0'
  779. */
  780. #define HSUB_UB2(RTYPE, in0, in1, out0, out1) \
  781. { \
  782. out0 = (RTYPE)__msa_hsub_u_h((v16u8)in0, (v16u8)in0); \
  783. out1 = (RTYPE)__msa_hsub_u_h((v16u8)in1, (v16u8)in1); \
  784. }
  785. #define HSUB_UB2_SH(...) HSUB_UB2(v8i16, __VA_ARGS__)
  786. /* Description : Horizontal subtraction of signed halfword vector elements
  787. Arguments : Inputs - in0, in1
  788. Outputs - out0, out1
  789. Return Type - as per RTYPE
  790. Details : Each signed odd halfword element from 'in0' is subtracted from
  791. even signed halfword element from 'in0' (pairwise) and the
  792. word result is written to 'out0'
  793. */
  794. #define HSUB_UH2(RTYPE, in0, in1, out0, out1) \
  795. { \
  796. out0 = (RTYPE)__msa_hsub_s_w((v8i16)in0, (v8i16)in0); \
  797. out1 = (RTYPE)__msa_hsub_s_w((v8i16)in1, (v8i16)in1); \
  798. }
  799. #define HSUB_UH2_SW(...) HSUB_UH2(v4i32, __VA_ARGS__)
  800. /* Description : Set element n input vector to GPR value
  801. Arguments : Inputs - in0, in1, in2, in3
  802. Output - out
  803. Return Type - as per RTYPE
  804. Details : Set element 0 in vector 'out' to value specified in 'in0'
  805. */
  806. #define INSERT_D2(RTYPE, in0, in1, out) \
  807. { \
  808. out = (RTYPE)__msa_insert_d((v2i64)out, 0, in0); \
  809. out = (RTYPE)__msa_insert_d((v2i64)out, 1, in1); \
  810. }
  811. #define INSERT_D2_SB(...) INSERT_D2(v16i8, __VA_ARGS__)
  812. /* Description : Interleave even byte elements from vectors
  813. Arguments : Inputs - in0, in1, in2, in3
  814. Outputs - out0, out1
  815. Return Type - as per RTYPE
  816. Details : Even byte elements of 'in0' and 'in1' are interleaved
  817. and written to 'out0'
  818. */
  819. #define ILVEV_B2(RTYPE, in0, in1, in2, in3, out0, out1) \
  820. { \
  821. out0 = (RTYPE)__msa_ilvev_b((v16i8)in1, (v16i8)in0); \
  822. out1 = (RTYPE)__msa_ilvev_b((v16i8)in3, (v16i8)in2); \
  823. }
  824. #define ILVEV_B2_UB(...) ILVEV_B2(v16u8, __VA_ARGS__)
  825. #define ILVEV_B2_SH(...) ILVEV_B2(v8i16, __VA_ARGS__)
  826. #define ILVEV_B2_SD(...) ILVEV_B2(v2i64, __VA_ARGS__)
  827. /* Description : Interleave even halfword elements from vectors
  828. Arguments : Inputs - in0, in1, in2, in3
  829. Outputs - out0, out1
  830. Return Type - as per RTYPE
  831. Details : Even halfword elements of 'in0' and 'in1' are interleaved
  832. and written to 'out0'
  833. */
  834. #define ILVEV_H2(RTYPE, in0, in1, in2, in3, out0, out1) \
  835. { \
  836. out0 = (RTYPE)__msa_ilvev_h((v8i16)in1, (v8i16)in0); \
  837. out1 = (RTYPE)__msa_ilvev_h((v8i16)in3, (v8i16)in2); \
  838. }
  839. #define ILVEV_H2_UB(...) ILVEV_H2(v16u8, __VA_ARGS__)
  840. #define ILVEV_H2_SH(...) ILVEV_H2(v8i16, __VA_ARGS__)
  841. /* Description : Interleave even word elements from vectors
  842. Arguments : Inputs - in0, in1, in2, in3
  843. Outputs - out0, out1
  844. Return Type - as per RTYPE
  845. Details : Even word elements of 'in0' and 'in1' are interleaved
  846. and written to 'out0'
  847. */
  848. #define ILVEV_W2(RTYPE, in0, in1, in2, in3, out0, out1) \
  849. { \
  850. out0 = (RTYPE)__msa_ilvev_w((v4i32)in1, (v4i32)in0); \
  851. out1 = (RTYPE)__msa_ilvev_w((v4i32)in3, (v4i32)in2); \
  852. }
  853. #define ILVEV_W2_SD(...) ILVEV_W2(v2i64, __VA_ARGS__)
  854. /* Description : Interleave even double word elements from vectors
  855. Arguments : Inputs - in0, in1, in2, in3
  856. Outputs - out0, out1
  857. Return Type - as per RTYPE
  858. Details : Even double word elements of 'in0' and 'in1' are interleaved
  859. and written to 'out0'
  860. */
  861. #define ILVEV_D2(RTYPE, in0, in1, in2, in3, out0, out1) \
  862. { \
  863. out0 = (RTYPE)__msa_ilvev_d((v2i64)in1, (v2i64)in0); \
  864. out1 = (RTYPE)__msa_ilvev_d((v2i64)in3, (v2i64)in2); \
  865. }
  866. #define ILVEV_D2_UB(...) ILVEV_D2(v16u8, __VA_ARGS__)
  867. /* Description : Interleave left half of byte elements from vectors
  868. Arguments : Inputs - in0, in1, in2, in3
  869. Outputs - out0, out1
  870. Return Type - as per RTYPE
  871. Details : Left half of byte elements of 'in0' and 'in1' are interleaved
  872. and written to 'out0'.
  873. */
  874. #define ILVL_B2(RTYPE, in0, in1, in2, in3, out0, out1) \
  875. { \
  876. out0 = (RTYPE)__msa_ilvl_b((v16i8)in0, (v16i8)in1); \
  877. out1 = (RTYPE)__msa_ilvl_b((v16i8)in2, (v16i8)in3); \
  878. }
  879. #define ILVL_B2_UB(...) ILVL_B2(v16u8, __VA_ARGS__)
  880. #define ILVL_B2_SB(...) ILVL_B2(v16i8, __VA_ARGS__)
  881. #define ILVL_B2_SH(...) ILVL_B2(v8i16, __VA_ARGS__)
  882. #define ILVL_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \
  883. out2, out3) \
  884. { \
  885. ILVL_B2(RTYPE, in0, in1, in2, in3, out0, out1); \
  886. ILVL_B2(RTYPE, in4, in5, in6, in7, out2, out3); \
  887. }
  888. #define ILVL_B4_SB(...) ILVL_B4(v16i8, __VA_ARGS__)
  889. #define ILVL_B4_SH(...) ILVL_B4(v8i16, __VA_ARGS__)
  890. /* Description : Interleave left half of halfword elements from vectors
  891. Arguments : Inputs - in0, in1, in2, in3
  892. Outputs - out0, out1
  893. Return Type - as per RTYPE
  894. Details : Left half of halfword elements of 'in0' and 'in1' are
  895. interleaved and written to 'out0'.
  896. */
  897. #define ILVL_H2(RTYPE, in0, in1, in2, in3, out0, out1) \
  898. { \
  899. out0 = (RTYPE)__msa_ilvl_h((v8i16)in0, (v8i16)in1); \
  900. out1 = (RTYPE)__msa_ilvl_h((v8i16)in2, (v8i16)in3); \
  901. }
  902. #define ILVL_H2_SH(...) ILVL_H2(v8i16, __VA_ARGS__)
  903. #define ILVL_H2_SW(...) ILVL_H2(v4i32, __VA_ARGS__)
  904. /* Description : Interleave left half of word elements from vectors
  905. Arguments : Inputs - in0, in1, in2, in3
  906. Outputs - out0, out1
  907. Return Type - as per RTYPE
  908. Details : Left half of word elements of 'in0' and 'in1' are interleaved
  909. and written to 'out0'.
  910. */
  911. #define ILVL_W2(RTYPE, in0, in1, in2, in3, out0, out1) \
  912. { \
  913. out0 = (RTYPE)__msa_ilvl_w((v4i32)in0, (v4i32)in1); \
  914. out1 = (RTYPE)__msa_ilvl_w((v4i32)in2, (v4i32)in3); \
  915. }
  916. #define ILVL_W2_SH(...) ILVL_W2(v8i16, __VA_ARGS__)
  917. /* Description : Interleave right half of byte elements from vectors
  918. Arguments : Inputs - in0, in1, in2, in3
  919. Outputs - out0, out1
  920. Return Type - as per RTYPE
  921. Details : Right half of byte elements of 'in0' and 'in1' are interleaved
  922. and written to out0.
  923. */
  924. #define ILVR_B2(RTYPE, in0, in1, in2, in3, out0, out1) \
  925. { \
  926. out0 = (RTYPE)__msa_ilvr_b((v16i8)in0, (v16i8)in1); \
  927. out1 = (RTYPE)__msa_ilvr_b((v16i8)in2, (v16i8)in3); \
  928. }
  929. #define ILVR_B2_UB(...) ILVR_B2(v16u8, __VA_ARGS__)
  930. #define ILVR_B2_SB(...) ILVR_B2(v16i8, __VA_ARGS__)
  931. #define ILVR_B2_SH(...) ILVR_B2(v8i16, __VA_ARGS__)
  932. #define ILVR_B2_SW(...) ILVR_B2(v4i32, __VA_ARGS__)
  933. #define ILVR_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \
  934. out2, out3) \
  935. { \
  936. ILVR_B2(RTYPE, in0, in1, in2, in3, out0, out1); \
  937. ILVR_B2(RTYPE, in4, in5, in6, in7, out2, out3); \
  938. }
  939. #define ILVR_B4_UB(...) ILVR_B4(v16u8, __VA_ARGS__)
  940. #define ILVR_B4_SB(...) ILVR_B4(v16i8, __VA_ARGS__)
  941. #define ILVR_B4_UH(...) ILVR_B4(v8u16, __VA_ARGS__)
  942. #define ILVR_B4_SH(...) ILVR_B4(v8i16, __VA_ARGS__)
  943. #define ILVR_B4_SW(...) ILVR_B4(v4i32, __VA_ARGS__)
  944. /* Description : Interleave right half of halfword elements from vectors
  945. Arguments : Inputs - in0, in1, in2, in3
  946. Outputs - out0, out1
  947. Return Type - as per RTYPE
  948. Details : Right half of halfword elements of 'in0' and 'in1' are
  949. interleaved and written to 'out0'.
  950. */
  951. #define ILVR_H2(RTYPE, in0, in1, in2, in3, out0, out1) \
  952. { \
  953. out0 = (RTYPE)__msa_ilvr_h((v8i16)in0, (v8i16)in1); \
  954. out1 = (RTYPE)__msa_ilvr_h((v8i16)in2, (v8i16)in3); \
  955. }
  956. #define ILVR_H2_SH(...) ILVR_H2(v8i16, __VA_ARGS__)
  957. #define ILVR_H2_SW(...) ILVR_H2(v4i32, __VA_ARGS__)
  958. #define ILVR_H4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \
  959. out2, out3) \
  960. { \
  961. ILVR_H2(RTYPE, in0, in1, in2, in3, out0, out1); \
  962. ILVR_H2(RTYPE, in4, in5, in6, in7, out2, out3); \
  963. }
  964. #define ILVR_H4_SH(...) ILVR_H4(v8i16, __VA_ARGS__)
  965. #define ILVR_H4_SW(...) ILVR_H4(v4i32, __VA_ARGS__)
  966. #define ILVR_W2(RTYPE, in0, in1, in2, in3, out0, out1) \
  967. { \
  968. out0 = (RTYPE)__msa_ilvr_w((v4i32)in0, (v4i32)in1); \
  969. out1 = (RTYPE)__msa_ilvr_w((v4i32)in2, (v4i32)in3); \
  970. }
  971. #define ILVR_W2_SH(...) ILVR_W2(v8i16, __VA_ARGS__)
  972. /* Description : Interleave right half of double word elements from vectors
  973. Arguments : Inputs - in0, in1, in2, in3
  974. Outputs - out0, out1
  975. Return Type - as per RTYPE
  976. Details : Right half of double word elements of 'in0' and 'in1' are
  977. interleaved and written to 'out0'.
  978. */
  979. #define ILVR_D2(RTYPE, in0, in1, in2, in3, out0, out1) \
  980. { \
  981. out0 = (RTYPE)__msa_ilvr_d((v2i64)(in0), (v2i64)(in1)); \
  982. out1 = (RTYPE)__msa_ilvr_d((v2i64)(in2), (v2i64)(in3)); \
  983. }
  984. #define ILVR_D2_UB(...) ILVR_D2(v16u8, __VA_ARGS__)
  985. #define ILVR_D2_SB(...) ILVR_D2(v16i8, __VA_ARGS__)
  986. #define ILVR_D2_SH(...) ILVR_D2(v8i16, __VA_ARGS__)
  987. #define ILVR_D4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \
  988. out2, out3) \
  989. { \
  990. ILVR_D2(RTYPE, in0, in1, in2, in3, out0, out1); \
  991. ILVR_D2(RTYPE, in4, in5, in6, in7, out2, out3); \
  992. }
  993. #define ILVR_D4_SB(...) ILVR_D4(v16i8, __VA_ARGS__)
  994. #define ILVR_D4_UB(...) ILVR_D4(v16u8, __VA_ARGS__)
  995. /* Description : Interleave both left and right half of input vectors
  996. Arguments : Inputs - in0, in1
  997. Outputs - out0, out1
  998. Return Type - as per RTYPE
  999. Details : Right half of byte elements from 'in0' and 'in1' are
  1000. interleaved and written to 'out0'
  1001. */
  1002. #define ILVRL_B2(RTYPE, in0, in1, out0, out1) \
  1003. { \
  1004. out0 = (RTYPE)__msa_ilvr_b((v16i8)in0, (v16i8)in1); \
  1005. out1 = (RTYPE)__msa_ilvl_b((v16i8)in0, (v16i8)in1); \
  1006. }
  1007. #define ILVRL_B2_UB(...) ILVRL_B2(v16u8, __VA_ARGS__)
  1008. #define ILVRL_B2_SB(...) ILVRL_B2(v16i8, __VA_ARGS__)
  1009. #define ILVRL_B2_UH(...) ILVRL_B2(v8u16, __VA_ARGS__)
  1010. #define ILVRL_B2_SH(...) ILVRL_B2(v8i16, __VA_ARGS__)
  1011. #define ILVRL_H2(RTYPE, in0, in1, out0, out1) \
  1012. { \
  1013. out0 = (RTYPE)__msa_ilvr_h((v8i16)in0, (v8i16)in1); \
  1014. out1 = (RTYPE)__msa_ilvl_h((v8i16)in0, (v8i16)in1); \
  1015. }
  1016. #define ILVRL_H2_SH(...) ILVRL_H2(v8i16, __VA_ARGS__)
  1017. #define ILVRL_H2_SW(...) ILVRL_H2(v4i32, __VA_ARGS__)
  1018. #define ILVRL_W2(RTYPE, in0, in1, out0, out1) \
  1019. { \
  1020. out0 = (RTYPE)__msa_ilvr_w((v4i32)in0, (v4i32)in1); \
  1021. out1 = (RTYPE)__msa_ilvl_w((v4i32)in0, (v4i32)in1); \
  1022. }
  1023. #define ILVRL_W2_UB(...) ILVRL_W2(v16u8, __VA_ARGS__)
  1024. #define ILVRL_W2_SH(...) ILVRL_W2(v8i16, __VA_ARGS__)
  1025. #define ILVRL_W2_SW(...) ILVRL_W2(v4i32, __VA_ARGS__)
  1026. /* Description : Maximum values between signed elements of vector and
  1027. 5-bit signed immediate value are copied to the output vector
  1028. Arguments : Inputs - in0, in1, in2, in3, max_val
  1029. Outputs - in place operation
  1030. Return Type - unsigned halfword
  1031. Details : Maximum of signed halfword element values from 'in0' and
  1032. 'max_val' are written in place
  1033. */
  1034. #define MAXI_SH2(RTYPE, in0, in1, max_val) \
  1035. { \
  1036. in0 = (RTYPE)__msa_maxi_s_h((v8i16)in0, (max_val)); \
  1037. in1 = (RTYPE)__msa_maxi_s_h((v8i16)in1, (max_val)); \
  1038. }
  1039. #define MAXI_SH2_SH(...) MAXI_SH2(v8i16, __VA_ARGS__)
  1040. /* Description : Saturate the halfword element values to the max
  1041. unsigned value of (sat_val + 1) bits
  1042. The element data width remains unchanged
  1043. Arguments : Inputs - in0, in1, sat_val
  1044. Outputs - in place operation
  1045. Return Type - as per RTYPE
  1046. Details : Each unsigned halfword element from 'in0' is saturated to the
  1047. value generated with (sat_val + 1) bit range.
  1048. The results are written in place
  1049. */
  1050. #define SAT_UH2(RTYPE, in0, in1, sat_val) \
  1051. { \
  1052. in0 = (RTYPE)__msa_sat_u_h((v8u16)in0, sat_val); \
  1053. in1 = (RTYPE)__msa_sat_u_h((v8u16)in1, sat_val); \
  1054. }
  1055. #define SAT_UH2_SH(...) SAT_UH2(v8i16, __VA_ARGS__)
  1056. /* Description : Saturate the halfword element values to the max
  1057. unsigned value of (sat_val + 1) bits
  1058. The element data width remains unchanged
  1059. Arguments : Inputs - in0, in1, sat_val
  1060. Outputs - in place operation
  1061. Return Type - as per RTYPE
  1062. Details : Each unsigned halfword element from 'in0' is saturated to the
  1063. value generated with (sat_val + 1) bit range
  1064. The results are written in place
  1065. */
  1066. #define SAT_SH2(RTYPE, in0, in1, sat_val) \
  1067. { \
  1068. in0 = (RTYPE)__msa_sat_s_h((v8i16)in0, sat_val); \
  1069. in1 = (RTYPE)__msa_sat_s_h((v8i16)in1, sat_val); \
  1070. }
  1071. #define SAT_SH2_SH(...) SAT_SH2(v8i16, __VA_ARGS__)
  1072. #define SAT_SH4(RTYPE, in0, in1, in2, in3, sat_val) \
  1073. { \
  1074. SAT_SH2(RTYPE, in0, in1, sat_val); \
  1075. SAT_SH2(RTYPE, in2, in3, sat_val); \
  1076. }
  1077. #define SAT_SH4_SH(...) SAT_SH4(v8i16, __VA_ARGS__)
  1078. /* Description : Indexed halfword element values are replicated to all
  1079. elements in output vector
  1080. Arguments : Inputs - in, idx0, idx1
  1081. Outputs - out0, out1
  1082. Return Type - as per RTYPE
  1083. Details : 'idx0' element value from 'in' vector is replicated to all
  1084. elements in 'out0' vector
  1085. Valid index range for halfword operation is 0-7
  1086. */
  1087. #define SPLATI_H2(RTYPE, in, idx0, idx1, out0, out1) \
  1088. { \
  1089. out0 = (RTYPE)__msa_splati_h((v8i16)in, idx0); \
  1090. out1 = (RTYPE)__msa_splati_h((v8i16)in, idx1); \
  1091. }
  1092. #define SPLATI_H2_SB(...) SPLATI_H2(v16i8, __VA_ARGS__)
  1093. #define SPLATI_H2_SH(...) SPLATI_H2(v8i16, __VA_ARGS__)
  1094. #define SPLATI_H3(RTYPE, in, idx0, idx1, idx2, out0, out1, out2) \
  1095. { \
  1096. SPLATI_H2(RTYPE, in, idx0, idx1, out0, out1); \
  1097. out2 = (RTYPE)__msa_splati_h((v8i16)in, idx2); \
  1098. }
  1099. #define SPLATI_H3_SB(...) SPLATI_H3(v16i8, __VA_ARGS__)
  1100. #define SPLATI_H3_SH(...) SPLATI_H3(v8i16, __VA_ARGS__)
  1101. /* Description : Indexed word element values are replicated to all
  1102. elements in output vector
  1103. Arguments : Inputs - in, stidx
  1104. Outputs - out0, out1
  1105. Return Type - as per RTYPE
  1106. Details : 'stidx' element value from 'in' vector is replicated to all
  1107. elements in 'out0' vector
  1108. 'stidx + 1' element value from 'in' vector is replicated to all
  1109. elements in 'out1' vector
  1110. Valid index range for word operation is 0-3
  1111. */
  1112. #define SPLATI_W2(RTYPE, in, stidx, out0, out1) \
  1113. { \
  1114. out0 = (RTYPE)__msa_splati_w((v4i32)in, stidx); \
  1115. out1 = (RTYPE)__msa_splati_w((v4i32)in, (stidx + 1)); \
  1116. }
  1117. #define SPLATI_W2_SW(...) SPLATI_W2(v4i32, __VA_ARGS__)
  1118. /* Description : Pack even byte elements of vector pairs
  1119. Arguments : Inputs - in0, in1, in2, in3
  1120. Outputs - out0, out1
  1121. Return Type - as per RTYPE
  1122. Details : Even byte elements of 'in0' are copied to the left half of
  1123. 'out0' & even byte elements of 'in1' are copied to the right
  1124. half of 'out0'.
  1125. */
  1126. #define PCKEV_B2(RTYPE, in0, in1, in2, in3, out0, out1) \
  1127. { \
  1128. out0 = (RTYPE)__msa_pckev_b((v16i8)in0, (v16i8)in1); \
  1129. out1 = (RTYPE)__msa_pckev_b((v16i8)in2, (v16i8)in3); \
  1130. }
  1131. #define PCKEV_B2_SB(...) PCKEV_B2(v16i8, __VA_ARGS__)
  1132. #define PCKEV_B2_UB(...) PCKEV_B2(v16u8, __VA_ARGS__)
  1133. #define PCKEV_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \
  1134. out2, out3) \
  1135. { \
  1136. PCKEV_B2(RTYPE, in0, in1, in2, in3, out0, out1); \
  1137. PCKEV_B2(RTYPE, in4, in5, in6, in7, out2, out3); \
  1138. }
  1139. #define PCKEV_B4_SB(...) PCKEV_B4(v16i8, __VA_ARGS__)
  1140. #define PCKEV_B4_UB(...) PCKEV_B4(v16u8, __VA_ARGS__)
  1141. #define PCKEV_B4_SH(...) PCKEV_B4(v8i16, __VA_ARGS__)
  1142. /* Description : Pack even halfword elements of vector pairs
  1143. Arguments : Inputs - in0, in1, in2, in3
  1144. Outputs - out0, out1
  1145. Return Type - as per RTYPE
  1146. Details : Even halfword elements of 'in0' are copied to the left half of
  1147. 'out0' & even halfword elements of 'in1' are copied to the
  1148. right half of 'out0'.
  1149. */
  1150. #define PCKEV_H2(RTYPE, in0, in1, in2, in3, out0, out1) \
  1151. { \
  1152. out0 = (RTYPE)__msa_pckev_h((v8i16)in0, (v8i16)in1); \
  1153. out1 = (RTYPE)__msa_pckev_h((v8i16)in2, (v8i16)in3); \
  1154. }
  1155. #define PCKEV_H2_SH(...) PCKEV_H2(v8i16, __VA_ARGS__)
  1156. #define PCKEV_H4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \
  1157. out2, out3) \
  1158. { \
  1159. PCKEV_H2(RTYPE, in0, in1, in2, in3, out0, out1); \
  1160. PCKEV_H2(RTYPE, in4, in5, in6, in7, out2, out3); \
  1161. }
  1162. #define PCKEV_H4_SH(...) PCKEV_H4(v8i16, __VA_ARGS__)
  1163. /* Description : Pack even double word elements of vector pairs
  1164. Arguments : Inputs - in0, in1, in2, in3
  1165. Outputs - out0, out1
  1166. Return Type - as per RTYPE
  1167. Details : Even double elements of 'in0' are copied to the left half of
  1168. 'out0' & even double elements of 'in1' are copied to the right
  1169. half of 'out0'.
  1170. */
  1171. #define PCKEV_D2(RTYPE, in0, in1, in2, in3, out0, out1) \
  1172. { \
  1173. out0 = (RTYPE)__msa_pckev_d((v2i64)in0, (v2i64)in1); \
  1174. out1 = (RTYPE)__msa_pckev_d((v2i64)in2, (v2i64)in3); \
  1175. }
  1176. #define PCKEV_D2_UB(...) PCKEV_D2(v16u8, __VA_ARGS__)
  1177. #define PCKEV_D2_SH(...) PCKEV_D2(v8i16, __VA_ARGS__)
  1178. /* Description : Pack odd double word elements of vector pairs
  1179. Arguments : Inputs - in0, in1, in2, in3
  1180. Outputs - out0, out1
  1181. Return Type - as per RTYPE
  1182. Details : Odd double word elements of 'in0' are copied to the left half
  1183. of 'out0' & odd double word elements of 'in1' are copied to
  1184. the right half of 'out0'.
  1185. */
  1186. #define PCKOD_D2(RTYPE, in0, in1, in2, in3, out0, out1) \
  1187. { \
  1188. out0 = (RTYPE)__msa_pckod_d((v2i64)in0, (v2i64)in1); \
  1189. out1 = (RTYPE)__msa_pckod_d((v2i64)in2, (v2i64)in3); \
  1190. }
  1191. #define PCKOD_D2_UB(...) PCKOD_D2(v16u8, __VA_ARGS__)
  1192. #define PCKOD_D2_SH(...) PCKOD_D2(v8i16, __VA_ARGS__)
  1193. /* Description : Each byte element is logically xor'ed with immediate 128
  1194. Arguments : Inputs - in0, in1
  1195. Outputs - in place operation
  1196. Return Type - as per RTYPE
  1197. Details : Each unsigned byte element from input vector 'in0' is
  1198. logically xor'ed with 128 and the result is stored in-place.
  1199. */
  1200. #define XORI_B2_128(RTYPE, in0, in1) \
  1201. { \
  1202. in0 = (RTYPE)__msa_xori_b((v16u8)in0, 128); \
  1203. in1 = (RTYPE)__msa_xori_b((v16u8)in1, 128); \
  1204. }
  1205. #define XORI_B2_128_UB(...) XORI_B2_128(v16u8, __VA_ARGS__)
  1206. #define XORI_B2_128_SB(...) XORI_B2_128(v16i8, __VA_ARGS__)
  1207. #define XORI_B3_128(RTYPE, in0, in1, in2) \
  1208. { \
  1209. XORI_B2_128(RTYPE, in0, in1); \
  1210. in2 = (RTYPE)__msa_xori_b((v16u8)in2, 128); \
  1211. }
  1212. #define XORI_B3_128_SB(...) XORI_B3_128(v16i8, __VA_ARGS__)
  1213. #define XORI_B4_128(RTYPE, in0, in1, in2, in3) \
  1214. { \
  1215. XORI_B2_128(RTYPE, in0, in1); \
  1216. XORI_B2_128(RTYPE, in2, in3); \
  1217. }
  1218. #define XORI_B4_128_UB(...) XORI_B4_128(v16u8, __VA_ARGS__)
  1219. #define XORI_B4_128_SB(...) XORI_B4_128(v16i8, __VA_ARGS__)
  1220. #define XORI_B5_128(RTYPE, in0, in1, in2, in3, in4) \
  1221. { \
  1222. XORI_B3_128(RTYPE, in0, in1, in2); \
  1223. XORI_B2_128(RTYPE, in3, in4); \
  1224. }
  1225. #define XORI_B5_128_SB(...) XORI_B5_128(v16i8, __VA_ARGS__)
  1226. #define XORI_B8_128(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7) \
  1227. { \
  1228. XORI_B4_128(RTYPE, in0, in1, in2, in3); \
  1229. XORI_B4_128(RTYPE, in4, in5, in6, in7); \
  1230. }
  1231. #define XORI_B8_128_SB(...) XORI_B8_128(v16i8, __VA_ARGS__)
  1232. /* Description : Shift left all elements of vector (generic for all data types)
  1233. Arguments : Inputs - in0, in1, in2, in3, shift
  1234. Outputs - in place operation
  1235. Return Type - as per input vector RTYPE
  1236. Details : Each element of vector 'in0' is left shifted by 'shift' and
  1237. the result is written in-place.
  1238. */
  1239. #define SLLI_4V(in0, in1, in2, in3, shift) \
  1240. { \
  1241. in0 = in0 << shift; \
  1242. in1 = in1 << shift; \
  1243. in2 = in2 << shift; \
  1244. in3 = in3 << shift; \
  1245. }
  1246. /* Description : Arithmetic shift right all elements of vector
  1247. (generic for all data types)
  1248. Arguments : Inputs - in0, in1, in2, in3, shift
  1249. Outputs - in place operation
  1250. Return Type - as per input vector RTYPE
  1251. Details : Each element of vector 'in0' is right shifted by 'shift' and
  1252. the result is written in-place. 'shift' is a GP variable.
  1253. */
  1254. #define SRA_4V(in0, in1, in2, in3, shift) \
  1255. { \
  1256. in0 = in0 >> shift; \
  1257. in1 = in1 >> shift; \
  1258. in2 = in2 >> shift; \
  1259. in3 = in3 >> shift; \
  1260. }
  1261. /* Description : Shift right arithmetic rounded words
  1262. Arguments : Inputs - in0, in1, shift
  1263. Outputs - in place operation
  1264. Return Type - as per RTYPE
  1265. Details : Each element of vector 'in0' is shifted right arithmetically by
  1266. the number of bits in the corresponding element in the vector
  1267. 'shift'. The last discarded bit is added to shifted value for
  1268. rounding and the result is written in-place.
  1269. 'shift' is a vector.
  1270. */
  1271. #define SRAR_W2(RTYPE, in0, in1, shift) \
  1272. { \
  1273. in0 = (RTYPE)__msa_srar_w((v4i32)in0, (v4i32)shift); \
  1274. in1 = (RTYPE)__msa_srar_w((v4i32)in1, (v4i32)shift); \
  1275. }
  1276. #define SRAR_W4(RTYPE, in0, in1, in2, in3, shift) \
  1277. { \
  1278. SRAR_W2(RTYPE, in0, in1, shift); \
  1279. SRAR_W2(RTYPE, in2, in3, shift); \
  1280. }
  1281. #define SRAR_W4_SW(...) SRAR_W4(v4i32, __VA_ARGS__)
  1282. /* Description : Shift right arithmetic rounded (immediate)
  1283. Arguments : Inputs - in0, in1, shift
  1284. Outputs - in place operation
  1285. Return Type - as per RTYPE
  1286. Details : Each element of vector 'in0' is shifted right arithmetically by
  1287. the value in 'shift'. The last discarded bit is added to the
  1288. shifted value for rounding and the result is written in-place.
  1289. 'shift' is an immediate value.
  1290. */
  1291. #define SRARI_H2(RTYPE, in0, in1, shift) \
  1292. { \
  1293. in0 = (RTYPE)__msa_srari_h((v8i16)in0, shift); \
  1294. in1 = (RTYPE)__msa_srari_h((v8i16)in1, shift); \
  1295. }
  1296. #define SRARI_H2_UH(...) SRARI_H2(v8u16, __VA_ARGS__)
  1297. #define SRARI_H2_SH(...) SRARI_H2(v8i16, __VA_ARGS__)
  1298. #define SRARI_H4(RTYPE, in0, in1, in2, in3, shift) \
  1299. { \
  1300. SRARI_H2(RTYPE, in0, in1, shift); \
  1301. SRARI_H2(RTYPE, in2, in3, shift); \
  1302. }
  1303. #define SRARI_H4_UH(...) SRARI_H4(v8u16, __VA_ARGS__)
  1304. #define SRARI_H4_SH(...) SRARI_H4(v8i16, __VA_ARGS__)
  1305. #define SRARI_W2(RTYPE, in0, in1, shift) \
  1306. { \
  1307. in0 = (RTYPE)__msa_srari_w((v4i32)in0, shift); \
  1308. in1 = (RTYPE)__msa_srari_w((v4i32)in1, shift); \
  1309. }
  1310. #define SRARI_W4(RTYPE, in0, in1, in2, in3, shift) \
  1311. { \
  1312. SRARI_W2(RTYPE, in0, in1, shift); \
  1313. SRARI_W2(RTYPE, in2, in3, shift); \
  1314. }
  1315. #define SRARI_W4_SW(...) SRARI_W4(v4i32, __VA_ARGS__)
  1316. /* Description : Multiplication of pairs of vectors
  1317. Arguments : Inputs - in0, in1, in2, in3
  1318. Outputs - out0, out1
  1319. Details : Each element from 'in0' is multiplied with elements from 'in1'
  1320. and the result is written to 'out0'
  1321. */
  1322. #define MUL2(in0, in1, in2, in3, out0, out1) \
  1323. { \
  1324. out0 = in0 * in1; \
  1325. out1 = in2 * in3; \
  1326. }
  1327. #define MUL4(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, out3) \
  1328. { \
  1329. MUL2(in0, in1, in2, in3, out0, out1); \
  1330. MUL2(in4, in5, in6, in7, out2, out3); \
  1331. }
  1332. /* Description : Addition of 2 pairs of vectors
  1333. Arguments : Inputs - in0, in1, in2, in3
  1334. Outputs - out0, out1
  1335. Details : Each element in 'in0' is added to 'in1' and result is written
  1336. to 'out0'.
  1337. */
  1338. #define ADD2(in0, in1, in2, in3, out0, out1) \
  1339. { \
  1340. out0 = in0 + in1; \
  1341. out1 = in2 + in3; \
  1342. }
  1343. #define ADD4(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, out3) \
  1344. { \
  1345. ADD2(in0, in1, in2, in3, out0, out1); \
  1346. ADD2(in4, in5, in6, in7, out2, out3); \
  1347. }
  1348. /* Description : Subtraction of 2 pairs of vectors
  1349. Arguments : Inputs - in0, in1, in2, in3
  1350. Outputs - out0, out1
  1351. Details : Each element in 'in1' is subtracted from 'in0' and result is
  1352. written to 'out0'.
  1353. */
  1354. #define SUB2(in0, in1, in2, in3, out0, out1) \
  1355. { \
  1356. out0 = in0 - in1; \
  1357. out1 = in2 - in3; \
  1358. }
  1359. #define SUB4(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, out3) \
  1360. { \
  1361. out0 = in0 - in1; \
  1362. out1 = in2 - in3; \
  1363. out2 = in4 - in5; \
  1364. out3 = in6 - in7; \
  1365. }
  1366. /* Description : Sign extend halfword elements from right half of the vector
  1367. Arguments : Input - in (halfword vector)
  1368. Output - out (sign extended word vector)
  1369. Return Type - signed word
  1370. Details : Sign bit of halfword elements from input vector 'in' is
  1371. extracted and interleaved with same vector 'in0' to generate
  1372. 4 word elements keeping sign intact
  1373. */
  1374. #define UNPCK_R_SH_SW(in, out) \
  1375. { \
  1376. v8i16 sign_m; \
  1377. \
  1378. sign_m = __msa_clti_s_h((v8i16)in, 0); \
  1379. out = (v4i32)__msa_ilvr_h(sign_m, (v8i16)in); \
  1380. }
  1381. /* Description : Zero extend unsigned byte elements to halfword elements
  1382. Arguments : Input - in (unsigned byte vector)
  1383. Outputs - out0, out1 (unsigned halfword vectors)
  1384. Return Type - signed halfword
  1385. Details : Zero extended right half of vector is returned in 'out0'
  1386. Zero extended left half of vector is returned in 'out1'
  1387. */
  1388. #define UNPCK_UB_SH(in, out0, out1) \
  1389. { \
  1390. v16i8 zero_m = { 0 }; \
  1391. \
  1392. ILVRL_B2_SH(zero_m, in, out0, out1); \
  1393. }
  1394. /* Description : Sign extend halfword elements from input vector and return
  1395. the result in pair of vectors
  1396. Arguments : Input - in (halfword vector)
  1397. Outputs - out0, out1 (sign extended word vectors)
  1398. Return Type - signed word
  1399. Details : Sign bit of halfword elements from input vector 'in' is
  1400. extracted and interleaved right with same vector 'in0' to
  1401. generate 4 signed word elements in 'out0'
  1402. Then interleaved left with same vector 'in0' to
  1403. generate 4 signed word elements in 'out1'
  1404. */
  1405. #define UNPCK_SH_SW(in, out0, out1) \
  1406. { \
  1407. v8i16 tmp_m; \
  1408. \
  1409. tmp_m = __msa_clti_s_h((v8i16)in, 0); \
  1410. ILVRL_H2_SW(tmp_m, in, out0, out1); \
  1411. }
  1412. /* Description : Butterfly of 4 input vectors
  1413. Arguments : Inputs - in0, in1, in2, in3
  1414. Outputs - out0, out1, out2, out3
  1415. Details : Butterfly operation
  1416. */
  1417. #define BUTTERFLY_4(in0, in1, in2, in3, out0, out1, out2, out3) \
  1418. { \
  1419. out0 = in0 + in3; \
  1420. out1 = in1 + in2; \
  1421. \
  1422. out2 = in1 - in2; \
  1423. out3 = in0 - in3; \
  1424. }
  1425. /* Description : Transpose input 8x8 byte block
  1426. Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7
  1427. Outputs - out0, out1, out2, out3, out4, out5, out6, out7
  1428. Return Type - as per RTYPE
  1429. */
  1430. #define TRANSPOSE8x8_UB(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, \
  1431. out1, out2, out3, out4, out5, out6, out7) \
  1432. { \
  1433. v16i8 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \
  1434. v16i8 tmp4_m, tmp5_m, tmp6_m, tmp7_m; \
  1435. \
  1436. ILVR_B4_SB(in2, in0, in3, in1, in6, in4, in7, in5, tmp0_m, tmp1_m, tmp2_m, \
  1437. tmp3_m); \
  1438. ILVRL_B2_SB(tmp1_m, tmp0_m, tmp4_m, tmp5_m); \
  1439. ILVRL_B2_SB(tmp3_m, tmp2_m, tmp6_m, tmp7_m); \
  1440. ILVRL_W2(RTYPE, tmp6_m, tmp4_m, out0, out2); \
  1441. ILVRL_W2(RTYPE, tmp7_m, tmp5_m, out4, out6); \
  1442. SLDI_B2_0(RTYPE, out0, out2, out1, out3, 8); \
  1443. SLDI_B2_0(RTYPE, out4, out6, out5, out7, 8); \
  1444. }
  1445. #define TRANSPOSE8x8_UB_UB(...) TRANSPOSE8x8_UB(v16u8, __VA_ARGS__)
  1446. /* Description : Transpose 16x4 block into 4x16 with byte elements in vectors
  1447. Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7,
  1448. in8, in9, in10, in11, in12, in13, in14, in15
  1449. Outputs - out0, out1, out2, out3
  1450. Return Type - unsigned byte
  1451. */
  1452. #define TRANSPOSE16x4_UB_UB(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, \
  1453. in10, in11, in12, in13, in14, in15, out0, out1, \
  1454. out2, out3) \
  1455. { \
  1456. v2i64 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \
  1457. \
  1458. ILVEV_W2_SD(in0, in4, in8, in12, tmp0_m, tmp1_m); \
  1459. out1 = (v16u8)__msa_ilvev_d(tmp1_m, tmp0_m); \
  1460. \
  1461. ILVEV_W2_SD(in1, in5, in9, in13, tmp0_m, tmp1_m); \
  1462. out3 = (v16u8)__msa_ilvev_d(tmp1_m, tmp0_m); \
  1463. \
  1464. ILVEV_W2_SD(in2, in6, in10, in14, tmp0_m, tmp1_m); \
  1465. \
  1466. tmp2_m = __msa_ilvev_d(tmp1_m, tmp0_m); \
  1467. ILVEV_W2_SD(in3, in7, in11, in15, tmp0_m, tmp1_m); \
  1468. \
  1469. tmp3_m = __msa_ilvev_d(tmp1_m, tmp0_m); \
  1470. ILVEV_B2_SD(out1, out3, tmp2_m, tmp3_m, tmp0_m, tmp1_m); \
  1471. out0 = (v16u8)__msa_ilvev_h((v8i16)tmp1_m, (v8i16)tmp0_m); \
  1472. out2 = (v16u8)__msa_ilvod_h((v8i16)tmp1_m, (v8i16)tmp0_m); \
  1473. \
  1474. tmp0_m = (v2i64)__msa_ilvod_b((v16i8)out3, (v16i8)out1); \
  1475. tmp1_m = (v2i64)__msa_ilvod_b((v16i8)tmp3_m, (v16i8)tmp2_m); \
  1476. out1 = (v16u8)__msa_ilvev_h((v8i16)tmp1_m, (v8i16)tmp0_m); \
  1477. out3 = (v16u8)__msa_ilvod_h((v8i16)tmp1_m, (v8i16)tmp0_m); \
  1478. }
  1479. /* Description : Transpose 16x8 block into 8x16 with byte elements in vectors
  1480. Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7,
  1481. in8, in9, in10, in11, in12, in13, in14, in15
  1482. Outputs - out0, out1, out2, out3, out4, out5, out6, out7
  1483. Return Type - unsigned byte
  1484. */
  1485. #define TRANSPOSE16x8_UB_UB(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, \
  1486. in10, in11, in12, in13, in14, in15, out0, out1, \
  1487. out2, out3, out4, out5, out6, out7) \
  1488. { \
  1489. v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \
  1490. v16u8 tmp4_m, tmp5_m, tmp6_m, tmp7_m; \
  1491. \
  1492. ILVEV_D2_UB(in0, in8, in1, in9, out7, out6); \
  1493. ILVEV_D2_UB(in2, in10, in3, in11, out5, out4); \
  1494. ILVEV_D2_UB(in4, in12, in5, in13, out3, out2); \
  1495. ILVEV_D2_UB(in6, in14, in7, in15, out1, out0); \
  1496. \
  1497. tmp0_m = (v16u8)__msa_ilvev_b((v16i8)out6, (v16i8)out7); \
  1498. tmp4_m = (v16u8)__msa_ilvod_b((v16i8)out6, (v16i8)out7); \
  1499. tmp1_m = (v16u8)__msa_ilvev_b((v16i8)out4, (v16i8)out5); \
  1500. tmp5_m = (v16u8)__msa_ilvod_b((v16i8)out4, (v16i8)out5); \
  1501. out5 = (v16u8)__msa_ilvev_b((v16i8)out2, (v16i8)out3); \
  1502. tmp6_m = (v16u8)__msa_ilvod_b((v16i8)out2, (v16i8)out3); \
  1503. out7 = (v16u8)__msa_ilvev_b((v16i8)out0, (v16i8)out1); \
  1504. tmp7_m = (v16u8)__msa_ilvod_b((v16i8)out0, (v16i8)out1); \
  1505. \
  1506. ILVEV_H2_UB(tmp0_m, tmp1_m, out5, out7, tmp2_m, tmp3_m); \
  1507. out0 = (v16u8)__msa_ilvev_w((v4i32)tmp3_m, (v4i32)tmp2_m); \
  1508. out4 = (v16u8)__msa_ilvod_w((v4i32)tmp3_m, (v4i32)tmp2_m); \
  1509. \
  1510. tmp2_m = (v16u8)__msa_ilvod_h((v8i16)tmp1_m, (v8i16)tmp0_m); \
  1511. tmp3_m = (v16u8)__msa_ilvod_h((v8i16)out7, (v8i16)out5); \
  1512. out2 = (v16u8)__msa_ilvev_w((v4i32)tmp3_m, (v4i32)tmp2_m); \
  1513. out6 = (v16u8)__msa_ilvod_w((v4i32)tmp3_m, (v4i32)tmp2_m); \
  1514. \
  1515. ILVEV_H2_UB(tmp4_m, tmp5_m, tmp6_m, tmp7_m, tmp2_m, tmp3_m); \
  1516. out1 = (v16u8)__msa_ilvev_w((v4i32)tmp3_m, (v4i32)tmp2_m); \
  1517. out5 = (v16u8)__msa_ilvod_w((v4i32)tmp3_m, (v4i32)tmp2_m); \
  1518. \
  1519. tmp2_m = (v16u8)__msa_ilvod_h((v8i16)tmp5_m, (v8i16)tmp4_m); \
  1520. tmp2_m = (v16u8)__msa_ilvod_h((v8i16)tmp5_m, (v8i16)tmp4_m); \
  1521. tmp3_m = (v16u8)__msa_ilvod_h((v8i16)tmp7_m, (v8i16)tmp6_m); \
  1522. tmp3_m = (v16u8)__msa_ilvod_h((v8i16)tmp7_m, (v8i16)tmp6_m); \
  1523. out3 = (v16u8)__msa_ilvev_w((v4i32)tmp3_m, (v4i32)tmp2_m); \
  1524. out7 = (v16u8)__msa_ilvod_w((v4i32)tmp3_m, (v4i32)tmp2_m); \
  1525. }
  1526. /* Description : Transpose 4x4 block with half word elements in vectors
  1527. Arguments : Inputs - in0, in1, in2, in3
  1528. Outputs - out0, out1, out2, out3
  1529. Return Type - signed halfword
  1530. */
  1531. #define TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, out0, out1, out2, out3) \
  1532. { \
  1533. v8i16 s0_m, s1_m; \
  1534. \
  1535. ILVR_H2_SH(in1, in0, in3, in2, s0_m, s1_m); \
  1536. ILVRL_W2_SH(s1_m, s0_m, out0, out2); \
  1537. out1 = (v8i16)__msa_ilvl_d((v2i64)out0, (v2i64)out0); \
  1538. out3 = (v8i16)__msa_ilvl_d((v2i64)out0, (v2i64)out2); \
  1539. }
  1540. /* Description : Transpose 8x4 block with half word elements in vectors
  1541. Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7
  1542. Outputs - out0, out1, out2, out3, out4, out5, out6, out7
  1543. Return Type - signed halfword
  1544. */
  1545. #define TRANSPOSE8X4_SH_SH(in0, in1, in2, in3, out0, out1, out2, out3) \
  1546. { \
  1547. v8i16 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \
  1548. \
  1549. ILVR_H2_SH(in1, in0, in3, in2, tmp0_m, tmp1_m); \
  1550. ILVL_H2_SH(in1, in0, in3, in2, tmp2_m, tmp3_m); \
  1551. ILVR_W2_SH(tmp1_m, tmp0_m, tmp3_m, tmp2_m, out0, out2); \
  1552. ILVL_W2_SH(tmp1_m, tmp0_m, tmp3_m, tmp2_m, out1, out3); \
  1553. }
  1554. /* Description : Transpose 4x4 block with word elements in vectors
  1555. Arguments : Inputs - in0, in1, in2, in3
  1556. Outputs - out0, out1, out2, out3
  1557. Return Type - signed word
  1558. */
  1559. #define TRANSPOSE4x4_SW_SW(in0, in1, in2, in3, out0, out1, out2, out3) \
  1560. { \
  1561. v4i32 s0_m, s1_m, s2_m, s3_m; \
  1562. \
  1563. ILVRL_W2_SW(in1, in0, s0_m, s1_m); \
  1564. ILVRL_W2_SW(in3, in2, s2_m, s3_m); \
  1565. \
  1566. out0 = (v4i32)__msa_ilvr_d((v2i64)s2_m, (v2i64)s0_m); \
  1567. out1 = (v4i32)__msa_ilvl_d((v2i64)s2_m, (v2i64)s0_m); \
  1568. out2 = (v4i32)__msa_ilvr_d((v2i64)s3_m, (v2i64)s1_m); \
  1569. out3 = (v4i32)__msa_ilvl_d((v2i64)s3_m, (v2i64)s1_m); \
  1570. }
  1571. /* Description : Dot product and addition of 3 signed halfword input vectors
  1572. Arguments : Inputs - in0, in1, in2, coeff0, coeff1, coeff2
  1573. Output - out0_m
  1574. Return Type - signed halfword
  1575. Details : Dot product of 'in0' with 'coeff0'
  1576. Dot product of 'in1' with 'coeff1'
  1577. Dot product of 'in2' with 'coeff2'
  1578. Addition of all the 3 vector results
  1579. out0_m = (in0 * coeff0) + (in1 * coeff1) + (in2 * coeff2)
  1580. */
  1581. #define DPADD_SH3_SH(in0, in1, in2, coeff0, coeff1, coeff2) \
  1582. ({ \
  1583. v8i16 tmp1_m; \
  1584. v8i16 out0_m; \
  1585. \
  1586. out0_m = __msa_dotp_s_h((v16i8)in0, (v16i8)coeff0); \
  1587. out0_m = __msa_dpadd_s_h(out0_m, (v16i8)in1, (v16i8)coeff1); \
  1588. tmp1_m = __msa_dotp_s_h((v16i8)in2, (v16i8)coeff2); \
  1589. out0_m = __msa_adds_s_h(out0_m, tmp1_m); \
  1590. \
  1591. out0_m; \
  1592. })
  1593. /* Description : Pack even elements of input vectors & xor with 128
  1594. Arguments : Inputs - in0, in1
  1595. Output - out_m
  1596. Return Type - unsigned byte
  1597. Details : Signed byte even elements from 'in0' and 'in1' are packed
  1598. together in one vector and the resulting vector is xor'ed with
  1599. 128 to shift the range from signed to unsigned byte
  1600. */
  1601. #define PCKEV_XORI128_UB(in0, in1) \
  1602. ({ \
  1603. v16u8 out_m; \
  1604. out_m = (v16u8)__msa_pckev_b((v16i8)in1, (v16i8)in0); \
  1605. out_m = (v16u8)__msa_xori_b((v16u8)out_m, 128); \
  1606. out_m; \
  1607. })
  1608. /* Description : Pack even byte elements and store byte vector in destination
  1609. memory
  1610. Arguments : Inputs - in0, in1, pdst
  1611. */
  1612. #define PCKEV_ST_SB(in0, in1, pdst) \
  1613. { \
  1614. v16i8 tmp_m; \
  1615. tmp_m = __msa_pckev_b((v16i8)in1, (v16i8)in0); \
  1616. ST_SB(tmp_m, (pdst)); \
  1617. }
  1618. /* Description : Horizontal 2 tap filter kernel code
  1619. Arguments : Inputs - in0, in1, mask, coeff, shift
  1620. */
  1621. #define HORIZ_2TAP_FILT_UH(in0, in1, mask, coeff, shift) \
  1622. ({ \
  1623. v16i8 tmp0_m; \
  1624. v8u16 tmp1_m; \
  1625. \
  1626. tmp0_m = __msa_vshf_b((v16i8)mask, (v16i8)in1, (v16i8)in0); \
  1627. tmp1_m = __msa_dotp_u_h((v16u8)tmp0_m, (v16u8)coeff); \
  1628. tmp1_m = (v8u16)__msa_srari_h((v8i16)tmp1_m, shift); \
  1629. \
  1630. tmp1_m; \
  1631. })
  1632. #endif /* VP8_COMMON_MIPS_MSA_VP8_MACROS_MSA_H_ */