loongson-mmintrin.h 22 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324
  1. /*
  2. * Loongson MMI optimizations for libjpeg-turbo
  3. *
  4. * Copyright (C) 2016-2018, Loongson Technology Corporation Limited, BeiJing.
  5. * All Rights Reserved.
  6. * Copyright (C) 2019, D. R. Commander. All Rights Reserved.
  7. *
  8. * This software is provided 'as-is', without any express or implied
  9. * warranty. In no event will the authors be held liable for any damages
  10. * arising from the use of this software.
  11. *
  12. * Permission is granted to anyone to use this software for any purpose,
  13. * including commercial applications, and to alter it and redistribute it
  14. * freely, subject to the following restrictions:
  15. *
  16. * 1. The origin of this software must not be misrepresented; you must not
  17. * claim that you wrote the original software. If you use this software
  18. * in a product, an acknowledgment in the product documentation would be
  19. * appreciated but is not required.
  20. * 2. Altered source versions must be plainly marked as such, and must not be
  21. * misrepresented as being the original software.
  22. * 3. This notice may not be removed or altered from any source distribution.
  23. */
  24. #ifndef __LOONGSON_MMINTRIN_H__
  25. #define __LOONGSON_MMINTRIN_H__
  26. #include <stdint.h>
  27. #define FUNCTION_ATTRIBS \
  28. __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  29. /* Vectors are stored in 64-bit floating-point registers. */
  30. typedef double __m64;
  31. /* Having a 32-bit datatype allows us to use 32-bit loads in places like
  32. load8888. */
  33. typedef float __m32;
  34. /********** Set Operations **********/
  35. extern __inline __m64 FUNCTION_ATTRIBS
  36. _mm_setzero_si64(void)
  37. {
  38. return 0.0;
  39. }
  40. extern __inline __m64 FUNCTION_ATTRIBS
  41. _mm_set_pi8(uint8_t __b7, uint8_t __b6, uint8_t __b5, uint8_t __b4,
  42. uint8_t __b3, uint8_t __b2, uint8_t __b1, uint8_t __b0)
  43. {
  44. __m64 ret;
  45. uint32_t lo = ((uint32_t)__b6 << 24) |
  46. ((uint32_t)__b4 << 16) |
  47. ((uint32_t)__b2 << 8) |
  48. (uint32_t)__b0;
  49. uint32_t hi = ((uint32_t)__b7 << 24) |
  50. ((uint32_t)__b5 << 16) |
  51. ((uint32_t)__b3 << 8) |
  52. (uint32_t)__b1;
  53. asm("mtc1 %1, %0\n\t"
  54. "mtc1 %2, $f0\n\t"
  55. "punpcklbh %0, %0, $f0\n\t"
  56. : "=f" (ret)
  57. : "r" (lo), "r" (hi)
  58. : "$f0"
  59. );
  60. return ret;
  61. }
  62. extern __inline __m64 FUNCTION_ATTRIBS
  63. _mm_set_pi16(uint16_t __h3, uint16_t __h2, uint16_t __h1, uint16_t __h0)
  64. {
  65. __m64 ret;
  66. uint32_t lo = ((uint32_t)__h2 << 16) | (uint32_t)__h0;
  67. uint32_t hi = ((uint32_t)__h3 << 16) | (uint32_t)__h1;
  68. asm("mtc1 %1, %0\n\t"
  69. "mtc1 %2, $f0\n\t"
  70. "punpcklhw %0, %0, $f0\n\t"
  71. : "=f" (ret)
  72. : "r" (lo), "r" (hi)
  73. : "$f0"
  74. );
  75. return ret;
  76. }
  77. #define _MM_SHUFFLE(fp3, fp2, fp1, fp0) \
  78. (((fp3) << 6) | ((fp2) << 4) | ((fp1) << 2) | (fp0))
  79. extern __inline __m64 FUNCTION_ATTRIBS
  80. _mm_set_pi32(uint32_t __i1, uint32_t __i0)
  81. {
  82. if (__builtin_constant_p(__i1) && __builtin_constant_p(__i0)) {
  83. uint64_t val = ((uint64_t)__i1 << 32) |
  84. ((uint64_t)__i0 << 0);
  85. return *(__m64 *)&val;
  86. } else if (__i1 == __i0) {
  87. uint64_t imm = _MM_SHUFFLE(1, 0, 1, 0);
  88. __m64 ret;
  89. asm("pshufh %0, %1, %2\n\t"
  90. : "=f" (ret)
  91. : "f" (*(__m32 *)&__i1), "f" (*(__m64 *)&imm)
  92. );
  93. return ret;
  94. } else {
  95. uint64_t val = ((uint64_t)__i1 << 32) |
  96. ((uint64_t)__i0 << 0);
  97. return *(__m64 *)&val;
  98. }
  99. }
  100. extern __inline __m64 FUNCTION_ATTRIBS
  101. _mm_set1_pi8(uint8_t __b0)
  102. {
  103. __m64 ret;
  104. asm("sll $8, %1, 8\n\t"
  105. "or %1, %1, $8\n\t"
  106. "mtc1 %1, %0\n\t"
  107. "mtc1 $0, $f0\n\t"
  108. "pshufh %0, %0, $f0\n\t"
  109. : "=f" (ret)
  110. : "r" (__b0)
  111. : "$8", "$f0"
  112. );
  113. return ret;
  114. }
  115. extern __inline __m64 FUNCTION_ATTRIBS
  116. _mm_set1_pi16(uint16_t __h0)
  117. {
  118. __m64 ret;
  119. asm("mtc1 %1, %0\n\t"
  120. "mtc1 $0, $f0\n\t"
  121. "pshufh %0, %0, $f0\n\t"
  122. : "=f" (ret)
  123. : "r" (__h0)
  124. : "$8", "$f0"
  125. );
  126. return ret;
  127. }
  128. extern __inline __m64 FUNCTION_ATTRIBS
  129. _mm_set1_pi32(unsigned __i0)
  130. {
  131. return _mm_set_pi32(__i0, __i0);
  132. }
  133. extern __inline __m64 FUNCTION_ATTRIBS
  134. _mm_setr_pi8(uint8_t __h0, uint8_t __h1, uint8_t __h2, uint8_t __h3,
  135. uint8_t __h4, uint8_t __h5, uint8_t __h6, uint8_t __h7)
  136. {
  137. return _mm_set_pi8(__h7, __h6, __h5, __h4,
  138. __h3, __h2, __h1, __h0);
  139. }
  140. extern __inline __m64 FUNCTION_ATTRIBS
  141. _mm_setr_pi16(uint16_t __w0, uint16_t __w1, uint16_t __w2, uint16_t __w3)
  142. {
  143. return _mm_set_pi16(__w3, __w2, __w1, __w0);
  144. }
  145. extern __inline __m64 FUNCTION_ATTRIBS
  146. _mm_setr_pi32(uint32_t __i0, uint32_t __i1)
  147. {
  148. return _mm_set_pi32(__i1, __i0);
  149. }
  150. /********** Arithmetic Operations **********/
  151. extern __inline __m64 FUNCTION_ATTRIBS
  152. _mm_add_pi8(__m64 __m1, __m64 __m2)
  153. {
  154. __m64 ret;
  155. asm("paddb %0, %1, %2\n\t"
  156. : "=f" (ret)
  157. : "f" (__m1), "f" (__m2)
  158. );
  159. return ret;
  160. }
  161. extern __inline __m64 FUNCTION_ATTRIBS
  162. _mm_add_pi16(__m64 __m1, __m64 __m2)
  163. {
  164. __m64 ret;
  165. asm("paddh %0, %1, %2\n\t"
  166. : "=f" (ret)
  167. : "f" (__m1), "f" (__m2)
  168. );
  169. return ret;
  170. }
  171. extern __inline __m64 FUNCTION_ATTRIBS
  172. _mm_add_pi32(__m64 __m1, __m64 __m2)
  173. {
  174. __m64 ret;
  175. asm("paddw %0, %1, %2\n\t"
  176. : "=f" (ret)
  177. : "f" (__m1), "f" (__m2)
  178. );
  179. return ret;
  180. }
  181. extern __inline __m64 FUNCTION_ATTRIBS
  182. _mm_add_si64(__m64 __m1, __m64 __m2)
  183. {
  184. __m64 ret;
  185. asm("paddd %0, %1, %2\n\t"
  186. : "=f" (ret)
  187. : "f" (__m1), "f" (__m2)
  188. );
  189. return ret;
  190. }
  191. extern __inline __m64 FUNCTION_ATTRIBS
  192. _mm_adds_pi8(__m64 __m1, __m64 __m2)
  193. {
  194. __m64 ret;
  195. asm("paddsb %0, %1, %2\n\t"
  196. : "=f" (ret)
  197. : "f" (__m1), "f" (__m2)
  198. );
  199. return ret;
  200. }
  201. extern __inline __m64 FUNCTION_ATTRIBS
  202. _mm_adds_pi16(__m64 __m1, __m64 __m2)
  203. {
  204. __m64 ret;
  205. asm("paddsh %0, %1, %2\n\t"
  206. : "=f" (ret)
  207. : "f" (__m1), "f" (__m2)
  208. );
  209. return ret;
  210. }
  211. extern __inline __m64 FUNCTION_ATTRIBS
  212. _mm_adds_pu8(__m64 __m1, __m64 __m2)
  213. {
  214. __m64 ret;
  215. asm("paddusb %0, %1, %2\n\t"
  216. : "=f" (ret)
  217. : "f" (__m1), "f" (__m2)
  218. );
  219. return ret;
  220. }
  221. extern __inline __m64 FUNCTION_ATTRIBS
  222. _mm_adds_pu16(__m64 __m1, __m64 __m2)
  223. {
  224. __m64 ret;
  225. asm("paddush %0, %1, %2\n\t"
  226. : "=f" (ret)
  227. : "f" (__m1), "f" (__m2)
  228. );
  229. return ret;
  230. }
  231. extern __inline __m64 FUNCTION_ATTRIBS
  232. _mm_avg_pu8(__m64 __m1, __m64 __m2)
  233. {
  234. __m64 ret;
  235. asm("pavgb %0, %1, %2\n\t"
  236. : "=f" (ret)
  237. : "f" (__m1), "f" (__m2)
  238. );
  239. return ret;
  240. }
  241. extern __inline __m64 FUNCTION_ATTRIBS
  242. _mm_avg_pu16(__m64 __m1, __m64 __m2)
  243. {
  244. __m64 ret;
  245. asm("pavgh %0, %1, %2\n\t"
  246. : "=f" (ret)
  247. : "f" (__m1), "f" (__m2)
  248. );
  249. return ret;
  250. }
  251. extern __inline __m64 FUNCTION_ATTRIBS
  252. _mm_madd_pi16(__m64 __m1, __m64 __m2)
  253. {
  254. __m64 ret;
  255. asm("pmaddhw %0, %1, %2\n\t"
  256. : "=f" (ret)
  257. : "f" (__m1), "f" (__m2)
  258. );
  259. return ret;
  260. }
  261. extern __inline __m64 FUNCTION_ATTRIBS
  262. _mm_max_pi16(__m64 __m1, __m64 __m2)
  263. {
  264. __m64 ret;
  265. asm("pmaxsh %0, %1, %2\n\t"
  266. : "=f" (ret)
  267. : "f" (__m1), "f" (__m2)
  268. );
  269. return ret;
  270. }
  271. extern __inline __m64 FUNCTION_ATTRIBS
  272. _mm_max_pu8(__m64 __m1, __m64 __m2)
  273. {
  274. __m64 ret;
  275. asm("pmaxub %0, %1, %2\n\t"
  276. : "=f" (ret)
  277. : "f" (__m1), "f" (__m2)
  278. );
  279. return ret;
  280. }
  281. extern __inline __m64 FUNCTION_ATTRIBS
  282. _mm_min_pi16(__m64 __m1, __m64 __m2)
  283. {
  284. __m64 ret;
  285. asm("pminsh %0, %1, %2\n\t"
  286. : "=f" (ret)
  287. : "f" (__m1), "f" (__m2)
  288. );
  289. return ret;
  290. }
  291. extern __inline __m64 FUNCTION_ATTRIBS
  292. _mm_min_pu8(__m64 __m1, __m64 __m2)
  293. {
  294. __m64 ret;
  295. asm("pminub %0, %1, %2\n\t"
  296. : "=f" (ret)
  297. : "f" (__m1), "f" (__m2)
  298. );
  299. return ret;
  300. }
  301. extern __inline int FUNCTION_ATTRIBS
  302. _mm_movemask_pi8(__m64 __m1)
  303. {
  304. int ret;
  305. asm("pmovmskb %0, %1\n\t"
  306. : "=r" (ret)
  307. : "y" (__m1)
  308. );
  309. return ret;
  310. }
  311. extern __inline __m64 FUNCTION_ATTRIBS
  312. _mm_mulhi_pi16(__m64 __m1, __m64 __m2)
  313. {
  314. __m64 ret;
  315. asm("pmulhh %0, %1, %2\n\t"
  316. : "=f" (ret)
  317. : "f" (__m1), "f" (__m2)
  318. );
  319. return ret;
  320. }
  321. extern __inline __m64 FUNCTION_ATTRIBS
  322. _mm_mulhi_pu16(__m64 __m1, __m64 __m2)
  323. {
  324. __m64 ret;
  325. asm("pmulhuh %0, %1, %2\n\t"
  326. : "=f" (ret)
  327. : "f" (__m1), "f" (__m2)
  328. );
  329. return ret;
  330. }
  331. extern __inline __m64 FUNCTION_ATTRIBS
  332. _mm_mullo_pi16(__m64 __m1, __m64 __m2)
  333. {
  334. __m64 ret;
  335. asm("pmullh %0, %1, %2\n\t"
  336. : "=f" (ret)
  337. : "f" (__m1), "f" (__m2)
  338. );
  339. return ret;
  340. }
  341. extern __inline __m64 FUNCTION_ATTRIBS
  342. _mm_mul_pu32(__m64 __m1, __m64 __m2)
  343. {
  344. __m64 ret;
  345. asm("pmuluw %0, %1, %2\n\t"
  346. : "=f" (ret)
  347. : "f" (__m1), "f" (__m2)
  348. );
  349. return ret;
  350. }
  351. extern __inline __m64 FUNCTION_ATTRIBS
  352. _mm_sad_pu8(__m64 __m1, __m64 __m2)
  353. {
  354. __m64 ret;
  355. asm("psadbh %0, %1, %2\n\t"
  356. : "=f" (ret)
  357. : "f" (__m1), "f" (__m2)
  358. );
  359. return ret;
  360. }
  361. extern __inline __m64 FUNCTION_ATTRIBS
  362. _mm_asub_pu8(__m64 __m1, __m64 __m2)
  363. {
  364. __m64 ret;
  365. asm("pasubub %0, %1, %2\n\t"
  366. : "=f" (ret)
  367. : "f" (__m1), "f" (__m2)
  368. );
  369. return ret;
  370. }
  371. extern __inline __m64 FUNCTION_ATTRIBS
  372. _mm_biadd_pu8(__m64 __m1, __m64 __m2)
  373. {
  374. __m64 ret;
  375. asm("biadd %0, %1, %2\n\t"
  376. : "=f" (ret)
  377. : "f" (__m1), "f" (__m2)
  378. );
  379. return ret;
  380. }
  381. extern __inline __m64 FUNCTION_ATTRIBS
  382. _mm_sub_pi8(__m64 __m1, __m64 __m2)
  383. {
  384. __m64 ret;
  385. asm("psubb %0, %1, %2\n\t"
  386. : "=f" (ret)
  387. : "f" (__m1), "f" (__m2)
  388. );
  389. return ret;
  390. }
  391. extern __inline __m64 FUNCTION_ATTRIBS
  392. _mm_sub_pi16(__m64 __m1, __m64 __m2)
  393. {
  394. __m64 ret;
  395. asm("psubh %0, %1, %2\n\t"
  396. : "=f" (ret)
  397. : "f" (__m1), "f" (__m2)
  398. );
  399. return ret;
  400. }
  401. extern __inline __m64 FUNCTION_ATTRIBS
  402. _mm_sub_pi32(__m64 __m1, __m64 __m2)
  403. {
  404. __m64 ret;
  405. asm("psubw %0, %1, %2\n\t"
  406. : "=f" (ret)
  407. : "f" (__m1), "f" (__m2)
  408. );
  409. return ret;
  410. }
  411. extern __inline __m64 FUNCTION_ATTRIBS
  412. _mm_sub_si64(__m64 __m1, __m64 __m2)
  413. {
  414. __m64 ret;
  415. asm("psubd %0, %1, %2\n\t"
  416. : "=f" (ret)
  417. : "f" (__m1), "f" (__m2)
  418. );
  419. return ret;
  420. }
  421. extern __inline __m64 FUNCTION_ATTRIBS
  422. _mm_subs_pi8(__m64 __m1, __m64 __m2)
  423. {
  424. __m64 ret;
  425. asm("psubsb %0, %1, %2\n\t"
  426. : "=f" (ret)
  427. : "f" (__m1), "f" (__m2)
  428. );
  429. return ret;
  430. }
  431. extern __inline __m64 FUNCTION_ATTRIBS
  432. _mm_subs_pi16(__m64 __m1, __m64 __m2)
  433. {
  434. __m64 ret;
  435. asm("psubsh %0, %1, %2\n\t"
  436. : "=f" (ret)
  437. : "f" (__m1), "f" (__m2)
  438. );
  439. return ret;
  440. }
  441. extern __inline __m64 FUNCTION_ATTRIBS
  442. _mm_subs_pu8(__m64 __m1, __m64 __m2)
  443. {
  444. __m64 ret;
  445. asm("psubusb %0, %1, %2\n\t"
  446. : "=f" (ret)
  447. : "f" (__m1), "f" (__m2)
  448. );
  449. return ret;
  450. }
  451. extern __inline __m64 FUNCTION_ATTRIBS
  452. _mm_subs_pu16(__m64 __m1, __m64 __m2)
  453. {
  454. __m64 ret;
  455. asm("psubush %0, %1, %2\n\t"
  456. : "=f" (ret)
  457. : "f" (__m1), "f" (__m2)
  458. );
  459. return ret;
  460. }
  461. /********** Logical Operations **********/
  462. extern __inline __m64 FUNCTION_ATTRIBS
  463. _mm_and_si64(__m64 __m1, __m64 __m2)
  464. {
  465. __m64 ret;
  466. asm("and %0, %1, %2\n\t"
  467. : "=f" (ret)
  468. : "f" (__m1), "f" (__m2)
  469. );
  470. return ret;
  471. }
  472. extern __inline __m64 FUNCTION_ATTRIBS
  473. _mm_andnot_si64(__m64 __m1, __m64 __m2)
  474. {
  475. __m64 ret;
  476. asm("andn %0, %1, %2\n\t"
  477. : "=f" (ret)
  478. : "f" (__m1), "f" (__m2)
  479. );
  480. return ret;
  481. }
  482. extern __inline __m64 FUNCTION_ATTRIBS
  483. _mm_or_si32(__m32 __m1, __m32 __m2)
  484. {
  485. __m32 ret;
  486. asm("or %0, %1, %2\n\t"
  487. : "=f" (ret)
  488. : "f" (__m1), "f" (__m2)
  489. );
  490. return ret;
  491. }
  492. extern __inline __m64 FUNCTION_ATTRIBS
  493. _mm_or_si64(__m64 __m1, __m64 __m2)
  494. {
  495. __m64 ret;
  496. asm("or %0, %1, %2\n\t"
  497. : "=f" (ret)
  498. : "f" (__m1), "f" (__m2)
  499. );
  500. return ret;
  501. }
  502. extern __inline __m64 FUNCTION_ATTRIBS
  503. _mm_xor_si64(__m64 __m1, __m64 __m2)
  504. {
  505. __m64 ret;
  506. asm("xor %0, %1, %2\n\t"
  507. : "=f" (ret)
  508. : "f" (__m1), "f" (__m2)
  509. );
  510. return ret;
  511. }
  512. /********** Shift Operations **********/
  513. extern __inline __m64 FUNCTION_ATTRIBS
  514. _mm_slli_pi16(__m64 __m, int64_t __count)
  515. {
  516. __m64 ret;
  517. asm("psllh %0, %1, %2\n\t"
  518. : "=f" (ret)
  519. : "f" (__m), "f" (*(__m64 *)&__count)
  520. );
  521. return ret;
  522. }
  523. extern __inline __m64 FUNCTION_ATTRIBS
  524. _mm_slli_pi32(__m64 __m, int64_t __count)
  525. {
  526. __m64 ret;
  527. asm("psllw %0, %1, %2\n\t"
  528. : "=f" (ret)
  529. : "f" (__m), "f" (*(__m64 *)&__count)
  530. );
  531. return ret;
  532. }
  533. extern __inline __m64 FUNCTION_ATTRIBS
  534. _mm_slli_si64(__m64 __m, int64_t __count)
  535. {
  536. __m64 ret;
  537. asm("dsll %0, %1, %2\n\t"
  538. : "=f" (ret)
  539. : "f" (__m), "f" (*(__m64 *)&__count)
  540. );
  541. return ret;
  542. }
  543. extern __inline __m64 FUNCTION_ATTRIBS
  544. _mm_srli_pi16(__m64 __m, int64_t __count)
  545. {
  546. __m64 ret;
  547. asm("psrlh %0, %1, %2\n\t"
  548. : "=f" (ret)
  549. : "f" (__m), "f" (*(__m64 *)&__count)
  550. );
  551. return ret;
  552. }
  553. extern __inline __m64 FUNCTION_ATTRIBS
  554. _mm_srli_pi32(__m64 __m, int64_t __count)
  555. {
  556. __m64 ret;
  557. asm("psrlw %0, %1, %2\n\t"
  558. : "=f" (ret)
  559. : "f" (__m), "f" (*(__m64 *)&__count)
  560. );
  561. return ret;
  562. }
  563. extern __inline __m64 FUNCTION_ATTRIBS
  564. _mm_srli_si64(__m64 __m, int64_t __count)
  565. {
  566. __m64 ret;
  567. asm("dsrl %0, %1, %2\n\t"
  568. : "=f" (ret)
  569. : "f" (__m), "f" (*(__m64 *)&__count)
  570. );
  571. return ret;
  572. }
  573. extern __inline __m64 FUNCTION_ATTRIBS
  574. _mm_srai_pi16(__m64 __m, int64_t __count)
  575. {
  576. __m64 ret;
  577. asm("psrah %0, %1, %2\n\t"
  578. : "=f" (ret)
  579. : "f" (__m), "f" (*(__m64 *)&__count)
  580. );
  581. return ret;
  582. }
  583. extern __inline __m64 FUNCTION_ATTRIBS
  584. _mm_srai_pi32(__m64 __m, int64_t __count)
  585. {
  586. __m64 ret;
  587. asm("psraw %0, %1, %2\n\t"
  588. : "=f" (ret)
  589. : "f" (__m), "f" (*(__m64 *)&__count)
  590. );
  591. return ret;
  592. }
  593. extern __inline __m64 FUNCTION_ATTRIBS
  594. _mm_srai_si64(__m64 __m, int64_t __count)
  595. {
  596. __m64 ret;
  597. asm("dsra %0, %1, %2\n\t"
  598. : "=f" (ret)
  599. : "f" (__m), "f" (*(__m64 *)&__count)
  600. );
  601. return ret;
  602. }
  603. /********** Conversion Intrinsics **********/
  604. extern __inline __m64 FUNCTION_ATTRIBS
  605. to_m64(uint64_t x)
  606. {
  607. return *(__m64 *)&x;
  608. }
  609. extern __inline uint64_t FUNCTION_ATTRIBS
  610. to_uint64(__m64 x)
  611. {
  612. return *(uint64_t *)&x;
  613. }
  614. /********** Comparison Intrinsics **********/
  615. extern __inline __m64 FUNCTION_ATTRIBS
  616. _mm_cmpeq_pi8(__m64 __m1, __m64 __m2)
  617. {
  618. __m64 ret;
  619. asm("pcmpeqb %0, %1, %2\n\t"
  620. : "=f" (ret)
  621. : "f" (__m1), "f" (__m2)
  622. );
  623. return ret;
  624. }
  625. extern __inline __m64 FUNCTION_ATTRIBS
  626. _mm_cmpeq_pi16(__m64 __m1, __m64 __m2)
  627. {
  628. __m64 ret;
  629. asm("pcmpeqh %0, %1, %2\n\t"
  630. : "=f" (ret)
  631. : "f" (__m1), "f" (__m2)
  632. );
  633. return ret;
  634. }
  635. extern __inline __m64 FUNCTION_ATTRIBS
  636. _mm_cmpeq_pi32(__m64 __m1, __m64 __m2)
  637. {
  638. __m64 ret;
  639. asm("pcmpeqw %0, %1, %2\n\t"
  640. : "=f" (ret)
  641. : "f" (__m1), "f" (__m2)
  642. );
  643. return ret;
  644. }
  645. extern __inline __m64 FUNCTION_ATTRIBS
  646. _mm_cmpgt_pi8(__m64 __m1, __m64 __m2)
  647. {
  648. __m64 ret;
  649. asm("pcmpgtb %0, %1, %2\n\t"
  650. : "=f" (ret)
  651. : "f" (__m1), "f" (__m2)
  652. );
  653. return ret;
  654. }
  655. extern __inline __m64 FUNCTION_ATTRIBS
  656. _mm_cmpgt_pi16(__m64 __m1, __m64 __m2)
  657. {
  658. __m64 ret;
  659. asm("pcmpgth %0, %1, %2\n\t"
  660. : "=f" (ret)
  661. : "f" (__m1), "f" (__m2)
  662. );
  663. return ret;
  664. }
  665. extern __inline __m64 FUNCTION_ATTRIBS
  666. _mm_cmpgt_pi32(__m64 __m1, __m64 __m2)
  667. {
  668. __m64 ret;
  669. asm("pcmpgtw %0, %1, %2\n\t"
  670. : "=f" (ret)
  671. : "f" (__m1), "f" (__m2)
  672. );
  673. return ret;
  674. }
  675. extern __inline __m64 FUNCTION_ATTRIBS
  676. _mm_cmplt_pi8(__m64 __m1, __m64 __m2)
  677. {
  678. __m64 ret;
  679. asm("pcmpltb %0, %1, %2\n\t"
  680. : "=f" (ret)
  681. : "f" (__m1), "f" (__m2)
  682. );
  683. return ret;
  684. }
  685. extern __inline __m64 FUNCTION_ATTRIBS
  686. _mm_cmplt_pi16(__m64 __m1, __m64 __m2)
  687. {
  688. __m64 ret;
  689. asm("pcmplth %0, %1, %2\n\t"
  690. : "=f" (ret)
  691. : "f" (__m1), "f" (__m2)
  692. );
  693. return ret;
  694. }
  695. extern __inline __m64 FUNCTION_ATTRIBS
  696. _mm_cmplt_pi32(__m64 __m1, __m64 __m2)
  697. {
  698. __m64 ret;
  699. asm("pcmpltw %0, %1, %2\n\t"
  700. : "=f" (ret)
  701. : "f" (__m1), "f" (__m2)
  702. );
  703. return ret;
  704. }
  705. /********** Miscellaneous Operations **********/
  706. extern __inline __m64 FUNCTION_ATTRIBS
  707. _mm_packs_pi16(__m64 __m1, __m64 __m2)
  708. {
  709. __m64 ret;
  710. asm("packsshb %0, %1, %2\n\t"
  711. : "=f" (ret)
  712. : "f" (__m1), "f" (__m2)
  713. );
  714. return ret;
  715. }
  716. extern __inline __m64 FUNCTION_ATTRIBS
  717. _mm_packs_pi32(__m64 __m1, __m64 __m2)
  718. {
  719. __m64 ret;
  720. asm("packsswh %0, %1, %2\n\t"
  721. : "=f" (ret)
  722. : "f" (__m1), "f" (__m2)
  723. );
  724. return ret;
  725. }
  726. extern __inline __m64 FUNCTION_ATTRIBS
  727. _mm_packs_pi32_f(__m64 __m1, __m64 __m2)
  728. {
  729. __m64 ret;
  730. asm("packsswh %0, %1, %2\n\t"
  731. : "=f" (ret)
  732. : "f" (__m1), "f" (__m2)
  733. );
  734. return ret;
  735. }
  736. extern __inline __m64 FUNCTION_ATTRIBS
  737. _mm_packs_pu16(__m64 __m1, __m64 __m2)
  738. {
  739. __m64 ret;
  740. asm("packushb %0, %1, %2\n\t"
  741. : "=f" (ret)
  742. : "f" (__m1), "f" (__m2)
  743. );
  744. return ret;
  745. }
  746. extern __inline __m64 FUNCTION_ATTRIBS
  747. _mm_extract_pi16(__m64 __m, int64_t __pos)
  748. {
  749. __m64 ret;
  750. asm("pextrh %0, %1, %2\n\t"
  751. : "=f" (ret)
  752. : "f" (__m), "f" (*(__m64 *)&__pos)
  753. );
  754. return ret;
  755. }
  756. extern __inline __m64 FUNCTION_ATTRIBS
  757. _mm_insert_pi16(__m64 __m1, __m64 __m2, int64_t __pos)
  758. {
  759. __m64 ret;
  760. switch (__pos) {
  761. case 0:
  762. asm("pinsrh_0 %0, %1, %2\n\t"
  763. : "=f" (ret)
  764. : "f" (__m1), "f" (__m2), "i" (__pos)
  765. );
  766. break;
  767. case 1:
  768. asm("pinsrh_1 %0, %1, %2\n\t"
  769. : "=f" (ret)
  770. : "f" (__m1), "f" (__m2), "i" (__pos)
  771. );
  772. break;
  773. case 2:
  774. asm("pinsrh_2 %0, %1, %2\n\t"
  775. : "=f" (ret)
  776. : "f" (__m1), "f" (__m2), "i" (__pos)
  777. );
  778. break;
  779. case 3:
  780. asm("pinsrh_3 %0, %1, %2\n\t"
  781. : "=f" (ret)
  782. : "f" (__m1), "f" (__m2), "i" (__pos)
  783. );
  784. break;
  785. }
  786. return ret;
  787. }
  788. extern __inline __m64 FUNCTION_ATTRIBS
  789. _mm_shuffle_pi16(__m64 __m, int64_t __n)
  790. {
  791. __m64 ret;
  792. asm("pshufh %0, %1, %2\n\t"
  793. : "=f" (ret)
  794. : "f" (__m), "f" (*(__m64 *)&__n)
  795. );
  796. return ret;
  797. }
  798. extern __inline __m64 FUNCTION_ATTRIBS
  799. _mm_unpackhi_pi8(__m64 __m1, __m64 __m2)
  800. {
  801. __m64 ret;
  802. asm("punpckhbh %0, %1, %2\n\t"
  803. : "=f" (ret)
  804. : "f" (__m1), "f" (__m2)
  805. );
  806. return ret;
  807. }
  808. extern __inline __m64 FUNCTION_ATTRIBS
  809. _mm_unpackhi_pi8_f(__m64 __m1, __m64 __m2)
  810. {
  811. __m64 ret;
  812. asm("punpckhbh %0, %1, %2\n\t"
  813. : "=f" (ret)
  814. : "f" (__m1), "f" (__m2)
  815. );
  816. return ret;
  817. }
  818. extern __inline __m64 FUNCTION_ATTRIBS
  819. _mm_unpackhi_pi16(__m64 __m1, __m64 __m2)
  820. {
  821. __m64 ret;
  822. asm("punpckhhw %0, %1, %2\n\t"
  823. : "=f" (ret)
  824. : "f" (__m1), "f" (__m2)
  825. );
  826. return ret;
  827. }
  828. extern __inline __m64 FUNCTION_ATTRIBS
  829. _mm_unpackhi_pi16_f(__m64 __m1, __m64 __m2)
  830. {
  831. __m64 ret;
  832. asm("punpckhhw %0, %1, %2\n\t"
  833. : "=f" (ret)
  834. : "f" (__m1), "f" (__m2)
  835. );
  836. return ret;
  837. }
  838. extern __inline __m64 FUNCTION_ATTRIBS
  839. _mm_unpackhi_pi32(__m64 __m1, __m64 __m2)
  840. {
  841. __m64 ret;
  842. asm("punpckhwd %0, %1, %2\n\t"
  843. : "=f" (ret)
  844. : "f" (__m1), "f" (__m2)
  845. );
  846. return ret;
  847. }
  848. extern __inline __m64 FUNCTION_ATTRIBS
  849. _mm_unpacklo_pi8(__m64 __m1, __m64 __m2)
  850. {
  851. __m64 ret;
  852. asm("punpcklbh %0, %1, %2\n\t"
  853. : "=f" (ret)
  854. : "f" (__m1), "f" (__m2)
  855. );
  856. return ret;
  857. }
  858. /* Since punpcklbh cares about the high 32-bits, we use the __m64 datatype,
  859. which preserves the data. */
  860. extern __inline __m64 FUNCTION_ATTRIBS
  861. _mm_unpacklo_pi8_f64(__m64 __m1, __m64 __m2)
  862. {
  863. __m64 ret;
  864. asm("punpcklbh %0, %1, %2\n\t"
  865. : "=f" (ret)
  866. : "f" (__m1), "f" (__m2)
  867. );
  868. return ret;
  869. }
  870. /* Since punpcklbh doesn't care about the high 32-bits, we use the __m32,
  871. datatype, which allows load8888 to use 32-bit loads. */
  872. extern __inline __m64 FUNCTION_ATTRIBS
  873. _mm_unpacklo_pi8_f(__m32 __m1, __m64 __m2)
  874. {
  875. __m64 ret;
  876. asm("punpcklbh %0, %1, %2\n\t"
  877. : "=f" (ret)
  878. : "f" (__m1), "f" (__m2)
  879. );
  880. return ret;
  881. }
  882. extern __inline __m64 FUNCTION_ATTRIBS
  883. _mm_unpacklo_pi16(__m64 __m1, __m64 __m2)
  884. {
  885. __m64 ret;
  886. asm("punpcklhw %0, %1, %2\n\t"
  887. : "=f" (ret)
  888. : "f" (__m1), "f" (__m2)
  889. );
  890. return ret;
  891. }
  892. extern __inline __m64 FUNCTION_ATTRIBS
  893. _mm_unpacklo_pi16_f(__m64 __m1, __m64 __m2)
  894. {
  895. __m64 ret;
  896. asm("punpcklhw %0, %1, %2\n\t"
  897. : "=f" (ret)
  898. : "f" (__m1), "f" (__m2)
  899. );
  900. return ret;
  901. }
  902. extern __inline __m64 FUNCTION_ATTRIBS
  903. _mm_unpacklo_pi32(__m64 __m1, __m64 __m2)
  904. {
  905. __m64 ret;
  906. asm("punpcklwd %0, %1, %2\n\t"
  907. : "=f" (ret)
  908. : "f" (__m1), "f" (__m2)
  909. );
  910. return ret;
  911. }
  912. extern __inline __m64 FUNCTION_ATTRIBS
  913. _mm_unpacklo_pi32_f(__m64 __m1, __m64 __m2)
  914. {
  915. __m64 ret;
  916. asm("punpcklwd %0, %1, %2\n\t"
  917. : "=f" (ret)
  918. : "f" (__m1), "f" (__m2)
  919. );
  920. return ret;
  921. }
  922. extern __inline void FUNCTION_ATTRIBS
  923. _mm_store_pi32(__m32 *dest, __m64 src)
  924. {
  925. src = _mm_packs_pu16(src, _mm_setzero_si64());
  926. asm("swc1 %1, %0\n\t"
  927. : "=m" (*dest)
  928. : "f" (src)
  929. : "memory"
  930. );
  931. }
  932. extern __inline void FUNCTION_ATTRIBS
  933. _mm_store_si64(__m64 *dest, __m64 src)
  934. {
  935. asm("gssdlc1 %1, 7+%0\n\t"
  936. "gssdrc1 %1, %0\n\t"
  937. : "=m" (*dest)
  938. : "f" (src)
  939. : "memory"
  940. );
  941. }
  942. extern __inline __m64 FUNCTION_ATTRIBS
  943. _mm_load_si32(const __m32 *src)
  944. {
  945. __m32 ret;
  946. asm("lwc1 %0, %1\n\t"
  947. : "=f" (ret)
  948. : "m" (*src)
  949. );
  950. return ret;
  951. }
  952. extern __inline __m64 FUNCTION_ATTRIBS
  953. _mm_load_si64(const __m64 *src)
  954. {
  955. __m64 ret;
  956. asm("ldc1 %0, %1\n\t"
  957. : "=f" (ret)
  958. : "m" (*src)
  959. : "memory"
  960. );
  961. return ret;
  962. }
  963. extern __inline __m64 FUNCTION_ATTRIBS
  964. _mm_loadu_si64(const __m64 *src)
  965. {
  966. __m64 ret;
  967. asm("gsldlc1 %0, 7(%1)\n\t"
  968. "gsldrc1 %0, 0(%1)\n\t"
  969. : "=f" (ret)
  970. : "r" (src)
  971. : "memory"
  972. );
  973. return ret;
  974. }
  975. extern __inline __m64 FUNCTION_ATTRIBS
  976. _mm_loadlo_pi8(const uint32_t *src)
  977. {
  978. return _mm_unpacklo_pi8_f(*(__m32 *)src, _mm_setzero_si64());
  979. }
  980. extern __inline __m64 FUNCTION_ATTRIBS
  981. _mm_loadlo_pi8_f(__m64 src)
  982. {
  983. return _mm_unpacklo_pi8_f64(src, _mm_setzero_si64());
  984. }
  985. extern __inline __m64 FUNCTION_ATTRIBS
  986. _mm_loadhi_pi8_f(__m64 src)
  987. {
  988. return _mm_unpackhi_pi8_f(src, _mm_setzero_si64());
  989. }
  990. extern __inline __m64 FUNCTION_ATTRIBS
  991. _mm_loadlo_pi16(__m64 src)
  992. {
  993. return _mm_unpacklo_pi16(src, _mm_setzero_si64());
  994. }
  995. extern __inline __m64 FUNCTION_ATTRIBS
  996. _mm_loadlo_pi16_f(__m64 src)
  997. {
  998. return _mm_unpacklo_pi16_f(_mm_setzero_si64(), src);
  999. }
  1000. extern __inline __m64 FUNCTION_ATTRIBS
  1001. _mm_loadhi_pi16(__m64 src)
  1002. {
  1003. return _mm_unpackhi_pi16(src, _mm_setzero_si64());
  1004. }
  1005. extern __inline __m64 FUNCTION_ATTRIBS
  1006. _mm_loadhi_pi16_f(__m64 src)
  1007. {
  1008. return _mm_unpackhi_pi16_f(_mm_setzero_si64(), src);
  1009. }
  1010. extern __inline __m64 FUNCTION_ATTRIBS
  1011. _mm_expand_alpha(__m64 pixel)
  1012. {
  1013. return _mm_shuffle_pi16(pixel, _MM_SHUFFLE(3, 3, 3, 3));
  1014. }
  1015. extern __inline __m64 FUNCTION_ATTRIBS
  1016. _mm_expand_alpha_rev(__m64 pixel)
  1017. {
  1018. return _mm_shuffle_pi16(pixel, _MM_SHUFFLE(0, 0, 0, 0));
  1019. }
  1020. #endif /* __LOONGSON_MMINTRIN_H__ */