jdsample-mmi.c 9.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245
  1. /*
  2. * Loongson MMI optimizations for libjpeg-turbo
  3. *
  4. * Copyright (C) 2015, 2018, D. R. Commander. All Rights Reserved.
  5. * Copyright (C) 2016-2017, Loongson Technology Corporation Limited, BeiJing.
  6. * All Rights Reserved.
  7. * Authors: ZhuChen <zhuchen@loongson.cn>
  8. * CaiWanwei <caiwanwei@loongson.cn>
  9. * SunZhangzhi <sunzhangzhi-cq@loongson.cn>
  10. *
  11. * Based on the x86 SIMD extension for IJG JPEG library
  12. * Copyright (C) 1999-2006, MIYASAKA Masaru.
  13. *
  14. * This software is provided 'as-is', without any express or implied
  15. * warranty. In no event will the authors be held liable for any damages
  16. * arising from the use of this software.
  17. *
  18. * Permission is granted to anyone to use this software for any purpose,
  19. * including commercial applications, and to alter it and redistribute it
  20. * freely, subject to the following restrictions:
  21. *
  22. * 1. The origin of this software must not be misrepresented; you must not
  23. * claim that you wrote the original software. If you use this software
  24. * in a product, an acknowledgment in the product documentation would be
  25. * appreciated but is not required.
  26. * 2. Altered source versions must be plainly marked as such, and must not be
  27. * misrepresented as being the original software.
  28. * 3. This notice may not be removed or altered from any source distribution.
  29. */
  30. /* CHROMA UPSAMPLING */
  31. #include "jsimd_mmi.h"
  32. enum const_index {
  33. index_PW_THREE,
  34. index_PW_SEVEN,
  35. index_PW_EIGHT,
  36. };
  37. static uint64_t const_value[] = {
  38. _uint64_set_pi16(3, 3, 3, 3),
  39. _uint64_set_pi16(7, 7, 7, 7),
  40. _uint64_set_pi16(8, 8, 8, 8),
  41. };
  42. #define PW_THREE get_const_value(index_PW_THREE)
  43. #define PW_SEVEN get_const_value(index_PW_SEVEN)
  44. #define PW_EIGHT get_const_value(index_PW_EIGHT)
  45. #define PROCESS_ROW(r) { \
  46. mm7 = _mm_load_si64((__m64 *)outptr##r); /* mm7=IntrL=( 0 1 2 3) */ \
  47. mm3 = _mm_load_si64((__m64 *)outptr##r + 1); /* mm3=IntrH=( 4 5 6 7) */ \
  48. \
  49. mm0 = mm7; \
  50. mm4 = mm3; \
  51. mm0 = _mm_srli_si64(mm0, 2 * BYTE_BIT); /* mm0=( 1 2 3 -) */ \
  52. mm4 = _mm_slli_si64(mm4, (SIZEOF_MMWORD - 2) * BYTE_BIT); /* mm4=( - - - 4) */ \
  53. mm5 = mm7; \
  54. mm6 = mm3; \
  55. mm5 = _mm_srli_si64(mm5, (SIZEOF_MMWORD - 2) * BYTE_BIT); /* mm5=( 3 - - -) */ \
  56. mm6 = _mm_slli_si64(mm6, 2 * BYTE_BIT); /* mm6=( - 4 5 6) */ \
  57. \
  58. mm0 = _mm_or_si64(mm0, mm4); /* mm0=( 1 2 3 4) */ \
  59. mm5 = _mm_or_si64(mm5, mm6); /* mm5=( 3 4 5 6) */ \
  60. \
  61. mm1 = mm7; \
  62. mm2 = mm3; \
  63. mm1 = _mm_slli_si64(mm1, 2 * BYTE_BIT); /* mm1=( - 0 1 2) */ \
  64. mm2 = _mm_srli_si64(mm2, 2 * BYTE_BIT); /* mm2=( 5 6 7 -) */ \
  65. mm4 = mm3; \
  66. mm4 = _mm_srli_si64(mm4, (SIZEOF_MMWORD - 2) * BYTE_BIT); /* mm4=( 7 - - -) */ \
  67. \
  68. mm1 = _mm_or_si64(mm1, wk[r]); /* mm1=(-1 0 1 2) */ \
  69. mm2 = _mm_or_si64(mm2, wk[r + 2]); /* mm2=( 5 6 6 8) */ \
  70. \
  71. wk[r] = mm4; \
  72. \
  73. mm7 = _mm_mullo_pi16(mm7, PW_THREE); \
  74. mm3 = _mm_mullo_pi16(mm3, PW_THREE); \
  75. mm1 = _mm_add_pi16(mm1, PW_EIGHT); \
  76. mm5 = _mm_add_pi16(mm5, PW_EIGHT); \
  77. mm0 = _mm_add_pi16(mm0, PW_SEVEN); \
  78. mm2 = _mm_add_pi16(mm2, PW_SEVEN); \
  79. \
  80. mm1 = _mm_add_pi16(mm1, mm7); \
  81. mm5 = _mm_add_pi16(mm5, mm3); \
  82. mm1 = _mm_srli_pi16(mm1, 4); /* mm1=OutrLE=( 0 2 4 6) */ \
  83. mm5 = _mm_srli_pi16(mm5, 4); /* mm5=OutrHE=( 8 10 12 14) */ \
  84. mm0 = _mm_add_pi16(mm0, mm7); \
  85. mm2 = _mm_add_pi16(mm2, mm3); \
  86. mm0 = _mm_srli_pi16(mm0, 4); /* mm0=OutrLO=( 1 3 5 7) */ \
  87. mm2 = _mm_srli_pi16(mm2, 4); /* mm2=OutrHO=( 9 11 13 15) */ \
  88. \
  89. mm0 = _mm_slli_pi16(mm0, BYTE_BIT); \
  90. mm2 = _mm_slli_pi16(mm2, BYTE_BIT); \
  91. mm1 = _mm_or_si64(mm1, mm0); /* mm1=OutrL=( 0 1 2 3 4 5 6 7) */ \
  92. mm5 = _mm_or_si64(mm5, mm2); /* mm5=OutrH=( 8 9 10 11 12 13 14 15) */ \
  93. \
  94. _mm_store_si64((__m64 *)outptr##r, mm1); \
  95. _mm_store_si64((__m64 *)outptr##r + 1, mm5); \
  96. }
  97. void jsimd_h2v2_fancy_upsample_mmi(int max_v_samp_factor,
  98. JDIMENSION downsampled_width,
  99. JSAMPARRAY input_data,
  100. JSAMPARRAY *output_data_ptr)
  101. {
  102. JSAMPARRAY output_data = *output_data_ptr;
  103. JSAMPROW inptr_1, inptr0, inptr1, outptr0, outptr1;
  104. int inrow, outrow, incol, tmp, tmp1;
  105. __m64 mm0, mm1, mm2, mm3 = 0.0, mm4, mm5, mm6, mm7 = 0.0;
  106. __m64 wk[4], mm_tmp;
  107. for (inrow = 0, outrow = 0; outrow < max_v_samp_factor; inrow++) {
  108. inptr_1 = input_data[inrow - 1];
  109. inptr0 = input_data[inrow];
  110. inptr1 = input_data[inrow + 1];
  111. outptr0 = output_data[outrow++];
  112. outptr1 = output_data[outrow++];
  113. if (downsampled_width & 7) {
  114. tmp = (downsampled_width - 1) * sizeof(JSAMPLE);
  115. tmp1 = downsampled_width * sizeof(JSAMPLE);
  116. asm("daddu $8, %3, %6\r\n"
  117. "lb $9, ($8)\r\n"
  118. "daddu $8, %3, %7\r\n"
  119. "sb $9, ($8)\r\n"
  120. "daddu $8, %4, %6\r\n"
  121. "lb $9, ($8)\r\n"
  122. "daddu $8, %4, %7\r\n"
  123. "sb $9, ($8)\r\n"
  124. "daddu $8, %5, %6\r\n"
  125. "lb $9, ($8)\r\n"
  126. "daddu $8, %5, %7\r\n"
  127. "sb $9, ($8)\r\n"
  128. : "=m" (*inptr_1), "=m" (*inptr0), "=m" (*inptr1)
  129. : "r" (inptr_1), "r" (inptr0), "r" (inptr1), "r" (tmp), "r" (tmp1)
  130. : "$8", "$9"
  131. );
  132. }
  133. /* process the first column block */
  134. mm0 = _mm_load_si64((__m64 *)inptr0); /* mm0 = row[ 0][0] */
  135. mm1 = _mm_load_si64((__m64 *)inptr_1); /* mm1 = row[-1][0] */
  136. mm2 = _mm_load_si64((__m64 *)inptr1); /* mm2 = row[ 1][0] */
  137. mm3 = _mm_xor_si64(mm3, mm3); /* mm3 = (all 0's) */
  138. mm4 = mm0;
  139. mm0 = _mm_unpacklo_pi8(mm0, mm3); /* mm0 = row[ 0][0]( 0 1 2 3) */
  140. mm4 = _mm_unpackhi_pi8(mm4, mm3); /* mm4 = row[ 0][0]( 4 5 6 7) */
  141. mm5 = mm1;
  142. mm1 = _mm_unpacklo_pi8(mm1, mm3); /* mm1 = row[-1][0]( 0 1 2 3) */
  143. mm5 = _mm_unpackhi_pi8(mm5, mm3); /* mm5 = row[-1][0]( 4 5 6 7) */
  144. mm6 = mm2;
  145. mm2 = _mm_unpacklo_pi8(mm2, mm3); /* mm2 = row[+1][0]( 0 1 2 3) */
  146. mm6 = _mm_unpackhi_pi8(mm6, mm3); /* mm6 = row[+1][0]( 4 5 6 7) */
  147. mm0 = _mm_mullo_pi16(mm0, PW_THREE);
  148. mm4 = _mm_mullo_pi16(mm4, PW_THREE);
  149. mm7 = _mm_cmpeq_pi8(mm7, mm7);
  150. mm7 = _mm_srli_si64(mm7, (SIZEOF_MMWORD - 2) * BYTE_BIT);
  151. mm1 = _mm_add_pi16(mm1, mm0); /* mm1=Int0L=( 0 1 2 3) */
  152. mm5 = _mm_add_pi16(mm5, mm4); /* mm5=Int0H=( 4 5 6 7) */
  153. mm2 = _mm_add_pi16(mm2, mm0); /* mm2=Int1L=( 0 1 2 3) */
  154. mm6 = _mm_add_pi16(mm6, mm4); /* mm6=Int1H=( 4 5 6 7) */
  155. _mm_store_si64((__m64 *)outptr0, mm1); /* temporarily save */
  156. _mm_store_si64((__m64 *)outptr0 + 1, mm5); /* the intermediate data */
  157. _mm_store_si64((__m64 *)outptr1, mm2);
  158. _mm_store_si64((__m64 *)outptr1 + 1, mm6);
  159. mm1 = _mm_and_si64(mm1, mm7); /* mm1=( 0 - - -) */
  160. mm2 = _mm_and_si64(mm2, mm7); /* mm2=( 0 - - -) */
  161. wk[0] = mm1;
  162. wk[1] = mm2;
  163. for (incol = downsampled_width; incol > 0;
  164. incol -= 8, inptr_1 += 8, inptr0 += 8, inptr1 += 8,
  165. outptr0 += 16, outptr1 += 16) {
  166. if (incol > 8) {
  167. /* process the next column block */
  168. mm0 = _mm_load_si64((__m64 *)inptr0 + 1); /* mm0 = row[ 0][1] */
  169. mm1 = _mm_load_si64((__m64 *)inptr_1 + 1); /* mm1 = row[-1][1] */
  170. mm2 = _mm_load_si64((__m64 *)inptr1 + 1); /* mm2 = row[+1][1] */
  171. mm3 = _mm_setzero_si64(); /* mm3 = (all 0's) */
  172. mm4 = mm0;
  173. mm0 = _mm_unpacklo_pi8(mm0, mm3); /* mm0 = row[ 0][1]( 0 1 2 3) */
  174. mm4 = _mm_unpackhi_pi8(mm4, mm3); /* mm4 = row[ 0][1]( 4 5 6 7) */
  175. mm5 = mm1;
  176. mm1 = _mm_unpacklo_pi8(mm1, mm3); /* mm1 = row[-1][1]( 0 1 2 3) */
  177. mm5 = _mm_unpackhi_pi8(mm5, mm3); /* mm5 = row[-1][1]( 4 5 6 7) */
  178. mm6 = mm2;
  179. mm2 = _mm_unpacklo_pi8(mm2, mm3); /* mm2 = row[+1][1]( 0 1 2 3) */
  180. mm6 = _mm_unpackhi_pi8(mm6, mm3); /* mm6 = row[+1][1]( 4 5 6 7) */
  181. mm0 = _mm_mullo_pi16(mm0, PW_THREE);
  182. mm4 = _mm_mullo_pi16(mm4, PW_THREE);
  183. mm1 = _mm_add_pi16(mm1, mm0); /* mm1 = Int0L = ( 0 1 2 3) */
  184. mm5 = _mm_add_pi16(mm5, mm4); /* mm5 = Int0H = ( 4 5 6 7) */
  185. mm2 = _mm_add_pi16(mm2, mm0); /* mm2 = Int1L = ( 0 1 2 3) */
  186. mm6 = _mm_add_pi16(mm6, mm4); /* mm6 = Int1H = ( 4 5 6 7) */
  187. _mm_store_si64((__m64 *)outptr0 + 2, mm1); /* temporarily save */
  188. _mm_store_si64((__m64 *)outptr0 + 3, mm5); /* the intermediate data */
  189. _mm_store_si64((__m64 *)outptr1 + 2, mm2);
  190. _mm_store_si64((__m64 *)outptr1 + 3, mm6);
  191. mm1 = _mm_slli_si64(mm1, (SIZEOF_MMWORD - 2) * BYTE_BIT); /* mm1=( - - - 0) */
  192. mm2 = _mm_slli_si64(mm2, (SIZEOF_MMWORD - 2) * BYTE_BIT); /* mm2=( - - - 0) */
  193. wk[2] = mm1;
  194. wk[3] = mm2;
  195. } else {
  196. /* process the last column block */
  197. mm1 = _mm_cmpeq_pi8(mm1, mm1);
  198. mm1 = _mm_slli_si64(mm1, (SIZEOF_MMWORD - 2) * BYTE_BIT);
  199. mm2 = mm1;
  200. mm_tmp = _mm_load_si64((__m64 *)outptr0 + 1);
  201. mm1 = _mm_and_si64(mm1, mm_tmp); /* mm1=( - - - 7) */
  202. mm_tmp = _mm_load_si64((__m64 *)outptr1 + 1);
  203. mm2 = _mm_and_si64(mm2, mm_tmp); /* mm2=( - - - 7) */
  204. wk[2] = mm1;
  205. wk[3] = mm2;
  206. }
  207. /* process the upper row */
  208. PROCESS_ROW(0)
  209. /* process the lower row */
  210. PROCESS_ROW(1)
  211. }
  212. }
  213. }