jdmrgext-altivec.c 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329
  1. /*
  2. * AltiVec optimizations for libjpeg-turbo
  3. *
  4. * Copyright (C) 2015, D. R. Commander. All Rights Reserved.
  5. *
  6. * This software is provided 'as-is', without any express or implied
  7. * warranty. In no event will the authors be held liable for any damages
  8. * arising from the use of this software.
  9. *
  10. * Permission is granted to anyone to use this software for any purpose,
  11. * including commercial applications, and to alter it and redistribute it
  12. * freely, subject to the following restrictions:
  13. *
  14. * 1. The origin of this software must not be misrepresented; you must not
  15. * claim that you wrote the original software. If you use this software
  16. * in a product, an acknowledgment in the product documentation would be
  17. * appreciated but is not required.
  18. * 2. Altered source versions must be plainly marked as such, and must not be
  19. * misrepresented as being the original software.
  20. * 3. This notice may not be removed or altered from any source distribution.
  21. */
  22. /* This file is included by jdmerge-altivec.c */
  23. void jsimd_h2v1_merged_upsample_altivec(JDIMENSION output_width,
  24. JSAMPIMAGE input_buf,
  25. JDIMENSION in_row_group_ctr,
  26. JSAMPARRAY output_buf)
  27. {
  28. JSAMPROW outptr, inptr0, inptr1, inptr2;
  29. int pitch = output_width * RGB_PIXELSIZE, num_cols, yloop;
  30. #if __BIG_ENDIAN__
  31. int offset;
  32. #endif
  33. unsigned char __attribute__((aligned(16))) tmpbuf[RGB_PIXELSIZE * 16];
  34. __vector unsigned char rgb0, rgb1, rgb2, rgbx0, rgbx1, rgbx2, rgbx3,
  35. y, cb, cr;
  36. #if __BIG_ENDIAN__
  37. __vector unsigned char edgel, edgeh, edges, out0, out1, out2, out3;
  38. #if RGB_PIXELSIZE == 4
  39. __vector unsigned char out4;
  40. #endif
  41. #endif
  42. #if RGB_PIXELSIZE == 4
  43. __vector unsigned char rgb3;
  44. #endif
  45. __vector short rg0, rg1, rg2, rg3, bx0, bx1, bx2, bx3, ye, yo, cbl, cbh,
  46. crl, crh, r_yl, r_yh, g_yl, g_yh, b_yl, b_yh, g_y0w, g_y1w, g_y2w, g_y3w,
  47. rl, rh, gl, gh, bl, bh, re, ro, ge, go, be, bo;
  48. __vector int g_y0, g_y1, g_y2, g_y3;
  49. /* Constants
  50. * NOTE: The >> 1 is to compensate for the fact that vec_madds() returns 17
  51. * high-order bits, not 16.
  52. */
  53. __vector short pw_f0402 = { __8X(F_0_402 >> 1) },
  54. pw_mf0228 = { __8X(-F_0_228 >> 1) },
  55. pw_mf0344_f0285 = { __4X2(-F_0_344, F_0_285) },
  56. pw_one = { __8X(1) }, pw_255 = { __8X(255) },
  57. pw_cj = { __8X(CENTERJSAMPLE) };
  58. __vector int pd_onehalf = { __4X(ONE_HALF) };
  59. __vector unsigned char pb_zero = { __16X(0) },
  60. #if __BIG_ENDIAN__
  61. shift_pack_index =
  62. { 0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29 },
  63. even_index =
  64. { 0, 16, 0, 18, 0, 20, 0, 22, 0, 24, 0, 26, 0, 28, 0, 30 },
  65. odd_index =
  66. { 0, 17, 0, 19, 0, 21, 0, 23, 0, 25, 0, 27, 0, 29, 0, 31 };
  67. #else
  68. shift_pack_index =
  69. { 2, 3, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31 },
  70. even_index =
  71. { 16, 0, 18, 0, 20, 0, 22, 0, 24, 0, 26, 0, 28, 0, 30, 0 },
  72. odd_index =
  73. { 17, 0, 19, 0, 21, 0, 23, 0, 25, 0, 27, 0, 29, 0, 31, 0 };
  74. #endif
  75. inptr0 = input_buf[0][in_row_group_ctr];
  76. inptr1 = input_buf[1][in_row_group_ctr];
  77. inptr2 = input_buf[2][in_row_group_ctr];
  78. outptr = output_buf[0];
  79. for (num_cols = pitch; num_cols > 0; inptr1 += 16, inptr2 += 16) {
  80. cb = vec_ld(0, inptr1);
  81. /* NOTE: We have to use vec_merge*() here because vec_unpack*() doesn't
  82. * support unsigned vectors.
  83. */
  84. cbl = (__vector signed short)VEC_UNPACKHU(cb);
  85. cbh = (__vector signed short)VEC_UNPACKLU(cb);
  86. cbl = vec_sub(cbl, pw_cj);
  87. cbh = vec_sub(cbh, pw_cj);
  88. cr = vec_ld(0, inptr2);
  89. crl = (__vector signed short)VEC_UNPACKHU(cr);
  90. crh = (__vector signed short)VEC_UNPACKLU(cr);
  91. crl = vec_sub(crl, pw_cj);
  92. crh = vec_sub(crh, pw_cj);
  93. /* (Original)
  94. * R = Y + 1.40200 * Cr
  95. * G = Y - 0.34414 * Cb - 0.71414 * Cr
  96. * B = Y + 1.77200 * Cb
  97. *
  98. * (This implementation)
  99. * R = Y + 0.40200 * Cr + Cr
  100. * G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr
  101. * B = Y - 0.22800 * Cb + Cb + Cb
  102. */
  103. b_yl = vec_add(cbl, cbl);
  104. b_yh = vec_add(cbh, cbh);
  105. b_yl = vec_madds(b_yl, pw_mf0228, pw_one);
  106. b_yh = vec_madds(b_yh, pw_mf0228, pw_one);
  107. b_yl = vec_sra(b_yl, (__vector unsigned short)pw_one);
  108. b_yh = vec_sra(b_yh, (__vector unsigned short)pw_one);
  109. b_yl = vec_add(b_yl, cbl);
  110. b_yh = vec_add(b_yh, cbh);
  111. b_yl = vec_add(b_yl, cbl);
  112. b_yh = vec_add(b_yh, cbh);
  113. r_yl = vec_add(crl, crl);
  114. r_yh = vec_add(crh, crh);
  115. r_yl = vec_madds(r_yl, pw_f0402, pw_one);
  116. r_yh = vec_madds(r_yh, pw_f0402, pw_one);
  117. r_yl = vec_sra(r_yl, (__vector unsigned short)pw_one);
  118. r_yh = vec_sra(r_yh, (__vector unsigned short)pw_one);
  119. r_yl = vec_add(r_yl, crl);
  120. r_yh = vec_add(r_yh, crh);
  121. g_y0w = vec_mergeh(cbl, crl);
  122. g_y1w = vec_mergel(cbl, crl);
  123. g_y0 = vec_msums(g_y0w, pw_mf0344_f0285, pd_onehalf);
  124. g_y1 = vec_msums(g_y1w, pw_mf0344_f0285, pd_onehalf);
  125. g_y2w = vec_mergeh(cbh, crh);
  126. g_y3w = vec_mergel(cbh, crh);
  127. g_y2 = vec_msums(g_y2w, pw_mf0344_f0285, pd_onehalf);
  128. g_y3 = vec_msums(g_y3w, pw_mf0344_f0285, pd_onehalf);
  129. /* Clever way to avoid 4 shifts + 2 packs. This packs the high word from
  130. * each dword into a new 16-bit vector, which is the equivalent of
  131. * descaling the 32-bit results (right-shifting by 16 bits) and then
  132. * packing them.
  133. */
  134. g_yl = vec_perm((__vector short)g_y0, (__vector short)g_y1,
  135. shift_pack_index);
  136. g_yh = vec_perm((__vector short)g_y2, (__vector short)g_y3,
  137. shift_pack_index);
  138. g_yl = vec_sub(g_yl, crl);
  139. g_yh = vec_sub(g_yh, crh);
  140. for (yloop = 0; yloop < 2 && num_cols > 0; yloop++,
  141. num_cols -= RGB_PIXELSIZE * 16,
  142. outptr += RGB_PIXELSIZE * 16, inptr0 += 16) {
  143. y = vec_ld(0, inptr0);
  144. ye = (__vector signed short)vec_perm(pb_zero, y, even_index);
  145. yo = (__vector signed short)vec_perm(pb_zero, y, odd_index);
  146. if (yloop == 0) {
  147. be = vec_add(b_yl, ye);
  148. bo = vec_add(b_yl, yo);
  149. re = vec_add(r_yl, ye);
  150. ro = vec_add(r_yl, yo);
  151. ge = vec_add(g_yl, ye);
  152. go = vec_add(g_yl, yo);
  153. } else {
  154. be = vec_add(b_yh, ye);
  155. bo = vec_add(b_yh, yo);
  156. re = vec_add(r_yh, ye);
  157. ro = vec_add(r_yh, yo);
  158. ge = vec_add(g_yh, ye);
  159. go = vec_add(g_yh, yo);
  160. }
  161. rl = vec_mergeh(re, ro);
  162. rh = vec_mergel(re, ro);
  163. gl = vec_mergeh(ge, go);
  164. gh = vec_mergel(ge, go);
  165. bl = vec_mergeh(be, bo);
  166. bh = vec_mergel(be, bo);
  167. rg0 = vec_mergeh(rl, gl);
  168. bx0 = vec_mergeh(bl, pw_255);
  169. rg1 = vec_mergel(rl, gl);
  170. bx1 = vec_mergel(bl, pw_255);
  171. rg2 = vec_mergeh(rh, gh);
  172. bx2 = vec_mergeh(bh, pw_255);
  173. rg3 = vec_mergel(rh, gh);
  174. bx3 = vec_mergel(bh, pw_255);
  175. rgbx0 = vec_packsu(rg0, bx0);
  176. rgbx1 = vec_packsu(rg1, bx1);
  177. rgbx2 = vec_packsu(rg2, bx2);
  178. rgbx3 = vec_packsu(rg3, bx3);
  179. #if RGB_PIXELSIZE == 3
  180. /* rgbx0 = R0 G0 R1 G1 R2 G2 R3 G3 B0 X0 B1 X1 B2 X2 B3 X3
  181. * rgbx1 = R4 G4 R5 G5 R6 G6 R7 G7 B4 X4 B5 X5 B6 X6 B7 X7
  182. * rgbx2 = R8 G8 R9 G9 Ra Ga Rb Gb B8 X8 B9 X9 Ba Xa Bb Xb
  183. * rgbx3 = Rc Gc Rd Gd Re Ge Rf Gf Bc Xc Bd Xd Be Xe Bf Xf
  184. *
  185. * rgb0 = R0 G0 B0 R1 G1 B1 R2 G2 B2 R3 G3 B3 R4 G4 B4 R5
  186. * rgb1 = G5 B5 R6 G6 B6 R7 G7 B7 R8 G8 B8 R9 G9 B9 Ra Ga
  187. * rgb2 = Ba Rb Gb Bb Rc Gc Bc Rd Gd Bd Re Ge Be Rf Gf Bf
  188. */
  189. rgb0 = vec_perm(rgbx0, rgbx1, (__vector unsigned char)RGB_INDEX0);
  190. rgb1 = vec_perm(rgbx1, rgbx2, (__vector unsigned char)RGB_INDEX1);
  191. rgb2 = vec_perm(rgbx2, rgbx3, (__vector unsigned char)RGB_INDEX2);
  192. #else
  193. /* rgbx0 = R0 G0 R1 G1 R2 G2 R3 G3 B0 X0 B1 X1 B2 X2 B3 X3
  194. * rgbx1 = R4 G4 R5 G5 R6 G6 R7 G7 B4 X4 B5 X5 B6 X6 B7 X7
  195. * rgbx2 = R8 G8 R9 G9 Ra Ga Rb Gb B8 X8 B9 X9 Ba Xa Bb Xb
  196. * rgbx3 = Rc Gc Rd Gd Re Ge Rf Gf Bc Xc Bd Xd Be Xe Bf Xf
  197. *
  198. * rgb0 = R0 G0 B0 X0 R1 G1 B1 X1 R2 G2 B2 X2 R3 G3 B3 X3
  199. * rgb1 = R4 G4 B4 X4 R5 G5 B5 X5 R6 G6 B6 X6 R7 G7 B7 X7
  200. * rgb2 = R8 G8 B8 X8 R9 G9 B9 X9 Ra Ga Ba Xa Rb Gb Bb Xb
  201. * rgb3 = Rc Gc Bc Xc Rd Gd Bd Xd Re Ge Be Xe Rf Gf Bf Xf
  202. */
  203. rgb0 = vec_perm(rgbx0, rgbx0, (__vector unsigned char)RGB_INDEX);
  204. rgb1 = vec_perm(rgbx1, rgbx1, (__vector unsigned char)RGB_INDEX);
  205. rgb2 = vec_perm(rgbx2, rgbx2, (__vector unsigned char)RGB_INDEX);
  206. rgb3 = vec_perm(rgbx3, rgbx3, (__vector unsigned char)RGB_INDEX);
  207. #endif
  208. #if __BIG_ENDIAN__
  209. offset = (size_t)outptr & 15;
  210. if (offset) {
  211. __vector unsigned char unaligned_shift_index;
  212. int bytes = num_cols + offset;
  213. if (bytes < (RGB_PIXELSIZE + 1) * 16 && (bytes & 15)) {
  214. /* Slow path to prevent buffer overwrite. Since there is no way to
  215. * write a partial AltiVec register, overwrite would occur on the
  216. * last chunk of the last image row if the right edge is not on a
  217. * 16-byte boundary. It could also occur on other rows if the bytes
  218. * per row is low enough. Since we can't determine whether we're on
  219. * the last image row, we have to assume every row is the last.
  220. */
  221. vec_st(rgb0, 0, tmpbuf);
  222. vec_st(rgb1, 16, tmpbuf);
  223. vec_st(rgb2, 32, tmpbuf);
  224. #if RGB_PIXELSIZE == 4
  225. vec_st(rgb3, 48, tmpbuf);
  226. #endif
  227. memcpy(outptr, tmpbuf, min(num_cols, RGB_PIXELSIZE * 16));
  228. } else {
  229. /* Fast path */
  230. unaligned_shift_index = vec_lvsl(0, outptr);
  231. edgel = vec_ld(0, outptr);
  232. edgeh = vec_ld(min(num_cols - 1, RGB_PIXELSIZE * 16), outptr);
  233. edges = vec_perm(edgeh, edgel, unaligned_shift_index);
  234. unaligned_shift_index = vec_lvsr(0, outptr);
  235. out0 = vec_perm(edges, rgb0, unaligned_shift_index);
  236. out1 = vec_perm(rgb0, rgb1, unaligned_shift_index);
  237. out2 = vec_perm(rgb1, rgb2, unaligned_shift_index);
  238. #if RGB_PIXELSIZE == 4
  239. out3 = vec_perm(rgb2, rgb3, unaligned_shift_index);
  240. out4 = vec_perm(rgb3, edges, unaligned_shift_index);
  241. #else
  242. out3 = vec_perm(rgb2, edges, unaligned_shift_index);
  243. #endif
  244. vec_st(out0, 0, outptr);
  245. if (bytes > 16)
  246. vec_st(out1, 16, outptr);
  247. if (bytes > 32)
  248. vec_st(out2, 32, outptr);
  249. if (bytes > 48)
  250. vec_st(out3, 48, outptr);
  251. #if RGB_PIXELSIZE == 4
  252. if (bytes > 64)
  253. vec_st(out4, 64, outptr);
  254. #endif
  255. }
  256. } else {
  257. #endif /* __BIG_ENDIAN__ */
  258. if (num_cols < RGB_PIXELSIZE * 16 && (num_cols & 15)) {
  259. /* Slow path */
  260. VEC_ST(rgb0, 0, tmpbuf);
  261. VEC_ST(rgb1, 16, tmpbuf);
  262. VEC_ST(rgb2, 32, tmpbuf);
  263. #if RGB_PIXELSIZE == 4
  264. VEC_ST(rgb3, 48, tmpbuf);
  265. #endif
  266. memcpy(outptr, tmpbuf, min(num_cols, RGB_PIXELSIZE * 16));
  267. } else {
  268. /* Fast path */
  269. VEC_ST(rgb0, 0, outptr);
  270. if (num_cols > 16)
  271. VEC_ST(rgb1, 16, outptr);
  272. if (num_cols > 32)
  273. VEC_ST(rgb2, 32, outptr);
  274. #if RGB_PIXELSIZE == 4
  275. if (num_cols > 48)
  276. VEC_ST(rgb3, 48, outptr);
  277. #endif
  278. }
  279. #if __BIG_ENDIAN__
  280. }
  281. #endif
  282. }
  283. }
  284. }
  285. void jsimd_h2v2_merged_upsample_altivec(JDIMENSION output_width,
  286. JSAMPIMAGE input_buf,
  287. JDIMENSION in_row_group_ctr,
  288. JSAMPARRAY output_buf)
  289. {
  290. JSAMPROW inptr, outptr;
  291. inptr = input_buf[0][in_row_group_ctr];
  292. outptr = output_buf[0];
  293. input_buf[0][in_row_group_ctr] = input_buf[0][in_row_group_ctr * 2];
  294. jsimd_h2v1_merged_upsample_altivec(output_width, input_buf, in_row_group_ctr,
  295. output_buf);
  296. input_buf[0][in_row_group_ctr] = input_buf[0][in_row_group_ctr * 2 + 1];
  297. output_buf[0] = output_buf[1];
  298. jsimd_h2v1_merged_upsample_altivec(output_width, input_buf, in_row_group_ctr,
  299. output_buf);
  300. input_buf[0][in_row_group_ctr] = inptr;
  301. output_buf[0] = outptr;
  302. }