jdsample-altivec.c 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400
  1. /*
  2. * AltiVec optimizations for libjpeg-turbo
  3. *
  4. * Copyright (C) 2015, D. R. Commander. All Rights Reserved.
  5. *
  6. * This software is provided 'as-is', without any express or implied
  7. * warranty. In no event will the authors be held liable for any damages
  8. * arising from the use of this software.
  9. *
  10. * Permission is granted to anyone to use this software for any purpose,
  11. * including commercial applications, and to alter it and redistribute it
  12. * freely, subject to the following restrictions:
  13. *
  14. * 1. The origin of this software must not be misrepresented; you must not
  15. * claim that you wrote the original software. If you use this software
  16. * in a product, an acknowledgment in the product documentation would be
  17. * appreciated but is not required.
  18. * 2. Altered source versions must be plainly marked as such, and must not be
  19. * misrepresented as being the original software.
  20. * 3. This notice may not be removed or altered from any source distribution.
  21. */
  22. /* CHROMA UPSAMPLING */
  23. #include "jsimd_altivec.h"
  24. void jsimd_h2v1_fancy_upsample_altivec(int max_v_samp_factor,
  25. JDIMENSION downsampled_width,
  26. JSAMPARRAY input_data,
  27. JSAMPARRAY *output_data_ptr)
  28. {
  29. JSAMPARRAY output_data = *output_data_ptr;
  30. JSAMPROW inptr, outptr;
  31. int inrow, incol;
  32. __vector unsigned char this0, last0, p_last0, next0 = { 0 }, p_next0,
  33. out;
  34. __vector short this0e, this0o, this0l, this0h, last0l, last0h,
  35. next0l, next0h, outle, outhe, outlo, outho;
  36. /* Constants */
  37. __vector unsigned char pb_zero = { __16X(0) }, pb_three = { __16X(3) },
  38. last_index_col0 =
  39. { 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14 },
  40. last_index =
  41. { 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30 },
  42. next_index =
  43. { 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 },
  44. next_index_lastcol =
  45. { 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 15 },
  46. #if __BIG_ENDIAN__
  47. merge_pack_index =
  48. { 1, 17, 3, 19, 5, 21, 7, 23, 9, 25, 11, 27, 13, 29, 15, 31 };
  49. #else
  50. merge_pack_index =
  51. { 0, 16, 2, 18, 4, 20, 6, 22, 8, 24, 10, 26, 12, 28, 14, 30 };
  52. #endif
  53. __vector short pw_one = { __8X(1) }, pw_two = { __8X(2) };
  54. for (inrow = 0; inrow < max_v_samp_factor; inrow++) {
  55. inptr = input_data[inrow];
  56. outptr = output_data[inrow];
  57. if (downsampled_width & 15)
  58. inptr[downsampled_width] = inptr[downsampled_width - 1];
  59. this0 = vec_ld(0, inptr);
  60. p_last0 = vec_perm(this0, this0, last_index_col0);
  61. last0 = this0;
  62. for (incol = downsampled_width; incol > 0;
  63. incol -= 16, inptr += 16, outptr += 32) {
  64. if (downsampled_width - incol > 0) {
  65. p_last0 = vec_perm(last0, this0, last_index);
  66. last0 = this0;
  67. }
  68. if (incol <= 16)
  69. p_next0 = vec_perm(this0, this0, next_index_lastcol);
  70. else {
  71. next0 = vec_ld(16, inptr);
  72. p_next0 = vec_perm(this0, next0, next_index);
  73. }
  74. this0e = (__vector short)vec_mule(this0, pb_three);
  75. this0o = (__vector short)vec_mulo(this0, pb_three);
  76. this0l = vec_mergeh(this0e, this0o);
  77. this0h = vec_mergel(this0e, this0o);
  78. last0l = (__vector short)VEC_UNPACKHU(p_last0);
  79. last0h = (__vector short)VEC_UNPACKLU(p_last0);
  80. last0l = vec_add(last0l, pw_one);
  81. next0l = (__vector short)VEC_UNPACKHU(p_next0);
  82. next0h = (__vector short)VEC_UNPACKLU(p_next0);
  83. next0l = vec_add(next0l, pw_two);
  84. outle = vec_add(this0l, last0l);
  85. outlo = vec_add(this0l, next0l);
  86. outle = vec_sr(outle, (__vector unsigned short)pw_two);
  87. outlo = vec_sr(outlo, (__vector unsigned short)pw_two);
  88. out = vec_perm((__vector unsigned char)outle,
  89. (__vector unsigned char)outlo, merge_pack_index);
  90. vec_st(out, 0, outptr);
  91. if (incol > 8) {
  92. last0h = vec_add(last0h, pw_one);
  93. next0h = vec_add(next0h, pw_two);
  94. outhe = vec_add(this0h, last0h);
  95. outho = vec_add(this0h, next0h);
  96. outhe = vec_sr(outhe, (__vector unsigned short)pw_two);
  97. outho = vec_sr(outho, (__vector unsigned short)pw_two);
  98. out = vec_perm((__vector unsigned char)outhe,
  99. (__vector unsigned char)outho, merge_pack_index);
  100. vec_st(out, 16, outptr);
  101. }
  102. this0 = next0;
  103. }
  104. }
  105. }
  106. void jsimd_h2v2_fancy_upsample_altivec(int max_v_samp_factor,
  107. JDIMENSION downsampled_width,
  108. JSAMPARRAY input_data,
  109. JSAMPARRAY *output_data_ptr)
  110. {
  111. JSAMPARRAY output_data = *output_data_ptr;
  112. JSAMPROW inptr_1, inptr0, inptr1, outptr0, outptr1;
  113. int inrow, outrow, incol;
  114. __vector unsigned char this_1, this0, this1, out;
  115. __vector short this_1l, this_1h, this0l, this0h, this1l, this1h,
  116. lastcolsum_1h, lastcolsum1h,
  117. p_lastcolsum_1l, p_lastcolsum_1h, p_lastcolsum1l, p_lastcolsum1h,
  118. thiscolsum_1l, thiscolsum_1h, thiscolsum1l, thiscolsum1h,
  119. nextcolsum_1l = { 0 }, nextcolsum_1h = { 0 },
  120. nextcolsum1l = { 0 }, nextcolsum1h = { 0 },
  121. p_nextcolsum_1l, p_nextcolsum_1h, p_nextcolsum1l, p_nextcolsum1h,
  122. tmpl, tmph, outle, outhe, outlo, outho;
  123. /* Constants */
  124. __vector unsigned char pb_zero = { __16X(0) },
  125. last_index_col0 =
  126. { 0, 1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13 },
  127. last_index =
  128. { 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29 },
  129. next_index =
  130. { 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17 },
  131. next_index_lastcol =
  132. { 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 14, 15 },
  133. #if __BIG_ENDIAN__
  134. merge_pack_index =
  135. { 1, 17, 3, 19, 5, 21, 7, 23, 9, 25, 11, 27, 13, 29, 15, 31 };
  136. #else
  137. merge_pack_index =
  138. { 0, 16, 2, 18, 4, 20, 6, 22, 8, 24, 10, 26, 12, 28, 14, 30 };
  139. #endif
  140. __vector short pw_zero = { __8X(0) }, pw_three = { __8X(3) },
  141. pw_seven = { __8X(7) }, pw_eight = { __8X(8) };
  142. __vector unsigned short pw_four = { __8X(4) };
  143. for (inrow = 0, outrow = 0; outrow < max_v_samp_factor; inrow++) {
  144. inptr_1 = input_data[inrow - 1];
  145. inptr0 = input_data[inrow];
  146. inptr1 = input_data[inrow + 1];
  147. outptr0 = output_data[outrow++];
  148. outptr1 = output_data[outrow++];
  149. if (downsampled_width & 15) {
  150. inptr_1[downsampled_width] = inptr_1[downsampled_width - 1];
  151. inptr0[downsampled_width] = inptr0[downsampled_width - 1];
  152. inptr1[downsampled_width] = inptr1[downsampled_width - 1];
  153. }
  154. this0 = vec_ld(0, inptr0);
  155. this0l = (__vector short)VEC_UNPACKHU(this0);
  156. this0h = (__vector short)VEC_UNPACKLU(this0);
  157. this0l = vec_mladd(this0l, pw_three, pw_zero);
  158. this0h = vec_mladd(this0h, pw_three, pw_zero);
  159. this_1 = vec_ld(0, inptr_1);
  160. this_1l = (__vector short)VEC_UNPACKHU(this_1);
  161. this_1h = (__vector short)VEC_UNPACKLU(this_1);
  162. thiscolsum_1l = vec_add(this0l, this_1l);
  163. thiscolsum_1h = vec_add(this0h, this_1h);
  164. lastcolsum_1h = thiscolsum_1h;
  165. p_lastcolsum_1l = vec_perm(thiscolsum_1l, thiscolsum_1l, last_index_col0);
  166. p_lastcolsum_1h = vec_perm(thiscolsum_1l, thiscolsum_1h, last_index);
  167. this1 = vec_ld(0, inptr1);
  168. this1l = (__vector short)VEC_UNPACKHU(this1);
  169. this1h = (__vector short)VEC_UNPACKLU(this1);
  170. thiscolsum1l = vec_add(this0l, this1l);
  171. thiscolsum1h = vec_add(this0h, this1h);
  172. lastcolsum1h = thiscolsum1h;
  173. p_lastcolsum1l = vec_perm(thiscolsum1l, thiscolsum1l, last_index_col0);
  174. p_lastcolsum1h = vec_perm(thiscolsum1l, thiscolsum1h, last_index);
  175. for (incol = downsampled_width; incol > 0;
  176. incol -= 16, inptr_1 += 16, inptr0 += 16, inptr1 += 16,
  177. outptr0 += 32, outptr1 += 32) {
  178. if (downsampled_width - incol > 0) {
  179. p_lastcolsum_1l = vec_perm(lastcolsum_1h, thiscolsum_1l, last_index);
  180. p_lastcolsum_1h = vec_perm(thiscolsum_1l, thiscolsum_1h, last_index);
  181. p_lastcolsum1l = vec_perm(lastcolsum1h, thiscolsum1l, last_index);
  182. p_lastcolsum1h = vec_perm(thiscolsum1l, thiscolsum1h, last_index);
  183. lastcolsum_1h = thiscolsum_1h; lastcolsum1h = thiscolsum1h;
  184. }
  185. if (incol <= 16) {
  186. p_nextcolsum_1l = vec_perm(thiscolsum_1l, thiscolsum_1h, next_index);
  187. p_nextcolsum_1h = vec_perm(thiscolsum_1h, thiscolsum_1h,
  188. next_index_lastcol);
  189. p_nextcolsum1l = vec_perm(thiscolsum1l, thiscolsum1h, next_index);
  190. p_nextcolsum1h = vec_perm(thiscolsum1h, thiscolsum1h,
  191. next_index_lastcol);
  192. } else {
  193. this0 = vec_ld(16, inptr0);
  194. this0l = (__vector short)VEC_UNPACKHU(this0);
  195. this0h = (__vector short)VEC_UNPACKLU(this0);
  196. this0l = vec_mladd(this0l, pw_three, pw_zero);
  197. this0h = vec_mladd(this0h, pw_three, pw_zero);
  198. this_1 = vec_ld(16, inptr_1);
  199. this_1l = (__vector short)VEC_UNPACKHU(this_1);
  200. this_1h = (__vector short)VEC_UNPACKLU(this_1);
  201. nextcolsum_1l = vec_add(this0l, this_1l);
  202. nextcolsum_1h = vec_add(this0h, this_1h);
  203. p_nextcolsum_1l = vec_perm(thiscolsum_1l, thiscolsum_1h, next_index);
  204. p_nextcolsum_1h = vec_perm(thiscolsum_1h, nextcolsum_1l, next_index);
  205. this1 = vec_ld(16, inptr1);
  206. this1l = (__vector short)VEC_UNPACKHU(this1);
  207. this1h = (__vector short)VEC_UNPACKLU(this1);
  208. nextcolsum1l = vec_add(this0l, this1l);
  209. nextcolsum1h = vec_add(this0h, this1h);
  210. p_nextcolsum1l = vec_perm(thiscolsum1l, thiscolsum1h, next_index);
  211. p_nextcolsum1h = vec_perm(thiscolsum1h, nextcolsum1l, next_index);
  212. }
  213. /* Process the upper row */
  214. tmpl = vec_mladd(thiscolsum_1l, pw_three, pw_zero);
  215. outle = vec_add(tmpl, p_lastcolsum_1l);
  216. outle = vec_add(outle, pw_eight);
  217. outle = vec_sr(outle, pw_four);
  218. outlo = vec_add(tmpl, p_nextcolsum_1l);
  219. outlo = vec_add(outlo, pw_seven);
  220. outlo = vec_sr(outlo, pw_four);
  221. out = vec_perm((__vector unsigned char)outle,
  222. (__vector unsigned char)outlo, merge_pack_index);
  223. vec_st(out, 0, outptr0);
  224. if (incol > 8) {
  225. tmph = vec_mladd(thiscolsum_1h, pw_three, pw_zero);
  226. outhe = vec_add(tmph, p_lastcolsum_1h);
  227. outhe = vec_add(outhe, pw_eight);
  228. outhe = vec_sr(outhe, pw_four);
  229. outho = vec_add(tmph, p_nextcolsum_1h);
  230. outho = vec_add(outho, pw_seven);
  231. outho = vec_sr(outho, pw_four);
  232. out = vec_perm((__vector unsigned char)outhe,
  233. (__vector unsigned char)outho, merge_pack_index);
  234. vec_st(out, 16, outptr0);
  235. }
  236. /* Process the lower row */
  237. tmpl = vec_mladd(thiscolsum1l, pw_three, pw_zero);
  238. outle = vec_add(tmpl, p_lastcolsum1l);
  239. outle = vec_add(outle, pw_eight);
  240. outle = vec_sr(outle, pw_four);
  241. outlo = vec_add(tmpl, p_nextcolsum1l);
  242. outlo = vec_add(outlo, pw_seven);
  243. outlo = vec_sr(outlo, pw_four);
  244. out = vec_perm((__vector unsigned char)outle,
  245. (__vector unsigned char)outlo, merge_pack_index);
  246. vec_st(out, 0, outptr1);
  247. if (incol > 8) {
  248. tmph = vec_mladd(thiscolsum1h, pw_three, pw_zero);
  249. outhe = vec_add(tmph, p_lastcolsum1h);
  250. outhe = vec_add(outhe, pw_eight);
  251. outhe = vec_sr(outhe, pw_four);
  252. outho = vec_add(tmph, p_nextcolsum1h);
  253. outho = vec_add(outho, pw_seven);
  254. outho = vec_sr(outho, pw_four);
  255. out = vec_perm((__vector unsigned char)outhe,
  256. (__vector unsigned char)outho, merge_pack_index);
  257. vec_st(out, 16, outptr1);
  258. }
  259. thiscolsum_1l = nextcolsum_1l; thiscolsum_1h = nextcolsum_1h;
  260. thiscolsum1l = nextcolsum1l; thiscolsum1h = nextcolsum1h;
  261. }
  262. }
  263. }
  264. /* These are rarely used (mainly just for decompressing YCCK images) */
  265. void jsimd_h2v1_upsample_altivec(int max_v_samp_factor,
  266. JDIMENSION output_width,
  267. JSAMPARRAY input_data,
  268. JSAMPARRAY *output_data_ptr)
  269. {
  270. JSAMPARRAY output_data = *output_data_ptr;
  271. JSAMPROW inptr, outptr;
  272. int inrow, incol;
  273. __vector unsigned char in, inl, inh;
  274. for (inrow = 0; inrow < max_v_samp_factor; inrow++) {
  275. inptr = input_data[inrow];
  276. outptr = output_data[inrow];
  277. for (incol = (output_width + 31) & (~31); incol > 0;
  278. incol -= 64, inptr += 32, outptr += 64) {
  279. in = vec_ld(0, inptr);
  280. inl = vec_mergeh(in, in);
  281. inh = vec_mergel(in, in);
  282. vec_st(inl, 0, outptr);
  283. vec_st(inh, 16, outptr);
  284. if (incol > 32) {
  285. in = vec_ld(16, inptr);
  286. inl = vec_mergeh(in, in);
  287. inh = vec_mergel(in, in);
  288. vec_st(inl, 32, outptr);
  289. vec_st(inh, 48, outptr);
  290. }
  291. }
  292. }
  293. }
  294. void jsimd_h2v2_upsample_altivec(int max_v_samp_factor,
  295. JDIMENSION output_width,
  296. JSAMPARRAY input_data,
  297. JSAMPARRAY *output_data_ptr)
  298. {
  299. JSAMPARRAY output_data = *output_data_ptr;
  300. JSAMPROW inptr, outptr0, outptr1;
  301. int inrow, outrow, incol;
  302. __vector unsigned char in, inl, inh;
  303. for (inrow = 0, outrow = 0; outrow < max_v_samp_factor; inrow++) {
  304. inptr = input_data[inrow];
  305. outptr0 = output_data[outrow++];
  306. outptr1 = output_data[outrow++];
  307. for (incol = (output_width + 31) & (~31); incol > 0;
  308. incol -= 64, inptr += 32, outptr0 += 64, outptr1 += 64) {
  309. in = vec_ld(0, inptr);
  310. inl = vec_mergeh(in, in);
  311. inh = vec_mergel(in, in);
  312. vec_st(inl, 0, outptr0);
  313. vec_st(inl, 0, outptr1);
  314. vec_st(inh, 16, outptr0);
  315. vec_st(inh, 16, outptr1);
  316. if (incol > 32) {
  317. in = vec_ld(16, inptr);
  318. inl = vec_mergeh(in, in);
  319. inh = vec_mergel(in, in);
  320. vec_st(inl, 32, outptr0);
  321. vec_st(inl, 32, outptr1);
  322. vec_st(inh, 48, outptr0);
  323. vec_st(inh, 48, outptr1);
  324. }
  325. }
  326. }
  327. }