jcsample-altivec.c 5.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159
  1. /*
  2. * AltiVec optimizations for libjpeg-turbo
  3. *
  4. * Copyright (C) 2015, D. R. Commander. All Rights Reserved.
  5. *
  6. * This software is provided 'as-is', without any express or implied
  7. * warranty. In no event will the authors be held liable for any damages
  8. * arising from the use of this software.
  9. *
  10. * Permission is granted to anyone to use this software for any purpose,
  11. * including commercial applications, and to alter it and redistribute it
  12. * freely, subject to the following restrictions:
  13. *
  14. * 1. The origin of this software must not be misrepresented; you must not
  15. * claim that you wrote the original software. If you use this software
  16. * in a product, an acknowledgment in the product documentation would be
  17. * appreciated but is not required.
  18. * 2. Altered source versions must be plainly marked as such, and must not be
  19. * misrepresented as being the original software.
  20. * 3. This notice may not be removed or altered from any source distribution.
  21. */
  22. /* CHROMA DOWNSAMPLING */
  23. #include "jsimd_altivec.h"
  24. #include "jcsample.h"
  25. void jsimd_h2v1_downsample_altivec(JDIMENSION image_width,
  26. int max_v_samp_factor,
  27. JDIMENSION v_samp_factor,
  28. JDIMENSION width_in_blocks,
  29. JSAMPARRAY input_data,
  30. JSAMPARRAY output_data)
  31. {
  32. int outrow, outcol;
  33. JDIMENSION output_cols = width_in_blocks * DCTSIZE;
  34. JSAMPROW inptr, outptr;
  35. __vector unsigned char this0, next0, out;
  36. __vector unsigned short this0e, this0o, next0e, next0o, outl, outh;
  37. /* Constants */
  38. __vector unsigned short pw_bias = { __4X2(0, 1) },
  39. pw_one = { __8X(1) };
  40. __vector unsigned char even_odd_index =
  41. { 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15 },
  42. pb_zero = { __16X(0) };
  43. expand_right_edge(input_data, max_v_samp_factor, image_width,
  44. output_cols * 2);
  45. for (outrow = 0; outrow < v_samp_factor; outrow++) {
  46. outptr = output_data[outrow];
  47. inptr = input_data[outrow];
  48. for (outcol = output_cols; outcol > 0;
  49. outcol -= 16, inptr += 32, outptr += 16) {
  50. this0 = vec_ld(0, inptr);
  51. this0 = vec_perm(this0, this0, even_odd_index);
  52. this0e = (__vector unsigned short)VEC_UNPACKHU(this0);
  53. this0o = (__vector unsigned short)VEC_UNPACKLU(this0);
  54. outl = vec_add(this0e, this0o);
  55. outl = vec_add(outl, pw_bias);
  56. outl = vec_sr(outl, pw_one);
  57. if (outcol > 8) {
  58. next0 = vec_ld(16, inptr);
  59. next0 = vec_perm(next0, next0, even_odd_index);
  60. next0e = (__vector unsigned short)VEC_UNPACKHU(next0);
  61. next0o = (__vector unsigned short)VEC_UNPACKLU(next0);
  62. outh = vec_add(next0e, next0o);
  63. outh = vec_add(outh, pw_bias);
  64. outh = vec_sr(outh, pw_one);
  65. } else
  66. outh = vec_splat_u16(0);
  67. out = vec_pack(outl, outh);
  68. vec_st(out, 0, outptr);
  69. }
  70. }
  71. }
  72. void
  73. jsimd_h2v2_downsample_altivec(JDIMENSION image_width, int max_v_samp_factor,
  74. JDIMENSION v_samp_factor,
  75. JDIMENSION width_in_blocks,
  76. JSAMPARRAY input_data, JSAMPARRAY output_data)
  77. {
  78. int inrow, outrow, outcol;
  79. JDIMENSION output_cols = width_in_blocks * DCTSIZE;
  80. JSAMPROW inptr0, inptr1, outptr;
  81. __vector unsigned char this0, next0, this1, next1, out;
  82. __vector unsigned short this0e, this0o, next0e, next0o, this1e, this1o,
  83. next1e, next1o, out0l, out0h, out1l, out1h, outl, outh;
  84. /* Constants */
  85. __vector unsigned short pw_bias = { __4X2(1, 2) },
  86. pw_two = { __8X(2) };
  87. __vector unsigned char even_odd_index =
  88. { 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15 },
  89. pb_zero = { __16X(0) };
  90. expand_right_edge(input_data, max_v_samp_factor, image_width,
  91. output_cols * 2);
  92. for (inrow = 0, outrow = 0; outrow < v_samp_factor;
  93. inrow += 2, outrow++) {
  94. inptr0 = input_data[inrow];
  95. inptr1 = input_data[inrow + 1];
  96. outptr = output_data[outrow];
  97. for (outcol = output_cols; outcol > 0;
  98. outcol -= 16, inptr0 += 32, inptr1 += 32, outptr += 16) {
  99. this0 = vec_ld(0, inptr0);
  100. this0 = vec_perm(this0, this0, even_odd_index);
  101. this0e = (__vector unsigned short)VEC_UNPACKHU(this0);
  102. this0o = (__vector unsigned short)VEC_UNPACKLU(this0);
  103. out0l = vec_add(this0e, this0o);
  104. this1 = vec_ld(0, inptr1);
  105. this1 = vec_perm(this1, this1, even_odd_index);
  106. this1e = (__vector unsigned short)VEC_UNPACKHU(this1);
  107. this1o = (__vector unsigned short)VEC_UNPACKLU(this1);
  108. out1l = vec_add(this1e, this1o);
  109. outl = vec_add(out0l, out1l);
  110. outl = vec_add(outl, pw_bias);
  111. outl = vec_sr(outl, pw_two);
  112. if (outcol > 8) {
  113. next0 = vec_ld(16, inptr0);
  114. next0 = vec_perm(next0, next0, even_odd_index);
  115. next0e = (__vector unsigned short)VEC_UNPACKHU(next0);
  116. next0o = (__vector unsigned short)VEC_UNPACKLU(next0);
  117. out0h = vec_add(next0e, next0o);
  118. next1 = vec_ld(16, inptr1);
  119. next1 = vec_perm(next1, next1, even_odd_index);
  120. next1e = (__vector unsigned short)VEC_UNPACKHU(next1);
  121. next1o = (__vector unsigned short)VEC_UNPACKLU(next1);
  122. out1h = vec_add(next1e, next1o);
  123. outh = vec_add(out0h, out1h);
  124. outh = vec_add(outh, pw_bias);
  125. outh = vec_sr(outh, pw_two);
  126. } else
  127. outh = vec_splat_u16(0);
  128. out = vec_pack(outl, outh);
  129. vec_st(out, 0, outptr);
  130. }
  131. }
  132. }