alpha_processing_sse2.c 2.7 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677
  1. // Copyright 2014 Google Inc. All Rights Reserved.
  2. //
  3. // Use of this source code is governed by a BSD-style license
  4. // that can be found in the COPYING file in the root of the source
  5. // tree. An additional intellectual property rights grant can be found
  6. // in the file PATENTS. All contributing project authors may
  7. // be found in the AUTHORS file in the root of the source tree.
  8. // -----------------------------------------------------------------------------
  9. //
  10. // Utilities for processing transparent channel.
  11. //
  12. // Author: Skal (pascal.massimino@gmail.com)
  13. #include "./dsp.h"
  14. #if defined(WEBP_USE_SSE2)
  15. #include <emmintrin.h>
  16. //------------------------------------------------------------------------------
  17. static int ExtractAlpha(const uint8_t* argb, int argb_stride,
  18. int width, int height,
  19. uint8_t* alpha, int alpha_stride) {
  20. // alpha_and stores an 'and' operation of all the alpha[] values. The final
  21. // value is not 0xff if any of the alpha[] is not equal to 0xff.
  22. uint32_t alpha_and = 0xff;
  23. int i, j;
  24. const __m128i a_mask = _mm_set1_epi32(0xffu); // to preserve alpha
  25. const __m128i all_0xff = _mm_set_epi32(0, 0, ~0u, ~0u);
  26. __m128i all_alphas = all_0xff;
  27. // We must be able to access 3 extra bytes after the last written byte
  28. // 'src[4 * width - 4]', because we don't know if alpha is the first or the
  29. // last byte of the quadruplet.
  30. const int limit = (width - 1) & ~7;
  31. for (j = 0; j < height; ++j) {
  32. const __m128i* src = (const __m128i*)argb;
  33. for (i = 0; i < limit; i += 8) {
  34. // load 32 argb bytes
  35. const __m128i a0 = _mm_loadu_si128(src + 0);
  36. const __m128i a1 = _mm_loadu_si128(src + 1);
  37. const __m128i b0 = _mm_and_si128(a0, a_mask);
  38. const __m128i b1 = _mm_and_si128(a1, a_mask);
  39. const __m128i c0 = _mm_packs_epi32(b0, b1);
  40. const __m128i d0 = _mm_packus_epi16(c0, c0);
  41. // store
  42. _mm_storel_epi64((__m128i*)&alpha[i], d0);
  43. // accumulate eight alpha 'and' in parallel
  44. all_alphas = _mm_and_si128(all_alphas, d0);
  45. src += 2;
  46. }
  47. for (; i < width; ++i) {
  48. const uint32_t alpha_value = argb[4 * i];
  49. alpha[i] = alpha_value;
  50. alpha_and &= alpha_value;
  51. }
  52. argb += argb_stride;
  53. alpha += alpha_stride;
  54. }
  55. // Combine the eight alpha 'and' into a 8-bit mask.
  56. alpha_and &= _mm_movemask_epi8(_mm_cmpeq_epi8(all_alphas, all_0xff));
  57. return (alpha_and == 0xff);
  58. }
  59. #endif // WEBP_USE_SSE2
  60. //------------------------------------------------------------------------------
  61. // Init function
  62. extern void WebPInitAlphaProcessingSSE2(void);
  63. void WebPInitAlphaProcessingSSE2(void) {
  64. #if defined(WEBP_USE_SSE2)
  65. WebPExtractAlpha = ExtractAlpha;
  66. #endif
  67. }