neon.h 3.0 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182
  1. // Copyright 2014 Google Inc. All Rights Reserved.
  2. //
  3. // Use of this source code is governed by a BSD-style license
  4. // that can be found in the COPYING file in the root of the source
  5. // tree. An additional intellectual property rights grant can be found
  6. // in the file PATENTS. All contributing project authors may
  7. // be found in the AUTHORS file in the root of the source tree.
  8. // -----------------------------------------------------------------------------
  9. //
  10. // NEON common code.
  11. #ifndef WEBP_DSP_NEON_H_
  12. #define WEBP_DSP_NEON_H_
  13. #include <arm_neon.h>
  14. #include "./dsp.h"
  15. // Right now, some intrinsics functions seem slower, so we disable them
  16. // everywhere except aarch64 where the inline assembly is incompatible.
  17. #if defined(__aarch64__)
  18. #define USE_INTRINSICS // use intrinsics when possible
  19. #endif
  20. #define INIT_VECTOR2(v, a, b) do { \
  21. v.val[0] = a; \
  22. v.val[1] = b; \
  23. } while (0)
  24. #define INIT_VECTOR3(v, a, b, c) do { \
  25. v.val[0] = a; \
  26. v.val[1] = b; \
  27. v.val[2] = c; \
  28. } while (0)
  29. #define INIT_VECTOR4(v, a, b, c, d) do { \
  30. v.val[0] = a; \
  31. v.val[1] = b; \
  32. v.val[2] = c; \
  33. v.val[3] = d; \
  34. } while (0)
  35. // if using intrinsics, this flag avoids some functions that make gcc-4.6.3
  36. // crash ("internal compiler error: in immed_double_const, at emit-rtl.").
  37. // (probably similar to gcc.gnu.org/bugzilla/show_bug.cgi?id=48183)
  38. #if !(LOCAL_GCC_PREREQ(4,8) || defined(__aarch64__))
  39. #define WORK_AROUND_GCC
  40. #endif
  41. static WEBP_INLINE int32x4x4_t Transpose4x4(const int32x4x4_t rows) {
  42. uint64x2x2_t row01, row23;
  43. row01.val[0] = vreinterpretq_u64_s32(rows.val[0]);
  44. row01.val[1] = vreinterpretq_u64_s32(rows.val[1]);
  45. row23.val[0] = vreinterpretq_u64_s32(rows.val[2]);
  46. row23.val[1] = vreinterpretq_u64_s32(rows.val[3]);
  47. // Transpose 64-bit values (there's no vswp equivalent)
  48. {
  49. const uint64x1_t row0h = vget_high_u64(row01.val[0]);
  50. const uint64x1_t row2l = vget_low_u64(row23.val[0]);
  51. const uint64x1_t row1h = vget_high_u64(row01.val[1]);
  52. const uint64x1_t row3l = vget_low_u64(row23.val[1]);
  53. row01.val[0] = vcombine_u64(vget_low_u64(row01.val[0]), row2l);
  54. row23.val[0] = vcombine_u64(row0h, vget_high_u64(row23.val[0]));
  55. row01.val[1] = vcombine_u64(vget_low_u64(row01.val[1]), row3l);
  56. row23.val[1] = vcombine_u64(row1h, vget_high_u64(row23.val[1]));
  57. }
  58. {
  59. const int32x4x2_t out01 = vtrnq_s32(vreinterpretq_s32_u64(row01.val[0]),
  60. vreinterpretq_s32_u64(row01.val[1]));
  61. const int32x4x2_t out23 = vtrnq_s32(vreinterpretq_s32_u64(row23.val[0]),
  62. vreinterpretq_s32_u64(row23.val[1]));
  63. int32x4x4_t out;
  64. out.val[0] = out01.val[0];
  65. out.val[1] = out01.val[1];
  66. out.val[2] = out23.val[0];
  67. out.val[3] = out23.val[1];
  68. return out;
  69. }
  70. }
  71. #endif // WEBP_DSP_NEON_H_