lossless_neon.c 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332
  1. // Copyright 2014 Google Inc. All Rights Reserved.
  2. //
  3. // Use of this source code is governed by a BSD-style license
  4. // that can be found in the COPYING file in the root of the source
  5. // tree. An additional intellectual property rights grant can be found
  6. // in the file PATENTS. All contributing project authors may
  7. // be found in the AUTHORS file in the root of the source tree.
  8. // -----------------------------------------------------------------------------
  9. //
  10. // NEON variant of methods for lossless decoder
  11. //
  12. // Author: Skal (pascal.massimino@gmail.com)
  13. #include "./dsp.h"
  14. #if defined(WEBP_USE_NEON)
  15. #include <arm_neon.h>
  16. #include "./lossless.h"
  17. #include "./neon.h"
  18. //------------------------------------------------------------------------------
  19. // Colorspace conversion functions
  20. #if !defined(WORK_AROUND_GCC)
  21. // gcc 4.6.0 had some trouble (NDK-r9) with this code. We only use it for
  22. // gcc-4.8.x at least.
  23. static void ConvertBGRAToRGBA(const uint32_t* src,
  24. int num_pixels, uint8_t* dst) {
  25. const uint32_t* const end = src + (num_pixels & ~15);
  26. for (; src < end; src += 16) {
  27. uint8x16x4_t pixel = vld4q_u8((uint8_t*)src);
  28. // swap B and R. (VSWP d0,d2 has no intrinsics equivalent!)
  29. const uint8x16_t tmp = pixel.val[0];
  30. pixel.val[0] = pixel.val[2];
  31. pixel.val[2] = tmp;
  32. vst4q_u8(dst, pixel);
  33. dst += 64;
  34. }
  35. VP8LConvertBGRAToRGBA_C(src, num_pixels & 15, dst); // left-overs
  36. }
  37. static void ConvertBGRAToBGR(const uint32_t* src,
  38. int num_pixels, uint8_t* dst) {
  39. const uint32_t* const end = src + (num_pixels & ~15);
  40. for (; src < end; src += 16) {
  41. const uint8x16x4_t pixel = vld4q_u8((uint8_t*)src);
  42. const uint8x16x3_t tmp = { { pixel.val[0], pixel.val[1], pixel.val[2] } };
  43. vst3q_u8(dst, tmp);
  44. dst += 48;
  45. }
  46. VP8LConvertBGRAToBGR_C(src, num_pixels & 15, dst); // left-overs
  47. }
  48. static void ConvertBGRAToRGB(const uint32_t* src,
  49. int num_pixels, uint8_t* dst) {
  50. const uint32_t* const end = src + (num_pixels & ~15);
  51. for (; src < end; src += 16) {
  52. const uint8x16x4_t pixel = vld4q_u8((uint8_t*)src);
  53. const uint8x16x3_t tmp = { { pixel.val[2], pixel.val[1], pixel.val[0] } };
  54. vst3q_u8(dst, tmp);
  55. dst += 48;
  56. }
  57. VP8LConvertBGRAToRGB_C(src, num_pixels & 15, dst); // left-overs
  58. }
  59. #else // WORK_AROUND_GCC
  60. // gcc-4.6.0 fallback
  61. static const uint8_t kRGBAShuffle[8] = { 2, 1, 0, 3, 6, 5, 4, 7 };
  62. static void ConvertBGRAToRGBA(const uint32_t* src,
  63. int num_pixels, uint8_t* dst) {
  64. const uint32_t* const end = src + (num_pixels & ~1);
  65. const uint8x8_t shuffle = vld1_u8(kRGBAShuffle);
  66. for (; src < end; src += 2) {
  67. const uint8x8_t pixels = vld1_u8((uint8_t*)src);
  68. vst1_u8(dst, vtbl1_u8(pixels, shuffle));
  69. dst += 8;
  70. }
  71. VP8LConvertBGRAToRGBA_C(src, num_pixels & 1, dst); // left-overs
  72. }
  73. static const uint8_t kBGRShuffle[3][8] = {
  74. { 0, 1, 2, 4, 5, 6, 8, 9 },
  75. { 10, 12, 13, 14, 16, 17, 18, 20 },
  76. { 21, 22, 24, 25, 26, 28, 29, 30 }
  77. };
  78. static void ConvertBGRAToBGR(const uint32_t* src,
  79. int num_pixels, uint8_t* dst) {
  80. const uint32_t* const end = src + (num_pixels & ~7);
  81. const uint8x8_t shuffle0 = vld1_u8(kBGRShuffle[0]);
  82. const uint8x8_t shuffle1 = vld1_u8(kBGRShuffle[1]);
  83. const uint8x8_t shuffle2 = vld1_u8(kBGRShuffle[2]);
  84. for (; src < end; src += 8) {
  85. uint8x8x4_t pixels;
  86. INIT_VECTOR4(pixels,
  87. vld1_u8((const uint8_t*)(src + 0)),
  88. vld1_u8((const uint8_t*)(src + 2)),
  89. vld1_u8((const uint8_t*)(src + 4)),
  90. vld1_u8((const uint8_t*)(src + 6)));
  91. vst1_u8(dst + 0, vtbl4_u8(pixels, shuffle0));
  92. vst1_u8(dst + 8, vtbl4_u8(pixels, shuffle1));
  93. vst1_u8(dst + 16, vtbl4_u8(pixels, shuffle2));
  94. dst += 8 * 3;
  95. }
  96. VP8LConvertBGRAToBGR_C(src, num_pixels & 7, dst); // left-overs
  97. }
  98. static const uint8_t kRGBShuffle[3][8] = {
  99. { 2, 1, 0, 6, 5, 4, 10, 9 },
  100. { 8, 14, 13, 12, 18, 17, 16, 22 },
  101. { 21, 20, 26, 25, 24, 30, 29, 28 }
  102. };
  103. static void ConvertBGRAToRGB(const uint32_t* src,
  104. int num_pixels, uint8_t* dst) {
  105. const uint32_t* const end = src + (num_pixels & ~7);
  106. const uint8x8_t shuffle0 = vld1_u8(kRGBShuffle[0]);
  107. const uint8x8_t shuffle1 = vld1_u8(kRGBShuffle[1]);
  108. const uint8x8_t shuffle2 = vld1_u8(kRGBShuffle[2]);
  109. for (; src < end; src += 8) {
  110. uint8x8x4_t pixels;
  111. INIT_VECTOR4(pixels,
  112. vld1_u8((const uint8_t*)(src + 0)),
  113. vld1_u8((const uint8_t*)(src + 2)),
  114. vld1_u8((const uint8_t*)(src + 4)),
  115. vld1_u8((const uint8_t*)(src + 6)));
  116. vst1_u8(dst + 0, vtbl4_u8(pixels, shuffle0));
  117. vst1_u8(dst + 8, vtbl4_u8(pixels, shuffle1));
  118. vst1_u8(dst + 16, vtbl4_u8(pixels, shuffle2));
  119. dst += 8 * 3;
  120. }
  121. VP8LConvertBGRAToRGB_C(src, num_pixels & 7, dst); // left-overs
  122. }
  123. #endif // !WORK_AROUND_GCC
  124. //------------------------------------------------------------------------------
  125. #ifdef USE_INTRINSICS
  126. static WEBP_INLINE uint32_t Average2(const uint32_t* const a,
  127. const uint32_t* const b) {
  128. const uint8x8_t a0 = vreinterpret_u8_u64(vcreate_u64(*a));
  129. const uint8x8_t b0 = vreinterpret_u8_u64(vcreate_u64(*b));
  130. const uint8x8_t avg = vhadd_u8(a0, b0);
  131. return vget_lane_u32(vreinterpret_u32_u8(avg), 0);
  132. }
  133. static WEBP_INLINE uint32_t Average3(const uint32_t* const a,
  134. const uint32_t* const b,
  135. const uint32_t* const c) {
  136. const uint8x8_t a0 = vreinterpret_u8_u64(vcreate_u64(*a));
  137. const uint8x8_t b0 = vreinterpret_u8_u64(vcreate_u64(*b));
  138. const uint8x8_t c0 = vreinterpret_u8_u64(vcreate_u64(*c));
  139. const uint8x8_t avg1 = vhadd_u8(a0, c0);
  140. const uint8x8_t avg2 = vhadd_u8(avg1, b0);
  141. return vget_lane_u32(vreinterpret_u32_u8(avg2), 0);
  142. }
  143. static WEBP_INLINE uint32_t Average4(const uint32_t* const a,
  144. const uint32_t* const b,
  145. const uint32_t* const c,
  146. const uint32_t* const d) {
  147. const uint8x8_t a0 = vreinterpret_u8_u64(vcreate_u64(*a));
  148. const uint8x8_t b0 = vreinterpret_u8_u64(vcreate_u64(*b));
  149. const uint8x8_t c0 = vreinterpret_u8_u64(vcreate_u64(*c));
  150. const uint8x8_t d0 = vreinterpret_u8_u64(vcreate_u64(*d));
  151. const uint8x8_t avg1 = vhadd_u8(a0, b0);
  152. const uint8x8_t avg2 = vhadd_u8(c0, d0);
  153. const uint8x8_t avg3 = vhadd_u8(avg1, avg2);
  154. return vget_lane_u32(vreinterpret_u32_u8(avg3), 0);
  155. }
  156. static uint32_t Predictor5(uint32_t left, const uint32_t* const top) {
  157. return Average3(&left, top + 0, top + 1);
  158. }
  159. static uint32_t Predictor6(uint32_t left, const uint32_t* const top) {
  160. return Average2(&left, top - 1);
  161. }
  162. static uint32_t Predictor7(uint32_t left, const uint32_t* const top) {
  163. return Average2(&left, top + 0);
  164. }
  165. static uint32_t Predictor8(uint32_t left, const uint32_t* const top) {
  166. (void)left;
  167. return Average2(top - 1, top + 0);
  168. }
  169. static uint32_t Predictor9(uint32_t left, const uint32_t* const top) {
  170. (void)left;
  171. return Average2(top + 0, top + 1);
  172. }
  173. static uint32_t Predictor10(uint32_t left, const uint32_t* const top) {
  174. return Average4(&left, top - 1, top + 0, top + 1);
  175. }
  176. //------------------------------------------------------------------------------
  177. static WEBP_INLINE uint32_t Select(const uint32_t* const c0,
  178. const uint32_t* const c1,
  179. const uint32_t* const c2) {
  180. const uint8x8_t p0 = vreinterpret_u8_u64(vcreate_u64(*c0));
  181. const uint8x8_t p1 = vreinterpret_u8_u64(vcreate_u64(*c1));
  182. const uint8x8_t p2 = vreinterpret_u8_u64(vcreate_u64(*c2));
  183. const uint8x8_t bc = vabd_u8(p1, p2); // |b-c|
  184. const uint8x8_t ac = vabd_u8(p0, p2); // |a-c|
  185. const int16x4_t sum_bc = vreinterpret_s16_u16(vpaddl_u8(bc));
  186. const int16x4_t sum_ac = vreinterpret_s16_u16(vpaddl_u8(ac));
  187. const int32x2_t diff = vpaddl_s16(vsub_s16(sum_bc, sum_ac));
  188. const int32_t pa_minus_pb = vget_lane_s32(diff, 0);
  189. return (pa_minus_pb <= 0) ? *c0 : *c1;
  190. }
  191. static uint32_t Predictor11(uint32_t left, const uint32_t* const top) {
  192. return Select(top + 0, &left, top - 1);
  193. }
  194. static WEBP_INLINE uint32_t ClampedAddSubtractFull(const uint32_t* const c0,
  195. const uint32_t* const c1,
  196. const uint32_t* const c2) {
  197. const uint8x8_t p0 = vreinterpret_u8_u64(vcreate_u64(*c0));
  198. const uint8x8_t p1 = vreinterpret_u8_u64(vcreate_u64(*c1));
  199. const uint8x8_t p2 = vreinterpret_u8_u64(vcreate_u64(*c2));
  200. const uint16x8_t sum0 = vaddl_u8(p0, p1); // add and widen
  201. const uint16x8_t sum1 = vqsubq_u16(sum0, vmovl_u8(p2)); // widen and subtract
  202. const uint8x8_t out = vqmovn_u16(sum1); // narrow and clamp
  203. return vget_lane_u32(vreinterpret_u32_u8(out), 0);
  204. }
  205. static uint32_t Predictor12(uint32_t left, const uint32_t* const top) {
  206. return ClampedAddSubtractFull(&left, top + 0, top - 1);
  207. }
  208. static WEBP_INLINE uint32_t ClampedAddSubtractHalf(const uint32_t* const c0,
  209. const uint32_t* const c1,
  210. const uint32_t* const c2) {
  211. const uint8x8_t p0 = vreinterpret_u8_u64(vcreate_u64(*c0));
  212. const uint8x8_t p1 = vreinterpret_u8_u64(vcreate_u64(*c1));
  213. const uint8x8_t p2 = vreinterpret_u8_u64(vcreate_u64(*c2));
  214. const uint8x8_t avg = vhadd_u8(p0, p1); // Average(c0,c1)
  215. const uint8x8_t ab = vshr_n_u8(vqsub_u8(avg, p2), 1); // (a-b)>>1 saturated
  216. const uint8x8_t ba = vshr_n_u8(vqsub_u8(p2, avg), 1); // (b-a)>>1 saturated
  217. const uint8x8_t out = vqsub_u8(vqadd_u8(avg, ab), ba);
  218. return vget_lane_u32(vreinterpret_u32_u8(out), 0);
  219. }
  220. static uint32_t Predictor13(uint32_t left, const uint32_t* const top) {
  221. return ClampedAddSubtractHalf(&left, top + 0, top - 1);
  222. }
  223. //------------------------------------------------------------------------------
  224. // Subtract-Green Transform
  225. // vtbl? are unavailable in iOS/arm64 builds.
  226. #if !defined(__aarch64__)
  227. // 255 = byte will be zero'd
  228. static const uint8_t kGreenShuffle[8] = { 1, 255, 1, 255, 5, 255, 5, 255 };
  229. static void SubtractGreenFromBlueAndRed(uint32_t* argb_data, int num_pixels) {
  230. const uint32_t* const end = argb_data + (num_pixels & ~3);
  231. const uint8x8_t shuffle = vld1_u8(kGreenShuffle);
  232. for (; argb_data < end; argb_data += 4) {
  233. const uint8x16_t argb = vld1q_u8((uint8_t*)argb_data);
  234. const uint8x16_t greens =
  235. vcombine_u8(vtbl1_u8(vget_low_u8(argb), shuffle),
  236. vtbl1_u8(vget_high_u8(argb), shuffle));
  237. vst1q_u8((uint8_t*)argb_data, vsubq_u8(argb, greens));
  238. }
  239. // fallthrough and finish off with plain-C
  240. VP8LSubtractGreenFromBlueAndRed_C(argb_data, num_pixels & 3);
  241. }
  242. static void AddGreenToBlueAndRed(uint32_t* argb_data, int num_pixels) {
  243. const uint32_t* const end = argb_data + (num_pixels & ~3);
  244. const uint8x8_t shuffle = vld1_u8(kGreenShuffle);
  245. for (; argb_data < end; argb_data += 4) {
  246. const uint8x16_t argb = vld1q_u8((uint8_t*)argb_data);
  247. const uint8x16_t greens =
  248. vcombine_u8(vtbl1_u8(vget_low_u8(argb), shuffle),
  249. vtbl1_u8(vget_high_u8(argb), shuffle));
  250. vst1q_u8((uint8_t*)argb_data, vaddq_u8(argb, greens));
  251. }
  252. // fallthrough and finish off with plain-C
  253. VP8LAddGreenToBlueAndRed_C(argb_data, num_pixels & 3);
  254. }
  255. #endif // !__aarch64__
  256. #endif // USE_INTRINSICS
  257. #endif // WEBP_USE_NEON
  258. //------------------------------------------------------------------------------
  259. extern void VP8LDspInitNEON(void);
  260. void VP8LDspInitNEON(void) {
  261. #if defined(WEBP_USE_NEON)
  262. VP8LConvertBGRAToRGBA = ConvertBGRAToRGBA;
  263. VP8LConvertBGRAToBGR = ConvertBGRAToBGR;
  264. VP8LConvertBGRAToRGB = ConvertBGRAToRGB;
  265. #ifdef USE_INTRINSICS
  266. VP8LPredictors[5] = Predictor5;
  267. VP8LPredictors[6] = Predictor6;
  268. VP8LPredictors[7] = Predictor7;
  269. VP8LPredictors[8] = Predictor8;
  270. VP8LPredictors[9] = Predictor9;
  271. VP8LPredictors[10] = Predictor10;
  272. VP8LPredictors[11] = Predictor11;
  273. VP8LPredictors[12] = Predictor12;
  274. VP8LPredictors[13] = Predictor13;
  275. #if !defined(__aarch64__)
  276. VP8LSubtractGreenFromBlueAndRed = SubtractGreenFromBlueAndRed;
  277. VP8LAddGreenToBlueAndRed = AddGreenToBlueAndRed;
  278. #endif
  279. #endif
  280. #endif // WEBP_USE_NEON
  281. }
  282. //------------------------------------------------------------------------------