upsampling_neon.c 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267
  1. // Copyright 2011 Google Inc. All Rights Reserved.
  2. //
  3. // Use of this source code is governed by a BSD-style license
  4. // that can be found in the COPYING file in the root of the source
  5. // tree. An additional intellectual property rights grant can be found
  6. // in the file PATENTS. All contributing project authors may
  7. // be found in the AUTHORS file in the root of the source tree.
  8. // -----------------------------------------------------------------------------
  9. //
  10. // NEON version of YUV to RGB upsampling functions.
  11. //
  12. // Author: mans@mansr.com (Mans Rullgard)
  13. // Based on SSE code by: somnath@google.com (Somnath Banerjee)
  14. #include "./dsp.h"
  15. #if defined(WEBP_USE_NEON)
  16. #include <assert.h>
  17. #include <arm_neon.h>
  18. #include <string.h>
  19. #include "./neon.h"
  20. #include "./yuv.h"
  21. #ifdef FANCY_UPSAMPLING
  22. //-----------------------------------------------------------------------------
  23. // U/V upsampling
  24. // Loads 9 pixels each from rows r1 and r2 and generates 16 pixels.
  25. #define UPSAMPLE_16PIXELS(r1, r2, out) { \
  26. uint8x8_t a = vld1_u8(r1); \
  27. uint8x8_t b = vld1_u8(r1 + 1); \
  28. uint8x8_t c = vld1_u8(r2); \
  29. uint8x8_t d = vld1_u8(r2 + 1); \
  30. \
  31. uint16x8_t al = vshll_n_u8(a, 1); \
  32. uint16x8_t bl = vshll_n_u8(b, 1); \
  33. uint16x8_t cl = vshll_n_u8(c, 1); \
  34. uint16x8_t dl = vshll_n_u8(d, 1); \
  35. \
  36. uint8x8_t diag1, diag2; \
  37. uint16x8_t sl; \
  38. \
  39. /* a + b + c + d */ \
  40. sl = vaddl_u8(a, b); \
  41. sl = vaddw_u8(sl, c); \
  42. sl = vaddw_u8(sl, d); \
  43. \
  44. al = vaddq_u16(sl, al); /* 3a + b + c + d */ \
  45. bl = vaddq_u16(sl, bl); /* a + 3b + c + d */ \
  46. \
  47. al = vaddq_u16(al, dl); /* 3a + b + c + 3d */ \
  48. bl = vaddq_u16(bl, cl); /* a + 3b + 3c + d */ \
  49. \
  50. diag2 = vshrn_n_u16(al, 3); \
  51. diag1 = vshrn_n_u16(bl, 3); \
  52. \
  53. a = vrhadd_u8(a, diag1); \
  54. b = vrhadd_u8(b, diag2); \
  55. c = vrhadd_u8(c, diag2); \
  56. d = vrhadd_u8(d, diag1); \
  57. \
  58. { \
  59. uint8x8x2_t a_b, c_d; \
  60. INIT_VECTOR2(a_b, a, b); \
  61. INIT_VECTOR2(c_d, c, d); \
  62. vst2_u8(out, a_b); \
  63. vst2_u8(out + 32, c_d); \
  64. } \
  65. }
  66. // Turn the macro into a function for reducing code-size when non-critical
  67. static void Upsample16Pixels(const uint8_t *r1, const uint8_t *r2,
  68. uint8_t *out) {
  69. UPSAMPLE_16PIXELS(r1, r2, out);
  70. }
  71. #define UPSAMPLE_LAST_BLOCK(tb, bb, num_pixels, out) { \
  72. uint8_t r1[9], r2[9]; \
  73. memcpy(r1, (tb), (num_pixels)); \
  74. memcpy(r2, (bb), (num_pixels)); \
  75. /* replicate last byte */ \
  76. memset(r1 + (num_pixels), r1[(num_pixels) - 1], 9 - (num_pixels)); \
  77. memset(r2 + (num_pixels), r2[(num_pixels) - 1], 9 - (num_pixels)); \
  78. Upsample16Pixels(r1, r2, out); \
  79. }
  80. //-----------------------------------------------------------------------------
  81. // YUV->RGB conversion
  82. static const int16_t kCoeffs[4] = { kYScale, kVToR, kUToG, kVToG };
  83. #define v255 vdup_n_u8(255)
  84. #define STORE_Rgb(out, r, g, b) do { \
  85. uint8x8x3_t r_g_b; \
  86. INIT_VECTOR3(r_g_b, r, g, b); \
  87. vst3_u8(out, r_g_b); \
  88. } while (0)
  89. #define STORE_Bgr(out, r, g, b) do { \
  90. uint8x8x3_t b_g_r; \
  91. INIT_VECTOR3(b_g_r, b, g, r); \
  92. vst3_u8(out, b_g_r); \
  93. } while (0)
  94. #define STORE_Rgba(out, r, g, b) do { \
  95. uint8x8x4_t r_g_b_v255; \
  96. INIT_VECTOR4(r_g_b_v255, r, g, b, v255); \
  97. vst4_u8(out, r_g_b_v255); \
  98. } while (0)
  99. #define STORE_Bgra(out, r, g, b) do { \
  100. uint8x8x4_t b_g_r_v255; \
  101. INIT_VECTOR4(b_g_r_v255, b, g, r, v255); \
  102. vst4_u8(out, b_g_r_v255); \
  103. } while (0)
  104. #define CONVERT8(FMT, XSTEP, N, src_y, src_uv, out, cur_x) { \
  105. int i; \
  106. for (i = 0; i < N; i += 8) { \
  107. const int off = ((cur_x) + i) * XSTEP; \
  108. uint8x8_t y = vld1_u8((src_y) + (cur_x) + i); \
  109. uint8x8_t u = vld1_u8((src_uv) + i); \
  110. uint8x8_t v = vld1_u8((src_uv) + i + 16); \
  111. const int16x8_t yy = vreinterpretq_s16_u16(vsubl_u8(y, u16)); \
  112. const int16x8_t uu = vreinterpretq_s16_u16(vsubl_u8(u, u128)); \
  113. const int16x8_t vv = vreinterpretq_s16_u16(vsubl_u8(v, u128)); \
  114. int32x4_t yl = vmull_lane_s16(vget_low_s16(yy), cf16, 0); \
  115. int32x4_t yh = vmull_lane_s16(vget_high_s16(yy), cf16, 0); \
  116. const int32x4_t rl = vmlal_lane_s16(yl, vget_low_s16(vv), cf16, 1);\
  117. const int32x4_t rh = vmlal_lane_s16(yh, vget_high_s16(vv), cf16, 1);\
  118. int32x4_t gl = vmlsl_lane_s16(yl, vget_low_s16(uu), cf16, 2); \
  119. int32x4_t gh = vmlsl_lane_s16(yh, vget_high_s16(uu), cf16, 2); \
  120. const int32x4_t bl = vmovl_s16(vget_low_s16(uu)); \
  121. const int32x4_t bh = vmovl_s16(vget_high_s16(uu)); \
  122. gl = vmlsl_lane_s16(gl, vget_low_s16(vv), cf16, 3); \
  123. gh = vmlsl_lane_s16(gh, vget_high_s16(vv), cf16, 3); \
  124. yl = vmlaq_lane_s32(yl, bl, cf32, 0); \
  125. yh = vmlaq_lane_s32(yh, bh, cf32, 0); \
  126. /* vrshrn_n_s32() already incorporates the rounding constant */ \
  127. y = vqmovun_s16(vcombine_s16(vrshrn_n_s32(rl, YUV_FIX2), \
  128. vrshrn_n_s32(rh, YUV_FIX2))); \
  129. u = vqmovun_s16(vcombine_s16(vrshrn_n_s32(gl, YUV_FIX2), \
  130. vrshrn_n_s32(gh, YUV_FIX2))); \
  131. v = vqmovun_s16(vcombine_s16(vrshrn_n_s32(yl, YUV_FIX2), \
  132. vrshrn_n_s32(yh, YUV_FIX2))); \
  133. STORE_ ## FMT(out + off, y, u, v); \
  134. } \
  135. }
  136. #define CONVERT1(FUNC, XSTEP, N, src_y, src_uv, rgb, cur_x) { \
  137. int i; \
  138. for (i = 0; i < N; i++) { \
  139. const int off = ((cur_x) + i) * XSTEP; \
  140. const int y = src_y[(cur_x) + i]; \
  141. const int u = (src_uv)[i]; \
  142. const int v = (src_uv)[i + 16]; \
  143. FUNC(y, u, v, rgb + off); \
  144. } \
  145. }
  146. #define CONVERT2RGB_8(FMT, XSTEP, top_y, bottom_y, uv, \
  147. top_dst, bottom_dst, cur_x, len) { \
  148. CONVERT8(FMT, XSTEP, len, top_y, uv, top_dst, cur_x) \
  149. if (bottom_y != NULL) { \
  150. CONVERT8(FMT, XSTEP, len, bottom_y, (uv) + 32, bottom_dst, cur_x) \
  151. } \
  152. }
  153. #define CONVERT2RGB_1(FUNC, XSTEP, top_y, bottom_y, uv, \
  154. top_dst, bottom_dst, cur_x, len) { \
  155. CONVERT1(FUNC, XSTEP, len, top_y, uv, top_dst, cur_x); \
  156. if (bottom_y != NULL) { \
  157. CONVERT1(FUNC, XSTEP, len, bottom_y, (uv) + 32, bottom_dst, cur_x); \
  158. } \
  159. }
  160. #define NEON_UPSAMPLE_FUNC(FUNC_NAME, FMT, XSTEP) \
  161. static void FUNC_NAME(const uint8_t *top_y, const uint8_t *bottom_y, \
  162. const uint8_t *top_u, const uint8_t *top_v, \
  163. const uint8_t *cur_u, const uint8_t *cur_v, \
  164. uint8_t *top_dst, uint8_t *bottom_dst, int len) { \
  165. int block; \
  166. /* 16 byte aligned array to cache reconstructed u and v */ \
  167. uint8_t uv_buf[2 * 32 + 15]; \
  168. uint8_t *const r_uv = (uint8_t*)((uintptr_t)(uv_buf + 15) & ~15); \
  169. const int uv_len = (len + 1) >> 1; \
  170. /* 9 pixels must be read-able for each block */ \
  171. const int num_blocks = (uv_len - 1) >> 3; \
  172. const int leftover = uv_len - num_blocks * 8; \
  173. const int last_pos = 1 + 16 * num_blocks; \
  174. \
  175. const int u_diag = ((top_u[0] + cur_u[0]) >> 1) + 1; \
  176. const int v_diag = ((top_v[0] + cur_v[0]) >> 1) + 1; \
  177. \
  178. const int16x4_t cf16 = vld1_s16(kCoeffs); \
  179. const int32x2_t cf32 = vdup_n_s32(kUToB); \
  180. const uint8x8_t u16 = vdup_n_u8(16); \
  181. const uint8x8_t u128 = vdup_n_u8(128); \
  182. \
  183. /* Treat the first pixel in regular way */ \
  184. assert(top_y != NULL); \
  185. { \
  186. const int u0 = (top_u[0] + u_diag) >> 1; \
  187. const int v0 = (top_v[0] + v_diag) >> 1; \
  188. VP8YuvTo ## FMT(top_y[0], u0, v0, top_dst); \
  189. } \
  190. if (bottom_y != NULL) { \
  191. const int u0 = (cur_u[0] + u_diag) >> 1; \
  192. const int v0 = (cur_v[0] + v_diag) >> 1; \
  193. VP8YuvTo ## FMT(bottom_y[0], u0, v0, bottom_dst); \
  194. } \
  195. \
  196. for (block = 0; block < num_blocks; ++block) { \
  197. UPSAMPLE_16PIXELS(top_u, cur_u, r_uv); \
  198. UPSAMPLE_16PIXELS(top_v, cur_v, r_uv + 16); \
  199. CONVERT2RGB_8(FMT, XSTEP, top_y, bottom_y, r_uv, \
  200. top_dst, bottom_dst, 16 * block + 1, 16); \
  201. top_u += 8; \
  202. cur_u += 8; \
  203. top_v += 8; \
  204. cur_v += 8; \
  205. } \
  206. \
  207. UPSAMPLE_LAST_BLOCK(top_u, cur_u, leftover, r_uv); \
  208. UPSAMPLE_LAST_BLOCK(top_v, cur_v, leftover, r_uv + 16); \
  209. CONVERT2RGB_1(VP8YuvTo ## FMT, XSTEP, top_y, bottom_y, r_uv, \
  210. top_dst, bottom_dst, last_pos, len - last_pos); \
  211. }
  212. // NEON variants of the fancy upsampler.
  213. NEON_UPSAMPLE_FUNC(UpsampleRgbLinePair, Rgb, 3)
  214. NEON_UPSAMPLE_FUNC(UpsampleBgrLinePair, Bgr, 3)
  215. NEON_UPSAMPLE_FUNC(UpsampleRgbaLinePair, Rgba, 4)
  216. NEON_UPSAMPLE_FUNC(UpsampleBgraLinePair, Bgra, 4)
  217. #endif // FANCY_UPSAMPLING
  218. #endif // WEBP_USE_NEON
  219. //------------------------------------------------------------------------------
  220. extern void WebPInitUpsamplersNEON(void);
  221. #ifdef FANCY_UPSAMPLING
  222. extern WebPUpsampleLinePairFunc WebPUpsamplers[/* MODE_LAST */];
  223. void WebPInitUpsamplersNEON(void) {
  224. #if defined(WEBP_USE_NEON)
  225. WebPUpsamplers[MODE_RGB] = UpsampleRgbLinePair;
  226. WebPUpsamplers[MODE_RGBA] = UpsampleRgbaLinePair;
  227. WebPUpsamplers[MODE_BGR] = UpsampleBgrLinePair;
  228. WebPUpsamplers[MODE_BGRA] = UpsampleBgraLinePair;
  229. WebPUpsamplers[MODE_rgbA] = UpsampleRgbaLinePair;
  230. WebPUpsamplers[MODE_bgrA] = UpsampleBgraLinePair;
  231. #endif // WEBP_USE_NEON
  232. }
  233. #else
  234. // this empty function is to avoid an empty .o
  235. void WebPInitUpsamplersNEON(void) {}
  236. #endif // FANCY_UPSAMPLING