lossless_sse2.c 23 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535
  1. // Copyright 2014 Google Inc. All Rights Reserved.
  2. //
  3. // Use of this source code is governed by a BSD-style license
  4. // that can be found in the COPYING file in the root of the source
  5. // tree. An additional intellectual property rights grant can be found
  6. // in the file PATENTS. All contributing project authors may
  7. // be found in the AUTHORS file in the root of the source tree.
  8. // -----------------------------------------------------------------------------
  9. //
  10. // SSE2 variant of methods for lossless decoder
  11. //
  12. // Author: Skal (pascal.massimino@gmail.com)
  13. #include "./dsp.h"
  14. #include <assert.h>
  15. #if defined(WEBP_USE_SSE2)
  16. #include <emmintrin.h>
  17. #include "./lossless.h"
  18. //------------------------------------------------------------------------------
  19. // Predictor Transform
  20. static WEBP_INLINE uint32_t ClampedAddSubtractFull(uint32_t c0, uint32_t c1,
  21. uint32_t c2) {
  22. const __m128i zero = _mm_setzero_si128();
  23. const __m128i C0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(c0), zero);
  24. const __m128i C1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(c1), zero);
  25. const __m128i C2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(c2), zero);
  26. const __m128i V1 = _mm_add_epi16(C0, C1);
  27. const __m128i V2 = _mm_sub_epi16(V1, C2);
  28. const __m128i b = _mm_packus_epi16(V2, V2);
  29. const uint32_t output = _mm_cvtsi128_si32(b);
  30. return output;
  31. }
  32. static WEBP_INLINE uint32_t ClampedAddSubtractHalf(uint32_t c0, uint32_t c1,
  33. uint32_t c2) {
  34. const __m128i zero = _mm_setzero_si128();
  35. const __m128i C0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(c0), zero);
  36. const __m128i C1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(c1), zero);
  37. const __m128i B0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(c2), zero);
  38. const __m128i avg = _mm_add_epi16(C1, C0);
  39. const __m128i A0 = _mm_srli_epi16(avg, 1);
  40. const __m128i A1 = _mm_sub_epi16(A0, B0);
  41. const __m128i BgtA = _mm_cmpgt_epi16(B0, A0);
  42. const __m128i A2 = _mm_sub_epi16(A1, BgtA);
  43. const __m128i A3 = _mm_srai_epi16(A2, 1);
  44. const __m128i A4 = _mm_add_epi16(A0, A3);
  45. const __m128i A5 = _mm_packus_epi16(A4, A4);
  46. const uint32_t output = _mm_cvtsi128_si32(A5);
  47. return output;
  48. }
  49. static WEBP_INLINE uint32_t Select(uint32_t a, uint32_t b, uint32_t c) {
  50. int pa_minus_pb;
  51. const __m128i zero = _mm_setzero_si128();
  52. const __m128i A0 = _mm_cvtsi32_si128(a);
  53. const __m128i B0 = _mm_cvtsi32_si128(b);
  54. const __m128i C0 = _mm_cvtsi32_si128(c);
  55. const __m128i AC0 = _mm_subs_epu8(A0, C0);
  56. const __m128i CA0 = _mm_subs_epu8(C0, A0);
  57. const __m128i BC0 = _mm_subs_epu8(B0, C0);
  58. const __m128i CB0 = _mm_subs_epu8(C0, B0);
  59. const __m128i AC = _mm_or_si128(AC0, CA0);
  60. const __m128i BC = _mm_or_si128(BC0, CB0);
  61. const __m128i pa = _mm_unpacklo_epi8(AC, zero); // |a - c|
  62. const __m128i pb = _mm_unpacklo_epi8(BC, zero); // |b - c|
  63. const __m128i diff = _mm_sub_epi16(pb, pa);
  64. {
  65. int16_t out[8];
  66. _mm_storeu_si128((__m128i*)out, diff);
  67. pa_minus_pb = out[0] + out[1] + out[2] + out[3];
  68. }
  69. return (pa_minus_pb <= 0) ? a : b;
  70. }
  71. static WEBP_INLINE __m128i Average2_128i(uint32_t a0, uint32_t a1) {
  72. const __m128i zero = _mm_setzero_si128();
  73. const __m128i A0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(a0), zero);
  74. const __m128i A1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(a1), zero);
  75. const __m128i sum = _mm_add_epi16(A1, A0);
  76. const __m128i avg = _mm_srli_epi16(sum, 1);
  77. return avg;
  78. }
  79. static WEBP_INLINE uint32_t Average2(uint32_t a0, uint32_t a1) {
  80. const __m128i avg = Average2_128i(a0, a1);
  81. const __m128i A2 = _mm_packus_epi16(avg, avg);
  82. const uint32_t output = _mm_cvtsi128_si32(A2);
  83. return output;
  84. }
  85. static WEBP_INLINE uint32_t Average3(uint32_t a0, uint32_t a1, uint32_t a2) {
  86. const __m128i zero = _mm_setzero_si128();
  87. const __m128i avg1 = Average2_128i(a0, a2);
  88. const __m128i A1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(a1), zero);
  89. const __m128i sum = _mm_add_epi16(avg1, A1);
  90. const __m128i avg2 = _mm_srli_epi16(sum, 1);
  91. const __m128i A2 = _mm_packus_epi16(avg2, avg2);
  92. const uint32_t output = _mm_cvtsi128_si32(A2);
  93. return output;
  94. }
  95. static WEBP_INLINE uint32_t Average4(uint32_t a0, uint32_t a1,
  96. uint32_t a2, uint32_t a3) {
  97. const __m128i avg1 = Average2_128i(a0, a1);
  98. const __m128i avg2 = Average2_128i(a2, a3);
  99. const __m128i sum = _mm_add_epi16(avg2, avg1);
  100. const __m128i avg3 = _mm_srli_epi16(sum, 1);
  101. const __m128i A0 = _mm_packus_epi16(avg3, avg3);
  102. const uint32_t output = _mm_cvtsi128_si32(A0);
  103. return output;
  104. }
  105. static uint32_t Predictor5(uint32_t left, const uint32_t* const top) {
  106. const uint32_t pred = Average3(left, top[0], top[1]);
  107. return pred;
  108. }
  109. static uint32_t Predictor6(uint32_t left, const uint32_t* const top) {
  110. const uint32_t pred = Average2(left, top[-1]);
  111. return pred;
  112. }
  113. static uint32_t Predictor7(uint32_t left, const uint32_t* const top) {
  114. const uint32_t pred = Average2(left, top[0]);
  115. return pred;
  116. }
  117. static uint32_t Predictor8(uint32_t left, const uint32_t* const top) {
  118. const uint32_t pred = Average2(top[-1], top[0]);
  119. (void)left;
  120. return pred;
  121. }
  122. static uint32_t Predictor9(uint32_t left, const uint32_t* const top) {
  123. const uint32_t pred = Average2(top[0], top[1]);
  124. (void)left;
  125. return pred;
  126. }
  127. static uint32_t Predictor10(uint32_t left, const uint32_t* const top) {
  128. const uint32_t pred = Average4(left, top[-1], top[0], top[1]);
  129. return pred;
  130. }
  131. static uint32_t Predictor11(uint32_t left, const uint32_t* const top) {
  132. const uint32_t pred = Select(top[0], left, top[-1]);
  133. return pred;
  134. }
  135. static uint32_t Predictor12(uint32_t left, const uint32_t* const top) {
  136. const uint32_t pred = ClampedAddSubtractFull(left, top[0], top[-1]);
  137. return pred;
  138. }
  139. static uint32_t Predictor13(uint32_t left, const uint32_t* const top) {
  140. const uint32_t pred = ClampedAddSubtractHalf(left, top[0], top[-1]);
  141. return pred;
  142. }
  143. //------------------------------------------------------------------------------
  144. // Subtract-Green Transform
  145. static void SubtractGreenFromBlueAndRed(uint32_t* argb_data, int num_pixels) {
  146. const __m128i mask = _mm_set1_epi32(0x0000ff00);
  147. int i;
  148. for (i = 0; i + 4 <= num_pixels; i += 4) {
  149. const __m128i in = _mm_loadu_si128((__m128i*)&argb_data[i]);
  150. const __m128i in_00g0 = _mm_and_si128(in, mask); // 00g0|00g0|...
  151. const __m128i in_0g00 = _mm_slli_epi32(in_00g0, 8); // 0g00|0g00|...
  152. const __m128i in_000g = _mm_srli_epi32(in_00g0, 8); // 000g|000g|...
  153. const __m128i in_0g0g = _mm_or_si128(in_0g00, in_000g);
  154. const __m128i out = _mm_sub_epi8(in, in_0g0g);
  155. _mm_storeu_si128((__m128i*)&argb_data[i], out);
  156. }
  157. // fallthrough and finish off with plain-C
  158. VP8LSubtractGreenFromBlueAndRed_C(argb_data + i, num_pixels - i);
  159. }
  160. static void AddGreenToBlueAndRed(uint32_t* argb_data, int num_pixels) {
  161. const __m128i mask = _mm_set1_epi32(0x0000ff00);
  162. int i;
  163. for (i = 0; i + 4 <= num_pixels; i += 4) {
  164. const __m128i in = _mm_loadu_si128((__m128i*)&argb_data[i]);
  165. const __m128i in_00g0 = _mm_and_si128(in, mask); // 00g0|00g0|...
  166. const __m128i in_0g00 = _mm_slli_epi32(in_00g0, 8); // 0g00|0g00|...
  167. const __m128i in_000g = _mm_srli_epi32(in_00g0, 8); // 000g|000g|...
  168. const __m128i in_0g0g = _mm_or_si128(in_0g00, in_000g);
  169. const __m128i out = _mm_add_epi8(in, in_0g0g);
  170. _mm_storeu_si128((__m128i*)&argb_data[i], out);
  171. }
  172. // fallthrough and finish off with plain-C
  173. VP8LAddGreenToBlueAndRed_C(argb_data + i, num_pixels - i);
  174. }
  175. //------------------------------------------------------------------------------
  176. // Color Transform
  177. static WEBP_INLINE __m128i ColorTransformDelta(__m128i color_pred,
  178. __m128i color) {
  179. // We simulate signed 8-bit multiplication as:
  180. // * Left shift the two (8-bit) numbers by 8 bits,
  181. // * Perform a 16-bit signed multiplication and retain the higher 16-bits.
  182. const __m128i color_pred_shifted = _mm_slli_epi32(color_pred, 8);
  183. const __m128i color_shifted = _mm_slli_epi32(color, 8);
  184. // Note: This performs multiplication on 8 packed 16-bit numbers, 4 of which
  185. // happen to be zeroes.
  186. const __m128i signed_mult =
  187. _mm_mulhi_epi16(color_pred_shifted, color_shifted);
  188. return _mm_srli_epi32(signed_mult, 5);
  189. }
  190. static WEBP_INLINE void TransformColor(const VP8LMultipliers* const m,
  191. uint32_t* argb_data,
  192. int num_pixels) {
  193. const __m128i g_to_r = _mm_set1_epi32(m->green_to_red_); // multipliers
  194. const __m128i g_to_b = _mm_set1_epi32(m->green_to_blue_);
  195. const __m128i r_to_b = _mm_set1_epi32(m->red_to_blue_);
  196. int i;
  197. for (i = 0; i + 4 <= num_pixels; i += 4) {
  198. const __m128i in = _mm_loadu_si128((__m128i*)&argb_data[i]);
  199. const __m128i alpha_green_mask = _mm_set1_epi32(0xff00ff00); // masks
  200. const __m128i red_mask = _mm_set1_epi32(0x00ff0000);
  201. const __m128i green_mask = _mm_set1_epi32(0x0000ff00);
  202. const __m128i lower_8bit_mask = _mm_set1_epi32(0x000000ff);
  203. const __m128i ag = _mm_and_si128(in, alpha_green_mask); // alpha, green
  204. const __m128i r = _mm_srli_epi32(_mm_and_si128(in, red_mask), 16);
  205. const __m128i g = _mm_srli_epi32(_mm_and_si128(in, green_mask), 8);
  206. const __m128i b = in;
  207. const __m128i r_delta = ColorTransformDelta(g_to_r, g); // red
  208. const __m128i r_new =
  209. _mm_and_si128(_mm_sub_epi32(r, r_delta), lower_8bit_mask);
  210. const __m128i r_new_shifted = _mm_slli_epi32(r_new, 16);
  211. const __m128i b_delta_1 = ColorTransformDelta(g_to_b, g); // blue
  212. const __m128i b_delta_2 = ColorTransformDelta(r_to_b, r);
  213. const __m128i b_delta = _mm_add_epi32(b_delta_1, b_delta_2);
  214. const __m128i b_new =
  215. _mm_and_si128(_mm_sub_epi32(b, b_delta), lower_8bit_mask);
  216. const __m128i out = _mm_or_si128(_mm_or_si128(ag, r_new_shifted), b_new);
  217. _mm_storeu_si128((__m128i*)&argb_data[i], out);
  218. }
  219. // Fall-back to C-version for left-overs.
  220. VP8LTransformColor_C(m, argb_data + i, num_pixels - i);
  221. }
  222. static WEBP_INLINE void TransformColorInverse(const VP8LMultipliers* const m,
  223. uint32_t* argb_data,
  224. int num_pixels) {
  225. const __m128i g_to_r = _mm_set1_epi32(m->green_to_red_); // multipliers
  226. const __m128i g_to_b = _mm_set1_epi32(m->green_to_blue_);
  227. const __m128i r_to_b = _mm_set1_epi32(m->red_to_blue_);
  228. int i;
  229. for (i = 0; i + 4 <= num_pixels; i += 4) {
  230. const __m128i in = _mm_loadu_si128((__m128i*)&argb_data[i]);
  231. const __m128i alpha_green_mask = _mm_set1_epi32(0xff00ff00); // masks
  232. const __m128i red_mask = _mm_set1_epi32(0x00ff0000);
  233. const __m128i green_mask = _mm_set1_epi32(0x0000ff00);
  234. const __m128i lower_8bit_mask = _mm_set1_epi32(0x000000ff);
  235. const __m128i ag = _mm_and_si128(in, alpha_green_mask); // alpha, green
  236. const __m128i r = _mm_srli_epi32(_mm_and_si128(in, red_mask), 16);
  237. const __m128i g = _mm_srli_epi32(_mm_and_si128(in, green_mask), 8);
  238. const __m128i b = in;
  239. const __m128i r_delta = ColorTransformDelta(g_to_r, g); // red
  240. const __m128i r_new =
  241. _mm_and_si128(_mm_add_epi32(r, r_delta), lower_8bit_mask);
  242. const __m128i r_new_shifted = _mm_slli_epi32(r_new, 16);
  243. const __m128i b_delta_1 = ColorTransformDelta(g_to_b, g); // blue
  244. const __m128i b_delta_2 = ColorTransformDelta(r_to_b, r_new);
  245. const __m128i b_delta = _mm_add_epi32(b_delta_1, b_delta_2);
  246. const __m128i b_new =
  247. _mm_and_si128(_mm_add_epi32(b, b_delta), lower_8bit_mask);
  248. const __m128i out = _mm_or_si128(_mm_or_si128(ag, r_new_shifted), b_new);
  249. _mm_storeu_si128((__m128i*)&argb_data[i], out);
  250. }
  251. // Fall-back to C-version for left-overs.
  252. VP8LTransformColorInverse_C(m, argb_data + i, num_pixels - i);
  253. }
  254. //------------------------------------------------------------------------------
  255. // Color-space conversion functions
  256. static void ConvertBGRAToRGBA(const uint32_t* src,
  257. int num_pixels, uint8_t* dst) {
  258. const __m128i* in = (const __m128i*)src;
  259. __m128i* out = (__m128i*)dst;
  260. while (num_pixels >= 8) {
  261. const __m128i bgra0 = _mm_loadu_si128(in++); // bgra0|bgra1|bgra2|bgra3
  262. const __m128i bgra4 = _mm_loadu_si128(in++); // bgra4|bgra5|bgra6|bgra7
  263. const __m128i v0l = _mm_unpacklo_epi8(bgra0, bgra4); // b0b4g0g4r0r4a0a4...
  264. const __m128i v0h = _mm_unpackhi_epi8(bgra0, bgra4); // b2b6g2g6r2r6a2a6...
  265. const __m128i v1l = _mm_unpacklo_epi8(v0l, v0h); // b0b2b4b6g0g2g4g6...
  266. const __m128i v1h = _mm_unpackhi_epi8(v0l, v0h); // b1b3b5b7g1g3g5g7...
  267. const __m128i v2l = _mm_unpacklo_epi8(v1l, v1h); // b0...b7 | g0...g7
  268. const __m128i v2h = _mm_unpackhi_epi8(v1l, v1h); // r0...r7 | a0...a7
  269. const __m128i ga0 = _mm_unpackhi_epi64(v2l, v2h); // g0...g7 | a0...a7
  270. const __m128i rb0 = _mm_unpacklo_epi64(v2h, v2l); // r0...r7 | b0...b7
  271. const __m128i rg0 = _mm_unpacklo_epi8(rb0, ga0); // r0g0r1g1 ... r6g6r7g7
  272. const __m128i ba0 = _mm_unpackhi_epi8(rb0, ga0); // b0a0b1a1 ... b6a6b7a7
  273. const __m128i rgba0 = _mm_unpacklo_epi16(rg0, ba0); // rgba0|rgba1...
  274. const __m128i rgba4 = _mm_unpackhi_epi16(rg0, ba0); // rgba4|rgba5...
  275. _mm_storeu_si128(out++, rgba0);
  276. _mm_storeu_si128(out++, rgba4);
  277. num_pixels -= 8;
  278. }
  279. // left-overs
  280. VP8LConvertBGRAToRGBA_C((const uint32_t*)in, num_pixels, (uint8_t*)out);
  281. }
  282. static void ConvertBGRAToRGBA4444(const uint32_t* src,
  283. int num_pixels, uint8_t* dst) {
  284. const __m128i mask_0x0f = _mm_set1_epi8(0x0f);
  285. const __m128i mask_0xf0 = _mm_set1_epi8(0xf0);
  286. const __m128i* in = (const __m128i*)src;
  287. __m128i* out = (__m128i*)dst;
  288. while (num_pixels >= 8) {
  289. const __m128i bgra0 = _mm_loadu_si128(in++); // bgra0|bgra1|bgra2|bgra3
  290. const __m128i bgra4 = _mm_loadu_si128(in++); // bgra4|bgra5|bgra6|bgra7
  291. const __m128i v0l = _mm_unpacklo_epi8(bgra0, bgra4); // b0b4g0g4r0r4a0a4...
  292. const __m128i v0h = _mm_unpackhi_epi8(bgra0, bgra4); // b2b6g2g6r2r6a2a6...
  293. const __m128i v1l = _mm_unpacklo_epi8(v0l, v0h); // b0b2b4b6g0g2g4g6...
  294. const __m128i v1h = _mm_unpackhi_epi8(v0l, v0h); // b1b3b5b7g1g3g5g7...
  295. const __m128i v2l = _mm_unpacklo_epi8(v1l, v1h); // b0...b7 | g0...g7
  296. const __m128i v2h = _mm_unpackhi_epi8(v1l, v1h); // r0...r7 | a0...a7
  297. const __m128i ga0 = _mm_unpackhi_epi64(v2l, v2h); // g0...g7 | a0...a7
  298. const __m128i rb0 = _mm_unpacklo_epi64(v2h, v2l); // r0...r7 | b0...b7
  299. const __m128i ga1 = _mm_srli_epi16(ga0, 4); // g0-|g1-|...|a6-|a7-
  300. const __m128i rb1 = _mm_and_si128(rb0, mask_0xf0); // -r0|-r1|...|-b6|-a7
  301. const __m128i ga2 = _mm_and_si128(ga1, mask_0x0f); // g0-|g1-|...|a6-|a7-
  302. const __m128i rgba0 = _mm_or_si128(ga2, rb1); // rg0..rg7 | ba0..ba7
  303. const __m128i rgba1 = _mm_srli_si128(rgba0, 8); // ba0..ba7 | 0
  304. #ifdef WEBP_SWAP_16BIT_CSP
  305. const __m128i rgba = _mm_unpacklo_epi8(rgba1, rgba0); // barg0...barg7
  306. #else
  307. const __m128i rgba = _mm_unpacklo_epi8(rgba0, rgba1); // rgba0...rgba7
  308. #endif
  309. _mm_storeu_si128(out++, rgba);
  310. num_pixels -= 8;
  311. }
  312. // left-overs
  313. VP8LConvertBGRAToRGBA4444_C((const uint32_t*)in, num_pixels, (uint8_t*)out);
  314. }
  315. static void ConvertBGRAToRGB565(const uint32_t* src,
  316. int num_pixels, uint8_t* dst) {
  317. const __m128i mask_0xe0 = _mm_set1_epi8(0xe0);
  318. const __m128i mask_0xf8 = _mm_set1_epi8(0xf8);
  319. const __m128i mask_0x07 = _mm_set1_epi8(0x07);
  320. const __m128i* in = (const __m128i*)src;
  321. __m128i* out = (__m128i*)dst;
  322. while (num_pixels >= 8) {
  323. const __m128i bgra0 = _mm_loadu_si128(in++); // bgra0|bgra1|bgra2|bgra3
  324. const __m128i bgra4 = _mm_loadu_si128(in++); // bgra4|bgra5|bgra6|bgra7
  325. const __m128i v0l = _mm_unpacklo_epi8(bgra0, bgra4); // b0b4g0g4r0r4a0a4...
  326. const __m128i v0h = _mm_unpackhi_epi8(bgra0, bgra4); // b2b6g2g6r2r6a2a6...
  327. const __m128i v1l = _mm_unpacklo_epi8(v0l, v0h); // b0b2b4b6g0g2g4g6...
  328. const __m128i v1h = _mm_unpackhi_epi8(v0l, v0h); // b1b3b5b7g1g3g5g7...
  329. const __m128i v2l = _mm_unpacklo_epi8(v1l, v1h); // b0...b7 | g0...g7
  330. const __m128i v2h = _mm_unpackhi_epi8(v1l, v1h); // r0...r7 | a0...a7
  331. const __m128i ga0 = _mm_unpackhi_epi64(v2l, v2h); // g0...g7 | a0...a7
  332. const __m128i rb0 = _mm_unpacklo_epi64(v2h, v2l); // r0...r7 | b0...b7
  333. const __m128i rb1 = _mm_and_si128(rb0, mask_0xf8); // -r0..-r7|-b0..-b7
  334. const __m128i g_lo1 = _mm_srli_epi16(ga0, 5);
  335. const __m128i g_lo2 = _mm_and_si128(g_lo1, mask_0x07); // g0-...g7-|xx (3b)
  336. const __m128i g_hi1 = _mm_slli_epi16(ga0, 3);
  337. const __m128i g_hi2 = _mm_and_si128(g_hi1, mask_0xe0); // -g0...-g7|xx (3b)
  338. const __m128i b0 = _mm_srli_si128(rb1, 8); // -b0...-b7|0
  339. const __m128i rg1 = _mm_or_si128(rb1, g_lo2); // gr0...gr7|xx
  340. const __m128i b1 = _mm_srli_epi16(b0, 3);
  341. const __m128i gb1 = _mm_or_si128(b1, g_hi2); // bg0...bg7|xx
  342. #ifdef WEBP_SWAP_16BIT_CSP
  343. const __m128i rgba = _mm_unpacklo_epi8(gb1, rg1); // rggb0...rggb7
  344. #else
  345. const __m128i rgba = _mm_unpacklo_epi8(rg1, gb1); // bgrb0...bgrb7
  346. #endif
  347. _mm_storeu_si128(out++, rgba);
  348. num_pixels -= 8;
  349. }
  350. // left-overs
  351. VP8LConvertBGRAToRGB565_C((const uint32_t*)in, num_pixels, (uint8_t*)out);
  352. }
  353. static void ConvertBGRAToBGR(const uint32_t* src,
  354. int num_pixels, uint8_t* dst) {
  355. const __m128i mask_l = _mm_set_epi32(0, 0x00ffffff, 0, 0x00ffffff);
  356. const __m128i mask_h = _mm_set_epi32(0x00ffffff, 0, 0x00ffffff, 0);
  357. const __m128i* in = (const __m128i*)src;
  358. const uint8_t* const end = dst + num_pixels * 3;
  359. // the last storel_epi64 below writes 8 bytes starting at offset 18
  360. while (dst + 26 <= end) {
  361. const __m128i bgra0 = _mm_loadu_si128(in++); // bgra0|bgra1|bgra2|bgra3
  362. const __m128i bgra4 = _mm_loadu_si128(in++); // bgra4|bgra5|bgra6|bgra7
  363. const __m128i a0l = _mm_and_si128(bgra0, mask_l); // bgr0|0|bgr0|0
  364. const __m128i a4l = _mm_and_si128(bgra4, mask_l); // bgr0|0|bgr0|0
  365. const __m128i a0h = _mm_and_si128(bgra0, mask_h); // 0|bgr0|0|bgr0
  366. const __m128i a4h = _mm_and_si128(bgra4, mask_h); // 0|bgr0|0|bgr0
  367. const __m128i b0h = _mm_srli_epi64(a0h, 8); // 000b|gr00|000b|gr00
  368. const __m128i b4h = _mm_srli_epi64(a4h, 8); // 000b|gr00|000b|gr00
  369. const __m128i c0 = _mm_or_si128(a0l, b0h); // rgbrgb00|rgbrgb00
  370. const __m128i c4 = _mm_or_si128(a4l, b4h); // rgbrgb00|rgbrgb00
  371. const __m128i c2 = _mm_srli_si128(c0, 8);
  372. const __m128i c6 = _mm_srli_si128(c4, 8);
  373. _mm_storel_epi64((__m128i*)(dst + 0), c0);
  374. _mm_storel_epi64((__m128i*)(dst + 6), c2);
  375. _mm_storel_epi64((__m128i*)(dst + 12), c4);
  376. _mm_storel_epi64((__m128i*)(dst + 18), c6);
  377. dst += 24;
  378. num_pixels -= 8;
  379. }
  380. // left-overs
  381. VP8LConvertBGRAToBGR_C((const uint32_t*)in, num_pixels, dst);
  382. }
  383. //------------------------------------------------------------------------------
  384. #define LINE_SIZE 16 // 8 or 16
  385. static void AddVector(const uint32_t* a, const uint32_t* b, uint32_t* out,
  386. int size) {
  387. int i;
  388. assert(size % LINE_SIZE == 0);
  389. for (i = 0; i < size; i += LINE_SIZE) {
  390. const __m128i a0 = _mm_loadu_si128((__m128i*)&a[i + 0]);
  391. const __m128i a1 = _mm_loadu_si128((__m128i*)&a[i + 4]);
  392. #if (LINE_SIZE == 16)
  393. const __m128i a2 = _mm_loadu_si128((__m128i*)&a[i + 8]);
  394. const __m128i a3 = _mm_loadu_si128((__m128i*)&a[i + 12]);
  395. #endif
  396. const __m128i b0 = _mm_loadu_si128((__m128i*)&b[i + 0]);
  397. const __m128i b1 = _mm_loadu_si128((__m128i*)&b[i + 4]);
  398. #if (LINE_SIZE == 16)
  399. const __m128i b2 = _mm_loadu_si128((__m128i*)&b[i + 8]);
  400. const __m128i b3 = _mm_loadu_si128((__m128i*)&b[i + 12]);
  401. #endif
  402. _mm_storeu_si128((__m128i*)&out[i + 0], _mm_add_epi32(a0, b0));
  403. _mm_storeu_si128((__m128i*)&out[i + 4], _mm_add_epi32(a1, b1));
  404. #if (LINE_SIZE == 16)
  405. _mm_storeu_si128((__m128i*)&out[i + 8], _mm_add_epi32(a2, b2));
  406. _mm_storeu_si128((__m128i*)&out[i + 12], _mm_add_epi32(a3, b3));
  407. #endif
  408. }
  409. }
  410. static void AddVectorEq(const uint32_t* a, uint32_t* out, int size) {
  411. int i;
  412. assert(size % LINE_SIZE == 0);
  413. for (i = 0; i < size; i += LINE_SIZE) {
  414. const __m128i a0 = _mm_loadu_si128((__m128i*)&a[i + 0]);
  415. const __m128i a1 = _mm_loadu_si128((__m128i*)&a[i + 4]);
  416. #if (LINE_SIZE == 16)
  417. const __m128i a2 = _mm_loadu_si128((__m128i*)&a[i + 8]);
  418. const __m128i a3 = _mm_loadu_si128((__m128i*)&a[i + 12]);
  419. #endif
  420. const __m128i b0 = _mm_loadu_si128((__m128i*)&out[i + 0]);
  421. const __m128i b1 = _mm_loadu_si128((__m128i*)&out[i + 4]);
  422. #if (LINE_SIZE == 16)
  423. const __m128i b2 = _mm_loadu_si128((__m128i*)&out[i + 8]);
  424. const __m128i b3 = _mm_loadu_si128((__m128i*)&out[i + 12]);
  425. #endif
  426. _mm_storeu_si128((__m128i*)&out[i + 0], _mm_add_epi32(a0, b0));
  427. _mm_storeu_si128((__m128i*)&out[i + 4], _mm_add_epi32(a1, b1));
  428. #if (LINE_SIZE == 16)
  429. _mm_storeu_si128((__m128i*)&out[i + 8], _mm_add_epi32(a2, b2));
  430. _mm_storeu_si128((__m128i*)&out[i + 12], _mm_add_epi32(a3, b3));
  431. #endif
  432. }
  433. }
  434. #undef LINE_SIZE
  435. // Note we are adding uint32_t's as *signed* int32's (using _mm_add_epi32). But
  436. // that's ok since the histogram values are less than 1<<28 (max picture size).
  437. static void HistogramAdd(const VP8LHistogram* const a,
  438. const VP8LHistogram* const b,
  439. VP8LHistogram* const out) {
  440. int i;
  441. const int literal_size = VP8LHistogramNumCodes(a->palette_code_bits_);
  442. assert(a->palette_code_bits_ == b->palette_code_bits_);
  443. if (b != out) {
  444. AddVector(a->literal_, b->literal_, out->literal_, NUM_LITERAL_CODES);
  445. AddVector(a->red_, b->red_, out->red_, NUM_LITERAL_CODES);
  446. AddVector(a->blue_, b->blue_, out->blue_, NUM_LITERAL_CODES);
  447. AddVector(a->alpha_, b->alpha_, out->alpha_, NUM_LITERAL_CODES);
  448. } else {
  449. AddVectorEq(a->literal_, out->literal_, NUM_LITERAL_CODES);
  450. AddVectorEq(a->red_, out->red_, NUM_LITERAL_CODES);
  451. AddVectorEq(a->blue_, out->blue_, NUM_LITERAL_CODES);
  452. AddVectorEq(a->alpha_, out->alpha_, NUM_LITERAL_CODES);
  453. }
  454. for (i = NUM_LITERAL_CODES; i < literal_size; ++i) {
  455. out->literal_[i] = a->literal_[i] + b->literal_[i];
  456. }
  457. for (i = 0; i < NUM_DISTANCE_CODES; ++i) {
  458. out->distance_[i] = a->distance_[i] + b->distance_[i];
  459. }
  460. }
  461. #endif // WEBP_USE_SSE2
  462. //------------------------------------------------------------------------------
  463. extern void VP8LDspInitSSE2(void);
  464. void VP8LDspInitSSE2(void) {
  465. #if defined(WEBP_USE_SSE2)
  466. VP8LPredictors[5] = Predictor5;
  467. VP8LPredictors[6] = Predictor6;
  468. VP8LPredictors[7] = Predictor7;
  469. VP8LPredictors[8] = Predictor8;
  470. VP8LPredictors[9] = Predictor9;
  471. VP8LPredictors[10] = Predictor10;
  472. VP8LPredictors[11] = Predictor11;
  473. VP8LPredictors[12] = Predictor12;
  474. VP8LPredictors[13] = Predictor13;
  475. VP8LSubtractGreenFromBlueAndRed = SubtractGreenFromBlueAndRed;
  476. VP8LAddGreenToBlueAndRed = AddGreenToBlueAndRed;
  477. VP8LTransformColor = TransformColor;
  478. VP8LTransformColorInverse = TransformColorInverse;
  479. VP8LConvertBGRAToRGBA = ConvertBGRAToRGBA;
  480. VP8LConvertBGRAToRGBA4444 = ConvertBGRAToRGBA4444;
  481. VP8LConvertBGRAToRGB565 = ConvertBGRAToRGB565;
  482. VP8LConvertBGRAToBGR = ConvertBGRAToBGR;
  483. VP8LHistogramAdd = HistogramAdd;
  484. #endif // WEBP_USE_SSE2
  485. }
  486. //------------------------------------------------------------------------------