lossless_mips32.c 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416
  1. // Copyright 2014 Google Inc. All Rights Reserved.
  2. //
  3. // Use of this source code is governed by a BSD-style license
  4. // that can be found in the COPYING file in the root of the source
  5. // tree. An additional intellectual property rights grant can be found
  6. // in the file PATENTS. All contributing project authors may
  7. // be found in the AUTHORS file in the root of the source tree.
  8. // -----------------------------------------------------------------------------
  9. //
  10. // MIPS version of lossless functions
  11. //
  12. // Author(s): Djordje Pesut (djordje.pesut@imgtec.com)
  13. // Jovan Zelincevic (jovan.zelincevic@imgtec.com)
  14. #include "./dsp.h"
  15. #include "./lossless.h"
  16. #if defined(WEBP_USE_MIPS32)
  17. #include <assert.h>
  18. #include <math.h>
  19. #include <stdlib.h>
  20. #include <string.h>
  21. #define APPROX_LOG_WITH_CORRECTION_MAX 65536
  22. #define APPROX_LOG_MAX 4096
  23. #define LOG_2_RECIPROCAL 1.44269504088896338700465094007086
  24. static float FastSLog2Slow(uint32_t v) {
  25. assert(v >= LOG_LOOKUP_IDX_MAX);
  26. if (v < APPROX_LOG_WITH_CORRECTION_MAX) {
  27. uint32_t log_cnt, y, correction;
  28. const int c24 = 24;
  29. const float v_f = (float)v;
  30. uint32_t temp;
  31. // Xf = 256 = 2^8
  32. // log_cnt is index of leading one in upper 24 bits
  33. __asm__ volatile(
  34. "clz %[log_cnt], %[v] \n\t"
  35. "addiu %[y], $zero, 1 \n\t"
  36. "subu %[log_cnt], %[c24], %[log_cnt] \n\t"
  37. "sllv %[y], %[y], %[log_cnt] \n\t"
  38. "srlv %[temp], %[v], %[log_cnt] \n\t"
  39. : [log_cnt]"=&r"(log_cnt), [y]"=&r"(y),
  40. [temp]"=r"(temp)
  41. : [c24]"r"(c24), [v]"r"(v)
  42. );
  43. // vf = (2^log_cnt) * Xf; where y = 2^log_cnt and Xf < 256
  44. // Xf = floor(Xf) * (1 + (v % y) / v)
  45. // log2(Xf) = log2(floor(Xf)) + log2(1 + (v % y) / v)
  46. // The correction factor: log(1 + d) ~ d; for very small d values, so
  47. // log2(1 + (v % y) / v) ~ LOG_2_RECIPROCAL * (v % y)/v
  48. // LOG_2_RECIPROCAL ~ 23/16
  49. // (v % y) = (v % 2^log_cnt) = v & (2^log_cnt - 1)
  50. correction = (23 * (v & (y - 1))) >> 4;
  51. return v_f * (kLog2Table[temp] + log_cnt) + correction;
  52. } else {
  53. return (float)(LOG_2_RECIPROCAL * v * log((double)v));
  54. }
  55. }
  56. static float FastLog2Slow(uint32_t v) {
  57. assert(v >= LOG_LOOKUP_IDX_MAX);
  58. if (v < APPROX_LOG_WITH_CORRECTION_MAX) {
  59. uint32_t log_cnt, y;
  60. const int c24 = 24;
  61. double log_2;
  62. uint32_t temp;
  63. __asm__ volatile(
  64. "clz %[log_cnt], %[v] \n\t"
  65. "addiu %[y], $zero, 1 \n\t"
  66. "subu %[log_cnt], %[c24], %[log_cnt] \n\t"
  67. "sllv %[y], %[y], %[log_cnt] \n\t"
  68. "srlv %[temp], %[v], %[log_cnt] \n\t"
  69. : [log_cnt]"=&r"(log_cnt), [y]"=&r"(y),
  70. [temp]"=r"(temp)
  71. : [c24]"r"(c24), [v]"r"(v)
  72. );
  73. log_2 = kLog2Table[temp] + log_cnt;
  74. if (v >= APPROX_LOG_MAX) {
  75. // Since the division is still expensive, add this correction factor only
  76. // for large values of 'v'.
  77. const uint32_t correction = (23 * (v & (y - 1))) >> 4;
  78. log_2 += (double)correction / v;
  79. }
  80. return (float)log_2;
  81. } else {
  82. return (float)(LOG_2_RECIPROCAL * log((double)v));
  83. }
  84. }
  85. // C version of this function:
  86. // int i = 0;
  87. // int64_t cost = 0;
  88. // const uint32_t* pop = &population[4];
  89. // const uint32_t* LoopEnd = &population[length];
  90. // while (pop != LoopEnd) {
  91. // ++i;
  92. // cost += i * *pop;
  93. // cost += i * *(pop + 1);
  94. // pop += 2;
  95. // }
  96. // return (double)cost;
  97. static double ExtraCost(const uint32_t* const population, int length) {
  98. int i, temp0, temp1;
  99. const uint32_t* pop = &population[4];
  100. const uint32_t* const LoopEnd = &population[length];
  101. __asm__ volatile(
  102. "mult $zero, $zero \n\t"
  103. "xor %[i], %[i], %[i] \n\t"
  104. "beq %[pop], %[LoopEnd], 2f \n\t"
  105. "1: \n\t"
  106. "lw %[temp0], 0(%[pop]) \n\t"
  107. "lw %[temp1], 4(%[pop]) \n\t"
  108. "addiu %[i], %[i], 1 \n\t"
  109. "addiu %[pop], %[pop], 8 \n\t"
  110. "madd %[i], %[temp0] \n\t"
  111. "madd %[i], %[temp1] \n\t"
  112. "bne %[pop], %[LoopEnd], 1b \n\t"
  113. "2: \n\t"
  114. "mfhi %[temp0] \n\t"
  115. "mflo %[temp1] \n\t"
  116. : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1),
  117. [i]"=&r"(i), [pop]"+r"(pop)
  118. : [LoopEnd]"r"(LoopEnd)
  119. : "memory", "hi", "lo"
  120. );
  121. return (double)((int64_t)temp0 << 32 | temp1);
  122. }
  123. // C version of this function:
  124. // int i = 0;
  125. // int64_t cost = 0;
  126. // const uint32_t* pX = &X[4];
  127. // const uint32_t* pY = &Y[4];
  128. // const uint32_t* LoopEnd = &X[length];
  129. // while (pX != LoopEnd) {
  130. // const uint32_t xy0 = *pX + *pY;
  131. // const uint32_t xy1 = *(pX + 1) + *(pY + 1);
  132. // ++i;
  133. // cost += i * xy0;
  134. // cost += i * xy1;
  135. // pX += 2;
  136. // pY += 2;
  137. // }
  138. // return (double)cost;
  139. static double ExtraCostCombined(const uint32_t* const X,
  140. const uint32_t* const Y, int length) {
  141. int i, temp0, temp1, temp2, temp3;
  142. const uint32_t* pX = &X[4];
  143. const uint32_t* pY = &Y[4];
  144. const uint32_t* const LoopEnd = &X[length];
  145. __asm__ volatile(
  146. "mult $zero, $zero \n\t"
  147. "xor %[i], %[i], %[i] \n\t"
  148. "beq %[pX], %[LoopEnd], 2f \n\t"
  149. "1: \n\t"
  150. "lw %[temp0], 0(%[pX]) \n\t"
  151. "lw %[temp1], 0(%[pY]) \n\t"
  152. "lw %[temp2], 4(%[pX]) \n\t"
  153. "lw %[temp3], 4(%[pY]) \n\t"
  154. "addiu %[i], %[i], 1 \n\t"
  155. "addu %[temp0], %[temp0], %[temp1] \n\t"
  156. "addu %[temp2], %[temp2], %[temp3] \n\t"
  157. "addiu %[pX], %[pX], 8 \n\t"
  158. "addiu %[pY], %[pY], 8 \n\t"
  159. "madd %[i], %[temp0] \n\t"
  160. "madd %[i], %[temp2] \n\t"
  161. "bne %[pX], %[LoopEnd], 1b \n\t"
  162. "2: \n\t"
  163. "mfhi %[temp0] \n\t"
  164. "mflo %[temp1] \n\t"
  165. : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1),
  166. [temp2]"=&r"(temp2), [temp3]"=&r"(temp3),
  167. [i]"=&r"(i), [pX]"+r"(pX), [pY]"+r"(pY)
  168. : [LoopEnd]"r"(LoopEnd)
  169. : "memory", "hi", "lo"
  170. );
  171. return (double)((int64_t)temp0 << 32 | temp1);
  172. }
  173. #define HUFFMAN_COST_PASS \
  174. __asm__ volatile( \
  175. "sll %[temp1], %[temp0], 3 \n\t" \
  176. "addiu %[temp3], %[streak], -3 \n\t" \
  177. "addu %[temp2], %[pstreaks], %[temp1] \n\t" \
  178. "blez %[temp3], 1f \n\t" \
  179. "srl %[temp1], %[temp1], 1 \n\t" \
  180. "addu %[temp3], %[pcnts], %[temp1] \n\t" \
  181. "lw %[temp0], 4(%[temp2]) \n\t" \
  182. "lw %[temp1], 0(%[temp3]) \n\t" \
  183. "addu %[temp0], %[temp0], %[streak] \n\t" \
  184. "addiu %[temp1], %[temp1], 1 \n\t" \
  185. "sw %[temp0], 4(%[temp2]) \n\t" \
  186. "sw %[temp1], 0(%[temp3]) \n\t" \
  187. "b 2f \n\t" \
  188. "1: \n\t" \
  189. "lw %[temp0], 0(%[temp2]) \n\t" \
  190. "addu %[temp0], %[temp0], %[streak] \n\t" \
  191. "sw %[temp0], 0(%[temp2]) \n\t" \
  192. "2: \n\t" \
  193. : [temp1]"=&r"(temp1), [temp2]"=&r"(temp2), \
  194. [temp3]"=&r"(temp3), [temp0]"+r"(temp0) \
  195. : [pstreaks]"r"(pstreaks), [pcnts]"r"(pcnts), \
  196. [streak]"r"(streak) \
  197. : "memory" \
  198. );
  199. // Returns the various RLE counts
  200. static VP8LStreaks HuffmanCostCount(const uint32_t* population, int length) {
  201. int i;
  202. int streak = 0;
  203. VP8LStreaks stats;
  204. int* const pstreaks = &stats.streaks[0][0];
  205. int* const pcnts = &stats.counts[0];
  206. int temp0, temp1, temp2, temp3;
  207. memset(&stats, 0, sizeof(stats));
  208. for (i = 0; i < length - 1; ++i) {
  209. ++streak;
  210. if (population[i] == population[i + 1]) {
  211. continue;
  212. }
  213. temp0 = (population[i] != 0);
  214. HUFFMAN_COST_PASS
  215. streak = 0;
  216. }
  217. ++streak;
  218. temp0 = (population[i] != 0);
  219. HUFFMAN_COST_PASS
  220. return stats;
  221. }
  222. static VP8LStreaks HuffmanCostCombinedCount(const uint32_t* X,
  223. const uint32_t* Y, int length) {
  224. int i;
  225. int streak = 0;
  226. VP8LStreaks stats;
  227. int* const pstreaks = &stats.streaks[0][0];
  228. int* const pcnts = &stats.counts[0];
  229. int temp0, temp1, temp2, temp3;
  230. memset(&stats, 0, sizeof(stats));
  231. for (i = 0; i < length - 1; ++i) {
  232. const uint32_t xy = X[i] + Y[i];
  233. const uint32_t xy_next = X[i + 1] + Y[i + 1];
  234. ++streak;
  235. if (xy == xy_next) {
  236. continue;
  237. }
  238. temp0 = (xy != 0);
  239. HUFFMAN_COST_PASS
  240. streak = 0;
  241. }
  242. {
  243. const uint32_t xy = X[i] + Y[i];
  244. ++streak;
  245. temp0 = (xy != 0);
  246. HUFFMAN_COST_PASS
  247. }
  248. return stats;
  249. }
  250. #define ASM_START \
  251. __asm__ volatile( \
  252. ".set push \n\t" \
  253. ".set at \n\t" \
  254. ".set macro \n\t" \
  255. "1: \n\t"
  256. // P2 = P0 + P1
  257. // A..D - offsets
  258. // E - temp variable to tell macro
  259. // if pointer should be incremented
  260. // literal_ and successive histograms could be unaligned
  261. // so we must use ulw and usw
  262. #define ADD_TO_OUT(A, B, C, D, E, P0, P1, P2) \
  263. "ulw %[temp0], "#A"(%["#P0"]) \n\t" \
  264. "ulw %[temp1], "#B"(%["#P0"]) \n\t" \
  265. "ulw %[temp2], "#C"(%["#P0"]) \n\t" \
  266. "ulw %[temp3], "#D"(%["#P0"]) \n\t" \
  267. "ulw %[temp4], "#A"(%["#P1"]) \n\t" \
  268. "ulw %[temp5], "#B"(%["#P1"]) \n\t" \
  269. "ulw %[temp6], "#C"(%["#P1"]) \n\t" \
  270. "ulw %[temp7], "#D"(%["#P1"]) \n\t" \
  271. "addu %[temp4], %[temp4], %[temp0] \n\t" \
  272. "addu %[temp5], %[temp5], %[temp1] \n\t" \
  273. "addu %[temp6], %[temp6], %[temp2] \n\t" \
  274. "addu %[temp7], %[temp7], %[temp3] \n\t" \
  275. "addiu %["#P0"], %["#P0"], 16 \n\t" \
  276. ".if "#E" == 1 \n\t" \
  277. "addiu %["#P1"], %["#P1"], 16 \n\t" \
  278. ".endif \n\t" \
  279. "usw %[temp4], "#A"(%["#P2"]) \n\t" \
  280. "usw %[temp5], "#B"(%["#P2"]) \n\t" \
  281. "usw %[temp6], "#C"(%["#P2"]) \n\t" \
  282. "usw %[temp7], "#D"(%["#P2"]) \n\t" \
  283. "addiu %["#P2"], %["#P2"], 16 \n\t" \
  284. "bne %["#P0"], %[LoopEnd], 1b \n\t" \
  285. ".set pop \n\t" \
  286. #define ASM_END_COMMON_0 \
  287. : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), \
  288. [temp2]"=&r"(temp2), [temp3]"=&r"(temp3), \
  289. [temp4]"=&r"(temp4), [temp5]"=&r"(temp5), \
  290. [temp6]"=&r"(temp6), [temp7]"=&r"(temp7), \
  291. [pa]"+r"(pa), [pout]"+r"(pout)
  292. #define ASM_END_COMMON_1 \
  293. : [LoopEnd]"r"(LoopEnd) \
  294. : "memory", "at" \
  295. );
  296. #define ASM_END_0 \
  297. ASM_END_COMMON_0 \
  298. , [pb]"+r"(pb) \
  299. ASM_END_COMMON_1
  300. #define ASM_END_1 \
  301. ASM_END_COMMON_0 \
  302. ASM_END_COMMON_1
  303. #define ADD_VECTOR(A, B, OUT, SIZE, EXTRA_SIZE) do { \
  304. const uint32_t* pa = (const uint32_t*)(A); \
  305. const uint32_t* pb = (const uint32_t*)(B); \
  306. uint32_t* pout = (uint32_t*)(OUT); \
  307. const uint32_t* const LoopEnd = pa + (SIZE); \
  308. assert((SIZE) % 4 == 0); \
  309. ASM_START \
  310. ADD_TO_OUT(0, 4, 8, 12, 1, pa, pb, pout) \
  311. ASM_END_0 \
  312. if ((EXTRA_SIZE) > 0) { \
  313. const int last = (EXTRA_SIZE); \
  314. int i; \
  315. for (i = 0; i < last; ++i) pout[i] = pa[i] + pb[i]; \
  316. } \
  317. } while (0)
  318. #define ADD_VECTOR_EQ(A, OUT, SIZE, EXTRA_SIZE) do { \
  319. const uint32_t* pa = (const uint32_t*)(A); \
  320. uint32_t* pout = (uint32_t*)(OUT); \
  321. const uint32_t* const LoopEnd = pa + (SIZE); \
  322. assert((SIZE) % 4 == 0); \
  323. ASM_START \
  324. ADD_TO_OUT(0, 4, 8, 12, 0, pa, pout, pout) \
  325. ASM_END_1 \
  326. if ((EXTRA_SIZE) > 0) { \
  327. const int last = (EXTRA_SIZE); \
  328. int i; \
  329. for (i = 0; i < last; ++i) pout[i] += pa[i]; \
  330. } \
  331. } while (0)
  332. static void HistogramAdd(const VP8LHistogram* const a,
  333. const VP8LHistogram* const b,
  334. VP8LHistogram* const out) {
  335. uint32_t temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
  336. const int extra_cache_size = VP8LHistogramNumCodes(a->palette_code_bits_)
  337. - (NUM_LITERAL_CODES + NUM_LENGTH_CODES);
  338. assert(a->palette_code_bits_ == b->palette_code_bits_);
  339. if (b != out) {
  340. ADD_VECTOR(a->literal_, b->literal_, out->literal_,
  341. NUM_LITERAL_CODES + NUM_LENGTH_CODES, extra_cache_size);
  342. ADD_VECTOR(a->distance_, b->distance_, out->distance_,
  343. NUM_DISTANCE_CODES, 0);
  344. ADD_VECTOR(a->red_, b->red_, out->red_, NUM_LITERAL_CODES, 0);
  345. ADD_VECTOR(a->blue_, b->blue_, out->blue_, NUM_LITERAL_CODES, 0);
  346. ADD_VECTOR(a->alpha_, b->alpha_, out->alpha_, NUM_LITERAL_CODES, 0);
  347. } else {
  348. ADD_VECTOR_EQ(a->literal_, out->literal_,
  349. NUM_LITERAL_CODES + NUM_LENGTH_CODES, extra_cache_size);
  350. ADD_VECTOR_EQ(a->distance_, out->distance_, NUM_DISTANCE_CODES, 0);
  351. ADD_VECTOR_EQ(a->red_, out->red_, NUM_LITERAL_CODES, 0);
  352. ADD_VECTOR_EQ(a->blue_, out->blue_, NUM_LITERAL_CODES, 0);
  353. ADD_VECTOR_EQ(a->alpha_, out->alpha_, NUM_LITERAL_CODES, 0);
  354. }
  355. }
  356. #undef ADD_VECTOR_EQ
  357. #undef ADD_VECTOR
  358. #undef ASM_END_1
  359. #undef ASM_END_0
  360. #undef ASM_END_COMMON_1
  361. #undef ASM_END_COMMON_0
  362. #undef ADD_TO_OUT
  363. #undef ASM_START
  364. #endif // WEBP_USE_MIPS32
  365. //------------------------------------------------------------------------------
  366. // Entry point
  367. extern void VP8LDspInitMIPS32(void);
  368. void VP8LDspInitMIPS32(void) {
  369. #if defined(WEBP_USE_MIPS32)
  370. VP8LFastSLog2Slow = FastSLog2Slow;
  371. VP8LFastLog2Slow = FastLog2Slow;
  372. VP8LExtraCost = ExtraCost;
  373. VP8LExtraCostCombined = ExtraCostCombined;
  374. VP8LHuffmanCostCount = HuffmanCostCount;
  375. VP8LHuffmanCostCombinedCount = HuffmanCostCombinedCount;
  376. VP8LHistogramAdd = HistogramAdd;
  377. #endif // WEBP_USE_MIPS32
  378. }