jcphuff-sse2.asm 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660
  1. ;
  2. ; jcphuff-sse2.asm - prepare data for progressive Huffman encoding (SSE2)
  3. ;
  4. ; Copyright (C) 2016, 2018, Matthieu Darbois
  5. ;
  6. ; Based on the x86 SIMD extension for IJG JPEG library
  7. ; Copyright (C) 1999-2006, MIYASAKA Masaru.
  8. ; For conditions of distribution and use, see copyright notice in jsimdext.inc
  9. ;
  10. ; This file should be assembled with NASM (Netwide Assembler),
  11. ; can *not* be assembled with Microsoft's MASM or any compatible
  12. ; assembler (including Borland's Turbo Assembler).
  13. ; NASM is available from http://nasm.sourceforge.net/ or
  14. ; http://sourceforge.net/project/showfiles.php?group_id=6208
  15. ;
  16. ; This file contains an SSE2 implementation of data preparation for progressive
  17. ; Huffman encoding. See jcphuff.c for more details.
  18. %include "jsimdext.inc"
  19. ; --------------------------------------------------------------------------
  20. SECTION SEG_TEXT
  21. BITS 32
  22. ; --------------------------------------------------------------------------
  23. ; Macros to load data for jsimd_encode_mcu_AC_first_prepare_sse2() and
  24. ; jsimd_encode_mcu_AC_refine_prepare_sse2()
  25. %macro LOAD16 0
  26. pxor N0, N0
  27. pxor N1, N1
  28. mov T0, INT [LUT + 0*SIZEOF_INT]
  29. mov T1, INT [LUT + 8*SIZEOF_INT]
  30. pinsrw X0, word [BLOCK + T0 * 2], 0
  31. pinsrw X1, word [BLOCK + T1 * 2], 0
  32. mov T0, INT [LUT + 1*SIZEOF_INT]
  33. mov T1, INT [LUT + 9*SIZEOF_INT]
  34. pinsrw X0, word [BLOCK + T0 * 2], 1
  35. pinsrw X1, word [BLOCK + T1 * 2], 1
  36. mov T0, INT [LUT + 2*SIZEOF_INT]
  37. mov T1, INT [LUT + 10*SIZEOF_INT]
  38. pinsrw X0, word [BLOCK + T0 * 2], 2
  39. pinsrw X1, word [BLOCK + T1 * 2], 2
  40. mov T0, INT [LUT + 3*SIZEOF_INT]
  41. mov T1, INT [LUT + 11*SIZEOF_INT]
  42. pinsrw X0, word [BLOCK + T0 * 2], 3
  43. pinsrw X1, word [BLOCK + T1 * 2], 3
  44. mov T0, INT [LUT + 4*SIZEOF_INT]
  45. mov T1, INT [LUT + 12*SIZEOF_INT]
  46. pinsrw X0, word [BLOCK + T0 * 2], 4
  47. pinsrw X1, word [BLOCK + T1 * 2], 4
  48. mov T0, INT [LUT + 5*SIZEOF_INT]
  49. mov T1, INT [LUT + 13*SIZEOF_INT]
  50. pinsrw X0, word [BLOCK + T0 * 2], 5
  51. pinsrw X1, word [BLOCK + T1 * 2], 5
  52. mov T0, INT [LUT + 6*SIZEOF_INT]
  53. mov T1, INT [LUT + 14*SIZEOF_INT]
  54. pinsrw X0, word [BLOCK + T0 * 2], 6
  55. pinsrw X1, word [BLOCK + T1 * 2], 6
  56. mov T0, INT [LUT + 7*SIZEOF_INT]
  57. mov T1, INT [LUT + 15*SIZEOF_INT]
  58. pinsrw X0, word [BLOCK + T0 * 2], 7
  59. pinsrw X1, word [BLOCK + T1 * 2], 7
  60. %endmacro
  61. %macro LOAD15 0
  62. pxor N0, N0
  63. pxor N1, N1
  64. pxor X1, X1
  65. mov T0, INT [LUT + 0*SIZEOF_INT]
  66. mov T1, INT [LUT + 8*SIZEOF_INT]
  67. pinsrw X0, word [BLOCK + T0 * 2], 0
  68. pinsrw X1, word [BLOCK + T1 * 2], 0
  69. mov T0, INT [LUT + 1*SIZEOF_INT]
  70. pinsrw X0, word [BLOCK + T0 * 2], 1
  71. mov T0, INT [LUT + 2*SIZEOF_INT]
  72. pinsrw X0, word [BLOCK + T0 * 2], 2
  73. mov T0, INT [LUT + 3*SIZEOF_INT]
  74. pinsrw X0, word [BLOCK + T0 * 2], 3
  75. mov T0, INT [LUT + 4*SIZEOF_INT]
  76. pinsrw X0, word [BLOCK + T0 * 2], 4
  77. mov T0, INT [LUT + 5*SIZEOF_INT]
  78. pinsrw X0, word [BLOCK + T0 * 2], 5
  79. mov T0, INT [LUT + 6*SIZEOF_INT]
  80. pinsrw X0, word [BLOCK + T0 * 2], 6
  81. mov T0, INT [LUT + 7*SIZEOF_INT]
  82. pinsrw X0, word [BLOCK + T0 * 2], 7
  83. cmp LENEND, 2
  84. jl %%.ELOAD15
  85. mov T1, INT [LUT + 9*SIZEOF_INT]
  86. pinsrw X1, word [BLOCK + T1 * 2], 1
  87. cmp LENEND, 3
  88. jl %%.ELOAD15
  89. mov T1, INT [LUT + 10*SIZEOF_INT]
  90. pinsrw X1, word [BLOCK + T1 * 2], 2
  91. cmp LENEND, 4
  92. jl %%.ELOAD15
  93. mov T1, INT [LUT + 11*SIZEOF_INT]
  94. pinsrw X1, word [BLOCK + T1 * 2], 3
  95. cmp LENEND, 5
  96. jl %%.ELOAD15
  97. mov T1, INT [LUT + 12*SIZEOF_INT]
  98. pinsrw X1, word [BLOCK + T1 * 2], 4
  99. cmp LENEND, 6
  100. jl %%.ELOAD15
  101. mov T1, INT [LUT + 13*SIZEOF_INT]
  102. pinsrw X1, word [BLOCK + T1 * 2], 5
  103. cmp LENEND, 7
  104. jl %%.ELOAD15
  105. mov T1, INT [LUT + 14*SIZEOF_INT]
  106. pinsrw X1, word [BLOCK + T1 * 2], 6
  107. %%.ELOAD15:
  108. %endmacro
  109. %macro LOAD8 0
  110. pxor N0, N0
  111. mov T0, INT [LUT + 0*SIZEOF_INT]
  112. pinsrw X0, word [BLOCK + T0 * 2], 0
  113. mov T0, INT [LUT + 1*SIZEOF_INT]
  114. pinsrw X0, word [BLOCK + T0 * 2], 1
  115. mov T0, INT [LUT + 2*SIZEOF_INT]
  116. pinsrw X0, word [BLOCK + T0 * 2], 2
  117. mov T0, INT [LUT + 3*SIZEOF_INT]
  118. pinsrw X0, word [BLOCK + T0 * 2], 3
  119. mov T0, INT [LUT + 4*SIZEOF_INT]
  120. pinsrw X0, word [BLOCK + T0 * 2], 4
  121. mov T0, INT [LUT + 5*SIZEOF_INT]
  122. pinsrw X0, word [BLOCK + T0 * 2], 5
  123. mov T0, INT [LUT + 6*SIZEOF_INT]
  124. pinsrw X0, word [BLOCK + T0 * 2], 6
  125. mov T0, INT [LUT + 7*SIZEOF_INT]
  126. pinsrw X0, word [BLOCK + T0 * 2], 7
  127. %endmacro
  128. %macro LOAD7 0
  129. pxor N0, N0
  130. pxor X0, X0
  131. mov T1, INT [LUT + 0*SIZEOF_INT]
  132. pinsrw X0, word [BLOCK + T1 * 2], 0
  133. cmp LENEND, 2
  134. jl %%.ELOAD7
  135. mov T1, INT [LUT + 1*SIZEOF_INT]
  136. pinsrw X0, word [BLOCK + T1 * 2], 1
  137. cmp LENEND, 3
  138. jl %%.ELOAD7
  139. mov T1, INT [LUT + 2*SIZEOF_INT]
  140. pinsrw X0, word [BLOCK + T1 * 2], 2
  141. cmp LENEND, 4
  142. jl %%.ELOAD7
  143. mov T1, INT [LUT + 3*SIZEOF_INT]
  144. pinsrw X0, word [BLOCK + T1 * 2], 3
  145. cmp LENEND, 5
  146. jl %%.ELOAD7
  147. mov T1, INT [LUT + 4*SIZEOF_INT]
  148. pinsrw X0, word [BLOCK + T1 * 2], 4
  149. cmp LENEND, 6
  150. jl %%.ELOAD7
  151. mov T1, INT [LUT + 5*SIZEOF_INT]
  152. pinsrw X0, word [BLOCK + T1 * 2], 5
  153. cmp LENEND, 7
  154. jl %%.ELOAD7
  155. mov T1, INT [LUT + 6*SIZEOF_INT]
  156. pinsrw X0, word [BLOCK + T1 * 2], 6
  157. %%.ELOAD7:
  158. %endmacro
  159. %macro REDUCE0 0
  160. movdqa xmm0, XMMWORD [VALUES + ( 0*2)]
  161. movdqa xmm1, XMMWORD [VALUES + ( 8*2)]
  162. movdqa xmm2, XMMWORD [VALUES + (16*2)]
  163. movdqa xmm3, XMMWORD [VALUES + (24*2)]
  164. movdqa xmm4, XMMWORD [VALUES + (32*2)]
  165. movdqa xmm5, XMMWORD [VALUES + (40*2)]
  166. movdqa xmm6, XMMWORD [VALUES + (48*2)]
  167. pcmpeqw xmm0, ZERO
  168. pcmpeqw xmm1, ZERO
  169. pcmpeqw xmm2, ZERO
  170. pcmpeqw xmm3, ZERO
  171. pcmpeqw xmm4, ZERO
  172. pcmpeqw xmm5, ZERO
  173. pcmpeqw xmm6, ZERO
  174. pcmpeqw xmm7, XMMWORD [VALUES + (56*2)]
  175. packsswb xmm0, xmm1
  176. packsswb xmm2, xmm3
  177. packsswb xmm4, xmm5
  178. packsswb xmm6, xmm7
  179. pmovmskb eax, xmm0
  180. pmovmskb ecx, xmm2
  181. pmovmskb edx, xmm4
  182. pmovmskb esi, xmm6
  183. shl ecx, 16
  184. shl esi, 16
  185. or eax, ecx
  186. or edx, esi
  187. not eax
  188. not edx
  189. mov edi, ZEROBITS
  190. mov INT [edi], eax
  191. mov INT [edi+SIZEOF_INT], edx
  192. %endmacro
  193. ;
  194. ; Prepare data for jsimd_encode_mcu_AC_first().
  195. ;
  196. ; GLOBAL(void)
  197. ; jsimd_encode_mcu_AC_first_prepare_sse2(const JCOEF *block,
  198. ; const int *jpeg_natural_order_start,
  199. ; int Sl, int Al, JCOEF *values,
  200. ; size_t *zerobits)
  201. ;
  202. ; eax + 8 = const JCOEF *block
  203. ; eax + 12 = const int *jpeg_natural_order_start
  204. ; eax + 16 = int Sl
  205. ; eax + 20 = int Al
  206. ; eax + 24 = JCOEF *values
  207. ; eax + 28 = size_t *zerobits
  208. %define ZERO xmm7
  209. %define X0 xmm0
  210. %define X1 xmm1
  211. %define N0 xmm2
  212. %define N1 xmm3
  213. %define AL xmm4
  214. %define K eax
  215. %define LENEND eax
  216. %define LUT ebx
  217. %define T0 ecx
  218. %define T1 edx
  219. %define BLOCK esi
  220. %define VALUES edi
  221. %define LEN ebp
  222. %define ZEROBITS INT [esp + 5 * 4]
  223. align 32
  224. GLOBAL_FUNCTION(jsimd_encode_mcu_AC_first_prepare_sse2)
  225. EXTN(jsimd_encode_mcu_AC_first_prepare_sse2):
  226. push ebp
  227. mov eax, esp ; eax = original ebp
  228. sub esp, byte 4
  229. and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
  230. mov [esp], eax
  231. mov ebp, esp ; ebp = aligned ebp
  232. sub esp, 4
  233. push ebx
  234. push ecx
  235. ; push edx ; need not be preserved
  236. push esi
  237. push edi
  238. push ebp
  239. mov BLOCK, INT [eax + 8]
  240. mov LUT, INT [eax + 12]
  241. mov VALUES, INT [eax + 24]
  242. movd AL, INT [eax + 20]
  243. mov T0, INT [eax + 28]
  244. mov ZEROBITS, T0
  245. mov LEN, INT [eax + 16]
  246. pxor ZERO, ZERO
  247. mov K, LEN
  248. and K, -16
  249. shr K, 4
  250. jz .ELOOP16
  251. .BLOOP16:
  252. LOAD16
  253. pcmpgtw N0, X0
  254. pcmpgtw N1, X1
  255. paddw X0, N0
  256. paddw X1, N1
  257. pxor X0, N0
  258. pxor X1, N1
  259. psrlw X0, AL
  260. psrlw X1, AL
  261. pxor N0, X0
  262. pxor N1, X1
  263. movdqa XMMWORD [VALUES + (0) * 2], X0
  264. movdqa XMMWORD [VALUES + (8) * 2], X1
  265. movdqa XMMWORD [VALUES + (0 + DCTSIZE2) * 2], N0
  266. movdqa XMMWORD [VALUES + (8 + DCTSIZE2) * 2], N1
  267. add VALUES, 16*2
  268. add LUT, 16*SIZEOF_INT
  269. dec K
  270. jnz .BLOOP16
  271. test LEN, 15
  272. je .PADDING
  273. .ELOOP16:
  274. mov LENEND, LEN
  275. and LENEND, 7
  276. test LEN, 8
  277. jz .TRY7
  278. test LEN, 7
  279. jz .TRY8
  280. LOAD15
  281. pcmpgtw N0, X0
  282. pcmpgtw N1, X1
  283. paddw X0, N0
  284. paddw X1, N1
  285. pxor X0, N0
  286. pxor X1, N1
  287. psrlw X0, AL
  288. psrlw X1, AL
  289. pxor N0, X0
  290. pxor N1, X1
  291. movdqa XMMWORD [VALUES + (0) * 2], X0
  292. movdqa XMMWORD [VALUES + (8) * 2], X1
  293. movdqa XMMWORD [VALUES + (0 + DCTSIZE2) * 2], N0
  294. movdqa XMMWORD [VALUES + (8 + DCTSIZE2) * 2], N1
  295. add VALUES, 16*2
  296. jmp .PADDING
  297. .TRY8:
  298. LOAD8
  299. pcmpgtw N0, X0
  300. paddw X0, N0
  301. pxor X0, N0
  302. psrlw X0, AL
  303. pxor N0, X0
  304. movdqa XMMWORD [VALUES + (0) * 2], X0
  305. movdqa XMMWORD [VALUES + (0 + DCTSIZE2) * 2], N0
  306. add VALUES, 8*2
  307. jmp .PADDING
  308. .TRY7:
  309. LOAD7
  310. pcmpgtw N0, X0
  311. paddw X0, N0
  312. pxor X0, N0
  313. psrlw X0, AL
  314. pxor N0, X0
  315. movdqa XMMWORD [VALUES + (0) * 2], X0
  316. movdqa XMMWORD [VALUES + (0 + DCTSIZE2) * 2], N0
  317. add VALUES, 8*2
  318. .PADDING:
  319. mov K, LEN
  320. add K, 7
  321. and K, -8
  322. shr K, 3
  323. sub K, DCTSIZE2/8
  324. jz .EPADDING
  325. align 16
  326. .ZEROLOOP:
  327. movdqa XMMWORD [VALUES + 0], ZERO
  328. add VALUES, 8*2
  329. inc K
  330. jnz .ZEROLOOP
  331. .EPADDING:
  332. sub VALUES, DCTSIZE2*2
  333. REDUCE0
  334. pop ebp
  335. pop edi
  336. pop esi
  337. ; pop edx ; need not be preserved
  338. pop ecx
  339. pop ebx
  340. mov esp, ebp ; esp <- aligned ebp
  341. pop esp ; esp <- original ebp
  342. pop ebp
  343. ret
  344. %undef ZERO
  345. %undef X0
  346. %undef X1
  347. %undef N0
  348. %undef N1
  349. %undef AL
  350. %undef K
  351. %undef LUT
  352. %undef T0
  353. %undef T1
  354. %undef BLOCK
  355. %undef VALUES
  356. %undef LEN
  357. ;
  358. ; Prepare data for jsimd_encode_mcu_AC_refine().
  359. ;
  360. ; GLOBAL(int)
  361. ; jsimd_encode_mcu_AC_refine_prepare_sse2(const JCOEF *block,
  362. ; const int *jpeg_natural_order_start,
  363. ; int Sl, int Al, JCOEF *absvalues,
  364. ; size_t *bits)
  365. ;
  366. ; eax + 8 = const JCOEF *block
  367. ; eax + 12 = const int *jpeg_natural_order_start
  368. ; eax + 16 = int Sl
  369. ; eax + 20 = int Al
  370. ; eax + 24 = JCOEF *values
  371. ; eax + 28 = size_t *bits
  372. %define ZERO xmm7
  373. %define ONE xmm5
  374. %define X0 xmm0
  375. %define X1 xmm1
  376. %define N0 xmm2
  377. %define N1 xmm3
  378. %define AL xmm4
  379. %define K eax
  380. %define LENEND eax
  381. %define LUT ebx
  382. %define T0 ecx
  383. %define T0w cx
  384. %define T1 edx
  385. %define BLOCK esi
  386. %define VALUES edi
  387. %define KK ebp
  388. %define ZEROBITS INT [esp + 5 * 4]
  389. %define EOB INT [esp + 5 * 4 + 4]
  390. %define LEN INT [esp + 5 * 4 + 8]
  391. align 32
  392. GLOBAL_FUNCTION(jsimd_encode_mcu_AC_refine_prepare_sse2)
  393. EXTN(jsimd_encode_mcu_AC_refine_prepare_sse2):
  394. push ebp
  395. mov eax, esp ; eax = original ebp
  396. sub esp, byte 4
  397. and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
  398. mov [esp], eax
  399. mov ebp, esp ; ebp = aligned ebp
  400. sub esp, 16
  401. push ebx
  402. push ecx
  403. ; push edx ; need not be preserved
  404. push esi
  405. push edi
  406. push ebp
  407. pcmpeqw ONE, ONE
  408. psrlw ONE, 15
  409. mov BLOCK, INT [eax + 8]
  410. mov LUT, INT [eax + 12]
  411. mov VALUES, INT [eax + 24]
  412. movd AL, INT [eax + 20]
  413. mov T0, INT [eax + 28]
  414. mov K, INT [eax + 16]
  415. mov INT [T0 + 2 * SIZEOF_INT], -1
  416. mov INT [T0 + 3 * SIZEOF_INT], -1
  417. mov ZEROBITS, T0
  418. mov LEN, K
  419. pxor ZERO, ZERO
  420. and K, -16
  421. mov EOB, 0
  422. xor KK, KK
  423. shr K, 4
  424. jz .ELOOPR16
  425. .BLOOPR16:
  426. LOAD16
  427. pcmpgtw N0, X0
  428. pcmpgtw N1, X1
  429. paddw X0, N0
  430. paddw X1, N1
  431. pxor X0, N0
  432. pxor X1, N1
  433. psrlw X0, AL
  434. psrlw X1, AL
  435. movdqa XMMWORD [VALUES + (0) * 2], X0
  436. movdqa XMMWORD [VALUES + (8) * 2], X1
  437. pcmpeqw X0, ONE
  438. pcmpeqw X1, ONE
  439. packsswb N0, N1
  440. packsswb X0, X1
  441. pmovmskb T0, N0 ; lsignbits.val16u[k>>4] = _mm_movemask_epi8(neg);
  442. mov T1, ZEROBITS
  443. not T0
  444. mov word [T1 + 2 * SIZEOF_INT + KK], T0w
  445. pmovmskb T1, X0 ; idx = _mm_movemask_epi8(x1);
  446. bsr T1, T1 ; idx = 16 - (__builtin_clz(idx)>>1);
  447. jz .CONTINUER16 ; if (idx) {
  448. lea T1, [T1+KK*8]
  449. mov EOB, T1 ; EOB = k + idx;
  450. .CONTINUER16:
  451. add VALUES, 16*2
  452. add LUT, 16*SIZEOF_INT
  453. add KK, 2
  454. dec K
  455. jnz .BLOOPR16
  456. .ELOOPR16:
  457. mov LENEND, LEN
  458. test LENEND, 8
  459. jz .TRYR7
  460. test LENEND, 7
  461. jz .TRYR8
  462. and LENEND, 7
  463. LOAD15
  464. pcmpgtw N0, X0
  465. pcmpgtw N1, X1
  466. paddw X0, N0
  467. paddw X1, N1
  468. pxor X0, N0
  469. pxor X1, N1
  470. psrlw X0, AL
  471. psrlw X1, AL
  472. movdqa XMMWORD [VALUES + (0) * 2], X0
  473. movdqa XMMWORD [VALUES + (8) * 2], X1
  474. pcmpeqw X0, ONE
  475. pcmpeqw X1, ONE
  476. packsswb N0, N1
  477. packsswb X0, X1
  478. pmovmskb T0, N0 ; lsignbits.val16u[k>>4] = _mm_movemask_epi8(neg);
  479. mov T1, ZEROBITS
  480. not T0
  481. mov word [T1 + 2 * SIZEOF_INT + KK], T0w
  482. pmovmskb T1, X0 ; idx = _mm_movemask_epi8(x1);
  483. bsr T1, T1 ; idx = 16 - (__builtin_clz(idx)>>1);
  484. jz .CONTINUER15 ; if (idx) {
  485. lea T1, [T1+KK*8]
  486. mov EOB, T1 ; EOB = k + idx;
  487. .CONTINUER15:
  488. add VALUES, 16*2
  489. jmp .PADDINGR
  490. .TRYR8:
  491. LOAD8
  492. pcmpgtw N0, X0
  493. paddw X0, N0
  494. pxor X0, N0
  495. psrlw X0, AL
  496. movdqa XMMWORD [VALUES + (0) * 2], X0
  497. pcmpeqw X0, ONE
  498. packsswb N0, ZERO
  499. packsswb X0, ZERO
  500. pmovmskb T0, N0 ; lsignbits.val16u[k>>4] = _mm_movemask_epi8(neg);
  501. mov T1, ZEROBITS
  502. not T0
  503. mov word [T1 + 2 * SIZEOF_INT + KK], T0w
  504. pmovmskb T1, X0 ; idx = _mm_movemask_epi8(x1);
  505. bsr T1, T1 ; idx = 16 - (__builtin_clz(idx)>>1);
  506. jz .CONTINUER8 ; if (idx) {
  507. lea T1, [T1+KK*8]
  508. mov EOB, T1 ; EOB = k + idx;
  509. .CONTINUER8:
  510. add VALUES, 8*2
  511. jmp .PADDINGR
  512. .TRYR7:
  513. and LENEND, 7
  514. LOAD7
  515. pcmpgtw N0, X0
  516. paddw X0, N0
  517. pxor X0, N0
  518. psrlw X0, AL
  519. movdqa XMMWORD [VALUES + (0) * 2], X0
  520. pcmpeqw X0, ONE
  521. packsswb N0, ZERO
  522. packsswb X0, ZERO
  523. pmovmskb T0, N0 ; lsignbits.val16u[k>>4] = _mm_movemask_epi8(neg);
  524. mov T1, ZEROBITS
  525. not T0
  526. mov word [T1 + 2 * SIZEOF_INT + KK], T0w
  527. pmovmskb T1, X0 ; idx = _mm_movemask_epi8(x1);
  528. bsr T1, T1 ; idx = 16 - (__builtin_clz(idx)>>1);
  529. jz .CONTINUER7 ; if (idx) {
  530. lea T1, [T1+KK*8]
  531. mov EOB, T1 ; EOB = k + idx;
  532. .CONTINUER7:
  533. add VALUES, 8*2
  534. .PADDINGR:
  535. mov K, LEN
  536. add K, 7
  537. and K, -8
  538. shr K, 3
  539. sub K, DCTSIZE2/8
  540. jz .EPADDINGR
  541. align 16
  542. .ZEROLOOPR:
  543. movdqa XMMWORD [VALUES + 0], ZERO
  544. add VALUES, 8*2
  545. inc K
  546. jnz .ZEROLOOPR
  547. .EPADDINGR:
  548. sub VALUES, DCTSIZE2*2
  549. REDUCE0
  550. mov eax, EOB
  551. pop ebp
  552. pop edi
  553. pop esi
  554. ; pop edx ; need not be preserved
  555. pop ecx
  556. pop ebx
  557. mov esp, ebp ; esp <- aligned ebp
  558. pop esp ; esp <- original ebp
  559. pop ebp
  560. ret
  561. %undef ZERO
  562. %undef ONE
  563. %undef X0
  564. %undef X1
  565. %undef N0
  566. %undef N1
  567. %undef AL
  568. %undef K
  569. %undef KK
  570. %undef EOB
  571. %undef SIGN
  572. %undef LUT
  573. %undef T0
  574. %undef T1
  575. %undef BLOCK
  576. %undef VALUES
  577. %undef LEN
  578. %undef LENEND
  579. ; For some reason, the OS X linker does not honor the request to align the
  580. ; segment unless we do this.
  581. align 32