jcphuff-sse2.asm 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637
  1. ;
  2. ; jcphuff-sse2.asm - prepare data for progressive Huffman encoding
  3. ; (64-bit SSE2)
  4. ;
  5. ; Copyright (C) 2016, 2018, Matthieu Darbois
  6. ;
  7. ; Based on the x86 SIMD extension for IJG JPEG library
  8. ; Copyright (C) 1999-2006, MIYASAKA Masaru.
  9. ; For conditions of distribution and use, see copyright notice in jsimdext.inc
  10. ;
  11. ; This file should be assembled with NASM (Netwide Assembler),
  12. ; can *not* be assembled with Microsoft's MASM or any compatible
  13. ; assembler (including Borland's Turbo Assembler).
  14. ; NASM is available from http://nasm.sourceforge.net/ or
  15. ; http://sourceforge.net/project/showfiles.php?group_id=6208
  16. ;
  17. ; This file contains an SSE2 implementation of data preparation for progressive
  18. ; Huffman encoding. See jcphuff.c for more details.
  19. %include "jsimdext.inc"
  20. ; --------------------------------------------------------------------------
  21. SECTION SEG_TEXT
  22. BITS 64
  23. ; --------------------------------------------------------------------------
  24. ; Macros to load data for jsimd_encode_mcu_AC_first_prepare_sse2() and
  25. ; jsimd_encode_mcu_AC_refine_prepare_sse2()
  26. %macro LOAD16 0
  27. pxor N0, N0
  28. pxor N1, N1
  29. mov T0d, INT [LUT + 0*SIZEOF_INT]
  30. mov T1d, INT [LUT + 8*SIZEOF_INT]
  31. pinsrw X0, word [BLOCK + T0 * 2], 0
  32. pinsrw X1, word [BLOCK + T1 * 2], 0
  33. mov T0d, INT [LUT + 1*SIZEOF_INT]
  34. mov T1d, INT [LUT + 9*SIZEOF_INT]
  35. pinsrw X0, word [BLOCK + T0 * 2], 1
  36. pinsrw X1, word [BLOCK + T1 * 2], 1
  37. mov T0d, INT [LUT + 2*SIZEOF_INT]
  38. mov T1d, INT [LUT + 10*SIZEOF_INT]
  39. pinsrw X0, word [BLOCK + T0 * 2], 2
  40. pinsrw X1, word [BLOCK + T1 * 2], 2
  41. mov T0d, INT [LUT + 3*SIZEOF_INT]
  42. mov T1d, INT [LUT + 11*SIZEOF_INT]
  43. pinsrw X0, word [BLOCK + T0 * 2], 3
  44. pinsrw X1, word [BLOCK + T1 * 2], 3
  45. mov T0d, INT [LUT + 4*SIZEOF_INT]
  46. mov T1d, INT [LUT + 12*SIZEOF_INT]
  47. pinsrw X0, word [BLOCK + T0 * 2], 4
  48. pinsrw X1, word [BLOCK + T1 * 2], 4
  49. mov T0d, INT [LUT + 5*SIZEOF_INT]
  50. mov T1d, INT [LUT + 13*SIZEOF_INT]
  51. pinsrw X0, word [BLOCK + T0 * 2], 5
  52. pinsrw X1, word [BLOCK + T1 * 2], 5
  53. mov T0d, INT [LUT + 6*SIZEOF_INT]
  54. mov T1d, INT [LUT + 14*SIZEOF_INT]
  55. pinsrw X0, word [BLOCK + T0 * 2], 6
  56. pinsrw X1, word [BLOCK + T1 * 2], 6
  57. mov T0d, INT [LUT + 7*SIZEOF_INT]
  58. mov T1d, INT [LUT + 15*SIZEOF_INT]
  59. pinsrw X0, word [BLOCK + T0 * 2], 7
  60. pinsrw X1, word [BLOCK + T1 * 2], 7
  61. %endmacro
  62. %macro LOAD15 0
  63. pxor N0, N0
  64. pxor N1, N1
  65. pxor X1, X1
  66. mov T0d, INT [LUT + 0*SIZEOF_INT]
  67. mov T1d, INT [LUT + 8*SIZEOF_INT]
  68. pinsrw X0, word [BLOCK + T0 * 2], 0
  69. pinsrw X1, word [BLOCK + T1 * 2], 0
  70. mov T0d, INT [LUT + 1*SIZEOF_INT]
  71. pinsrw X0, word [BLOCK + T0 * 2], 1
  72. mov T0d, INT [LUT + 2*SIZEOF_INT]
  73. pinsrw X0, word [BLOCK + T0 * 2], 2
  74. mov T0d, INT [LUT + 3*SIZEOF_INT]
  75. pinsrw X0, word [BLOCK + T0 * 2], 3
  76. mov T0d, INT [LUT + 4*SIZEOF_INT]
  77. pinsrw X0, word [BLOCK + T0 * 2], 4
  78. mov T0d, INT [LUT + 5*SIZEOF_INT]
  79. pinsrw X0, word [BLOCK + T0 * 2], 5
  80. mov T0d, INT [LUT + 6*SIZEOF_INT]
  81. pinsrw X0, word [BLOCK + T0 * 2], 6
  82. mov T0d, INT [LUT + 7*SIZEOF_INT]
  83. pinsrw X0, word [BLOCK + T0 * 2], 7
  84. cmp LENEND, 2
  85. jl %%.ELOAD15
  86. mov T1d, INT [LUT + 9*SIZEOF_INT]
  87. pinsrw X1, word [BLOCK + T1 * 2], 1
  88. cmp LENEND, 3
  89. jl %%.ELOAD15
  90. mov T1d, INT [LUT + 10*SIZEOF_INT]
  91. pinsrw X1, word [BLOCK + T1 * 2], 2
  92. cmp LENEND, 4
  93. jl %%.ELOAD15
  94. mov T1d, INT [LUT + 11*SIZEOF_INT]
  95. pinsrw X1, word [BLOCK + T1 * 2], 3
  96. cmp LENEND, 5
  97. jl %%.ELOAD15
  98. mov T1d, INT [LUT + 12*SIZEOF_INT]
  99. pinsrw X1, word [BLOCK + T1 * 2], 4
  100. cmp LENEND, 6
  101. jl %%.ELOAD15
  102. mov T1d, INT [LUT + 13*SIZEOF_INT]
  103. pinsrw X1, word [BLOCK + T1 * 2], 5
  104. cmp LENEND, 7
  105. jl %%.ELOAD15
  106. mov T1d, INT [LUT + 14*SIZEOF_INT]
  107. pinsrw X1, word [BLOCK + T1 * 2], 6
  108. %%.ELOAD15:
  109. %endmacro
  110. %macro LOAD8 0
  111. pxor N0, N0
  112. mov T0d, INT [LUT + 0*SIZEOF_INT]
  113. pinsrw X0, word [BLOCK + T0 * 2], 0
  114. mov T0d, INT [LUT + 1*SIZEOF_INT]
  115. pinsrw X0, word [BLOCK + T0 * 2], 1
  116. mov T0d, INT [LUT + 2*SIZEOF_INT]
  117. pinsrw X0, word [BLOCK + T0 * 2], 2
  118. mov T0d, INT [LUT + 3*SIZEOF_INT]
  119. pinsrw X0, word [BLOCK + T0 * 2], 3
  120. mov T0d, INT [LUT + 4*SIZEOF_INT]
  121. pinsrw X0, word [BLOCK + T0 * 2], 4
  122. mov T0d, INT [LUT + 5*SIZEOF_INT]
  123. pinsrw X0, word [BLOCK + T0 * 2], 5
  124. mov T0d, INT [LUT + 6*SIZEOF_INT]
  125. pinsrw X0, word [BLOCK + T0 * 2], 6
  126. mov T0d, INT [LUT + 7*SIZEOF_INT]
  127. pinsrw X0, word [BLOCK + T0 * 2], 7
  128. %endmacro
  129. %macro LOAD7 0
  130. pxor N0, N0
  131. pxor X0, X0
  132. mov T1d, INT [LUT + 0*SIZEOF_INT]
  133. pinsrw X0, word [BLOCK + T1 * 2], 0
  134. cmp LENEND, 2
  135. jl %%.ELOAD7
  136. mov T1d, INT [LUT + 1*SIZEOF_INT]
  137. pinsrw X0, word [BLOCK + T1 * 2], 1
  138. cmp LENEND, 3
  139. jl %%.ELOAD7
  140. mov T1d, INT [LUT + 2*SIZEOF_INT]
  141. pinsrw X0, word [BLOCK + T1 * 2], 2
  142. cmp LENEND, 4
  143. jl %%.ELOAD7
  144. mov T1d, INT [LUT + 3*SIZEOF_INT]
  145. pinsrw X0, word [BLOCK + T1 * 2], 3
  146. cmp LENEND, 5
  147. jl %%.ELOAD7
  148. mov T1d, INT [LUT + 4*SIZEOF_INT]
  149. pinsrw X0, word [BLOCK + T1 * 2], 4
  150. cmp LENEND, 6
  151. jl %%.ELOAD7
  152. mov T1d, INT [LUT + 5*SIZEOF_INT]
  153. pinsrw X0, word [BLOCK + T1 * 2], 5
  154. cmp LENEND, 7
  155. jl %%.ELOAD7
  156. mov T1d, INT [LUT + 6*SIZEOF_INT]
  157. pinsrw X0, word [BLOCK + T1 * 2], 6
  158. %%.ELOAD7:
  159. %endmacro
  160. %macro REDUCE0 0
  161. movdqa xmm0, XMMWORD [VALUES + ( 0*2)]
  162. movdqa xmm1, XMMWORD [VALUES + ( 8*2)]
  163. movdqa xmm2, XMMWORD [VALUES + (16*2)]
  164. movdqa xmm3, XMMWORD [VALUES + (24*2)]
  165. movdqa xmm4, XMMWORD [VALUES + (32*2)]
  166. movdqa xmm5, XMMWORD [VALUES + (40*2)]
  167. movdqa xmm6, XMMWORD [VALUES + (48*2)]
  168. movdqa xmm7, XMMWORD [VALUES + (56*2)]
  169. pcmpeqw xmm0, ZERO
  170. pcmpeqw xmm1, ZERO
  171. pcmpeqw xmm2, ZERO
  172. pcmpeqw xmm3, ZERO
  173. pcmpeqw xmm4, ZERO
  174. pcmpeqw xmm5, ZERO
  175. pcmpeqw xmm6, ZERO
  176. pcmpeqw xmm7, ZERO
  177. packsswb xmm0, xmm1
  178. packsswb xmm2, xmm3
  179. packsswb xmm4, xmm5
  180. packsswb xmm6, xmm7
  181. pmovmskb eax, xmm0
  182. pmovmskb ecx, xmm2
  183. pmovmskb edx, xmm4
  184. pmovmskb esi, xmm6
  185. shl rcx, 16
  186. shl rdx, 32
  187. shl rsi, 48
  188. or rax, rcx
  189. or rdx, rsi
  190. or rax, rdx
  191. not rax
  192. mov MMWORD [r15], rax
  193. %endmacro
  194. ;
  195. ; Prepare data for jsimd_encode_mcu_AC_first().
  196. ;
  197. ; GLOBAL(void)
  198. ; jsimd_encode_mcu_AC_first_prepare_sse2(const JCOEF *block,
  199. ; const int *jpeg_natural_order_start,
  200. ; int Sl, int Al, JCOEF *values,
  201. ; size_t *zerobits)
  202. ;
  203. ; r10 = const JCOEF *block
  204. ; r11 = const int *jpeg_natural_order_start
  205. ; r12 = int Sl
  206. ; r13 = int Al
  207. ; r14 = JCOEF *values
  208. ; r15 = size_t *zerobits
  209. %define ZERO xmm9
  210. %define X0 xmm0
  211. %define X1 xmm1
  212. %define N0 xmm2
  213. %define N1 xmm3
  214. %define AL xmm4
  215. %define K eax
  216. %define LUT r11
  217. %define T0 rcx
  218. %define T0d ecx
  219. %define T1 rdx
  220. %define T1d edx
  221. %define BLOCK r10
  222. %define VALUES r14
  223. %define LEN r12d
  224. %define LENEND r13d
  225. align 32
  226. GLOBAL_FUNCTION(jsimd_encode_mcu_AC_first_prepare_sse2)
  227. EXTN(jsimd_encode_mcu_AC_first_prepare_sse2):
  228. push rbp
  229. mov rax, rsp ; rax = original rbp
  230. sub rsp, byte 4
  231. and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
  232. mov [rsp], rax
  233. mov rbp, rsp ; rbp = aligned rbp
  234. lea rsp, [rbp - 16]
  235. collect_args 6
  236. movdqa XMMWORD [rbp - 16], ZERO
  237. movd AL, r13d
  238. pxor ZERO, ZERO
  239. mov K, LEN
  240. mov LENEND, LEN
  241. and K, -16
  242. and LENEND, 7
  243. shr K, 4
  244. jz .ELOOP16
  245. .BLOOP16:
  246. LOAD16
  247. pcmpgtw N0, X0
  248. pcmpgtw N1, X1
  249. paddw X0, N0
  250. paddw X1, N1
  251. pxor X0, N0
  252. pxor X1, N1
  253. psrlw X0, AL
  254. psrlw X1, AL
  255. pxor N0, X0
  256. pxor N1, X1
  257. movdqa XMMWORD [VALUES + (0) * 2], X0
  258. movdqa XMMWORD [VALUES + (8) * 2], X1
  259. movdqa XMMWORD [VALUES + (0 + DCTSIZE2) * 2], N0
  260. movdqa XMMWORD [VALUES + (8 + DCTSIZE2) * 2], N1
  261. add VALUES, 16*2
  262. add LUT, 16*SIZEOF_INT
  263. dec K
  264. jnz .BLOOP16
  265. test LEN, 15
  266. je .PADDING
  267. .ELOOP16:
  268. test LEN, 8
  269. jz .TRY7
  270. test LEN, 7
  271. jz .TRY8
  272. LOAD15
  273. pcmpgtw N0, X0
  274. pcmpgtw N1, X1
  275. paddw X0, N0
  276. paddw X1, N1
  277. pxor X0, N0
  278. pxor X1, N1
  279. psrlw X0, AL
  280. psrlw X1, AL
  281. pxor N0, X0
  282. pxor N1, X1
  283. movdqa XMMWORD [VALUES + (0) * 2], X0
  284. movdqa XMMWORD [VALUES + (8) * 2], X1
  285. movdqa XMMWORD [VALUES + (0 + DCTSIZE2) * 2], N0
  286. movdqa XMMWORD [VALUES + (8 + DCTSIZE2) * 2], N1
  287. add VALUES, 16*2
  288. jmp .PADDING
  289. .TRY8:
  290. LOAD8
  291. pcmpgtw N0, X0
  292. paddw X0, N0
  293. pxor X0, N0
  294. psrlw X0, AL
  295. pxor N0, X0
  296. movdqa XMMWORD [VALUES + (0) * 2], X0
  297. movdqa XMMWORD [VALUES + (0 + DCTSIZE2) * 2], N0
  298. add VALUES, 8*2
  299. jmp .PADDING
  300. .TRY7:
  301. LOAD7
  302. pcmpgtw N0, X0
  303. paddw X0, N0
  304. pxor X0, N0
  305. psrlw X0, AL
  306. pxor N0, X0
  307. movdqa XMMWORD [VALUES + (0) * 2], X0
  308. movdqa XMMWORD [VALUES + (0 + DCTSIZE2) * 2], N0
  309. add VALUES, 8*2
  310. .PADDING:
  311. mov K, LEN
  312. add K, 7
  313. and K, -8
  314. shr K, 3
  315. sub K, DCTSIZE2/8
  316. jz .EPADDING
  317. align 16
  318. .ZEROLOOP:
  319. movdqa XMMWORD [VALUES + 0], ZERO
  320. add VALUES, 8*2
  321. inc K
  322. jnz .ZEROLOOP
  323. .EPADDING:
  324. sub VALUES, DCTSIZE2*2
  325. REDUCE0
  326. movdqa ZERO, XMMWORD [rbp - 16]
  327. uncollect_args 6
  328. mov rsp, rbp ; rsp <- aligned rbp
  329. pop rsp ; rsp <- original rbp
  330. pop rbp
  331. ret
  332. %undef ZERO
  333. %undef X0
  334. %undef X1
  335. %undef N0
  336. %undef N1
  337. %undef AL
  338. %undef K
  339. %undef LUT
  340. %undef T0
  341. %undef T0d
  342. %undef T1
  343. %undef T1d
  344. %undef BLOCK
  345. %undef VALUES
  346. %undef LEN
  347. %undef LENEND
  348. ;
  349. ; Prepare data for jsimd_encode_mcu_AC_refine().
  350. ;
  351. ; GLOBAL(int)
  352. ; jsimd_encode_mcu_AC_refine_prepare_sse2(const JCOEF *block,
  353. ; const int *jpeg_natural_order_start,
  354. ; int Sl, int Al, JCOEF *absvalues,
  355. ; size_t *bits)
  356. ;
  357. ; r10 = const JCOEF *block
  358. ; r11 = const int *jpeg_natural_order_start
  359. ; r12 = int Sl
  360. ; r13 = int Al
  361. ; r14 = JCOEF *values
  362. ; r15 = size_t *bits
  363. %define ZERO xmm9
  364. %define ONE xmm5
  365. %define X0 xmm0
  366. %define X1 xmm1
  367. %define N0 xmm2
  368. %define N1 xmm3
  369. %define AL xmm4
  370. %define K eax
  371. %define KK r9d
  372. %define EOB r8d
  373. %define SIGN rdi
  374. %define LUT r11
  375. %define T0 rcx
  376. %define T0d ecx
  377. %define T1 rdx
  378. %define T1d edx
  379. %define BLOCK r10
  380. %define VALUES r14
  381. %define LEN r12d
  382. %define LENEND r13d
  383. align 32
  384. GLOBAL_FUNCTION(jsimd_encode_mcu_AC_refine_prepare_sse2)
  385. EXTN(jsimd_encode_mcu_AC_refine_prepare_sse2):
  386. push rbp
  387. mov rax, rsp ; rax = original rbp
  388. sub rsp, byte 4
  389. and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
  390. mov [rsp], rax
  391. mov rbp, rsp ; rbp = aligned rbp
  392. lea rsp, [rbp - 16]
  393. collect_args 6
  394. movdqa XMMWORD [rbp - 16], ZERO
  395. xor SIGN, SIGN
  396. xor EOB, EOB
  397. xor KK, KK
  398. movd AL, r13d
  399. pxor ZERO, ZERO
  400. pcmpeqw ONE, ONE
  401. psrlw ONE, 15
  402. mov K, LEN
  403. mov LENEND, LEN
  404. and K, -16
  405. and LENEND, 7
  406. shr K, 4
  407. jz .ELOOPR16
  408. .BLOOPR16:
  409. LOAD16
  410. pcmpgtw N0, X0
  411. pcmpgtw N1, X1
  412. paddw X0, N0
  413. paddw X1, N1
  414. pxor X0, N0
  415. pxor X1, N1
  416. psrlw X0, AL
  417. psrlw X1, AL
  418. movdqa XMMWORD [VALUES + (0) * 2], X0
  419. movdqa XMMWORD [VALUES + (8) * 2], X1
  420. pcmpeqw X0, ONE
  421. pcmpeqw X1, ONE
  422. packsswb N0, N1
  423. packsswb X0, X1
  424. pmovmskb T0d, N0 ; lsignbits.val16u[k>>4] = _mm_movemask_epi8(neg);
  425. pmovmskb T1d, X0 ; idx = _mm_movemask_epi8(x1);
  426. shr SIGN, 16 ; make room for sizebits
  427. shl T0, 48
  428. or SIGN, T0
  429. bsr T1d, T1d ; idx = 16 - (__builtin_clz(idx)>>1);
  430. jz .CONTINUER16 ; if (idx) {
  431. mov EOB, KK
  432. add EOB, T1d ; EOB = k + idx;
  433. .CONTINUER16:
  434. add VALUES, 16*2
  435. add LUT, 16*SIZEOF_INT
  436. add KK, 16
  437. dec K
  438. jnz .BLOOPR16
  439. .ELOOPR16:
  440. test LEN, 8
  441. jz .TRYR7
  442. test LEN, 7
  443. jz .TRYR8
  444. LOAD15
  445. pcmpgtw N0, X0
  446. pcmpgtw N1, X1
  447. paddw X0, N0
  448. paddw X1, N1
  449. pxor X0, N0
  450. pxor X1, N1
  451. psrlw X0, AL
  452. psrlw X1, AL
  453. movdqa XMMWORD [VALUES + (0) * 2], X0
  454. movdqa XMMWORD [VALUES + (8) * 2], X1
  455. pcmpeqw X0, ONE
  456. pcmpeqw X1, ONE
  457. packsswb N0, N1
  458. packsswb X0, X1
  459. pmovmskb T0d, N0 ; lsignbits.val16u[k>>4] = _mm_movemask_epi8(neg);
  460. pmovmskb T1d, X0 ; idx = _mm_movemask_epi8(x1);
  461. shr SIGN, 16 ; make room for sizebits
  462. shl T0, 48
  463. or SIGN, T0
  464. bsr T1d, T1d ; idx = 16 - (__builtin_clz(idx)>>1);
  465. jz .CONTINUER15 ; if (idx) {
  466. mov EOB, KK
  467. add EOB, T1d ; EOB = k + idx;
  468. .CONTINUER15:
  469. add VALUES, 16*2
  470. jmp .PADDINGR
  471. .TRYR8:
  472. LOAD8
  473. pcmpgtw N0, X0
  474. paddw X0, N0
  475. pxor X0, N0
  476. psrlw X0, AL
  477. movdqa XMMWORD [VALUES + (0) * 2], X0
  478. pcmpeqw X0, ONE
  479. packsswb N0, ZERO
  480. packsswb X0, ZERO
  481. pmovmskb T0d, N0 ; lsignbits.val16u[k>>4] = _mm_movemask_epi8(neg);
  482. pmovmskb T1d, X0 ; idx = _mm_movemask_epi8(x1);
  483. shr SIGN, 8 ; make room for sizebits
  484. shl T0, 56
  485. or SIGN, T0
  486. bsr T1d, T1d ; idx = 16 - (__builtin_clz(idx)>>1);
  487. jz .CONTINUER8 ; if (idx) {
  488. mov EOB, KK
  489. add EOB, T1d ; EOB = k + idx;
  490. .CONTINUER8:
  491. add VALUES, 8*2
  492. jmp .PADDINGR
  493. .TRYR7:
  494. LOAD7
  495. pcmpgtw N0, X0
  496. paddw X0, N0
  497. pxor X0, N0
  498. psrlw X0, AL
  499. movdqa XMMWORD [VALUES + (0) * 2], X0
  500. pcmpeqw X0, ONE
  501. packsswb N0, ZERO
  502. packsswb X0, ZERO
  503. pmovmskb T0d, N0 ; lsignbits.val16u[k>>4] = _mm_movemask_epi8(neg);
  504. pmovmskb T1d, X0 ; idx = _mm_movemask_epi8(x1);
  505. shr SIGN, 8 ; make room for sizebits
  506. shl T0, 56
  507. or SIGN, T0
  508. bsr T1d, T1d ; idx = 16 - (__builtin_clz(idx)>>1);
  509. jz .CONTINUER7 ; if (idx) {
  510. mov EOB, KK
  511. add EOB, T1d ; EOB = k + idx;
  512. .CONTINUER7:
  513. add VALUES, 8*2
  514. .PADDINGR:
  515. mov K, LEN
  516. add K, 7
  517. and K, -8
  518. shr K, 3
  519. sub K, DCTSIZE2/8
  520. jz .EPADDINGR
  521. align 16
  522. .ZEROLOOPR:
  523. movdqa XMMWORD [VALUES + 0], ZERO
  524. shr SIGN, 8
  525. add VALUES, 8*2
  526. inc K
  527. jnz .ZEROLOOPR
  528. .EPADDINGR:
  529. not SIGN
  530. sub VALUES, DCTSIZE2*2
  531. mov MMWORD [r15+SIZEOF_MMWORD], SIGN
  532. REDUCE0
  533. mov eax, EOB
  534. movdqa ZERO, XMMWORD [rbp - 16]
  535. uncollect_args 6
  536. mov rsp, rbp ; rsp <- aligned rbp
  537. pop rsp ; rsp <- original rbp
  538. pop rbp
  539. ret
  540. %undef ZERO
  541. %undef ONE
  542. %undef X0
  543. %undef X1
  544. %undef N0
  545. %undef N1
  546. %undef AL
  547. %undef K
  548. %undef KK
  549. %undef EOB
  550. %undef SIGN
  551. %undef LUT
  552. %undef T0
  553. %undef T0d
  554. %undef T1
  555. %undef T1d
  556. %undef BLOCK
  557. %undef VALUES
  558. %undef LEN
  559. %undef LENEND
  560. ; For some reason, the OS X linker does not honor the request to align the
  561. ; segment unless we do this.
  562. align 32