jidctfst-mmx.asm 20 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499
  1. ;
  2. ; jidctfst.asm - fast integer IDCT (MMX)
  3. ;
  4. ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
  5. ; Copyright (C) 2016, D. R. Commander.
  6. ;
  7. ; Based on the x86 SIMD extension for IJG JPEG library
  8. ; Copyright (C) 1999-2006, MIYASAKA Masaru.
  9. ; For conditions of distribution and use, see copyright notice in jsimdext.inc
  10. ;
  11. ; This file should be assembled with NASM (Netwide Assembler),
  12. ; can *not* be assembled with Microsoft's MASM or any compatible
  13. ; assembler (including Borland's Turbo Assembler).
  14. ; NASM is available from http://nasm.sourceforge.net/ or
  15. ; http://sourceforge.net/project/showfiles.php?group_id=6208
  16. ;
  17. ; This file contains a fast, not so accurate integer implementation of
  18. ; the inverse DCT (Discrete Cosine Transform). The following code is
  19. ; based directly on the IJG's original jidctfst.c; see the jidctfst.c
  20. ; for more details.
  21. %include "jsimdext.inc"
  22. %include "jdct.inc"
  23. ; --------------------------------------------------------------------------
  24. %define CONST_BITS 8 ; 14 is also OK.
  25. %define PASS1_BITS 2
  26. %if IFAST_SCALE_BITS != PASS1_BITS
  27. %error "'IFAST_SCALE_BITS' must be equal to 'PASS1_BITS'."
  28. %endif
  29. %if CONST_BITS == 8
  30. F_1_082 equ 277 ; FIX(1.082392200)
  31. F_1_414 equ 362 ; FIX(1.414213562)
  32. F_1_847 equ 473 ; FIX(1.847759065)
  33. F_2_613 equ 669 ; FIX(2.613125930)
  34. F_1_613 equ (F_2_613 - 256) ; FIX(2.613125930) - FIX(1)
  35. %else
  36. ; NASM cannot do compile-time arithmetic on floating-point constants.
  37. %define DESCALE(x, n) (((x) + (1 << ((n) - 1))) >> (n))
  38. F_1_082 equ DESCALE(1162209775, 30 - CONST_BITS) ; FIX(1.082392200)
  39. F_1_414 equ DESCALE(1518500249, 30 - CONST_BITS) ; FIX(1.414213562)
  40. F_1_847 equ DESCALE(1984016188, 30 - CONST_BITS) ; FIX(1.847759065)
  41. F_2_613 equ DESCALE(2805822602, 30 - CONST_BITS) ; FIX(2.613125930)
  42. F_1_613 equ (F_2_613 - (1 << CONST_BITS)) ; FIX(2.613125930) - FIX(1)
  43. %endif
  44. ; --------------------------------------------------------------------------
  45. SECTION SEG_CONST
  46. ; PRE_MULTIPLY_SCALE_BITS <= 2 (to avoid overflow)
  47. ; CONST_BITS + CONST_SHIFT + PRE_MULTIPLY_SCALE_BITS == 16 (for pmulhw)
  48. %define PRE_MULTIPLY_SCALE_BITS 2
  49. %define CONST_SHIFT (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS)
  50. alignz 32
  51. GLOBAL_DATA(jconst_idct_ifast_mmx)
  52. EXTN(jconst_idct_ifast_mmx):
  53. PW_F1414 times 4 dw F_1_414 << CONST_SHIFT
  54. PW_F1847 times 4 dw F_1_847 << CONST_SHIFT
  55. PW_MF1613 times 4 dw -F_1_613 << CONST_SHIFT
  56. PW_F1082 times 4 dw F_1_082 << CONST_SHIFT
  57. PB_CENTERJSAMP times 8 db CENTERJSAMPLE
  58. alignz 32
  59. ; --------------------------------------------------------------------------
  60. SECTION SEG_TEXT
  61. BITS 32
  62. ;
  63. ; Perform dequantization and inverse DCT on one block of coefficients.
  64. ;
  65. ; GLOBAL(void)
  66. ; jsimd_idct_ifast_mmx(void *dct_table, JCOEFPTR coef_block,
  67. ; JSAMPARRAY output_buf, JDIMENSION output_col)
  68. ;
  69. %define dct_table(b) (b) + 8 ; jpeg_component_info *compptr
  70. %define coef_block(b) (b) + 12 ; JCOEFPTR coef_block
  71. %define output_buf(b) (b) + 16 ; JSAMPARRAY output_buf
  72. %define output_col(b) (b) + 20 ; JDIMENSION output_col
  73. %define original_ebp ebp + 0
  74. %define wk(i) ebp - (WK_NUM - (i)) * SIZEOF_MMWORD
  75. ; mmword wk[WK_NUM]
  76. %define WK_NUM 2
  77. %define workspace wk(0) - DCTSIZE2 * SIZEOF_JCOEF
  78. ; JCOEF workspace[DCTSIZE2]
  79. align 32
  80. GLOBAL_FUNCTION(jsimd_idct_ifast_mmx)
  81. EXTN(jsimd_idct_ifast_mmx):
  82. push ebp
  83. mov eax, esp ; eax = original ebp
  84. sub esp, byte 4
  85. and esp, byte (-SIZEOF_MMWORD) ; align to 64 bits
  86. mov [esp], eax
  87. mov ebp, esp ; ebp = aligned ebp
  88. lea esp, [workspace]
  89. push ebx
  90. ; push ecx ; need not be preserved
  91. ; push edx ; need not be preserved
  92. push esi
  93. push edi
  94. get_GOT ebx ; get GOT address
  95. ; ---- Pass 1: process columns from input, store into work array.
  96. ; mov eax, [original_ebp]
  97. mov edx, POINTER [dct_table(eax)] ; quantptr
  98. mov esi, JCOEFPTR [coef_block(eax)] ; inptr
  99. lea edi, [workspace] ; JCOEF *wsptr
  100. mov ecx, DCTSIZE/4 ; ctr
  101. alignx 16, 7
  102. .columnloop:
  103. %ifndef NO_ZERO_COLUMN_TEST_IFAST_MMX
  104. mov eax, dword [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
  105. or eax, dword [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
  106. jnz short .columnDCT
  107. movq mm0, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
  108. movq mm1, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
  109. por mm0, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
  110. por mm1, MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)]
  111. por mm0, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
  112. por mm1, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
  113. por mm0, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
  114. por mm1, mm0
  115. packsswb mm1, mm1
  116. movd eax, mm1
  117. test eax, eax
  118. jnz short .columnDCT
  119. ; -- AC terms all zero
  120. movq mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
  121. pmullw mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_IFAST_MULT_TYPE)]
  122. movq mm2, mm0 ; mm0=in0=(00 01 02 03)
  123. punpcklwd mm0, mm0 ; mm0=(00 00 01 01)
  124. punpckhwd mm2, mm2 ; mm2=(02 02 03 03)
  125. movq mm1, mm0
  126. punpckldq mm0, mm0 ; mm0=(00 00 00 00)
  127. punpckhdq mm1, mm1 ; mm1=(01 01 01 01)
  128. movq mm3, mm2
  129. punpckldq mm2, mm2 ; mm2=(02 02 02 02)
  130. punpckhdq mm3, mm3 ; mm3=(03 03 03 03)
  131. movq MMWORD [MMBLOCK(0,0,edi,SIZEOF_JCOEF)], mm0
  132. movq MMWORD [MMBLOCK(0,1,edi,SIZEOF_JCOEF)], mm0
  133. movq MMWORD [MMBLOCK(1,0,edi,SIZEOF_JCOEF)], mm1
  134. movq MMWORD [MMBLOCK(1,1,edi,SIZEOF_JCOEF)], mm1
  135. movq MMWORD [MMBLOCK(2,0,edi,SIZEOF_JCOEF)], mm2
  136. movq MMWORD [MMBLOCK(2,1,edi,SIZEOF_JCOEF)], mm2
  137. movq MMWORD [MMBLOCK(3,0,edi,SIZEOF_JCOEF)], mm3
  138. movq MMWORD [MMBLOCK(3,1,edi,SIZEOF_JCOEF)], mm3
  139. jmp near .nextcolumn
  140. alignx 16, 7
  141. %endif
  142. .columnDCT:
  143. ; -- Even part
  144. movq mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
  145. movq mm1, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
  146. pmullw mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_IFAST_MULT_TYPE)]
  147. pmullw mm1, MMWORD [MMBLOCK(2,0,edx,SIZEOF_IFAST_MULT_TYPE)]
  148. movq mm2, MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)]
  149. movq mm3, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
  150. pmullw mm2, MMWORD [MMBLOCK(4,0,edx,SIZEOF_IFAST_MULT_TYPE)]
  151. pmullw mm3, MMWORD [MMBLOCK(6,0,edx,SIZEOF_IFAST_MULT_TYPE)]
  152. movq mm4, mm0
  153. movq mm5, mm1
  154. psubw mm0, mm2 ; mm0=tmp11
  155. psubw mm1, mm3
  156. paddw mm4, mm2 ; mm4=tmp10
  157. paddw mm5, mm3 ; mm5=tmp13
  158. psllw mm1, PRE_MULTIPLY_SCALE_BITS
  159. pmulhw mm1, [GOTOFF(ebx,PW_F1414)]
  160. psubw mm1, mm5 ; mm1=tmp12
  161. movq mm6, mm4
  162. movq mm7, mm0
  163. psubw mm4, mm5 ; mm4=tmp3
  164. psubw mm0, mm1 ; mm0=tmp2
  165. paddw mm6, mm5 ; mm6=tmp0
  166. paddw mm7, mm1 ; mm7=tmp1
  167. movq MMWORD [wk(1)], mm4 ; wk(1)=tmp3
  168. movq MMWORD [wk(0)], mm0 ; wk(0)=tmp2
  169. ; -- Odd part
  170. movq mm2, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
  171. movq mm3, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
  172. pmullw mm2, MMWORD [MMBLOCK(1,0,edx,SIZEOF_IFAST_MULT_TYPE)]
  173. pmullw mm3, MMWORD [MMBLOCK(3,0,edx,SIZEOF_IFAST_MULT_TYPE)]
  174. movq mm5, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
  175. movq mm1, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
  176. pmullw mm5, MMWORD [MMBLOCK(5,0,edx,SIZEOF_IFAST_MULT_TYPE)]
  177. pmullw mm1, MMWORD [MMBLOCK(7,0,edx,SIZEOF_IFAST_MULT_TYPE)]
  178. movq mm4, mm2
  179. movq mm0, mm5
  180. psubw mm2, mm1 ; mm2=z12
  181. psubw mm5, mm3 ; mm5=z10
  182. paddw mm4, mm1 ; mm4=z11
  183. paddw mm0, mm3 ; mm0=z13
  184. movq mm1, mm5 ; mm1=z10(unscaled)
  185. psllw mm2, PRE_MULTIPLY_SCALE_BITS
  186. psllw mm5, PRE_MULTIPLY_SCALE_BITS
  187. movq mm3, mm4
  188. psubw mm4, mm0
  189. paddw mm3, mm0 ; mm3=tmp7
  190. psllw mm4, PRE_MULTIPLY_SCALE_BITS
  191. pmulhw mm4, [GOTOFF(ebx,PW_F1414)] ; mm4=tmp11
  192. ; To avoid overflow...
  193. ;
  194. ; (Original)
  195. ; tmp12 = -2.613125930 * z10 + z5;
  196. ;
  197. ; (This implementation)
  198. ; tmp12 = (-1.613125930 - 1) * z10 + z5;
  199. ; = -1.613125930 * z10 - z10 + z5;
  200. movq mm0, mm5
  201. paddw mm5, mm2
  202. pmulhw mm5, [GOTOFF(ebx,PW_F1847)] ; mm5=z5
  203. pmulhw mm0, [GOTOFF(ebx,PW_MF1613)]
  204. pmulhw mm2, [GOTOFF(ebx,PW_F1082)]
  205. psubw mm0, mm1
  206. psubw mm2, mm5 ; mm2=tmp10
  207. paddw mm0, mm5 ; mm0=tmp12
  208. ; -- Final output stage
  209. psubw mm0, mm3 ; mm0=tmp6
  210. movq mm1, mm6
  211. movq mm5, mm7
  212. paddw mm6, mm3 ; mm6=data0=(00 01 02 03)
  213. paddw mm7, mm0 ; mm7=data1=(10 11 12 13)
  214. psubw mm1, mm3 ; mm1=data7=(70 71 72 73)
  215. psubw mm5, mm0 ; mm5=data6=(60 61 62 63)
  216. psubw mm4, mm0 ; mm4=tmp5
  217. movq mm3, mm6 ; transpose coefficients(phase 1)
  218. punpcklwd mm6, mm7 ; mm6=(00 10 01 11)
  219. punpckhwd mm3, mm7 ; mm3=(02 12 03 13)
  220. movq mm0, mm5 ; transpose coefficients(phase 1)
  221. punpcklwd mm5, mm1 ; mm5=(60 70 61 71)
  222. punpckhwd mm0, mm1 ; mm0=(62 72 63 73)
  223. movq mm7, MMWORD [wk(0)] ; mm7=tmp2
  224. movq mm1, MMWORD [wk(1)] ; mm1=tmp3
  225. movq MMWORD [wk(0)], mm5 ; wk(0)=(60 70 61 71)
  226. movq MMWORD [wk(1)], mm0 ; wk(1)=(62 72 63 73)
  227. paddw mm2, mm4 ; mm2=tmp4
  228. movq mm5, mm7
  229. movq mm0, mm1
  230. paddw mm7, mm4 ; mm7=data2=(20 21 22 23)
  231. paddw mm1, mm2 ; mm1=data4=(40 41 42 43)
  232. psubw mm5, mm4 ; mm5=data5=(50 51 52 53)
  233. psubw mm0, mm2 ; mm0=data3=(30 31 32 33)
  234. movq mm4, mm7 ; transpose coefficients(phase 1)
  235. punpcklwd mm7, mm0 ; mm7=(20 30 21 31)
  236. punpckhwd mm4, mm0 ; mm4=(22 32 23 33)
  237. movq mm2, mm1 ; transpose coefficients(phase 1)
  238. punpcklwd mm1, mm5 ; mm1=(40 50 41 51)
  239. punpckhwd mm2, mm5 ; mm2=(42 52 43 53)
  240. movq mm0, mm6 ; transpose coefficients(phase 2)
  241. punpckldq mm6, mm7 ; mm6=(00 10 20 30)
  242. punpckhdq mm0, mm7 ; mm0=(01 11 21 31)
  243. movq mm5, mm3 ; transpose coefficients(phase 2)
  244. punpckldq mm3, mm4 ; mm3=(02 12 22 32)
  245. punpckhdq mm5, mm4 ; mm5=(03 13 23 33)
  246. movq mm7, MMWORD [wk(0)] ; mm7=(60 70 61 71)
  247. movq mm4, MMWORD [wk(1)] ; mm4=(62 72 63 73)
  248. movq MMWORD [MMBLOCK(0,0,edi,SIZEOF_JCOEF)], mm6
  249. movq MMWORD [MMBLOCK(1,0,edi,SIZEOF_JCOEF)], mm0
  250. movq MMWORD [MMBLOCK(2,0,edi,SIZEOF_JCOEF)], mm3
  251. movq MMWORD [MMBLOCK(3,0,edi,SIZEOF_JCOEF)], mm5
  252. movq mm6, mm1 ; transpose coefficients(phase 2)
  253. punpckldq mm1, mm7 ; mm1=(40 50 60 70)
  254. punpckhdq mm6, mm7 ; mm6=(41 51 61 71)
  255. movq mm0, mm2 ; transpose coefficients(phase 2)
  256. punpckldq mm2, mm4 ; mm2=(42 52 62 72)
  257. punpckhdq mm0, mm4 ; mm0=(43 53 63 73)
  258. movq MMWORD [MMBLOCK(0,1,edi,SIZEOF_JCOEF)], mm1
  259. movq MMWORD [MMBLOCK(1,1,edi,SIZEOF_JCOEF)], mm6
  260. movq MMWORD [MMBLOCK(2,1,edi,SIZEOF_JCOEF)], mm2
  261. movq MMWORD [MMBLOCK(3,1,edi,SIZEOF_JCOEF)], mm0
  262. .nextcolumn:
  263. add esi, byte 4*SIZEOF_JCOEF ; coef_block
  264. add edx, byte 4*SIZEOF_IFAST_MULT_TYPE ; quantptr
  265. add edi, byte 4*DCTSIZE*SIZEOF_JCOEF ; wsptr
  266. dec ecx ; ctr
  267. jnz near .columnloop
  268. ; ---- Pass 2: process rows from work array, store into output array.
  269. mov eax, [original_ebp]
  270. lea esi, [workspace] ; JCOEF *wsptr
  271. mov edi, JSAMPARRAY [output_buf(eax)] ; (JSAMPROW *)
  272. mov eax, JDIMENSION [output_col(eax)]
  273. mov ecx, DCTSIZE/4 ; ctr
  274. alignx 16, 7
  275. .rowloop:
  276. ; -- Even part
  277. movq mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
  278. movq mm1, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
  279. movq mm2, MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)]
  280. movq mm3, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
  281. movq mm4, mm0
  282. movq mm5, mm1
  283. psubw mm0, mm2 ; mm0=tmp11
  284. psubw mm1, mm3
  285. paddw mm4, mm2 ; mm4=tmp10
  286. paddw mm5, mm3 ; mm5=tmp13
  287. psllw mm1, PRE_MULTIPLY_SCALE_BITS
  288. pmulhw mm1, [GOTOFF(ebx,PW_F1414)]
  289. psubw mm1, mm5 ; mm1=tmp12
  290. movq mm6, mm4
  291. movq mm7, mm0
  292. psubw mm4, mm5 ; mm4=tmp3
  293. psubw mm0, mm1 ; mm0=tmp2
  294. paddw mm6, mm5 ; mm6=tmp0
  295. paddw mm7, mm1 ; mm7=tmp1
  296. movq MMWORD [wk(1)], mm4 ; wk(1)=tmp3
  297. movq MMWORD [wk(0)], mm0 ; wk(0)=tmp2
  298. ; -- Odd part
  299. movq mm2, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
  300. movq mm3, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
  301. movq mm5, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
  302. movq mm1, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
  303. movq mm4, mm2
  304. movq mm0, mm5
  305. psubw mm2, mm1 ; mm2=z12
  306. psubw mm5, mm3 ; mm5=z10
  307. paddw mm4, mm1 ; mm4=z11
  308. paddw mm0, mm3 ; mm0=z13
  309. movq mm1, mm5 ; mm1=z10(unscaled)
  310. psllw mm2, PRE_MULTIPLY_SCALE_BITS
  311. psllw mm5, PRE_MULTIPLY_SCALE_BITS
  312. movq mm3, mm4
  313. psubw mm4, mm0
  314. paddw mm3, mm0 ; mm3=tmp7
  315. psllw mm4, PRE_MULTIPLY_SCALE_BITS
  316. pmulhw mm4, [GOTOFF(ebx,PW_F1414)] ; mm4=tmp11
  317. ; To avoid overflow...
  318. ;
  319. ; (Original)
  320. ; tmp12 = -2.613125930 * z10 + z5;
  321. ;
  322. ; (This implementation)
  323. ; tmp12 = (-1.613125930 - 1) * z10 + z5;
  324. ; = -1.613125930 * z10 - z10 + z5;
  325. movq mm0, mm5
  326. paddw mm5, mm2
  327. pmulhw mm5, [GOTOFF(ebx,PW_F1847)] ; mm5=z5
  328. pmulhw mm0, [GOTOFF(ebx,PW_MF1613)]
  329. pmulhw mm2, [GOTOFF(ebx,PW_F1082)]
  330. psubw mm0, mm1
  331. psubw mm2, mm5 ; mm2=tmp10
  332. paddw mm0, mm5 ; mm0=tmp12
  333. ; -- Final output stage
  334. psubw mm0, mm3 ; mm0=tmp6
  335. movq mm1, mm6
  336. movq mm5, mm7
  337. paddw mm6, mm3 ; mm6=data0=(00 10 20 30)
  338. paddw mm7, mm0 ; mm7=data1=(01 11 21 31)
  339. psraw mm6, (PASS1_BITS+3) ; descale
  340. psraw mm7, (PASS1_BITS+3) ; descale
  341. psubw mm1, mm3 ; mm1=data7=(07 17 27 37)
  342. psubw mm5, mm0 ; mm5=data6=(06 16 26 36)
  343. psraw mm1, (PASS1_BITS+3) ; descale
  344. psraw mm5, (PASS1_BITS+3) ; descale
  345. psubw mm4, mm0 ; mm4=tmp5
  346. packsswb mm6, mm5 ; mm6=(00 10 20 30 06 16 26 36)
  347. packsswb mm7, mm1 ; mm7=(01 11 21 31 07 17 27 37)
  348. movq mm3, MMWORD [wk(0)] ; mm3=tmp2
  349. movq mm0, MMWORD [wk(1)] ; mm0=tmp3
  350. paddw mm2, mm4 ; mm2=tmp4
  351. movq mm5, mm3
  352. movq mm1, mm0
  353. paddw mm3, mm4 ; mm3=data2=(02 12 22 32)
  354. paddw mm0, mm2 ; mm0=data4=(04 14 24 34)
  355. psraw mm3, (PASS1_BITS+3) ; descale
  356. psraw mm0, (PASS1_BITS+3) ; descale
  357. psubw mm5, mm4 ; mm5=data5=(05 15 25 35)
  358. psubw mm1, mm2 ; mm1=data3=(03 13 23 33)
  359. psraw mm5, (PASS1_BITS+3) ; descale
  360. psraw mm1, (PASS1_BITS+3) ; descale
  361. movq mm4, [GOTOFF(ebx,PB_CENTERJSAMP)] ; mm4=[PB_CENTERJSAMP]
  362. packsswb mm3, mm0 ; mm3=(02 12 22 32 04 14 24 34)
  363. packsswb mm1, mm5 ; mm1=(03 13 23 33 05 15 25 35)
  364. paddb mm6, mm4
  365. paddb mm7, mm4
  366. paddb mm3, mm4
  367. paddb mm1, mm4
  368. movq mm2, mm6 ; transpose coefficients(phase 1)
  369. punpcklbw mm6, mm7 ; mm6=(00 01 10 11 20 21 30 31)
  370. punpckhbw mm2, mm7 ; mm2=(06 07 16 17 26 27 36 37)
  371. movq mm0, mm3 ; transpose coefficients(phase 1)
  372. punpcklbw mm3, mm1 ; mm3=(02 03 12 13 22 23 32 33)
  373. punpckhbw mm0, mm1 ; mm0=(04 05 14 15 24 25 34 35)
  374. movq mm5, mm6 ; transpose coefficients(phase 2)
  375. punpcklwd mm6, mm3 ; mm6=(00 01 02 03 10 11 12 13)
  376. punpckhwd mm5, mm3 ; mm5=(20 21 22 23 30 31 32 33)
  377. movq mm4, mm0 ; transpose coefficients(phase 2)
  378. punpcklwd mm0, mm2 ; mm0=(04 05 06 07 14 15 16 17)
  379. punpckhwd mm4, mm2 ; mm4=(24 25 26 27 34 35 36 37)
  380. movq mm7, mm6 ; transpose coefficients(phase 3)
  381. punpckldq mm6, mm0 ; mm6=(00 01 02 03 04 05 06 07)
  382. punpckhdq mm7, mm0 ; mm7=(10 11 12 13 14 15 16 17)
  383. movq mm1, mm5 ; transpose coefficients(phase 3)
  384. punpckldq mm5, mm4 ; mm5=(20 21 22 23 24 25 26 27)
  385. punpckhdq mm1, mm4 ; mm1=(30 31 32 33 34 35 36 37)
  386. pushpic ebx ; save GOT address
  387. mov edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]
  388. mov ebx, JSAMPROW [edi+1*SIZEOF_JSAMPROW]
  389. movq MMWORD [edx+eax*SIZEOF_JSAMPLE], mm6
  390. movq MMWORD [ebx+eax*SIZEOF_JSAMPLE], mm7
  391. mov edx, JSAMPROW [edi+2*SIZEOF_JSAMPROW]
  392. mov ebx, JSAMPROW [edi+3*SIZEOF_JSAMPROW]
  393. movq MMWORD [edx+eax*SIZEOF_JSAMPLE], mm5
  394. movq MMWORD [ebx+eax*SIZEOF_JSAMPLE], mm1
  395. poppic ebx ; restore GOT address
  396. add esi, byte 4*SIZEOF_JCOEF ; wsptr
  397. add edi, byte 4*SIZEOF_JSAMPROW
  398. dec ecx ; ctr
  399. jnz near .rowloop
  400. emms ; empty MMX state
  401. pop edi
  402. pop esi
  403. ; pop edx ; need not be preserved
  404. ; pop ecx ; need not be preserved
  405. pop ebx
  406. mov esp, ebp ; esp <- aligned ebp
  407. pop esp ; esp <- original ebp
  408. pop ebp
  409. ret
  410. ; For some reason, the OS X linker does not honor the request to align the
  411. ; segment unless we do this.
  412. align 32