jidctflt-sse2.asm 21 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497
  1. ;
  2. ; jidctflt.asm - floating-point IDCT (SSE & SSE2)
  3. ;
  4. ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
  5. ; Copyright (C) 2016, D. R. Commander.
  6. ;
  7. ; Based on the x86 SIMD extension for IJG JPEG library
  8. ; Copyright (C) 1999-2006, MIYASAKA Masaru.
  9. ; For conditions of distribution and use, see copyright notice in jsimdext.inc
  10. ;
  11. ; This file should be assembled with NASM (Netwide Assembler),
  12. ; can *not* be assembled with Microsoft's MASM or any compatible
  13. ; assembler (including Borland's Turbo Assembler).
  14. ; NASM is available from http://nasm.sourceforge.net/ or
  15. ; http://sourceforge.net/project/showfiles.php?group_id=6208
  16. ;
  17. ; This file contains a floating-point implementation of the inverse DCT
  18. ; (Discrete Cosine Transform). The following code is based directly on
  19. ; the IJG's original jidctflt.c; see the jidctflt.c for more details.
  20. %include "jsimdext.inc"
  21. %include "jdct.inc"
  22. ; --------------------------------------------------------------------------
  23. %macro unpcklps2 2 ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(0 1 4 5)
  24. shufps %1, %2, 0x44
  25. %endmacro
  26. %macro unpckhps2 2 ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(2 3 6 7)
  27. shufps %1, %2, 0xEE
  28. %endmacro
  29. ; --------------------------------------------------------------------------
  30. SECTION SEG_CONST
  31. alignz 32
  32. GLOBAL_DATA(jconst_idct_float_sse2)
  33. EXTN(jconst_idct_float_sse2):
  34. PD_1_414 times 4 dd 1.414213562373095048801689
  35. PD_1_847 times 4 dd 1.847759065022573512256366
  36. PD_1_082 times 4 dd 1.082392200292393968799446
  37. PD_M2_613 times 4 dd -2.613125929752753055713286
  38. PD_RNDINT_MAGIC times 4 dd 100663296.0 ; (float)(0x00C00000 << 3)
  39. PB_CENTERJSAMP times 16 db CENTERJSAMPLE
  40. alignz 32
  41. ; --------------------------------------------------------------------------
  42. SECTION SEG_TEXT
  43. BITS 32
  44. ;
  45. ; Perform dequantization and inverse DCT on one block of coefficients.
  46. ;
  47. ; GLOBAL(void)
  48. ; jsimd_idct_float_sse2(void *dct_table, JCOEFPTR coef_block,
  49. ; JSAMPARRAY output_buf, JDIMENSION output_col)
  50. ;
  51. %define dct_table(b) (b) + 8 ; void *dct_table
  52. %define coef_block(b) (b) + 12 ; JCOEFPTR coef_block
  53. %define output_buf(b) (b) + 16 ; JSAMPARRAY output_buf
  54. %define output_col(b) (b) + 20 ; JDIMENSION output_col
  55. %define original_ebp ebp + 0
  56. %define wk(i) ebp - (WK_NUM - (i)) * SIZEOF_XMMWORD
  57. ; xmmword wk[WK_NUM]
  58. %define WK_NUM 2
  59. %define workspace wk(0) - DCTSIZE2 * SIZEOF_FAST_FLOAT
  60. ; FAST_FLOAT workspace[DCTSIZE2]
  61. align 32
  62. GLOBAL_FUNCTION(jsimd_idct_float_sse2)
  63. EXTN(jsimd_idct_float_sse2):
  64. push ebp
  65. mov eax, esp ; eax = original ebp
  66. sub esp, byte 4
  67. and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
  68. mov [esp], eax
  69. mov ebp, esp ; ebp = aligned ebp
  70. lea esp, [workspace]
  71. push ebx
  72. ; push ecx ; need not be preserved
  73. ; push edx ; need not be preserved
  74. push esi
  75. push edi
  76. get_GOT ebx ; get GOT address
  77. ; ---- Pass 1: process columns from input, store into work array.
  78. ; mov eax, [original_ebp]
  79. mov edx, POINTER [dct_table(eax)] ; quantptr
  80. mov esi, JCOEFPTR [coef_block(eax)] ; inptr
  81. lea edi, [workspace] ; FAST_FLOAT *wsptr
  82. mov ecx, DCTSIZE/4 ; ctr
  83. alignx 16, 7
  84. .columnloop:
  85. %ifndef NO_ZERO_COLUMN_TEST_FLOAT_SSE
  86. mov eax, dword [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
  87. or eax, dword [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
  88. jnz near .columnDCT
  89. movq xmm1, XMM_MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
  90. movq xmm2, XMM_MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
  91. movq xmm3, XMM_MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
  92. movq xmm4, XMM_MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)]
  93. movq xmm5, XMM_MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
  94. movq xmm6, XMM_MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
  95. movq xmm7, XMM_MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
  96. por xmm1, xmm2
  97. por xmm3, xmm4
  98. por xmm5, xmm6
  99. por xmm1, xmm3
  100. por xmm5, xmm7
  101. por xmm1, xmm5
  102. packsswb xmm1, xmm1
  103. movd eax, xmm1
  104. test eax, eax
  105. jnz short .columnDCT
  106. ; -- AC terms all zero
  107. movq xmm0, XMM_MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
  108. punpcklwd xmm0, xmm0 ; xmm0=(00 00 01 01 02 02 03 03)
  109. psrad xmm0, (DWORD_BIT-WORD_BIT) ; xmm0=in0=(00 01 02 03)
  110. cvtdq2ps xmm0, xmm0 ; xmm0=in0=(00 01 02 03)
  111. mulps xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
  112. movaps xmm1, xmm0
  113. movaps xmm2, xmm0
  114. movaps xmm3, xmm0
  115. shufps xmm0, xmm0, 0x00 ; xmm0=(00 00 00 00)
  116. shufps xmm1, xmm1, 0x55 ; xmm1=(01 01 01 01)
  117. shufps xmm2, xmm2, 0xAA ; xmm2=(02 02 02 02)
  118. shufps xmm3, xmm3, 0xFF ; xmm3=(03 03 03 03)
  119. movaps XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], xmm0
  120. movaps XMMWORD [XMMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], xmm0
  121. movaps XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], xmm1
  122. movaps XMMWORD [XMMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], xmm1
  123. movaps XMMWORD [XMMBLOCK(2,0,edi,SIZEOF_FAST_FLOAT)], xmm2
  124. movaps XMMWORD [XMMBLOCK(2,1,edi,SIZEOF_FAST_FLOAT)], xmm2
  125. movaps XMMWORD [XMMBLOCK(3,0,edi,SIZEOF_FAST_FLOAT)], xmm3
  126. movaps XMMWORD [XMMBLOCK(3,1,edi,SIZEOF_FAST_FLOAT)], xmm3
  127. jmp near .nextcolumn
  128. alignx 16, 7
  129. %endif
  130. .columnDCT:
  131. ; -- Even part
  132. movq xmm0, XMM_MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
  133. movq xmm1, XMM_MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
  134. movq xmm2, XMM_MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)]
  135. movq xmm3, XMM_MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
  136. punpcklwd xmm0, xmm0 ; xmm0=(00 00 01 01 02 02 03 03)
  137. punpcklwd xmm1, xmm1 ; xmm1=(20 20 21 21 22 22 23 23)
  138. psrad xmm0, (DWORD_BIT-WORD_BIT) ; xmm0=in0=(00 01 02 03)
  139. psrad xmm1, (DWORD_BIT-WORD_BIT) ; xmm1=in2=(20 21 22 23)
  140. cvtdq2ps xmm0, xmm0 ; xmm0=in0=(00 01 02 03)
  141. cvtdq2ps xmm1, xmm1 ; xmm1=in2=(20 21 22 23)
  142. punpcklwd xmm2, xmm2 ; xmm2=(40 40 41 41 42 42 43 43)
  143. punpcklwd xmm3, xmm3 ; xmm3=(60 60 61 61 62 62 63 63)
  144. psrad xmm2, (DWORD_BIT-WORD_BIT) ; xmm2=in4=(40 41 42 43)
  145. psrad xmm3, (DWORD_BIT-WORD_BIT) ; xmm3=in6=(60 61 62 63)
  146. cvtdq2ps xmm2, xmm2 ; xmm2=in4=(40 41 42 43)
  147. cvtdq2ps xmm3, xmm3 ; xmm3=in6=(60 61 62 63)
  148. mulps xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
  149. mulps xmm1, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
  150. mulps xmm2, XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
  151. mulps xmm3, XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
  152. movaps xmm4, xmm0
  153. movaps xmm5, xmm1
  154. subps xmm0, xmm2 ; xmm0=tmp11
  155. subps xmm1, xmm3
  156. addps xmm4, xmm2 ; xmm4=tmp10
  157. addps xmm5, xmm3 ; xmm5=tmp13
  158. mulps xmm1, [GOTOFF(ebx,PD_1_414)]
  159. subps xmm1, xmm5 ; xmm1=tmp12
  160. movaps xmm6, xmm4
  161. movaps xmm7, xmm0
  162. subps xmm4, xmm5 ; xmm4=tmp3
  163. subps xmm0, xmm1 ; xmm0=tmp2
  164. addps xmm6, xmm5 ; xmm6=tmp0
  165. addps xmm7, xmm1 ; xmm7=tmp1
  166. movaps XMMWORD [wk(1)], xmm4 ; tmp3
  167. movaps XMMWORD [wk(0)], xmm0 ; tmp2
  168. ; -- Odd part
  169. movq xmm2, XMM_MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
  170. movq xmm3, XMM_MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
  171. movq xmm5, XMM_MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
  172. movq xmm1, XMM_MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
  173. punpcklwd xmm2, xmm2 ; xmm2=(10 10 11 11 12 12 13 13)
  174. punpcklwd xmm3, xmm3 ; xmm3=(30 30 31 31 32 32 33 33)
  175. psrad xmm2, (DWORD_BIT-WORD_BIT) ; xmm2=in1=(10 11 12 13)
  176. psrad xmm3, (DWORD_BIT-WORD_BIT) ; xmm3=in3=(30 31 32 33)
  177. cvtdq2ps xmm2, xmm2 ; xmm2=in1=(10 11 12 13)
  178. cvtdq2ps xmm3, xmm3 ; xmm3=in3=(30 31 32 33)
  179. punpcklwd xmm5, xmm5 ; xmm5=(50 50 51 51 52 52 53 53)
  180. punpcklwd xmm1, xmm1 ; xmm1=(70 70 71 71 72 72 73 73)
  181. psrad xmm5, (DWORD_BIT-WORD_BIT) ; xmm5=in5=(50 51 52 53)
  182. psrad xmm1, (DWORD_BIT-WORD_BIT) ; xmm1=in7=(70 71 72 73)
  183. cvtdq2ps xmm5, xmm5 ; xmm5=in5=(50 51 52 53)
  184. cvtdq2ps xmm1, xmm1 ; xmm1=in7=(70 71 72 73)
  185. mulps xmm2, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
  186. mulps xmm3, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
  187. mulps xmm5, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
  188. mulps xmm1, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
  189. movaps xmm4, xmm2
  190. movaps xmm0, xmm5
  191. addps xmm2, xmm1 ; xmm2=z11
  192. addps xmm5, xmm3 ; xmm5=z13
  193. subps xmm4, xmm1 ; xmm4=z12
  194. subps xmm0, xmm3 ; xmm0=z10
  195. movaps xmm1, xmm2
  196. subps xmm2, xmm5
  197. addps xmm1, xmm5 ; xmm1=tmp7
  198. mulps xmm2, [GOTOFF(ebx,PD_1_414)] ; xmm2=tmp11
  199. movaps xmm3, xmm0
  200. addps xmm0, xmm4
  201. mulps xmm0, [GOTOFF(ebx,PD_1_847)] ; xmm0=z5
  202. mulps xmm3, [GOTOFF(ebx,PD_M2_613)] ; xmm3=(z10 * -2.613125930)
  203. mulps xmm4, [GOTOFF(ebx,PD_1_082)] ; xmm4=(z12 * 1.082392200)
  204. addps xmm3, xmm0 ; xmm3=tmp12
  205. subps xmm4, xmm0 ; xmm4=tmp10
  206. ; -- Final output stage
  207. subps xmm3, xmm1 ; xmm3=tmp6
  208. movaps xmm5, xmm6
  209. movaps xmm0, xmm7
  210. addps xmm6, xmm1 ; xmm6=data0=(00 01 02 03)
  211. addps xmm7, xmm3 ; xmm7=data1=(10 11 12 13)
  212. subps xmm5, xmm1 ; xmm5=data7=(70 71 72 73)
  213. subps xmm0, xmm3 ; xmm0=data6=(60 61 62 63)
  214. subps xmm2, xmm3 ; xmm2=tmp5
  215. movaps xmm1, xmm6 ; transpose coefficients(phase 1)
  216. unpcklps xmm6, xmm7 ; xmm6=(00 10 01 11)
  217. unpckhps xmm1, xmm7 ; xmm1=(02 12 03 13)
  218. movaps xmm3, xmm0 ; transpose coefficients(phase 1)
  219. unpcklps xmm0, xmm5 ; xmm0=(60 70 61 71)
  220. unpckhps xmm3, xmm5 ; xmm3=(62 72 63 73)
  221. movaps xmm7, XMMWORD [wk(0)] ; xmm7=tmp2
  222. movaps xmm5, XMMWORD [wk(1)] ; xmm5=tmp3
  223. movaps XMMWORD [wk(0)], xmm0 ; wk(0)=(60 70 61 71)
  224. movaps XMMWORD [wk(1)], xmm3 ; wk(1)=(62 72 63 73)
  225. addps xmm4, xmm2 ; xmm4=tmp4
  226. movaps xmm0, xmm7
  227. movaps xmm3, xmm5
  228. addps xmm7, xmm2 ; xmm7=data2=(20 21 22 23)
  229. addps xmm5, xmm4 ; xmm5=data4=(40 41 42 43)
  230. subps xmm0, xmm2 ; xmm0=data5=(50 51 52 53)
  231. subps xmm3, xmm4 ; xmm3=data3=(30 31 32 33)
  232. movaps xmm2, xmm7 ; transpose coefficients(phase 1)
  233. unpcklps xmm7, xmm3 ; xmm7=(20 30 21 31)
  234. unpckhps xmm2, xmm3 ; xmm2=(22 32 23 33)
  235. movaps xmm4, xmm5 ; transpose coefficients(phase 1)
  236. unpcklps xmm5, xmm0 ; xmm5=(40 50 41 51)
  237. unpckhps xmm4, xmm0 ; xmm4=(42 52 43 53)
  238. movaps xmm3, xmm6 ; transpose coefficients(phase 2)
  239. unpcklps2 xmm6, xmm7 ; xmm6=(00 10 20 30)
  240. unpckhps2 xmm3, xmm7 ; xmm3=(01 11 21 31)
  241. movaps xmm0, xmm1 ; transpose coefficients(phase 2)
  242. unpcklps2 xmm1, xmm2 ; xmm1=(02 12 22 32)
  243. unpckhps2 xmm0, xmm2 ; xmm0=(03 13 23 33)
  244. movaps xmm7, XMMWORD [wk(0)] ; xmm7=(60 70 61 71)
  245. movaps xmm2, XMMWORD [wk(1)] ; xmm2=(62 72 63 73)
  246. movaps XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], xmm6
  247. movaps XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], xmm3
  248. movaps XMMWORD [XMMBLOCK(2,0,edi,SIZEOF_FAST_FLOAT)], xmm1
  249. movaps XMMWORD [XMMBLOCK(3,0,edi,SIZEOF_FAST_FLOAT)], xmm0
  250. movaps xmm6, xmm5 ; transpose coefficients(phase 2)
  251. unpcklps2 xmm5, xmm7 ; xmm5=(40 50 60 70)
  252. unpckhps2 xmm6, xmm7 ; xmm6=(41 51 61 71)
  253. movaps xmm3, xmm4 ; transpose coefficients(phase 2)
  254. unpcklps2 xmm4, xmm2 ; xmm4=(42 52 62 72)
  255. unpckhps2 xmm3, xmm2 ; xmm3=(43 53 63 73)
  256. movaps XMMWORD [XMMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], xmm5
  257. movaps XMMWORD [XMMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], xmm6
  258. movaps XMMWORD [XMMBLOCK(2,1,edi,SIZEOF_FAST_FLOAT)], xmm4
  259. movaps XMMWORD [XMMBLOCK(3,1,edi,SIZEOF_FAST_FLOAT)], xmm3
  260. .nextcolumn:
  261. add esi, byte 4*SIZEOF_JCOEF ; coef_block
  262. add edx, byte 4*SIZEOF_FLOAT_MULT_TYPE ; quantptr
  263. add edi, 4*DCTSIZE*SIZEOF_FAST_FLOAT ; wsptr
  264. dec ecx ; ctr
  265. jnz near .columnloop
  266. ; -- Prefetch the next coefficient block
  267. prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 0*32]
  268. prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 1*32]
  269. prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 2*32]
  270. prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 3*32]
  271. ; ---- Pass 2: process rows from work array, store into output array.
  272. mov eax, [original_ebp]
  273. lea esi, [workspace] ; FAST_FLOAT *wsptr
  274. mov edi, JSAMPARRAY [output_buf(eax)] ; (JSAMPROW *)
  275. mov eax, JDIMENSION [output_col(eax)]
  276. mov ecx, DCTSIZE/4 ; ctr
  277. alignx 16, 7
  278. .rowloop:
  279. ; -- Even part
  280. movaps xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_FAST_FLOAT)]
  281. movaps xmm1, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_FAST_FLOAT)]
  282. movaps xmm2, XMMWORD [XMMBLOCK(4,0,esi,SIZEOF_FAST_FLOAT)]
  283. movaps xmm3, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_FAST_FLOAT)]
  284. movaps xmm4, xmm0
  285. movaps xmm5, xmm1
  286. subps xmm0, xmm2 ; xmm0=tmp11
  287. subps xmm1, xmm3
  288. addps xmm4, xmm2 ; xmm4=tmp10
  289. addps xmm5, xmm3 ; xmm5=tmp13
  290. mulps xmm1, [GOTOFF(ebx,PD_1_414)]
  291. subps xmm1, xmm5 ; xmm1=tmp12
  292. movaps xmm6, xmm4
  293. movaps xmm7, xmm0
  294. subps xmm4, xmm5 ; xmm4=tmp3
  295. subps xmm0, xmm1 ; xmm0=tmp2
  296. addps xmm6, xmm5 ; xmm6=tmp0
  297. addps xmm7, xmm1 ; xmm7=tmp1
  298. movaps XMMWORD [wk(1)], xmm4 ; tmp3
  299. movaps XMMWORD [wk(0)], xmm0 ; tmp2
  300. ; -- Odd part
  301. movaps xmm2, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_FAST_FLOAT)]
  302. movaps xmm3, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_FAST_FLOAT)]
  303. movaps xmm5, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_FAST_FLOAT)]
  304. movaps xmm1, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_FAST_FLOAT)]
  305. movaps xmm4, xmm2
  306. movaps xmm0, xmm5
  307. addps xmm2, xmm1 ; xmm2=z11
  308. addps xmm5, xmm3 ; xmm5=z13
  309. subps xmm4, xmm1 ; xmm4=z12
  310. subps xmm0, xmm3 ; xmm0=z10
  311. movaps xmm1, xmm2
  312. subps xmm2, xmm5
  313. addps xmm1, xmm5 ; xmm1=tmp7
  314. mulps xmm2, [GOTOFF(ebx,PD_1_414)] ; xmm2=tmp11
  315. movaps xmm3, xmm0
  316. addps xmm0, xmm4
  317. mulps xmm0, [GOTOFF(ebx,PD_1_847)] ; xmm0=z5
  318. mulps xmm3, [GOTOFF(ebx,PD_M2_613)] ; xmm3=(z10 * -2.613125930)
  319. mulps xmm4, [GOTOFF(ebx,PD_1_082)] ; xmm4=(z12 * 1.082392200)
  320. addps xmm3, xmm0 ; xmm3=tmp12
  321. subps xmm4, xmm0 ; xmm4=tmp10
  322. ; -- Final output stage
  323. subps xmm3, xmm1 ; xmm3=tmp6
  324. movaps xmm5, xmm6
  325. movaps xmm0, xmm7
  326. addps xmm6, xmm1 ; xmm6=data0=(00 10 20 30)
  327. addps xmm7, xmm3 ; xmm7=data1=(01 11 21 31)
  328. subps xmm5, xmm1 ; xmm5=data7=(07 17 27 37)
  329. subps xmm0, xmm3 ; xmm0=data6=(06 16 26 36)
  330. subps xmm2, xmm3 ; xmm2=tmp5
  331. movaps xmm1, [GOTOFF(ebx,PD_RNDINT_MAGIC)] ; xmm1=[PD_RNDINT_MAGIC]
  332. pcmpeqd xmm3, xmm3
  333. psrld xmm3, WORD_BIT ; xmm3={0xFFFF 0x0000 0xFFFF 0x0000 ..}
  334. addps xmm6, xmm1 ; xmm6=roundint(data0/8)=(00 ** 10 ** 20 ** 30 **)
  335. addps xmm7, xmm1 ; xmm7=roundint(data1/8)=(01 ** 11 ** 21 ** 31 **)
  336. addps xmm0, xmm1 ; xmm0=roundint(data6/8)=(06 ** 16 ** 26 ** 36 **)
  337. addps xmm5, xmm1 ; xmm5=roundint(data7/8)=(07 ** 17 ** 27 ** 37 **)
  338. pand xmm6, xmm3 ; xmm6=(00 -- 10 -- 20 -- 30 --)
  339. pslld xmm7, WORD_BIT ; xmm7=(-- 01 -- 11 -- 21 -- 31)
  340. pand xmm0, xmm3 ; xmm0=(06 -- 16 -- 26 -- 36 --)
  341. pslld xmm5, WORD_BIT ; xmm5=(-- 07 -- 17 -- 27 -- 37)
  342. por xmm6, xmm7 ; xmm6=(00 01 10 11 20 21 30 31)
  343. por xmm0, xmm5 ; xmm0=(06 07 16 17 26 27 36 37)
  344. movaps xmm1, XMMWORD [wk(0)] ; xmm1=tmp2
  345. movaps xmm3, XMMWORD [wk(1)] ; xmm3=tmp3
  346. addps xmm4, xmm2 ; xmm4=tmp4
  347. movaps xmm7, xmm1
  348. movaps xmm5, xmm3
  349. addps xmm1, xmm2 ; xmm1=data2=(02 12 22 32)
  350. addps xmm3, xmm4 ; xmm3=data4=(04 14 24 34)
  351. subps xmm7, xmm2 ; xmm7=data5=(05 15 25 35)
  352. subps xmm5, xmm4 ; xmm5=data3=(03 13 23 33)
  353. movaps xmm2, [GOTOFF(ebx,PD_RNDINT_MAGIC)] ; xmm2=[PD_RNDINT_MAGIC]
  354. pcmpeqd xmm4, xmm4
  355. psrld xmm4, WORD_BIT ; xmm4={0xFFFF 0x0000 0xFFFF 0x0000 ..}
  356. addps xmm3, xmm2 ; xmm3=roundint(data4/8)=(04 ** 14 ** 24 ** 34 **)
  357. addps xmm7, xmm2 ; xmm7=roundint(data5/8)=(05 ** 15 ** 25 ** 35 **)
  358. addps xmm1, xmm2 ; xmm1=roundint(data2/8)=(02 ** 12 ** 22 ** 32 **)
  359. addps xmm5, xmm2 ; xmm5=roundint(data3/8)=(03 ** 13 ** 23 ** 33 **)
  360. pand xmm3, xmm4 ; xmm3=(04 -- 14 -- 24 -- 34 --)
  361. pslld xmm7, WORD_BIT ; xmm7=(-- 05 -- 15 -- 25 -- 35)
  362. pand xmm1, xmm4 ; xmm1=(02 -- 12 -- 22 -- 32 --)
  363. pslld xmm5, WORD_BIT ; xmm5=(-- 03 -- 13 -- 23 -- 33)
  364. por xmm3, xmm7 ; xmm3=(04 05 14 15 24 25 34 35)
  365. por xmm1, xmm5 ; xmm1=(02 03 12 13 22 23 32 33)
  366. movdqa xmm2, [GOTOFF(ebx,PB_CENTERJSAMP)] ; xmm2=[PB_CENTERJSAMP]
  367. packsswb xmm6, xmm3 ; xmm6=(00 01 10 11 20 21 30 31 04 05 14 15 24 25 34 35)
  368. packsswb xmm1, xmm0 ; xmm1=(02 03 12 13 22 23 32 33 06 07 16 17 26 27 36 37)
  369. paddb xmm6, xmm2
  370. paddb xmm1, xmm2
  371. movdqa xmm4, xmm6 ; transpose coefficients(phase 2)
  372. punpcklwd xmm6, xmm1 ; xmm6=(00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33)
  373. punpckhwd xmm4, xmm1 ; xmm4=(04 05 06 07 14 15 16 17 24 25 26 27 34 35 36 37)
  374. movdqa xmm7, xmm6 ; transpose coefficients(phase 3)
  375. punpckldq xmm6, xmm4 ; xmm6=(00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17)
  376. punpckhdq xmm7, xmm4 ; xmm7=(20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37)
  377. pshufd xmm5, xmm6, 0x4E ; xmm5=(10 11 12 13 14 15 16 17 00 01 02 03 04 05 06 07)
  378. pshufd xmm3, xmm7, 0x4E ; xmm3=(30 31 32 33 34 35 36 37 20 21 22 23 24 25 26 27)
  379. pushpic ebx ; save GOT address
  380. mov edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]
  381. mov ebx, JSAMPROW [edi+2*SIZEOF_JSAMPROW]
  382. movq XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm6
  383. movq XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE], xmm7
  384. mov edx, JSAMPROW [edi+1*SIZEOF_JSAMPROW]
  385. mov ebx, JSAMPROW [edi+3*SIZEOF_JSAMPROW]
  386. movq XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm5
  387. movq XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE], xmm3
  388. poppic ebx ; restore GOT address
  389. add esi, byte 4*SIZEOF_FAST_FLOAT ; wsptr
  390. add edi, byte 4*SIZEOF_JSAMPROW
  391. dec ecx ; ctr
  392. jnz near .rowloop
  393. pop edi
  394. pop esi
  395. ; pop edx ; need not be preserved
  396. ; pop ecx ; need not be preserved
  397. pop ebx
  398. mov esp, ebp ; esp <- aligned ebp
  399. pop esp ; esp <- original ebp
  400. pop ebp
  401. ret
  402. ; For some reason, the OS X linker does not honor the request to align the
  403. ; segment unless we do this.
  404. align 32