jidctflt-sse2.asm 20 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481
  1. ;
  2. ; jidctflt.asm - floating-point IDCT (64-bit SSE & SSE2)
  3. ;
  4. ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
  5. ; Copyright (C) 2009, 2016, D. R. Commander.
  6. ;
  7. ; Based on the x86 SIMD extension for IJG JPEG library
  8. ; Copyright (C) 1999-2006, MIYASAKA Masaru.
  9. ; For conditions of distribution and use, see copyright notice in jsimdext.inc
  10. ;
  11. ; This file should be assembled with NASM (Netwide Assembler),
  12. ; can *not* be assembled with Microsoft's MASM or any compatible
  13. ; assembler (including Borland's Turbo Assembler).
  14. ; NASM is available from http://nasm.sourceforge.net/ or
  15. ; http://sourceforge.net/project/showfiles.php?group_id=6208
  16. ;
  17. ; This file contains a floating-point implementation of the inverse DCT
  18. ; (Discrete Cosine Transform). The following code is based directly on
  19. ; the IJG's original jidctflt.c; see the jidctflt.c for more details.
  20. %include "jsimdext.inc"
  21. %include "jdct.inc"
  22. ; --------------------------------------------------------------------------
  23. %macro unpcklps2 2 ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(0 1 4 5)
  24. shufps %1, %2, 0x44
  25. %endmacro
  26. %macro unpckhps2 2 ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(2 3 6 7)
  27. shufps %1, %2, 0xEE
  28. %endmacro
  29. ; --------------------------------------------------------------------------
  30. SECTION SEG_CONST
  31. alignz 32
  32. GLOBAL_DATA(jconst_idct_float_sse2)
  33. EXTN(jconst_idct_float_sse2):
  34. PD_1_414 times 4 dd 1.414213562373095048801689
  35. PD_1_847 times 4 dd 1.847759065022573512256366
  36. PD_1_082 times 4 dd 1.082392200292393968799446
  37. PD_M2_613 times 4 dd -2.613125929752753055713286
  38. PD_RNDINT_MAGIC times 4 dd 100663296.0 ; (float)(0x00C00000 << 3)
  39. PB_CENTERJSAMP times 16 db CENTERJSAMPLE
  40. alignz 32
  41. ; --------------------------------------------------------------------------
  42. SECTION SEG_TEXT
  43. BITS 64
  44. ;
  45. ; Perform dequantization and inverse DCT on one block of coefficients.
  46. ;
  47. ; GLOBAL(void)
  48. ; jsimd_idct_float_sse2(void *dct_table, JCOEFPTR coef_block,
  49. ; JSAMPARRAY output_buf, JDIMENSION output_col)
  50. ;
  51. ; r10 = void *dct_table
  52. ; r11 = JCOEFPTR coef_block
  53. ; r12 = JSAMPARRAY output_buf
  54. ; r13d = JDIMENSION output_col
  55. %define original_rbp rbp + 0
  56. %define wk(i) rbp - (WK_NUM - (i)) * SIZEOF_XMMWORD
  57. ; xmmword wk[WK_NUM]
  58. %define WK_NUM 2
  59. %define workspace wk(0) - DCTSIZE2 * SIZEOF_FAST_FLOAT
  60. ; FAST_FLOAT workspace[DCTSIZE2]
  61. align 32
  62. GLOBAL_FUNCTION(jsimd_idct_float_sse2)
  63. EXTN(jsimd_idct_float_sse2):
  64. push rbp
  65. mov rax, rsp ; rax = original rbp
  66. sub rsp, byte 4
  67. and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
  68. mov [rsp], rax
  69. mov rbp, rsp ; rbp = aligned rbp
  70. lea rsp, [workspace]
  71. collect_args 4
  72. push rbx
  73. ; ---- Pass 1: process columns from input, store into work array.
  74. mov rdx, r10 ; quantptr
  75. mov rsi, r11 ; inptr
  76. lea rdi, [workspace] ; FAST_FLOAT *wsptr
  77. mov rcx, DCTSIZE/4 ; ctr
  78. .columnloop:
  79. %ifndef NO_ZERO_COLUMN_TEST_FLOAT_SSE
  80. mov eax, dword [DWBLOCK(1,0,rsi,SIZEOF_JCOEF)]
  81. or eax, dword [DWBLOCK(2,0,rsi,SIZEOF_JCOEF)]
  82. jnz near .columnDCT
  83. movq xmm1, XMM_MMWORD [MMBLOCK(1,0,rsi,SIZEOF_JCOEF)]
  84. movq xmm2, XMM_MMWORD [MMBLOCK(2,0,rsi,SIZEOF_JCOEF)]
  85. movq xmm3, XMM_MMWORD [MMBLOCK(3,0,rsi,SIZEOF_JCOEF)]
  86. movq xmm4, XMM_MMWORD [MMBLOCK(4,0,rsi,SIZEOF_JCOEF)]
  87. movq xmm5, XMM_MMWORD [MMBLOCK(5,0,rsi,SIZEOF_JCOEF)]
  88. movq xmm6, XMM_MMWORD [MMBLOCK(6,0,rsi,SIZEOF_JCOEF)]
  89. movq xmm7, XMM_MMWORD [MMBLOCK(7,0,rsi,SIZEOF_JCOEF)]
  90. por xmm1, xmm2
  91. por xmm3, xmm4
  92. por xmm5, xmm6
  93. por xmm1, xmm3
  94. por xmm5, xmm7
  95. por xmm1, xmm5
  96. packsswb xmm1, xmm1
  97. movd eax, xmm1
  98. test rax, rax
  99. jnz short .columnDCT
  100. ; -- AC terms all zero
  101. movq xmm0, XMM_MMWORD [MMBLOCK(0,0,rsi,SIZEOF_JCOEF)]
  102. punpcklwd xmm0, xmm0 ; xmm0=(00 00 01 01 02 02 03 03)
  103. psrad xmm0, (DWORD_BIT-WORD_BIT) ; xmm0=in0=(00 01 02 03)
  104. cvtdq2ps xmm0, xmm0 ; xmm0=in0=(00 01 02 03)
  105. mulps xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
  106. movaps xmm1, xmm0
  107. movaps xmm2, xmm0
  108. movaps xmm3, xmm0
  109. shufps xmm0, xmm0, 0x00 ; xmm0=(00 00 00 00)
  110. shufps xmm1, xmm1, 0x55 ; xmm1=(01 01 01 01)
  111. shufps xmm2, xmm2, 0xAA ; xmm2=(02 02 02 02)
  112. shufps xmm3, xmm3, 0xFF ; xmm3=(03 03 03 03)
  113. movaps XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_FAST_FLOAT)], xmm0
  114. movaps XMMWORD [XMMBLOCK(0,1,rdi,SIZEOF_FAST_FLOAT)], xmm0
  115. movaps XMMWORD [XMMBLOCK(1,0,rdi,SIZEOF_FAST_FLOAT)], xmm1
  116. movaps XMMWORD [XMMBLOCK(1,1,rdi,SIZEOF_FAST_FLOAT)], xmm1
  117. movaps XMMWORD [XMMBLOCK(2,0,rdi,SIZEOF_FAST_FLOAT)], xmm2
  118. movaps XMMWORD [XMMBLOCK(2,1,rdi,SIZEOF_FAST_FLOAT)], xmm2
  119. movaps XMMWORD [XMMBLOCK(3,0,rdi,SIZEOF_FAST_FLOAT)], xmm3
  120. movaps XMMWORD [XMMBLOCK(3,1,rdi,SIZEOF_FAST_FLOAT)], xmm3
  121. jmp near .nextcolumn
  122. %endif
  123. .columnDCT:
  124. ; -- Even part
  125. movq xmm0, XMM_MMWORD [MMBLOCK(0,0,rsi,SIZEOF_JCOEF)]
  126. movq xmm1, XMM_MMWORD [MMBLOCK(2,0,rsi,SIZEOF_JCOEF)]
  127. movq xmm2, XMM_MMWORD [MMBLOCK(4,0,rsi,SIZEOF_JCOEF)]
  128. movq xmm3, XMM_MMWORD [MMBLOCK(6,0,rsi,SIZEOF_JCOEF)]
  129. punpcklwd xmm0, xmm0 ; xmm0=(00 00 01 01 02 02 03 03)
  130. punpcklwd xmm1, xmm1 ; xmm1=(20 20 21 21 22 22 23 23)
  131. psrad xmm0, (DWORD_BIT-WORD_BIT) ; xmm0=in0=(00 01 02 03)
  132. psrad xmm1, (DWORD_BIT-WORD_BIT) ; xmm1=in2=(20 21 22 23)
  133. cvtdq2ps xmm0, xmm0 ; xmm0=in0=(00 01 02 03)
  134. cvtdq2ps xmm1, xmm1 ; xmm1=in2=(20 21 22 23)
  135. punpcklwd xmm2, xmm2 ; xmm2=(40 40 41 41 42 42 43 43)
  136. punpcklwd xmm3, xmm3 ; xmm3=(60 60 61 61 62 62 63 63)
  137. psrad xmm2, (DWORD_BIT-WORD_BIT) ; xmm2=in4=(40 41 42 43)
  138. psrad xmm3, (DWORD_BIT-WORD_BIT) ; xmm3=in6=(60 61 62 63)
  139. cvtdq2ps xmm2, xmm2 ; xmm2=in4=(40 41 42 43)
  140. cvtdq2ps xmm3, xmm3 ; xmm3=in6=(60 61 62 63)
  141. mulps xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
  142. mulps xmm1, XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
  143. mulps xmm2, XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
  144. mulps xmm3, XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
  145. movaps xmm4, xmm0
  146. movaps xmm5, xmm1
  147. subps xmm0, xmm2 ; xmm0=tmp11
  148. subps xmm1, xmm3
  149. addps xmm4, xmm2 ; xmm4=tmp10
  150. addps xmm5, xmm3 ; xmm5=tmp13
  151. mulps xmm1, [rel PD_1_414]
  152. subps xmm1, xmm5 ; xmm1=tmp12
  153. movaps xmm6, xmm4
  154. movaps xmm7, xmm0
  155. subps xmm4, xmm5 ; xmm4=tmp3
  156. subps xmm0, xmm1 ; xmm0=tmp2
  157. addps xmm6, xmm5 ; xmm6=tmp0
  158. addps xmm7, xmm1 ; xmm7=tmp1
  159. movaps XMMWORD [wk(1)], xmm4 ; tmp3
  160. movaps XMMWORD [wk(0)], xmm0 ; tmp2
  161. ; -- Odd part
  162. movq xmm2, XMM_MMWORD [MMBLOCK(1,0,rsi,SIZEOF_JCOEF)]
  163. movq xmm3, XMM_MMWORD [MMBLOCK(3,0,rsi,SIZEOF_JCOEF)]
  164. movq xmm5, XMM_MMWORD [MMBLOCK(5,0,rsi,SIZEOF_JCOEF)]
  165. movq xmm1, XMM_MMWORD [MMBLOCK(7,0,rsi,SIZEOF_JCOEF)]
  166. punpcklwd xmm2, xmm2 ; xmm2=(10 10 11 11 12 12 13 13)
  167. punpcklwd xmm3, xmm3 ; xmm3=(30 30 31 31 32 32 33 33)
  168. psrad xmm2, (DWORD_BIT-WORD_BIT) ; xmm2=in1=(10 11 12 13)
  169. psrad xmm3, (DWORD_BIT-WORD_BIT) ; xmm3=in3=(30 31 32 33)
  170. cvtdq2ps xmm2, xmm2 ; xmm2=in1=(10 11 12 13)
  171. cvtdq2ps xmm3, xmm3 ; xmm3=in3=(30 31 32 33)
  172. punpcklwd xmm5, xmm5 ; xmm5=(50 50 51 51 52 52 53 53)
  173. punpcklwd xmm1, xmm1 ; xmm1=(70 70 71 71 72 72 73 73)
  174. psrad xmm5, (DWORD_BIT-WORD_BIT) ; xmm5=in5=(50 51 52 53)
  175. psrad xmm1, (DWORD_BIT-WORD_BIT) ; xmm1=in7=(70 71 72 73)
  176. cvtdq2ps xmm5, xmm5 ; xmm5=in5=(50 51 52 53)
  177. cvtdq2ps xmm1, xmm1 ; xmm1=in7=(70 71 72 73)
  178. mulps xmm2, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
  179. mulps xmm3, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
  180. mulps xmm5, XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
  181. mulps xmm1, XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
  182. movaps xmm4, xmm2
  183. movaps xmm0, xmm5
  184. addps xmm2, xmm1 ; xmm2=z11
  185. addps xmm5, xmm3 ; xmm5=z13
  186. subps xmm4, xmm1 ; xmm4=z12
  187. subps xmm0, xmm3 ; xmm0=z10
  188. movaps xmm1, xmm2
  189. subps xmm2, xmm5
  190. addps xmm1, xmm5 ; xmm1=tmp7
  191. mulps xmm2, [rel PD_1_414] ; xmm2=tmp11
  192. movaps xmm3, xmm0
  193. addps xmm0, xmm4
  194. mulps xmm0, [rel PD_1_847] ; xmm0=z5
  195. mulps xmm3, [rel PD_M2_613] ; xmm3=(z10 * -2.613125930)
  196. mulps xmm4, [rel PD_1_082] ; xmm4=(z12 * 1.082392200)
  197. addps xmm3, xmm0 ; xmm3=tmp12
  198. subps xmm4, xmm0 ; xmm4=tmp10
  199. ; -- Final output stage
  200. subps xmm3, xmm1 ; xmm3=tmp6
  201. movaps xmm5, xmm6
  202. movaps xmm0, xmm7
  203. addps xmm6, xmm1 ; xmm6=data0=(00 01 02 03)
  204. addps xmm7, xmm3 ; xmm7=data1=(10 11 12 13)
  205. subps xmm5, xmm1 ; xmm5=data7=(70 71 72 73)
  206. subps xmm0, xmm3 ; xmm0=data6=(60 61 62 63)
  207. subps xmm2, xmm3 ; xmm2=tmp5
  208. movaps xmm1, xmm6 ; transpose coefficients(phase 1)
  209. unpcklps xmm6, xmm7 ; xmm6=(00 10 01 11)
  210. unpckhps xmm1, xmm7 ; xmm1=(02 12 03 13)
  211. movaps xmm3, xmm0 ; transpose coefficients(phase 1)
  212. unpcklps xmm0, xmm5 ; xmm0=(60 70 61 71)
  213. unpckhps xmm3, xmm5 ; xmm3=(62 72 63 73)
  214. movaps xmm7, XMMWORD [wk(0)] ; xmm7=tmp2
  215. movaps xmm5, XMMWORD [wk(1)] ; xmm5=tmp3
  216. movaps XMMWORD [wk(0)], xmm0 ; wk(0)=(60 70 61 71)
  217. movaps XMMWORD [wk(1)], xmm3 ; wk(1)=(62 72 63 73)
  218. addps xmm4, xmm2 ; xmm4=tmp4
  219. movaps xmm0, xmm7
  220. movaps xmm3, xmm5
  221. addps xmm7, xmm2 ; xmm7=data2=(20 21 22 23)
  222. addps xmm5, xmm4 ; xmm5=data4=(40 41 42 43)
  223. subps xmm0, xmm2 ; xmm0=data5=(50 51 52 53)
  224. subps xmm3, xmm4 ; xmm3=data3=(30 31 32 33)
  225. movaps xmm2, xmm7 ; transpose coefficients(phase 1)
  226. unpcklps xmm7, xmm3 ; xmm7=(20 30 21 31)
  227. unpckhps xmm2, xmm3 ; xmm2=(22 32 23 33)
  228. movaps xmm4, xmm5 ; transpose coefficients(phase 1)
  229. unpcklps xmm5, xmm0 ; xmm5=(40 50 41 51)
  230. unpckhps xmm4, xmm0 ; xmm4=(42 52 43 53)
  231. movaps xmm3, xmm6 ; transpose coefficients(phase 2)
  232. unpcklps2 xmm6, xmm7 ; xmm6=(00 10 20 30)
  233. unpckhps2 xmm3, xmm7 ; xmm3=(01 11 21 31)
  234. movaps xmm0, xmm1 ; transpose coefficients(phase 2)
  235. unpcklps2 xmm1, xmm2 ; xmm1=(02 12 22 32)
  236. unpckhps2 xmm0, xmm2 ; xmm0=(03 13 23 33)
  237. movaps xmm7, XMMWORD [wk(0)] ; xmm7=(60 70 61 71)
  238. movaps xmm2, XMMWORD [wk(1)] ; xmm2=(62 72 63 73)
  239. movaps XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_FAST_FLOAT)], xmm6
  240. movaps XMMWORD [XMMBLOCK(1,0,rdi,SIZEOF_FAST_FLOAT)], xmm3
  241. movaps XMMWORD [XMMBLOCK(2,0,rdi,SIZEOF_FAST_FLOAT)], xmm1
  242. movaps XMMWORD [XMMBLOCK(3,0,rdi,SIZEOF_FAST_FLOAT)], xmm0
  243. movaps xmm6, xmm5 ; transpose coefficients(phase 2)
  244. unpcklps2 xmm5, xmm7 ; xmm5=(40 50 60 70)
  245. unpckhps2 xmm6, xmm7 ; xmm6=(41 51 61 71)
  246. movaps xmm3, xmm4 ; transpose coefficients(phase 2)
  247. unpcklps2 xmm4, xmm2 ; xmm4=(42 52 62 72)
  248. unpckhps2 xmm3, xmm2 ; xmm3=(43 53 63 73)
  249. movaps XMMWORD [XMMBLOCK(0,1,rdi,SIZEOF_FAST_FLOAT)], xmm5
  250. movaps XMMWORD [XMMBLOCK(1,1,rdi,SIZEOF_FAST_FLOAT)], xmm6
  251. movaps XMMWORD [XMMBLOCK(2,1,rdi,SIZEOF_FAST_FLOAT)], xmm4
  252. movaps XMMWORD [XMMBLOCK(3,1,rdi,SIZEOF_FAST_FLOAT)], xmm3
  253. .nextcolumn:
  254. add rsi, byte 4*SIZEOF_JCOEF ; coef_block
  255. add rdx, byte 4*SIZEOF_FLOAT_MULT_TYPE ; quantptr
  256. add rdi, 4*DCTSIZE*SIZEOF_FAST_FLOAT ; wsptr
  257. dec rcx ; ctr
  258. jnz near .columnloop
  259. ; -- Prefetch the next coefficient block
  260. prefetchnta [rsi + (DCTSIZE2-8)*SIZEOF_JCOEF + 0*32]
  261. prefetchnta [rsi + (DCTSIZE2-8)*SIZEOF_JCOEF + 1*32]
  262. prefetchnta [rsi + (DCTSIZE2-8)*SIZEOF_JCOEF + 2*32]
  263. prefetchnta [rsi + (DCTSIZE2-8)*SIZEOF_JCOEF + 3*32]
  264. ; ---- Pass 2: process rows from work array, store into output array.
  265. mov rax, [original_rbp]
  266. lea rsi, [workspace] ; FAST_FLOAT *wsptr
  267. mov rdi, r12 ; (JSAMPROW *)
  268. mov eax, r13d
  269. mov rcx, DCTSIZE/4 ; ctr
  270. .rowloop:
  271. ; -- Even part
  272. movaps xmm0, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_FAST_FLOAT)]
  273. movaps xmm1, XMMWORD [XMMBLOCK(2,0,rsi,SIZEOF_FAST_FLOAT)]
  274. movaps xmm2, XMMWORD [XMMBLOCK(4,0,rsi,SIZEOF_FAST_FLOAT)]
  275. movaps xmm3, XMMWORD [XMMBLOCK(6,0,rsi,SIZEOF_FAST_FLOAT)]
  276. movaps xmm4, xmm0
  277. movaps xmm5, xmm1
  278. subps xmm0, xmm2 ; xmm0=tmp11
  279. subps xmm1, xmm3
  280. addps xmm4, xmm2 ; xmm4=tmp10
  281. addps xmm5, xmm3 ; xmm5=tmp13
  282. mulps xmm1, [rel PD_1_414]
  283. subps xmm1, xmm5 ; xmm1=tmp12
  284. movaps xmm6, xmm4
  285. movaps xmm7, xmm0
  286. subps xmm4, xmm5 ; xmm4=tmp3
  287. subps xmm0, xmm1 ; xmm0=tmp2
  288. addps xmm6, xmm5 ; xmm6=tmp0
  289. addps xmm7, xmm1 ; xmm7=tmp1
  290. movaps XMMWORD [wk(1)], xmm4 ; tmp3
  291. movaps XMMWORD [wk(0)], xmm0 ; tmp2
  292. ; -- Odd part
  293. movaps xmm2, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_FAST_FLOAT)]
  294. movaps xmm3, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_FAST_FLOAT)]
  295. movaps xmm5, XMMWORD [XMMBLOCK(5,0,rsi,SIZEOF_FAST_FLOAT)]
  296. movaps xmm1, XMMWORD [XMMBLOCK(7,0,rsi,SIZEOF_FAST_FLOAT)]
  297. movaps xmm4, xmm2
  298. movaps xmm0, xmm5
  299. addps xmm2, xmm1 ; xmm2=z11
  300. addps xmm5, xmm3 ; xmm5=z13
  301. subps xmm4, xmm1 ; xmm4=z12
  302. subps xmm0, xmm3 ; xmm0=z10
  303. movaps xmm1, xmm2
  304. subps xmm2, xmm5
  305. addps xmm1, xmm5 ; xmm1=tmp7
  306. mulps xmm2, [rel PD_1_414] ; xmm2=tmp11
  307. movaps xmm3, xmm0
  308. addps xmm0, xmm4
  309. mulps xmm0, [rel PD_1_847] ; xmm0=z5
  310. mulps xmm3, [rel PD_M2_613] ; xmm3=(z10 * -2.613125930)
  311. mulps xmm4, [rel PD_1_082] ; xmm4=(z12 * 1.082392200)
  312. addps xmm3, xmm0 ; xmm3=tmp12
  313. subps xmm4, xmm0 ; xmm4=tmp10
  314. ; -- Final output stage
  315. subps xmm3, xmm1 ; xmm3=tmp6
  316. movaps xmm5, xmm6
  317. movaps xmm0, xmm7
  318. addps xmm6, xmm1 ; xmm6=data0=(00 10 20 30)
  319. addps xmm7, xmm3 ; xmm7=data1=(01 11 21 31)
  320. subps xmm5, xmm1 ; xmm5=data7=(07 17 27 37)
  321. subps xmm0, xmm3 ; xmm0=data6=(06 16 26 36)
  322. subps xmm2, xmm3 ; xmm2=tmp5
  323. movaps xmm1, [rel PD_RNDINT_MAGIC] ; xmm1=[rel PD_RNDINT_MAGIC]
  324. pcmpeqd xmm3, xmm3
  325. psrld xmm3, WORD_BIT ; xmm3={0xFFFF 0x0000 0xFFFF 0x0000 ..}
  326. addps xmm6, xmm1 ; xmm6=roundint(data0/8)=(00 ** 10 ** 20 ** 30 **)
  327. addps xmm7, xmm1 ; xmm7=roundint(data1/8)=(01 ** 11 ** 21 ** 31 **)
  328. addps xmm0, xmm1 ; xmm0=roundint(data6/8)=(06 ** 16 ** 26 ** 36 **)
  329. addps xmm5, xmm1 ; xmm5=roundint(data7/8)=(07 ** 17 ** 27 ** 37 **)
  330. pand xmm6, xmm3 ; xmm6=(00 -- 10 -- 20 -- 30 --)
  331. pslld xmm7, WORD_BIT ; xmm7=(-- 01 -- 11 -- 21 -- 31)
  332. pand xmm0, xmm3 ; xmm0=(06 -- 16 -- 26 -- 36 --)
  333. pslld xmm5, WORD_BIT ; xmm5=(-- 07 -- 17 -- 27 -- 37)
  334. por xmm6, xmm7 ; xmm6=(00 01 10 11 20 21 30 31)
  335. por xmm0, xmm5 ; xmm0=(06 07 16 17 26 27 36 37)
  336. movaps xmm1, XMMWORD [wk(0)] ; xmm1=tmp2
  337. movaps xmm3, XMMWORD [wk(1)] ; xmm3=tmp3
  338. addps xmm4, xmm2 ; xmm4=tmp4
  339. movaps xmm7, xmm1
  340. movaps xmm5, xmm3
  341. addps xmm1, xmm2 ; xmm1=data2=(02 12 22 32)
  342. addps xmm3, xmm4 ; xmm3=data4=(04 14 24 34)
  343. subps xmm7, xmm2 ; xmm7=data5=(05 15 25 35)
  344. subps xmm5, xmm4 ; xmm5=data3=(03 13 23 33)
  345. movaps xmm2, [rel PD_RNDINT_MAGIC] ; xmm2=[rel PD_RNDINT_MAGIC]
  346. pcmpeqd xmm4, xmm4
  347. psrld xmm4, WORD_BIT ; xmm4={0xFFFF 0x0000 0xFFFF 0x0000 ..}
  348. addps xmm3, xmm2 ; xmm3=roundint(data4/8)=(04 ** 14 ** 24 ** 34 **)
  349. addps xmm7, xmm2 ; xmm7=roundint(data5/8)=(05 ** 15 ** 25 ** 35 **)
  350. addps xmm1, xmm2 ; xmm1=roundint(data2/8)=(02 ** 12 ** 22 ** 32 **)
  351. addps xmm5, xmm2 ; xmm5=roundint(data3/8)=(03 ** 13 ** 23 ** 33 **)
  352. pand xmm3, xmm4 ; xmm3=(04 -- 14 -- 24 -- 34 --)
  353. pslld xmm7, WORD_BIT ; xmm7=(-- 05 -- 15 -- 25 -- 35)
  354. pand xmm1, xmm4 ; xmm1=(02 -- 12 -- 22 -- 32 --)
  355. pslld xmm5, WORD_BIT ; xmm5=(-- 03 -- 13 -- 23 -- 33)
  356. por xmm3, xmm7 ; xmm3=(04 05 14 15 24 25 34 35)
  357. por xmm1, xmm5 ; xmm1=(02 03 12 13 22 23 32 33)
  358. movdqa xmm2, [rel PB_CENTERJSAMP] ; xmm2=[rel PB_CENTERJSAMP]
  359. packsswb xmm6, xmm3 ; xmm6=(00 01 10 11 20 21 30 31 04 05 14 15 24 25 34 35)
  360. packsswb xmm1, xmm0 ; xmm1=(02 03 12 13 22 23 32 33 06 07 16 17 26 27 36 37)
  361. paddb xmm6, xmm2
  362. paddb xmm1, xmm2
  363. movdqa xmm4, xmm6 ; transpose coefficients(phase 2)
  364. punpcklwd xmm6, xmm1 ; xmm6=(00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33)
  365. punpckhwd xmm4, xmm1 ; xmm4=(04 05 06 07 14 15 16 17 24 25 26 27 34 35 36 37)
  366. movdqa xmm7, xmm6 ; transpose coefficients(phase 3)
  367. punpckldq xmm6, xmm4 ; xmm6=(00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17)
  368. punpckhdq xmm7, xmm4 ; xmm7=(20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37)
  369. pshufd xmm5, xmm6, 0x4E ; xmm5=(10 11 12 13 14 15 16 17 00 01 02 03 04 05 06 07)
  370. pshufd xmm3, xmm7, 0x4E ; xmm3=(30 31 32 33 34 35 36 37 20 21 22 23 24 25 26 27)
  371. mov rdx, JSAMPROW [rdi+0*SIZEOF_JSAMPROW]
  372. mov rbx, JSAMPROW [rdi+2*SIZEOF_JSAMPROW]
  373. movq XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm6
  374. movq XMM_MMWORD [rbx+rax*SIZEOF_JSAMPLE], xmm7
  375. mov rdx, JSAMPROW [rdi+1*SIZEOF_JSAMPROW]
  376. mov rbx, JSAMPROW [rdi+3*SIZEOF_JSAMPROW]
  377. movq XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm5
  378. movq XMM_MMWORD [rbx+rax*SIZEOF_JSAMPLE], xmm3
  379. add rsi, byte 4*SIZEOF_FAST_FLOAT ; wsptr
  380. add rdi, byte 4*SIZEOF_JSAMPROW
  381. dec rcx ; ctr
  382. jnz near .rowloop
  383. pop rbx
  384. uncollect_args 4
  385. mov rsp, rbp ; rsp <- aligned rbp
  386. pop rsp ; rsp <- original rbp
  387. pop rbp
  388. ret
  389. ; For some reason, the OS X linker does not honor the request to align the
  390. ; segment unless we do this.
  391. align 32