jquant-sse.asm 7.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208
  1. ;
  2. ; jquant.asm - sample data conversion and quantization (SSE & MMX)
  3. ;
  4. ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
  5. ; Copyright (C) 2016, D. R. Commander.
  6. ;
  7. ; Based on the x86 SIMD extension for IJG JPEG library
  8. ; Copyright (C) 1999-2006, MIYASAKA Masaru.
  9. ; For conditions of distribution and use, see copyright notice in jsimdext.inc
  10. ;
  11. ; This file should be assembled with NASM (Netwide Assembler),
  12. ; can *not* be assembled with Microsoft's MASM or any compatible
  13. ; assembler (including Borland's Turbo Assembler).
  14. ; NASM is available from http://nasm.sourceforge.net/ or
  15. ; http://sourceforge.net/project/showfiles.php?group_id=6208
  16. %include "jsimdext.inc"
  17. %include "jdct.inc"
  18. ; --------------------------------------------------------------------------
  19. SECTION SEG_TEXT
  20. BITS 32
  21. ;
  22. ; Load data into workspace, applying unsigned->signed conversion
  23. ;
  24. ; GLOBAL(void)
  25. ; jsimd_convsamp_float_sse(JSAMPARRAY sample_data, JDIMENSION start_col,
  26. ; FAST_FLOAT *workspace);
  27. ;
  28. %define sample_data ebp + 8 ; JSAMPARRAY sample_data
  29. %define start_col ebp + 12 ; JDIMENSION start_col
  30. %define workspace ebp + 16 ; FAST_FLOAT *workspace
  31. align 32
  32. GLOBAL_FUNCTION(jsimd_convsamp_float_sse)
  33. EXTN(jsimd_convsamp_float_sse):
  34. push ebp
  35. mov ebp, esp
  36. push ebx
  37. ; push ecx ; need not be preserved
  38. ; push edx ; need not be preserved
  39. push esi
  40. push edi
  41. pcmpeqw mm7, mm7
  42. psllw mm7, 7
  43. packsswb mm7, mm7 ; mm7 = PB_CENTERJSAMPLE (0x808080..)
  44. mov esi, JSAMPARRAY [sample_data] ; (JSAMPROW *)
  45. mov eax, JDIMENSION [start_col]
  46. mov edi, POINTER [workspace] ; (DCTELEM *)
  47. mov ecx, DCTSIZE/2
  48. alignx 16, 7
  49. .convloop:
  50. mov ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW] ; (JSAMPLE *)
  51. mov edx, JSAMPROW [esi+1*SIZEOF_JSAMPROW] ; (JSAMPLE *)
  52. movq mm0, MMWORD [ebx+eax*SIZEOF_JSAMPLE]
  53. movq mm1, MMWORD [edx+eax*SIZEOF_JSAMPLE]
  54. psubb mm0, mm7 ; mm0=(01234567)
  55. psubb mm1, mm7 ; mm1=(89ABCDEF)
  56. punpcklbw mm2, mm0 ; mm2=(*0*1*2*3)
  57. punpckhbw mm0, mm0 ; mm0=(*4*5*6*7)
  58. punpcklbw mm3, mm1 ; mm3=(*8*9*A*B)
  59. punpckhbw mm1, mm1 ; mm1=(*C*D*E*F)
  60. punpcklwd mm4, mm2 ; mm4=(***0***1)
  61. punpckhwd mm2, mm2 ; mm2=(***2***3)
  62. punpcklwd mm5, mm0 ; mm5=(***4***5)
  63. punpckhwd mm0, mm0 ; mm0=(***6***7)
  64. psrad mm4, (DWORD_BIT-BYTE_BIT) ; mm4=(01)
  65. psrad mm2, (DWORD_BIT-BYTE_BIT) ; mm2=(23)
  66. cvtpi2ps xmm0, mm4 ; xmm0=(01**)
  67. cvtpi2ps xmm1, mm2 ; xmm1=(23**)
  68. psrad mm5, (DWORD_BIT-BYTE_BIT) ; mm5=(45)
  69. psrad mm0, (DWORD_BIT-BYTE_BIT) ; mm0=(67)
  70. cvtpi2ps xmm2, mm5 ; xmm2=(45**)
  71. cvtpi2ps xmm3, mm0 ; xmm3=(67**)
  72. punpcklwd mm6, mm3 ; mm6=(***8***9)
  73. punpckhwd mm3, mm3 ; mm3=(***A***B)
  74. punpcklwd mm4, mm1 ; mm4=(***C***D)
  75. punpckhwd mm1, mm1 ; mm1=(***E***F)
  76. psrad mm6, (DWORD_BIT-BYTE_BIT) ; mm6=(89)
  77. psrad mm3, (DWORD_BIT-BYTE_BIT) ; mm3=(AB)
  78. cvtpi2ps xmm4, mm6 ; xmm4=(89**)
  79. cvtpi2ps xmm5, mm3 ; xmm5=(AB**)
  80. psrad mm4, (DWORD_BIT-BYTE_BIT) ; mm4=(CD)
  81. psrad mm1, (DWORD_BIT-BYTE_BIT) ; mm1=(EF)
  82. cvtpi2ps xmm6, mm4 ; xmm6=(CD**)
  83. cvtpi2ps xmm7, mm1 ; xmm7=(EF**)
  84. movlhps xmm0, xmm1 ; xmm0=(0123)
  85. movlhps xmm2, xmm3 ; xmm2=(4567)
  86. movlhps xmm4, xmm5 ; xmm4=(89AB)
  87. movlhps xmm6, xmm7 ; xmm6=(CDEF)
  88. movaps XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], xmm0
  89. movaps XMMWORD [XMMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], xmm2
  90. movaps XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], xmm4
  91. movaps XMMWORD [XMMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], xmm6
  92. add esi, byte 2*SIZEOF_JSAMPROW
  93. add edi, byte 2*DCTSIZE*SIZEOF_FAST_FLOAT
  94. dec ecx
  95. jnz near .convloop
  96. emms ; empty MMX state
  97. pop edi
  98. pop esi
  99. ; pop edx ; need not be preserved
  100. ; pop ecx ; need not be preserved
  101. pop ebx
  102. pop ebp
  103. ret
  104. ; --------------------------------------------------------------------------
  105. ;
  106. ; Quantize/descale the coefficients, and store into coef_block
  107. ;
  108. ; GLOBAL(void)
  109. ; jsimd_quantize_float_sse(JCOEFPTR coef_block, FAST_FLOAT *divisors,
  110. ; FAST_FLOAT *workspace);
  111. ;
  112. %define coef_block ebp + 8 ; JCOEFPTR coef_block
  113. %define divisors ebp + 12 ; FAST_FLOAT *divisors
  114. %define workspace ebp + 16 ; FAST_FLOAT *workspace
  115. align 32
  116. GLOBAL_FUNCTION(jsimd_quantize_float_sse)
  117. EXTN(jsimd_quantize_float_sse):
  118. push ebp
  119. mov ebp, esp
  120. ; push ebx ; unused
  121. ; push ecx ; unused
  122. ; push edx ; need not be preserved
  123. push esi
  124. push edi
  125. mov esi, POINTER [workspace]
  126. mov edx, POINTER [divisors]
  127. mov edi, JCOEFPTR [coef_block]
  128. mov eax, DCTSIZE2/16
  129. alignx 16, 7
  130. .quantloop:
  131. movaps xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_FAST_FLOAT)]
  132. movaps xmm1, XMMWORD [XMMBLOCK(0,1,esi,SIZEOF_FAST_FLOAT)]
  133. mulps xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)]
  134. mulps xmm1, XMMWORD [XMMBLOCK(0,1,edx,SIZEOF_FAST_FLOAT)]
  135. movaps xmm2, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_FAST_FLOAT)]
  136. movaps xmm3, XMMWORD [XMMBLOCK(1,1,esi,SIZEOF_FAST_FLOAT)]
  137. mulps xmm2, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)]
  138. mulps xmm3, XMMWORD [XMMBLOCK(1,1,edx,SIZEOF_FAST_FLOAT)]
  139. movhlps xmm4, xmm0
  140. movhlps xmm5, xmm1
  141. cvtps2pi mm0, xmm0
  142. cvtps2pi mm1, xmm1
  143. cvtps2pi mm4, xmm4
  144. cvtps2pi mm5, xmm5
  145. movhlps xmm6, xmm2
  146. movhlps xmm7, xmm3
  147. cvtps2pi mm2, xmm2
  148. cvtps2pi mm3, xmm3
  149. cvtps2pi mm6, xmm6
  150. cvtps2pi mm7, xmm7
  151. packssdw mm0, mm4
  152. packssdw mm1, mm5
  153. packssdw mm2, mm6
  154. packssdw mm3, mm7
  155. movq MMWORD [MMBLOCK(0,0,edi,SIZEOF_JCOEF)], mm0
  156. movq MMWORD [MMBLOCK(0,1,edi,SIZEOF_JCOEF)], mm1
  157. movq MMWORD [MMBLOCK(1,0,edi,SIZEOF_JCOEF)], mm2
  158. movq MMWORD [MMBLOCK(1,1,edi,SIZEOF_JCOEF)], mm3
  159. add esi, byte 16*SIZEOF_FAST_FLOAT
  160. add edx, byte 16*SIZEOF_FAST_FLOAT
  161. add edi, byte 16*SIZEOF_JCOEF
  162. dec eax
  163. jnz short .quantloop
  164. emms ; empty MMX state
  165. pop edi
  166. pop esi
  167. ; pop edx ; need not be preserved
  168. ; pop ecx ; unused
  169. ; pop ebx ; unused
  170. pop ebp
  171. ret
  172. ; For some reason, the OS X linker does not honor the request to align the
  173. ; segment unless we do this.
  174. align 32