jquanti-sse2.asm 7.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201
  1. ;
  2. ; jquanti.asm - sample data conversion and quantization (SSE2)
  3. ;
  4. ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
  5. ; Copyright (C) 2016, D. R. Commander.
  6. ;
  7. ; Based on the x86 SIMD extension for IJG JPEG library
  8. ; Copyright (C) 1999-2006, MIYASAKA Masaru.
  9. ; For conditions of distribution and use, see copyright notice in jsimdext.inc
  10. ;
  11. ; This file should be assembled with NASM (Netwide Assembler),
  12. ; can *not* be assembled with Microsoft's MASM or any compatible
  13. ; assembler (including Borland's Turbo Assembler).
  14. ; NASM is available from http://nasm.sourceforge.net/ or
  15. ; http://sourceforge.net/project/showfiles.php?group_id=6208
  16. %include "jsimdext.inc"
  17. %include "jdct.inc"
  18. ; --------------------------------------------------------------------------
  19. SECTION SEG_TEXT
  20. BITS 32
  21. ;
  22. ; Load data into workspace, applying unsigned->signed conversion
  23. ;
  24. ; GLOBAL(void)
  25. ; jsimd_convsamp_sse2(JSAMPARRAY sample_data, JDIMENSION start_col,
  26. ; DCTELEM *workspace);
  27. ;
  28. %define sample_data ebp + 8 ; JSAMPARRAY sample_data
  29. %define start_col ebp + 12 ; JDIMENSION start_col
  30. %define workspace ebp + 16 ; DCTELEM *workspace
  31. align 32
  32. GLOBAL_FUNCTION(jsimd_convsamp_sse2)
  33. EXTN(jsimd_convsamp_sse2):
  34. push ebp
  35. mov ebp, esp
  36. push ebx
  37. ; push ecx ; need not be preserved
  38. ; push edx ; need not be preserved
  39. push esi
  40. push edi
  41. pxor xmm6, xmm6 ; xmm6=(all 0's)
  42. pcmpeqw xmm7, xmm7
  43. psllw xmm7, 7 ; xmm7={0xFF80 0xFF80 0xFF80 0xFF80 ..}
  44. mov esi, JSAMPARRAY [sample_data] ; (JSAMPROW *)
  45. mov eax, JDIMENSION [start_col]
  46. mov edi, POINTER [workspace] ; (DCTELEM *)
  47. mov ecx, DCTSIZE/4
  48. alignx 16, 7
  49. .convloop:
  50. mov ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW] ; (JSAMPLE *)
  51. mov edx, JSAMPROW [esi+1*SIZEOF_JSAMPROW] ; (JSAMPLE *)
  52. movq xmm0, XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE] ; xmm0=(01234567)
  53. movq xmm1, XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE] ; xmm1=(89ABCDEF)
  54. mov ebx, JSAMPROW [esi+2*SIZEOF_JSAMPROW] ; (JSAMPLE *)
  55. mov edx, JSAMPROW [esi+3*SIZEOF_JSAMPROW] ; (JSAMPLE *)
  56. movq xmm2, XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE] ; xmm2=(GHIJKLMN)
  57. movq xmm3, XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE] ; xmm3=(OPQRSTUV)
  58. punpcklbw xmm0, xmm6 ; xmm0=(01234567)
  59. punpcklbw xmm1, xmm6 ; xmm1=(89ABCDEF)
  60. paddw xmm0, xmm7
  61. paddw xmm1, xmm7
  62. punpcklbw xmm2, xmm6 ; xmm2=(GHIJKLMN)
  63. punpcklbw xmm3, xmm6 ; xmm3=(OPQRSTUV)
  64. paddw xmm2, xmm7
  65. paddw xmm3, xmm7
  66. movdqa XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_DCTELEM)], xmm0
  67. movdqa XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_DCTELEM)], xmm1
  68. movdqa XMMWORD [XMMBLOCK(2,0,edi,SIZEOF_DCTELEM)], xmm2
  69. movdqa XMMWORD [XMMBLOCK(3,0,edi,SIZEOF_DCTELEM)], xmm3
  70. add esi, byte 4*SIZEOF_JSAMPROW
  71. add edi, byte 4*DCTSIZE*SIZEOF_DCTELEM
  72. dec ecx
  73. jnz short .convloop
  74. pop edi
  75. pop esi
  76. ; pop edx ; need not be preserved
  77. ; pop ecx ; need not be preserved
  78. pop ebx
  79. pop ebp
  80. ret
  81. ; --------------------------------------------------------------------------
  82. ;
  83. ; Quantize/descale the coefficients, and store into coef_block
  84. ;
  85. ; This implementation is based on an algorithm described in
  86. ; "How to optimize for the Pentium family of microprocessors"
  87. ; (http://www.agner.org/assem/).
  88. ;
  89. ; GLOBAL(void)
  90. ; jsimd_quantize_sse2(JCOEFPTR coef_block, DCTELEM *divisors,
  91. ; DCTELEM *workspace);
  92. ;
  93. %define RECIPROCAL(m, n, b) \
  94. XMMBLOCK(DCTSIZE * 0 + (m), (n), (b), SIZEOF_DCTELEM)
  95. %define CORRECTION(m, n, b) \
  96. XMMBLOCK(DCTSIZE * 1 + (m), (n), (b), SIZEOF_DCTELEM)
  97. %define SCALE(m, n, b) \
  98. XMMBLOCK(DCTSIZE * 2 + (m), (n), (b), SIZEOF_DCTELEM)
  99. %define coef_block ebp + 8 ; JCOEFPTR coef_block
  100. %define divisors ebp + 12 ; DCTELEM *divisors
  101. %define workspace ebp + 16 ; DCTELEM *workspace
  102. align 32
  103. GLOBAL_FUNCTION(jsimd_quantize_sse2)
  104. EXTN(jsimd_quantize_sse2):
  105. push ebp
  106. mov ebp, esp
  107. ; push ebx ; unused
  108. ; push ecx ; unused
  109. ; push edx ; need not be preserved
  110. push esi
  111. push edi
  112. mov esi, POINTER [workspace]
  113. mov edx, POINTER [divisors]
  114. mov edi, JCOEFPTR [coef_block]
  115. mov eax, DCTSIZE2/32
  116. alignx 16, 7
  117. .quantloop:
  118. movdqa xmm4, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_DCTELEM)]
  119. movdqa xmm5, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_DCTELEM)]
  120. movdqa xmm6, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_DCTELEM)]
  121. movdqa xmm7, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_DCTELEM)]
  122. movdqa xmm0, xmm4
  123. movdqa xmm1, xmm5
  124. movdqa xmm2, xmm6
  125. movdqa xmm3, xmm7
  126. psraw xmm4, (WORD_BIT-1)
  127. psraw xmm5, (WORD_BIT-1)
  128. psraw xmm6, (WORD_BIT-1)
  129. psraw xmm7, (WORD_BIT-1)
  130. pxor xmm0, xmm4
  131. pxor xmm1, xmm5
  132. pxor xmm2, xmm6
  133. pxor xmm3, xmm7
  134. psubw xmm0, xmm4 ; if (xmm0 < 0) xmm0 = -xmm0;
  135. psubw xmm1, xmm5 ; if (xmm1 < 0) xmm1 = -xmm1;
  136. psubw xmm2, xmm6 ; if (xmm2 < 0) xmm2 = -xmm2;
  137. psubw xmm3, xmm7 ; if (xmm3 < 0) xmm3 = -xmm3;
  138. paddw xmm0, XMMWORD [CORRECTION(0,0,edx)] ; correction + roundfactor
  139. paddw xmm1, XMMWORD [CORRECTION(1,0,edx)]
  140. paddw xmm2, XMMWORD [CORRECTION(2,0,edx)]
  141. paddw xmm3, XMMWORD [CORRECTION(3,0,edx)]
  142. pmulhuw xmm0, XMMWORD [RECIPROCAL(0,0,edx)] ; reciprocal
  143. pmulhuw xmm1, XMMWORD [RECIPROCAL(1,0,edx)]
  144. pmulhuw xmm2, XMMWORD [RECIPROCAL(2,0,edx)]
  145. pmulhuw xmm3, XMMWORD [RECIPROCAL(3,0,edx)]
  146. pmulhuw xmm0, XMMWORD [SCALE(0,0,edx)] ; scale
  147. pmulhuw xmm1, XMMWORD [SCALE(1,0,edx)]
  148. pmulhuw xmm2, XMMWORD [SCALE(2,0,edx)]
  149. pmulhuw xmm3, XMMWORD [SCALE(3,0,edx)]
  150. pxor xmm0, xmm4
  151. pxor xmm1, xmm5
  152. pxor xmm2, xmm6
  153. pxor xmm3, xmm7
  154. psubw xmm0, xmm4
  155. psubw xmm1, xmm5
  156. psubw xmm2, xmm6
  157. psubw xmm3, xmm7
  158. movdqa XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_DCTELEM)], xmm0
  159. movdqa XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_DCTELEM)], xmm1
  160. movdqa XMMWORD [XMMBLOCK(2,0,edi,SIZEOF_DCTELEM)], xmm2
  161. movdqa XMMWORD [XMMBLOCK(3,0,edi,SIZEOF_DCTELEM)], xmm3
  162. add esi, byte 32*SIZEOF_DCTELEM
  163. add edx, byte 32*SIZEOF_DCTELEM
  164. add edi, byte 32*SIZEOF_JCOEF
  165. dec eax
  166. jnz near .quantloop
  167. pop edi
  168. pop esi
  169. ; pop edx ; need not be preserved
  170. ; pop ecx ; unused
  171. ; pop ebx ; unused
  172. pop ebp
  173. ret
  174. ; For some reason, the OS X linker does not honor the request to align the
  175. ; segment unless we do this.
  176. align 32