jquant-mmx.asm 9.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276
  1. ;
  2. ; jquant.asm - sample data conversion and quantization (MMX)
  3. ;
  4. ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
  5. ; Copyright (C) 2016, D. R. Commander.
  6. ;
  7. ; Based on the x86 SIMD extension for IJG JPEG library
  8. ; Copyright (C) 1999-2006, MIYASAKA Masaru.
  9. ; For conditions of distribution and use, see copyright notice in jsimdext.inc
  10. ;
  11. ; This file should be assembled with NASM (Netwide Assembler),
  12. ; can *not* be assembled with Microsoft's MASM or any compatible
  13. ; assembler (including Borland's Turbo Assembler).
  14. ; NASM is available from http://nasm.sourceforge.net/ or
  15. ; http://sourceforge.net/project/showfiles.php?group_id=6208
  16. %include "jsimdext.inc"
  17. %include "jdct.inc"
  18. ; --------------------------------------------------------------------------
  19. SECTION SEG_TEXT
  20. BITS 32
  21. ;
  22. ; Load data into workspace, applying unsigned->signed conversion
  23. ;
  24. ; GLOBAL(void)
  25. ; jsimd_convsamp_mmx(JSAMPARRAY sample_data, JDIMENSION start_col,
  26. ; DCTELEM *workspace);
  27. ;
  28. %define sample_data ebp + 8 ; JSAMPARRAY sample_data
  29. %define start_col ebp + 12 ; JDIMENSION start_col
  30. %define workspace ebp + 16 ; DCTELEM *workspace
  31. align 32
  32. GLOBAL_FUNCTION(jsimd_convsamp_mmx)
  33. EXTN(jsimd_convsamp_mmx):
  34. push ebp
  35. mov ebp, esp
  36. push ebx
  37. ; push ecx ; need not be preserved
  38. ; push edx ; need not be preserved
  39. push esi
  40. push edi
  41. pxor mm6, mm6 ; mm6=(all 0's)
  42. pcmpeqw mm7, mm7
  43. psllw mm7, 7 ; mm7={0xFF80 0xFF80 0xFF80 0xFF80}
  44. mov esi, JSAMPARRAY [sample_data] ; (JSAMPROW *)
  45. mov eax, JDIMENSION [start_col]
  46. mov edi, POINTER [workspace] ; (DCTELEM *)
  47. mov ecx, DCTSIZE/4
  48. alignx 16, 7
  49. .convloop:
  50. mov ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW] ; (JSAMPLE *)
  51. mov edx, JSAMPROW [esi+1*SIZEOF_JSAMPROW] ; (JSAMPLE *)
  52. movq mm0, MMWORD [ebx+eax*SIZEOF_JSAMPLE] ; mm0=(01234567)
  53. movq mm1, MMWORD [edx+eax*SIZEOF_JSAMPLE] ; mm1=(89ABCDEF)
  54. mov ebx, JSAMPROW [esi+2*SIZEOF_JSAMPROW] ; (JSAMPLE *)
  55. mov edx, JSAMPROW [esi+3*SIZEOF_JSAMPROW] ; (JSAMPLE *)
  56. movq mm2, MMWORD [ebx+eax*SIZEOF_JSAMPLE] ; mm2=(GHIJKLMN)
  57. movq mm3, MMWORD [edx+eax*SIZEOF_JSAMPLE] ; mm3=(OPQRSTUV)
  58. movq mm4, mm0
  59. punpcklbw mm0, mm6 ; mm0=(0123)
  60. punpckhbw mm4, mm6 ; mm4=(4567)
  61. movq mm5, mm1
  62. punpcklbw mm1, mm6 ; mm1=(89AB)
  63. punpckhbw mm5, mm6 ; mm5=(CDEF)
  64. paddw mm0, mm7
  65. paddw mm4, mm7
  66. paddw mm1, mm7
  67. paddw mm5, mm7
  68. movq MMWORD [MMBLOCK(0,0,edi,SIZEOF_DCTELEM)], mm0
  69. movq MMWORD [MMBLOCK(0,1,edi,SIZEOF_DCTELEM)], mm4
  70. movq MMWORD [MMBLOCK(1,0,edi,SIZEOF_DCTELEM)], mm1
  71. movq MMWORD [MMBLOCK(1,1,edi,SIZEOF_DCTELEM)], mm5
  72. movq mm0, mm2
  73. punpcklbw mm2, mm6 ; mm2=(GHIJ)
  74. punpckhbw mm0, mm6 ; mm0=(KLMN)
  75. movq mm4, mm3
  76. punpcklbw mm3, mm6 ; mm3=(OPQR)
  77. punpckhbw mm4, mm6 ; mm4=(STUV)
  78. paddw mm2, mm7
  79. paddw mm0, mm7
  80. paddw mm3, mm7
  81. paddw mm4, mm7
  82. movq MMWORD [MMBLOCK(2,0,edi,SIZEOF_DCTELEM)], mm2
  83. movq MMWORD [MMBLOCK(2,1,edi,SIZEOF_DCTELEM)], mm0
  84. movq MMWORD [MMBLOCK(3,0,edi,SIZEOF_DCTELEM)], mm3
  85. movq MMWORD [MMBLOCK(3,1,edi,SIZEOF_DCTELEM)], mm4
  86. add esi, byte 4*SIZEOF_JSAMPROW
  87. add edi, byte 4*DCTSIZE*SIZEOF_DCTELEM
  88. dec ecx
  89. jnz short .convloop
  90. emms ; empty MMX state
  91. pop edi
  92. pop esi
  93. ; pop edx ; need not be preserved
  94. ; pop ecx ; need not be preserved
  95. pop ebx
  96. pop ebp
  97. ret
  98. ; --------------------------------------------------------------------------
  99. ;
  100. ; Quantize/descale the coefficients, and store into coef_block
  101. ;
  102. ; This implementation is based on an algorithm described in
  103. ; "How to optimize for the Pentium family of microprocessors"
  104. ; (http://www.agner.org/assem/).
  105. ;
  106. ; GLOBAL(void)
  107. ; jsimd_quantize_mmx(JCOEFPTR coef_block, DCTELEM *divisors,
  108. ; DCTELEM *workspace);
  109. ;
  110. %define RECIPROCAL(m, n, b) \
  111. MMBLOCK(DCTSIZE * 0 + (m), (n), (b), SIZEOF_DCTELEM)
  112. %define CORRECTION(m, n, b) \
  113. MMBLOCK(DCTSIZE * 1 + (m), (n), (b), SIZEOF_DCTELEM)
  114. %define SCALE(m, n, b) \
  115. MMBLOCK(DCTSIZE * 2 + (m), (n), (b), SIZEOF_DCTELEM)
  116. %define SHIFT(m, n, b) \
  117. MMBLOCK(DCTSIZE * 3 + (m), (n), (b), SIZEOF_DCTELEM)
  118. %define coef_block ebp + 8 ; JCOEFPTR coef_block
  119. %define divisors ebp + 12 ; DCTELEM *divisors
  120. %define workspace ebp + 16 ; DCTELEM *workspace
  121. align 32
  122. GLOBAL_FUNCTION(jsimd_quantize_mmx)
  123. EXTN(jsimd_quantize_mmx):
  124. push ebp
  125. mov ebp, esp
  126. ; push ebx ; unused
  127. ; push ecx ; unused
  128. ; push edx ; need not be preserved
  129. push esi
  130. push edi
  131. mov esi, POINTER [workspace]
  132. mov edx, POINTER [divisors]
  133. mov edi, JCOEFPTR [coef_block]
  134. mov ah, 2
  135. alignx 16, 7
  136. .quantloop1:
  137. mov al, DCTSIZE2/8/2
  138. alignx 16, 7
  139. .quantloop2:
  140. movq mm2, MMWORD [MMBLOCK(0,0,esi,SIZEOF_DCTELEM)]
  141. movq mm3, MMWORD [MMBLOCK(0,1,esi,SIZEOF_DCTELEM)]
  142. movq mm0, mm2
  143. movq mm1, mm3
  144. psraw mm2, (WORD_BIT-1) ; -1 if value < 0, 0 otherwise
  145. psraw mm3, (WORD_BIT-1)
  146. pxor mm0, mm2 ; val = -val
  147. pxor mm1, mm3
  148. psubw mm0, mm2
  149. psubw mm1, mm3
  150. ;
  151. ; MMX is an annoyingly crappy instruction set. It has two
  152. ; misfeatures that are causing problems here:
  153. ;
  154. ; - All multiplications are signed.
  155. ;
  156. ; - The second operand for the shifts is not treated as packed.
  157. ;
  158. ;
  159. ; We work around the first problem by implementing this algorithm:
  160. ;
  161. ; unsigned long unsigned_multiply(unsigned short x, unsigned short y)
  162. ; {
  163. ; enum { SHORT_BIT = 16 };
  164. ; signed short sx = (signed short)x;
  165. ; signed short sy = (signed short)y;
  166. ; signed long sz;
  167. ;
  168. ; sz = (long)sx * (long)sy; /* signed multiply */
  169. ;
  170. ; if (sx < 0) sz += (long)sy << SHORT_BIT;
  171. ; if (sy < 0) sz += (long)sx << SHORT_BIT;
  172. ;
  173. ; return (unsigned long)sz;
  174. ; }
  175. ;
  176. ; (note that a negative sx adds _sy_ and vice versa)
  177. ;
  178. ; For the second problem, we replace the shift by a multiplication.
  179. ; Unfortunately that means we have to deal with the signed issue again.
  180. ;
  181. paddw mm0, MMWORD [CORRECTION(0,0,edx)] ; correction + roundfactor
  182. paddw mm1, MMWORD [CORRECTION(0,1,edx)]
  183. movq mm4, mm0 ; store current value for later
  184. movq mm5, mm1
  185. pmulhw mm0, MMWORD [RECIPROCAL(0,0,edx)] ; reciprocal
  186. pmulhw mm1, MMWORD [RECIPROCAL(0,1,edx)]
  187. paddw mm0, mm4 ; reciprocal is always negative (MSB=1),
  188. paddw mm1, mm5 ; so we always need to add the initial value
  189. ; (input value is never negative as we
  190. ; inverted it at the start of this routine)
  191. ; here it gets a bit tricky as both scale
  192. ; and mm0/mm1 can be negative
  193. movq mm6, MMWORD [SCALE(0,0,edx)] ; scale
  194. movq mm7, MMWORD [SCALE(0,1,edx)]
  195. movq mm4, mm0
  196. movq mm5, mm1
  197. pmulhw mm0, mm6
  198. pmulhw mm1, mm7
  199. psraw mm6, (WORD_BIT-1) ; determine if scale is negative
  200. psraw mm7, (WORD_BIT-1)
  201. pand mm6, mm4 ; and add input if it is
  202. pand mm7, mm5
  203. paddw mm0, mm6
  204. paddw mm1, mm7
  205. psraw mm4, (WORD_BIT-1) ; then check if negative input
  206. psraw mm5, (WORD_BIT-1)
  207. pand mm4, MMWORD [SCALE(0,0,edx)] ; and add scale if it is
  208. pand mm5, MMWORD [SCALE(0,1,edx)]
  209. paddw mm0, mm4
  210. paddw mm1, mm5
  211. pxor mm0, mm2 ; val = -val
  212. pxor mm1, mm3
  213. psubw mm0, mm2
  214. psubw mm1, mm3
  215. movq MMWORD [MMBLOCK(0,0,edi,SIZEOF_DCTELEM)], mm0
  216. movq MMWORD [MMBLOCK(0,1,edi,SIZEOF_DCTELEM)], mm1
  217. add esi, byte 8*SIZEOF_DCTELEM
  218. add edx, byte 8*SIZEOF_DCTELEM
  219. add edi, byte 8*SIZEOF_JCOEF
  220. dec al
  221. jnz near .quantloop2
  222. dec ah
  223. jnz near .quantloop1 ; to avoid branch misprediction
  224. emms ; empty MMX state
  225. pop edi
  226. pop esi
  227. ; pop edx ; need not be preserved
  228. ; pop ecx ; unused
  229. ; pop ebx ; unused
  230. pop ebp
  231. ret
  232. ; For some reason, the OS X linker does not honor the request to align the
  233. ; segment unless we do this.
  234. align 32