jquant-3dn.asm 8.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230
  1. ;
  2. ; jquant.asm - sample data conversion and quantization (3DNow! & MMX)
  3. ;
  4. ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
  5. ; Copyright (C) 2016, D. R. Commander.
  6. ;
  7. ; Based on the x86 SIMD extension for IJG JPEG library
  8. ; Copyright (C) 1999-2006, MIYASAKA Masaru.
  9. ; For conditions of distribution and use, see copyright notice in jsimdext.inc
  10. ;
  11. ; This file should be assembled with NASM (Netwide Assembler),
  12. ; can *not* be assembled with Microsoft's MASM or any compatible
  13. ; assembler (including Borland's Turbo Assembler).
  14. ; NASM is available from http://nasm.sourceforge.net/ or
  15. ; http://sourceforge.net/project/showfiles.php?group_id=6208
  16. %include "jsimdext.inc"
  17. %include "jdct.inc"
  18. ; --------------------------------------------------------------------------
  19. SECTION SEG_TEXT
  20. BITS 32
  21. ;
  22. ; Load data into workspace, applying unsigned->signed conversion
  23. ;
  24. ; GLOBAL(void)
  25. ; jsimd_convsamp_float_3dnow(JSAMPARRAY sample_data, JDIMENSION start_col,
  26. ; FAST_FLOAT *workspace);
  27. ;
  28. %define sample_data ebp + 8 ; JSAMPARRAY sample_data
  29. %define start_col ebp + 12 ; JDIMENSION start_col
  30. %define workspace ebp + 16 ; FAST_FLOAT *workspace
  31. align 32
  32. GLOBAL_FUNCTION(jsimd_convsamp_float_3dnow)
  33. EXTN(jsimd_convsamp_float_3dnow):
  34. push ebp
  35. mov ebp, esp
  36. push ebx
  37. ; push ecx ; need not be preserved
  38. ; push edx ; need not be preserved
  39. push esi
  40. push edi
  41. pcmpeqw mm7, mm7
  42. psllw mm7, 7
  43. packsswb mm7, mm7 ; mm7 = PB_CENTERJSAMPLE (0x808080..)
  44. mov esi, JSAMPARRAY [sample_data] ; (JSAMPROW *)
  45. mov eax, JDIMENSION [start_col]
  46. mov edi, POINTER [workspace] ; (DCTELEM *)
  47. mov ecx, DCTSIZE/2
  48. alignx 16, 7
  49. .convloop:
  50. mov ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW] ; (JSAMPLE *)
  51. mov edx, JSAMPROW [esi+1*SIZEOF_JSAMPROW] ; (JSAMPLE *)
  52. movq mm0, MMWORD [ebx+eax*SIZEOF_JSAMPLE]
  53. movq mm1, MMWORD [edx+eax*SIZEOF_JSAMPLE]
  54. psubb mm0, mm7 ; mm0=(01234567)
  55. psubb mm1, mm7 ; mm1=(89ABCDEF)
  56. punpcklbw mm2, mm0 ; mm2=(*0*1*2*3)
  57. punpckhbw mm0, mm0 ; mm0=(*4*5*6*7)
  58. punpcklbw mm3, mm1 ; mm3=(*8*9*A*B)
  59. punpckhbw mm1, mm1 ; mm1=(*C*D*E*F)
  60. punpcklwd mm4, mm2 ; mm4=(***0***1)
  61. punpckhwd mm2, mm2 ; mm2=(***2***3)
  62. punpcklwd mm5, mm0 ; mm5=(***4***5)
  63. punpckhwd mm0, mm0 ; mm0=(***6***7)
  64. psrad mm4, (DWORD_BIT-BYTE_BIT) ; mm4=(01)
  65. psrad mm2, (DWORD_BIT-BYTE_BIT) ; mm2=(23)
  66. pi2fd mm4, mm4
  67. pi2fd mm2, mm2
  68. psrad mm5, (DWORD_BIT-BYTE_BIT) ; mm5=(45)
  69. psrad mm0, (DWORD_BIT-BYTE_BIT) ; mm0=(67)
  70. pi2fd mm5, mm5
  71. pi2fd mm0, mm0
  72. movq MMWORD [MMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], mm4
  73. movq MMWORD [MMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], mm2
  74. movq MMWORD [MMBLOCK(0,2,edi,SIZEOF_FAST_FLOAT)], mm5
  75. movq MMWORD [MMBLOCK(0,3,edi,SIZEOF_FAST_FLOAT)], mm0
  76. punpcklwd mm6, mm3 ; mm6=(***8***9)
  77. punpckhwd mm3, mm3 ; mm3=(***A***B)
  78. punpcklwd mm4, mm1 ; mm4=(***C***D)
  79. punpckhwd mm1, mm1 ; mm1=(***E***F)
  80. psrad mm6, (DWORD_BIT-BYTE_BIT) ; mm6=(89)
  81. psrad mm3, (DWORD_BIT-BYTE_BIT) ; mm3=(AB)
  82. pi2fd mm6, mm6
  83. pi2fd mm3, mm3
  84. psrad mm4, (DWORD_BIT-BYTE_BIT) ; mm4=(CD)
  85. psrad mm1, (DWORD_BIT-BYTE_BIT) ; mm1=(EF)
  86. pi2fd mm4, mm4
  87. pi2fd mm1, mm1
  88. movq MMWORD [MMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], mm6
  89. movq MMWORD [MMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], mm3
  90. movq MMWORD [MMBLOCK(1,2,edi,SIZEOF_FAST_FLOAT)], mm4
  91. movq MMWORD [MMBLOCK(1,3,edi,SIZEOF_FAST_FLOAT)], mm1
  92. add esi, byte 2*SIZEOF_JSAMPROW
  93. add edi, byte 2*DCTSIZE*SIZEOF_FAST_FLOAT
  94. dec ecx
  95. jnz near .convloop
  96. femms ; empty MMX/3DNow! state
  97. pop edi
  98. pop esi
  99. ; pop edx ; need not be preserved
  100. ; pop ecx ; need not be preserved
  101. pop ebx
  102. pop ebp
  103. ret
  104. ; --------------------------------------------------------------------------
  105. ;
  106. ; Quantize/descale the coefficients, and store into coef_block
  107. ;
  108. ; GLOBAL(void)
  109. ; jsimd_quantize_float_3dnow(JCOEFPTR coef_block, FAST_FLOAT *divisors,
  110. ; FAST_FLOAT *workspace);
  111. ;
  112. %define coef_block ebp + 8 ; JCOEFPTR coef_block
  113. %define divisors ebp + 12 ; FAST_FLOAT *divisors
  114. %define workspace ebp + 16 ; FAST_FLOAT *workspace
  115. align 32
  116. GLOBAL_FUNCTION(jsimd_quantize_float_3dnow)
  117. EXTN(jsimd_quantize_float_3dnow):
  118. push ebp
  119. mov ebp, esp
  120. ; push ebx ; unused
  121. ; push ecx ; unused
  122. ; push edx ; need not be preserved
  123. push esi
  124. push edi
  125. mov eax, 0x4B400000 ; (float)0x00C00000 (rndint_magic)
  126. movd mm7, eax
  127. punpckldq mm7, mm7 ; mm7={12582912.0F 12582912.0F}
  128. mov esi, POINTER [workspace]
  129. mov edx, POINTER [divisors]
  130. mov edi, JCOEFPTR [coef_block]
  131. mov eax, DCTSIZE2/16
  132. alignx 16, 7
  133. .quantloop:
  134. movq mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_FAST_FLOAT)]
  135. movq mm1, MMWORD [MMBLOCK(0,1,esi,SIZEOF_FAST_FLOAT)]
  136. pfmul mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)]
  137. pfmul mm1, MMWORD [MMBLOCK(0,1,edx,SIZEOF_FAST_FLOAT)]
  138. movq mm2, MMWORD [MMBLOCK(0,2,esi,SIZEOF_FAST_FLOAT)]
  139. movq mm3, MMWORD [MMBLOCK(0,3,esi,SIZEOF_FAST_FLOAT)]
  140. pfmul mm2, MMWORD [MMBLOCK(0,2,edx,SIZEOF_FAST_FLOAT)]
  141. pfmul mm3, MMWORD [MMBLOCK(0,3,edx,SIZEOF_FAST_FLOAT)]
  142. pfadd mm0, mm7 ; mm0=(00 ** 01 **)
  143. pfadd mm1, mm7 ; mm1=(02 ** 03 **)
  144. pfadd mm2, mm7 ; mm0=(04 ** 05 **)
  145. pfadd mm3, mm7 ; mm1=(06 ** 07 **)
  146. movq mm4, mm0
  147. punpcklwd mm0, mm1 ; mm0=(00 02 ** **)
  148. punpckhwd mm4, mm1 ; mm4=(01 03 ** **)
  149. movq mm5, mm2
  150. punpcklwd mm2, mm3 ; mm2=(04 06 ** **)
  151. punpckhwd mm5, mm3 ; mm5=(05 07 ** **)
  152. punpcklwd mm0, mm4 ; mm0=(00 01 02 03)
  153. punpcklwd mm2, mm5 ; mm2=(04 05 06 07)
  154. movq mm6, MMWORD [MMBLOCK(1,0,esi,SIZEOF_FAST_FLOAT)]
  155. movq mm1, MMWORD [MMBLOCK(1,1,esi,SIZEOF_FAST_FLOAT)]
  156. pfmul mm6, MMWORD [MMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)]
  157. pfmul mm1, MMWORD [MMBLOCK(1,1,edx,SIZEOF_FAST_FLOAT)]
  158. movq mm3, MMWORD [MMBLOCK(1,2,esi,SIZEOF_FAST_FLOAT)]
  159. movq mm4, MMWORD [MMBLOCK(1,3,esi,SIZEOF_FAST_FLOAT)]
  160. pfmul mm3, MMWORD [MMBLOCK(1,2,edx,SIZEOF_FAST_FLOAT)]
  161. pfmul mm4, MMWORD [MMBLOCK(1,3,edx,SIZEOF_FAST_FLOAT)]
  162. pfadd mm6, mm7 ; mm0=(10 ** 11 **)
  163. pfadd mm1, mm7 ; mm4=(12 ** 13 **)
  164. pfadd mm3, mm7 ; mm0=(14 ** 15 **)
  165. pfadd mm4, mm7 ; mm4=(16 ** 17 **)
  166. movq mm5, mm6
  167. punpcklwd mm6, mm1 ; mm6=(10 12 ** **)
  168. punpckhwd mm5, mm1 ; mm5=(11 13 ** **)
  169. movq mm1, mm3
  170. punpcklwd mm3, mm4 ; mm3=(14 16 ** **)
  171. punpckhwd mm1, mm4 ; mm1=(15 17 ** **)
  172. punpcklwd mm6, mm5 ; mm6=(10 11 12 13)
  173. punpcklwd mm3, mm1 ; mm3=(14 15 16 17)
  174. movq MMWORD [MMBLOCK(0,0,edi,SIZEOF_JCOEF)], mm0
  175. movq MMWORD [MMBLOCK(0,1,edi,SIZEOF_JCOEF)], mm2
  176. movq MMWORD [MMBLOCK(1,0,edi,SIZEOF_JCOEF)], mm6
  177. movq MMWORD [MMBLOCK(1,1,edi,SIZEOF_JCOEF)], mm3
  178. add esi, byte 16*SIZEOF_FAST_FLOAT
  179. add edx, byte 16*SIZEOF_FAST_FLOAT
  180. add edi, byte 16*SIZEOF_JCOEF
  181. dec eax
  182. jnz near .quantloop
  183. femms ; empty MMX/3DNow! state
  184. pop edi
  185. pop esi
  186. ; pop edx ; need not be preserved
  187. ; pop ecx ; unused
  188. ; pop ebx ; unused
  189. pop ebp
  190. ret
  191. ; For some reason, the OS X linker does not honor the request to align the
  192. ; segment unless we do this.
  193. align 32