jquanti-avx2.asm 6.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188
  1. ;
  2. ; jquanti.asm - sample data conversion and quantization (AVX2)
  3. ;
  4. ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
  5. ; Copyright (C) 2016, 2018, D. R. Commander.
  6. ; Copyright (C) 2016, Matthieu Darbois.
  7. ;
  8. ; Based on the x86 SIMD extension for IJG JPEG library
  9. ; Copyright (C) 1999-2006, MIYASAKA Masaru.
  10. ; For conditions of distribution and use, see copyright notice in jsimdext.inc
  11. ;
  12. ; This file should be assembled with NASM (Netwide Assembler),
  13. ; can *not* be assembled with Microsoft's MASM or any compatible
  14. ; assembler (including Borland's Turbo Assembler).
  15. ; NASM is available from http://nasm.sourceforge.net/ or
  16. ; http://sourceforge.net/project/showfiles.php?group_id=6208
  17. %include "jsimdext.inc"
  18. %include "jdct.inc"
  19. ; --------------------------------------------------------------------------
  20. SECTION SEG_TEXT
  21. BITS 32
  22. ;
  23. ; Load data into workspace, applying unsigned->signed conversion
  24. ;
  25. ; GLOBAL(void)
  26. ; jsimd_convsamp_avx2(JSAMPARRAY sample_data, JDIMENSION start_col,
  27. ; DCTELEM *workspace);
  28. ;
  29. %define sample_data ebp + 8 ; JSAMPARRAY sample_data
  30. %define start_col ebp + 12 ; JDIMENSION start_col
  31. %define workspace ebp + 16 ; DCTELEM *workspace
  32. align 32
  33. GLOBAL_FUNCTION(jsimd_convsamp_avx2)
  34. EXTN(jsimd_convsamp_avx2):
  35. push ebp
  36. mov ebp, esp
  37. push ebx
  38. ; push ecx ; need not be preserved
  39. ; push edx ; need not be preserved
  40. push esi
  41. push edi
  42. mov esi, JSAMPARRAY [sample_data] ; (JSAMPROW *)
  43. mov eax, JDIMENSION [start_col]
  44. mov edi, POINTER [workspace] ; (DCTELEM *)
  45. mov ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW] ; (JSAMPLE *)
  46. mov edx, JSAMPROW [esi+1*SIZEOF_JSAMPROW] ; (JSAMPLE *)
  47. movq xmm0, XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE]
  48. movq xmm1, XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE]
  49. mov ebx, JSAMPROW [esi+2*SIZEOF_JSAMPROW] ; (JSAMPLE *)
  50. mov edx, JSAMPROW [esi+3*SIZEOF_JSAMPROW] ; (JSAMPLE *)
  51. movq xmm2, XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE]
  52. movq xmm3, XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE]
  53. mov ebx, JSAMPROW [esi+4*SIZEOF_JSAMPROW] ; (JSAMPLE *)
  54. mov edx, JSAMPROW [esi+5*SIZEOF_JSAMPROW] ; (JSAMPLE *)
  55. movq xmm4, XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE]
  56. movq xmm5, XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE]
  57. mov ebx, JSAMPROW [esi+6*SIZEOF_JSAMPROW] ; (JSAMPLE *)
  58. mov edx, JSAMPROW [esi+7*SIZEOF_JSAMPROW] ; (JSAMPLE *)
  59. movq xmm6, XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE]
  60. movq xmm7, XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE]
  61. vinserti128 ymm0, ymm0, xmm1, 1
  62. vinserti128 ymm2, ymm2, xmm3, 1
  63. vinserti128 ymm4, ymm4, xmm5, 1
  64. vinserti128 ymm6, ymm6, xmm7, 1
  65. vpxor ymm1, ymm1, ymm1 ; ymm1=(all 0's)
  66. vpunpcklbw ymm0, ymm0, ymm1
  67. vpunpcklbw ymm2, ymm2, ymm1
  68. vpunpcklbw ymm4, ymm4, ymm1
  69. vpunpcklbw ymm6, ymm6, ymm1
  70. vpcmpeqw ymm7, ymm7, ymm7
  71. vpsllw ymm7, ymm7, 7 ; ymm7={0xFF80 0xFF80 0xFF80 0xFF80 ..}
  72. vpaddw ymm0, ymm0, ymm7
  73. vpaddw ymm2, ymm2, ymm7
  74. vpaddw ymm4, ymm4, ymm7
  75. vpaddw ymm6, ymm6, ymm7
  76. vmovdqu YMMWORD [YMMBLOCK(0,0,edi,SIZEOF_DCTELEM)], ymm0
  77. vmovdqu YMMWORD [YMMBLOCK(2,0,edi,SIZEOF_DCTELEM)], ymm2
  78. vmovdqu YMMWORD [YMMBLOCK(4,0,edi,SIZEOF_DCTELEM)], ymm4
  79. vmovdqu YMMWORD [YMMBLOCK(6,0,edi,SIZEOF_DCTELEM)], ymm6
  80. vzeroupper
  81. pop edi
  82. pop esi
  83. ; pop edx ; need not be preserved
  84. ; pop ecx ; need not be preserved
  85. pop ebx
  86. pop ebp
  87. ret
  88. ; --------------------------------------------------------------------------
  89. ;
  90. ; Quantize/descale the coefficients, and store into coef_block
  91. ;
  92. ; This implementation is based on an algorithm described in
  93. ; "How to optimize for the Pentium family of microprocessors"
  94. ; (http://www.agner.org/assem/).
  95. ;
  96. ; GLOBAL(void)
  97. ; jsimd_quantize_avx2(JCOEFPTR coef_block, DCTELEM *divisors,
  98. ; DCTELEM *workspace);
  99. ;
  100. %define RECIPROCAL(m, n, b) \
  101. YMMBLOCK(DCTSIZE * 0 + (m), (n), (b), SIZEOF_DCTELEM)
  102. %define CORRECTION(m, n, b) \
  103. YMMBLOCK(DCTSIZE * 1 + (m), (n), (b), SIZEOF_DCTELEM)
  104. %define SCALE(m, n, b) \
  105. YMMBLOCK(DCTSIZE * 2 + (m), (n), (b), SIZEOF_DCTELEM)
  106. %define coef_block ebp + 8 ; JCOEFPTR coef_block
  107. %define divisors ebp + 12 ; DCTELEM *divisors
  108. %define workspace ebp + 16 ; DCTELEM *workspace
  109. align 32
  110. GLOBAL_FUNCTION(jsimd_quantize_avx2)
  111. EXTN(jsimd_quantize_avx2):
  112. push ebp
  113. mov ebp, esp
  114. ; push ebx ; unused
  115. ; push ecx ; unused
  116. ; push edx ; need not be preserved
  117. push esi
  118. push edi
  119. mov esi, POINTER [workspace]
  120. mov edx, POINTER [divisors]
  121. mov edi, JCOEFPTR [coef_block]
  122. vmovdqu ymm4, [YMMBLOCK(0,0,esi,SIZEOF_DCTELEM)]
  123. vmovdqu ymm5, [YMMBLOCK(2,0,esi,SIZEOF_DCTELEM)]
  124. vmovdqu ymm6, [YMMBLOCK(4,0,esi,SIZEOF_DCTELEM)]
  125. vmovdqu ymm7, [YMMBLOCK(6,0,esi,SIZEOF_DCTELEM)]
  126. vpabsw ymm0, ymm4
  127. vpabsw ymm1, ymm5
  128. vpabsw ymm2, ymm6
  129. vpabsw ymm3, ymm7
  130. vpaddw ymm0, YMMWORD [CORRECTION(0,0,edx)] ; correction + roundfactor
  131. vpaddw ymm1, YMMWORD [CORRECTION(2,0,edx)]
  132. vpaddw ymm2, YMMWORD [CORRECTION(4,0,edx)]
  133. vpaddw ymm3, YMMWORD [CORRECTION(6,0,edx)]
  134. vpmulhuw ymm0, YMMWORD [RECIPROCAL(0,0,edx)] ; reciprocal
  135. vpmulhuw ymm1, YMMWORD [RECIPROCAL(2,0,edx)]
  136. vpmulhuw ymm2, YMMWORD [RECIPROCAL(4,0,edx)]
  137. vpmulhuw ymm3, YMMWORD [RECIPROCAL(6,0,edx)]
  138. vpmulhuw ymm0, YMMWORD [SCALE(0,0,edx)] ; scale
  139. vpmulhuw ymm1, YMMWORD [SCALE(2,0,edx)]
  140. vpmulhuw ymm2, YMMWORD [SCALE(4,0,edx)]
  141. vpmulhuw ymm3, YMMWORD [SCALE(6,0,edx)]
  142. vpsignw ymm0, ymm0, ymm4
  143. vpsignw ymm1, ymm1, ymm5
  144. vpsignw ymm2, ymm2, ymm6
  145. vpsignw ymm3, ymm3, ymm7
  146. vmovdqu [YMMBLOCK(0,0,edi,SIZEOF_DCTELEM)], ymm0
  147. vmovdqu [YMMBLOCK(2,0,edi,SIZEOF_DCTELEM)], ymm1
  148. vmovdqu [YMMBLOCK(4,0,edi,SIZEOF_DCTELEM)], ymm2
  149. vmovdqu [YMMBLOCK(6,0,edi,SIZEOF_DCTELEM)], ymm3
  150. vzeroupper
  151. pop edi
  152. pop esi
  153. ; pop edx ; need not be preserved
  154. ; pop ecx ; unused
  155. ; pop ebx ; unused
  156. pop ebp
  157. ret
  158. ; For some reason, the OS X linker does not honor the request to align the
  159. ; segment unless we do this.
  160. align 32