jquanti-sse2.asm 6.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187
  1. ;
  2. ; jquanti.asm - sample data conversion and quantization (64-bit SSE2)
  3. ;
  4. ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
  5. ; Copyright (C) 2009, 2016, D. R. Commander.
  6. ;
  7. ; Based on the x86 SIMD extension for IJG JPEG library
  8. ; Copyright (C) 1999-2006, MIYASAKA Masaru.
  9. ; For conditions of distribution and use, see copyright notice in jsimdext.inc
  10. ;
  11. ; This file should be assembled with NASM (Netwide Assembler),
  12. ; can *not* be assembled with Microsoft's MASM or any compatible
  13. ; assembler (including Borland's Turbo Assembler).
  14. ; NASM is available from http://nasm.sourceforge.net/ or
  15. ; http://sourceforge.net/project/showfiles.php?group_id=6208
  16. %include "jsimdext.inc"
  17. %include "jdct.inc"
  18. ; --------------------------------------------------------------------------
  19. SECTION SEG_TEXT
  20. BITS 64
  21. ;
  22. ; Load data into workspace, applying unsigned->signed conversion
  23. ;
  24. ; GLOBAL(void)
  25. ; jsimd_convsamp_sse2(JSAMPARRAY sample_data, JDIMENSION start_col,
  26. ; DCTELEM *workspace);
  27. ;
  28. ; r10 = JSAMPARRAY sample_data
  29. ; r11d = JDIMENSION start_col
  30. ; r12 = DCTELEM *workspace
  31. align 32
  32. GLOBAL_FUNCTION(jsimd_convsamp_sse2)
  33. EXTN(jsimd_convsamp_sse2):
  34. push rbp
  35. mov rax, rsp
  36. mov rbp, rsp
  37. collect_args 3
  38. push rbx
  39. pxor xmm6, xmm6 ; xmm6=(all 0's)
  40. pcmpeqw xmm7, xmm7
  41. psllw xmm7, 7 ; xmm7={0xFF80 0xFF80 0xFF80 0xFF80 ..}
  42. mov rsi, r10
  43. mov eax, r11d
  44. mov rdi, r12
  45. mov rcx, DCTSIZE/4
  46. .convloop:
  47. mov rbx, JSAMPROW [rsi+0*SIZEOF_JSAMPROW] ; (JSAMPLE *)
  48. mov rdx, JSAMPROW [rsi+1*SIZEOF_JSAMPROW] ; (JSAMPLE *)
  49. movq xmm0, XMM_MMWORD [rbx+rax*SIZEOF_JSAMPLE] ; xmm0=(01234567)
  50. movq xmm1, XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE] ; xmm1=(89ABCDEF)
  51. mov rbx, JSAMPROW [rsi+2*SIZEOF_JSAMPROW] ; (JSAMPLE *)
  52. mov rdx, JSAMPROW [rsi+3*SIZEOF_JSAMPROW] ; (JSAMPLE *)
  53. movq xmm2, XMM_MMWORD [rbx+rax*SIZEOF_JSAMPLE] ; xmm2=(GHIJKLMN)
  54. movq xmm3, XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE] ; xmm3=(OPQRSTUV)
  55. punpcklbw xmm0, xmm6 ; xmm0=(01234567)
  56. punpcklbw xmm1, xmm6 ; xmm1=(89ABCDEF)
  57. paddw xmm0, xmm7
  58. paddw xmm1, xmm7
  59. punpcklbw xmm2, xmm6 ; xmm2=(GHIJKLMN)
  60. punpcklbw xmm3, xmm6 ; xmm3=(OPQRSTUV)
  61. paddw xmm2, xmm7
  62. paddw xmm3, xmm7
  63. movdqa XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_DCTELEM)], xmm0
  64. movdqa XMMWORD [XMMBLOCK(1,0,rdi,SIZEOF_DCTELEM)], xmm1
  65. movdqa XMMWORD [XMMBLOCK(2,0,rdi,SIZEOF_DCTELEM)], xmm2
  66. movdqa XMMWORD [XMMBLOCK(3,0,rdi,SIZEOF_DCTELEM)], xmm3
  67. add rsi, byte 4*SIZEOF_JSAMPROW
  68. add rdi, byte 4*DCTSIZE*SIZEOF_DCTELEM
  69. dec rcx
  70. jnz short .convloop
  71. pop rbx
  72. uncollect_args 3
  73. pop rbp
  74. ret
  75. ; --------------------------------------------------------------------------
  76. ;
  77. ; Quantize/descale the coefficients, and store into coef_block
  78. ;
  79. ; This implementation is based on an algorithm described in
  80. ; "How to optimize for the Pentium family of microprocessors"
  81. ; (http://www.agner.org/assem/).
  82. ;
  83. ; GLOBAL(void)
  84. ; jsimd_quantize_sse2(JCOEFPTR coef_block, DCTELEM *divisors,
  85. ; DCTELEM *workspace);
  86. ;
  87. %define RECIPROCAL(m, n, b) \
  88. XMMBLOCK(DCTSIZE * 0 + (m), (n), (b), SIZEOF_DCTELEM)
  89. %define CORRECTION(m, n, b) \
  90. XMMBLOCK(DCTSIZE * 1 + (m), (n), (b), SIZEOF_DCTELEM)
  91. %define SCALE(m, n, b) \
  92. XMMBLOCK(DCTSIZE * 2 + (m), (n), (b), SIZEOF_DCTELEM)
  93. ; r10 = JCOEFPTR coef_block
  94. ; r11 = DCTELEM *divisors
  95. ; r12 = DCTELEM *workspace
  96. align 32
  97. GLOBAL_FUNCTION(jsimd_quantize_sse2)
  98. EXTN(jsimd_quantize_sse2):
  99. push rbp
  100. mov rax, rsp
  101. mov rbp, rsp
  102. collect_args 3
  103. mov rsi, r12
  104. mov rdx, r11
  105. mov rdi, r10
  106. mov rax, DCTSIZE2/32
  107. .quantloop:
  108. movdqa xmm4, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_DCTELEM)]
  109. movdqa xmm5, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_DCTELEM)]
  110. movdqa xmm6, XMMWORD [XMMBLOCK(2,0,rsi,SIZEOF_DCTELEM)]
  111. movdqa xmm7, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_DCTELEM)]
  112. movdqa xmm0, xmm4
  113. movdqa xmm1, xmm5
  114. movdqa xmm2, xmm6
  115. movdqa xmm3, xmm7
  116. psraw xmm4, (WORD_BIT-1)
  117. psraw xmm5, (WORD_BIT-1)
  118. psraw xmm6, (WORD_BIT-1)
  119. psraw xmm7, (WORD_BIT-1)
  120. pxor xmm0, xmm4
  121. pxor xmm1, xmm5
  122. pxor xmm2, xmm6
  123. pxor xmm3, xmm7
  124. psubw xmm0, xmm4 ; if (xmm0 < 0) xmm0 = -xmm0;
  125. psubw xmm1, xmm5 ; if (xmm1 < 0) xmm1 = -xmm1;
  126. psubw xmm2, xmm6 ; if (xmm2 < 0) xmm2 = -xmm2;
  127. psubw xmm3, xmm7 ; if (xmm3 < 0) xmm3 = -xmm3;
  128. paddw xmm0, XMMWORD [CORRECTION(0,0,rdx)] ; correction + roundfactor
  129. paddw xmm1, XMMWORD [CORRECTION(1,0,rdx)]
  130. paddw xmm2, XMMWORD [CORRECTION(2,0,rdx)]
  131. paddw xmm3, XMMWORD [CORRECTION(3,0,rdx)]
  132. pmulhuw xmm0, XMMWORD [RECIPROCAL(0,0,rdx)] ; reciprocal
  133. pmulhuw xmm1, XMMWORD [RECIPROCAL(1,0,rdx)]
  134. pmulhuw xmm2, XMMWORD [RECIPROCAL(2,0,rdx)]
  135. pmulhuw xmm3, XMMWORD [RECIPROCAL(3,0,rdx)]
  136. pmulhuw xmm0, XMMWORD [SCALE(0,0,rdx)] ; scale
  137. pmulhuw xmm1, XMMWORD [SCALE(1,0,rdx)]
  138. pmulhuw xmm2, XMMWORD [SCALE(2,0,rdx)]
  139. pmulhuw xmm3, XMMWORD [SCALE(3,0,rdx)]
  140. pxor xmm0, xmm4
  141. pxor xmm1, xmm5
  142. pxor xmm2, xmm6
  143. pxor xmm3, xmm7
  144. psubw xmm0, xmm4
  145. psubw xmm1, xmm5
  146. psubw xmm2, xmm6
  147. psubw xmm3, xmm7
  148. movdqa XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_DCTELEM)], xmm0
  149. movdqa XMMWORD [XMMBLOCK(1,0,rdi,SIZEOF_DCTELEM)], xmm1
  150. movdqa XMMWORD [XMMBLOCK(2,0,rdi,SIZEOF_DCTELEM)], xmm2
  151. movdqa XMMWORD [XMMBLOCK(3,0,rdi,SIZEOF_DCTELEM)], xmm3
  152. add rsi, byte 32*SIZEOF_DCTELEM
  153. add rdx, byte 32*SIZEOF_DCTELEM
  154. add rdi, byte 32*SIZEOF_JCOEF
  155. dec rax
  156. jnz near .quantloop
  157. uncollect_args 3
  158. pop rbp
  159. ret
  160. ; For some reason, the OS X linker does not honor the request to align the
  161. ; segment unless we do this.
  162. align 32