jcsample-mmx.asm 9.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324
  1. ;
  2. ; jcsample.asm - downsampling (MMX)
  3. ;
  4. ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
  5. ; Copyright (C) 2016, D. R. Commander.
  6. ;
  7. ; Based on the x86 SIMD extension for IJG JPEG library
  8. ; Copyright (C) 1999-2006, MIYASAKA Masaru.
  9. ; For conditions of distribution and use, see copyright notice in jsimdext.inc
  10. ;
  11. ; This file should be assembled with NASM (Netwide Assembler),
  12. ; can *not* be assembled with Microsoft's MASM or any compatible
  13. ; assembler (including Borland's Turbo Assembler).
  14. ; NASM is available from http://nasm.sourceforge.net/ or
  15. ; http://sourceforge.net/project/showfiles.php?group_id=6208
  16. %include "jsimdext.inc"
  17. ; --------------------------------------------------------------------------
  18. SECTION SEG_TEXT
  19. BITS 32
  20. ;
  21. ; Downsample pixel values of a single component.
  22. ; This version handles the common case of 2:1 horizontal and 1:1 vertical,
  23. ; without smoothing.
  24. ;
  25. ; GLOBAL(void)
  26. ; jsimd_h2v1_downsample_mmx(JDIMENSION image_width, int max_v_samp_factor,
  27. ; JDIMENSION v_samp_factor,
  28. ; JDIMENSION width_in_blocks, JSAMPARRAY input_data,
  29. ; JSAMPARRAY output_data);
  30. ;
  31. %define img_width(b) (b) + 8 ; JDIMENSION image_width
  32. %define max_v_samp(b) (b) + 12 ; int max_v_samp_factor
  33. %define v_samp(b) (b) + 16 ; JDIMENSION v_samp_factor
  34. %define width_blks(b) (b) + 20 ; JDIMENSION width_in_blocks
  35. %define input_data(b) (b) + 24 ; JSAMPARRAY input_data
  36. %define output_data(b) (b) + 28 ; JSAMPARRAY output_data
  37. align 32
  38. GLOBAL_FUNCTION(jsimd_h2v1_downsample_mmx)
  39. EXTN(jsimd_h2v1_downsample_mmx):
  40. push ebp
  41. mov ebp, esp
  42. ; push ebx ; unused
  43. ; push ecx ; need not be preserved
  44. ; push edx ; need not be preserved
  45. push esi
  46. push edi
  47. mov ecx, JDIMENSION [width_blks(ebp)]
  48. shl ecx, 3 ; imul ecx,DCTSIZE (ecx = output_cols)
  49. jz near .return
  50. mov edx, JDIMENSION [img_width(ebp)]
  51. ; -- expand_right_edge
  52. push ecx
  53. shl ecx, 1 ; output_cols * 2
  54. sub ecx, edx
  55. jle short .expand_end
  56. mov eax, INT [max_v_samp(ebp)]
  57. test eax, eax
  58. jle short .expand_end
  59. cld
  60. mov esi, JSAMPARRAY [input_data(ebp)] ; input_data
  61. alignx 16, 7
  62. .expandloop:
  63. push eax
  64. push ecx
  65. mov edi, JSAMPROW [esi]
  66. add edi, edx
  67. mov al, JSAMPLE [edi-1]
  68. rep stosb
  69. pop ecx
  70. pop eax
  71. add esi, byte SIZEOF_JSAMPROW
  72. dec eax
  73. jg short .expandloop
  74. .expand_end:
  75. pop ecx ; output_cols
  76. ; -- h2v1_downsample
  77. mov eax, JDIMENSION [v_samp(ebp)] ; rowctr
  78. test eax, eax
  79. jle near .return
  80. mov edx, 0x00010000 ; bias pattern
  81. movd mm7, edx
  82. pcmpeqw mm6, mm6
  83. punpckldq mm7, mm7 ; mm7={0, 1, 0, 1}
  84. psrlw mm6, BYTE_BIT ; mm6={0xFF 0x00 0xFF 0x00 ..}
  85. mov esi, JSAMPARRAY [input_data(ebp)] ; input_data
  86. mov edi, JSAMPARRAY [output_data(ebp)] ; output_data
  87. alignx 16, 7
  88. .rowloop:
  89. push ecx
  90. push edi
  91. push esi
  92. mov esi, JSAMPROW [esi] ; inptr
  93. mov edi, JSAMPROW [edi] ; outptr
  94. alignx 16, 7
  95. .columnloop:
  96. movq mm0, MMWORD [esi+0*SIZEOF_MMWORD]
  97. movq mm1, MMWORD [esi+1*SIZEOF_MMWORD]
  98. movq mm2, mm0
  99. movq mm3, mm1
  100. pand mm0, mm6
  101. psrlw mm2, BYTE_BIT
  102. pand mm1, mm6
  103. psrlw mm3, BYTE_BIT
  104. paddw mm0, mm2
  105. paddw mm1, mm3
  106. paddw mm0, mm7
  107. paddw mm1, mm7
  108. psrlw mm0, 1
  109. psrlw mm1, 1
  110. packuswb mm0, mm1
  111. movq MMWORD [edi+0*SIZEOF_MMWORD], mm0
  112. add esi, byte 2*SIZEOF_MMWORD ; inptr
  113. add edi, byte 1*SIZEOF_MMWORD ; outptr
  114. sub ecx, byte SIZEOF_MMWORD ; outcol
  115. jnz short .columnloop
  116. pop esi
  117. pop edi
  118. pop ecx
  119. add esi, byte SIZEOF_JSAMPROW ; input_data
  120. add edi, byte SIZEOF_JSAMPROW ; output_data
  121. dec eax ; rowctr
  122. jg short .rowloop
  123. emms ; empty MMX state
  124. .return:
  125. pop edi
  126. pop esi
  127. ; pop edx ; need not be preserved
  128. ; pop ecx ; need not be preserved
  129. ; pop ebx ; unused
  130. pop ebp
  131. ret
  132. ; --------------------------------------------------------------------------
  133. ;
  134. ; Downsample pixel values of a single component.
  135. ; This version handles the standard case of 2:1 horizontal and 2:1 vertical,
  136. ; without smoothing.
  137. ;
  138. ; GLOBAL(void)
  139. ; jsimd_h2v2_downsample_mmx(JDIMENSION image_width, int max_v_samp_factor,
  140. ; JDIMENSION v_samp_factor,
  141. ; JDIMENSION width_in_blocks, JSAMPARRAY input_data,
  142. ; JSAMPARRAY output_data);
  143. ;
  144. %define img_width(b) (b) + 8 ; JDIMENSION image_width
  145. %define max_v_samp(b) (b) + 12 ; int max_v_samp_factor
  146. %define v_samp(b) (b) + 16 ; JDIMENSION v_samp_factor
  147. %define width_blks(b) (b) + 20 ; JDIMENSION width_in_blocks
  148. %define input_data(b) (b) + 24 ; JSAMPARRAY input_data
  149. %define output_data(b) (b) + 28 ; JSAMPARRAY output_data
  150. align 32
  151. GLOBAL_FUNCTION(jsimd_h2v2_downsample_mmx)
  152. EXTN(jsimd_h2v2_downsample_mmx):
  153. push ebp
  154. mov ebp, esp
  155. ; push ebx ; unused
  156. ; push ecx ; need not be preserved
  157. ; push edx ; need not be preserved
  158. push esi
  159. push edi
  160. mov ecx, JDIMENSION [width_blks(ebp)]
  161. shl ecx, 3 ; imul ecx,DCTSIZE (ecx = output_cols)
  162. jz near .return
  163. mov edx, JDIMENSION [img_width(ebp)]
  164. ; -- expand_right_edge
  165. push ecx
  166. shl ecx, 1 ; output_cols * 2
  167. sub ecx, edx
  168. jle short .expand_end
  169. mov eax, INT [max_v_samp(ebp)]
  170. test eax, eax
  171. jle short .expand_end
  172. cld
  173. mov esi, JSAMPARRAY [input_data(ebp)] ; input_data
  174. alignx 16, 7
  175. .expandloop:
  176. push eax
  177. push ecx
  178. mov edi, JSAMPROW [esi]
  179. add edi, edx
  180. mov al, JSAMPLE [edi-1]
  181. rep stosb
  182. pop ecx
  183. pop eax
  184. add esi, byte SIZEOF_JSAMPROW
  185. dec eax
  186. jg short .expandloop
  187. .expand_end:
  188. pop ecx ; output_cols
  189. ; -- h2v2_downsample
  190. mov eax, JDIMENSION [v_samp(ebp)] ; rowctr
  191. test eax, eax
  192. jle near .return
  193. mov edx, 0x00020001 ; bias pattern
  194. movd mm7, edx
  195. pcmpeqw mm6, mm6
  196. punpckldq mm7, mm7 ; mm7={1, 2, 1, 2}
  197. psrlw mm6, BYTE_BIT ; mm6={0xFF 0x00 0xFF 0x00 ..}
  198. mov esi, JSAMPARRAY [input_data(ebp)] ; input_data
  199. mov edi, JSAMPARRAY [output_data(ebp)] ; output_data
  200. alignx 16, 7
  201. .rowloop:
  202. push ecx
  203. push edi
  204. push esi
  205. mov edx, JSAMPROW [esi+0*SIZEOF_JSAMPROW] ; inptr0
  206. mov esi, JSAMPROW [esi+1*SIZEOF_JSAMPROW] ; inptr1
  207. mov edi, JSAMPROW [edi] ; outptr
  208. alignx 16, 7
  209. .columnloop:
  210. movq mm0, MMWORD [edx+0*SIZEOF_MMWORD]
  211. movq mm1, MMWORD [esi+0*SIZEOF_MMWORD]
  212. movq mm2, MMWORD [edx+1*SIZEOF_MMWORD]
  213. movq mm3, MMWORD [esi+1*SIZEOF_MMWORD]
  214. movq mm4, mm0
  215. movq mm5, mm1
  216. pand mm0, mm6
  217. psrlw mm4, BYTE_BIT
  218. pand mm1, mm6
  219. psrlw mm5, BYTE_BIT
  220. paddw mm0, mm4
  221. paddw mm1, mm5
  222. movq mm4, mm2
  223. movq mm5, mm3
  224. pand mm2, mm6
  225. psrlw mm4, BYTE_BIT
  226. pand mm3, mm6
  227. psrlw mm5, BYTE_BIT
  228. paddw mm2, mm4
  229. paddw mm3, mm5
  230. paddw mm0, mm1
  231. paddw mm2, mm3
  232. paddw mm0, mm7
  233. paddw mm2, mm7
  234. psrlw mm0, 2
  235. psrlw mm2, 2
  236. packuswb mm0, mm2
  237. movq MMWORD [edi+0*SIZEOF_MMWORD], mm0
  238. add edx, byte 2*SIZEOF_MMWORD ; inptr0
  239. add esi, byte 2*SIZEOF_MMWORD ; inptr1
  240. add edi, byte 1*SIZEOF_MMWORD ; outptr
  241. sub ecx, byte SIZEOF_MMWORD ; outcol
  242. jnz near .columnloop
  243. pop esi
  244. pop edi
  245. pop ecx
  246. add esi, byte 2*SIZEOF_JSAMPROW ; input_data
  247. add edi, byte 1*SIZEOF_JSAMPROW ; output_data
  248. dec eax ; rowctr
  249. jg near .rowloop
  250. emms ; empty MMX state
  251. .return:
  252. pop edi
  253. pop esi
  254. ; pop edx ; need not be preserved
  255. ; pop ecx ; need not be preserved
  256. ; pop ebx ; unused
  257. pop ebp
  258. ret
  259. ; For some reason, the OS X linker does not honor the request to align the
  260. ; segment unless we do this.
  261. align 32