jcsample-avx2.asm 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388
  1. ;
  2. ; jcsample.asm - downsampling (AVX2)
  3. ;
  4. ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
  5. ; Copyright (C) 2015, Intel Corporation.
  6. ; Copyright (C) 2016, D. R. Commander.
  7. ;
  8. ; Based on the x86 SIMD extension for IJG JPEG library
  9. ; Copyright (C) 1999-2006, MIYASAKA Masaru.
  10. ; For conditions of distribution and use, see copyright notice in jsimdext.inc
  11. ;
  12. ; This file should be assembled with NASM (Netwide Assembler),
  13. ; can *not* be assembled with Microsoft's MASM or any compatible
  14. ; assembler (including Borland's Turbo Assembler).
  15. ; NASM is available from http://nasm.sourceforge.net/ or
  16. ; http://sourceforge.net/project/showfiles.php?group_id=6208
  17. %include "jsimdext.inc"
  18. ; --------------------------------------------------------------------------
  19. SECTION SEG_TEXT
  20. BITS 32
  21. ;
  22. ; Downsample pixel values of a single component.
  23. ; This version handles the common case of 2:1 horizontal and 1:1 vertical,
  24. ; without smoothing.
  25. ;
  26. ; GLOBAL(void)
  27. ; jsimd_h2v1_downsample_avx2(JDIMENSION image_width, int max_v_samp_factor,
  28. ; JDIMENSION v_samp_factor,
  29. ; JDIMENSION width_in_blocks, JSAMPARRAY input_data,
  30. ; JSAMPARRAY output_data);
  31. ;
  32. %define img_width(b) (b) + 8 ; JDIMENSION image_width
  33. %define max_v_samp(b) (b) + 12 ; int max_v_samp_factor
  34. %define v_samp(b) (b) + 16 ; JDIMENSION v_samp_factor
  35. %define width_blks(b) (b) + 20 ; JDIMENSION width_in_blocks
  36. %define input_data(b) (b) + 24 ; JSAMPARRAY input_data
  37. %define output_data(b) (b) + 28 ; JSAMPARRAY output_data
  38. align 32
  39. GLOBAL_FUNCTION(jsimd_h2v1_downsample_avx2)
  40. EXTN(jsimd_h2v1_downsample_avx2):
  41. push ebp
  42. mov ebp, esp
  43. ; push ebx ; unused
  44. ; push ecx ; need not be preserved
  45. ; push edx ; need not be preserved
  46. push esi
  47. push edi
  48. mov ecx, JDIMENSION [width_blks(ebp)]
  49. shl ecx, 3 ; imul ecx,DCTSIZE (ecx = output_cols)
  50. jz near .return
  51. mov edx, JDIMENSION [img_width(ebp)]
  52. ; -- expand_right_edge
  53. push ecx
  54. shl ecx, 1 ; output_cols * 2
  55. sub ecx, edx
  56. jle short .expand_end
  57. mov eax, INT [max_v_samp(ebp)]
  58. test eax, eax
  59. jle short .expand_end
  60. cld
  61. mov esi, JSAMPARRAY [input_data(ebp)] ; input_data
  62. alignx 16, 7
  63. .expandloop:
  64. push eax
  65. push ecx
  66. mov edi, JSAMPROW [esi]
  67. add edi, edx
  68. mov al, JSAMPLE [edi-1]
  69. rep stosb
  70. pop ecx
  71. pop eax
  72. add esi, byte SIZEOF_JSAMPROW
  73. dec eax
  74. jg short .expandloop
  75. .expand_end:
  76. pop ecx ; output_cols
  77. ; -- h2v1_downsample
  78. mov eax, JDIMENSION [v_samp(ebp)] ; rowctr
  79. test eax, eax
  80. jle near .return
  81. mov edx, 0x00010000 ; bias pattern
  82. vmovd xmm7, edx
  83. vpshufd xmm7, xmm7, 0x00 ; xmm7={0, 1, 0, 1, 0, 1, 0, 1}
  84. vperm2i128 ymm7, ymm7, ymm7, 0 ; ymm7={xmm7, xmm7}
  85. vpcmpeqw ymm6, ymm6, ymm6
  86. vpsrlw ymm6, ymm6, BYTE_BIT ; ymm6={0xFF 0x00 0xFF 0x00 ..}
  87. mov esi, JSAMPARRAY [input_data(ebp)] ; input_data
  88. mov edi, JSAMPARRAY [output_data(ebp)] ; output_data
  89. alignx 16, 7
  90. .rowloop:
  91. push ecx
  92. push edi
  93. push esi
  94. mov esi, JSAMPROW [esi] ; inptr
  95. mov edi, JSAMPROW [edi] ; outptr
  96. cmp ecx, byte SIZEOF_YMMWORD
  97. jae short .columnloop
  98. alignx 16, 7
  99. .columnloop_r24:
  100. ; ecx can possibly be 8, 16, 24
  101. cmp ecx, 24
  102. jne .columnloop_r16
  103. vmovdqu ymm0, YMMWORD [esi+0*SIZEOF_YMMWORD]
  104. vmovdqu xmm1, XMMWORD [esi+1*SIZEOF_YMMWORD]
  105. mov ecx, SIZEOF_YMMWORD
  106. jmp short .downsample
  107. .columnloop_r16:
  108. cmp ecx, 16
  109. jne .columnloop_r8
  110. vmovdqu ymm0, YMMWORD [esi+0*SIZEOF_YMMWORD]
  111. vpxor ymm1, ymm1, ymm1
  112. mov ecx, SIZEOF_YMMWORD
  113. jmp short .downsample
  114. .columnloop_r8:
  115. vmovdqu xmm0, XMMWORD[esi+0*SIZEOF_YMMWORD]
  116. vpxor ymm1, ymm1, ymm1
  117. mov ecx, SIZEOF_YMMWORD
  118. jmp short .downsample
  119. alignx 16, 7
  120. .columnloop:
  121. vmovdqu ymm0, YMMWORD [esi+0*SIZEOF_YMMWORD]
  122. vmovdqu ymm1, YMMWORD [esi+1*SIZEOF_YMMWORD]
  123. .downsample:
  124. vpsrlw ymm2, ymm0, BYTE_BIT
  125. vpand ymm0, ymm0, ymm6
  126. vpsrlw ymm3, ymm1, BYTE_BIT
  127. vpand ymm1, ymm1, ymm6
  128. vpaddw ymm0, ymm0, ymm2
  129. vpaddw ymm1, ymm1, ymm3
  130. vpaddw ymm0, ymm0, ymm7
  131. vpaddw ymm1, ymm1, ymm7
  132. vpsrlw ymm0, ymm0, 1
  133. vpsrlw ymm1, ymm1, 1
  134. vpackuswb ymm0, ymm0, ymm1
  135. vpermq ymm0, ymm0, 0xd8
  136. vmovdqu YMMWORD [edi+0*SIZEOF_YMMWORD], ymm0
  137. sub ecx, byte SIZEOF_YMMWORD ; outcol
  138. add esi, byte 2*SIZEOF_YMMWORD ; inptr
  139. add edi, byte 1*SIZEOF_YMMWORD ; outptr
  140. cmp ecx, byte SIZEOF_YMMWORD
  141. jae short .columnloop
  142. test ecx, ecx
  143. jnz near .columnloop_r24
  144. pop esi
  145. pop edi
  146. pop ecx
  147. add esi, byte SIZEOF_JSAMPROW ; input_data
  148. add edi, byte SIZEOF_JSAMPROW ; output_data
  149. dec eax ; rowctr
  150. jg near .rowloop
  151. .return:
  152. vzeroupper
  153. pop edi
  154. pop esi
  155. ; pop edx ; need not be preserved
  156. ; pop ecx ; need not be preserved
  157. ; pop ebx ; unused
  158. pop ebp
  159. ret
  160. ; --------------------------------------------------------------------------
  161. ;
  162. ; Downsample pixel values of a single component.
  163. ; This version handles the standard case of 2:1 horizontal and 2:1 vertical,
  164. ; without smoothing.
  165. ;
  166. ; GLOBAL(void)
  167. ; jsimd_h2v2_downsample_avx2(JDIMENSION image_width, int max_v_samp_factor,
  168. ; JDIMENSION v_samp_factor,
  169. ; JDIMENSION width_in_blocks, JSAMPARRAY input_data,
  170. ; JSAMPARRAY output_data);
  171. ;
  172. %define img_width(b) (b) + 8 ; JDIMENSION image_width
  173. %define max_v_samp(b) (b) + 12 ; int max_v_samp_factor
  174. %define v_samp(b) (b) + 16 ; JDIMENSION v_samp_factor
  175. %define width_blks(b) (b) + 20 ; JDIMENSION width_in_blocks
  176. %define input_data(b) (b) + 24 ; JSAMPARRAY input_data
  177. %define output_data(b) (b) + 28 ; JSAMPARRAY output_data
  178. align 32
  179. GLOBAL_FUNCTION(jsimd_h2v2_downsample_avx2)
  180. EXTN(jsimd_h2v2_downsample_avx2):
  181. push ebp
  182. mov ebp, esp
  183. ; push ebx ; unused
  184. ; push ecx ; need not be preserved
  185. ; push edx ; need not be preserved
  186. push esi
  187. push edi
  188. mov ecx, JDIMENSION [width_blks(ebp)]
  189. shl ecx, 3 ; imul ecx,DCTSIZE (ecx = output_cols)
  190. jz near .return
  191. mov edx, JDIMENSION [img_width(ebp)]
  192. ; -- expand_right_edge
  193. push ecx
  194. shl ecx, 1 ; output_cols * 2
  195. sub ecx, edx
  196. jle short .expand_end
  197. mov eax, INT [max_v_samp(ebp)]
  198. test eax, eax
  199. jle short .expand_end
  200. cld
  201. mov esi, JSAMPARRAY [input_data(ebp)] ; input_data
  202. alignx 16, 7
  203. .expandloop:
  204. push eax
  205. push ecx
  206. mov edi, JSAMPROW [esi]
  207. add edi, edx
  208. mov al, JSAMPLE [edi-1]
  209. rep stosb
  210. pop ecx
  211. pop eax
  212. add esi, byte SIZEOF_JSAMPROW
  213. dec eax
  214. jg short .expandloop
  215. .expand_end:
  216. pop ecx ; output_cols
  217. ; -- h2v2_downsample
  218. mov eax, JDIMENSION [v_samp(ebp)] ; rowctr
  219. test eax, eax
  220. jle near .return
  221. mov edx, 0x00020001 ; bias pattern
  222. vmovd xmm7, edx
  223. vpcmpeqw ymm6, ymm6, ymm6
  224. vpshufd xmm7, xmm7, 0x00 ; ymm7={1, 2, 1, 2, 1, 2, 1, 2}
  225. vperm2i128 ymm7, ymm7, ymm7, 0
  226. vpsrlw ymm6, ymm6, BYTE_BIT ; ymm6={0xFF 0x00 0xFF 0x00 ..}
  227. mov esi, JSAMPARRAY [input_data(ebp)] ; input_data
  228. mov edi, JSAMPARRAY [output_data(ebp)] ; output_data
  229. alignx 16, 7
  230. .rowloop:
  231. push ecx
  232. push edi
  233. push esi
  234. mov edx, JSAMPROW [esi+0*SIZEOF_JSAMPROW] ; inptr0
  235. mov esi, JSAMPROW [esi+1*SIZEOF_JSAMPROW] ; inptr1
  236. mov edi, JSAMPROW [edi] ; outptr
  237. cmp ecx, byte SIZEOF_YMMWORD
  238. jae short .columnloop
  239. alignx 16, 7
  240. .columnloop_r24:
  241. cmp ecx, 24
  242. jne .columnloop_r16
  243. vmovdqu ymm0, YMMWORD [edx+0*SIZEOF_YMMWORD]
  244. vmovdqu ymm1, YMMWORD [esi+0*SIZEOF_YMMWORD]
  245. vmovdqu xmm2, XMMWORD [edx+1*SIZEOF_YMMWORD]
  246. vmovdqu xmm3, XMMWORD [esi+1*SIZEOF_YMMWORD]
  247. mov ecx, SIZEOF_YMMWORD
  248. jmp short .downsample
  249. .columnloop_r16:
  250. cmp ecx, 16
  251. jne .columnloop_r8
  252. vmovdqu ymm0, YMMWORD [edx+0*SIZEOF_YMMWORD]
  253. vmovdqu ymm1, YMMWORD [esi+0*SIZEOF_YMMWORD]
  254. vpxor ymm2, ymm2, ymm2
  255. vpxor ymm3, ymm3, ymm3
  256. mov ecx, SIZEOF_YMMWORD
  257. jmp short .downsample
  258. .columnloop_r8:
  259. vmovdqu xmm0, XMMWORD [edx+0*SIZEOF_XMMWORD]
  260. vmovdqu xmm1, XMMWORD [esi+0*SIZEOF_XMMWORD]
  261. vpxor ymm2, ymm2, ymm2
  262. vpxor ymm3, ymm3, ymm3
  263. mov ecx, SIZEOF_YMMWORD
  264. jmp short .downsample
  265. alignx 16, 7
  266. .columnloop:
  267. vmovdqu ymm0, YMMWORD [edx+0*SIZEOF_YMMWORD]
  268. vmovdqu ymm1, YMMWORD [esi+0*SIZEOF_YMMWORD]
  269. vmovdqu ymm2, YMMWORD [edx+1*SIZEOF_YMMWORD]
  270. vmovdqu ymm3, YMMWORD [esi+1*SIZEOF_YMMWORD]
  271. .downsample:
  272. vpand ymm4, ymm0, ymm6
  273. vpsrlw ymm0, ymm0, BYTE_BIT
  274. vpand ymm5, ymm1, ymm6
  275. vpsrlw ymm1, ymm1, BYTE_BIT
  276. vpaddw ymm0, ymm0, ymm4
  277. vpaddw ymm1, ymm1, ymm5
  278. vpand ymm4, ymm2, ymm6
  279. vpsrlw ymm2, ymm2, BYTE_BIT
  280. vpand ymm5, ymm3, ymm6
  281. vpsrlw ymm3, ymm3, BYTE_BIT
  282. vpaddw ymm2, ymm2, ymm4
  283. vpaddw ymm3, ymm3, ymm5
  284. vpaddw ymm0, ymm0, ymm1
  285. vpaddw ymm2, ymm2, ymm3
  286. vpaddw ymm0, ymm0, ymm7
  287. vpaddw ymm2, ymm2, ymm7
  288. vpsrlw ymm0, ymm0, 2
  289. vpsrlw ymm2, ymm2, 2
  290. vpackuswb ymm0, ymm0, ymm2
  291. vpermq ymm0, ymm0, 0xd8
  292. vmovdqu YMMWORD [edi+0*SIZEOF_YMMWORD], ymm0
  293. sub ecx, byte SIZEOF_YMMWORD ; outcol
  294. add edx, byte 2*SIZEOF_YMMWORD ; inptr0
  295. add esi, byte 2*SIZEOF_YMMWORD ; inptr1
  296. add edi, byte 1*SIZEOF_YMMWORD ; outptr
  297. cmp ecx, byte SIZEOF_YMMWORD
  298. jae near .columnloop
  299. test ecx, ecx
  300. jnz near .columnloop_r24
  301. pop esi
  302. pop edi
  303. pop ecx
  304. add esi, byte 2*SIZEOF_JSAMPROW ; input_data
  305. add edi, byte 1*SIZEOF_JSAMPROW ; output_data
  306. dec eax ; rowctr
  307. jg near .rowloop
  308. .return:
  309. vzeroupper
  310. pop edi
  311. pop esi
  312. ; pop edx ; need not be preserved
  313. ; pop ecx ; need not be preserved
  314. ; pop ebx ; unused
  315. pop ebp
  316. ret
  317. ; For some reason, the OS X linker does not honor the request to align the
  318. ; segment unless we do this.
  319. align 32