jcsample-avx2.asm 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366
  1. ;
  2. ; jcsample.asm - downsampling (64-bit AVX2)
  3. ;
  4. ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
  5. ; Copyright (C) 2009, 2016, D. R. Commander.
  6. ; Copyright (C) 2015, Intel Corporation.
  7. ;
  8. ; Based on the x86 SIMD extension for IJG JPEG library
  9. ; Copyright (C) 1999-2006, MIYASAKA Masaru.
  10. ; For conditions of distribution and use, see copyright notice in jsimdext.inc
  11. ;
  12. ; This file should be assembled with NASM (Netwide Assembler),
  13. ; can *not* be assembled with Microsoft's MASM or any compatible
  14. ; assembler (including Borland's Turbo Assembler).
  15. ; NASM is available from http://nasm.sourceforge.net/ or
  16. ; http://sourceforge.net/project/showfiles.php?group_id=6208
  17. %include "jsimdext.inc"
  18. ; --------------------------------------------------------------------------
  19. SECTION SEG_TEXT
  20. BITS 64
  21. ;
  22. ; Downsample pixel values of a single component.
  23. ; This version handles the common case of 2:1 horizontal and 1:1 vertical,
  24. ; without smoothing.
  25. ;
  26. ; GLOBAL(void)
  27. ; jsimd_h2v1_downsample_avx2(JDIMENSION image_width, int max_v_samp_factor,
  28. ; JDIMENSION v_samp_factor,
  29. ; JDIMENSION width_in_blocks, JSAMPARRAY input_data,
  30. ; JSAMPARRAY output_data);
  31. ;
  32. ; r10d = JDIMENSION image_width
  33. ; r11 = int max_v_samp_factor
  34. ; r12d = JDIMENSION v_samp_factor
  35. ; r13d = JDIMENSION width_in_blocks
  36. ; r14 = JSAMPARRAY input_data
  37. ; r15 = JSAMPARRAY output_data
  38. align 32
  39. GLOBAL_FUNCTION(jsimd_h2v1_downsample_avx2)
  40. EXTN(jsimd_h2v1_downsample_avx2):
  41. push rbp
  42. mov rax, rsp
  43. mov rbp, rsp
  44. collect_args 6
  45. mov ecx, r13d
  46. shl rcx, 3 ; imul rcx,DCTSIZE (rcx = output_cols)
  47. jz near .return
  48. mov edx, r10d
  49. ; -- expand_right_edge
  50. push rcx
  51. shl rcx, 1 ; output_cols * 2
  52. sub rcx, rdx
  53. jle short .expand_end
  54. mov rax, r11
  55. test rax, rax
  56. jle short .expand_end
  57. cld
  58. mov rsi, r14 ; input_data
  59. .expandloop:
  60. push rax
  61. push rcx
  62. mov rdi, JSAMPROW [rsi]
  63. add rdi, rdx
  64. mov al, JSAMPLE [rdi-1]
  65. rep stosb
  66. pop rcx
  67. pop rax
  68. add rsi, byte SIZEOF_JSAMPROW
  69. dec rax
  70. jg short .expandloop
  71. .expand_end:
  72. pop rcx ; output_cols
  73. ; -- h2v1_downsample
  74. mov eax, r12d ; rowctr
  75. test eax, eax
  76. jle near .return
  77. mov rdx, 0x00010000 ; bias pattern
  78. vmovd xmm7, edx
  79. vpshufd xmm7, xmm7, 0x00 ; xmm7={0, 1, 0, 1, 0, 1, 0, 1}
  80. vperm2i128 ymm7, ymm7, ymm7, 0 ; ymm7={xmm7, xmm7}
  81. vpcmpeqw ymm6, ymm6, ymm6
  82. vpsrlw ymm6, ymm6, BYTE_BIT ; ymm6={0xFF 0x00 0xFF 0x00 ..}
  83. mov rsi, r14 ; input_data
  84. mov rdi, r15 ; output_data
  85. .rowloop:
  86. push rcx
  87. push rdi
  88. push rsi
  89. mov rsi, JSAMPROW [rsi] ; inptr
  90. mov rdi, JSAMPROW [rdi] ; outptr
  91. cmp rcx, byte SIZEOF_YMMWORD
  92. jae short .columnloop
  93. .columnloop_r24:
  94. ; rcx can possibly be 8, 16, 24
  95. cmp rcx, 24
  96. jne .columnloop_r16
  97. vmovdqu ymm0, YMMWORD [rsi+0*SIZEOF_YMMWORD]
  98. vmovdqu xmm1, XMMWORD [rsi+1*SIZEOF_YMMWORD]
  99. mov rcx, SIZEOF_YMMWORD
  100. jmp short .downsample
  101. .columnloop_r16:
  102. cmp rcx, 16
  103. jne .columnloop_r8
  104. vmovdqu ymm0, YMMWORD [rsi+0*SIZEOF_YMMWORD]
  105. vpxor ymm1, ymm1, ymm1
  106. mov rcx, SIZEOF_YMMWORD
  107. jmp short .downsample
  108. .columnloop_r8:
  109. vmovdqu xmm0, XMMWORD[rsi+0*SIZEOF_YMMWORD]
  110. vpxor ymm1, ymm1, ymm1
  111. mov rcx, SIZEOF_YMMWORD
  112. jmp short .downsample
  113. .columnloop:
  114. vmovdqu ymm0, YMMWORD [rsi+0*SIZEOF_YMMWORD]
  115. vmovdqu ymm1, YMMWORD [rsi+1*SIZEOF_YMMWORD]
  116. .downsample:
  117. vpsrlw ymm2, ymm0, BYTE_BIT
  118. vpand ymm0, ymm0, ymm6
  119. vpsrlw ymm3, ymm1, BYTE_BIT
  120. vpand ymm1, ymm1, ymm6
  121. vpaddw ymm0, ymm0, ymm2
  122. vpaddw ymm1, ymm1, ymm3
  123. vpaddw ymm0, ymm0, ymm7
  124. vpaddw ymm1, ymm1, ymm7
  125. vpsrlw ymm0, ymm0, 1
  126. vpsrlw ymm1, ymm1, 1
  127. vpackuswb ymm0, ymm0, ymm1
  128. vpermq ymm0, ymm0, 0xd8
  129. vmovdqu YMMWORD [rdi+0*SIZEOF_YMMWORD], ymm0
  130. sub rcx, byte SIZEOF_YMMWORD ; outcol
  131. add rsi, byte 2*SIZEOF_YMMWORD ; inptr
  132. add rdi, byte 1*SIZEOF_YMMWORD ; outptr
  133. cmp rcx, byte SIZEOF_YMMWORD
  134. jae short .columnloop
  135. test rcx, rcx
  136. jnz near .columnloop_r24
  137. pop rsi
  138. pop rdi
  139. pop rcx
  140. add rsi, byte SIZEOF_JSAMPROW ; input_data
  141. add rdi, byte SIZEOF_JSAMPROW ; output_data
  142. dec rax ; rowctr
  143. jg near .rowloop
  144. .return:
  145. vzeroupper
  146. uncollect_args 6
  147. pop rbp
  148. ret
  149. ; --------------------------------------------------------------------------
  150. ;
  151. ; Downsample pixel values of a single component.
  152. ; This version handles the standard case of 2:1 horizontal and 2:1 vertical,
  153. ; without smoothing.
  154. ;
  155. ; GLOBAL(void)
  156. ; jsimd_h2v2_downsample_avx2(JDIMENSION image_width, int max_v_samp_factor,
  157. ; JDIMENSION v_samp_factor,
  158. ; JDIMENSION width_in_blocks, JSAMPARRAY input_data,
  159. ; JSAMPARRAY output_data);
  160. ;
  161. ; r10d = JDIMENSION image_width
  162. ; r11 = int max_v_samp_factor
  163. ; r12d = JDIMENSION v_samp_factor
  164. ; r13d = JDIMENSION width_in_blocks
  165. ; r14 = JSAMPARRAY input_data
  166. ; r15 = JSAMPARRAY output_data
  167. align 32
  168. GLOBAL_FUNCTION(jsimd_h2v2_downsample_avx2)
  169. EXTN(jsimd_h2v2_downsample_avx2):
  170. push rbp
  171. mov rax, rsp
  172. mov rbp, rsp
  173. collect_args 6
  174. mov ecx, r13d
  175. shl rcx, 3 ; imul rcx,DCTSIZE (rcx = output_cols)
  176. jz near .return
  177. mov edx, r10d
  178. ; -- expand_right_edge
  179. push rcx
  180. shl rcx, 1 ; output_cols * 2
  181. sub rcx, rdx
  182. jle short .expand_end
  183. mov rax, r11
  184. test rax, rax
  185. jle short .expand_end
  186. cld
  187. mov rsi, r14 ; input_data
  188. .expandloop:
  189. push rax
  190. push rcx
  191. mov rdi, JSAMPROW [rsi]
  192. add rdi, rdx
  193. mov al, JSAMPLE [rdi-1]
  194. rep stosb
  195. pop rcx
  196. pop rax
  197. add rsi, byte SIZEOF_JSAMPROW
  198. dec rax
  199. jg short .expandloop
  200. .expand_end:
  201. pop rcx ; output_cols
  202. ; -- h2v2_downsample
  203. mov eax, r12d ; rowctr
  204. test rax, rax
  205. jle near .return
  206. mov rdx, 0x00020001 ; bias pattern
  207. vmovd xmm7, edx
  208. vpcmpeqw ymm6, ymm6, ymm6
  209. vpshufd xmm7, xmm7, 0x00 ; ymm7={1, 2, 1, 2, 1, 2, 1, 2}
  210. vperm2i128 ymm7, ymm7, ymm7, 0
  211. vpsrlw ymm6, ymm6, BYTE_BIT ; ymm6={0xFF 0x00 0xFF 0x00 ..}
  212. mov rsi, r14 ; input_data
  213. mov rdi, r15 ; output_data
  214. .rowloop:
  215. push rcx
  216. push rdi
  217. push rsi
  218. mov rdx, JSAMPROW [rsi+0*SIZEOF_JSAMPROW] ; inptr0
  219. mov rsi, JSAMPROW [rsi+1*SIZEOF_JSAMPROW] ; inptr1
  220. mov rdi, JSAMPROW [rdi] ; outptr
  221. cmp rcx, byte SIZEOF_YMMWORD
  222. jae short .columnloop
  223. .columnloop_r24:
  224. cmp rcx, 24
  225. jne .columnloop_r16
  226. vmovdqu ymm0, YMMWORD [rdx+0*SIZEOF_YMMWORD]
  227. vmovdqu ymm1, YMMWORD [rsi+0*SIZEOF_YMMWORD]
  228. vmovdqu xmm2, XMMWORD [rdx+1*SIZEOF_YMMWORD]
  229. vmovdqu xmm3, XMMWORD [rsi+1*SIZEOF_YMMWORD]
  230. mov rcx, SIZEOF_YMMWORD
  231. jmp short .downsample
  232. .columnloop_r16:
  233. cmp rcx, 16
  234. jne .columnloop_r8
  235. vmovdqu ymm0, YMMWORD [rdx+0*SIZEOF_YMMWORD]
  236. vmovdqu ymm1, YMMWORD [rsi+0*SIZEOF_YMMWORD]
  237. vpxor ymm2, ymm2, ymm2
  238. vpxor ymm3, ymm3, ymm3
  239. mov rcx, SIZEOF_YMMWORD
  240. jmp short .downsample
  241. .columnloop_r8:
  242. vmovdqu xmm0, XMMWORD [rdx+0*SIZEOF_XMMWORD]
  243. vmovdqu xmm1, XMMWORD [rsi+0*SIZEOF_XMMWORD]
  244. vpxor ymm2, ymm2, ymm2
  245. vpxor ymm3, ymm3, ymm3
  246. mov rcx, SIZEOF_YMMWORD
  247. jmp short .downsample
  248. .columnloop:
  249. vmovdqu ymm0, YMMWORD [rdx+0*SIZEOF_YMMWORD]
  250. vmovdqu ymm1, YMMWORD [rsi+0*SIZEOF_YMMWORD]
  251. vmovdqu ymm2, YMMWORD [rdx+1*SIZEOF_YMMWORD]
  252. vmovdqu ymm3, YMMWORD [rsi+1*SIZEOF_YMMWORD]
  253. .downsample:
  254. vpand ymm4, ymm0, ymm6
  255. vpsrlw ymm0, ymm0, BYTE_BIT
  256. vpand ymm5, ymm1, ymm6
  257. vpsrlw ymm1, ymm1, BYTE_BIT
  258. vpaddw ymm0, ymm0, ymm4
  259. vpaddw ymm1, ymm1, ymm5
  260. vpand ymm4, ymm2, ymm6
  261. vpsrlw ymm2, ymm2, BYTE_BIT
  262. vpand ymm5, ymm3, ymm6
  263. vpsrlw ymm3, ymm3, BYTE_BIT
  264. vpaddw ymm2, ymm2, ymm4
  265. vpaddw ymm3, ymm3, ymm5
  266. vpaddw ymm0, ymm0, ymm1
  267. vpaddw ymm2, ymm2, ymm3
  268. vpaddw ymm0, ymm0, ymm7
  269. vpaddw ymm2, ymm2, ymm7
  270. vpsrlw ymm0, ymm0, 2
  271. vpsrlw ymm2, ymm2, 2
  272. vpackuswb ymm0, ymm0, ymm2
  273. vpermq ymm0, ymm0, 0xd8
  274. vmovdqu YMMWORD [rdi+0*SIZEOF_YMMWORD], ymm0
  275. sub rcx, byte SIZEOF_YMMWORD ; outcol
  276. add rdx, byte 2*SIZEOF_YMMWORD ; inptr0
  277. add rsi, byte 2*SIZEOF_YMMWORD ; inptr1
  278. add rdi, byte 1*SIZEOF_YMMWORD ; outptr
  279. cmp rcx, byte SIZEOF_YMMWORD
  280. jae near .columnloop
  281. test rcx, rcx
  282. jnz near .columnloop_r24
  283. pop rsi
  284. pop rdi
  285. pop rcx
  286. add rsi, byte 2*SIZEOF_JSAMPROW ; input_data
  287. add rdi, byte 1*SIZEOF_JSAMPROW ; output_data
  288. dec rax ; rowctr
  289. jg near .rowloop
  290. .return:
  291. vzeroupper
  292. uncollect_args 6
  293. pop rbp
  294. ret
  295. ; For some reason, the OS X linker does not honor the request to align the
  296. ; segment unless we do this.
  297. align 32