jdsample-avx2.asm 31 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760
  1. ;
  2. ; jdsample.asm - upsampling (AVX2)
  3. ;
  4. ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
  5. ; Copyright (C) 2015, Intel Corporation.
  6. ; Copyright (C) 2016, D. R. Commander.
  7. ;
  8. ; Based on the x86 SIMD extension for IJG JPEG library
  9. ; Copyright (C) 1999-2006, MIYASAKA Masaru.
  10. ; For conditions of distribution and use, see copyright notice in jsimdext.inc
  11. ;
  12. ; This file should be assembled with NASM (Netwide Assembler),
  13. ; can *not* be assembled with Microsoft's MASM or any compatible
  14. ; assembler (including Borland's Turbo Assembler).
  15. ; NASM is available from http://nasm.sourceforge.net/ or
  16. ; http://sourceforge.net/project/showfiles.php?group_id=6208
  17. %include "jsimdext.inc"
  18. ; --------------------------------------------------------------------------
  19. SECTION SEG_CONST
  20. alignz 32
  21. GLOBAL_DATA(jconst_fancy_upsample_avx2)
  22. EXTN(jconst_fancy_upsample_avx2):
  23. PW_ONE times 16 dw 1
  24. PW_TWO times 16 dw 2
  25. PW_THREE times 16 dw 3
  26. PW_SEVEN times 16 dw 7
  27. PW_EIGHT times 16 dw 8
  28. alignz 32
  29. ; --------------------------------------------------------------------------
  30. SECTION SEG_TEXT
  31. BITS 32
  32. ;
  33. ; Fancy processing for the common case of 2:1 horizontal and 1:1 vertical.
  34. ;
  35. ; The upsampling algorithm is linear interpolation between pixel centers,
  36. ; also known as a "triangle filter". This is a good compromise between
  37. ; speed and visual quality. The centers of the output pixels are 1/4 and 3/4
  38. ; of the way between input pixel centers.
  39. ;
  40. ; GLOBAL(void)
  41. ; jsimd_h2v1_fancy_upsample_avx2(int max_v_samp_factor,
  42. ; JDIMENSION downsampled_width,
  43. ; JSAMPARRAY input_data,
  44. ; JSAMPARRAY *output_data_ptr);
  45. ;
  46. %define max_v_samp(b) (b) + 8 ; int max_v_samp_factor
  47. %define downsamp_width(b) (b) + 12 ; JDIMENSION downsampled_width
  48. %define input_data(b) (b) + 16 ; JSAMPARRAY input_data
  49. %define output_data_ptr(b) (b) + 20 ; JSAMPARRAY *output_data_ptr
  50. align 32
  51. GLOBAL_FUNCTION(jsimd_h2v1_fancy_upsample_avx2)
  52. EXTN(jsimd_h2v1_fancy_upsample_avx2):
  53. push ebp
  54. mov ebp, esp
  55. pushpic ebx
  56. ; push ecx ; need not be preserved
  57. ; push edx ; need not be preserved
  58. push esi
  59. push edi
  60. get_GOT ebx ; get GOT address
  61. mov eax, JDIMENSION [downsamp_width(ebp)] ; colctr
  62. test eax, eax
  63. jz near .return
  64. mov ecx, INT [max_v_samp(ebp)] ; rowctr
  65. test ecx, ecx
  66. jz near .return
  67. mov esi, JSAMPARRAY [input_data(ebp)] ; input_data
  68. mov edi, POINTER [output_data_ptr(ebp)]
  69. mov edi, JSAMPARRAY [edi] ; output_data
  70. alignx 16, 7
  71. .rowloop:
  72. push eax ; colctr
  73. push edi
  74. push esi
  75. mov esi, JSAMPROW [esi] ; inptr
  76. mov edi, JSAMPROW [edi] ; outptr
  77. test eax, SIZEOF_YMMWORD-1
  78. jz short .skip
  79. mov dl, JSAMPLE [esi+(eax-1)*SIZEOF_JSAMPLE]
  80. mov JSAMPLE [esi+eax*SIZEOF_JSAMPLE], dl ; insert a dummy sample
  81. .skip:
  82. vpxor ymm0, ymm0, ymm0 ; ymm0=(all 0's)
  83. vpcmpeqb xmm7, xmm7, xmm7
  84. vpsrldq xmm7, xmm7, (SIZEOF_XMMWORD-1) ; (ff -- -- -- ... -- --) LSB is ff
  85. vpand ymm7, ymm7, YMMWORD [esi+0*SIZEOF_YMMWORD]
  86. add eax, byte SIZEOF_YMMWORD-1
  87. and eax, byte -SIZEOF_YMMWORD
  88. cmp eax, byte SIZEOF_YMMWORD
  89. ja short .columnloop
  90. alignx 16, 7
  91. .columnloop_last:
  92. vpcmpeqb xmm6, xmm6, xmm6
  93. vpslldq xmm6, xmm6, (SIZEOF_XMMWORD-1)
  94. vperm2i128 ymm6, ymm6, ymm6, 1 ; (---- ---- ... ---- ---- ff) MSB is ff
  95. vpand ymm6, ymm6, YMMWORD [esi+0*SIZEOF_YMMWORD]
  96. jmp short .upsample
  97. alignx 16, 7
  98. .columnloop:
  99. vmovdqu ymm6, YMMWORD [esi+1*SIZEOF_YMMWORD]
  100. vperm2i128 ymm6, ymm0, ymm6, 0x20
  101. vpslldq ymm6, ymm6, 15
  102. .upsample:
  103. vmovdqu ymm1, YMMWORD [esi+0*SIZEOF_YMMWORD] ; ymm1=( 0 1 2 ... 29 30 31)
  104. vperm2i128 ymm2, ymm0, ymm1, 0x20
  105. vpalignr ymm2, ymm1, ymm2, 15 ; ymm2=(-- 0 1 ... 28 29 30)
  106. vperm2i128 ymm4, ymm0, ymm1, 0x03
  107. vpalignr ymm3, ymm4, ymm1, 1 ; ymm3=( 1 2 3 ... 30 31 --)
  108. vpor ymm2, ymm2, ymm7 ; ymm2=(-1 0 1 ... 28 29 30)
  109. vpor ymm3, ymm3, ymm6 ; ymm3=( 1 2 3 ... 30 31 32)
  110. vpsrldq ymm7, ymm4, (SIZEOF_XMMWORD-1) ; ymm7=(31 -- -- ... -- -- --)
  111. vpunpckhbw ymm4, ymm1, ymm0 ; ymm4=( 8 9 10 11 12 13 14 15 24 25 26 27 28 29 30 31)
  112. vpunpcklbw ymm5, ymm1, ymm0 ; ymm5=( 0 1 2 3 4 5 6 7 16 17 18 19 20 21 22 23)
  113. vperm2i128 ymm1, ymm5, ymm4, 0x20 ; ymm1=( 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15)
  114. vperm2i128 ymm4, ymm5, ymm4, 0x31 ; ymm4=(16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31)
  115. vpunpckhbw ymm5, ymm2, ymm0 ; ymm5=( 7 8 9 10 11 12 13 14 23 24 25 26 27 28 29 30)
  116. vpunpcklbw ymm6, ymm2, ymm0 ; ymm6=(-1 0 1 2 3 4 5 6 15 16 17 18 19 20 21 22)
  117. vperm2i128 ymm2, ymm6, ymm5, 0x20 ; ymm2=(-1 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14)
  118. vperm2i128 ymm5, ymm6, ymm5, 0x31 ; ymm5=(15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30)
  119. vpunpckhbw ymm6, ymm3, ymm0 ; ymm6=( 1 2 3 4 5 6 7 8 17 18 19 20 21 22 23 24)
  120. vpunpcklbw ymm0, ymm3, ymm0 ; ymm0=( 9 10 11 12 13 14 15 16 25 26 27 28 29 30 31 32)
  121. vperm2i128 ymm3, ymm0, ymm6, 0x20 ; ymm3=( 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16)
  122. vperm2i128 ymm6, ymm0, ymm6, 0x31 ; ymm6=(17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32)
  123. vpxor ymm0, ymm0, ymm0 ; ymm0=(all 0's)
  124. vpmullw ymm1, ymm1, [GOTOFF(ebx,PW_THREE)]
  125. vpmullw ymm4, ymm4, [GOTOFF(ebx,PW_THREE)]
  126. vpaddw ymm2, ymm2, [GOTOFF(ebx,PW_ONE)]
  127. vpaddw ymm5, ymm5, [GOTOFF(ebx,PW_ONE)]
  128. vpaddw ymm3, ymm3, [GOTOFF(ebx,PW_TWO)]
  129. vpaddw ymm6, ymm6, [GOTOFF(ebx,PW_TWO)]
  130. vpaddw ymm2, ymm2, ymm1
  131. vpaddw ymm5, ymm5, ymm4
  132. vpsrlw ymm2, ymm2, 2 ; ymm2=OutLE=( 0 2 4 6 8 10 12 14 16 18 20 22 24 26 28 30)
  133. vpsrlw ymm5, ymm5, 2 ; ymm5=OutHE=(32 34 36 38 40 42 44 46 48 50 52 54 56 58 60 62)
  134. vpaddw ymm3, ymm3, ymm1
  135. vpaddw ymm6, ymm6, ymm4
  136. vpsrlw ymm3, ymm3, 2 ; ymm3=OutLO=( 1 3 5 7 9 11 13 15 17 19 21 23 25 27 29 31)
  137. vpsrlw ymm6, ymm6, 2 ; ymm6=OutHO=(33 35 37 39 41 43 45 47 49 51 53 55 57 59 61 63)
  138. vpsllw ymm3, ymm3, BYTE_BIT
  139. vpsllw ymm6, ymm6, BYTE_BIT
  140. vpor ymm2, ymm2, ymm3 ; ymm2=OutL=( 0 1 2 ... 29 30 31)
  141. vpor ymm5, ymm5, ymm6 ; ymm5=OutH=(32 33 34 ... 61 62 63)
  142. vmovdqu YMMWORD [edi+0*SIZEOF_YMMWORD], ymm2
  143. vmovdqu YMMWORD [edi+1*SIZEOF_YMMWORD], ymm5
  144. sub eax, byte SIZEOF_YMMWORD
  145. add esi, byte 1*SIZEOF_YMMWORD ; inptr
  146. add edi, byte 2*SIZEOF_YMMWORD ; outptr
  147. cmp eax, byte SIZEOF_YMMWORD
  148. ja near .columnloop
  149. test eax, eax
  150. jnz near .columnloop_last
  151. pop esi
  152. pop edi
  153. pop eax
  154. add esi, byte SIZEOF_JSAMPROW ; input_data
  155. add edi, byte SIZEOF_JSAMPROW ; output_data
  156. dec ecx ; rowctr
  157. jg near .rowloop
  158. .return:
  159. vzeroupper
  160. pop edi
  161. pop esi
  162. ; pop edx ; need not be preserved
  163. ; pop ecx ; need not be preserved
  164. poppic ebx
  165. pop ebp
  166. ret
  167. ; --------------------------------------------------------------------------
  168. ;
  169. ; Fancy processing for the common case of 2:1 horizontal and 2:1 vertical.
  170. ; Again a triangle filter; see comments for h2v1 case, above.
  171. ;
  172. ; GLOBAL(void)
  173. ; jsimd_h2v2_fancy_upsample_avx2(int max_v_samp_factor,
  174. ; JDIMENSION downsampled_width,
  175. ; JSAMPARRAY input_data,
  176. ; JSAMPARRAY *output_data_ptr);
  177. ;
  178. %define max_v_samp(b) (b) + 8 ; int max_v_samp_factor
  179. %define downsamp_width(b) (b) + 12 ; JDIMENSION downsampled_width
  180. %define input_data(b) (b) + 16 ; JSAMPARRAY input_data
  181. %define output_data_ptr(b) (b) + 20 ; JSAMPARRAY *output_data_ptr
  182. %define original_ebp ebp + 0
  183. %define wk(i) ebp - (WK_NUM - (i)) * SIZEOF_YMMWORD
  184. ; ymmword wk[WK_NUM]
  185. %define WK_NUM 4
  186. %define gotptr wk(0) - SIZEOF_POINTER ; void *gotptr
  187. align 32
  188. GLOBAL_FUNCTION(jsimd_h2v2_fancy_upsample_avx2)
  189. EXTN(jsimd_h2v2_fancy_upsample_avx2):
  190. push ebp
  191. mov eax, esp ; eax = original ebp
  192. sub esp, byte 4
  193. and esp, byte (-SIZEOF_YMMWORD) ; align to 256 bits
  194. mov [esp], eax
  195. mov ebp, esp ; ebp = aligned ebp
  196. lea esp, [wk(0)]
  197. pushpic eax ; make a room for GOT address
  198. push ebx
  199. ; push ecx ; need not be preserved
  200. ; push edx ; need not be preserved
  201. push esi
  202. push edi
  203. get_GOT ebx ; get GOT address
  204. movpic POINTER [gotptr], ebx ; save GOT address
  205. mov edx, eax ; edx = original ebp
  206. mov eax, JDIMENSION [downsamp_width(edx)] ; colctr
  207. test eax, eax
  208. jz near .return
  209. mov ecx, INT [max_v_samp(edx)] ; rowctr
  210. test ecx, ecx
  211. jz near .return
  212. mov esi, JSAMPARRAY [input_data(edx)] ; input_data
  213. mov edi, POINTER [output_data_ptr(edx)]
  214. mov edi, JSAMPARRAY [edi] ; output_data
  215. alignx 16, 7
  216. .rowloop:
  217. push eax ; colctr
  218. push ecx
  219. push edi
  220. push esi
  221. mov ecx, JSAMPROW [esi-1*SIZEOF_JSAMPROW] ; inptr1(above)
  222. mov ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW] ; inptr0
  223. mov esi, JSAMPROW [esi+1*SIZEOF_JSAMPROW] ; inptr1(below)
  224. mov edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW] ; outptr0
  225. mov edi, JSAMPROW [edi+1*SIZEOF_JSAMPROW] ; outptr1
  226. test eax, SIZEOF_YMMWORD-1
  227. jz short .skip
  228. push edx
  229. mov dl, JSAMPLE [ecx+(eax-1)*SIZEOF_JSAMPLE]
  230. mov JSAMPLE [ecx+eax*SIZEOF_JSAMPLE], dl
  231. mov dl, JSAMPLE [ebx+(eax-1)*SIZEOF_JSAMPLE]
  232. mov JSAMPLE [ebx+eax*SIZEOF_JSAMPLE], dl
  233. mov dl, JSAMPLE [esi+(eax-1)*SIZEOF_JSAMPLE]
  234. mov JSAMPLE [esi+eax*SIZEOF_JSAMPLE], dl ; insert a dummy sample
  235. pop edx
  236. .skip:
  237. ; -- process the first column block
  238. vmovdqu ymm0, YMMWORD [ebx+0*SIZEOF_YMMWORD] ; ymm0=row[ 0][0]
  239. vmovdqu ymm1, YMMWORD [ecx+0*SIZEOF_YMMWORD] ; ymm1=row[-1][0]
  240. vmovdqu ymm2, YMMWORD [esi+0*SIZEOF_YMMWORD] ; ymm2=row[+1][0]
  241. pushpic ebx
  242. movpic ebx, POINTER [gotptr] ; load GOT address
  243. vpxor ymm3, ymm3, ymm3 ; ymm3=(all 0's)
  244. vpunpckhbw ymm4, ymm0, ymm3 ; ymm4=row[ 0]( 8 9 10 11 12 13 14 15 24 25 26 27 28 29 30 31)
  245. vpunpcklbw ymm5, ymm0, ymm3 ; ymm5=row[ 0]( 0 1 2 3 4 5 6 7 16 17 18 19 20 21 22 23)
  246. vperm2i128 ymm0, ymm5, ymm4, 0x20 ; ymm0=row[ 0]( 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15)
  247. vperm2i128 ymm4, ymm5, ymm4, 0x31 ; ymm4=row[ 0](16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31)
  248. vpunpckhbw ymm5, ymm1, ymm3 ; ymm5=row[-1]( 8 9 10 11 12 13 14 15 24 25 26 27 28 29 30 31)
  249. vpunpcklbw ymm6, ymm1, ymm3 ; ymm6=row[-1]( 0 1 2 3 4 5 6 7 16 17 18 19 20 21 22 23)
  250. vperm2i128 ymm1, ymm6, ymm5, 0x20 ; ymm1=row[-1]( 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15)
  251. vperm2i128 ymm5, ymm6, ymm5, 0x31 ; ymm5=row[-1](16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31)
  252. vpunpckhbw ymm6, ymm2, ymm3 ; ymm6=row[+1]( 8 9 10 11 12 13 14 15 24 25 26 27 28 29 30 31)
  253. vpunpcklbw ymm3, ymm2, ymm3 ; ymm3=row[+1]( 0 1 2 3 4 5 6 7 16 17 18 19 20 21 22 23)
  254. vperm2i128 ymm2, ymm3, ymm6, 0x20 ; ymm2=row[+1]( 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15)
  255. vperm2i128 ymm6, ymm3, ymm6, 0x31 ; ymm6=row[+1](16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31)
  256. vpmullw ymm0, ymm0, [GOTOFF(ebx,PW_THREE)]
  257. vpmullw ymm4, ymm4, [GOTOFF(ebx,PW_THREE)]
  258. vpcmpeqb xmm7, xmm7, xmm7
  259. vpsrldq xmm7, xmm7, (SIZEOF_XMMWORD-2) ; (ffff ---- ---- ... ---- ----) LSB is ffff
  260. vpaddw ymm1, ymm1, ymm0 ; ymm1=Int0L=( 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15)
  261. vpaddw ymm5, ymm5, ymm4 ; ymm5=Int0H=(16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31)
  262. vpaddw ymm2, ymm2, ymm0 ; ymm2=Int1L=( 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15)
  263. vpaddw ymm6, ymm6, ymm4 ; ymm6=Int1H=(16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31)
  264. vmovdqu YMMWORD [edx+0*SIZEOF_YMMWORD], ymm1 ; temporarily save
  265. vmovdqu YMMWORD [edx+1*SIZEOF_YMMWORD], ymm5 ; the intermediate data
  266. vmovdqu YMMWORD [edi+0*SIZEOF_YMMWORD], ymm2
  267. vmovdqu YMMWORD [edi+1*SIZEOF_YMMWORD], ymm6
  268. vpand ymm1, ymm1, ymm7 ; ymm1=( 0 -- -- -- -- -- -- -- -- -- -- -- -- -- -- --)
  269. vpand ymm2, ymm2, ymm7 ; ymm2=( 0 -- -- -- -- -- -- -- -- -- -- -- -- -- -- --)
  270. vmovdqa YMMWORD [wk(0)], ymm1
  271. vmovdqa YMMWORD [wk(1)], ymm2
  272. poppic ebx
  273. add eax, byte SIZEOF_YMMWORD-1
  274. and eax, byte -SIZEOF_YMMWORD
  275. cmp eax, byte SIZEOF_YMMWORD
  276. ja short .columnloop
  277. alignx 16, 7
  278. .columnloop_last:
  279. ; -- process the last column block
  280. pushpic ebx
  281. movpic ebx, POINTER [gotptr] ; load GOT address
  282. vpcmpeqb xmm1, xmm1, xmm1
  283. vpslldq xmm1, xmm1, (SIZEOF_XMMWORD-2)
  284. vperm2i128 ymm1, ymm1, ymm1, 1 ; (---- ---- ... ---- ---- ffff) MSB is ffff
  285. vpand ymm2, ymm1, YMMWORD [edi+1*SIZEOF_YMMWORD]
  286. vpand ymm1, ymm1, YMMWORD [edx+1*SIZEOF_YMMWORD]
  287. vmovdqa YMMWORD [wk(2)], ymm1 ; ymm1=(-- -- -- -- -- -- -- -- -- -- -- -- -- -- -- 31)
  288. vmovdqa YMMWORD [wk(3)], ymm2 ; ymm2=(-- -- -- -- -- -- -- -- -- -- -- -- -- -- -- 31)
  289. jmp near .upsample
  290. alignx 16, 7
  291. .columnloop:
  292. ; -- process the next column block
  293. vmovdqu ymm0, YMMWORD [ebx+1*SIZEOF_YMMWORD] ; ymm0=row[ 0][1]
  294. vmovdqu ymm1, YMMWORD [ecx+1*SIZEOF_YMMWORD] ; ymm1=row[-1][1]
  295. vmovdqu ymm2, YMMWORD [esi+1*SIZEOF_YMMWORD] ; ymm2=row[+1][1]
  296. pushpic ebx
  297. movpic ebx, POINTER [gotptr] ; load GOT address
  298. vpxor ymm3, ymm3, ymm3 ; ymm3=(all 0's)
  299. vpunpckhbw ymm4, ymm0, ymm3 ; ymm4=row[ 0]( 8 9 10 11 12 13 14 15 24 25 26 27 28 29 30 31)
  300. vpunpcklbw ymm5, ymm0, ymm3 ; ymm5=row[ 0]( 0 1 2 3 4 5 6 7 16 17 18 19 20 21 22 23)
  301. vperm2i128 ymm0, ymm5, ymm4, 0x20 ; ymm0=row[ 0]( 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15)
  302. vperm2i128 ymm4, ymm5, ymm4, 0x31 ; ymm4=row[ 0](16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31)
  303. vpunpckhbw ymm5, ymm1, ymm3 ; ymm5=row[-1]( 8 9 10 11 12 13 14 15 24 25 26 27 28 29 30 31)
  304. vpunpcklbw ymm6, ymm1, ymm3 ; ymm6=row[-1]( 0 1 2 3 4 5 6 7 16 17 18 19 20 21 22 23)
  305. vperm2i128 ymm1, ymm6, ymm5, 0x20 ; ymm1=row[-1]( 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15)
  306. vperm2i128 ymm5, ymm6, ymm5, 0x31 ; ymm5=row[-1](16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31)
  307. vpunpckhbw ymm6, ymm2, ymm3 ; ymm6=row[+1]( 8 9 10 11 12 13 14 15 24 25 26 27 28 29 30 31)
  308. vpunpcklbw ymm7, ymm2, ymm3 ; ymm7=row[+1]( 0 1 2 3 4 5 6 7 16 17 18 19 20 21 22 23)
  309. vperm2i128 ymm2, ymm7, ymm6, 0x20 ; ymm2=row[+1]( 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15)
  310. vperm2i128 ymm6, ymm7, ymm6, 0x31 ; ymm6=row[+1](16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31)
  311. vpmullw ymm0, ymm0, [GOTOFF(ebx,PW_THREE)]
  312. vpmullw ymm4, ymm4, [GOTOFF(ebx,PW_THREE)]
  313. vpaddw ymm1, ymm1, ymm0 ; ymm1=Int0L=( 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15)
  314. vpaddw ymm5, ymm5, ymm4 ; ymm5=Int0H=(16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31)
  315. vpaddw ymm2, ymm2, ymm0 ; ymm2=Int1L=( 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15)
  316. vpaddw ymm6, ymm6, ymm4 ; ymm6=Int1H=(16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31)
  317. vmovdqu YMMWORD [edx+2*SIZEOF_YMMWORD], ymm1 ; temporarily save
  318. vmovdqu YMMWORD [edx+3*SIZEOF_YMMWORD], ymm5 ; the intermediate data
  319. vmovdqu YMMWORD [edi+2*SIZEOF_YMMWORD], ymm2
  320. vmovdqu YMMWORD [edi+3*SIZEOF_YMMWORD], ymm6
  321. vperm2i128 ymm1, ymm3, ymm1, 0x20
  322. vpslldq ymm1, ymm1, 14 ; ymm1=(-- -- -- -- -- -- -- -- -- -- -- -- -- -- -- 0)
  323. vperm2i128 ymm2, ymm3, ymm2, 0x20
  324. vpslldq ymm2, ymm2, 14 ; ymm2=(-- -- -- -- -- -- -- -- -- -- -- -- -- -- -- 0)
  325. vmovdqa YMMWORD [wk(2)], ymm1
  326. vmovdqa YMMWORD [wk(3)], ymm2
  327. .upsample:
  328. ; -- process the upper row
  329. vmovdqu ymm7, YMMWORD [edx+0*SIZEOF_YMMWORD] ; ymm7=Int0L=( 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15)
  330. vmovdqu ymm3, YMMWORD [edx+1*SIZEOF_YMMWORD] ; ymm3=Int0H=(16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31)
  331. vpxor ymm1, ymm1, ymm1 ; ymm1=(all 0's)
  332. vperm2i128 ymm0, ymm1, ymm7, 0x03
  333. vpalignr ymm0, ymm0, ymm7, 2 ; ymm0=( 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 --)
  334. vperm2i128 ymm4, ymm1, ymm3, 0x20
  335. vpslldq ymm4, ymm4, 14 ; ymm4=(-- -- -- -- -- -- -- -- -- -- -- -- -- -- -- 16)
  336. vperm2i128 ymm5, ymm1, ymm7, 0x03
  337. vpsrldq ymm5, ymm5, 14 ; ymm5=(15 -- -- -- -- -- -- -- -- -- -- -- -- -- -- --)
  338. vperm2i128 ymm6, ymm1, ymm3, 0x20
  339. vpalignr ymm6, ymm3, ymm6, 14 ; ymm6=(-- 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30)
  340. vpor ymm0, ymm0, ymm4 ; ymm0=( 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16)
  341. vpor ymm5, ymm5, ymm6 ; ymm5=(15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30)
  342. vperm2i128 ymm2, ymm1, ymm3, 0x03
  343. vpalignr ymm2, ymm2, ymm3, 2 ; ymm2=(17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 --)
  344. vperm2i128 ymm4, ymm1, ymm3, 0x03
  345. vpsrldq ymm4, ymm4, 14 ; ymm4=(31 -- -- -- -- -- -- -- -- -- -- -- -- -- -- --)
  346. vperm2i128 ymm1, ymm1, ymm7, 0x20
  347. vpalignr ymm1, ymm7, ymm1, 14 ; ymm1=(-- 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14)
  348. vpor ymm1, ymm1, YMMWORD [wk(0)] ; ymm1=(-1 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14)
  349. vpor ymm2, ymm2, YMMWORD [wk(2)] ; ymm2=(17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32)
  350. vmovdqa YMMWORD [wk(0)], ymm4
  351. vpmullw ymm7, ymm7, [GOTOFF(ebx,PW_THREE)]
  352. vpmullw ymm3, ymm3, [GOTOFF(ebx,PW_THREE)]
  353. vpaddw ymm1, ymm1, [GOTOFF(ebx,PW_EIGHT)]
  354. vpaddw ymm5, ymm5, [GOTOFF(ebx,PW_EIGHT)]
  355. vpaddw ymm0, ymm0, [GOTOFF(ebx,PW_SEVEN)]
  356. vpaddw ymm2, [GOTOFF(ebx,PW_SEVEN)]
  357. vpaddw ymm1, ymm1, ymm7
  358. vpaddw ymm5, ymm5, ymm3
  359. vpsrlw ymm1, ymm1, 4 ; ymm1=Out0LE=( 0 2 4 6 8 10 12 14 16 18 20 22 24 26 28 30)
  360. vpsrlw ymm5, ymm5, 4 ; ymm5=Out0HE=(32 34 36 38 40 42 44 46 48 50 52 54 56 58 60 62)
  361. vpaddw ymm0, ymm0, ymm7
  362. vpaddw ymm2, ymm2, ymm3
  363. vpsrlw ymm0, ymm0, 4 ; ymm0=Out0LO=( 1 3 5 7 9 11 13 15 17 19 21 23 25 27 29 31)
  364. vpsrlw ymm2, ymm2, 4 ; ymm2=Out0HO=(33 35 37 39 41 43 45 47 49 51 53 55 57 59 61 63)
  365. vpsllw ymm0, ymm0, BYTE_BIT
  366. vpsllw ymm2, ymm2, BYTE_BIT
  367. vpor ymm1, ymm1, ymm0 ; ymm1=Out0L=( 0 1 2 ... 29 30 31)
  368. vpor ymm5, ymm5, ymm2 ; ymm5=Out0H=(32 33 34 ... 61 62 63)
  369. vmovdqu YMMWORD [edx+0*SIZEOF_YMMWORD], ymm1
  370. vmovdqu YMMWORD [edx+1*SIZEOF_YMMWORD], ymm5
  371. ; -- process the lower row
  372. vmovdqu ymm6, YMMWORD [edi+0*SIZEOF_YMMWORD] ; ymm6=Int1L=( 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15)
  373. vmovdqu ymm4, YMMWORD [edi+1*SIZEOF_YMMWORD] ; ymm4=Int1H=(16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31)
  374. vpxor ymm1, ymm1, ymm1 ; ymm1=(all 0's)
  375. vperm2i128 ymm7, ymm1, ymm6, 0x03
  376. vpalignr ymm7, ymm7, ymm6, 2 ; ymm7=( 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 --)
  377. vperm2i128 ymm3, ymm1, ymm4, 0x20
  378. vpslldq ymm3, ymm3, 14 ; ymm3=(-- -- -- -- -- -- -- -- -- -- -- -- -- -- -- 16)
  379. vperm2i128 ymm0, ymm1, ymm6, 0x03
  380. vpsrldq ymm0, ymm0, 14 ; ymm0=(15 -- -- -- -- -- -- -- -- -- -- -- -- -- -- --)
  381. vperm2i128 ymm2, ymm1, ymm4, 0x20
  382. vpalignr ymm2, ymm4, ymm2, 14 ; ymm2=(-- 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30)
  383. vpor ymm7, ymm7, ymm3 ; ymm7=( 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16)
  384. vpor ymm0, ymm0, ymm2 ; ymm0=(15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30)
  385. vperm2i128 ymm5, ymm1, ymm4, 0x03
  386. vpalignr ymm5, ymm5, ymm4, 2 ; ymm5=(17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 --)
  387. vperm2i128 ymm3, ymm1, ymm4, 0x03
  388. vpsrldq ymm3, ymm3, 14 ; ymm3=(31 -- -- -- -- -- -- -- -- -- -- -- -- -- -- --)
  389. vperm2i128 ymm1, ymm1, ymm6, 0x20
  390. vpalignr ymm1, ymm6, ymm1, 14 ; ymm1=(-- 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14)
  391. vpor ymm1, ymm1, YMMWORD [wk(1)] ; ymm1=(-1 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14)
  392. vpor ymm5, ymm5, YMMWORD [wk(3)] ; ymm5=(17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32)
  393. vmovdqa YMMWORD [wk(1)], ymm3
  394. vpmullw ymm6, ymm6, [GOTOFF(ebx,PW_THREE)]
  395. vpmullw ymm4, ymm4, [GOTOFF(ebx,PW_THREE)]
  396. vpaddw ymm1, ymm1, [GOTOFF(ebx,PW_EIGHT)]
  397. vpaddw ymm0, ymm0, [GOTOFF(ebx,PW_EIGHT)]
  398. vpaddw ymm7, ymm7, [GOTOFF(ebx,PW_SEVEN)]
  399. vpaddw ymm5, ymm5, [GOTOFF(ebx,PW_SEVEN)]
  400. vpaddw ymm1, ymm1, ymm6
  401. vpaddw ymm0, ymm0, ymm4
  402. vpsrlw ymm1, ymm1, 4 ; ymm1=Out1LE=( 0 2 4 6 8 10 12 14 16 18 20 22 24 26 28 30)
  403. vpsrlw ymm0, ymm0, 4 ; ymm0=Out1HE=(32 34 36 38 40 42 44 46 48 50 52 54 56 58 60 62)
  404. vpaddw ymm7, ymm7, ymm6
  405. vpaddw ymm5, ymm5, ymm4
  406. vpsrlw ymm7, ymm7, 4 ; ymm7=Out1LO=( 1 3 5 7 9 11 13 15 17 19 21 23 25 27 29 31)
  407. vpsrlw ymm5, ymm5, 4 ; ymm5=Out1HO=(33 35 37 39 41 43 45 47 49 51 53 55 57 59 61 63)
  408. vpsllw ymm7, ymm7, BYTE_BIT
  409. vpsllw ymm5, ymm5, BYTE_BIT
  410. vpor ymm1, ymm1, ymm7 ; ymm1=Out1L=( 0 1 2 ... 29 30 31)
  411. vpor ymm0, ymm0, ymm5 ; ymm0=Out1H=(32 33 34 ... 61 62 63)
  412. vmovdqu YMMWORD [edi+0*SIZEOF_YMMWORD], ymm1
  413. vmovdqu YMMWORD [edi+1*SIZEOF_YMMWORD], ymm0
  414. poppic ebx
  415. sub eax, byte SIZEOF_YMMWORD
  416. add ecx, byte 1*SIZEOF_YMMWORD ; inptr1(above)
  417. add ebx, byte 1*SIZEOF_YMMWORD ; inptr0
  418. add esi, byte 1*SIZEOF_YMMWORD ; inptr1(below)
  419. add edx, byte 2*SIZEOF_YMMWORD ; outptr0
  420. add edi, byte 2*SIZEOF_YMMWORD ; outptr1
  421. cmp eax, byte SIZEOF_YMMWORD
  422. ja near .columnloop
  423. test eax, eax
  424. jnz near .columnloop_last
  425. pop esi
  426. pop edi
  427. pop ecx
  428. pop eax
  429. add esi, byte 1*SIZEOF_JSAMPROW ; input_data
  430. add edi, byte 2*SIZEOF_JSAMPROW ; output_data
  431. sub ecx, byte 2 ; rowctr
  432. jg near .rowloop
  433. .return:
  434. vzeroupper
  435. pop edi
  436. pop esi
  437. ; pop edx ; need not be preserved
  438. ; pop ecx ; need not be preserved
  439. pop ebx
  440. mov esp, ebp ; esp <- aligned ebp
  441. pop esp ; esp <- original ebp
  442. pop ebp
  443. ret
  444. ; --------------------------------------------------------------------------
  445. ;
  446. ; Fast processing for the common case of 2:1 horizontal and 1:1 vertical.
  447. ; It's still a box filter.
  448. ;
  449. ; GLOBAL(void)
  450. ; jsimd_h2v1_upsample_avx2(int max_v_samp_factor, JDIMENSION output_width,
  451. ; JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr);
  452. ;
  453. %define max_v_samp(b) (b) + 8 ; int max_v_samp_factor
  454. %define output_width(b) (b) + 12 ; JDIMENSION output_width
  455. %define input_data(b) (b) + 16 ; JSAMPARRAY input_data
  456. %define output_data_ptr(b) (b) + 20 ; JSAMPARRAY *output_data_ptr
  457. align 32
  458. GLOBAL_FUNCTION(jsimd_h2v1_upsample_avx2)
  459. EXTN(jsimd_h2v1_upsample_avx2):
  460. push ebp
  461. mov ebp, esp
  462. ; push ebx ; unused
  463. ; push ecx ; need not be preserved
  464. ; push edx ; need not be preserved
  465. push esi
  466. push edi
  467. mov edx, JDIMENSION [output_width(ebp)]
  468. add edx, byte (SIZEOF_YMMWORD-1)
  469. and edx, -SIZEOF_YMMWORD
  470. jz short .return
  471. mov ecx, INT [max_v_samp(ebp)] ; rowctr
  472. test ecx, ecx
  473. jz short .return
  474. mov esi, JSAMPARRAY [input_data(ebp)] ; input_data
  475. mov edi, POINTER [output_data_ptr(ebp)]
  476. mov edi, JSAMPARRAY [edi] ; output_data
  477. alignx 16, 7
  478. .rowloop:
  479. push edi
  480. push esi
  481. mov esi, JSAMPROW [esi] ; inptr
  482. mov edi, JSAMPROW [edi] ; outptr
  483. mov eax, edx ; colctr
  484. alignx 16, 7
  485. .columnloop:
  486. cmp eax, byte SIZEOF_YMMWORD
  487. ja near .above_16
  488. vmovdqu xmm0, XMMWORD [esi+0*SIZEOF_YMMWORD]
  489. vpunpckhbw xmm1, xmm0, xmm0
  490. vpunpcklbw xmm0, xmm0, xmm0
  491. vmovdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmm0
  492. vmovdqu XMMWORD [edi+1*SIZEOF_XMMWORD], xmm1
  493. jmp short .nextrow
  494. .above_16:
  495. vmovdqu ymm0, YMMWORD [esi+0*SIZEOF_YMMWORD]
  496. vpermq ymm0, ymm0, 0xd8
  497. vpunpckhbw ymm1, ymm0, ymm0
  498. vpunpcklbw ymm0, ymm0, ymm0
  499. vmovdqu YMMWORD [edi+0*SIZEOF_YMMWORD], ymm0
  500. vmovdqu YMMWORD [edi+1*SIZEOF_YMMWORD], ymm1
  501. sub eax, byte 2*SIZEOF_YMMWORD
  502. jz short .nextrow
  503. add esi, byte SIZEOF_YMMWORD ; inptr
  504. add edi, byte 2*SIZEOF_YMMWORD ; outptr
  505. jmp short .columnloop
  506. alignx 16, 7
  507. .nextrow:
  508. pop esi
  509. pop edi
  510. add esi, byte SIZEOF_JSAMPROW ; input_data
  511. add edi, byte SIZEOF_JSAMPROW ; output_data
  512. dec ecx ; rowctr
  513. jg short .rowloop
  514. .return:
  515. vzeroupper
  516. pop edi
  517. pop esi
  518. ; pop edx ; need not be preserved
  519. ; pop ecx ; need not be preserved
  520. ; pop ebx ; unused
  521. pop ebp
  522. ret
  523. ; --------------------------------------------------------------------------
  524. ;
  525. ; Fast processing for the common case of 2:1 horizontal and 2:1 vertical.
  526. ; It's still a box filter.
  527. ;
  528. ; GLOBAL(void)
  529. ; jsimd_h2v2_upsample_avx2(int max_v_samp_factor, JDIMENSION output_width,
  530. ; JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr);
  531. ;
  532. %define max_v_samp(b) (b) + 8 ; int max_v_samp_factor
  533. %define output_width(b) (b) + 12 ; JDIMENSION output_width
  534. %define input_data(b) (b) + 16 ; JSAMPARRAY input_data
  535. %define output_data_ptr(b) (b) + 20 ; JSAMPARRAY *output_data_ptr
  536. align 32
  537. GLOBAL_FUNCTION(jsimd_h2v2_upsample_avx2)
  538. EXTN(jsimd_h2v2_upsample_avx2):
  539. push ebp
  540. mov ebp, esp
  541. push ebx
  542. ; push ecx ; need not be preserved
  543. ; push edx ; need not be preserved
  544. push esi
  545. push edi
  546. mov edx, JDIMENSION [output_width(ebp)]
  547. add edx, byte (SIZEOF_YMMWORD-1)
  548. and edx, -SIZEOF_YMMWORD
  549. jz near .return
  550. mov ecx, INT [max_v_samp(ebp)] ; rowctr
  551. test ecx, ecx
  552. jz near .return
  553. mov esi, JSAMPARRAY [input_data(ebp)] ; input_data
  554. mov edi, POINTER [output_data_ptr(ebp)]
  555. mov edi, JSAMPARRAY [edi] ; output_data
  556. alignx 16, 7
  557. .rowloop:
  558. push edi
  559. push esi
  560. mov esi, JSAMPROW [esi] ; inptr
  561. mov ebx, JSAMPROW [edi+0*SIZEOF_JSAMPROW] ; outptr0
  562. mov edi, JSAMPROW [edi+1*SIZEOF_JSAMPROW] ; outptr1
  563. mov eax, edx ; colctr
  564. alignx 16, 7
  565. .columnloop:
  566. cmp eax, byte SIZEOF_YMMWORD
  567. ja short .above_16
  568. vmovdqu xmm0, XMMWORD [esi+0*SIZEOF_XMMWORD]
  569. vpunpckhbw xmm1, xmm0, xmm0
  570. vpunpcklbw xmm0, xmm0, xmm0
  571. vmovdqu XMMWORD [ebx+0*SIZEOF_XMMWORD], xmm0
  572. vmovdqu XMMWORD [ebx+1*SIZEOF_XMMWORD], xmm1
  573. vmovdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmm0
  574. vmovdqu XMMWORD [edi+1*SIZEOF_XMMWORD], xmm1
  575. jmp near .nextrow
  576. .above_16:
  577. vmovdqu ymm0, YMMWORD [esi+0*SIZEOF_YMMWORD]
  578. vpermq ymm0, ymm0, 0xd8
  579. vpunpckhbw ymm1, ymm0, ymm0
  580. vpunpcklbw ymm0, ymm0, ymm0
  581. vmovdqu YMMWORD [ebx+0*SIZEOF_YMMWORD], ymm0
  582. vmovdqu YMMWORD [ebx+1*SIZEOF_YMMWORD], ymm1
  583. vmovdqu YMMWORD [edi+0*SIZEOF_YMMWORD], ymm0
  584. vmovdqu YMMWORD [edi+1*SIZEOF_YMMWORD], ymm1
  585. sub eax, byte 2*SIZEOF_YMMWORD
  586. jz short .nextrow
  587. add esi, byte SIZEOF_YMMWORD ; inptr
  588. add ebx, 2*SIZEOF_YMMWORD ; outptr0
  589. add edi, 2*SIZEOF_YMMWORD ; outptr1
  590. jmp short .columnloop
  591. alignx 16, 7
  592. .nextrow:
  593. pop esi
  594. pop edi
  595. add esi, byte 1*SIZEOF_JSAMPROW ; input_data
  596. add edi, byte 2*SIZEOF_JSAMPROW ; output_data
  597. sub ecx, byte 2 ; rowctr
  598. jg near .rowloop
  599. .return:
  600. vzeroupper
  601. pop edi
  602. pop esi
  603. ; pop edx ; need not be preserved
  604. ; pop ecx ; need not be preserved
  605. pop ebx
  606. pop ebp
  607. ret
  608. ; For some reason, the OS X linker does not honor the request to align the
  609. ; segment unless we do this.
  610. align 32