jdsample-sse2.asm 26 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724
  1. ;
  2. ; jdsample.asm - upsampling (SSE2)
  3. ;
  4. ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
  5. ; Copyright (C) 2016, D. R. Commander.
  6. ;
  7. ; Based on the x86 SIMD extension for IJG JPEG library
  8. ; Copyright (C) 1999-2006, MIYASAKA Masaru.
  9. ; For conditions of distribution and use, see copyright notice in jsimdext.inc
  10. ;
  11. ; This file should be assembled with NASM (Netwide Assembler),
  12. ; can *not* be assembled with Microsoft's MASM or any compatible
  13. ; assembler (including Borland's Turbo Assembler).
  14. ; NASM is available from http://nasm.sourceforge.net/ or
  15. ; http://sourceforge.net/project/showfiles.php?group_id=6208
  16. %include "jsimdext.inc"
  17. ; --------------------------------------------------------------------------
  18. SECTION SEG_CONST
  19. alignz 32
  20. GLOBAL_DATA(jconst_fancy_upsample_sse2)
  21. EXTN(jconst_fancy_upsample_sse2):
  22. PW_ONE times 8 dw 1
  23. PW_TWO times 8 dw 2
  24. PW_THREE times 8 dw 3
  25. PW_SEVEN times 8 dw 7
  26. PW_EIGHT times 8 dw 8
  27. alignz 32
  28. ; --------------------------------------------------------------------------
  29. SECTION SEG_TEXT
  30. BITS 32
  31. ;
  32. ; Fancy processing for the common case of 2:1 horizontal and 1:1 vertical.
  33. ;
  34. ; The upsampling algorithm is linear interpolation between pixel centers,
  35. ; also known as a "triangle filter". This is a good compromise between
  36. ; speed and visual quality. The centers of the output pixels are 1/4 and 3/4
  37. ; of the way between input pixel centers.
  38. ;
  39. ; GLOBAL(void)
  40. ; jsimd_h2v1_fancy_upsample_sse2(int max_v_samp_factor,
  41. ; JDIMENSION downsampled_width,
  42. ; JSAMPARRAY input_data,
  43. ; JSAMPARRAY *output_data_ptr);
  44. ;
  45. %define max_v_samp(b) (b) + 8 ; int max_v_samp_factor
  46. %define downsamp_width(b) (b) + 12 ; JDIMENSION downsampled_width
  47. %define input_data(b) (b) + 16 ; JSAMPARRAY input_data
  48. %define output_data_ptr(b) (b) + 20 ; JSAMPARRAY *output_data_ptr
  49. align 32
  50. GLOBAL_FUNCTION(jsimd_h2v1_fancy_upsample_sse2)
  51. EXTN(jsimd_h2v1_fancy_upsample_sse2):
  52. push ebp
  53. mov ebp, esp
  54. pushpic ebx
  55. ; push ecx ; need not be preserved
  56. ; push edx ; need not be preserved
  57. push esi
  58. push edi
  59. get_GOT ebx ; get GOT address
  60. mov eax, JDIMENSION [downsamp_width(ebp)] ; colctr
  61. test eax, eax
  62. jz near .return
  63. mov ecx, INT [max_v_samp(ebp)] ; rowctr
  64. test ecx, ecx
  65. jz near .return
  66. mov esi, JSAMPARRAY [input_data(ebp)] ; input_data
  67. mov edi, POINTER [output_data_ptr(ebp)]
  68. mov edi, JSAMPARRAY [edi] ; output_data
  69. alignx 16, 7
  70. .rowloop:
  71. push eax ; colctr
  72. push edi
  73. push esi
  74. mov esi, JSAMPROW [esi] ; inptr
  75. mov edi, JSAMPROW [edi] ; outptr
  76. test eax, SIZEOF_XMMWORD-1
  77. jz short .skip
  78. mov dl, JSAMPLE [esi+(eax-1)*SIZEOF_JSAMPLE]
  79. mov JSAMPLE [esi+eax*SIZEOF_JSAMPLE], dl ; insert a dummy sample
  80. .skip:
  81. pxor xmm0, xmm0 ; xmm0=(all 0's)
  82. pcmpeqb xmm7, xmm7
  83. psrldq xmm7, (SIZEOF_XMMWORD-1)
  84. pand xmm7, XMMWORD [esi+0*SIZEOF_XMMWORD]
  85. add eax, byte SIZEOF_XMMWORD-1
  86. and eax, byte -SIZEOF_XMMWORD
  87. cmp eax, byte SIZEOF_XMMWORD
  88. ja short .columnloop
  89. alignx 16, 7
  90. .columnloop_last:
  91. pcmpeqb xmm6, xmm6
  92. pslldq xmm6, (SIZEOF_XMMWORD-1)
  93. pand xmm6, XMMWORD [esi+0*SIZEOF_XMMWORD]
  94. jmp short .upsample
  95. alignx 16, 7
  96. .columnloop:
  97. movdqa xmm6, XMMWORD [esi+1*SIZEOF_XMMWORD]
  98. pslldq xmm6, (SIZEOF_XMMWORD-1)
  99. .upsample:
  100. movdqa xmm1, XMMWORD [esi+0*SIZEOF_XMMWORD]
  101. movdqa xmm2, xmm1
  102. movdqa xmm3, xmm1 ; xmm1=( 0 1 2 ... 13 14 15)
  103. pslldq xmm2, 1 ; xmm2=(-- 0 1 ... 12 13 14)
  104. psrldq xmm3, 1 ; xmm3=( 1 2 3 ... 14 15 --)
  105. por xmm2, xmm7 ; xmm2=(-1 0 1 ... 12 13 14)
  106. por xmm3, xmm6 ; xmm3=( 1 2 3 ... 14 15 16)
  107. movdqa xmm7, xmm1
  108. psrldq xmm7, (SIZEOF_XMMWORD-1) ; xmm7=(15 -- -- ... -- -- --)
  109. movdqa xmm4, xmm1
  110. punpcklbw xmm1, xmm0 ; xmm1=( 0 1 2 3 4 5 6 7)
  111. punpckhbw xmm4, xmm0 ; xmm4=( 8 9 10 11 12 13 14 15)
  112. movdqa xmm5, xmm2
  113. punpcklbw xmm2, xmm0 ; xmm2=(-1 0 1 2 3 4 5 6)
  114. punpckhbw xmm5, xmm0 ; xmm5=( 7 8 9 10 11 12 13 14)
  115. movdqa xmm6, xmm3
  116. punpcklbw xmm3, xmm0 ; xmm3=( 1 2 3 4 5 6 7 8)
  117. punpckhbw xmm6, xmm0 ; xmm6=( 9 10 11 12 13 14 15 16)
  118. pmullw xmm1, [GOTOFF(ebx,PW_THREE)]
  119. pmullw xmm4, [GOTOFF(ebx,PW_THREE)]
  120. paddw xmm2, [GOTOFF(ebx,PW_ONE)]
  121. paddw xmm5, [GOTOFF(ebx,PW_ONE)]
  122. paddw xmm3, [GOTOFF(ebx,PW_TWO)]
  123. paddw xmm6, [GOTOFF(ebx,PW_TWO)]
  124. paddw xmm2, xmm1
  125. paddw xmm5, xmm4
  126. psrlw xmm2, 2 ; xmm2=OutLE=( 0 2 4 6 8 10 12 14)
  127. psrlw xmm5, 2 ; xmm5=OutHE=(16 18 20 22 24 26 28 30)
  128. paddw xmm3, xmm1
  129. paddw xmm6, xmm4
  130. psrlw xmm3, 2 ; xmm3=OutLO=( 1 3 5 7 9 11 13 15)
  131. psrlw xmm6, 2 ; xmm6=OutHO=(17 19 21 23 25 27 29 31)
  132. psllw xmm3, BYTE_BIT
  133. psllw xmm6, BYTE_BIT
  134. por xmm2, xmm3 ; xmm2=OutL=( 0 1 2 ... 13 14 15)
  135. por xmm5, xmm6 ; xmm5=OutH=(16 17 18 ... 29 30 31)
  136. movdqa XMMWORD [edi+0*SIZEOF_XMMWORD], xmm2
  137. movdqa XMMWORD [edi+1*SIZEOF_XMMWORD], xmm5
  138. sub eax, byte SIZEOF_XMMWORD
  139. add esi, byte 1*SIZEOF_XMMWORD ; inptr
  140. add edi, byte 2*SIZEOF_XMMWORD ; outptr
  141. cmp eax, byte SIZEOF_XMMWORD
  142. ja near .columnloop
  143. test eax, eax
  144. jnz near .columnloop_last
  145. pop esi
  146. pop edi
  147. pop eax
  148. add esi, byte SIZEOF_JSAMPROW ; input_data
  149. add edi, byte SIZEOF_JSAMPROW ; output_data
  150. dec ecx ; rowctr
  151. jg near .rowloop
  152. .return:
  153. pop edi
  154. pop esi
  155. ; pop edx ; need not be preserved
  156. ; pop ecx ; need not be preserved
  157. poppic ebx
  158. pop ebp
  159. ret
  160. ; --------------------------------------------------------------------------
  161. ;
  162. ; Fancy processing for the common case of 2:1 horizontal and 2:1 vertical.
  163. ; Again a triangle filter; see comments for h2v1 case, above.
  164. ;
  165. ; GLOBAL(void)
  166. ; jsimd_h2v2_fancy_upsample_sse2(int max_v_samp_factor,
  167. ; JDIMENSION downsampled_width,
  168. ; JSAMPARRAY input_data,
  169. ; JSAMPARRAY *output_data_ptr);
  170. ;
  171. %define max_v_samp(b) (b) + 8 ; int max_v_samp_factor
  172. %define downsamp_width(b) (b) + 12 ; JDIMENSION downsampled_width
  173. %define input_data(b) (b) + 16 ; JSAMPARRAY input_data
  174. %define output_data_ptr(b) (b) + 20 ; JSAMPARRAY *output_data_ptr
  175. %define original_ebp ebp + 0
  176. %define wk(i) ebp - (WK_NUM - (i)) * SIZEOF_XMMWORD
  177. ; xmmword wk[WK_NUM]
  178. %define WK_NUM 4
  179. %define gotptr wk(0) - SIZEOF_POINTER ; void *gotptr
  180. align 32
  181. GLOBAL_FUNCTION(jsimd_h2v2_fancy_upsample_sse2)
  182. EXTN(jsimd_h2v2_fancy_upsample_sse2):
  183. push ebp
  184. mov eax, esp ; eax = original ebp
  185. sub esp, byte 4
  186. and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
  187. mov [esp], eax
  188. mov ebp, esp ; ebp = aligned ebp
  189. lea esp, [wk(0)]
  190. pushpic eax ; make a room for GOT address
  191. push ebx
  192. ; push ecx ; need not be preserved
  193. ; push edx ; need not be preserved
  194. push esi
  195. push edi
  196. get_GOT ebx ; get GOT address
  197. movpic POINTER [gotptr], ebx ; save GOT address
  198. mov edx, eax ; edx = original ebp
  199. mov eax, JDIMENSION [downsamp_width(edx)] ; colctr
  200. test eax, eax
  201. jz near .return
  202. mov ecx, INT [max_v_samp(edx)] ; rowctr
  203. test ecx, ecx
  204. jz near .return
  205. mov esi, JSAMPARRAY [input_data(edx)] ; input_data
  206. mov edi, POINTER [output_data_ptr(edx)]
  207. mov edi, JSAMPARRAY [edi] ; output_data
  208. alignx 16, 7
  209. .rowloop:
  210. push eax ; colctr
  211. push ecx
  212. push edi
  213. push esi
  214. mov ecx, JSAMPROW [esi-1*SIZEOF_JSAMPROW] ; inptr1(above)
  215. mov ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW] ; inptr0
  216. mov esi, JSAMPROW [esi+1*SIZEOF_JSAMPROW] ; inptr1(below)
  217. mov edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW] ; outptr0
  218. mov edi, JSAMPROW [edi+1*SIZEOF_JSAMPROW] ; outptr1
  219. test eax, SIZEOF_XMMWORD-1
  220. jz short .skip
  221. push edx
  222. mov dl, JSAMPLE [ecx+(eax-1)*SIZEOF_JSAMPLE]
  223. mov JSAMPLE [ecx+eax*SIZEOF_JSAMPLE], dl
  224. mov dl, JSAMPLE [ebx+(eax-1)*SIZEOF_JSAMPLE]
  225. mov JSAMPLE [ebx+eax*SIZEOF_JSAMPLE], dl
  226. mov dl, JSAMPLE [esi+(eax-1)*SIZEOF_JSAMPLE]
  227. mov JSAMPLE [esi+eax*SIZEOF_JSAMPLE], dl ; insert a dummy sample
  228. pop edx
  229. .skip:
  230. ; -- process the first column block
  231. movdqa xmm0, XMMWORD [ebx+0*SIZEOF_XMMWORD] ; xmm0=row[ 0][0]
  232. movdqa xmm1, XMMWORD [ecx+0*SIZEOF_XMMWORD] ; xmm1=row[-1][0]
  233. movdqa xmm2, XMMWORD [esi+0*SIZEOF_XMMWORD] ; xmm2=row[+1][0]
  234. pushpic ebx
  235. movpic ebx, POINTER [gotptr] ; load GOT address
  236. pxor xmm3, xmm3 ; xmm3=(all 0's)
  237. movdqa xmm4, xmm0
  238. punpcklbw xmm0, xmm3 ; xmm0=row[ 0]( 0 1 2 3 4 5 6 7)
  239. punpckhbw xmm4, xmm3 ; xmm4=row[ 0]( 8 9 10 11 12 13 14 15)
  240. movdqa xmm5, xmm1
  241. punpcklbw xmm1, xmm3 ; xmm1=row[-1]( 0 1 2 3 4 5 6 7)
  242. punpckhbw xmm5, xmm3 ; xmm5=row[-1]( 8 9 10 11 12 13 14 15)
  243. movdqa xmm6, xmm2
  244. punpcklbw xmm2, xmm3 ; xmm2=row[+1]( 0 1 2 3 4 5 6 7)
  245. punpckhbw xmm6, xmm3 ; xmm6=row[+1]( 8 9 10 11 12 13 14 15)
  246. pmullw xmm0, [GOTOFF(ebx,PW_THREE)]
  247. pmullw xmm4, [GOTOFF(ebx,PW_THREE)]
  248. pcmpeqb xmm7, xmm7
  249. psrldq xmm7, (SIZEOF_XMMWORD-2)
  250. paddw xmm1, xmm0 ; xmm1=Int0L=( 0 1 2 3 4 5 6 7)
  251. paddw xmm5, xmm4 ; xmm5=Int0H=( 8 9 10 11 12 13 14 15)
  252. paddw xmm2, xmm0 ; xmm2=Int1L=( 0 1 2 3 4 5 6 7)
  253. paddw xmm6, xmm4 ; xmm6=Int1H=( 8 9 10 11 12 13 14 15)
  254. movdqa XMMWORD [edx+0*SIZEOF_XMMWORD], xmm1 ; temporarily save
  255. movdqa XMMWORD [edx+1*SIZEOF_XMMWORD], xmm5 ; the intermediate data
  256. movdqa XMMWORD [edi+0*SIZEOF_XMMWORD], xmm2
  257. movdqa XMMWORD [edi+1*SIZEOF_XMMWORD], xmm6
  258. pand xmm1, xmm7 ; xmm1=( 0 -- -- -- -- -- -- --)
  259. pand xmm2, xmm7 ; xmm2=( 0 -- -- -- -- -- -- --)
  260. movdqa XMMWORD [wk(0)], xmm1
  261. movdqa XMMWORD [wk(1)], xmm2
  262. poppic ebx
  263. add eax, byte SIZEOF_XMMWORD-1
  264. and eax, byte -SIZEOF_XMMWORD
  265. cmp eax, byte SIZEOF_XMMWORD
  266. ja short .columnloop
  267. alignx 16, 7
  268. .columnloop_last:
  269. ; -- process the last column block
  270. pushpic ebx
  271. movpic ebx, POINTER [gotptr] ; load GOT address
  272. pcmpeqb xmm1, xmm1
  273. pslldq xmm1, (SIZEOF_XMMWORD-2)
  274. movdqa xmm2, xmm1
  275. pand xmm1, XMMWORD [edx+1*SIZEOF_XMMWORD]
  276. pand xmm2, XMMWORD [edi+1*SIZEOF_XMMWORD]
  277. movdqa XMMWORD [wk(2)], xmm1 ; xmm1=(-- -- -- -- -- -- -- 15)
  278. movdqa XMMWORD [wk(3)], xmm2 ; xmm2=(-- -- -- -- -- -- -- 15)
  279. jmp near .upsample
  280. alignx 16, 7
  281. .columnloop:
  282. ; -- process the next column block
  283. movdqa xmm0, XMMWORD [ebx+1*SIZEOF_XMMWORD] ; xmm0=row[ 0][1]
  284. movdqa xmm1, XMMWORD [ecx+1*SIZEOF_XMMWORD] ; xmm1=row[-1][1]
  285. movdqa xmm2, XMMWORD [esi+1*SIZEOF_XMMWORD] ; xmm2=row[+1][1]
  286. pushpic ebx
  287. movpic ebx, POINTER [gotptr] ; load GOT address
  288. pxor xmm3, xmm3 ; xmm3=(all 0's)
  289. movdqa xmm4, xmm0
  290. punpcklbw xmm0, xmm3 ; xmm0=row[ 0]( 0 1 2 3 4 5 6 7)
  291. punpckhbw xmm4, xmm3 ; xmm4=row[ 0]( 8 9 10 11 12 13 14 15)
  292. movdqa xmm5, xmm1
  293. punpcklbw xmm1, xmm3 ; xmm1=row[-1]( 0 1 2 3 4 5 6 7)
  294. punpckhbw xmm5, xmm3 ; xmm5=row[-1]( 8 9 10 11 12 13 14 15)
  295. movdqa xmm6, xmm2
  296. punpcklbw xmm2, xmm3 ; xmm2=row[+1]( 0 1 2 3 4 5 6 7)
  297. punpckhbw xmm6, xmm3 ; xmm6=row[+1]( 8 9 10 11 12 13 14 15)
  298. pmullw xmm0, [GOTOFF(ebx,PW_THREE)]
  299. pmullw xmm4, [GOTOFF(ebx,PW_THREE)]
  300. paddw xmm1, xmm0 ; xmm1=Int0L=( 0 1 2 3 4 5 6 7)
  301. paddw xmm5, xmm4 ; xmm5=Int0H=( 8 9 10 11 12 13 14 15)
  302. paddw xmm2, xmm0 ; xmm2=Int1L=( 0 1 2 3 4 5 6 7)
  303. paddw xmm6, xmm4 ; xmm6=Int1H=( 8 9 10 11 12 13 14 15)
  304. movdqa XMMWORD [edx+2*SIZEOF_XMMWORD], xmm1 ; temporarily save
  305. movdqa XMMWORD [edx+3*SIZEOF_XMMWORD], xmm5 ; the intermediate data
  306. movdqa XMMWORD [edi+2*SIZEOF_XMMWORD], xmm2
  307. movdqa XMMWORD [edi+3*SIZEOF_XMMWORD], xmm6
  308. pslldq xmm1, (SIZEOF_XMMWORD-2) ; xmm1=(-- -- -- -- -- -- -- 0)
  309. pslldq xmm2, (SIZEOF_XMMWORD-2) ; xmm2=(-- -- -- -- -- -- -- 0)
  310. movdqa XMMWORD [wk(2)], xmm1
  311. movdqa XMMWORD [wk(3)], xmm2
  312. .upsample:
  313. ; -- process the upper row
  314. movdqa xmm7, XMMWORD [edx+0*SIZEOF_XMMWORD]
  315. movdqa xmm3, XMMWORD [edx+1*SIZEOF_XMMWORD]
  316. movdqa xmm0, xmm7 ; xmm7=Int0L=( 0 1 2 3 4 5 6 7)
  317. movdqa xmm4, xmm3 ; xmm3=Int0H=( 8 9 10 11 12 13 14 15)
  318. psrldq xmm0, 2 ; xmm0=( 1 2 3 4 5 6 7 --)
  319. pslldq xmm4, (SIZEOF_XMMWORD-2) ; xmm4=(-- -- -- -- -- -- -- 8)
  320. movdqa xmm5, xmm7
  321. movdqa xmm6, xmm3
  322. psrldq xmm5, (SIZEOF_XMMWORD-2) ; xmm5=( 7 -- -- -- -- -- -- --)
  323. pslldq xmm6, 2 ; xmm6=(-- 8 9 10 11 12 13 14)
  324. por xmm0, xmm4 ; xmm0=( 1 2 3 4 5 6 7 8)
  325. por xmm5, xmm6 ; xmm5=( 7 8 9 10 11 12 13 14)
  326. movdqa xmm1, xmm7
  327. movdqa xmm2, xmm3
  328. pslldq xmm1, 2 ; xmm1=(-- 0 1 2 3 4 5 6)
  329. psrldq xmm2, 2 ; xmm2=( 9 10 11 12 13 14 15 --)
  330. movdqa xmm4, xmm3
  331. psrldq xmm4, (SIZEOF_XMMWORD-2) ; xmm4=(15 -- -- -- -- -- -- --)
  332. por xmm1, XMMWORD [wk(0)] ; xmm1=(-1 0 1 2 3 4 5 6)
  333. por xmm2, XMMWORD [wk(2)] ; xmm2=( 9 10 11 12 13 14 15 16)
  334. movdqa XMMWORD [wk(0)], xmm4
  335. pmullw xmm7, [GOTOFF(ebx,PW_THREE)]
  336. pmullw xmm3, [GOTOFF(ebx,PW_THREE)]
  337. paddw xmm1, [GOTOFF(ebx,PW_EIGHT)]
  338. paddw xmm5, [GOTOFF(ebx,PW_EIGHT)]
  339. paddw xmm0, [GOTOFF(ebx,PW_SEVEN)]
  340. paddw xmm2, [GOTOFF(ebx,PW_SEVEN)]
  341. paddw xmm1, xmm7
  342. paddw xmm5, xmm3
  343. psrlw xmm1, 4 ; xmm1=Out0LE=( 0 2 4 6 8 10 12 14)
  344. psrlw xmm5, 4 ; xmm5=Out0HE=(16 18 20 22 24 26 28 30)
  345. paddw xmm0, xmm7
  346. paddw xmm2, xmm3
  347. psrlw xmm0, 4 ; xmm0=Out0LO=( 1 3 5 7 9 11 13 15)
  348. psrlw xmm2, 4 ; xmm2=Out0HO=(17 19 21 23 25 27 29 31)
  349. psllw xmm0, BYTE_BIT
  350. psllw xmm2, BYTE_BIT
  351. por xmm1, xmm0 ; xmm1=Out0L=( 0 1 2 ... 13 14 15)
  352. por xmm5, xmm2 ; xmm5=Out0H=(16 17 18 ... 29 30 31)
  353. movdqa XMMWORD [edx+0*SIZEOF_XMMWORD], xmm1
  354. movdqa XMMWORD [edx+1*SIZEOF_XMMWORD], xmm5
  355. ; -- process the lower row
  356. movdqa xmm6, XMMWORD [edi+0*SIZEOF_XMMWORD]
  357. movdqa xmm4, XMMWORD [edi+1*SIZEOF_XMMWORD]
  358. movdqa xmm7, xmm6 ; xmm6=Int1L=( 0 1 2 3 4 5 6 7)
  359. movdqa xmm3, xmm4 ; xmm4=Int1H=( 8 9 10 11 12 13 14 15)
  360. psrldq xmm7, 2 ; xmm7=( 1 2 3 4 5 6 7 --)
  361. pslldq xmm3, (SIZEOF_XMMWORD-2) ; xmm3=(-- -- -- -- -- -- -- 8)
  362. movdqa xmm0, xmm6
  363. movdqa xmm2, xmm4
  364. psrldq xmm0, (SIZEOF_XMMWORD-2) ; xmm0=( 7 -- -- -- -- -- -- --)
  365. pslldq xmm2, 2 ; xmm2=(-- 8 9 10 11 12 13 14)
  366. por xmm7, xmm3 ; xmm7=( 1 2 3 4 5 6 7 8)
  367. por xmm0, xmm2 ; xmm0=( 7 8 9 10 11 12 13 14)
  368. movdqa xmm1, xmm6
  369. movdqa xmm5, xmm4
  370. pslldq xmm1, 2 ; xmm1=(-- 0 1 2 3 4 5 6)
  371. psrldq xmm5, 2 ; xmm5=( 9 10 11 12 13 14 15 --)
  372. movdqa xmm3, xmm4
  373. psrldq xmm3, (SIZEOF_XMMWORD-2) ; xmm3=(15 -- -- -- -- -- -- --)
  374. por xmm1, XMMWORD [wk(1)] ; xmm1=(-1 0 1 2 3 4 5 6)
  375. por xmm5, XMMWORD [wk(3)] ; xmm5=( 9 10 11 12 13 14 15 16)
  376. movdqa XMMWORD [wk(1)], xmm3
  377. pmullw xmm6, [GOTOFF(ebx,PW_THREE)]
  378. pmullw xmm4, [GOTOFF(ebx,PW_THREE)]
  379. paddw xmm1, [GOTOFF(ebx,PW_EIGHT)]
  380. paddw xmm0, [GOTOFF(ebx,PW_EIGHT)]
  381. paddw xmm7, [GOTOFF(ebx,PW_SEVEN)]
  382. paddw xmm5, [GOTOFF(ebx,PW_SEVEN)]
  383. paddw xmm1, xmm6
  384. paddw xmm0, xmm4
  385. psrlw xmm1, 4 ; xmm1=Out1LE=( 0 2 4 6 8 10 12 14)
  386. psrlw xmm0, 4 ; xmm0=Out1HE=(16 18 20 22 24 26 28 30)
  387. paddw xmm7, xmm6
  388. paddw xmm5, xmm4
  389. psrlw xmm7, 4 ; xmm7=Out1LO=( 1 3 5 7 9 11 13 15)
  390. psrlw xmm5, 4 ; xmm5=Out1HO=(17 19 21 23 25 27 29 31)
  391. psllw xmm7, BYTE_BIT
  392. psllw xmm5, BYTE_BIT
  393. por xmm1, xmm7 ; xmm1=Out1L=( 0 1 2 ... 13 14 15)
  394. por xmm0, xmm5 ; xmm0=Out1H=(16 17 18 ... 29 30 31)
  395. movdqa XMMWORD [edi+0*SIZEOF_XMMWORD], xmm1
  396. movdqa XMMWORD [edi+1*SIZEOF_XMMWORD], xmm0
  397. poppic ebx
  398. sub eax, byte SIZEOF_XMMWORD
  399. add ecx, byte 1*SIZEOF_XMMWORD ; inptr1(above)
  400. add ebx, byte 1*SIZEOF_XMMWORD ; inptr0
  401. add esi, byte 1*SIZEOF_XMMWORD ; inptr1(below)
  402. add edx, byte 2*SIZEOF_XMMWORD ; outptr0
  403. add edi, byte 2*SIZEOF_XMMWORD ; outptr1
  404. cmp eax, byte SIZEOF_XMMWORD
  405. ja near .columnloop
  406. test eax, eax
  407. jnz near .columnloop_last
  408. pop esi
  409. pop edi
  410. pop ecx
  411. pop eax
  412. add esi, byte 1*SIZEOF_JSAMPROW ; input_data
  413. add edi, byte 2*SIZEOF_JSAMPROW ; output_data
  414. sub ecx, byte 2 ; rowctr
  415. jg near .rowloop
  416. .return:
  417. pop edi
  418. pop esi
  419. ; pop edx ; need not be preserved
  420. ; pop ecx ; need not be preserved
  421. pop ebx
  422. mov esp, ebp ; esp <- aligned ebp
  423. pop esp ; esp <- original ebp
  424. pop ebp
  425. ret
  426. ; --------------------------------------------------------------------------
  427. ;
  428. ; Fast processing for the common case of 2:1 horizontal and 1:1 vertical.
  429. ; It's still a box filter.
  430. ;
  431. ; GLOBAL(void)
  432. ; jsimd_h2v1_upsample_sse2(int max_v_samp_factor, JDIMENSION output_width,
  433. ; JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr);
  434. ;
  435. %define max_v_samp(b) (b) + 8 ; int max_v_samp_factor
  436. %define output_width(b) (b) + 12 ; JDIMENSION output_width
  437. %define input_data(b) (b) + 16 ; JSAMPARRAY input_data
  438. %define output_data_ptr(b) (b) + 20 ; JSAMPARRAY *output_data_ptr
  439. align 32
  440. GLOBAL_FUNCTION(jsimd_h2v1_upsample_sse2)
  441. EXTN(jsimd_h2v1_upsample_sse2):
  442. push ebp
  443. mov ebp, esp
  444. ; push ebx ; unused
  445. ; push ecx ; need not be preserved
  446. ; push edx ; need not be preserved
  447. push esi
  448. push edi
  449. mov edx, JDIMENSION [output_width(ebp)]
  450. add edx, byte (2*SIZEOF_XMMWORD)-1
  451. and edx, byte -(2*SIZEOF_XMMWORD)
  452. jz short .return
  453. mov ecx, INT [max_v_samp(ebp)] ; rowctr
  454. test ecx, ecx
  455. jz short .return
  456. mov esi, JSAMPARRAY [input_data(ebp)] ; input_data
  457. mov edi, POINTER [output_data_ptr(ebp)]
  458. mov edi, JSAMPARRAY [edi] ; output_data
  459. alignx 16, 7
  460. .rowloop:
  461. push edi
  462. push esi
  463. mov esi, JSAMPROW [esi] ; inptr
  464. mov edi, JSAMPROW [edi] ; outptr
  465. mov eax, edx ; colctr
  466. alignx 16, 7
  467. .columnloop:
  468. movdqa xmm0, XMMWORD [esi+0*SIZEOF_XMMWORD]
  469. movdqa xmm1, xmm0
  470. punpcklbw xmm0, xmm0
  471. punpckhbw xmm1, xmm1
  472. movdqa XMMWORD [edi+0*SIZEOF_XMMWORD], xmm0
  473. movdqa XMMWORD [edi+1*SIZEOF_XMMWORD], xmm1
  474. sub eax, byte 2*SIZEOF_XMMWORD
  475. jz short .nextrow
  476. movdqa xmm2, XMMWORD [esi+1*SIZEOF_XMMWORD]
  477. movdqa xmm3, xmm2
  478. punpcklbw xmm2, xmm2
  479. punpckhbw xmm3, xmm3
  480. movdqa XMMWORD [edi+2*SIZEOF_XMMWORD], xmm2
  481. movdqa XMMWORD [edi+3*SIZEOF_XMMWORD], xmm3
  482. sub eax, byte 2*SIZEOF_XMMWORD
  483. jz short .nextrow
  484. add esi, byte 2*SIZEOF_XMMWORD ; inptr
  485. add edi, byte 4*SIZEOF_XMMWORD ; outptr
  486. jmp short .columnloop
  487. alignx 16, 7
  488. .nextrow:
  489. pop esi
  490. pop edi
  491. add esi, byte SIZEOF_JSAMPROW ; input_data
  492. add edi, byte SIZEOF_JSAMPROW ; output_data
  493. dec ecx ; rowctr
  494. jg short .rowloop
  495. .return:
  496. pop edi
  497. pop esi
  498. ; pop edx ; need not be preserved
  499. ; pop ecx ; need not be preserved
  500. ; pop ebx ; unused
  501. pop ebp
  502. ret
  503. ; --------------------------------------------------------------------------
  504. ;
  505. ; Fast processing for the common case of 2:1 horizontal and 2:1 vertical.
  506. ; It's still a box filter.
  507. ;
  508. ; GLOBAL(void)
  509. ; jsimd_h2v2_upsample_sse2(int max_v_samp_factor, JDIMENSION output_width,
  510. ; JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr);
  511. ;
  512. %define max_v_samp(b) (b) + 8 ; int max_v_samp_factor
  513. %define output_width(b) (b) + 12 ; JDIMENSION output_width
  514. %define input_data(b) (b) + 16 ; JSAMPARRAY input_data
  515. %define output_data_ptr(b) (b) + 20 ; JSAMPARRAY *output_data_ptr
  516. align 32
  517. GLOBAL_FUNCTION(jsimd_h2v2_upsample_sse2)
  518. EXTN(jsimd_h2v2_upsample_sse2):
  519. push ebp
  520. mov ebp, esp
  521. push ebx
  522. ; push ecx ; need not be preserved
  523. ; push edx ; need not be preserved
  524. push esi
  525. push edi
  526. mov edx, JDIMENSION [output_width(ebp)]
  527. add edx, byte (2*SIZEOF_XMMWORD)-1
  528. and edx, byte -(2*SIZEOF_XMMWORD)
  529. jz near .return
  530. mov ecx, INT [max_v_samp(ebp)] ; rowctr
  531. test ecx, ecx
  532. jz near .return
  533. mov esi, JSAMPARRAY [input_data(ebp)] ; input_data
  534. mov edi, POINTER [output_data_ptr(ebp)]
  535. mov edi, JSAMPARRAY [edi] ; output_data
  536. alignx 16, 7
  537. .rowloop:
  538. push edi
  539. push esi
  540. mov esi, JSAMPROW [esi] ; inptr
  541. mov ebx, JSAMPROW [edi+0*SIZEOF_JSAMPROW] ; outptr0
  542. mov edi, JSAMPROW [edi+1*SIZEOF_JSAMPROW] ; outptr1
  543. mov eax, edx ; colctr
  544. alignx 16, 7
  545. .columnloop:
  546. movdqa xmm0, XMMWORD [esi+0*SIZEOF_XMMWORD]
  547. movdqa xmm1, xmm0
  548. punpcklbw xmm0, xmm0
  549. punpckhbw xmm1, xmm1
  550. movdqa XMMWORD [ebx+0*SIZEOF_XMMWORD], xmm0
  551. movdqa XMMWORD [ebx+1*SIZEOF_XMMWORD], xmm1
  552. movdqa XMMWORD [edi+0*SIZEOF_XMMWORD], xmm0
  553. movdqa XMMWORD [edi+1*SIZEOF_XMMWORD], xmm1
  554. sub eax, byte 2*SIZEOF_XMMWORD
  555. jz short .nextrow
  556. movdqa xmm2, XMMWORD [esi+1*SIZEOF_XMMWORD]
  557. movdqa xmm3, xmm2
  558. punpcklbw xmm2, xmm2
  559. punpckhbw xmm3, xmm3
  560. movdqa XMMWORD [ebx+2*SIZEOF_XMMWORD], xmm2
  561. movdqa XMMWORD [ebx+3*SIZEOF_XMMWORD], xmm3
  562. movdqa XMMWORD [edi+2*SIZEOF_XMMWORD], xmm2
  563. movdqa XMMWORD [edi+3*SIZEOF_XMMWORD], xmm3
  564. sub eax, byte 2*SIZEOF_XMMWORD
  565. jz short .nextrow
  566. add esi, byte 2*SIZEOF_XMMWORD ; inptr
  567. add ebx, byte 4*SIZEOF_XMMWORD ; outptr0
  568. add edi, byte 4*SIZEOF_XMMWORD ; outptr1
  569. jmp short .columnloop
  570. alignx 16, 7
  571. .nextrow:
  572. pop esi
  573. pop edi
  574. add esi, byte 1*SIZEOF_JSAMPROW ; input_data
  575. add edi, byte 2*SIZEOF_JSAMPROW ; output_data
  576. sub ecx, byte 2 ; rowctr
  577. jg short .rowloop
  578. .return:
  579. pop edi
  580. pop esi
  581. ; pop edx ; need not be preserved
  582. ; pop ecx ; need not be preserved
  583. pop ebx
  584. pop ebp
  585. ret
  586. ; For some reason, the OS X linker does not honor the request to align the
  587. ; segment unless we do this.
  588. align 32