jdsample-sse2.asm 23 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664
  1. ;
  2. ; jdsample.asm - upsampling (64-bit SSE2)
  3. ;
  4. ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
  5. ; Copyright (C) 2009, 2016, D. R. Commander.
  6. ;
  7. ; Based on the x86 SIMD extension for IJG JPEG library
  8. ; Copyright (C) 1999-2006, MIYASAKA Masaru.
  9. ; For conditions of distribution and use, see copyright notice in jsimdext.inc
  10. ;
  11. ; This file should be assembled with NASM (Netwide Assembler),
  12. ; can *not* be assembled with Microsoft's MASM or any compatible
  13. ; assembler (including Borland's Turbo Assembler).
  14. ; NASM is available from http://nasm.sourceforge.net/ or
  15. ; http://sourceforge.net/project/showfiles.php?group_id=6208
  16. %include "jsimdext.inc"
  17. ; --------------------------------------------------------------------------
  18. SECTION SEG_CONST
  19. alignz 32
  20. GLOBAL_DATA(jconst_fancy_upsample_sse2)
  21. EXTN(jconst_fancy_upsample_sse2):
  22. PW_ONE times 8 dw 1
  23. PW_TWO times 8 dw 2
  24. PW_THREE times 8 dw 3
  25. PW_SEVEN times 8 dw 7
  26. PW_EIGHT times 8 dw 8
  27. alignz 32
  28. ; --------------------------------------------------------------------------
  29. SECTION SEG_TEXT
  30. BITS 64
  31. ;
  32. ; Fancy processing for the common case of 2:1 horizontal and 1:1 vertical.
  33. ;
  34. ; The upsampling algorithm is linear interpolation between pixel centers,
  35. ; also known as a "triangle filter". This is a good compromise between
  36. ; speed and visual quality. The centers of the output pixels are 1/4 and 3/4
  37. ; of the way between input pixel centers.
  38. ;
  39. ; GLOBAL(void)
  40. ; jsimd_h2v1_fancy_upsample_sse2(int max_v_samp_factor,
  41. ; JDIMENSION downsampled_width,
  42. ; JSAMPARRAY input_data,
  43. ; JSAMPARRAY *output_data_ptr);
  44. ;
  45. ; r10 = int max_v_samp_factor
  46. ; r11d = JDIMENSION downsampled_width
  47. ; r12 = JSAMPARRAY input_data
  48. ; r13 = JSAMPARRAY *output_data_ptr
  49. align 32
  50. GLOBAL_FUNCTION(jsimd_h2v1_fancy_upsample_sse2)
  51. EXTN(jsimd_h2v1_fancy_upsample_sse2):
  52. push rbp
  53. mov rax, rsp
  54. mov rbp, rsp
  55. collect_args 4
  56. mov eax, r11d ; colctr
  57. test rax, rax
  58. jz near .return
  59. mov rcx, r10 ; rowctr
  60. test rcx, rcx
  61. jz near .return
  62. mov rsi, r12 ; input_data
  63. mov rdi, r13
  64. mov rdi, JSAMPARRAY [rdi] ; output_data
  65. .rowloop:
  66. push rax ; colctr
  67. push rdi
  68. push rsi
  69. mov rsi, JSAMPROW [rsi] ; inptr
  70. mov rdi, JSAMPROW [rdi] ; outptr
  71. test rax, SIZEOF_XMMWORD-1
  72. jz short .skip
  73. mov dl, JSAMPLE [rsi+(rax-1)*SIZEOF_JSAMPLE]
  74. mov JSAMPLE [rsi+rax*SIZEOF_JSAMPLE], dl ; insert a dummy sample
  75. .skip:
  76. pxor xmm0, xmm0 ; xmm0=(all 0's)
  77. pcmpeqb xmm7, xmm7
  78. psrldq xmm7, (SIZEOF_XMMWORD-1)
  79. pand xmm7, XMMWORD [rsi+0*SIZEOF_XMMWORD]
  80. add rax, byte SIZEOF_XMMWORD-1
  81. and rax, byte -SIZEOF_XMMWORD
  82. cmp rax, byte SIZEOF_XMMWORD
  83. ja short .columnloop
  84. .columnloop_last:
  85. pcmpeqb xmm6, xmm6
  86. pslldq xmm6, (SIZEOF_XMMWORD-1)
  87. pand xmm6, XMMWORD [rsi+0*SIZEOF_XMMWORD]
  88. jmp short .upsample
  89. .columnloop:
  90. movdqa xmm6, XMMWORD [rsi+1*SIZEOF_XMMWORD]
  91. pslldq xmm6, (SIZEOF_XMMWORD-1)
  92. .upsample:
  93. movdqa xmm1, XMMWORD [rsi+0*SIZEOF_XMMWORD]
  94. movdqa xmm2, xmm1
  95. movdqa xmm3, xmm1 ; xmm1=( 0 1 2 ... 13 14 15)
  96. pslldq xmm2, 1 ; xmm2=(-- 0 1 ... 12 13 14)
  97. psrldq xmm3, 1 ; xmm3=( 1 2 3 ... 14 15 --)
  98. por xmm2, xmm7 ; xmm2=(-1 0 1 ... 12 13 14)
  99. por xmm3, xmm6 ; xmm3=( 1 2 3 ... 14 15 16)
  100. movdqa xmm7, xmm1
  101. psrldq xmm7, (SIZEOF_XMMWORD-1) ; xmm7=(15 -- -- ... -- -- --)
  102. movdqa xmm4, xmm1
  103. punpcklbw xmm1, xmm0 ; xmm1=( 0 1 2 3 4 5 6 7)
  104. punpckhbw xmm4, xmm0 ; xmm4=( 8 9 10 11 12 13 14 15)
  105. movdqa xmm5, xmm2
  106. punpcklbw xmm2, xmm0 ; xmm2=(-1 0 1 2 3 4 5 6)
  107. punpckhbw xmm5, xmm0 ; xmm5=( 7 8 9 10 11 12 13 14)
  108. movdqa xmm6, xmm3
  109. punpcklbw xmm3, xmm0 ; xmm3=( 1 2 3 4 5 6 7 8)
  110. punpckhbw xmm6, xmm0 ; xmm6=( 9 10 11 12 13 14 15 16)
  111. pmullw xmm1, [rel PW_THREE]
  112. pmullw xmm4, [rel PW_THREE]
  113. paddw xmm2, [rel PW_ONE]
  114. paddw xmm5, [rel PW_ONE]
  115. paddw xmm3, [rel PW_TWO]
  116. paddw xmm6, [rel PW_TWO]
  117. paddw xmm2, xmm1
  118. paddw xmm5, xmm4
  119. psrlw xmm2, 2 ; xmm2=OutLE=( 0 2 4 6 8 10 12 14)
  120. psrlw xmm5, 2 ; xmm5=OutHE=(16 18 20 22 24 26 28 30)
  121. paddw xmm3, xmm1
  122. paddw xmm6, xmm4
  123. psrlw xmm3, 2 ; xmm3=OutLO=( 1 3 5 7 9 11 13 15)
  124. psrlw xmm6, 2 ; xmm6=OutHO=(17 19 21 23 25 27 29 31)
  125. psllw xmm3, BYTE_BIT
  126. psllw xmm6, BYTE_BIT
  127. por xmm2, xmm3 ; xmm2=OutL=( 0 1 2 ... 13 14 15)
  128. por xmm5, xmm6 ; xmm5=OutH=(16 17 18 ... 29 30 31)
  129. movdqa XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm2
  130. movdqa XMMWORD [rdi+1*SIZEOF_XMMWORD], xmm5
  131. sub rax, byte SIZEOF_XMMWORD
  132. add rsi, byte 1*SIZEOF_XMMWORD ; inptr
  133. add rdi, byte 2*SIZEOF_XMMWORD ; outptr
  134. cmp rax, byte SIZEOF_XMMWORD
  135. ja near .columnloop
  136. test eax, eax
  137. jnz near .columnloop_last
  138. pop rsi
  139. pop rdi
  140. pop rax
  141. add rsi, byte SIZEOF_JSAMPROW ; input_data
  142. add rdi, byte SIZEOF_JSAMPROW ; output_data
  143. dec rcx ; rowctr
  144. jg near .rowloop
  145. .return:
  146. uncollect_args 4
  147. pop rbp
  148. ret
  149. ; --------------------------------------------------------------------------
  150. ;
  151. ; Fancy processing for the common case of 2:1 horizontal and 2:1 vertical.
  152. ; Again a triangle filter; see comments for h2v1 case, above.
  153. ;
  154. ; GLOBAL(void)
  155. ; jsimd_h2v2_fancy_upsample_sse2(int max_v_samp_factor,
  156. ; JDIMENSION downsampled_width,
  157. ; JSAMPARRAY input_data,
  158. ; JSAMPARRAY *output_data_ptr);
  159. ;
  160. ; r10 = int max_v_samp_factor
  161. ; r11d = JDIMENSION downsampled_width
  162. ; r12 = JSAMPARRAY input_data
  163. ; r13 = JSAMPARRAY *output_data_ptr
  164. %define wk(i) rbp - (WK_NUM - (i)) * SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
  165. %define WK_NUM 4
  166. align 32
  167. GLOBAL_FUNCTION(jsimd_h2v2_fancy_upsample_sse2)
  168. EXTN(jsimd_h2v2_fancy_upsample_sse2):
  169. push rbp
  170. mov rax, rsp ; rax = original rbp
  171. sub rsp, byte 4
  172. and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
  173. mov [rsp], rax
  174. mov rbp, rsp ; rbp = aligned rbp
  175. lea rsp, [wk(0)]
  176. collect_args 4
  177. push rbx
  178. mov eax, r11d ; colctr
  179. test rax, rax
  180. jz near .return
  181. mov rcx, r10 ; rowctr
  182. test rcx, rcx
  183. jz near .return
  184. mov rsi, r12 ; input_data
  185. mov rdi, r13
  186. mov rdi, JSAMPARRAY [rdi] ; output_data
  187. .rowloop:
  188. push rax ; colctr
  189. push rcx
  190. push rdi
  191. push rsi
  192. mov rcx, JSAMPROW [rsi-1*SIZEOF_JSAMPROW] ; inptr1(above)
  193. mov rbx, JSAMPROW [rsi+0*SIZEOF_JSAMPROW] ; inptr0
  194. mov rsi, JSAMPROW [rsi+1*SIZEOF_JSAMPROW] ; inptr1(below)
  195. mov rdx, JSAMPROW [rdi+0*SIZEOF_JSAMPROW] ; outptr0
  196. mov rdi, JSAMPROW [rdi+1*SIZEOF_JSAMPROW] ; outptr1
  197. test rax, SIZEOF_XMMWORD-1
  198. jz short .skip
  199. push rdx
  200. mov dl, JSAMPLE [rcx+(rax-1)*SIZEOF_JSAMPLE]
  201. mov JSAMPLE [rcx+rax*SIZEOF_JSAMPLE], dl
  202. mov dl, JSAMPLE [rbx+(rax-1)*SIZEOF_JSAMPLE]
  203. mov JSAMPLE [rbx+rax*SIZEOF_JSAMPLE], dl
  204. mov dl, JSAMPLE [rsi+(rax-1)*SIZEOF_JSAMPLE]
  205. mov JSAMPLE [rsi+rax*SIZEOF_JSAMPLE], dl ; insert a dummy sample
  206. pop rdx
  207. .skip:
  208. ; -- process the first column block
  209. movdqa xmm0, XMMWORD [rbx+0*SIZEOF_XMMWORD] ; xmm0=row[ 0][0]
  210. movdqa xmm1, XMMWORD [rcx+0*SIZEOF_XMMWORD] ; xmm1=row[-1][0]
  211. movdqa xmm2, XMMWORD [rsi+0*SIZEOF_XMMWORD] ; xmm2=row[+1][0]
  212. pxor xmm3, xmm3 ; xmm3=(all 0's)
  213. movdqa xmm4, xmm0
  214. punpcklbw xmm0, xmm3 ; xmm0=row[ 0]( 0 1 2 3 4 5 6 7)
  215. punpckhbw xmm4, xmm3 ; xmm4=row[ 0]( 8 9 10 11 12 13 14 15)
  216. movdqa xmm5, xmm1
  217. punpcklbw xmm1, xmm3 ; xmm1=row[-1]( 0 1 2 3 4 5 6 7)
  218. punpckhbw xmm5, xmm3 ; xmm5=row[-1]( 8 9 10 11 12 13 14 15)
  219. movdqa xmm6, xmm2
  220. punpcklbw xmm2, xmm3 ; xmm2=row[+1]( 0 1 2 3 4 5 6 7)
  221. punpckhbw xmm6, xmm3 ; xmm6=row[+1]( 8 9 10 11 12 13 14 15)
  222. pmullw xmm0, [rel PW_THREE]
  223. pmullw xmm4, [rel PW_THREE]
  224. pcmpeqb xmm7, xmm7
  225. psrldq xmm7, (SIZEOF_XMMWORD-2)
  226. paddw xmm1, xmm0 ; xmm1=Int0L=( 0 1 2 3 4 5 6 7)
  227. paddw xmm5, xmm4 ; xmm5=Int0H=( 8 9 10 11 12 13 14 15)
  228. paddw xmm2, xmm0 ; xmm2=Int1L=( 0 1 2 3 4 5 6 7)
  229. paddw xmm6, xmm4 ; xmm6=Int1H=( 8 9 10 11 12 13 14 15)
  230. movdqa XMMWORD [rdx+0*SIZEOF_XMMWORD], xmm1 ; temporarily save
  231. movdqa XMMWORD [rdx+1*SIZEOF_XMMWORD], xmm5 ; the intermediate data
  232. movdqa XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm2
  233. movdqa XMMWORD [rdi+1*SIZEOF_XMMWORD], xmm6
  234. pand xmm1, xmm7 ; xmm1=( 0 -- -- -- -- -- -- --)
  235. pand xmm2, xmm7 ; xmm2=( 0 -- -- -- -- -- -- --)
  236. movdqa XMMWORD [wk(0)], xmm1
  237. movdqa XMMWORD [wk(1)], xmm2
  238. add rax, byte SIZEOF_XMMWORD-1
  239. and rax, byte -SIZEOF_XMMWORD
  240. cmp rax, byte SIZEOF_XMMWORD
  241. ja short .columnloop
  242. .columnloop_last:
  243. ; -- process the last column block
  244. pcmpeqb xmm1, xmm1
  245. pslldq xmm1, (SIZEOF_XMMWORD-2)
  246. movdqa xmm2, xmm1
  247. pand xmm1, XMMWORD [rdx+1*SIZEOF_XMMWORD]
  248. pand xmm2, XMMWORD [rdi+1*SIZEOF_XMMWORD]
  249. movdqa XMMWORD [wk(2)], xmm1 ; xmm1=(-- -- -- -- -- -- -- 15)
  250. movdqa XMMWORD [wk(3)], xmm2 ; xmm2=(-- -- -- -- -- -- -- 15)
  251. jmp near .upsample
  252. .columnloop:
  253. ; -- process the next column block
  254. movdqa xmm0, XMMWORD [rbx+1*SIZEOF_XMMWORD] ; xmm0=row[ 0][1]
  255. movdqa xmm1, XMMWORD [rcx+1*SIZEOF_XMMWORD] ; xmm1=row[-1][1]
  256. movdqa xmm2, XMMWORD [rsi+1*SIZEOF_XMMWORD] ; xmm2=row[+1][1]
  257. pxor xmm3, xmm3 ; xmm3=(all 0's)
  258. movdqa xmm4, xmm0
  259. punpcklbw xmm0, xmm3 ; xmm0=row[ 0]( 0 1 2 3 4 5 6 7)
  260. punpckhbw xmm4, xmm3 ; xmm4=row[ 0]( 8 9 10 11 12 13 14 15)
  261. movdqa xmm5, xmm1
  262. punpcklbw xmm1, xmm3 ; xmm1=row[-1]( 0 1 2 3 4 5 6 7)
  263. punpckhbw xmm5, xmm3 ; xmm5=row[-1]( 8 9 10 11 12 13 14 15)
  264. movdqa xmm6, xmm2
  265. punpcklbw xmm2, xmm3 ; xmm2=row[+1]( 0 1 2 3 4 5 6 7)
  266. punpckhbw xmm6, xmm3 ; xmm6=row[+1]( 8 9 10 11 12 13 14 15)
  267. pmullw xmm0, [rel PW_THREE]
  268. pmullw xmm4, [rel PW_THREE]
  269. paddw xmm1, xmm0 ; xmm1=Int0L=( 0 1 2 3 4 5 6 7)
  270. paddw xmm5, xmm4 ; xmm5=Int0H=( 8 9 10 11 12 13 14 15)
  271. paddw xmm2, xmm0 ; xmm2=Int1L=( 0 1 2 3 4 5 6 7)
  272. paddw xmm6, xmm4 ; xmm6=Int1H=( 8 9 10 11 12 13 14 15)
  273. movdqa XMMWORD [rdx+2*SIZEOF_XMMWORD], xmm1 ; temporarily save
  274. movdqa XMMWORD [rdx+3*SIZEOF_XMMWORD], xmm5 ; the intermediate data
  275. movdqa XMMWORD [rdi+2*SIZEOF_XMMWORD], xmm2
  276. movdqa XMMWORD [rdi+3*SIZEOF_XMMWORD], xmm6
  277. pslldq xmm1, (SIZEOF_XMMWORD-2) ; xmm1=(-- -- -- -- -- -- -- 0)
  278. pslldq xmm2, (SIZEOF_XMMWORD-2) ; xmm2=(-- -- -- -- -- -- -- 0)
  279. movdqa XMMWORD [wk(2)], xmm1
  280. movdqa XMMWORD [wk(3)], xmm2
  281. .upsample:
  282. ; -- process the upper row
  283. movdqa xmm7, XMMWORD [rdx+0*SIZEOF_XMMWORD]
  284. movdqa xmm3, XMMWORD [rdx+1*SIZEOF_XMMWORD]
  285. movdqa xmm0, xmm7 ; xmm7=Int0L=( 0 1 2 3 4 5 6 7)
  286. movdqa xmm4, xmm3 ; xmm3=Int0H=( 8 9 10 11 12 13 14 15)
  287. psrldq xmm0, 2 ; xmm0=( 1 2 3 4 5 6 7 --)
  288. pslldq xmm4, (SIZEOF_XMMWORD-2) ; xmm4=(-- -- -- -- -- -- -- 8)
  289. movdqa xmm5, xmm7
  290. movdqa xmm6, xmm3
  291. psrldq xmm5, (SIZEOF_XMMWORD-2) ; xmm5=( 7 -- -- -- -- -- -- --)
  292. pslldq xmm6, 2 ; xmm6=(-- 8 9 10 11 12 13 14)
  293. por xmm0, xmm4 ; xmm0=( 1 2 3 4 5 6 7 8)
  294. por xmm5, xmm6 ; xmm5=( 7 8 9 10 11 12 13 14)
  295. movdqa xmm1, xmm7
  296. movdqa xmm2, xmm3
  297. pslldq xmm1, 2 ; xmm1=(-- 0 1 2 3 4 5 6)
  298. psrldq xmm2, 2 ; xmm2=( 9 10 11 12 13 14 15 --)
  299. movdqa xmm4, xmm3
  300. psrldq xmm4, (SIZEOF_XMMWORD-2) ; xmm4=(15 -- -- -- -- -- -- --)
  301. por xmm1, XMMWORD [wk(0)] ; xmm1=(-1 0 1 2 3 4 5 6)
  302. por xmm2, XMMWORD [wk(2)] ; xmm2=( 9 10 11 12 13 14 15 16)
  303. movdqa XMMWORD [wk(0)], xmm4
  304. pmullw xmm7, [rel PW_THREE]
  305. pmullw xmm3, [rel PW_THREE]
  306. paddw xmm1, [rel PW_EIGHT]
  307. paddw xmm5, [rel PW_EIGHT]
  308. paddw xmm0, [rel PW_SEVEN]
  309. paddw xmm2, [rel PW_SEVEN]
  310. paddw xmm1, xmm7
  311. paddw xmm5, xmm3
  312. psrlw xmm1, 4 ; xmm1=Out0LE=( 0 2 4 6 8 10 12 14)
  313. psrlw xmm5, 4 ; xmm5=Out0HE=(16 18 20 22 24 26 28 30)
  314. paddw xmm0, xmm7
  315. paddw xmm2, xmm3
  316. psrlw xmm0, 4 ; xmm0=Out0LO=( 1 3 5 7 9 11 13 15)
  317. psrlw xmm2, 4 ; xmm2=Out0HO=(17 19 21 23 25 27 29 31)
  318. psllw xmm0, BYTE_BIT
  319. psllw xmm2, BYTE_BIT
  320. por xmm1, xmm0 ; xmm1=Out0L=( 0 1 2 ... 13 14 15)
  321. por xmm5, xmm2 ; xmm5=Out0H=(16 17 18 ... 29 30 31)
  322. movdqa XMMWORD [rdx+0*SIZEOF_XMMWORD], xmm1
  323. movdqa XMMWORD [rdx+1*SIZEOF_XMMWORD], xmm5
  324. ; -- process the lower row
  325. movdqa xmm6, XMMWORD [rdi+0*SIZEOF_XMMWORD]
  326. movdqa xmm4, XMMWORD [rdi+1*SIZEOF_XMMWORD]
  327. movdqa xmm7, xmm6 ; xmm6=Int1L=( 0 1 2 3 4 5 6 7)
  328. movdqa xmm3, xmm4 ; xmm4=Int1H=( 8 9 10 11 12 13 14 15)
  329. psrldq xmm7, 2 ; xmm7=( 1 2 3 4 5 6 7 --)
  330. pslldq xmm3, (SIZEOF_XMMWORD-2) ; xmm3=(-- -- -- -- -- -- -- 8)
  331. movdqa xmm0, xmm6
  332. movdqa xmm2, xmm4
  333. psrldq xmm0, (SIZEOF_XMMWORD-2) ; xmm0=( 7 -- -- -- -- -- -- --)
  334. pslldq xmm2, 2 ; xmm2=(-- 8 9 10 11 12 13 14)
  335. por xmm7, xmm3 ; xmm7=( 1 2 3 4 5 6 7 8)
  336. por xmm0, xmm2 ; xmm0=( 7 8 9 10 11 12 13 14)
  337. movdqa xmm1, xmm6
  338. movdqa xmm5, xmm4
  339. pslldq xmm1, 2 ; xmm1=(-- 0 1 2 3 4 5 6)
  340. psrldq xmm5, 2 ; xmm5=( 9 10 11 12 13 14 15 --)
  341. movdqa xmm3, xmm4
  342. psrldq xmm3, (SIZEOF_XMMWORD-2) ; xmm3=(15 -- -- -- -- -- -- --)
  343. por xmm1, XMMWORD [wk(1)] ; xmm1=(-1 0 1 2 3 4 5 6)
  344. por xmm5, XMMWORD [wk(3)] ; xmm5=( 9 10 11 12 13 14 15 16)
  345. movdqa XMMWORD [wk(1)], xmm3
  346. pmullw xmm6, [rel PW_THREE]
  347. pmullw xmm4, [rel PW_THREE]
  348. paddw xmm1, [rel PW_EIGHT]
  349. paddw xmm0, [rel PW_EIGHT]
  350. paddw xmm7, [rel PW_SEVEN]
  351. paddw xmm5, [rel PW_SEVEN]
  352. paddw xmm1, xmm6
  353. paddw xmm0, xmm4
  354. psrlw xmm1, 4 ; xmm1=Out1LE=( 0 2 4 6 8 10 12 14)
  355. psrlw xmm0, 4 ; xmm0=Out1HE=(16 18 20 22 24 26 28 30)
  356. paddw xmm7, xmm6
  357. paddw xmm5, xmm4
  358. psrlw xmm7, 4 ; xmm7=Out1LO=( 1 3 5 7 9 11 13 15)
  359. psrlw xmm5, 4 ; xmm5=Out1HO=(17 19 21 23 25 27 29 31)
  360. psllw xmm7, BYTE_BIT
  361. psllw xmm5, BYTE_BIT
  362. por xmm1, xmm7 ; xmm1=Out1L=( 0 1 2 ... 13 14 15)
  363. por xmm0, xmm5 ; xmm0=Out1H=(16 17 18 ... 29 30 31)
  364. movdqa XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm1
  365. movdqa XMMWORD [rdi+1*SIZEOF_XMMWORD], xmm0
  366. sub rax, byte SIZEOF_XMMWORD
  367. add rcx, byte 1*SIZEOF_XMMWORD ; inptr1(above)
  368. add rbx, byte 1*SIZEOF_XMMWORD ; inptr0
  369. add rsi, byte 1*SIZEOF_XMMWORD ; inptr1(below)
  370. add rdx, byte 2*SIZEOF_XMMWORD ; outptr0
  371. add rdi, byte 2*SIZEOF_XMMWORD ; outptr1
  372. cmp rax, byte SIZEOF_XMMWORD
  373. ja near .columnloop
  374. test rax, rax
  375. jnz near .columnloop_last
  376. pop rsi
  377. pop rdi
  378. pop rcx
  379. pop rax
  380. add rsi, byte 1*SIZEOF_JSAMPROW ; input_data
  381. add rdi, byte 2*SIZEOF_JSAMPROW ; output_data
  382. sub rcx, byte 2 ; rowctr
  383. jg near .rowloop
  384. .return:
  385. pop rbx
  386. uncollect_args 4
  387. mov rsp, rbp ; rsp <- aligned rbp
  388. pop rsp ; rsp <- original rbp
  389. pop rbp
  390. ret
  391. ; --------------------------------------------------------------------------
  392. ;
  393. ; Fast processing for the common case of 2:1 horizontal and 1:1 vertical.
  394. ; It's still a box filter.
  395. ;
  396. ; GLOBAL(void)
  397. ; jsimd_h2v1_upsample_sse2(int max_v_samp_factor, JDIMENSION output_width,
  398. ; JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr);
  399. ;
  400. ; r10 = int max_v_samp_factor
  401. ; r11d = JDIMENSION output_width
  402. ; r12 = JSAMPARRAY input_data
  403. ; r13 = JSAMPARRAY *output_data_ptr
  404. align 32
  405. GLOBAL_FUNCTION(jsimd_h2v1_upsample_sse2)
  406. EXTN(jsimd_h2v1_upsample_sse2):
  407. push rbp
  408. mov rax, rsp
  409. mov rbp, rsp
  410. collect_args 4
  411. mov edx, r11d
  412. add rdx, byte (2*SIZEOF_XMMWORD)-1
  413. and rdx, byte -(2*SIZEOF_XMMWORD)
  414. jz near .return
  415. mov rcx, r10 ; rowctr
  416. test rcx, rcx
  417. jz short .return
  418. mov rsi, r12 ; input_data
  419. mov rdi, r13
  420. mov rdi, JSAMPARRAY [rdi] ; output_data
  421. .rowloop:
  422. push rdi
  423. push rsi
  424. mov rsi, JSAMPROW [rsi] ; inptr
  425. mov rdi, JSAMPROW [rdi] ; outptr
  426. mov rax, rdx ; colctr
  427. .columnloop:
  428. movdqa xmm0, XMMWORD [rsi+0*SIZEOF_XMMWORD]
  429. movdqa xmm1, xmm0
  430. punpcklbw xmm0, xmm0
  431. punpckhbw xmm1, xmm1
  432. movdqa XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm0
  433. movdqa XMMWORD [rdi+1*SIZEOF_XMMWORD], xmm1
  434. sub rax, byte 2*SIZEOF_XMMWORD
  435. jz short .nextrow
  436. movdqa xmm2, XMMWORD [rsi+1*SIZEOF_XMMWORD]
  437. movdqa xmm3, xmm2
  438. punpcklbw xmm2, xmm2
  439. punpckhbw xmm3, xmm3
  440. movdqa XMMWORD [rdi+2*SIZEOF_XMMWORD], xmm2
  441. movdqa XMMWORD [rdi+3*SIZEOF_XMMWORD], xmm3
  442. sub rax, byte 2*SIZEOF_XMMWORD
  443. jz short .nextrow
  444. add rsi, byte 2*SIZEOF_XMMWORD ; inptr
  445. add rdi, byte 4*SIZEOF_XMMWORD ; outptr
  446. jmp short .columnloop
  447. .nextrow:
  448. pop rsi
  449. pop rdi
  450. add rsi, byte SIZEOF_JSAMPROW ; input_data
  451. add rdi, byte SIZEOF_JSAMPROW ; output_data
  452. dec rcx ; rowctr
  453. jg short .rowloop
  454. .return:
  455. uncollect_args 4
  456. pop rbp
  457. ret
  458. ; --------------------------------------------------------------------------
  459. ;
  460. ; Fast processing for the common case of 2:1 horizontal and 2:1 vertical.
  461. ; It's still a box filter.
  462. ;
  463. ; GLOBAL(void)
  464. ; jsimd_h2v2_upsample_sse2(int max_v_samp_factor, JDIMENSION output_width,
  465. ; JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr);
  466. ;
  467. ; r10 = int max_v_samp_factor
  468. ; r11d = JDIMENSION output_width
  469. ; r12 = JSAMPARRAY input_data
  470. ; r13 = JSAMPARRAY *output_data_ptr
  471. align 32
  472. GLOBAL_FUNCTION(jsimd_h2v2_upsample_sse2)
  473. EXTN(jsimd_h2v2_upsample_sse2):
  474. push rbp
  475. mov rax, rsp
  476. mov rbp, rsp
  477. collect_args 4
  478. push rbx
  479. mov edx, r11d
  480. add rdx, byte (2*SIZEOF_XMMWORD)-1
  481. and rdx, byte -(2*SIZEOF_XMMWORD)
  482. jz near .return
  483. mov rcx, r10 ; rowctr
  484. test rcx, rcx
  485. jz near .return
  486. mov rsi, r12 ; input_data
  487. mov rdi, r13
  488. mov rdi, JSAMPARRAY [rdi] ; output_data
  489. .rowloop:
  490. push rdi
  491. push rsi
  492. mov rsi, JSAMPROW [rsi] ; inptr
  493. mov rbx, JSAMPROW [rdi+0*SIZEOF_JSAMPROW] ; outptr0
  494. mov rdi, JSAMPROW [rdi+1*SIZEOF_JSAMPROW] ; outptr1
  495. mov rax, rdx ; colctr
  496. .columnloop:
  497. movdqa xmm0, XMMWORD [rsi+0*SIZEOF_XMMWORD]
  498. movdqa xmm1, xmm0
  499. punpcklbw xmm0, xmm0
  500. punpckhbw xmm1, xmm1
  501. movdqa XMMWORD [rbx+0*SIZEOF_XMMWORD], xmm0
  502. movdqa XMMWORD [rbx+1*SIZEOF_XMMWORD], xmm1
  503. movdqa XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm0
  504. movdqa XMMWORD [rdi+1*SIZEOF_XMMWORD], xmm1
  505. sub rax, byte 2*SIZEOF_XMMWORD
  506. jz short .nextrow
  507. movdqa xmm2, XMMWORD [rsi+1*SIZEOF_XMMWORD]
  508. movdqa xmm3, xmm2
  509. punpcklbw xmm2, xmm2
  510. punpckhbw xmm3, xmm3
  511. movdqa XMMWORD [rbx+2*SIZEOF_XMMWORD], xmm2
  512. movdqa XMMWORD [rbx+3*SIZEOF_XMMWORD], xmm3
  513. movdqa XMMWORD [rdi+2*SIZEOF_XMMWORD], xmm2
  514. movdqa XMMWORD [rdi+3*SIZEOF_XMMWORD], xmm3
  515. sub rax, byte 2*SIZEOF_XMMWORD
  516. jz short .nextrow
  517. add rsi, byte 2*SIZEOF_XMMWORD ; inptr
  518. add rbx, byte 4*SIZEOF_XMMWORD ; outptr0
  519. add rdi, byte 4*SIZEOF_XMMWORD ; outptr1
  520. jmp short .columnloop
  521. .nextrow:
  522. pop rsi
  523. pop rdi
  524. add rsi, byte 1*SIZEOF_JSAMPROW ; input_data
  525. add rdi, byte 2*SIZEOF_JSAMPROW ; output_data
  526. sub rcx, byte 2 ; rowctr
  527. jg near .rowloop
  528. .return:
  529. pop rbx
  530. uncollect_args 4
  531. pop rbp
  532. ret
  533. ; For some reason, the OS X linker does not honor the request to align the
  534. ; segment unless we do this.
  535. align 32