jdsample-mmx.asm 25 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731
  1. ;
  2. ; jdsample.asm - upsampling (MMX)
  3. ;
  4. ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
  5. ; Copyright (C) 2016, D. R. Commander.
  6. ;
  7. ; Based on the x86 SIMD extension for IJG JPEG library
  8. ; Copyright (C) 1999-2006, MIYASAKA Masaru.
  9. ; For conditions of distribution and use, see copyright notice in jsimdext.inc
  10. ;
  11. ; This file should be assembled with NASM (Netwide Assembler),
  12. ; can *not* be assembled with Microsoft's MASM or any compatible
  13. ; assembler (including Borland's Turbo Assembler).
  14. ; NASM is available from http://nasm.sourceforge.net/ or
  15. ; http://sourceforge.net/project/showfiles.php?group_id=6208
  16. %include "jsimdext.inc"
  17. ; --------------------------------------------------------------------------
  18. SECTION SEG_CONST
  19. alignz 32
  20. GLOBAL_DATA(jconst_fancy_upsample_mmx)
  21. EXTN(jconst_fancy_upsample_mmx):
  22. PW_ONE times 4 dw 1
  23. PW_TWO times 4 dw 2
  24. PW_THREE times 4 dw 3
  25. PW_SEVEN times 4 dw 7
  26. PW_EIGHT times 4 dw 8
  27. alignz 32
  28. ; --------------------------------------------------------------------------
  29. SECTION SEG_TEXT
  30. BITS 32
  31. ;
  32. ; Fancy processing for the common case of 2:1 horizontal and 1:1 vertical.
  33. ;
  34. ; The upsampling algorithm is linear interpolation between pixel centers,
  35. ; also known as a "triangle filter". This is a good compromise between
  36. ; speed and visual quality. The centers of the output pixels are 1/4 and 3/4
  37. ; of the way between input pixel centers.
  38. ;
  39. ; GLOBAL(void)
  40. ; jsimd_h2v1_fancy_upsample_mmx(int max_v_samp_factor,
  41. ; JDIMENSION downsampled_width,
  42. ; JSAMPARRAY input_data,
  43. ; JSAMPARRAY *output_data_ptr);
  44. ;
  45. %define max_v_samp(b) (b) + 8 ; int max_v_samp_factor
  46. %define downsamp_width(b) (b) + 12 ; JDIMENSION downsampled_width
  47. %define input_data(b) (b) + 16 ; JSAMPARRAY input_data
  48. %define output_data_ptr(b) (b) + 20 ; JSAMPARRAY *output_data_ptr
  49. align 32
  50. GLOBAL_FUNCTION(jsimd_h2v1_fancy_upsample_mmx)
  51. EXTN(jsimd_h2v1_fancy_upsample_mmx):
  52. push ebp
  53. mov ebp, esp
  54. pushpic ebx
  55. ; push ecx ; need not be preserved
  56. ; push edx ; need not be preserved
  57. push esi
  58. push edi
  59. get_GOT ebx ; get GOT address
  60. mov eax, JDIMENSION [downsamp_width(ebp)] ; colctr
  61. test eax, eax
  62. jz near .return
  63. mov ecx, INT [max_v_samp(ebp)] ; rowctr
  64. test ecx, ecx
  65. jz near .return
  66. mov esi, JSAMPARRAY [input_data(ebp)] ; input_data
  67. mov edi, POINTER [output_data_ptr(ebp)]
  68. mov edi, JSAMPARRAY [edi] ; output_data
  69. alignx 16, 7
  70. .rowloop:
  71. push eax ; colctr
  72. push edi
  73. push esi
  74. mov esi, JSAMPROW [esi] ; inptr
  75. mov edi, JSAMPROW [edi] ; outptr
  76. test eax, SIZEOF_MMWORD-1
  77. jz short .skip
  78. mov dl, JSAMPLE [esi+(eax-1)*SIZEOF_JSAMPLE]
  79. mov JSAMPLE [esi+eax*SIZEOF_JSAMPLE], dl ; insert a dummy sample
  80. .skip:
  81. pxor mm0, mm0 ; mm0=(all 0's)
  82. pcmpeqb mm7, mm7
  83. psrlq mm7, (SIZEOF_MMWORD-1)*BYTE_BIT
  84. pand mm7, MMWORD [esi+0*SIZEOF_MMWORD]
  85. add eax, byte SIZEOF_MMWORD-1
  86. and eax, byte -SIZEOF_MMWORD
  87. cmp eax, byte SIZEOF_MMWORD
  88. ja short .columnloop
  89. alignx 16, 7
  90. .columnloop_last:
  91. pcmpeqb mm6, mm6
  92. psllq mm6, (SIZEOF_MMWORD-1)*BYTE_BIT
  93. pand mm6, MMWORD [esi+0*SIZEOF_MMWORD]
  94. jmp short .upsample
  95. alignx 16, 7
  96. .columnloop:
  97. movq mm6, MMWORD [esi+1*SIZEOF_MMWORD]
  98. psllq mm6, (SIZEOF_MMWORD-1)*BYTE_BIT
  99. .upsample:
  100. movq mm1, MMWORD [esi+0*SIZEOF_MMWORD]
  101. movq mm2, mm1
  102. movq mm3, mm1 ; mm1=( 0 1 2 3 4 5 6 7)
  103. psllq mm2, BYTE_BIT ; mm2=( - 0 1 2 3 4 5 6)
  104. psrlq mm3, BYTE_BIT ; mm3=( 1 2 3 4 5 6 7 -)
  105. por mm2, mm7 ; mm2=(-1 0 1 2 3 4 5 6)
  106. por mm3, mm6 ; mm3=( 1 2 3 4 5 6 7 8)
  107. movq mm7, mm1
  108. psrlq mm7, (SIZEOF_MMWORD-1)*BYTE_BIT ; mm7=( 7 - - - - - - -)
  109. movq mm4, mm1
  110. punpcklbw mm1, mm0 ; mm1=( 0 1 2 3)
  111. punpckhbw mm4, mm0 ; mm4=( 4 5 6 7)
  112. movq mm5, mm2
  113. punpcklbw mm2, mm0 ; mm2=(-1 0 1 2)
  114. punpckhbw mm5, mm0 ; mm5=( 3 4 5 6)
  115. movq mm6, mm3
  116. punpcklbw mm3, mm0 ; mm3=( 1 2 3 4)
  117. punpckhbw mm6, mm0 ; mm6=( 5 6 7 8)
  118. pmullw mm1, [GOTOFF(ebx,PW_THREE)]
  119. pmullw mm4, [GOTOFF(ebx,PW_THREE)]
  120. paddw mm2, [GOTOFF(ebx,PW_ONE)]
  121. paddw mm5, [GOTOFF(ebx,PW_ONE)]
  122. paddw mm3, [GOTOFF(ebx,PW_TWO)]
  123. paddw mm6, [GOTOFF(ebx,PW_TWO)]
  124. paddw mm2, mm1
  125. paddw mm5, mm4
  126. psrlw mm2, 2 ; mm2=OutLE=( 0 2 4 6)
  127. psrlw mm5, 2 ; mm5=OutHE=( 8 10 12 14)
  128. paddw mm3, mm1
  129. paddw mm6, mm4
  130. psrlw mm3, 2 ; mm3=OutLO=( 1 3 5 7)
  131. psrlw mm6, 2 ; mm6=OutHO=( 9 11 13 15)
  132. psllw mm3, BYTE_BIT
  133. psllw mm6, BYTE_BIT
  134. por mm2, mm3 ; mm2=OutL=( 0 1 2 3 4 5 6 7)
  135. por mm5, mm6 ; mm5=OutH=( 8 9 10 11 12 13 14 15)
  136. movq MMWORD [edi+0*SIZEOF_MMWORD], mm2
  137. movq MMWORD [edi+1*SIZEOF_MMWORD], mm5
  138. sub eax, byte SIZEOF_MMWORD
  139. add esi, byte 1*SIZEOF_MMWORD ; inptr
  140. add edi, byte 2*SIZEOF_MMWORD ; outptr
  141. cmp eax, byte SIZEOF_MMWORD
  142. ja near .columnloop
  143. test eax, eax
  144. jnz near .columnloop_last
  145. pop esi
  146. pop edi
  147. pop eax
  148. add esi, byte SIZEOF_JSAMPROW ; input_data
  149. add edi, byte SIZEOF_JSAMPROW ; output_data
  150. dec ecx ; rowctr
  151. jg near .rowloop
  152. emms ; empty MMX state
  153. .return:
  154. pop edi
  155. pop esi
  156. ; pop edx ; need not be preserved
  157. ; pop ecx ; need not be preserved
  158. poppic ebx
  159. pop ebp
  160. ret
  161. ; --------------------------------------------------------------------------
  162. ;
  163. ; Fancy processing for the common case of 2:1 horizontal and 2:1 vertical.
  164. ; Again a triangle filter; see comments for h2v1 case, above.
  165. ;
  166. ; GLOBAL(void)
  167. ; jsimd_h2v2_fancy_upsample_mmx(int max_v_samp_factor,
  168. ; JDIMENSION downsampled_width,
  169. ; JSAMPARRAY input_data,
  170. ; JSAMPARRAY *output_data_ptr);
  171. ;
  172. %define max_v_samp(b) (b) + 8 ; int max_v_samp_factor
  173. %define downsamp_width(b) (b) + 12 ; JDIMENSION downsampled_width
  174. %define input_data(b) (b) + 16 ; JSAMPARRAY input_data
  175. %define output_data_ptr(b) (b) + 20 ; JSAMPARRAY *output_data_ptr
  176. %define original_ebp ebp + 0
  177. %define wk(i) ebp - (WK_NUM - (i)) * SIZEOF_MMWORD ; mmword wk[WK_NUM]
  178. %define WK_NUM 4
  179. %define gotptr wk(0) - SIZEOF_POINTER ; void *gotptr
  180. align 32
  181. GLOBAL_FUNCTION(jsimd_h2v2_fancy_upsample_mmx)
  182. EXTN(jsimd_h2v2_fancy_upsample_mmx):
  183. push ebp
  184. mov eax, esp ; eax = original ebp
  185. sub esp, byte 4
  186. and esp, byte (-SIZEOF_MMWORD) ; align to 64 bits
  187. mov [esp], eax
  188. mov ebp, esp ; ebp = aligned ebp
  189. lea esp, [wk(0)]
  190. pushpic eax ; make a room for GOT address
  191. push ebx
  192. ; push ecx ; need not be preserved
  193. ; push edx ; need not be preserved
  194. push esi
  195. push edi
  196. get_GOT ebx ; get GOT address
  197. movpic POINTER [gotptr], ebx ; save GOT address
  198. mov edx, eax ; edx = original ebp
  199. mov eax, JDIMENSION [downsamp_width(edx)] ; colctr
  200. test eax, eax
  201. jz near .return
  202. mov ecx, INT [max_v_samp(edx)] ; rowctr
  203. test ecx, ecx
  204. jz near .return
  205. mov esi, JSAMPARRAY [input_data(edx)] ; input_data
  206. mov edi, POINTER [output_data_ptr(edx)]
  207. mov edi, JSAMPARRAY [edi] ; output_data
  208. alignx 16, 7
  209. .rowloop:
  210. push eax ; colctr
  211. push ecx
  212. push edi
  213. push esi
  214. mov ecx, JSAMPROW [esi-1*SIZEOF_JSAMPROW] ; inptr1(above)
  215. mov ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW] ; inptr0
  216. mov esi, JSAMPROW [esi+1*SIZEOF_JSAMPROW] ; inptr1(below)
  217. mov edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW] ; outptr0
  218. mov edi, JSAMPROW [edi+1*SIZEOF_JSAMPROW] ; outptr1
  219. test eax, SIZEOF_MMWORD-1
  220. jz short .skip
  221. push edx
  222. mov dl, JSAMPLE [ecx+(eax-1)*SIZEOF_JSAMPLE]
  223. mov JSAMPLE [ecx+eax*SIZEOF_JSAMPLE], dl
  224. mov dl, JSAMPLE [ebx+(eax-1)*SIZEOF_JSAMPLE]
  225. mov JSAMPLE [ebx+eax*SIZEOF_JSAMPLE], dl
  226. mov dl, JSAMPLE [esi+(eax-1)*SIZEOF_JSAMPLE]
  227. mov JSAMPLE [esi+eax*SIZEOF_JSAMPLE], dl ; insert a dummy sample
  228. pop edx
  229. .skip:
  230. ; -- process the first column block
  231. movq mm0, MMWORD [ebx+0*SIZEOF_MMWORD] ; mm0=row[ 0][0]
  232. movq mm1, MMWORD [ecx+0*SIZEOF_MMWORD] ; mm1=row[-1][0]
  233. movq mm2, MMWORD [esi+0*SIZEOF_MMWORD] ; mm2=row[+1][0]
  234. pushpic ebx
  235. movpic ebx, POINTER [gotptr] ; load GOT address
  236. pxor mm3, mm3 ; mm3=(all 0's)
  237. movq mm4, mm0
  238. punpcklbw mm0, mm3 ; mm0=row[ 0][0]( 0 1 2 3)
  239. punpckhbw mm4, mm3 ; mm4=row[ 0][0]( 4 5 6 7)
  240. movq mm5, mm1
  241. punpcklbw mm1, mm3 ; mm1=row[-1][0]( 0 1 2 3)
  242. punpckhbw mm5, mm3 ; mm5=row[-1][0]( 4 5 6 7)
  243. movq mm6, mm2
  244. punpcklbw mm2, mm3 ; mm2=row[+1][0]( 0 1 2 3)
  245. punpckhbw mm6, mm3 ; mm6=row[+1][0]( 4 5 6 7)
  246. pmullw mm0, [GOTOFF(ebx,PW_THREE)]
  247. pmullw mm4, [GOTOFF(ebx,PW_THREE)]
  248. pcmpeqb mm7, mm7
  249. psrlq mm7, (SIZEOF_MMWORD-2)*BYTE_BIT
  250. paddw mm1, mm0 ; mm1=Int0L=( 0 1 2 3)
  251. paddw mm5, mm4 ; mm5=Int0H=( 4 5 6 7)
  252. paddw mm2, mm0 ; mm2=Int1L=( 0 1 2 3)
  253. paddw mm6, mm4 ; mm6=Int1H=( 4 5 6 7)
  254. movq MMWORD [edx+0*SIZEOF_MMWORD], mm1 ; temporarily save
  255. movq MMWORD [edx+1*SIZEOF_MMWORD], mm5 ; the intermediate data
  256. movq MMWORD [edi+0*SIZEOF_MMWORD], mm2
  257. movq MMWORD [edi+1*SIZEOF_MMWORD], mm6
  258. pand mm1, mm7 ; mm1=( 0 - - -)
  259. pand mm2, mm7 ; mm2=( 0 - - -)
  260. movq MMWORD [wk(0)], mm1
  261. movq MMWORD [wk(1)], mm2
  262. poppic ebx
  263. add eax, byte SIZEOF_MMWORD-1
  264. and eax, byte -SIZEOF_MMWORD
  265. cmp eax, byte SIZEOF_MMWORD
  266. ja short .columnloop
  267. alignx 16, 7
  268. .columnloop_last:
  269. ; -- process the last column block
  270. pushpic ebx
  271. movpic ebx, POINTER [gotptr] ; load GOT address
  272. pcmpeqb mm1, mm1
  273. psllq mm1, (SIZEOF_MMWORD-2)*BYTE_BIT
  274. movq mm2, mm1
  275. pand mm1, MMWORD [edx+1*SIZEOF_MMWORD] ; mm1=( - - - 7)
  276. pand mm2, MMWORD [edi+1*SIZEOF_MMWORD] ; mm2=( - - - 7)
  277. movq MMWORD [wk(2)], mm1
  278. movq MMWORD [wk(3)], mm2
  279. jmp short .upsample
  280. alignx 16, 7
  281. .columnloop:
  282. ; -- process the next column block
  283. movq mm0, MMWORD [ebx+1*SIZEOF_MMWORD] ; mm0=row[ 0][1]
  284. movq mm1, MMWORD [ecx+1*SIZEOF_MMWORD] ; mm1=row[-1][1]
  285. movq mm2, MMWORD [esi+1*SIZEOF_MMWORD] ; mm2=row[+1][1]
  286. pushpic ebx
  287. movpic ebx, POINTER [gotptr] ; load GOT address
  288. pxor mm3, mm3 ; mm3=(all 0's)
  289. movq mm4, mm0
  290. punpcklbw mm0, mm3 ; mm0=row[ 0][1]( 0 1 2 3)
  291. punpckhbw mm4, mm3 ; mm4=row[ 0][1]( 4 5 6 7)
  292. movq mm5, mm1
  293. punpcklbw mm1, mm3 ; mm1=row[-1][1]( 0 1 2 3)
  294. punpckhbw mm5, mm3 ; mm5=row[-1][1]( 4 5 6 7)
  295. movq mm6, mm2
  296. punpcklbw mm2, mm3 ; mm2=row[+1][1]( 0 1 2 3)
  297. punpckhbw mm6, mm3 ; mm6=row[+1][1]( 4 5 6 7)
  298. pmullw mm0, [GOTOFF(ebx,PW_THREE)]
  299. pmullw mm4, [GOTOFF(ebx,PW_THREE)]
  300. paddw mm1, mm0 ; mm1=Int0L=( 0 1 2 3)
  301. paddw mm5, mm4 ; mm5=Int0H=( 4 5 6 7)
  302. paddw mm2, mm0 ; mm2=Int1L=( 0 1 2 3)
  303. paddw mm6, mm4 ; mm6=Int1H=( 4 5 6 7)
  304. movq MMWORD [edx+2*SIZEOF_MMWORD], mm1 ; temporarily save
  305. movq MMWORD [edx+3*SIZEOF_MMWORD], mm5 ; the intermediate data
  306. movq MMWORD [edi+2*SIZEOF_MMWORD], mm2
  307. movq MMWORD [edi+3*SIZEOF_MMWORD], mm6
  308. psllq mm1, (SIZEOF_MMWORD-2)*BYTE_BIT ; mm1=( - - - 0)
  309. psllq mm2, (SIZEOF_MMWORD-2)*BYTE_BIT ; mm2=( - - - 0)
  310. movq MMWORD [wk(2)], mm1
  311. movq MMWORD [wk(3)], mm2
  312. .upsample:
  313. ; -- process the upper row
  314. movq mm7, MMWORD [edx+0*SIZEOF_MMWORD] ; mm7=Int0L=( 0 1 2 3)
  315. movq mm3, MMWORD [edx+1*SIZEOF_MMWORD] ; mm3=Int0H=( 4 5 6 7)
  316. movq mm0, mm7
  317. movq mm4, mm3
  318. psrlq mm0, 2*BYTE_BIT ; mm0=( 1 2 3 -)
  319. psllq mm4, (SIZEOF_MMWORD-2)*BYTE_BIT ; mm4=( - - - 4)
  320. movq mm5, mm7
  321. movq mm6, mm3
  322. psrlq mm5, (SIZEOF_MMWORD-2)*BYTE_BIT ; mm5=( 3 - - -)
  323. psllq mm6, 2*BYTE_BIT ; mm6=( - 4 5 6)
  324. por mm0, mm4 ; mm0=( 1 2 3 4)
  325. por mm5, mm6 ; mm5=( 3 4 5 6)
  326. movq mm1, mm7
  327. movq mm2, mm3
  328. psllq mm1, 2*BYTE_BIT ; mm1=( - 0 1 2)
  329. psrlq mm2, 2*BYTE_BIT ; mm2=( 5 6 7 -)
  330. movq mm4, mm3
  331. psrlq mm4, (SIZEOF_MMWORD-2)*BYTE_BIT ; mm4=( 7 - - -)
  332. por mm1, MMWORD [wk(0)] ; mm1=(-1 0 1 2)
  333. por mm2, MMWORD [wk(2)] ; mm2=( 5 6 7 8)
  334. movq MMWORD [wk(0)], mm4
  335. pmullw mm7, [GOTOFF(ebx,PW_THREE)]
  336. pmullw mm3, [GOTOFF(ebx,PW_THREE)]
  337. paddw mm1, [GOTOFF(ebx,PW_EIGHT)]
  338. paddw mm5, [GOTOFF(ebx,PW_EIGHT)]
  339. paddw mm0, [GOTOFF(ebx,PW_SEVEN)]
  340. paddw mm2, [GOTOFF(ebx,PW_SEVEN)]
  341. paddw mm1, mm7
  342. paddw mm5, mm3
  343. psrlw mm1, 4 ; mm1=Out0LE=( 0 2 4 6)
  344. psrlw mm5, 4 ; mm5=Out0HE=( 8 10 12 14)
  345. paddw mm0, mm7
  346. paddw mm2, mm3
  347. psrlw mm0, 4 ; mm0=Out0LO=( 1 3 5 7)
  348. psrlw mm2, 4 ; mm2=Out0HO=( 9 11 13 15)
  349. psllw mm0, BYTE_BIT
  350. psllw mm2, BYTE_BIT
  351. por mm1, mm0 ; mm1=Out0L=( 0 1 2 3 4 5 6 7)
  352. por mm5, mm2 ; mm5=Out0H=( 8 9 10 11 12 13 14 15)
  353. movq MMWORD [edx+0*SIZEOF_MMWORD], mm1
  354. movq MMWORD [edx+1*SIZEOF_MMWORD], mm5
  355. ; -- process the lower row
  356. movq mm6, MMWORD [edi+0*SIZEOF_MMWORD] ; mm6=Int1L=( 0 1 2 3)
  357. movq mm4, MMWORD [edi+1*SIZEOF_MMWORD] ; mm4=Int1H=( 4 5 6 7)
  358. movq mm7, mm6
  359. movq mm3, mm4
  360. psrlq mm7, 2*BYTE_BIT ; mm7=( 1 2 3 -)
  361. psllq mm3, (SIZEOF_MMWORD-2)*BYTE_BIT ; mm3=( - - - 4)
  362. movq mm0, mm6
  363. movq mm2, mm4
  364. psrlq mm0, (SIZEOF_MMWORD-2)*BYTE_BIT ; mm0=( 3 - - -)
  365. psllq mm2, 2*BYTE_BIT ; mm2=( - 4 5 6)
  366. por mm7, mm3 ; mm7=( 1 2 3 4)
  367. por mm0, mm2 ; mm0=( 3 4 5 6)
  368. movq mm1, mm6
  369. movq mm5, mm4
  370. psllq mm1, 2*BYTE_BIT ; mm1=( - 0 1 2)
  371. psrlq mm5, 2*BYTE_BIT ; mm5=( 5 6 7 -)
  372. movq mm3, mm4
  373. psrlq mm3, (SIZEOF_MMWORD-2)*BYTE_BIT ; mm3=( 7 - - -)
  374. por mm1, MMWORD [wk(1)] ; mm1=(-1 0 1 2)
  375. por mm5, MMWORD [wk(3)] ; mm5=( 5 6 7 8)
  376. movq MMWORD [wk(1)], mm3
  377. pmullw mm6, [GOTOFF(ebx,PW_THREE)]
  378. pmullw mm4, [GOTOFF(ebx,PW_THREE)]
  379. paddw mm1, [GOTOFF(ebx,PW_EIGHT)]
  380. paddw mm0, [GOTOFF(ebx,PW_EIGHT)]
  381. paddw mm7, [GOTOFF(ebx,PW_SEVEN)]
  382. paddw mm5, [GOTOFF(ebx,PW_SEVEN)]
  383. paddw mm1, mm6
  384. paddw mm0, mm4
  385. psrlw mm1, 4 ; mm1=Out1LE=( 0 2 4 6)
  386. psrlw mm0, 4 ; mm0=Out1HE=( 8 10 12 14)
  387. paddw mm7, mm6
  388. paddw mm5, mm4
  389. psrlw mm7, 4 ; mm7=Out1LO=( 1 3 5 7)
  390. psrlw mm5, 4 ; mm5=Out1HO=( 9 11 13 15)
  391. psllw mm7, BYTE_BIT
  392. psllw mm5, BYTE_BIT
  393. por mm1, mm7 ; mm1=Out1L=( 0 1 2 3 4 5 6 7)
  394. por mm0, mm5 ; mm0=Out1H=( 8 9 10 11 12 13 14 15)
  395. movq MMWORD [edi+0*SIZEOF_MMWORD], mm1
  396. movq MMWORD [edi+1*SIZEOF_MMWORD], mm0
  397. poppic ebx
  398. sub eax, byte SIZEOF_MMWORD
  399. add ecx, byte 1*SIZEOF_MMWORD ; inptr1(above)
  400. add ebx, byte 1*SIZEOF_MMWORD ; inptr0
  401. add esi, byte 1*SIZEOF_MMWORD ; inptr1(below)
  402. add edx, byte 2*SIZEOF_MMWORD ; outptr0
  403. add edi, byte 2*SIZEOF_MMWORD ; outptr1
  404. cmp eax, byte SIZEOF_MMWORD
  405. ja near .columnloop
  406. test eax, eax
  407. jnz near .columnloop_last
  408. pop esi
  409. pop edi
  410. pop ecx
  411. pop eax
  412. add esi, byte 1*SIZEOF_JSAMPROW ; input_data
  413. add edi, byte 2*SIZEOF_JSAMPROW ; output_data
  414. sub ecx, byte 2 ; rowctr
  415. jg near .rowloop
  416. emms ; empty MMX state
  417. .return:
  418. pop edi
  419. pop esi
  420. ; pop edx ; need not be preserved
  421. ; pop ecx ; need not be preserved
  422. pop ebx
  423. mov esp, ebp ; esp <- aligned ebp
  424. pop esp ; esp <- original ebp
  425. pop ebp
  426. ret
  427. ; --------------------------------------------------------------------------
  428. ;
  429. ; Fast processing for the common case of 2:1 horizontal and 1:1 vertical.
  430. ; It's still a box filter.
  431. ;
  432. ; GLOBAL(void)
  433. ; jsimd_h2v1_upsample_mmx(int max_v_samp_factor, JDIMENSION output_width,
  434. ; JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr);
  435. ;
  436. %define max_v_samp(b) (b) + 8 ; int max_v_samp_factor
  437. %define output_width(b) (b) + 12 ; JDIMENSION output_width
  438. %define input_data(b) (b) + 16 ; JSAMPARRAY input_data
  439. %define output_data_ptr(b) (b) + 20 ; JSAMPARRAY *output_data_ptr
  440. align 32
  441. GLOBAL_FUNCTION(jsimd_h2v1_upsample_mmx)
  442. EXTN(jsimd_h2v1_upsample_mmx):
  443. push ebp
  444. mov ebp, esp
  445. ; push ebx ; unused
  446. ; push ecx ; need not be preserved
  447. ; push edx ; need not be preserved
  448. push esi
  449. push edi
  450. mov edx, JDIMENSION [output_width(ebp)]
  451. add edx, byte (2*SIZEOF_MMWORD)-1
  452. and edx, byte -(2*SIZEOF_MMWORD)
  453. jz short .return
  454. mov ecx, INT [max_v_samp(ebp)] ; rowctr
  455. test ecx, ecx
  456. jz short .return
  457. mov esi, JSAMPARRAY [input_data(ebp)] ; input_data
  458. mov edi, POINTER [output_data_ptr(ebp)]
  459. mov edi, JSAMPARRAY [edi] ; output_data
  460. alignx 16, 7
  461. .rowloop:
  462. push edi
  463. push esi
  464. mov esi, JSAMPROW [esi] ; inptr
  465. mov edi, JSAMPROW [edi] ; outptr
  466. mov eax, edx ; colctr
  467. alignx 16, 7
  468. .columnloop:
  469. movq mm0, MMWORD [esi+0*SIZEOF_MMWORD]
  470. movq mm1, mm0
  471. punpcklbw mm0, mm0
  472. punpckhbw mm1, mm1
  473. movq MMWORD [edi+0*SIZEOF_MMWORD], mm0
  474. movq MMWORD [edi+1*SIZEOF_MMWORD], mm1
  475. sub eax, byte 2*SIZEOF_MMWORD
  476. jz short .nextrow
  477. movq mm2, MMWORD [esi+1*SIZEOF_MMWORD]
  478. movq mm3, mm2
  479. punpcklbw mm2, mm2
  480. punpckhbw mm3, mm3
  481. movq MMWORD [edi+2*SIZEOF_MMWORD], mm2
  482. movq MMWORD [edi+3*SIZEOF_MMWORD], mm3
  483. sub eax, byte 2*SIZEOF_MMWORD
  484. jz short .nextrow
  485. add esi, byte 2*SIZEOF_MMWORD ; inptr
  486. add edi, byte 4*SIZEOF_MMWORD ; outptr
  487. jmp short .columnloop
  488. alignx 16, 7
  489. .nextrow:
  490. pop esi
  491. pop edi
  492. add esi, byte SIZEOF_JSAMPROW ; input_data
  493. add edi, byte SIZEOF_JSAMPROW ; output_data
  494. dec ecx ; rowctr
  495. jg short .rowloop
  496. emms ; empty MMX state
  497. .return:
  498. pop edi
  499. pop esi
  500. ; pop edx ; need not be preserved
  501. ; pop ecx ; need not be preserved
  502. ; pop ebx ; unused
  503. pop ebp
  504. ret
  505. ; --------------------------------------------------------------------------
  506. ;
  507. ; Fast processing for the common case of 2:1 horizontal and 2:1 vertical.
  508. ; It's still a box filter.
  509. ;
  510. ; GLOBAL(void)
  511. ; jsimd_h2v2_upsample_mmx(int max_v_samp_factor, JDIMENSION output_width,
  512. ; JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr);
  513. ;
  514. %define max_v_samp(b) (b) + 8 ; int max_v_samp_factor
  515. %define output_width(b) (b) + 12 ; JDIMENSION output_width
  516. %define input_data(b) (b) + 16 ; JSAMPARRAY input_data
  517. %define output_data_ptr(b) (b) + 20 ; JSAMPARRAY *output_data_ptr
  518. align 32
  519. GLOBAL_FUNCTION(jsimd_h2v2_upsample_mmx)
  520. EXTN(jsimd_h2v2_upsample_mmx):
  521. push ebp
  522. mov ebp, esp
  523. push ebx
  524. ; push ecx ; need not be preserved
  525. ; push edx ; need not be preserved
  526. push esi
  527. push edi
  528. mov edx, JDIMENSION [output_width(ebp)]
  529. add edx, byte (2*SIZEOF_MMWORD)-1
  530. and edx, byte -(2*SIZEOF_MMWORD)
  531. jz near .return
  532. mov ecx, INT [max_v_samp(ebp)] ; rowctr
  533. test ecx, ecx
  534. jz short .return
  535. mov esi, JSAMPARRAY [input_data(ebp)] ; input_data
  536. mov edi, POINTER [output_data_ptr(ebp)]
  537. mov edi, JSAMPARRAY [edi] ; output_data
  538. alignx 16, 7
  539. .rowloop:
  540. push edi
  541. push esi
  542. mov esi, JSAMPROW [esi] ; inptr
  543. mov ebx, JSAMPROW [edi+0*SIZEOF_JSAMPROW] ; outptr0
  544. mov edi, JSAMPROW [edi+1*SIZEOF_JSAMPROW] ; outptr1
  545. mov eax, edx ; colctr
  546. alignx 16, 7
  547. .columnloop:
  548. movq mm0, MMWORD [esi+0*SIZEOF_MMWORD]
  549. movq mm1, mm0
  550. punpcklbw mm0, mm0
  551. punpckhbw mm1, mm1
  552. movq MMWORD [ebx+0*SIZEOF_MMWORD], mm0
  553. movq MMWORD [ebx+1*SIZEOF_MMWORD], mm1
  554. movq MMWORD [edi+0*SIZEOF_MMWORD], mm0
  555. movq MMWORD [edi+1*SIZEOF_MMWORD], mm1
  556. sub eax, byte 2*SIZEOF_MMWORD
  557. jz short .nextrow
  558. movq mm2, MMWORD [esi+1*SIZEOF_MMWORD]
  559. movq mm3, mm2
  560. punpcklbw mm2, mm2
  561. punpckhbw mm3, mm3
  562. movq MMWORD [ebx+2*SIZEOF_MMWORD], mm2
  563. movq MMWORD [ebx+3*SIZEOF_MMWORD], mm3
  564. movq MMWORD [edi+2*SIZEOF_MMWORD], mm2
  565. movq MMWORD [edi+3*SIZEOF_MMWORD], mm3
  566. sub eax, byte 2*SIZEOF_MMWORD
  567. jz short .nextrow
  568. add esi, byte 2*SIZEOF_MMWORD ; inptr
  569. add ebx, byte 4*SIZEOF_MMWORD ; outptr0
  570. add edi, byte 4*SIZEOF_MMWORD ; outptr1
  571. jmp short .columnloop
  572. alignx 16, 7
  573. .nextrow:
  574. pop esi
  575. pop edi
  576. add esi, byte 1*SIZEOF_JSAMPROW ; input_data
  577. add edi, byte 2*SIZEOF_JSAMPROW ; output_data
  578. sub ecx, byte 2 ; rowctr
  579. jg short .rowloop
  580. emms ; empty MMX state
  581. .return:
  582. pop edi
  583. pop esi
  584. ; pop edx ; need not be preserved
  585. ; pop ecx ; need not be preserved
  586. pop ebx
  587. pop ebp
  588. ret
  589. ; For some reason, the OS X linker does not honor the request to align the
  590. ; segment unless we do this.
  591. align 32