jfdctflt-3dn.asm 12 KB


  1. ;
  2. ; jfdctflt.asm - floating-point FDCT (3DNow!)
  3. ;
  4. ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
  5. ; Copyright (C) 2016, D. R. Commander.
  6. ;
  7. ; Based on the x86 SIMD extension for IJG JPEG library
  8. ; Copyright (C) 1999-2006, MIYASAKA Masaru.
  9. ; For conditions of distribution and use, see copyright notice in jsimdext.inc
  10. ;
  11. ; This file should be assembled with NASM (Netwide Assembler),
  12. ; can *not* be assembled with Microsoft's MASM or any compatible
  13. ; assembler (including Borland's Turbo Assembler).
  14. ; NASM is available from http://nasm.sourceforge.net/ or
  15. ; http://sourceforge.net/project/showfiles.php?group_id=6208
  16. ;
  17. ; This file contains a floating-point implementation of the forward DCT
  18. ; (Discrete Cosine Transform). The following code is based directly on
  19. ; the IJG's original jfdctflt.c; see the jfdctflt.c for more details.
  20. %include "jsimdext.inc"
  21. %include "jdct.inc"
  22. ; --------------------------------------------------------------------------
  23. SECTION SEG_CONST
  24. alignz 32
  25. GLOBAL_DATA(jconst_fdct_float_3dnow)
  26. EXTN(jconst_fdct_float_3dnow):
  27. PD_0_382 times 2 dd 0.382683432365089771728460
  28. PD_0_707 times 2 dd 0.707106781186547524400844
  29. PD_0_541 times 2 dd 0.541196100146196984399723
  30. PD_1_306 times 2 dd 1.306562964876376527856643
  31. alignz 32
  32. ; --------------------------------------------------------------------------
  33. SECTION SEG_TEXT
  34. BITS 32
  35. ;
  36. ; Perform the forward DCT on one block of samples.
  37. ;
  38. ; GLOBAL(void)
  39. ; jsimd_fdct_float_3dnow(FAST_FLOAT *data)
  40. ;
  41. %define data(b) (b) + 8 ; FAST_FLOAT *data
  42. %define original_ebp ebp + 0
  43. %define wk(i) ebp - (WK_NUM - (i)) * SIZEOF_MMWORD ; mmword wk[WK_NUM]
  44. %define WK_NUM 2
  45. align 32
  46. GLOBAL_FUNCTION(jsimd_fdct_float_3dnow)
  47. EXTN(jsimd_fdct_float_3dnow):
  48. push ebp
  49. mov eax, esp ; eax = original ebp
  50. sub esp, byte 4
  51. and esp, byte (-SIZEOF_MMWORD) ; align to 64 bits
  52. mov [esp], eax
  53. mov ebp, esp ; ebp = aligned ebp
  54. lea esp, [wk(0)]
  55. pushpic ebx
  56. ; push ecx ; need not be preserved
  57. ; push edx ; need not be preserved
  58. ; push esi ; unused
  59. ; push edi ; unused
  60. get_GOT ebx ; get GOT address
  61. ; ---- Pass 1: process rows.
  62. mov edx, POINTER [data(eax)] ; (FAST_FLOAT *)
  63. mov ecx, DCTSIZE/2
  64. alignx 16, 7
  65. .rowloop:
  66. movq mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)]
  67. movq mm1, MMWORD [MMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)]
  68. movq mm2, MMWORD [MMBLOCK(0,3,edx,SIZEOF_FAST_FLOAT)]
  69. movq mm3, MMWORD [MMBLOCK(1,3,edx,SIZEOF_FAST_FLOAT)]
  70. ; mm0=(00 01), mm1=(10 11), mm2=(06 07), mm3=(16 17)
  71. movq mm4, mm0 ; transpose coefficients
  72. punpckldq mm0, mm1 ; mm0=(00 10)=data0
  73. punpckhdq mm4, mm1 ; mm4=(01 11)=data1
  74. movq mm5, mm2 ; transpose coefficients
  75. punpckldq mm2, mm3 ; mm2=(06 16)=data6
  76. punpckhdq mm5, mm3 ; mm5=(07 17)=data7
  77. movq mm6, mm4
  78. movq mm7, mm0
  79. pfsub mm4, mm2 ; mm4=data1-data6=tmp6
  80. pfsub mm0, mm5 ; mm0=data0-data7=tmp7
  81. pfadd mm6, mm2 ; mm6=data1+data6=tmp1
  82. pfadd mm7, mm5 ; mm7=data0+data7=tmp0
  83. movq mm1, MMWORD [MMBLOCK(0,1,edx,SIZEOF_FAST_FLOAT)]
  84. movq mm3, MMWORD [MMBLOCK(1,1,edx,SIZEOF_FAST_FLOAT)]
  85. movq mm2, MMWORD [MMBLOCK(0,2,edx,SIZEOF_FAST_FLOAT)]
  86. movq mm5, MMWORD [MMBLOCK(1,2,edx,SIZEOF_FAST_FLOAT)]
  87. ; mm1=(02 03), mm3=(12 13), mm2=(04 05), mm5=(14 15)
  88. movq MMWORD [wk(0)], mm4 ; wk(0)=tmp6
  89. movq MMWORD [wk(1)], mm0 ; wk(1)=tmp7
  90. movq mm4, mm1 ; transpose coefficients
  91. punpckldq mm1, mm3 ; mm1=(02 12)=data2
  92. punpckhdq mm4, mm3 ; mm4=(03 13)=data3
  93. movq mm0, mm2 ; transpose coefficients
  94. punpckldq mm2, mm5 ; mm2=(04 14)=data4
  95. punpckhdq mm0, mm5 ; mm0=(05 15)=data5
  96. movq mm3, mm4
  97. movq mm5, mm1
  98. pfadd mm4, mm2 ; mm4=data3+data4=tmp3
  99. pfadd mm1, mm0 ; mm1=data2+data5=tmp2
  100. pfsub mm3, mm2 ; mm3=data3-data4=tmp4
  101. pfsub mm5, mm0 ; mm5=data2-data5=tmp5
  102. ; -- Even part
  103. movq mm2, mm7
  104. movq mm0, mm6
  105. pfsub mm7, mm4 ; mm7=tmp13
  106. pfsub mm6, mm1 ; mm6=tmp12
  107. pfadd mm2, mm4 ; mm2=tmp10
  108. pfadd mm0, mm1 ; mm0=tmp11
  109. pfadd mm6, mm7
  110. pfmul mm6, [GOTOFF(ebx,PD_0_707)] ; mm6=z1
  111. movq mm4, mm2
  112. movq mm1, mm7
  113. pfsub mm2, mm0 ; mm2=data4
  114. pfsub mm7, mm6 ; mm7=data6
  115. pfadd mm4, mm0 ; mm4=data0
  116. pfadd mm1, mm6 ; mm1=data2
  117. movq MMWORD [MMBLOCK(0,2,edx,SIZEOF_FAST_FLOAT)], mm2
  118. movq MMWORD [MMBLOCK(0,3,edx,SIZEOF_FAST_FLOAT)], mm7
  119. movq MMWORD [MMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)], mm4
  120. movq MMWORD [MMBLOCK(0,1,edx,SIZEOF_FAST_FLOAT)], mm1
  121. ; -- Odd part
  122. movq mm0, MMWORD [wk(0)] ; mm0=tmp6
  123. movq mm6, MMWORD [wk(1)] ; mm6=tmp7
  124. pfadd mm3, mm5 ; mm3=tmp10
  125. pfadd mm5, mm0 ; mm5=tmp11
  126. pfadd mm0, mm6 ; mm0=tmp12, mm6=tmp7
  127. pfmul mm5, [GOTOFF(ebx,PD_0_707)] ; mm5=z3
  128. movq mm2, mm3 ; mm2=tmp10
  129. pfsub mm3, mm0
  130. pfmul mm3, [GOTOFF(ebx,PD_0_382)] ; mm3=z5
  131. pfmul mm2, [GOTOFF(ebx,PD_0_541)] ; mm2=MULTIPLY(tmp10,FIX_0_54119610)
  132. pfmul mm0, [GOTOFF(ebx,PD_1_306)] ; mm0=MULTIPLY(tmp12,FIX_1_30656296)
  133. pfadd mm2, mm3 ; mm2=z2
  134. pfadd mm0, mm3 ; mm0=z4
  135. movq mm7, mm6
  136. pfsub mm6, mm5 ; mm6=z13
  137. pfadd mm7, mm5 ; mm7=z11
  138. movq mm4, mm6
  139. movq mm1, mm7
  140. pfsub mm6, mm2 ; mm6=data3
  141. pfsub mm7, mm0 ; mm7=data7
  142. pfadd mm4, mm2 ; mm4=data5
  143. pfadd mm1, mm0 ; mm1=data1
  144. movq MMWORD [MMBLOCK(1,1,edx,SIZEOF_FAST_FLOAT)], mm6
  145. movq MMWORD [MMBLOCK(1,3,edx,SIZEOF_FAST_FLOAT)], mm7
  146. movq MMWORD [MMBLOCK(1,2,edx,SIZEOF_FAST_FLOAT)], mm4
  147. movq MMWORD [MMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)], mm1
  148. add edx, byte 2*DCTSIZE*SIZEOF_FAST_FLOAT
  149. dec ecx
  150. jnz near .rowloop
  151. ; ---- Pass 2: process columns.
  152. mov edx, POINTER [data(eax)] ; (FAST_FLOAT *)
  153. mov ecx, DCTSIZE/2
  154. alignx 16, 7
  155. .columnloop:
  156. movq mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)]
  157. movq mm1, MMWORD [MMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)]
  158. movq mm2, MMWORD [MMBLOCK(6,0,edx,SIZEOF_FAST_FLOAT)]
  159. movq mm3, MMWORD [MMBLOCK(7,0,edx,SIZEOF_FAST_FLOAT)]
  160. ; mm0=(00 10), mm1=(01 11), mm2=(60 70), mm3=(61 71)
  161. movq mm4, mm0 ; transpose coefficients
  162. punpckldq mm0, mm1 ; mm0=(00 01)=data0
  163. punpckhdq mm4, mm1 ; mm4=(10 11)=data1
  164. movq mm5, mm2 ; transpose coefficients
  165. punpckldq mm2, mm3 ; mm2=(60 61)=data6
  166. punpckhdq mm5, mm3 ; mm5=(70 71)=data7
  167. movq mm6, mm4
  168. movq mm7, mm0
  169. pfsub mm4, mm2 ; mm4=data1-data6=tmp6
  170. pfsub mm0, mm5 ; mm0=data0-data7=tmp7
  171. pfadd mm6, mm2 ; mm6=data1+data6=tmp1
  172. pfadd mm7, mm5 ; mm7=data0+data7=tmp0
  173. movq mm1, MMWORD [MMBLOCK(2,0,edx,SIZEOF_FAST_FLOAT)]
  174. movq mm3, MMWORD [MMBLOCK(3,0,edx,SIZEOF_FAST_FLOAT)]
  175. movq mm2, MMWORD [MMBLOCK(4,0,edx,SIZEOF_FAST_FLOAT)]
  176. movq mm5, MMWORD [MMBLOCK(5,0,edx,SIZEOF_FAST_FLOAT)]
  177. ; mm1=(20 30), mm3=(21 31), mm2=(40 50), mm5=(41 51)
  178. movq MMWORD [wk(0)], mm4 ; wk(0)=tmp6
  179. movq MMWORD [wk(1)], mm0 ; wk(1)=tmp7
  180. movq mm4, mm1 ; transpose coefficients
  181. punpckldq mm1, mm3 ; mm1=(20 21)=data2
  182. punpckhdq mm4, mm3 ; mm4=(30 31)=data3
  183. movq mm0, mm2 ; transpose coefficients
  184. punpckldq mm2, mm5 ; mm2=(40 41)=data4
  185. punpckhdq mm0, mm5 ; mm0=(50 51)=data5
  186. movq mm3, mm4
  187. movq mm5, mm1
  188. pfadd mm4, mm2 ; mm4=data3+data4=tmp3
  189. pfadd mm1, mm0 ; mm1=data2+data5=tmp2
  190. pfsub mm3, mm2 ; mm3=data3-data4=tmp4
  191. pfsub mm5, mm0 ; mm5=data2-data5=tmp5
  192. ; -- Even part
  193. movq mm2, mm7
  194. movq mm0, mm6
  195. pfsub mm7, mm4 ; mm7=tmp13
  196. pfsub mm6, mm1 ; mm6=tmp12
  197. pfadd mm2, mm4 ; mm2=tmp10
  198. pfadd mm0, mm1 ; mm0=tmp11
  199. pfadd mm6, mm7
  200. pfmul mm6, [GOTOFF(ebx,PD_0_707)] ; mm6=z1
  201. movq mm4, mm2
  202. movq mm1, mm7
  203. pfsub mm2, mm0 ; mm2=data4
  204. pfsub mm7, mm6 ; mm7=data6
  205. pfadd mm4, mm0 ; mm4=data0
  206. pfadd mm1, mm6 ; mm1=data2
  207. movq MMWORD [MMBLOCK(4,0,edx,SIZEOF_FAST_FLOAT)], mm2
  208. movq MMWORD [MMBLOCK(6,0,edx,SIZEOF_FAST_FLOAT)], mm7
  209. movq MMWORD [MMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)], mm4
  210. movq MMWORD [MMBLOCK(2,0,edx,SIZEOF_FAST_FLOAT)], mm1
  211. ; -- Odd part
  212. movq mm0, MMWORD [wk(0)] ; mm0=tmp6
  213. movq mm6, MMWORD [wk(1)] ; mm6=tmp7
  214. pfadd mm3, mm5 ; mm3=tmp10
  215. pfadd mm5, mm0 ; mm5=tmp11
  216. pfadd mm0, mm6 ; mm0=tmp12, mm6=tmp7
  217. pfmul mm5, [GOTOFF(ebx,PD_0_707)] ; mm5=z3
  218. movq mm2, mm3 ; mm2=tmp10
  219. pfsub mm3, mm0
  220. pfmul mm3, [GOTOFF(ebx,PD_0_382)] ; mm3=z5
  221. pfmul mm2, [GOTOFF(ebx,PD_0_541)] ; mm2=MULTIPLY(tmp10,FIX_0_54119610)
  222. pfmul mm0, [GOTOFF(ebx,PD_1_306)] ; mm0=MULTIPLY(tmp12,FIX_1_30656296)
  223. pfadd mm2, mm3 ; mm2=z2
  224. pfadd mm0, mm3 ; mm0=z4
  225. movq mm7, mm6
  226. pfsub mm6, mm5 ; mm6=z13
  227. pfadd mm7, mm5 ; mm7=z11
  228. movq mm4, mm6
  229. movq mm1, mm7
  230. pfsub mm6, mm2 ; mm6=data3
  231. pfsub mm7, mm0 ; mm7=data7
  232. pfadd mm4, mm2 ; mm4=data5
  233. pfadd mm1, mm0 ; mm1=data1
  234. movq MMWORD [MMBLOCK(3,0,edx,SIZEOF_FAST_FLOAT)], mm6
  235. movq MMWORD [MMBLOCK(7,0,edx,SIZEOF_FAST_FLOAT)], mm7
  236. movq MMWORD [MMBLOCK(5,0,edx,SIZEOF_FAST_FLOAT)], mm4
  237. movq MMWORD [MMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)], mm1
  238. add edx, byte 2*SIZEOF_FAST_FLOAT
  239. dec ecx
  240. jnz near .columnloop
  241. femms ; empty MMX/3DNow! state
  242. ; pop edi ; unused
  243. ; pop esi ; unused
  244. ; pop edx ; need not be preserved
  245. ; pop ecx ; need not be preserved
  246. poppic ebx
  247. mov esp, ebp ; esp <- aligned ebp
  248. pop esp ; esp <- original ebp
  249. pop ebp
  250. ret
  251. ; For some reason, the OS X linker does not honor the request to align the
  252. ; segment unless we do this.
  253. align 32