jsimdext.inc 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479
  1. ;
  2. ; jsimdext.inc - common declarations
  3. ;
  4. ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
  5. ; Copyright (C) 2010, 2016, 2019, D. R. Commander.
  6. ; Copyright (C) 2018, Matthieu Darbois.
  7. ;
  8. ; Based on the x86 SIMD extension for IJG JPEG library - version 1.02
  9. ;
  10. ; Copyright (C) 1999-2006, MIYASAKA Masaru.
  11. ;
  12. ; This software is provided 'as-is', without any express or implied
  13. ; warranty. In no event will the authors be held liable for any damages
  14. ; arising from the use of this software.
  15. ;
  16. ; Permission is granted to anyone to use this software for any purpose,
  17. ; including commercial applications, and to alter it and redistribute it
  18. ; freely, subject to the following restrictions:
  19. ;
  20. ; 1. The origin of this software must not be misrepresented; you must not
  21. ; claim that you wrote the original software. If you use this software
  22. ; in a product, an acknowledgment in the product documentation would be
  23. ; appreciated but is not required.
  24. ; 2. Altered source versions must be plainly marked as such, and must not be
  25. ; misrepresented as being the original software.
  26. ; 3. This notice may not be removed or altered from any source distribution.
  27. ; ==========================================================================
  28. ; System-dependent configurations
  29. %ifdef WIN32 ; ----(nasm -fwin32 -DWIN32 ...)--------
  30. ; * Microsoft Visual C++
  31. ; * MinGW (Minimalist GNU for Windows)
  32. ; * CygWin
  33. ; * LCC-Win32
  34. ; -- segment definition --
  35. ;
  36. %ifdef __YASM_VER__
  37. %define SEG_TEXT .text align=32
  38. %define SEG_CONST .rdata align=32
  39. %else
  40. %define SEG_TEXT .text align=32 public use32 class=CODE
  41. %define SEG_CONST .rdata align=32 public use32 class=CONST
  42. %endif
  43. %elifdef WIN64 ; ----(nasm -fwin64 -DWIN64 ...)--------
  44. ; * Microsoft Visual C++
  45. ; -- segment definition --
  46. ;
  47. %ifdef __YASM_VER__
  48. %define SEG_TEXT .text align=32
  49. %define SEG_CONST .rdata align=32
  50. %else
  51. %define SEG_TEXT .text align=32 public use64 class=CODE
  52. %define SEG_CONST .rdata align=32 public use64 class=CONST
  53. %endif
  54. %define EXTN(name) name ; foo() -> foo
  55. %elifdef OBJ32 ; ----(nasm -fobj -DOBJ32 ...)----------
  56. ; * Borland C++ (Win32)
  57. ; -- segment definition --
  58. ;
  59. %define SEG_TEXT _text align=32 public use32 class=CODE
  60. %define SEG_CONST _data align=32 public use32 class=DATA
  61. %elifdef ELF ; ----(nasm -felf[64] -DELF ...)------------
  62. ; * Linux
  63. ; * *BSD family Unix using elf format
  64. ; * Unix System V, including Solaris x86, UnixWare and SCO Unix
  65. ; mark stack as non-executable
  66. section .note.GNU-stack noalloc noexec nowrite progbits
  67. ; -- segment definition --
  68. ;
  69. %ifdef __x86_64__
  70. %define SEG_TEXT .text progbits align=32
  71. %define SEG_CONST .rodata progbits align=32
  72. %else
  73. %define SEG_TEXT .text progbits alloc exec nowrite align=32
  74. %define SEG_CONST .rodata progbits alloc noexec nowrite align=32
  75. %endif
  76. ; To make the code position-independent, append -DPIC to the commandline
  77. ;
  78. %define GOT_SYMBOL _GLOBAL_OFFSET_TABLE_ ; ELF supports PIC
  79. %define EXTN(name) name ; foo() -> foo
  80. %elifdef AOUT ; ----(nasm -faoutb/aout -DAOUT ...)----
  81. ; * Older Linux using a.out format (nasm -f aout -DAOUT ...)
  82. ; * *BSD family Unix using a.out format (nasm -f aoutb -DAOUT ...)
  83. ; -- segment definition --
  84. ;
  85. %define SEG_TEXT .text
  86. %define SEG_CONST .data
  87. ; To make the code position-independent, append -DPIC to the commandline
  88. ;
  89. %define GOT_SYMBOL __GLOBAL_OFFSET_TABLE_ ; BSD-style a.out supports PIC
  90. %elifdef MACHO ; ----(nasm -fmacho -DMACHO ...)--------
  91. ; * NeXTstep/OpenStep/Rhapsody/Darwin/MacOS X (Mach-O format)
  92. ; -- segment definition --
  93. ;
  94. %define SEG_TEXT .text ;align=32 ; nasm doesn't accept align=32. why?
  95. %define SEG_CONST .rodata align=32
  96. ; The generation of position-independent code (PIC) is the default on Darwin.
  97. ;
  98. %define PIC
  99. %define GOT_SYMBOL _MACHO_PIC_ ; Mach-O style code-relative addressing
  100. %else ; ----(Other case)----------------------
  101. ; -- segment definition --
  102. ;
  103. %define SEG_TEXT .text
  104. %define SEG_CONST .data
  105. %endif ; ----------------------------------------------
  106. ; ==========================================================================
  107. ; --------------------------------------------------------------------------
  108. ; Common types
  109. ;
  110. %ifdef __x86_64__
  111. %define POINTER qword ; general pointer type
  112. %define SIZEOF_POINTER SIZEOF_QWORD ; sizeof(POINTER)
  113. %define POINTER_BIT QWORD_BIT ; sizeof(POINTER)*BYTE_BIT
  114. %else
  115. %define POINTER dword ; general pointer type
  116. %define SIZEOF_POINTER SIZEOF_DWORD ; sizeof(POINTER)
  117. %define POINTER_BIT DWORD_BIT ; sizeof(POINTER)*BYTE_BIT
  118. %endif
  119. %define INT dword ; signed integer type
  120. %define SIZEOF_INT SIZEOF_DWORD ; sizeof(INT)
  121. %define INT_BIT DWORD_BIT ; sizeof(INT)*BYTE_BIT
  122. %define FP32 dword ; IEEE754 single
  123. %define SIZEOF_FP32 SIZEOF_DWORD ; sizeof(FP32)
  124. %define FP32_BIT DWORD_BIT ; sizeof(FP32)*BYTE_BIT
  125. %define MMWORD qword ; int64 (MMX register)
  126. %define SIZEOF_MMWORD SIZEOF_QWORD ; sizeof(MMWORD)
  127. %define MMWORD_BIT QWORD_BIT ; sizeof(MMWORD)*BYTE_BIT
  128. ; NASM is buggy and doesn't properly handle operand sizes for SSE
  129. ; instructions, so for now we have to define XMMWORD as blank.
  130. %define XMMWORD ; int128 (SSE register)
  131. %define SIZEOF_XMMWORD SIZEOF_OWORD ; sizeof(XMMWORD)
  132. %define XMMWORD_BIT OWORD_BIT ; sizeof(XMMWORD)*BYTE_BIT
  133. %define YMMWORD ; int256 (AVX register)
  134. %define SIZEOF_YMMWORD SIZEOF_YWORD ; sizeof(YMMWORD)
  135. %define YMMWORD_BIT YWORD_BIT ; sizeof(YMMWORD)*BYTE_BIT
  136. ; Similar hacks for when we load a dword or MMWORD into an xmm# register
  137. %define XMM_DWORD
  138. %define XMM_MMWORD
  139. %define SIZEOF_BYTE 1 ; sizeof(byte)
  140. %define SIZEOF_WORD 2 ; sizeof(word)
  141. %define SIZEOF_DWORD 4 ; sizeof(dword)
  142. %define SIZEOF_QWORD 8 ; sizeof(qword)
  143. %define SIZEOF_OWORD 16 ; sizeof(oword)
  144. %define SIZEOF_YWORD 32 ; sizeof(yword)
  145. %define BYTE_BIT 8 ; CHAR_BIT in C
  146. %define WORD_BIT 16 ; sizeof(word)*BYTE_BIT
  147. %define DWORD_BIT 32 ; sizeof(dword)*BYTE_BIT
  148. %define QWORD_BIT 64 ; sizeof(qword)*BYTE_BIT
  149. %define OWORD_BIT 128 ; sizeof(oword)*BYTE_BIT
  150. %define YWORD_BIT 256 ; sizeof(yword)*BYTE_BIT
  151. ; --------------------------------------------------------------------------
  152. ; External Symbol Name
  153. ;
  154. %ifndef EXTN
  155. %define EXTN(name) _ %+ name ; foo() -> _foo
  156. %endif
  157. ; --------------------------------------------------------------------------
  158. ; Hidden symbols
  159. ;
  160. %ifdef ELF ; ----(nasm -felf[64] -DELF ...)--------
  161. %define GLOBAL_FUNCTION(name) global EXTN(name):function hidden
  162. %define GLOBAL_DATA(name) global EXTN(name):data hidden
  163. %elifdef MACHO ; ----(nasm -fmacho -DMACHO ...)--------
  164. %ifdef __YASM_VER__
  165. %define GLOBAL_FUNCTION(name) global EXTN(name):private_extern
  166. %define GLOBAL_DATA(name) global EXTN(name):private_extern
  167. %else
  168. %if __NASM_VERSION_ID__ >= 0x020E0000
  169. %define GLOBAL_FUNCTION(name) global EXTN(name):private_extern
  170. %define GLOBAL_DATA(name) global EXTN(name):private_extern
  171. %endif
  172. %endif
  173. %endif
  174. %ifndef GLOBAL_FUNCTION
  175. %define GLOBAL_FUNCTION(name) global EXTN(name)
  176. %endif
  177. %ifndef GLOBAL_DATA
  178. %define GLOBAL_DATA(name) global EXTN(name)
  179. %endif
  180. ; --------------------------------------------------------------------------
  181. ; Macros for position-independent code (PIC) support
  182. ;
  183. %ifndef GOT_SYMBOL
  184. %undef PIC
  185. %endif
  186. %ifdef PIC ; -------------------------------------------
  187. %ifidn GOT_SYMBOL, _MACHO_PIC_ ; --------------------
  188. ; At present, nasm doesn't seem to support PIC generation for Mach-O.
  189. ; The PIC support code below is a little tricky.
  190. SECTION SEG_CONST
  191. const_base:
  192. %define GOTOFF(got, sym) (got) + (sym) - const_base
  193. %imacro get_GOT 1
  194. ; NOTE: this macro destroys ecx resister.
  195. call %%geteip
  196. add ecx, byte (%%ref - $)
  197. jmp short %%adjust
  198. %%geteip:
  199. mov ecx, POINTER [esp]
  200. ret
  201. %%adjust:
  202. push ebp
  203. xor ebp, ebp ; ebp = 0
  204. %ifidni %1, ebx ; (%1 == ebx)
  205. ; db 0x8D,0x9C + jmp near const_base =
  206. ; lea ebx, [ecx+ebp*8+(const_base-%%ref)] ; 8D,9C,E9,(offset32)
  207. db 0x8D, 0x9C ; 8D,9C
  208. jmp near const_base ; E9,(const_base-%%ref)
  209. %%ref:
  210. %else ; (%1 != ebx)
  211. ; db 0x8D,0x8C + jmp near const_base =
  212. ; lea ecx, [ecx+ebp*8+(const_base-%%ref)] ; 8D,8C,E9,(offset32)
  213. db 0x8D, 0x8C ; 8D,8C
  214. jmp near const_base ; E9,(const_base-%%ref)
  215. %%ref:
  216. mov %1, ecx
  217. %endif ; (%1 == ebx)
  218. pop ebp
  219. %endmacro
  220. %else ; GOT_SYMBOL != _MACHO_PIC_ ----------------
  221. %define GOTOFF(got, sym) (got) + (sym) wrt ..gotoff
  222. %imacro get_GOT 1
  223. extern GOT_SYMBOL
  224. call %%geteip
  225. add %1, GOT_SYMBOL + $$ - $ wrt ..gotpc
  226. jmp short %%done
  227. %%geteip:
  228. mov %1, POINTER [esp]
  229. ret
  230. %%done:
  231. %endmacro
  232. %endif ; GOT_SYMBOL == _MACHO_PIC_ ----------------
  233. %imacro pushpic 1.nolist
  234. push %1
  235. %endmacro
  236. %imacro poppic 1.nolist
  237. pop %1
  238. %endmacro
  239. %imacro movpic 2.nolist
  240. mov %1, %2
  241. %endmacro
  242. %else ; !PIC -----------------------------------------
  243. %define GOTOFF(got, sym) (sym)
  244. %imacro get_GOT 1.nolist
  245. %endmacro
  246. %imacro pushpic 1.nolist
  247. %endmacro
  248. %imacro poppic 1.nolist
  249. %endmacro
  250. %imacro movpic 2.nolist
  251. %endmacro
  252. %endif ; PIC -----------------------------------------
  253. ; --------------------------------------------------------------------------
  254. ; Align the next instruction on {2,4,8,16,..}-byte boundary.
  255. ; ".balign n,,m" in GNU as
  256. ;
  257. %define MSKLE(x, y) (~(((y) & 0xFFFF) - ((x) & 0xFFFF)) >> 16)
  258. %define FILLB(b, n) (($$-(b)) & ((n)-1))
  259. %imacro alignx 1-2.nolist 0xFFFF
  260. %%bs: \
  261. times MSKLE(FILLB(%%bs, %1), %2) & MSKLE(16, FILLB($, %1)) & FILLB($, %1) \
  262. db 0x90 ; nop
  263. times MSKLE(FILLB(%%bs, %1), %2) & FILLB($, %1) / 9 \
  264. db 0x8D, 0x9C, 0x23, 0x00, 0x00, 0x00, 0x00 ; lea ebx,[ebx+0x00000000]
  265. times MSKLE(FILLB(%%bs, %1), %2) & FILLB($, %1) / 7 \
  266. db 0x8D, 0xAC, 0x25, 0x00, 0x00, 0x00, 0x00 ; lea ebp,[ebp+0x00000000]
  267. times MSKLE(FILLB(%%bs, %1), %2) & FILLB($, %1) / 6 \
  268. db 0x8D, 0xAD, 0x00, 0x00, 0x00, 0x00 ; lea ebp,[ebp+0x00000000]
  269. times MSKLE(FILLB(%%bs, %1), %2) & FILLB($, %1) / 4 \
  270. db 0x8D, 0x6C, 0x25, 0x00 ; lea ebp,[ebp+0x00]
  271. times MSKLE(FILLB(%%bs, %1), %2) & FILLB($, %1) / 3 \
  272. db 0x8D, 0x6D, 0x00 ; lea ebp,[ebp+0x00]
  273. times MSKLE(FILLB(%%bs, %1), %2) & FILLB($, %1) / 2 \
  274. db 0x8B, 0xED ; mov ebp,ebp
  275. times MSKLE(FILLB(%%bs, %1), %2) & FILLB($, %1) / 1 \
  276. db 0x90 ; nop
  277. %endmacro
  278. ; Align the next data on {2,4,8,16,..}-byte boundary.
  279. ;
  280. %imacro alignz 1.nolist
  281. align %1, db 0 ; filling zeros
  282. %endmacro
  283. %ifdef __x86_64__
  284. %ifdef WIN64
  285. %imacro collect_args 1
  286. sub rsp, SIZEOF_XMMWORD
  287. movaps XMMWORD [rsp], xmm6
  288. sub rsp, SIZEOF_XMMWORD
  289. movaps XMMWORD [rsp], xmm7
  290. mov r10, rcx
  291. %if %1 > 1
  292. mov r11, rdx
  293. %endif
  294. %if %1 > 2
  295. push r12
  296. mov r12, r8
  297. %endif
  298. %if %1 > 3
  299. push r13
  300. mov r13, r9
  301. %endif
  302. %if %1 > 4
  303. push r14
  304. mov r14, [rax+48]
  305. %endif
  306. %if %1 > 5
  307. push r15
  308. mov r15, [rax+56]
  309. %endif
  310. push rsi
  311. push rdi
  312. %endmacro
  313. %imacro uncollect_args 1
  314. pop rdi
  315. pop rsi
  316. %if %1 > 5
  317. pop r15
  318. %endif
  319. %if %1 > 4
  320. pop r14
  321. %endif
  322. %if %1 > 3
  323. pop r13
  324. %endif
  325. %if %1 > 2
  326. pop r12
  327. %endif
  328. movaps xmm7, XMMWORD [rsp]
  329. add rsp, SIZEOF_XMMWORD
  330. movaps xmm6, XMMWORD [rsp]
  331. add rsp, SIZEOF_XMMWORD
  332. %endmacro
  333. %imacro push_xmm 1
  334. sub rsp, %1 * SIZEOF_XMMWORD
  335. movaps XMMWORD [rsp+0*SIZEOF_XMMWORD], xmm8
  336. %if %1 > 1
  337. movaps XMMWORD [rsp+1*SIZEOF_XMMWORD], xmm9
  338. %endif
  339. %if %1 > 2
  340. movaps XMMWORD [rsp+2*SIZEOF_XMMWORD], xmm10
  341. %endif
  342. %if %1 > 3
  343. movaps XMMWORD [rsp+3*SIZEOF_XMMWORD], xmm11
  344. %endif
  345. %endmacro
  346. %imacro pop_xmm 1
  347. movaps xmm8, XMMWORD [rsp+0*SIZEOF_XMMWORD]
  348. %if %1 > 1
  349. movaps xmm9, XMMWORD [rsp+1*SIZEOF_XMMWORD]
  350. %endif
  351. %if %1 > 2
  352. movaps xmm10, XMMWORD [rsp+2*SIZEOF_XMMWORD]
  353. %endif
  354. %if %1 > 3
  355. movaps xmm11, XMMWORD [rsp+3*SIZEOF_XMMWORD]
  356. %endif
  357. add rsp, %1 * SIZEOF_XMMWORD
  358. %endmacro
  359. %else
  360. %imacro collect_args 1
  361. push r10
  362. mov r10, rdi
  363. %if %1 > 1
  364. push r11
  365. mov r11, rsi
  366. %endif
  367. %if %1 > 2
  368. push r12
  369. mov r12, rdx
  370. %endif
  371. %if %1 > 3
  372. push r13
  373. mov r13, rcx
  374. %endif
  375. %if %1 > 4
  376. push r14
  377. mov r14, r8
  378. %endif
  379. %if %1 > 5
  380. push r15
  381. mov r15, r9
  382. %endif
  383. %endmacro
  384. %imacro uncollect_args 1
  385. %if %1 > 5
  386. pop r15
  387. %endif
  388. %if %1 > 4
  389. pop r14
  390. %endif
  391. %if %1 > 3
  392. pop r13
  393. %endif
  394. %if %1 > 2
  395. pop r12
  396. %endif
  397. %if %1 > 1
  398. pop r11
  399. %endif
  400. pop r10
  401. %endmacro
  402. %imacro push_xmm 1
  403. %endmacro
  404. %imacro pop_xmm 1
  405. %endmacro
  406. %endif
  407. %endif
  408. ; --------------------------------------------------------------------------
  409. ; Defines picked up from the C headers
  410. ;
  411. %include "jsimdcfg.inc"
  412. ; --------------------------------------------------------------------------