jsimdext.inc 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376
  1. ;
  2. ; jsimdext.inc - common declarations
  3. ;
  4. ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
  5. ; Copyright (C) 2010, D. R. Commander.
  6. ;
  7. ; Based on the x86 SIMD extension for IJG JPEG library - version 1.02
  8. ;
  9. ; Copyright (C) 1999-2006, MIYASAKA Masaru.
  10. ;
  11. ; This software is provided 'as-is', without any express or implied
  12. ; warranty. In no event will the authors be held liable for any damages
  13. ; arising from the use of this software.
  14. ;
  15. ; Permission is granted to anyone to use this software for any purpose,
  16. ; including commercial applications, and to alter it and redistribute it
  17. ; freely, subject to the following restrictions:
  18. ;
  19. ; 1. The origin of this software must not be misrepresented; you must not
  20. ; claim that you wrote the original software. If you use this software
  21. ; in a product, an acknowledgment in the product documentation would be
  22. ; appreciated but is not required.
  23. ; 2. Altered source versions must be plainly marked as such, and must not be
  24. ; misrepresented as being the original software.
  25. ; 3. This notice may not be removed or altered from any source distribution.
  26. ;
  27. ; [TAB8]
  28. ; ==========================================================================
  29. ; System-dependent configurations
  30. %ifdef WIN32 ; ----(nasm -fwin32 -DWIN32 ...)--------
  31. ; * Microsoft Visual C++
  32. ; * MinGW (Minimalist GNU for Windows)
  33. ; * CygWin
  34. ; * LCC-Win32
  35. ; -- segment definition --
  36. ;
  37. %ifdef __YASM_VER__
  38. %define SEG_TEXT .text align=16
  39. %define SEG_CONST .rdata align=16
  40. %else
  41. %define SEG_TEXT .text align=16 public use32 class=CODE
  42. %define SEG_CONST .rdata align=16 public use32 class=CONST
  43. %endif
  44. %elifdef WIN64 ; ----(nasm -fwin64 -DWIN64 ...)--------
  45. ; * Microsoft Visual C++
  46. ; -- segment definition --
  47. ;
  48. %ifdef __YASM_VER__
  49. %define SEG_TEXT .text align=16
  50. %define SEG_CONST .rdata align=16
  51. %else
  52. %define SEG_TEXT .text align=16 public use64 class=CODE
  53. %define SEG_CONST .rdata align=16 public use64 class=CONST
  54. %endif
  55. %define EXTN(name) name ; foo() -> foo
  56. %elifdef OBJ32 ; ----(nasm -fobj -DOBJ32 ...)----------
  57. ; * Borland C++ (Win32)
  58. ; -- segment definition --
  59. ;
  60. %define SEG_TEXT _text align=16 public use32 class=CODE
  61. %define SEG_CONST _data align=16 public use32 class=DATA
  62. %elifdef ELF ; ----(nasm -felf[64] -DELF ...)------------
  63. ; * Linux
  64. ; * *BSD family Unix using elf format
  65. ; * Unix System V, including Solaris x86, UnixWare and SCO Unix
  66. ; mark stack as non-executable
  67. section .note.GNU-stack noalloc noexec nowrite progbits
  68. ; -- segment definition --
  69. ;
  70. %ifdef __x86_64__
  71. %define SEG_TEXT .text progbits align=16
  72. %define SEG_CONST .rodata progbits align=16
  73. %else
  74. %define SEG_TEXT .text progbits alloc exec nowrite align=16
  75. %define SEG_CONST .rodata progbits alloc noexec nowrite align=16
  76. %endif
  77. ; To make the code position-independent, append -DPIC to the commandline
  78. ;
  79. %define GOT_SYMBOL _GLOBAL_OFFSET_TABLE_ ; ELF supports PIC
  80. %define EXTN(name) name ; foo() -> foo
  81. %elifdef AOUT ; ----(nasm -faoutb/aout -DAOUT ...)----
  82. ; * Older Linux using a.out format (nasm -f aout -DAOUT ...)
  83. ; * *BSD family Unix using a.out format (nasm -f aoutb -DAOUT ...)
  84. ; -- segment definition --
  85. ;
  86. %define SEG_TEXT .text
  87. %define SEG_CONST .data
  88. ; To make the code position-independent, append -DPIC to the commandline
  89. ;
  90. %define GOT_SYMBOL __GLOBAL_OFFSET_TABLE_ ; BSD-style a.out supports PIC
  91. %elifdef MACHO ; ----(nasm -fmacho -DMACHO ...)--------
  92. ; * NeXTstep/OpenStep/Rhapsody/Darwin/MacOS X (Mach-O format)
  93. ; -- segment definition --
  94. ;
  95. %define SEG_TEXT .text ;align=16 ; nasm doesn't accept align=16. why?
  96. %define SEG_CONST .rodata align=16
  97. ; The generation of position-independent code (PIC) is the default on Darwin.
  98. ;
  99. %define PIC
  100. %define GOT_SYMBOL _MACHO_PIC_ ; Mach-O style code-relative addressing
  101. %else ; ----(Other case)----------------------
  102. ; -- segment definition --
  103. ;
  104. %define SEG_TEXT .text
  105. %define SEG_CONST .data
  106. %endif ; ----------------------------------------------
  107. ; ==========================================================================
  108. ; --------------------------------------------------------------------------
  109. ; Common types
  110. ;
  111. %ifdef __x86_64__
  112. %define POINTER qword ; general pointer type
  113. %define SIZEOF_POINTER SIZEOF_QWORD ; sizeof(POINTER)
  114. %define POINTER_BIT QWORD_BIT ; sizeof(POINTER)*BYTE_BIT
  115. %else
  116. %define POINTER dword ; general pointer type
  117. %define SIZEOF_POINTER SIZEOF_DWORD ; sizeof(POINTER)
  118. %define POINTER_BIT DWORD_BIT ; sizeof(POINTER)*BYTE_BIT
  119. %endif
  120. %define INT dword ; signed integer type
  121. %define SIZEOF_INT SIZEOF_DWORD ; sizeof(INT)
  122. %define INT_BIT DWORD_BIT ; sizeof(INT)*BYTE_BIT
  123. %define FP32 dword ; IEEE754 single
  124. %define SIZEOF_FP32 SIZEOF_DWORD ; sizeof(FP32)
  125. %define FP32_BIT DWORD_BIT ; sizeof(FP32)*BYTE_BIT
  126. %define MMWORD qword ; int64 (MMX register)
  127. %define SIZEOF_MMWORD SIZEOF_QWORD ; sizeof(MMWORD)
  128. %define MMWORD_BIT QWORD_BIT ; sizeof(MMWORD)*BYTE_BIT
  129. ; NASM is buggy and doesn't properly handle operand sizes for SSE
  130. ; instructions, so for now we have to define XMMWORD as blank.
  131. %define XMMWORD ; int128 (SSE register)
  132. %define SIZEOF_XMMWORD SIZEOF_OWORD ; sizeof(XMMWORD)
  133. %define XMMWORD_BIT OWORD_BIT ; sizeof(XMMWORD)*BYTE_BIT
  134. ; Similar hacks for when we load a dword or MMWORD into an xmm# register
  135. %define XMM_DWORD
  136. %define XMM_MMWORD
  137. %define SIZEOF_BYTE 1 ; sizeof(BYTE)
  138. %define SIZEOF_WORD 2 ; sizeof(WORD)
  139. %define SIZEOF_DWORD 4 ; sizeof(DWORD)
  140. %define SIZEOF_QWORD 8 ; sizeof(QWORD)
  141. %define SIZEOF_OWORD 16 ; sizeof(OWORD)
  142. %define BYTE_BIT 8 ; CHAR_BIT in C
  143. %define WORD_BIT 16 ; sizeof(WORD)*BYTE_BIT
  144. %define DWORD_BIT 32 ; sizeof(DWORD)*BYTE_BIT
  145. %define QWORD_BIT 64 ; sizeof(QWORD)*BYTE_BIT
  146. %define OWORD_BIT 128 ; sizeof(OWORD)*BYTE_BIT
  147. ; --------------------------------------------------------------------------
  148. ; External Symbol Name
  149. ;
  150. %ifndef EXTN
  151. %define EXTN(name) _ %+ name ; foo() -> _foo
  152. %endif
  153. ; --------------------------------------------------------------------------
  154. ; Macros for position-independent code (PIC) support
  155. ;
  156. %ifndef GOT_SYMBOL
  157. %undef PIC
  158. %endif
  159. %ifdef PIC ; -------------------------------------------
  160. %ifidn GOT_SYMBOL,_MACHO_PIC_ ; --------------------
  161. ; At present, nasm doesn't seem to support PIC generation for Mach-O.
  162. ; The PIC support code below is a little tricky.
  163. SECTION SEG_CONST
  164. const_base:
  165. %define GOTOFF(got,sym) (got) + (sym) - const_base
  166. %imacro get_GOT 1
  167. ; NOTE: this macro destroys ecx resister.
  168. call %%geteip
  169. add ecx, byte (%%ref - $)
  170. jmp short %%adjust
  171. %%geteip:
  172. mov ecx, POINTER [esp]
  173. ret
  174. %%adjust:
  175. push ebp
  176. xor ebp,ebp ; ebp = 0
  177. %ifidni %1,ebx ; (%1 == ebx)
  178. ; db 0x8D,0x9C + jmp near const_base =
  179. ; lea ebx, [ecx+ebp*8+(const_base-%%ref)] ; 8D,9C,E9,(offset32)
  180. db 0x8D,0x9C ; 8D,9C
  181. jmp near const_base ; E9,(const_base-%%ref)
  182. %%ref:
  183. %else ; (%1 != ebx)
  184. ; db 0x8D,0x8C + jmp near const_base =
  185. ; lea ecx, [ecx+ebp*8+(const_base-%%ref)] ; 8D,8C,E9,(offset32)
  186. db 0x8D,0x8C ; 8D,8C
  187. jmp near const_base ; E9,(const_base-%%ref)
  188. %%ref: mov %1, ecx
  189. %endif ; (%1 == ebx)
  190. pop ebp
  191. %endmacro
  192. %else ; GOT_SYMBOL != _MACHO_PIC_ ----------------
  193. %define GOTOFF(got,sym) (got) + (sym) wrt ..gotoff
  194. %imacro get_GOT 1
  195. extern GOT_SYMBOL
  196. call %%geteip
  197. add %1, GOT_SYMBOL + $$ - $ wrt ..gotpc
  198. jmp short %%done
  199. %%geteip:
  200. mov %1, POINTER [esp]
  201. ret
  202. %%done:
  203. %endmacro
  204. %endif ; GOT_SYMBOL == _MACHO_PIC_ ----------------
  205. %imacro pushpic 1.nolist
  206. push %1
  207. %endmacro
  208. %imacro poppic 1.nolist
  209. pop %1
  210. %endmacro
  211. %imacro movpic 2.nolist
  212. mov %1,%2
  213. %endmacro
  214. %else ; !PIC -----------------------------------------
  215. %define GOTOFF(got,sym) (sym)
  216. %imacro get_GOT 1.nolist
  217. %endmacro
  218. %imacro pushpic 1.nolist
  219. %endmacro
  220. %imacro poppic 1.nolist
  221. %endmacro
  222. %imacro movpic 2.nolist
  223. %endmacro
  224. %endif ; PIC -----------------------------------------
  225. ; --------------------------------------------------------------------------
  226. ; Align the next instruction on {2,4,8,16,..}-byte boundary.
  227. ; ".balign n,,m" in GNU as
  228. ;
  229. %define MSKLE(x,y) (~(((y) & 0xFFFF) - ((x) & 0xFFFF)) >> 16)
  230. %define FILLB(b,n) (($$-(b)) & ((n)-1))
  231. %imacro alignx 1-2.nolist 0xFFFF
  232. %%bs: times MSKLE(FILLB(%%bs,%1),%2) & MSKLE(16,FILLB($,%1)) & FILLB($,%1) \
  233. db 0x90 ; nop
  234. times MSKLE(FILLB(%%bs,%1),%2) & FILLB($,%1)/9 \
  235. db 0x8D,0x9C,0x23,0x00,0x00,0x00,0x00 ; lea ebx,[ebx+0x00000000]
  236. times MSKLE(FILLB(%%bs,%1),%2) & FILLB($,%1)/7 \
  237. db 0x8D,0xAC,0x25,0x00,0x00,0x00,0x00 ; lea ebp,[ebp+0x00000000]
  238. times MSKLE(FILLB(%%bs,%1),%2) & FILLB($,%1)/6 \
  239. db 0x8D,0xAD,0x00,0x00,0x00,0x00 ; lea ebp,[ebp+0x00000000]
  240. times MSKLE(FILLB(%%bs,%1),%2) & FILLB($,%1)/4 \
  241. db 0x8D,0x6C,0x25,0x00 ; lea ebp,[ebp+0x00]
  242. times MSKLE(FILLB(%%bs,%1),%2) & FILLB($,%1)/3 \
  243. db 0x8D,0x6D,0x00 ; lea ebp,[ebp+0x00]
  244. times MSKLE(FILLB(%%bs,%1),%2) & FILLB($,%1)/2 \
  245. db 0x8B,0xED ; mov ebp,ebp
  246. times MSKLE(FILLB(%%bs,%1),%2) & FILLB($,%1)/1 \
  247. db 0x90 ; nop
  248. %endmacro
  249. ; Align the next data on {2,4,8,16,..}-byte boundary.
  250. ;
  251. %imacro alignz 1.nolist
  252. align %1, db 0 ; filling zeros
  253. %endmacro
  254. %ifdef __x86_64__
  255. %ifdef WIN64
  256. %imacro collect_args 0
  257. push r12
  258. push r13
  259. push r14
  260. push r15
  261. mov r10, rcx
  262. mov r11, rdx
  263. mov r12, r8
  264. mov r13, r9
  265. mov r14, [rax+48]
  266. mov r15, [rax+56]
  267. push rsi
  268. push rdi
  269. sub rsp, SIZEOF_XMMWORD
  270. movaps XMMWORD [rsp], xmm6
  271. sub rsp, SIZEOF_XMMWORD
  272. movaps XMMWORD [rsp], xmm7
  273. %endmacro
  274. %imacro uncollect_args 0
  275. movaps xmm7, XMMWORD [rsp]
  276. add rsp, SIZEOF_XMMWORD
  277. movaps xmm6, XMMWORD [rsp]
  278. add rsp, SIZEOF_XMMWORD
  279. pop rdi
  280. pop rsi
  281. pop r15
  282. pop r14
  283. pop r13
  284. pop r12
  285. %endmacro
  286. %else
  287. %imacro collect_args 0
  288. push r10
  289. push r11
  290. push r12
  291. push r13
  292. push r14
  293. push r15
  294. mov r10, rdi
  295. mov r11, rsi
  296. mov r12, rdx
  297. mov r13, rcx
  298. mov r14, r8
  299. mov r15, r9
  300. %endmacro
  301. %imacro uncollect_args 0
  302. pop r15
  303. pop r14
  304. pop r13
  305. pop r12
  306. pop r11
  307. pop r10
  308. %endmacro
  309. %endif
  310. %endif
  311. ; --------------------------------------------------------------------------
  312. ; Defines picked up from the C headers
  313. ;
  314. %include "jsimdcfg.inc"
  315. ; --------------------------------------------------------------------------