d_draw.s 24 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038
  1. /*
  2. Copyright (C) 1996-1997 Id Software, Inc.
  3. This program is free software; you can redistribute it and/or
  4. modify it under the terms of the GNU General Public License
  5. as published by the Free Software Foundation; either version 2
  6. of the License, or (at your option) any later version.
  7. This program is distributed in the hope that it will be useful,
  8. but WITHOUT ANY WARRANTY; without even the implied warranty of
  9. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
  10. See the GNU General Public License for more details.
  11. You should have received a copy of the GNU General Public License
  12. along with this program; if not, write to the Free Software
  13. Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
  14. */
  15. //
  16. // d_draw.s
  17. // x86 assembly-language horizontal 8-bpp span-drawing code.
  18. //
  19. #include "asm_i386.h"
  20. #include "quakeasm.h"
  21. #include "asm_draw.h"
  22. #include "d_ifacea.h"
  23. #if id386
  24. //----------------------------------------------------------------------
  25. // 8-bpp horizontal span drawing code for polygons, with no transparency.
  26. //
  27. // Assumes there is at least one span in pspans, and that every span
  28. // contains at least one pixel
  29. //----------------------------------------------------------------------
  30. .text
  31. // out-of-line, rarely-needed clamping code
  32. LClampHigh0:
  33. movl C(bbextents),%esi
  34. jmp LClampReentry0
  35. LClampHighOrLow0:
  36. jg LClampHigh0
  37. xorl %esi,%esi
  38. jmp LClampReentry0
  39. LClampHigh1:
  40. movl C(bbextentt),%edx
  41. jmp LClampReentry1
  42. LClampHighOrLow1:
  43. jg LClampHigh1
  44. xorl %edx,%edx
  45. jmp LClampReentry1
  46. LClampLow2:
  47. movl $2048,%ebp
  48. jmp LClampReentry2
  49. LClampHigh2:
  50. movl C(bbextents),%ebp
  51. jmp LClampReentry2
  52. LClampLow3:
  53. movl $2048,%ecx
  54. jmp LClampReentry3
  55. LClampHigh3:
  56. movl C(bbextentt),%ecx
  57. jmp LClampReentry3
  58. LClampLow4:
  59. movl $2048,%eax
  60. jmp LClampReentry4
  61. LClampHigh4:
  62. movl C(bbextents),%eax
  63. jmp LClampReentry4
  64. LClampLow5:
  65. movl $2048,%ebx
  66. jmp LClampReentry5
  67. LClampHigh5:
  68. movl C(bbextentt),%ebx
  69. jmp LClampReentry5
  70. #define pspans 4+16
  71. .align 4
  72. .globl C(D_DrawSpans8)
  73. C(D_DrawSpans8):
  74. pushl %ebp // preserve caller's stack frame
  75. pushl %edi
  76. pushl %esi // preserve register variables
  77. pushl %ebx
  78. //
  79. // set up scaled-by-8 steps, for 8-long segments; also set up cacheblock
  80. // and span list pointers
  81. //
  82. // TODO: any overlap from rearranging?
  83. flds C(d_sdivzstepu)
  84. fmuls fp_8
  85. movl C(cacheblock),%edx
  86. flds C(d_tdivzstepu)
  87. fmuls fp_8
  88. movl pspans(%esp),%ebx // point to the first span descriptor
  89. flds C(d_zistepu)
  90. fmuls fp_8
  91. movl %edx,pbase // pbase = cacheblock
  92. fstps zi8stepu
  93. fstps tdivz8stepu
  94. fstps sdivz8stepu
  95. LSpanLoop:
  96. //
  97. // set up the initial s/z, t/z, and 1/z on the FP stack, and generate the
  98. // initial s and t values
  99. //
  100. // FIXME: pipeline FILD?
  101. fildl espan_t_v(%ebx)
  102. fildl espan_t_u(%ebx)
  103. fld %st(1) // dv | du | dv
  104. fmuls C(d_sdivzstepv) // dv*d_sdivzstepv | du | dv
  105. fld %st(1) // du | dv*d_sdivzstepv | du | dv
  106. fmuls C(d_sdivzstepu) // du*d_sdivzstepu | dv*d_sdivzstepv | du | dv
  107. fld %st(2) // du | du*d_sdivzstepu | dv*d_sdivzstepv | du | dv
  108. fmuls C(d_tdivzstepu) // du*d_tdivzstepu | du*d_sdivzstepu |
  109. // dv*d_sdivzstepv | du | dv
  110. fxch %st(1) // du*d_sdivzstepu | du*d_tdivzstepu |
  111. // dv*d_sdivzstepv | du | dv
  112. faddp %st(0),%st(2) // du*d_tdivzstepu |
  113. // du*d_sdivzstepu + dv*d_sdivzstepv | du | dv
  114. fxch %st(1) // du*d_sdivzstepu + dv*d_sdivzstepv |
  115. // du*d_tdivzstepu | du | dv
  116. fld %st(3) // dv | du*d_sdivzstepu + dv*d_sdivzstepv |
  117. // du*d_tdivzstepu | du | dv
  118. fmuls C(d_tdivzstepv) // dv*d_tdivzstepv |
  119. // du*d_sdivzstepu + dv*d_sdivzstepv |
  120. // du*d_tdivzstepu | du | dv
  121. fxch %st(1) // du*d_sdivzstepu + dv*d_sdivzstepv |
  122. // dv*d_tdivzstepv | du*d_tdivzstepu | du | dv
  123. fadds C(d_sdivzorigin) // sdivz = d_sdivzorigin + dv*d_sdivzstepv +
  124. // du*d_sdivzstepu; stays in %st(2) at end
  125. fxch %st(4) // dv | dv*d_tdivzstepv | du*d_tdivzstepu | du |
  126. // s/z
  127. fmuls C(d_zistepv) // dv*d_zistepv | dv*d_tdivzstepv |
  128. // du*d_tdivzstepu | du | s/z
  129. fxch %st(1) // dv*d_tdivzstepv | dv*d_zistepv |
  130. // du*d_tdivzstepu | du | s/z
  131. faddp %st(0),%st(2) // dv*d_zistepv |
  132. // dv*d_tdivzstepv + du*d_tdivzstepu | du | s/z
  133. fxch %st(2) // du | dv*d_tdivzstepv + du*d_tdivzstepu |
  134. // dv*d_zistepv | s/z
  135. fmuls C(d_zistepu) // du*d_zistepu |
  136. // dv*d_tdivzstepv + du*d_tdivzstepu |
  137. // dv*d_zistepv | s/z
  138. fxch %st(1) // dv*d_tdivzstepv + du*d_tdivzstepu |
  139. // du*d_zistepu | dv*d_zistepv | s/z
  140. fadds C(d_tdivzorigin) // tdivz = d_tdivzorigin + dv*d_tdivzstepv +
  141. // du*d_tdivzstepu; stays in %st(1) at end
  142. fxch %st(2) // dv*d_zistepv | du*d_zistepu | t/z | s/z
  143. faddp %st(0),%st(1) // dv*d_zistepv + du*d_zistepu | t/z | s/z
  144. flds fp_64k // fp_64k | dv*d_zistepv + du*d_zistepu | t/z | s/z
  145. fxch %st(1) // dv*d_zistepv + du*d_zistepu | fp_64k | t/z | s/z
  146. fadds C(d_ziorigin) // zi = d_ziorigin + dv*d_zistepv +
  147. // du*d_zistepu; stays in %st(0) at end
  148. // 1/z | fp_64k | t/z | s/z
  149. //
  150. // calculate and clamp s & t
  151. //
  152. fdivr %st(0),%st(1) // 1/z | z*64k | t/z | s/z
  153. //
  154. // point %edi to the first pixel in the span
  155. //
  156. movl C(d_viewbuffer),%ecx
  157. movl espan_t_v(%ebx),%eax
  158. movl %ebx,pspantemp // preserve spans pointer
  159. movl C(tadjust),%edx
  160. movl C(sadjust),%esi
  161. movl C(d_scantable)(,%eax,4),%edi // v * screenwidth
  162. addl %ecx,%edi
  163. movl espan_t_u(%ebx),%ecx
  164. addl %ecx,%edi // pdest = &pdestspan[scans->u];
  165. movl espan_t_count(%ebx),%ecx
  166. //
  167. // now start the FDIV for the end of the span
  168. //
  169. cmpl $8,%ecx
  170. ja LSetupNotLast1
  171. decl %ecx
  172. jz LCleanup1 // if only one pixel, no need to start an FDIV
  173. movl %ecx,spancountminus1
  174. // finish up the s and t calcs
  175. fxch %st(1) // z*64k | 1/z | t/z | s/z
  176. fld %st(0) // z*64k | z*64k | 1/z | t/z | s/z
  177. fmul %st(4),%st(0) // s | z*64k | 1/z | t/z | s/z
  178. fxch %st(1) // z*64k | s | 1/z | t/z | s/z
  179. fmul %st(3),%st(0) // t | s | 1/z | t/z | s/z
  180. fxch %st(1) // s | t | 1/z | t/z | s/z
  181. fistpl s // 1/z | t | t/z | s/z
  182. fistpl t // 1/z | t/z | s/z
  183. fildl spancountminus1
  184. flds C(d_tdivzstepu) // C(d_tdivzstepu) | spancountminus1
  185. flds C(d_zistepu) // C(d_zistepu) | C(d_tdivzstepu) | spancountminus1
  186. fmul %st(2),%st(0) // C(d_zistepu)*scm1 | C(d_tdivzstepu) | scm1
  187. fxch %st(1) // C(d_tdivzstepu) | C(d_zistepu)*scm1 | scm1
  188. fmul %st(2),%st(0) // C(d_tdivzstepu)*scm1 | C(d_zistepu)*scm1 | scm1
  189. fxch %st(2) // scm1 | C(d_zistepu)*scm1 | C(d_tdivzstepu)*scm1
  190. fmuls C(d_sdivzstepu) // C(d_sdivzstepu)*scm1 | C(d_zistepu)*scm1 |
  191. // C(d_tdivzstepu)*scm1
  192. fxch %st(1) // C(d_zistepu)*scm1 | C(d_sdivzstepu)*scm1 |
  193. // C(d_tdivzstepu)*scm1
  194. faddp %st(0),%st(3) // C(d_sdivzstepu)*scm1 | C(d_tdivzstepu)*scm1
  195. fxch %st(1) // C(d_tdivzstepu)*scm1 | C(d_sdivzstepu)*scm1
  196. faddp %st(0),%st(3) // C(d_sdivzstepu)*scm1
  197. faddp %st(0),%st(3)
  198. flds fp_64k
  199. fdiv %st(1),%st(0) // this is what we've gone to all this trouble to
  200. // overlap
  201. jmp LFDIVInFlight1
  202. LCleanup1:
  203. // finish up the s and t calcs
  204. fxch %st(1) // z*64k | 1/z | t/z | s/z
  205. fld %st(0) // z*64k | z*64k | 1/z | t/z | s/z
  206. fmul %st(4),%st(0) // s | z*64k | 1/z | t/z | s/z
  207. fxch %st(1) // z*64k | s | 1/z | t/z | s/z
  208. fmul %st(3),%st(0) // t | s | 1/z | t/z | s/z
  209. fxch %st(1) // s | t | 1/z | t/z | s/z
  210. fistpl s // 1/z | t | t/z | s/z
  211. fistpl t // 1/z | t/z | s/z
  212. jmp LFDIVInFlight1
  213. .align 4
  214. LSetupNotLast1:
  215. // finish up the s and t calcs
  216. fxch %st(1) // z*64k | 1/z | t/z | s/z
  217. fld %st(0) // z*64k | z*64k | 1/z | t/z | s/z
  218. fmul %st(4),%st(0) // s | z*64k | 1/z | t/z | s/z
  219. fxch %st(1) // z*64k | s | 1/z | t/z | s/z
  220. fmul %st(3),%st(0) // t | s | 1/z | t/z | s/z
  221. fxch %st(1) // s | t | 1/z | t/z | s/z
  222. fistpl s // 1/z | t | t/z | s/z
  223. fistpl t // 1/z | t/z | s/z
  224. fadds zi8stepu
  225. fxch %st(2)
  226. fadds sdivz8stepu
  227. fxch %st(2)
  228. flds tdivz8stepu
  229. faddp %st(0),%st(2)
  230. flds fp_64k
  231. fdiv %st(1),%st(0) // z = 1/1/z
  232. // this is what we've gone to all this trouble to
  233. // overlap
  234. LFDIVInFlight1:
  235. addl s,%esi
  236. addl t,%edx
  237. movl C(bbextents),%ebx
  238. movl C(bbextentt),%ebp
  239. cmpl %ebx,%esi
  240. ja LClampHighOrLow0
  241. LClampReentry0:
  242. movl %esi,s
  243. movl pbase,%ebx
  244. shll $16,%esi
  245. cmpl %ebp,%edx
  246. movl %esi,sfracf
  247. ja LClampHighOrLow1
  248. LClampReentry1:
  249. movl %edx,t
  250. movl s,%esi // sfrac = scans->sfrac;
  251. shll $16,%edx
  252. movl t,%eax // tfrac = scans->tfrac;
  253. sarl $16,%esi
  254. movl %edx,tfracf
  255. //
  256. // calculate the texture starting address
  257. //
  258. sarl $16,%eax
  259. movl C(cachewidth),%edx
  260. imull %edx,%eax // (tfrac >> 16) * cachewidth
  261. addl %ebx,%esi
  262. addl %eax,%esi // psource = pbase + (sfrac >> 16) +
  263. // ((tfrac >> 16) * cachewidth);
  264. //
  265. // determine whether last span or not
  266. //
  267. cmpl $8,%ecx
  268. jna LLastSegment
  269. //
  270. // not the last segment; do full 8-wide segment
  271. //
  272. LNotLastSegment:
  273. //
  274. // advance s/z, t/z, and 1/z, and calculate s & t at end of span and steps to
  275. // get there
  276. //
  277. // pick up after the FDIV that was left in flight previously
  278. fld %st(0) // duplicate it
  279. fmul %st(4),%st(0) // s = s/z * z
  280. fxch %st(1)
  281. fmul %st(3),%st(0) // t = t/z * z
  282. fxch %st(1)
  283. fistpl snext
  284. fistpl tnext
  285. movl snext,%eax
  286. movl tnext,%edx
  287. movb (%esi),%bl // get first source texel
  288. subl $8,%ecx // count off this segments' pixels
  289. movl C(sadjust),%ebp
  290. movl %ecx,counttemp // remember count of remaining pixels
  291. movl C(tadjust),%ecx
  292. movb %bl,(%edi) // store first dest pixel
  293. addl %eax,%ebp
  294. addl %edx,%ecx
  295. movl C(bbextents),%eax
  296. movl C(bbextentt),%edx
  297. cmpl $2048,%ebp
  298. jl LClampLow2
  299. cmpl %eax,%ebp
  300. ja LClampHigh2
  301. LClampReentry2:
  302. cmpl $2048,%ecx
  303. jl LClampLow3
  304. cmpl %edx,%ecx
  305. ja LClampHigh3
  306. LClampReentry3:
  307. movl %ebp,snext
  308. movl %ecx,tnext
  309. subl s,%ebp
  310. subl t,%ecx
  311. //
  312. // set up advancetable
  313. //
  314. movl %ecx,%eax
  315. movl %ebp,%edx
  316. sarl $19,%eax // tstep >>= 16;
  317. jz LZero
  318. sarl $19,%edx // sstep >>= 16;
  319. movl C(cachewidth),%ebx
  320. imull %ebx,%eax
  321. jmp LSetUp1
  322. LZero:
  323. sarl $19,%edx // sstep >>= 16;
  324. movl C(cachewidth),%ebx
  325. LSetUp1:
  326. addl %edx,%eax // add in sstep
  327. // (tstep >> 16) * cachewidth + (sstep >> 16);
  328. movl tfracf,%edx
  329. movl %eax,advancetable+4 // advance base in t
  330. addl %ebx,%eax // ((tstep >> 16) + 1) * cachewidth +
  331. // (sstep >> 16);
  332. shll $13,%ebp // left-justify sstep fractional part
  333. movl sfracf,%ebx
  334. shll $13,%ecx // left-justify tstep fractional part
  335. movl %eax,advancetable // advance extra in t
  336. movl %ecx,tstep
  337. addl %ecx,%edx // advance tfrac fractional part by tstep frac
  338. sbbl %ecx,%ecx // turn tstep carry into -1 (0 if none)
  339. addl %ebp,%ebx // advance sfrac fractional part by sstep frac
  340. adcl advancetable+4(,%ecx,4),%esi // point to next source texel
  341. addl tstep,%edx
  342. sbbl %ecx,%ecx
  343. movb (%esi),%al
  344. addl %ebp,%ebx
  345. movb %al,1(%edi)
  346. adcl advancetable+4(,%ecx,4),%esi
  347. addl tstep,%edx
  348. sbbl %ecx,%ecx
  349. addl %ebp,%ebx
  350. movb (%esi),%al
  351. adcl advancetable+4(,%ecx,4),%esi
  352. addl tstep,%edx
  353. sbbl %ecx,%ecx
  354. movb %al,2(%edi)
  355. addl %ebp,%ebx
  356. movb (%esi),%al
  357. adcl advancetable+4(,%ecx,4),%esi
  358. addl tstep,%edx
  359. sbbl %ecx,%ecx
  360. movb %al,3(%edi)
  361. addl %ebp,%ebx
  362. movb (%esi),%al
  363. adcl advancetable+4(,%ecx,4),%esi
  364. //
  365. // start FDIV for end of next segment in flight, so it can overlap
  366. //
  367. movl counttemp,%ecx
  368. cmpl $8,%ecx // more than one segment after this?
  369. ja LSetupNotLast2 // yes
  370. decl %ecx
  371. jz LFDIVInFlight2 // if only one pixel, no need to start an FDIV
  372. movl %ecx,spancountminus1
  373. fildl spancountminus1
  374. flds C(d_zistepu) // C(d_zistepu) | spancountminus1
  375. fmul %st(1),%st(0) // C(d_zistepu)*scm1 | scm1
  376. flds C(d_tdivzstepu) // C(d_tdivzstepu) | C(d_zistepu)*scm1 | scm1
  377. fmul %st(2),%st(0) // C(d_tdivzstepu)*scm1 | C(d_zistepu)*scm1 | scm1
  378. fxch %st(1) // C(d_zistepu)*scm1 | C(d_tdivzstepu)*scm1 | scm1
  379. faddp %st(0),%st(3) // C(d_tdivzstepu)*scm1 | scm1
  380. fxch %st(1) // scm1 | C(d_tdivzstepu)*scm1
  381. fmuls C(d_sdivzstepu) // C(d_sdivzstepu)*scm1 | C(d_tdivzstepu)*scm1
  382. fxch %st(1) // C(d_tdivzstepu)*scm1 | C(d_sdivzstepu)*scm1
  383. faddp %st(0),%st(3) // C(d_sdivzstepu)*scm1
  384. flds fp_64k // 64k | C(d_sdivzstepu)*scm1
  385. fxch %st(1) // C(d_sdivzstepu)*scm1 | 64k
  386. faddp %st(0),%st(4) // 64k
  387. fdiv %st(1),%st(0) // this is what we've gone to all this trouble to
  388. // overlap
  389. jmp LFDIVInFlight2
  390. .align 4
  391. LSetupNotLast2:
  392. fadds zi8stepu
  393. fxch %st(2)
  394. fadds sdivz8stepu
  395. fxch %st(2)
  396. flds tdivz8stepu
  397. faddp %st(0),%st(2)
  398. flds fp_64k
  399. fdiv %st(1),%st(0) // z = 1/1/z
  400. // this is what we've gone to all this trouble to
  401. // overlap
  402. LFDIVInFlight2:
  403. movl %ecx,counttemp
  404. addl tstep,%edx
  405. sbbl %ecx,%ecx
  406. movb %al,4(%edi)
  407. addl %ebp,%ebx
  408. movb (%esi),%al
  409. adcl advancetable+4(,%ecx,4),%esi
  410. addl tstep,%edx
  411. sbbl %ecx,%ecx
  412. movb %al,5(%edi)
  413. addl %ebp,%ebx
  414. movb (%esi),%al
  415. adcl advancetable+4(,%ecx,4),%esi
  416. addl tstep,%edx
  417. sbbl %ecx,%ecx
  418. movb %al,6(%edi)
  419. addl %ebp,%ebx
  420. movb (%esi),%al
  421. adcl advancetable+4(,%ecx,4),%esi
  422. addl $8,%edi
  423. movl %edx,tfracf
  424. movl snext,%edx
  425. movl %ebx,sfracf
  426. movl tnext,%ebx
  427. movl %edx,s
  428. movl %ebx,t
  429. movl counttemp,%ecx // retrieve count
  430. //
  431. // determine whether last span or not
  432. //
  433. cmpl $8,%ecx // are there multiple segments remaining?
  434. movb %al,-1(%edi)
  435. ja LNotLastSegment // yes
  436. //
  437. // last segment of scan
  438. //
  439. LLastSegment:
  440. //
  441. // advance s/z, t/z, and 1/z, and calculate s & t at end of span and steps to
  442. // get there. The number of pixels left is variable, and we want to land on the
  443. // last pixel, not step one past it, so we can't run into arithmetic problems
  444. //
  445. testl %ecx,%ecx
  446. jz LNoSteps // just draw the last pixel and we're done
  447. // pick up after the FDIV that was left in flight previously
  448. fld %st(0) // duplicate it
  449. fmul %st(4),%st(0) // s = s/z * z
  450. fxch %st(1)
  451. fmul %st(3),%st(0) // t = t/z * z
  452. fxch %st(1)
  453. fistpl snext
  454. fistpl tnext
  455. movb (%esi),%al // load first texel in segment
  456. movl C(tadjust),%ebx
  457. movb %al,(%edi) // store first pixel in segment
  458. movl C(sadjust),%eax
  459. addl snext,%eax
  460. addl tnext,%ebx
  461. movl C(bbextents),%ebp
  462. movl C(bbextentt),%edx
  463. cmpl $2048,%eax
  464. jl LClampLow4
  465. cmpl %ebp,%eax
  466. ja LClampHigh4
  467. LClampReentry4:
  468. movl %eax,snext
  469. cmpl $2048,%ebx
  470. jl LClampLow5
  471. cmpl %edx,%ebx
  472. ja LClampHigh5
  473. LClampReentry5:
  474. cmpl $1,%ecx // don't bother
  475. je LOnlyOneStep // if two pixels in segment, there's only one step,
  476. // of the segment length
  477. subl s,%eax
  478. subl t,%ebx
  479. addl %eax,%eax // convert to 15.17 format so multiply by 1.31
  480. addl %ebx,%ebx // reciprocal yields 16.48
  481. imull reciprocal_table-8(,%ecx,4) // sstep = (snext - s) / (spancount-1)
  482. movl %edx,%ebp
  483. movl %ebx,%eax
  484. imull reciprocal_table-8(,%ecx,4) // tstep = (tnext - t) / (spancount-1)
  485. LSetEntryvec:
  486. //
  487. // set up advancetable
  488. //
  489. movl entryvec_table(,%ecx,4),%ebx
  490. movl %edx,%eax
  491. movl %ebx,jumptemp // entry point into code for RET later
  492. movl %ebp,%ecx
  493. sarl $16,%edx // tstep >>= 16;
  494. movl C(cachewidth),%ebx
  495. sarl $16,%ecx // sstep >>= 16;
  496. imull %ebx,%edx
  497. addl %ecx,%edx // add in sstep
  498. // (tstep >> 16) * cachewidth + (sstep >> 16);
  499. movl tfracf,%ecx
  500. movl %edx,advancetable+4 // advance base in t
  501. addl %ebx,%edx // ((tstep >> 16) + 1) * cachewidth +
  502. // (sstep >> 16);
  503. shll $16,%ebp // left-justify sstep fractional part
  504. movl sfracf,%ebx
  505. shll $16,%eax // left-justify tstep fractional part
  506. movl %edx,advancetable // advance extra in t
  507. movl %eax,tstep
  508. movl %ecx,%edx
  509. addl %eax,%edx
  510. sbbl %ecx,%ecx
  511. addl %ebp,%ebx
  512. adcl advancetable+4(,%ecx,4),%esi
  513. jmp *jumptemp // jump to the number-of-pixels handler
  514. //----------------------------------------
  515. LNoSteps:
  516. movb (%esi),%al // load first texel in segment
  517. subl $7,%edi // adjust for hardwired offset
  518. jmp LEndSpan
  519. LOnlyOneStep:
  520. subl s,%eax
  521. subl t,%ebx
  522. movl %eax,%ebp
  523. movl %ebx,%edx
  524. jmp LSetEntryvec
  525. //----------------------------------------
  526. .globl Entry2_8
  527. Entry2_8:
  528. subl $6,%edi // adjust for hardwired offsets
  529. movb (%esi),%al
  530. jmp LLEntry2_8
  531. //----------------------------------------
  532. .globl Entry3_8
  533. Entry3_8:
  534. subl $5,%edi // adjust for hardwired offsets
  535. addl %eax,%edx
  536. movb (%esi),%al
  537. sbbl %ecx,%ecx
  538. addl %ebp,%ebx
  539. adcl advancetable+4(,%ecx,4),%esi
  540. jmp LLEntry3_8
  541. //----------------------------------------
  542. .globl Entry4_8
  543. Entry4_8:
  544. subl $4,%edi // adjust for hardwired offsets
  545. addl %eax,%edx
  546. movb (%esi),%al
  547. sbbl %ecx,%ecx
  548. addl %ebp,%ebx
  549. adcl advancetable+4(,%ecx,4),%esi
  550. addl tstep,%edx
  551. jmp LLEntry4_8
  552. //----------------------------------------
  553. .globl Entry5_8
  554. Entry5_8:
  555. subl $3,%edi // adjust for hardwired offsets
  556. addl %eax,%edx
  557. movb (%esi),%al
  558. sbbl %ecx,%ecx
  559. addl %ebp,%ebx
  560. adcl advancetable+4(,%ecx,4),%esi
  561. addl tstep,%edx
  562. jmp LLEntry5_8
  563. //----------------------------------------
  564. .globl Entry6_8
  565. Entry6_8:
  566. subl $2,%edi // adjust for hardwired offsets
  567. addl %eax,%edx
  568. movb (%esi),%al
  569. sbbl %ecx,%ecx
  570. addl %ebp,%ebx
  571. adcl advancetable+4(,%ecx,4),%esi
  572. addl tstep,%edx
  573. jmp LLEntry6_8
  574. //----------------------------------------
  575. .globl Entry7_8
  576. Entry7_8:
  577. decl %edi // adjust for hardwired offsets
  578. addl %eax,%edx
  579. movb (%esi),%al
  580. sbbl %ecx,%ecx
  581. addl %ebp,%ebx
  582. adcl advancetable+4(,%ecx,4),%esi
  583. addl tstep,%edx
  584. jmp LLEntry7_8
  585. //----------------------------------------
  586. .globl Entry8_8
  587. Entry8_8:
  588. addl %eax,%edx
  589. movb (%esi),%al
  590. sbbl %ecx,%ecx
  591. addl %ebp,%ebx
  592. adcl advancetable+4(,%ecx,4),%esi
  593. addl tstep,%edx
  594. sbbl %ecx,%ecx
  595. movb %al,1(%edi)
  596. addl %ebp,%ebx
  597. movb (%esi),%al
  598. adcl advancetable+4(,%ecx,4),%esi
  599. addl tstep,%edx
  600. LLEntry7_8:
  601. sbbl %ecx,%ecx
  602. movb %al,2(%edi)
  603. addl %ebp,%ebx
  604. movb (%esi),%al
  605. adcl advancetable+4(,%ecx,4),%esi
  606. addl tstep,%edx
  607. LLEntry6_8:
  608. sbbl %ecx,%ecx
  609. movb %al,3(%edi)
  610. addl %ebp,%ebx
  611. movb (%esi),%al
  612. adcl advancetable+4(,%ecx,4),%esi
  613. addl tstep,%edx
  614. LLEntry5_8:
  615. sbbl %ecx,%ecx
  616. movb %al,4(%edi)
  617. addl %ebp,%ebx
  618. movb (%esi),%al
  619. adcl advancetable+4(,%ecx,4),%esi
  620. addl tstep,%edx
  621. LLEntry4_8:
  622. sbbl %ecx,%ecx
  623. movb %al,5(%edi)
  624. addl %ebp,%ebx
  625. movb (%esi),%al
  626. adcl advancetable+4(,%ecx,4),%esi
  627. LLEntry3_8:
  628. movb %al,6(%edi)
  629. movb (%esi),%al
  630. LLEntry2_8:
  631. LEndSpan:
  632. //
  633. // clear s/z, t/z, 1/z from FP stack
  634. //
  635. fstp %st(0)
  636. fstp %st(0)
  637. fstp %st(0)
  638. movl pspantemp,%ebx // restore spans pointer
  639. movl espan_t_pnext(%ebx),%ebx // point to next span
  640. testl %ebx,%ebx // any more spans?
  641. movb %al,7(%edi)
  642. jnz LSpanLoop // more spans
  643. popl %ebx // restore register variables
  644. popl %esi
  645. popl %edi
  646. popl %ebp // restore the caller's stack frame
  647. ret
  648. //----------------------------------------------------------------------
  649. // 8-bpp horizontal span z drawing codefor polygons, with no transparency.
  650. //
  651. // Assumes there is at least one span in pzspans, and that every span
  652. // contains at least one pixel
  653. //----------------------------------------------------------------------
  654. .text
  655. // z-clamp on a non-negative gradient span
  656. LClamp:
  657. movl $0x40000000,%edx
  658. xorl %ebx,%ebx
  659. fstp %st(0)
  660. jmp LZDraw
  661. // z-clamp on a negative gradient span
  662. LClampNeg:
  663. movl $0x40000000,%edx
  664. xorl %ebx,%ebx
  665. fstp %st(0)
  666. jmp LZDrawNeg
  667. #define pzspans 4+16
  668. .globl C(D_DrawZSpans)
  669. C(D_DrawZSpans):
  670. pushl %ebp // preserve caller's stack frame
  671. pushl %edi
  672. pushl %esi // preserve register variables
  673. pushl %ebx
  674. flds C(d_zistepu)
  675. movl C(d_zistepu),%eax
  676. movl pzspans(%esp),%esi
  677. testl %eax,%eax
  678. jz LFNegSpan
  679. fmuls Float2ToThe31nd
  680. fistpl izistep // note: we are relying on FP exceptions being turned
  681. // off here to avoid range problems
  682. movl izistep,%ebx // remains loaded for all spans
  683. LFSpanLoop:
  684. // set up the initial 1/z value
  685. fildl espan_t_v(%esi)
  686. fildl espan_t_u(%esi)
  687. movl espan_t_v(%esi),%ecx
  688. movl C(d_pzbuffer),%edi
  689. fmuls C(d_zistepu)
  690. fxch %st(1)
  691. fmuls C(d_zistepv)
  692. fxch %st(1)
  693. fadds C(d_ziorigin)
  694. imull C(d_zrowbytes),%ecx
  695. faddp %st(0),%st(1)
  696. // clamp if z is nearer than 2 (1/z > 0.5)
  697. fcoms float_point5
  698. addl %ecx,%edi
  699. movl espan_t_u(%esi),%edx
  700. addl %edx,%edx // word count
  701. movl espan_t_count(%esi),%ecx
  702. addl %edx,%edi // pdest = &pdestspan[scans->u];
  703. pushl %esi // preserve spans pointer
  704. fnstsw %ax
  705. testb $0x45,%ah
  706. jz LClamp
  707. fmuls Float2ToThe31nd
  708. fistpl izi // note: we are relying on FP exceptions being turned
  709. // off here to avoid problems when the span is closer
  710. // than 1/(2**31)
  711. movl izi,%edx
  712. // at this point:
  713. // %ebx = izistep
  714. // %ecx = count
  715. // %edx = izi
  716. // %edi = pdest
  717. LZDraw:
  718. // do a single pixel up front, if necessary to dword align the destination
  719. testl $2,%edi
  720. jz LFMiddle
  721. movl %edx,%eax
  722. addl %ebx,%edx
  723. shrl $16,%eax
  724. decl %ecx
  725. movw %ax,(%edi)
  726. addl $2,%edi
  727. // do middle a pair of aligned dwords at a time
  728. LFMiddle:
  729. pushl %ecx
  730. shrl $1,%ecx // count / 2
  731. jz LFLast // no aligned dwords to do
  732. shrl $1,%ecx // (count / 2) / 2
  733. jnc LFMiddleLoop // even number of aligned dwords to do
  734. movl %edx,%eax
  735. addl %ebx,%edx
  736. shrl $16,%eax
  737. movl %edx,%esi
  738. addl %ebx,%edx
  739. andl $0xFFFF0000,%esi
  740. orl %esi,%eax
  741. movl %eax,(%edi)
  742. addl $4,%edi
  743. andl %ecx,%ecx
  744. jz LFLast
  745. LFMiddleLoop:
  746. movl %edx,%eax
  747. addl %ebx,%edx
  748. shrl $16,%eax
  749. movl %edx,%esi
  750. addl %ebx,%edx
  751. andl $0xFFFF0000,%esi
  752. orl %esi,%eax
  753. movl %edx,%ebp
  754. movl %eax,(%edi)
  755. addl %ebx,%edx
  756. shrl $16,%ebp
  757. movl %edx,%esi
  758. addl %ebx,%edx
  759. andl $0xFFFF0000,%esi
  760. orl %esi,%ebp
  761. movl %ebp,4(%edi) // FIXME: eliminate register contention
  762. addl $8,%edi
  763. decl %ecx
  764. jnz LFMiddleLoop
  765. LFLast:
  766. popl %ecx // retrieve count
  767. popl %esi // retrieve span pointer
  768. // do the last, unaligned pixel, if there is one
  769. andl $1,%ecx // is there an odd pixel left to do?
  770. jz LFSpanDone // no
  771. shrl $16,%edx
  772. movw %dx,(%edi) // do the final pixel's z
  773. LFSpanDone:
  774. movl espan_t_pnext(%esi),%esi
  775. testl %esi,%esi
  776. jnz LFSpanLoop
  777. jmp LFDone
  778. LFNegSpan:
  779. fmuls FloatMinus2ToThe31nd
  780. fistpl izistep // note: we are relying on FP exceptions being turned
  781. // off here to avoid range problems
  782. movl izistep,%ebx // remains loaded for all spans
  783. LFNegSpanLoop:
  784. // set up the initial 1/z value
  785. fildl espan_t_v(%esi)
  786. fildl espan_t_u(%esi)
  787. movl espan_t_v(%esi),%ecx
  788. movl C(d_pzbuffer),%edi
  789. fmuls C(d_zistepu)
  790. fxch %st(1)
  791. fmuls C(d_zistepv)
  792. fxch %st(1)
  793. fadds C(d_ziorigin)
  794. imull C(d_zrowbytes),%ecx
  795. faddp %st(0),%st(1)
  796. // clamp if z is nearer than 2 (1/z > 0.5)
  797. fcoms float_point5
  798. addl %ecx,%edi
  799. movl espan_t_u(%esi),%edx
  800. addl %edx,%edx // word count
  801. movl espan_t_count(%esi),%ecx
  802. addl %edx,%edi // pdest = &pdestspan[scans->u];
  803. pushl %esi // preserve spans pointer
  804. fnstsw %ax
  805. testb $0x45,%ah
  806. jz LClampNeg
  807. fmuls Float2ToThe31nd
  808. fistpl izi // note: we are relying on FP exceptions being turned
  809. // off here to avoid problems when the span is closer
  810. // than 1/(2**31)
  811. movl izi,%edx
  812. // at this point:
  813. // %ebx = izistep
  814. // %ecx = count
  815. // %edx = izi
  816. // %edi = pdest
  817. LZDrawNeg:
  818. // do a single pixel up front, if necessary to dword align the destination
  819. testl $2,%edi
  820. jz LFNegMiddle
  821. movl %edx,%eax
  822. subl %ebx,%edx
  823. shrl $16,%eax
  824. decl %ecx
  825. movw %ax,(%edi)
  826. addl $2,%edi
  827. // do middle a pair of aligned dwords at a time
  828. LFNegMiddle:
  829. pushl %ecx
  830. shrl $1,%ecx // count / 2
  831. jz LFNegLast // no aligned dwords to do
  832. shrl $1,%ecx // (count / 2) / 2
  833. jnc LFNegMiddleLoop // even number of aligned dwords to do
  834. movl %edx,%eax
  835. subl %ebx,%edx
  836. shrl $16,%eax
  837. movl %edx,%esi
  838. subl %ebx,%edx
  839. andl $0xFFFF0000,%esi
  840. orl %esi,%eax
  841. movl %eax,(%edi)
  842. addl $4,%edi
  843. andl %ecx,%ecx
  844. jz LFNegLast
  845. LFNegMiddleLoop:
  846. movl %edx,%eax
  847. subl %ebx,%edx
  848. shrl $16,%eax
  849. movl %edx,%esi
  850. subl %ebx,%edx
  851. andl $0xFFFF0000,%esi
  852. orl %esi,%eax
  853. movl %edx,%ebp
  854. movl %eax,(%edi)
  855. subl %ebx,%edx
  856. shrl $16,%ebp
  857. movl %edx,%esi
  858. subl %ebx,%edx
  859. andl $0xFFFF0000,%esi
  860. orl %esi,%ebp
  861. movl %ebp,4(%edi) // FIXME: eliminate register contention
  862. addl $8,%edi
  863. decl %ecx
  864. jnz LFNegMiddleLoop
  865. LFNegLast:
  866. popl %ecx // retrieve count
  867. popl %esi // retrieve span pointer
  868. // do the last, unaligned pixel, if there is one
  869. andl $1,%ecx // is there an odd pixel left to do?
  870. jz LFNegSpanDone // no
  871. shrl $16,%edx
  872. movw %dx,(%edi) // do the final pixel's z
  873. LFNegSpanDone:
  874. movl espan_t_pnext(%esi),%esi
  875. testl %esi,%esi
  876. jnz LFNegSpanLoop
  877. LFDone:
  878. popl %ebx // restore register variables
  879. popl %esi
  880. popl %edi
  881. popl %ebp // restore the caller's stack frame
  882. ret
  883. #endif // id386