texture_asm.hh 22 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689
  1. /********************************************************************** <BR>
  2. This file is part of Crack dot Com's free source code release of
  3. Golgotha. <a href="http://www.crack.com/golgotha_release"> <BR> for
  4. information about compiling & licensing issues visit this URL</a>
  5. <PRE> If that doesn't help, contact Jonathan Clark at
  6. golgotha_source@usa.net (Subject should have "GOLG" in it)
  7. ***********************************************************************/
  8. #ifndef _WIN95_ASM_HH_
  9. #define _WIN95_ASM_HH_
  10. //June 7th... Jonathon finally talked me into inline...
  11. //globals... These are referenced as memory offsets
  12. //so they need to be global 'c' type things
  13. //when these are totally finalized, re-arrange these in
  14. //tmapper soft so that the variables used together are in
  15. //the same cache line
  16. //external references
  17. extern "C" w32 cTable[2*256*32]; //low and high bits of color
  18. extern "C" w16 *r1_software_texture_ptr;
  19. extern "C" w8 r1_software_twidth_log2;
  20. extern "C" w16 *pixel_on;
  21. //external declarations
  22. extern "C"
  23. {
  24. i4_float FixedScale = 65536.f;
  25. i4_float FixedScale8 = 8192.f;
  26. i4_float LightScale = 2031616.f;
  27. w16 zBuffer[640*480]; //low and high bits of color
  28. i4_float One = 1.f;
  29. i4_float FloatTemp;
  30. w32 UVintVfracStepV[2];
  31. w32 DeltaVFrac;
  32. w32 DeltaUFrac;
  33. w32 LVal;
  34. w32 LDelta;
  35. w32 pTex;
  36. }
  37. void draw_scanline_white_pc(edgeGrad *left, edgeGrad *right)
  38. {
  39. //locals on the stack
  40. w32 Subdivisions;
  41. w32 WidthModLength;
  42. w32 DeltaU;
  43. w32 DeltaV;
  44. w32 UFixed;
  45. w32 VFixed;
  46. w16 FPUCW;
  47. w16 OldFPUCW;
  48. sw32 blarg;
  49. if (((sw32)r1_software_texture_ptr) & 1)
  50. {
  51. _asm
  52. {
  53. mov Subdivisions,0
  54. }
  55. return;
  56. }
  57. _asm {
  58. //this should be moved out
  59. ; put the FPU in 32 bit mode
  60. fstcw [OldFPUCW] ; store copy of CW
  61. mov ax,OldFPUCW ; get it in ax
  62. and eax,0ffh ; 24 bit precision
  63. mov [FPUCW],ax ; store it
  64. fldcw [FPUCW] ; load the FPU
  65. mov ebx,left ;
  66. mov esi,r1_software_pixel_on ; grab screen pointer
  67. mov edx,right ;
  68. mov eax,[ebx]edgeGrad.X ; left x
  69. mov ecx,[edx]edgeGrad.X ; edx = right x
  70. sub ecx,eax ; edx = width
  71. jle whtp_Return ; no pixels to draw, get out
  72. mov [pTex],ecx ; just for a temp so it can be fild'ed
  73. //note below the fdiv... if more values need to be interpolated based on the scanline
  74. //width... you should do 1/WID and use muls... this is faster cuz theres only one
  75. ; calc this scanlines light step ; FPU Stack
  76. ; st0 st1 st2 st3 st4 st5 st6 st7
  77. fild dword ptr [pTex] ; WID
  78. fld dword ptr [edx]edgeGrad.r ; LR WID
  79. fld dword ptr [ebx]edgeGrad.r ; LL LR WID
  80. fld st ; LL LL LR WID
  81. fmul [LightScale] ; LL16 LL LR WID
  82. fistp [LVal] ; LL LR WID
  83. fsubp st(1),st ; LD WID
  84. fdiv st,st(1) ; LS WID
  85. mov ebx,eax
  86. //heres an unused scanline z fill (inaccurate)
  87. ; mov ax,word ptr [sortKey]
  88. ; mov edi,zBuffer
  89. mov edx,ecx
  90. mov edi,r1_software_texture_ptr
  91. //rest of the z fill thing that isn't used
  92. ; rep stosw
  93. mov cl,r1_software_twidth_log2
  94. mov byte ptr whtp_twidth_log2_patch0+2, cl
  95. shr edi,1 ; pray that it's aligned!
  96. mov byte ptr whtp_twidth_log2_patch1+2, cl
  97. mov pTex,edi
  98. mov byte ptr whtp_twidth_log2_patch2+2, cl
  99. mov edi,esi ; edi = dest pointer
  100. mov byte ptr whtp_twidth_log2_patch3+2, cl
  101. //is pixel_on.add being used? If not the shl is needed
  102. ; shl ebx,1
  103. mov eax,edx ; eax and edx = width
  104. //this is also needed if pixel_on.add isn't used
  105. ; add edi,ebx
  106. ; edi = pointer to start pixel in dest dib
  107. ; edx = spanwidth
  108. shr edx,3 ; ecx = width / subdivision length
  109. and eax,7 ; eax = width mod subdivision length
  110. //the next line is avoiding a far jump which was the
  111. //only opcode inline would generate... it simply skips down to
  112. //to mov [Subdivisions],edx two instructions down
  113. __asm _emit 0x75 __asm _emit 0x06 ;jnz @f any leftover?
  114. dec edx ; no, so special case last span
  115. mov eax,8 ; it's 8 pixels long
  116. mov [Subdivisions],edx ; store widths
  117. mov [WidthModLength],eax
  118. fmul [LightScale]
  119. fxch st(1)
  120. fistp [LDelta]
  121. fistp [LDelta]
  122. mov ebx,left ; get left edge pointer
  123. //try not to play around with the ordering here later...
  124. //this is the most optimal stack ordering possible
  125. ; calculate ULeft and VLeft ; FPU Stack (ZL = ZLeft)
  126. ; st0 st1 st2 st3 st4 st5 st6 st7
  127. fld [ebx]edgeGrad.VOverZ ; V/ZL
  128. fld [ebx]edgeGrad.UOverZ ; U/ZL V/ZL
  129. fld [ebx]edgeGrad.OneOverZ ; 1/ZL U/ZL V/ZL
  130. fld1 ; 1 1/ZL U/ZL V/ZL
  131. fdiv st,st(1) ; ZL 1/ZL U/ZL V/ZL
  132. //here there is room for alot of integer ops, but this is only done
  133. //once... This would be a good spot for the zbuffer shitcan method
  134. fld st ; ZL ZL 1/ZL U/ZL V/ZL
  135. fmul st,st(4) ; VL ZL 1/ZL U/ZL V/ZL
  136. fxch st(1) ; ZL VL 1/ZL U/ZL V/ZL
  137. fmul st,st(3) ; UL VL 1/ZL U/ZL V/ZL
  138. fstp st(5) ; VL 1/ZL U/ZL V/ZL UL
  139. fstp st(5) ; 1/ZL U/ZL V/ZL UL VL
  140. //the dword ptrs below shut up compiler warnings (though they don't do qwords)
  141. //notice the adds... that simply steps to the end of the next span ahead of time
  142. ; calculate right side OverZ terms ; st0 st1 st2 st3 st4 st5 st6 st7
  143. fadd dword ptr grads.dOneOverZdX8 ; 1/ZR U/ZL V/ZL UL VL
  144. fxch st(1) ; U/ZL 1/ZR V/ZL UL VL
  145. fadd dword ptr grads.dUOverZdX8 ; U/ZR 1/ZR V/ZL UL VL
  146. fxch st(2) ; V/ZL 1/ZR U/ZR UL VL
  147. fadd dword ptr grads.dVOverZdX8 ; V/ZR 1/ZR U/ZR UL VL
  148. //here's the equivlent of the fpu loop at the bottom, only theres an fdiv
  149. //here that has plenty of room for int ops between
  150. ; calculate right side coords ; st0 st1 st2 st3 st4 st5 st6 st7
  151. fld1 ; 1 V/ZR 1/ZR U/ZR UL VL
  152. ; @todo overlap this guy
  153. fdiv st,st(2) ; ZR V/ZR 1/ZR U/ZR UL VL
  154. fld st ; ZR ZR V/ZR 1/ZR U/ZR UL VL
  155. fmul st,st(2) ; VR ZR V/ZR 1/ZR U/ZR UL VL
  156. fxch st(1) ; ZR VR V/ZR 1/ZR U/ZR UL VL
  157. fmul st,st(4) ; UR VR V/ZR 1/ZR U/ZR UL VL
  158. test edx,edx ; check for any full spans
  159. jz whtp_HandleLeftoverPixels
  160. whtp_SpanLoop:
  161. ; at this point the FPU contains ; st0 st1 st2 st3 st4 st5 st6 st7
  162. ; UR VR V/ZR 1/ZR U/ZR UL VL
  163. ; convert left side coords
  164. fld st(5) ; UL UR VR V/ZR 1/ZR U/ZR UL VL
  165. fmul [FixedScale] ; UL8 UR VR V/ZR 1/ZR U/ZR UL VL
  166. fistp [UFixed] ; UR VR V/ZR 1/ZR U/ZR UL VL
  167. fld st(6) ; VL UR VR V/ZR 1/ZR U/ZR UL VL
  168. fmul [FixedScale] ; VL8 UR VR V/ZR 1/ZR U/ZR UL VL
  169. fistp [VFixed] ; UR VR V/ZR 1/ZR U/ZR UL VL
  170. ; calculate deltas ; st0 st1 st2 st3 st4 st5 st6 st7
  171. fsubr st(5),st ; UR VR V/ZR 1/ZR U/ZR dU VL
  172. fxch st(1) ; VR UR V/ZR 1/ZR U/ZR dU VL
  173. fsubr st(6),st ; VR UR V/ZR 1/ZR U/ZR dU dV
  174. fxch st(6) ; dV UR V/ZR 1/ZR U/ZR dU VR
  175. fmul [FixedScale8] ; dV8 UR V/ZR 1/ZR U/ZR dU VR
  176. fistp [DeltaV] ; UR V/ZR 1/ZR U/ZR dU VR
  177. fxch st(4) ; dU V/ZR 1/ZR U/ZR UR VR
  178. fmul [FixedScale8] ; dU8 V/ZR 1/ZR U/ZR UR VR
  179. fistp [DeltaU] ; V/ZR 1/ZR U/ZR UR VR
  180. ; increment terms for next span ; st0 st1 st2 st3 st4 st5 st6 st7
  181. ; Right terms become Left terms----> ; V/ZL 1/ZL U/ZL UL VL
  182. fadd dword ptr grads.dVOverZdX8 ; V/ZR 1/ZL U/ZL UL VL
  183. fxch st(1) ; 1/ZL V/ZR U/ZL UL VL
  184. fadd dword ptr grads.dOneOverZdX8 ; 1/ZR V/ZR U/ZL UL VL
  185. fxch st(2) ; U/ZL V/ZR 1/ZR UL VL
  186. fadd dword ptr grads.dUOverZdX8 ; U/ZR V/ZR 1/ZR UL VL
  187. fxch st(2) ; 1/ZR V/ZR U/ZR UL VL
  188. fxch st(1) ; V/ZR 1/ZR U/ZR UL VL
  189. //here's the span fdiv... avoid any fpu ops after this, or any imuls
  190. ; calculate right side coords ; st0 st1 st2 st3 st4 st5 st6 st7
  191. fld1 ; 1 V/ZR 1/ZR U/ZR UL VL
  192. fdiv st,st(2) ; ZR V/ZR 1/ZR U/ZR UL VL
  193. //see hecker's articles for an explaination of below... it's too big
  194. //to comment in
  195. ; do pre span delta setup and start modify
  196. mov esi,[DeltaV] ; get v 16.16 step
  197. mov ebx,esi ; split up into int and frac
  198. sar esi,16
  199. shl ebx,16
  200. mov [DeltaVFrac],ebx
  201. whtp_twidth_log2_patch0:
  202. shl esi,3
  203. mov ebx,[DeltaU] ; get u 16.16 step
  204. mov eax,ebx
  205. shl eax,16
  206. sar ebx,16
  207. mov [DeltaUFrac],eax
  208. add esi,ebx
  209. mov ebx,1
  210. mov [4+UVintVfracStepV],esi ; non V carry slot
  211. whtp_twidth_log2_patch1:
  212. shl ebx,3 ; this is to get the texwidth
  213. add esi,ebx
  214. mov [UVintVfracStepV],esi ; V carry slot
  215. ; setup initial coordinates
  216. //see the file gradients.txt in hecker's code pack for an explaination
  217. //of the modifiers... it's pretty long
  218. mov esi,[UFixed] ; get u 16.16 fixedpoint coordinate
  219. add esi,dword ptr grads.dUdXModifier
  220. mov ecx,[VFixed] ; and v... before ebp gets pushed
  221. add ecx,dword ptr grads.dVdXModifier
  222. mov blarg,ebp
  223. push ebp
  224. mov ebp,esi
  225. sar esi,16
  226. mov edx,ecx
  227. shl ebp,16
  228. sar edx,16
  229. shl ecx,16
  230. whtp_twidth_log2_patch2:
  231. shl edx,3
  232. add esi,edx ; move texture to offset
  233. xor eax,eax ; clear eax
  234. mov edx,[LVal]
  235. add esi,pTex
  236. ; edi = dest dib bits at current pixel
  237. ; esi = texture pointer at current texel
  238. ; ebp = u fraction 0.32
  239. ; ecx = v fraction 0.32
  240. ; edx = Light value and bit storage
  241. ; ebx = v carry scratch
  242. ;
  243. ; edx rolls to act as temp storage, and also to
  244. ; make the light value an index into the ctable
  245. //theres a stall in this loop... it's only one cycle but it might be
  246. //somehow avoided... I ran outta time. It's the V pipe add below rol edx, 16
  247. mov al,[1+esi*2]
  248. rol edx,16 ; get the low word usable
  249. add ecx,[DeltaVFrac]
  250. sbb ebx,ebx ; U pipe only
  251. mov ah,dl
  252. mov dh,[2*esi]
  253. add ebp,[DeltaUFrac]
  254. adc esi,[4+UVintVfracStepV+ebx*4] ; U pipe only
  255. mov bl,dh
  256. mov bh,dl
  257. mov eax,dword ptr[cTable+8000h+eax*4]
  258. and ebx,0ffffh
  259. mov dh,0
  260. rol edx,16
  261. add eax,dword ptr[cTable+ebx*4] ; One cycle stall here U exp flow i think
  262. add edx,[LDelta]
  263. mov [edi],al
  264. rol edx,16
  265. mov [edi+1],ah
  266. add ecx,[DeltaVFrac]
  267. mov ah,dl
  268. sbb ebx,ebx ; U pipe only
  269. mov dh,[esi*2]
  270. add ebp,[DeltaUFrac]
  271. mov al,[1+esi*2]
  272. adc esi,[4+UVintVfracStepV+ebx*4] ; U pipe only
  273. mov bl,dh
  274. mov eax,dword ptr[cTable+8000h+eax*4]
  275. mov bh,dl
  276. mov dh,0
  277. and ebx,0ffffh
  278. rol edx,16
  279. add eax,dword ptr[cTable+ebx*4]
  280. add edx,[LDelta]
  281. mov [edi+2],al
  282. rol edx,16
  283. mov [edi+3],ah
  284. add ecx,[DeltaVFrac]
  285. mov ah,dl
  286. sbb ebx,ebx ; U pipe only
  287. mov dh,[esi*2]
  288. add ebp,[DeltaUFrac]
  289. mov al,[1+esi*2]
  290. adc esi,[4+UVintVfracStepV+ebx*4] ; U pipe only
  291. mov bl,dh
  292. mov eax,dword ptr[cTable+8000h+eax*4]
  293. mov bh,dl
  294. mov dh,0
  295. and ebx,0ffffh
  296. rol edx,16
  297. add eax,dword ptr[cTable+ebx*4]
  298. add edx,[LDelta]
  299. mov [edi+4],al
  300. rol edx,16
  301. mov [edi+5],ah
  302. add ecx,[DeltaVFrac]
  303. mov ah,dl
  304. sbb ebx,ebx ; U pipe only
  305. mov dh,[esi*2]
  306. add ebp,[DeltaUFrac]
  307. mov al,[1+esi*2]
  308. adc esi,[4+UVintVfracStepV+ebx*4] ; U pipe only
  309. mov bl,dh
  310. mov eax,dword ptr[cTable+8000h+eax*4]
  311. mov bh,dl
  312. mov dh,0
  313. and ebx,0ffffh
  314. rol edx,16
  315. add eax,dword ptr[cTable+ebx*4]
  316. add edx,[LDelta]
  317. mov [edi+6],al
  318. rol edx,16
  319. mov [edi+7],ah
  320. add ecx,[DeltaVFrac]
  321. mov ah,dl
  322. sbb ebx,ebx ; U pipe only
  323. mov dh,[esi*2]
  324. add ebp,[DeltaUFrac]
  325. mov al,[1+esi*2]
  326. adc esi,[4+UVintVfracStepV+ebx*4] ; U pipe only
  327. mov bl,dh
  328. mov eax,dword ptr[cTable+8000h+eax*4]
  329. mov bh,dl
  330. mov dh,0
  331. and ebx,0ffffh
  332. rol edx,16
  333. add eax,dword ptr[cTable+ebx*4]
  334. add edx,[LDelta]
  335. mov [edi+8],al
  336. rol edx,16
  337. mov [edi+9],ah
  338. add ecx,[DeltaVFrac]
  339. mov ah,dl
  340. sbb ebx,ebx ; U pipe only
  341. mov dh,[esi*2]
  342. add ebp,[DeltaUFrac]
  343. mov al,[1+esi*2]
  344. adc esi,[4+UVintVfracStepV+ebx*4] ; U pipe only
  345. mov bl,dh
  346. mov eax,dword ptr[cTable+8000h+eax*4]
  347. mov bh,dl
  348. mov dh,0
  349. and ebx,0ffffh
  350. rol edx,16
  351. add eax,dword ptr[cTable+ebx*4]
  352. add edx,[LDelta]
  353. mov [edi+10],al
  354. rol edx,16
  355. mov [edi+11],ah
  356. add ecx,[DeltaVFrac]
  357. mov ah,dl
  358. sbb ebx,ebx ; U pipe only
  359. mov dh,[esi*2]
  360. add ebp,[DeltaUFrac]
  361. mov al,[1+esi*2]
  362. adc esi,[4+UVintVfracStepV+ebx*4] ; U pipe only
  363. mov bl,dh
  364. mov eax,dword ptr[cTable+8000h+eax*4]
  365. mov bh,dl
  366. mov dh,0
  367. and ebx,0ffffh
  368. rol edx,16
  369. add eax,dword ptr[cTable+ebx*4]
  370. add edx,[LDelta]
  371. mov [edi+12],al
  372. rol edx,16
  373. mov [edi+13],ah
  374. add ecx,[DeltaVFrac]
  375. mov ah,dl
  376. sbb ebx,ebx ; U pipe only
  377. mov dh,[esi*2]
  378. add ebp,[DeltaUFrac]
  379. mov al,[1+esi*2]
  380. adc esi,[4+UVintVfracStepV+ebx*4] ; U pipe only
  381. mov bl,dh
  382. mov eax,dword ptr[cTable+8000h+eax*4]
  383. mov bh,dl
  384. mov dh,0
  385. and ebx,0ffffh
  386. rol edx,16
  387. add eax,dword ptr[cTable+ebx*4]
  388. add edx,[LDelta]
  389. mov [edi+14],al
  390. mov [LVal],edx
  391. mov [edi+15],ah
  392. pop ebp
  393. ; ************** Okay to Access Stack Frame ****************
  394. ; ************** Okay to Access Stack Frame ****************
  395. ; ************** Okay to Access Stack Frame ****************
  396. ; the fdiv is done, finish right ; st0 st1 st2 st3 st4 st5 st6 st7
  397. ; ZR V/ZR 1/ZR U/ZR UL VL
  398. fld st ; ZR ZR V/ZR 1/ZR U/ZR UL VL
  399. fmul st,st(2) ; VR ZR V/ZR 1/ZR U/ZR UL VL
  400. fxch st(1) ; ZR VR V/ZR 1/ZR U/ZR UL VL
  401. fmul st,st(4) ; UR VR V/ZR 1/ZR U/ZR UL VL
  402. add edi,16 ; increment to next span
  403. dec [Subdivisions] ; decrement span count
  404. jnz whtp_SpanLoop ; loop back
  405. whtp_HandleLeftoverPixels:
  406. mov esi,pTex
  407. ; edi = dest dib bits
  408. ; esi = texture dib bits
  409. ; at this point the FPU contains ; st0 st1 st2 st3 st4 st5 st6 st7
  410. ; inv. means invalid numbers ; inv. inv. inv. inv. inv. UL VL
  411. cmp [WidthModLength],0 ; are there remaining pixels to draw?
  412. jz whtp_FPUReturn ; nope, pop the FPU and bail
  413. mov ebx,right ; get right edge pointer
  414. ; convert left side coords ; st0 st1 st2 st3 st4 st5 st6 st7
  415. fld st(5) ; UL inv. inv. inv. inv. inv. UL VL
  416. fmul [FixedScale] ; UL8 inv. inv. inv. inv. inv. UL VL
  417. fistp [UFixed] ; inv. inv. inv. inv. inv. UL VL
  418. fld st(6) ; VL inv. inv. inv. inv. inv. UL VL
  419. fmul [FixedScale] ; VL8 inv. inv. inv. inv. inv. UL VL
  420. fistp [VFixed] ; inv. inv. inv. inv. inv. UL VL
  421. cmp [WidthModLength],1 ; calc how many steps to take
  422. jz whtp_OnePixelSpan ; just one, don't do deltas
  423. ; calculate right edge coordinates ; st0 st1 st2 st3 st4 st5 st6 st7
  424. ; r -> R+1
  425. ; @todo rearrange things so we don't need these two instructions
  426. fstp [FloatTemp] ; inv. inv. inv. inv. UL VL
  427. fstp [FloatTemp] ; inv. inv. inv. UL VL
  428. //notice the fsubs... hecker doesn't explain this at all (damn him)
  429. //These are the compensation for the dudxmodifiers, the fill convention
  430. //is left biased so this makes the edge perfect
  431. fld [ebx]edgeGrad.VOverZ ; V/Zr inv. inv. inv. UL VL
  432. fsub dword ptr grads.dVOverZdX ; V/ZR inv. inv. inv. UL VL
  433. fld [ebx]edgeGrad.UOverZ ; U/Zr V/ZR inv. inv. inv. UL VL
  434. fsub dword ptr grads.dUOverZdX ; U/ZR V/ZR inv. inv. inv. UL VL
  435. fld [ebx]edgeGrad.OneOverZ ; 1/Zr U/ZR V/ZR inv. inv. inv. UL VL
  436. fsub dword ptr grads.dOneOverZdX ; 1/ZR U/ZR V/ZR inv. inv. inv. UL VL
  437. fdivr [One] ; ZR U/ZR V/ZR inv. inv. inv. UL VL
  438. fmul st(1),st ; ZR UR V/ZR inv. inv. inv. UL VL
  439. fmulp st(2),st ; UR VR inv. inv. inv. UL VL
  440. ; calculate deltas ; st0 st1 st2 st3 st4 st5 st6 st7
  441. fsubr st(5),st ; UR VR inv. inv. inv. dU VL
  442. fxch st(1) ; VR UR inv. inv. inv. dU VL
  443. fsubr st(6),st ; VR UR inv. inv. inv. dU dV
  444. fxch st(6) ; dV UR inv. inv. inv. dU VR
  445. fidiv [WidthModLength] ; dv UR inv. inv. inv. dU VR
  446. fmul [FixedScale] ; dv8 UR inv. inv. inv. dU VR
  447. fistp [DeltaV] ; UR inv. inv. inv. dU VR
  448. fxch st(4) ; dU inv. inv. inv. UR VR
  449. fidiv [WidthModLength] ; du inv. inv. inv. UR VR
  450. fmul [FixedScale] ; du8 inv. inv. inv. UR VR
  451. fistp [DeltaU] ; inv. inv. inv. UR VR
  452. //this is hecker's klooge for keeping his fpu stack aligned (ught)
  453. ; @todo gross! these are to line up with the other loop
  454. fld st(1) ; inv. inv. inv. inv. UR VR
  455. fld st(2) ; inv. inv. inv. inv. inv. UR VR
  456. whtp_OnePixelSpan:
  457. mov esi,[UFixed] ; get starting coordinates
  458. add esi,dword ptr grads.dUdXModifier
  459. mov ecx,[VFixed] ; and v... before ebp gets pushed
  460. add ecx,dword ptr grads.dVdXModifier
  461. ; leftover pixels loop
  462. ; edi = dest dib bits
  463. ; esi = texture dib bits
  464. ; esi = u 16.16
  465. ; ecx = v 16.16
  466. xor eax,eax
  467. //this loop isn't too good... didn't get time to tune it well
  468. //but it's only for the end of the scanline if there are <8 pixels
  469. whtp_LeftoverLoop:
  470. mov edx,ecx ; copy v
  471. mov ebx,esi
  472. sar edx,16 ; int(v)
  473. add edi,2
  474. whtp_twidth_log2_patch3:
  475. sal edx,3
  476. add esi,[DeltaU] ; increment u coordinate
  477. sar ebx,16 ; int(u)
  478. add edx,pTex
  479. add ecx,[DeltaV] ; increment v coordinate
  480. add ebx,edx
  481. mov edx,[LVal]
  482. mov al,byte ptr[1+ebx*2] ; get source texel
  483. rol edx,16
  484. mov bl,byte ptr[ebx*2]
  485. mov ah,dl
  486. mov eax,dword ptr[cTable+8000h+eax*4]
  487. mov bh,dl
  488. and ebx,0ffffh
  489. rol edx,16
  490. add eax,dword ptr[cTable+ebx*4]
  491. add edx,[LDelta]
  492. mov [edi-2],al
  493. mov [LVal],edx
  494. mov [edi-1],ah
  495. dec [WidthModLength] ; decrement loop count
  496. jnz whtp_LeftoverLoop ; finish up
  497. whtp_FPUReturn:
  498. ; busy FPU registers: ; st0 st1 st2 st3 st4 st5 st6 st7
  499. ; xxx xxx xxx xxx xxx xxx xxx
  500. ffree st(0)
  501. ffree st(1)
  502. ffree st(2)
  503. ffree st(3)
  504. ffree st(4)
  505. ffree st(5)
  506. ffree st(6)
  507. whtp_Return:
  508. fldcw [OldFPUCW] ; restore the FPU
  509. }
  510. }
  511. #endif