;**********************************
;*        -=DviJoke 3D=-          *
;* Small 3D texture mapped engine *
;*    (C)oded by Sergey Chaban    *
;**********************************

;* Speed-optimized version, average 40 FPS (up to 100+ FPS :-) on P200 MMX
;* Inner loop in a scanconversion optimized a bit (EAX is used to keep the
;  lit values for both edges).
;* Span draw routine optimized. FDIV overlaps with more ALU instructions.
;* Code size redused by using local variables in the stack and also
;  recycling some fields in span structure.


COMMENT ~
After wasting a two weeks for program optimization, cycles timings,
commands shuffling, I said "Enough! The code runs pretty fast anyway."
My goal was to find compromise between the size, the speed
and the "usability" of the engine and I think I have achieved success.
When I used local variables in spandraw loop I thought this was only the size
optimization. I thought that global variables are faster since they are cached
once for the first span and then are used repeatedly.
This is probably true for the machines with small cache.
But on nowadays machines locals are faster (in this particular case at least).
The only thing we need to do is to align stack on 16-byte boundary at the
start of spandraw routine. Note that this also helps a scanconversion code.
The span structure is 48 (i. e. 16*3) bytes long, what is quite good.
The last optimization I have made was removal of those "subspansizemuls"
out from the inner loop and keeping interpolation step values in FPU stack
instead of memory vars.

Here is still there things which are possible to be improved.
First optimization can be done for the short spans. Now we perform unnecessary
fmuls and ffrees for such spans though we can avoid them.
But such optimization requires some special case code which is quite bad
for the size.
So the current version is not so big 8), pretty fast and does perspective
correct texture mapped polys, Gouraud shading, Z-plane clipping,
graphical clipping (the edge clipping) and maybe something more I forgot :).

~


Rotate_Points proc
 mov bx,offset Points_List
 mov si,offset Camera_Data
 mov di,offset Rotated_Points
 mov cx,1234h   ; numPoints
NumPoints EQU $-2
Rotate_Next_Point:
 ; Get point coords relatively to camera (vector), that is translate
 fild word ptr [bx]   ; X
 fsub dword ptr [si].Camera_X
 fild word ptr [bx+2] ; Y
 fsub dword ptr [si].Camera_Y
 fild word ptr [bx+4] ; Z 
 fsub dword ptr [si].Camera_Z

 ; multiply vector by matrix (row*column etc.)
Rotate_Vector:
 fld st(2) ; relX
 fmul dword ptr [si].Camera_RotationMatrix.matr_Xx
 add bx,2*3 ; next point coords
 fld st(2) ; relY
 fmul dword ptr [si].Camera_RotationMatrix.matr_Yx
 fadd
 fld st(1)
 fmul dword ptr [si].Camera_RotationMatrix.matr_Zx
 fadd
 fstp dword ptr [di] ; rotated X

 fld st(2) ; relX
 fmul dword ptr [si].Camera_RotationMatrix.matr_Xy
 add di,4
 fld st(2) ; relY
 fmul dword ptr [si].Camera_RotationMatrix.matr_Yy
 fadd
 fld st(1)
 fmul dword ptr [si].Camera_RotationMatrix.matr_Zy
 fadd
 fstp dword ptr [di] ; rotated Y

 fmul dword ptr [si].Camera_RotationMatrix.matr_Zz
 fxch
 fmul dword ptr [si].Camera_RotationMatrix.matr_Yz
 fadd
 add di,8
 fld dword ptr [si].Camera_RotationMatrix.matr_Xz
 fmulp st(2),st
 dec cx
 fadd
 fstp dword ptr [di-4] ; rotated Z
Rotate_SelfModi:
 jnz Rotate_Next_Point
ProjExit:
 ret
Rotate_Points endp


; SI=Poly script
Project_Polygon proc
 mov word ptr ds:[Div0_SP],sp
 push ds
 pop es
 mov cx,4    ; Vertices per poly
 lodsb
 add al,al   ; Shift bit 7 onto carry
 setc dl     ; DL=1 if hilight bit is set
 shr al,1    ; Shift lit value back with bit 7 clear
 mov byte ptr ds:[PolyLit],al
 mov al,-50h ; shading bound (negated)
 imul dl
 add ah,2    ; AX+200h=200h-(50h*DL)
 xchg bp,ax
 mov di,offset Vertices
 lodsb ; texture#*2
 mov byte ptr ds:[CurrTextureNum],al
Get_Vertex_Data:
 lodsw ; #point*12 + offset Rotated_Points
 xchg bx,ax
 fld dword ptr [bx].VecX  ; X (already rotated)
 mov eax,[bx].VecY        ; Y
 fstp dword ptr [di].Vertex_X3D
 mov [di].Vertex_Y3D,eax
 fld dword ptr [bx].VecZ  ; Z
 fist word ptr [di].Vertex_intZ
 fstp dword ptr [di].Vertex_Z3D
 mov ax,200h
 sub ax,word ptr [di].Vertex_intZ
 js ProjExit ; Polygon is too far away.
 ;Here comes some tricky code to avoid branches (faster on Pentium Pro?).
 mov bx,1FFh
 cmp bp,ax   ; CMP (200h-ShadingBound),(200h-z)
 sbb dx,dx
 and dh,bh
 or ax,dx    ; AX=1FFh if z<ShadingBound
 cmp ax,bx
 sbb dx,dx
 ; and ax,bx ; we can safely ignore these two
 ; and dx,bx
 xor dx,bx
 or ax,dx
 shl ax,7
 mov dx,7F00h ; DH=Lit
PolyLit EQU $-1
 mul dx
 mov [di].Vertex_Lit,dx
 add di,SIZE VertexData
 dec cx
 jnz Get_Vertex_Data

 lea si,[di-SIZE VertexData*4] ; Point SI to polygon vertice 0
 ;**** Set (u,v) texture coordinates for the current polygon
 ;****  V0(0,0)---------V1(3Fh,0)
 ;****     |               |
 ;****  V3(0,3Fh)------V2(3Fh,3Fh)
 sub eax,eax
 mov [si].Vertex_u,eax                                 ; u0
 mov [si].Vertex_v,eax                                 ; v0
 mov [si + SIZE VertexData].Vertex_v,eax               ; v1
 mov [si + SIZE VertexData * 3].Vertex_u,eax           ; u3
 mov eax,12345678h
 ORG $-4                                               ;)
 dd 624230400.0 ; (63)=619315200.0=4128768.0*Magic     ; (63.5)*65536*Magic
 ; Use the same values for the both width and height.
 mov dword ptr [si + SIZE VertexData].Vertex_u,eax     ; u1
 mov dword ptr [si + SIZE VertexData * 2].Vertex_u,eax ; u2
 mov dword ptr [si + SIZE VertexData * 2].Vertex_v,eax ; v2
 mov dword ptr [si + SIZE VertexData * 3].Vertex_v,eax ; v3


 ;*** Now perform clipping on Z-planes
 ;************************************
 fld1        ; FRONT_PLANE_Z=1
 fcom dword ptr [di-SIZE VertexData].Vertex_Z3D ; last vertex Z
 fnstsw ax
 sahf
 setbe dl
 neg dl      ; PrevOut flag in DL=-1 if Z>1.0
 mov di,offset Vertices2Draw
 sub bp,bp   ; #points counter
 mov dh,4
ZClip_Loop:
 cmp word ptr [si].Vertex_intZ,0 ; Z beetwen -inf, 0.4
 jle Z_Out
 ; more precise checking
 fcom dword ptr [si].Vertex_Z3D  ; Z beetwen 0.5, 1.0
 fnstsw ax
 sahf
 ja Z_Out
 test dl,dl     ; is previous out?
 jnz @@JustCopy ; nope
 call ZPlane_Clipping ; Current Z is in, but previous is out
 not dl
@@JustCopy:
 mov cx,SIZE VertexData SHR 1
 rep movsw
 inc bp
 jmp @@CheckNext
Z_Out:
 test dl,dl
 jz @@BothOut
 call ZPlane_Clipping
 not dl
@@BothOut:
 add si,SIZE VertexData
@@CheckNext:
 dec dh
 jnz ZClip_Loop
 fstp st(0)  ; pop the FRONT_PLANE_Z
 cmp bp,3
 jb ProjExit ; we don't want the point, line or nothing
 mov cx,bp
 mov si,offset Vertices2Draw
 push cx si
ProjectEm:
 fld dword ptr [si].Vertex_Z3D
 fiadd word ptr ds:Camera_Data.Camera_FocalDistance
 fdivr dword ptr ds:[fp_One]   ; 1/(z+camera_focal_distance)

 fld dword ptr [si].Vertex_u   ; Texture u * Magic
 fmul st,st(1)                 ; u/z
 ;* skip fimul word ptr ds:[Magic] *** already done
 fstp dword ptr [si].Vertex_u
 fld dword ptr [si].Vertex_v   ; Texture v * Magic
 fmul st,st(1)                 ; v/z
 ;* skip fimul word ptr ds:[Magic] *** already done
 fstp dword ptr [si].Vertex_v


 fld dword ptr [si].Vertex_X3D
 fimul word ptr ds:[Scale_X]
 fmul st,st(1)
 fiadd word ptr ds:[ScreenCenter_X]
 frndint                                   ; round it
 fmul dword ptr ds:[fp_Scale16]            ; *10000h
 fistp dword ptr [si].Vertex_ScreenX_Frac  ; 16.16 fixed point number
 fnstsw ax
 test al,1
 jnz AbortPolygon
 fld dword ptr [si].Vertex_Y3D
 fimul word ptr ds:[Scale_Y]
 fmul st,st(1)
 fiadd word ptr ds:[ScreenCenter_Y]
 fistp dword ptr [si].Vertex_ScreenY
 fnstsw ax
 test al,1
 jnz AbortPolygon
 fmul dword ptr ds:[Magic16]               ; 1/z * Magic * 65536
 fstp dword ptr [si].Vertex_invZ
 add si,SIZE VertexData
 loop ProjectEm
 pop si
 pop cx


;***************************************************************************
;* Here comes the heart of the engine -- rasterizer/texture mapper.
;* Perspective correct texture mapped polys with CLUT-based (depth) shading.
;* (C)oded by Sergey Chaban ! Yes, by ME !
;* 
;* This piece of code seems looks like some sort of abrakadabra or
;* black magic text. But... Yes, it is necromancy.
;***************************************************************************

  movzx ebp,cx ; BP=CX=Number of polygon vertices
  ;imul bp,20;20 bytes for each vertex -> dw lit,frac_x,x,y; dd tuz, tvz, invz
  lea bp,[ebp+ebp*4]
  shl bp,2
  mov word ptr ds:[PolyCount+1],bp
  ; CX=Number of coordinate pairs (in words)
  dec cx      ; One pair is loaded outside the loop.
  dec cx      ; We will check for sign.
  shl ecx,16
  mov di,offset Coord_Buf
  mov word ptr [Min_Addr+1],di
  mov ax,[si].Vertex_Lit
  mov es:[di+bp],ax
  stosw
  mov eax,dword ptr [si].Vertex_ScreenX_Frac
  mov es:[di+bp],eax
  stosd
  shr eax,16
  mov bx,ax   ; Min X
  mov dx,ax   ; Max X
  mov ax,[si].Vertex_ScreenY
  mov cx,ax   ; Min Y
  mov word ptr [Max_Y+1],ax
  mov es:[di+bp],ax
  stosw
  mov eax,[si].Vertex_u
  mov es:[di+bp],eax
  stosd
  mov eax,[si].Vertex_v
  mov es:[di+bp],eax
  stosd
  mov eax,[si].Vertex_invZ
  mov es:[di+bp],eax
  stosd
  jmp FillCoordsQueue
  EVEN
FillCoordsQueue:
  add si,SIZE VertexData
  mov ax,[si].Vertex_Lit
  mov es:[di+bp],ax
  stosw
  mov eax,dword ptr [si].Vertex_ScreenX_Frac   ; x
  mov es:[di+bp],eax
  stosd 
  shr eax,16                                   ; get integer part of x
  cmp ax,bx
  jnl No_Min_X
  mov bx,ax
No_Min_X:
  cmp ax,dx
  jng No_Max_X
  mov dx,ax
No_Max_X:
  mov ax,[si].Vertex_ScreenY                   ; y
  cmp ax,cx
  jge Max_Y
  lea cx,[di-(offset PolyCache_Y)]
  mov word ptr [Min_Addr+1],cx
  mov cx,ax
Max_Y:
  cmp ax,0
  jng No_Max_Y
  mov word ptr [Max_Y+1],ax
No_Max_Y:
  mov es:[di+bp],ax
  stosw
  mov eax,[si].Vertex_u         ; 1/u
  mov es:[di+bp],eax
  stosd
  mov eax,[si].Vertex_v         ; 1/v
  mov es:[di+bp],eax
  stosd
  mov eax,[si].Vertex_invZ      ; 1/z
  mov es:[di+bp],eax
  stosd
  sub ecx,1 SHL 16
  jns FillCoordsQueue
  mov si,cx ; min Y
PolyCount:
  mov cx,0
  mov ax,word ptr ds:[Max_Y+1]
  mov bp,ax
  or ax,dx
  js P_Exit ; Either max_X or max_Y is < 0  -> poly is out of view
PolyMain:
  cmp si,word ptr ds:[Max_Screen_Y+1]
  jge P_Exit ; min_Y>MaxScreenY
  cmp bx,word ptr ds:[Max_Screen_X+1]
  jg P_Exit ; min_X>MaxScreenX
  mov word ptr ds:[P_Stack+1],sp
  push offset Check_Done
Clip:
  test bx,bx ; Min_X<0 -> clip the left side
  jns Clip_1
  push offset Clip_Left
Clip_1:
  cmp dx,word ptr ds:[Max_Screen_X+1]
  jng Clip_2
  push offset Clip_Right
Clip_2:
  test si,si ; Min_Y<0 -> clip the right side
  jns Clip_3
  push offset Clip_Top
Clip_3:
  cmp bp,word ptr ds:[Max_Screen_Y+1]
  jng Min_Addr
  push offset Clip_Bottom
Min_Addr:
  mov si,1234h  ; Ptr to upper vertex of polygon
Count EQU $-2 ;) ReUse this mem :)
  ;***** Backface culling
  mov bx,cx
  mov di,[si].PolyCache_Y
  mov ax,[si + SIZE PolyCache].PolyCache_Y
  sub ax,di ; Y1-Y0
  mov bp,[si].PolyCache_X
  mov dx,[si+bx-SIZE PolyCache].PolyCache_X
  sub dx,bp ; Xn-X0
  imul dx
  xchg ax,di
  sub ax,[si+bx-SIZE PolyCache].PolyCache_Y ; Y0-Yn
  mov bx,dx
  sub bp,[si+SIZE PolyCache].PolyCache_X    ; X0-X1
  imul bp
  ; (Y0-Yn)*(X0-X1)-(Y1-Y0)*(Xn-X0) must be > 0 !see line equation
  sub ax,di
  sbb dx,bx
  jl P_Stack
P_Exit:
  ret

;EVEN

COMMENT /
The real edge clipping performed, not that line by line hack!
Many people don't like this kind of 2D clipping due to its 'complicateness',
but this way is much faster...
/

Clip_Left:
  mov di,offset Clip_Buf_1
  mov ax,cx
  mov dl,SIZE PolyCache
  div dl
  ; Num vertices
  mov byte ptr ds:[Count],al
ClpL_1:
  mov bp,[si].PolyCache_X ; X0
  mov dx,[si].PolyCache_Y ; Y0
  add si,SIZE PolyCache
  test bp,bp
  js ClpL_1stOut
  fld dword ptr [si-SIZE PolyCache].PolyCache_u
  fld dword ptr [si-SIZE PolyCache].PolyCache_invZ
  mov ax,[si-SIZE PolyCache].PolyCache_X_Frac
  mov cx,[si-SIZE PolyCache].PolyCache_Lit
  mov [di].PolyCache_X,bp
  mov [di].PolyCache_Y,dx
  mov [di].PolyCache_X_Frac,ax
  mov eax,dword ptr [si-SIZE PolyCache].PolyCache_v
  mov [di].PolyCache_Lit,cx
  fstp dword ptr [di].PolyCache_invZ
  mov dword ptr [di].PolyCache_v,eax
  fstp dword ptr [di].PolyCache_u
  add di,SIZE PolyCache
  mov cx,[si].PolyCache_X
  test cx,cx
  js ClpL_4
ClpL_NextEdge:
  dec byte ptr ds:[Count]
  jnz ClpL_1
  mov si,offset Clip_Buf_1
  mov cx,di
  sub cx,si
  jz P_Stack
  mov ax,[si].PolyCache_Y
  mov bx,[di-SIZE PolyCache].PolyCache_Y
  cmp ax,bx
  jle Clp_Ret
Clp_Fin:
  fld  dword ptr [di-SIZE PolyCache].PolyCache_invZ
  sub   si,SIZE PolyCache
  mov   eax,dword ptr [di-SIZE PolyCache].PolyCache_X_Frac
  fld   dword ptr [di-SIZE PolyCache].PolyCache_v
  mov   dx,[di-SIZE PolyCache].PolyCache_Lit
  mov   [si].PolyCache_Y,bx
  mov   dword ptr [si].PolyCache_X_Frac,eax
  mov   eax,[di-SIZE PolyCache].PolyCache_u
  fstp  dword ptr [si].PolyCache_v
  mov   [si].PolyCache_Lit,dx
  mov   [si].PolyCache_u,eax
  fstp dword ptr [si].PolyCache_invZ
  ret

;EVEN

ClpL_1stOut:
  mov cx,[si].PolyCache_X ; X1
  test cx,cx
  js ClpL_NextEdge  ; Both X0 and X1 negative -> skip this edge
  ; X0 negative and X1 positive
  ; looks like this:
  ; .A
  ;  \ ͻ   Let  A(X0,Y0), B(X1,Y1), C(X,Y)
  ;   \C           we remember the following formula from the schooltime:
  ;    \              X-X1    Y-Y1
  ;     \.B            = 
  ;    ͼ      Y0-Y1   X0-X1
  ; define X=0 =>
  ;        (0-X1)*(Y0-Y1)
  ; Y=Y1 +       DONE!
  ;           X0-X1
  ;
  ; That is, we're interpolate Y.
  ;
  ; Okay, but what about the light value and all those texture stuff?
  ; Yeah. Interpolate again!
  ; For the lit:
  ; Interpolation step = delta(Lit)/ABS(delta(X))
  ; and number of steps = ABS(X-X0)=-X0 (due to X=0 and X0<0)
  ; Lit(C)=Lit0+ABS(-X0)*(Lit1-Lit0)/ABS(X1-X0)
  ; We also interpolate u/z, v/z & 1/z in the same way.
  ; Now do it...
  fld dword ptr [si-SIZE PolyCache].PolyCache_invZ ; invZ0
  fld dword ptr [si].PolyCache_invZ                ; invZ1
  mov bx,[si].PolyCache_Y                         ; Y1
  fsub st,st(1)                                   ; invZ1-invZ0
  neg bx                                          ; -Y1
  add bx,dx                                       ; Y0-Y1
  fld dword ptr [si-SIZE PolyCache].PolyCache_u   ; u0
  fld dword ptr [si].PolyCache_u                  ; u1
  sub eax,eax                                     ; new X = 0
  fsub st,st(1)                                   ; u1-u0
  mov dword ptr [di].PolyCache_X_Frac,eax ; X=0
  sub ax,cx                                       ; 0-X1=-X1
  fld dword ptr [si-SIZE PolyCache].PolyCache_v   ; v0
  fld dword ptr [si].PolyCache_v                  ; v1
  mov cx,bp                                       ; CX=X0 (always negative)
  add bp,ax                                       ; X0+(-X1)=X0-X1
  fsub st,st(1)                                   ; v1-v0
  ; FPU stack: v1-v0, v0, u1-u0, u0, invZ1-invZ0, invZ0
  imul bx
  idiv bp
  mov word ptr ds:[FPU_Temp],cx                   ; X0 (negative)
  fimul word ptr ds:[FPU_Temp]                    ; (v1-v0)*(X0)
  fxch st(2)                                      ; swap dv,du
  add ax,[si].PolyCache_Y ; Y=Y1+(-X1)*(Y0-Y1)/(X0-X1)
  mov [di].PolyCache_Y,ax
  fimul word ptr ds:[FPU_Temp] ; du*X0, v0, dv*X0, u0, dz, z0
  mov ax,[si].PolyCache_Lit                       ; Lit1
  fxch st(4)                                      ; swap du,dz
  mov bx,[si-SIZE PolyCache].PolyCache_Lit ; Lit0
  fimul word ptr ds:[FPU_Temp] ; dz*X0, v0, dv*X0, u0, du*X0, z0
  sub ax,bx                                       ; Lit1-Lit0
  test bp,bp
  js ClpL_dX_SignOk ; Use inverse checking => BP=(-1)*SIGN(dX)*ABS(dX)
  neg bp
ClpL_dX_SignOk:
  mov word ptr ds:[FPU_Temp],bp
  imul cx                                  ; *X0
  fild word ptr ds:[FPU_Temp]
  fdivr dword ptr ds:[fp_One] ; 1/dx, dz*X0, v0, dv*X0, u0, du*X0, z0
  fxch                        ; dz*X0, 1/dx, v0, dv*X0, u0, du*X0, z0
  idiv bp                                  ; /ABS(X0-X1)
  fmul st,st(1)               ; dz*X0/dx
  faddp st(6),st              ; +z0-> 1/dx, v0, dv*X0, u0, du*X0, z0+dz*X0/dx
  fxch st(2)                  ; dv*X0, v0, 1/dx, u0, du*X0, z0+dz*X0/dx
  fmul st,st(2)               ; dv*X0/dx
  add ax,bx                                ; +Lit0
  mov [di].PolyCache_Lit,ax
  faddp st(1),st              ; +v0-> v0+dv*X0/dx
  fstp [di].PolyCache_v       ; 1/dx, u0, du*X0, z0+dz*X0/dx
  fmulp st(2),st              ; u0, du*X0/dx, z0+dz*X0/dx
  faddp st(1),st              ; u0+du*X0/dx, z0+dz*X0/dx
  fstp [di].PolyCache_u
  fstp [di].PolyCache_invZ
  add di,SIZE PolyCache
  jmp ClpL_NextEdge

ClpL_4:
  ; X0 positive but X1 is negative
  ; No more ASCII art :(
  fld dword ptr [si-SIZE PolyCache].PolyCache_invZ ; invZ0
  fld dword ptr [si].PolyCache_invZ                ; invZ1
  mov bx,[si].PolyCache_Y   ; Y1
  fsub st,st(1)                                   ; invZ1-invZ0
  sub bx,dx                                       ; Y1-Y0
  fld dword ptr [si-SIZE PolyCache].PolyCache_u   ; u0
  fld dword ptr [si].PolyCache_u                  ; u1
  xor eax,eax                                     ; new X = 0
  fsub st,st(1)                                   ; u1-u0
  mov dword ptr [di].PolyCache_X_Frac,eax ; X=0
  sub ax,bp                                       ; 0-X0
  fld dword ptr [si-SIZE PolyCache].PolyCache_v   ; v0
  fld dword ptr [si].PolyCache_v                  ; v1
  add cx,ax                                       ; X1-X0
  fsub st,st(1)                                   ; v1-v0
  ; FPU stack: v1-v0, v0, u1-u0, u0, invZ1-invZ0, invZ0
  imul bx
  idiv cx
  mov word ptr ds:[FPU_Temp],bp                   ; X0 (positive)
  fimul word ptr ds:[FPU_Temp]                    ; (v1-v0)*(X0)
  fxch st(2)                                      ; swap dv,du
  add ax,[si-SIZE PolyCache].PolyCache_Y          ; +Y0
  mov [di].PolyCache_Y,ax
  fimul word ptr ds:[FPU_Temp] ; du*X0, v0, dv*X0, u0, dz, z0
  mov ax,[si].PolyCache_Lit                       ; Lit1
  fxch st(4)                                      ; swap du,dz
  mov bx,[si-SIZE PolyCache].PolyCache_Lit        ; Lit0
  fimul word ptr ds:[FPU_Temp] ; dz*X0, v0, dv*X0, u0, du*X0, z0
  sub ax,bx                                       ; Lit1-Lit0
  test cx,cx
  jns ClpL_dX_SignOk2 ; X0 is always positive
  neg cx
ClpL_dX_SignOk2:
  mov word ptr ds:[FPU_Temp],cx
  imul bp                                  ; *X0
  fild word ptr ds:[FPU_Temp]
  fdivr dword ptr ds:[fp_One]              ; 1/dx
  fxch                        ; dz*X0, 1/dx, v0, dv*X0, u0, du*X0, z0
  idiv cx                                  ; /ABS(X0-X1)
  fmul st,st(1)
  faddp st(6),st              ; +z0
  fxch st(2)                  ; dv*X0, v0, 1/dx, u0, du*X0, z0+dz*X0/dx
  fmul st,st(2)               ; dv*X0/dx
  add ax,bx                                ; +Lit0
  mov [di].PolyCache_Lit,ax
  faddp st(1),st
  fstp [di].PolyCache_v       ; 1/dx, u0, du*X0, z0+dz*X0/dx
  fmulp st(2),st
  faddp st(1),st
  fstp [di].PolyCache_u
  fstp [di].PolyCache_invZ
  add di,SIZE PolyCache
  jmp ClpL_NextEdge


;EVEN

P_Stack:
   mov sp,0 ; Restore Stack point
Clp_Ret:
   ret      ; and exit

;EVEN

Clip_Right:
  mov ax,cx
  mov dl,SIZE PolyCache
  div dl
  mov byte ptr ds:[Count],al
  mov di,offset Clip_Buf_2
Max_Screen_X:
  mov cx,319
ClpR_1:
  mov bp,[si].PolyCache_X
  mov dx,[si].PolyCache_Y
  add si,SIZE PolyCache
  cmp bp,cx
  jg ClpR_3
  fld dword ptr [si-SIZE PolyCache].PolyCache_u
  fld dword ptr [si-SIZE PolyCache].PolyCache_invZ
  mov ax,[si-SIZE PolyCache].PolyCache_X_Frac
  mov [di].PolyCache_X,bp
  mov [di].PolyCache_X_Frac,ax
  mov ax,[si-SIZE PolyCache].PolyCache_Lit
  mov [di].PolyCache_Y,dx
  mov [di].PolyCache_Lit,ax
  fstp dword ptr [di].PolyCache_invZ
  mov eax,dword ptr [si-SIZE PolyCache].PolyCache_v
  mov dword ptr [di].PolyCache_v,eax
  fstp dword ptr [di].PolyCache_u
  add di,SIZE PolyCache
  mov ax,[si].PolyCache_X
  cmp ax,cx
  jg ClpR_5
ClpR_NextEdge:
  dec byte ptr ds:[Count]
  jnz ClpR_1
  mov si,offset Clip_Buf_2
  mov cx,di
  sub cx,si
  jz P_Stack
  mov ax,[si].PolyCache_Y
  mov bx,[di-SIZE PolyCache].PolyCache_Y
  cmp ax,bx
  jng Clp_Fin_2
  jmp Clp_Fin

ClpR_3:
  mov ax,[si].PolyCache_X ; X1
  cmp ax,cx
  jg ClpR_NextEdge ; Both X0 and X1 is out of view - so sad :(
  ; X0>bound_X, X1 Ok
  ; X=bound_X
  ;
  ;        (bound_X-X1)*(Y0-Y1)
  ; Y =Y1+ 
  ;              (X0-X1)
  ;
  ;
  ;               Lit1-Lit0
  ; Lit = Lit0 +  * ABS(Bound_X-X0)
  ;               ABS(X0-X1)
  ; Interpolate 1/u, 1/v, 1/z in the same manner.
  fld dword ptr [si].PolyCache_InvZ
  mov bx,[si].PolyCache_Y ; Y1
  fsub dword ptr [si-SIZE PolyCache].PolyCache_InvZ ; Z1-Z0
  sub dx,bx ; Y0-Y1
  mov bx,dx
  xchg ax,cx
  fld dword ptr [si].PolyCache_u
  fsub dword ptr [si-SIZE PolyCache].PolyCache_u     ; u1-u0
  mov word ptr [di].PolyCache_X_Frac,0 ; fraction=0
  mov [di].PolyCache_X,ax ; X=bound_X
  fld dword ptr [si].PolyCache_v
  fsub dword ptr [si-SIZE PolyCache].PolyCache_v     ; v1-v0
  mov dx,bp
  sub dx,ax ; X0-bound_X (always positive due to X0>bound_X)
  sub ax,cx ; bound_X-X1
  sub bp,cx ; X0-X1
  mov cx,dx
  mov word ptr ds:[FPU_Temp],cx
  imul bx
  fimul word ptr ds:[FPU_Temp]  ; dv*(X0-bound_X)
  fxch
  idiv bp
  fimul word ptr ds:[FPU_Temp]  ; du*(X0-bound_X)
  fxch st(2)
  add ax,[si].PolyCache_Y ; +Y1
  mov [di].PolyCache_Y,ax
  fimul word ptr ds:[FPU_Temp]  ; dz*(X0-bound_X)
  mov ax,[si].PolyCache_Lit                ; Lit1
  mov bx,[si-SIZE PolyCache].PolyCache_Lit ; Lit0
  sub ax,bx                                ; Lit1-Lit0
  test bp,bp
  jns ClpR_dX_SignOk_1
  neg bp
ClpR_dX_SignOk_1:
  mov word ptr ds:[FPU_Temp],bp
  imul cx
  fild word ptr ds:[FPU_Temp]
  fdivr dword ptr ds:[fp_One]
  fxch     ; dz*t, 1/dx, dv*t, du*t
  idiv bp
  fmul st,st(1)
  fadd dword ptr [si-SIZE PolyCache].PolyCache_invZ ; +z0
  fstp dword ptr [di].PolyCache_invZ
  fxch     ; dv*t, 1/dx, du*t
  fmul st,st(1)
  add ax,bx
  mov [di].PolyCache_Lit,ax
  fadd dword ptr [si-SIZE PolyCache].PolyCache_v    ; +v0
  fstp dword ptr [di].PolyCache_v
  fmulp st(1),st
  mov cx,word ptr ds:[Max_Screen_X+1]
  fadd dword ptr [si-SIZE PolyCache].PolyCache_u    ; +u0
  fstp dword ptr [di].PolyCache_u
  add di,SIZE PolyCache
  jmp ClpR_NextEdge

; X0 ok, X1 out
ClpR_5:
  fld dword ptr [si].PolyCache_invZ                  ; z1
  mov bx,[si].PolyCache_Y
  fsub dword ptr [si-SIZE PolyCache].PolyCache_invZ  ; z1-z0
  sub bx,dx  ; Y1-Y0
  xchg ax,cx
  fld dword ptr [si].PolyCache_u                      ; u1
  fsub dword ptr [si-SIZE PolyCache].PolyCache_u      ; u1-u0
  mov word ptr [di].PolyCache_X_Frac,0
  mov [di].PolyCache_X,ax ; X=bound_X
  fld dword ptr [si].PolyCache_v                      ; v1
  fsub dword ptr [si-SIZE PolyCache].PolyCache_v      ; v1-v0
  sub ax,bp  ; bound_X-X0
  sub cx,bp  ; X1-X0
  mov bp,ax  ; BP=bound_X-X0 -- always positive
  mov word ptr ds:[FPU_Temp],ax
  imul bx
  fimul word ptr ds:[FPU_Temp]  ; dv*(bound_X-X0)
  fxch
  fimul word ptr ds:[FPU_Temp]  ; du*(bound_X-X0)
  fxch st(2)
  fimul word ptr ds:[FPU_Temp]  ; dz*(), dv*(), du*()
  idiv cx
  add ax,[si-SIZE PolyCache].PolyCache_Y ; +Y0
  mov [di].PolyCache_Y,ax
  mov ax,[si].PolyCache_Lit                ; Lit1
  mov bx,[si-SIZE PolyCache].PolyCache_Lit ; Lit0
  sub ax,bx                                ; Lit1-Lit0
  test cx,cx
  jns ClpR_dX_SignOk_2
  neg cx
ClpR_dX_SignOk_2:
  mov word ptr ds:[FPU_Temp],cx
  imul bp
  ; [?] FDIV seems cannot overlap with IMUL
  fild word ptr ds:[FPU_Temp]
  fdivr dword ptr ds:[fp_One] ; 1/dx
  fxch                        ; dz*(), 1/dx, dv*(), du*()
  idiv cx
  fmul st,st(1)
  fadd dword ptr [si-SIZE PolyCache].PolyCache_invZ
  fstp dword ptr [di].PolyCache_invZ
  fxch                        ; dv*(), 1/dx, du*()
  fmul st,st(1)
  add ax,bx
  mov [di].PolyCache_Lit,ax
  fadd dword ptr [si-SIZE PolyCache].PolyCache_v
  fstp dword ptr [di].PolyCache_v
  fmulp st(1),st
  mov cx,word ptr ds:[Max_Screen_X+1]
  fadd dword ptr [si-SIZE PolyCache].PolyCache_u
  fstp dword ptr [di].PolyCache_u
  add di,SIZE PolyCache
  jmp ClpR_NextEdge


Clp_Fin_2:
  fld dword ptr [si].PolyCache_u
  mov [di].PolyCache_Y,ax
  fld dword ptr [si].PolyCache_invZ
  mov ax,[si].PolyCache_Lit
  mov [di].PolyCache_Lit,ax
  fstp dword ptr [di].PolyCache_invZ
  mov eax,dword ptr [si].PolyCache_X_Frac
  mov dword ptr [di].PolyCache_X_Frac,eax
  fstp dword ptr [di].PolyCache_u
  mov eax,[si].PolyCache_v
  mov [di].PolyCache_v,eax
  ret

;EVEN

Clip_Top:
  mov di,offset Top_Buf
  ;NOTE: since the first Y is minimal it must be negative for this case
Cl_Top_1:
  add si,SIZE PolyCache
  mov ax,[si].PolyCache_Y ; Y1
  test ax,ax
  js Cl_Top_1 ; at least one Y must be > 0
  fld dword ptr [si].PolyCache_invZ                   ; z1
  mov bx,[si-SIZE PolyCache].PolyCache_Y   ; Y0
  sub bx,ax                                ; Y0-Y1
  fsub dword ptr [si-SIZE PolyCache].PolyCache_invZ   ; z1-z0
  neg ax                                   ; 0-Y1
  mov dx,[si-SIZE PolyCache].PolyCache_X   ; X0
  fld dword ptr [si].PolyCache_u                      ; u1
  fsub dword ptr [si-SIZE PolyCache].PolyCache_u      ; u1-u0
  mov bp,[si].PolyCache_X                  ; X1
  sub dx,bp                                ; X0-X1
  fld dword ptr [si].PolyCache_v                      ; v1
  fsub dword ptr [si-SIZE PolyCache].PolyCache_v      ; v1-v0
  imul dx
  idiv bx
  add ax,bp              ; X=X1+(0-Y1)*(X0-X1)/(Y0-Y1)
  shl eax,16                                          ; fixed point
  mov dword ptr [di].PolyCache_X_Frac,eax
  mov bp,[si-SIZE PolyCache].PolyCache_Y              ; Y0 (negative)
  mov [di].PolyCache_Y,ax ; AX=0
  neg bp  ; ABS(Y0)
  mov word ptr ds:[FPU_Temp],bp
  fimul word ptr ds:[FPU_Temp]                        ; dv*ABS(0-Y0)
  fxch
  mov ax,[si].PolyCache_Lit ; Lit1
  fimul word ptr ds:[FPU_Temp]                        ; du*()
  fxch st(2)
  sub ax,[si-SIZE PolyCache].PolyCache_Lit ; Lit1-Lit0
  fimul word ptr ds:[FPU_Temp]                        ; dz*(), dv*(), du*()
  test bx,bx
  jns ClTop_dY_Pos1
  neg bx  ; ABS(Y0-Y1)
ClTop_dY_Pos1:
  mov word ptr ds:[FPU_Temp],bx
  imul bp
  fild word ptr ds:[FPU_Temp]
  fdivr dword ptr ds:[fp_One]
  fxch                        ; dz*(), 1/dx, dv*(), du*()
  idiv bx
  fmul st,st(1)
  fadd dword ptr [si-SIZE PolyCache].PolyCache_invZ  ; +z0
  fstp dword ptr [di].PolyCache_invZ
  fxch
  fmul st,st(1)
  add ax,[si-SIZE PolyCache].PolyCache_Lit ; +Lit0
  ; Lit=Lit0+ABS(Y0-0)*(Lit1-Lit0)/ABS(Y0-Y1)
  mov [di].PolyCache_Lit,ax
  fadd dword ptr [si-SIZE PolyCache].PolyCache_v      ; +v0
  fstp dword ptr [di].PolyCache_v
  fmulp st(1),st
  fadd dword ptr [si-SIZE PolyCache].PolyCache_u      ; +u0
  fstp dword ptr [di].PolyCache_u
  add di,SIZE PolyCache

Cl_Top_Copy:
  ; copy current - it SHOULD be positive
  fld dword ptr [si].PolyCache_u
  fld dword ptr [si].PolyCache_invZ
  mov ebp,dword ptr [si].PolyCache_X_Frac
  mov ebx,dword ptr [si].PolyCache_v
  mov ax,word ptr [si].PolyCache_Lit
  fstp dword ptr [di].PolyCache_invZ
  mov dword ptr [di].PolyCache_X_Frac,ebp
  mov [di].PolyCache_Lit,ax
  mov dword ptr [di].PolyCache_v,ebx
  mov ax,[si].PolyCache_Y ; Y0
  fstp dword ptr [di].PolyCache_u
  shr ebp,16              ; X0
  mov [di].PolyCache_Y,ax
  add si,SIZE PolyCache
  add di,SIZE PolyCache
  mov bx,[si].PolyCache_Y ; Y1
  test bx,bx
  jns Cl_Top_Copy ; copy all following positives
  ; Now we back to negatives
  fld dword ptr [si].PolyCache_invZ                 ; z1
  mov dx,[si].PolyCache_X ; X1
  fsub dword ptr [si-SIZE PolyCache].PolyCache_invZ ; z1-z0
  sub bx,ax ; Y1-Y0
  fld dword ptr [si].PolyCache_u
  fsub dword ptr [si-SIZE PolyCache].PolyCache_u     ; u1-u0
  neg ax    ; 0-Y0
  sub dx,bp ; X1-X0
  fld dword ptr [si].PolyCache_v
  fsub dword ptr [si-SIZE PolyCache].PolyCache_v     ; v1-v0
  imul dx
  idiv bx
  add ax,bp ; X=X0+(0-Y0)*(X1-X0)/(Y1-Y0)
  shl eax,16 ; convert to fixed point
  mov dx,[si-SIZE PolyCache].PolyCache_X             ; X0
  mov dword ptr [di].PolyCache_X_Frac,eax
  mov bp,[si-SIZE PolyCache].PolyCache_Y             ; Y0 - always positive
  mov word ptr ds:[FPU_Temp],bp
  fimul word ptr ds:[FPU_Temp]                       ; dv*()
  fxch
  mov word ptr [di].PolyCache_Y,ax ; AX=0
  fimul word ptr ds:[FPU_Temp]                       ; du*()
  fxch st(2)
  mov ax,[si].PolyCache_Lit ; Lit1
  fimul word ptr ds:[FPU_Temp]                       ; dz*(), dv*(), du*()
  sub ax,[si-SIZE PolyCache].PolyCache_Lit ; Lit1-Lit0
  test bx,bx
  jns ClTop_dY_Pos2
  neg bx ; ABS(Y1-Y0)
ClTop_dY_Pos2:
  mov word ptr ds:[FPU_Temp],bx
  imul bp
  fild word ptr ds:[FPU_Temp]
  fdivr dword ptr ds:[fp_One]
  fxch                        ; dz*(), 1/dx, dv*(), du*()
  idiv bx
  fmul st,st(1)
  ; Lit=Lit0+ABS(Y0-0)*(Lit1-Lit0)/ABS(Y1-Y0)
  add ax,[si-SIZE PolyCache].PolyCache_Lit           ; +Lit0
  fadd dword ptr [si-SIZE PolyCache].PolyCache_invZ  ; +z0
  fstp dword ptr [di].PolyCache_invZ
  fxch                     ; dv*(), 1/dx, du*()
  fmul st,st(1)
  mov [di].PolyCache_Lit,ax
  fadd dword ptr [si-SIZE PolyCache].PolyCache_v     ; +v0
  fstp dword ptr [di].PolyCache_v
  fmulp st(1),st
  fadd dword ptr [si-SIZE PolyCache].PolyCache_u     ; +u0
  mov si,offset Top_Buf
  fstp dword ptr [di].PolyCache_u
  add di,SIZE PolyCache
ClY_Fin:
  mov cx,di
  sub cx,si
  jz P_Stack
  fld dword ptr [si].PolyCache_u
  fld dword ptr [si].PolyCache_invZ
  mov eax,dword ptr [si].PolyCache_X_Frac
  mov dword ptr [di].PolyCache_X_Frac,eax
  mov ax,[si].PolyCache_Y
  mov [di].PolyCache_Y,ax
  fstp dword ptr [di].PolyCache_invZ
  mov ax,[si].PolyCache_Lit
  mov [di].PolyCache_Lit,ax
  mov eax,dword ptr [si].PolyCache_v
  fstp dword ptr [di].PolyCache_u
  mov dword ptr [di].PolyCache_v,eax
  ret

;EVEN

Clip_Bottom:
  mov ax,cx
  mov dl,SIZE PolyCache
  div dl
  mov byte ptr ds:[Count],al
  mov di,offset Bottom_Buf
Max_Screen_Y:
  mov cx,200
Btm_1:
  mov dx,[si].PolyCache_X
  mov bp,[si].PolyCache_Y
  add si,SIZE PolyCache
  cmp bp,cx
  jg Btm_1stOut
  fld dword ptr [si-SIZE PolyCache].PolyCache_u
  fld dword ptr [si-SIZE PolyCache].PolyCache_invZ
  mov ax,[si-SIZE PolyCache].PolyCache_X_Frac
  mov [di].PolyCache_X,dx
  mov bx,[si-SIZE PolyCache].PolyCache_Lit
  mov [di].PolyCache_Y,bp
  mov [di].PolyCache_X_Frac,ax
  mov eax,dword ptr [si-SIZE PolyCache].PolyCache_v
  fstp dword ptr [di].PolyCache_invZ
  mov [di].PolyCache_Lit,bx
  fstp dword ptr [di].PolyCache_u
  mov [di].PolyCache_v,eax
  add di,SIZE PolyCache
  mov ax,[si].PolyCache_Y
  cmp ax,cx
  jg Btm_2ndOut
Btm_NextEdge:
  dec byte ptr ds:[Count]
  jnz Btm_1
  mov si,offset Bottom_Buf
  jmp ClY_Fin
Btm_1stOut:
  mov ax,[si].PolyCache_Y ; Y1
  cmp ax,cx
  jg Btm_NextEdge ; both out - skip this edge
  fld dword ptr [si].PolyCache_invZ
  mov bx,[si].PolyCache_X ; X1
  sub dx,bx ; X0-X1
  fsub dword ptr [si-SIZE PolyCache].PolyCache_invZ ; z1-z0
  neg ax
  fld dword ptr [si].PolyCache_u
  fsub dword ptr [si-SIZE PolyCache].PolyCache_u
  add bp,ax ; Y0-Y1
  mov word ptr ds:[FPU_Temp],bp
  fld dword ptr [si].PolyCache_v
  add ax,cx ; bound_Y-Y1
  fsub dword ptr [si-SIZE PolyCache].PolyCache_v ; dv, du, dz
  imul dx   ; X1+(bound_Y-Y1)*(X0-X1)/(Y0-Y1)
  fild word ptr ds:[FPU_Temp]
  fdivr dword ptr ds:[fp_One]  ; 1/dy
  fxch                         ; dv, 1/dy, du, dz
  idiv bp
  fmul st,st(1)                ; dv/()
  fxch st(2)
  fmul st,st(1)                ; du/(), 1/(), dv/(), dz
  fxch st(3)
  fmulp st(1),st               ; dz/(), dv/(), du/()
  add ax,bx
  shl eax,16 ; convert to fixed point
  mov dx,[si-SIZE PolyCache].PolyCache_X ; X0
  mov dword ptr [di].PolyCache_X_Frac,eax
  mov dx,[si-SIZE PolyCache].PolyCache_Y ; Y0>Y
  sub dx,cx  ; Y0-Y=ABS(Y0-Y) due to Y0>Y
  mov word ptr [di].PolyCache_Y,cx ; Y=bound_Y
  mov word ptr ds:[FPU_Temp],dx
  fimul word ptr ds:[FPU_Temp]
  ; Y0>bound_y, Y1<bound_y => BP=Y0-Y1>0
  mov ax,[si].PolyCache_Lit ; Lit1
  mov bx,[si-SIZE PolyCache].PolyCache_Lit ; Lit0
  sub ax,bx
  fadd dword ptr [si-SIZE PolyCache].PolyCache_invZ
  imul dx
  fstp dword ptr [di].PolyCache_invZ
  fimul word ptr ds:[FPU_Temp]
  idiv bp
  fadd dword ptr [si-SIZE PolyCache].PolyCache_v
  fstp dword ptr [di].PolyCache_v
  fimul word ptr ds:[FPU_Temp]
  add ax,bx ; Lit=Lit0+ABS(bound_Y-Y0)*(Lit1-Lit0)/ABS(Y0-Y1)
  mov [di].PolyCache_Lit,ax
  fadd dword ptr [si-SIZE PolyCache].PolyCache_u
  fstp dword ptr [di].PolyCache_u
  add di,SIZE PolyCache
  jmp Btm_NextEdge

Btm_2ndOut:
  fld dword ptr [si].PolyCache_invZ
  mov bx,[si].PolyCache_X ; X1
  sub dx,bx ; X0-X1
  fsub dword ptr [si-SIZE PolyCache].PolyCache_invZ
  neg ax
  fld dword ptr [si].PolyCache_u
  add bp,ax  ; Y0-Y1
  fsub dword ptr [si-SIZE PolyCache].PolyCache_u ; u1-u0
  add ax,cx  ; bound_Y-Y1
  fld dword ptr [si].PolyCache_v
  fsub dword ptr [si-SIZE PolyCache].PolyCache_v
  mov word ptr ds:[FPU_Temp],bp
  imul dx    ; X1+(bound_Y-Y1)*(X0-X1)/(Y0-Y1)
  fild word ptr ds:[FPU_Temp]
  fdivr dword ptr ds:[fp_One]  ; 1/(), dv, du, dz
  fxch                         ; dv, 1/(), du, dz
  idiv bp
  fmul st,st(1)
  fxch st(2)
  fmul st,st(1)
  fxch st(3)
  fmulp st(1),st               ; dz/(), dv/(), du/()
  add ax,bx
  shl eax,16 ; convert to fixed point
  mov dword ptr [di].PolyCache_X_Frac,eax
  mov dx,[si-SIZE PolyCache].PolyCache_Y ; Y0
  mov word ptr [di].PolyCache_Y,cx ; Y=bound_Y
  sub dx,cx ; Y0<bound_Y => Y0-bound_Y<0
            ;! BP is <0 too => we can avoid checks and negs safely
  mov word ptr ds:[FPU_Temp],dx
  fimul word ptr ds:[FPU_Temp]
  mov ax,[si].PolyCache_Lit ; Lit1
  mov bx,[si-SIZE PolyCache].PolyCache_Lit ; Lit0
  fadd dword ptr [si-SIZE PolyCache].PolyCache_invZ
  sub ax,bx
  fstp dword ptr [di].PolyCache_invZ
  fimul word ptr ds:[FPU_Temp]
  imul dx
  idiv bp
  fadd dword ptr [si-SIZE PolyCache].PolyCache_v
  fstp dword ptr [di].PolyCache_v
  fimul word ptr ds:[FPU_Temp]
  add ax,bx  ; Lit=Lit0+ABS(bound_Y-Y0)*(Lit1-Lit0)/ABS(Y0-Y1)
  mov [di].PolyCache_Lit,ax
  fadd dword ptr [si-SIZE PolyCache].PolyCache_u
  fstp dword ptr [di].PolyCache_u
  add di,SIZE PolyCache
  jmp Btm_NextEdge


;EVEN

Compute_RightSlope:
 mov ebp,dword ptr [si].PolyCache_X_Frac ; X(n) in 16.16 fixed point
 mov dword ptr ds:[Polygon_SpanRight],ebp
 mov ax,[si+SIZE PolyCache].PolyCache_Y ; Y(n+1)
 sub ax,[si].PolyCache_Y   ; Y(n+1)-Y(n)
 movsx ecx,ax
 jle Invalid_Y_Difference
 mov eax,dword ptr [si+SIZE PolyCache].PolyCache_X_Frac ;X(n+1) in 16.16
 sub eax,ebp
 cdq
 idiv ecx ;dX(n)/dY(n) = interpolation step
SetSpanRightInc:
 mov dword ptr ds:[Polygon_SpanRightInc],eax
 mov eax,dword ptr [si].PolyCache_u
 mov dword ptr ds:[Polygon_Right_u],eax
 mov dword ptr ds:[Poly_RightEdge_u],eax
 mov eax,dword ptr [si].PolyCache_v
 mov dword ptr ds:[Polygon_Right_v],eax
 mov dword ptr ds:[Poly_RightEdge_v],eax
 mov eax,dword ptr [si].PolyCache_invZ
 mov dword ptr ds:[Polygon_Right_z],eax
 mov dword ptr ds:[Poly_RightEdge_z],eax
 mov di,[si].PolyCache_Lit ; Lit0
 mov ax,cx
 jcxz SetSpanRightLitStep
 mov word ptr ds:[FPU_Temp],cx
 fld dword ptr [si+SIZE PolyCache].PolyCache_u
 fsub dword ptr [si].PolyCache_u
 mov ax,[si+SIZE PolyCache].PolyCache_Lit ; Lit1-
 sub ax,di                                ; -Lit0
 cwd
 fild word ptr ds:[FPU_Temp]
 fdivr dword ptr ds:[fp_One]   ; 1/(), du
 idiv cx
 fmul st(1),st                 ; 1/(), du/()
 fld dword ptr [si+SIZE PolyCache].PolyCache_v
 fsub dword ptr [si].PolyCache_v
 fmul st,st(1)
 fxch                          ; 1/(), dv/(), du/()
 fld dword ptr [si+SIZE PolyCache].PolyCache_invZ
 fsub dword ptr [si].PolyCache_invZ
 fmulp st(1),st                ; zstep, vstep, ustep
SetSpanRightLitStep:
 mov word ptr ds:[Polygon_RightLitInc],ax
 mov word ptr ds:[Polygon_RightLit],di
 mov word ptr ds:[Poly_RightEdgeLit],di
 fstp dword ptr ds:[Polygon_Right_z_inc]
 fstp dword ptr ds:[Polygon_Right_v_inc]
 fstp dword ptr ds:[Polygon_Right_u_inc]
 mov cx,[si].PolyCache_Y ; Y(n)
 ; inc ebp ; /**????**/
 add si,SIZE PolyCache
 ret
Invalid_Y_Difference:
 jnz JumpToPaint
 fldz           ; zstep=vstep=ustep=0
 fld st(0)
 fld st(0)
 sub eax,eax
 jmp SetSpanRightInc
JumpToPaint:
 pop ax              ; CLEAR STACK
 jmp Start_Paint     ; JUMP IF Y2<Y1

;EVEN

Check_Done:
 IFDEF __STATISTICS
 add dword ptr ds:[PolysDone],1
 adc word ptr ds:[PolysDone+4],0
 ENDIF
 mov bx,si 
 add bx,cx 
 mov ax,[si].PolyCache_Y ; AX=Min_Y
 mov word ptr ds:[PolygonsMinimumY+1],ax
 call Compute_RightSlope
Rasterize:
 sub bx,SIZE PolyCache  ; BX points on left side
 mov ax,[bx].PolyCache_Y
 sub ax,cx ; dY for the left side
 jle Is_This_Done ; should be >= MinY

 fld dword ptr [bx].PolyCache_u
 fsub dword ptr ds:[Poly_RightEdge_u]
 movsx edi,ax
 fld dword ptr [bx].PolyCache_v
 fsub dword ptr ds:[Poly_RightEdge_v]
 mov word ptr ds:[FPU_Temp],ax

 mov dx,0FADEh
Poly_RightEdgeLit EQU $-2

 fld dword ptr [bx].PolyCache_invZ
 mov ax,[bx].PolyCache_Lit
 mov word ptr ds:[Polygon_LeftLit],dx
 fsub dword ptr ds:[Poly_RightEdge_z]   ; dz, dv, du
 fild word ptr ds:[FPU_Temp]
 sub ax,dx
 cwd
 fdivr dword ptr ds:[fp_One]            ; 1/dY, dz, dv, du
 idiv di
 mov word ptr ds:[Polygon_LeftLitInc],ax
 fmul st(1),st
 fmul st(2),st
 fmulp st(3),st                          ; dz/dy, dv/dy, du/dy

 mov eax,dword ptr ds:[Poly_RightEdge_u]
 mov dword ptr ds:[Polygon_Left_u],eax
 mov eax,dword ptr ds:[Poly_RightEdge_v]
 mov dword ptr ds:[Polygon_Left_v],eax
 mov eax,dword ptr ds:[Poly_RightEdge_z]
 mov dword ptr ds:[Polygon_Left_z],eax

 fstp dword ptr ds:[Polygon_Left_z_Inc]
 mov eax,dword ptr [bx].PolyCache_X_Frac ; X(n-1)
 sub eax,ebp   ; X(n-1)-X(n)
 fstp dword ptr ds:[Polygon_Left_v_Inc]
 cdq
 idiv edi
 fstp dword ptr ds:[Polygon_Left_u_Inc]

SetSpanLeftInc:
 mov dword ptr ds:[Polygon_SpanLeftInc],eax
 mov dword ptr ds:[Polygon_SpanLeft],ebp ;left X in 16.16
ProdNextEdge:
 mov ax,[bx].PolyCache_Y ; Y(n-1)
 cmp ax,[si].PolyCache_Y ; cmp Y(n-1),Y(n)
 jle PredLessNext
 mov ax,[si].PolyCache_Y ; AX=min[Y(n-1),Y(n)]
PredLessNext:
 cmp bx,si
 jbe ReturnToBeginning
 sub ax,cx          ;Yn-Ymin
 jle Next_Step
 xchg ax,cx         ;CX=NextY-CurrY

 mov word ptr ds:[Poly_SaveBX+1],bx

 ;****** interpolation

 fld dword ptr ds:[Polygon_Right_u]
 fld dword ptr ds:[Polygon_Left_u]
 fld dword ptr ds:[Polygon_Right_v]
 fld dword ptr ds:[Polygon_Left_v]
 fld dword ptr ds:[Polygon_Right_z]
 fld dword ptr ds:[Polygon_Left_z]
 ; FPU: lz, rz, lv, rv, lu, ru

 mov edi,0DEADFADEh
Polygon_SpanLeft EQU $-4

 mov edx,0ACDCABBAh
Polygon_SpanRight EQU $-4

 mov eax,0C0DECAFEh
Polygon_LeftLit EQU $-2   ; E[ax]
Polygon_RightLit EQU $-4  ; AX
Polygon_Lit EQU $-4

 mov ebx,12345678h
Polygon_SpanLeftInc EQU $-4

Store_Polygon_Lines:
 push edi                    ; Left X
 push edx                    ; Right X
 push eax                    ; E[ax]=Left lit, AX=Right lit
 mov bp,sp
 sub sp,(SIZE Span - 3*4)
 fist dword ptr [bp-4]       ; Left 1/z in 16.16*Magic
 fadd dword ptr ds:[Polygon_Left_z_Inc]
 fxch                        ; rz, lz, lv, rv, lu, ru
 fist dword ptr [bp-8]       ; Right 1/z in 16.16*Magic
 fadd dword ptr ds:[Polygon_Right_z_Inc]
 fxch st(2)                  ; lv, lz, rz, rv, lu, ru
 fist dword ptr [bp-12]      ; Left 1/v in 16.16*Magic
 fadd dword ptr ds:[Polygon_Left_v_Inc]
 fxch st(3)                  ; rv, lz, rz, lv, lu, ru

 fist dword ptr [bp-16]       ; Right 1/v in 16.16*Magic
 fadd dword ptr ds:[Polygon_Right_v_Inc]
 fxch st(4)                   ; lu, lz, rz, lv, rv, ru

 fist dword ptr [bp-20]       ; Left 1/u in 16.16*Magic
 fadd dword ptr ds:[Polygon_Left_u_Inc]
 fxch st(5)                   ; ru, lz, rz, lv, rv, lu

 fist dword ptr [bp-24]       ; Right 1/u in 16.16*Magic
 fadd dword ptr ds:[Polygon_Right_u_Inc]


 add edi,ebx                  ; Left edge x
 add edx,12345678h            ; Right edge x
Polygon_SpanRightInc EQU $-4
 add eax,12345678h
Polygon_LeftLitInc EQU $-2
Polygon_RightLitInc EQU $-4
Polygon_LitInc EQU $-4

 fxch st(5)
 fxch st(4)
 fxch st(3)
 fxch st(2)
 fxch

 dec cx
 jnz Store_Polygon_Lines

 ;*************************

 fstp dword ptr ds:[Polygon_Left_z]
 fstp dword ptr ds:[Polygon_Right_z]
 fstp dword ptr ds:[Polygon_Left_v]
 fstp dword ptr ds:[Polygon_Right_v]
 fstp dword ptr ds:[Polygon_Left_u]
 fstp dword ptr ds:[Polygon_Right_u]
 mov dword ptr ds:[Polygon_SpanLeft],edi
 mov dword ptr ds:[Polygon_SpanRight],edx
 mov dword ptr ds:[Polygon_Lit],eax

Poly_SaveBX:
 mov bx,1234h

Next_Step:
 mov ax,[bx].PolyCache_Lit
 mov ebp,dword ptr [bx].PolyCache_X_Frac
 mov cx,[bx].PolyCache_Y
 mov word ptr ds:[Poly_RightEdgeLit],ax
 mov eax,[bx].PolyCache_u
 mov dword ptr ds:[Poly_RightEdge_u],eax
 mov eax,[bx].PolyCache_v
 mov dword ptr ds:[Poly_RightEdge_v],eax
 mov eax,[bx].PolyCache_invZ
 mov dword ptr ds:[Poly_RightEdge_z],eax
 cmp cx,[si].PolyCache_Y
 jle Rasterize
 call Compute_RightSlope
 jmp ProdNextEdge
Is_This_Done:
 jnz Start_Paint
 mov ax,[bx].PolyCache_Lit
 mov word ptr ds:[Polygon_LeftLit],ax
 mov eax,[bx].PolyCache_u
 mov dword ptr ds:[Polygon_Left_u],eax
 mov eax,[bx].PolyCache_v
 mov dword ptr ds:[Polygon_Left_v],eax
 mov eax,[bx].PolyCache_invZ
 mov dword ptr ds:[Polygon_Left_z],eax
 sub eax,eax
 mov word ptr ds:[Polygon_LeftLitInc],ax
 mov dword ptr ds:[Polygon_Left_z_Inc],eax
 mov dword ptr ds:[Polygon_Left_u_Inc],eax
 mov dword ptr ds:[Polygon_Left_v_Inc],eax
 jmp SetSpanLeftInc
ReturnToBeginning:
 sub ax,cx
 jng Start_Paint

 xchg ax,cx
 fld dword ptr ds:[Polygon_Right_u]
 fld dword ptr ds:[Polygon_Left_u]
 fld dword ptr ds:[Polygon_Right_v]
 fld dword ptr ds:[Polygon_Left_v]
 fld dword ptr ds:[Polygon_Right_z]
 fld dword ptr ds:[Polygon_Left_z]
 ; FPU: lz, rz, lv, rv, lu, ru
 mov edi,dword ptr ds:[Polygon_SpanLeft]
 mov edx,dword ptr ds:[Polygon_SpanRight]
 mov ebx,dword ptr ds:[Polygon_SpanLeftInc]
 mov esi,dword ptr ds:[Polygon_SpanRightInc]
 mov eax,dword ptr ds:[Polygon_Lit]


Store_Last_Lines:
 push edi ; Left X
 push edx ; Right X
 push eax ; E[ax]=Left lit, AX=Right Lit
 mov bp,sp
 sub sp,(SIZE Span - 3*4)
 fist dword ptr [bp-4]       ; Left 1/z
 fadd dword ptr ds:[Polygon_Left_z_Inc]
 fxch                        ; rz, lz, lv, rv, lu, ru
 fist dword ptr [bp-8]       ; Right 1/z
 fadd dword ptr ds:[Polygon_Right_z_Inc]
 fxch st(2)                  ; lv, lz, rz, rv, lu, ru
 fist dword ptr [bp-12]      ; Left 1/v
 fadd dword ptr ds:[Polygon_Left_v_Inc]
 fxch st(3)                  ; rv, lz, rz, lv, lu, ru

 fist dword ptr [bp-16]       ; Right 1/v
 fadd dword ptr ds:[Polygon_Right_v_Inc]
 fxch st(4)                   ; lu, lz, rz, lv, rv, ru


 fist dword ptr [bp-20]       ; Left 1/u
 fadd dword ptr ds:[Polygon_Left_u_Inc]
 fxch st(5)                   ; ru, lz, rz, lv, rv, lu

 fist dword ptr [bp-24]       ; Right 1/u
 fadd dword ptr ds:[Polygon_Right_u_Inc]

 add edi,ebx
 add edx,esi
 add eax,dword ptr ds:[Polygon_LitInc]

 fxch st(5)
 fxch st(4)
 fxch st(3)
 fxch st(2)
 fxch

 dec cx
 jnz Store_Last_Lines

 ; Clear the FPU stack
 ; One FCOMPP is used instead of two FSTPs -
 ; this is faster on P5 and saves a few bytes!
 REPT 6/2
  fcompp
 ENDM

Start_Paint:
 mov cx,word ptr ds:[P_Stack+1]
 sub cx,sp
 jz P_Exit
 xchg ax,cx
 sub dx,dx
 mov cx,SIZE Span
 div cx      ; Number of scanlines to fill
 xchg ax,cx  ; Put it in CX
 mov di,0    ; TxtNum*2
CurrTextureNum EQU $-2
 mov fs,word ptr ds:[Textures+di] ; FS:0 ptr to texture image
PolygonsMinimumY:
 mov di,0
 add di,cx ; Calculate Maximal Y point (+1)
 dec di
 mov es,word ptr ds:[Frame_Buf]

;*********************DrawTextureSpans proc

 add di,di
 fldcw word ptr ds:[FPUCW_RoundToZero] ; Prepare FPU for integer calcs
 mov si,word ptr ds:[Screen_Table+di]  ; screen ptr
SpansLoop:
 ;************ Span setup code **************
 mov bp,sp
 mov di,word ptr [bp].Span_rx+2        ; int(RH x)
 mov bx,word ptr [bp].Span_lx+2        ; int(LH x)
 sub di,bx                             ; dx=delta_x=int(rx)-int(lx)+1
 js NextScanline
 inc di                                ; DI=dx
 shl di,2
 fld dword ptr ds:[DeltaXRecips+di]    ; 1/dx
 fild dword ptr [bp].Span_lz
 fdivr dword ptr ds:[fp_Scale8]        ; 256*left_z

 mov eax,dword ptr [bp].Span_ru        ; [16.16]*Magic
 sub eax,dword ptr [bp].Span_lu        ; delta u/z
 mov dword ptr [bp].Span_uzstep,eax

 mov eax,dword ptr [bp].Span_rv        ; [16.16]*Magic
 sub eax,dword ptr [bp].Span_lv        ; delta v/z
 mov dword ptr [bp].Span_vzstep,eax

 mov eax,dword ptr [bp].Span_rz        ; [16.16]*Magic
 sub eax,dword ptr [bp].Span_lz        ; delta 1/z
 mov dword ptr [bp].Span_invzstep,eax

 mov ax,word ptr [bp].Span_rlit        ; [8.8]
 mov dx,[bp].Span_llit
 sub ax,dx                             ; delta lit
 mov word ptr ds:[Span_Lit],dx
 mov word ptr ds:[Span_LitInc],ax
 
 ; Set u,v values for the end of 'previous' subspan.
 fild dword ptr [bp].Span_lu           ; left u
 fmul st,st(1)
 fistp word ptr [bp].Span_texture_u2
 fimul dword ptr [bp].Span_lv          ; left v
 fistp word ptr [bp].Span_texture_v2

 ;* Lit interpolated lineary for the whole span.
 fild word ptr ds:[Span_LitInc]        ; (RHlit-LHlit)
 fmul st,st(1)                         ; (RHlit-LHlit)/dx
 fistp word ptr ds:[Span_LitInc]

 ;* Calc interpolation step values, which are added every subspan,
 ;  and keep 'em in FPU stack instead of memory vars.
 fmul dword ptr ds:[fp_SubspanSize]    ; (1/dx)*SubspanSize
 fild dword ptr [bp].Span_uzstep
 fild dword ptr [bp].Span_vzstep
 fild dword ptr [bp].Span_invzstep
 fmul st,st(3)                         ; 16*d_invz/dx, d_vz, d_uz, 16/dx
 fxch
 fmul st,st(3)                         ; 16*d_vz/dx, 16*d_invz6/dx, d_uz, 16/dx
 fxch st(2)
 fmulp st(3),st                        ; 16*d_invz/dx, 16*d_vz/dx, 16*d_uz/dx
 ;* FPU: (1/z step)*16, (v/z step)*16, (u/z step)*16


 push cx          ; Save number of scanlines
 push si          ; Save scanline addr
 mov cx,di
 shr cx,2         ; Span size
 lea di,[si+bx]   ; Span start addr = scanline_start+X1

 ;************** Outter loop *************

SpanRound:
 mov si,16
SubspanSize EQU $-4
 cmp si,cx
 jbe SubspanSizeOk
 mov si,cx
SubspanSizeOk:
 jb NotFinalInvZ ; Full subspan, business as usual
 ;* We dont need the subspan size in case of partial subspan, so skip it.
 ;* Get RH 1/z directly from span's structure.
 fild dword ptr [bp].Span_rz
 jmp InvZDivide
 ;* Calculate RH 1/z for the full subspan
NotFinalInvZ:
 fild dword ptr [bp].Span_lz             ; interpolate 1/z
 fadd st,st(1)
 fist dword ptr [bp].Span_lz             ; RightHand 1/z
InvZDivide:
 fdivr dword ptr ds:[fp_Scale8]          ; RH 256*z
 mov bx,si
 cmp ax,es:[di]                          ; take care about caching screen mem
 add di,si                               ; prepare DI for the inner loop
 neg si
 shl bx,2                                ; BX=index in recips table
 add cx,si                               ; this controls ZF
 push cx                                 ; save span's remainder
 ;* End of the previous subspan becomes start of the current one.
 mov dx,word ptr [bp].Span_texture_u2
 mov cx,word ptr [bp].Span_texture_v2
 mov word ptr [bp].Span_texture_u1,dx
 mov word ptr [bp].Span_texture_v1,cx
 fld dword ptr ds:[DeltaXRecips+bx]      ; 1/dx, RH 256*z, z16, v16, u16
 jnz NotFinalSubspan                     ; ZF was changed by ADD CX,SI above
 ;* Remove all those step16 values from FPU since we don't need 'em anymore.
 ffree st(2)
 ffree st(3)
 ffree st(4)
 ;* We can skip some calcs for the final/partial subspan.
 ;* Get RH values directly from span's structure instead od calcing 'em.
 fild dword ptr [bp].Span_ru             ; RH u/z
 fmul st,st(2)                           ; RH u * 256 = (RH u/z) * (RH 256*z)
 fisub word ptr [bp].Span_texture_u1     ; 256*(Ru-Lu)
 fmul st,st(1)                           ; 256*(Ru-Lu)/dx
 fistp word ptr ds:[Texture_u_Inc]
 fild dword ptr [bp].Span_rv             ; RH v/z
 fmulp st(2),st
 fxch
 fisub word ptr [bp].Span_texture_v1     ; 256*(RHv-LHv)
 jmp Set_v_Inc
 ;* Common case - do all needed calcs for the full (and not final) subspan.
 ;* Interpolate u/z and v/z by adding those subspan step values provided by
 ;  setup code above, then calc RH uv pair and linear subspans gradients for
 ;  the inner loop.
NotFinalSubspan:
 fild dword ptr [bp].Span_lu             ; u/z, 1/dx, RH 256*z, z16, v16, u16
 fadd st,st(5)
 fist dword ptr [bp].Span_lu             ; next u/z
 fmul st,st(2)                           ; 256*RHu=(RH u/z)*(RH 256*z)
 fist word ptr [bp].Span_texture_u2
 fisub word ptr [bp].Span_texture_u1     ; 256*(RHu-LHu)
 fmul st,st(1)                           ; u_LinearInc=256*(RHu-LHu)/dx
 fistp word ptr ds:[Texture_u_Inc]

 fild dword ptr [bp].Span_lv
 fadd st,st(4)
 fist dword ptr [bp].Span_lv             ; next v/z
 fmulp st(2),st                          ; RHv=(RH v/z)*(RH z)
 fxch
 fist word ptr [bp].Span_texture_v2
 fisub word ptr [bp].Span_texture_v1     ; 256*(RHv-LHv)
Set_v_Inc:
 fmulp st(1),st                          ; v_LinearInc=256*(RHv-LHv)/dx
 push bp                                 ; Save span's structure ptr
 mov bp,si
 fistp word ptr ds:[FPU_Temp]            ; Texture_v_Inc

 jmp short $+2 ;* Make sure that all self-modified code
               ;* will be reloaded into the cache.

 mov si,word ptr ds:[FPU_Temp]           ; Texture_v_Inc

 mov ax,0
Span_Lit EQU $-2


 ;*************** Inner loop ******************

DrawSubSpan:
 ; Convert u/v pair into texture pointer
 mov bh,ch                    ; v
 sub bl,bl
 shr bx,2
 or bl,dh                     ; u
 add dx,1234h                 ; Update texture u (8.8 int/frac)
Texture_u_Inc EQU $-2
 mov bl,fs:[bx]               ; Get color value from texture data
 mov bh,ah                    ; lit
 add cx,si                    ; Update texture v (8.8 int/frac)
 IFDEF _FARCLUT
 segss
 ENDIF
 mov bl,byte ptr CLUT[bx]     ; Get color depending on lit value
 add ax,1234h                 ; Interpolate lit value (Gouraud-style :)
Span_LitInc EQU $-2
 mov es:[di+bp],bl            ; Write pixel
 inc bp                       ; Decrement counter -> next point
 jnz DrawSubSpan
 mov word ptr ds:[Span_Lit],ax

 pop bp
 pop cx
 test cx,cx
 jnz SpanRound
 pop si
 pop cx
NextScanline:
 add sp,SIZE Span
 sub si,320   ; -bytes_per_line -> next scanline address
 dec cx
 jnz SpansLoop
 fldcw word ptr ds:[FPUCW_Normal]  ; Put FPU back to normal mode
 ret
Project_Polygon endp


ZPlane_Clipping proc
 ; IN: ST=FRONT_PLANE_Z=1.0
 lea ax,[si-SIZE VertexData]        ; ptr to previous vertex
 cmp ax,offset Vertices
 jae @@NoWrap
 mov ax,offset Vertices + (SIZE VertexData * 3)
@@NoWrap:
 fst dword ptr [di].Vertex_Z3D      ; NewZ=FRONT_PLANE_Z
 xchg bx,ax
 fld dword ptr [bx].Vertex_Z3D      ; PrevZ
 fld st(0)                          ; dup it
 fsubr st,st(2)                     ; FRONT_PLANE_Z-PrevZ
 fxch                               ; FPU: PrevZ, (FRONT_Z-PrevZ)
 fsubr dword ptr [si].Vertex_Z3D    ; DeltaZ=CurrentZ-PrevZ
 fdivp st(1),st                     ; t=(FRONT_PLANE_Z-PrevZ)/(CurrentZ-PrevZ)

 inc bp                             ; add point
 push dx

 ; new_u=prev_u+(current_u-prev_u)*t
 fld dword ptr [si].Vertex_u
 fsub dword ptr [bx].Vertex_u
 fmul st,st(1)
 fadd dword ptr [bx].Vertex_u
 fstp dword ptr [di].Vertex_u

 ; new_v=prev_v+(current_v-prev_v)*t
 fld dword ptr [si].Vertex_v
 fsub dword ptr [bx].Vertex_v
 fmul st,st(1)
 fadd dword ptr [bx].Vertex_v
 fstp dword ptr [di].Vertex_v

 ; new_X=prev_X+(current_X-prev_X)*t
 fld dword ptr [si].Vertex_X3D
 fsub dword ptr [bx].Vertex_X3D
 fmul st,st(1)
 fadd dword ptr [bx].Vertex_X3D
 fstp dword ptr [di].Vertex_X3D

 ; new_Y=prev_Y+(current_Y-prev_Y)*t
 fld dword ptr [si].Vertex_Y3D
 fsub dword ptr [bx].Vertex_Y3D
 mov ax,1FFh SHL 7
 fmulp st(1),st
 mul word ptr ds:[PolyLit-1]
 fadd dword ptr [bx].Vertex_Y3D
 mov word ptr [di].Vertex_Lit,dx
 fstp dword ptr [di].Vertex_Y3D
 pop dx
 add di,SIZE VertexData
 ret
ZPlane_Clipping endp

Render_Scene proc
Z_Sort:
 mov si,offset Polygons3D
 mov di,offset ZSort_List-4
 lea bp,[di+4]
 mov cx,1234h   ; TOTAL_POLYS
Num3DPolys EQU $-2
ZSort_GetAverageZ:
 add di,4
 mov [di+2],si
 mov bx,[si+2]              ; Point1  #*12 (skip lit/texture data)
 fld dword ptr [bx].VecZ    ; z1
 mov bx,[si+4]              ; Point2
 fadd dword ptr [bx].VecZ   ; +z2
 mov bx,[si+6]              ; Point3
 fadd dword ptr [bx].VecZ   ; +z3
 mov bx,[si+8]              ; Point4
 fadd dword ptr [bx].VecZ   ; +z4
 add si,1+1+2*4             ; lit+texture+#Points*4
 fistp word ptr [di]        ; unweighted sum of ZETs
 loop ZSort_GetAverageZ
 dec cx                     ; 0FFFFh
 mov [di+4+2],cx            ; mark list end
 push bp
 push di
 call Quick_Sort
 mov bx,offset ZSort_List
Render_Loop:
 mov si,[bx+2]
 cmp si,0FFFFh
 jz Render_Done
 push bx
 call Project_Polygon
 pop bx
 add bx,4
 jmp Render_Loop
Render_Scene endp

Quick_Sort proc
 mov bp,4  ; Step
QSort_Recurse:
 pop ax
 pop cx    ; right
 pop dx    ; left
 cmp dx,cx
 jb Do_Sort
 jmp ax
Do_Sort:
 mov bx,dx ; Left
 mov di,cx ; Right
 push ax
 mov si,[bx]
QSort_1:
 cmp si,[bx]
 jge QSort_2
 add bx,bp
 jmp QSort_1
QSort_2:
 cmp si,[di]
 jle QSort_3
 sub di,bp
 jmp QSort_2
QSort_3:
 cmp bx,di
 ja QSort_4
 mov eax,[bx]
 xchg eax,[di]
 mov [bx],eax
 add bx,bp
 sub di,bp
QSort_4:
 cmp bx,di
 jbe QSort_1
 push bx cx
 push dx di
 call QSort_Recurse
 call QSort_Recurse
Render_Done:
 ret
Quick_Sort endp
