;  Summary: Span loop Input
;  mm0, mm1:  RGB, delta
;  ebx:    span width
;  edi:    span ptr
;  esi:    Zbuffer ptr
;  xmm0:    ZUVX initial left
;  xmm2:    ZUVX constant d/dx
TGZM_SpanLoop      proc  near

  ; DOS INCOMPATIBLE
  mov [save_esp],esp
  mov esp,edi
;  mov [t_edi], edi

  ; releases ebx.
  mov [SubspanCounter], ebx
  
  ; logic
  ; z = 1/interpolated rz
  ; Z = 0xFF80 - Fist(g_zscale * z);

  ; allocation
  ; xmm4 = z
  ; eax = Z, edx = dZdx
  mov      eax,  0ff80h

  rcpss    xmm1,  xmm0
;  movss    xmm1,  [f1]
;  divss    xmm1,  xmm0

  movss    xmm4,  xmm1
  mulss    xmm1,  _g_zscale
  cvtss2si  ebx,  xmm1
  sub      eax,  ebx

  ; xmm4, xmm7 = UV affine (16bit fractional)
  movss    xmm5,  [pf65536]
  mulss    xmm4,  xmm5
  shufps    xmm4,  xmm4,  00000000b
  mulps    xmm4,  xmm0
  shufps    xmm4,  xmm4,  00001001b
  movaps    xmm7,  xmm4

  ; mm2 = [Ui:Uf]        [Vi:Vf] (frees xmm4)
  ; ebp = [Ui:Uf], edi = [Vi:Vf] (frees mm2)
  cvtps2pi  mm2,  xmm4


;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;; placeholder: ebp/edi init
  ; ebp = u wwww ww00 0000 00ww 0fff ffff ffff ffff
  movd    edi,  mm2
  
  and      edi,  0BADCAB1Eh    ; TB0(logWidth)
  org      $-4
  smc_TB0Wd0  dd    0BADCAB1Eh

  shl      edi,  8        ; logHeight
  org      $-1
  smc_Hb0    db    8
  
  movd    ebp,  mm2
  movd    ebx,  mm2
  shr      ebp,  1
  and      ebx,  000030000h
  or      ebx,  edi
  and      ebp,  07fffh
  or      ebp,  ebx  

  pshufw    mm2,  mm2,  01001110b

  ; edi = v 0000 00ww wwww ww00 0fff ffff ffff ffff
  movd    edi,  mm2
  movd    ebx,  mm2
  shr      edi,  1  
  
  and      ebx,  0BADCAB1Eh      ; T0(logHeight)
  org      $-4
  smc_T0Hd0  dd    0BADCAB1Eh

  add      ebx,  ebx
  add      ebx,  ebx
  
  and      edi,  07fffh
  or      edi,  ebx

  pshufw    mm2,  mm2,  01001110b

Outer:

  ; Compute current sub-span width
  mov      ecx,  16
  mov      ebx,  [SubspanCounter]
  cmp      ebx,  ecx
  cmova    ebx,  ecx

  ; Prepare scaled deltas for multiplication
  cvtsi2ss  xmm3,  ebx
  shufps    xmm3,  xmm3,  0
  mov      [SubSpanWidth],  ebx

  ; Compute offset into DivTable
  shl      ebx,  4
  mulps    xmm3,  xmm2

  ; step ZUVX
  addps    xmm0,  xmm3

  ; xmm4 = z
  ; edx = Z
  rcpss    xmm5,  xmm0
;  movss    xmm5,  [f1]
;  divss    xmm5,  xmm0

  mov      edx,  0ff80h
  movss    xmm4,  xmm5
  mulss    xmm5,  _g_zscale
  cvtss2si  ecx,  xmm5
  sub      edx,  ecx

  ; xmm4 = UV affine (16bit fractional)
  movss    xmm5,  [pf65536]
  mulss    xmm4,  xmm5
  shufps    xmm4,  xmm4,  00000000b
  mulps    xmm4,  xmm0
  shufps    xmm4,  xmm4,  00001001b

  ; use rcpps?
  movaps    xmm5,  [DivTable + ebx]
  movaps    xmm6,  xmm4
  subps    xmm6,  xmm7
  mulps    xmm6,  xmm5

  ; mm3 = [dUi:dUf] [dVi:dVf] (frees xmm4)
  ; stores Z
  ; edx = dzdx
  cvtps2pi  mm3,  xmm6
;  movq    mm7,  mm3
  mov      [SubspanZ], edx
;  psubd    mm3,  mm2
  sub      edx,  eax

  ; Fix Rounding Errors
  mov      ebx,  8
  mov      ecx,  -8
  cmovs    ebx,  ecx
  add      edx,  ebx
  
;  psrad    mm3,  4
  ; should divide by subspan width, not 16.
;  sar      edx,  4

  mov      ecx,  [SubSpanWidth]
  add      ecx,  ecx
  add      ecx,  ecx
  mov      ebx,  eax

  mov      eax,  [RITable+ecx]
  imul    edx
  mov      eax,  ebx

  ; dudx = [Ui:Uf], dvdx = [Vi:Vf] (frees mm3)

  ; dudx = wwww ww11 1111 11ww 1fff ffff ffff ffff
  movd    ebx,  mm3
  movd    ecx,  mm3
  
  and      ecx,    0BADCAB1Eh  ; TB0(logWidth)
  org      $-4
  smc_TB0Wd1  dd    0BADCAB1Eh

  shl      ecx,  8      ; logHeight
  org      $ - 1
  smc_Hb1    db    8

  and      ebx,  000030000h

  or      ebx,  0BADCAB1Eh  ;  TB1(logHeight)
  org      $-4
  smc_TB1Hd1  dd    0BADCAB1Eh

  or      ebx,  ecx

  movd    ecx,  mm3
  shr      ecx,  1

  and      ecx,  07fffh
  or      ebx,  ecx
  mov      [dudx],  ebx

  ;     0000 0000 wwww wwww ffff ffff ffff fff*
  ; dvdx = 1111 11ww wwww ww11 1fff ffff ffff ffff
  pshufw    mm3,  mm3,  01001110b
  movd    ecx,  mm3
  movd    ebx,  mm3
    
  shr      ecx,  1

  and      ebx,  0BADCAB1Eh  ;  T0(logHeight)
  org      $-4
  smc_T0Hd2  dd    0BADCAB1Eh

  add      ebx,  ebx
  add      ebx,  ebx

  and      ecx,  07fffh

  or      ebx,  0BADCAB1Eh  ;  T2_22(logHeight)
  org      $-4
  smc_TB2Hd3  dd    0BADCAB1Eh

  or      ebx,  ecx
  mov      [dvdx],  ebx

  pshufw    mm3,  mm3,  01001110b

;  ; step ZUVX
;  addps    xmm0,  xmm3

  mov      ebx,  [SubSpanWidth]
;  mov      ecx,  16
;  mov      ebx,  [SubspanCounter]
;  cmp      ebx,  ecx
;  cmova    ebx,  ecx

  lea ecx,[ebx+ebx]
  neg ecx

  sub esp, ecx
;  sub [t_edi], ecx
  sub esi, ecx
  sub esp, ecx
;  sub [t_edi], ecx

; Register allocation table

; Generic
; esp - Page ptr

; ZBuffer
; eax - current Z (low word)
; edx - dZ/dx (low word)
; esi - ZBuffer ptr
; ecx - offset counter (low byte)

; Gouraud
; mm0 - Color
; mm1 - dColor/dx

; Texture
; mm2 - u, v
; mm7 - next u, v
; Required: u, v, du, dv (8:8 each); Texture ptr (dword)
; add (u, v), (du, dv) will be made on an mmx register

; Free registers
; CPU: none
; MMX: mm3-6
; SSE: xmm1,xmm5-6


;align 16
  Inner:
    cmp ax, word ptr [esi+ecx]
    jbe nodraw ; remark to disable zbuffer

    ; get texture
    lea    ebx,  [edi+ebp]
    shr    ebx,  16

    mov  word ptr [esi+ecx], ax

    pxor  mm4,  mm4
    
    movd  mm5,  [ebx*4 + 0BADCAB1Eh]
    org $ - 4
    SMC_Texture dd 0BADCAB1Eh

;    movd    mm5,  [whitecolor]

    punpcklbw  mm5,  mm4
    pmulhuw    mm5,  mm0
    psllw    mm5,  1
    packuswb  mm5,  mm5
    
    ; write to screen
    movd [esp+ecx*2], mm5
nodraw:
    ; step gouraud
    paddw  mm0,  mm1

    ; step texture
    add ebp,  [dudx]
    add edi,  [dvdx]

    ; step current Z
    add eax, edx
;    ror eax,  4

    and ebp,  0BADCAB1Eh   ; (TB2(logHeight)-1) & ((1<<(logWidth+logHeight+16))-1)
    org $ - 4
    smc_TB2Wd2  dd  0BADCAB1Eh

    and  edi,  0BADCAB1Eh  ; TB1(logHeight)-1
    org $ - 4
    smc_TB1Hd4  dd  0BADCAB1Eh
    
    ; step to next pixel
    add ecx, 2
    jnz Inner

;  mov dword ptr [esp-4], 000ffffffh

  
  movq  mm2,  mm7
  movaps  xmm7,  xmm4
;  pslld  mm3,  4
;  paddd  mm2,  mm3
  
  mov eax, [SubspanZ]  
  mov edx, [SubspanCounter]
  sub edx, 16
  mov [SubspanCounter], edx
  jg Outer
Terminate:
  mov esp,[save_esp]
  ret
TGZM_SpanLoop    endp