ON ERROR: ON ERROR OFF: PRINT REPORT$;" at line ";ERL:END

DIM code% 4096

PROCassemble(TRUE, 800, 600)

SYS "OS_ReadMonotonicTime" TO start_time%
num_frames% = USR code%
SYS "OS_ReadMonotonicTime" TO end_time%

PROCassemble(FALSE, 800, 600)
PRINT "fps: " num_frames% / (end_time% - start_time%) * 100
PRINT "Code size: " P%-code%
SYS "XOS_ReadVarVal", "Tunnel256$Dir", 0, -1, 0, 0 TO ,,length%
IF length% < -1 THEN
  SYS "OS_File", 10, "<Tunnel256$Dir>.Elsecaller-UnOpt", &ff8, 0, code%, P%
  PRINT '"Saved"
ENDIF
END

DEF PROCassemble(preview, xres%, yres%)
FOR pass%=0 TO 2 STEP 2
  P% = code%
  IF preview THEN [OPT pass%: mov r12, #0:]
  [OPT pass%
  mov r0, #(1<<1) + (1<<31)
  mov r1, #32
  mov r2, #0
  swi "VFPSupport_CreateContext"

  adr r1, constants
  vld4.16 {d11[], d12[], d13[], d14[]}, [r1]!
  mov r0, #15
  swi "OS_ScreenMode"

  ; r1 is still at label 'mode' from the call above
  adr r0, vdu
  swi "OS_ReadVduVariables"

  .mainloop
  ldr r5, mode
  swi "OS_ReadMonotonicTime"
  vmov d17, r0, r0           ; d17 = time
  vshl.u32 d16, d17, #1      ; d16 = time * 2

  mov r2, #FNimm_save(yres%)
.yloop
  mov r1, #xres%
  sub r3, r2, #FNimm_save(yres% / 2)
  vmov d25, r3, r3
.xloop
  sub r3, r1, #FNimm_save(xres% / 2)
  add r4, r3, #1
  vmov d24, r4, r3
  vcvt.f32.s32 q0, q12       ; d0 = x, d1 = y
  vmul.f32 q13, q0, q0       ; d26 = x^2, d27 = y^2
  vadd.f32 d26, d26, d27     ; d26 = x^2+y^2 = len
  vrsqrte.f32 d26, d26       ; d26 = 1/len

  vabs.f32 q1, q0            ; d2,d3 = abs(x),abs(y)
  vacgt.f32 d4, d0, d1       ; d4 = abs(x) > abs(y)
  veor d5, d0, d1            ; d5 = x,y sign parity
  veor d5, d5, d4            ; d5 = sign of sin(angle) + garbage bits
  vsri.32 d5, d19, #1        ; d5 = sign bit of sin(angle)
  vmin.f32 d4, d2, d3        ; d4 = min_by_abs(x, y)
  veor d4, d4, d5            ; d4 = smaller side with sign for quadrant
  vmul.f32 d15, d26, d4      ; d15 = d4 / LEN = SIN(angle)
  vmul.f32 d27, d15, d15     ; d27 = d3^2
  vmul.f32 q10, q13, q7      ; d20 = 50/len = depth, d21 = d3^3
  vfma.f32 d15, d21, d13     ; d15 = x + 0.2215x^3 (asin approx)
  vmul.f32 d27, d15, d12     ; * 4/PI * 512

  vmin.f32 d20, d20, d13     ; d20 = min(1, depth)
  vsub.f32 d20, d13, d20     ; d20 = max(0, depth)
  vmul.f32 d20, d20, d20     ; d20 = depth^2 (nicer falloff)
  vcvt.s32.f32 d20, d20, #12 ; d20 = max(0, 1-depth) in fixpoint 31

  vcvt.s32.f32 q15, q13, #17 ; d30/d31 = texture u/v
  vadd.s32 q15, q15, q8      ; d30/d31 = texture u/v + TIME
  veor d21, d30, d31         ; d21 = eor texture
  vsli.u32 d21, d19, #8      ; d21 = eor texture masked out

  vmul.u32 q10, q10, d11[0]  ; d20/d21: dup value into all bytes
  vmull.u8 q11, d21, d14     ; multiply texture with color
  vqshrn.u16 d18, q11, #6    ; shift back
  vqsub.u8 q9, q9, q6        ; subtract second color + clear d19
  vqabs.s8 d18, d18

  vmull.u8 q11, d18, d20     ; multiply with depth shade value
  vqshrn.u16 d18, q11, #6    ; shift back

  vstmia r5!, { d18 }
  subs r1, r1, #2
  bne xloop
  subs r2, r2, #1
  bne yloop
]
IF preview THEN
  [OPT pass%
  add r12, r12, #1
  swi "OS_ReadEscapeState"
  bcc mainloop
  mov r0, #124
  swi "OS_Byte"
  mov r0, r12
  mov pc,r14
  ]
ELSE
[OPT pass%
  b mainloop
]
ENDIF
[OPT pass%

.constants
  ; 16 bit per constant, enough precission for us
  dcd &101 OR (FNf16(4/PI/512) << 16)
  dcd FNf16(0.2215) OR (&413e << 16)

.mode
  equs "32 C16M": dcb 0
.vdu
  dcd 148
  dcd -1
  ]
NEXT
ENDPROC

DEF FNf16(value)
  value% = FNf32(value)
=(value% >> 16) + ((value% >> 15) AND 1)

DEF FNf32(value)
  IF value = 0 THEN =0
  sign = SGN value
  value = value * sign
  exp% = INT(LN value / LN 2)
  mant = value / EXP(exp% * LN 2)
  value% = (mant * &800000) AND &7fffff
  value% = value% + (((exp% + &7f) AND &ff) << 23)
  IF sign < 0 THEN value% = value% OR &80000000
=value%

DEF FNimm_save(value%)
LOCAL shift%
  shift% = 0
  WHILE value% > 255
    value% = value% >> 2
    shift% = shift% + 2
  ENDWHILE
=value% << shift%
