;--------------------------------
;"pa2" by Kuemmel
;
; requieres AVX2 compatible CPU
; tested on FreeDOS
;--------------------------------
; speed up by
; - early exit if all pixel hit
; - 32 pixel loop instead 16
;--------------------------------

org 100h

;---parameters
effect_speed_shift=9    ;default: 9
effect_01=1000000000b   ;default: 1000000000b should kind of fit to effect_speed_shift
depth_initial=8         ;default: 8
depth_steps=512         ;raycasting steps 0...65535
scr_width=1024          ;screen width
scr_height=768          ;screen height
x_offset=512+130        ;center x
y_offset=384+50         ;center y

;---create words with 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,... for x-offset mask
   inc cx        ;cx=0x100
   mov di,0x300
   loop0123:
      stosw
      inc ax
   loop loop0123 ;after loop di is 0x500

   mov si,di     ;si = 0x500

;---screen mode stuff by JIN-X LFB setting, screen address will be in 0x428
   mov ax,0x4f01    ;Get Mode Info  INT 0x10, AX=0x4F01, CX=mode, ES:DI=256 byte buffer
   mov cx,0x4105
   int 0x10
   mov bx,cx
   mov ax,0x4f02    ;Set Video Mode INT 0x10, AX=0x4F02, BX=mode, ES:DI=CRTCInfoBlock
   int 0x10

;---pmode stuff by JIN-X
   cli
   pop fs
   mov eax,0x40603
   lmsw ax
   mov cr4,eax
   xor ecx,ecx
   xgetbv
   or al,0x7
   xsetbv

;---set first palette entry (background) to orange
   xchg ax,cx   ;clear ax with cx
   mov dx,0x3c8
   out dx,al    ;palette entry 0
   inc dx
   dec ax       ;al = 0xffff => 63 
   out dx,al
   xor al,208   ;al = 47 (for 31 use 224) and carry flag clear with xor
   out dx,al
   salc         ;al =  0
   out dx,al

;init stuff
   vzeroall                 ;for xmm6 counter...may be not needed....
   vpcmpeqw ymm2,ymm2,ymm2  ;all bits to 1 => = ymm2 = -1|-1|...
   xor bp,bp
   
   mainloop:

   shld ax,bp,effect_speed_shift
   and al,00000011b
   or  al,00010000b         ;mask caluclation/variation => 000100??b
   mov byte [si],al
   vpbroadcastb ymm7,[si]   ;needed on all 32 bytes
   
   mov edi,[si+0x28]        ;get screen address
   mov cx,-scr_height+y_offset
   y_loop:
      mov [si],cx
      vpbroadcastw ymm0,[si]    ;y...
      ;change geometry after every 4 shapes
      test bp,effect_01
      jz skip_effect_01
        vpaddw ymm0,ymm0,ymm7
      skip_effect_01:
      vmovdqu [si+0x60],ymm0    ;backup y to free ymm3    
      mov ax,-scr_width+x_offset
      x_loop:
         vpsllw       ymm4,ymm2,depth_initial
         vpxor        ymm1,ymm1,ymm1         ;hit_colours = 0
         mov [si],ax
         vpbroadcastw ymm0,[si]
         vpaddw       ymm3,ymm0,[0x320]      ;x_b = x+16|...|x+31
         vpaddw       ymm0,ymm0,[0x300]      ;x_a = x+0 |...|x+15
         vmovdqu      [si],     ymm0         ;store x_a
         vmovdqu      [si+0x40],ymm3         ;store x_b
         mov bx,depth_steps                  ;depth => if >255 steps use "bx" + 1 Byte
         ray_loop:
            vpaddw       ymm4,ymm4,ymm2      ;depth = depth - 1
            vpmullw      ymm3,ymm4,[si]      ;x_a = (x-center)*depth 0...15
            vpmullw      ymm5,ymm4,[si+0x40] ;x_b = (x-center)*depth 16..31
            vpmullw      ymm0,ymm4,[si+0x60] ;y =   (y-center)*depth
            vpaddw       ymm3,ymm3,ymm6      ;x_a + timer
            vpaddw       ymm0,ymm0,ymm6      ;y   + timer
            vpaddw       ymm5,ymm5,ymm6      ;y_b + timer
            vpand        ymm3,ymm3,ymm0      ;x_a AND y
            vpand        ymm5,ymm5,ymm0      ;x_b AND y
            vpsraw       ymm3,ymm3,10        ;x_a shift
            vpsraw       ymm5,ymm5,10        ;x_b shift
            vpand        ymm3,ymm3,ymm4      ;x_a initial plattenbau geometry
            vpand        ymm5,ymm5,ymm4      ;x_b initial plattenbau geometry            
            vperm2i128   ymm0,ymm3,ymm5,00100000b ;create words 0... 7 | 16...23"
            vperm2i128   ymm5,ymm3,ymm5,00110001b ;create words 8...15 | 24...31"            
            vpacksswb    ymm3,ymm0,ymm5           ;create bytes 0...31       
            vpandn       ymm0,ymm1,ymm7      ;mask only if hit_colour NOT set already
            vptest       ymm0,ymm0           ;early exit if all zero
            jz exit_depth_loop
            vpand        ymm0,ymm0,ymm3      ;check if hit occurred => if current color contains the mask
            vpcmpeqb     ymm0,ymm0,ymm7      ;if hit occurred set byte to 11111111 
            vpblendvb    ymm1,ymm1,ymm3,ymm0 ;update only the 11111111 byte's of hit_colours
            dec bx                           ;reordered
         jnz ray_loop                        ;using LOOP is much slower
         exit_depth_loop:
         vmovdqa [fs:edi],ymm1               ;plot all 32 pixels bytes
         add ax,32
         add edi,32
         cmp ax,x_offset
      jl x_loop
      inc cx
      cmp cx,y_offset
   jne y_loop

   vpsubw ymm6,ymm6,ymm4
   inc bp
   cmp bp,1024
   jne mainloop

   mov cr0,eax
   mov ax,0x3
   int 0x10
   int 0x20

