// A Simple test for FPU-Optimization

#include <stdio.h>
#include <math.h>

class V3D
{
  public:
  	float x,y,z;
  	void __fastcall Add (V3D &a, V3D &b);
};
\
void __fastcall V3D::Add (V3D &a, V3D &b) {	 // 21 cycles
	x = a.x + b.x;
	y = a.y + b.y;
	z = a.z + b.z;
}

void _V3Add(V3D &a, V3D &b, V3D &c);		 		 // 15 cycles
#pragma aux V3Add1_ = \
	"fld  dword [eax][0]"\
	"fadd dword [ebx][0]"\
	"fld  dword [eax][4]"\
	"fadd dword [ebx][4]"\
	"fld  dword [eax][8]"\
	"fadd dword [ebx][8]"\
	"fstp dword [ecx][8]"\
	"fstp dword [ecx][4]"\
	"fstp dword [ecx][0]"\
	parm [eax] [ebx] [ecx];

void V3Add2_ (V3D &a, V3D &b, V3D &c);	 		 // 14 cycles
#pragma aux V3Add2_ = \
	"fld  dword ptr [eax+0]"\
	"fadd dword ptr [ebx+0]"\
	"fld  dword ptr [eax+4]"\
	"fadd dword ptr [ebx+4]"\
	"fxch"\
	"fstp dword ptr [ecx+0]"\
	"fld  dword ptr [eax+8]"\
	"fadd dword ptr [ebx+8]"\
	"fxch"\
	"fstp dword ptr [ecx+4]"\
	"fstp dword ptr [ecx+8]"\
	parm [eax] [ebx] [ecx];

void V3Add3_ (V3D &a, V3D &b, V3D &c);	 		 // 13-cycles
#pragma aux V3Add3_ = \
	"fld  dword ptr [eax+0]"\
	"fadd dword ptr [ebx+0]"\
	"fld  dword ptr [eax+4]"\
	"fadd dword ptr [ebx+4]"\
	"fxch"\
	"fld  dword ptr [eax+8]"\
	"fadd dword ptr [ebx+8]"\
	"fxch"\
	"fstp dword ptr [ecx+0]"\
	"fstp dword ptr [ecx+8]"\
	"fstp dword ptr [ecx+4]"\
	parm [eax] [ebx] [ecx];

void main (void)
{
	V3D a={1,2,3},b={1,2,3},c;
	printf ("before call\n");
	c.Add (a,b);
	printf ("c = %f %f %f\n", c.x,c.y,c.z);
	V3Add2_	(a,b,c);
	printf ("c = %f %f %f\n", c.x,c.y,c.z);
	V3Add3_	(a,b,c);
	printf ("c = %f %f %f\n", c.x,c.y,c.z);
}

