#include <stdlib.h>
#include <stdio.h>
#include <math.h>

#include "get_time.h"

#define SIN_TABLE_SIZE		(1<<16)
#define SIN_TABLE_BITMASK	(SIN_TABLE_SIZE-1)

#define SIN_TABLE2_SIZE		(1<<16)
#define SIN_TABLE2_BITMASK	(SIN_TABLE2_SIZE-1)


double _sin_t[SIN_TABLE_SIZE];
float sin_table2[SIN_TABLE2_SIZE];

#ifndef PI
#	define PI 3.14159265358979
#endif

union split_int_64 {
	Uint64 whole;
	struct blah {
		Uint32 low;
		Uint32 high;
	} s;
};
#if defined(_MSC_VER) && defined(__i386__) && !defined(NO_ASM)
#	define macro_LCG64b_32c(dest,src,multiplier,additive) \
		{\
		enum { M = multiplier, A = additive };\
		\
		split_int_64 _tmp = src;\
		_asm { mov eax, [_tmp.s.high] } \
		_asm { mov ebx, M } \
		_asm { imul eax, ebx } \
		_asm { mov ecx, eax } \
		_asm { mov eax, [_tmp.s.low] } \
		_asm { mul ebx } \
		_asm { add eax, A } \
		_asm { adc ecx, edx} \
		_asm { mov [_tmp.s.low], eax } \
		_asm { mov [_tmp.s.high], ecx } \
		dest = _tmp; \
		}
#elif defined(__GNUC__) && defined(__i386__) && !defined(NO_ASM)
//haven't written the gcc version of this asm yet
#	define macro_LCG64b_32c(dest,src,multiplier,additive) \
		{ \
			Uint32 _blah;\
			asm ( \
				"imull %3, %5"	"\n\t"\
				"mull  %3"		"\n\t"\
				"addl  %6, %1"	"\n\t"\
				"adcl  %0, %2"	\
				: "=d" (_blah),      "=a" (dest.s.low), "=S" (dest.s.high)\
				: "0"  (multiplier), "1"  (src.s.low),  "2" (src.s.high),  "i" (additive) \
			); \
		}
#else
#	define macro_LCG64b_32c(dest,src,multiplier,additive) {dest.whole = src.whole * multiplier + additive; }
#endif
inline Uint32 _rng_dist_32_flat ( Uint32 m, Uint32 r ) {
#	if defined(_MSC_VER) && defined(__i386__) && !defined(NO_ASM)
		_asm {
			mov eax, [r]
			mov ebx, [m]
			mul ebx
			mov [r], edx
		}
		return r;
#	elif defined(__GNUC__) && defined(__i386__) && !defined(NO_ASM)
		asm ("mull %0" : "=d" (r), "=a" (m) : "%0" (r), "1" (m) );
		return r;
#	else
		return (Uint32)((r * (Uint64)m) >> 32);
#	endif
}
template<class base_rng>
class RNG_HELPER_TEMPLATE_32 : public base_rng {
public:
	Uint8  raw8()  {return (Uint8)raw32();}
	Uint16 raw16() {return (Uint16)raw32();}
	Uint64 raw64() {return (((Uint64)raw32()) << 32) + raw32();}
//		simple distributions:
	Uint32 randi (Uint32 max) {return _rng_dist_32_flat(max, raw32());}
	double randf (double max) {return max * raw32() / 4294967296.0;}
//		simple overloads:
	Uint32 randi (Uint32 min, Uint32 max) {return min+randi (max-min);}
	double randf (double min, double max) {return min+randf (max-min);}
};
class _RNG_lcg64a {
public:
	enum { 
		OUTPUT_BITS = 32,   //number of bits produced internally (must be 8, 16, 32, or 64)
		INTERNAL_BITS = 64, //number of bits used in state
		ROBUST = 2          //2 = all states valid (even low entropy ones), 1 = all states valid, 0 = not all states valid
	};
	enum {LOCKED = 1, LOCKED_SEED = 0, LOCKED_MD5 = 0};
protected:
	split_int_64 s64;
	enum {
		MULTIPLIER = 1812433253,
		ADDITIVE   = 123456789
	};
public:
	Uint32 raw32() {
		macro_LCG64b_32c ( s64, s64, MULTIPLIER, ADDITIVE );
		return s64.s.high;
	}
	void seed ( int s ) { s64.whole = s; }
	void fast_forward ( Uint64 how_far );
	void rewind ( Uint64 how_far ) {fast_forward(1 + ~how_far);}
	Uint64 get_state64() const {return s64.whole;}
	void set_state64 (Uint64 s) {s64.whole = s;}
};

typedef RNG_HELPER_TEMPLATE_32<_RNG_lcg64a> RNG_lcg64a;
RNG_lcg64a rng;

void init_sin_table() {
	for (int i = 0; i < SIN_TABLE_SIZE; i++) {
		_sin_t[i] = sin( (i-.0) * PI * 2 / SIN_TABLE_SIZE);
	}
	for (int i = 0; i < SIN_TABLE2_SIZE; i++) {
		sin_table2[i] = sin( (i+.0) * PI * 2 / SIN_TABLE2_SIZE);
	}
}

double Get_SinT (double d)
{
	return _sin_t[(int)(d * (SIN_TABLE_SIZE/PI/2)) & SIN_TABLE_BITMASK];
}

int iround(double a) {
	int r; 
	__asm fld a   ;
	__asm fistp r ;
	return r;
}
#define BIGDOUBLE 6755399441055744.0
int iround_binary(double a) {
	int i;
	a = a + (BIGDOUBLE + 0);
	i = *((int*)&a);
	return i;
}
double get_sin_via_table2 ( double d) {
	return sin_table2[iround(d * (SIN_TABLE2_SIZE/PI/2)) & SIN_TABLE2_BITMASK];
}
double get_sin_via_table2_binary_iround ( double d) {
	return sin_table2[iround_binary(d * (SIN_TABLE2_SIZE/PI/2)) & SIN_TABLE2_BITMASK];
}
double get_sin_via_table2_with_interpolation ( double d) {
	double f = d * (SIN_TABLE2_SIZE/PI/2);
	int i = iround(f);
	double s1 = sin_table2[i & SIN_TABLE2_BITMASK];
	double s2 = sin_table2[(i+1) & SIN_TABLE2_BITMASK];
	return s1 + (s2 - s1) * (f-i);
}
double dummy(double a) {return a;}

template<double(*func)(double)>
void _benchmark(int n, int seed) {
	double time1 = get_time2();
	const double c = 0;//PI * pow(2.0f,24.0f);
	double sum = 2.1;
	rng.seed(seed);
	for (int i = 0; i < n; i++) {
		double step = rng.randf(-PI,PI);
		for (int j = 0; j < 100; j++) {
			sum += func(c+(sum+=step));
			sum += func(c+(sum+=step));
			sum += func(c+(sum+=step));
			sum += func(c+(sum+=step));
		}
		sum = fmod(10*PI*sum+rng.randf(PI*2), PI*2);
		if (sum < 0) sum += PI * 2;
	}
//	sum = fmod(sum, 2.0);
	double time2 = get_time2();
	double dt = (time2 - time1) / (n * 400.0) * 1000000;
	printf("%8.3f ns / call; check:%.10f\n", dt, sum);
}
inline double _sinf(double a) {return sinf(a);}
double pointless_global_x;
double _fsincos(double a) {
	double y;
	__asm fld a
	__asm fsincos
	__asm fstp pointless_global_x
	__asm fstp y
	return y;
}
void benchmark ( int n, int seed ) {
#define BENCH(a) printf("%40s ", #a);_benchmark<a>(n, seed);
	BENCH(dummy);
	BENCH(Get_SinT);
	BENCH(get_sin_via_table2);
	BENCH(get_sin_via_table2_binary_iround);
	BENCH(get_sin_via_table2_with_interpolation);
	BENCH(_sinf)(n);
	BENCH(sin);
	BENCH(_fsincos);
}

int main(int argc, char **argv) {
	init_time();
	init_sin_table();
	benchmark( 50000, 0 );
}





