#ifndef __SIMD_X86_H
#define __SIMD_X86_H

#include <xmmintrin.h> // SSE
#include <pmmintrin.h> // SSE2
#include <emmintrin.h> // SSE3

#if 0
#ifdef __SSE__
#warning SSE
#endif
#ifdef __SSE2__
#warning SSE2
#endif
#ifdef __SSE3__
#warning SSE3
#endif
#endif

typedef __m128 simd_v4f;

inline simd_v4f simd_load(const float *f)
{
	return _mm_load_ps(f);
}

inline simd_v4f simd_set(float f0, float f1, float f2, float f3)
{
	return _mm_set_ps(f0, f1, f2, f3);
}

inline simd_v4f simd_set(float f)
{
	return _mm_set1_ps(f);
}

inline simd_v4f simd_zero()
{
	return _mm_set1_ps(0.0f);
}

inline float simd_extract(simd_v4f v, int pos)
{
	return __builtin_ia32_vec_ext_v4sf(v, pos);
}

inline simd_v4f simd_add(simd_v4f v1, simd_v4f v2)
{
	return _mm_add_ps(v1, v2);
}

inline simd_v4f simd_sub(simd_v4f v1, simd_v4f v2)
{
	return _mm_sub_ps(v1, v2);
}

inline simd_v4f simd_mul(simd_v4f v1, simd_v4f v2)
{
	return _mm_mul_ps(v1, v2);
}

inline simd_v4f simd_div(simd_v4f v1, simd_v4f v2)
{
	return _mm_div_ps(v1, v2);
}

inline float simd_dot(simd_v4f v1, simd_v4f v2)
{
#if 0 && defined(__SSE3__)
	/* this seems to be slow on a core2 */
	simd_v4f temp = _mm_mul_ps(v1, v2);

	temp = _mm_hadd_ps(temp, temp);
	temp = _mm_hadd_ps(temp, temp);
	return simd_extract(temp, 0);
#else
	simd_v4f temp = _mm_mul_ps(v1, v2);

	simd_v4f temp2 = (simd_v4f)__builtin_ia32_pshufd((__v4si)temp, 0x31); // 1->0, 3->2
	temp = _mm_add_ps(temp, temp2);
	temp2 = (simd_v4f)__builtin_ia32_pshufd((__v4si)temp, 0x02); // 2->0
	temp = _mm_add_ps(temp, temp2);

	return simd_extract(temp, 0);	
#endif
}

inline void simd_transpose(simd_v4f &row0, simd_v4f &row1, simd_v4f &row2, simd_v4f &row3)
{
	_MM_TRANSPOSE4_PS(row0, row1, row2, row3);
}

#endif

