#ifndef CBM_KF_F64vec2P4_H #define CBM_KF_F64vec2P4_H #include #include using namespace std; #include #include #include "vec_arithmetic.h" #include "align16.h" // ALIGNMENT_PREFIX, ALIGNMENT_SUFFIX /********************************** * * Vector of two double floats * **********************************/ #pragma pack(push,16)/* Must ensure class & union 16-B aligned */ typedef ALIGNMENT_PREFIX __m128d VectorDouble ALIGNMENT_SUFFIX; const union { double d; unsigned long long i; } __d_one = {(double)1.}; const union { unsigned long long i[2]; __m128d m; } __f64vec2_abs_mask_cheat = {{0x7fffffffffffffffll, 0x7fffffffffffffffll}}, __f64vec2_sgn_mask_cheat = {{0x8000000000000000ull, 0x8000000000000000ull}}, __f64vec2_zero_cheat = {{ 0, 0}}, __f64vec2_one_cheat = {{__d_one.i , __d_one.i }}, __f64vec2_true_cheat = {{0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF}}, __f64vec2_false_cheat = {{0x0000000000000000, 0x0000000000000000}}; #define _f64vec2_abs_mask (static_cast(__f64vec2_abs_mask_cheat.m)) #define _f64vec2_sgn_mask (static_cast(__f64vec2_sgn_mask_cheat.m)) #define _f64vec2_zero (static_cast(__f64vec2_zero_cheat.m)) #define _f64vec2_one (static_cast(__f64vec2_one_cheat.m)) #define _f64vec2_true (static_cast(__f64vec2_true_cheat.m)) #define _f64vec2_false (static_cast(__f64vec2_false_cheat.m)) ALIGNMENT_PREFIX class F64vec2 { public: __m128d v; double & operator[]( int i ){ return ((double*)&v)[i ]; } double operator[]( int i ) const { return ((double*)&v)[i ]; } F64vec2( ){} F64vec2( const __m128d &a ) { v = a; } F64vec2( const double &a ) { v = _mm_set1_pd(a); } //F64vec2( const double &a ) { v = _mm_set_pd1(a); } F64vec2( const double &f0, const double &f1){ v = _mm_set_pd(f1,f0); } /* Conversion function */ operator __m128d() const { return v; } /* Convert to __m128d */ /* Arithmetic Operators */ friend F64vec2 operator +(const F64vec2 &a, const F64vec2 &b) { return _mm_add_pd(a,b); } friend F64vec2 operator -(const F64vec2 &a, const F64vec2 &b) { return _mm_sub_pd(a,b); } friend F64vec2 operator *(const F64vec2 &a, const F64vec2 &b) { return _mm_mul_pd(a,b); } friend F64vec2 operator /(const F64vec2 &a, const F64vec2 &b) { return _mm_div_pd(a,b); } /* Functions */ friend F64vec2 min( const F64vec2 &a, const F64vec2 &b ){ return _mm_min_pd(a, b); } friend F64vec2 max( const F64vec2 &a, const F64vec2 &b ){ return _mm_max_pd(a, b); } /* Square Root */ friend F64vec2 sqrt ( const F64vec2 &a ){ return _mm_sqrt_pd (a); } /* Reciprocal( inverse) Square Root */ /* Intrinsic does not exist for double */ friend F64vec2 rsqrt( const F64vec2 &a ){ return 1. / sqrt(a); } //friend F64vec2 rsqrt( const F64vec2 &a ){ return _f64vec2_one / _mm_sqrt_pd(a); } /* Reciprocal (inversion) */ /* Intrinsic does not exist for double */ friend F64vec2 rcp ( const F64vec2 &a ){ return 1. / a; } //friend F64vec2 rcp ( const F64vec2 &a ){ return _f64vec2_one / a; } /* Absolute value */ friend F64vec2 fabs(const F64vec2 &a){ return _mm_and_pd(a, _f64vec2_abs_mask); } /* Sign */ friend F64vec2 sgn(const F64vec2 &a){ return _mm_or_pd(_mm_and_pd(a, _f64vec2_sgn_mask),_f64vec2_one); } friend F64vec2 asgnb(const F64vec2 &a, const F64vec2 &b ){ return _mm_or_pd(_mm_and_pd(b, _f64vec2_sgn_mask),a); } /* Logical */ friend F64vec2 operator&( const F64vec2 &a, const F64vec2 &b ){ // mask returned return _mm_and_pd(a, b); } friend F64vec2 operator|( const F64vec2 &a, const F64vec2 &b ){ // mask returned return _mm_or_pd(a, b); } friend F64vec2 operator^( const F64vec2 &a, const F64vec2 &b ){ // mask returned return _mm_xor_pd(a, b); } friend F64vec2 operator!( const F64vec2 &a ){ // mask returned return _mm_xor_pd(a, _f64vec2_true); } friend F64vec2 operator||( const F64vec2 &a, const F64vec2 &b ){ // mask returned return _mm_or_pd(a, b); } /* Comparison */ friend F64vec2 operator<( const F64vec2 &a, const F64vec2 &b ){ // mask returned return _mm_cmplt_pd(a, b); } friend F64vec2 operator>( const F64vec2 &a, const F64vec2 &b ){ // mask returned return _mm_cmpgt_pd(a, b); } #define if3(a, b, c) ((a)&(b)) | ((!(a))&(c)) // analog (a) ? b : c #define NotEmpty(a) bool((a)[0])|bool((a)[1])|bool((a)[2])|bool((a)[3]) #define Empty(a) !(bool((a)[0])|bool((a)[1])|bool((a)[2])|bool((a)[3])) /* Non intrinsic functions */ #define _f1(A,F) F64vec2( F(A[0]), F(A[1]) ) friend F64vec2 exp( const F64vec2 &a ){ return _f1( a, exp ); } friend F64vec2 log( const F64vec2 &a ){ return _f1( a, log ); } friend F64vec2 sin( const F64vec2 &a ){ return _f1( a, sin ); } friend F64vec2 cos( const F64vec2 &a ){ return _f1( a, cos ); } #undef _f1 // RRMOD: Define these operations explicitly instead of using vec_arithmetic macro // This will prevent temp vectors from being allocated. F64vec2& operator +=(const F64vec2 &a) { return *this = _mm_add_pd(v,a); } F64vec2& operator -=(const F64vec2 &a) { return *this = _mm_sub_pd(v,a); } F64vec2& operator *=(const F64vec2 &a) { return *this = _mm_mul_pd(v,a); } F64vec2& operator /=(const F64vec2 &a) { return *this = _mm_div_pd(v,a); } F64vec2& operator &=(const F64vec2 &a) { return *this = _mm_and_pd(v,a); } F64vec2& operator |=(const F64vec2 &a) { return *this = _mm_or_pd(v,a); } F64vec2& operator ^=(const F64vec2 &a) { return *this = _mm_xor_pd(v,a); } /* Define all operators for consistensy */ vec_arithmetic1(F64vec2); vec_arithmetic(F64vec2,double); friend ostream & operator<<(ostream &strm, const F64vec2 &a ){ strm<>(istream &strm, F64vec2 &a ){ double tmp; strm>>tmp; a = tmp; return strm; } } ALIGNMENT_SUFFIX; #endif