32 #ifndef PLANCK_SSE_UTILS_CXX_H 33 #define PLANCK_SSE_UTILS_CXX_H 35 template<
typename T,
int sz>
class svec;
37 #if (defined(__SSE2__)) 39 #include <xmmintrin.h> 40 #include <emmintrin.h> 42 template<>
class svec<int, 4>
47 typedef union { Tv v; Ts d[4]; } Tu;
51 svec (
const svec &b) : v(b.v) {}
52 svec (
const Tv &b) : v(b) {}
53 svec (
const Ts &val) : v(_mm_set1_epi32(val)) {}
54 svec (
const Ts &val1,
const Ts &val2,
const Ts &val3,
const Ts &val4)
55 : v(_mm_set_epi32(val4,val3,val2,val1)) {}
57 const svec &operator= (
const Ts &val)
58 { v=_mm_set1_epi32(val);
return *
this; }
59 const svec &operator= (
const svec &b)
60 { v=b.v;
return *
this; }
62 Ts operator[] (
int p)
const 63 { Tu u; u.v=v;
return u.d[p]; }
64 void set (
int p, Ts val)
65 { Tu u; u.v=v; u.d[p]=val; v=u.v; }
67 const svec &operator+= (
const svec &b)
68 { v=_mm_add_epi32(v,b.v);
return *
this; }
69 const svec &operator-= (
const svec &b)
70 { v=_mm_sub_epi32(v,b.v);
return *
this; }
71 svec operator+ (
const svec &b)
const 72 {
return svec(_mm_add_epi32(v,b.v)); }
73 svec operator- (
const svec &b)
const 74 {
return svec(_mm_sub_epi32(v,b.v)); }
76 const svec &operator&= (
const svec &b)
77 { v=_mm_and_si128(v,b.v);
return *
this; }
78 const svec &operator|= (
const svec &b)
79 { v=_mm_or_si128(v,b.v);
return *
this; }
80 const svec &operator^= (
const svec &b)
81 { v=_mm_xor_si128(v,b.v);
return *
this; }
82 svec operator& (
const svec &b)
const 83 {
return svec(_mm_and_si128(v,b.v)); }
84 svec operator| (
const svec &b)
const 85 {
return svec(_mm_or_si128(v,b.v)); }
86 svec operator^ (
const svec &b)
const 87 {
return svec(_mm_xor_si128(v,b.v)); }
88 svec andnot (
const svec &b)
const 89 {
return svec(_mm_andnot_si128(v,b.v)); }
91 const svec &operator<<= (
int b)
92 { v=_mm_slli_epi32(v,b);
return *
this; }
93 svec operator<< (
int b)
const 94 {
return svec(_mm_slli_epi32(v,b)); }
95 const svec &operator>>= (
int b)
96 { v=_mm_srai_epi32(v,b);
return *
this; }
97 svec operator>> (
int b)
const 98 {
return svec(_mm_srai_epi32(v,b)); }
100 svec eq (
const svec &b)
const 101 {
return svec(_mm_cmpeq_epi32(v,b.v)); }
102 svec gt (
const svec &b)
const 103 {
return svec(_mm_cmpgt_epi32(v,b.v)); }
104 svec lt (
const svec &b)
const 105 {
return svec(_mm_cmplt_epi32(v,b.v)); }
108 typedef svec<int,4> V4si;
111 template<>
class svec<long long , 2>
114 typedef long long Ts;
116 typedef union { Tv v; Ts d[2]; } Tu;
120 svec (
const svec &b) : v(b.v) {}
121 svec (
const Tv &b) : v(b) {}
122 svec (
const Ts &val) : v(_mm_set1_epi64x(val)) {}
123 svec (
const Ts &val1,
const Ts &val2)
124 : v(_mm_set_epi64x(val2,val1)) {}
126 const svec &operator= (
const Ts &val)
127 { v=_mm_set1_epi64x(val);
return *
this; }
128 const svec &operator= (
const svec &b)
129 { v=b.v;
return *
this; }
131 int operator[] (
int p)
const 132 { Tu u; u.v=v;
return u.d[p]; }
133 void set (
int p,
int val)
134 { Tu u; u.v=v; u.d[p]=val; v=u.v; }
136 const svec &operator+= (
const svec &b)
137 { v=_mm_add_epi64(v,b.v);
return *
this; }
138 const svec &operator-= (
const svec &b)
139 { v=_mm_sub_epi64(v,b.v);
return *
this; }
140 svec operator+ (
const svec &b)
const 141 {
return svec(_mm_add_epi64(v,b.v)); }
142 svec operator- (
const svec &b)
const 143 {
return svec(_mm_sub_epi64(v,b.v)); }
145 const svec &operator&= (
const svec &b)
146 { v=_mm_and_si128(v,b.v);
return *
this; }
147 const svec &operator|= (
const svec &b)
148 { v=_mm_or_si128(v,b.v);
return *
this; }
149 const svec &operator^= (
const svec &b)
150 { v=_mm_xor_si128(v,b.v);
return *
this; }
151 svec operator& (
const svec &b)
const 152 {
return svec(_mm_and_si128(v,b.v)); }
153 svec operator| (
const svec &b)
const 154 {
return svec(_mm_or_si128(v,b.v)); }
155 svec operator^ (
const svec &b)
const 156 {
return svec(_mm_xor_si128(v,b.v)); }
157 svec andnot (
const svec &b)
const 158 {
return svec(_mm_andnot_si128(v,b.v)); }
160 const svec &operator<<= (
int b)
161 { v=_mm_slli_epi64(v,b);
return *
this; }
162 svec operator<< (
int b)
const 163 {
return svec(_mm_slli_epi64(v,b)); }
166 typedef svec<long long,2> V2di;
169 template<>
class svec<float, 4>
174 typedef union { Tv v; Ts d[4]; } Tu;
178 svec (
const svec &b) : v(b.v) {}
179 svec (
const Tv &b) : v(b) {}
180 svec (
const Ts &val) : v(_mm_set1_ps(val)) {}
181 svec (Ts val1, Ts val2, Ts val3, Ts val4)
182 : v(_mm_set_ps(val4,val3,val2,val1)) {}
183 explicit svec (
const svec<int,4> &b) : v(_mm_cvtepi32_ps(b.v)) {}
185 operator svec<int,4>()
const 186 {
return svec<int,4> (_mm_cvtps_epi32(v)); }
187 const svec &operator= (
const Ts &val)
188 { v=_mm_set1_ps(val);
return *
this; }
189 const svec &operator= (
const svec &b)
190 { v=b.v;
return *
this; }
192 Ts operator[] (
int p)
const 193 { Tu u; u.v=v;
return u.d[p]; }
194 void set (
int p, Ts val)
195 { Tu u; u.v=v; u.d[p]=val; v=u.v; }
197 const svec &operator+= (
const svec &b)
198 { v=_mm_add_ps(v,b.v);
return *
this; }
199 const svec &operator-= (
const svec &b)
200 { v=_mm_sub_ps(v,b.v);
return *
this; }
201 const svec &operator*= (
const svec &b)
202 { v=_mm_mul_ps(v,b.v);
return *
this; }
203 const svec &operator/= (
const svec &b)
204 { v=_mm_div_ps(v,b.v);
return *
this; }
206 svec operator+ (
const svec &b)
const 207 {
return svec(_mm_add_ps(v,b.v)); }
208 svec operator- (
const svec &b)
const 209 {
return svec(_mm_sub_ps(v,b.v)); }
210 svec operator* (
const svec &b)
const 211 {
return svec(_mm_mul_ps(v,b.v)); }
212 svec operator/ (
const svec &b)
const 213 {
return svec(_mm_div_ps(v,b.v)); }
215 const svec &operator&= (
const svec &b)
216 { v=_mm_and_ps(v,b.v);
return *
this; }
217 const svec &operator|= (
const svec &b)
218 { v=_mm_or_ps(v,b.v);
return *
this; }
219 const svec &operator^= (
const svec &b)
220 { v=_mm_xor_ps(v,b.v);
return *
this; }
221 svec operator& (
const svec &b)
const 222 {
return svec(_mm_and_ps(v,b.v)); }
223 svec andnot (
const svec &b)
const 224 {
return svec(_mm_andnot_ps(v,b.v)); }
225 svec operator| (
const svec &b)
const 226 {
return svec(_mm_or_ps(v,b.v)); }
227 svec operator^ (
const svec &b)
const 228 {
return svec(_mm_xor_ps(v,b.v)); }
230 svec operator- ()
const 231 {
return svec(_mm_xor_ps(_mm_set1_ps(-0.),v)); }
233 svec eq (
const svec &b)
const 234 {
return svec(_mm_cmpeq_ps(v,b.v)); }
235 svec neq (
const svec &b)
const 236 {
return svec(_mm_cmpneq_ps(v,b.v)); }
237 svec lt (
const svec &b)
const 238 {
return svec(_mm_cmplt_ps(v,b.v)); }
239 svec le (
const svec &b)
const 240 {
return svec(_mm_cmple_ps(v,b.v)); }
241 svec gt (
const svec &b)
const 242 {
return svec(_mm_cmpgt_ps(v,b.v)); }
243 svec ge (
const svec &b)
const 244 {
return svec(_mm_cmpge_ps(v,b.v)); }
246 void writeTo (Ts *val)
const 247 { _mm_storeu_ps (val, v); }
248 void writeTo (Ts &a, Ts &b, Ts &c, Ts &d)
const 249 { Tu u; u.v=v; a=u.d[0]; b=u.d[1]; c=u.d[2]; d=u.d[3]; }
250 void readFrom (
const Ts *val)
251 { v=_mm_loadu_ps(val); }
252 void readFrom (Ts a, Ts b, Ts c, Ts d)
253 { v=_mm_set_ps(d,c,b,a); }
256 typedef svec<float,4> V4sf;
258 inline V4sf sqrt(
const V4sf &v)
259 {
return V4sf(_mm_sqrt_ps(v.v)); }
260 inline V4sf abs(
const V4sf &v)
261 {
return V4sf(_mm_andnot_ps(_mm_set1_ps(-0.),v.v)); }
262 inline V4sf blend(
const V4sf &mask,
const V4sf &a,
const V4sf &b)
263 {
return (mask&a)|(mask.andnot(b)); }
264 inline bool any (
const V4sf &a)
265 {
return _mm_movemask_ps(a.v)!=0; }
266 inline bool all (
const V4sf &a)
267 {
return _mm_movemask_ps(a.v)==15; }
268 inline bool none (
const V4sf &a)
269 {
return _mm_movemask_ps(a.v)==0; }
270 inline V4sf min (
const V4sf &a,
const V4sf &b)
271 {
return _mm_min_ps(a.v,b.v); }
272 inline V4sf max (
const V4sf &a,
const V4sf &b)
273 {
return _mm_max_ps(a.v,b.v); }
275 template<>
class svec<double, 2>
280 typedef union { Tv v; Ts d[2]; } Tu;
284 svec (
const svec &b) : v(b.v) {}
285 svec (
const Tv &b) : v(b) {}
286 svec (
const Ts &val) : v(_mm_set1_pd(val)) {}
287 svec (
const Ts &val1,
const Ts &val2)
288 : v(_mm_set_pd(val2,val1)) {}
289 explicit svec (
const svec<int,4> &b) : v(_mm_cvtepi32_pd(b.v)) {}
291 operator svec<int,4>()
const 292 {
return svec<int,4> (_mm_cvtpd_epi32(v)); }
293 const svec &operator= (
const Ts &val)
294 { v=_mm_set1_pd(val);
return *
this; }
295 const svec &operator= (
const svec &b)
296 { v=b.v;
return *
this; }
298 Ts operator[] (
int p)
const 299 { Tu u; u.v=v;
return u.d[p]; }
300 void set (
int p, Ts val)
301 { Tu u; u.v=v; u.d[p]=val; v=u.v; }
303 const svec &operator+= (
const svec &b)
304 { v=_mm_add_pd(v,b.v);
return *
this; }
305 const svec &operator-= (
const svec &b)
306 { v=_mm_sub_pd(v,b.v);
return *
this; }
307 const svec &operator*= (
const svec &b)
308 { v=_mm_mul_pd(v,b.v);
return *
this; }
309 const svec &operator/= (
const svec &b)
310 { v=_mm_div_pd(v,b.v);
return *
this; }
312 svec operator+ (
const svec &b)
const 313 {
return svec(_mm_add_pd(v,b.v)); }
314 svec operator- (
const svec &b)
const 315 {
return svec(_mm_sub_pd(v,b.v)); }
316 svec operator* (
const svec &b)
const 317 {
return svec(_mm_mul_pd(v,b.v)); }
318 svec operator/ (
const svec &b)
const 319 {
return svec(_mm_div_pd(v,b.v)); }
321 const svec &operator&= (
const svec &b)
322 { v=_mm_and_pd(v,b.v);
return *
this; }
323 const svec &operator|= (
const svec &b)
324 { v=_mm_or_pd(v,b.v);
return *
this; }
325 const svec &operator^= (
const svec &b)
326 { v=_mm_xor_pd(v,b.v);
return *
this; }
327 svec operator& (
const svec &b)
const 328 {
return svec(_mm_and_pd(v,b.v)); }
329 svec operator| (
const svec &b)
const 330 {
return svec(_mm_or_pd(v,b.v)); }
331 svec operator^ (
const svec &b)
const 332 {
return svec(_mm_xor_pd(v,b.v)); }
334 svec operator- ()
const 335 {
return svec(_mm_xor_pd(_mm_set1_pd(-0.),v)); }
337 svec eq (
const svec &b)
const 338 {
return svec(_mm_cmpeq_pd(v,b.v)); }
339 svec neq (
const svec &b)
const 340 {
return svec(_mm_cmpneq_pd(v,b.v)); }
341 svec lt (
const svec &b)
const 342 {
return svec(_mm_cmplt_pd(v,b.v)); }
343 svec le (
const svec &b)
const 344 {
return svec(_mm_cmple_pd(v,b.v)); }
345 svec gt (
const svec &b)
const 346 {
return svec(_mm_cmpgt_pd(v,b.v)); }
347 svec ge (
const svec &b)
const 348 {
return svec(_mm_cmpge_pd(v,b.v)); }
350 void writeTo (Ts *val)
const 351 { _mm_storeu_pd (val, v); }
352 void writeTo (Ts &a, Ts &b)
const 353 { _mm_store_sd(&a,v); _mm_storeh_pd(&b,v); }
354 void readFrom (
const Ts *val)
355 { v=_mm_loadu_pd(val); }
356 void readFrom (
const Ts &a,
const Ts &b)
357 { v=_mm_set_pd(b,a); }
360 typedef svec<double,2> V2df;
362 inline V2df sqrt(
const V2df &v)
363 {
return V2df(_mm_sqrt_pd(v.v)); }
364 inline V2df abs(
const V2df &v)
365 {
return V2df(_mm_andnot_pd(_mm_set1_pd(-0.),v.v)); }
366 inline V2df blend(
const V2df &mask,
const V2df &a,
const V2df &b)
367 {
return V2df(_mm_or_pd(_mm_and_pd(a.v,mask.v),_mm_andnot_pd(mask.v,b.v))); }
368 inline bool any (
const V2df &a)
369 {
return _mm_movemask_pd(a.v)!=0; }
370 inline bool all (
const V2df &a)
371 {
return _mm_movemask_pd(a.v)==3; }
372 inline bool none (
const V2df &a)
373 {
return _mm_movemask_pd(a.v)==0; }
375 template<
typename T>
inline T vcast(
const V4si &a);
376 template<
typename T>
inline T vcast(
const V4sf &a);
377 template<
typename T>
inline T vcast(
const V2df &a);
379 template<>
inline V4si vcast (
const V4sf &a)
380 {
return V4si (_mm_castps_si128(a.v)); }
381 template<>
inline V4sf vcast (
const V4si &a)
382 {
return V4sf (_mm_castsi128_ps(a.v)); }
383 template<>
inline V2df vcast (
const V4si &a)
384 {
return V2df (_mm_castsi128_pd(a.v)); }