libflame revision_anchor
FLA_Apply_G_mx2_asm.h
Go to the documentation of this file.
1/*
2
3 Copyright (C) 2014, The University of Texas at Austin
4
5 This file is part of libflame and is available under the 3-Clause
6 BSD license, which can be found in the LICENSE file at the top-level
7 directory, or at http://opensource.org/licenses/BSD-3-Clause
8
9*/
10
11
12#if FLA_VECTOR_INTRINSIC_TYPE == FLA_NO_INTRINSICS
13
14#define MAC_Apply_G_mx2_ass MAC_Apply_G_mx2_ops
15#define MAC_Apply_G_mx2_asd MAC_Apply_G_mx2_opd
16#define MAC_Apply_G_mx2_asc MAC_Apply_G_mx2_opc
17#define MAC_Apply_G_mx2_asz MAC_Apply_G_mx2_opz
18
19#elif FLA_VECTOR_INTRINSIC_TYPE == FLA_SSE_INTRINSICS
20
21#define MAC_Apply_G_mx2_ass( m_A, \
22 gamma12, \
23 sigma12, \
24 a1, inc_a1, \
25 a2, inc_a2 ) \
26{\
27 int n_iter32 = m_A / ( 4 * 8 ); \
28 int n_left32 = m_A % ( 4 * 8 ); \
29 int n_iter4 = n_left32 / ( 4 * 1 ); \
30 int n_left = n_left32 % ( 4 * 1 ); \
31 int i; \
32\
33 const int step_a1 = inc_a1 * 4; \
34 const int step_a2 = inc_a2 * 4; \
35\
36 float* restrict alpha1 = a1; \
37 float* restrict alpha2 = a2; \
38\
39 v4sf_t a1v, a2v; \
40 v4sf_t g12v, s12v; \
41 v4sf_t t1v; \
42\
43 g12v.v = _mm_load1_ps( gamma12 ); \
44 s12v.v = _mm_load1_ps( sigma12 ); \
45\
46 for ( i = 0; i < n_iter32; ++i ) \
47 { \
48\
49 a1v.v = _mm_load_ps( ( float* )alpha1 ); \
50 a2v.v = _mm_load_ps( ( float* )alpha2 ); \
51\
52 t1v.v = a1v.v; \
53 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
54 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
55\
56 _mm_store_ps( ( float* )alpha1, a1v.v ); \
57 _mm_store_ps( ( float* )alpha2, a2v.v ); \
58\
59 alpha1 += step_a1; \
60 alpha2 += step_a2; \
61\
62 a1v.v = _mm_load_ps( ( float* )alpha1 ); \
63 a2v.v = _mm_load_ps( ( float* )alpha2 ); \
64\
65 t1v.v = a1v.v; \
66 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
67 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
68\
69 _mm_store_ps( ( float* )alpha1, a1v.v ); \
70 _mm_store_ps( ( float* )alpha2, a2v.v ); \
71\
72 alpha1 += step_a1; \
73 alpha2 += step_a2; \
74\
75 a1v.v = _mm_load_ps( ( float* )alpha1 ); \
76 a2v.v = _mm_load_ps( ( float* )alpha2 ); \
77\
78 t1v.v = a1v.v; \
79 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
80 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
81\
82 _mm_store_ps( ( float* )alpha1, a1v.v ); \
83 _mm_store_ps( ( float* )alpha2, a2v.v ); \
84\
85 alpha1 += step_a1; \
86 alpha2 += step_a2; \
87\
88 a1v.v = _mm_load_ps( ( float* )alpha1 ); \
89 a2v.v = _mm_load_ps( ( float* )alpha2 ); \
90\
91 t1v.v = a1v.v; \
92 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
93 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
94\
95 _mm_store_ps( ( float* )alpha1, a1v.v ); \
96 _mm_store_ps( ( float* )alpha2, a2v.v ); \
97\
98 alpha1 += step_a1; \
99 alpha2 += step_a2; \
100\
101 a1v.v = _mm_load_ps( ( float* )alpha1 ); \
102 a2v.v = _mm_load_ps( ( float* )alpha2 ); \
103\
104 t1v.v = a1v.v; \
105 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
106 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
107\
108 _mm_store_ps( ( float* )alpha1, a1v.v ); \
109 _mm_store_ps( ( float* )alpha2, a2v.v ); \
110\
111 alpha1 += step_a1; \
112 alpha2 += step_a2; \
113\
114 a1v.v = _mm_load_ps( ( float* )alpha1 ); \
115 a2v.v = _mm_load_ps( ( float* )alpha2 ); \
116\
117 t1v.v = a1v.v; \
118 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
119 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
120\
121 _mm_store_ps( ( float* )alpha1, a1v.v ); \
122 _mm_store_ps( ( float* )alpha2, a2v.v ); \
123\
124 alpha1 += step_a1; \
125 alpha2 += step_a2; \
126\
127 a1v.v = _mm_load_ps( ( float* )alpha1 ); \
128 a2v.v = _mm_load_ps( ( float* )alpha2 ); \
129\
130 t1v.v = a1v.v; \
131 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
132 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
133\
134 _mm_store_ps( ( float* )alpha1, a1v.v ); \
135 _mm_store_ps( ( float* )alpha2, a2v.v ); \
136\
137 alpha1 += step_a1; \
138 alpha2 += step_a2; \
139\
140 a1v.v = _mm_load_ps( ( float* )alpha1 ); \
141 a2v.v = _mm_load_ps( ( float* )alpha2 ); \
142\
143 t1v.v = a1v.v; \
144 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
145 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
146\
147 _mm_store_ps( ( float* )alpha1, a1v.v ); \
148 _mm_store_ps( ( float* )alpha2, a2v.v ); \
149\
150 alpha1 += step_a1; \
151 alpha2 += step_a2; \
152 } \
153\
154 for ( i = 0; i < n_iter4; ++i ) \
155 { \
156\
157 a1v.v = _mm_load_ps( ( float* )alpha1 ); \
158 a2v.v = _mm_load_ps( ( float* )alpha2 ); \
159\
160 t1v.v = a1v.v; \
161 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
162 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
163\
164 _mm_store_ps( ( float* )alpha1, a1v.v ); \
165 _mm_store_ps( ( float* )alpha2, a2v.v ); \
166\
167 alpha1 += step_a1; \
168 alpha2 += step_a2; \
169 } \
170\
171 for ( i = 0; i < n_left; ++i ) \
172 { \
173 float ga12 = *gamma12; \
174 float si12 = *sigma12; \
175 float temp1; \
176 float temp2; \
177\
178 temp1 = *alpha1; \
179 temp2 = *alpha2; \
180\
181 *alpha1 = temp1 * ga12 + temp2 * si12; \
182 *alpha2 = temp2 * ga12 - temp1 * si12; \
183\
184 alpha1 += 1; \
185 alpha2 += 1; \
186 } \
187}
188
189#define MAC_Apply_G_mx2_asd( m_A, \
190 gamma12, \
191 sigma12, \
192 a1, inc_a1, \
193 a2, inc_a2 ) \
194{\
195 int n_iter16 = m_A / ( 2 * 8 ); \
196 int n_left16 = m_A % ( 2 * 8 ); \
197 int n_iter2 = n_left16 / ( 2 * 1 ); \
198 int n_left = n_left16 % ( 2 * 1 ); \
199 int i; \
200\
201 const int step_a1 = inc_a1 * 2; \
202 const int step_a2 = inc_a2 * 2; \
203\
204 double* restrict alpha1 = a1; \
205 double* restrict alpha2 = a2; \
206\
207 v2df_t a1v, a2v; \
208 v2df_t g12v, s12v; \
209 v2df_t t1v; \
210\
211 g12v.v = _mm_loaddup_pd( gamma12 ); \
212 s12v.v = _mm_loaddup_pd( sigma12 ); \
213\
214 for ( i = 0; i < n_iter16; ++i ) \
215 { \
216\
217 a1v.v = _mm_load_pd( ( double* )alpha1 ); \
218 a2v.v = _mm_load_pd( ( double* )alpha2 ); \
219\
220 t1v.v = a1v.v; \
221 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
222 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
223\
224 _mm_store_pd( ( double* )alpha1, a1v.v ); \
225 _mm_store_pd( ( double* )alpha2, a2v.v ); \
226\
227 alpha1 += step_a1; \
228 alpha2 += step_a2; \
229\
230 a1v.v = _mm_load_pd( ( double* )alpha1 ); \
231 a2v.v = _mm_load_pd( ( double* )alpha2 ); \
232\
233 t1v.v = a1v.v; \
234 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
235 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
236\
237 _mm_store_pd( ( double* )alpha1, a1v.v ); \
238 _mm_store_pd( ( double* )alpha2, a2v.v ); \
239\
240 alpha1 += step_a1; \
241 alpha2 += step_a2; \
242\
243 a1v.v = _mm_load_pd( ( double* )alpha1 ); \
244 a2v.v = _mm_load_pd( ( double* )alpha2 ); \
245\
246 t1v.v = a1v.v; \
247 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
248 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
249\
250 _mm_store_pd( ( double* )alpha1, a1v.v ); \
251 _mm_store_pd( ( double* )alpha2, a2v.v ); \
252\
253 alpha1 += step_a1; \
254 alpha2 += step_a2; \
255\
256 a1v.v = _mm_load_pd( ( double* )alpha1 ); \
257 a2v.v = _mm_load_pd( ( double* )alpha2 ); \
258\
259 t1v.v = a1v.v; \
260 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
261 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
262\
263 _mm_store_pd( ( double* )alpha1, a1v.v ); \
264 _mm_store_pd( ( double* )alpha2, a2v.v ); \
265\
266 alpha1 += step_a1; \
267 alpha2 += step_a2; \
268\
269 a1v.v = _mm_load_pd( ( double* )alpha1 ); \
270 a2v.v = _mm_load_pd( ( double* )alpha2 ); \
271\
272 t1v.v = a1v.v; \
273 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
274 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
275\
276 _mm_store_pd( ( double* )alpha1, a1v.v ); \
277 _mm_store_pd( ( double* )alpha2, a2v.v ); \
278\
279 alpha1 += step_a1; \
280 alpha2 += step_a2; \
281\
282 a1v.v = _mm_load_pd( ( double* )alpha1 ); \
283 a2v.v = _mm_load_pd( ( double* )alpha2 ); \
284\
285 t1v.v = a1v.v; \
286 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
287 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
288\
289 _mm_store_pd( ( double* )alpha1, a1v.v ); \
290 _mm_store_pd( ( double* )alpha2, a2v.v ); \
291\
292 alpha1 += step_a1; \
293 alpha2 += step_a2; \
294\
295 a1v.v = _mm_load_pd( ( double* )alpha1 ); \
296 a2v.v = _mm_load_pd( ( double* )alpha2 ); \
297\
298 t1v.v = a1v.v; \
299 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
300 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
301\
302 _mm_store_pd( ( double* )alpha1, a1v.v ); \
303 _mm_store_pd( ( double* )alpha2, a2v.v ); \
304\
305 alpha1 += step_a1; \
306 alpha2 += step_a2; \
307\
308 a1v.v = _mm_load_pd( ( double* )alpha1 ); \
309 a2v.v = _mm_load_pd( ( double* )alpha2 ); \
310\
311 t1v.v = a1v.v; \
312 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
313 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
314\
315 _mm_store_pd( ( double* )alpha1, a1v.v ); \
316 _mm_store_pd( ( double* )alpha2, a2v.v ); \
317\
318 alpha1 += step_a1; \
319 alpha2 += step_a2; \
320 } \
321\
322 for ( i = 0; i < n_iter2; ++i ) \
323 { \
324\
325 a1v.v = _mm_load_pd( ( double* )alpha1 ); \
326 a2v.v = _mm_load_pd( ( double* )alpha2 ); \
327\
328 t1v.v = a1v.v; \
329 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
330 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
331\
332 _mm_store_pd( ( double* )alpha1, a1v.v ); \
333 _mm_store_pd( ( double* )alpha2, a2v.v ); \
334\
335 alpha1 += step_a1; \
336 alpha2 += step_a2; \
337 } \
338\
339 if ( n_left == 1 ) \
340 { \
341 double ga12 = *gamma12; \
342 double si12 = *sigma12; \
343 double temp1; \
344 double temp2; \
345\
346 temp1 = *alpha1; \
347 temp2 = *alpha2; \
348\
349 *alpha1 = temp1 * ga12 + temp2 * si12; \
350 *alpha2 = temp2 * ga12 - temp1 * si12; \
351 } \
352}
353
354#define MAC_Apply_G_mx2_asc( m_A, \
355 gamma12, \
356 sigma12, \
357 a1, inc_a1, \
358 a2, inc_a2 ) \
359{\
360 int n_iter16 = m_A / ( 2 * 8 ); \
361 int n_left16 = m_A % ( 2 * 8 ); \
362 int n_iter2 = n_left16 / ( 2 * 1 ); \
363 int n_left = n_left16 % ( 2 * 1 ); \
364 int i; \
365\
366 const int step_a1 = inc_a1 * 2; \
367 const int step_a2 = inc_a2 * 2; \
368\
369 scomplex* restrict alpha1 = a1; \
370 scomplex* restrict alpha2 = a2; \
371\
372 v4sf_t a1v, a2v; \
373 v4sf_t g12v, s12v; \
374 v4sf_t t1v; \
375\
376 g12v.v = _mm_load1_ps( gamma12 ); \
377 s12v.v = _mm_load1_ps( sigma12 ); \
378\
379 for ( i = 0; i < n_iter16; ++i ) \
380 { \
381\
382 a1v.v = _mm_load_ps( ( float* )alpha1 ); \
383 a2v.v = _mm_load_ps( ( float* )alpha2 ); \
384\
385 t1v.v = a1v.v; \
386 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
387 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
388\
389 _mm_store_ps( ( float* )alpha1, a1v.v ); \
390 _mm_store_ps( ( float* )alpha2, a2v.v ); \
391\
392 alpha1 += step_a1; \
393 alpha2 += step_a2; \
394\
395 a1v.v = _mm_load_ps( ( float* )alpha1 ); \
396 a2v.v = _mm_load_ps( ( float* )alpha2 ); \
397\
398 t1v.v = a1v.v; \
399 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
400 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
401\
402 _mm_store_ps( ( float* )alpha1, a1v.v ); \
403 _mm_store_ps( ( float* )alpha2, a2v.v ); \
404\
405 alpha1 += step_a1; \
406 alpha2 += step_a2; \
407\
408 a1v.v = _mm_load_ps( ( float* )alpha1 ); \
409 a2v.v = _mm_load_ps( ( float* )alpha2 ); \
410\
411 t1v.v = a1v.v; \
412 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
413 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
414\
415 _mm_store_ps( ( float* )alpha1, a1v.v ); \
416 _mm_store_ps( ( float* )alpha2, a2v.v ); \
417\
418 alpha1 += step_a1; \
419 alpha2 += step_a2; \
420\
421 a1v.v = _mm_load_ps( ( float* )alpha1 ); \
422 a2v.v = _mm_load_ps( ( float* )alpha2 ); \
423\
424 t1v.v = a1v.v; \
425 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
426 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
427\
428 _mm_store_ps( ( float* )alpha1, a1v.v ); \
429 _mm_store_ps( ( float* )alpha2, a2v.v ); \
430\
431 alpha1 += step_a1; \
432 alpha2 += step_a2; \
433\
434 a1v.v = _mm_load_ps( ( float* )alpha1 ); \
435 a2v.v = _mm_load_ps( ( float* )alpha2 ); \
436\
437 t1v.v = a1v.v; \
438 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
439 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
440\
441 _mm_store_ps( ( float* )alpha1, a1v.v ); \
442 _mm_store_ps( ( float* )alpha2, a2v.v ); \
443\
444 alpha1 += step_a1; \
445 alpha2 += step_a2; \
446\
447 a1v.v = _mm_load_ps( ( float* )alpha1 ); \
448 a2v.v = _mm_load_ps( ( float* )alpha2 ); \
449\
450 t1v.v = a1v.v; \
451 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
452 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
453\
454 _mm_store_ps( ( float* )alpha1, a1v.v ); \
455 _mm_store_ps( ( float* )alpha2, a2v.v ); \
456\
457 alpha1 += step_a1; \
458 alpha2 += step_a2; \
459\
460 a1v.v = _mm_load_ps( ( float* )alpha1 ); \
461 a2v.v = _mm_load_ps( ( float* )alpha2 ); \
462\
463 t1v.v = a1v.v; \
464 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
465 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
466\
467 _mm_store_ps( ( float* )alpha1, a1v.v ); \
468 _mm_store_ps( ( float* )alpha2, a2v.v ); \
469\
470 alpha1 += step_a1; \
471 alpha2 += step_a2; \
472\
473 a1v.v = _mm_load_ps( ( float* )alpha1 ); \
474 a2v.v = _mm_load_ps( ( float* )alpha2 ); \
475\
476 t1v.v = a1v.v; \
477 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
478 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
479\
480 _mm_store_ps( ( float* )alpha1, a1v.v ); \
481 _mm_store_ps( ( float* )alpha2, a2v.v ); \
482\
483 alpha1 += step_a1; \
484 alpha2 += step_a2; \
485 } \
486\
487 for ( i = 0; i < n_iter2; ++i ) \
488 { \
489\
490 a1v.v = _mm_load_ps( ( float* )alpha1 ); \
491 a2v.v = _mm_load_ps( ( float* )alpha2 ); \
492\
493 t1v.v = a1v.v; \
494 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
495 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
496\
497 _mm_store_ps( ( float* )alpha1, a1v.v ); \
498 _mm_store_ps( ( float* )alpha2, a2v.v ); \
499\
500 alpha1 += step_a1; \
501 alpha2 += step_a2; \
502 } \
503\
504 if ( n_left == 1 ) \
505 { \
506 float ga12 = *gamma12; \
507 float si12 = *sigma12; \
508 scomplex temp1; \
509 scomplex temp2; \
510\
511 temp1 = *alpha1; \
512 temp2 = *alpha2; \
513\
514 alpha1->real = temp1.real * ga12 + temp2.real * si12; \
515 alpha2->real = temp2.real * ga12 - temp1.real * si12; \
516\
517 alpha1->imag = temp1.imag * ga12 + temp2.imag * si12; \
518 alpha2->imag = temp2.imag * ga12 - temp1.imag * si12; \
519 } \
520}
521
522#define MAC_Apply_G_mx2_asz( m_A, \
523 gamma12, \
524 sigma12, \
525 a1, inc_a1, \
526 a2, inc_a2 ) \
527{\
528 int n_iter = m_A / 8; \
529 int n_left = m_A % 8; \
530 int i; \
531\
532 const int step_a1 = inc_a1 * 1; \
533 const int step_a2 = inc_a2 * 1; \
534\
535 dcomplex* restrict alpha1 = a1; \
536 dcomplex* restrict alpha2 = a2; \
537\
538 v2df_t a1v, a2v; \
539 v2df_t g12v, s12v; \
540 v2df_t t1v; \
541\
542 g12v.v = _mm_loaddup_pd( gamma12 ); \
543 s12v.v = _mm_loaddup_pd( sigma12 ); \
544\
545 for ( i = 0; i < n_iter; ++i ) \
546 { \
547\
548 a1v.v = _mm_load_pd( ( double* )alpha1 ); \
549 a2v.v = _mm_load_pd( ( double* )alpha2 ); \
550\
551 t1v.v = a1v.v; \
552 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
553 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
554\
555 _mm_store_pd( ( double* )alpha1, a1v.v ); \
556 _mm_store_pd( ( double* )alpha2, a2v.v ); \
557\
558 alpha1 += step_a1; \
559 alpha2 += step_a2; \
560\
561 a1v.v = _mm_load_pd( ( double* )alpha1 ); \
562 a2v.v = _mm_load_pd( ( double* )alpha2 ); \
563\
564 t1v.v = a1v.v; \
565 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
566 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
567\
568 _mm_store_pd( ( double* )alpha1, a1v.v ); \
569 _mm_store_pd( ( double* )alpha2, a2v.v ); \
570\
571 alpha1 += step_a1; \
572 alpha2 += step_a2; \
573\
574 a1v.v = _mm_load_pd( ( double* )alpha1 ); \
575 a2v.v = _mm_load_pd( ( double* )alpha2 ); \
576\
577 t1v.v = a1v.v; \
578 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
579 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
580\
581 _mm_store_pd( ( double* )alpha1, a1v.v ); \
582 _mm_store_pd( ( double* )alpha2, a2v.v ); \
583\
584 alpha1 += step_a1; \
585 alpha2 += step_a2; \
586\
587 a1v.v = _mm_load_pd( ( double* )alpha1 ); \
588 a2v.v = _mm_load_pd( ( double* )alpha2 ); \
589\
590 t1v.v = a1v.v; \
591 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
592 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
593\
594 _mm_store_pd( ( double* )alpha1, a1v.v ); \
595 _mm_store_pd( ( double* )alpha2, a2v.v ); \
596\
597 alpha1 += step_a1; \
598 alpha2 += step_a2; \
599\
600 a1v.v = _mm_load_pd( ( double* )alpha1 ); \
601 a2v.v = _mm_load_pd( ( double* )alpha2 ); \
602\
603 t1v.v = a1v.v; \
604 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
605 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
606\
607 _mm_store_pd( ( double* )alpha1, a1v.v ); \
608 _mm_store_pd( ( double* )alpha2, a2v.v ); \
609\
610 alpha1 += step_a1; \
611 alpha2 += step_a2; \
612\
613 a1v.v = _mm_load_pd( ( double* )alpha1 ); \
614 a2v.v = _mm_load_pd( ( double* )alpha2 ); \
615\
616 t1v.v = a1v.v; \
617 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
618 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
619\
620 _mm_store_pd( ( double* )alpha1, a1v.v ); \
621 _mm_store_pd( ( double* )alpha2, a2v.v ); \
622\
623 alpha1 += step_a1; \
624 alpha2 += step_a2; \
625\
626 a1v.v = _mm_load_pd( ( double* )alpha1 ); \
627 a2v.v = _mm_load_pd( ( double* )alpha2 ); \
628\
629 t1v.v = a1v.v; \
630 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
631 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
632\
633 _mm_store_pd( ( double* )alpha1, a1v.v ); \
634 _mm_store_pd( ( double* )alpha2, a2v.v ); \
635\
636 alpha1 += step_a1; \
637 alpha2 += step_a2; \
638\
639 a1v.v = _mm_load_pd( ( double* )alpha1 ); \
640 a2v.v = _mm_load_pd( ( double* )alpha2 ); \
641\
642 t1v.v = a1v.v; \
643 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
644 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
645\
646 _mm_store_pd( ( double* )alpha1, a1v.v ); \
647 _mm_store_pd( ( double* )alpha2, a2v.v ); \
648\
649 alpha1 += step_a1; \
650 alpha2 += step_a2; \
651 } \
652\
653 for ( i = 0; i < n_left; ++i ) \
654 { \
655 a1v.v = _mm_load_pd( ( double* )alpha1 ); \
656 a2v.v = _mm_load_pd( ( double* )alpha2 ); \
657\
658 t1v.v = a1v.v; \
659 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
660 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
661\
662 _mm_store_pd( ( double* )alpha1, a1v.v ); \
663 _mm_store_pd( ( double* )alpha2, a2v.v ); \
664\
665 alpha1 += step_a1; \
666 alpha2 += step_a2; \
667 } \
668}
669
670#endif