libflame revision_anchor
FLA_Apply_G_mx4s_asm.h
Go to the documentation of this file.
1/*
2
3 Copyright (C) 2014, The University of Texas at Austin
4
5 This file is part of libflame and is available under the 3-Clause
6 BSD license, which can be found in the LICENSE file at the top-level
7 directory, or at http://opensource.org/licenses/BSD-3-Clause
8
9*/
10
11
12#if FLA_VECTOR_INTRINSIC_TYPE == FLA_NO_INTRINSICS
13
14#define MAC_Apply_G_mx4s_ass MAC_Apply_G_mx4s_ops
15#define MAC_Apply_G_mx4s_asd MAC_Apply_G_mx4s_opd
16#define MAC_Apply_G_mx4s_asc MAC_Apply_G_mx4s_opc
17#define MAC_Apply_G_mx4s_asz MAC_Apply_G_mx4s_opz
18
19#elif FLA_VECTOR_INTRINSIC_TYPE == FLA_SSE_INTRINSICS
20
21#define MAC_Apply_G_mx4s_ass( m_A, \
22 gamma23_k1, \
23 sigma23_k1, \
24 gamma34_k1, \
25 sigma34_k1, \
26 gamma12_k2, \
27 sigma12_k2, \
28 gamma23_k2, \
29 sigma23_k2, \
30 a1, inc_a1, \
31 a2, inc_a2, \
32 a3, inc_a3, \
33 a4, inc_a4 ) \
34{\
35 int n_iter32 = m_A / ( 4 * 8 ); \
36 int n_left32 = m_A % ( 4 * 8 ); \
37 int n_iter4 = n_left32 / ( 4 * 1 ); \
38 int n_left = n_left32 % ( 4 * 1 ); \
39 int i; \
40\
41 const int step_a1 = inc_a1 * 4; \
42 const int step_a2 = inc_a2 * 4; \
43 const int step_a3 = inc_a3 * 4; \
44 const int step_a4 = inc_a4 * 4; \
45\
46 float* restrict alpha1 = a1; \
47 float* restrict alpha2 = a2; \
48 float* restrict alpha3 = a3; \
49 float* restrict alpha4 = a4; \
50\
51 v4sf_t a1v, a2v, a3v, a4v; \
52 v4sf_t b1v, b2v, b3v, b4v; \
53 v4sf_t g23_k1v, s23_k1v; \
54 v4sf_t g34_k1v, s34_k1v; \
55 v4sf_t g12_k2v, s12_k2v; \
56 v4sf_t g23_k2v, s23_k2v; \
57 v4sf_t t1v, t2v, t3v; \
58\
59 g23_k1v.v = _mm_load1_ps( gamma23_k1 ); \
60 s23_k1v.v = _mm_load1_ps( sigma23_k1 ); \
61 g34_k1v.v = _mm_load1_ps( gamma34_k1 ); \
62 s34_k1v.v = _mm_load1_ps( sigma34_k1 ); \
63 g12_k2v.v = _mm_load1_ps( gamma12_k2 ); \
64 s12_k2v.v = _mm_load1_ps( sigma12_k2 ); \
65 g23_k2v.v = _mm_load1_ps( gamma23_k2 ); \
66 s23_k2v.v = _mm_load1_ps( sigma23_k2 ); \
67\
68 for ( i = 0; i < n_iter32; ++i ) \
69 { \
70\
71 a2v.v = _mm_load_ps( ( float* )alpha2 ); \
72 a3v.v = _mm_load_ps( ( float* )alpha3 ); \
73 a4v.v = _mm_load_ps( ( float* )alpha4 ); \
74\
75 t2v.v = a2v.v; \
76 a2v.v = t2v.v * g23_k1v.v + a3v.v * s23_k1v.v; \
77 a3v.v = a3v.v * g23_k1v.v - t2v.v * s23_k1v.v; \
78\
79 a1v.v = _mm_load_ps( ( float* )alpha1 ); \
80\
81 t3v.v = a3v.v; \
82 a3v.v = t3v.v * g34_k1v.v + a4v.v * s34_k1v.v; \
83 a4v.v = a4v.v * g34_k1v.v - t3v.v * s34_k1v.v; \
84\
85 _mm_store_ps( ( float* )alpha4, a4v.v ); \
86 alpha4 += step_a4; \
87\
88 t1v.v = a1v.v; \
89 a1v.v = t1v.v * g12_k2v.v + a2v.v * s12_k2v.v; \
90 a2v.v = a2v.v * g12_k2v.v - t1v.v * s12_k2v.v; \
91\
92 _mm_store_ps( ( float* )alpha1, a1v.v ); \
93 alpha1 += step_a1; \
94 b2v.v = _mm_load_ps( ( float* )(alpha2 + step_a2) ); \
95\
96 t2v.v = a2v.v; \
97 a2v.v = t2v.v * g23_k2v.v + a3v.v * s23_k2v.v; \
98 a3v.v = a3v.v * g23_k2v.v - t2v.v * s23_k2v.v; \
99\
100 _mm_store_ps( ( float* )alpha2, a2v.v ); \
101 alpha2 += step_a2; \
102 b3v.v = _mm_load_ps( ( float* )(alpha3 + step_a3) ); \
103\
104/* ----------------------------------------------------------- */ \
105\
106 b4v.v = _mm_load_ps( ( float* )alpha4 ); \
107\
108 t2v.v = b2v.v; \
109 b2v.v = t2v.v * g23_k1v.v + b3v.v * s23_k1v.v; \
110 b3v.v = b3v.v * g23_k1v.v - t2v.v * s23_k1v.v; \
111\
112 _mm_store_ps( ( float* )alpha3, a3v.v ); \
113 alpha3 += step_a3; \
114 b1v.v = _mm_load_ps( ( float* )alpha1 ); \
115\
116 t3v.v = b3v.v; \
117 b3v.v = t3v.v * g34_k1v.v + b4v.v * s34_k1v.v; \
118 b4v.v = b4v.v * g34_k1v.v - t3v.v * s34_k1v.v; \
119\
120 _mm_store_ps( ( float* )alpha4, b4v.v ); \
121 alpha4 += step_a4; \
122\
123 t1v.v = b1v.v; \
124 b1v.v = t1v.v * g12_k2v.v + b2v.v * s12_k2v.v; \
125 b2v.v = b2v.v * g12_k2v.v - t1v.v * s12_k2v.v; \
126\
127 _mm_store_ps( ( float* )alpha1, b1v.v ); \
128 alpha1 += step_a1; \
129 a2v.v = _mm_load_ps( ( float* )(alpha2 + step_a2) ); \
130\
131 t2v.v = b2v.v; \
132 b2v.v = t2v.v * g23_k2v.v + b3v.v * s23_k2v.v; \
133 b3v.v = b3v.v * g23_k2v.v - t2v.v * s23_k2v.v; \
134\
135 _mm_store_ps( ( float* )alpha2, b2v.v ); \
136 alpha2 += step_a2; \
137 a3v.v = _mm_load_ps( ( float* )(alpha3 + step_a3) ); \
138\
139/* ----------------------------------------------------------- */ \
140\
141 a4v.v = _mm_load_ps( ( float* )alpha4 ); \
142\
143 t2v.v = a2v.v; \
144 a2v.v = t2v.v * g23_k1v.v + a3v.v * s23_k1v.v; \
145 a3v.v = a3v.v * g23_k1v.v - t2v.v * s23_k1v.v; \
146\
147 _mm_store_ps( ( float* )alpha3, b3v.v ); \
148 alpha3 += step_a3; \
149 a1v.v = _mm_load_ps( ( float* )alpha1 ); \
150\
151 t3v.v = a3v.v; \
152 a3v.v = t3v.v * g34_k1v.v + a4v.v * s34_k1v.v; \
153 a4v.v = a4v.v * g34_k1v.v - t3v.v * s34_k1v.v; \
154\
155 _mm_store_ps( ( float* )alpha4, a4v.v ); \
156 alpha4 += step_a4; \
157\
158 t1v.v = a1v.v; \
159 a1v.v = t1v.v * g12_k2v.v + a2v.v * s12_k2v.v; \
160 a2v.v = a2v.v * g12_k2v.v - t1v.v * s12_k2v.v; \
161\
162 _mm_store_ps( ( float* )alpha1, a1v.v ); \
163 alpha1 += step_a1; \
164 b2v.v = _mm_load_ps( ( float* )(alpha2 + step_a2) ); \
165\
166 t2v.v = a2v.v; \
167 a2v.v = t2v.v * g23_k2v.v + a3v.v * s23_k2v.v; \
168 a3v.v = a3v.v * g23_k2v.v - t2v.v * s23_k2v.v; \
169\
170 _mm_store_ps( ( float* )alpha2, a2v.v ); \
171 alpha2 += step_a2; \
172 b3v.v = _mm_load_ps( ( float* )(alpha3 + step_a3) ); \
173\
174/* ----------------------------------------------------------- */ \
175\
176 b4v.v = _mm_load_ps( ( float* )alpha4 ); \
177\
178 t2v.v = b2v.v; \
179 b2v.v = t2v.v * g23_k1v.v + b3v.v * s23_k1v.v; \
180 b3v.v = b3v.v * g23_k1v.v - t2v.v * s23_k1v.v; \
181\
182 _mm_store_ps( ( float* )alpha3, a3v.v ); \
183 alpha3 += step_a3; \
184 b1v.v = _mm_load_ps( ( float* )alpha1 ); \
185\
186 t3v.v = b3v.v; \
187 b3v.v = t3v.v * g34_k1v.v + b4v.v * s34_k1v.v; \
188 b4v.v = b4v.v * g34_k1v.v - t3v.v * s34_k1v.v; \
189\
190 _mm_store_ps( ( float* )alpha4, b4v.v ); \
191 alpha4 += step_a4; \
192\
193 t1v.v = b1v.v; \
194 b1v.v = t1v.v * g12_k2v.v + b2v.v * s12_k2v.v; \
195 b2v.v = b2v.v * g12_k2v.v - t1v.v * s12_k2v.v; \
196\
197 _mm_store_ps( ( float* )alpha1, b1v.v ); \
198 alpha1 += step_a1; \
199 a2v.v = _mm_load_ps( ( float* )(alpha2 + step_a3) ); \
200\
201 t2v.v = b2v.v; \
202 b2v.v = t2v.v * g23_k2v.v + b3v.v * s23_k2v.v; \
203 b3v.v = b3v.v * g23_k2v.v - t2v.v * s23_k2v.v; \
204\
205 _mm_store_ps( ( float* )alpha2, b2v.v ); \
206 alpha2 += step_a2; \
207 a3v.v = _mm_load_ps( ( float* )(alpha3 + step_a3) ); \
208\
209\
210/* ----------------------------------------------------------- */ \
211\
212 a4v.v = _mm_load_ps( ( float* )alpha4 ); \
213\
214 t2v.v = a2v.v; \
215 a2v.v = t2v.v * g23_k1v.v + a3v.v * s23_k1v.v; \
216 a3v.v = a3v.v * g23_k1v.v - t2v.v * s23_k1v.v; \
217\
218 _mm_store_ps( ( float* )alpha3, b3v.v ); \
219 alpha3 += step_a3; \
220 a1v.v = _mm_load_ps( ( float* )alpha1 ); \
221\
222 t3v.v = a3v.v; \
223 a3v.v = t3v.v * g34_k1v.v + a4v.v * s34_k1v.v; \
224 a4v.v = a4v.v * g34_k1v.v - t3v.v * s34_k1v.v; \
225\
226 _mm_store_ps( ( float* )alpha4, a4v.v ); \
227 alpha4 += step_a4; \
228\
229 t1v.v = a1v.v; \
230 a1v.v = t1v.v * g12_k2v.v + a2v.v * s12_k2v.v; \
231 a2v.v = a2v.v * g12_k2v.v - t1v.v * s12_k2v.v; \
232\
233 _mm_store_ps( ( float* )alpha1, a1v.v ); \
234 alpha1 += step_a1; \
235 b2v.v = _mm_load_ps( ( float* )(alpha2 + step_a2) ); \
236\
237 t2v.v = a2v.v; \
238 a2v.v = t2v.v * g23_k2v.v + a3v.v * s23_k2v.v; \
239 a3v.v = a3v.v * g23_k2v.v - t2v.v * s23_k2v.v; \
240\
241 _mm_store_ps( ( float* )alpha2, a2v.v ); \
242 alpha2 += step_a2; \
243 b3v.v = _mm_load_ps( ( float* )(alpha3 + step_a3) ); \
244\
245/* ----------------------------------------------------------- */ \
246\
247 b4v.v = _mm_load_ps( ( float* )alpha4 ); \
248\
249 t2v.v = b2v.v; \
250 b2v.v = t2v.v * g23_k1v.v + b3v.v * s23_k1v.v; \
251 b3v.v = b3v.v * g23_k1v.v - t2v.v * s23_k1v.v; \
252\
253 _mm_store_ps( ( float* )alpha3, a3v.v ); \
254 alpha3 += step_a3; \
255 b1v.v = _mm_load_ps( ( float* )alpha1 ); \
256\
257 t3v.v = b3v.v; \
258 b3v.v = t3v.v * g34_k1v.v + b4v.v * s34_k1v.v; \
259 b4v.v = b4v.v * g34_k1v.v - t3v.v * s34_k1v.v; \
260\
261 _mm_store_ps( ( float* )alpha4, b4v.v ); \
262 alpha4 += step_a4; \
263\
264 t1v.v = b1v.v; \
265 b1v.v = t1v.v * g12_k2v.v + b2v.v * s12_k2v.v; \
266 b2v.v = b2v.v * g12_k2v.v - t1v.v * s12_k2v.v; \
267\
268 _mm_store_ps( ( float* )alpha1, b1v.v ); \
269 alpha1 += step_a1; \
270 a2v.v = _mm_load_ps( ( float* )(alpha2 + step_a2) ); \
271\
272 t2v.v = b2v.v; \
273 b2v.v = t2v.v * g23_k2v.v + b3v.v * s23_k2v.v; \
274 b3v.v = b3v.v * g23_k2v.v - t2v.v * s23_k2v.v; \
275\
276 _mm_store_ps( ( float* )alpha2, b2v.v ); \
277 alpha2 += step_a2; \
278 a3v.v = _mm_load_ps( ( float* )(alpha3 + step_a3) ); \
279\
280/* ----------------------------------------------------------- */ \
281\
282 a4v.v = _mm_load_ps( ( float* )alpha4 ); \
283\
284 t2v.v = a2v.v; \
285 a2v.v = t2v.v * g23_k1v.v + a3v.v * s23_k1v.v; \
286 a3v.v = a3v.v * g23_k1v.v - t2v.v * s23_k1v.v; \
287\
288 _mm_store_ps( ( float* )alpha3, b3v.v ); \
289 alpha3 += step_a3; \
290 a1v.v = _mm_load_ps( ( float* )alpha1 ); \
291\
292 t3v.v = a3v.v; \
293 a3v.v = t3v.v * g34_k1v.v + a4v.v * s34_k1v.v; \
294 a4v.v = a4v.v * g34_k1v.v - t3v.v * s34_k1v.v; \
295\
296 _mm_store_ps( ( float* )alpha4, a4v.v ); \
297 alpha4 += step_a4; \
298\
299 t1v.v = a1v.v; \
300 a1v.v = t1v.v * g12_k2v.v + a2v.v * s12_k2v.v; \
301 a2v.v = a2v.v * g12_k2v.v - t1v.v * s12_k2v.v; \
302\
303 _mm_store_ps( ( float* )alpha1, a1v.v ); \
304 alpha1 += step_a1; \
305 b2v.v = _mm_load_ps( ( float* )(alpha2 + step_a2) ); \
306\
307 t2v.v = a2v.v; \
308 a2v.v = t2v.v * g23_k2v.v + a3v.v * s23_k2v.v; \
309 a3v.v = a3v.v * g23_k2v.v - t2v.v * s23_k2v.v; \
310\
311 _mm_store_ps( ( float* )alpha2, a2v.v ); \
312 alpha2 += step_a2; \
313 b3v.v = _mm_load_ps( ( float* )(alpha3 + step_a3) ); \
314\
315/* ----------------------------------------------------------- */ \
316\
317 b4v.v = _mm_load_ps( ( float* )alpha4 ); \
318\
319 t2v.v = b2v.v; \
320 b2v.v = t2v.v * g23_k1v.v + b3v.v * s23_k1v.v; \
321 b3v.v = b3v.v * g23_k1v.v - t2v.v * s23_k1v.v; \
322\
323 _mm_store_ps( ( float* )alpha3, a3v.v ); \
324 alpha3 += step_a3; \
325 b1v.v = _mm_load_ps( ( float* )alpha1 ); \
326\
327 t3v.v = b3v.v; \
328 b3v.v = t3v.v * g34_k1v.v + b4v.v * s34_k1v.v; \
329 b4v.v = b4v.v * g34_k1v.v - t3v.v * s34_k1v.v; \
330\
331 _mm_store_ps( ( float* )alpha4, b4v.v ); \
332 alpha4 += step_a4; \
333\
334 t1v.v = b1v.v; \
335 b1v.v = t1v.v * g12_k2v.v + b2v.v * s12_k2v.v; \
336 b2v.v = b2v.v * g12_k2v.v - t1v.v * s12_k2v.v; \
337\
338 _mm_store_ps( ( float* )alpha1, b1v.v ); \
339 alpha1 += step_a1; \
340\
341 t2v.v = b2v.v; \
342 b2v.v = t2v.v * g23_k2v.v + b3v.v * s23_k2v.v; \
343 b3v.v = b3v.v * g23_k2v.v - t2v.v * s23_k2v.v; \
344\
345 _mm_store_ps( ( float* )alpha2, b2v.v ); \
346 alpha2 += step_a2; \
347\
348 _mm_store_ps( ( float* )alpha3, b3v.v ); \
349 alpha3 += step_a3; \
350\
351/* ----------------------------------------------------------- */ \
352 } \
353\
354 for ( i = 0; i < n_iter4; ++i ) \
355 { \
356\
357 a2v.v = _mm_load_ps( ( float* )alpha2 ); \
358 a3v.v = _mm_load_ps( ( float* )alpha3 ); \
359 a4v.v = _mm_load_ps( ( float* )alpha4 ); \
360\
361 t2v.v = a2v.v; \
362 a2v.v = t2v.v * g23_k1v.v + a3v.v * s23_k1v.v; \
363 a3v.v = a3v.v * g23_k1v.v - t2v.v * s23_k1v.v; \
364\
365 a1v.v = _mm_load_ps( ( float* )alpha1 ); \
366\
367 t3v.v = a3v.v; \
368 a3v.v = t3v.v * g34_k1v.v + a4v.v * s34_k1v.v; \
369 a4v.v = a4v.v * g34_k1v.v - t3v.v * s34_k1v.v; \
370\
371 _mm_store_ps( ( float* )alpha4, a4v.v ); \
372 alpha4 += step_a4; \
373\
374 t1v.v = a1v.v; \
375 a1v.v = t1v.v * g12_k2v.v + a2v.v * s12_k2v.v; \
376 a2v.v = a2v.v * g12_k2v.v - t1v.v * s12_k2v.v; \
377\
378 _mm_store_ps( ( float* )alpha1, a1v.v ); \
379 alpha1 += step_a1; \
380\
381 t2v.v = a2v.v; \
382 a2v.v = t2v.v * g23_k2v.v + a3v.v * s23_k2v.v; \
383 a3v.v = a3v.v * g23_k2v.v - t2v.v * s23_k2v.v; \
384\
385 _mm_store_ps( ( float* )alpha2, a2v.v ); \
386 alpha2 += step_a2; \
387 _mm_store_ps( ( float* )alpha3, a3v.v ); \
388 alpha3 += step_a3; \
389 } \
390\
391 for ( i = 0; i < n_left; ++i ) \
392 { \
393 float ga23_k1 = *gamma23_k1; \
394 float si23_k1 = *sigma23_k1; \
395 float ga34_k1 = *gamma34_k1; \
396 float si34_k1 = *sigma34_k1; \
397 float ga12_k2 = *gamma12_k2; \
398 float si12_k2 = *sigma12_k2; \
399 float ga23_k2 = *gamma23_k2; \
400 float si23_k2 = *sigma23_k2; \
401 float temp1; \
402 float temp2; \
403 float temp3; \
404 float temp4; \
405\
406 temp2 = *alpha2; \
407 temp3 = *alpha3; \
408\
409 *alpha2 = temp2 * ga23_k1 + temp3 * si23_k1; \
410 *alpha3 = temp3 * ga23_k1 - temp2 * si23_k1; \
411\
412 temp3 = *alpha3; \
413 temp4 = *alpha4; \
414\
415 *alpha3 = temp3 * ga34_k1 + temp4 * si34_k1; \
416 *alpha4 = temp4 * ga34_k1 - temp3 * si34_k1; \
417\
418 temp1 = *alpha1; \
419 temp2 = *alpha2; \
420\
421 *alpha1 = temp1 * ga12_k2 + temp2 * si12_k2; \
422 *alpha2 = temp2 * ga12_k2 - temp1 * si12_k2; \
423\
424 temp2 = *alpha2; \
425 temp3 = *alpha3; \
426\
427 *alpha2 = temp2 * ga23_k2 + temp3 * si23_k2; \
428 *alpha3 = temp3 * ga23_k2 - temp2 * si23_k2; \
429\
430 alpha1 += 1; \
431 alpha2 += 1; \
432 alpha3 += 1; \
433 alpha4 += 1; \
434 } \
435}
436
437#define MAC_Apply_G_mx4s_asd( m_A, \
438 gamma23_k1, \
439 sigma23_k1, \
440 gamma34_k1, \
441 sigma34_k1, \
442 gamma12_k2, \
443 sigma12_k2, \
444 gamma23_k2, \
445 sigma23_k2, \
446 a1, inc_a1, \
447 a2, inc_a2, \
448 a3, inc_a3, \
449 a4, inc_a4 ) \
450{\
451 int n_iter16 = m_A / ( 2 * 8 ); \
452 int n_left16 = m_A % ( 2 * 8 ); \
453 int n_iter2 = n_left16 / ( 2 * 1 ); \
454 int n_left = n_left16 % ( 2 * 1 ); \
455 int i; \
456\
457 const int step_a1 = inc_a1 * 2; \
458 const int step_a2 = inc_a2 * 2; \
459 const int step_a3 = inc_a3 * 2; \
460 const int step_a4 = inc_a4 * 2; \
461\
462 double* restrict alpha1 = a1; \
463 double* restrict alpha2 = a2; \
464 double* restrict alpha3 = a3; \
465 double* restrict alpha4 = a4; \
466\
467 v2df_t a1v, a2v, a3v, a4v; \
468 v2df_t b1v, b2v, b3v, b4v; \
469 v2df_t g23_k1v, s23_k1v; \
470 v2df_t g34_k1v, s34_k1v; \
471 v2df_t g12_k2v, s12_k2v; \
472 v2df_t g23_k2v, s23_k2v; \
473 v2df_t t1v, t2v, t3v; \
474\
475 g23_k1v.v = _mm_loaddup_pd( gamma23_k1 ); \
476 s23_k1v.v = _mm_loaddup_pd( sigma23_k1 ); \
477 g34_k1v.v = _mm_loaddup_pd( gamma34_k1 ); \
478 s34_k1v.v = _mm_loaddup_pd( sigma34_k1 ); \
479 g12_k2v.v = _mm_loaddup_pd( gamma12_k2 ); \
480 s12_k2v.v = _mm_loaddup_pd( sigma12_k2 ); \
481 g23_k2v.v = _mm_loaddup_pd( gamma23_k2 ); \
482 s23_k2v.v = _mm_loaddup_pd( sigma23_k2 ); \
483\
484 for ( i = 0; i < n_iter16; ++i ) \
485 { \
486\
487 a2v.v = _mm_load_pd( ( double* )alpha2 ); \
488 a3v.v = _mm_load_pd( ( double* )alpha3 ); \
489 a4v.v = _mm_load_pd( ( double* )alpha4 ); \
490\
491 t2v.v = a2v.v; \
492 a2v.v = t2v.v * g23_k1v.v + a3v.v * s23_k1v.v; \
493 a3v.v = a3v.v * g23_k1v.v - t2v.v * s23_k1v.v; \
494\
495 a1v.v = _mm_load_pd( ( double* )alpha1 ); \
496\
497 t3v.v = a3v.v; \
498 a3v.v = t3v.v * g34_k1v.v + a4v.v * s34_k1v.v; \
499 a4v.v = a4v.v * g34_k1v.v - t3v.v * s34_k1v.v; \
500\
501 _mm_store_pd( ( double* )alpha4, a4v.v ); \
502 alpha4 += step_a4; \
503\
504 t1v.v = a1v.v; \
505 a1v.v = t1v.v * g12_k2v.v + a2v.v * s12_k2v.v; \
506 a2v.v = a2v.v * g12_k2v.v - t1v.v * s12_k2v.v; \
507\
508 _mm_store_pd( ( double* )alpha1, a1v.v ); \
509 alpha1 += step_a1; \
510 b2v.v = _mm_load_pd( ( double* )(alpha2 + step_a2) ); \
511\
512 t2v.v = a2v.v; \
513 a2v.v = t2v.v * g23_k2v.v + a3v.v * s23_k2v.v; \
514 a3v.v = a3v.v * g23_k2v.v - t2v.v * s23_k2v.v; \
515\
516 _mm_store_pd( ( double* )alpha2, a2v.v ); \
517 alpha2 += step_a2; \
518 b3v.v = _mm_load_pd( ( double* )(alpha3 + step_a3) ); \
519\
520/* ----------------------------------------------------------- */ \
521\
522 b4v.v = _mm_load_pd( ( double* )alpha4 ); \
523\
524 t2v.v = b2v.v; \
525 b2v.v = t2v.v * g23_k1v.v + b3v.v * s23_k1v.v; \
526 b3v.v = b3v.v * g23_k1v.v - t2v.v * s23_k1v.v; \
527\
528 _mm_store_pd( ( double* )alpha3, a3v.v ); \
529 alpha3 += step_a3; \
530 b1v.v = _mm_load_pd( ( double* )alpha1 ); \
531\
532 t3v.v = b3v.v; \
533 b3v.v = t3v.v * g34_k1v.v + b4v.v * s34_k1v.v; \
534 b4v.v = b4v.v * g34_k1v.v - t3v.v * s34_k1v.v; \
535\
536 _mm_store_pd( ( double* )alpha4, b4v.v ); \
537 alpha4 += step_a4; \
538\
539 t1v.v = b1v.v; \
540 b1v.v = t1v.v * g12_k2v.v + b2v.v * s12_k2v.v; \
541 b2v.v = b2v.v * g12_k2v.v - t1v.v * s12_k2v.v; \
542\
543 _mm_store_pd( ( double* )alpha1, b1v.v ); \
544 alpha1 += step_a1; \
545 a2v.v = _mm_load_pd( ( double* )(alpha2 + step_a2) ); \
546\
547 t2v.v = b2v.v; \
548 b2v.v = t2v.v * g23_k2v.v + b3v.v * s23_k2v.v; \
549 b3v.v = b3v.v * g23_k2v.v - t2v.v * s23_k2v.v; \
550\
551 _mm_store_pd( ( double* )alpha2, b2v.v ); \
552 alpha2 += step_a2; \
553 a3v.v = _mm_load_pd( ( double* )(alpha3 + step_a3) ); \
554\
555/* ----------------------------------------------------------- */ \
556\
557 a4v.v = _mm_load_pd( ( double* )alpha4 ); \
558\
559 t2v.v = a2v.v; \
560 a2v.v = t2v.v * g23_k1v.v + a3v.v * s23_k1v.v; \
561 a3v.v = a3v.v * g23_k1v.v - t2v.v * s23_k1v.v; \
562\
563 _mm_store_pd( ( double* )alpha3, b3v.v ); \
564 alpha3 += step_a3; \
565 a1v.v = _mm_load_pd( ( double* )alpha1 ); \
566\
567 t3v.v = a3v.v; \
568 a3v.v = t3v.v * g34_k1v.v + a4v.v * s34_k1v.v; \
569 a4v.v = a4v.v * g34_k1v.v - t3v.v * s34_k1v.v; \
570\
571 _mm_store_pd( ( double* )alpha4, a4v.v ); \
572 alpha4 += step_a4; \
573\
574 t1v.v = a1v.v; \
575 a1v.v = t1v.v * g12_k2v.v + a2v.v * s12_k2v.v; \
576 a2v.v = a2v.v * g12_k2v.v - t1v.v * s12_k2v.v; \
577\
578 _mm_store_pd( ( double* )alpha1, a1v.v ); \
579 alpha1 += step_a1; \
580 b2v.v = _mm_load_pd( ( double* )(alpha2 + step_a2) ); \
581\
582 t2v.v = a2v.v; \
583 a2v.v = t2v.v * g23_k2v.v + a3v.v * s23_k2v.v; \
584 a3v.v = a3v.v * g23_k2v.v - t2v.v * s23_k2v.v; \
585\
586 _mm_store_pd( ( double* )alpha2, a2v.v ); \
587 alpha2 += step_a2; \
588 b3v.v = _mm_load_pd( ( double* )(alpha3 + step_a3) ); \
589\
590/* ----------------------------------------------------------- */ \
591\
592 b4v.v = _mm_load_pd( ( double* )alpha4 ); \
593\
594 t2v.v = b2v.v; \
595 b2v.v = t2v.v * g23_k1v.v + b3v.v * s23_k1v.v; \
596 b3v.v = b3v.v * g23_k1v.v - t2v.v * s23_k1v.v; \
597\
598 _mm_store_pd( ( double* )alpha3, a3v.v ); \
599 alpha3 += step_a3; \
600 b1v.v = _mm_load_pd( ( double* )alpha1 ); \
601\
602 t3v.v = b3v.v; \
603 b3v.v = t3v.v * g34_k1v.v + b4v.v * s34_k1v.v; \
604 b4v.v = b4v.v * g34_k1v.v - t3v.v * s34_k1v.v; \
605\
606 _mm_store_pd( ( double* )alpha4, b4v.v ); \
607 alpha4 += step_a4; \
608\
609 t1v.v = b1v.v; \
610 b1v.v = t1v.v * g12_k2v.v + b2v.v * s12_k2v.v; \
611 b2v.v = b2v.v * g12_k2v.v - t1v.v * s12_k2v.v; \
612\
613 _mm_store_pd( ( double* )alpha1, b1v.v ); \
614 alpha1 += step_a1; \
615 a2v.v = _mm_load_pd( ( double* )(alpha2 + step_a3) ); \
616\
617 t2v.v = b2v.v; \
618 b2v.v = t2v.v * g23_k2v.v + b3v.v * s23_k2v.v; \
619 b3v.v = b3v.v * g23_k2v.v - t2v.v * s23_k2v.v; \
620\
621 _mm_store_pd( ( double* )alpha2, b2v.v ); \
622 alpha2 += step_a2; \
623 a3v.v = _mm_load_pd( ( double* )(alpha3 + step_a3) ); \
624\
625\
626/* ----------------------------------------------------------- */ \
627\
628 a4v.v = _mm_load_pd( ( double* )alpha4 ); \
629\
630 t2v.v = a2v.v; \
631 a2v.v = t2v.v * g23_k1v.v + a3v.v * s23_k1v.v; \
632 a3v.v = a3v.v * g23_k1v.v - t2v.v * s23_k1v.v; \
633\
634 _mm_store_pd( ( double* )alpha3, b3v.v ); \
635 alpha3 += step_a3; \
636 a1v.v = _mm_load_pd( ( double* )alpha1 ); \
637\
638 t3v.v = a3v.v; \
639 a3v.v = t3v.v * g34_k1v.v + a4v.v * s34_k1v.v; \
640 a4v.v = a4v.v * g34_k1v.v - t3v.v * s34_k1v.v; \
641\
642 _mm_store_pd( ( double* )alpha4, a4v.v ); \
643 alpha4 += step_a4; \
644\
645 t1v.v = a1v.v; \
646 a1v.v = t1v.v * g12_k2v.v + a2v.v * s12_k2v.v; \
647 a2v.v = a2v.v * g12_k2v.v - t1v.v * s12_k2v.v; \
648\
649 _mm_store_pd( ( double* )alpha1, a1v.v ); \
650 alpha1 += step_a1; \
651 b2v.v = _mm_load_pd( ( double* )(alpha2 + step_a2) ); \
652\
653 t2v.v = a2v.v; \
654 a2v.v = t2v.v * g23_k2v.v + a3v.v * s23_k2v.v; \
655 a3v.v = a3v.v * g23_k2v.v - t2v.v * s23_k2v.v; \
656\
657 _mm_store_pd( ( double* )alpha2, a2v.v ); \
658 alpha2 += step_a2; \
659 b3v.v = _mm_load_pd( ( double* )(alpha3 + step_a3) ); \
660\
661/* ----------------------------------------------------------- */ \
662\
663 b4v.v = _mm_load_pd( ( double* )alpha4 ); \
664\
665 t2v.v = b2v.v; \
666 b2v.v = t2v.v * g23_k1v.v + b3v.v * s23_k1v.v; \
667 b3v.v = b3v.v * g23_k1v.v - t2v.v * s23_k1v.v; \
668\
669 _mm_store_pd( ( double* )alpha3, a3v.v ); \
670 alpha3 += step_a3; \
671 b1v.v = _mm_load_pd( ( double* )alpha1 ); \
672\
673 t3v.v = b3v.v; \
674 b3v.v = t3v.v * g34_k1v.v + b4v.v * s34_k1v.v; \
675 b4v.v = b4v.v * g34_k1v.v - t3v.v * s34_k1v.v; \
676\
677 _mm_store_pd( ( double* )alpha4, b4v.v ); \
678 alpha4 += step_a4; \
679\
680 t1v.v = b1v.v; \
681 b1v.v = t1v.v * g12_k2v.v + b2v.v * s12_k2v.v; \
682 b2v.v = b2v.v * g12_k2v.v - t1v.v * s12_k2v.v; \
683\
684 _mm_store_pd( ( double* )alpha1, b1v.v ); \
685 alpha1 += step_a1; \
686 a2v.v = _mm_load_pd( ( double* )(alpha2 + step_a2) ); \
687\
688 t2v.v = b2v.v; \
689 b2v.v = t2v.v * g23_k2v.v + b3v.v * s23_k2v.v; \
690 b3v.v = b3v.v * g23_k2v.v - t2v.v * s23_k2v.v; \
691\
692 _mm_store_pd( ( double* )alpha2, b2v.v ); \
693 alpha2 += step_a2; \
694 a3v.v = _mm_load_pd( ( double* )(alpha3 + step_a3) ); \
695\
696/* ----------------------------------------------------------- */ \
697\
698 a4v.v = _mm_load_pd( ( double* )alpha4 ); \
699\
700 t2v.v = a2v.v; \
701 a2v.v = t2v.v * g23_k1v.v + a3v.v * s23_k1v.v; \
702 a3v.v = a3v.v * g23_k1v.v - t2v.v * s23_k1v.v; \
703\
704 _mm_store_pd( ( double* )alpha3, b3v.v ); \
705 alpha3 += step_a3; \
706 a1v.v = _mm_load_pd( ( double* )alpha1 ); \
707\
708 t3v.v = a3v.v; \
709 a3v.v = t3v.v * g34_k1v.v + a4v.v * s34_k1v.v; \
710 a4v.v = a4v.v * g34_k1v.v - t3v.v * s34_k1v.v; \
711\
712 _mm_store_pd( ( double* )alpha4, a4v.v ); \
713 alpha4 += step_a4; \
714\
715 t1v.v = a1v.v; \
716 a1v.v = t1v.v * g12_k2v.v + a2v.v * s12_k2v.v; \
717 a2v.v = a2v.v * g12_k2v.v - t1v.v * s12_k2v.v; \
718\
719 _mm_store_pd( ( double* )alpha1, a1v.v ); \
720 alpha1 += step_a1; \
721 b2v.v = _mm_load_pd( ( double* )(alpha2 + step_a2) ); \
722\
723 t2v.v = a2v.v; \
724 a2v.v = t2v.v * g23_k2v.v + a3v.v * s23_k2v.v; \
725 a3v.v = a3v.v * g23_k2v.v - t2v.v * s23_k2v.v; \
726\
727 _mm_store_pd( ( double* )alpha2, a2v.v ); \
728 alpha2 += step_a2; \
729 b3v.v = _mm_load_pd( ( double* )(alpha3 + step_a3) ); \
730\
731/* ----------------------------------------------------------- */ \
732\
733 b4v.v = _mm_load_pd( ( double* )alpha4 ); \
734\
735 t2v.v = b2v.v; \
736 b2v.v = t2v.v * g23_k1v.v + b3v.v * s23_k1v.v; \
737 b3v.v = b3v.v * g23_k1v.v - t2v.v * s23_k1v.v; \
738\
739 _mm_store_pd( ( double* )alpha3, a3v.v ); \
740 alpha3 += step_a3; \
741 b1v.v = _mm_load_pd( ( double* )alpha1 ); \
742\
743 t3v.v = b3v.v; \
744 b3v.v = t3v.v * g34_k1v.v + b4v.v * s34_k1v.v; \
745 b4v.v = b4v.v * g34_k1v.v - t3v.v * s34_k1v.v; \
746\
747 _mm_store_pd( ( double* )alpha4, b4v.v ); \
748 alpha4 += step_a4; \
749\
750 t1v.v = b1v.v; \
751 b1v.v = t1v.v * g12_k2v.v + b2v.v * s12_k2v.v; \
752 b2v.v = b2v.v * g12_k2v.v - t1v.v * s12_k2v.v; \
753\
754 _mm_store_pd( ( double* )alpha1, b1v.v ); \
755 alpha1 += step_a1; \
756\
757 t2v.v = b2v.v; \
758 b2v.v = t2v.v * g23_k2v.v + b3v.v * s23_k2v.v; \
759 b3v.v = b3v.v * g23_k2v.v - t2v.v * s23_k2v.v; \
760\
761 _mm_store_pd( ( double* )alpha2, b2v.v ); \
762 alpha2 += step_a2; \
763\
764 _mm_store_pd( ( double* )alpha3, b3v.v ); \
765 alpha3 += step_a3; \
766\
767/* ----------------------------------------------------------- */ \
768 } \
769\
770 for ( i = 0; i < n_iter2; ++i ) \
771 { \
772\
773 a2v.v = _mm_load_pd( ( double* )alpha2 ); \
774 a3v.v = _mm_load_pd( ( double* )alpha3 ); \
775 a4v.v = _mm_load_pd( ( double* )alpha4 ); \
776\
777 t2v.v = a2v.v; \
778 a2v.v = t2v.v * g23_k1v.v + a3v.v * s23_k1v.v; \
779 a3v.v = a3v.v * g23_k1v.v - t2v.v * s23_k1v.v; \
780\
781 a1v.v = _mm_load_pd( ( double* )alpha1 ); \
782\
783 t3v.v = a3v.v; \
784 a3v.v = t3v.v * g34_k1v.v + a4v.v * s34_k1v.v; \
785 a4v.v = a4v.v * g34_k1v.v - t3v.v * s34_k1v.v; \
786\
787 _mm_store_pd( ( double* )alpha4, a4v.v ); \
788 alpha4 += step_a4; \
789\
790 t1v.v = a1v.v; \
791 a1v.v = t1v.v * g12_k2v.v + a2v.v * s12_k2v.v; \
792 a2v.v = a2v.v * g12_k2v.v - t1v.v * s12_k2v.v; \
793\
794 _mm_store_pd( ( double* )alpha1, a1v.v ); \
795 alpha1 += step_a1; \
796\
797 t2v.v = a2v.v; \
798 a2v.v = t2v.v * g23_k2v.v + a3v.v * s23_k2v.v; \
799 a3v.v = a3v.v * g23_k2v.v - t2v.v * s23_k2v.v; \
800\
801 _mm_store_pd( ( double* )alpha2, a2v.v ); \
802 alpha2 += step_a2; \
803 _mm_store_pd( ( double* )alpha3, a3v.v ); \
804 alpha3 += step_a3; \
805 } \
806\
807 if ( n_left == 1 ) \
808 { \
809 double ga23_k1 = *gamma23_k1; \
810 double si23_k1 = *sigma23_k1; \
811 double ga34_k1 = *gamma34_k1; \
812 double si34_k1 = *sigma34_k1; \
813 double ga12_k2 = *gamma12_k2; \
814 double si12_k2 = *sigma12_k2; \
815 double ga23_k2 = *gamma23_k2; \
816 double si23_k2 = *sigma23_k2; \
817 double temp1; \
818 double temp2; \
819 double temp3; \
820 double temp4; \
821\
822 temp2 = *alpha2; \
823 temp3 = *alpha3; \
824\
825 *alpha2 = temp2 * ga23_k1 + temp3 * si23_k1; \
826 *alpha3 = temp3 * ga23_k1 - temp2 * si23_k1; \
827\
828 temp3 = *alpha3; \
829 temp4 = *alpha4; \
830\
831 *alpha3 = temp3 * ga34_k1 + temp4 * si34_k1; \
832 *alpha4 = temp4 * ga34_k1 - temp3 * si34_k1; \
833\
834 temp1 = *alpha1; \
835 temp2 = *alpha2; \
836\
837 *alpha1 = temp1 * ga12_k2 + temp2 * si12_k2; \
838 *alpha2 = temp2 * ga12_k2 - temp1 * si12_k2; \
839\
840 temp2 = *alpha2; \
841 temp3 = *alpha3; \
842\
843 *alpha2 = temp2 * ga23_k2 + temp3 * si23_k2; \
844 *alpha3 = temp3 * ga23_k2 - temp2 * si23_k2; \
845\
846 } \
847}
848
849#define MAC_Apply_G_mx4s_asc( m_A, \
850 gamma23_k1, \
851 sigma23_k1, \
852 gamma34_k1, \
853 sigma34_k1, \
854 gamma12_k2, \
855 sigma12_k2, \
856 gamma23_k2, \
857 sigma23_k2, \
858 a1, inc_a1, \
859 a2, inc_a2, \
860 a3, inc_a3, \
861 a4, inc_a4 ) \
862{\
863 int n_iter16 = m_A / ( 2 * 8 ); \
864 int n_left16 = m_A % ( 2 * 8 ); \
865 int n_iter2 = n_left16 / ( 2 * 1 ); \
866 int n_left = n_left16 % ( 2 * 1 ); \
867 int i; \
868\
869 const int step_a1 = inc_a1 * 2; \
870 const int step_a2 = inc_a2 * 2; \
871 const int step_a3 = inc_a3 * 2; \
872 const int step_a4 = inc_a4 * 2; \
873\
874 scomplex* restrict alpha1 = a1; \
875 scomplex* restrict alpha2 = a2; \
876 scomplex* restrict alpha3 = a3; \
877 scomplex* restrict alpha4 = a4; \
878\
879 v4sf_t a1v, a2v, a3v, a4v; \
880 v4sf_t b1v, b2v, b3v, b4v; \
881 v4sf_t g23_k1v, s23_k1v; \
882 v4sf_t g34_k1v, s34_k1v; \
883 v4sf_t g12_k2v, s12_k2v; \
884 v4sf_t g23_k2v, s23_k2v; \
885 v4sf_t t1v, t2v, t3v; \
886\
887 g23_k1v.v = _mm_load1_ps( gamma23_k1 ); \
888 s23_k1v.v = _mm_load1_ps( sigma23_k1 ); \
889 g34_k1v.v = _mm_load1_ps( gamma34_k1 ); \
890 s34_k1v.v = _mm_load1_ps( sigma34_k1 ); \
891 g12_k2v.v = _mm_load1_ps( gamma12_k2 ); \
892 s12_k2v.v = _mm_load1_ps( sigma12_k2 ); \
893 g23_k2v.v = _mm_load1_ps( gamma23_k2 ); \
894 s23_k2v.v = _mm_load1_ps( sigma23_k2 ); \
895\
896 for ( i = 0; i < n_iter16; ++i ) \
897 { \
898\
899 a2v.v = _mm_load_ps( ( float* )alpha2 ); \
900 a3v.v = _mm_load_ps( ( float* )alpha3 ); \
901 a4v.v = _mm_load_ps( ( float* )alpha4 ); \
902\
903 t2v.v = a2v.v; \
904 a2v.v = t2v.v * g23_k1v.v + a3v.v * s23_k1v.v; \
905 a3v.v = a3v.v * g23_k1v.v - t2v.v * s23_k1v.v; \
906\
907 a1v.v = _mm_load_ps( ( float* )alpha1 ); \
908\
909 t3v.v = a3v.v; \
910 a3v.v = t3v.v * g34_k1v.v + a4v.v * s34_k1v.v; \
911 a4v.v = a4v.v * g34_k1v.v - t3v.v * s34_k1v.v; \
912\
913 _mm_store_ps( ( float* )alpha4, a4v.v ); \
914 alpha4 += step_a4; \
915\
916 t1v.v = a1v.v; \
917 a1v.v = t1v.v * g12_k2v.v + a2v.v * s12_k2v.v; \
918 a2v.v = a2v.v * g12_k2v.v - t1v.v * s12_k2v.v; \
919\
920 _mm_store_ps( ( float* )alpha1, a1v.v ); \
921 alpha1 += step_a1; \
922 b2v.v = _mm_load_ps( ( float* )(alpha2 + step_a2) ); \
923\
924 t2v.v = a2v.v; \
925 a2v.v = t2v.v * g23_k2v.v + a3v.v * s23_k2v.v; \
926 a3v.v = a3v.v * g23_k2v.v - t2v.v * s23_k2v.v; \
927\
928 _mm_store_ps( ( float* )alpha2, a2v.v ); \
929 alpha2 += step_a2; \
930 b3v.v = _mm_load_ps( ( float* )(alpha3 + step_a3) ); \
931\
932/* ----------------------------------------------------------- */ \
933\
934 b4v.v = _mm_load_ps( ( float* )alpha4 ); \
935\
936 t2v.v = b2v.v; \
937 b2v.v = t2v.v * g23_k1v.v + b3v.v * s23_k1v.v; \
938 b3v.v = b3v.v * g23_k1v.v - t2v.v * s23_k1v.v; \
939\
940 _mm_store_ps( ( float* )alpha3, a3v.v ); \
941 alpha3 += step_a3; \
942 b1v.v = _mm_load_ps( ( float* )alpha1 ); \
943\
944 t3v.v = b3v.v; \
945 b3v.v = t3v.v * g34_k1v.v + b4v.v * s34_k1v.v; \
946 b4v.v = b4v.v * g34_k1v.v - t3v.v * s34_k1v.v; \
947\
948 _mm_store_ps( ( float* )alpha4, b4v.v ); \
949 alpha4 += step_a4; \
950\
951 t1v.v = b1v.v; \
952 b1v.v = t1v.v * g12_k2v.v + b2v.v * s12_k2v.v; \
953 b2v.v = b2v.v * g12_k2v.v - t1v.v * s12_k2v.v; \
954\
955 _mm_store_ps( ( float* )alpha1, b1v.v ); \
956 alpha1 += step_a1; \
957 a2v.v = _mm_load_ps( ( float* )(alpha2 + step_a2) ); \
958\
959 t2v.v = b2v.v; \
960 b2v.v = t2v.v * g23_k2v.v + b3v.v * s23_k2v.v; \
961 b3v.v = b3v.v * g23_k2v.v - t2v.v * s23_k2v.v; \
962\
963 _mm_store_ps( ( float* )alpha2, b2v.v ); \
964 alpha2 += step_a2; \
965 a3v.v = _mm_load_ps( ( float* )(alpha3 + step_a3) ); \
966\
967/* ----------------------------------------------------------- */ \
968\
969 a4v.v = _mm_load_ps( ( float* )alpha4 ); \
970\
971 t2v.v = a2v.v; \
972 a2v.v = t2v.v * g23_k1v.v + a3v.v * s23_k1v.v; \
973 a3v.v = a3v.v * g23_k1v.v - t2v.v * s23_k1v.v; \
974\
975 _mm_store_ps( ( float* )alpha3, b3v.v ); \
976 alpha3 += step_a3; \
977 a1v.v = _mm_load_ps( ( float* )alpha1 ); \
978\
979 t3v.v = a3v.v; \
980 a3v.v = t3v.v * g34_k1v.v + a4v.v * s34_k1v.v; \
981 a4v.v = a4v.v * g34_k1v.v - t3v.v * s34_k1v.v; \
982\
983 _mm_store_ps( ( float* )alpha4, a4v.v ); \
984 alpha4 += step_a4; \
985\
986 t1v.v = a1v.v; \
987 a1v.v = t1v.v * g12_k2v.v + a2v.v * s12_k2v.v; \
988 a2v.v = a2v.v * g12_k2v.v - t1v.v * s12_k2v.v; \
989\
990 _mm_store_ps( ( float* )alpha1, a1v.v ); \
991 alpha1 += step_a1; \
992 b2v.v = _mm_load_ps( ( float* )(alpha2 + step_a2) ); \
993\
994 t2v.v = a2v.v; \
995 a2v.v = t2v.v * g23_k2v.v + a3v.v * s23_k2v.v; \
996 a3v.v = a3v.v * g23_k2v.v - t2v.v * s23_k2v.v; \
997\
998 _mm_store_ps( ( float* )alpha2, a2v.v ); \
999 alpha2 += step_a2; \
1000 b3v.v = _mm_load_ps( ( float* )(alpha3 + step_a3) ); \
1001\
1002/* ----------------------------------------------------------- */ \
1003\
1004 b4v.v = _mm_load_ps( ( float* )alpha4 ); \
1005\
1006 t2v.v = b2v.v; \
1007 b2v.v = t2v.v * g23_k1v.v + b3v.v * s23_k1v.v; \
1008 b3v.v = b3v.v * g23_k1v.v - t2v.v * s23_k1v.v; \
1009\
1010 _mm_store_ps( ( float* )alpha3, a3v.v ); \
1011 alpha3 += step_a3; \
1012 b1v.v = _mm_load_ps( ( float* )alpha1 ); \
1013\
1014 t3v.v = b3v.v; \
1015 b3v.v = t3v.v * g34_k1v.v + b4v.v * s34_k1v.v; \
1016 b4v.v = b4v.v * g34_k1v.v - t3v.v * s34_k1v.v; \
1017\
1018 _mm_store_ps( ( float* )alpha4, b4v.v ); \
1019 alpha4 += step_a4; \
1020\
1021 t1v.v = b1v.v; \
1022 b1v.v = t1v.v * g12_k2v.v + b2v.v * s12_k2v.v; \
1023 b2v.v = b2v.v * g12_k2v.v - t1v.v * s12_k2v.v; \
1024\
1025 _mm_store_ps( ( float* )alpha1, b1v.v ); \
1026 alpha1 += step_a1; \
1027 a2v.v = _mm_load_ps( ( float* )(alpha2 + step_a3) ); \
1028\
1029 t2v.v = b2v.v; \
1030 b2v.v = t2v.v * g23_k2v.v + b3v.v * s23_k2v.v; \
1031 b3v.v = b3v.v * g23_k2v.v - t2v.v * s23_k2v.v; \
1032\
1033 _mm_store_ps( ( float* )alpha2, b2v.v ); \
1034 alpha2 += step_a2; \
1035 a3v.v = _mm_load_ps( ( float* )(alpha3 + step_a3) ); \
1036\
1037\
1038/* ----------------------------------------------------------- */ \
1039\
1040 a4v.v = _mm_load_ps( ( float* )alpha4 ); \
1041\
1042 t2v.v = a2v.v; \
1043 a2v.v = t2v.v * g23_k1v.v + a3v.v * s23_k1v.v; \
1044 a3v.v = a3v.v * g23_k1v.v - t2v.v * s23_k1v.v; \
1045\
1046 _mm_store_ps( ( float* )alpha3, b3v.v ); \
1047 alpha3 += step_a3; \
1048 a1v.v = _mm_load_ps( ( float* )alpha1 ); \
1049\
1050 t3v.v = a3v.v; \
1051 a3v.v = t3v.v * g34_k1v.v + a4v.v * s34_k1v.v; \
1052 a4v.v = a4v.v * g34_k1v.v - t3v.v * s34_k1v.v; \
1053\
1054 _mm_store_ps( ( float* )alpha4, a4v.v ); \
1055 alpha4 += step_a4; \
1056\
1057 t1v.v = a1v.v; \
1058 a1v.v = t1v.v * g12_k2v.v + a2v.v * s12_k2v.v; \
1059 a2v.v = a2v.v * g12_k2v.v - t1v.v * s12_k2v.v; \
1060\
1061 _mm_store_ps( ( float* )alpha1, a1v.v ); \
1062 alpha1 += step_a1; \
1063 b2v.v = _mm_load_ps( ( float* )(alpha2 + step_a2) ); \
1064\
1065 t2v.v = a2v.v; \
1066 a2v.v = t2v.v * g23_k2v.v + a3v.v * s23_k2v.v; \
1067 a3v.v = a3v.v * g23_k2v.v - t2v.v * s23_k2v.v; \
1068\
1069 _mm_store_ps( ( float* )alpha2, a2v.v ); \
1070 alpha2 += step_a2; \
1071 b3v.v = _mm_load_ps( ( float* )(alpha3 + step_a3) ); \
1072\
1073/* ----------------------------------------------------------- */ \
1074\
1075 b4v.v = _mm_load_ps( ( float* )alpha4 ); \
1076\
1077 t2v.v = b2v.v; \
1078 b2v.v = t2v.v * g23_k1v.v + b3v.v * s23_k1v.v; \
1079 b3v.v = b3v.v * g23_k1v.v - t2v.v * s23_k1v.v; \
1080\
1081 _mm_store_ps( ( float* )alpha3, a3v.v ); \
1082 alpha3 += step_a3; \
1083 b1v.v = _mm_load_ps( ( float* )alpha1 ); \
1084\
1085 t3v.v = b3v.v; \
1086 b3v.v = t3v.v * g34_k1v.v + b4v.v * s34_k1v.v; \
1087 b4v.v = b4v.v * g34_k1v.v - t3v.v * s34_k1v.v; \
1088\
1089 _mm_store_ps( ( float* )alpha4, b4v.v ); \
1090 alpha4 += step_a4; \
1091\
1092 t1v.v = b1v.v; \
1093 b1v.v = t1v.v * g12_k2v.v + b2v.v * s12_k2v.v; \
1094 b2v.v = b2v.v * g12_k2v.v - t1v.v * s12_k2v.v; \
1095\
1096 _mm_store_ps( ( float* )alpha1, b1v.v ); \
1097 alpha1 += step_a1; \
1098 a2v.v = _mm_load_ps( ( float* )(alpha2 + step_a2) ); \
1099\
1100 t2v.v = b2v.v; \
1101 b2v.v = t2v.v * g23_k2v.v + b3v.v * s23_k2v.v; \
1102 b3v.v = b3v.v * g23_k2v.v - t2v.v * s23_k2v.v; \
1103\
1104 _mm_store_ps( ( float* )alpha2, b2v.v ); \
1105 alpha2 += step_a2; \
1106 a3v.v = _mm_load_ps( ( float* )(alpha3 + step_a3) ); \
1107\
1108/* ----------------------------------------------------------- */ \
1109\
1110 a4v.v = _mm_load_ps( ( float* )alpha4 ); \
1111\
1112 t2v.v = a2v.v; \
1113 a2v.v = t2v.v * g23_k1v.v + a3v.v * s23_k1v.v; \
1114 a3v.v = a3v.v * g23_k1v.v - t2v.v * s23_k1v.v; \
1115\
1116 _mm_store_ps( ( float* )alpha3, b3v.v ); \
1117 alpha3 += step_a3; \
1118 a1v.v = _mm_load_ps( ( float* )alpha1 ); \
1119\
1120 t3v.v = a3v.v; \
1121 a3v.v = t3v.v * g34_k1v.v + a4v.v * s34_k1v.v; \
1122 a4v.v = a4v.v * g34_k1v.v - t3v.v * s34_k1v.v; \
1123\
1124 _mm_store_ps( ( float* )alpha4, a4v.v ); \
1125 alpha4 += step_a4; \
1126\
1127 t1v.v = a1v.v; \
1128 a1v.v = t1v.v * g12_k2v.v + a2v.v * s12_k2v.v; \
1129 a2v.v = a2v.v * g12_k2v.v - t1v.v * s12_k2v.v; \
1130\
1131 _mm_store_ps( ( float* )alpha1, a1v.v ); \
1132 alpha1 += step_a1; \
1133 b2v.v = _mm_load_ps( ( float* )(alpha2 + step_a2) ); \
1134\
1135 t2v.v = a2v.v; \
1136 a2v.v = t2v.v * g23_k2v.v + a3v.v * s23_k2v.v; \
1137 a3v.v = a3v.v * g23_k2v.v - t2v.v * s23_k2v.v; \
1138\
1139 _mm_store_ps( ( float* )alpha2, a2v.v ); \
1140 alpha2 += step_a2; \
1141 b3v.v = _mm_load_ps( ( float* )(alpha3 + step_a3) ); \
1142\
1143/* ----------------------------------------------------------- */ \
1144\
1145 b4v.v = _mm_load_ps( ( float* )alpha4 ); \
1146\
1147 t2v.v = b2v.v; \
1148 b2v.v = t2v.v * g23_k1v.v + b3v.v * s23_k1v.v; \
1149 b3v.v = b3v.v * g23_k1v.v - t2v.v * s23_k1v.v; \
1150\
1151 _mm_store_ps( ( float* )alpha3, a3v.v ); \
1152 alpha3 += step_a3; \
1153 b1v.v = _mm_load_ps( ( float* )alpha1 ); \
1154\
1155 t3v.v = b3v.v; \
1156 b3v.v = t3v.v * g34_k1v.v + b4v.v * s34_k1v.v; \
1157 b4v.v = b4v.v * g34_k1v.v - t3v.v * s34_k1v.v; \
1158\
1159 _mm_store_ps( ( float* )alpha4, b4v.v ); \
1160 alpha4 += step_a4; \
1161\
1162 t1v.v = b1v.v; \
1163 b1v.v = t1v.v * g12_k2v.v + b2v.v * s12_k2v.v; \
1164 b2v.v = b2v.v * g12_k2v.v - t1v.v * s12_k2v.v; \
1165\
1166 _mm_store_ps( ( float* )alpha1, b1v.v ); \
1167 alpha1 += step_a1; \
1168\
1169 t2v.v = b2v.v; \
1170 b2v.v = t2v.v * g23_k2v.v + b3v.v * s23_k2v.v; \
1171 b3v.v = b3v.v * g23_k2v.v - t2v.v * s23_k2v.v; \
1172\
1173 _mm_store_ps( ( float* )alpha2, b2v.v ); \
1174 alpha2 += step_a2; \
1175\
1176 _mm_store_ps( ( float* )alpha3, b3v.v ); \
1177 alpha3 += step_a3; \
1178\
1179/* ----------------------------------------------------------- */ \
1180 } \
1181\
1182 for ( i = 0; i < n_iter2; ++i ) \
1183 { \
1184\
1185 a2v.v = _mm_load_ps( ( float* )alpha2 ); \
1186 a3v.v = _mm_load_ps( ( float* )alpha3 ); \
1187 a4v.v = _mm_load_ps( ( float* )alpha4 ); \
1188\
1189 t2v.v = a2v.v; \
1190 a2v.v = t2v.v * g23_k1v.v + a3v.v * s23_k1v.v; \
1191 a3v.v = a3v.v * g23_k1v.v - t2v.v * s23_k1v.v; \
1192\
1193 a1v.v = _mm_load_ps( ( float* )alpha1 ); \
1194\
1195 t3v.v = a3v.v; \
1196 a3v.v = t3v.v * g34_k1v.v + a4v.v * s34_k1v.v; \
1197 a4v.v = a4v.v * g34_k1v.v - t3v.v * s34_k1v.v; \
1198\
1199 _mm_store_ps( ( float* )alpha4, a4v.v ); \
1200 alpha4 += step_a4; \
1201\
1202 t1v.v = a1v.v; \
1203 a1v.v = t1v.v * g12_k2v.v + a2v.v * s12_k2v.v; \
1204 a2v.v = a2v.v * g12_k2v.v - t1v.v * s12_k2v.v; \
1205\
1206 _mm_store_ps( ( float* )alpha1, a1v.v ); \
1207 alpha1 += step_a1; \
1208\
1209 t2v.v = a2v.v; \
1210 a2v.v = t2v.v * g23_k2v.v + a3v.v * s23_k2v.v; \
1211 a3v.v = a3v.v * g23_k2v.v - t2v.v * s23_k2v.v; \
1212\
1213 _mm_store_ps( ( float* )alpha2, a2v.v ); \
1214 alpha2 += step_a2; \
1215 _mm_store_ps( ( float* )alpha3, a3v.v ); \
1216 alpha3 += step_a3; \
1217 } \
1218\
1219 if ( n_left == 1 ) \
1220 { \
1221 float ga23_k1 = *gamma23_k1; \
1222 float si23_k1 = *sigma23_k1; \
1223 float ga34_k1 = *gamma34_k1; \
1224 float si34_k1 = *sigma34_k1; \
1225 float ga12_k2 = *gamma12_k2; \
1226 float si12_k2 = *sigma12_k2; \
1227 float ga23_k2 = *gamma23_k2; \
1228 float si23_k2 = *sigma23_k2; \
1229 scomplex temp1; \
1230 scomplex temp2; \
1231 scomplex temp3; \
1232 scomplex temp4; \
1233\
1234 temp2 = *alpha2; \
1235 temp3 = *alpha3; \
1236\
1237 alpha2->real = temp2.real * ga23_k1 + temp3.real * si23_k1; \
1238 alpha3->real = temp3.real * ga23_k1 - temp2.real * si23_k1; \
1239\
1240 alpha2->imag = temp2.imag * ga23_k1 + temp3.imag * si23_k1; \
1241 alpha3->imag = temp3.imag * ga23_k1 - temp2.imag * si23_k1; \
1242\
1243 temp3 = *alpha3; \
1244 temp4 = *alpha4; \
1245\
1246 alpha3->real = temp3.real * ga34_k1 + temp4.real * si34_k1; \
1247 alpha4->real = temp4.real * ga34_k1 - temp3.real * si34_k1; \
1248\
1249 alpha3->imag = temp3.imag * ga34_k1 + temp4.imag * si34_k1; \
1250 alpha4->imag = temp4.imag * ga34_k1 - temp3.imag * si34_k1; \
1251\
1252 temp1 = *alpha1; \
1253 temp2 = *alpha2; \
1254\
1255 alpha1->real = temp1.real * ga12_k2 + temp2.real * si12_k2; \
1256 alpha2->real = temp2.real * ga12_k2 - temp1.real * si12_k2; \
1257\
1258 alpha1->imag = temp1.imag * ga12_k2 + temp2.imag * si12_k2; \
1259 alpha2->imag = temp2.imag * ga12_k2 - temp1.imag * si12_k2; \
1260\
1261 temp2 = *alpha2; \
1262 temp3 = *alpha3; \
1263\
1264 alpha2->real = temp2.real * ga23_k2 + temp3.real * si23_k2; \
1265 alpha3->real = temp3.real * ga23_k2 - temp2.real * si23_k2; \
1266\
1267 alpha2->imag = temp2.imag * ga23_k2 + temp3.imag * si23_k2; \
1268 alpha3->imag = temp3.imag * ga23_k2 - temp2.imag * si23_k2; \
1269\
1270 } \
1271}
1272
1273#define MAC_Apply_G_mx4s_asz( m_A, \
1274 gamma23_k1, \
1275 sigma23_k1, \
1276 gamma34_k1, \
1277 sigma34_k1, \
1278 gamma12_k2, \
1279 sigma12_k2, \
1280 gamma23_k2, \
1281 sigma23_k2, \
1282 a1, inc_a1, \
1283 a2, inc_a2, \
1284 a3, inc_a3, \
1285 a4, inc_a4 ) \
1286{\
1287 int n_iter = m_A / 8; \
1288 int n_left = m_A % 8; \
1289 int i; \
1290\
1291 const int step_a1 = inc_a1 * 1; \
1292 const int step_a2 = inc_a2 * 1; \
1293 const int step_a3 = inc_a3 * 1; \
1294 const int step_a4 = inc_a4 * 1; \
1295\
1296 dcomplex* restrict alpha1 = a1; \
1297 dcomplex* restrict alpha2 = a2; \
1298 dcomplex* restrict alpha3 = a3; \
1299 dcomplex* restrict alpha4 = a4; \
1300\
1301 v2df_t a1v, a2v, a3v, a4v; \
1302 v2df_t b1v, b2v, b3v, b4v; \
1303 v2df_t g23_k1v, s23_k1v; \
1304 v2df_t g34_k1v, s34_k1v; \
1305 v2df_t g12_k2v, s12_k2v; \
1306 v2df_t g23_k2v, s23_k2v; \
1307 v2df_t t1v, t2v, t3v; \
1308\
1309 g23_k1v.v = _mm_loaddup_pd( gamma23_k1 ); \
1310 s23_k1v.v = _mm_loaddup_pd( sigma23_k1 ); \
1311 g34_k1v.v = _mm_loaddup_pd( gamma34_k1 ); \
1312 s34_k1v.v = _mm_loaddup_pd( sigma34_k1 ); \
1313 g12_k2v.v = _mm_loaddup_pd( gamma12_k2 ); \
1314 s12_k2v.v = _mm_loaddup_pd( sigma12_k2 ); \
1315 g23_k2v.v = _mm_loaddup_pd( gamma23_k2 ); \
1316 s23_k2v.v = _mm_loaddup_pd( sigma23_k2 ); \
1317\
1318 for ( i = 0; i < n_iter; ++i ) \
1319 { \
1320\
1321 a2v.v = _mm_load_pd( ( double* )alpha2 ); \
1322 a3v.v = _mm_load_pd( ( double* )alpha3 ); \
1323 a4v.v = _mm_load_pd( ( double* )alpha4 ); \
1324\
1325 t2v.v = a2v.v; \
1326 a2v.v = t2v.v * g23_k1v.v + a3v.v * s23_k1v.v; \
1327 a3v.v = a3v.v * g23_k1v.v - t2v.v * s23_k1v.v; \
1328\
1329 a1v.v = _mm_load_pd( ( double* )alpha1 ); \
1330\
1331 t3v.v = a3v.v; \
1332 a3v.v = t3v.v * g34_k1v.v + a4v.v * s34_k1v.v; \
1333 a4v.v = a4v.v * g34_k1v.v - t3v.v * s34_k1v.v; \
1334\
1335 _mm_store_pd( ( double* )alpha4, a4v.v ); \
1336 alpha4 += step_a4; \
1337\
1338 t1v.v = a1v.v; \
1339 a1v.v = t1v.v * g12_k2v.v + a2v.v * s12_k2v.v; \
1340 a2v.v = a2v.v * g12_k2v.v - t1v.v * s12_k2v.v; \
1341\
1342 _mm_store_pd( ( double* )alpha1, a1v.v ); \
1343 alpha1 += step_a1; \
1344 b2v.v = _mm_load_pd( ( double* )(alpha2 + step_a2) ); \
1345\
1346 t2v.v = a2v.v; \
1347 a2v.v = t2v.v * g23_k2v.v + a3v.v * s23_k2v.v; \
1348 a3v.v = a3v.v * g23_k2v.v - t2v.v * s23_k2v.v; \
1349\
1350 _mm_store_pd( ( double* )alpha2, a2v.v ); \
1351 alpha2 += step_a2; \
1352 b3v.v = _mm_load_pd( ( double* )(alpha3 + step_a3) ); \
1353\
1354/* ----------------------------------------------------------- */ \
1355\
1356 b4v.v = _mm_load_pd( ( double* )alpha4 ); \
1357\
1358 t2v.v = b2v.v; \
1359 b2v.v = t2v.v * g23_k1v.v + b3v.v * s23_k1v.v; \
1360 b3v.v = b3v.v * g23_k1v.v - t2v.v * s23_k1v.v; \
1361\
1362 _mm_store_pd( ( double* )alpha3, a3v.v ); \
1363 alpha3 += step_a3; \
1364 b1v.v = _mm_load_pd( ( double* )alpha1 ); \
1365\
1366 t3v.v = b3v.v; \
1367 b3v.v = t3v.v * g34_k1v.v + b4v.v * s34_k1v.v; \
1368 b4v.v = b4v.v * g34_k1v.v - t3v.v * s34_k1v.v; \
1369\
1370 _mm_store_pd( ( double* )alpha4, b4v.v ); \
1371 alpha4 += step_a4; \
1372\
1373 t1v.v = b1v.v; \
1374 b1v.v = t1v.v * g12_k2v.v + b2v.v * s12_k2v.v; \
1375 b2v.v = b2v.v * g12_k2v.v - t1v.v * s12_k2v.v; \
1376\
1377 _mm_store_pd( ( double* )alpha1, b1v.v ); \
1378 alpha1 += step_a1; \
1379 a2v.v = _mm_load_pd( ( double* )(alpha2 + step_a2) ); \
1380\
1381 t2v.v = b2v.v; \
1382 b2v.v = t2v.v * g23_k2v.v + b3v.v * s23_k2v.v; \
1383 b3v.v = b3v.v * g23_k2v.v - t2v.v * s23_k2v.v; \
1384\
1385 _mm_store_pd( ( double* )alpha2, b2v.v ); \
1386 alpha2 += step_a2; \
1387 a3v.v = _mm_load_pd( ( double* )(alpha3 + step_a3) ); \
1388\
1389/* ----------------------------------------------------------- */ \
1390\
1391 a4v.v = _mm_load_pd( ( double* )alpha4 ); \
1392\
1393 t2v.v = a2v.v; \
1394 a2v.v = t2v.v * g23_k1v.v + a3v.v * s23_k1v.v; \
1395 a3v.v = a3v.v * g23_k1v.v - t2v.v * s23_k1v.v; \
1396\
1397 _mm_store_pd( ( double* )alpha3, b3v.v ); \
1398 alpha3 += step_a3; \
1399 a1v.v = _mm_load_pd( ( double* )alpha1 ); \
1400\
1401 t3v.v = a3v.v; \
1402 a3v.v = t3v.v * g34_k1v.v + a4v.v * s34_k1v.v; \
1403 a4v.v = a4v.v * g34_k1v.v - t3v.v * s34_k1v.v; \
1404\
1405 _mm_store_pd( ( double* )alpha4, a4v.v ); \
1406 alpha4 += step_a4; \
1407\
1408 t1v.v = a1v.v; \
1409 a1v.v = t1v.v * g12_k2v.v + a2v.v * s12_k2v.v; \
1410 a2v.v = a2v.v * g12_k2v.v - t1v.v * s12_k2v.v; \
1411\
1412 _mm_store_pd( ( double* )alpha1, a1v.v ); \
1413 alpha1 += step_a1; \
1414 b2v.v = _mm_load_pd( ( double* )(alpha2 + step_a2) ); \
1415\
1416 t2v.v = a2v.v; \
1417 a2v.v = t2v.v * g23_k2v.v + a3v.v * s23_k2v.v; \
1418 a3v.v = a3v.v * g23_k2v.v - t2v.v * s23_k2v.v; \
1419\
1420 _mm_store_pd( ( double* )alpha2, a2v.v ); \
1421 alpha2 += step_a2; \
1422 b3v.v = _mm_load_pd( ( double* )(alpha3 + step_a3) ); \
1423\
1424/* ----------------------------------------------------------- */ \
1425\
1426 b4v.v = _mm_load_pd( ( double* )alpha4 ); \
1427\
1428 t2v.v = b2v.v; \
1429 b2v.v = t2v.v * g23_k1v.v + b3v.v * s23_k1v.v; \
1430 b3v.v = b3v.v * g23_k1v.v - t2v.v * s23_k1v.v; \
1431\
1432 _mm_store_pd( ( double* )alpha3, a3v.v ); \
1433 alpha3 += step_a3; \
1434 b1v.v = _mm_load_pd( ( double* )alpha1 ); \
1435\
1436 t3v.v = b3v.v; \
1437 b3v.v = t3v.v * g34_k1v.v + b4v.v * s34_k1v.v; \
1438 b4v.v = b4v.v * g34_k1v.v - t3v.v * s34_k1v.v; \
1439\
1440 _mm_store_pd( ( double* )alpha4, b4v.v ); \
1441 alpha4 += step_a4; \
1442\
1443 t1v.v = b1v.v; \
1444 b1v.v = t1v.v * g12_k2v.v + b2v.v * s12_k2v.v; \
1445 b2v.v = b2v.v * g12_k2v.v - t1v.v * s12_k2v.v; \
1446\
1447 _mm_store_pd( ( double* )alpha1, b1v.v ); \
1448 alpha1 += step_a1; \
1449 a2v.v = _mm_load_pd( ( double* )(alpha2 + step_a3) ); \
1450\
1451 t2v.v = b2v.v; \
1452 b2v.v = t2v.v * g23_k2v.v + b3v.v * s23_k2v.v; \
1453 b3v.v = b3v.v * g23_k2v.v - t2v.v * s23_k2v.v; \
1454\
1455 _mm_store_pd( ( double* )alpha2, b2v.v ); \
1456 alpha2 += step_a2; \
1457 a3v.v = _mm_load_pd( ( double* )(alpha3 + step_a3) ); \
1458\
1459/* ----------------------------------------------------------- */ \
1460\
1461 a4v.v = _mm_load_pd( ( double* )alpha4 ); \
1462\
1463 t2v.v = a2v.v; \
1464 a2v.v = t2v.v * g23_k1v.v + a3v.v * s23_k1v.v; \
1465 a3v.v = a3v.v * g23_k1v.v - t2v.v * s23_k1v.v; \
1466\
1467 _mm_store_pd( ( double* )alpha3, b3v.v ); \
1468 alpha3 += step_a3; \
1469 a1v.v = _mm_load_pd( ( double* )alpha1 ); \
1470\
1471 t3v.v = a3v.v; \
1472 a3v.v = t3v.v * g34_k1v.v + a4v.v * s34_k1v.v; \
1473 a4v.v = a4v.v * g34_k1v.v - t3v.v * s34_k1v.v; \
1474\
1475 _mm_store_pd( ( double* )alpha4, a4v.v ); \
1476 alpha4 += step_a4; \
1477\
1478 t1v.v = a1v.v; \
1479 a1v.v = t1v.v * g12_k2v.v + a2v.v * s12_k2v.v; \
1480 a2v.v = a2v.v * g12_k2v.v - t1v.v * s12_k2v.v; \
1481\
1482 _mm_store_pd( ( double* )alpha1, a1v.v ); \
1483 alpha1 += step_a1; \
1484 b2v.v = _mm_load_pd( ( double* )(alpha2 + step_a2) ); \
1485\
1486 t2v.v = a2v.v; \
1487 a2v.v = t2v.v * g23_k2v.v + a3v.v * s23_k2v.v; \
1488 a3v.v = a3v.v * g23_k2v.v - t2v.v * s23_k2v.v; \
1489\
1490 _mm_store_pd( ( double* )alpha2, a2v.v ); \
1491 alpha2 += step_a2; \
1492 b3v.v = _mm_load_pd( ( double* )(alpha3 + step_a3) ); \
1493\
1494/* ----------------------------------------------------------- */ \
1495\
1496 b4v.v = _mm_load_pd( ( double* )alpha4 ); \
1497\
1498 t2v.v = b2v.v; \
1499 b2v.v = t2v.v * g23_k1v.v + b3v.v * s23_k1v.v; \
1500 b3v.v = b3v.v * g23_k1v.v - t2v.v * s23_k1v.v; \
1501\
1502 _mm_store_pd( ( double* )alpha3, a3v.v ); \
1503 alpha3 += step_a3; \
1504 b1v.v = _mm_load_pd( ( double* )alpha1 ); \
1505\
1506 t3v.v = b3v.v; \
1507 b3v.v = t3v.v * g34_k1v.v + b4v.v * s34_k1v.v; \
1508 b4v.v = b4v.v * g34_k1v.v - t3v.v * s34_k1v.v; \
1509\
1510 _mm_store_pd( ( double* )alpha4, b4v.v ); \
1511 alpha4 += step_a4; \
1512\
1513 t1v.v = b1v.v; \
1514 b1v.v = t1v.v * g12_k2v.v + b2v.v * s12_k2v.v; \
1515 b2v.v = b2v.v * g12_k2v.v - t1v.v * s12_k2v.v; \
1516\
1517 _mm_store_pd( ( double* )alpha1, b1v.v ); \
1518 alpha1 += step_a1; \
1519 a2v.v = _mm_load_pd( ( double* )(alpha2 + step_a2) ); \
1520\
1521 t2v.v = b2v.v; \
1522 b2v.v = t2v.v * g23_k2v.v + b3v.v * s23_k2v.v; \
1523 b3v.v = b3v.v * g23_k2v.v - t2v.v * s23_k2v.v; \
1524\
1525 _mm_store_pd( ( double* )alpha2, b2v.v ); \
1526 alpha2 += step_a2; \
1527 a3v.v = _mm_load_pd( ( double* )(alpha3 + step_a3) ); \
1528\
1529/* ----------------------------------------------------------- */ \
1530\
1531 a4v.v = _mm_load_pd( ( double* )alpha4 ); \
1532\
1533 t2v.v = a2v.v; \
1534 a2v.v = t2v.v * g23_k1v.v + a3v.v * s23_k1v.v; \
1535 a3v.v = a3v.v * g23_k1v.v - t2v.v * s23_k1v.v; \
1536\
1537 _mm_store_pd( ( double* )alpha3, b3v.v ); \
1538 alpha3 += step_a3; \
1539 a1v.v = _mm_load_pd( ( double* )alpha1 ); \
1540\
1541 t3v.v = a3v.v; \
1542 a3v.v = t3v.v * g34_k1v.v + a4v.v * s34_k1v.v; \
1543 a4v.v = a4v.v * g34_k1v.v - t3v.v * s34_k1v.v; \
1544\
1545 _mm_store_pd( ( double* )alpha4, a4v.v ); \
1546 alpha4 += step_a4; \
1547\
1548 t1v.v = a1v.v; \
1549 a1v.v = t1v.v * g12_k2v.v + a2v.v * s12_k2v.v; \
1550 a2v.v = a2v.v * g12_k2v.v - t1v.v * s12_k2v.v; \
1551\
1552 _mm_store_pd( ( double* )alpha1, a1v.v ); \
1553 alpha1 += step_a1; \
1554 b2v.v = _mm_load_pd( ( double* )(alpha2 + step_a2) ); \
1555\
1556 t2v.v = a2v.v; \
1557 a2v.v = t2v.v * g23_k2v.v + a3v.v * s23_k2v.v; \
1558 a3v.v = a3v.v * g23_k2v.v - t2v.v * s23_k2v.v; \
1559\
1560 _mm_store_pd( ( double* )alpha2, a2v.v ); \
1561 alpha2 += step_a2; \
1562 b3v.v = _mm_load_pd( ( double* )(alpha3 + step_a3) ); \
1563\
1564/* ----------------------------------------------------------- */ \
1565\
1566 b4v.v = _mm_load_pd( ( double* )alpha4 ); \
1567\
1568 t2v.v = b2v.v; \
1569 b2v.v = t2v.v * g23_k1v.v + b3v.v * s23_k1v.v; \
1570 b3v.v = b3v.v * g23_k1v.v - t2v.v * s23_k1v.v; \
1571\
1572 _mm_store_pd( ( double* )alpha3, a3v.v ); \
1573 alpha3 += step_a3; \
1574 b1v.v = _mm_load_pd( ( double* )alpha1 ); \
1575\
1576 t3v.v = b3v.v; \
1577 b3v.v = t3v.v * g34_k1v.v + b4v.v * s34_k1v.v; \
1578 b4v.v = b4v.v * g34_k1v.v - t3v.v * s34_k1v.v; \
1579\
1580 _mm_store_pd( ( double* )alpha4, b4v.v ); \
1581 alpha4 += step_a4; \
1582\
1583 t1v.v = b1v.v; \
1584 b1v.v = t1v.v * g12_k2v.v + b2v.v * s12_k2v.v; \
1585 b2v.v = b2v.v * g12_k2v.v - t1v.v * s12_k2v.v; \
1586\
1587 _mm_store_pd( ( double* )alpha1, b1v.v ); \
1588 alpha1 += step_a1; \
1589\
1590 t2v.v = b2v.v; \
1591 b2v.v = t2v.v * g23_k2v.v + b3v.v * s23_k2v.v; \
1592 b3v.v = b3v.v * g23_k2v.v - t2v.v * s23_k2v.v; \
1593\
1594 _mm_store_pd( ( double* )alpha2, b2v.v ); \
1595 alpha2 += step_a2; \
1596\
1597 _mm_store_pd( ( double* )alpha3, b3v.v ); \
1598 alpha3 += step_a3; \
1599\
1600/* ----------------------------------------------------------- */ \
1601 } \
1602\
1603 for ( i = 0; i < n_left; ++i ) \
1604 { \
1605\
1606 a2v.v = _mm_load_pd( ( double* )alpha2 ); \
1607 a3v.v = _mm_load_pd( ( double* )alpha3 ); \
1608 a4v.v = _mm_load_pd( ( double* )alpha4 ); \
1609\
1610 t2v.v = a2v.v; \
1611 a2v.v = t2v.v * g23_k1v.v + a3v.v * s23_k1v.v; \
1612 a3v.v = a3v.v * g23_k1v.v - t2v.v * s23_k1v.v; \
1613\
1614 a1v.v = _mm_load_pd( ( double* )alpha1 ); \
1615\
1616 t3v.v = a3v.v; \
1617 a3v.v = t3v.v * g34_k1v.v + a4v.v * s34_k1v.v; \
1618 a4v.v = a4v.v * g34_k1v.v - t3v.v * s34_k1v.v; \
1619\
1620 _mm_store_pd( ( double* )alpha4, a4v.v ); \
1621 alpha4 += step_a4; \
1622\
1623 t1v.v = a1v.v; \
1624 a1v.v = t1v.v * g12_k2v.v + a2v.v * s12_k2v.v; \
1625 a2v.v = a2v.v * g12_k2v.v - t1v.v * s12_k2v.v; \
1626\
1627 _mm_store_pd( ( double* )alpha1, a1v.v ); \
1628 alpha1 += step_a1; \
1629\
1630 t2v.v = a2v.v; \
1631 a2v.v = t2v.v * g23_k2v.v + a3v.v * s23_k2v.v; \
1632 a3v.v = a3v.v * g23_k2v.v - t2v.v * s23_k2v.v; \
1633\
1634 _mm_store_pd( ( double* )alpha2, a2v.v ); \
1635 alpha2 += step_a2; \
1636 _mm_store_pd( ( double* )alpha3, a3v.v ); \
1637 alpha3 += step_a3; \
1638 } \
1639}
1640
1641#endif