libflame revision_anchor
FLA_Apply_G_mx3b_asm.h
Go to the documentation of this file.
1/*
2
3 Copyright (C) 2014, The University of Texas at Austin
4
5 This file is part of libflame and is available under the 3-Clause
6 BSD license, which can be found in the LICENSE file at the top-level
7 directory, or at http://opensource.org/licenses/BSD-3-Clause
8
9*/
10
11
12#if FLA_VECTOR_INTRINSIC_TYPE == FLA_NO_INTRINSICS
13
14#define MAC_Apply_G_mx3b_ass MAC_Apply_G_mx3b_ops
15#define MAC_Apply_G_mx3b_asd MAC_Apply_G_mx3b_opd
16#define MAC_Apply_G_mx3b_asc MAC_Apply_G_mx3b_opc
17#define MAC_Apply_G_mx3b_asz MAC_Apply_G_mx3b_opz
18
19#elif FLA_VECTOR_INTRINSIC_TYPE == FLA_SSE_INTRINSICS
20
21#define MAC_Apply_G_mx3b_ass( m_A, \
22 gamma12, \
23 sigma12, \
24 gamma23, \
25 sigma23, \
26 a1, inc_a1, \
27 a2, inc_a2, \
28 a3, inc_a3 ) \
29{\
30 int n_iter32 = m_A / ( 4 * 8 ); \
31 int n_left32 = m_A % ( 4 * 8 ); \
32 int n_iter4 = n_left32 / ( 4 * 1 ); \
33 int n_left = n_left32 % ( 4 * 1 ); \
34 int i; \
35\
36 const int step_a1 = inc_a1 * 4; \
37 const int step_a2 = inc_a2 * 4; \
38 const int step_a3 = inc_a3 * 4; \
39\
40 float* restrict alpha1 = a1; \
41 float* restrict alpha2 = a2; \
42 float* restrict alpha3 = a3; \
43\
44 v4sf_t a1v, a2v, a3v; \
45 v4sf_t g12v, s12v; \
46 v4sf_t g23v, s23v; \
47 v4sf_t t1v, t2v; \
48\
49 g12v.v = _mm_load1_ps( gamma12 ); \
50 s12v.v = _mm_load1_ps( sigma12 ); \
51 g23v.v = _mm_load1_ps( gamma23 ); \
52 s23v.v = _mm_load1_ps( sigma23 ); \
53\
54 for ( i = 0; i < n_iter32; ++i ) \
55 { \
56\
57 a2v.v = _mm_load_ps( ( float* )alpha2 ); \
58 a3v.v = _mm_load_ps( ( float* )alpha3 ); \
59\
60 t2v.v = a2v.v; \
61 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
62 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
63\
64 _mm_store_ps( ( float* )alpha3, a3v.v ); \
65 alpha3 += step_a3; \
66 a1v.v = _mm_load_ps( ( float* )alpha1 ); \
67\
68 t1v.v = a1v.v; \
69 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
70 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
71\
72 _mm_store_ps( ( float* )alpha1, a1v.v ); \
73 alpha1 += step_a1; \
74 _mm_store_ps( ( float* )alpha2, a2v.v ); \
75 alpha2 += step_a2; \
76\
77 a2v.v = _mm_load_ps( ( float* )alpha2 ); \
78 a3v.v = _mm_load_ps( ( float* )alpha3 ); \
79\
80 t2v.v = a2v.v; \
81 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
82 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
83\
84 _mm_store_ps( ( float* )alpha3, a3v.v ); \
85 alpha3 += step_a3; \
86 a1v.v = _mm_load_ps( ( float* )alpha1 ); \
87\
88 t1v.v = a1v.v; \
89 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
90 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
91\
92 _mm_store_ps( ( float* )alpha1, a1v.v ); \
93 alpha1 += step_a1; \
94 _mm_store_ps( ( float* )alpha2, a2v.v ); \
95 alpha2 += step_a2; \
96\
97 a2v.v = _mm_load_ps( ( float* )alpha2 ); \
98 a3v.v = _mm_load_ps( ( float* )alpha3 ); \
99\
100 t2v.v = a2v.v; \
101 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
102 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
103\
104 _mm_store_ps( ( float* )alpha3, a3v.v ); \
105 alpha3 += step_a3; \
106 a1v.v = _mm_load_ps( ( float* )alpha1 ); \
107\
108 t1v.v = a1v.v; \
109 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
110 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
111\
112 _mm_store_ps( ( float* )alpha1, a1v.v ); \
113 alpha1 += step_a1; \
114 _mm_store_ps( ( float* )alpha2, a2v.v ); \
115 alpha2 += step_a2; \
116\
117 a2v.v = _mm_load_ps( ( float* )alpha2 ); \
118 a3v.v = _mm_load_ps( ( float* )alpha3 ); \
119\
120 t2v.v = a2v.v; \
121 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
122 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
123\
124 _mm_store_ps( ( float* )alpha3, a3v.v ); \
125 alpha3 += step_a3; \
126 a1v.v = _mm_load_ps( ( float* )alpha1 ); \
127\
128 t1v.v = a1v.v; \
129 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
130 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
131\
132 _mm_store_ps( ( float* )alpha1, a1v.v ); \
133 alpha1 += step_a1; \
134 _mm_store_ps( ( float* )alpha2, a2v.v ); \
135 alpha2 += step_a2; \
136\
137 a2v.v = _mm_load_ps( ( float* )alpha2 ); \
138 a3v.v = _mm_load_ps( ( float* )alpha3 ); \
139\
140 t2v.v = a2v.v; \
141 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
142 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
143\
144 _mm_store_ps( ( float* )alpha3, a3v.v ); \
145 alpha3 += step_a3; \
146 a1v.v = _mm_load_ps( ( float* )alpha1 ); \
147\
148 t1v.v = a1v.v; \
149 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
150 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
151\
152 _mm_store_ps( ( float* )alpha1, a1v.v ); \
153 alpha1 += step_a1; \
154 _mm_store_ps( ( float* )alpha2, a2v.v ); \
155 alpha2 += step_a2; \
156\
157 a2v.v = _mm_load_ps( ( float* )alpha2 ); \
158 a3v.v = _mm_load_ps( ( float* )alpha3 ); \
159\
160 t2v.v = a2v.v; \
161 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
162 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
163\
164 _mm_store_ps( ( float* )alpha3, a3v.v ); \
165 alpha3 += step_a3; \
166 a1v.v = _mm_load_ps( ( float* )alpha1 ); \
167\
168 t1v.v = a1v.v; \
169 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
170 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
171\
172 _mm_store_ps( ( float* )alpha1, a1v.v ); \
173 alpha1 += step_a1; \
174 _mm_store_ps( ( float* )alpha2, a2v.v ); \
175 alpha2 += step_a2; \
176\
177 a2v.v = _mm_load_ps( ( float* )alpha2 ); \
178 a3v.v = _mm_load_ps( ( float* )alpha3 ); \
179\
180 t2v.v = a2v.v; \
181 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
182 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
183\
184 _mm_store_ps( ( float* )alpha3, a3v.v ); \
185 alpha3 += step_a3; \
186 a1v.v = _mm_load_ps( ( float* )alpha1 ); \
187\
188 t1v.v = a1v.v; \
189 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
190 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
191\
192 _mm_store_ps( ( float* )alpha1, a1v.v ); \
193 alpha1 += step_a1; \
194 _mm_store_ps( ( float* )alpha2, a2v.v ); \
195 alpha2 += step_a2; \
196\
197 a2v.v = _mm_load_ps( ( float* )alpha2 ); \
198 a3v.v = _mm_load_ps( ( float* )alpha3 ); \
199\
200 t2v.v = a2v.v; \
201 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
202 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
203\
204 _mm_store_ps( ( float* )alpha3, a3v.v ); \
205 alpha3 += step_a3; \
206 a1v.v = _mm_load_ps( ( float* )alpha1 ); \
207\
208 t1v.v = a1v.v; \
209 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
210 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
211\
212 _mm_store_ps( ( float* )alpha1, a1v.v ); \
213 alpha1 += step_a1; \
214 _mm_store_ps( ( float* )alpha2, a2v.v ); \
215 alpha2 += step_a2; \
216 } \
217\
218 for ( i = 0; i < n_iter4; ++i ) \
219 { \
220\
221 a2v.v = _mm_load_ps( ( float* )alpha2 ); \
222 a3v.v = _mm_load_ps( ( float* )alpha3 ); \
223\
224 t2v.v = a2v.v; \
225 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
226 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
227\
228 _mm_store_ps( ( float* )alpha3, a3v.v ); \
229 alpha3 += step_a3; \
230 a1v.v = _mm_load_ps( ( float* )alpha1 ); \
231\
232 t1v.v = a1v.v; \
233 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
234 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
235\
236 _mm_store_ps( ( float* )alpha1, a1v.v ); \
237 alpha1 += step_a1; \
238 _mm_store_ps( ( float* )alpha2, a2v.v ); \
239 alpha2 += step_a2; \
240 } \
241\
242 for ( i = 0; i < n_left; ++i ) \
243 { \
244 float ga12 = *gamma12; \
245 float si12 = *sigma12; \
246 float ga23 = *gamma23; \
247 float si23 = *sigma23; \
248 float temp1; \
249 float temp2; \
250 float temp3; \
251\
252 temp2 = *alpha2; \
253 temp3 = *alpha3; \
254\
255 *alpha2 = temp2 * ga23 + temp3 * si23; \
256 *alpha3 = temp3 * ga23 - temp2 * si23; \
257\
258 temp1 = *alpha1; \
259 temp2 = *alpha2; \
260\
261 *alpha1 = temp1 * ga12 + temp2 * si12; \
262 *alpha2 = temp2 * ga12 - temp1 * si12; \
263\
264 alpha1 += 1; \
265 alpha2 += 1; \
266 alpha3 += 1; \
267 } \
268}
269
270#define MAC_Apply_G_mx3b_asd( m_A, \
271 gamma12, \
272 sigma12, \
273 gamma23, \
274 sigma23, \
275 a1, inc_a1, \
276 a2, inc_a2, \
277 a3, inc_a3 ) \
278{\
279 int n_iter16 = m_A / ( 2 * 8 ); \
280 int n_left16 = m_A % ( 2 * 8 ); \
281 int n_iter2 = n_left16 / ( 2 * 1 ); \
282 int n_left = n_left16 % ( 2 * 1 ); \
283 int i; \
284\
285 const int step_a1 = inc_a1 * 2; \
286 const int step_a2 = inc_a2 * 2; \
287 const int step_a3 = inc_a3 * 2; \
288\
289 double* restrict alpha1 = a1; \
290 double* restrict alpha2 = a2; \
291 double* restrict alpha3 = a3; \
292\
293 v2df_t a1v, a2v, a3v; \
294 v2df_t g12v, s12v; \
295 v2df_t g23v, s23v; \
296 v2df_t t1v, t2v; \
297\
298 g12v.v = _mm_loaddup_pd( gamma12 ); \
299 s12v.v = _mm_loaddup_pd( sigma12 ); \
300 g23v.v = _mm_loaddup_pd( gamma23 ); \
301 s23v.v = _mm_loaddup_pd( sigma23 ); \
302\
303 for ( i = 0; i < n_iter16; ++i ) \
304 { \
305\
306 a2v.v = _mm_load_pd( ( double* )alpha2 ); \
307 a3v.v = _mm_load_pd( ( double* )alpha3 ); \
308\
309 t2v.v = a2v.v; \
310 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
311 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
312\
313 _mm_store_pd( ( double* )alpha3, a3v.v ); \
314 alpha3 += step_a3; \
315 a1v.v = _mm_load_pd( ( double* )alpha1 ); \
316\
317 t1v.v = a1v.v; \
318 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
319 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
320\
321 _mm_store_pd( ( double* )alpha1, a1v.v ); \
322 alpha1 += step_a1; \
323 _mm_store_pd( ( double* )alpha2, a2v.v ); \
324 alpha2 += step_a2; \
325\
326 a2v.v = _mm_load_pd( ( double* )alpha2 ); \
327 a3v.v = _mm_load_pd( ( double* )alpha3 ); \
328\
329 t2v.v = a2v.v; \
330 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
331 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
332\
333 _mm_store_pd( ( double* )alpha3, a3v.v ); \
334 alpha3 += step_a3; \
335 a1v.v = _mm_load_pd( ( double* )alpha1 ); \
336\
337 t1v.v = a1v.v; \
338 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
339 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
340\
341 _mm_store_pd( ( double* )alpha1, a1v.v ); \
342 alpha1 += step_a1; \
343 _mm_store_pd( ( double* )alpha2, a2v.v ); \
344 alpha2 += step_a2; \
345\
346 a2v.v = _mm_load_pd( ( double* )alpha2 ); \
347 a3v.v = _mm_load_pd( ( double* )alpha3 ); \
348\
349 t2v.v = a2v.v; \
350 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
351 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
352\
353 _mm_store_pd( ( double* )alpha3, a3v.v ); \
354 alpha3 += step_a3; \
355 a1v.v = _mm_load_pd( ( double* )alpha1 ); \
356\
357 t1v.v = a1v.v; \
358 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
359 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
360\
361 _mm_store_pd( ( double* )alpha1, a1v.v ); \
362 alpha1 += step_a1; \
363 _mm_store_pd( ( double* )alpha2, a2v.v ); \
364 alpha2 += step_a2; \
365\
366 a2v.v = _mm_load_pd( ( double* )alpha2 ); \
367 a3v.v = _mm_load_pd( ( double* )alpha3 ); \
368\
369 t2v.v = a2v.v; \
370 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
371 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
372\
373 _mm_store_pd( ( double* )alpha3, a3v.v ); \
374 alpha3 += step_a3; \
375 a1v.v = _mm_load_pd( ( double* )alpha1 ); \
376\
377 t1v.v = a1v.v; \
378 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
379 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
380\
381 _mm_store_pd( ( double* )alpha1, a1v.v ); \
382 alpha1 += step_a1; \
383 _mm_store_pd( ( double* )alpha2, a2v.v ); \
384 alpha2 += step_a2; \
385\
386 a2v.v = _mm_load_pd( ( double* )alpha2 ); \
387 a3v.v = _mm_load_pd( ( double* )alpha3 ); \
388\
389 t2v.v = a2v.v; \
390 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
391 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
392\
393 _mm_store_pd( ( double* )alpha3, a3v.v ); \
394 alpha3 += step_a3; \
395 a1v.v = _mm_load_pd( ( double* )alpha1 ); \
396\
397 t1v.v = a1v.v; \
398 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
399 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
400\
401 _mm_store_pd( ( double* )alpha1, a1v.v ); \
402 alpha1 += step_a1; \
403 _mm_store_pd( ( double* )alpha2, a2v.v ); \
404 alpha2 += step_a2; \
405\
406 a2v.v = _mm_load_pd( ( double* )alpha2 ); \
407 a3v.v = _mm_load_pd( ( double* )alpha3 ); \
408\
409 t2v.v = a2v.v; \
410 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
411 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
412\
413 _mm_store_pd( ( double* )alpha3, a3v.v ); \
414 alpha3 += step_a3; \
415 a1v.v = _mm_load_pd( ( double* )alpha1 ); \
416\
417 t1v.v = a1v.v; \
418 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
419 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
420\
421 _mm_store_pd( ( double* )alpha1, a1v.v ); \
422 alpha1 += step_a1; \
423 _mm_store_pd( ( double* )alpha2, a2v.v ); \
424 alpha2 += step_a2; \
425\
426 a2v.v = _mm_load_pd( ( double* )alpha2 ); \
427 a3v.v = _mm_load_pd( ( double* )alpha3 ); \
428\
429 t2v.v = a2v.v; \
430 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
431 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
432\
433 _mm_store_pd( ( double* )alpha3, a3v.v ); \
434 alpha3 += step_a3; \
435 a1v.v = _mm_load_pd( ( double* )alpha1 ); \
436\
437 t1v.v = a1v.v; \
438 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
439 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
440\
441 _mm_store_pd( ( double* )alpha1, a1v.v ); \
442 alpha1 += step_a1; \
443 _mm_store_pd( ( double* )alpha2, a2v.v ); \
444 alpha2 += step_a2; \
445\
446 a2v.v = _mm_load_pd( ( double* )alpha2 ); \
447 a3v.v = _mm_load_pd( ( double* )alpha3 ); \
448\
449 t2v.v = a2v.v; \
450 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
451 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
452\
453 _mm_store_pd( ( double* )alpha3, a3v.v ); \
454 alpha3 += step_a3; \
455 a1v.v = _mm_load_pd( ( double* )alpha1 ); \
456\
457 t1v.v = a1v.v; \
458 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
459 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
460\
461 _mm_store_pd( ( double* )alpha1, a1v.v ); \
462 alpha1 += step_a1; \
463 _mm_store_pd( ( double* )alpha2, a2v.v ); \
464 alpha2 += step_a2; \
465 } \
466\
467 for ( i = 0; i < n_iter2; ++i ) \
468 { \
469\
470 a2v.v = _mm_load_pd( ( double* )alpha2 ); \
471 a3v.v = _mm_load_pd( ( double* )alpha3 ); \
472\
473 t2v.v = a2v.v; \
474 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
475 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
476\
477 _mm_store_pd( ( double* )alpha3, a3v.v ); \
478 alpha3 += step_a3; \
479 a1v.v = _mm_load_pd( ( double* )alpha1 ); \
480\
481 t1v.v = a1v.v; \
482 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
483 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
484\
485 _mm_store_pd( ( double* )alpha1, a1v.v ); \
486 alpha1 += step_a1; \
487 _mm_store_pd( ( double* )alpha2, a2v.v ); \
488 alpha2 += step_a2; \
489 } \
490\
491 if ( n_left == 1 ) \
492 { \
493 double ga12 = *gamma12; \
494 double si12 = *sigma12; \
495 double ga23 = *gamma23; \
496 double si23 = *sigma23; \
497 double temp1; \
498 double temp2; \
499 double temp3; \
500\
501 temp2 = *alpha2; \
502 temp3 = *alpha3; \
503\
504 *alpha2 = temp2 * ga23 + temp3 * si23; \
505 *alpha3 = temp3 * ga23 - temp2 * si23; \
506\
507 temp1 = *alpha1; \
508 temp2 = *alpha2; \
509\
510 *alpha1 = temp1 * ga12 + temp2 * si12; \
511 *alpha2 = temp2 * ga12 - temp1 * si12; \
512 } \
513}
514
515#define MAC_Apply_G_mx3b_asc( m_A, \
516 gamma12, \
517 sigma12, \
518 gamma23, \
519 sigma23, \
520 a1, inc_a1, \
521 a2, inc_a2, \
522 a3, inc_a3 ) \
523{\
524 int n_iter16 = m_A / ( 2 * 8 ); \
525 int n_left16 = m_A % ( 2 * 8 ); \
526 int n_iter2 = n_left16 / ( 2 * 1 ); \
527 int n_left = n_left16 % ( 2 * 1 ); \
528 int i; \
529\
530 const int step_a1 = inc_a1 * 2; \
531 const int step_a2 = inc_a2 * 2; \
532 const int step_a3 = inc_a3 * 2; \
533\
534 scomplex* restrict alpha1 = a1; \
535 scomplex* restrict alpha2 = a2; \
536 scomplex* restrict alpha3 = a3; \
537\
538 v4sf_t a1v, a2v, a3v; \
539 v4sf_t g12v, s12v; \
540 v4sf_t g23v, s23v; \
541 v4sf_t t1v, t2v; \
542\
543 g12v.v = _mm_load1_ps( gamma12 ); \
544 s12v.v = _mm_load1_ps( sigma12 ); \
545 g23v.v = _mm_load1_ps( gamma23 ); \
546 s23v.v = _mm_load1_ps( sigma23 ); \
547\
548 for ( i = 0; i < n_iter16; ++i ) \
549 { \
550\
551 a2v.v = _mm_load_ps( ( float* )alpha2 ); \
552 a3v.v = _mm_load_ps( ( float* )alpha3 ); \
553\
554 t2v.v = a2v.v; \
555 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
556 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
557\
558 _mm_store_ps( ( float* )alpha3, a3v.v ); \
559 alpha3 += step_a3; \
560 a1v.v = _mm_load_ps( ( float* )alpha1 ); \
561\
562 t1v.v = a1v.v; \
563 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
564 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
565\
566 _mm_store_ps( ( float* )alpha1, a1v.v ); \
567 alpha1 += step_a1; \
568 _mm_store_ps( ( float* )alpha2, a2v.v ); \
569 alpha2 += step_a2; \
570\
571 a2v.v = _mm_load_ps( ( float* )alpha2 ); \
572 a3v.v = _mm_load_ps( ( float* )alpha3 ); \
573\
574 t2v.v = a2v.v; \
575 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
576 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
577\
578 _mm_store_ps( ( float* )alpha3, a3v.v ); \
579 alpha3 += step_a3; \
580 a1v.v = _mm_load_ps( ( float* )alpha1 ); \
581\
582 t1v.v = a1v.v; \
583 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
584 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
585\
586 _mm_store_ps( ( float* )alpha1, a1v.v ); \
587 alpha1 += step_a1; \
588 _mm_store_ps( ( float* )alpha2, a2v.v ); \
589 alpha2 += step_a2; \
590\
591 a2v.v = _mm_load_ps( ( float* )alpha2 ); \
592 a3v.v = _mm_load_ps( ( float* )alpha3 ); \
593\
594 t2v.v = a2v.v; \
595 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
596 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
597\
598 _mm_store_ps( ( float* )alpha3, a3v.v ); \
599 alpha3 += step_a3; \
600 a1v.v = _mm_load_ps( ( float* )alpha1 ); \
601\
602 t1v.v = a1v.v; \
603 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
604 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
605\
606 _mm_store_ps( ( float* )alpha1, a1v.v ); \
607 alpha1 += step_a1; \
608 _mm_store_ps( ( float* )alpha2, a2v.v ); \
609 alpha2 += step_a2; \
610\
611 a2v.v = _mm_load_ps( ( float* )alpha2 ); \
612 a3v.v = _mm_load_ps( ( float* )alpha3 ); \
613\
614 t2v.v = a2v.v; \
615 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
616 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
617\
618 _mm_store_ps( ( float* )alpha3, a3v.v ); \
619 alpha3 += step_a3; \
620 a1v.v = _mm_load_ps( ( float* )alpha1 ); \
621\
622 t1v.v = a1v.v; \
623 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
624 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
625\
626 _mm_store_ps( ( float* )alpha1, a1v.v ); \
627 alpha1 += step_a1; \
628 _mm_store_ps( ( float* )alpha2, a2v.v ); \
629 alpha2 += step_a2; \
630\
631 a2v.v = _mm_load_ps( ( float* )alpha2 ); \
632 a3v.v = _mm_load_ps( ( float* )alpha3 ); \
633\
634 t2v.v = a2v.v; \
635 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
636 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
637\
638 _mm_store_ps( ( float* )alpha3, a3v.v ); \
639 alpha3 += step_a3; \
640 a1v.v = _mm_load_ps( ( float* )alpha1 ); \
641\
642 t1v.v = a1v.v; \
643 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
644 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
645\
646 _mm_store_ps( ( float* )alpha1, a1v.v ); \
647 alpha1 += step_a1; \
648 _mm_store_ps( ( float* )alpha2, a2v.v ); \
649 alpha2 += step_a2; \
650\
651 a2v.v = _mm_load_ps( ( float* )alpha2 ); \
652 a3v.v = _mm_load_ps( ( float* )alpha3 ); \
653\
654 t2v.v = a2v.v; \
655 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
656 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
657\
658 _mm_store_ps( ( float* )alpha3, a3v.v ); \
659 alpha3 += step_a3; \
660 a1v.v = _mm_load_ps( ( float* )alpha1 ); \
661\
662 t1v.v = a1v.v; \
663 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
664 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
665\
666 _mm_store_ps( ( float* )alpha1, a1v.v ); \
667 alpha1 += step_a1; \
668 _mm_store_ps( ( float* )alpha2, a2v.v ); \
669 alpha2 += step_a2; \
670\
671 a2v.v = _mm_load_ps( ( float* )alpha2 ); \
672 a3v.v = _mm_load_ps( ( float* )alpha3 ); \
673\
674 t2v.v = a2v.v; \
675 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
676 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
677\
678 _mm_store_ps( ( float* )alpha3, a3v.v ); \
679 alpha3 += step_a3; \
680 a1v.v = _mm_load_ps( ( float* )alpha1 ); \
681\
682 t1v.v = a1v.v; \
683 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
684 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
685\
686 _mm_store_ps( ( float* )alpha1, a1v.v ); \
687 alpha1 += step_a1; \
688 _mm_store_ps( ( float* )alpha2, a2v.v ); \
689 alpha2 += step_a2; \
690\
691 a2v.v = _mm_load_ps( ( float* )alpha2 ); \
692 a3v.v = _mm_load_ps( ( float* )alpha3 ); \
693\
694 t2v.v = a2v.v; \
695 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
696 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
697\
698 _mm_store_ps( ( float* )alpha3, a3v.v ); \
699 alpha3 += step_a3; \
700 a1v.v = _mm_load_ps( ( float* )alpha1 ); \
701\
702 t1v.v = a1v.v; \
703 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
704 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
705\
706 _mm_store_ps( ( float* )alpha1, a1v.v ); \
707 alpha1 += step_a1; \
708 _mm_store_ps( ( float* )alpha2, a2v.v ); \
709 alpha2 += step_a2; \
710 } \
711\
712 for ( i = 0; i < n_iter2; ++i ) \
713 { \
714\
715 a2v.v = _mm_load_ps( ( float* )alpha2 ); \
716 a3v.v = _mm_load_ps( ( float* )alpha3 ); \
717\
718 t2v.v = a2v.v; \
719 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
720 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
721\
722 _mm_store_ps( ( float* )alpha3, a3v.v ); \
723 alpha3 += step_a3; \
724 a1v.v = _mm_load_ps( ( float* )alpha1 ); \
725\
726 t1v.v = a1v.v; \
727 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
728 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
729\
730 _mm_store_ps( ( float* )alpha1, a1v.v ); \
731 alpha1 += step_a1; \
732 _mm_store_ps( ( float* )alpha2, a2v.v ); \
733 alpha2 += step_a2; \
734 } \
735\
736 if ( n_left == 1 ) \
737 { \
738 float ga12 = *gamma12; \
739 float si12 = *sigma12; \
740 float ga23 = *gamma23; \
741 float si23 = *sigma23; \
742 scomplex temp1; \
743 scomplex temp2; \
744 scomplex temp3; \
745\
746 temp1 = *alpha1; \
747 temp2 = *alpha2; \
748\
749 alpha1->real = temp1.real * ga12 + temp2.real * si12; \
750 alpha2->real = temp2.real * ga12 - temp1.real * si12; \
751\
752 alpha1->imag = temp1.imag * ga12 + temp2.imag * si12; \
753 alpha2->imag = temp2.imag * ga12 - temp1.imag * si12; \
754\
755 temp2 = *alpha2; \
756 temp3 = *alpha3; \
757\
758 alpha2->real = temp2.real * ga23 + temp3.real * si23; \
759 alpha3->real = temp3.real * ga23 - temp2.real * si23; \
760\
761 alpha2->imag = temp2.imag * ga23 + temp3.imag * si23; \
762 alpha3->imag = temp3.imag * ga23 - temp2.imag * si23; \
763 } \
764}
765
766#define MAC_Apply_G_mx3b_asz( m_A, \
767 gamma12, \
768 sigma12, \
769 gamma23, \
770 sigma23, \
771 a1, inc_a1, \
772 a2, inc_a2, \
773 a3, inc_a3 ) \
774{\
775 int n_iter = m_A / 8; \
776 int n_left = m_A % 8; \
777 int i; \
778\
779 const int step_a1 = inc_a1 * 1; \
780 const int step_a2 = inc_a2 * 1; \
781 const int step_a3 = inc_a3 * 1; \
782\
783 dcomplex* restrict alpha1 = a1; \
784 dcomplex* restrict alpha2 = a2; \
785 dcomplex* restrict alpha3 = a3; \
786\
787 v2df_t a1v, a2v, a3v; \
788 v2df_t g12v, s12v; \
789 v2df_t g23v, s23v; \
790 v2df_t t1v, t2v; \
791\
792 g12v.v = _mm_loaddup_pd( gamma12 ); \
793 s12v.v = _mm_loaddup_pd( sigma12 ); \
794 g23v.v = _mm_loaddup_pd( gamma23 ); \
795 s23v.v = _mm_loaddup_pd( sigma23 ); \
796\
797 for ( i = 0; i < n_iter; ++i ) \
798 { \
799\
800 a2v.v = _mm_load_pd( ( double* )alpha2 ); \
801 a3v.v = _mm_load_pd( ( double* )alpha3 ); \
802\
803 t2v.v = a2v.v; \
804 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
805 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
806\
807 _mm_store_pd( ( double* )alpha3, a3v.v ); \
808 alpha3 += step_a3; \
809 a1v.v = _mm_load_pd( ( double* )alpha1 ); \
810\
811 t1v.v = a1v.v; \
812 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
813 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
814\
815 _mm_store_pd( ( double* )alpha1, a1v.v ); \
816 alpha1 += step_a1; \
817 _mm_store_pd( ( double* )alpha2, a2v.v ); \
818 alpha2 += step_a2; \
819\
820 a2v.v = _mm_load_pd( ( double* )alpha2 ); \
821 a3v.v = _mm_load_pd( ( double* )alpha3 ); \
822\
823 t2v.v = a2v.v; \
824 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
825 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
826\
827 _mm_store_pd( ( double* )alpha3, a3v.v ); \
828 alpha3 += step_a3; \
829 a1v.v = _mm_load_pd( ( double* )alpha1 ); \
830\
831 t1v.v = a1v.v; \
832 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
833 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
834\
835 _mm_store_pd( ( double* )alpha1, a1v.v ); \
836 alpha1 += step_a1; \
837 _mm_store_pd( ( double* )alpha2, a2v.v ); \
838 alpha2 += step_a2; \
839\
840 a2v.v = _mm_load_pd( ( double* )alpha2 ); \
841 a3v.v = _mm_load_pd( ( double* )alpha3 ); \
842\
843 t2v.v = a2v.v; \
844 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
845 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
846\
847 _mm_store_pd( ( double* )alpha3, a3v.v ); \
848 alpha3 += step_a3; \
849 a1v.v = _mm_load_pd( ( double* )alpha1 ); \
850\
851 t1v.v = a1v.v; \
852 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
853 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
854\
855 _mm_store_pd( ( double* )alpha1, a1v.v ); \
856 alpha1 += step_a1; \
857 _mm_store_pd( ( double* )alpha2, a2v.v ); \
858 alpha2 += step_a2; \
859\
860 a2v.v = _mm_load_pd( ( double* )alpha2 ); \
861 a3v.v = _mm_load_pd( ( double* )alpha3 ); \
862\
863 t2v.v = a2v.v; \
864 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
865 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
866\
867 _mm_store_pd( ( double* )alpha3, a3v.v ); \
868 alpha3 += step_a3; \
869 a1v.v = _mm_load_pd( ( double* )alpha1 ); \
870\
871 t1v.v = a1v.v; \
872 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
873 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
874\
875 _mm_store_pd( ( double* )alpha1, a1v.v ); \
876 alpha1 += step_a1; \
877 _mm_store_pd( ( double* )alpha2, a2v.v ); \
878 alpha2 += step_a2; \
879\
880 a2v.v = _mm_load_pd( ( double* )alpha2 ); \
881 a3v.v = _mm_load_pd( ( double* )alpha3 ); \
882\
883 t2v.v = a2v.v; \
884 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
885 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
886\
887 _mm_store_pd( ( double* )alpha3, a3v.v ); \
888 alpha3 += step_a3; \
889 a1v.v = _mm_load_pd( ( double* )alpha1 ); \
890\
891 t1v.v = a1v.v; \
892 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
893 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
894\
895 _mm_store_pd( ( double* )alpha1, a1v.v ); \
896 alpha1 += step_a1; \
897 _mm_store_pd( ( double* )alpha2, a2v.v ); \
898 alpha2 += step_a2; \
899\
900 a2v.v = _mm_load_pd( ( double* )alpha2 ); \
901 a3v.v = _mm_load_pd( ( double* )alpha3 ); \
902\
903 t2v.v = a2v.v; \
904 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
905 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
906\
907 _mm_store_pd( ( double* )alpha3, a3v.v ); \
908 alpha3 += step_a3; \
909 a1v.v = _mm_load_pd( ( double* )alpha1 ); \
910\
911 t1v.v = a1v.v; \
912 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
913 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
914\
915 _mm_store_pd( ( double* )alpha1, a1v.v ); \
916 alpha1 += step_a1; \
917 _mm_store_pd( ( double* )alpha2, a2v.v ); \
918 alpha2 += step_a2; \
919\
920 a2v.v = _mm_load_pd( ( double* )alpha2 ); \
921 a3v.v = _mm_load_pd( ( double* )alpha3 ); \
922\
923 t2v.v = a2v.v; \
924 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
925 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
926\
927 _mm_store_pd( ( double* )alpha3, a3v.v ); \
928 alpha3 += step_a3; \
929 a1v.v = _mm_load_pd( ( double* )alpha1 ); \
930\
931 t1v.v = a1v.v; \
932 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
933 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
934\
935 _mm_store_pd( ( double* )alpha1, a1v.v ); \
936 alpha1 += step_a1; \
937 _mm_store_pd( ( double* )alpha2, a2v.v ); \
938 alpha2 += step_a2; \
939\
940 a2v.v = _mm_load_pd( ( double* )alpha2 ); \
941 a3v.v = _mm_load_pd( ( double* )alpha3 ); \
942\
943 t2v.v = a2v.v; \
944 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
945 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
946\
947 _mm_store_pd( ( double* )alpha3, a3v.v ); \
948 alpha3 += step_a3; \
949 a1v.v = _mm_load_pd( ( double* )alpha1 ); \
950\
951 t1v.v = a1v.v; \
952 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
953 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
954\
955 _mm_store_pd( ( double* )alpha1, a1v.v ); \
956 alpha1 += step_a1; \
957 _mm_store_pd( ( double* )alpha2, a2v.v ); \
958 alpha2 += step_a2; \
959 } \
960\
961 for ( i = 0; i < n_left; ++i ) \
962 { \
963\
964 a2v.v = _mm_load_pd( ( double* )alpha2 ); \
965 a3v.v = _mm_load_pd( ( double* )alpha3 ); \
966\
967 t2v.v = a2v.v; \
968 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
969 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
970\
971 _mm_store_pd( ( double* )alpha3, a3v.v ); \
972 alpha3 += step_a3; \
973 a1v.v = _mm_load_pd( ( double* )alpha1 ); \
974\
975 t1v.v = a1v.v; \
976 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
977 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
978\
979 _mm_store_pd( ( double* )alpha1, a1v.v ); \
980 alpha1 += step_a1; \
981 _mm_store_pd( ( double* )alpha2, a2v.v ); \
982 alpha2 += step_a2; \
983 } \
984}
985
986#endif