libflame revision_anchor
FLA_Apply_G_mx3_asm.h
Go to the documentation of this file.
1/*
2
3 Copyright (C) 2014, The University of Texas at Austin
4
5 This file is part of libflame and is available under the 3-Clause
6 BSD license, which can be found in the LICENSE file at the top-level
7 directory, or at http://opensource.org/licenses/BSD-3-Clause
8
9*/
10
11
12#if FLA_VECTOR_INTRINSIC_TYPE == FLA_NO_INTRINSICS
13
14#define MAC_Apply_G_mx3_ass MAC_Apply_G_mx3_ops
15#define MAC_Apply_G_mx3_asd MAC_Apply_G_mx3_opd
16#define MAC_Apply_G_mx3_asc MAC_Apply_G_mx3_opc
17#define MAC_Apply_G_mx3_asz MAC_Apply_G_mx3_opz
18
19#elif FLA_VECTOR_INTRINSIC_TYPE == FLA_SSE_INTRINSICS
20
21#define MAC_Apply_G_mx3_ass( m_A, \
22 gamma12, \
23 sigma12, \
24 gamma23, \
25 sigma23, \
26 a1, inc_a1, \
27 a2, inc_a2, \
28 a3, inc_a3 ) \
29{\
30 int n_iter32 = m_A / ( 4 * 8 ); \
31 int n_left32 = m_A % ( 4 * 8 ); \
32 int n_iter4 = n_left32 / ( 4 * 1 ); \
33 int n_left = n_left32 % ( 4 * 1 ); \
34 int i; \
35\
36 const int step_a1 = inc_a1 * 4; \
37 const int step_a2 = inc_a1 * 4; \
38 const int step_a3 = inc_a1 * 4; \
39\
40 float* restrict alpha1 = a1; \
41 float* restrict alpha2 = a2; \
42 float* restrict alpha3 = a3; \
43\
44 v4sf_t a1v, a2v, a3v; \
45 v4sf_t g12v, s12v; \
46 v4sf_t g23v, s23v; \
47 v4sf_t t1v, t2v; \
48\
49 g12v.v = _mm_load1_ps( gamma12 ); \
50 s12v.v = _mm_load1_ps( sigma12 ); \
51 g23v.v = _mm_load1_ps( gamma23 ); \
52 s23v.v = _mm_load1_ps( sigma23 ); \
53\
54 for ( i = 0; i < n_iter32; ++i ) \
55 { \
56\
57 a1v.v = _mm_load_ps( ( float* )alpha1 ); \
58 a2v.v = _mm_load_ps( ( float* )alpha2 ); \
59\
60 t1v.v = a1v.v; \
61 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
62 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
63\
64 a3v.v = _mm_load_ps( ( float* )alpha3 ); \
65 _mm_store_ps( ( float* )alpha1, a1v.v ); \
66 alpha1 += step_a1; \
67\
68 t2v.v = a2v.v; \
69 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
70 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
71\
72 _mm_store_ps( ( float* )alpha2, a2v.v ); \
73 alpha2 += step_a2; \
74 _mm_store_ps( ( float* )alpha3, a3v.v ); \
75 alpha3 += step_a3; \
76\
77 a1v.v = _mm_load_ps( ( float* )alpha1 ); \
78 a2v.v = _mm_load_ps( ( float* )alpha2 ); \
79\
80 t1v.v = a1v.v; \
81 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
82 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
83\
84 a3v.v = _mm_load_ps( ( float* )alpha3 ); \
85 _mm_store_ps( ( float* )alpha1, a1v.v ); \
86 alpha1 += step_a1; \
87\
88 t2v.v = a2v.v; \
89 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
90 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
91\
92 _mm_store_ps( ( float* )alpha2, a2v.v ); \
93 alpha2 += step_a2; \
94 _mm_store_ps( ( float* )alpha3, a3v.v ); \
95 alpha3 += step_a3; \
96\
97 a1v.v = _mm_load_ps( ( float* )alpha1 ); \
98 a2v.v = _mm_load_ps( ( float* )alpha2 ); \
99\
100 t1v.v = a1v.v; \
101 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
102 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
103\
104 a3v.v = _mm_load_ps( ( float* )alpha3 ); \
105 _mm_store_ps( ( float* )alpha1, a1v.v ); \
106 alpha1 += step_a1; \
107\
108 t2v.v = a2v.v; \
109 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
110 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
111\
112 _mm_store_ps( ( float* )alpha2, a2v.v ); \
113 alpha2 += step_a2; \
114 _mm_store_ps( ( float* )alpha3, a3v.v ); \
115 alpha3 += step_a3; \
116\
117 a1v.v = _mm_load_ps( ( float* )alpha1 ); \
118 a2v.v = _mm_load_ps( ( float* )alpha2 ); \
119\
120 t1v.v = a1v.v; \
121 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
122 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
123\
124 a3v.v = _mm_load_ps( ( float* )alpha3 ); \
125 _mm_store_ps( ( float* )alpha1, a1v.v ); \
126 alpha1 += step_a1; \
127\
128 t2v.v = a2v.v; \
129 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
130 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
131\
132 _mm_store_ps( ( float* )alpha2, a2v.v ); \
133 alpha2 += step_a2; \
134 _mm_store_ps( ( float* )alpha3, a3v.v ); \
135 alpha3 += step_a3; \
136\
137 a1v.v = _mm_load_ps( ( float* )alpha1 ); \
138 a2v.v = _mm_load_ps( ( float* )alpha2 ); \
139\
140 t1v.v = a1v.v; \
141 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
142 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
143\
144 a3v.v = _mm_load_ps( ( float* )alpha3 ); \
145 _mm_store_ps( ( float* )alpha1, a1v.v ); \
146 alpha1 += step_a1; \
147\
148 t2v.v = a2v.v; \
149 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
150 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
151\
152 _mm_store_ps( ( float* )alpha2, a2v.v ); \
153 alpha2 += step_a2; \
154 _mm_store_ps( ( float* )alpha3, a3v.v ); \
155 alpha3 += step_a3; \
156\
157 a1v.v = _mm_load_ps( ( float* )alpha1 ); \
158 a2v.v = _mm_load_ps( ( float* )alpha2 ); \
159\
160 t1v.v = a1v.v; \
161 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
162 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
163\
164 a3v.v = _mm_load_ps( ( float* )alpha3 ); \
165 _mm_store_ps( ( float* )alpha1, a1v.v ); \
166 alpha1 += step_a1; \
167\
168 t2v.v = a2v.v; \
169 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
170 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
171\
172 _mm_store_ps( ( float* )alpha2, a2v.v ); \
173 alpha2 += step_a2; \
174 _mm_store_ps( ( float* )alpha3, a3v.v ); \
175 alpha3 += step_a3; \
176\
177 a1v.v = _mm_load_ps( ( float* )alpha1 ); \
178 a2v.v = _mm_load_ps( ( float* )alpha2 ); \
179\
180 t1v.v = a1v.v; \
181 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
182 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
183\
184 a3v.v = _mm_load_ps( ( float* )alpha3 ); \
185 _mm_store_ps( ( float* )alpha1, a1v.v ); \
186 alpha1 += step_a1; \
187\
188 t2v.v = a2v.v; \
189 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
190 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
191\
192 _mm_store_ps( ( float* )alpha2, a2v.v ); \
193 alpha2 += step_a2; \
194 _mm_store_ps( ( float* )alpha3, a3v.v ); \
195 alpha3 += step_a3; \
196\
197 a1v.v = _mm_load_ps( ( float* )alpha1 ); \
198 a2v.v = _mm_load_ps( ( float* )alpha2 ); \
199\
200 t1v.v = a1v.v; \
201 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
202 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
203\
204 a3v.v = _mm_load_ps( ( float* )alpha3 ); \
205 _mm_store_ps( ( float* )alpha1, a1v.v ); \
206 alpha1 += step_a1; \
207\
208 t2v.v = a2v.v; \
209 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
210 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
211\
212 _mm_store_ps( ( float* )alpha2, a2v.v ); \
213 alpha2 += step_a2; \
214 _mm_store_ps( ( float* )alpha3, a3v.v ); \
215 alpha3 += step_a3; \
216 } \
217\
218 for ( i = 0; i < n_iter4; ++i ) \
219 { \
220\
221 a1v.v = _mm_load_ps( ( float* )alpha1 ); \
222 a2v.v = _mm_load_ps( ( float* )alpha2 ); \
223\
224 t1v.v = a1v.v; \
225 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
226 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
227\
228 a3v.v = _mm_load_ps( ( float* )alpha3 ); \
229 _mm_store_ps( ( float* )alpha1, a1v.v ); \
230 alpha1 += step_a1; \
231\
232 t2v.v = a2v.v; \
233 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
234 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
235\
236 _mm_store_ps( ( float* )alpha2, a2v.v ); \
237 alpha2 += step_a2; \
238 _mm_store_ps( ( float* )alpha3, a3v.v ); \
239 alpha3 += step_a3; \
240 } \
241\
242 for ( i = 0; i < n_left; ++i ) \
243 { \
244 float ga12 = *gamma12; \
245 float si12 = *sigma12; \
246 float ga23 = *gamma23; \
247 float si23 = *sigma23; \
248 float temp1; \
249 float temp2; \
250 float temp3; \
251\
252 temp1 = *alpha1; \
253 temp2 = *alpha2; \
254\
255 *alpha1 = temp1 * ga12 + temp2 * si12; \
256 *alpha2 = temp2 * ga12 - temp1 * si12; \
257\
258 temp2 = *alpha2; \
259 temp3 = *alpha3; \
260\
261 *alpha2 = temp2 * ga23 + temp3 * si23; \
262 *alpha3 = temp3 * ga23 - temp2 * si23; \
263\
264 alpha1 += 1; \
265 alpha2 += 1; \
266 alpha3 += 1; \
267 } \
268}
269
270#define MAC_Apply_G_mx3_asd( m_A, \
271 gamma12, \
272 sigma12, \
273 gamma23, \
274 sigma23, \
275 a1, inc_a1, \
276 a2, inc_a2, \
277 a3, inc_a3 ) \
278{\
279 int n_iter16 = m_A / ( 2 * 8 ); \
280 int n_left16 = m_A % ( 2 * 8 ); \
281 int n_iter2 = n_left16 / ( 2 * 1 ); \
282 int n_left = n_left16 % ( 2 * 1 ); \
283 int i; \
284\
285 const int step_a1 = inc_a1 * 2; \
286 const int step_a2 = inc_a1 * 2; \
287 const int step_a3 = inc_a1 * 2; \
288\
289 double* restrict alpha1 = a1; \
290 double* restrict alpha2 = a2; \
291 double* restrict alpha3 = a3; \
292\
293 v2df_t a1v, a2v, a3v; \
294 v2df_t g12v, s12v; \
295 v2df_t g23v, s23v; \
296 v2df_t t1v, t2v; \
297\
298 g12v.v = _mm_loaddup_pd( gamma12 ); \
299 s12v.v = _mm_loaddup_pd( sigma12 ); \
300 g23v.v = _mm_loaddup_pd( gamma23 ); \
301 s23v.v = _mm_loaddup_pd( sigma23 ); \
302\
303 for ( i = 0; i < n_iter16; ++i ) \
304 { \
305\
306 a1v.v = _mm_load_pd( ( double* )alpha1 ); \
307 a2v.v = _mm_load_pd( ( double* )alpha2 ); \
308\
309 t1v.v = a1v.v; \
310 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
311 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
312\
313 a3v.v = _mm_load_pd( ( double* )alpha3 ); \
314 _mm_store_pd( ( double* )alpha1, a1v.v ); \
315 alpha1 += step_a1; \
316\
317 t2v.v = a2v.v; \
318 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
319 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
320\
321 _mm_store_pd( ( double* )alpha2, a2v.v ); \
322 alpha2 += step_a2; \
323 _mm_store_pd( ( double* )alpha3, a3v.v ); \
324 alpha3 += step_a3; \
325\
326 a1v.v = _mm_load_pd( ( double* )alpha1 ); \
327 a2v.v = _mm_load_pd( ( double* )alpha2 ); \
328\
329 t1v.v = a1v.v; \
330 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
331 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
332\
333 a3v.v = _mm_load_pd( ( double* )alpha3 ); \
334 _mm_store_pd( ( double* )alpha1, a1v.v ); \
335 alpha1 += step_a1; \
336\
337 t2v.v = a2v.v; \
338 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
339 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
340\
341 _mm_store_pd( ( double* )alpha2, a2v.v ); \
342 alpha2 += step_a2; \
343 _mm_store_pd( ( double* )alpha3, a3v.v ); \
344 alpha3 += step_a3; \
345\
346 a1v.v = _mm_load_pd( ( double* )alpha1 ); \
347 a2v.v = _mm_load_pd( ( double* )alpha2 ); \
348\
349 t1v.v = a1v.v; \
350 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
351 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
352\
353 a3v.v = _mm_load_pd( ( double* )alpha3 ); \
354 _mm_store_pd( ( double* )alpha1, a1v.v ); \
355 alpha1 += step_a1; \
356\
357 t2v.v = a2v.v; \
358 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
359 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
360\
361 _mm_store_pd( ( double* )alpha2, a2v.v ); \
362 alpha2 += step_a2; \
363 _mm_store_pd( ( double* )alpha3, a3v.v ); \
364 alpha3 += step_a3; \
365\
366 a1v.v = _mm_load_pd( ( double* )alpha1 ); \
367 a2v.v = _mm_load_pd( ( double* )alpha2 ); \
368\
369 t1v.v = a1v.v; \
370 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
371 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
372\
373 a3v.v = _mm_load_pd( ( double* )alpha3 ); \
374 _mm_store_pd( ( double* )alpha1, a1v.v ); \
375 alpha1 += step_a1; \
376\
377 t2v.v = a2v.v; \
378 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
379 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
380\
381 _mm_store_pd( ( double* )alpha2, a2v.v ); \
382 alpha2 += step_a2; \
383 _mm_store_pd( ( double* )alpha3, a3v.v ); \
384 alpha3 += step_a3; \
385\
386 a1v.v = _mm_load_pd( ( double* )alpha1 ); \
387 a2v.v = _mm_load_pd( ( double* )alpha2 ); \
388\
389 t1v.v = a1v.v; \
390 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
391 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
392\
393 a3v.v = _mm_load_pd( ( double* )alpha3 ); \
394 _mm_store_pd( ( double* )alpha1, a1v.v ); \
395 alpha1 += step_a1; \
396\
397 t2v.v = a2v.v; \
398 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
399 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
400\
401 _mm_store_pd( ( double* )alpha2, a2v.v ); \
402 alpha2 += step_a2; \
403 _mm_store_pd( ( double* )alpha3, a3v.v ); \
404 alpha3 += step_a3; \
405\
406 a1v.v = _mm_load_pd( ( double* )alpha1 ); \
407 a2v.v = _mm_load_pd( ( double* )alpha2 ); \
408\
409 t1v.v = a1v.v; \
410 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
411 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
412\
413 a3v.v = _mm_load_pd( ( double* )alpha3 ); \
414 _mm_store_pd( ( double* )alpha1, a1v.v ); \
415 alpha1 += step_a1; \
416\
417 t2v.v = a2v.v; \
418 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
419 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
420\
421 _mm_store_pd( ( double* )alpha2, a2v.v ); \
422 alpha2 += step_a2; \
423 _mm_store_pd( ( double* )alpha3, a3v.v ); \
424 alpha3 += step_a3; \
425\
426 a1v.v = _mm_load_pd( ( double* )alpha1 ); \
427 a2v.v = _mm_load_pd( ( double* )alpha2 ); \
428\
429 t1v.v = a1v.v; \
430 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
431 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
432\
433 a3v.v = _mm_load_pd( ( double* )alpha3 ); \
434 _mm_store_pd( ( double* )alpha1, a1v.v ); \
435 alpha1 += step_a1; \
436\
437 t2v.v = a2v.v; \
438 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
439 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
440\
441 _mm_store_pd( ( double* )alpha2, a2v.v ); \
442 alpha2 += step_a2; \
443 _mm_store_pd( ( double* )alpha3, a3v.v ); \
444 alpha3 += step_a3; \
445\
446 a1v.v = _mm_load_pd( ( double* )alpha1 ); \
447 a2v.v = _mm_load_pd( ( double* )alpha2 ); \
448\
449 t1v.v = a1v.v; \
450 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
451 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
452\
453 a3v.v = _mm_load_pd( ( double* )alpha3 ); \
454 _mm_store_pd( ( double* )alpha1, a1v.v ); \
455 alpha1 += step_a1; \
456\
457 t2v.v = a2v.v; \
458 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
459 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
460\
461 _mm_store_pd( ( double* )alpha2, a2v.v ); \
462 alpha2 += step_a2; \
463 _mm_store_pd( ( double* )alpha3, a3v.v ); \
464 alpha3 += step_a3; \
465 } \
466\
467 for ( i = 0; i < n_iter2; ++i ) \
468 { \
469\
470 a1v.v = _mm_load_pd( ( double* )alpha1 ); \
471 a2v.v = _mm_load_pd( ( double* )alpha2 ); \
472\
473 t1v.v = a1v.v; \
474 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
475 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
476\
477 a3v.v = _mm_load_pd( ( double* )alpha3 ); \
478 _mm_store_pd( ( double* )alpha1, a1v.v ); \
479 alpha1 += step_a1; \
480\
481 t2v.v = a2v.v; \
482 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
483 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
484\
485 _mm_store_pd( ( double* )alpha2, a2v.v ); \
486 alpha2 += step_a2; \
487 _mm_store_pd( ( double* )alpha3, a3v.v ); \
488 alpha3 += step_a3; \
489 } \
490\
491 if ( n_left == 1 ) \
492 { \
493 double ga12 = *gamma12; \
494 double si12 = *sigma12; \
495 double ga23 = *gamma23; \
496 double si23 = *sigma23; \
497 double temp1; \
498 double temp2; \
499 double temp3; \
500\
501 temp1 = *alpha1; \
502 temp2 = *alpha2; \
503\
504 *alpha1 = temp1 * ga12 + temp2 * si12; \
505 *alpha2 = temp2 * ga12 - temp1 * si12; \
506\
507 temp2 = *alpha2; \
508 temp3 = *alpha3; \
509\
510 *alpha2 = temp2 * ga23 + temp3 * si23; \
511 *alpha3 = temp3 * ga23 - temp2 * si23; \
512 } \
513}
514
515#define MAC_Apply_G_mx3_asc( m_A, \
516 gamma12, \
517 sigma12, \
518 gamma23, \
519 sigma23, \
520 a1, inc_a1, \
521 a2, inc_a2, \
522 a3, inc_a3 ) \
523{ \
524 int n_iter16 = m_A / ( 2 * 8 ); \
525 int n_left16 = m_A % ( 2 * 8 ); \
526 int n_iter2 = n_left16 / ( 2 * 1 ); \
527 int n_left = n_left16 % ( 2 * 1 ); \
528 int i; \
529\
530 const int step_a1 = inc_a1 * 2; \
531 const int step_a2 = inc_a1 * 2; \
532 const int step_a3 = inc_a1 * 2; \
533\
534 scomplex* restrict alpha1 = a1; \
535 scomplex* restrict alpha2 = a2; \
536 scomplex* restrict alpha3 = a3; \
537\
538 v4sf_t a1v, a2v, a3v; \
539 v4sf_t g12v, s12v; \
540 v4sf_t g23v, s23v; \
541 v4sf_t t1v, t2v; \
542\
543 g12v.v = _mm_load1_ps( gamma12 ); \
544 s12v.v = _mm_load1_ps( sigma12 ); \
545 g23v.v = _mm_load1_ps( gamma23 ); \
546 s23v.v = _mm_load1_ps( sigma23 ); \
547\
548 for ( i = 0; i < n_iter16; ++i ) \
549 { \
550\
551 a1v.v = _mm_load_ps( ( float* )alpha1 ); \
552 a2v.v = _mm_load_ps( ( float* )alpha2 ); \
553\
554 t1v.v = a1v.v; \
555 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
556 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
557\
558 a3v.v = _mm_load_ps( ( float* )alpha3 ); \
559 _mm_store_ps( ( float* )alpha1, a1v.v ); \
560 alpha1 += step_a1; \
561\
562 t2v.v = a2v.v; \
563 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
564 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
565\
566 _mm_store_ps( ( float* )alpha2, a2v.v ); \
567 alpha2 += step_a2; \
568 _mm_store_ps( ( float* )alpha3, a3v.v ); \
569 alpha3 += step_a3; \
570\
571 a1v.v = _mm_load_ps( ( float* )alpha1 ); \
572 a2v.v = _mm_load_ps( ( float* )alpha2 ); \
573\
574 t1v.v = a1v.v; \
575 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
576 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
577\
578 a3v.v = _mm_load_ps( ( float* )alpha3 ); \
579 _mm_store_ps( ( float* )alpha1, a1v.v ); \
580 alpha1 += step_a1; \
581\
582 t2v.v = a2v.v; \
583 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
584 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
585\
586 _mm_store_ps( ( float* )alpha2, a2v.v ); \
587 alpha2 += step_a2; \
588 _mm_store_ps( ( float* )alpha3, a3v.v ); \
589 alpha3 += step_a3; \
590\
591 a1v.v = _mm_load_ps( ( float* )alpha1 ); \
592 a2v.v = _mm_load_ps( ( float* )alpha2 ); \
593\
594 t1v.v = a1v.v; \
595 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
596 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
597\
598 a3v.v = _mm_load_ps( ( float* )alpha3 ); \
599 _mm_store_ps( ( float* )alpha1, a1v.v ); \
600 alpha1 += step_a1; \
601\
602 t2v.v = a2v.v; \
603 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
604 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
605\
606 _mm_store_ps( ( float* )alpha2, a2v.v ); \
607 alpha2 += step_a2; \
608 _mm_store_ps( ( float* )alpha3, a3v.v ); \
609 alpha3 += step_a3; \
610\
611 a1v.v = _mm_load_ps( ( float* )alpha1 ); \
612 a2v.v = _mm_load_ps( ( float* )alpha2 ); \
613\
614 t1v.v = a1v.v; \
615 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
616 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
617\
618 a3v.v = _mm_load_ps( ( float* )alpha3 ); \
619 _mm_store_ps( ( float* )alpha1, a1v.v ); \
620 alpha1 += step_a1; \
621\
622 t2v.v = a2v.v; \
623 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
624 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
625\
626 _mm_store_ps( ( float* )alpha2, a2v.v ); \
627 alpha2 += step_a2; \
628 _mm_store_ps( ( float* )alpha3, a3v.v ); \
629 alpha3 += step_a3; \
630\
631 a1v.v = _mm_load_ps( ( float* )alpha1 ); \
632 a2v.v = _mm_load_ps( ( float* )alpha2 ); \
633\
634 t1v.v = a1v.v; \
635 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
636 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
637\
638 a3v.v = _mm_load_ps( ( float* )alpha3 ); \
639 _mm_store_ps( ( float* )alpha1, a1v.v ); \
640 alpha1 += step_a1; \
641\
642 t2v.v = a2v.v; \
643 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
644 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
645\
646 _mm_store_ps( ( float* )alpha2, a2v.v ); \
647 alpha2 += step_a2; \
648 _mm_store_ps( ( float* )alpha3, a3v.v ); \
649 alpha3 += step_a3; \
650\
651 a1v.v = _mm_load_ps( ( float* )alpha1 ); \
652 a2v.v = _mm_load_ps( ( float* )alpha2 ); \
653\
654 t1v.v = a1v.v; \
655 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
656 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
657\
658 a3v.v = _mm_load_ps( ( float* )alpha3 ); \
659 _mm_store_ps( ( float* )alpha1, a1v.v ); \
660 alpha1 += step_a1; \
661\
662 t2v.v = a2v.v; \
663 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
664 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
665\
666 _mm_store_ps( ( float* )alpha2, a2v.v ); \
667 alpha2 += step_a2; \
668 _mm_store_ps( ( float* )alpha3, a3v.v ); \
669 alpha3 += step_a3; \
670\
671 a1v.v = _mm_load_ps( ( float* )alpha1 ); \
672 a2v.v = _mm_load_ps( ( float* )alpha2 ); \
673\
674 t1v.v = a1v.v; \
675 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
676 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
677\
678 a3v.v = _mm_load_ps( ( float* )alpha3 ); \
679 _mm_store_ps( ( float* )alpha1, a1v.v ); \
680 alpha1 += step_a1; \
681\
682 t2v.v = a2v.v; \
683 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
684 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
685\
686 _mm_store_ps( ( float* )alpha2, a2v.v ); \
687 alpha2 += step_a2; \
688 _mm_store_ps( ( float* )alpha3, a3v.v ); \
689 alpha3 += step_a3; \
690\
691 a1v.v = _mm_load_ps( ( float* )alpha1 ); \
692 a2v.v = _mm_load_ps( ( float* )alpha2 ); \
693\
694 t1v.v = a1v.v; \
695 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
696 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
697\
698 a3v.v = _mm_load_ps( ( float* )alpha3 ); \
699 _mm_store_ps( ( float* )alpha1, a1v.v ); \
700 alpha1 += step_a1; \
701\
702 t2v.v = a2v.v; \
703 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
704 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
705\
706 _mm_store_ps( ( float* )alpha2, a2v.v ); \
707 alpha2 += step_a2; \
708 _mm_store_ps( ( float* )alpha3, a3v.v ); \
709 alpha3 += step_a3; \
710 } \
711\
712 for ( i = 0; i < n_iter2; ++i ) \
713 { \
714\
715 a1v.v = _mm_load_ps( ( float* )alpha1 ); \
716 a2v.v = _mm_load_ps( ( float* )alpha2 ); \
717\
718 t1v.v = a1v.v; \
719 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
720 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
721\
722 a3v.v = _mm_load_ps( ( float* )alpha3 ); \
723 _mm_store_ps( ( float* )alpha1, a1v.v ); \
724 alpha1 += step_a1; \
725\
726 t2v.v = a2v.v; \
727 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
728 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
729\
730 _mm_store_ps( ( float* )alpha2, a2v.v ); \
731 alpha2 += step_a2; \
732 _mm_store_ps( ( float* )alpha3, a3v.v ); \
733 alpha3 += step_a3; \
734 } \
735\
736 if ( n_left == 1 ) \
737 { \
738 float ga12 = *gamma12; \
739 float si12 = *sigma12; \
740 float ga23 = *gamma23; \
741 float si23 = *sigma23; \
742 scomplex temp1; \
743 scomplex temp2; \
744 scomplex temp3; \
745\
746 temp1 = *alpha1; \
747 temp2 = *alpha2; \
748\
749 alpha1->real = temp1.real * ga12 + temp2.real * si12; \
750 alpha2->real = temp2.real * ga12 - temp1.real * si12; \
751\
752 alpha1->imag = temp1.imag * ga12 + temp2.imag * si12; \
753 alpha2->imag = temp2.imag * ga12 - temp1.imag * si12; \
754\
755 temp2 = *alpha2; \
756 temp3 = *alpha3; \
757\
758 alpha2->real = temp2.real * ga23 + temp3.real * si23; \
759 alpha3->real = temp3.real * ga23 - temp2.real * si23; \
760\
761 alpha2->imag = temp2.imag * ga23 + temp3.imag * si23; \
762 alpha3->imag = temp3.imag * ga23 - temp2.imag * si23; \
763 } \
764}
765
766#define MAC_Apply_G_mx3_asz( m_A, \
767 gamma12, \
768 sigma12, \
769 gamma23, \
770 sigma23, \
771 a1, inc_a1, \
772 a2, inc_a2, \
773 a3, inc_a3 ) \
774{\
775 int n_iter = m_A / 8; \
776 int n_left = m_A % 8; \
777 int i; \
778\
779 const int step_a1 = inc_a1 * 1; \
780 const int step_a2 = inc_a1 * 1; \
781 const int step_a3 = inc_a1 * 1; \
782\
783 dcomplex* restrict alpha1 = a1; \
784 dcomplex* restrict alpha2 = a2; \
785 dcomplex* restrict alpha3 = a3; \
786\
787 v2df_t a1v, a2v, a3v; \
788 v2df_t g12v, s12v; \
789 v2df_t g23v, s23v; \
790 v2df_t t1v, t2v; \
791\
792 g12v.v = _mm_loaddup_pd( gamma12 ); \
793 s12v.v = _mm_loaddup_pd( sigma12 ); \
794 g23v.v = _mm_loaddup_pd( gamma23 ); \
795 s23v.v = _mm_loaddup_pd( sigma23 ); \
796\
797 for ( i = 0; i < n_iter; ++i ) \
798 { \
799\
800 a1v.v = _mm_load_pd( ( double* )alpha1 ); \
801 a2v.v = _mm_load_pd( ( double* )alpha2 ); \
802\
803 t1v.v = a1v.v; \
804 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
805 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
806\
807 a3v.v = _mm_load_pd( ( double* )alpha3 ); \
808 _mm_store_pd( ( double* )alpha1, a1v.v ); \
809 alpha1 += step_a1; \
810\
811 t2v.v = a2v.v; \
812 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
813 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
814\
815 _mm_store_pd( ( double* )alpha2, a2v.v ); \
816 _mm_store_pd( ( double* )alpha3, a3v.v ); \
817 alpha2 += step_a2; \
818 alpha3 += step_a3; \
819\
820 a1v.v = _mm_load_pd( ( double* )alpha1 ); \
821 a2v.v = _mm_load_pd( ( double* )alpha2 ); \
822\
823 t1v.v = a1v.v; \
824 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
825 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
826\
827 a3v.v = _mm_load_pd( ( double* )alpha3 ); \
828 _mm_store_pd( ( double* )alpha1, a1v.v ); \
829 alpha1 += step_a1; \
830\
831 t2v.v = a2v.v; \
832 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
833 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
834\
835 _mm_store_pd( ( double* )alpha2, a2v.v ); \
836 _mm_store_pd( ( double* )alpha3, a3v.v ); \
837 alpha2 += step_a2; \
838 alpha3 += step_a3; \
839\
840 a1v.v = _mm_load_pd( ( double* )alpha1 ); \
841 a2v.v = _mm_load_pd( ( double* )alpha2 ); \
842\
843 t1v.v = a1v.v; \
844 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
845 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
846\
847 a3v.v = _mm_load_pd( ( double* )alpha3 ); \
848 _mm_store_pd( ( double* )alpha1, a1v.v ); \
849 alpha1 += step_a1; \
850\
851 t2v.v = a2v.v; \
852 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
853 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
854\
855 _mm_store_pd( ( double* )alpha2, a2v.v ); \
856 _mm_store_pd( ( double* )alpha3, a3v.v ); \
857 alpha2 += step_a2; \
858 alpha3 += step_a3; \
859\
860 a1v.v = _mm_load_pd( ( double* )alpha1 ); \
861 a2v.v = _mm_load_pd( ( double* )alpha2 ); \
862\
863 t1v.v = a1v.v; \
864 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
865 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
866\
867 a3v.v = _mm_load_pd( ( double* )alpha3 ); \
868 _mm_store_pd( ( double* )alpha1, a1v.v ); \
869 alpha1 += step_a1; \
870\
871 t2v.v = a2v.v; \
872 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
873 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
874\
875 _mm_store_pd( ( double* )alpha2, a2v.v ); \
876 _mm_store_pd( ( double* )alpha3, a3v.v ); \
877 alpha2 += step_a2; \
878 alpha3 += step_a3; \
879\
880 a1v.v = _mm_load_pd( ( double* )alpha1 ); \
881 a2v.v = _mm_load_pd( ( double* )alpha2 ); \
882\
883 t1v.v = a1v.v; \
884 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
885 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
886\
887 a3v.v = _mm_load_pd( ( double* )alpha3 ); \
888 _mm_store_pd( ( double* )alpha1, a1v.v ); \
889 alpha1 += step_a1; \
890\
891 t2v.v = a2v.v; \
892 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
893 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
894\
895 _mm_store_pd( ( double* )alpha2, a2v.v ); \
896 _mm_store_pd( ( double* )alpha3, a3v.v ); \
897 alpha2 += step_a2; \
898 alpha3 += step_a3; \
899\
900 a1v.v = _mm_load_pd( ( double* )alpha1 ); \
901 a2v.v = _mm_load_pd( ( double* )alpha2 ); \
902\
903 t1v.v = a1v.v; \
904 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
905 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
906\
907 a3v.v = _mm_load_pd( ( double* )alpha3 ); \
908 _mm_store_pd( ( double* )alpha1, a1v.v ); \
909 alpha1 += step_a1; \
910\
911 t2v.v = a2v.v; \
912 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
913 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
914\
915 _mm_store_pd( ( double* )alpha2, a2v.v ); \
916 _mm_store_pd( ( double* )alpha3, a3v.v ); \
917 alpha2 += step_a2; \
918 alpha3 += step_a3; \
919\
920 a1v.v = _mm_load_pd( ( double* )alpha1 ); \
921 a2v.v = _mm_load_pd( ( double* )alpha2 ); \
922\
923 t1v.v = a1v.v; \
924 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
925 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
926\
927 a3v.v = _mm_load_pd( ( double* )alpha3 ); \
928 _mm_store_pd( ( double* )alpha1, a1v.v ); \
929 alpha1 += step_a1; \
930\
931 t2v.v = a2v.v; \
932 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
933 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
934\
935 _mm_store_pd( ( double* )alpha2, a2v.v ); \
936 _mm_store_pd( ( double* )alpha3, a3v.v ); \
937 alpha2 += step_a2; \
938 alpha3 += step_a3; \
939\
940 a1v.v = _mm_load_pd( ( double* )alpha1 ); \
941 a2v.v = _mm_load_pd( ( double* )alpha2 ); \
942\
943 t1v.v = a1v.v; \
944 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
945 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
946\
947 a3v.v = _mm_load_pd( ( double* )alpha3 ); \
948 _mm_store_pd( ( double* )alpha1, a1v.v ); \
949 alpha1 += step_a1; \
950\
951 t2v.v = a2v.v; \
952 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
953 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
954\
955 _mm_store_pd( ( double* )alpha2, a2v.v ); \
956 _mm_store_pd( ( double* )alpha3, a3v.v ); \
957 alpha2 += step_a2; \
958 alpha3 += step_a3; \
959 } \
960\
961 for ( i = 0; i < n_left; ++i ) \
962 { \
963 a1v.v = _mm_load_pd( ( double* )alpha1 ); \
964 a2v.v = _mm_load_pd( ( double* )alpha2 ); \
965\
966 t1v.v = a1v.v; \
967 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
968 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
969\
970 _mm_store_pd( ( double* )alpha1, a1v.v ); \
971 alpha1 += step_a1; \
972 a3v.v = _mm_load_pd( ( double* )alpha3 ); \
973\
974 t2v.v = a2v.v; \
975 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
976 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
977\
978 _mm_store_pd( ( double* )alpha2, a2v.v ); \
979 alpha2 += step_a2; \
980 _mm_store_pd( ( double* )alpha3, a3v.v ); \
981 alpha3 += step_a3; \
982 } \
983}
984
985#endif