libflame revision_anchor
FLA_Apply_G_mx4s_opt.h
Go to the documentation of this file.
1/*
2
3 Copyright (C) 2014, The University of Texas at Austin
4
5 This file is part of libflame and is available under the 3-Clause
6 BSD license, which can be found in the LICENSE file at the top-level
7 directory, or at http://opensource.org/licenses/BSD-3-Clause
8
9*/
10
11#define MAC_Apply_G_mx4s_ops( m_A, \
12 gamma23_k1, \
13 sigma23_k1, \
14 gamma34_k1, \
15 sigma34_k1, \
16 gamma12_k2, \
17 sigma12_k2, \
18 gamma23_k2, \
19 sigma23_k2, \
20 a1, inc_a1, \
21 a2, inc_a2, \
22 a3, inc_a3, \
23 a4, inc_a4 ) \
24{ \
25 float ga23_k1 = *gamma23_k1; \
26 float si23_k1 = *sigma23_k1; \
27 float ga34_k1 = *gamma34_k1; \
28 float si34_k1 = *sigma34_k1; \
29 float ga12_k2 = *gamma12_k2; \
30 float si12_k2 = *sigma12_k2; \
31 float ga23_k2 = *gamma23_k2; \
32 float si23_k2 = *sigma23_k2; \
33 float* restrict alpha1 = a1; \
34 float* restrict alpha2 = a2; \
35 float* restrict alpha3 = a3; \
36 float* restrict alpha4 = a4; \
37 float temp1; \
38 float temp2; \
39 float temp3; \
40 float temp4; \
41 int i; \
42\
43 for ( i = 0; i < m_A; ++i ) \
44 { \
45 temp2 = *alpha2; \
46 temp3 = *alpha3; \
47\
48 *alpha2 = temp2 * ga23_k1 + temp3 * si23_k1; \
49 *alpha3 = temp3 * ga23_k1 - temp2 * si23_k1; \
50\
51 temp3 = *alpha3; \
52 temp4 = *alpha4; \
53\
54 *alpha3 = temp3 * ga34_k1 + temp4 * si34_k1; \
55 *alpha4 = temp4 * ga34_k1 - temp3 * si34_k1; \
56\
57 temp1 = *alpha1; \
58 temp2 = *alpha2; \
59\
60 *alpha1 = temp1 * ga12_k2 + temp2 * si12_k2; \
61 *alpha2 = temp2 * ga12_k2 - temp1 * si12_k2; \
62\
63 temp2 = *alpha2; \
64 temp3 = *alpha3; \
65\
66 *alpha2 = temp2 * ga23_k2 + temp3 * si23_k2; \
67 *alpha3 = temp3 * ga23_k2 - temp2 * si23_k2; \
68\
69 alpha1 += inc_a1; \
70 alpha2 += inc_a2; \
71 alpha3 += inc_a3; \
72 alpha4 += inc_a4; \
73 } \
74}
75
76#define MAC_Apply_G_mx4s_opc( m_A, \
77 gamma23_k1, \
78 sigma23_k1, \
79 gamma34_k1, \
80 sigma34_k1, \
81 gamma12_k2, \
82 sigma12_k2, \
83 gamma23_k2, \
84 sigma23_k2, \
85 a1, inc_a1, \
86 a2, inc_a2, \
87 a3, inc_a3, \
88 a4, inc_a4 ) \
89{ \
90 float ga23_k1 = *gamma23_k1; \
91 float si23_k1 = *sigma23_k1; \
92 float ga34_k1 = *gamma34_k1; \
93 float si34_k1 = *sigma34_k1; \
94 float ga12_k2 = *gamma12_k2; \
95 float si12_k2 = *sigma12_k2; \
96 float ga23_k2 = *gamma23_k2; \
97 float si23_k2 = *sigma23_k2; \
98 scomplex* restrict alpha1 = a1; \
99 scomplex* restrict alpha2 = a2; \
100 scomplex* restrict alpha3 = a3; \
101 scomplex* restrict alpha4 = a4; \
102 scomplex temp1; \
103 scomplex temp2; \
104 scomplex temp3; \
105 scomplex temp4; \
106 int i; \
107\
108 for ( i = 0; i < m_A; ++i ) \
109 { \
110\
111 temp2 = *alpha2; \
112 temp3 = *alpha3; \
113\
114 alpha2->real = temp2.real * ga23_k1 + temp3.real * si23_k1; \
115 alpha2->imag = temp2.imag * ga23_k1 + temp3.imag * si23_k1; \
116\
117 alpha3->real = temp3.real * ga23_k1 - temp2.real * si23_k1; \
118 alpha3->imag = temp3.imag * ga23_k1 - temp2.imag * si23_k1; \
119\
120 temp3 = *alpha3; \
121 temp4 = *alpha4; \
122\
123 alpha3->real = temp3.real * ga34_k1 + temp4.real * si34_k1; \
124 alpha3->imag = temp3.imag * ga34_k1 + temp4.imag * si34_k1; \
125\
126 alpha4->real = temp4.real * ga34_k1 - temp3.real * si34_k1; \
127 alpha4->imag = temp4.imag * ga34_k1 - temp3.imag * si34_k1; \
128\
129 temp1 = *alpha1; \
130 temp2 = *alpha2; \
131\
132 alpha1->real = temp1.real * ga12_k2 + temp2.real * si12_k2; \
133 alpha1->imag = temp1.imag * ga12_k2 + temp2.imag * si12_k2; \
134\
135 alpha2->real = temp2.real * ga12_k2 - temp1.real * si12_k2; \
136 alpha2->imag = temp2.imag * ga12_k2 - temp1.imag * si12_k2; \
137\
138 temp2 = *alpha2; \
139 temp3 = *alpha3; \
140\
141 alpha2->real = temp2.real * ga23_k2 + temp3.real * si23_k2; \
142 alpha2->imag = temp2.imag * ga23_k2 + temp3.imag * si23_k2; \
143\
144 alpha3->real = temp3.real * ga23_k2 - temp2.real * si23_k2; \
145 alpha3->imag = temp3.imag * ga23_k2 - temp2.imag * si23_k2; \
146\
147 alpha1 += inc_a1; \
148 alpha2 += inc_a2; \
149 alpha3 += inc_a3; \
150 alpha4 += inc_a4; \
151 } \
152}
153
154#define MAC_Apply_G_mx4s_opd( m_A, \
155 gamma23_k1, \
156 sigma23_k1, \
157 gamma34_k1, \
158 sigma34_k1, \
159 gamma12_k2, \
160 sigma12_k2, \
161 gamma23_k2, \
162 sigma23_k2, \
163 a1, inc_a1, \
164 a2, inc_a2, \
165 a3, inc_a3, \
166 a4, inc_a4 ) \
167{ \
168 double ga23_k1 = *gamma23_k1; \
169 double si23_k1 = *sigma23_k1; \
170 double ga34_k1 = *gamma34_k1; \
171 double si34_k1 = *sigma34_k1; \
172 double ga12_k2 = *gamma12_k2; \
173 double si12_k2 = *sigma12_k2; \
174 double ga23_k2 = *gamma23_k2; \
175 double si23_k2 = *sigma23_k2; \
176 double* restrict alpha1 = a1; \
177 double* restrict alpha2 = a2; \
178 double* restrict alpha3 = a3; \
179 double* restrict alpha4 = a4; \
180 double temp1; \
181 double temp2; \
182 double temp3; \
183 double temp4; \
184 int i; \
185\
186 for ( i = 0; i < m_A; ++i ) \
187 { \
188 temp2 = *alpha2; \
189 temp3 = *alpha3; \
190\
191 *alpha2 = temp2 * ga23_k1 + temp3 * si23_k1; \
192 *alpha3 = temp3 * ga23_k1 - temp2 * si23_k1; \
193\
194 temp3 = *alpha3; \
195 temp4 = *alpha4; \
196\
197 *alpha3 = temp3 * ga34_k1 + temp4 * si34_k1; \
198 *alpha4 = temp4 * ga34_k1 - temp3 * si34_k1; \
199\
200 temp1 = *alpha1; \
201 temp2 = *alpha2; \
202\
203 *alpha1 = temp1 * ga12_k2 + temp2 * si12_k2; \
204 *alpha2 = temp2 * ga12_k2 - temp1 * si12_k2; \
205\
206 temp2 = *alpha2; \
207 temp3 = *alpha3; \
208\
209 *alpha2 = temp2 * ga23_k2 + temp3 * si23_k2; \
210 *alpha3 = temp3 * ga23_k2 - temp2 * si23_k2; \
211\
212 alpha1 += inc_a1; \
213 alpha2 += inc_a2; \
214 alpha3 += inc_a3; \
215 alpha4 += inc_a4; \
216 } \
217}
218
219#define MAC_Apply_G_mx4s_opz( m_A, \
220 gamma23_k1, \
221 sigma23_k1, \
222 gamma34_k1, \
223 sigma34_k1, \
224 gamma12_k2, \
225 sigma12_k2, \
226 gamma23_k2, \
227 sigma23_k2, \
228 a1, inc_a1, \
229 a2, inc_a2, \
230 a3, inc_a3, \
231 a4, inc_a4 ) \
232{ \
233 double ga23_k1 = *gamma23_k1; \
234 double si23_k1 = *sigma23_k1; \
235 double ga34_k1 = *gamma34_k1; \
236 double si34_k1 = *sigma34_k1; \
237 double ga12_k2 = *gamma12_k2; \
238 double si12_k2 = *sigma12_k2; \
239 double ga23_k2 = *gamma23_k2; \
240 double si23_k2 = *sigma23_k2; \
241 dcomplex* restrict alpha1 = a1; \
242 dcomplex* restrict alpha2 = a2; \
243 dcomplex* restrict alpha3 = a3; \
244 dcomplex* restrict alpha4 = a4; \
245 dcomplex temp1; \
246 dcomplex temp2; \
247 dcomplex temp3; \
248 dcomplex temp4; \
249 int i; \
250\
251 for ( i = 0; i < m_A; ++i ) \
252 { \
253\
254 temp2 = *alpha2; \
255 temp3 = *alpha3; \
256\
257 alpha2->real = temp2.real * ga23_k1 + temp3.real * si23_k1; \
258 alpha2->imag = temp2.imag * ga23_k1 + temp3.imag * si23_k1; \
259\
260 alpha3->real = temp3.real * ga23_k1 - temp2.real * si23_k1; \
261 alpha3->imag = temp3.imag * ga23_k1 - temp2.imag * si23_k1; \
262\
263 temp3 = *alpha3; \
264 temp4 = *alpha4; \
265\
266 alpha3->real = temp3.real * ga34_k1 + temp4.real * si34_k1; \
267 alpha3->imag = temp3.imag * ga34_k1 + temp4.imag * si34_k1; \
268\
269 alpha4->real = temp4.real * ga34_k1 - temp3.real * si34_k1; \
270 alpha4->imag = temp4.imag * ga34_k1 - temp3.imag * si34_k1; \
271\
272 temp1 = *alpha1; \
273 temp2 = *alpha2; \
274\
275 alpha1->real = temp1.real * ga12_k2 + temp2.real * si12_k2; \
276 alpha1->imag = temp1.imag * ga12_k2 + temp2.imag * si12_k2; \
277\
278 alpha2->real = temp2.real * ga12_k2 - temp1.real * si12_k2; \
279 alpha2->imag = temp2.imag * ga12_k2 - temp1.imag * si12_k2; \
280\
281 temp2 = *alpha2; \
282 temp3 = *alpha3; \
283\
284 alpha2->real = temp2.real * ga23_k2 + temp3.real * si23_k2; \
285 alpha2->imag = temp2.imag * ga23_k2 + temp3.imag * si23_k2; \
286\
287 alpha3->real = temp3.real * ga23_k2 - temp2.real * si23_k2; \
288 alpha3->imag = temp3.imag * ga23_k2 - temp2.imag * si23_k2; \
289\
290 alpha1 += inc_a1; \
291 alpha2 += inc_a2; \
292 alpha3 += inc_a3; \
293 alpha4 += inc_a4; \
294 } \
295}
296