libflame revision_anchor
Functions | Variables
bl1_axpyv2bdotaxpy.c File Reference

(r)

Functions

void bl1_saxpyv2bdotaxpy (int n, float *beta, float *u, int inc_u, float *gamma, float *z, int inc_z, float *a, int inc_a, float *x, int inc_x, float *kappa, float *rho, float *w, int inc_w)
 
void bl1_daxpyv2bdotaxpy (int n, double *beta, double *u, int inc_u, double *gamma, double *z, int inc_z, double *a, int inc_a, double *x, int inc_x, double *kappa, double *rho, double *w, int inc_w)
 
 if (n_pre==1)
 
 for (i=0;i< n_run;++i)
 
 if (n_left > 0)
 
void bl1_caxpyv2bdotaxpy (int n, scomplex *beta, scomplex *u, int inc_u, scomplex *gamma, scomplex *z, int inc_z, scomplex *a, int inc_a, scomplex *x, int inc_x, scomplex *kappa, scomplex *rho, scomplex *w, int inc_w)
 
void bl1_zaxpyv2bdotaxpy (int n, dcomplex *beta, dcomplex *u, int inc_u, dcomplex *gamma, dcomplex *z, int inc_z, dcomplex *a, int inc_a, dcomplex *x, int inc_x, dcomplex *kappa, dcomplex *rho, dcomplex *w, int inc_w)
 

Variables

double *restrict zeta1
 
double *restrict alpha1 = a
 
double *restrict chi1 = x
 
double *restrict omega1 = w
 
double beta_c = *beta
 
double gamma_c = *gamma
 
double kappa_c = *kappa
 
double rho_c = 0.0
 
int i
 
int n_pre = 0
 
int n_run = ( n - n_pre ) / 2
 
int n_left = ( n - n_pre ) % 2
 
 upsilon1 = u
 
rho = rho_c
 
rho_c real = 0.0
 
rho_c imag = 0.0
 

Function Documentation

◆ bl1_caxpyv2bdotaxpy()

void bl1_caxpyv2bdotaxpy ( int  n,
scomplex beta,
scomplex u,
int  inc_u,
scomplex gamma,
scomplex z,
int  inc_z,
scomplex a,
int  inc_a,
scomplex x,
int  inc_x,
scomplex kappa,
scomplex rho,
scomplex w,
int  inc_w 
)
337{
338 bl1_abort();
339}
void bl1_abort(void)
Definition bl1_abort.c:13

References bl1_abort().

◆ bl1_daxpyv2bdotaxpy()

void bl1_daxpyv2bdotaxpy ( int  n,
double beta,
double u,
int  inc_u,
double gamma,
double z,
int  inc_z,
double a,
int  inc_a,
double x,
int  inc_x,
double kappa,
double rho,
double w,
int  inc_w 
)
47{
48 double* restrict upsilon1;
49 double* restrict zeta1;
50 double* restrict alpha1;
51 double* restrict chi1;
52 double* restrict omega1;
53 double rho_c;
54 int i;
55 v2df_t b1v, g1v, k1v;
57 v2df_t u1v, z1v, a1v;
58 v2df_t u2v, z2v, a2v;
59 v2df_t x1v, w1v;
60 v2df_t x2v, w2v;
61
62 int n_pre;
63 int n_run;
64 int n_left;
65
66 n_pre = 0;
67 if ( ( unsigned long ) a % 16 != 0 )
68 {
69 if ( ( unsigned long ) u % 16 == 0 ||
70 ( unsigned long ) z % 16 == 0 ||
71 ( unsigned long ) x % 16 == 0 ||
72 ( unsigned long ) w % 16 == 0 ) bl1_abort();
73
74 n_pre = 1;
75 }
76
77 n_run = ( n - n_pre ) / 4;
78 n_left = ( n - n_pre ) % 4;
79
80 upsilon1 = u;
81 zeta1 = z;
82 alpha1 = a;
83 chi1 = x;
84 omega1 = w;
85
86
87 rho_c = 0.0;
88
89 if ( n_pre == 1 )
90 {
91 double beta_c = *beta;
92 double gamma_c = *gamma;
93 double kappa_c = *kappa;
94
95 double upsilon1_c = *upsilon1;
96 double zeta1_c = *zeta1;
97 double alpha1_c = *alpha1;
98 double chi1_c = *chi1;
99 double omega1_c = *omega1;
100
102 rho_c += alpha1_c * chi1_c;
104
105 *alpha1 = alpha1_c;
106 *omega1 = omega1_c;
107
108 upsilon1 += inc_u;
109 zeta1 += inc_z;
110 alpha1 += inc_a;
111 chi1 += inc_x;
112 omega1 += inc_w;
113 }
114
115 b1v.v = _mm_loaddup_pd( ( double* )beta );
116 g1v.v = _mm_loaddup_pd( ( double* )gamma );
117 k1v.v = _mm_loaddup_pd( ( double* )kappa );
118
119 rhov.v = _mm_setzero_pd();
120
121 for ( i = 0; i < n_run; ++i )
122 {
123 u1v.v = _mm_load_pd( ( double* )upsilon1 );
124 z1v.v = _mm_load_pd( ( double* )zeta1 );
125 a1v.v = _mm_load_pd( ( double* )alpha1 );
126
127 a1v.v += b1v.v * u1v.v + g1v.v * z1v.v;
128
129 u2v.v = _mm_load_pd( ( double* )(upsilon1 + 2) );
130 z2v.v = _mm_load_pd( ( double* )(zeta1 + 2) );
131 a2v.v = _mm_load_pd( ( double* )(alpha1 + 2) );
132
133 a2v.v += b1v.v * u2v.v + g1v.v * z2v.v;
134
135 x1v.v = _mm_load_pd( ( double* )chi1 );
136 x2v.v = _mm_load_pd( ( double* )(chi1 + 2) );
137
138 w1v.v = _mm_load_pd( ( double* )omega1 );
139 w2v.v = _mm_load_pd( ( double* )(omega1 + 2) );
140
141 rhov.v += a1v.v * x1v.v;
142 rhov.v += a2v.v * x2v.v;
143
144 w1v.v += k1v.v * a1v.v;
145 w2v.v += k1v.v * a2v.v;
146
147 _mm_store_pd( ( double* )alpha1, a1v.v );
148 _mm_store_pd( ( double* )(alpha1 + 2), a2v.v );
149
150 _mm_store_pd( ( double* )omega1, w1v.v );
151 _mm_store_pd( ( double* )(omega1 + 2), w2v.v );
152
153
154 upsilon1 += 4;
155 zeta1 += 4;
156 alpha1 += 4;
157 chi1 += 4;
158 omega1 += 4;
159 }
160
161 rho_c += rhov.d[0] + rhov.d[1];
162
163 if ( n_left > 0 )
164 {
165 double beta_c = *beta;
166 double gamma_c = *gamma;
167 double kappa_c = *kappa;
168
169 for ( i = 0; i < n_left; ++i )
170 {
171 double upsilon1_c = *upsilon1;
172 double zeta1_c = *zeta1;
173 double alpha1_c = *alpha1;
174 double chi1_c = *chi1;
175 double omega1_c = *omega1;
176
178 rho_c += alpha1_c * chi1_c;
180
181 *alpha1 = alpha1_c;
182 *omega1 = omega1_c;
183
184 upsilon1 += inc_u;
185 zeta1 += inc_z;
186 alpha1 += inc_a;
187 chi1 += inc_x;
188 omega1 += inc_w;
189 }
190 }
191
192 *rho = rho_c;
193}
double alpha1_c
Definition bl1_axpyv2b.c:144
int n_left
Definition bl1_axpyv2bdotaxpy.c:209
double *restrict chi1
Definition bl1_axpyv2bdotaxpy.c:199
upsilon1
Definition bl1_axpyv2bdotaxpy.c:225
double beta_c
Definition bl1_axpyv2bdotaxpy.c:201
double rho_c
Definition bl1_axpyv2bdotaxpy.c:204
double *restrict alpha1
Definition bl1_axpyv2bdotaxpy.c:198
double kappa_c
Definition bl1_axpyv2bdotaxpy.c:203
* rho
Definition bl1_axpyv2bdotaxpy.c:322
double *restrict zeta1
Definition bl1_axpyv2bdotaxpy.c:197
int i
Definition bl1_axpyv2bdotaxpy.c:205
int n_pre
Definition bl1_axpyv2bdotaxpy.c:207
double gamma_c
Definition bl1_axpyv2bdotaxpy.c:202
double *restrict omega1
Definition bl1_axpyv2bdotaxpy.c:200
int n_run
Definition bl1_axpyv2bdotaxpy.c:208
Definition blis_type_defs.h:117

References alpha1, alpha1_c, beta_c, bl1_abort(), chi1, v2df_t::d, gamma_c, i, kappa_c, n_left, n_pre, n_run, omega1, rho, rho_c, upsilon1, v2df_t::v, and zeta1.

Referenced by FLA_Fused_Gerc2_Ahx_Ax_opd_var1(), and FLA_Fused_Her2_Ax_l_opd_var1().

◆ bl1_saxpyv2bdotaxpy()

void bl1_saxpyv2bdotaxpy ( int  n,
float beta,
float u,
int  inc_u,
float gamma,
float z,
int  inc_z,
float a,
int  inc_a,
float x,
int  inc_x,
float kappa,
float rho,
float w,
int  inc_w 
)
31{
32 bl1_abort();
33}

References bl1_abort().

◆ bl1_zaxpyv2bdotaxpy()

void bl1_zaxpyv2bdotaxpy ( int  n,
dcomplex beta,
dcomplex u,
int  inc_u,
dcomplex gamma,
dcomplex z,
int  inc_z,
dcomplex a,
int  inc_a,
dcomplex x,
int  inc_x,
dcomplex kappa,
dcomplex rho,
dcomplex w,
int  inc_w 
)
353{
359 int i;
360
361 //v2df_t beta1v, beta1rv;
362 //v2df_t gamma1v, gamma1rv;
363 //v2df_t kappa1v, kappa1rv;
365 //v2df_t u11v, u12v;
366 //v2df_t z11v, z12v;
368 v2df_t x1v, x1rv;
369 v2df_t w1v;
372
373 v2df_t a1v, a1rv;
374 v2df_t u1v, u1rv;
375 v2df_t z1v, z1rv;
378
379 upsilon1 = u;
380 zeta1 = z;
381 alpha1 = a;
382 chi1 = x;
383 omega1 = w;
384
385 if ( inc_u != 1 ||
386 inc_z != 1 ||
387 inc_a != 1 ||
388 inc_x != 1 ||
389 inc_w != 1 ) bl1_abort();
390
391
392 beta11v.v = _mm_loaddup_pd( ( double* )&(beta->real) );
393 beta12v.v = _mm_loaddup_pd( ( double* )&(beta->imag) );
394 gamma11v.v = _mm_loaddup_pd( ( double* )&(gamma->real) );
395 gamma12v.v = _mm_loaddup_pd( ( double* )&(gamma->imag) );
396 kappa11v.v = _mm_loaddup_pd( ( double* )&(kappa->real) );
397 kappa12v.v = _mm_loaddup_pd( ( double* )&(kappa->imag) );
398
399 rho1v.v = _mm_setzero_pd();
400
401 for ( i = 0; i < n; ++i )
402 {
403 //alpha_c = *alpha1;
404 a1v.v = _mm_load_pd( ( double* )alpha1 );
405
406 //alpha1_c.real += beta_c.real * upsilon1_c.real - beta_c.imag * upsilon1_c.imag;
407 //alpha1_c.imag += beta_c.real * upsilon1_c.imag + beta_c.imag * upsilon1_c.real;
408 u1v.v = _mm_load_pd( ( double* )upsilon1 );
409 u1rv.v = _mm_shuffle_pd( u1v.v, u1v.v, _MM_SHUFFLE2 (0,1) );
410 acbc.v = beta11v.v * u1v.v;
411 bdad.v = beta12v.v * u1rv.v;
412 a1v.v += _mm_addsub_pd( acbc.v, bdad.v );
413
414 //alpha1_c.real += gamma_c.real * zeta1_c.real - gamma_c.imag * zeta1_c.imag;
415 //alpha1_c.imag += gamma_c.real * zeta1_c.imag + gamma_c.imag * zeta1_c.real;
416 z1v.v = _mm_load_pd( ( double* )zeta1 );
417 z1rv.v = _mm_shuffle_pd( z1v.v, z1v.v, _MM_SHUFFLE2 (0,1) );
418 acbc.v = gamma11v.v * z1v.v;
419 bdad.v = gamma12v.v * z1rv.v;
420 a1v.v += _mm_addsub_pd( acbc.v, bdad.v );
421
422 //*alpha1 = alpha1_c;
423 _mm_store_pd( ( double* )alpha1, a1v.v );
424
425 //rho_c.real += alpha1_c.real * chi1_c.real - -alpha1_c.imag * chi1_c.imag;
426 //rho_c.imag += alpha1_c.real * chi1_c.imag + -alpha1_c.imag * chi1_c.real;
427 x1v.v = _mm_load_pd( ( double* )chi1 );
428 x1rv.v = _mm_shuffle_pd( x1v.v, x1v.v, _MM_SHUFFLE2 (0,1) );
429 a11v.v = a1v.v;
430 a12v.v = _mm_shuffle_pd( a11v.v, a11v.v, _MM_SHUFFLE2 (1,1) );
431 a11v.v = _mm_shuffle_pd( a11v.v, a11v.v, _MM_SHUFFLE2 (0,0) );
432 adac.v = a11v.v * x1rv.v;
433 bcbd.v = a12v.v * x1v.v;
434 rho1v.v = rho1v.v + _mm_addsub_pd( adac.v, bcbd.v );
435
436 //omega_c = *omega1;
437 w1v.v = _mm_load_pd( ( double* )omega1 );
438
439 //omega1_c.real += kappa_c.real * alpha1_c.real - kappa_c.imag * alpha1_c.imag;
440 //omega1_c.imag += kappa_c.real * alpha1_c.imag + kappa_c.imag * alpha1_c.real;
441 a1rv.v = _mm_shuffle_pd( a1v.v, a1v.v, _MM_SHUFFLE2 (0,1) );
442 acbc.v = kappa11v.v * a1v.v;
443 bdad.v = kappa12v.v * a1rv.v;
444 w1v.v += _mm_addsub_pd( acbc.v, bdad.v );
445
446 // *omega1 = omega1_c;
447 _mm_store_pd( ( double* )omega1, w1v.v );
448
449
450 upsilon1 += 1;
451 zeta1 += 1;
452 alpha1 += 1;
453 chi1 += 1;
454 omega1 += 1;
455 }
456
457 rho1v.v = _mm_shuffle_pd( rho1v.v, rho1v.v, _MM_SHUFFLE2 (0,1) );
458
459 //rho->real = rho_c.real;
460 //rho->imag = rho_c.imag;
461 _mm_store_pd( ( double* )rho, rho1v.v );
462}
Definition blis_type_defs.h:138

References alpha1, bl1_abort(), chi1, i, dcomplex::imag, omega1, dcomplex::real, rho, upsilon1, v2df_t::v, and zeta1.

◆ for()

for ( )
260 {
261 double upsilon1_c = *upsilon1;
262 double upsilon2_c = *(upsilon1 + 1);
263 double zeta1_c = *zeta1;
264 double zeta2_c = *(zeta1 + 1);
265 double alpha1_c = *alpha1;
266 double alpha2_c = *(alpha1 + 1);
267 double chi1_c = *chi1;
268 double chi2_c = *(chi1 + 1);
269 double omega1_c = *omega1;
270 double omega2_c = *(omega1 + 1);
271
272 // alpha1 += beta * upsilon1 + gamma * zeta1;
275
276 // rho += conj(alpha1) * chi1 +
277 // conj(alpha2) * chi2;
279
280 // omega1 += kappa * alpha1;
283
284 *alpha1 = alpha1_c;
285 *(alpha1 + 1) = alpha2_c;
286 *omega1 = omega1_c;
287 *(omega1 + 1) = omega2_c;
288
289 upsilon1 += 2*inc_u;
290 zeta1 += 2*inc_z;
291 alpha1 += 2*inc_a;
292 chi1 += 2*inc_x;
293 omega1 += 2*inc_w;
294 }
double alpha2_c
Definition bl1_axpyv2b.c:145

References alpha1, alpha1_c, alpha2_c, beta_c, chi1, gamma_c, kappa_c, omega1, rho_c, upsilon1, and zeta1.

◆ if() [1/2]

if ( n_left  ,
 
)
297 {
298
299 for ( i = 0; i < n_left; ++i )
300 {
301 double upsilon1_c = *upsilon1;
302 double zeta1_c = *zeta1;
303 double alpha1_c = *alpha1;
304 double chi1_c = *chi1;
305 double omega1_c = *omega1;
306
308 rho_c += alpha1_c * chi1_c;
310
311 *alpha1 = alpha1_c;
312 *omega1 = omega1_c;
313
314 upsilon1 += inc_u;
315 zeta1 += inc_z;
316 alpha1 += inc_a;
317 chi1 += inc_x;
318 omega1 += inc_w;
319 }
320 }

References alpha1, alpha1_c, beta_c, chi1, gamma_c, i, kappa_c, n_left, omega1, rho_c, upsilon1, and zeta1.

◆ if() [2/2]

if ( n_pre  = = 1)
238 {
239 double upsilon1_c = *upsilon1;
240 double zeta1_c = *zeta1;
241 double alpha1_c = *alpha1;
242 double chi1_c = *chi1;
243 double omega1_c = *omega1;
244
246 rho_c += alpha1_c * chi1_c;
248
249 *alpha1 = alpha1_c;
250 *omega1 = omega1_c;
251
252 upsilon1 += inc_u;
253 zeta1 += inc_z;
254 alpha1 += inc_a;
255 chi1 += inc_x;
256 omega1 += inc_w;
257 }

References alpha1, alpha1_c, beta_c, chi1, gamma_c, kappa_c, omega1, rho_c, upsilon1, and zeta1.

Variable Documentation

◆ alpha1

dcomplex *restrict alpha1 = a

◆ beta_c

beta_c = *beta

Referenced by bl1_daxpyv2bdotaxpy(), for(), if(), and if().

◆ chi1

chi1 = x

◆ gamma_c

dcomplex gamma_c = *gamma

Referenced by bl1_daxpyv2bdotaxpy(), for(), if(), and if().

◆ i

int i

◆ imag

rho2_c imag = 0.0

◆ kappa_c

dcomplex kappa_c = *kappa

Referenced by bl1_daxpyv2bdotaxpy(), for(), if(), and if().

◆ n_left

n_left = ( n - n_pre ) % 2

Referenced by bl1_daxpyv2bdotaxpy(), and if().

◆ n_pre

n_pre = 0

Referenced by bl1_daxpyv2bdotaxpy().

◆ n_run

n_run = ( n - n_pre ) / 2

Referenced by bl1_daxpyv2bdotaxpy().

◆ omega1

dcomplex *restrict omega1 = w

◆ real

rho real = 0.0

◆ rho

* rho = rho_c

◆ rho_c

dcomplex rho_c = 0.0

Referenced by bl1_daxpyv2bdotaxpy(), for(), if(), and if().

◆ upsilon1

dcomplex *restrict upsilon1 = u

◆ zeta1

zeta1
Initial value:
{

Referenced by bl1_daxpyv2bdotaxpy(), bl1_zaxpyv2bdotaxpy(), for(), if(), and if().