libflame revision_anchor
Functions | Variables
bl1_dotv2axpyv2b.c File Reference

(r)

Functions

void bl1_sdotv2axpyv2b (int n, float *a1, int inc_a1, float *a2, int inc_a2, float *x, int inc_x, float *kappa1, float *kappa2, float *rho1, float *rho2, float *w, int inc_w)
 
void bl1_ddotv2axpyv2b (int n, double *a1, int inc_a1, double *a2, int inc_a2, double *x, int inc_x, double *kappa1, double *kappa2, double *rho1, double *rho2, double *w, int inc_w)
 
 if (inc_a1 !=1||inc_a2 !=1||inc_x !=1||inc_w !=1)
 
 for (i=0;i< n_run;++i)
 
 if (n_left > 0)
 
void bl1_cdotv2axpyv2b (int n, scomplex *a1, int inc_a1, scomplex *a2, int inc_a2, scomplex *x, int inc_x, scomplex *kappa1, scomplex *kappa2, scomplex *rho1, scomplex *rho2, scomplex *w, int inc_w)
 
void bl1_zdotv2axpyv2b (int n, dcomplex *a1, int inc_a1, dcomplex *a2, int inc_a2, dcomplex *x, int inc_x, dcomplex *kappa1, dcomplex *kappa2, dcomplex *rho1, dcomplex *rho2, dcomplex *w, int inc_w)
 

Variables

double *restrict alpha2
 
double *restrict chi1 = x
 
double *restrict omega1 = w
 
double kappa1_c = *kappa1
 
double kappa2_c = *kappa2
 
double rho1_c
 
double rho2_c
 
int i
 
int n_pre
 
int n_run
 
int n_left
 
rho1 = rho1_c
 
rho2 = rho2_c
 
 alpha1 = a1
 
rho1_c real = 0.0
 
rho1_c imag = 0.0
 

Function Documentation

◆ bl1_cdotv2axpyv2b()

void bl1_cdotv2axpyv2b ( int  n,
scomplex a1,
int  inc_a1,
scomplex a2,
int  inc_a2,
scomplex x,
int  inc_x,
scomplex kappa1,
scomplex kappa2,
scomplex rho1,
scomplex rho2,
scomplex w,
int  inc_w 
)
326{
327 bl1_abort();
328}
void bl1_abort(void)
Definition bl1_abort.c:13

References bl1_abort().

◆ bl1_ddotv2axpyv2b()

void bl1_ddotv2axpyv2b ( int  n,
double a1,
int  inc_a1,
double a2,
int  inc_a2,
double x,
int  inc_x,
double kappa1,
double kappa2,
double rho1,
double rho2,
double w,
int  inc_w 
)
46{
47 double* restrict alpha1;
48 double* restrict alpha2;
49 double* restrict chi1;
50 double* restrict omega1;
51 double rho1_c;
52 double rho2_c;
53 int i;
54
55 int n_pre;
56 int n_run;
57 int n_left;
58
63
64 if ( inc_a1 != 1 ||
65 inc_a2 != 1 ||
66 inc_x != 1 ||
67 inc_w != 1 ) bl1_abort();
68
69 n_pre = 0;
70 if ( ( unsigned long ) a1 % 16 != 0 )
71 {
72 if ( ( unsigned long ) a2 % 16 == 0 ||
73 ( unsigned long ) x % 16 == 0 ||
74 ( unsigned long ) w % 16 == 0 ) bl1_abort();
75
76 n_pre = 1;
77 }
78
79 n_run = ( n - n_pre ) / 4;
80 n_left = ( n - n_pre ) % 4;
81
82 alpha1 = a1;
83 alpha2 = a2;
84 chi1 = x;
85 omega1 = w;
86
87 rho1_c = 0.0;
88 rho2_c = 0.0;
89
90 if ( n_pre == 1 )
91 {
92 double kappa1_c = *kappa1;
93 double kappa2_c = *kappa2;
94 double alpha1_c = *alpha1;
95 double alpha2_c = *alpha2;
96 double chi1_c = *chi1;
97 double omega1_c = *omega1;
98
101
104
105 *omega1 = omega1_c;
106
107 alpha1 += inc_a1;
108 alpha2 += inc_a2;
109 chi1 += inc_x;
110 omega1 += inc_w;
111 }
112
113 rho1v.v = _mm_setzero_pd();
114 rho2v.v = _mm_setzero_pd();
115
116 k1v.v = _mm_loaddup_pd( ( double* )kappa1 );
117 k2v.v = _mm_loaddup_pd( ( double* )kappa2 );
118
119 for ( i = 0; i < n_run; ++i )
120 {
121 a11v.v = _mm_load_pd( ( double* )alpha1 );
122 a12v.v = _mm_load_pd( ( double* )alpha2 );
123 x1v.v = _mm_load_pd( ( double* )chi1 );
124 w1v.v = _mm_load_pd( ( double* )omega1 );
125
126 rho1v.v += a11v.v * x1v.v;
127 w1v.v += k1v.v * a11v.v;
128
129 rho2v.v += a12v.v * x1v.v;
130 w1v.v += k2v.v * a12v.v;
131
132 _mm_store_pd( ( double* )omega1, w1v.v );
133
134 a21v.v = _mm_load_pd( ( double* )(alpha1 + 2) );
135 a22v.v = _mm_load_pd( ( double* )(alpha2 + 2) );
136 x2v.v = _mm_load_pd( ( double* )(chi1 + 2) );
137 w2v.v = _mm_load_pd( ( double* )(omega1 + 2) );
138
139 rho1v.v += a21v.v * x2v.v;
140 w2v.v += k1v.v * a21v.v;
141
142 rho2v.v += a22v.v * x2v.v;
143 w2v.v += k2v.v * a22v.v;
144
145 _mm_store_pd( ( double* )(omega1 + 2), w2v.v );
146
147 alpha1 += 4;
148 alpha2 += 4;
149 chi1 += 4;
150 omega1 += 4;
151 }
152
153 if ( n_left > 0 )
154 {
155 for ( i = 0; i < n_left; ++i )
156 {
157 double kappa1_c = *kappa1;
158 double kappa2_c = *kappa2;
159 double alpha1_c = *alpha1;
160 double alpha2_c = *alpha2;
161 double chi1_c = *chi1;
162 double omega1_c = *omega1;
163
166
169
170 *omega1 = omega1_c;
171
172 alpha1 += inc_a1;
173 alpha2 += inc_a2;
174 chi1 += inc_x;
175 omega1 += inc_w;
176 }
177 }
178
179 rho1_c += rho1v.d[0] + rho1v.d[1];
180 rho2_c += rho2v.d[0] + rho2v.d[1];
181
182 *rho1 = rho1_c;
183 *rho2 = rho2_c;
184}
double alpha1_c
Definition bl1_axpyv2b.c:144
double alpha2_c
Definition bl1_axpyv2b.c:145
double *restrict omega1
Definition bl1_dotv2axpyv2b.c:190
double rho2_c
Definition bl1_dotv2axpyv2b.c:194
double *restrict chi1
Definition bl1_dotv2axpyv2b.c:189
double kappa1_c
Definition bl1_dotv2axpyv2b.c:191
alpha1
Definition bl1_dotv2axpyv2b.c:456
* rho2
Definition bl1_dotv2axpyv2b.c:312
int n_left
Definition bl1_dotv2axpyv2b.c:199
double rho1_c
Definition bl1_dotv2axpyv2b.c:193
int n_pre
Definition bl1_dotv2axpyv2b.c:197
* rho1
Definition bl1_dotv2axpyv2b.c:311
int n_run
Definition bl1_dotv2axpyv2b.c:198
double kappa2_c
Definition bl1_dotv2axpyv2b.c:192
int i
Definition bl1_dotv2axpyv2b.c:195
double *restrict alpha2
Definition bl1_dotv2axpyv2b.c:188
Definition blis_type_defs.h:117

References alpha1, alpha1_c, alpha2, alpha2_c, bl1_abort(), chi1, v2df_t::d, i, kappa1_c, kappa2_c, n_left, n_pre, n_run, omega1, rho1, rho1_c, rho2, rho2_c, and v2df_t::v.

Referenced by FLA_Fused_Ahx_Ax_opd_var1(), and FLA_Fused_UZhu_ZUhu_opd_var1().

◆ bl1_sdotv2axpyv2b()

void bl1_sdotv2axpyv2b ( int  n,
float a1,
int  inc_a1,
float a2,
int  inc_a2,
float x,
int  inc_x,
float kappa1,
float kappa2,
float rho1,
float rho2,
float w,
int  inc_w 
)
31{
32 bl1_abort();
33}

References bl1_abort().

◆ bl1_zdotv2axpyv2b()

void bl1_zdotv2axpyv2b ( int  n,
dcomplex a1,
int  inc_a1,
dcomplex a2,
int  inc_a2,
dcomplex x,
int  inc_x,
dcomplex kappa1,
dcomplex kappa2,
dcomplex rho1,
dcomplex rho2,
dcomplex w,
int  inc_w 
)
341{
346 int i;
347
354 v2df_t x1v, x1rv;
355 v2df_t w1v;
358
359 if ( inc_a1 != 1 ||
360 inc_a2 != 1 ||
361 inc_x != 1 ||
362 inc_w != 1 ) bl1_abort();
363
364 alpha1 = a1;
365 alpha2 = a2;
366 chi1 = x;
367 omega1 = w;
368
369 rho1v.v = _mm_setzero_pd();
370 rho2v.v = _mm_setzero_pd();
371
372 kappa1v.v = _mm_load_pd( ( double* )kappa1 );
374 kappa2v.v = _mm_load_pd( ( double* )kappa2 );
376
377 for ( i = 0; i < n; ++i )
378 {
379 //dcomplex omega1_c = *omega1;
380 w1v.v = _mm_load_pd( ( double* )omega1 );
381
382 //dcomplex chi1_c = *chi1;
383 x1v.v = _mm_load_pd( ( double* )chi1 );
384
385
386 //dcomplex alpha1_c = *alpha1;
387 a11v.v = _mm_loaddup_pd( ( double* )&(alpha1->real) );
388 a12v.v = _mm_loaddup_pd( ( double* )&(alpha1->imag) );
389
390 //rho1_c.real += alpha1_c.real * chi1_c.real - -alpha1_c.imag * chi1_c.imag;
391 //rho1_c.imag += alpha1_c.real * chi1_c.imag + -alpha1_c.imag * chi1_c.real;
392 x1rv.v = _mm_shuffle_pd( x1v.v, x1v.v, _MM_SHUFFLE2 (0,1) );
393 adac.v = a11v.v * x1rv.v;
394 bcbd.v = a12v.v * x1v.v;
395 rho1v.v = rho1v.v + _mm_addsub_pd( adac.v, bcbd.v );
396
397 //omega1_c.real += kappa1_c.real * alpha1_c.real - kappa1_c.imag * alpha1_c.imag;
398 //omega1_c.imag += kappa1_c.real * alpha1_c.imag + kappa1_c.imag * alpha1_c.real;
399 acbc.v = kappa1v.v * a11v.v;
400 bdad.v = kappa1rv.v * a12v.v;
401 w1v.v += _mm_addsub_pd( acbc.v, bdad.v );
402
403
404 //dcomplex alpha2_c = *alpha2;
405 a21v.v = _mm_loaddup_pd( ( double* )&(alpha2->real) );
406 a22v.v = _mm_loaddup_pd( ( double* )&(alpha2->imag) );
407
408 //rho2_c.real += alpha2_c.real * chi1_c.real - -alpha2_c.imag * chi1_c.imag;
409 //rho2_c.imag += alpha2_c.real * chi1_c.imag + -alpha2_c.imag * chi1_c.real;
410 x1rv.v = _mm_shuffle_pd( x1v.v, x1v.v, _MM_SHUFFLE2 (0,1) );
411 adac.v = a21v.v * x1rv.v;
412 bcbd.v = a22v.v * x1v.v;
413 rho2v.v = rho2v.v + _mm_addsub_pd( adac.v, bcbd.v );
414
415 //omega1_c.real += kappa2_c.real * alpha2_c.real - kappa2_c.imag * alpha2_c.imag;
416 //omega1_c.imag += kappa2_c.real * alpha2_c.imag + kappa2_c.imag * alpha2_c.real;
417 acbc.v = kappa2v.v * a21v.v;
418 bdad.v = kappa2rv.v * a22v.v;
419 w1v.v += _mm_addsub_pd( acbc.v, bdad.v );
420
421
422 //*omega1 = omega1_c;
423 _mm_store_pd( ( double* )omega1, w1v.v );
424
425
426 //alpha1 += inc_a1;
427 //alpha2 += inc_a2;
428 //chi1 += inc_x;
429 //omega1 += inc_w;
430 alpha1 += 1;
431 alpha2 += 1;
432 chi1 += 1;
433 omega1 += 1;
434 }
435
436 rho1v.v = _mm_shuffle_pd( rho1v.v, rho1v.v, _MM_SHUFFLE2 (0,1) );
437 rho2v.v = _mm_shuffle_pd( rho2v.v, rho2v.v, _MM_SHUFFLE2 (0,1) );
438
439 //*rho1 = rho1_c;
440 //*rho2 = rho2_c;
441 _mm_store_pd( ( double* )rho1, rho1v.v );
442 _mm_store_pd( ( double* )rho2, rho2v.v );
443}
Definition blis_type_defs.h:138

References alpha1, alpha2, bl1_abort(), chi1, i, omega1, rho1, rho2, and v2df_t::v.

Referenced by FLA_Fused_Ahx_Ax_opz_var1().

◆ for()

for ( )
252 {
253 double alpha11_c = *alpha1;
254 double alpha21_c = *(alpha1 + 1);
255 double alpha12_c = *alpha2;
256 double alpha22_c = *(alpha2 + 1);
257 double chi1_c = *chi1;
258 double chi2_c = *(chi1 + 1);
259 double omega1_c = *omega1;
260 double omega2_c = *(omega1 + 1);
261
262 // rho1 += conj(alpha1) * chi1;
265
266 // omega1 += kappa1 * alpha1;
269
270 // rho2 += conj(alpha2) * chi1;
273
274 // omega1 += kappa2 * alpha2;
277
278 *omega1 = omega1_c;
279 *(omega1 + 1) = omega2_c;
280
281 alpha1 += 2*inc_a1;
282 alpha2 += 2*inc_a2;
283 chi1 += 2*inc_x;
284 omega1 += 2*inc_w;
285 }

References alpha1, alpha2, chi1, kappa1_c, kappa2_c, omega1, rho1_c, and rho2_c.

◆ if() [1/2]

if ( inc_a1 = 1 || inc_a2 != 1 || inc_x != 1 || inc_w != 1)
231 {
232 double alpha1_c = *alpha1;
233 double alpha2_c = *alpha2;
234 double chi1_c = *chi1;
235 double omega1_c = *omega1;
236
239
242
243 *omega1 = omega1_c;
244
245 alpha1 += inc_a1;
246 alpha2 += inc_a2;
247 chi1 += inc_x;
248 omega1 += inc_w;
249 }

◆ if() [2/2]

if ( n_left  ,
 
)
288 {
289 for ( i = 0; i < n_left; ++i )
290 {
291 double alpha1_c = *alpha1;
292 double alpha2_c = *alpha2;
293 double chi1_c = *chi1;
294 double omega1_c = *omega1;
295
298
301
302 *omega1 = omega1_c;
303
304 alpha1 += inc_a1;
305 alpha2 += inc_a2;
306 chi1 += inc_x;
307 omega1 += inc_w;
308 }
309 }

References alpha1, alpha1_c, alpha2, alpha2_c, chi1, i, kappa1_c, kappa2_c, n_left, omega1, rho1_c, and rho2_c.

Variable Documentation

◆ alpha1

alpha1 = a1

◆ alpha2

dcomplex *restrict alpha2

◆ chi1

chi1 = x

◆ i

int i

◆ imag

rho2_c imag = 0.0

◆ kappa1_c

dcomplex kappa1_c = *kappa1

Referenced by bl1_ddotv2axpyv2b(), for(), and if().

◆ kappa2_c

dcomplex kappa2_c = *kappa2

Referenced by bl1_ddotv2axpyv2b(), for(), and if().

◆ n_left

int n_left

Referenced by bl1_ddotv2axpyv2b(), and if().

◆ n_pre

int n_pre

Referenced by bl1_ddotv2axpyv2b().

◆ n_run

int n_run

Referenced by bl1_ddotv2axpyv2b().

◆ omega1

omega1 = w

◆ real

rho2_c real = 0.0

◆ rho1

* rho1 = rho1_c

◆ rho1_c

dcomplex rho1_c

Referenced by bl1_ddotv2axpyv2b(), for(), and if().

◆ rho2

* rho2 = rho2_c

◆ rho2_c

dcomplex rho2_c

Referenced by bl1_ddotv2axpyv2b(), for(), and if().